diff options
Diffstat (limited to 'src/backend/executor/nodeSamplescan.c')
-rw-r--r-- | src/backend/executor/nodeSamplescan.c | 437 |
1 files changed, 391 insertions, 46 deletions
diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index 4c1c5237b7d..dbe84b0baa8 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -3,7 +3,7 @@ * nodeSamplescan.c * Support routines for sample scans of relations (table sampling). * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -14,22 +14,23 @@ */ #include "postgres.h" -#include "access/tablesample.h" +#include "access/hash.h" +#include "access/relscan.h" +#include "access/tsmapi.h" #include "executor/executor.h" #include "executor/nodeSamplescan.h" #include "miscadmin.h" -#include "parser/parsetree.h" #include "pgstat.h" -#include "storage/bufmgr.h" #include "storage/predicate.h" #include "utils/rel.h" -#include "utils/syscache.h" #include "utils/tqual.h" -static void InitScanRelation(SampleScanState *node, EState *estate, - int eflags, TableSampleClause *tablesample); +static void InitScanRelation(SampleScanState *node, EState *estate, int eflags); static TupleTableSlot *SampleNext(SampleScanState *node); - +static void tablesample_init(SampleScanState *scanstate); +static HeapTuple tablesample_getnext(SampleScanState *scanstate); +static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, + HeapScanDesc scan); /* ---------------------------------------------------------------- * Scan Support @@ -45,23 +46,26 @@ static TupleTableSlot *SampleNext(SampleScanState *node); static TupleTableSlot * SampleNext(SampleScanState *node) { - TupleTableSlot *slot; - TableSampleDesc *tsdesc; HeapTuple tuple; + TupleTableSlot *slot; /* - * get information from the scan state + * if this is first call within a scan, initialize */ - slot = node->ss.ss_ScanTupleSlot; - tsdesc = node->tsdesc; + if (!node->begun) + tablesample_init(node); + + /* + * get the next tuple, and store it in our result slot + */ + tuple = tablesample_getnext(node); - tuple = tablesample_getnext(tsdesc); + slot = node->ss.ss_ScanTupleSlot; if (tuple) ExecStoreTuple(tuple, /* tuple to store */ slot, /* slot to store in */ - tsdesc->heapScan->rs_cbuf, /* buffer associated - * with this tuple */ + node->ss.ss_currentScanDesc->rs_cbuf, /* tuple's buffer */ false); /* don't pfree this pointer */ else ExecClearTuple(slot); @@ -75,7 +79,10 @@ SampleNext(SampleScanState *node) static bool SampleRecheck(SampleScanState *node, TupleTableSlot *slot) { - /* No need to recheck for SampleScan */ + /* + * No need to recheck for SampleScan, since like SeqScan we don't pass any + * checkable keys to heap_beginscan. + */ return true; } @@ -103,8 +110,7 @@ ExecSampleScan(SampleScanState *node) * ---------------------------------------------------------------- */ static void -InitScanRelation(SampleScanState *node, EState *estate, int eflags, - TableSampleClause *tablesample) +InitScanRelation(SampleScanState *node, EState *estate, int eflags) { Relation currentRelation; @@ -113,19 +119,13 @@ InitScanRelation(SampleScanState *node, EState *estate, int eflags, * open that relation and acquire appropriate lock on it. */ currentRelation = ExecOpenScanRelation(estate, - ((SampleScan *) node->ss.ps.plan)->scanrelid, + ((SampleScan *) node->ss.ps.plan)->scan.scanrelid, eflags); node->ss.ss_currentRelation = currentRelation; - /* - * Even though we aren't going to do a conventional seqscan, it is useful - * to create a HeapScanDesc --- many of the fields in it are usable. - */ - node->ss.ss_currentScanDesc = - heap_beginscan_sampling(currentRelation, estate->es_snapshot, 0, NULL, - tablesample->tsmseqscan, - tablesample->tsmpagemode); + /* we won't set up the HeapScanDesc till later */ + node->ss.ss_currentScanDesc = NULL; /* and report the scan tuple slot's rowtype */ ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation)); @@ -140,12 +140,11 @@ SampleScanState * ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) { SampleScanState *scanstate; - RangeTblEntry *rte = rt_fetch(node->scanrelid, - estate->es_range_table); + TableSampleClause *tsc = node->tablesample; + TsmRoutine *tsm; Assert(outerPlan(node) == NULL); Assert(innerPlan(node) == NULL); - Assert(rte->tablesample != NULL); /* * create state structure @@ -165,10 +164,17 @@ ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) * initialize child expressions */ scanstate->ss.ps.targetlist = (List *) - ExecInitExpr((Expr *) node->plan.targetlist, + ExecInitExpr((Expr *) node->scan.plan.targetlist, (PlanState *) scanstate); scanstate->ss.ps.qual = (List *) - ExecInitExpr((Expr *) node->plan.qual, + ExecInitExpr((Expr *) node->scan.plan.qual, + (PlanState *) scanstate); + + scanstate->args = (List *) + ExecInitExpr((Expr *) tsc->args, + (PlanState *) scanstate); + scanstate->repeatable = + ExecInitExpr(tsc->repeatable, (PlanState *) scanstate); /* @@ -180,7 +186,7 @@ ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) /* * initialize scan relation */ - InitScanRelation(scanstate, estate, eflags, rte->tablesample); + InitScanRelation(scanstate, estate, eflags); scanstate->ss.ps.ps_TupFromTlist = false; @@ -190,7 +196,25 @@ ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) ExecAssignResultTypeFromTL(&scanstate->ss.ps); ExecAssignScanProjectionInfo(&scanstate->ss); - scanstate->tsdesc = tablesample_init(scanstate, rte->tablesample); + /* + * If we don't have a REPEATABLE clause, select a random seed. We want to + * do this just once, since the seed shouldn't change over rescans. + */ + if (tsc->repeatable == NULL) + scanstate->seed = random(); + + /* + * Finally, initialize the TABLESAMPLE method handler. + */ + tsm = GetTsmRoutine(tsc->tsmhandler); + scanstate->tsmroutine = tsm; + scanstate->tsm_state = NULL; + + if (tsm->InitSampleScan) + tsm->InitSampleScan(scanstate, eflags); + + /* We'll do BeginSampleScan later; we can't evaluate params yet */ + scanstate->begun = false; return scanstate; } @@ -207,7 +231,8 @@ ExecEndSampleScan(SampleScanState *node) /* * Tell sampling function that we finished the scan. */ - tablesample_end(node->tsdesc); + if (node->tsmroutine->EndSampleScan) + node->tsmroutine->EndSampleScan(node); /* * Free the exprcontext @@ -223,7 +248,8 @@ ExecEndSampleScan(SampleScanState *node) /* * close heap scan */ - heap_endscan(node->ss.ss_currentScanDesc); + if (node->ss.ss_currentScanDesc) + heap_endscan(node->ss.ss_currentScanDesc); /* * close the heap relation. @@ -232,11 +258,6 @@ ExecEndSampleScan(SampleScanState *node) } /* ---------------------------------------------------------------- - * Join Support - * ---------------------------------------------------------------- - */ - -/* ---------------------------------------------------------------- * ExecReScanSampleScan * * Rescans the relation. @@ -246,12 +267,336 @@ ExecEndSampleScan(SampleScanState *node) void ExecReScanSampleScan(SampleScanState *node) { - heap_rescan(node->ss.ss_currentScanDesc, NULL); + /* Remember we need to do BeginSampleScan again (if we did it at all) */ + node->begun = false; + + ExecScanReScan(&node->ss); +} + + +/* + * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan. + */ +static void +tablesample_init(SampleScanState *scanstate) +{ + TsmRoutine *tsm = scanstate->tsmroutine; + ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; + Datum *params; + Datum datum; + bool isnull; + uint32 seed; + bool allow_sync; + int i; + ListCell *arg; + + params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum)); + + i = 0; + foreach(arg, scanstate->args) + { + ExprState *argstate = (ExprState *) lfirst(arg); + + params[i] = ExecEvalExprSwitchContext(argstate, + econtext, + &isnull, + NULL); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("TABLESAMPLE parameter cannot be null"))); + i++; + } + + if (scanstate->repeatable) + { + datum = ExecEvalExprSwitchContext(scanstate->repeatable, + econtext, + &isnull, + NULL); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT), + errmsg("TABLESAMPLE REPEATABLE parameter cannot be null"))); + + /* + * The REPEATABLE parameter has been coerced to float8 by the parser. + * The reason for using float8 at the SQL level is that it will + * produce unsurprising results both for users used to databases that + * accept only integers in the REPEATABLE clause and for those who + * might expect that REPEATABLE works like setseed() (a float in the + * range from -1 to 1). + * + * We use hashfloat8() to convert the supplied value into a suitable + * seed. For regression-testing purposes, that has the convenient + * property that REPEATABLE(0) gives a machine-independent result. + */ + seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum)); + } + else + { + /* Use the seed selected by ExecInitSampleScan */ + seed = scanstate->seed; + } + + /* Set default values for params that BeginSampleScan can adjust */ + scanstate->use_bulkread = true; + scanstate->use_pagemode = true; + + /* Let tablesample method do its thing */ + tsm->BeginSampleScan(scanstate, + params, + list_length(scanstate->args), + seed); + + /* We'll use syncscan if there's no NextSampleBlock function */ + allow_sync = (tsm->NextSampleBlock == NULL); + + /* Now we can create or reset the HeapScanDesc */ + if (scanstate->ss.ss_currentScanDesc == NULL) + { + scanstate->ss.ss_currentScanDesc = + heap_beginscan_sampling(scanstate->ss.ss_currentRelation, + scanstate->ss.ps.state->es_snapshot, + 0, NULL, + scanstate->use_bulkread, + allow_sync, + scanstate->use_pagemode); + } + else + { + heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL, + scanstate->use_bulkread, + allow_sync, + scanstate->use_pagemode); + } + + pfree(params); + + /* And we're initialized. */ + scanstate->begun = true; +} + +/* + * Get next tuple from TABLESAMPLE method. + * + * Note: an awful lot of this is copied-and-pasted from heapam.c. It would + * perhaps be better to refactor to share more code. + */ +static HeapTuple +tablesample_getnext(SampleScanState *scanstate) +{ + TsmRoutine *tsm = scanstate->tsmroutine; + HeapScanDesc scan = scanstate->ss.ss_currentScanDesc; + HeapTuple tuple = &(scan->rs_ctup); + Snapshot snapshot = scan->rs_snapshot; + bool pagemode = scan->rs_pageatatime; + BlockNumber blockno; + Page page; + bool all_visible; + OffsetNumber maxoffset; + + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return NULL; + } + if (tsm->NextSampleBlock) + { + blockno = tsm->NextSampleBlock(scanstate); + if (!BlockNumberIsValid(blockno)) + { + tuple->t_data = NULL; + return NULL; + } + } + else + blockno = scan->rs_startblock; + Assert(blockno < scan->rs_nblocks); + heapgetpage(scan, blockno); + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + blockno = scan->rs_cblock; /* current page */ + } /* - * Tell sampling function to reset its state for rescan. + * When not using pagemode, we must lock the buffer during tuple + * visibility checks. */ - tablesample_reset(node->tsdesc); + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + OffsetNumber tupoffset; + bool finished; + + CHECK_FOR_INTERRUPTS(); + + /* Ask the tablesample method which tuples to check on this page. */ + tupoffset = tsm->NextSampleTuple(scanstate, + blockno, + maxoffset); + + if (OffsetNumberIsValid(tupoffset)) + { + ItemId itemid; + bool visible; + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + if (all_visible) + visible = true; + else + visible = SampleTupleVisible(tuple, tupoffset, scan); + + /* in pagemode, heapgetpage did this for us */ + if (!pagemode) + CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, + scan->rs_cbuf, snapshot); + + if (visible) + { + /* Found visible tuple, return it. */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + break; + } + else + { + /* Try next tuple from same page. */ + continue; + } + } + + /* + * if we get here, it means we've exhausted the items on this page and + * it's time to move to the next. + */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + if (tsm->NextSampleBlock) + { + blockno = tsm->NextSampleBlock(scanstate); + Assert(!scan->rs_syncscan); + finished = !BlockNumberIsValid(blockno); + } + else + { + /* Without NextSampleBlock, just do a plain forward seqscan. */ + blockno++; + if (blockno >= scan->rs_nblocks) + blockno = 0; + + /* + * Report our new scan position for synchronization purposes. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_syncscan) + ss_report_location(scan->rs_rd, blockno); + + finished = (blockno == scan->rs_startblock); + } + + /* + * Reached end of scan? + */ + if (finished) + { + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; + return NULL; + } + + Assert(blockno < scan->rs_nblocks); + heapgetpage(scan, blockno); + + /* Re-establish state for new page */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; + maxoffset = PageGetMaxOffsetNumber(page); + } + + /* Count successfully-fetched tuples as heap fetches */ + pgstat_count_heap_getnext(scan->rs_rd); + + return &(scan->rs_ctup); +} - ExecScanReScan(&node->ss); +/* + * Check visibility of the tuple. + */ +static bool +SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) +{ + if (scan->rs_pageatatime) + { + /* + * In pageatatime mode, heapgetpage() already did visibility checks, + * so just look at the info it left in rs_vistuples[]. + * + * We use a binary search over the known-sorted array. Note: we could + * save some effort if we insisted that NextSampleTuple select tuples + * in increasing order, but it's not clear that there would be enough + * gain to justify the restriction. + */ + int start = 0, + end = scan->rs_ntuples - 1; + + while (start <= end) + { + int mid = (start + end) / 2; + OffsetNumber curoffset = scan->rs_vistuples[mid]; + + if (tupoffset == curoffset) + return true; + else if (tupoffset < curoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* Otherwise, we have to check the tuple individually. */ + return HeapTupleSatisfiesVisibility(tuple, + scan->rs_snapshot, + scan->rs_cbuf); + } } |