diff options
Diffstat (limited to 'src/backend/access')
-rw-r--r-- | src/backend/access/Makefile | 3 | ||||
-rw-r--r-- | src/backend/access/heap/heapam.c | 41 | ||||
-rw-r--r-- | src/backend/access/tablesample/Makefile | 17 | ||||
-rw-r--r-- | src/backend/access/tablesample/bernoulli.c | 235 | ||||
-rw-r--r-- | src/backend/access/tablesample/system.c | 186 | ||||
-rw-r--r-- | src/backend/access/tablesample/tablesample.c | 368 |
6 files changed, 837 insertions, 13 deletions
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 21721b48f04..bd93a6a8d1e 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,6 +8,7 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist transam +SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ + tablesample transam include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 1a8d2f2d0b5..f0c2394e600 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -80,8 +80,9 @@ bool synchronize_seqscans = true; static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, - bool is_bitmapscan, bool temp_snap); + bool allow_strat, bool allow_sync, bool allow_pagemode, + bool is_bitmapscan, bool is_samplescan, + bool temp_snap); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, @@ -294,9 +295,10 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) /* * Currently, we don't have a stats counter for bitmap heap scans (but the - * underlying bitmap index scans will be counted). + * underlying bitmap index scans will be counted) or sample scans (we only + * update stats for tuple fetches there) */ - if (!scan->rs_bitmapscan) + if (!scan->rs_bitmapscan && !scan->rs_samplescan) pgstat_count_heap_scan(scan->rs_rd); } @@ -315,7 +317,7 @@ heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks) * In page-at-a-time mode it performs additional work, namely determining * which tuples on the page are visible. */ -static void +void heapgetpage(HeapScanDesc scan, BlockNumber page) { Buffer buffer; @@ -1310,6 +1312,9 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, * HeapScanDesc for a bitmap heap scan. Although that scan technology is * really quite unlike a standard seqscan, there is just enough commonality * to make it worth using the same data structure. + * + * heap_beginscan_samplingscan is alternate entry point for setting up a + * HeapScanDesc for a TABLESAMPLE scan. * ---------------- */ HeapScanDesc @@ -1317,7 +1322,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - true, true, false, false); + true, true, true, false, false, false); } HeapScanDesc @@ -1327,7 +1332,7 @@ heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key) Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); return heap_beginscan_internal(relation, snapshot, nkeys, key, - true, true, false, true); + true, true, true, false, false, true); } HeapScanDesc @@ -1336,7 +1341,8 @@ heap_beginscan_strat(Relation relation, Snapshot snapshot, bool allow_strat, bool allow_sync) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - allow_strat, allow_sync, false, false); + allow_strat, allow_sync, true, + false, false, false); } HeapScanDesc @@ -1344,14 +1350,24 @@ heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - false, false, true, false); + false, false, true, true, false, false); +} + +HeapScanDesc +heap_beginscan_sampling(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + bool allow_strat, bool allow_pagemode) +{ + return heap_beginscan_internal(relation, snapshot, nkeys, key, + allow_strat, false, allow_pagemode, + false, true, false); } static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, - bool is_bitmapscan, bool temp_snap) + bool allow_strat, bool allow_sync, bool allow_pagemode, + bool is_bitmapscan, bool is_samplescan, bool temp_snap) { HeapScanDesc scan; @@ -1373,6 +1389,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, scan->rs_snapshot = snapshot; scan->rs_nkeys = nkeys; scan->rs_bitmapscan = is_bitmapscan; + scan->rs_samplescan = is_samplescan; scan->rs_strategy = NULL; /* set in initscan */ scan->rs_allow_strat = allow_strat; scan->rs_allow_sync = allow_sync; @@ -1381,7 +1398,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, /* * we can use page-at-a-time mode if it's an MVCC-safe snapshot */ - scan->rs_pageatatime = IsMVCCSnapshot(snapshot); + scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot); /* * For a seqscan in a serializable transaction, acquire a predicate lock diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile new file mode 100644 index 00000000000..46eeb59f9c4 --- /dev/null +++ b/src/backend/access/tablesample/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for utils/tablesample +# +# IDENTIFICATION +# src/backend/utils/tablesample/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/tablesample +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = tablesample.o system.o bernoulli.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c new file mode 100644 index 00000000000..c91f3f593e5 --- /dev/null +++ b/src/backend/access/tablesample/bernoulli.c @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * bernoulli.c + * interface routines for BERNOULLI tablesample method + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/tablesample/bernoulli.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "access/tablesample.h" +#include "access/relscan.h" +#include "nodes/execnodes.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "storage/bufmgr.h" +#include "utils/sampling.h" + + +/* tsdesc */ +typedef struct +{ + uint32 seed; /* random seed */ + BlockNumber startblock; /* starting block, we use ths for syncscan support */ + BlockNumber nblocks; /* number of blocks */ + BlockNumber blockno; /* current block */ + float4 probability; /* probabilty that tuple will be returned (0.0-1.0) */ + OffsetNumber lt; /* last tuple returned from current block */ + SamplerRandomState randstate; /* random generator tsdesc */ +} BernoulliSamplerData; + +/* + * Initialize the state. + */ +Datum +tsm_bernoulli_init(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + uint32 seed = PG_GETARG_UINT32(1); + float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); + HeapScanDesc scan = tsdesc->heapScan; + BernoulliSamplerData *sampler; + + if (percent < 0 || percent > 100) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("invalid sample size"), + errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + + sampler = palloc0(sizeof(BernoulliSamplerData)); + + /* Remember initial values for reinit */ + sampler->seed = seed; + sampler->startblock = scan->rs_startblock; + sampler->nblocks = scan->rs_nblocks; + sampler->blockno = InvalidBlockNumber; + sampler->probability = percent / 100; + sampler->lt = InvalidOffsetNumber; + sampler_random_init_state(sampler->seed, sampler->randstate); + + tsdesc->tsmdata = (void *) sampler; + + PG_RETURN_VOID(); +} + +/* + * Get next block number to read or InvalidBlockNumber if we are at the + * end of the relation. + */ +Datum +tsm_bernoulli_nextblock(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + + /* + * Bernoulli sampling scans all blocks on the table and supports + * syncscan so loop from startblock to startblock instead of + * from 0 to nblocks. + */ + if (sampler->blockno == InvalidBlockNumber) + sampler->blockno = sampler->startblock; + else + { + sampler->blockno++; + + if (sampler->blockno >= sampler->nblocks) + sampler->blockno = 0; + + if (sampler->blockno == sampler->startblock) + PG_RETURN_UINT32(InvalidBlockNumber); + } + + PG_RETURN_UINT32(sampler->blockno); +} + +/* + * Get next tuple from current block. + * + * This method implements the main logic in bernoulli sampling. + * The algorithm simply generates new random number (in 0.0-1.0 range) and if + * it falls within user specified probability (in the same range) return the + * tuple offset. + * + * It is ok here to return tuple offset without knowing if tuple is visible + * and not check it via examinetuple. The reason for that is that we do the + * coinflip (random number generation) for every tuple in the table. Since all + * tuples have same probability of being returned the visible and invisible + * tuples will be returned in same ratio as they have in the actual table. + * This means that there is no skew towards either visible or invisible tuples + * and the number returned visible tuples to from the executor node is the + * fraction of visible tuples which was specified in input. + * + * This is faster than doing the coinflip in the examinetuple because we don't + * have to do visibility checks on uninteresting tuples. + * + * If we reach end of the block return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +Datum +tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + OffsetNumber maxoffset = PG_GETARG_UINT16(2); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + OffsetNumber tupoffset = sampler->lt; + float4 probability = sampler->probability; + + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* + * Loop over tuple offsets until the random generator returns value that + * is within the probability of returning the tuple or until we reach + * end of the block. + * + * (This is our implementation of bernoulli trial) + */ + while (sampler_random_fract(sampler->randstate) > probability) + { + tupoffset++; + + if (tupoffset > maxoffset) + break; + } + + if (tupoffset > maxoffset) + /* Tell SampleScan that we want next block. */ + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + PG_RETURN_UINT16(tupoffset); +} + +/* + * Cleanup method. + */ +Datum +tsm_bernoulli_end(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + + pfree(tsdesc->tsmdata); + + PG_RETURN_VOID(); +} + +/* + * Reset tsdesc (called by ReScan). + */ +Datum +tsm_bernoulli_reset(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + + sampler->blockno = InvalidBlockNumber; + sampler->lt = InvalidOffsetNumber; + sampler_random_init_state(sampler->seed, sampler->randstate); + + PG_RETURN_VOID(); +} + +/* + * Costing function. + */ +Datum +tsm_bernoulli_cost(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Path *path = (Path *) PG_GETARG_POINTER(1); + RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); + List *args = (List *) PG_GETARG_POINTER(3); + BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); + double *tuples = (double *) PG_GETARG_POINTER(5); + Node *pctnode; + float4 samplesize; + + *pages = baserel->pages; + + pctnode = linitial(args); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, RelabelType)) + pctnode = (Node *) ((RelabelType *) pctnode)->arg; + + if (IsA(pctnode, Const)) + { + samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); + samplesize /= 100.0; + } + else + { + /* Default samplesize if the estimation didn't return Const. */ + samplesize = 0.1f; + } + + *tuples = path->rows * samplesize; + path->rows = *tuples; + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c new file mode 100644 index 00000000000..1412e511faf --- /dev/null +++ b/src/backend/access/tablesample/system.c @@ -0,0 +1,186 @@ +/*------------------------------------------------------------------------- + * + * system.c + * interface routines for system tablesample method + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/tablesample/system.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "access/tablesample.h" +#include "access/relscan.h" +#include "nodes/execnodes.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "storage/bufmgr.h" +#include "utils/sampling.h" + + +/* + * State + */ +typedef struct +{ + BlockSamplerData bs; + uint32 seed; /* random seed */ + BlockNumber nblocks; /* number of block in relation */ + int samplesize; /* number of blocks to return */ + OffsetNumber lt; /* last tuple returned from current block */ +} SystemSamplerData; + + +/* + * Initializes the state. + */ +Datum +tsm_system_init(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + uint32 seed = PG_GETARG_UINT32(1); + float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); + HeapScanDesc scan = tsdesc->heapScan; + SystemSamplerData *sampler; + + if (percent < 0 || percent > 100) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("invalid sample size"), + errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + + sampler = palloc0(sizeof(SystemSamplerData)); + + /* Remember initial values for reinit */ + sampler->seed = seed; + sampler->nblocks = scan->rs_nblocks; + sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0)); + sampler->lt = InvalidOffsetNumber; + + BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, + sampler->seed); + + tsdesc->tsmdata = (void *) sampler; + + PG_RETURN_VOID(); +} + +/* + * Get next block number or InvalidBlockNumber when we're done. + * + * Uses the same logic as ANALYZE for picking the random blocks. + */ +Datum +tsm_system_nextblock(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + BlockNumber blockno; + + if (!BlockSampler_HasMore(&sampler->bs)) + PG_RETURN_UINT32(InvalidBlockNumber); + + blockno = BlockSampler_Next(&sampler->bs); + + PG_RETURN_UINT32(blockno); +} + +/* + * Get next tuple offset in current block or InvalidOffsetNumber if we are done + * with this block. + */ +Datum +tsm_system_nexttuple(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + OffsetNumber maxoffset = PG_GETARG_UINT16(2); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + OffsetNumber tupoffset = sampler->lt; + + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + PG_RETURN_UINT16(tupoffset); +} + +/* + * Cleanup method. + */ +Datum +tsm_system_end(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + + pfree(tsdesc->tsmdata); + + PG_RETURN_VOID(); +} + +/* + * Reset state (called by ReScan). + */ +Datum +tsm_system_reset(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + + sampler->lt = InvalidOffsetNumber; + BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, + sampler->seed); + + PG_RETURN_VOID(); +} + +/* + * Costing function. + */ +Datum +tsm_system_cost(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Path *path = (Path *) PG_GETARG_POINTER(1); + RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); + List *args = (List *) PG_GETARG_POINTER(3); + BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); + double *tuples = (double *) PG_GETARG_POINTER(5); + Node *pctnode; + float4 samplesize; + + pctnode = linitial(args); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, RelabelType)) + pctnode = (Node *) ((RelabelType *) pctnode)->arg; + + if (IsA(pctnode, Const)) + { + samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); + samplesize /= 100.0; + } + else + { + /* Default samplesize if the estimation didn't return Const. */ + samplesize = 0.1f; + } + + *pages = baserel->pages * samplesize; + *tuples = path->rows * samplesize; + path->rows = *tuples; + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c new file mode 100644 index 00000000000..ef55d062e75 --- /dev/null +++ b/src/backend/access/tablesample/tablesample.c @@ -0,0 +1,368 @@ +/*------------------------------------------------------------------------- + * + * tablesample.c + * TABLESAMPLE internal API + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/tablesample/tablesample.c + * + * TABLESAMPLE is the SQL standard clause for sampling the relations. + * + * The API is interface between the Executor and the TABLESAMPLE Methods. + * + * TABLESAMPLE Methods are implementations of actual sampling algorithms which + * can be used for returning a sample of the source relation. + * Methods don't read the table directly but are asked for block number and + * tuple offset which they want to examine (or return) and the tablesample + * interface implemented here does the reading for them. + * + * We currently only support sampling of the physical relations, but in the + * future we might extend the API to support subqueries as well. + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/tablesample.h" + +#include "catalog/pg_tablesample_method.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/rel.h" +#include "utils/tqual.h" + + +static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan); + + +/* + * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method. + */ +TableSampleDesc * +tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample) +{ + FunctionCallInfoData fcinfo; + int i; + List *args = tablesample->args; + ListCell *arg; + ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; + TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc)); + + /* Load functions */ + fmgr_info(tablesample->tsminit, &(tsdesc->tsminit)); + fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock)); + fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple)); + if (OidIsValid(tablesample->tsmexaminetuple)) + fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple)); + else + tsdesc->tsmexaminetuple.fn_oid = InvalidOid; + fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset)); + fmgr_info(tablesample->tsmend, &(tsdesc->tsmend)); + + InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit, + list_length(args) + 2, + InvalidOid, NULL, NULL); + + tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + tsdesc->heapScan = scanstate->ss.ss_currentScanDesc; + + /* First argument for init function is always TableSampleDesc */ + fcinfo.arg[0] = PointerGetDatum(tsdesc); + fcinfo.argnull[0] = false; + + /* + * Second arg for init function is always REPEATABLE + * When tablesample->repeatable is NULL then REPEATABLE clause was not + * specified. + * When specified, the expression cannot evaluate to NULL. + */ + if (tablesample->repeatable) + { + ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable, + (PlanState *) scanstate); + fcinfo.arg[1] = ExecEvalExpr(argstate, econtext, + &fcinfo.argnull[1], NULL); + if (fcinfo.argnull[1]) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("REPEATABLE clause must be NOT NULL numeric value"))); + } + else + { + fcinfo.arg[1] = UInt32GetDatum(random()); + fcinfo.argnull[1] = false; + } + + /* Rest of the arguments come from user. */ + i = 2; + foreach(arg, args) + { + Expr *argexpr = (Expr *) lfirst(arg); + ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate); + + if (argstate == NULL) + { + fcinfo.argnull[i] = true; + fcinfo.arg[i] = (Datum) 0;; + } + + fcinfo.arg[i] = ExecEvalExpr(argstate, econtext, + &fcinfo.argnull[i], NULL); + i++; + } + Assert(i == fcinfo.nargs); + + (void) FunctionCallInvoke(&fcinfo); + + return tsdesc; +} + +/* + * Get next tuple from TABLESAMPLE Method. + */ +HeapTuple +tablesample_getnext(TableSampleDesc *desc) +{ + HeapScanDesc scan = desc->heapScan; + HeapTuple tuple = &(scan->rs_ctup); + bool pagemode = scan->rs_pageatatime; + BlockNumber blockno; + Page page; + bool page_all_visible; + ItemId itemid; + OffsetNumber tupoffset, + maxoffset; + + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return NULL; + } + blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, + PointerGetDatum(desc))); + if (!BlockNumberIsValid(blockno)) + { + tuple->t_data = NULL; + return NULL; + } + + heapgetpage(scan, blockno); + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + blockno = scan->rs_cblock; /* current page */ + } + + /* + * When pagemode is disabled, the scan will do visibility checks for each + * tuple it finds so the buffer needs to be locked. + */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + page_all_visible = PageIsAllVisible(page); + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple, + PointerGetDatum(desc), + UInt32GetDatum(blockno), + UInt16GetDatum(maxoffset))); + + if (OffsetNumberIsValid(tupoffset)) + { + bool visible; + bool found; + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + if (page_all_visible) + visible = true; + else + visible = SampleTupleVisible(tuple, tupoffset, scan); + + /* + * Let the sampling method examine the actual tuple and decide if we + * should return it. + * + * Note that we let it examine even invisible tuples for + * statistical purposes, but not return them since user should + * never see invisible tuples. + */ + if (OidIsValid(desc->tsmexaminetuple.fn_oid)) + { + found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple, + PointerGetDatum(desc), + UInt32GetDatum(blockno), + PointerGetDatum(tuple), + BoolGetDatum(visible))); + /* Should not happen if sampling method is well written. */ + if (found && !visible) + elog(ERROR, "Sampling method wanted to return invisible tuple"); + } + else + found = visible; + + /* Found visible tuple, return it. */ + if (found) + { + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + break; + } + else + { + /* Try next tuple from same page. */ + continue; + } + } + + + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, + PointerGetDatum(desc))); + + /* + * Report our new scan position for synchronization purposes. We + * don't do that when moving backwards, however. That would just + * mess up any other forward-moving scanners. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_syncscan) + ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ? + blockno : scan->rs_startblock); + + /* + * Reached end of scan. + */ + if (!BlockNumberIsValid(blockno)) + { + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; + return NULL; + } + + heapgetpage(scan, blockno); + + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + page_all_visible = PageIsAllVisible(page); + maxoffset = PageGetMaxOffsetNumber(page); + } + + pgstat_count_heap_getnext(scan->rs_rd); + + return &(scan->rs_ctup); +} + +/* + * Reset the sampling to starting state + */ +void +tablesample_reset(TableSampleDesc *desc) +{ + (void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc)); +} + +/* + * Signal the sampling method that the scan has finished. + */ +void +tablesample_end(TableSampleDesc *desc) +{ + (void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc)); +} + +/* + * Check visibility of the tuple. + */ +static bool +SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) +{ + /* + * If this scan is reading whole pages at a time, there is already + * visibility info present in rs_vistuples so we can just search it + * for the tupoffset. + */ + if (scan->rs_pageatatime) + { + int start = 0, + end = scan->rs_ntuples - 1; + + /* + * Do the binary search over rs_vistuples, it's already sorted by + * OffsetNumber so we don't need to do any sorting ourselves here. + * + * We could use bsearch() here but it's slower for integers because + * of the function call overhead and because it needs boiler plate code + * it would not save us anything code-wise anyway. + */ + while (start <= end) + { + int mid = start + (end - start) / 2; + OffsetNumber curoffset = scan->rs_vistuples[mid]; + + if (curoffset == tupoffset) + return true; + else if (curoffset > tupoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* No pagemode, we have to check the tuple itself. */ + Snapshot snapshot = scan->rs_snapshot; + Buffer buffer = scan->rs_cbuf; + + bool visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + + CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer, + snapshot); + + return visible; + } +} |