diff options
author | Simon Riggs <simon@2ndQuadrant.com> | 2015-05-15 14:37:10 -0400 |
---|---|---|
committer | Simon Riggs <simon@2ndQuadrant.com> | 2015-05-15 14:37:10 -0400 |
commit | f6d208d6e51810c73f0e02c477984a6b44627f11 (patch) | |
tree | 99d540d0b7bda73ff60479f15444f554403d4679 /src/backend/access/tablesample | |
parent | 11a83bbedd73800db70f6f2af5a8eb10d15d39d7 (diff) | |
download | postgresql-f6d208d6e51810c73f0e02c477984a6b44627f11.tar.gz postgresql-f6d208d6e51810c73f0e02c477984a6b44627f11.zip |
TABLESAMPLE, SQL Standard and extensible
Add a TABLESAMPLE clause to SELECT statements that allows
user to specify random BERNOULLI sampling or block level
SYSTEM sampling. Implementation allows for extensible
sampling functions to be written, using a standard API.
Basic version follows SQLStandard exactly. Usable
concrete use cases for the sampling API follow in later
commits.
Petr Jelinek
Reviewed by Michael Paquier and Simon Riggs
Diffstat (limited to 'src/backend/access/tablesample')
-rw-r--r-- | src/backend/access/tablesample/Makefile | 17 | ||||
-rw-r--r-- | src/backend/access/tablesample/bernoulli.c | 235 | ||||
-rw-r--r-- | src/backend/access/tablesample/system.c | 186 | ||||
-rw-r--r-- | src/backend/access/tablesample/tablesample.c | 368 |
4 files changed, 806 insertions, 0 deletions
diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile new file mode 100644 index 00000000000..46eeb59f9c4 --- /dev/null +++ b/src/backend/access/tablesample/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for utils/tablesample +# +# IDENTIFICATION +# src/backend/utils/tablesample/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/tablesample +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = tablesample.o system.o bernoulli.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c new file mode 100644 index 00000000000..c91f3f593e5 --- /dev/null +++ b/src/backend/access/tablesample/bernoulli.c @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * bernoulli.c + * interface routines for BERNOULLI tablesample method + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/tablesample/bernoulli.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "access/tablesample.h" +#include "access/relscan.h" +#include "nodes/execnodes.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "storage/bufmgr.h" +#include "utils/sampling.h" + + +/* tsdesc */ +typedef struct +{ + uint32 seed; /* random seed */ + BlockNumber startblock; /* starting block, we use ths for syncscan support */ + BlockNumber nblocks; /* number of blocks */ + BlockNumber blockno; /* current block */ + float4 probability; /* probabilty that tuple will be returned (0.0-1.0) */ + OffsetNumber lt; /* last tuple returned from current block */ + SamplerRandomState randstate; /* random generator tsdesc */ +} BernoulliSamplerData; + +/* + * Initialize the state. + */ +Datum +tsm_bernoulli_init(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + uint32 seed = PG_GETARG_UINT32(1); + float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); + HeapScanDesc scan = tsdesc->heapScan; + BernoulliSamplerData *sampler; + + if (percent < 0 || percent > 100) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("invalid sample size"), + errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + + sampler = palloc0(sizeof(BernoulliSamplerData)); + + /* Remember initial values for reinit */ + sampler->seed = seed; + sampler->startblock = scan->rs_startblock; + sampler->nblocks = scan->rs_nblocks; + sampler->blockno = InvalidBlockNumber; + sampler->probability = percent / 100; + sampler->lt = InvalidOffsetNumber; + sampler_random_init_state(sampler->seed, sampler->randstate); + + tsdesc->tsmdata = (void *) sampler; + + PG_RETURN_VOID(); +} + +/* + * Get next block number to read or InvalidBlockNumber if we are at the + * end of the relation. + */ +Datum +tsm_bernoulli_nextblock(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + + /* + * Bernoulli sampling scans all blocks on the table and supports + * syncscan so loop from startblock to startblock instead of + * from 0 to nblocks. + */ + if (sampler->blockno == InvalidBlockNumber) + sampler->blockno = sampler->startblock; + else + { + sampler->blockno++; + + if (sampler->blockno >= sampler->nblocks) + sampler->blockno = 0; + + if (sampler->blockno == sampler->startblock) + PG_RETURN_UINT32(InvalidBlockNumber); + } + + PG_RETURN_UINT32(sampler->blockno); +} + +/* + * Get next tuple from current block. + * + * This method implements the main logic in bernoulli sampling. + * The algorithm simply generates new random number (in 0.0-1.0 range) and if + * it falls within user specified probability (in the same range) return the + * tuple offset. + * + * It is ok here to return tuple offset without knowing if tuple is visible + * and not check it via examinetuple. The reason for that is that we do the + * coinflip (random number generation) for every tuple in the table. Since all + * tuples have same probability of being returned the visible and invisible + * tuples will be returned in same ratio as they have in the actual table. + * This means that there is no skew towards either visible or invisible tuples + * and the number returned visible tuples to from the executor node is the + * fraction of visible tuples which was specified in input. + * + * This is faster than doing the coinflip in the examinetuple because we don't + * have to do visibility checks on uninteresting tuples. + * + * If we reach end of the block return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +Datum +tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + OffsetNumber maxoffset = PG_GETARG_UINT16(2); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + OffsetNumber tupoffset = sampler->lt; + float4 probability = sampler->probability; + + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* + * Loop over tuple offsets until the random generator returns value that + * is within the probability of returning the tuple or until we reach + * end of the block. + * + * (This is our implementation of bernoulli trial) + */ + while (sampler_random_fract(sampler->randstate) > probability) + { + tupoffset++; + + if (tupoffset > maxoffset) + break; + } + + if (tupoffset > maxoffset) + /* Tell SampleScan that we want next block. */ + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + PG_RETURN_UINT16(tupoffset); +} + +/* + * Cleanup method. + */ +Datum +tsm_bernoulli_end(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + + pfree(tsdesc->tsmdata); + + PG_RETURN_VOID(); +} + +/* + * Reset tsdesc (called by ReScan). + */ +Datum +tsm_bernoulli_reset(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + + sampler->blockno = InvalidBlockNumber; + sampler->lt = InvalidOffsetNumber; + sampler_random_init_state(sampler->seed, sampler->randstate); + + PG_RETURN_VOID(); +} + +/* + * Costing function. + */ +Datum +tsm_bernoulli_cost(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Path *path = (Path *) PG_GETARG_POINTER(1); + RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); + List *args = (List *) PG_GETARG_POINTER(3); + BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); + double *tuples = (double *) PG_GETARG_POINTER(5); + Node *pctnode; + float4 samplesize; + + *pages = baserel->pages; + + pctnode = linitial(args); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, RelabelType)) + pctnode = (Node *) ((RelabelType *) pctnode)->arg; + + if (IsA(pctnode, Const)) + { + samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); + samplesize /= 100.0; + } + else + { + /* Default samplesize if the estimation didn't return Const. */ + samplesize = 0.1f; + } + + *tuples = path->rows * samplesize; + path->rows = *tuples; + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c new file mode 100644 index 00000000000..1412e511faf --- /dev/null +++ b/src/backend/access/tablesample/system.c @@ -0,0 +1,186 @@ +/*------------------------------------------------------------------------- + * + * system.c + * interface routines for system tablesample method + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/tablesample/system.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "access/tablesample.h" +#include "access/relscan.h" +#include "nodes/execnodes.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "storage/bufmgr.h" +#include "utils/sampling.h" + + +/* + * State + */ +typedef struct +{ + BlockSamplerData bs; + uint32 seed; /* random seed */ + BlockNumber nblocks; /* number of block in relation */ + int samplesize; /* number of blocks to return */ + OffsetNumber lt; /* last tuple returned from current block */ +} SystemSamplerData; + + +/* + * Initializes the state. + */ +Datum +tsm_system_init(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + uint32 seed = PG_GETARG_UINT32(1); + float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); + HeapScanDesc scan = tsdesc->heapScan; + SystemSamplerData *sampler; + + if (percent < 0 || percent > 100) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("invalid sample size"), + errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + + sampler = palloc0(sizeof(SystemSamplerData)); + + /* Remember initial values for reinit */ + sampler->seed = seed; + sampler->nblocks = scan->rs_nblocks; + sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0)); + sampler->lt = InvalidOffsetNumber; + + BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, + sampler->seed); + + tsdesc->tsmdata = (void *) sampler; + + PG_RETURN_VOID(); +} + +/* + * Get next block number or InvalidBlockNumber when we're done. + * + * Uses the same logic as ANALYZE for picking the random blocks. + */ +Datum +tsm_system_nextblock(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + BlockNumber blockno; + + if (!BlockSampler_HasMore(&sampler->bs)) + PG_RETURN_UINT32(InvalidBlockNumber); + + blockno = BlockSampler_Next(&sampler->bs); + + PG_RETURN_UINT32(blockno); +} + +/* + * Get next tuple offset in current block or InvalidOffsetNumber if we are done + * with this block. + */ +Datum +tsm_system_nexttuple(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + OffsetNumber maxoffset = PG_GETARG_UINT16(2); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + OffsetNumber tupoffset = sampler->lt; + + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + PG_RETURN_UINT16(tupoffset); +} + +/* + * Cleanup method. + */ +Datum +tsm_system_end(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + + pfree(tsdesc->tsmdata); + + PG_RETURN_VOID(); +} + +/* + * Reset state (called by ReScan). + */ +Datum +tsm_system_reset(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + + sampler->lt = InvalidOffsetNumber; + BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, + sampler->seed); + + PG_RETURN_VOID(); +} + +/* + * Costing function. + */ +Datum +tsm_system_cost(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Path *path = (Path *) PG_GETARG_POINTER(1); + RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); + List *args = (List *) PG_GETARG_POINTER(3); + BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); + double *tuples = (double *) PG_GETARG_POINTER(5); + Node *pctnode; + float4 samplesize; + + pctnode = linitial(args); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, RelabelType)) + pctnode = (Node *) ((RelabelType *) pctnode)->arg; + + if (IsA(pctnode, Const)) + { + samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); + samplesize /= 100.0; + } + else + { + /* Default samplesize if the estimation didn't return Const. */ + samplesize = 0.1f; + } + + *pages = baserel->pages * samplesize; + *tuples = path->rows * samplesize; + path->rows = *tuples; + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c new file mode 100644 index 00000000000..ef55d062e75 --- /dev/null +++ b/src/backend/access/tablesample/tablesample.c @@ -0,0 +1,368 @@ +/*------------------------------------------------------------------------- + * + * tablesample.c + * TABLESAMPLE internal API + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/tablesample/tablesample.c + * + * TABLESAMPLE is the SQL standard clause for sampling the relations. + * + * The API is interface between the Executor and the TABLESAMPLE Methods. + * + * TABLESAMPLE Methods are implementations of actual sampling algorithms which + * can be used for returning a sample of the source relation. + * Methods don't read the table directly but are asked for block number and + * tuple offset which they want to examine (or return) and the tablesample + * interface implemented here does the reading for them. + * + * We currently only support sampling of the physical relations, but in the + * future we might extend the API to support subqueries as well. + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/tablesample.h" + +#include "catalog/pg_tablesample_method.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/rel.h" +#include "utils/tqual.h" + + +static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan); + + +/* + * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method. + */ +TableSampleDesc * +tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample) +{ + FunctionCallInfoData fcinfo; + int i; + List *args = tablesample->args; + ListCell *arg; + ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; + TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc)); + + /* Load functions */ + fmgr_info(tablesample->tsminit, &(tsdesc->tsminit)); + fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock)); + fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple)); + if (OidIsValid(tablesample->tsmexaminetuple)) + fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple)); + else + tsdesc->tsmexaminetuple.fn_oid = InvalidOid; + fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset)); + fmgr_info(tablesample->tsmend, &(tsdesc->tsmend)); + + InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit, + list_length(args) + 2, + InvalidOid, NULL, NULL); + + tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + tsdesc->heapScan = scanstate->ss.ss_currentScanDesc; + + /* First argument for init function is always TableSampleDesc */ + fcinfo.arg[0] = PointerGetDatum(tsdesc); + fcinfo.argnull[0] = false; + + /* + * Second arg for init function is always REPEATABLE + * When tablesample->repeatable is NULL then REPEATABLE clause was not + * specified. + * When specified, the expression cannot evaluate to NULL. + */ + if (tablesample->repeatable) + { + ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable, + (PlanState *) scanstate); + fcinfo.arg[1] = ExecEvalExpr(argstate, econtext, + &fcinfo.argnull[1], NULL); + if (fcinfo.argnull[1]) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("REPEATABLE clause must be NOT NULL numeric value"))); + } + else + { + fcinfo.arg[1] = UInt32GetDatum(random()); + fcinfo.argnull[1] = false; + } + + /* Rest of the arguments come from user. */ + i = 2; + foreach(arg, args) + { + Expr *argexpr = (Expr *) lfirst(arg); + ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate); + + if (argstate == NULL) + { + fcinfo.argnull[i] = true; + fcinfo.arg[i] = (Datum) 0;; + } + + fcinfo.arg[i] = ExecEvalExpr(argstate, econtext, + &fcinfo.argnull[i], NULL); + i++; + } + Assert(i == fcinfo.nargs); + + (void) FunctionCallInvoke(&fcinfo); + + return tsdesc; +} + +/* + * Get next tuple from TABLESAMPLE Method. + */ +HeapTuple +tablesample_getnext(TableSampleDesc *desc) +{ + HeapScanDesc scan = desc->heapScan; + HeapTuple tuple = &(scan->rs_ctup); + bool pagemode = scan->rs_pageatatime; + BlockNumber blockno; + Page page; + bool page_all_visible; + ItemId itemid; + OffsetNumber tupoffset, + maxoffset; + + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return NULL; + } + blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, + PointerGetDatum(desc))); + if (!BlockNumberIsValid(blockno)) + { + tuple->t_data = NULL; + return NULL; + } + + heapgetpage(scan, blockno); + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + blockno = scan->rs_cblock; /* current page */ + } + + /* + * When pagemode is disabled, the scan will do visibility checks for each + * tuple it finds so the buffer needs to be locked. + */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + page_all_visible = PageIsAllVisible(page); + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple, + PointerGetDatum(desc), + UInt32GetDatum(blockno), + UInt16GetDatum(maxoffset))); + + if (OffsetNumberIsValid(tupoffset)) + { + bool visible; + bool found; + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + if (page_all_visible) + visible = true; + else + visible = SampleTupleVisible(tuple, tupoffset, scan); + + /* + * Let the sampling method examine the actual tuple and decide if we + * should return it. + * + * Note that we let it examine even invisible tuples for + * statistical purposes, but not return them since user should + * never see invisible tuples. + */ + if (OidIsValid(desc->tsmexaminetuple.fn_oid)) + { + found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple, + PointerGetDatum(desc), + UInt32GetDatum(blockno), + PointerGetDatum(tuple), + BoolGetDatum(visible))); + /* Should not happen if sampling method is well written. */ + if (found && !visible) + elog(ERROR, "Sampling method wanted to return invisible tuple"); + } + else + found = visible; + + /* Found visible tuple, return it. */ + if (found) + { + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + break; + } + else + { + /* Try next tuple from same page. */ + continue; + } + } + + + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, + PointerGetDatum(desc))); + + /* + * Report our new scan position for synchronization purposes. We + * don't do that when moving backwards, however. That would just + * mess up any other forward-moving scanners. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_syncscan) + ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ? + blockno : scan->rs_startblock); + + /* + * Reached end of scan. + */ + if (!BlockNumberIsValid(blockno)) + { + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; + return NULL; + } + + heapgetpage(scan, blockno); + + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + page_all_visible = PageIsAllVisible(page); + maxoffset = PageGetMaxOffsetNumber(page); + } + + pgstat_count_heap_getnext(scan->rs_rd); + + return &(scan->rs_ctup); +} + +/* + * Reset the sampling to starting state + */ +void +tablesample_reset(TableSampleDesc *desc) +{ + (void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc)); +} + +/* + * Signal the sampling method that the scan has finished. + */ +void +tablesample_end(TableSampleDesc *desc) +{ + (void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc)); +} + +/* + * Check visibility of the tuple. + */ +static bool +SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) +{ + /* + * If this scan is reading whole pages at a time, there is already + * visibility info present in rs_vistuples so we can just search it + * for the tupoffset. + */ + if (scan->rs_pageatatime) + { + int start = 0, + end = scan->rs_ntuples - 1; + + /* + * Do the binary search over rs_vistuples, it's already sorted by + * OffsetNumber so we don't need to do any sorting ourselves here. + * + * We could use bsearch() here but it's slower for integers because + * of the function call overhead and because it needs boiler plate code + * it would not save us anything code-wise anyway. + */ + while (start <= end) + { + int mid = start + (end - start) / 2; + OffsetNumber curoffset = scan->rs_vistuples[mid]; + + if (curoffset == tupoffset) + return true; + else if (curoffset > tupoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* No pagemode, we have to check the tuple itself. */ + Snapshot snapshot = scan->rs_snapshot; + Buffer buffer = scan->rs_cbuf; + + bool visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + + CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer, + snapshot); + + return visible; + } +} |