aboutsummaryrefslogtreecommitdiff
path: root/src/backend/executor/nodeBitmapHeapscan.c
diff options
context:
space:
mode:
authorMelanie Plageman <melanieplageman@gmail.com>2024-10-25 10:11:58 -0400
committerMelanie Plageman <melanieplageman@gmail.com>2024-10-25 10:11:58 -0400
commitde380a62b5dae610b3504b5036e5d5b1150cc4a4 (patch)
treee1e874e8a346f1e7107c5b935f8e12c7b741ab58 /src/backend/executor/nodeBitmapHeapscan.c
parent7bd7aa4d30676de006636bb2c9c079c363d9d56c (diff)
downloadpostgresql-de380a62b5dae610b3504b5036e5d5b1150cc4a4.tar.gz
postgresql-de380a62b5dae610b3504b5036e5d5b1150cc4a4.zip
Make table_scan_bitmap_next_block() async-friendly
Move all responsibility for indicating a block is exhuasted into table_scan_bitmap_next_tuple() and advance the main iterator in heap-specific code. This flow control makes more sense and is a step toward using the read stream API for bitmap heap scans. Previously, table_scan_bitmap_next_block() returned false to indicate table_scan_bitmap_next_tuple() should not be called for the tuples on the page. This happened both when 1) there were no visible tuples on the page and 2) when the block returned by the iterator was past the end of the table. BitmapHeapNext() (generic bitmap table scan code) handled the case when the bitmap was exhausted. It makes more sense for table_scan_bitmap_next_tuple() to return false when there are no visible tuples on the page and table_scan_bitmap_next_block() to return false when the bitmap is exhausted or there are no more blocks in the table. As part of this new design, TBMIterateResults are no longer used as a flow control mechanism in BitmapHeapNext(), so we removed table_scan_bitmap_next_tuple's TBMIterateResult parameter. Note that the prefetch iterator is still saved in the BitmapHeapScanState node and advanced in generic bitmap table scan code. This is because 1) it was not necessary to change the prefetch iterator location to change the flow control in BitmapHeapNext() 2) modifying prefetch iterator management requires several more steps better split over multiple commits and 3) the prefetch iterator will be removed once the read stream API is used. Author: Melanie Plageman Reviewed-by: Tomas Vondra, Andres Freund, Heikki Linnakangas, Mark Dilger Discussion: https://postgr.es/m/063e4eb4-32d9-439e-a0b1-75565a9835a8%40iki.fi
Diffstat (limited to 'src/backend/executor/nodeBitmapHeapscan.c')
-rw-r--r--src/backend/executor/nodeBitmapHeapscan.c246
1 files changed, 139 insertions, 107 deletions
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index f4690a20bb1..89a16f142b7 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -51,8 +51,7 @@
static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
-static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
- BlockNumber blockno);
+static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node);
static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
static inline void BitmapPrefetch(BitmapHeapScanState *node,
TableScanDesc scan);
@@ -71,9 +70,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
ExprContext *econtext;
TableScanDesc scan;
TIDBitmap *tbm;
- TBMIterator *tbmiterator = NULL;
- TBMSharedIterator *shared_tbmiterator = NULL;
- TBMIterateResult *tbmres;
TupleTableSlot *slot;
ParallelBitmapHeapState *pstate = node->pstate;
dsa_area *dsa = node->ss.ps.state->es_query_dsa;
@@ -85,11 +81,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
slot = node->ss.ss_ScanTupleSlot;
scan = node->ss.ss_currentScanDesc;
tbm = node->tbm;
- if (pstate == NULL)
- tbmiterator = node->tbmiterator;
- else
- shared_tbmiterator = node->shared_tbmiterator;
- tbmres = node->tbmres;
/*
* If we haven't yet performed the underlying index scan, do it, and begin
@@ -105,6 +96,9 @@ BitmapHeapNext(BitmapHeapScanState *node)
*/
if (!node->initialized)
{
+ TBMIterator *tbmiterator = NULL;
+ TBMSharedIterator *shared_tbmiterator = NULL;
+
if (!pstate)
{
tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
@@ -113,8 +107,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
elog(ERROR, "unrecognized result from subplan");
node->tbm = tbm;
- node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
- node->tbmres = tbmres = NULL;
+ tbmiterator = tbm_begin_iterate(tbm);
#ifdef USE_PREFETCH
if (node->prefetch_maximum > 0)
@@ -166,9 +159,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
}
/* Allocate a private iterator and attach the shared state to it */
- node->shared_tbmiterator = shared_tbmiterator =
+ shared_tbmiterator =
tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
- node->tbmres = tbmres = NULL;
#ifdef USE_PREFETCH
if (node->prefetch_maximum > 0)
@@ -207,47 +199,23 @@ BitmapHeapNext(BitmapHeapScanState *node)
node->ss.ss_currentScanDesc = scan;
}
+ scan->st.bitmap.rs_iterator = tbmiterator;
+ scan->st.bitmap.rs_shared_iterator = shared_tbmiterator;
node->initialized = true;
+
+ goto new_page;
}
for (;;)
{
- CHECK_FOR_INTERRUPTS();
-
- /*
- * Get next page of results if needed
- */
- if (tbmres == NULL)
- {
- if (!pstate)
- node->tbmres = tbmres = tbm_iterate(tbmiterator);
- else
- node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
- if (tbmres == NULL)
- {
- /* no more entries in the bitmap */
- break;
- }
-
- BitmapAdjustPrefetchIterator(node, tbmres->blockno);
-
- if (!table_scan_bitmap_next_block(scan, tbmres,
- &node->stats.lossy_pages,
- &node->stats.exact_pages))
- {
- /* AM doesn't think this block is valid, skip */
- continue;
- }
-
- /* Adjust the prefetch target */
- BitmapAdjustPrefetchTarget(node);
- }
- else
+ while (table_scan_bitmap_next_tuple(scan, slot))
{
/*
* Continuing in previously obtained page.
*/
+ CHECK_FOR_INTERRUPTS();
+
#ifdef USE_PREFETCH
/*
@@ -268,45 +236,64 @@ BitmapHeapNext(BitmapHeapScanState *node)
SpinLockRelease(&pstate->mutex);
}
#endif /* USE_PREFETCH */
+
+ /*
+ * We issue prefetch requests *after* fetching the current page to
+ * try to avoid having prefetching interfere with the main I/O.
+ * Also, this should happen only when we have determined there is
+ * still something to do on the current page, else we may
+ * uselessly prefetch the same page we are just about to request
+ * for real.
+ */
+ BitmapPrefetch(node, scan);
+
+ /*
+ * If we are using lossy info, we have to recheck the qual
+ * conditions at every tuple.
+ */
+ if (node->recheck)
+ {
+ econtext->ecxt_scantuple = slot;
+ if (!ExecQualAndReset(node->bitmapqualorig, econtext))
+ {
+ /* Fails recheck, so drop it and loop back for another */
+ InstrCountFiltered2(node, 1);
+ ExecClearTuple(slot);
+ continue;
+ }
+ }
+
+ /* OK to return this tuple */
+ return slot;
}
- /*
- * We issue prefetch requests *after* fetching the current page to try
- * to avoid having prefetching interfere with the main I/O. Also, this
- * should happen only when we have determined there is still something
- * to do on the current page, else we may uselessly prefetch the same
- * page we are just about to request for real.
- */
- BitmapPrefetch(node, scan);
+new_page:
+
+ BitmapAdjustPrefetchIterator(node);
/*
- * Attempt to fetch tuple from AM.
+ * Returns false if the bitmap is exhausted and there are no further
+ * blocks we need to scan.
*/
- if (!table_scan_bitmap_next_tuple(scan, tbmres, slot))
- {
- /* nothing more to look at on this page */
- node->tbmres = tbmres = NULL;
- continue;
- }
+ if (!table_scan_bitmap_next_block(scan, &node->blockno,
+ &node->recheck,
+ &node->stats.lossy_pages,
+ &node->stats.exact_pages))
+ break;
/*
- * If we are using lossy info, we have to recheck the qual conditions
- * at every tuple.
+ * If serial, we can error out if the the prefetch block doesn't stay
+ * ahead of the current block.
*/
- if (tbmres->recheck)
- {
- econtext->ecxt_scantuple = slot;
- if (!ExecQualAndReset(node->bitmapqualorig, econtext))
- {
- /* Fails recheck, so drop it and loop back for another */
- InstrCountFiltered2(node, 1);
- ExecClearTuple(slot);
- continue;
- }
- }
-
- /* OK to return this tuple */
- return slot;
+ if (node->pstate == NULL &&
+ node->prefetch_iterator &&
+ node->prefetch_blockno < node->blockno)
+ elog(ERROR,
+ "prefetch and main iterators are out of sync. pfblockno: %d. blockno: %d",
+ node->prefetch_blockno, node->blockno);
+
+ /* Adjust the prefetch target */
+ BitmapAdjustPrefetchTarget(node);
}
/*
@@ -332,13 +319,17 @@ BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
/*
* BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
+ *
+ * We keep track of how far the prefetch iterator is ahead of the main
+ * iterator in prefetch_pages. For each block the main iterator returns, we
+ * decrement prefetch_pages.
*/
static inline void
-BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
- BlockNumber blockno)
+BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
{
#ifdef USE_PREFETCH
ParallelBitmapHeapState *pstate = node->pstate;
+ TBMIterateResult *tbmpre;
if (pstate == NULL)
{
@@ -351,15 +342,22 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
}
else if (prefetch_iterator)
{
- /* Do not let the prefetch iterator get behind the main one */
- TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
-
- if (tbmpre == NULL || tbmpre->blockno != blockno)
- elog(ERROR, "prefetch and main iterators are out of sync");
+ tbmpre = tbm_iterate(prefetch_iterator);
+ node->prefetch_blockno = tbmpre ? tbmpre->blockno :
+ InvalidBlockNumber;
}
return;
}
+ /*
+ * XXX: There is a known issue with keeping the prefetch and current block
+ * iterators in sync for parallel bitmap table scans. This can lead to
+ * prefetching blocks that have already been read. See the discussion
+ * here:
+ * https://postgr.es/m/20240315211449.en2jcmdqxv5o6tlz%40alap3.anarazel.de
+ * Note that moving the call site of BitmapAdjustPrefetchIterator()
+ * exacerbates the effects of this bug.
+ */
if (node->prefetch_maximum > 0)
{
TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
@@ -384,7 +382,11 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
* case.
*/
if (prefetch_iterator)
- tbm_shared_iterate(prefetch_iterator);
+ {
+ tbmpre = tbm_shared_iterate(prefetch_iterator);
+ node->prefetch_blockno = tbmpre ? tbmpre->blockno :
+ InvalidBlockNumber;
+ }
}
}
#endif /* USE_PREFETCH */
@@ -462,6 +464,7 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
break;
}
node->prefetch_pages++;
+ node->prefetch_blockno = tbmpre->blockno;
/*
* If we expect not to have to actually read this heap page,
@@ -519,6 +522,8 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
break;
}
+ node->prefetch_blockno = tbmpre->blockno;
+
/* As above, skip prefetch if we expect not to need page */
skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
!tbmpre->recheck &&
@@ -575,17 +580,32 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
{
PlanState *outerPlan = outerPlanState(node);
- /* rescan to release any page pin */
- if (node->ss.ss_currentScanDesc)
+ TableScanDesc scan = node->ss.ss_currentScanDesc;
+
+ if (scan)
+ {
+ /*
+ * End iteration on iterators saved in scan descriptor.
+ */
+ if (scan->st.bitmap.rs_shared_iterator)
+ {
+ tbm_end_shared_iterate(scan->st.bitmap.rs_shared_iterator);
+ scan->st.bitmap.rs_shared_iterator = NULL;
+ }
+
+ if (scan->st.bitmap.rs_iterator)
+ {
+ tbm_end_iterate(scan->st.bitmap.rs_iterator);
+ scan->st.bitmap.rs_iterator = NULL;
+ }
+
+ /* rescan to release any page pin */
table_rescan(node->ss.ss_currentScanDesc, NULL);
+ }
/* release bitmaps and buffers if any */
- if (node->tbmiterator)
- tbm_end_iterate(node->tbmiterator);
if (node->prefetch_iterator)
tbm_end_iterate(node->prefetch_iterator);
- if (node->shared_tbmiterator)
- tbm_end_shared_iterate(node->shared_tbmiterator);
if (node->shared_prefetch_iterator)
tbm_end_shared_iterate(node->shared_prefetch_iterator);
if (node->tbm)
@@ -593,13 +613,13 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
if (node->pvmbuffer != InvalidBuffer)
ReleaseBuffer(node->pvmbuffer);
node->tbm = NULL;
- node->tbmiterator = NULL;
- node->tbmres = NULL;
node->prefetch_iterator = NULL;
node->initialized = false;
- node->shared_tbmiterator = NULL;
node->shared_prefetch_iterator = NULL;
node->pvmbuffer = InvalidBuffer;
+ node->recheck = true;
+ node->blockno = InvalidBlockNumber;
+ node->prefetch_blockno = InvalidBlockNumber;
ExecScanReScan(&node->ss);
@@ -653,28 +673,40 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
*/
ExecEndNode(outerPlanState(node));
+ if (scanDesc)
+ {
+ /*
+ * End iteration on iterators saved in scan descriptor.
+ */
+ if (scanDesc->st.bitmap.rs_shared_iterator)
+ {
+ tbm_end_shared_iterate(scanDesc->st.bitmap.rs_shared_iterator);
+ scanDesc->st.bitmap.rs_shared_iterator = NULL;
+ }
+
+ if (scanDesc->st.bitmap.rs_iterator)
+ {
+ tbm_end_iterate(scanDesc->st.bitmap.rs_iterator);
+ scanDesc->st.bitmap.rs_iterator = NULL;
+ }
+
+ /*
+ * close table scan
+ */
+ table_endscan(scanDesc);
+ }
+
/*
* release bitmaps and buffers if any
*/
- if (node->tbmiterator)
- tbm_end_iterate(node->tbmiterator);
if (node->prefetch_iterator)
tbm_end_iterate(node->prefetch_iterator);
if (node->tbm)
tbm_free(node->tbm);
- if (node->shared_tbmiterator)
- tbm_end_shared_iterate(node->shared_tbmiterator);
if (node->shared_prefetch_iterator)
tbm_end_shared_iterate(node->shared_prefetch_iterator);
if (node->pvmbuffer != InvalidBuffer)
ReleaseBuffer(node->pvmbuffer);
-
- /*
- * close heap scan
- */
- if (scanDesc)
- table_endscan(scanDesc);
-
}
/* ----------------------------------------------------------------
@@ -707,8 +739,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
scanstate->tbm = NULL;
- scanstate->tbmiterator = NULL;
- scanstate->tbmres = NULL;
scanstate->pvmbuffer = InvalidBuffer;
/* Zero the statistics counters */
@@ -718,9 +748,11 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->prefetch_pages = 0;
scanstate->prefetch_target = 0;
scanstate->initialized = false;
- scanstate->shared_tbmiterator = NULL;
scanstate->shared_prefetch_iterator = NULL;
scanstate->pstate = NULL;
+ scanstate->recheck = true;
+ scanstate->blockno = InvalidBlockNumber;
+ scanstate->prefetch_blockno = InvalidBlockNumber;
/*
* Miscellaneous initialization