aboutsummaryrefslogtreecommitdiff
path: root/src/backend/executor/nodeBitmapHeapscan.c
diff options
context:
space:
mode:
authorMelanie Plageman <melanieplageman@gmail.com>2025-03-15 10:34:42 -0400
committerMelanie Plageman <melanieplageman@gmail.com>2025-03-15 10:34:42 -0400
commit2b73a8cd33b745c5b8a7f44322f86642519e3a40 (patch)
tree43c4cb7aa83247b634ff15d4df76a75f16221230 /src/backend/executor/nodeBitmapHeapscan.c
parent944e81bf99db2b5b70b8a389d4f273534da73f74 (diff)
downloadpostgresql-2b73a8cd33b745c5b8a7f44322f86642519e3a40.tar.gz
postgresql-2b73a8cd33b745c5b8a7f44322f86642519e3a40.zip
BitmapHeapScan uses the read stream API
Make Bitmap Heap Scan use the read stream API instead of invoking ReadBuffer() for each block indicated by the bitmap. The read stream API handles prefetching, so remove all of the explicit prefetching from bitmap heap scan code. Now, heap table AM implements a read stream callback which uses the bitmap iterator to return the next required block to the read stream code. Tomas Vondra conducted extensive regression testing of this feature. Andres Freund, Thomas Munro, and I analyzed regressions and Thomas Munro patched the read stream API. Author: Melanie Plageman <melanieplageman@gmail.com> Reviewed-by: Tomas Vondra <tomas@vondra.me> Tested-by: Tomas Vondra <tomas@vondra.me> Tested-by: Andres Freund <andres@anarazel.de> Tested-by: Thomas Munro <thomas.munro@gmail.com> Tested-by: Nazir Bilal Yavuz <byavuz81@gmail.com> Discussion: https://postgr.es/m/flat/CAAKRu_ZwCwWFeL_H3ia26bP2e7HiKLWt0ZmGXPVwPO6uXq0vaA%40mail.gmail.com
Diffstat (limited to 'src/backend/executor/nodeBitmapHeapscan.c')
-rw-r--r--src/backend/executor/nodeBitmapHeapscan.c341
1 files changed, 1 insertions, 340 deletions
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 3b4ea0f6144..6df34094a13 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -51,10 +51,6 @@
static void BitmapTableScanSetup(BitmapHeapScanState *node);
static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
-static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node);
-static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
-static inline void BitmapPrefetch(BitmapHeapScanState *node,
- TableScanDesc scan);
static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
@@ -62,14 +58,6 @@ static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
* Do the underlying index scan, build the bitmap, set up the parallel state
* needed for parallel workers to iterate through the bitmap, and set up the
* underlying table scan descriptor.
- *
- * For prefetching, we use *two* iterators, one for the pages we are actually
- * scanning and another that runs ahead of the first for prefetching.
- * node->prefetch_pages tracks exactly how many pages ahead the prefetch
- * iterator is. Also, node->prefetch_target tracks the desired prefetch
- * distance, which starts small and increases up to the
- * node->prefetch_maximum. This is to avoid doing a lot of prefetching in a
- * scan that stops after a few tuples because of a LIMIT.
*/
static void
BitmapTableScanSetup(BitmapHeapScanState *node)
@@ -102,14 +90,6 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
*/
pstate->tbmiterator = tbm_prepare_shared_iterate(node->tbm);
-#ifdef USE_PREFETCH
- if (node->prefetch_maximum > 0)
- {
- pstate->prefetch_iterator =
- tbm_prepare_shared_iterate(node->tbm);
- }
-#endif /* USE_PREFETCH */
-
/* We have initialized the shared state so wake up others. */
BitmapDoneInitializingSharedState(pstate);
}
@@ -119,15 +99,6 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
pstate->tbmiterator :
InvalidDsaPointer);
-#ifdef USE_PREFETCH
- if (node->prefetch_maximum > 0)
- node->prefetch_iterator =
- tbm_begin_iterate(node->tbm, dsa,
- pstate ?
- pstate->prefetch_iterator :
- InvalidDsaPointer);
-#endif /* USE_PREFETCH */
-
/*
* If this is the first scan of the underlying table, create the table
* scan descriptor and begin the scan.
@@ -158,7 +129,6 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
node->initialized = true;
}
-
/* ----------------------------------------------------------------
* BitmapHeapNext
*
@@ -172,10 +142,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
TableScanDesc scan;
TupleTableSlot *slot;
-#ifdef USE_PREFETCH
- ParallelBitmapHeapState *pstate = node->pstate;
-#endif
-
/*
* extract necessary information from index scan node
*/
@@ -204,37 +170,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
CHECK_FOR_INTERRUPTS();
-#ifdef USE_PREFETCH
-
- /*
- * Try to prefetch at least a few pages even before we get to the
- * second page if we don't stop reading after the first tuple.
- */
- if (!pstate)
- {
- if (node->prefetch_target < node->prefetch_maximum)
- node->prefetch_target++;
- }
- else if (pstate->prefetch_target < node->prefetch_maximum)
- {
- /* take spinlock while updating shared state */
- SpinLockAcquire(&pstate->mutex);
- if (pstate->prefetch_target < node->prefetch_maximum)
- pstate->prefetch_target++;
- SpinLockRelease(&pstate->mutex);
- }
-#endif /* USE_PREFETCH */
-
- /*
- * We issue prefetch requests *after* fetching the current page to
- * try to avoid having prefetching interfere with the main I/O.
- * Also, this should happen only when we have determined there is
- * still something to do on the current page, else we may
- * uselessly prefetch the same page we are just about to request
- * for real.
- */
- BitmapPrefetch(node, scan);
-
/*
* If we are using lossy info, we have to recheck the qual
* conditions at every tuple.
@@ -257,31 +192,15 @@ BitmapHeapNext(BitmapHeapScanState *node)
new_page:
- BitmapAdjustPrefetchIterator(node);
-
/*
* Returns false if the bitmap is exhausted and there are no further
* blocks we need to scan.
*/
- if (!table_scan_bitmap_next_block(scan, &node->blockno,
+ if (!table_scan_bitmap_next_block(scan,
&node->recheck,
&node->stats.lossy_pages,
&node->stats.exact_pages))
break;
-
- /*
- * If serial, we can error out if the prefetch block doesn't stay
- * ahead of the current block.
- */
- if (node->pstate == NULL &&
- !tbm_exhausted(&node->prefetch_iterator) &&
- node->prefetch_blockno < node->blockno)
- elog(ERROR,
- "prefetch and main iterators are out of sync. pfblockno: %d. blockno: %d",
- node->prefetch_blockno, node->blockno);
-
- /* Adjust the prefetch target */
- BitmapAdjustPrefetchTarget(node);
}
/*
@@ -306,225 +225,6 @@ BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
}
/*
- * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
- *
- * We keep track of how far the prefetch iterator is ahead of the main
- * iterator in prefetch_pages. For each block the main iterator returns, we
- * decrement prefetch_pages.
- */
-static inline void
-BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
-{
-#ifdef USE_PREFETCH
- ParallelBitmapHeapState *pstate = node->pstate;
- TBMIterateResult tbmpre;
-
- if (pstate == NULL)
- {
- TBMIterator *prefetch_iterator = &node->prefetch_iterator;
-
- if (node->prefetch_pages > 0)
- {
- /* The main iterator has closed the distance by one page */
- node->prefetch_pages--;
- }
- else if (!tbm_exhausted(prefetch_iterator))
- {
- tbm_iterate(prefetch_iterator, &tbmpre);
- node->prefetch_blockno = tbmpre.blockno;
- }
- return;
- }
-
- /*
- * XXX: There is a known issue with keeping the prefetch and current block
- * iterators in sync for parallel bitmap table scans. This can lead to
- * prefetching blocks that have already been read. See the discussion
- * here:
- * https://postgr.es/m/20240315211449.en2jcmdqxv5o6tlz%40alap3.anarazel.de
- * Note that moving the call site of BitmapAdjustPrefetchIterator()
- * exacerbates the effects of this bug.
- */
- if (node->prefetch_maximum > 0)
- {
- TBMIterator *prefetch_iterator = &node->prefetch_iterator;
-
- SpinLockAcquire(&pstate->mutex);
- if (pstate->prefetch_pages > 0)
- {
- pstate->prefetch_pages--;
- SpinLockRelease(&pstate->mutex);
- }
- else
- {
- /* Release the mutex before iterating */
- SpinLockRelease(&pstate->mutex);
-
- /*
- * In case of shared mode, we can not ensure that the current
- * blockno of the main iterator and that of the prefetch iterator
- * are same. It's possible that whatever blockno we are
- * prefetching will be processed by another process. Therefore,
- * we don't validate the blockno here as we do in non-parallel
- * case.
- */
- if (!tbm_exhausted(prefetch_iterator))
- {
- tbm_iterate(prefetch_iterator, &tbmpre);
- node->prefetch_blockno = tbmpre.blockno;
- }
- }
- }
-#endif /* USE_PREFETCH */
-}
-
-/*
- * BitmapAdjustPrefetchTarget - Adjust the prefetch target
- *
- * Increase prefetch target if it's not yet at the max. Note that
- * we will increase it to zero after fetching the very first
- * page/tuple, then to one after the second tuple is fetched, then
- * it doubles as later pages are fetched.
- */
-static inline void
-BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
-{
-#ifdef USE_PREFETCH
- ParallelBitmapHeapState *pstate = node->pstate;
-
- if (pstate == NULL)
- {
- if (node->prefetch_target >= node->prefetch_maximum)
- /* don't increase any further */ ;
- else if (node->prefetch_target >= node->prefetch_maximum / 2)
- node->prefetch_target = node->prefetch_maximum;
- else if (node->prefetch_target > 0)
- node->prefetch_target *= 2;
- else
- node->prefetch_target++;
- return;
- }
-
- /* Do an unlocked check first to save spinlock acquisitions. */
- if (pstate->prefetch_target < node->prefetch_maximum)
- {
- SpinLockAcquire(&pstate->mutex);
- if (pstate->prefetch_target >= node->prefetch_maximum)
- /* don't increase any further */ ;
- else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
- pstate->prefetch_target = node->prefetch_maximum;
- else if (pstate->prefetch_target > 0)
- pstate->prefetch_target *= 2;
- else
- pstate->prefetch_target++;
- SpinLockRelease(&pstate->mutex);
- }
-#endif /* USE_PREFETCH */
-}
-
-/*
- * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
- */
-static inline void
-BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
-{
-#ifdef USE_PREFETCH
- ParallelBitmapHeapState *pstate = node->pstate;
-
- if (pstate == NULL)
- {
- TBMIterator *prefetch_iterator = &node->prefetch_iterator;
-
- if (!tbm_exhausted(prefetch_iterator))
- {
- while (node->prefetch_pages < node->prefetch_target)
- {
- TBMIterateResult tbmpre;
- bool skip_fetch;
-
- if (!tbm_iterate(prefetch_iterator, &tbmpre))
- {
- /* No more pages to prefetch */
- Assert(!BlockNumberIsValid(tbmpre.blockno));
- tbm_end_iterate(prefetch_iterator);
- break;
- }
- node->prefetch_pages++;
- node->prefetch_blockno = tbmpre.blockno;
-
- /*
- * If we expect not to have to actually read this heap page,
- * skip this prefetch call, but continue to run the prefetch
- * logic normally. (Would it be better not to increment
- * prefetch_pages?)
- */
- skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
- !tbmpre.recheck &&
- VM_ALL_VISIBLE(node->ss.ss_currentRelation,
- tbmpre.blockno,
- &node->pvmbuffer));
-
- if (!skip_fetch)
- PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno);
- }
- }
-
- return;
- }
-
- if (pstate->prefetch_pages < pstate->prefetch_target)
- {
- TBMIterator *prefetch_iterator = &node->prefetch_iterator;
-
- if (!tbm_exhausted(prefetch_iterator))
- {
- while (1)
- {
- TBMIterateResult tbmpre;
- bool do_prefetch = false;
- bool skip_fetch;
-
- /*
- * Recheck under the mutex. If some other process has already
- * done enough prefetching then we need not to do anything.
- */
- SpinLockAcquire(&pstate->mutex);
- if (pstate->prefetch_pages < pstate->prefetch_target)
- {
- pstate->prefetch_pages++;
- do_prefetch = true;
- }
- SpinLockRelease(&pstate->mutex);
-
- if (!do_prefetch)
- return;
-
- if (!tbm_iterate(prefetch_iterator, &tbmpre))
- {
- Assert(!BlockNumberIsValid(tbmpre.blockno));
- /* No more pages to prefetch */
- tbm_end_iterate(prefetch_iterator);
- break;
- }
-
- node->prefetch_blockno = tbmpre.blockno;
-
- /* As above, skip prefetch if we expect not to need page */
- skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLES) &&
- !tbmpre.recheck &&
- VM_ALL_VISIBLE(node->ss.ss_currentRelation,
- tbmpre.blockno,
- &node->pvmbuffer));
-
- if (!skip_fetch)
- PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno);
- }
- }
- }
-#endif /* USE_PREFETCH */
-}
-
-/*
* BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
*/
static bool
@@ -580,24 +280,12 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
table_rescan(node->ss.ss_currentScanDesc, NULL);
}
- /* If we did not already clean up the prefetch iterator, do so now. */
- if (!tbm_exhausted(&node->prefetch_iterator))
- tbm_end_iterate(&node->prefetch_iterator);
-
/* release bitmaps and buffers if any */
if (node->tbm)
tbm_free(node->tbm);
- if (node->pvmbuffer != InvalidBuffer)
- ReleaseBuffer(node->pvmbuffer);
node->tbm = NULL;
node->initialized = false;
- node->pvmbuffer = InvalidBuffer;
node->recheck = true;
- /* Only used for serial BHS */
- node->blockno = InvalidBlockNumber;
- node->prefetch_blockno = InvalidBlockNumber;
- node->prefetch_pages = 0;
- node->prefetch_target = -1;
ExecScanReScan(&node->ss);
@@ -666,17 +354,11 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
table_endscan(scanDesc);
}
- /* If we did not already clean up the prefetch iterator, do so now. */
- if (!tbm_exhausted(&node->prefetch_iterator))
- tbm_end_iterate(&node->prefetch_iterator);
-
/*
* release bitmaps and buffers if any
*/
if (node->tbm)
tbm_free(node->tbm);
- if (node->pvmbuffer != InvalidBuffer)
- ReleaseBuffer(node->pvmbuffer);
}
/* ----------------------------------------------------------------
@@ -709,18 +391,13 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
scanstate->tbm = NULL;
- scanstate->pvmbuffer = InvalidBuffer;
/* Zero the statistics counters */
memset(&scanstate->stats, 0, sizeof(BitmapHeapScanInstrumentation));
- scanstate->prefetch_pages = 0;
- scanstate->prefetch_target = -1;
scanstate->initialized = false;
scanstate->pstate = NULL;
scanstate->recheck = true;
- scanstate->blockno = InvalidBlockNumber;
- scanstate->prefetch_blockno = InvalidBlockNumber;
/*
* Miscellaneous initialization
@@ -760,13 +437,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->bitmapqualorig =
ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
- /*
- * Maximum number of prefetches for the tablespace if configured,
- * otherwise the current value of the effective_io_concurrency GUC.
- */
- scanstate->prefetch_maximum =
- get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
-
scanstate->ss.ss_currentRelation = currentRelation;
/*
@@ -870,12 +540,9 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
sinstrument = (SharedBitmapHeapInstrumentation *) ptr;
pstate->tbmiterator = 0;
- pstate->prefetch_iterator = 0;
/* Initialize the mutex */
SpinLockInit(&pstate->mutex);
- pstate->prefetch_pages = 0;
- pstate->prefetch_target = -1;
pstate->state = BM_INITIAL;
ConditionVariableInit(&pstate->cv);
@@ -912,17 +579,11 @@ ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
return;
pstate->state = BM_INITIAL;
- pstate->prefetch_pages = 0;
- pstate->prefetch_target = -1;
if (DsaPointerIsValid(pstate->tbmiterator))
tbm_free_shared_area(dsa, pstate->tbmiterator);
- if (DsaPointerIsValid(pstate->prefetch_iterator))
- tbm_free_shared_area(dsa, pstate->prefetch_iterator);
-
pstate->tbmiterator = InvalidDsaPointer;
- pstate->prefetch_iterator = InvalidDsaPointer;
}
/* ----------------------------------------------------------------