aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtree.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtree.c')
-rw-r--r--src/backend/access/nbtree/nbtree.c202
1 files changed, 118 insertions, 84 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 41df1027d2d..686a3206f72 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -40,6 +40,9 @@
/*
* BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
*
+ * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
+ * scan to advance it via another call to _bt_first.
+ *
* BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
* a new page; others must wait.
*
@@ -47,11 +50,11 @@
* to a new page; some process can start doing that.
*
* BTPARALLEL_DONE indicates that the scan is complete (including error exit).
- * We reach this state once for every distinct combination of array keys.
*/
typedef enum
{
BTPARALLEL_NOT_INITIALIZED,
+ BTPARALLEL_NEED_PRIMSCAN,
BTPARALLEL_ADVANCING,
BTPARALLEL_IDLE,
BTPARALLEL_DONE,
@@ -67,10 +70,14 @@ typedef struct BTParallelScanDescData
BTPS_State btps_pageStatus; /* indicates whether next page is
* available for scan. see above for
* possible states of parallel scan. */
- int btps_arrayKeyCount; /* count indicating number of array scan
- * keys processed by parallel scan */
- slock_t btps_mutex; /* protects above variables */
+ slock_t btps_mutex; /* protects above variables, btps_arrElems */
ConditionVariable btps_cv; /* used to synchronize parallel scan */
+
+ /*
+ * btps_arrElems is used when scans need to schedule another primitive
+ * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
+ */
+ int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
} BTParallelScanDescData;
typedef struct BTParallelScanDescData *BTParallelScanDesc;
@@ -204,21 +211,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
/* btree indexes are never lossy */
scan->xs_recheck = false;
- /*
- * If we have any array keys, initialize them during first call for a
- * scan. We can't do this in btrescan because we don't know the scan
- * direction at that time.
- */
- if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
- {
- /* punt if we have any unsatisfiable array keys */
- if (so->numArrayKeys < 0)
- return false;
-
- _bt_start_array_keys(scan, dir);
- }
-
- /* This loop handles advancing to the next array elements, if any */
+ /* Each loop iteration performs another primitive index scan */
do
{
/*
@@ -260,8 +253,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
/* If we have a tuple, return it ... */
if (res)
break;
- /* ... otherwise see if we have more array keys to deal with */
- } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
+ /* ... otherwise see if we need another primitive index scan */
+ } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
return res;
}
@@ -276,19 +269,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
int64 ntids = 0;
ItemPointer heapTid;
- /*
- * If we have any array keys, initialize them.
- */
- if (so->numArrayKeys)
- {
- /* punt if we have any unsatisfiable array keys */
- if (so->numArrayKeys < 0)
- return ntids;
-
- _bt_start_array_keys(scan, ForwardScanDirection);
- }
-
- /* This loop handles advancing to the next array elements, if any */
+ /* Each loop iteration performs another primitive index scan */
do
{
/* Fetch the first page & tuple */
@@ -318,8 +299,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
ntids++;
}
}
- /* Now see if we have more array keys to deal with */
- } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
+ /* Now see if we need another primitive index scan */
+ } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
return ntids;
}
@@ -348,10 +329,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
else
so->keyData = NULL;
- so->arrayKeyData = NULL; /* assume no array keys for now */
- so->arraysStarted = false;
- so->numArrayKeys = 0;
+ so->needPrimScan = false;
+ so->scanBehind = false;
so->arrayKeys = NULL;
+ so->orderProcs = NULL;
so->arrayContext = NULL;
so->killedItems = NULL; /* until needed */
@@ -391,7 +372,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
}
so->markItemIndex = -1;
- so->arrayKeyCount = 0;
+ so->needPrimScan = false;
+ so->scanBehind = false;
BTScanPosUnpinIfPinned(so->markPos);
BTScanPosInvalidate(so->markPos);
@@ -425,9 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
scankey,
scan->numberOfKeys * sizeof(ScanKeyData));
so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
-
- /* If any keys are SK_SEARCHARRAY type, set up array-key info */
- _bt_preprocess_array_keys(scan);
+ so->numArrayKeys = 0; /* ditto */
}
/*
@@ -455,7 +435,7 @@ btendscan(IndexScanDesc scan)
/* Release storage */
if (so->keyData != NULL)
pfree(so->keyData);
- /* so->arrayKeyData and so->arrayKeys are in arrayContext */
+ /* so->arrayKeys and so->orderProcs are in arrayContext */
if (so->arrayContext != NULL)
MemoryContextDelete(so->arrayContext);
if (so->killedItems != NULL)
@@ -490,10 +470,6 @@ btmarkpos(IndexScanDesc scan)
BTScanPosInvalidate(so->markPos);
so->markItemIndex = -1;
}
-
- /* Also record the current positions of any array keys */
- if (so->numArrayKeys)
- _bt_mark_array_keys(scan);
}
/*
@@ -504,10 +480,6 @@ btrestrpos(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- /* Restore the marked positions of any array keys */
- if (so->numArrayKeys)
- _bt_restore_array_keys(scan);
-
if (so->markItemIndex >= 0)
{
/*
@@ -546,6 +518,12 @@ btrestrpos(IndexScanDesc scan)
if (so->currTuples)
memcpy(so->currTuples, so->markTuples,
so->markPos.nextTupleOffset);
+ /* Reset the scan's array keys (see _bt_steppage for why) */
+ if (so->numArrayKeys)
+ {
+ _bt_start_array_keys(scan, so->currPos.dir);
+ so->needPrimScan = false;
+ }
}
else
BTScanPosInvalidate(so->currPos);
@@ -556,9 +534,10 @@ btrestrpos(IndexScanDesc scan)
* btestimateparallelscan -- estimate storage for BTParallelScanDescData
*/
Size
-btestimateparallelscan(void)
+btestimateparallelscan(int nkeys, int norderbys)
{
- return sizeof(BTParallelScanDescData);
+ /* Pessimistically assume all input scankeys will be output with arrays */
+ return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
}
/*
@@ -572,7 +551,6 @@ btinitparallelscan(void *target)
SpinLockInit(&bt_target->btps_mutex);
bt_target->btps_scanPage = InvalidBlockNumber;
bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- bt_target->btps_arrayKeyCount = 0;
ConditionVariableInit(&bt_target->btps_cv);
}
@@ -598,7 +576,6 @@ btparallelrescan(IndexScanDesc scan)
SpinLockAcquire(&btscan->btps_mutex);
btscan->btps_scanPage = InvalidBlockNumber;
btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- btscan->btps_arrayKeyCount = 0;
SpinLockRelease(&btscan->btps_mutex);
}
@@ -608,23 +585,26 @@ btparallelrescan(IndexScanDesc scan)
* or _bt_parallel_done().
*
* The return value is true if we successfully seized the scan and false
- * if we did not. The latter case occurs if no pages remain for the current
- * set of scankeys.
+ * if we did not. The latter case occurs if no pages remain.
*
* If the return value is true, *pageno returns the next or current page
* of the scan (depending on the scan direction). An invalid block number
- * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * means the scan hasn't yet started, or that caller needs to start the next
+ * primitive index scan (if it's the latter case we'll set so.needPrimScan).
* The first time a participating process reaches the last page, it will return
* true and set *pageno to P_NONE; after that, further attempts to seize the
* scan will return false.
*
* Callers should ignore the value of pageno if the return value is false.
+ *
+ * Callers that are in a position to start a new primitive index scan must
+ * pass first=true (all other callers pass first=false). We just return false
+ * for first=false callers that require another primitive index scan.
*/
bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
- BTPS_State pageStatus;
bool exit_loop = false;
bool status = true;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@@ -632,28 +612,69 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
*pageno = P_NONE;
+ if (first)
+ {
+ /*
+ * Initialize array related state when called from _bt_first, assuming
+ * that this will either be the first primitive index scan for the
+ * scan, or a previous explicitly scheduled primitive scan.
+ *
+ * Note: so->needPrimScan is only set when a scheduled primitive index
+ * scan is set to be performed in caller's worker process. It should
+ * not be set here by us for the first primitive scan, nor should we
+ * ever set it for a parallel scan that has no array keys.
+ */
+ so->needPrimScan = false;
+ so->scanBehind = false;
+ }
+ else
+ {
+ /*
+ * Don't attempt to seize the scan when backend requires another
+ * primitive index scan unless we're in a position to start it now
+ */
+ if (so->needPrimScan)
+ return false;
+ }
+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
while (1)
{
SpinLockAcquire(&btscan->btps_mutex);
- pageStatus = btscan->btps_pageStatus;
- if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+ if (btscan->btps_pageStatus == BTPARALLEL_DONE)
{
- /* Parallel scan has already advanced to a new set of scankeys. */
+ /* We're done with this parallel index scan */
status = false;
}
- else if (pageStatus == BTPARALLEL_DONE)
+ else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
{
+ Assert(so->numArrayKeys);
+
/*
- * We're done with this set of scankeys. This may be the end, or
- * there could be more sets to try.
+ * If we can start another primitive scan right away, do so.
+ * Otherwise just wait.
*/
- status = false;
+ if (first)
+ {
+ btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+ for (int i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[i];
+ ScanKey skey = &so->keyData[array->scan_key];
+
+ array->cur_elem = btscan->btps_arrElems[i];
+ skey->sk_argument = array->elem_values[array->cur_elem];
+ }
+ so->needPrimScan = true;
+ so->scanBehind = false;
+ *pageno = InvalidBlockNumber;
+ exit_loop = true;
+ }
}
- else if (pageStatus != BTPARALLEL_ADVANCING)
+ else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
{
/*
* We have successfully seized control of the scan for the purpose
@@ -677,6 +698,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
* _bt_parallel_release() -- Complete the process of advancing the scan to a
* new page. We now have the new value btps_scanPage; some other backend
* can now begin advancing the scan.
+ *
+ * Callers whose scan uses array keys must save their scan_page argument so
+ * that it can be passed to _bt_parallel_primscan_schedule, should caller
+ * determine that another primitive index scan is required. If that happens,
+ * scan_page won't be scanned by any backend (unless the next primitive index
+ * scan lands on scan_page).
*/
void
_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
@@ -704,7 +731,6 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
void
_bt_parallel_done(IndexScanDesc scan)
{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
bool status_changed = false;
@@ -717,13 +743,11 @@ _bt_parallel_done(IndexScanDesc scan)
parallel_scan->ps_offset);
/*
- * Mark the parallel scan as done for this combination of scan keys,
- * unless some other process already did so. See also
- * _bt_advance_array_keys.
+ * Mark the parallel scan as done, unless some other process did so
+ * already
*/
SpinLockAcquire(&btscan->btps_mutex);
- if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
- btscan->btps_pageStatus != BTPARALLEL_DONE)
+ if (btscan->btps_pageStatus != BTPARALLEL_DONE)
{
btscan->btps_pageStatus = BTPARALLEL_DONE;
status_changed = true;
@@ -736,29 +760,39 @@ _bt_parallel_done(IndexScanDesc scan)
}
/*
- * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
- * keys.
+ * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
*
- * Updates the count of array keys processed for both local and parallel
- * scans.
+ * Caller passes the block number most recently passed to _bt_parallel_release
+ * by its backend. Caller successfully schedules the next primitive index scan
+ * if the shared parallel state hasn't been seized since caller's backend last
+ * advanced the scan.
*/
void
-_bt_parallel_advance_array_keys(IndexScanDesc scan)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
BTParallelScanDesc btscan;
+ Assert(so->numArrayKeys);
+
btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
parallel_scan->ps_offset);
- so->arrayKeyCount++;
SpinLockAcquire(&btscan->btps_mutex);
- if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+ if (btscan->btps_scanPage == prev_scan_page &&
+ btscan->btps_pageStatus == BTPARALLEL_IDLE)
{
btscan->btps_scanPage = InvalidBlockNumber;
- btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
- btscan->btps_arrayKeyCount++;
+ btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
+
+ /* Serialize scan's current array keys */
+ for (int i = 0; i < so->numArrayKeys; i++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[i];
+
+ btscan->btps_arrElems[i] = array->cur_elem;
+ }
}
SpinLockRelease(&btscan->btps_mutex);
}