diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtree.c')
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 202 |
1 files changed, 118 insertions, 84 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 41df1027d2d..686a3206f72 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -40,6 +40,9 @@ /* * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. * + * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the + * scan to advance it via another call to _bt_first. + * * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to * a new page; others must wait. * @@ -47,11 +50,11 @@ * to a new page; some process can start doing that. * * BTPARALLEL_DONE indicates that the scan is complete (including error exit). - * We reach this state once for every distinct combination of array keys. */ typedef enum { BTPARALLEL_NOT_INITIALIZED, + BTPARALLEL_NEED_PRIMSCAN, BTPARALLEL_ADVANCING, BTPARALLEL_IDLE, BTPARALLEL_DONE, @@ -67,10 +70,14 @@ typedef struct BTParallelScanDescData BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ - int btps_arrayKeyCount; /* count indicating number of array scan - * keys processed by parallel scan */ - slock_t btps_mutex; /* protects above variables */ + slock_t btps_mutex; /* protects above variables, btps_arrElems */ ConditionVariable btps_cv; /* used to synchronize parallel scan */ + + /* + * btps_arrElems is used when scans need to schedule another primitive + * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys. + */ + int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]; } BTParallelScanDescData; typedef struct BTParallelScanDescData *BTParallelScanDesc; @@ -204,21 +211,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* btree indexes are never lossy */ scan->xs_recheck = false; - /* - * If we have any array keys, initialize them during first call for a - * scan. We can't do this in btrescan because we don't know the scan - * direction at that time. - */ - if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) - { - /* punt if we have any unsatisfiable array keys */ - if (so->numArrayKeys < 0) - return false; - - _bt_start_array_keys(scan, dir); - } - - /* This loop handles advancing to the next array elements, if any */ + /* Each loop iteration performs another primitive index scan */ do { /* @@ -260,8 +253,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* If we have a tuple, return it ... */ if (res) break; - /* ... otherwise see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); + /* ... otherwise see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir)); return res; } @@ -276,19 +269,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) int64 ntids = 0; ItemPointer heapTid; - /* - * If we have any array keys, initialize them. - */ - if (so->numArrayKeys) - { - /* punt if we have any unsatisfiable array keys */ - if (so->numArrayKeys < 0) - return ntids; - - _bt_start_array_keys(scan, ForwardScanDirection); - } - - /* This loop handles advancing to the next array elements, if any */ + /* Each loop iteration performs another primitive index scan */ do { /* Fetch the first page & tuple */ @@ -318,8 +299,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) ntids++; } } - /* Now see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); + /* Now see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection)); return ntids; } @@ -348,10 +329,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys) else so->keyData = NULL; - so->arrayKeyData = NULL; /* assume no array keys for now */ - so->arraysStarted = false; - so->numArrayKeys = 0; + so->needPrimScan = false; + so->scanBehind = false; so->arrayKeys = NULL; + so->orderProcs = NULL; so->arrayContext = NULL; so->killedItems = NULL; /* until needed */ @@ -391,7 +372,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, } so->markItemIndex = -1; - so->arrayKeyCount = 0; + so->needPrimScan = false; + so->scanBehind = false; BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); @@ -425,9 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ - - /* If any keys are SK_SEARCHARRAY type, set up array-key info */ - _bt_preprocess_array_keys(scan); + so->numArrayKeys = 0; /* ditto */ } /* @@ -455,7 +435,7 @@ btendscan(IndexScanDesc scan) /* Release storage */ if (so->keyData != NULL) pfree(so->keyData); - /* so->arrayKeyData and so->arrayKeys are in arrayContext */ + /* so->arrayKeys and so->orderProcs are in arrayContext */ if (so->arrayContext != NULL) MemoryContextDelete(so->arrayContext); if (so->killedItems != NULL) @@ -490,10 +470,6 @@ btmarkpos(IndexScanDesc scan) BTScanPosInvalidate(so->markPos); so->markItemIndex = -1; } - - /* Also record the current positions of any array keys */ - if (so->numArrayKeys) - _bt_mark_array_keys(scan); } /* @@ -504,10 +480,6 @@ btrestrpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* Restore the marked positions of any array keys */ - if (so->numArrayKeys) - _bt_restore_array_keys(scan); - if (so->markItemIndex >= 0) { /* @@ -546,6 +518,12 @@ btrestrpos(IndexScanDesc scan) if (so->currTuples) memcpy(so->currTuples, so->markTuples, so->markPos.nextTupleOffset); + /* Reset the scan's array keys (see _bt_steppage for why) */ + if (so->numArrayKeys) + { + _bt_start_array_keys(scan, so->currPos.dir); + so->needPrimScan = false; + } } else BTScanPosInvalidate(so->currPos); @@ -556,9 +534,10 @@ btrestrpos(IndexScanDesc scan) * btestimateparallelscan -- estimate storage for BTParallelScanDescData */ Size -btestimateparallelscan(void) +btestimateparallelscan(int nkeys, int norderbys) { - return sizeof(BTParallelScanDescData); + /* Pessimistically assume all input scankeys will be output with arrays */ + return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys; } /* @@ -572,7 +551,6 @@ btinitparallelscan(void *target) SpinLockInit(&bt_target->btps_mutex); bt_target->btps_scanPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - bt_target->btps_arrayKeyCount = 0; ConditionVariableInit(&bt_target->btps_cv); } @@ -598,7 +576,6 @@ btparallelrescan(IndexScanDesc scan) SpinLockAcquire(&btscan->btps_mutex); btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount = 0; SpinLockRelease(&btscan->btps_mutex); } @@ -608,23 +585,26 @@ btparallelrescan(IndexScanDesc scan) * or _bt_parallel_done(). * * The return value is true if we successfully seized the scan and false - * if we did not. The latter case occurs if no pages remain for the current - * set of scankeys. + * if we did not. The latter case occurs if no pages remain. * * If the return value is true, *pageno returns the next or current page * of the scan (depending on the scan direction). An invalid block number - * means the scan hasn't yet started, and P_NONE means we've reached the end. + * means the scan hasn't yet started, or that caller needs to start the next + * primitive index scan (if it's the latter case we'll set so.needPrimScan). * The first time a participating process reaches the last page, it will return * true and set *pageno to P_NONE; after that, further attempts to seize the * scan will return false. * * Callers should ignore the value of pageno if the return value is false. + * + * Callers that are in a position to start a new primitive index scan must + * pass first=true (all other callers pass first=false). We just return false + * for first=false callers that require another primitive index scan. */ bool -_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) +_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - BTPS_State pageStatus; bool exit_loop = false; bool status = true; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; @@ -632,28 +612,69 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) *pageno = P_NONE; + if (first) + { + /* + * Initialize array related state when called from _bt_first, assuming + * that this will either be the first primitive index scan for the + * scan, or a previous explicitly scheduled primitive scan. + * + * Note: so->needPrimScan is only set when a scheduled primitive index + * scan is set to be performed in caller's worker process. It should + * not be set here by us for the first primitive scan, nor should we + * ever set it for a parallel scan that has no array keys. + */ + so->needPrimScan = false; + so->scanBehind = false; + } + else + { + /* + * Don't attempt to seize the scan when backend requires another + * primitive index scan unless we're in a position to start it now + */ + if (so->needPrimScan) + return false; + } + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); while (1) { SpinLockAcquire(&btscan->btps_mutex); - pageStatus = btscan->btps_pageStatus; - if (so->arrayKeyCount < btscan->btps_arrayKeyCount) + if (btscan->btps_pageStatus == BTPARALLEL_DONE) { - /* Parallel scan has already advanced to a new set of scankeys. */ + /* We're done with this parallel index scan */ status = false; } - else if (pageStatus == BTPARALLEL_DONE) + else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN) { + Assert(so->numArrayKeys); + /* - * We're done with this set of scankeys. This may be the end, or - * there could be more sets to try. + * If we can start another primitive scan right away, do so. + * Otherwise just wait. */ - status = false; + if (first) + { + btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + + array->cur_elem = btscan->btps_arrElems[i]; + skey->sk_argument = array->elem_values[array->cur_elem]; + } + so->needPrimScan = true; + so->scanBehind = false; + *pageno = InvalidBlockNumber; + exit_loop = true; + } } - else if (pageStatus != BTPARALLEL_ADVANCING) + else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING) { /* * We have successfully seized control of the scan for the purpose @@ -677,6 +698,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) * _bt_parallel_release() -- Complete the process of advancing the scan to a * new page. We now have the new value btps_scanPage; some other backend * can now begin advancing the scan. + * + * Callers whose scan uses array keys must save their scan_page argument so + * that it can be passed to _bt_parallel_primscan_schedule, should caller + * determine that another primitive index scan is required. If that happens, + * scan_page won't be scanned by any backend (unless the next primitive index + * scan lands on scan_page). */ void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) @@ -704,7 +731,6 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) void _bt_parallel_done(IndexScanDesc scan) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; bool status_changed = false; @@ -717,13 +743,11 @@ _bt_parallel_done(IndexScanDesc scan) parallel_scan->ps_offset); /* - * Mark the parallel scan as done for this combination of scan keys, - * unless some other process already did so. See also - * _bt_advance_array_keys. + * Mark the parallel scan as done, unless some other process did so + * already */ SpinLockAcquire(&btscan->btps_mutex); - if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && - btscan->btps_pageStatus != BTPARALLEL_DONE) + if (btscan->btps_pageStatus != BTPARALLEL_DONE) { btscan->btps_pageStatus = BTPARALLEL_DONE; status_changed = true; @@ -736,29 +760,39 @@ _bt_parallel_done(IndexScanDesc scan) } /* - * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array - * keys. + * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan. * - * Updates the count of array keys processed for both local and parallel - * scans. + * Caller passes the block number most recently passed to _bt_parallel_release + * by its backend. Caller successfully schedules the next primitive index scan + * if the shared parallel state hasn't been seized since caller's backend last + * advanced the scan. */ void -_bt_parallel_advance_array_keys(IndexScanDesc scan) +_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; + Assert(so->numArrayKeys); + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); - so->arrayKeyCount++; SpinLockAcquire(&btscan->btps_mutex); - if (btscan->btps_pageStatus == BTPARALLEL_DONE) + if (btscan->btps_scanPage == prev_scan_page && + btscan->btps_pageStatus == BTPARALLEL_IDLE) { btscan->btps_scanPage = InvalidBlockNumber; - btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount++; + btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; + + /* Serialize scan's current array keys */ + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + + btscan->btps_arrElems[i] = array->cur_elem; + } } SpinLockRelease(&btscan->btps_mutex); } |