diff options
Diffstat (limited to 'src/backend/access/nbtree')
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 32 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 70 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 2 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtutils.c | 97 |
4 files changed, 119 insertions, 82 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af..fdff960c130 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; + Assert(scan->heapRelation != NULL); + /* btree indexes are never lossy */ scan->xs_recheck = false; @@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) int64 ntids = 0; ItemPointer heapTid; + Assert(scan->heapRelation == NULL); + /* Each loop iteration performs another primitive index scan */ do { @@ -393,6 +397,34 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, BTScanPosInvalidate(so->currPos); } + /* + * We prefer to eagerly drop leaf page pins before btgettuple returns. + * This avoids making VACUUM wait to acquire a cleanup lock on the page. + * + * We cannot safely drop leaf page pins during index-only scans due to a + * race condition involving VACUUM setting pages all-visible in the VM. + * It's also unsafe for plain index scans that use a non-MVCC snapshot. + * + * When we drop pins eagerly, the mechanism that marks so->killedItems[] + * index tuples LP_DEAD has to deal with concurrent TID recycling races. + * The scheme used to detect unsafe TID recycling won't work when scanning + * unlogged relations (since it involves saving an affected page's LSN). + * Opt out of eager pin dropping during unlogged relation scans for now + * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting). + * + * Also opt out of dropping leaf page pins eagerly during bitmap scans. + * Pins cannot be held for more than an instant during bitmap scans either + * way, so we might as well avoid wasting cycles on acquiring page LSNs. + * + * See nbtree/README section on making concurrent TID recycling safe. + * + * Note: so->dropPin should never change across rescans. + */ + so->dropPin = (!scan->xs_want_itup && + IsMVCCSnapshot(scan->xs_snapshot) && + RelationNeedsWAL(scan->indexRelation) && + scan->heapRelation != NULL); + so->markItemIndex = -1; so->needPrimScan = false; so->scanBehind = false; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fe9a3886913..070f14c8b91 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,7 +25,7 @@ #include "utils/rel.h" -static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so); static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); @@ -57,24 +57,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); /* * _bt_drop_lock_and_maybe_pin() * - * Unlock the buffer; and if it is safe to release the pin, do that, too. - * This will prevent vacuum from stalling in a blocked state trying to read a - * page when a cursor is sitting on it. - * - * See nbtree/README section on making concurrent TID recycling safe. + * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too. + * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock. */ -static void -_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +static inline void +_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so) { - _bt_unlockbuf(scan->indexRelation, sp->buf); - - if (IsMVCCSnapshot(scan->xs_snapshot) && - RelationNeedsWAL(scan->indexRelation) && - !scan->xs_want_itup) + if (!so->dropPin) { - ReleaseBuffer(sp->buf); - sp->buf = InvalidBuffer; + /* Just drop the lock (not the pin) */ + _bt_unlockbuf(rel, so->currPos.buf); + return; } + + /* + * Drop both the lock and the pin. + * + * Have to set so->currPos.lsn so that _bt_killitems has a way to detect + * when concurrent heap TID recycling by VACUUM might have taken place. + */ + Assert(RelationNeedsWAL(rel)); + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; } /* @@ -866,8 +871,8 @@ _bt_compare(Relation rel, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, data about the * matching tuple(s) on the page has been loaded into so->currPos. We'll - * drop all locks and hold onto a pin on page's buffer, except when - * _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM. + * drop all locks and hold onto a pin on page's buffer, except during + * so->dropPin scans, when we drop both the lock and the pin. * _bt_returnitem sets the next item to return to scan on success exit. * * If there are no matching items in the index, we return false, with no @@ -1610,7 +1615,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); so->currPos.prevPage = opaque->btpo_prev; so->currPos.nextPage = opaque->btpo_next; + /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */ + so->currPos.dir = dir; + so->currPos.nextTupleOffset = 0; + /* either moreRight or moreLeft should be set now (may be unset later) */ + Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : + so->currPos.moreLeft); Assert(!P_IGNORE(opaque)); Assert(BTScanPosIsPinned(so->currPos)); Assert(!so->needPrimScan); @@ -1626,14 +1637,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.currPage); } - /* initialize remaining currPos fields related to current page */ - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); - so->currPos.dir = dir; - so->currPos.nextTupleOffset = 0; - /* either moreLeft or moreRight should be set now (may be unset later) */ - Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : - so->currPos.moreLeft); - PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); /* initialize local variables */ @@ -2107,10 +2110,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * * Wrapper on _bt_readnextpage that performs final steps for the current page. * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. - * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped - * the pin eagerly earlier on. The scan must have so->currPos.currPage set to - * a valid block, in any case. + * On entry, so->currPos must be valid. Its buffer will be pinned, though + * never locked. (Actually, when so->dropPin there won't even be a pin held, + * though so->currPos.currPage must still be set to a valid block number.) */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) @@ -2251,12 +2253,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) */ if (_bt_readpage(scan, dir, offnum, true)) { + Relation rel = scan->indexRelation; + /* * _bt_readpage succeeded. Drop the lock (and maybe the pin) on * so->currPos.buf in preparation for btgettuple returning tuples. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(rel, so); return true; } @@ -2294,8 +2298,8 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * * On success exit, so->currPos is updated to contain data from the next * interesting page, and we return true. We hold a pin on the buffer on - * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe - * to eagerly drop the pin (to avoid blocking VACUUM). + * success exit (except during so->dropPin index scans, when we drop the pin + * eagerly to avoid blocking VACUUM). * * If there are no more matching records in the given direction, we drop all * locks and pins, invalidate so->currPos, and return false. @@ -2413,7 +2417,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, */ Assert(so->currPos.currPage == blkno); Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(rel, so); return true; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 3794cc924ad..9d70e89c1f3 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -105,7 +105,7 @@ typedef struct BTShared int scantuplesortstates; /* Query ID, for report in worker processes */ - uint64 queryid; + int64 queryid; /* * workersdonecv is used to monitor the progress of workers. All parallel diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 1a15dfcb7d3..c71d1b6f2e1 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -3330,87 +3330,85 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, * current page and killed tuples thereon (generally, this should only be * called if so->numKilled > 0). * - * The caller does not have a lock on the page and may or may not have the - * page pinned in a buffer. Note that read-lock is sufficient for setting - * LP_DEAD status (which is only a hint). + * Caller should not have a lock on the so->currPos page, but must hold a + * buffer pin when !so->dropPin. When we return, it still won't be locked. + * It'll continue to hold whatever pins were held before calling here. * - * We match items by heap TID before assuming they are the right ones to - * delete. We cope with cases where items have moved right due to insertions. - * If an item has moved off the current page due to a split, we'll fail to - * find it and do nothing (this is not an error case --- we assume the item - * will eventually get marked in a future indexscan). + * We match items by heap TID before assuming they are the right ones to set + * LP_DEAD. If the scan is one that holds a buffer pin on the target page + * continuously from initially reading the items until applying this function + * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the + * page, so the page's TIDs can't have been recycled by now. There's no risk + * that we'll confuse a new index tuple that happens to use a recycled TID + * with a now-removed tuple with the same TID (that used to be on this same + * page). We can't rely on that during scans that drop buffer pins eagerly + * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on + * the page LSN having not changed since back when _bt_readpage saw the page. + * We totally give up on setting LP_DEAD bits when the page LSN changed. * - * Note that if we hold a pin on the target page continuously from initially - * reading the items until applying this function, VACUUM cannot have deleted - * any items from the page, and so there is no need to search left from the - * recorded offset. (This observation also guarantees that the item is still - * the right one to delete, which might otherwise be questionable since heap - * TIDs can get recycled.) This holds true even if the page has been modified - * by inserts and page splits, so there is no need to consult the LSN. - * - * If the pin was released after reading the page, then we re-read it. If it - * has been modified since we read it (as determined by the LSN), we dare not - * flag any entries because it is possible that the old entry was vacuumed - * away and the TID was re-used by a completely different heap tuple. + * We give up much less often during !so->dropPin scans, but it still happens. + * We cope with cases where items have moved right due to insertions. If an + * item has moved off the current page due to a split, we'll fail to find it + * and just give up on it. */ void _bt_killitems(IndexScanDesc scan) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; - int i; int numKilled = so->numKilled; bool killedsomething = false; - bool droppedpin PG_USED_FOR_ASSERTS_ONLY; + Buffer buf; + Assert(numKilled > 0); Assert(BTScanPosIsValid(so->currPos)); + Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */ - /* - * Always reset the scan state, so we don't look for same items on other - * pages. - */ + /* Always invalidate so->killedItems[] before leaving so->currPos */ so->numKilled = 0; - if (BTScanPosIsPinned(so->currPos)) + if (!so->dropPin) { /* * We have held the pin on this page since we read the index tuples, * so all we need to do is lock it. The pin will have prevented - * re-use of any TID on the page, so there is no need to check the - * LSN. + * concurrent VACUUMs from recycling any of the TIDs on the page. */ - droppedpin = false; - _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); - - page = BufferGetPage(so->currPos.buf); + Assert(BTScanPosIsPinned(so->currPos)); + buf = so->currPos.buf; + _bt_lockbuf(rel, buf, BT_READ); } else { - Buffer buf; + XLogRecPtr latestlsn; - droppedpin = true; - /* Attempt to re-read the buffer, getting pin and lock. */ - buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + Assert(!BTScanPosIsPinned(so->currPos)); + Assert(RelationNeedsWAL(rel)); + buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); - page = BufferGetPage(buf); - if (BufferGetLSNAtomic(buf) == so->currPos.lsn) - so->currPos.buf = buf; - else + latestlsn = BufferGetLSNAtomic(buf); + Assert(!XLogRecPtrIsInvalid(so->currPos.lsn)); + Assert(so->currPos.lsn <= latestlsn); + if (so->currPos.lsn != latestlsn) { - /* Modified while not pinned means hinting is not safe. */ - _bt_relbuf(scan->indexRelation, buf); + /* Modified, give up on hinting */ + _bt_relbuf(rel, buf); return; } + + /* Unmodified, hinting is safe */ } + page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); - for (i = 0; i < numKilled; i++) + for (int i = 0; i < numKilled; i++) { int itemIndex = so->killedItems[i]; BTScanPosItem *kitem = &so->currPos.items[itemIndex]; @@ -3442,7 +3440,7 @@ _bt_killitems(IndexScanDesc scan) * correctness. * * Note that the page may have been modified in almost any way - * since we first read it (in the !droppedpin case), so it's + * since we first read it (in the !so->dropPin case), so it's * possible that this posting list tuple wasn't a posting list * tuple when we first encountered its heap TIDs. */ @@ -3458,7 +3456,7 @@ _bt_killitems(IndexScanDesc scan) * though only in the common case where the page can't * have been concurrently modified */ - Assert(kitem->indexOffset == offnum || !droppedpin); + Assert(kitem->indexOffset == offnum || !so->dropPin); /* * Read-ahead to later kitems here. @@ -3522,10 +3520,13 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(so->currPos.buf, true); + MarkBufferDirtyHint(buf, true); } - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!so->dropPin) + _bt_unlockbuf(rel, buf); + else + _bt_relbuf(rel, buf); } |