aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/brin/brin.c2
-rw-r--r--src/backend/access/common/reloptions.c17
-rw-r--r--src/backend/access/gist/gistutil.c14
-rw-r--r--src/backend/access/gist/gistvalidate.c6
-rw-r--r--src/backend/access/heap/heapam.c35
-rw-r--r--src/backend/access/heap/heapam_handler.c2
-rw-r--r--src/backend/access/heap/vacuumlazy.c3
-rw-r--r--src/backend/access/nbtree/nbtree.c30
-rw-r--r--src/backend/access/nbtree/nbtsearch.c70
-rw-r--r--src/backend/access/nbtree/nbtsort.c2
-rw-r--r--src/backend/access/nbtree/nbtutils.c93
-rw-r--r--src/backend/access/transam/xact.c28
12 files changed, 196 insertions, 106 deletions
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 01e1db7f856..4204088fa0d 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -68,7 +68,7 @@ typedef struct BrinShared
int scantuplesortstates;
/* Query ID, for report in worker processes */
- uint64 queryid;
+ int64 queryid;
/*
* workersdonecv is used to monitor the progress of workers. All parallel
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 46c1dce222d..50747c16396 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -1243,8 +1243,9 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
}
else
{
- text *t;
+ const char *name;
const char *value;
+ text *t;
Size len;
/*
@@ -1291,11 +1292,19 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
* have just "name", assume "name=true" is meant. Note: the
* namespace is not output.
*/
+ name = def->defname;
if (def->arg != NULL)
value = defGetString(def);
else
value = "true";
+ /* Insist that name not contain "=", else "a=b=c" is ambiguous */
+ if (strchr(name, '=') != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid option name \"%s\": must not contain \"=\"",
+ name)));
+
/*
* This is not a great place for this test, but there's no other
* convenient place to filter the option out. As WITH (oids =
@@ -1303,7 +1312,7 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
* amount of ugly.
*/
if (acceptOidsOff && def->defnamespace == NULL &&
- strcmp(def->defname, "oids") == 0)
+ strcmp(name, "oids") == 0)
{
if (defGetBoolean(def))
ereport(ERROR,
@@ -1313,11 +1322,11 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
continue;
}
- len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value);
+ len = VARHDRSZ + strlen(name) + 1 + strlen(value);
/* +1 leaves room for sprintf's trailing null */
t = (text *) palloc(len + 1);
SET_VARSIZE(t, len);
- sprintf(VARDATA(t), "%s=%s", def->defname, value);
+ sprintf(VARDATA(t), "%s=%s", name, value);
astate = accumArrayResult(astate, PointerGetDatum(t),
false, TEXTOID,
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index a6b701943d3..c0aa7d0222f 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -1058,11 +1058,11 @@ gistGetFakeLSN(Relation rel)
}
/*
- * This is a stratnum support function for GiST opclasses that use the
- * RT*StrategyNumber constants.
+ * This is a stratnum translation support function for GiST opclasses that use
+ * the RT*StrategyNumber constants.
*/
Datum
-gist_stratnum_common(PG_FUNCTION_ARGS)
+gist_translate_cmptype_common(PG_FUNCTION_ARGS)
{
CompareType cmptype = PG_GETARG_INT32(0);
@@ -1090,9 +1090,9 @@ gist_stratnum_common(PG_FUNCTION_ARGS)
/*
* Returns the opclass's private stratnum used for the given compare type.
*
- * Calls the opclass's GIST_STRATNUM_PROC support function, if any,
- * and returns the result.
- * Returns InvalidStrategy if the function is not defined.
+ * Calls the opclass's GIST_TRANSLATE_CMPTYPE_PROC support function, if any,
+ * and returns the result. Returns InvalidStrategy if the function is not
+ * defined.
*/
StrategyNumber
gisttranslatecmptype(CompareType cmptype, Oid opfamily)
@@ -1101,7 +1101,7 @@ gisttranslatecmptype(CompareType cmptype, Oid opfamily)
Datum result;
/* Check whether the function is provided. */
- funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_STRATNUM_PROC);
+ funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_TRANSLATE_CMPTYPE_PROC);
if (!OidIsValid(funcid))
return InvalidStrategy;
diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c
index 2a49e6d20f0..2ed6f74fce9 100644
--- a/src/backend/access/gist/gistvalidate.c
+++ b/src/backend/access/gist/gistvalidate.c
@@ -138,7 +138,7 @@ gistvalidate(Oid opclassoid)
ok = check_amproc_signature(procform->amproc, VOIDOID, true,
1, 1, INTERNALOID);
break;
- case GIST_STRATNUM_PROC:
+ case GIST_TRANSLATE_CMPTYPE_PROC:
ok = check_amproc_signature(procform->amproc, INT2OID, true,
1, 1, INT4OID) &&
procform->amproclefttype == ANYOID &&
@@ -265,7 +265,7 @@ gistvalidate(Oid opclassoid)
if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC ||
i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC ||
i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC ||
- i == GIST_STRATNUM_PROC)
+ i == GIST_TRANSLATE_CMPTYPE_PROC)
continue; /* optional methods */
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
@@ -336,7 +336,7 @@ gistadjustmembers(Oid opfamilyoid,
case GIST_FETCH_PROC:
case GIST_OPTIONS_PROC:
case GIST_SORTSUPPORT_PROC:
- case GIST_STRATNUM_PROC:
+ case GIST_TRANSLATE_CMPTYPE_PROC:
/* Optional, so force it to be a soft family dependency */
op->ref_is_hard = false;
op->ref_is_family = true;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 9ec8cda1c68..0dcd6ee817e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -213,6 +213,27 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
#define TUPLOCK_from_mxstatus(status) \
(MultiXactStatusLock[(status)])
+/*
+ * Check that we have a valid snapshot if we might need TOAST access.
+ */
+static inline void
+AssertHasSnapshotForToast(Relation rel)
+{
+#ifdef USE_ASSERT_CHECKING
+
+ /* bootstrap mode in particular breaks this rule */
+ if (!IsNormalProcessingMode())
+ return;
+
+ /* if the relation doesn't have a TOAST table, we are good */
+ if (!OidIsValid(rel->rd_rel->reltoastrelid))
+ return;
+
+ Assert(HaveRegisteredOrActiveSnapshot());
+
+#endif /* USE_ASSERT_CHECKING */
+}
+
/* ----------------------------------------------------------------
* heap support routines
* ----------------------------------------------------------------
@@ -2066,6 +2087,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
RelationGetNumberOfAttributes(relation));
+ AssertHasSnapshotForToast(relation);
+
/*
* Fill in tuple header fields and toast the tuple if necessary.
*
@@ -2343,6 +2366,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
/* currently not needed (thus unsupported) for heap_multi_insert() */
Assert(!(options & HEAP_INSERT_NO_LOGICAL));
+ AssertHasSnapshotForToast(relation);
+
needwal = RelationNeedsWAL(relation);
saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
HEAP_DEFAULT_FILLFACTOR);
@@ -2765,6 +2790,8 @@ heap_delete(Relation relation, ItemPointer tid,
Assert(ItemPointerIsValid(tid));
+ AssertHasSnapshotForToast(relation);
+
/*
* Forbid this during a parallel operation, lest it allocate a combo CID.
* Other workers might need that combo CID for visibility checks, and we
@@ -3260,6 +3287,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
RelationGetNumberOfAttributes(relation));
+ AssertHasSnapshotForToast(relation);
+
/*
* Forbid this during a parallel operation, lest it allocate a combo CID.
* Other workers might need that combo CID for visibility checks, and we
@@ -4953,7 +4982,7 @@ l3:
case LockWaitError:
if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
status, infomask, relation,
- NULL, log_lock_failure))
+ NULL, log_lock_failures))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
@@ -4991,7 +5020,7 @@ l3:
}
break;
case LockWaitError:
- if (!ConditionalXactLockTableWait(xwait, log_lock_failure))
+ if (!ConditionalXactLockTableWait(xwait, log_lock_failures))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
@@ -5256,7 +5285,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
break;
case LockWaitError:
- if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failure))
+ if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failures))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index ac082fefa77..cb4bc35c93e 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -464,7 +464,7 @@ tuple_lock_retry:
return TM_WouldBlock;
break;
case LockWaitError:
- if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failure))
+ if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failures))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 708674d8fcf..09416450af9 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -757,7 +757,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
vacrel->vm_new_visible_pages = 0;
vacrel->vm_new_visible_frozen_pages = 0;
vacrel->vm_new_frozen_pages = 0;
- vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
/*
* Get cutoffs that determine which deleted tuples are considered DEAD,
@@ -776,7 +775,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
* to increase the number of dead tuples it can prune away.)
*/
vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs);
+ vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
vacrel->vistest = GlobalVisTestFor(rel);
+
/* Initialize state used to track oldest extant XID/MXID */
vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin;
vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact;
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 765659887af..03a1d7b027a 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
BTScanOpaque so = (BTScanOpaque) scan->opaque;
bool res;
+ Assert(scan->heapRelation != NULL);
+
/* btree indexes are never lossy */
scan->xs_recheck = false;
@@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
int64 ntids = 0;
ItemPointer heapTid;
+ Assert(scan->heapRelation == NULL);
+
/* Each loop iteration performs another primitive index scan */
do
{
@@ -393,6 +397,32 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
BTScanPosInvalidate(so->currPos);
}
+ /*
+ * We prefer to eagerly drop leaf page pins before btgettuple returns.
+ * This avoids making VACUUM wait to acquire a cleanup lock on the page.
+ *
+ * We cannot safely drop leaf page pins during index-only scans due to a
+ * race condition involving VACUUM setting pages all-visible in the VM.
+ * It's also unsafe for plain index scans that use a non-MVCC snapshot.
+ *
+ * When we drop pins eagerly, the mechanism that marks so->killedItems[]
+ * index tuples LP_DEAD has to deal with concurrent TID recycling races.
+ * The scheme used to detect unsafe TID recycling won't work when scanning
+ * unlogged relations (since it involves saving an affected page's LSN).
+ * Opt out of eager pin dropping during unlogged relation scans for now
+ * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
+ *
+ * Also opt out of dropping leaf page pins eagerly during bitmap scans.
+ * Pins cannot be held for more than an instant during bitmap scans either
+ * way, so we might as well avoid wasting cycles on acquiring page LSNs.
+ *
+ * See nbtree/README section on making concurrent TID recycling safe.
+ */
+ so->dropPin = (!scan->xs_want_itup &&
+ IsMVCCSnapshot(scan->xs_snapshot) &&
+ RelationNeedsWAL(scan->indexRelation) &&
+ scan->heapRelation != NULL);
+
so->markItemIndex = -1;
so->needPrimScan = false;
so->scanBehind = false;
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fe9a3886913..070f14c8b91 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,7 @@
#include "utils/rel.h"
-static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
Buffer buf, bool forupdate, BTStack stack,
int access);
@@ -57,24 +57,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
/*
* _bt_drop_lock_and_maybe_pin()
*
- * Unlock the buffer; and if it is safe to release the pin, do that, too.
- * This will prevent vacuum from stalling in a blocked state trying to read a
- * page when a cursor is sitting on it.
- *
- * See nbtree/README section on making concurrent TID recycling safe.
+ * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
+ * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
*/
-static void
-_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+static inline void
+_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
{
- _bt_unlockbuf(scan->indexRelation, sp->buf);
-
- if (IsMVCCSnapshot(scan->xs_snapshot) &&
- RelationNeedsWAL(scan->indexRelation) &&
- !scan->xs_want_itup)
+ if (!so->dropPin)
{
- ReleaseBuffer(sp->buf);
- sp->buf = InvalidBuffer;
+ /* Just drop the lock (not the pin) */
+ _bt_unlockbuf(rel, so->currPos.buf);
+ return;
}
+
+ /*
+ * Drop both the lock and the pin.
+ *
+ * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
+ * when concurrent heap TID recycling by VACUUM might have taken place.
+ */
+ Assert(RelationNeedsWAL(rel));
+ so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+ _bt_relbuf(rel, so->currPos.buf);
+ so->currPos.buf = InvalidBuffer;
}
/*
@@ -866,8 +871,8 @@ _bt_compare(Relation rel,
* if backwards scan, the last item) in the tree that satisfies the
* qualifications in the scan key. On success exit, data about the
* matching tuple(s) on the page has been loaded into so->currPos. We'll
- * drop all locks and hold onto a pin on page's buffer, except when
- * _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM.
+ * drop all locks and hold onto a pin on page's buffer, except during
+ * so->dropPin scans, when we drop both the lock and the pin.
* _bt_returnitem sets the next item to return to scan on success exit.
*
* If there are no matching items in the index, we return false, with no
@@ -1610,7 +1615,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
so->currPos.prevPage = opaque->btpo_prev;
so->currPos.nextPage = opaque->btpo_next;
+ /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
+ so->currPos.dir = dir;
+ so->currPos.nextTupleOffset = 0;
+ /* either moreRight or moreLeft should be set now (may be unset later) */
+ Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
+ so->currPos.moreLeft);
Assert(!P_IGNORE(opaque));
Assert(BTScanPosIsPinned(so->currPos));
Assert(!so->needPrimScan);
@@ -1626,14 +1637,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
so->currPos.currPage);
}
- /* initialize remaining currPos fields related to current page */
- so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
- so->currPos.dir = dir;
- so->currPos.nextTupleOffset = 0;
- /* either moreLeft or moreRight should be set now (may be unset later) */
- Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
- so->currPos.moreLeft);
-
PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
/* initialize local variables */
@@ -2107,10 +2110,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
*
* Wrapper on _bt_readnextpage that performs final steps for the current page.
*
- * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
- * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
- * the pin eagerly earlier on. The scan must have so->currPos.currPage set to
- * a valid block, in any case.
+ * On entry, so->currPos must be valid. Its buffer will be pinned, though
+ * never locked. (Actually, when so->dropPin there won't even be a pin held,
+ * though so->currPos.currPage must still be set to a valid block number.)
*/
static bool
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2251,12 +2253,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
*/
if (_bt_readpage(scan, dir, offnum, true))
{
+ Relation rel = scan->indexRelation;
+
/*
* _bt_readpage succeeded. Drop the lock (and maybe the pin) on
* so->currPos.buf in preparation for btgettuple returning tuples.
*/
Assert(BTScanPosIsPinned(so->currPos));
- _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ _bt_drop_lock_and_maybe_pin(rel, so);
return true;
}
@@ -2294,8 +2298,8 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
*
* On success exit, so->currPos is updated to contain data from the next
* interesting page, and we return true. We hold a pin on the buffer on
- * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
- * to eagerly drop the pin (to avoid blocking VACUUM).
+ * success exit (except during so->dropPin index scans, when we drop the pin
+ * eagerly to avoid blocking VACUUM).
*
* If there are no more matching records in the given direction, we drop all
* locks and pins, invalidate so->currPos, and return false.
@@ -2413,7 +2417,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
*/
Assert(so->currPos.currPage == blkno);
Assert(BTScanPosIsPinned(so->currPos));
- _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ _bt_drop_lock_and_maybe_pin(rel, so);
return true;
}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 3794cc924ad..9d70e89c1f3 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -105,7 +105,7 @@ typedef struct BTShared
int scantuplesortstates;
/* Query ID, for report in worker processes */
- uint64 queryid;
+ int64 queryid;
/*
* workersdonecv is used to monitor the progress of workers. All parallel
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 1a15dfcb7d3..29f0dca1b08 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -63,7 +63,7 @@ static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
bool *continuescan, int *ikey);
static bool _bt_check_rowcompare(ScanKey skey,
IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
- ScanDirection dir, bool forcenonrequired, bool *continuescan);
+ ScanDirection dir, bool *continuescan);
static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
int tupnatts, TupleDesc tupdesc);
static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
@@ -2902,8 +2902,10 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
/* row-comparison keys need special processing */
if (key->sk_flags & SK_ROW_HEADER)
{
+ Assert(!forcenonrequired); /* forbidden by _bt_set_startikey */
+
if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
- forcenonrequired, continuescan))
+ continuescan))
continue;
return false;
}
@@ -3060,8 +3062,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
*/
static bool
_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
- TupleDesc tupdesc, ScanDirection dir,
- bool forcenonrequired, bool *continuescan)
+ TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
{
ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
int32 cmpresult = 0;
@@ -3101,11 +3102,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
if (isNull)
{
- if (forcenonrequired)
- {
- /* treating scan's keys as non-required */
- }
- else if (subkey->sk_flags & SK_BT_NULLS_FIRST)
+ if (subkey->sk_flags & SK_BT_NULLS_FIRST)
{
/*
* Since NULLs are sorted before non-NULLs, we know we have
@@ -3159,12 +3156,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
*/
Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
subkey--;
- if (forcenonrequired)
- {
- /* treating scan's keys as non-required */
- }
- else if ((subkey->sk_flags & SK_BT_REQFWD) &&
- ScanDirectionIsForward(dir))
+ if ((subkey->sk_flags & SK_BT_REQFWD) &&
+ ScanDirectionIsForward(dir))
*continuescan = false;
else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
ScanDirectionIsBackward(dir))
@@ -3216,7 +3209,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
break;
}
- if (!result && !forcenonrequired)
+ if (!result)
{
/*
* Tuple fails this qual. If it's a required qual for the current
@@ -3342,75 +3335,71 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
*
* Note that if we hold a pin on the target page continuously from initially
* reading the items until applying this function, VACUUM cannot have deleted
- * any items from the page, and so there is no need to search left from the
- * recorded offset. (This observation also guarantees that the item is still
- * the right one to delete, which might otherwise be questionable since heap
- * TIDs can get recycled.) This holds true even if the page has been modified
- * by inserts and page splits, so there is no need to consult the LSN.
- *
- * If the pin was released after reading the page, then we re-read it. If it
- * has been modified since we read it (as determined by the LSN), we dare not
- * flag any entries because it is possible that the old entry was vacuumed
- * away and the TID was re-used by a completely different heap tuple.
+ * any items on the page, so the page's TIDs can't have been recycled by now.
+ * There's no risk that we'll confuse a new index tuple that happens to use a
+ * recycled TID with a now-removed tuple with the same TID (that used to be on
+ * this same page). We can't rely on that during scans that drop pins eagerly
+ * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
+ * the page LSN having not changed since back when _bt_readpage saw the page.
*/
void
_bt_killitems(IndexScanDesc scan)
{
+ Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
- int i;
int numKilled = so->numKilled;
bool killedsomething = false;
- bool droppedpin PG_USED_FOR_ASSERTS_ONLY;
+ Assert(numKilled > 0);
Assert(BTScanPosIsValid(so->currPos));
+ Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
- /*
- * Always reset the scan state, so we don't look for same items on other
- * pages.
- */
+ /* Always invalidate so->killedItems[] before leaving so->currPos */
so->numKilled = 0;
- if (BTScanPosIsPinned(so->currPos))
+ if (!so->dropPin)
{
/*
* We have held the pin on this page since we read the index tuples,
* so all we need to do is lock it. The pin will have prevented
- * re-use of any TID on the page, so there is no need to check the
- * LSN.
+ * concurrent VACUUMs from recycling any of the TIDs on the page.
*/
- droppedpin = false;
- _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
-
- page = BufferGetPage(so->currPos.buf);
+ Assert(BTScanPosIsPinned(so->currPos));
+ _bt_lockbuf(rel, so->currPos.buf, BT_READ);
}
else
{
Buffer buf;
+ XLogRecPtr latestlsn;
- droppedpin = true;
- /* Attempt to re-read the buffer, getting pin and lock. */
- buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
+ Assert(!BTScanPosIsPinned(so->currPos));
+ Assert(RelationNeedsWAL(rel));
+ buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
- page = BufferGetPage(buf);
- if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
- so->currPos.buf = buf;
- else
+ latestlsn = BufferGetLSNAtomic(buf);
+ Assert(!XLogRecPtrIsInvalid(so->currPos.lsn));
+ Assert(so->currPos.lsn <= latestlsn);
+ if (so->currPos.lsn != latestlsn)
{
- /* Modified while not pinned means hinting is not safe. */
- _bt_relbuf(scan->indexRelation, buf);
+ /* Modified, give up on hinting */
+ _bt_relbuf(rel, buf);
return;
}
+
+ /* Unmodified, hinting is safe */
+ so->currPos.buf = buf;
}
+ page = BufferGetPage(so->currPos.buf);
opaque = BTPageGetOpaque(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
- for (i = 0; i < numKilled; i++)
+ for (int i = 0; i < numKilled; i++)
{
int itemIndex = so->killedItems[i];
BTScanPosItem *kitem = &so->currPos.items[itemIndex];
@@ -3442,7 +3431,7 @@ _bt_killitems(IndexScanDesc scan)
* correctness.
*
* Note that the page may have been modified in almost any way
- * since we first read it (in the !droppedpin case), so it's
+ * since we first read it (in the !so->dropPin case), so it's
* possible that this posting list tuple wasn't a posting list
* tuple when we first encountered its heap TIDs.
*/
@@ -3458,7 +3447,7 @@ _bt_killitems(IndexScanDesc scan)
* though only in the common case where the page can't
* have been concurrently modified
*/
- Assert(kitem->indexOffset == offnum || !droppedpin);
+ Assert(kitem->indexOffset == offnum || !so->dropPin);
/*
* Read-ahead to later kitems here.
@@ -3525,7 +3514,7 @@ _bt_killitems(IndexScanDesc scan)
MarkBufferDirtyHint(so->currPos.buf, true);
}
- _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ _bt_unlockbuf(rel, so->currPos.buf);
}
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index b885513f765..2e67e998adb 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1045,6 +1045,34 @@ TransactionStartedDuringRecovery(void)
}
/*
+ * GetTopReadOnlyTransactionNestLevel
+ *
+ * Note: this will return zero when not inside any transaction or when neither
+ * a top-level transaction nor subtransactions are read-only, one when the
+ * top-level transaction is read-only, two when one level of subtransaction is
+ * read-only, etc.
+ *
+ * Note: subtransactions of the topmost read-only transaction are also
+ * read-only, because they inherit read-only mode from the transaction, and
+ * thus can't change to read-write mode. See check_transaction_read_only().
+ */
+int
+GetTopReadOnlyTransactionNestLevel(void)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (!XactReadOnly)
+ return 0;
+ while (s->nestingLevel > 1)
+ {
+ if (!s->prevXactReadOnly)
+ return s->nestingLevel;
+ s = s->parent;
+ }
+ return s->nestingLevel;
+}
+
+/*
* EnterParallelMode
*/
void