diff options
Diffstat (limited to 'src/backend')
45 files changed, 640 insertions, 241 deletions
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 01e1db7f856..4204088fa0d 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -68,7 +68,7 @@ typedef struct BrinShared int scantuplesortstates; /* Query ID, for report in worker processes */ - uint64 queryid; + int64 queryid; /* * workersdonecv is used to monitor the progress of workers. All parallel diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 46c1dce222d..50747c16396 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -1243,8 +1243,9 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, } else { - text *t; + const char *name; const char *value; + text *t; Size len; /* @@ -1291,11 +1292,19 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, * have just "name", assume "name=true" is meant. Note: the * namespace is not output. */ + name = def->defname; if (def->arg != NULL) value = defGetString(def); else value = "true"; + /* Insist that name not contain "=", else "a=b=c" is ambiguous */ + if (strchr(name, '=') != NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option name \"%s\": must not contain \"=\"", + name))); + /* * This is not a great place for this test, but there's no other * convenient place to filter the option out. As WITH (oids = @@ -1303,7 +1312,7 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, * amount of ugly. */ if (acceptOidsOff && def->defnamespace == NULL && - strcmp(def->defname, "oids") == 0) + strcmp(name, "oids") == 0) { if (defGetBoolean(def)) ereport(ERROR, @@ -1313,11 +1322,11 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, continue; } - len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value); + len = VARHDRSZ + strlen(name) + 1 + strlen(value); /* +1 leaves room for sprintf's trailing null */ t = (text *) palloc(len + 1); SET_VARSIZE(t, len); - sprintf(VARDATA(t), "%s=%s", def->defname, value); + sprintf(VARDATA(t), "%s=%s", name, value); astate = accumArrayResult(astate, PointerGetDatum(t), false, TEXTOID, diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index a6b701943d3..c0aa7d0222f 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -1058,11 +1058,11 @@ gistGetFakeLSN(Relation rel) } /* - * This is a stratnum support function for GiST opclasses that use the - * RT*StrategyNumber constants. + * This is a stratnum translation support function for GiST opclasses that use + * the RT*StrategyNumber constants. */ Datum -gist_stratnum_common(PG_FUNCTION_ARGS) +gist_translate_cmptype_common(PG_FUNCTION_ARGS) { CompareType cmptype = PG_GETARG_INT32(0); @@ -1090,9 +1090,9 @@ gist_stratnum_common(PG_FUNCTION_ARGS) /* * Returns the opclass's private stratnum used for the given compare type. * - * Calls the opclass's GIST_STRATNUM_PROC support function, if any, - * and returns the result. - * Returns InvalidStrategy if the function is not defined. + * Calls the opclass's GIST_TRANSLATE_CMPTYPE_PROC support function, if any, + * and returns the result. Returns InvalidStrategy if the function is not + * defined. */ StrategyNumber gisttranslatecmptype(CompareType cmptype, Oid opfamily) @@ -1101,7 +1101,7 @@ gisttranslatecmptype(CompareType cmptype, Oid opfamily) Datum result; /* Check whether the function is provided. */ - funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_STRATNUM_PROC); + funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_TRANSLATE_CMPTYPE_PROC); if (!OidIsValid(funcid)) return InvalidStrategy; diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c index 2a49e6d20f0..2ed6f74fce9 100644 --- a/src/backend/access/gist/gistvalidate.c +++ b/src/backend/access/gist/gistvalidate.c @@ -138,7 +138,7 @@ gistvalidate(Oid opclassoid) ok = check_amproc_signature(procform->amproc, VOIDOID, true, 1, 1, INTERNALOID); break; - case GIST_STRATNUM_PROC: + case GIST_TRANSLATE_CMPTYPE_PROC: ok = check_amproc_signature(procform->amproc, INT2OID, true, 1, 1, INT4OID) && procform->amproclefttype == ANYOID && @@ -265,7 +265,7 @@ gistvalidate(Oid opclassoid) if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC || i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC || i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC || - i == GIST_STRATNUM_PROC) + i == GIST_TRANSLATE_CMPTYPE_PROC) continue; /* optional methods */ ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -336,7 +336,7 @@ gistadjustmembers(Oid opfamilyoid, case GIST_FETCH_PROC: case GIST_OPTIONS_PROC: case GIST_SORTSUPPORT_PROC: - case GIST_STRATNUM_PROC: + case GIST_TRANSLATE_CMPTYPE_PROC: /* Optional, so force it to be a soft family dependency */ op->ref_is_hard = false; op->ref_is_family = true; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9ec8cda1c68..0dcd6ee817e 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -213,6 +213,27 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = #define TUPLOCK_from_mxstatus(status) \ (MultiXactStatusLock[(status)]) +/* + * Check that we have a valid snapshot if we might need TOAST access. + */ +static inline void +AssertHasSnapshotForToast(Relation rel) +{ +#ifdef USE_ASSERT_CHECKING + + /* bootstrap mode in particular breaks this rule */ + if (!IsNormalProcessingMode()) + return; + + /* if the relation doesn't have a TOAST table, we are good */ + if (!OidIsValid(rel->rd_rel->reltoastrelid)) + return; + + Assert(HaveRegisteredOrActiveSnapshot()); + +#endif /* USE_ASSERT_CHECKING */ +} + /* ---------------------------------------------------------------- * heap support routines * ---------------------------------------------------------------- @@ -2066,6 +2087,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, Assert(HeapTupleHeaderGetNatts(tup->t_data) <= RelationGetNumberOfAttributes(relation)); + AssertHasSnapshotForToast(relation); + /* * Fill in tuple header fields and toast the tuple if necessary. * @@ -2343,6 +2366,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, /* currently not needed (thus unsupported) for heap_multi_insert() */ Assert(!(options & HEAP_INSERT_NO_LOGICAL)); + AssertHasSnapshotForToast(relation); + needwal = RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); @@ -2765,6 +2790,8 @@ heap_delete(Relation relation, ItemPointer tid, Assert(ItemPointerIsValid(tid)); + AssertHasSnapshotForToast(relation); + /* * Forbid this during a parallel operation, lest it allocate a combo CID. * Other workers might need that combo CID for visibility checks, and we @@ -3260,6 +3287,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, Assert(HeapTupleHeaderGetNatts(newtup->t_data) <= RelationGetNumberOfAttributes(relation)); + AssertHasSnapshotForToast(relation); + /* * Forbid this during a parallel operation, lest it allocate a combo CID. * Other workers might need that combo CID for visibility checks, and we @@ -4953,7 +4982,7 @@ l3: case LockWaitError: if (!ConditionalMultiXactIdWait((MultiXactId) xwait, status, infomask, relation, - NULL, log_lock_failure)) + NULL, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", @@ -4991,7 +5020,7 @@ l3: } break; case LockWaitError: - if (!ConditionalXactLockTableWait(xwait, log_lock_failure)) + if (!ConditionalXactLockTableWait(xwait, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", @@ -5256,7 +5285,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, break; case LockWaitError: - if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failure)) + if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ac082fefa77..cb4bc35c93e 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -464,7 +464,7 @@ tuple_lock_retry: return TM_WouldBlock; break; case LockWaitError: - if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failure)) + if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 708674d8fcf..09416450af9 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -757,7 +757,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->vm_new_visible_pages = 0; vacrel->vm_new_visible_frozen_pages = 0; vacrel->vm_new_frozen_pages = 0; - vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); /* * Get cutoffs that determine which deleted tuples are considered DEAD, @@ -776,7 +775,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * to increase the number of dead tuples it can prune away.) */ vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs); + vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); vacrel->vistest = GlobalVisTestFor(rel); + /* Initialize state used to track oldest extant XID/MXID */ vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af..03a1d7b027a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; + Assert(scan->heapRelation != NULL); + /* btree indexes are never lossy */ scan->xs_recheck = false; @@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) int64 ntids = 0; ItemPointer heapTid; + Assert(scan->heapRelation == NULL); + /* Each loop iteration performs another primitive index scan */ do { @@ -393,6 +397,32 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, BTScanPosInvalidate(so->currPos); } + /* + * We prefer to eagerly drop leaf page pins before btgettuple returns. + * This avoids making VACUUM wait to acquire a cleanup lock on the page. + * + * We cannot safely drop leaf page pins during index-only scans due to a + * race condition involving VACUUM setting pages all-visible in the VM. + * It's also unsafe for plain index scans that use a non-MVCC snapshot. + * + * When we drop pins eagerly, the mechanism that marks so->killedItems[] + * index tuples LP_DEAD has to deal with concurrent TID recycling races. + * The scheme used to detect unsafe TID recycling won't work when scanning + * unlogged relations (since it involves saving an affected page's LSN). + * Opt out of eager pin dropping during unlogged relation scans for now + * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting). + * + * Also opt out of dropping leaf page pins eagerly during bitmap scans. + * Pins cannot be held for more than an instant during bitmap scans either + * way, so we might as well avoid wasting cycles on acquiring page LSNs. + * + * See nbtree/README section on making concurrent TID recycling safe. + */ + so->dropPin = (!scan->xs_want_itup && + IsMVCCSnapshot(scan->xs_snapshot) && + RelationNeedsWAL(scan->indexRelation) && + scan->heapRelation != NULL); + so->markItemIndex = -1; so->needPrimScan = false; so->scanBehind = false; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fe9a3886913..070f14c8b91 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,7 +25,7 @@ #include "utils/rel.h" -static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so); static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); @@ -57,24 +57,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); /* * _bt_drop_lock_and_maybe_pin() * - * Unlock the buffer; and if it is safe to release the pin, do that, too. - * This will prevent vacuum from stalling in a blocked state trying to read a - * page when a cursor is sitting on it. - * - * See nbtree/README section on making concurrent TID recycling safe. + * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too. + * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock. */ -static void -_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +static inline void +_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so) { - _bt_unlockbuf(scan->indexRelation, sp->buf); - - if (IsMVCCSnapshot(scan->xs_snapshot) && - RelationNeedsWAL(scan->indexRelation) && - !scan->xs_want_itup) + if (!so->dropPin) { - ReleaseBuffer(sp->buf); - sp->buf = InvalidBuffer; + /* Just drop the lock (not the pin) */ + _bt_unlockbuf(rel, so->currPos.buf); + return; } + + /* + * Drop both the lock and the pin. + * + * Have to set so->currPos.lsn so that _bt_killitems has a way to detect + * when concurrent heap TID recycling by VACUUM might have taken place. + */ + Assert(RelationNeedsWAL(rel)); + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; } /* @@ -866,8 +871,8 @@ _bt_compare(Relation rel, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, data about the * matching tuple(s) on the page has been loaded into so->currPos. We'll - * drop all locks and hold onto a pin on page's buffer, except when - * _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM. + * drop all locks and hold onto a pin on page's buffer, except during + * so->dropPin scans, when we drop both the lock and the pin. * _bt_returnitem sets the next item to return to scan on success exit. * * If there are no matching items in the index, we return false, with no @@ -1610,7 +1615,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); so->currPos.prevPage = opaque->btpo_prev; so->currPos.nextPage = opaque->btpo_next; + /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */ + so->currPos.dir = dir; + so->currPos.nextTupleOffset = 0; + /* either moreRight or moreLeft should be set now (may be unset later) */ + Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : + so->currPos.moreLeft); Assert(!P_IGNORE(opaque)); Assert(BTScanPosIsPinned(so->currPos)); Assert(!so->needPrimScan); @@ -1626,14 +1637,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.currPage); } - /* initialize remaining currPos fields related to current page */ - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); - so->currPos.dir = dir; - so->currPos.nextTupleOffset = 0; - /* either moreLeft or moreRight should be set now (may be unset later) */ - Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : - so->currPos.moreLeft); - PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); /* initialize local variables */ @@ -2107,10 +2110,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * * Wrapper on _bt_readnextpage that performs final steps for the current page. * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. - * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped - * the pin eagerly earlier on. The scan must have so->currPos.currPage set to - * a valid block, in any case. + * On entry, so->currPos must be valid. Its buffer will be pinned, though + * never locked. (Actually, when so->dropPin there won't even be a pin held, + * though so->currPos.currPage must still be set to a valid block number.) */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) @@ -2251,12 +2253,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) */ if (_bt_readpage(scan, dir, offnum, true)) { + Relation rel = scan->indexRelation; + /* * _bt_readpage succeeded. Drop the lock (and maybe the pin) on * so->currPos.buf in preparation for btgettuple returning tuples. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(rel, so); return true; } @@ -2294,8 +2298,8 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * * On success exit, so->currPos is updated to contain data from the next * interesting page, and we return true. We hold a pin on the buffer on - * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe - * to eagerly drop the pin (to avoid blocking VACUUM). + * success exit (except during so->dropPin index scans, when we drop the pin + * eagerly to avoid blocking VACUUM). * * If there are no more matching records in the given direction, we drop all * locks and pins, invalidate so->currPos, and return false. @@ -2413,7 +2417,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, */ Assert(so->currPos.currPage == blkno); Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(rel, so); return true; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 3794cc924ad..9d70e89c1f3 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -105,7 +105,7 @@ typedef struct BTShared int scantuplesortstates; /* Query ID, for report in worker processes */ - uint64 queryid; + int64 queryid; /* * workersdonecv is used to monitor the progress of workers. All parallel diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 1a15dfcb7d3..29f0dca1b08 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -63,7 +63,7 @@ static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, bool *continuescan, int *ikey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - ScanDirection dir, bool forcenonrequired, bool *continuescan); + ScanDirection dir, bool *continuescan); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, @@ -2902,8 +2902,10 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { + Assert(!forcenonrequired); /* forbidden by _bt_set_startikey */ + if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, - forcenonrequired, continuescan)) + continuescan)) continue; return false; } @@ -3060,8 +3062,7 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, - TupleDesc tupdesc, ScanDirection dir, - bool forcenonrequired, bool *continuescan) + TupleDesc tupdesc, ScanDirection dir, bool *continuescan) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); int32 cmpresult = 0; @@ -3101,11 +3102,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if (isNull) { - if (forcenonrequired) - { - /* treating scan's keys as non-required */ - } - else if (subkey->sk_flags & SK_BT_NULLS_FIRST) + if (subkey->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have @@ -3159,12 +3156,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, */ Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); subkey--; - if (forcenonrequired) - { - /* treating scan's keys as non-required */ - } - else if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) + if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) *continuescan = false; else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) @@ -3216,7 +3209,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, break; } - if (!result && !forcenonrequired) + if (!result) { /* * Tuple fails this qual. If it's a required qual for the current @@ -3342,75 +3335,71 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, * * Note that if we hold a pin on the target page continuously from initially * reading the items until applying this function, VACUUM cannot have deleted - * any items from the page, and so there is no need to search left from the - * recorded offset. (This observation also guarantees that the item is still - * the right one to delete, which might otherwise be questionable since heap - * TIDs can get recycled.) This holds true even if the page has been modified - * by inserts and page splits, so there is no need to consult the LSN. - * - * If the pin was released after reading the page, then we re-read it. If it - * has been modified since we read it (as determined by the LSN), we dare not - * flag any entries because it is possible that the old entry was vacuumed - * away and the TID was re-used by a completely different heap tuple. + * any items on the page, so the page's TIDs can't have been recycled by now. + * There's no risk that we'll confuse a new index tuple that happens to use a + * recycled TID with a now-removed tuple with the same TID (that used to be on + * this same page). We can't rely on that during scans that drop pins eagerly + * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on + * the page LSN having not changed since back when _bt_readpage saw the page. */ void _bt_killitems(IndexScanDesc scan) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; - int i; int numKilled = so->numKilled; bool killedsomething = false; - bool droppedpin PG_USED_FOR_ASSERTS_ONLY; + Assert(numKilled > 0); Assert(BTScanPosIsValid(so->currPos)); + Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */ - /* - * Always reset the scan state, so we don't look for same items on other - * pages. - */ + /* Always invalidate so->killedItems[] before leaving so->currPos */ so->numKilled = 0; - if (BTScanPosIsPinned(so->currPos)) + if (!so->dropPin) { /* * We have held the pin on this page since we read the index tuples, * so all we need to do is lock it. The pin will have prevented - * re-use of any TID on the page, so there is no need to check the - * LSN. + * concurrent VACUUMs from recycling any of the TIDs on the page. */ - droppedpin = false; - _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); - - page = BufferGetPage(so->currPos.buf); + Assert(BTScanPosIsPinned(so->currPos)); + _bt_lockbuf(rel, so->currPos.buf, BT_READ); } else { Buffer buf; + XLogRecPtr latestlsn; - droppedpin = true; - /* Attempt to re-read the buffer, getting pin and lock. */ - buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + Assert(!BTScanPosIsPinned(so->currPos)); + Assert(RelationNeedsWAL(rel)); + buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); - page = BufferGetPage(buf); - if (BufferGetLSNAtomic(buf) == so->currPos.lsn) - so->currPos.buf = buf; - else + latestlsn = BufferGetLSNAtomic(buf); + Assert(!XLogRecPtrIsInvalid(so->currPos.lsn)); + Assert(so->currPos.lsn <= latestlsn); + if (so->currPos.lsn != latestlsn) { - /* Modified while not pinned means hinting is not safe. */ - _bt_relbuf(scan->indexRelation, buf); + /* Modified, give up on hinting */ + _bt_relbuf(rel, buf); return; } + + /* Unmodified, hinting is safe */ + so->currPos.buf = buf; } + page = BufferGetPage(so->currPos.buf); opaque = BTPageGetOpaque(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); - for (i = 0; i < numKilled; i++) + for (int i = 0; i < numKilled; i++) { int itemIndex = so->killedItems[i]; BTScanPosItem *kitem = &so->currPos.items[itemIndex]; @@ -3442,7 +3431,7 @@ _bt_killitems(IndexScanDesc scan) * correctness. * * Note that the page may have been modified in almost any way - * since we first read it (in the !droppedpin case), so it's + * since we first read it (in the !so->dropPin case), so it's * possible that this posting list tuple wasn't a posting list * tuple when we first encountered its heap TIDs. */ @@ -3458,7 +3447,7 @@ _bt_killitems(IndexScanDesc scan) * though only in the common case where the page can't * have been concurrently modified */ - Assert(kitem->indexOffset == offnum || !droppedpin); + Assert(kitem->indexOffset == offnum || !so->dropPin); /* * Read-ahead to later kitems here. @@ -3525,7 +3514,7 @@ _bt_killitems(IndexScanDesc scan) MarkBufferDirtyHint(so->currPos.buf, true); } - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + _bt_unlockbuf(rel, so->currPos.buf); } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index b885513f765..2e67e998adb 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1045,6 +1045,34 @@ TransactionStartedDuringRecovery(void) } /* + * GetTopReadOnlyTransactionNestLevel + * + * Note: this will return zero when not inside any transaction or when neither + * a top-level transaction nor subtransactions are read-only, one when the + * top-level transaction is read-only, two when one level of subtransaction is + * read-only, etc. + * + * Note: subtransactions of the topmost read-only transaction are also + * read-only, because they inherit read-only mode from the transaction, and + * thus can't change to read-write mode. See check_transaction_read_only(). + */ +int +GetTopReadOnlyTransactionNestLevel(void) +{ + TransactionState s = CurrentTransactionState; + + if (!XactReadOnly) + return 0; + while (s->nestingLevel > 1) + { + if (!s->prevXactReadOnly) + return s->nestingLevel; + s = s->parent; + } + return s->nestingLevel; +} + +/* * EnterParallelMode */ void diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 5fbbcdaabb1..c95eb945016 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -1065,16 +1065,41 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) /* Check that the chosen locales are valid, and get canonical spellings */ if (!check_locale(LC_COLLATE, dbcollate, &canonname)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), - errhint("If the locale name is specific to ICU, use ICU_LOCALE."))); + { + if (dblocprovider == COLLPROVIDER_BUILTIN) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), + errhint("If the locale name is specific to the builtin provider, use BUILTIN_LOCALE."))); + else if (dblocprovider == COLLPROVIDER_ICU) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), + errhint("If the locale name is specific to the ICU provider, use ICU_LOCALE."))); + else + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate))); + } dbcollate = canonname; if (!check_locale(LC_CTYPE, dbctype, &canonname)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), - errhint("If the locale name is specific to ICU, use ICU_LOCALE."))); + { + if (dblocprovider == COLLPROVIDER_BUILTIN) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), + errhint("If the locale name is specific to the builtin provider, use BUILTIN_LOCALE."))); + else if (dblocprovider == COLLPROVIDER_ICU) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), + errhint("If the locale name is specific to the ICU provider, use ICU_LOCALE."))); + else + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype))); + } + dbctype = canonname; check_encoding_locale_matches(encoding, dbcollate, dbctype); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index bfa83fbc3fe..7e2792ead71 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -811,14 +811,10 @@ ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc) * the queryid in any of the EXPLAIN plans to keep stable the results * generated by regression test suites. */ - if (es->verbose && queryDesc->plannedstmt->queryId != UINT64CONST(0) && + if (es->verbose && queryDesc->plannedstmt->queryId != INT64CONST(0) && compute_query_id != COMPUTE_QUERY_ID_REGRESS) { - /* - * Output the queryid as an int64 rather than a uint64 so we match - * what would be seen in the BIGINT pg_stat_statements.queryid column. - */ - ExplainPropertyInteger("Query Identifier", NULL, (int64) + ExplainPropertyInteger("Query Identifier", NULL, queryDesc->plannedstmt->queryId, es); } } diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c index c14e038d54f..8d2d7431544 100644 --- a/src/backend/commands/foreigncmds.c +++ b/src/backend/commands/foreigncmds.c @@ -71,15 +71,26 @@ optionListToArray(List *options) foreach(cell, options) { DefElem *def = lfirst(cell); + const char *name; const char *value; Size len; text *t; + name = def->defname; value = defGetString(def); - len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value); + + /* Insist that name not contain "=", else "a=b=c" is ambiguous */ + if (strchr(name, '=') != NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option name \"%s\": must not contain \"=\"", + name))); + + len = VARHDRSZ + strlen(name) + 1 + strlen(value); + /* +1 leaves room for sprintf's trailing null */ t = palloc(len + 1); SET_VARSIZE(t, len); - sprintf(VARDATA(t), "%s=%s", def->defname, value); + sprintf(VARDATA(t), "%s=%s", name, value); astate = accumArrayResult(astate, PointerGetDatum(t), false, TEXTOID, diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index d962fe392cd..c3ec2076a52 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -4226,7 +4226,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein false); /* - * Updating pg_index might involve TOAST table access, so ensure we + * Swapping the indexes might involve TOAST table access, so ensure we * have a valid snapshot. */ PushActiveSnapshot(GetTransactionSnapshot()); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 54ad38247aa..ea96947d813 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -430,8 +430,8 @@ static void AlterConstrUpdateConstraintEntry(ATAlterConstraint *cmdcon, Relation static ObjectAddress ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, bool recurse, bool recursing, LOCKMODE lockmode); -static void QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, - HeapTuple contuple, LOCKMODE lockmode); +static void QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation fkrel, + Oid pkrelid, HeapTuple contuple, LOCKMODE lockmode); static void QueueCheckConstraintValidation(List **wqueue, Relation conrel, Relation rel, char *constrName, HeapTuple contuple, bool recurse, bool recursing, LOCKMODE lockmode); @@ -11858,6 +11858,7 @@ AttachPartitionForeignKey(List **wqueue, if (queueValidation) { Relation conrel; + Oid confrelid; conrel = table_open(ConstraintRelationId, RowExclusiveLock); @@ -11865,9 +11866,11 @@ AttachPartitionForeignKey(List **wqueue, if (!HeapTupleIsValid(partcontup)) elog(ERROR, "cache lookup failed for constraint %u", partConstrOid); + confrelid = ((Form_pg_constraint) GETSTRUCT(partcontup))->confrelid; + /* Use the same lock as for AT_ValidateConstraint */ - QueueFKConstraintValidation(wqueue, conrel, partition, partcontup, - ShareUpdateExclusiveLock); + QueueFKConstraintValidation(wqueue, conrel, partition, confrelid, + partcontup, ShareUpdateExclusiveLock); ReleaseSysCache(partcontup); table_close(conrel, RowExclusiveLock); } @@ -12463,9 +12466,12 @@ ATExecAlterConstrEnforceability(List **wqueue, ATAlterConstraint *cmdcon, /* * Tell Phase 3 to check that the constraint is satisfied by existing - * rows. + * rows. Only applies to leaf partitions, and (for constraints that + * reference a partitioned table) only if this is not one of the + * pg_constraint rows that exist solely to support action triggers. */ - if (rel->rd_rel->relkind == RELKIND_RELATION) + if (rel->rd_rel->relkind == RELKIND_RELATION && + currcon->confrelid == pkrelid) { AlteredTableInfo *tab; NewConstraint *newcon; @@ -12919,7 +12925,8 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, { if (con->contype == CONSTRAINT_FOREIGN) { - QueueFKConstraintValidation(wqueue, conrel, rel, tuple, lockmode); + QueueFKConstraintValidation(wqueue, conrel, rel, con->confrelid, + tuple, lockmode); } else if (con->contype == CONSTRAINT_CHECK) { @@ -12952,8 +12959,8 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, * for the specified relation and all its children. */ static void -QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, - HeapTuple contuple, LOCKMODE lockmode) +QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation fkrel, + Oid pkrelid, HeapTuple contuple, LOCKMODE lockmode) { Form_pg_constraint con; AlteredTableInfo *tab; @@ -12964,7 +12971,17 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, Assert(con->contype == CONSTRAINT_FOREIGN); Assert(!con->convalidated); - if (rel->rd_rel->relkind == RELKIND_RELATION) + /* + * Add the validation to phase 3's queue; not needed for partitioned + * tables themselves, only for their partitions. + * + * When the referenced table (pkrelid) is partitioned, the referencing + * table (fkrel) has one pg_constraint row pointing to each partition + * thereof. These rows are there only to support action triggers and no + * table scan is needed, therefore skip this for them as well. + */ + if (fkrel->rd_rel->relkind == RELKIND_RELATION && + con->confrelid == pkrelid) { NewConstraint *newcon; Constraint *fkconstraint; @@ -12983,15 +13000,16 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, newcon->qual = (Node *) fkconstraint; /* Find or create work queue entry for this table */ - tab = ATGetQueueEntry(wqueue, rel); + tab = ATGetQueueEntry(wqueue, fkrel); tab->constraints = lappend(tab->constraints, newcon); } /* * If the table at either end of the constraint is partitioned, we need to - * recurse and handle every constraint that is a child of this constraint. + * recurse and handle every unvalidate constraint that is a child of this + * constraint. */ - if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE || + if (fkrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE || get_rel_relkind(con->confrelid) == RELKIND_PARTITIONED_TABLE) { ScanKeyData pkey; @@ -13023,8 +13041,12 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, childrel = table_open(childcon->conrelid, lockmode); - QueueFKConstraintValidation(wqueue, conrel, childrel, childtup, - lockmode); + /* + * NB: Note that pkrelid should be passed as-is during recursion, + * as it is required to identify the root referenced table. + */ + QueueFKConstraintValidation(wqueue, conrel, childrel, pkrelid, + childtup, lockmode); table_close(childrel, NoLock); } @@ -13032,7 +13054,11 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, } /* - * Now update the catalog, while we have the door open. + * Now mark the pg_constraint row as validated (even if we didn't check, + * notably the ones for partitions on the referenced side). + * + * We rely on transaction abort to roll back this change if phase 3 + * ultimately finds violating rows. This is a bit ugly. */ copyTuple = heap_copytuple(contuple); copy_con = (Form_pg_constraint) GETSTRUCT(copyTuple); @@ -20964,9 +20990,17 @@ ATExecDetachPartition(List **wqueue, AlteredTableInfo *tab, Relation rel, tab->rel = rel; } + /* + * Detaching the partition might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + /* Do the final part of detaching */ DetachPartitionFinalize(rel, partRel, concurrent, defaultPartOid); + PopActiveSnapshot(); + ObjectAddressSet(address, RelationRelationId, RelationGetRelid(partRel)); /* keep our lock until commit */ diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 2b9d548cdeb..0feea1d30ec 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -63,7 +63,7 @@ typedef struct PVShared */ Oid relid; int elevel; - uint64 queryid; + int64 queryid; /* * Fields for both index vacuum and cleanup. diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 2bc89bf84dc..54da8e7995b 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -64,6 +64,7 @@ #include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" #include "rewrite/rewriteHandler.h" +#include "rewrite/rewriteManip.h" #include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/datum.h" @@ -3735,6 +3736,7 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) switch (action->commandType) { case CMD_INSERT: + /* INSERT actions always use rootRelInfo */ ExecCheckPlanOutput(rootRelInfo->ri_RelationDesc, action->targetList); @@ -3774,9 +3776,23 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) } else { - /* not partitioned? use the stock relation and slot */ - tgtslot = resultRelInfo->ri_newTupleSlot; - tgtdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + /* + * If the MERGE targets an inherited table, we insert + * into the root table, so we must initialize its + * "new" tuple slot, if not already done, and use its + * relation descriptor for the projection. + * + * For non-inherited tables, rootRelInfo and + * resultRelInfo are the same, and the "new" tuple + * slot will already have been initialized. + */ + if (rootRelInfo->ri_newTupleSlot == NULL) + rootRelInfo->ri_newTupleSlot = + table_slot_create(rootRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + + tgtslot = rootRelInfo->ri_newTupleSlot; + tgtdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); } action_state->mas_proj = @@ -3809,6 +3825,114 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) } } } + + /* + * If the MERGE targets an inherited table, any INSERT actions will use + * rootRelInfo, and rootRelInfo will not be in the resultRelInfo array. + * Therefore we must initialize its WITH CHECK OPTION constraints and + * RETURNING projection, as ExecInitModifyTable did for the resultRelInfo + * entries. + * + * Note that the planner does not build a withCheckOptionList or + * returningList for the root relation, but as in ExecInitPartitionInfo, + * we can use the first resultRelInfo entry as a reference to calculate + * the attno's for the root table. + */ + if (rootRelInfo != mtstate->resultRelInfo && + rootRelInfo->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE && + (mtstate->mt_merge_subcommands & MERGE_INSERT) != 0) + { + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + Relation rootRelation = rootRelInfo->ri_RelationDesc; + Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; + int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + AttrMap *part_attmap = NULL; + bool found_whole_row; + + if (node->withCheckOptionLists != NIL) + { + List *wcoList; + List *wcoExprs = NIL; + + /* There should be as many WCO lists as result rels */ + Assert(list_length(node->withCheckOptionLists) == + list_length(node->resultRelations)); + + /* + * Use the first WCO list as a reference. In the most common case, + * this will be for the same relation as rootRelInfo, and so there + * will be no need to adjust its attno's. + */ + wcoList = linitial(node->withCheckOptionLists); + if (rootRelation != firstResultRel) + { + /* Convert any Vars in it to contain the root's attno's */ + part_attmap = + build_attrmap_by_name(RelationGetDescr(rootRelation), + RelationGetDescr(firstResultRel), + false); + + wcoList = (List *) + map_variable_attnos((Node *) wcoList, + firstVarno, 0, + part_attmap, + RelationGetForm(rootRelation)->reltype, + &found_whole_row); + } + + foreach(lc, wcoList) + { + WithCheckOption *wco = lfirst_node(WithCheckOption, lc); + ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), + &mtstate->ps); + + wcoExprs = lappend(wcoExprs, wcoExpr); + } + + rootRelInfo->ri_WithCheckOptions = wcoList; + rootRelInfo->ri_WithCheckOptionExprs = wcoExprs; + } + + if (node->returningLists != NIL) + { + List *returningList; + + /* There should be as many returning lists as result rels */ + Assert(list_length(node->returningLists) == + list_length(node->resultRelations)); + + /* + * Use the first returning list as a reference. In the most common + * case, this will be for the same relation as rootRelInfo, and so + * there will be no need to adjust its attno's. + */ + returningList = linitial(node->returningLists); + if (rootRelation != firstResultRel) + { + /* Convert any Vars in it to contain the root's attno's */ + if (part_attmap == NULL) + part_attmap = + build_attrmap_by_name(RelationGetDescr(rootRelation), + RelationGetDescr(firstResultRel), + false); + + returningList = (List *) + map_variable_attnos((Node *) returningList, + firstVarno, 0, + part_attmap, + RelationGetForm(rootRelation)->reltype, + &found_whole_row); + } + rootRelInfo->ri_returningList = returningList; + + /* Initialize the RETURNING projection */ + rootRelInfo->ri_projectReturning = + ExecBuildProjectionInfo(returningList, econtext, + mtstate->ps.ps_ResultTupleSlot, + &mtstate->ps, + RelationGetDescr(rootRelation)); + } + } } /* diff --git a/src/backend/libpq/be-secure-gssapi.c b/src/backend/libpq/be-secure-gssapi.c index 717ba9824f9..3534f0b8111 100644 --- a/src/backend/libpq/be-secure-gssapi.c +++ b/src/backend/libpq/be-secure-gssapi.c @@ -46,11 +46,18 @@ * don't want the other side to send arbitrarily huge packets as we * would have to allocate memory for them to then pass them to GSSAPI. * - * Therefore, these two #define's are effectively part of the protocol + * Therefore, this #define is effectively part of the protocol * spec and can't ever be changed. */ -#define PQ_GSS_SEND_BUFFER_SIZE 16384 -#define PQ_GSS_RECV_BUFFER_SIZE 16384 +#define PQ_GSS_MAX_PACKET_SIZE 16384 /* includes uint32 header word */ + +/* + * However, during the authentication exchange we must cope with whatever + * message size the GSSAPI library wants to send (because our protocol + * doesn't support splitting those messages). Depending on configuration + * those messages might be as much as 64kB. + */ +#define PQ_GSS_AUTH_BUFFER_SIZE 65536 /* includes uint32 header word */ /* * Since we manage at most one GSS-encrypted connection per backend, @@ -210,12 +217,12 @@ be_gssapi_write(Port *port, const void *ptr, size_t len) errno = ECONNRESET; return -1; } - if (output.length > PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)) + if (output.length > PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("server tried to send oversize GSSAPI packet (%zu > %zu)", (size_t) output.length, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)))); errno = ECONNRESET; return -1; } @@ -346,12 +353,12 @@ be_gssapi_read(Port *port, void *ptr, size_t len) /* Decode the packet length and check for overlength packet */ input.length = pg_ntoh32(*(uint32 *) PqGSSRecvBuffer); - if (input.length > PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)) + if (input.length > PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("oversize GSSAPI packet sent by the client (%zu > %zu)", (size_t) input.length, - PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)))); errno = ECONNRESET; return -1; } @@ -517,10 +524,13 @@ secure_open_gssapi(Port *port) * that will never use them, and we ensure that the buffers are * sufficiently aligned for the length-word accesses that we do in some * places in this file. + * + * We'll use PQ_GSS_AUTH_BUFFER_SIZE-sized buffers until transport + * negotiation is complete, then switch to PQ_GSS_MAX_PACKET_SIZE. */ - PqGSSSendBuffer = malloc(PQ_GSS_SEND_BUFFER_SIZE); - PqGSSRecvBuffer = malloc(PQ_GSS_RECV_BUFFER_SIZE); - PqGSSResultBuffer = malloc(PQ_GSS_RECV_BUFFER_SIZE); + PqGSSSendBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); + PqGSSRecvBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); + PqGSSResultBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); if (!PqGSSSendBuffer || !PqGSSRecvBuffer || !PqGSSResultBuffer) ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), @@ -568,16 +578,16 @@ secure_open_gssapi(Port *port) /* * During initialization, packets are always fully consumed and - * shouldn't ever be over PQ_GSS_RECV_BUFFER_SIZE in length. + * shouldn't ever be over PQ_GSS_AUTH_BUFFER_SIZE in total length. * * Verify on our side that the client doesn't do something funny. */ - if (input.length > PQ_GSS_RECV_BUFFER_SIZE) + if (input.length > PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)) { ereport(COMMERROR, - (errmsg("oversize GSSAPI packet sent by the client (%zu > %d)", + (errmsg("oversize GSSAPI packet sent by the client (%zu > %zu)", (size_t) input.length, - PQ_GSS_RECV_BUFFER_SIZE))); + PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)))); return -1; } @@ -631,12 +641,12 @@ secure_open_gssapi(Port *port) { uint32 netlen = pg_hton32(output.length); - if (output.length > PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)) + if (output.length > PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("server tried to send oversize GSSAPI packet (%zu > %zu)", (size_t) output.length, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)))); gss_release_buffer(&minor, &output); return -1; } @@ -692,11 +702,28 @@ secure_open_gssapi(Port *port) } /* + * Release the large authentication buffers and allocate the ones we want + * for normal operation. + */ + free(PqGSSSendBuffer); + free(PqGSSRecvBuffer); + free(PqGSSResultBuffer); + PqGSSSendBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + PqGSSRecvBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + PqGSSResultBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + if (!PqGSSSendBuffer || !PqGSSRecvBuffer || !PqGSSResultBuffer) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + PqGSSSendLength = PqGSSSendNext = PqGSSSendConsumed = 0; + PqGSSRecvLength = PqGSSResultLength = PqGSSResultNext = 0; + + /* * Determine the max packet size which will fit in our buffer, after * accounting for the length. be_gssapi_write will need this. */ major = gss_wrap_size_limit(&minor, port->gss->ctx, 1, GSS_C_QOP_DEFAULT, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32), + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32), &PqGSSMaxPktSize); if (GSS_ERROR(major)) diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl index 77659b0f760..c8595109b0e 100644 --- a/src/backend/nodes/gen_node_support.pl +++ b/src/backend/nodes/gen_node_support.pl @@ -1039,6 +1039,11 @@ _read${n}(void) print $off "\tWRITE_UINT_FIELD($f);\n"; print $rff "\tREAD_UINT_FIELD($f);\n" unless $no_read; } + elsif ($t eq 'int64') + { + print $off "\tWRITE_INT64_FIELD($f);\n"; + print $rff "\tREAD_INT64_FIELD($f);\n" unless $no_read; + } elsif ($t eq 'uint64' || $t eq 'AclMode') { diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index ceac3fd8620..25e08ba3426 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -51,6 +51,12 @@ static void outDouble(StringInfo str, double d); #define WRITE_UINT_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname) +/* Write a signed integer field (anything written with INT64_FORMAT) */ +#define WRITE_INT64_FIELD(fldname) \ + appendStringInfo(str, \ + " :" CppAsString(fldname) " " INT64_FORMAT, \ + node->fldname) + /* Write an unsigned integer field (anything written with UINT64_FORMAT) */ #define WRITE_UINT64_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " " UINT64_FORMAT, \ diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c index d1e82a63f09..ac3cb3d9caf 100644 --- a/src/backend/nodes/queryjumblefuncs.c +++ b/src/backend/nodes/queryjumblefuncs.c @@ -56,7 +56,7 @@ int compute_query_id = COMPUTE_QUERY_ID_AUTO; bool query_id_enabled = false; static JumbleState *InitJumble(void); -static uint64 DoJumble(JumbleState *jstate, Node *node); +static int64 DoJumble(JumbleState *jstate, Node *node); static void AppendJumble(JumbleState *jstate, const unsigned char *value, Size size); static void FlushPendingNulls(JumbleState *jstate); @@ -141,12 +141,12 @@ JumbleQuery(Query *query) * If we are unlucky enough to get a hash of zero, use 1 instead for * normal statements and 2 for utility queries. */ - if (query->queryId == UINT64CONST(0)) + if (query->queryId == INT64CONST(0)) { if (query->utilityStmt) - query->queryId = UINT64CONST(2); + query->queryId = INT64CONST(2); else - query->queryId = UINT64CONST(1); + query->queryId = INT64CONST(1); } return jstate; @@ -197,7 +197,7 @@ InitJumble(void) * Jumble the given Node using the given JumbleState and return the resulting * jumble hash. */ -static uint64 +static int64 DoJumble(JumbleState *jstate, Node *node) { /* Jumble the given node */ @@ -208,9 +208,9 @@ DoJumble(JumbleState *jstate, Node *node) FlushPendingNulls(jstate); /* Process the jumble buffer and produce the hash value */ - return DatumGetUInt64(hash_any_extended(jstate->jumble, - jstate->jumble_len, - 0)); + return DatumGetInt64(hash_any_extended(jstate->jumble, + jstate->jumble_len, + 0)); } /* @@ -256,10 +256,10 @@ AppendJumbleInternal(JumbleState *jstate, const unsigned char *item, if (unlikely(jumble_len >= JUMBLE_SIZE)) { - uint64 start_hash; + int64 start_hash; - start_hash = DatumGetUInt64(hash_any_extended(jumble, - JUMBLE_SIZE, 0)); + start_hash = DatumGetInt64(hash_any_extended(jumble, + JUMBLE_SIZE, 0)); memcpy(jumble, &start_hash, sizeof(start_hash)); jumble_len = sizeof(start_hash); } diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 64d3a09f765..8c90ab54af8 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -68,6 +68,12 @@ token = pg_strtok(&length); /* get field value */ \ local_node->fldname = atoui(token) +/* Read a signed integer field (anything written using INT64_FORMAT) */ +#define READ_INT64_FIELD(fldname) \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get field value */ \ + local_node->fldname = strtoi64(token, NULL, 10) + /* Read an unsigned integer field (anything written using UINT64_FORMAT) */ #define READ_UINT64_FIELD(fldname) \ token = pg_strtok(&length); /* skip :fldname */ \ diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 981be42e3af..451fb90a610 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -2234,6 +2234,12 @@ do_autovacuum(void) get_namespace_name(classForm->relnamespace), NameStr(classForm->relname)))); + /* + * Deletion might involve TOAST table access, so ensure we have a + * valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + object.classId = RelationRelationId; object.objectId = relid; object.objectSubId = 0; @@ -2246,6 +2252,7 @@ do_autovacuum(void) * To commit the deletion, end current transaction and start a new * one. Note this also releases the locks we took. */ + PopActiveSnapshot(); CommitTransactionCommand(); StartTransactionCommand(); diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 10677da56b2..1c3c051403d 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -1016,7 +1016,7 @@ logicalrep_launcher_attach_dshmem(void) last_start_times_dsa = dsa_attach(LogicalRepCtx->last_start_dsa); dsa_pin_mapping(last_start_times_dsa); last_start_times = dshash_attach(last_start_times_dsa, &dsh_params, - LogicalRepCtx->last_start_dsh, 0); + LogicalRepCtx->last_start_dsh, NULL); } MemoryContextSwitchTo(oldcontext); diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 4151a4b2a96..a23262957ac 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -4626,8 +4626,16 @@ run_apply_worker() walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); StartTransactionCommand(); + + /* + * Updating pg_subscription might involve TOAST table access, so + * ensure we have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + UpdateTwoPhaseState(MySubscription->oid, LOGICALREP_TWOPHASE_STATE_ENABLED); MySubscription->twophasestate = LOGICALREP_TWOPHASE_STATE_ENABLED; + PopActiveSnapshot(); CommitTransactionCommand(); } else @@ -4843,7 +4851,15 @@ DisableSubscriptionAndExit(void) /* Disable the subscription */ StartTransactionCommand(); + + /* + * Updating pg_subscription might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + DisableSubscription(MySubscription->oid); + PopActiveSnapshot(); CommitTransactionCommand(); /* Ensure we remove no-longer-useful entry for worker's start time */ @@ -4948,6 +4964,12 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) } /* + * Updating pg_subscription might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + + /* * Protect subskiplsn of pg_subscription from being concurrently updated * while clearing it. */ @@ -5005,6 +5027,8 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) heap_freetuple(tup); table_close(rel, NoLock); + PopActiveSnapshot(); + if (started_tx) CommitTransactionCommand(); } diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 9fa8beb6103..f2c33250e8b 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -3449,8 +3449,16 @@ XLogSendLogical(void) if (flushPtr == InvalidXLogRecPtr || logical_decoding_ctx->reader->EndRecPtr >= flushPtr) { + /* + * For cascading logical WAL senders, we use the replay LSN instead of + * the flush LSN, since logical decoding on a standby only processes + * WAL that has been replayed. This distinction becomes particularly + * important during shutdown, as new WAL is no longer replayed and the + * last replayed LSN marks the furthest point up to which decoding can + * proceed. + */ if (am_cascading_walsender) - flushPtr = GetStandbyFlushRecPtr(NULL); + flushPtr = GetXLogReplayRecPtr(NULL); else flushPtr = GetFlushRecPtr(NULL); } diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index f0bce5f9ed9..2ef0e7fbf3a 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -4544,7 +4544,7 @@ build_generation_expression(Relation rel, int attrno) List * QueryRewrite(Query *parsetree) { - uint64 input_query_id = parsetree->queryId; + int64 input_query_id = parsetree->queryId; List *querylist; List *results; ListCell *l; diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index c64d815ebd1..6c6c0a908e2 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -752,7 +752,7 @@ pgaio_io_wait_for_free(void) { int reclaimed = 0; - pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs", + pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), dclist_count(&pgaio_my_backend->idle_ios)); @@ -797,7 +797,7 @@ pgaio_io_wait_for_free(void) if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0) ereport(ERROR, errmsg_internal("no free IOs despite no in-flight IOs"), - errdetail_internal("%d pending, %d in-flight, %d idle IOs", + errdetail_internal("%d pending, %u in-flight, %u idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), dclist_count(&pgaio_my_backend->idle_ios))); @@ -828,7 +828,7 @@ pgaio_io_wait_for_free(void) case PGAIO_HS_COMPLETED_IO: case PGAIO_HS_SUBMITTED: pgaio_debug_io(DEBUG2, ioh, - "waiting for free io with %d in flight", + "waiting for free io with %u in flight", dclist_count(&pgaio_my_backend->in_flight_ios)); /* @@ -1252,7 +1252,7 @@ pgaio_closing_fd(int fd) break; pgaio_debug_io(DEBUG2, ioh, - "waiting for IO before FD %d gets closed, %d in-flight IOs", + "waiting for IO before FD %d gets closed, %u in-flight IOs", fd, dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ @@ -1288,7 +1288,7 @@ pgaio_shutdown(int code, Datum arg) uint64 generation = ioh->generation; pgaio_debug_io(DEBUG2, ioh, - "waiting for IO to complete during shutdown, %d in-flight IOs", + "waiting for IO to complete during shutdown, %u in-flight IOs", dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index c719ba2727a..cc312b641ca 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -126,7 +126,7 @@ pgaio_uring_shmem_size(void) static void pgaio_uring_shmem_init(bool first_time) { - int TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; + int TotalProcs = pgaio_uring_procs(); bool found; pgaio_uring_contexts = (PgAioUringContext *) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f93131a645e..667aa0c0c78 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -7320,7 +7320,7 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td, affected_count > 1 ? errdetail("Block %u held first zeroed page.", first + first_off) : 0, - errhint("See server log for details about the other %u invalid block(s).", + errhint("See server log for details about the other %d invalid block(s).", affected_count + checkfail_count - 1)); return; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 63101d56a07..ba26627f7b0 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -629,7 +629,7 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) */ if (check_unreferenced && (LocalRefCount[bufid] != 0 || BUF_STATE_GET_REFCOUNT(buf_state) != 0)) - elog(ERROR, "block %u of %s is still referenced (local %u)", + elog(ERROR, "block %u of %s is still referenced (local %d)", bufHdr->tag.blockNum, relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), MyProcNumber, diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index f50962983c3..3f6bf70bd3c 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -717,7 +717,10 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, * through, to avoid slowing down the normal case.) */ if (!first) + { + CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); + } first = false; xid = SubTransGetTopmostTransaction(xid); } @@ -757,7 +760,10 @@ ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure) /* See XactLockTableWait about this case */ if (!first) + { + CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); + } first = false; xid = SubTransGetTopmostTransaction(xid); } diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 86b06b9223f..2776ceb295b 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -51,7 +51,7 @@ /* GUC variables */ int max_locks_per_xact; /* used to set the lock table size */ -bool log_lock_failure = false; +bool log_lock_failures = false; #define NLOCKENTS() \ mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c242c8170b5..2f8c3d5f918 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -1682,7 +1682,7 @@ exec_bind_message(StringInfo input_message) { Query *query = lfirst_node(Query, lc); - if (query->queryId != UINT64CONST(0)) + if (query->queryId != INT64CONST(0)) { pgstat_report_query_id(query->queryId, false); break; @@ -2034,7 +2034,7 @@ exec_bind_message(StringInfo input_message) { PlannedStmt *plan = lfirst_node(PlannedStmt, lc); - if (plan->planId != UINT64CONST(0)) + if (plan->planId != INT64CONST(0)) { pgstat_report_plan_id(plan->planId, false); break; @@ -2174,7 +2174,7 @@ exec_execute_message(const char *portal_name, long max_rows) { PlannedStmt *stmt = lfirst_node(PlannedStmt, lc); - if (stmt->queryId != UINT64CONST(0)) + if (stmt->queryId != INT64CONST(0)) { pgstat_report_query_id(stmt->queryId, false); break; @@ -2185,7 +2185,7 @@ exec_execute_message(const char *portal_name, long max_rows) { PlannedStmt *stmt = lfirst_node(PlannedStmt, lc); - if (stmt->planId != UINT64CONST(0)) + if (stmt->planId != INT64CONST(0)) { pgstat_report_plan_id(stmt->planId, false); break; diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index e1576e64b6d..a290cc4c975 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -320,8 +320,8 @@ pgstat_bestart_initial(void) lbeentry.st_state = STATE_STARTING; lbeentry.st_progress_command = PROGRESS_COMMAND_INVALID; lbeentry.st_progress_command_target = InvalidOid; - lbeentry.st_query_id = UINT64CONST(0); - lbeentry.st_plan_id = UINT64CONST(0); + lbeentry.st_query_id = INT64CONST(0); + lbeentry.st_plan_id = INT64CONST(0); /* * we don't zero st_progress_param here to save cycles; nobody should @@ -599,8 +599,8 @@ pgstat_report_activity(BackendState state, const char *cmd_str) beentry->st_activity_start_timestamp = 0; /* st_xact_start_timestamp and wait_event_info are also disabled */ beentry->st_xact_start_timestamp = 0; - beentry->st_query_id = UINT64CONST(0); - beentry->st_plan_id = UINT64CONST(0); + beentry->st_query_id = INT64CONST(0); + beentry->st_plan_id = INT64CONST(0); proc->wait_event_info = 0; PGSTAT_END_WRITE_ACTIVITY(beentry); } @@ -662,8 +662,8 @@ pgstat_report_activity(BackendState state, const char *cmd_str) */ if (state == STATE_RUNNING) { - beentry->st_query_id = UINT64CONST(0); - beentry->st_plan_id = UINT64CONST(0); + beentry->st_query_id = INT64CONST(0); + beentry->st_plan_id = INT64CONST(0); } if (cmd_str != NULL) @@ -683,7 +683,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) * -------- */ void -pgstat_report_query_id(uint64 query_id, bool force) +pgstat_report_query_id(int64 query_id, bool force) { volatile PgBackendStatus *beentry = MyBEEntry; @@ -702,7 +702,7 @@ pgstat_report_query_id(uint64 query_id, bool force) * command, so ignore the one provided unless it's an explicit call to * reset the identifier. */ - if (beentry->st_query_id != 0 && !force) + if (beentry->st_query_id != INT64CONST(0) && !force) return; /* @@ -722,7 +722,7 @@ pgstat_report_query_id(uint64 query_id, bool force) * -------- */ void -pgstat_report_plan_id(uint64 plan_id, bool force) +pgstat_report_plan_id(int64 plan_id, bool force) { volatile PgBackendStatus *beentry = MyBEEntry; @@ -1134,7 +1134,7 @@ pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen) * * Return current backend's query identifier. */ -uint64 +int64 pgstat_get_my_query_id(void) { if (!MyBEEntry) @@ -1154,7 +1154,7 @@ pgstat_get_my_query_id(void) * * Return current backend's plan identifier. */ -uint64 +int64 pgstat_get_my_plan_id(void) { if (!MyBEEntry) diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c index 2e33293b000..53e7d534270 100644 --- a/src/backend/utils/activity/pgstat_shmem.c +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -183,7 +183,7 @@ StatsShmemInit(void) p += MAXALIGN(pgstat_dsa_init_size()); dsa = dsa_create_in_place(ctl->raw_dsa_area, pgstat_dsa_init_size(), - LWTRANCHE_PGSTATS_DSA, 0); + LWTRANCHE_PGSTATS_DSA, NULL); dsa_pin(dsa); /* @@ -255,7 +255,8 @@ pgstat_attach_shmem(void) dsa_pin_mapping(pgStatLocal.dsa); pgStatLocal.shared_hash = dshash_attach(pgStatLocal.dsa, &dsh_params, - pgStatLocal.shmem->hash_handle, 0); + pgStatLocal.shmem->hash_handle, + NULL); MemoryContextSwitchTo(oldcontext); } diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 5d9e04d6823..4da68312b5f 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -401,6 +401,7 @@ SerialSLRU "Waiting to access the serializable transaction conflict SLRU cache." SubtransSLRU "Waiting to access the sub-transaction SLRU cache." XactSLRU "Waiting to access the transaction status SLRU cache." ParallelVacuumDSA "Waiting for parallel vacuum dynamic shared memory allocation." +AioUringCompletion "Waiting for another process to complete IO via io_uring." # No "ABI_compatibility" region here as WaitEventLWLock has its own C code. diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c index 793d8a9adcc..680fee2a844 100644 --- a/src/backend/utils/adt/datetime.c +++ b/src/backend/utils/adt/datetime.c @@ -702,9 +702,18 @@ ParseFraction(char *cp, double *frac) } else { + /* + * On the other hand, let's reject anything that's not digits after + * the ".". strtod is happy with input like ".123e9", but that'd + * break callers' expectation that the result is in 0..1. (It's quite + * difficult to get here with such input, but not impossible.) + */ + if (strspn(cp + 1, "0123456789") != strlen(cp + 1)) + return DTERR_BAD_FORMAT; + errno = 0; *frac = strtod(cp, &cp); - /* check for parse failure */ + /* check for parse failure (probably redundant given prior check) */ if (*cp != '\0' || errno != 0) return DTERR_BAD_FORMAT; } @@ -2959,30 +2968,27 @@ DecodeNumberField(int len, char *str, int fmask, char *cp; /* + * This function was originally meant to cope only with DTK_NUMBER fields, + * but we now sometimes abuse it to parse (parts of) DTK_DATE fields, + * which can contain letters and other punctuation. Reject if it's not a + * valid DTK_NUMBER, that is digits and decimal point(s). (ParseFraction + * will reject if there's more than one decimal point.) + */ + if (strspn(str, "0123456789.") != len) + return DTERR_BAD_FORMAT; + + /* * Have a decimal point? Then this is a date or something with a seconds * field... */ if ((cp = strchr(str, '.')) != NULL) { - /* - * Can we use ParseFractionalSecond here? Not clear whether trailing - * junk should be rejected ... - */ - if (cp[1] == '\0') - { - /* avoid assuming that strtod will accept "." */ - *fsec = 0; - } - else - { - double frac; + int dterr; - errno = 0; - frac = strtod(cp, NULL); - if (errno != 0) - return DTERR_BAD_FORMAT; - *fsec = rint(frac * 1000000); - } + /* Convert the fraction and store at *fsec */ + dterr = ParseFractionalSecond(cp, fsec); + if (dterr) + return dterr; /* Now truncate off the fraction for further processing */ *cp = '\0'; len = strlen(str); diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 97af7c6554f..1c12ddbae49 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -640,10 +640,10 @@ pg_stat_get_activity(PG_FUNCTION_ARGS) values[28] = BoolGetDatum(false); /* GSS credentials not * delegated */ } - if (beentry->st_query_id == 0) + if (beentry->st_query_id == INT64CONST(0)) nulls[30] = true; else - values[30] = UInt64GetDatum(beentry->st_query_id); + values[30] = Int64GetDatum(beentry->st_query_id); } else { @@ -1510,7 +1510,7 @@ pg_stat_io_build_tuples(ReturnSetInfo *rsinfo, bktype_stats->bytes[io_obj][io_context][io_op]; /* Convert to numeric */ - snprintf(buf, sizeof buf, UINT64_FORMAT, byte); + snprintf(buf, sizeof buf, INT64_FORMAT, byte); values[byte_idx] = DirectFunctionCall3(numeric_in, CStringGetDatum(buf), ObjectIdGetDatum(0), diff --git a/src/backend/utils/cache/funccache.c b/src/backend/utils/cache/funccache.c index 150c502a612..afc048a051e 100644 --- a/src/backend/utils/cache/funccache.c +++ b/src/backend/utils/cache/funccache.c @@ -491,6 +491,7 @@ cached_function_compile(FunctionCallInfo fcinfo, CachedFunctionHashKey hashkey; bool function_valid = false; bool hashkey_valid = false; + bool new_function = false; /* * Lookup the pg_proc tuple by Oid; we'll need it in any case @@ -570,13 +571,15 @@ recheck: /* * Create the new function struct, if not done already. The function - * structs are never thrown away, so keep them in TopMemoryContext. + * cache entry will be kept for the life of the backend, so put it in + * TopMemoryContext. */ Assert(cacheEntrySize >= sizeof(CachedFunction)); if (function == NULL) { function = (CachedFunction *) MemoryContextAllocZero(TopMemoryContext, cacheEntrySize); + new_function = true; } else { @@ -585,17 +588,36 @@ recheck: } /* - * Fill in the CachedFunction part. fn_hashkey and use_count remain - * zeroes for now. + * However, if function compilation fails, we'd like not to leak the + * function struct, so use a PG_TRY block to prevent that. (It's up + * to the compile callback function to avoid its own internal leakage + * in such cases.) Unfortunately, freeing the struct is only safe if + * we just allocated it: otherwise there are probably fn_extra + * pointers to it. */ - function->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); - function->fn_tid = procTup->t_self; - function->dcallback = dcallback; + PG_TRY(); + { + /* + * Do the hard, language-specific part. + */ + ccallback(fcinfo, procTup, &hashkey, function, forValidator); + } + PG_CATCH(); + { + if (new_function) + pfree(function); + PG_RE_THROW(); + } + PG_END_TRY(); /* - * Do the hard, language-specific part. + * Fill in the CachedFunction part. (We do this last to prevent the + * function from looking valid before it's fully built.) fn_hashkey + * will be set by cfunc_hashtable_insert; use_count remains zero. */ - ccallback(fcinfo, procTup, &hashkey, function, forValidator); + function->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); + function->fn_tid = procTup->t_self; + function->dcallback = dcallback; /* * Add the completed struct to the hash table. diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c index 603632581d0..4bb84ff7087 100644 --- a/src/backend/utils/fmgr/dfmgr.c +++ b/src/backend/utils/fmgr/dfmgr.c @@ -99,6 +99,14 @@ load_external_function(const char *filename, const char *funcname, void *lib_handle; void *retval; + /* + * If the value starts with "$libdir/", strip that. This is because many + * extensions have hardcoded '$libdir/foo' as their library name, which + * prevents using the path. + */ + if (strncmp(filename, "$libdir/", 8) == 0) + filename += 8; + /* Expand the possibly-abbreviated filename to an exact path name */ fullname = expand_dynamic_library_name(filename); @@ -456,14 +464,6 @@ expand_dynamic_library_name(const char *name) Assert(name); - /* - * If the value starts with "$libdir/", strip that. This is because many - * extensions have hardcoded '$libdir/foo' as their library name, which - * prevents using the path. - */ - if (strncmp(name, "$libdir/", 8) == 0) - name += 8; - have_slash = (first_dir_separator(name) != NULL); if (!have_slash) diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 2f8cbd86759..f04bfedb2fd 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -1602,11 +1602,11 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { - {"log_lock_failure", PGC_SUSET, LOGGING_WHAT, + {"log_lock_failures", PGC_SUSET, LOGGING_WHAT, gettext_noop("Logs lock failures."), NULL }, - &log_lock_failure, + &log_lock_failures, false, NULL, NULL, NULL }, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 87ce76b18f4..341f88adc87 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -624,7 +624,7 @@ # %% = '%' # e.g. '<%u%%%d> ' #log_lock_waits = off # log lock waits >= deadlock_timeout -#log_lock_failure = off # log lock failures +#log_lock_failures = off # log lock failures #log_recovery_conflict_waits = off # log standby recovery conflict waits # >= deadlock_timeout #log_parameter_max_length = -1 # when logging statements, limit logged |