aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/cache/relcache.c
diff options
context:
space:
mode:
authorNoah Misch <noah@leadboat.com>2020-03-21 09:38:26 -0700
committerNoah Misch <noah@leadboat.com>2020-03-21 09:38:26 -0700
commitcb2fd7eac285b1b0a24eeb2b8ed4456b66c5a09f (patch)
tree99e6db118b05237d646fbca98ae13ec1a1970aa0 /src/backend/utils/cache/relcache.c
parentd3e572855be1e15c7e0a6adc8db52b9fd4f71be0 (diff)
downloadpostgresql-cb2fd7eac285b1b0a24eeb2b8ed4456b66c5a09f.tar.gz
postgresql-cb2fd7eac285b1b0a24eeb2b8ed4456b66c5a09f.zip
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a given relfilenode received both a WAL-skipping COPY and a WAL-logged operation (e.g. INSERT), recovery could lose tuples from the COPY. See src/backend/access/transam/README section "Skipping WAL for New RelFileNode" for the new coding rules. Maintainers of table access methods should examine that section. To maintain data durability, just before commit, we choose between an fsync of the relfilenode and copying its contents to WAL. A new GUC, wal_skip_threshold, guides that choice. If this change slows a workload that creates small, permanent relfilenodes under wal_level=minimal, try adjusting wal_skip_threshold. Users setting a timeout on COMMIT may need to adjust that timeout, and log_min_duration_statement analysis will reflect time consumption moving to COMMIT from commands like COPY. Internally, this requires a reliable determination of whether RollbackAndReleaseCurrentSubTransaction() would unlink a relation's current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the specification of rd_createSubid such that the field is zero when a new rel has an old rd_node. Make relcache.c retain entries for certain dropped relations until end of transaction. Back-patch to 9.5 (all supported versions). This introduces a new WAL record type, XLOG_GIST_ASSIGN_LSN, without bumping XLOG_PAGE_MAGIC. As always, update standby systems before master systems. This changes sizeof(RelationData) and sizeof(IndexStmt), breaking binary compatibility for affected extensions. (The most recent commit to affect the same class of extensions was 089e4d405d0f3b94c74a2c6a54357a84a681754b.) Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert Haas. Heikki Linnakangas and Michael Paquier implemented earlier designs that materially clarified the problem. Reviewed, in earlier designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane, Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout. Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
Diffstat (limited to 'src/backend/utils/cache/relcache.c')
-rw-r--r--src/backend/utils/cache/relcache.c268
1 files changed, 216 insertions, 52 deletions
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 76f41dbe36c..9ee9dc8cc0c 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -257,6 +257,9 @@ static void RelationReloadIndexInfo(Relation relation);
static void RelationReloadNailed(Relation relation);
static void RelationFlushRelation(Relation relation);
static void RememberToFreeTupleDescAtEOX(TupleDesc td);
+#ifdef USE_ASSERT_CHECKING
+static void AssertPendingSyncConsistency(Relation relation);
+#endif
static void AtEOXact_cleanup(Relation relation, bool isCommit);
static void AtEOSubXact_cleanup(Relation relation, bool isCommit,
SubTransactionId mySubid, SubTransactionId parentSubid);
@@ -1093,6 +1096,8 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
relation->rd_isnailed = false;
relation->rd_createSubid = InvalidSubTransactionId;
relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
+ relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
+ relation->rd_droppedSubid = InvalidSubTransactionId;
switch (relation->rd_rel->relpersistence)
{
case RELPERSISTENCE_UNLOGGED:
@@ -1817,6 +1822,8 @@ formrdesc(const char *relationName, Oid relationReltype,
relation->rd_isnailed = true;
relation->rd_createSubid = InvalidSubTransactionId;
relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
+ relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
+ relation->rd_droppedSubid = InvalidSubTransactionId;
relation->rd_backend = InvalidBackendId;
relation->rd_islocaltemp = false;
@@ -1989,6 +1996,13 @@ RelationIdGetRelation(Oid relationId)
if (RelationIsValid(rd))
{
+ /* return NULL for dropped relations */
+ if (rd->rd_droppedSubid != InvalidSubTransactionId)
+ {
+ Assert(!rd->rd_isvalid);
+ return NULL;
+ }
+
RelationIncrementReferenceCount(rd);
/* revalidate cache entry if necessary */
if (!rd->rd_isvalid)
@@ -2092,7 +2106,7 @@ RelationClose(Relation relation)
#ifdef RELCACHE_FORCE_RELEASE
if (RelationHasReferenceCountZero(relation) &&
relation->rd_createSubid == InvalidSubTransactionId &&
- relation->rd_newRelfilenodeSubid == InvalidSubTransactionId)
+ relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
RelationClearRelation(relation, false);
#endif
}
@@ -2131,10 +2145,11 @@ RelationReloadIndexInfo(Relation relation)
HeapTuple pg_class_tuple;
Form_pg_class relp;
- /* Should be called only for invalidated indexes */
+ /* Should be called only for invalidated, live indexes */
Assert((relation->rd_rel->relkind == RELKIND_INDEX ||
relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) &&
- !relation->rd_isvalid);
+ !relation->rd_isvalid &&
+ relation->rd_droppedSubid == InvalidSubTransactionId);
/* Ensure it's closed at smgr level */
RelationCloseSmgr(relation);
@@ -2430,6 +2445,13 @@ RelationClearRelation(Relation relation, bool rebuild)
return;
}
+ /* Mark it invalid until we've finished rebuild */
+ relation->rd_isvalid = false;
+
+ /* See RelationForgetRelation(). */
+ if (relation->rd_droppedSubid != InvalidSubTransactionId)
+ return;
+
/*
* Even non-system indexes should not be blown away if they are open and
* have valid index support information. This avoids problems with active
@@ -2442,15 +2464,11 @@ RelationClearRelation(Relation relation, bool rebuild)
relation->rd_refcnt > 0 &&
relation->rd_indexcxt != NULL)
{
- relation->rd_isvalid = false; /* needs to be revalidated */
if (IsTransactionState())
RelationReloadIndexInfo(relation);
return;
}
- /* Mark it invalid until we've finished rebuild */
- relation->rd_isvalid = false;
-
/*
* If we're really done with the relcache entry, blow it away. But if
* someone is still using it, reconstruct the whole deal without moving
@@ -2508,13 +2526,13 @@ RelationClearRelation(Relation relation, bool rebuild)
* problem.
*
* When rebuilding an open relcache entry, we must preserve ref count,
- * rd_createSubid/rd_newRelfilenodeSubid, and rd_toastoid state. Also
- * attempt to preserve the pg_class entry (rd_rel), tupledesc,
- * rewrite-rule, partition key, and partition descriptor substructures
- * in place, because various places assume that these structures won't
- * move while they are working with an open relcache entry. (Note:
- * the refcount mechanism for tupledescs might someday allow us to
- * remove this hack for the tupledesc.)
+ * rd_*Subid, and rd_toastoid state. Also attempt to preserve the
+ * pg_class entry (rd_rel), tupledesc, rewrite-rule, partition key,
+ * and partition descriptor substructures in place, because various
+ * places assume that these structures won't move while they are
+ * working with an open relcache entry. (Note: the refcount
+ * mechanism for tupledescs might someday allow us to remove this hack
+ * for the tupledesc.)
*
* Note that this process does not touch CurrentResourceOwner; which
* is good because whatever ref counts the entry may have do not
@@ -2594,6 +2612,8 @@ RelationClearRelation(Relation relation, bool rebuild)
/* creation sub-XIDs must be preserved */
SWAPFIELD(SubTransactionId, rd_createSubid);
SWAPFIELD(SubTransactionId, rd_newRelfilenodeSubid);
+ SWAPFIELD(SubTransactionId, rd_firstRelfilenodeSubid);
+ SWAPFIELD(SubTransactionId, rd_droppedSubid);
/* un-swap rd_rel pointers, swap contents instead */
SWAPFIELD(Form_pg_class, rd_rel);
/* ... but actually, we don't have to update newrel->rd_rel */
@@ -2672,12 +2692,12 @@ static void
RelationFlushRelation(Relation relation)
{
if (relation->rd_createSubid != InvalidSubTransactionId ||
- relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
+ relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
{
/*
* New relcache entries are always rebuilt, not flushed; else we'd
- * forget the "new" status of the relation, which is a useful
- * optimization to have. Ditto for the new-relfilenode status.
+ * forget the "new" status of the relation. Ditto for the
+ * new-relfilenode status.
*
* The rel could have zero refcnt here, so temporarily increment the
* refcnt to ensure it's safe to rebuild it. We can assume that the
@@ -2699,10 +2719,7 @@ RelationFlushRelation(Relation relation)
}
/*
- * RelationForgetRelation - unconditionally remove a relcache entry
- *
- * External interface for destroying a relcache entry when we
- * drop the relation.
+ * RelationForgetRelation - caller reports that it dropped the relation
*/
void
RelationForgetRelation(Oid rid)
@@ -2717,7 +2734,19 @@ RelationForgetRelation(Oid rid)
if (!RelationHasReferenceCountZero(relation))
elog(ERROR, "relation %u is still open", rid);
- /* Unconditionally destroy the relcache entry */
+ Assert(relation->rd_droppedSubid == InvalidSubTransactionId);
+ if (relation->rd_createSubid != InvalidSubTransactionId ||
+ relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
+ {
+ /*
+ * In the event of subtransaction rollback, we must not forget
+ * rd_*Subid. Mark the entry "dropped" so RelationClearRelation()
+ * invalidates it in lieu of destroying it. (If we're in a top
+ * transaction, we could opt to destroy the entry.)
+ */
+ relation->rd_droppedSubid = GetCurrentSubTransactionId();
+ }
+
RelationClearRelation(relation, false);
}
@@ -2757,11 +2786,10 @@ RelationCacheInvalidateEntry(Oid relationId)
* relation cache and re-read relation mapping data.
*
* This is currently used only to recover from SI message buffer overflow,
- * so we do not touch new-in-transaction relations; they cannot be targets
- * of cross-backend SI updates (and our own updates now go through a
- * separate linked list that isn't limited by the SI message buffer size).
- * Likewise, we need not discard new-relfilenode-in-transaction hints,
- * since any invalidation of those would be a local event.
+ * so we do not touch relations having new-in-transaction relfilenodes; they
+ * cannot be targets of cross-backend SI updates (and our own updates now go
+ * through a separate linked list that isn't limited by the SI message
+ * buffer size).
*
* We do this in two phases: the first pass deletes deletable items, and
* the second one rebuilds the rebuildable items. This is essential for
@@ -2812,7 +2840,7 @@ RelationCacheInvalidate(void)
* pending invalidations.
*/
if (relation->rd_createSubid != InvalidSubTransactionId ||
- relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
+ relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId)
continue;
relcacheInvalsReceived++;
@@ -2924,6 +2952,84 @@ RememberToFreeTupleDescAtEOX(TupleDesc td)
EOXactTupleDescArray[NextEOXactTupleDescNum++] = td;
}
+#ifdef USE_ASSERT_CHECKING
+static void
+AssertPendingSyncConsistency(Relation relation)
+{
+ bool relcache_verdict =
+ relation->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&
+ ((relation->rd_createSubid != InvalidSubTransactionId &&
+ RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) ||
+ relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId);
+
+ Assert(relcache_verdict == RelFileNodeSkippingWAL(relation->rd_node));
+
+ if (relation->rd_droppedSubid != InvalidSubTransactionId)
+ Assert(!relation->rd_isvalid &&
+ (relation->rd_createSubid != InvalidSubTransactionId ||
+ relation->rd_firstRelfilenodeSubid != InvalidSubTransactionId));
+}
+
+/*
+ * AssertPendingSyncs_RelationCache
+ *
+ * Assert that relcache.c and storage.c agree on whether to skip WAL.
+ */
+void
+AssertPendingSyncs_RelationCache(void)
+{
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+ Relation *rels;
+ int maxrels;
+ int nrels;
+ RelIdCacheEnt *idhentry;
+ int i;
+
+ /*
+ * Open every relation that this transaction has locked. If, for some
+ * relation, storage.c is skipping WAL and relcache.c is not skipping WAL,
+ * a CommandCounterIncrement() typically yields a local invalidation
+ * message that destroys the relcache entry. By recreating such entries
+ * here, we detect the problem.
+ */
+ PushActiveSnapshot(GetTransactionSnapshot());
+ maxrels = 1;
+ rels = palloc(maxrels * sizeof(*rels));
+ nrels = 0;
+ hash_seq_init(&status, GetLockMethodLocalHash());
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ Oid relid;
+ Relation r;
+
+ if (locallock->nLocks <= 0)
+ continue;
+ if ((LockTagType) locallock->tag.lock.locktag_type !=
+ LOCKTAG_RELATION)
+ continue;
+ relid = ObjectIdGetDatum(locallock->tag.lock.locktag_field2);
+ r = RelationIdGetRelation(relid);
+ if (!RelationIsValid(r))
+ continue;
+ if (nrels >= maxrels)
+ {
+ maxrels *= 2;
+ rels = repalloc(rels, maxrels * sizeof(*rels));
+ }
+ rels[nrels++] = r;
+ }
+
+ hash_seq_init(&status, RelationIdCache);
+ while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
+ AssertPendingSyncConsistency(idhentry->reldesc);
+
+ for (i = 0; i < nrels; i++)
+ RelationClose(rels[i]);
+ PopActiveSnapshot();
+}
+#endif
+
/*
* AtEOXact_RelationCache
*
@@ -3006,6 +3112,8 @@ AtEOXact_RelationCache(bool isCommit)
static void
AtEOXact_cleanup(Relation relation, bool isCommit)
{
+ bool clear_relcache = false;
+
/*
* The relcache entry's ref count should be back to its normal
* not-in-a-transaction state: 0 unless it's nailed in cache.
@@ -3031,17 +3139,31 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
#endif
/*
- * Is it a relation created in the current transaction?
+ * Is the relation live after this transaction ends?
*
- * During commit, reset the flag to zero, since we are now out of the
- * creating transaction. During abort, simply delete the relcache entry
- * --- it isn't interesting any longer.
+ * During commit, clear the relcache entry if it is preserved after
+ * relation drop, in order not to orphan the entry. During rollback,
+ * clear the relcache entry if the relation is created in the current
+ * transaction since it isn't interesting any longer once we are out of
+ * the transaction.
+ */
+ clear_relcache =
+ (isCommit ?
+ relation->rd_droppedSubid != InvalidSubTransactionId :
+ relation->rd_createSubid != InvalidSubTransactionId);
+
+ /*
+ * Since we are now out of the transaction, reset the subids to zero.
+ * That also lets RelationClearRelation() drop the relcache entry.
*/
- if (relation->rd_createSubid != InvalidSubTransactionId)
+ relation->rd_createSubid = InvalidSubTransactionId;
+ relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
+ relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
+ relation->rd_droppedSubid = InvalidSubTransactionId;
+
+ if (clear_relcache)
{
- if (isCommit)
- relation->rd_createSubid = InvalidSubTransactionId;
- else if (RelationHasReferenceCountZero(relation))
+ if (RelationHasReferenceCountZero(relation))
{
RelationClearRelation(relation, false);
return;
@@ -3056,16 +3178,10 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
* eventually. This must be just a WARNING to avoid
* error-during-error-recovery loops.
*/
- relation->rd_createSubid = InvalidSubTransactionId;
elog(WARNING, "cannot remove relcache entry for \"%s\" because it has nonzero refcount",
RelationGetRelationName(relation));
}
}
-
- /*
- * Likewise, reset the hint about the relfilenode being new.
- */
- relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
}
/*
@@ -3129,15 +3245,28 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
/*
* Is it a relation created in the current subtransaction?
*
- * During subcommit, mark it as belonging to the parent, instead. During
- * subabort, simply delete the relcache entry.
+ * During subcommit, mark it as belonging to the parent, instead, as long
+ * as it has not been dropped. Otherwise simply delete the relcache entry.
+ * --- it isn't interesting any longer.
*/
if (relation->rd_createSubid == mySubid)
{
- if (isCommit)
+ /*
+ * Valid rd_droppedSubid means the corresponding relation is dropped
+ * but the relcache entry is preserved for at-commit pending sync. We
+ * need to drop it explicitly here not to make the entry orphan.
+ */
+ Assert(relation->rd_droppedSubid == mySubid ||
+ relation->rd_droppedSubid == InvalidSubTransactionId);
+ if (isCommit && relation->rd_droppedSubid == InvalidSubTransactionId)
relation->rd_createSubid = parentSubid;
else if (RelationHasReferenceCountZero(relation))
{
+ /* allow the entry to be removed */
+ relation->rd_createSubid = InvalidSubTransactionId;
+ relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
+ relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
+ relation->rd_droppedSubid = InvalidSubTransactionId;
RelationClearRelation(relation, false);
return;
}
@@ -3157,7 +3286,8 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
}
/*
- * Likewise, update or drop any new-relfilenode-in-subtransaction hint.
+ * Likewise, update or drop any new-relfilenode-in-subtransaction record
+ * or drop record.
*/
if (relation->rd_newRelfilenodeSubid == mySubid)
{
@@ -3166,6 +3296,22 @@ AtEOSubXact_cleanup(Relation relation, bool isCommit,
else
relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
}
+
+ if (relation->rd_firstRelfilenodeSubid == mySubid)
+ {
+ if (isCommit)
+ relation->rd_firstRelfilenodeSubid = parentSubid;
+ else
+ relation->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
+ }
+
+ if (relation->rd_droppedSubid == mySubid)
+ {
+ if (isCommit)
+ relation->rd_droppedSubid = parentSubid;
+ else
+ relation->rd_droppedSubid = InvalidSubTransactionId;
+ }
}
@@ -3255,6 +3401,7 @@ RelationBuildLocalRelation(const char *relname,
/* it's being created in this transaction */
rel->rd_createSubid = GetCurrentSubTransactionId();
rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
+ rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
/*
* create a new tuple descriptor from the one passed in. We do this
@@ -3552,14 +3699,29 @@ RelationSetNewRelfilenode(Relation relation, char persistence)
*/
CommandCounterIncrement();
- /*
- * Mark the rel as having been given a new relfilenode in the current
- * (sub) transaction. This is a hint that can be used to optimize later
- * operations on the rel in the same transaction.
- */
+ RelationAssumeNewRelfilenode(relation);
+}
+
+/*
+ * RelationAssumeNewRelfilenode
+ *
+ * Code that modifies pg_class.reltablespace or pg_class.relfilenode must call
+ * this. The call shall precede any code that might insert WAL records whose
+ * replay would modify bytes in the new RelFileNode, and the call shall follow
+ * any WAL modifying bytes in the prior RelFileNode. See struct RelationData.
+ * Ideally, call this as near as possible to the CommandCounterIncrement()
+ * that makes the pg_class change visible (before it or after it); that
+ * minimizes the chance of future development adding a forbidden WAL insertion
+ * between RelationAssumeNewRelfilenode() and CommandCounterIncrement().
+ */
+void
+RelationAssumeNewRelfilenode(Relation relation)
+{
relation->rd_newRelfilenodeSubid = GetCurrentSubTransactionId();
+ if (relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)
+ relation->rd_firstRelfilenodeSubid = relation->rd_newRelfilenodeSubid;
- /* Flag relation as needing eoxact cleanup (to remove the hint) */
+ /* Flag relation as needing eoxact cleanup (to clear these fields) */
EOXactListAdd(relation);
}
@@ -5625,6 +5787,8 @@ load_relcache_init_file(bool shared)
rel->rd_fkeylist = NIL;
rel->rd_createSubid = InvalidSubTransactionId;
rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
+ rel->rd_firstRelfilenodeSubid = InvalidSubTransactionId;
+ rel->rd_droppedSubid = InvalidSubTransactionId;
rel->rd_amcache = NULL;
MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));