aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/cache
diff options
context:
space:
mode:
authorNoah Misch <noah@leadboat.com>2021-10-23 18:36:38 -0700
committerNoah Misch <noah@leadboat.com>2021-10-23 18:36:38 -0700
commitfdd965d074d46765c295223b119ca437dbcac973 (patch)
tree5fb2acb761849c3bf6f5f15ac783ddd561c6186b /src/backend/utils/cache
parent1e9475694b0ae2cf1204d01d2ef6ad86f3c7cac8 (diff)
downloadpostgresql-fdd965d074d46765c295223b119ca437dbcac973.tar.gz
postgresql-fdd965d074d46765c295223b119ca437dbcac973.zip
Avoid race in RelationBuildDesc() affecting CREATE INDEX CONCURRENTLY.
CIC and REINDEX CONCURRENTLY assume backends see their catalog changes no later than each backend's next transaction start. That failed to hold when a backend absorbed a relevant invalidation in the middle of running RelationBuildDesc() on the CIC index. Queries that use the resulting index can silently fail to find rows. Fix this for future index builds by making RelationBuildDesc() loop until it finishes without accepting a relevant invalidation. It may be necessary to reindex to recover from past occurrences; REINDEX CONCURRENTLY suffices. Back-patch to 9.6 (all supported versions). Noah Misch and Andrey Borodin, reviewed (in earlier versions) by Andres Freund. Discussion: https://postgr.es/m/20210730022548.GA1940096@gust.leadboat.com
Diffstat (limited to 'src/backend/utils/cache')
-rw-r--r--src/backend/utils/cache/inval.c12
-rw-r--r--src/backend/utils/cache/relcache.c115
2 files changed, 118 insertions, 9 deletions
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 9352c680900..dd8586ab4d9 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -628,7 +628,7 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
int i;
if (msg->rc.relId == InvalidOid)
- RelationCacheInvalidate();
+ RelationCacheInvalidate(false);
else
RelationCacheInvalidateEntry(msg->rc.relId);
@@ -686,11 +686,17 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
void
InvalidateSystemCaches(void)
{
+ InvalidateSystemCachesExtended(false);
+}
+
+void
+InvalidateSystemCachesExtended(bool debug_discard)
+{
int i;
InvalidateCatalogSnapshot();
ResetCatalogCaches();
- RelationCacheInvalidate(); /* gets smgr and relmap too */
+ RelationCacheInvalidate(debug_discard); /* gets smgr and relmap too */
for (i = 0; i < syscache_callback_count; i++)
{
@@ -759,7 +765,7 @@ AcceptInvalidationMessages(void)
if (recursion_depth < debug_discard_caches)
{
recursion_depth++;
- InvalidateSystemCaches();
+ InvalidateSystemCachesExtended(true);
recursion_depth--;
}
}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 13d9994af3e..b54c9117669 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -151,6 +151,24 @@ bool criticalSharedRelcachesBuilt = false;
static long relcacheInvalsReceived = 0L;
/*
+ * in_progress_list is a stack of ongoing RelationBuildDesc() calls. CREATE
+ * INDEX CONCURRENTLY makes catalog changes under ShareUpdateExclusiveLock.
+ * It critically relies on each backend absorbing those changes no later than
+ * next transaction start. Hence, RelationBuildDesc() loops until it finishes
+ * without accepting a relevant invalidation. (Most invalidation consumers
+ * don't do this.)
+ */
+typedef struct inprogressent
+{
+ Oid reloid; /* OID of relation being built */
+ bool invalidated; /* whether an invalidation arrived for it */
+} InProgressEnt;
+
+static InProgressEnt *in_progress_list;
+static int in_progress_list_len;
+static int in_progress_list_maxlen;
+
+/*
* eoxact_list[] stores the OIDs of relations that (might) need AtEOXact
* cleanup work. This list intentionally has limited size; if it overflows,
* we fall back to scanning the whole hashtable. There is no value in a very
@@ -1000,6 +1018,7 @@ equalRSDesc(RowSecurityDesc *rsdesc1, RowSecurityDesc *rsdesc2)
static Relation
RelationBuildDesc(Oid targetRelId, bool insertIt)
{
+ int in_progress_offset;
Relation relation;
Oid relid;
HeapTuple pg_class_tuple;
@@ -1033,6 +1052,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
}
#endif
+ /* Register to catch invalidation messages */
+ if (in_progress_list_len >= in_progress_list_maxlen)
+ {
+ int allocsize;
+
+ allocsize = in_progress_list_maxlen * 2;
+ in_progress_list = repalloc(in_progress_list,
+ allocsize * sizeof(*in_progress_list));
+ in_progress_list_maxlen = allocsize;
+ }
+ in_progress_offset = in_progress_list_len++;
+ in_progress_list[in_progress_offset].reloid = targetRelId;
+retry:
+ in_progress_list[in_progress_offset].invalidated = false;
+
/*
* find the tuple in pg_class corresponding to the given relation id
*/
@@ -1051,6 +1085,8 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
MemoryContextDelete(tmpcxt);
}
#endif
+ Assert(in_progress_offset + 1 == in_progress_list_len);
+ in_progress_list_len--;
return NULL;
}
@@ -1214,6 +1250,21 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
heap_freetuple(pg_class_tuple);
/*
+ * If an invalidation arrived mid-build, start over. Between here and the
+ * end of this function, don't add code that does or reasonably could read
+ * system catalogs. That range must be free from invalidation processing
+ * for the !insertIt case. For the insertIt case, RelationCacheInsert()
+ * will enroll this relation in ordinary relcache invalidation processing,
+ */
+ if (in_progress_list[in_progress_offset].invalidated)
+ {
+ RelationDestroyRelation(relation, false);
+ goto retry;
+ }
+ Assert(in_progress_offset + 1 == in_progress_list_len);
+ in_progress_list_len--;
+
+ /*
* Insert newly created relation into relcache hash table, if requested.
*
* There is one scenario in which we might find a hashtable entry already
@@ -2566,6 +2617,14 @@ RelationClearRelation(Relation relation, bool rebuild)
/* Build temporary entry, but don't link it into hashtable */
newrel = RelationBuildDesc(save_relid, false);
+
+ /*
+ * Between here and the end of the swap, don't add code that does or
+ * reasonably could read system catalogs. That range must be free
+ * from invalidation processing. See RelationBuildDesc() manipulation
+ * of in_progress_list.
+ */
+
if (newrel == NULL)
{
/*
@@ -2805,6 +2864,14 @@ RelationCacheInvalidateEntry(Oid relationId)
relcacheInvalsReceived++;
RelationFlushRelation(relation);
}
+ else
+ {
+ int i;
+
+ for (i = 0; i < in_progress_list_len; i++)
+ if (in_progress_list[i].reloid == relationId)
+ in_progress_list[i].invalidated = true;
+ }
}
/*
@@ -2813,11 +2880,11 @@ RelationCacheInvalidateEntry(Oid relationId)
* and rebuild those with positive reference counts. Also reset the smgr
* relation cache and re-read relation mapping data.
*
- * This is currently used only to recover from SI message buffer overflow,
- * so we do not touch relations having new-in-transaction relfilenodes; they
- * cannot be targets of cross-backend SI updates (and our own updates now go
- * through a separate linked list that isn't limited by the SI message
- * buffer size).
+ * Apart from debug_discard_caches, this is currently used only to recover
+ * from SI message buffer overflow, so we do not touch relations having
+ * new-in-transaction relfilenodes; they cannot be targets of cross-backend
+ * SI updates (and our own updates now go through a separate linked list
+ * that isn't limited by the SI message buffer size).
*
* We do this in two phases: the first pass deletes deletable items, and
* the second one rebuilds the rebuildable items. This is essential for
@@ -2835,9 +2902,14 @@ RelationCacheInvalidateEntry(Oid relationId)
* second pass processes nailed-in-cache items before other nondeletable
* items. This should ensure that system catalogs are up to date before
* we attempt to use them to reload information about other open relations.
+ *
+ * After those two phases of work having immediate effects, we normally
+ * signal any RelationBuildDesc() on the stack to start over. However, we
+ * don't do this if called as part of debug_discard_caches. Otherwise,
+ * RelationBuildDesc() would become an infinite loop.
*/
void
-RelationCacheInvalidate(void)
+RelationCacheInvalidate(bool debug_discard)
{
HASH_SEQ_STATUS status;
RelIdCacheEnt *idhentry;
@@ -2845,6 +2917,7 @@ RelationCacheInvalidate(void)
List *rebuildFirstList = NIL;
List *rebuildList = NIL;
ListCell *l;
+ int i;
/*
* Reload relation mapping data before starting to reconstruct cache.
@@ -2931,6 +3004,11 @@ RelationCacheInvalidate(void)
RelationClearRelation(relation, true);
}
list_free(rebuildList);
+
+ if (!debug_discard)
+ /* Any RelationBuildDesc() on the stack must start over. */
+ for (i = 0; i < in_progress_list_len; i++)
+ in_progress_list[i].invalidated = true;
}
/*
@@ -3082,6 +3160,13 @@ AtEOXact_RelationCache(bool isCommit)
int i;
/*
+ * Forget in_progress_list. This is relevant when we're aborting due to
+ * an error during RelationBuildDesc().
+ */
+ Assert(in_progress_list_len == 0 || !isCommit);
+ in_progress_list_len = 0;
+
+ /*
* Unless the eoxact_list[] overflowed, we only need to examine the rels
* listed in it. Otherwise fall back on a hash_seq_search scan.
*
@@ -3228,6 +3313,14 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
int i;
/*
+ * Forget in_progress_list. This is relevant when we're aborting due to
+ * an error during RelationBuildDesc(). We don't commit subtransactions
+ * during RelationBuildDesc().
+ */
+ Assert(in_progress_list_len == 0 || !isCommit);
+ in_progress_list_len = 0;
+
+ /*
* Unless the eoxact_list[] overflowed, we only need to examine the rels
* listed in it. Otherwise fall back on a hash_seq_search scan. Same
* logic as in AtEOXact_RelationCache.
@@ -3775,6 +3868,7 @@ void
RelationCacheInitialize(void)
{
HASHCTL ctl;
+ int allocsize;
/*
* make sure cache memory context exists
@@ -3791,6 +3885,15 @@ RelationCacheInitialize(void)
&ctl, HASH_ELEM | HASH_BLOBS);
/*
+ * reserve enough in_progress_list slots for many cases
+ */
+ allocsize = 4;
+ in_progress_list =
+ MemoryContextAlloc(CacheMemoryContext,
+ allocsize * sizeof(*in_progress_list));
+ in_progress_list_maxlen = allocsize;
+
+ /*
* relation mapper needs to be initialized too
*/
RelationMapInitialize();