Use an MVCC snapshot, rather than SnapshotNow, for catalog scans.

SnapshotNow scans have the undesirable property that, in the face of concurrent updates, the scan can fail to see either the old or the new versions of the row. In many cases, we work around this by requiring DDL operations to hold AccessExclusiveLock on the object being modified; in some cases, the existing locking is inadequate and random failures occur as a result. This commit doesn't change anything related to locking, but will hopefully pave the way to allowing lock strength reductions in the future. The major issue has held us back from making this change in the past is that taking an MVCC snapshot is significantly more expensive than using a static special snapshot such as SnapshotNow. However, testing of various worst-case scenarios reveals that this problem is not severe except under fairly extreme workloads. To mitigate those problems, we avoid retaking the MVCC snapshot for each new scan; instead, we take a new snapshot only when invalidation messages have been processed. The catcache machinery already requires that invalidation messages be sent before releasing the related heavyweight lock; else other backends might rely on locally-cached data rather than scanning the catalog at all. Thus, making snapshot reuse dependent on the same guarantees shouldn't break anything that wasn't already subtly broken. Patch by me. Review by Michael Paquier and Andres Freund.
author: Robert Haas <rhaas@postgresql.org> 2013-07-02 09:47:01 -0400
committer: Robert Haas <rhaas@postgresql.org> 2013-07-02 09:47:01 -0400
commit: 568d4138c646cd7cd8a837ac244ef2caf27c6bb8 (patch)
tree: 82022e9bd58a217976f94fea942f24b0c40278c0 /src/backend/commands/dbcommands.c
parent: 384f933046dc9e9a2b416f5f7b3be30b93587c63 (diff)
download: postgresql-568d4138c646cd7cd8a837ac244ef2caf27c6bb8.tar.gz
postgresql-568d4138c646cd7cd8a837ac244ef2caf27c6bb8.zip
1 files changed, 8 insertions, 58 deletions
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 0e10a752180..a3a150d7008 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -133,7 +133,6 @@ createdb(const CreatedbStmt *stmt)
 	int			notherbackends;
 	int			npreparedxacts;
 	createdb_failure_params fparms;
-	Snapshot	snapshot;
 
 	/* Extract options from the statement node tree */
 	foreach(option, stmt->options)
@@ -538,29 +537,6 @@ createdb(const CreatedbStmt *stmt)
 	RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
 
 	/*
-	 * Take an MVCC snapshot to use while scanning through pg_tablespace.  For
-	 * safety, register the snapshot (this prevents it from changing if
-	 * something else were to request a snapshot during the loop).
-	 *
-	 * Traversing pg_tablespace with an MVCC snapshot is necessary to provide
-	 * us with a consistent view of the tablespaces that exist.  Using
-	 * SnapshotNow here would risk seeing the same tablespace multiple times,
-	 * or worse not seeing a tablespace at all, if its tuple is moved around
-	 * by a concurrent update (eg an ACL change).
-	 *
-	 * Inconsistency of this sort is inherent to all SnapshotNow scans, unless
-	 * some lock is held to prevent concurrent updates of the rows being
-	 * sought.	There should be a generic fix for that, but in the meantime
-	 * it's worth fixing this case in particular because we are doing very
-	 * heavyweight operations within the scan, so that the elapsed time for
-	 * the scan is vastly longer than for most other catalog scans.  That
-	 * means there's a much wider window for concurrent updates to cause
-	 * trouble here than anywhere else.  XXX this code should be changed
-	 * whenever a generic fix is implemented.
-	 */
-	snapshot = RegisterSnapshot(GetLatestSnapshot());
-
-	/*
 	 * Once we start copying subdirectories, we need to be able to clean 'em
 	 * up if we fail.  Use an ENSURE block to make sure this happens.  (This
 	 * is not a 100% solution, because of the possibility of failure during
@@ -577,7 +553,7 @@ createdb(const CreatedbStmt *stmt)
 		 * each one to the new database.
 		 */
 		rel = heap_open(TableSpaceRelationId, AccessShareLock);
-		scan = heap_beginscan(rel, snapshot, 0, NULL);
+		scan = heap_beginscan_catalog(rel, 0, NULL);
 		while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 		{
 			Oid			srctablespace = HeapTupleGetOid(tuple);
@@ -682,9 +658,6 @@ createdb(const CreatedbStmt *stmt)
 	PG_END_ENSURE_ERROR_CLEANUP(createdb_failure_callback,
 								PointerGetDatum(&fparms));
 
-	/* Free our snapshot */
-	UnregisterSnapshot(snapshot);
-
 	return dboid;
 }
 
@@ -1214,7 +1187,7 @@ movedb(const char *dbname, const char *tblspcname)
 					BTEqualStrategyNumber, F_NAMEEQ,
 					NameGetDatum(dbname));
 		sysscan = systable_beginscan(pgdbrel, DatabaseNameIndexId, true,
-									 SnapshotNow, 1, &scankey);
+									 NULL, 1, &scankey);
 		oldtuple = systable_getnext(sysscan);
 		if (!HeapTupleIsValid(oldtuple))		/* shouldn't happen... */
 			ereport(ERROR,
@@ -1403,7 +1376,7 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel)
 				BTEqualStrategyNumber, F_NAMEEQ,
 				NameGetDatum(stmt->dbname));
 	scan = systable_beginscan(rel, DatabaseNameIndexId, true,
-							  SnapshotNow, 1, &scankey);
+							  NULL, 1, &scankey);
 	tuple = systable_getnext(scan);
 	if (!HeapTupleIsValid(tuple))
 		ereport(ERROR,
@@ -1498,7 +1471,7 @@ AlterDatabaseOwner(const char *dbname, Oid newOwnerId)
 				BTEqualStrategyNumber, F_NAMEEQ,
 				NameGetDatum(dbname));
 	scan = systable_beginscan(rel, DatabaseNameIndexId, true,
-							  SnapshotNow, 1, &scankey);
+							  NULL, 1, &scankey);
 	tuple = systable_getnext(scan);
 	if (!HeapTupleIsValid(tuple))
 		ereport(ERROR,
@@ -1637,7 +1610,7 @@ get_db_info(const char *name, LOCKMODE lockmode,
 					NameGetDatum(name));
 
 		scan = systable_beginscan(relation, DatabaseNameIndexId, true,
-								  SnapshotNow, 1, &scanKey);
+								  NULL, 1, &scanKey);
 
 		tuple = systable_getnext(scan);
 
@@ -1751,20 +1724,9 @@ remove_dbtablespaces(Oid db_id)
 	Relation	rel;
 	HeapScanDesc scan;
 	HeapTuple	tuple;
-	Snapshot	snapshot;
-
-	/*
-	 * As in createdb(), we'd better use an MVCC snapshot here, since this
-	 * scan can run for a long time.  Duplicate visits to tablespaces would be
-	 * harmless, but missing a tablespace could result in permanently leaked
-	 * files.
-	 *
-	 * XXX change this when a generic fix for SnapshotNow races is implemented
-	 */
-	snapshot = RegisterSnapshot(GetLatestSnapshot());
 
 	rel = heap_open(TableSpaceRelationId, AccessShareLock);
-	scan = heap_beginscan(rel, snapshot, 0, NULL);
+	scan = heap_beginscan_catalog(rel, 0, NULL);
 	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 	{
 		Oid			dsttablespace = HeapTupleGetOid(tuple);
@@ -1810,7 +1772,6 @@ remove_dbtablespaces(Oid db_id)
 
 	heap_endscan(scan);
 	heap_close(rel, AccessShareLock);
-	UnregisterSnapshot(snapshot);
 }
 
 /*
@@ -1832,19 +1793,9 @@ check_db_file_conflict(Oid db_id)
 	Relation	rel;
 	HeapScanDesc scan;
 	HeapTuple	tuple;
-	Snapshot	snapshot;
-
-	/*
-	 * As in createdb(), we'd better use an MVCC snapshot here; missing a
-	 * tablespace could result in falsely reporting the OID is unique, with
-	 * disastrous future consequences per the comment above.
-	 *
-	 * XXX change this when a generic fix for SnapshotNow races is implemented
-	 */
-	snapshot = RegisterSnapshot(GetLatestSnapshot());
 
 	rel = heap_open(TableSpaceRelationId, AccessShareLock);
-	scan = heap_beginscan(rel, snapshot, 0, NULL);
+	scan = heap_beginscan_catalog(rel, 0, NULL);
 	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 	{
 		Oid			dsttablespace = HeapTupleGetOid(tuple);
@@ -1870,7 +1821,6 @@ check_db_file_conflict(Oid db_id)
 
 	heap_endscan(scan);
 	heap_close(rel, AccessShareLock);
-	UnregisterSnapshot(snapshot);
 
 	return result;
 }
@@ -1927,7 +1877,7 @@ get_database_oid(const char *dbname, bool missing_ok)
 				BTEqualStrategyNumber, F_NAMEEQ,
 				CStringGetDatum(dbname));
 	scan = systable_beginscan(pg_database, DatabaseNameIndexId, true,
-							  SnapshotNow, 1, entry);
+							  NULL, 1, entry);
 
 	dbtuple = systable_getnext(scan);
author	Robert Haas <rhaas@postgresql.org>	2013-07-02 09:47:01 -0400
committer	Robert Haas <rhaas@postgresql.org>	2013-07-02 09:47:01 -0400
commit	568d4138c646cd7cd8a837ac244ef2caf27c6bb8 (patch)
tree	82022e9bd58a217976f94fea942f24b0c40278c0 /src/backend/commands/dbcommands.c
parent	384f933046dc9e9a2b416f5f7b3be30b93587c63 (diff)
download	postgresql-568d4138c646cd7cd8a837ac244ef2caf27c6bb8.tar.gz postgresql-568d4138c646cd7cd8a837ac244ef2caf27c6bb8.zip