diff options
author | Robert Haas <rhaas@postgresql.org> | 2013-07-02 09:47:01 -0400 |
---|---|---|
committer | Robert Haas <rhaas@postgresql.org> | 2013-07-02 09:47:01 -0400 |
commit | 568d4138c646cd7cd8a837ac244ef2caf27c6bb8 (patch) | |
tree | 82022e9bd58a217976f94fea942f24b0c40278c0 /src/backend/commands/dbcommands.c | |
parent | 384f933046dc9e9a2b416f5f7b3be30b93587c63 (diff) | |
download | postgresql-568d4138c646cd7cd8a837ac244ef2caf27c6bb8.tar.gz postgresql-568d4138c646cd7cd8a837ac244ef2caf27c6bb8.zip |
Use an MVCC snapshot, rather than SnapshotNow, for catalog scans.
SnapshotNow scans have the undesirable property that, in the face of
concurrent updates, the scan can fail to see either the old or the new
versions of the row. In many cases, we work around this by requiring
DDL operations to hold AccessExclusiveLock on the object being
modified; in some cases, the existing locking is inadequate and random
failures occur as a result. This commit doesn't change anything
related to locking, but will hopefully pave the way to allowing lock
strength reductions in the future.
The major issue has held us back from making this change in the past
is that taking an MVCC snapshot is significantly more expensive than
using a static special snapshot such as SnapshotNow. However, testing
of various worst-case scenarios reveals that this problem is not
severe except under fairly extreme workloads. To mitigate those
problems, we avoid retaking the MVCC snapshot for each new scan;
instead, we take a new snapshot only when invalidation messages have
been processed. The catcache machinery already requires that
invalidation messages be sent before releasing the related heavyweight
lock; else other backends might rely on locally-cached data rather
than scanning the catalog at all. Thus, making snapshot reuse
dependent on the same guarantees shouldn't break anything that wasn't
already subtly broken.
Patch by me. Review by Michael Paquier and Andres Freund.
Diffstat (limited to 'src/backend/commands/dbcommands.c')
-rw-r--r-- | src/backend/commands/dbcommands.c | 66 |
1 files changed, 8 insertions, 58 deletions
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 0e10a752180..a3a150d7008 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -133,7 +133,6 @@ createdb(const CreatedbStmt *stmt) int notherbackends; int npreparedxacts; createdb_failure_params fparms; - Snapshot snapshot; /* Extract options from the statement node tree */ foreach(option, stmt->options) @@ -538,29 +537,6 @@ createdb(const CreatedbStmt *stmt) RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* - * Take an MVCC snapshot to use while scanning through pg_tablespace. For - * safety, register the snapshot (this prevents it from changing if - * something else were to request a snapshot during the loop). - * - * Traversing pg_tablespace with an MVCC snapshot is necessary to provide - * us with a consistent view of the tablespaces that exist. Using - * SnapshotNow here would risk seeing the same tablespace multiple times, - * or worse not seeing a tablespace at all, if its tuple is moved around - * by a concurrent update (eg an ACL change). - * - * Inconsistency of this sort is inherent to all SnapshotNow scans, unless - * some lock is held to prevent concurrent updates of the rows being - * sought. There should be a generic fix for that, but in the meantime - * it's worth fixing this case in particular because we are doing very - * heavyweight operations within the scan, so that the elapsed time for - * the scan is vastly longer than for most other catalog scans. That - * means there's a much wider window for concurrent updates to cause - * trouble here than anywhere else. XXX this code should be changed - * whenever a generic fix is implemented. - */ - snapshot = RegisterSnapshot(GetLatestSnapshot()); - - /* * Once we start copying subdirectories, we need to be able to clean 'em * up if we fail. Use an ENSURE block to make sure this happens. (This * is not a 100% solution, because of the possibility of failure during @@ -577,7 +553,7 @@ createdb(const CreatedbStmt *stmt) * each one to the new database. */ rel = heap_open(TableSpaceRelationId, AccessShareLock); - scan = heap_beginscan(rel, snapshot, 0, NULL); + scan = heap_beginscan_catalog(rel, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { Oid srctablespace = HeapTupleGetOid(tuple); @@ -682,9 +658,6 @@ createdb(const CreatedbStmt *stmt) PG_END_ENSURE_ERROR_CLEANUP(createdb_failure_callback, PointerGetDatum(&fparms)); - /* Free our snapshot */ - UnregisterSnapshot(snapshot); - return dboid; } @@ -1214,7 +1187,7 @@ movedb(const char *dbname, const char *tblspcname) BTEqualStrategyNumber, F_NAMEEQ, NameGetDatum(dbname)); sysscan = systable_beginscan(pgdbrel, DatabaseNameIndexId, true, - SnapshotNow, 1, &scankey); + NULL, 1, &scankey); oldtuple = systable_getnext(sysscan); if (!HeapTupleIsValid(oldtuple)) /* shouldn't happen... */ ereport(ERROR, @@ -1403,7 +1376,7 @@ AlterDatabase(AlterDatabaseStmt *stmt, bool isTopLevel) BTEqualStrategyNumber, F_NAMEEQ, NameGetDatum(stmt->dbname)); scan = systable_beginscan(rel, DatabaseNameIndexId, true, - SnapshotNow, 1, &scankey); + NULL, 1, &scankey); tuple = systable_getnext(scan); if (!HeapTupleIsValid(tuple)) ereport(ERROR, @@ -1498,7 +1471,7 @@ AlterDatabaseOwner(const char *dbname, Oid newOwnerId) BTEqualStrategyNumber, F_NAMEEQ, NameGetDatum(dbname)); scan = systable_beginscan(rel, DatabaseNameIndexId, true, - SnapshotNow, 1, &scankey); + NULL, 1, &scankey); tuple = systable_getnext(scan); if (!HeapTupleIsValid(tuple)) ereport(ERROR, @@ -1637,7 +1610,7 @@ get_db_info(const char *name, LOCKMODE lockmode, NameGetDatum(name)); scan = systable_beginscan(relation, DatabaseNameIndexId, true, - SnapshotNow, 1, &scanKey); + NULL, 1, &scanKey); tuple = systable_getnext(scan); @@ -1751,20 +1724,9 @@ remove_dbtablespaces(Oid db_id) Relation rel; HeapScanDesc scan; HeapTuple tuple; - Snapshot snapshot; - - /* - * As in createdb(), we'd better use an MVCC snapshot here, since this - * scan can run for a long time. Duplicate visits to tablespaces would be - * harmless, but missing a tablespace could result in permanently leaked - * files. - * - * XXX change this when a generic fix for SnapshotNow races is implemented - */ - snapshot = RegisterSnapshot(GetLatestSnapshot()); rel = heap_open(TableSpaceRelationId, AccessShareLock); - scan = heap_beginscan(rel, snapshot, 0, NULL); + scan = heap_beginscan_catalog(rel, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { Oid dsttablespace = HeapTupleGetOid(tuple); @@ -1810,7 +1772,6 @@ remove_dbtablespaces(Oid db_id) heap_endscan(scan); heap_close(rel, AccessShareLock); - UnregisterSnapshot(snapshot); } /* @@ -1832,19 +1793,9 @@ check_db_file_conflict(Oid db_id) Relation rel; HeapScanDesc scan; HeapTuple tuple; - Snapshot snapshot; - - /* - * As in createdb(), we'd better use an MVCC snapshot here; missing a - * tablespace could result in falsely reporting the OID is unique, with - * disastrous future consequences per the comment above. - * - * XXX change this when a generic fix for SnapshotNow races is implemented - */ - snapshot = RegisterSnapshot(GetLatestSnapshot()); rel = heap_open(TableSpaceRelationId, AccessShareLock); - scan = heap_beginscan(rel, snapshot, 0, NULL); + scan = heap_beginscan_catalog(rel, 0, NULL); while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { Oid dsttablespace = HeapTupleGetOid(tuple); @@ -1870,7 +1821,6 @@ check_db_file_conflict(Oid db_id) heap_endscan(scan); heap_close(rel, AccessShareLock); - UnregisterSnapshot(snapshot); return result; } @@ -1927,7 +1877,7 @@ get_database_oid(const char *dbname, bool missing_ok) BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(dbname)); scan = systable_beginscan(pg_database, DatabaseNameIndexId, true, - SnapshotNow, 1, entry); + NULL, 1, entry); dbtuple = systable_getnext(scan); |