diff options
-rw-r--r-- | src/backend/commands/tablecmds.c | 11 | ||||
-rw-r--r-- | src/backend/storage/smgr/md.c | 27 | ||||
-rw-r--r-- | src/backend/utils/cache/relcache.c | 66 | ||||
-rw-r--r-- | src/bin/pg_dump/pg_dump.c | 31 |
4 files changed, 120 insertions, 15 deletions
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 7fbee0c1f71..e7aef2f6b08 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -40,6 +40,7 @@ #include "catalog/pg_depend.h" #include "catalog/pg_foreign_table.h" #include "catalog/pg_inherits.h" +#include "catalog/pg_largeobject.h" #include "catalog/pg_namespace.h" #include "catalog/pg_opclass.h" #include "catalog/pg_statistic_ext.h" @@ -2185,7 +2186,15 @@ truncate_check_rel(Oid relid, Form_pg_class reltuple) (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table", relname))); - if (!allowSystemTableMods && IsSystemClass(relid, reltuple)) + /* + * Most system catalogs can't be truncated at all, or at least not unless + * allow_system_table_mods=on. As an exception, however, we allow + * pg_largeobject to be truncated as part of pg_upgrade, because we need + * to change its relfilenode to match the old cluster, and allowing a + * TRUNCATE command to be executed is the easiest way of doing that. + */ + if (!allowSystemTableMods && IsSystemClass(relid, reltuple) + && (!IsBinaryUpgrade || relid != LargeObjectRelationId)) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied: \"%s\" is a system catalog", diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 3998296a62f..3deac496eed 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -319,6 +319,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo) { char *path; int ret; + BlockNumber segno = 0; path = relpath(rlocator, forkNum); @@ -353,8 +354,22 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo) /* Prevent other backends' fds from holding on to the disk space */ ret = do_truncate(path); - /* Register request to unlink first segment later */ - register_unlink_segment(rlocator, forkNum, 0 /* first seg */ ); + /* + * Except during a binary upgrade, register request to unlink first + * segment later, rather than now. + * + * If we're performing a binary upgrade, the dangers described in the + * header comments for mdunlink() do not exist, since after a crash + * or even a simple ERROR, the upgrade fails and the whole new cluster + * must be recreated from scratch. And, on the other hand, it is + * important to remove the files from disk immediately, because we + * may be about to reuse the same relfilenumber. + */ + if (!IsBinaryUpgrade) + { + register_unlink_segment(rlocator, forkNum, 0 /* first seg */ ); + ++segno; + } } /* @@ -363,15 +378,17 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum, bool isRedo) if (ret >= 0) { char *segpath = (char *) palloc(strlen(path) + 12); - BlockNumber segno; /* * Note that because we loop until getting ENOENT, we will correctly * remove all inactive segments as well as active ones. */ - for (segno = 1;; segno++) + for (;; segno++) { - sprintf(segpath, "%s.%u", path, segno); + if (segno == 0) + strcpy(segpath, path); + else + sprintf(segpath, "%s.%u", path, segno); if (!RelFileLocatorBackendIsTemp(rlocator)) { diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index bdb771d278f..00dc0f24037 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -41,6 +41,7 @@ #include "access/tupdesc_details.h" #include "access/xact.h" #include "access/xlog.h" +#include "catalog/binary_upgrade.h" #include "catalog/catalog.h" #include "catalog/indexing.h" #include "catalog/namespace.h" @@ -3707,9 +3708,36 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) TransactionId freezeXid = InvalidTransactionId; RelFileLocator newrlocator; - /* Allocate a new relfilenumber */ - newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace, - NULL, persistence); + if (!IsBinaryUpgrade) + { + /* Allocate a new relfilenumber */ + newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace, + NULL, persistence); + } + else if (relation->rd_rel->relkind == RELKIND_INDEX) + { + if (!OidIsValid(binary_upgrade_next_index_pg_class_relfilenumber)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("index relfilenumber value not set when in binary upgrade mode"))); + + newrelfilenumber = binary_upgrade_next_index_pg_class_relfilenumber; + binary_upgrade_next_index_pg_class_relfilenumber = InvalidOid; + } + else if (relation->rd_rel->relkind == RELKIND_RELATION) + { + if (!OidIsValid(binary_upgrade_next_heap_pg_class_relfilenumber)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("heap relfilenumber value not set when in binary upgrade mode"))); + + newrelfilenumber = binary_upgrade_next_heap_pg_class_relfilenumber; + binary_upgrade_next_heap_pg_class_relfilenumber = InvalidOid; + } + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unexpected request for new relfilenumber in binary upgrade mode"))); /* * Get a writable copy of the pg_class tuple for the given relation. @@ -3724,9 +3752,37 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) classform = (Form_pg_class) GETSTRUCT(tuple); /* - * Schedule unlinking of the old storage at transaction commit. + * Schedule unlinking of the old storage at transaction commit, except + * when performing a binary upgrade, when we must do it immediately. */ - RelationDropStorage(relation); + if (IsBinaryUpgrade) + { + SMgrRelation srel; + + /* + * During a binary upgrade, we use this code path to ensure that + * pg_largeobject and its index have the same relfilenumbers as in + * the old cluster. This is necessary because pg_upgrade treats + * pg_largeobject like a user table, not a system table. It is however + * possible that a table or index may need to end up with the same + * relfilenumber in the new cluster as what it had in the old cluster. + * Hence, we can't wait until commit time to remove the old storage. + * + * In general, this function needs to have transactional semantics, + * and removing the old storage before commit time surely isn't. + * However, it doesn't really matter, because if a binary upgrade + * fails at this stage, the new cluster will need to be recreated + * anyway. + */ + srel = smgropen(relation->rd_locator, relation->rd_backend); + smgrdounlinkall(&srel, 1, false); + smgrclose(srel); + } + else + { + /* Not a binary upgrade, so just schedule it to happen later. */ + RelationDropStorage(relation); + } /* * Create storage for the main fork of the new relfilenumber. If it's a diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index f9c51d1e679..25742a0e2ad 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -3141,6 +3141,7 @@ dumpDatabase(Archive *fout) PGresult *lo_res; PQExpBuffer loFrozenQry = createPQExpBuffer(); PQExpBuffer loOutQry = createPQExpBuffer(); + PQExpBuffer loVacQry = createPQExpBuffer(); int i_relfrozenxid, i_relfilenode, i_oid, @@ -3167,15 +3168,36 @@ dumpDatabase(Archive *fout) i_relfilenode = PQfnumber(lo_res, "relfilenode"); i_oid = PQfnumber(lo_res, "oid"); - appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, preserve values for pg_largeobject and its index\n"); + appendPQExpBufferStr(loOutQry, "\n-- For binary upgrade, set pg_largeobject relfrozenxid and relminmxid\n"); + appendPQExpBufferStr(loVacQry, "\n-- For binary upgrade, preserve pg_largeobject and index relfilenodes\n"); for (int i = 0; i < PQntuples(lo_res); ++i) + { + Oid oid; + RelFileNumber relfilenumber; + appendPQExpBuffer(loOutQry, "UPDATE pg_catalog.pg_class\n" - "SET relfrozenxid = '%u', relminmxid = '%u', relfilenode = '%u'\n" + "SET relfrozenxid = '%u', relminmxid = '%u'\n" "WHERE oid = %u;\n", atooid(PQgetvalue(lo_res, i, i_relfrozenxid)), atooid(PQgetvalue(lo_res, i, i_relminmxid)), - atooid(PQgetvalue(lo_res, i, i_relfilenode)), - atooid(PQgetvalue(lo_res, i, i_oid))); + atooid(PQgetvalue(lo_res, i, i_relfilenode))); + + oid = atooid(PQgetvalue(lo_res, i, i_oid)); + relfilenumber = atooid(PQgetvalue(lo_res, i, i_relfilenode)); + + if (oid == LargeObjectRelationId) + appendPQExpBuffer(loVacQry, + "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", + relfilenumber); + else if (oid == LargeObjectLOidPNIndexId) + appendPQExpBuffer(loVacQry, + "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", + relfilenumber); + } + + appendPQExpBufferStr(loVacQry, + "TRUNCATE pg_catalog.pg_largeobject;\n"); + appendPQExpBufferStr(loOutQry, loVacQry->data); ArchiveEntry(fout, nilCatalogId, createDumpId(), ARCHIVE_OPTS(.tag = "pg_largeobject", @@ -3187,6 +3209,7 @@ dumpDatabase(Archive *fout) destroyPQExpBuffer(loFrozenQry); destroyPQExpBuffer(loOutQry); + destroyPQExpBuffer(loVacQry); } PQclear(res); |