diff options
author | Robert Haas <rhaas@postgresql.org> | 2022-09-27 13:25:21 -0400 |
---|---|---|
committer | Robert Haas <rhaas@postgresql.org> | 2022-09-27 13:25:21 -0400 |
commit | 05d4cbf9b6ba708858984b01ca0fc56d59d4ec7c (patch) | |
tree | 645e3ac17f002ae33e086dbf871c330986452c35 /src/backend/access/transam/xlog.c | |
parent | 2f47715cc8649f854b1df28dfc338af9801db217 (diff) | |
download | postgresql-05d4cbf9b6ba708858984b01ca0fc56d59d4ec7c.tar.gz postgresql-05d4cbf9b6ba708858984b01ca0fc56d59d4ec7c.zip |
Increase width of RelFileNumbers from 32 bits to 56 bits.
RelFileNumbers are now assigned using a separate counter, instead of
being assigned from the OID counter. This counter never wraps around:
if all 2^56 possible RelFileNumbers are used, an internal error
occurs. As the cluster is limited to 2^64 total bytes of WAL, this
limitation should not cause a problem in practice.
If the counter were 64 bits wide rather than 56 bits wide, we would
need to increase the width of the BufferTag, which might adversely
impact buffer lookup performance. Also, this lets us use bigint for
pg_class.relfilenode and other places where these values are exposed
at the SQL level without worrying about overflow.
This should remove the need to keep "tombstone" files around until
the next checkpoint when relations are removed. We do that to keep
RelFileNumbers from being recycled, but now that won't happen
anyway. However, this patch doesn't actually change anything in
this area; it just makes it possible for a future patch to do so.
Dilip Kumar, based on an idea from Andres Freund, who also reviewed
some earlier versions of the patch. Further review and some
wordsmithing by me. Also reviewed at various points by Ashutosh
Sharma, Vignesh C, Amul Sul, Álvaro Herrera, and Tom Lane.
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Diffstat (limited to 'src/backend/access/transam/xlog.c')
-rw-r--r-- | src/backend/access/transam/xlog.c | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1dd6df0fe15..dff9b8d2366 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4712,6 +4712,7 @@ BootStrapXLOG(void) checkPoint.nextXid = FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; + checkPoint.nextRelFileNumber = FirstNormalRelFileNumber; checkPoint.nextMulti = FirstMultiXactId; checkPoint.nextMultiOffset = 0; checkPoint.oldestXid = FirstNormalTransactionId; @@ -4725,7 +4726,11 @@ BootStrapXLOG(void) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; ShmemVariableCache->oidCount = 0; + ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->loggedRelFileNumberRecPtr = InvalidXLogRecPtr; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -5191,7 +5196,10 @@ StartupXLOG(void) /* initialize shared memory variables from the checkpoint record */ ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; ShmemVariableCache->oidCount = 0; + ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -6663,6 +6671,24 @@ CreateCheckPoint(int flags) checkPoint.nextOid += ShmemVariableCache->oidCount; LWLockRelease(OidGenLock); + /* + * If this is a shutdown checkpoint then we can safely start allocating + * relfilenumber from the nextRelFileNumber value after the restart because + * no one one else can use the relfilenumber beyond that number before the + * shutdown. OTOH, if it is a normal checkpoint then if there is a crash + * after this point then we might end up reusing the same relfilenumbers + * after the restart so we need to set the nextRelFileNumber to the already + * logged relfilenumber as no one will use number beyond this limit without + * logging again. + */ + LWLockAcquire(RelFileNumberGenLock, LW_SHARED); + if (shutdown) + checkPoint.nextRelFileNumber = ShmemVariableCache->nextRelFileNumber; + else + checkPoint.nextRelFileNumber = ShmemVariableCache->loggedRelFileNumber; + + LWLockRelease(RelFileNumberGenLock); + MultiXactGetCheckptMulti(shutdown, &checkPoint.nextMulti, &checkPoint.nextMultiOffset, @@ -7541,6 +7567,24 @@ XLogPutNextOid(Oid nextOid) } /* + * Similar to the XLogPutNextOid but instead of writing NEXTOID log record it + * writes a NEXT_RELFILENUMBER log record. It also returns the XLogRecPtr of + * the currently logged relfilenumber record, so that the caller can flush it + * at the appropriate time. + */ +XLogRecPtr +LogNextRelFileNumber(RelFileNumber nextrelnumber) +{ + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterData((char *) (&nextrelnumber), sizeof(RelFileNumber)); + recptr = XLogInsert(RM_XLOG_ID, XLOG_NEXT_RELFILENUMBER); + + return recptr; +} + +/* * Write an XLOG SWITCH record. * * Here we just blindly issue an XLogInsert request for the record. @@ -7755,6 +7799,17 @@ xlog_redo(XLogReaderState *record) ShmemVariableCache->oidCount = 0; LWLockRelease(OidGenLock); } + if (info == XLOG_NEXT_RELFILENUMBER) + { + RelFileNumber nextRelFileNumber; + + memcpy(&nextRelFileNumber, XLogRecGetData(record), sizeof(RelFileNumber)); + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextRelFileNumber = nextRelFileNumber; + ShmemVariableCache->loggedRelFileNumber = nextRelFileNumber; + ShmemVariableCache->flushedRelFileNumber = nextRelFileNumber; + LWLockRelease(RelFileNumberGenLock); + } else if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; @@ -7769,6 +7824,11 @@ xlog_redo(XLogReaderState *record) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; LWLockRelease(OidGenLock); + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber; + LWLockRelease(RelFileNumberGenLock); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); |