aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/transam/multixact.c117
-rw-r--r--src/backend/access/transam/xlog.c44
-rw-r--r--src/backend/commands/vacuum.c8
-rw-r--r--src/include/access/multixact.h3
4 files changed, 112 insertions, 60 deletions
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 2cdfed4945e..9f259bb54eb 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -45,14 +45,17 @@
* anything we saw during replay.
*
* We are able to remove segments no longer necessary by carefully tracking
- * each table's used values: during vacuum, any multixact older than a
- * certain value is removed; the cutoff value is stored in pg_class.
- * The minimum value in each database is stored in pg_database, and the
- * global minimum is part of pg_control. Any vacuum that is able to
- * advance its database's minimum value also computes a new global minimum,
- * and uses this value to truncate older segments. When new multixactid
- * values are to be created, care is taken that the counter does not
- * fall within the wraparound horizon considering the global minimum value.
+ * each table's used values: during vacuum, any multixact older than a certain
+ * value is removed; the cutoff value is stored in pg_class. The minimum value
+ * across all tables in each database is stored in pg_database, and the global
+ * minimum across all databases is part of pg_control and is kept in shared
+ * memory. At checkpoint time, after the value is known flushed in WAL, any
+ * files that correspond to multixacts older than that value are removed.
+ * (These files are also removed when a restartpoint is executed.)
+ *
+ * When new multixactid values are to be created, care is taken that the
+ * counter does not fall within the wraparound horizon considering the global
+ * minimum value.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -91,7 +94,7 @@
* Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
* MultiXact page numbering also wraps around at
* 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
- * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need
+ * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need
* take no explicit notice of that fact in this module, except when comparing
* segment and page numbers in TruncateMultiXact (see
* MultiXactOffsetPagePrecedes).
@@ -188,16 +191,20 @@ typedef struct MultiXactStateData
/* next-to-be-assigned offset */
MultiXactOffset nextOffset;
- /* the Offset SLRU area was last truncated at this MultiXactId */
- MultiXactId lastTruncationPoint;
-
/*
- * oldest multixact that is still on disk. Anything older than this
- * should not be consulted.
+ * Oldest multixact that is still on disk. Anything older than this
+ * should not be consulted. These values are updated by vacuum.
*/
MultiXactId oldestMultiXactId;
Oid oldestMultiXactDB;
+ /*
+ * This is what the previous checkpoint stored as the truncate position.
+ * This value is the oldestMultiXactId that was valid when a checkpoint
+ * was last executed.
+ */
+ MultiXactId lastCheckpointedOldest;
+
/* support for anti-wraparound measures */
MultiXactId multiVacLimit;
MultiXactId multiWarnLimit;
@@ -234,12 +241,20 @@ typedef struct MultiXactStateData
* than its own OldestVisibleMXactId[] setting; this is necessary because
* the checkpointer could truncate away such data at any instant.
*
- * The checkpointer can compute the safe truncation point as the oldest
- * valid value among all the OldestMemberMXactId[] and
- * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
- * Clearly, it is not possible for any later-computed OldestVisibleMXactId
- * value to be older than this, and so there is no risk of truncating data
- * that is still needed.
+ * The oldest valid value among all of the OldestMemberMXactId[] and
+ * OldestVisibleMXactId[] entries is considered by vacuum as the earliest
+ * possible value still having any live member transaction. Subtracting
+ * vacuum_multixact_freeze_min_age from that value we obtain the freezing
+ * point for multixacts for that table. Any value older than that is
+ * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note
+ * that multis that have member xids that are older than the cutoff point
+ * for xids must also be frozen, even if the multis themselves are newer
+ * than the multixid cutoff point). Whenever a full table vacuum happens,
+ * the freezing point so computed is used as the new pg_class.relminmxid
+ * value. The minimum of all those values in a database is stored as
+ * pg_database.datminmxid. In turn, the minimum of all of those values is
+ * stored in pg_control and used as truncation point for pg_multixact. At
+ * checkpoint or restartpoint, unneeded segments are removed.
*/
MultiXactId perBackendXactIds[1]; /* VARIABLE LENGTH ARRAY */
} MultiXactStateData;
@@ -1121,8 +1136,8 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* We check known limits on MultiXact before resorting to the SLRU area.
*
* An ID older than MultiXactState->oldestMultiXactId cannot possibly be
- * useful; it should have already been removed by vacuum. We've truncated
- * the on-disk structures anyway. Returning the wrong values could lead
+ * useful; it has already been removed, or will be removed shortly, by
+ * truncation. Returning the wrong values could lead
* to an incorrect visibility result. However, to support pg_upgrade we
* need to allow an empty set to be returned regardless, if the caller is
* willing to accept it; the caller is expected to check that it's an
@@ -1932,14 +1947,14 @@ TrimMultiXact(void)
LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
/*
- * (Re-)Initialize our idea of the latest page number.
+ * (Re-)Initialize our idea of the latest page number for offsets.
*/
pageno = MultiXactIdToOffsetPage(multi);
MultiXactOffsetCtl->shared->latest_page_number = pageno;
/*
* Zero out the remainder of the current offsets page. See notes in
- * StartupCLOG() for motivation.
+ * TrimCLOG() for motivation.
*/
entryno = MultiXactIdToOffsetEntry(multi);
if (entryno != 0)
@@ -1962,7 +1977,7 @@ TrimMultiXact(void)
LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
/*
- * (Re-)Initialize our idea of the latest page number.
+ * (Re-)Initialize our idea of the latest page number for members.
*/
pageno = MXOffsetToMemberPage(offset);
MultiXactMemberCtl->shared->latest_page_number = pageno;
@@ -2241,6 +2256,18 @@ MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
}
/*
+ * Update the "safe truncation point". This is the newest value of oldestMulti
+ * that is known to be flushed as part of a checkpoint record.
+ */
+void
+MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti)
+{
+ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+ MultiXactState->lastCheckpointedOldest = safeTruncateMulti;
+ LWLockRelease(MultiXactGenLock);
+}
+
+/*
* Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
*
* NB: this is called while holding MultiXactGenLock. We want it to be very
@@ -2478,25 +2505,31 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
* Remove all MultiXactOffset and MultiXactMember segments before the oldest
* ones still of interest.
*
- * On a primary, this is called by vacuum after it has successfully advanced a
- * database's datminmxid value; the cutoff value we're passed is the minimum of
- * all databases' datminmxid values.
- *
- * During crash recovery, it's called from CreateRestartPoint() instead. We
- * rely on the fact that xlog_redo() will already have called
- * MultiXactAdvanceOldest(). Our latest_page_number will already have been
- * initialized by StartupMultiXact() and kept up to date as new pages are
- * zeroed.
+ * On a primary, this is called by the checkpointer process after a checkpoint
+ * has been flushed; during crash recovery, it's called from
+ * CreateRestartPoint(). In the latter case, we rely on the fact that
+ * xlog_redo() will already have called MultiXactAdvanceOldest(). Our
+ * latest_page_number will already have been initialized by StartupMultiXact()
+ * and kept up to date as new pages are zeroed.
*/
void
-TruncateMultiXact(MultiXactId oldestMXact)
+TruncateMultiXact(void)
{
+ MultiXactId oldestMXact;
MultiXactOffset oldestOffset;
MultiXactOffset nextOffset;
mxtruncinfo trunc;
MultiXactId earliest;
MembersLiveRange range;
+ Assert(AmCheckpointerProcess() || AmStartupProcess() ||
+ !IsPostmasterEnvironment);
+
+ LWLockAcquire(MultiXactGenLock, LW_SHARED);
+ oldestMXact = MultiXactState->lastCheckpointedOldest;
+ LWLockRelease(MultiXactGenLock);
+ Assert(MultiXactIdIsValid(oldestMXact));
+
/*
* Note we can't just plow ahead with the truncation; it's possible that
* there are no segments to truncate, which is a problem because we are
@@ -2507,6 +2540,8 @@ TruncateMultiXact(MultiXactId oldestMXact)
trunc.earliestExistingPage = -1;
SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
+ if (earliest < FirstMultiXactId)
+ earliest = FirstMultiXactId;
/* nothing to do */
if (MultiXactIdPrecedes(oldestMXact, earliest))
@@ -2514,8 +2549,7 @@ TruncateMultiXact(MultiXactId oldestMXact)
/*
* First, compute the safe truncation point for MultiXactMember. This is
- * the starting offset of the multixact we were passed as MultiXactOffset
- * cutoff.
+ * the starting offset of the oldest multixact.
*/
{
int pageno;
@@ -2538,10 +2572,6 @@ TruncateMultiXact(MultiXactId oldestMXact)
LWLockRelease(MultiXactOffsetControlLock);
}
- /* truncate MultiXactOffset */
- SimpleLruTruncate(MultiXactOffsetCtl,
- MultiXactIdToOffsetPage(oldestMXact));
-
/*
* To truncate MultiXactMembers, we need to figure out the active page
* range and delete all files outside that range. The start point is the
@@ -2559,6 +2589,11 @@ TruncateMultiXact(MultiXactId oldestMXact)
range.rangeEnd = MXOffsetToMemberPage(nextOffset);
SlruScanDirectory(MultiXactMemberCtl, SlruScanDirCbRemoveMembers, &range);
+
+ /* Now we can truncate MultiXactOffset */
+ SimpleLruTruncate(MultiXactOffsetCtl,
+ MultiXactIdToOffsetPage(oldestMXact));
+
}
/*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index abc5682e7f9..e5640793eb8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6264,6 +6264,7 @@ StartupXLOG(void)
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
XLogCtl->ckptXid = checkPoint.nextXid;
@@ -8273,6 +8274,12 @@ CreateCheckPoint(int flags)
END_CRIT_SECTION();
/*
+ * Now that the checkpoint is safely on disk, we can update the point to
+ * which multixact can be truncated.
+ */
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
+
+ /*
* Let smgr do post-checkpoint cleanup (eg, deleting old files).
*/
smgrpostckpt();
@@ -8305,6 +8312,11 @@ CreateCheckPoint(int flags)
if (!RecoveryInProgress())
TruncateSUBTRANS(GetOldestXmin(NULL, false));
+ /*
+ * Truncate pg_multixact too.
+ */
+ TruncateMultiXact();
+
/* Real work is done, but log and update stats before releasing lock. */
LogCheckpointEnd(false);
@@ -8579,21 +8591,6 @@ CreateRestartPoint(int flags)
LWLockRelease(ControlFileLock);
/*
- * Due to an historical accident multixact truncations are not WAL-logged,
- * but just performed everytime the mxact horizon is increased. So, unless
- * we explicitly execute truncations on a standby it will never clean out
- * /pg_multixact which obviously is bad, both because it uses space and
- * because we can wrap around into pre-existing data...
- *
- * We can only do the truncation here, after the UpdateControlFile()
- * above, because we've now safely established a restart point, that
- * guarantees we will not need need to access those multis.
- *
- * It's probably worth improving this.
- */
- TruncateMultiXact(lastCheckPoint.oldestMulti);
-
- /*
* Delete old log files (those no longer needed even for previous
* checkpoint/restartpoint) to prevent the disk holding the xlog from
* growing full.
@@ -8652,6 +8649,21 @@ CreateRestartPoint(int flags)
}
/*
+ * Due to an historical accident multixact truncations are not WAL-logged,
+ * but just performed everytime the mxact horizon is increased. So, unless
+ * we explicitly execute truncations on a standby it will never clean out
+ * /pg_multixact which obviously is bad, both because it uses space and
+ * because we can wrap around into pre-existing data...
+ *
+ * We can only do the truncation here, after the UpdateControlFile()
+ * above, because we've now safely established a restart point. That
+ * guarantees we will not need to access those multis.
+ *
+ * It's probably worth improving this.
+ */
+ TruncateMultiXact();
+
+ /*
* Truncate pg_subtrans if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will
* attempt to reference any pg_subtrans entry older than that (see Asserts
@@ -9117,6 +9129,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
checkPoint.nextMultiOffset);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/*
* If we see a shutdown checkpoint while waiting for an end-of-backup
@@ -9217,6 +9230,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
checkPoint.oldestXidDB);
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
+ MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 3d2c73902c6..8822a154dcc 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -969,9 +969,11 @@ vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti)
return;
}
- /* Truncate CLOG and Multi to the oldest computed value */
+ /*
+ * Truncate CLOG to the oldest computed value. Note we don't truncate
+ * multixacts; that will be done by the next checkpoint.
+ */
TruncateCLOG(frozenXID);
- TruncateMultiXact(minMulti);
/*
* Update the wrap limit for GetNewTransactionId and creation of new
@@ -980,7 +982,7 @@ vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti)
* signalling twice?
*/
SetTransactionIdLimit(frozenXID, oldestxid_datoid);
- MultiXactAdvanceOldest(minMulti, minmulti_datoid);
+ SetMultiXactIdLimit(minMulti, minmulti_datoid);
}
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 448ec100d39..f6d2e0418b1 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -119,12 +119,13 @@ extern void MultiXactGetCheckptMulti(bool is_shutdown,
Oid *oldestMultiDB);
extern void CheckPointMultiXact(void);
extern MultiXactId GetOldestMultiXactId(void);
-extern void TruncateMultiXact(MultiXactId cutoff_multi);
+extern void TruncateMultiXact(void);
extern void MultiXactSetNextMXact(MultiXactId nextMulti,
MultiXactOffset nextMultiOffset);
extern void MultiXactAdvanceNextMXact(MultiXactId minMulti,
MultiXactOffset minMultiOffset);
extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB);
+extern void MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti);
extern void multixact_twophase_recover(TransactionId xid, uint16 info,
void *recdata, uint32 len);