From f741300c90141ee274f19a13629ae03a9806b598 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Fri, 27 Jun 2014 14:43:53 -0400
Subject: Have multixact be truncated by checkpoint, not vacuum

Instead of truncating pg_multixact at vacuum time, do it only at
checkpoint time.  The reason for doing it this way is twofold: first, we
want it to delete only segments that we're certain will not be required
if there's a crash immediately after the removal; and second, we want to
do it relatively often so that older files are not left behind if
there's an untimely crash.

Per my proposal in
http://www.postgresql.org/message-id/20140626044519.GJ7340@eldon.alvh.no-ip.org
we now execute the truncation in the checkpointer process rather than as
part of vacuum.  Vacuum is in only charge of maintaining in shared
memory the value to which it's possible to truncate the files; that
value is stored as part of checkpoints also, and so upon recovery we can
reuse the same value to re-execute truncate and reset the
oldest-value-still-safe-to-use to one known to remain after truncation.

Per bug reported by Jeff Janes in the course of his tests involving
bug #8673.

While at it, update some comments that hadn't been updated since
multixacts were changed.

Backpatch to 9.3, where persistency of pg_multixact files was
introduced by commit 0ac5ad5134f2.
---
 src/backend/access/transam/xlog.c | 44 ++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 15 deletions(-)

(limited to 'src/backend/access/transam/xlog.c')

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index abc5682e7f9..e5640793eb8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6264,6 +6264,7 @@ StartupXLOG(void)
 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
 	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+	MultiXactSetSafeTruncate(checkPoint.oldestMulti);
 	XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
 	XLogCtl->ckptXid = checkPoint.nextXid;
 
@@ -8272,6 +8273,12 @@ CreateCheckPoint(int flags)
 	 */
 	END_CRIT_SECTION();
 
+	/*
+	 * Now that the checkpoint is safely on disk, we can update the point to
+	 * which multixact can be truncated.
+	 */
+	MultiXactSetSafeTruncate(checkPoint.oldestMulti);
+
 	/*
 	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
 	 */
@@ -8305,6 +8312,11 @@ CreateCheckPoint(int flags)
 	if (!RecoveryInProgress())
 		TruncateSUBTRANS(GetOldestXmin(NULL, false));
 
+	/*
+	 * Truncate pg_multixact too.
+	 */
+	TruncateMultiXact();
+
 	/* Real work is done, but log and update stats before releasing lock. */
 	LogCheckpointEnd(false);
 
@@ -8578,21 +8590,6 @@ CreateRestartPoint(int flags)
 	}
 	LWLockRelease(ControlFileLock);
 
-	/*
-	 * Due to an historical accident multixact truncations are not WAL-logged,
-	 * but just performed everytime the mxact horizon is increased. So, unless
-	 * we explicitly execute truncations on a standby it will never clean out
-	 * /pg_multixact which obviously is bad, both because it uses space and
-	 * because we can wrap around into pre-existing data...
-	 *
-	 * We can only do the truncation here, after the UpdateControlFile()
-	 * above, because we've now safely established a restart point, that
-	 * guarantees we will not need need to access those multis.
-	 *
-	 * It's probably worth improving this.
-	 */
-	TruncateMultiXact(lastCheckPoint.oldestMulti);
-
 	/*
 	 * Delete old log files (those no longer needed even for previous
 	 * checkpoint/restartpoint) to prevent the disk holding the xlog from
@@ -8651,6 +8648,21 @@ CreateRestartPoint(int flags)
 			ThisTimeLineID = 0;
 	}
 
+	/*
+	 * Due to an historical accident multixact truncations are not WAL-logged,
+	 * but just performed everytime the mxact horizon is increased. So, unless
+	 * we explicitly execute truncations on a standby it will never clean out
+	 * /pg_multixact which obviously is bad, both because it uses space and
+	 * because we can wrap around into pre-existing data...
+	 *
+	 * We can only do the truncation here, after the UpdateControlFile()
+	 * above, because we've now safely established a restart point.  That
+	 * guarantees we will not need to access those multis.
+	 *
+	 * It's probably worth improving this.
+	 */
+	TruncateMultiXact();
+
 	/*
 	 * Truncate pg_subtrans if possible.  We can throw away all data before
 	 * the oldest XMIN of any running transaction.  No future transaction will
@@ -9117,6 +9129,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 							  checkPoint.nextMultiOffset);
 		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
 		SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
+		MultiXactSetSafeTruncate(checkPoint.oldestMulti);
 
 		/*
 		 * If we see a shutdown checkpoint while waiting for an end-of-backup
@@ -9217,6 +9230,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 								  checkPoint.oldestXidDB);
 		MultiXactAdvanceOldest(checkPoint.oldestMulti,
 							   checkPoint.oldestMultiDB);
+		MultiXactSetSafeTruncate(checkPoint.oldestMulti);
 
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
-- 
cgit v1.2.3