5 files changed, 100 insertions, 38 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index d521fa6e7b1..401d805a8f5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.343 2009/06/11 14:48:54 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.344 2009/06/25 21:36:00 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -4912,6 +4912,7 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 {
 	char		recoveryPath[MAXPGPATH];
 	char		xlogpath[MAXPGPATH];
+	XLogRecPtr	InvalidXLogRecPtr = {0, 0};
 
 	/*
 	 * We are no longer in archive recovery state.
@@ -4919,6 +4920,11 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 	InArchiveRecovery = false;
 
 	/*
+	 * Update min recovery point one last time.
+	 */
+	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+
+	/*
 	 * We should have the ending log segment currently open.  Verify, and then
 	 * close it (to avoid problems on Windows with trying to rename or delete
 	 * an open file).
@@ -5156,6 +5162,7 @@ StartupXLOG(void)
 	XLogRecord *record;
 	uint32		freespace;
 	TransactionId oldestActiveXID;
+	bool		bgwriterLaunched = false;
 
 	XLogCtl->SharedRecoveryInProgress = true;
 
@@ -5472,7 +5479,11 @@ StartupXLOG(void)
 			 * process in addition to postmaster!
 			 */
 			if (InArchiveRecovery && IsUnderPostmaster)
+			{
+				SetForwardFsyncRequests();
 				SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+				bgwriterLaunched = true;
+			}
 
 			/*
 			 * main redo apply loop
@@ -5709,12 +5720,6 @@ StartupXLOG(void)
 	/* Pre-scan prepared transactions to find out the range of XIDs present */
 	oldestActiveXID = PrescanPreparedTransactions();
 
-	/*
-	 * Allow writing WAL for us, so that we can create a checkpoint record.
-	 * But not yet for other backends!
-	 */
-	LocalRecoveryInProgress = false;
-
 	if (InRecovery)
 	{
 		int			rmid;
@@ -5743,7 +5748,12 @@ StartupXLOG(void)
 		 * the rule that TLI only changes in shutdown checkpoints, which
 		 * allows some extra error checking in xlog_redo.
 		 */
-		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+		if (bgwriterLaunched)
+			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+							  CHECKPOINT_IMMEDIATE |
+							  CHECKPOINT_WAIT);
+		else
+			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
 
 		/*
 		 * And finally, execute the recovery_end_command, if any.
@@ -5806,7 +5816,7 @@ StartupXLOG(void)
 	}
 
 	/*
-	 * All done. Allow others to write WAL.
+	 * All done. Allow backends to write WAL.
 	 */
 	XLogCtl->SharedRecoveryInProgress = false;
 }
@@ -6123,12 +6133,13 @@ LogCheckpointStart(int flags, bool restartpoint)
 	 * the main message, but what about all the flags?
 	 */
 	if (restartpoint)
-		msg = "restartpoint starting:%s%s%s%s%s%s";
+		msg = "restartpoint starting:%s%s%s%s%s%s%s";
 	else
-		msg = "checkpoint starting:%s%s%s%s%s%s";
+		msg = "checkpoint starting:%s%s%s%s%s%s%s";
 
 	elog(LOG, msg,
 		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+		 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
 		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
 		 (flags & CHECKPOINT_FORCE) ? " force" : "",
 		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
@@ -6190,10 +6201,12 @@ LogCheckpointEnd(bool restartpoint)
  *
  * flags is a bitwise OR of the following:
  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
+ *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
+ *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
+ *		CHECKPOINT_END_OF_RECOVERY).
  *
  * Note: flags contains other bits, of interest here only for logging purposes.
  * In particular note that this routine is synchronous and does not pay
@@ -6202,7 +6215,7 @@ LogCheckpointEnd(bool restartpoint)
 void
 CreateCheckPoint(int flags)
 {
-	bool		shutdown = (flags & CHECKPOINT_IS_SHUTDOWN) != 0;
+	bool		shutdown;
 	CheckPoint	checkPoint;
 	XLogRecPtr	recptr;
 	XLogCtlInsert *Insert = &XLogCtl->Insert;
@@ -6212,36 +6225,54 @@ CreateCheckPoint(int flags)
 	uint32		_logSeg;
 	TransactionId *inCommitXids;
 	int			nInCommit;
+	bool		OldInRecovery = InRecovery;
 
-	/* shouldn't happen */
-	if (RecoveryInProgress())
-		elog(ERROR, "can't create a checkpoint during recovery");
+	/*
+	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
+	 * issued at a different time.
+	 */
+	if (flags & ((CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY) != 0))
+		shutdown = true;
+	else
+		shutdown = false;
 
 	/*
-	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
-	 * During normal operation, bgwriter is the only process that creates
-	 * checkpoints, but at the end of archive recovery, the bgwriter can be
-	 * busy creating a restartpoint while the startup process tries to perform
-	 * the startup checkpoint.
+	 * A startup checkpoint is created before anyone else is allowed to
+	 * write WAL. To allow us to write the checkpoint record, set
+	 * LocalRecoveryInProgress to false. This lets us write WAL, but others
+	 * are still not allowed to do so.
 	 */
-	if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
+	if (flags & CHECKPOINT_END_OF_RECOVERY)
 	{
-		Assert(InRecovery);
+		Assert(RecoveryInProgress());
+		LocalRecoveryInProgress = false;
+		InitXLOGAccess();
 
 		/*
-		 * A restartpoint is in progress. Wait until it finishes. This can
-		 * cause an extra restartpoint to be performed, but that's OK because
-		 * we're just about to perform a checkpoint anyway. Flushing the
-		 * buffers in this restartpoint can take some time, but that time is
-		 * saved from the upcoming checkpoint so the net effect is zero.
+		 * Before 8.4, end-of-recovery checkpoints were always performed by
+		 * the startup process, and InRecovery was set true. InRecovery is not
+		 * normally set in bgwriter, but we set it here temporarily to avoid
+		 * confusing old code in the end-of-recovery checkpoint code path that
+		 * rely on it.
 		 */
-		ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint")));
-		RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
-
-		LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+		InRecovery = true;
+	}
+	else
+	{
+		/* shouldn't happen */
+		if (RecoveryInProgress())
+			elog(ERROR, "can't create a checkpoint during recovery");
 	}
 
 	/*
+	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
+	 * (This is just pro forma, since in the present system structure there is
+	 * only one process that is allowed to issue checkpoints at any given
+	 * time.)
+	 */
+	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+	/*
 	 * Prepare to accumulate statistics.
 	 *
 	 * Note: because it is possible for log_checkpoints to change while a
@@ -6298,7 +6329,8 @@ CreateCheckPoint(int flags)
 	 * the end of the last checkpoint record, and its redo pointer must point
 	 * to itself.
 	 */
-	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
+	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
+				  CHECKPOINT_FORCE)) == 0)
 	{
 		XLogRecPtr	curInsert;
 
@@ -6542,6 +6574,9 @@ CreateCheckPoint(int flags)
 									 CheckpointStats.ckpt_segs_recycled);
 
 	LWLockRelease(CheckpointLock);
+
+	/* Restore old value */
+	InRecovery = OldInRecovery;
 }
 
 /*
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 629565a8104..b5fd31532e6 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.60 2009/06/11 14:49:01 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.61 2009/06/25 21:36:00 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -449,6 +449,13 @@ BackgroundWriterMain(void)
 			SpinLockRelease(&bgs->ckpt_lck);
 
 			/*
+			 * The end-of-recovery checkpoint is a real checkpoint that's
+			 * performed while we're still in recovery.
+			 */
+			if (flags & CHECKPOINT_END_OF_RECOVERY)
+				do_restartpoint = false;
+
+			/*
 			 * We will warn if (a) too soon since last checkpoint (whatever
 			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
 			 * since the last checkpoint start.  Note in particular that this
@@ -895,10 +902,12 @@ BgWriterShmemInit(void)
  *
  * flags is a bitwise OR of the following:
  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
+ *	CHECKPOINT_END_OF_RECOVERY: checkpoint is to finish WAL recovery.
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
+ *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
+ *		CHECKPOINT_END_OF_RECOVERY).
  *	CHECKPOINT_WAIT: wait for completion before returning (otherwise,
  *		just signal bgwriter to do it, and return).
  *	CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 8655e90d29f..18402a6ad61 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.146 2009/06/11 14:49:02 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.147 2009/06/25 21:36:00 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -204,6 +204,21 @@ mdinit(void)
 }
 
 /*
+ * In archive recovery, we rely on bgwriter to do fsyncs(), but we don't
+ * know that we do archive recovery at process startup when pendingOpsTable
+ * has already been created. Calling this function drops pendingOpsTable
+ * and causes any subsequent requests to be forwarded to bgwriter.
+ */
+void
+SetForwardFsyncRequests(void)
+{
+	/* Perform any pending ops we may have queued up */
+	if (pendingOpsTable)
+		mdsync();
+	pendingOpsTable = NULL;
+}
+
+/*
  *	mdexists() -- Does the physical file exist?
  *
  * Note: this will return true for lingering files, with pending deletions
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ca1d2361544..ea9e232a08c 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.91 2009/02/18 15:58:41 heikki Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.92 2009/06/25 21:36:00 heikki Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -166,6 +166,8 @@ extern bool XLOG_DEBUG;
 /* These indicate the cause of a checkpoint request */
 #define CHECKPOINT_CAUSE_XLOG	0x0010	/* XLOG consumption */
 #define CHECKPOINT_CAUSE_TIME	0x0020	/* Elapsed time */
+#define CHECKPOINT_END_OF_RECOVERY	0x0040	/* Like shutdown checkpoint, but
+											 * issued at end of WAL recovery */
 
 /* Checkpoint statistics */
 typedef struct CheckpointStatsData
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 1b4b06ac967..1761f1c8089 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.67 2009/06/11 14:49:12 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.68 2009/06/25 21:36:00 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -109,6 +109,7 @@ extern void mdpreckpt(void);
 extern void mdsync(void);
 extern void mdpostckpt(void);
 
+extern void SetForwardFsyncRequests(void);
 extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
 					 BlockNumber segno);
 extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);