aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2010-04-13 14:17:46 +0000
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2010-04-13 14:17:46 +0000
commit361bd1662eb1edbc24a9e26dd413e5f38c68fcdf (patch)
tree1ed8f32f93926b77336ec11ebbe3517046b22158 /src
parentea9c103237ae9a42cc8e7acdf837df959c03c107 (diff)
downloadpostgresql-361bd1662eb1edbc24a9e26dd413e5f38c68fcdf.tar.gz
postgresql-361bd1662eb1edbc24a9e26dd413e5f38c68fcdf.zip
Allow Hot Standby to begin from a shutdown checkpoint.
Patch by Simon Riggs & me
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/transam/twophase.c85
-rw-r--r--src/backend/access/transam/xlog.c198
-rw-r--r--src/include/access/twophase.h3
3 files changed, 224 insertions, 62 deletions
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index b1bf2c4f260..faafc7e5c18 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.59 2010/02/26 02:00:34 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.60 2010/04/13 14:17:46 heikki Exp $
*
* NOTES
* Each global transaction is associated with a global transaction
@@ -1719,6 +1719,89 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
}
/*
+ * StandbyRecoverPreparedTransactions
+ *
+ * Scan the pg_twophase directory and setup all the required information to
+ * allow standby queries to treat prepared transactions as still active.
+ * This is never called at the end of recovery - we use
+ * RecoverPreparedTransactions() at that point.
+ *
+ * Currently we simply call SubTransSetParent() for any subxids of prepared
+ * transactions. If overwriteOK is true, it's OK if some XIDs have already
+ * been marked in pg_subtrans.
+ */
+void
+StandbyRecoverPreparedTransactions(bool overwriteOK)
+{
+ DIR *cldir;
+ struct dirent *clde;
+
+ cldir = AllocateDir(TWOPHASE_DIR);
+ while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
+ {
+ if (strlen(clde->d_name) == 8 &&
+ strspn(clde->d_name, "0123456789ABCDEF") == 8)
+ {
+ TransactionId xid;
+ char *buf;
+ TwoPhaseFileHeader *hdr;
+ TransactionId *subxids;
+ int i;
+
+ xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
+
+ /* Already processed? */
+ if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+ {
+ ereport(WARNING,
+ (errmsg("removing stale two-phase state file \"%s\"",
+ clde->d_name)));
+ RemoveTwoPhaseFile(xid, true);
+ continue;
+ }
+
+ /* Read and validate file */
+ buf = ReadTwoPhaseFile(xid, true);
+ if (buf == NULL)
+ {
+ ereport(WARNING,
+ (errmsg("removing corrupt two-phase state file \"%s\"",
+ clde->d_name)));
+ RemoveTwoPhaseFile(xid, true);
+ continue;
+ }
+
+ /* Deconstruct header */
+ hdr = (TwoPhaseFileHeader *) buf;
+ if (!TransactionIdEquals(hdr->xid, xid))
+ {
+ ereport(WARNING,
+ (errmsg("removing corrupt two-phase state file \"%s\"",
+ clde->d_name)));
+ RemoveTwoPhaseFile(xid, true);
+ pfree(buf);
+ continue;
+ }
+
+ /*
+ * Examine subtransaction XIDs ... they should all follow main
+ * XID.
+ */
+ subxids = (TransactionId *)
+ (buf + MAXALIGN(sizeof(TwoPhaseFileHeader)));
+ for (i = 0; i < hdr->nsubxacts; i++)
+ {
+ TransactionId subxid = subxids[i];
+
+ Assert(TransactionIdFollows(subxid, xid));
+ SubTransSetParent(xid, subxid, overwriteOK);
+ }
+ }
+ }
+ FreeDir(cldir);
+}
+
+/*
* RecoverPreparedTransactions
*
* Scan the pg_twophase directory and reload shared-memory state for each
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 379c6f11750..5fd4b870bef 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.393 2010/04/12 10:40:42 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.394 2010/04/13 14:17:46 heikki Exp $
*
*-------------------------------------------------------------------------
*/
@@ -496,6 +496,7 @@ static TimeLineID lastPageTLI = 0;
static XLogRecPtr minRecoveryPoint; /* local copy of
* ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true;
+static bool reachedMinRecoveryPoint = false;
static bool InRedo = false;
@@ -551,6 +552,7 @@ static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void);
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
+static void CheckRecoveryConsistency(void);
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
static List *readTimeLineHistory(TimeLineID targetTLI);
@@ -5591,7 +5593,6 @@ StartupXLOG(void)
uint32 freespace;
TransactionId oldestActiveXID;
bool bgwriterLaunched = false;
- bool backendsAllowed = false;
/*
* Read control file and check XLOG status looks valid.
@@ -5838,6 +5839,8 @@ StartupXLOG(void)
if (InRecovery)
{
int rmid;
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
/*
* Update pg_control to show that we are recovering and to show the
@@ -5930,6 +5933,33 @@ StartupXLOG(void)
StartupMultiXact();
ProcArrayInitRecoveryInfo(oldestActiveXID);
+
+ /*
+ * If we're beginning at a shutdown checkpoint, we know that
+ * nothing was running on the master at this point. So fake-up
+ * an empty running-xacts record and use that here and now.
+ * Recover additional standby state for prepared transactions.
+ */
+ if (wasShutdown)
+ {
+ RunningTransactionsData running;
+
+ /*
+ * Construct a RunningTransactions snapshot representing a shut
+ * down server, with only prepared transactions still alive.
+ * We're never overflowed at this point because all subxids
+ * are listed with their parent prepared transactions.
+ */
+ running.xcnt = nxids;
+ running.subxid_overflow = false;
+ running.nextXid = checkPoint.nextXid;
+ running.oldestRunningXid = oldestActiveXID;
+ running.xids = xids;
+
+ ProcArrayApplyRecoveryInfo(&running);
+
+ StandbyRecoverPreparedTransactions(false);
+ }
}
/* Initialize resource managers */
@@ -5940,6 +5970,46 @@ StartupXLOG(void)
}
/*
+ * Initialize shared replayEndRecPtr and recoveryLastRecPtr.
+ *
+ * This is slightly confusing if we're starting from an online
+ * checkpoint; we've just read and replayed the chekpoint record,
+ * but we're going to start replay from its redo pointer, which
+ * precedes the location of the checkpoint record itself. So even
+ * though the last record we've replayed is indeed ReadRecPtr, we
+ * haven't replayed all the preceding records yet. That's OK for
+ * the current use of these variables.
+ */
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->replayEndRecPtr = ReadRecPtr;
+ xlogctl->recoveryLastRecPtr = ReadRecPtr;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ /*
+ * Let postmaster know we've started redo now, so that it can
+ * launch bgwriter to perform restartpoints. We don't bother
+ * during crash recovery as restartpoints can only be performed
+ * during archive recovery. And we'd like to keep crash recovery
+ * simple, to avoid introducing bugs that could you from
+ * recovering after crash.
+ *
+ * After this point, we can no longer assume that we're the only
+ * process in addition to postmaster! Also, fsync requests are
+ * subsequently to be handled by the bgwriter, not locally.
+ */
+ if (InArchiveRecovery && IsUnderPostmaster)
+ {
+ SetForwardFsyncRequests();
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+ bgwriterLaunched = true;
+ }
+
+ /*
+ * Allow read-only connections immediately if we're consistent already.
+ */
+ CheckRecoveryConsistency();
+
+ /*
* Find the first record that logically follows the checkpoint --- it
* might physically precede it, though.
*/
@@ -5958,18 +6028,8 @@ StartupXLOG(void)
{
bool recoveryContinue = true;
bool recoveryApply = true;
- bool reachedMinRecoveryPoint = false;
ErrorContextCallback errcontext;
- /* use volatile pointer to prevent code rearrangement */
- volatile XLogCtlData *xlogctl = XLogCtl;
-
- /* initialize shared replayEndRecPtr and recoveryLastRecPtr */
- SpinLockAcquire(&xlogctl->info_lck);
- xlogctl->replayEndRecPtr = ReadRecPtr;
- xlogctl->recoveryLastRecPtr = ReadRecPtr;
- SpinLockRelease(&xlogctl->info_lck);
-
InRedo = true;
ereport(LOG,
@@ -5977,25 +6037,6 @@ StartupXLOG(void)
ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
/*
- * Let postmaster know we've started redo now, so that it can
- * launch bgwriter to perform restartpoints. We don't bother
- * during crash recovery as restartpoints can only be performed
- * during archive recovery. And we'd like to keep crash recovery
- * simple, to avoid introducing bugs that could you from
- * recovering after crash.
- *
- * After this point, we can no longer assume that we're the only
- * process in addition to postmaster! Also, fsync requests are
- * subsequently to be handled by the bgwriter, not locally.
- */
- if (InArchiveRecovery && IsUnderPostmaster)
- {
- SetForwardFsyncRequests();
- SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
- bgwriterLaunched = true;
- }
-
- /*
* main redo apply loop
*/
do
@@ -6024,32 +6065,8 @@ StartupXLOG(void)
/* Handle interrupt signals of startup process */
HandleStartupProcInterrupts();
- /*
- * Have we passed our safe starting point?
- */
- if (!reachedMinRecoveryPoint &&
- XLByteLE(minRecoveryPoint, EndRecPtr) &&
- XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
- {
- reachedMinRecoveryPoint = true;
- ereport(LOG,
- (errmsg("consistent recovery state reached at %X/%X",
- EndRecPtr.xlogid, EndRecPtr.xrecoff)));
- }
-
- /*
- * Have we got a valid starting snapshot that will allow
- * queries to be run? If so, we can tell postmaster that the
- * database is consistent now, enabling connections.
- */
- if (standbyState == STANDBY_SNAPSHOT_READY &&
- !backendsAllowed &&
- reachedMinRecoveryPoint &&
- IsUnderPostmaster)
- {
- backendsAllowed = true;
- SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
- }
+ /* Allow read-only connections if we're consistent now */
+ CheckRecoveryConsistency();
/*
* Have we reached our recovery target?
@@ -6399,6 +6416,44 @@ StartupXLOG(void)
}
/*
+ * Checks if recovery has reached a consistent state. When consistency is
+ * reached and we have a valid starting standby snapshot, tell postmaster
+ * that it can start accepting read-only connections.
+ */
+static void
+CheckRecoveryConsistency(void)
+{
+ static bool backendsAllowed = false;
+
+ /*
+ * Have we passed our safe starting point?
+ */
+ if (!reachedMinRecoveryPoint &&
+ XLByteLE(minRecoveryPoint, EndRecPtr) &&
+ XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+ {
+ reachedMinRecoveryPoint = true;
+ ereport(LOG,
+ (errmsg("consistent recovery state reached at %X/%X",
+ EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ }
+
+ /*
+ * Have we got a valid starting snapshot that will allow
+ * queries to be run? If so, we can tell postmaster that the
+ * database is consistent now, enabling connections.
+ */
+ if (standbyState == STANDBY_SNAPSHOT_READY &&
+ !backendsAllowed &&
+ reachedMinRecoveryPoint &&
+ IsUnderPostmaster)
+ {
+ backendsAllowed = true;
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+ }
+}
+
+/*
* Is the system still in recovery?
*
* Unlike testing InRecovery, this works in any process that's connected to
@@ -7657,13 +7712,36 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
if (standbyState != STANDBY_DISABLED)
CheckRequiredParameterValues(checkPoint);
+ /*
+ * If we see a shutdown checkpoint, we know that nothing was
+ * running on the master at this point. So fake-up an empty
+ * running-xacts record and use that here and now. Recover
+ * additional standby state for prepared transactions.
+ */
if (standbyState >= STANDBY_INITIALIZED)
{
+ TransactionId *xids;
+ int nxids;
+ TransactionId oldestActiveXID;
+ RunningTransactionsData running;
+
+ oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+
/*
- * Remove stale transactions, if any.
+ * Construct a RunningTransactions snapshot representing a shut
+ * down server, with only prepared transactions still alive.
+ * We're never overflowed at this point because all subxids
+ * are listed with their parent prepared transactions.
*/
- ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid);
- StandbyReleaseOldLocks(checkPoint.nextXid);
+ running.xcnt = nxids;
+ running.subxid_overflow = false;
+ running.nextXid = checkPoint.nextXid;
+ running.oldestRunningXid = oldestActiveXID;
+ running.xids = xids;
+
+ ProcArrayApplyRecoveryInfo(&running);
+
+ StandbyRecoverPreparedTransactions(true);
}
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 61b92244fb9..ea3c9966c73 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.14 2010/01/02 16:58:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.15 2010/04/13 14:17:46 heikki Exp $
*
*-------------------------------------------------------------------------
*/
@@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid);
extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
int *nxids_p);
+extern void StandbyRecoverPreparedTransactions(bool overwriteOK);
extern void RecoverPreparedTransactions(void);
extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);