aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/transam/twophase.c229
-rw-r--r--src/backend/access/transam/xact.c15
-rw-r--r--src/backend/access/transam/xlog.c4
-rw-r--r--src/include/access/twophase.h7
4 files changed, 197 insertions, 58 deletions
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 30bc21b5244..1aa9ce9b025 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.3 2005/06/18 19:33:41 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.4 2005/06/19 20:00:38 tgl Exp $
*
* NOTES
* Each global transaction is associated with a global transaction
@@ -49,12 +49,12 @@
#include "catalog/pg_type.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "pgstat.h"
#include "storage/fd.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "utils/builtins.h"
-#include "pgstat.h"
/*
@@ -105,6 +105,7 @@ typedef struct GlobalTransactionData
{
PGPROC proc; /* dummy proc */
TimestampTz prepared_at; /* time of preparation */
+ XLogRecPtr prepare_lsn; /* XLOG offset of prepare record */
AclId owner; /* ID of user that executed the xact */
TransactionId locking_xid; /* top-level XID of backend working on xact */
bool valid; /* TRUE if fully prepared */
@@ -281,6 +282,9 @@ MarkAsPreparing(TransactionId xid, const char *gid,
gxact->proc.subxids.nxids = 0;
gxact->prepared_at = prepared_at;
+ /* initialize LSN to 0 (start of WAL) */
+ gxact->prepare_lsn.xlogid = 0;
+ gxact->prepare_lsn.xrecoff = 0;
gxact->owner = owner;
gxact->locking_xid = xid;
gxact->valid = false;
@@ -324,7 +328,7 @@ GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
* MarkAsPrepared
* Mark the GXACT as fully valid, and enter it into the global ProcArray.
*/
-void
+static void
MarkAsPrepared(GlobalTransaction gxact)
{
/* Lock here may be overkill, but I'm not convinced of that ... */
@@ -434,6 +438,40 @@ RemoveGXact(GlobalTransaction gxact)
}
/*
+ * TransactionIdIsPrepared
+ * True iff transaction associated with the identifier is prepared
+ * for two-phase commit
+ *
+ * Note: only gxacts marked "valid" are considered; but notice we do not
+ * check the locking status.
+ *
+ * This is not currently exported, because it is only needed internally.
+ */
+static bool
+TransactionIdIsPrepared(TransactionId xid)
+{
+ bool result = false;
+ int i;
+
+ LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+ if (gxact->valid && gxact->proc.xid == xid)
+ {
+ result = true;
+ break;
+ }
+ }
+
+ LWLockRelease(TwoPhaseStateLock);
+
+ return result;
+}
+
+/*
* Returns an array of all prepared transactions for the user-level
* function pg_prepared_xact.
*
@@ -790,7 +828,6 @@ EndPrepare(GlobalTransaction gxact)
TwoPhaseFileHeader *hdr;
char path[MAXPGPATH];
XLogRecData *record;
- XLogRecPtr recptr;
pg_crc32 statefile_crc;
pg_crc32 bogus_crc;
int fd;
@@ -841,14 +878,9 @@ EndPrepare(GlobalTransaction gxact)
FIN_CRC32(statefile_crc);
/*
- * Write a deliberately bogus CRC to the state file, and flush it to disk.
- * This is to minimize the odds of failure within the critical section
- * below --- in particular, running out of disk space.
- *
- * On most filesystems, write() rather than fsync() detects out-of-space,
- * so the fsync might be considered optional. Using it means there
- * are three fsyncs not two associated with preparing a transaction; is
- * the risk of an error from fsync high enough to justify that?
+ * Write a deliberately bogus CRC to the state file; this is just
+ * paranoia to catch the case where four more bytes will run us out of
+ * disk space.
*/
bogus_crc = ~ statefile_crc;
@@ -860,14 +892,6 @@ EndPrepare(GlobalTransaction gxact)
errmsg("could not write twophase state file: %m")));
}
- if (pg_fsync(fd) != 0)
- {
- close(fd);
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not fsync twophase state file: %m")));
- }
-
/* Back up to prepare for rewriting the CRC */
if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0)
{
@@ -881,28 +905,34 @@ EndPrepare(GlobalTransaction gxact)
* The state file isn't valid yet, because we haven't written the correct
* CRC yet. Before we do that, insert entry in WAL and flush it to disk.
*
- * Between the time we have written the WAL entry and the time we
- * flush the correct state file CRC to disk, we have an inconsistency:
- * the xact is prepared according to WAL but not according to our on-disk
- * state. We use a critical section to force a PANIC if we are unable to
- * complete the flush --- then, WAL replay should repair the
- * inconsistency.
+ * Between the time we have written the WAL entry and the time we write
+ * out the correct state file CRC, we have an inconsistency: the xact is
+ * prepared according to WAL but not according to our on-disk state.
+ * We use a critical section to force a PANIC if we are unable to complete
+ * the write --- then, WAL replay should repair the inconsistency. The
+ * odds of a PANIC actually occurring should be very tiny given that we
+ * were able to write the bogus CRC above.
*
* We have to lock out checkpoint start here, too; otherwise a checkpoint
* starting immediately after the WAL record is inserted could complete
- * before we've finished flushing, meaning that the WAL record would not
- * get replayed if a crash follows.
+ * without fsync'ing our state file. (This is essentially the same kind
+ * of race condition as the COMMIT-to-clog-write case that
+ * RecordTransactionCommit uses CheckpointStartLock for; see notes there.)
+ *
+ * We save the PREPARE record's location in the gxact for later use by
+ * CheckPointTwoPhase.
*/
START_CRIT_SECTION();
LWLockAcquire(CheckpointStartLock, LW_SHARED);
- recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, records.head);
- XLogFlush(recptr);
+ gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
+ records.head);
+ XLogFlush(gxact->prepare_lsn);
/* If we crash now, we have prepared: WAL replay will fix things */
- /* write correct CRC, flush, and close file */
+ /* write correct CRC and close file */
if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
{
close(fd);
@@ -911,19 +941,29 @@ EndPrepare(GlobalTransaction gxact)
errmsg("could not write twophase state file: %m")));
}
- if (pg_fsync(fd) != 0)
- {
- close(fd);
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not fsync twophase state file: %m")));
- }
-
if (close(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close twophase state file: %m")));
+ /*
+ * Mark the prepared transaction as valid. As soon as xact.c marks
+ * MyProc as not running our XID (which it will do immediately after
+ * this function returns), others can commit/rollback the xact.
+ *
+ * NB: a side effect of this is to make a dummy ProcArray entry for the
+ * prepared XID. This must happen before we clear the XID from MyProc,
+ * else there is a window where the XID is not running according to
+ * TransactionIdInProgress, and onlookers would be entitled to assume
+ * the xact crashed. Instead we have a window where the same XID
+ * appears twice in ProcArray, which is OK.
+ */
+ MarkAsPrepared(gxact);
+
+ /*
+ * Now we can release the checkpoint start lock: a checkpoint starting
+ * after this will certainly see the gxact as a candidate for fsyncing.
+ */
LWLockRelease(CheckpointStartLock);
END_CRIT_SECTION();
@@ -1119,6 +1159,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
* In case we fail while running the callbacks, mark the gxact invalid
* so no one else will try to commit/rollback, and so it can be recycled
* properly later. It is still locked by our XID so it won't go away yet.
+ *
+ * (We assume it's safe to do this without taking TwoPhaseStateLock.)
*/
gxact->valid = false;
@@ -1248,7 +1290,10 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
errmsg("could not write twophase state file: %m")));
}
- /* Sync and close the file */
+ /*
+ * We must fsync the file because the end-of-replay checkpoint will
+ * not do so, there being no GXACT in shared memory yet to tell it to.
+ */
if (pg_fsync(fd) != 0)
{
close(fd);
@@ -1264,6 +1309,103 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
}
/*
+ * CheckPointTwoPhase -- handle 2PC component of checkpointing.
+ *
+ * We must fsync the state file of any GXACT that is valid and has a PREPARE
+ * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or
+ * has a later LSN, this checkpoint is not responsible for fsyncing it.)
+ *
+ * This is deliberately run as late as possible in the checkpoint sequence,
+ * because GXACTs ordinarily have short lifespans, and so it is quite
+ * possible that GXACTs that were valid at checkpoint start will no longer
+ * exist if we wait a little bit.
+ *
+ * If a GXACT remains valid across multiple checkpoints, it'll be fsynced
+ * each time. This is considered unusual enough that we don't bother to
+ * expend any extra code to avoid the redundant fsyncs. (They should be
+ * reasonably cheap anyway, since they won't cause I/O.)
+ */
+void
+CheckPointTwoPhase(XLogRecPtr redo_horizon)
+{
+ TransactionId *xids;
+ int nxids;
+ char path[MAXPGPATH];
+ int i;
+
+ /*
+ * We don't want to hold the TwoPhaseStateLock while doing I/O,
+ * so we grab it just long enough to make a list of the XIDs that
+ * require fsyncing, and then do the I/O afterwards.
+ *
+ * This approach creates a race condition: someone else could delete
+ * a GXACT between the time we release TwoPhaseStateLock and the time
+ * we try to open its state file. We handle this by special-casing
+ * ENOENT failures: if we see that, we verify that the GXACT is no
+ * longer valid, and if so ignore the failure.
+ */
+ if (max_prepared_xacts <= 0)
+ return; /* nothing to do */
+ xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId));
+ nxids = 0;
+
+ LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
+
+ for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+ {
+ GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+
+ if (gxact->valid &&
+ XLByteLE(gxact->prepare_lsn, redo_horizon))
+ xids[nxids++] = gxact->proc.xid;
+ }
+
+ LWLockRelease(TwoPhaseStateLock);
+
+ for (i = 0; i < nxids; i++)
+ {
+ TransactionId xid = xids[i];
+ int fd;
+
+ TwoPhaseFilePath(path, xid);
+
+ fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
+ if (fd < 0)
+ {
+ if (errno == ENOENT)
+ {
+ /* OK if gxact is no longer valid */
+ if (!TransactionIdIsPrepared(xid))
+ continue;
+ /* Restore errno in case it was changed */
+ errno = ENOENT;
+ }
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open twophase state file \"%s\": %m",
+ path)));
+ }
+
+ if (pg_fsync(fd) != 0)
+ {
+ close(fd);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync twophase state file \"%s\": %m",
+ path)));
+ }
+
+ if (close(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close twophase state file \"%s\": %m",
+ path)));
+ }
+
+ pfree(xids);
+}
+
+/*
* PrescanPreparedTransactions
*
* Scan the pg_twophase directory and determine the range of valid XIDs
@@ -1492,6 +1634,13 @@ RecoverPreparedTransactions(void)
/*
* Recreate its GXACT and dummy PGPROC
+ *
+ * Note: since we don't have the PREPARE record's WAL location
+ * at hand, we leave prepare_lsn zeroes. This means the GXACT
+ * will be fsync'd on every future checkpoint. We assume this
+ * situation is infrequent enough that the performance cost is
+ * negligible (especially since we know the state file has
+ * already been fsynced).
*/
gxact = MarkAsPreparing(xid, hdr->gid,
hdr->prepared_at,
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 2f73ee10c06..98e56c40020 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.206 2005/06/18 19:33:41 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.207 2005/06/19 20:00:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1742,19 +1742,6 @@ PrepareTransaction(void)
EndPrepare(gxact);
/*
- * Mark the prepared transaction as valid. As soon as we mark ourselves
- * not running in MyProc below, others can commit/rollback the xact.
- *
- * NB: a side effect of this is to make a dummy ProcArray entry for the
- * prepared XID. This must happen before we clear the XID from MyProc,
- * else there is a window where the XID is not running according to
- * TransactionIdInProgress, and onlookers would be entitled to assume
- * the xact crashed. Instead we have a window where the same XID
- * appears twice in ProcArray, which is OK.
- */
- MarkAsPrepared(gxact);
-
- /*
* Now we clean up backend-internal state and release internal
* resources.
*/
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 15b82ee9be8..b15949b2854 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.201 2005/06/17 22:32:43 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.202 2005/06/19 20:00:38 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -5110,6 +5110,8 @@ CreateCheckPoint(bool shutdown, bool force)
CheckPointSUBTRANS();
CheckPointMultiXact();
FlushBufferPool();
+ /* We deliberately delay 2PC checkpointing as long as possible */
+ CheckPointTwoPhase(checkPoint.redo);
START_CRIT_SECTION();
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index d7c64f2b12e..08f1f9bd9fe 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -7,14 +7,14 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.2 2005/06/18 19:33:42 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.3 2005/06/19 20:00:39 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef TWOPHASE_H
#define TWOPHASE_H
-#include "storage/lock.h"
+#include "storage/proc.h"
#include "utils/timestamp.h"
@@ -35,7 +35,6 @@ extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid);
extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid,
TimestampTz prepared_at,
AclId owner, Oid databaseid);
-extern void MarkAsPrepared(GlobalTransaction gxact);
extern void StartPrepare(GlobalTransaction gxact);
extern void EndPrepare(GlobalTransaction gxact);
@@ -46,6 +45,8 @@ extern void RecoverPreparedTransactions(void);
extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
extern void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning);
+extern void CheckPointTwoPhase(XLogRecPtr redo_horizon);
+
extern void FinishPreparedTransaction(const char *gid, bool isCommit);
#endif /* TWOPHASE_H */