aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/transam/README113
-rw-r--r--src/backend/access/transam/clog.c78
-rw-r--r--src/backend/access/transam/multixact.c24
-rw-r--r--src/backend/access/transam/slru.c112
-rw-r--r--src/backend/access/transam/subtrans.c8
-rw-r--r--src/backend/access/transam/transam.c122
-rw-r--r--src/backend/access/transam/twophase.c8
-rw-r--r--src/backend/access/transam/xact.c90
-rw-r--r--src/backend/access/transam/xlog.c48
-rw-r--r--src/backend/commands/dbcommands.c14
-rw-r--r--src/backend/commands/tablespace.c18
-rw-r--r--src/backend/commands/vacuum.c22
-rw-r--r--src/backend/utils/init/flatfiles.c10
-rw-r--r--src/backend/utils/misc/guc.c14
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample1
-rw-r--r--src/backend/utils/time/tqual.c334
-rw-r--r--src/include/access/clog.h6
-rw-r--r--src/include/access/gist_private.h4
-rw-r--r--src/include/access/slru.h25
-rw-r--r--src/include/access/transam.h7
-rw-r--r--src/include/access/xact.h6
-rw-r--r--src/include/access/xlog.h5
-rw-r--r--src/include/access/xlogdefs.h4
23 files changed, 798 insertions, 275 deletions
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index cec93e6f766..6e7e132acab 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.5 2006/03/31 23:32:05 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.6 2007/08/01 22:45:07 tgl Exp $
The Transaction System
----------------------
@@ -409,4 +409,113 @@ two separate WAL records. The replay code has to remember "unfinished" split
operations, and match them up to subsequent insertions in the parent level.
If no matching insert has been found by the time the WAL replay ends, the
replay code has to do the insertion on its own to restore the index to
-consistency.
+consistency. Such insertions occur after WAL is operational, so they can
+and should write WAL records for the additional generated actions.
+
+
+Asynchronous Commit
+-------------------
+
+As of PostgreSQL 8.3 it is possible to perform asynchronous commits - i.e.,
+we don't wait while the WAL record for the commit is fsync'ed.
+We perform an asynchronous commit when synchronous_commit = off. Instead
+of performing an XLogFlush() up to the LSN of the commit, we merely note
+the LSN in shared memory. The backend then continues with other work.
+We record the LSN only for an asynchronous commit, not an abort; there's
+never any need to flush an abort record, since the presumption after a
+crash would be that the transaction aborted anyway.
+
+We always force synchronous commit when the transaction is deleting
+relations, to ensure the commit record is down to disk before the relations
+are removed from the filesystem. Also, certain utility commands that have
+non-roll-backable side effects (such as filesystem changes) force sync
+commit to minimize the window in which the filesystem change has been made
+but the transaction isn't guaranteed committed.
+
+Every wal_writer_delay milliseconds, the walwriter process performs an
+XLogBackgroundFlush(). This checks the location of the last completely
+filled WAL page. If that has moved forwards, then we write all the changed
+buffers up to that point, so that under full load we write only whole
+buffers. If there has been a break in activity and the current WAL page is
+the same as before, then we find out the LSN of the most recent
+asynchronous commit, and flush up to that point, if required (i.e.,
+if it's in the current WAL page). This arrangement in itself would
+guarantee that an async commit record reaches disk during at worst the
+second walwriter cycle after the transaction completes. However, we also
+allow XLogFlush to flush full buffers "flexibly" (ie, not wrapping around
+at the end of the circular WAL buffer area), so as to minimize the number
+of writes issued under high load when multiple WAL pages are filled per
+walwriter cycle. This makes the worst-case delay three walwriter cycles.
+
+There are some other subtle points to consider with asynchronous commits.
+First, for each page of CLOG we must remember the LSN of the latest commit
+affecting the page, so that we can enforce the same flush-WAL-before-write
+rule that we do for ordinary relation pages. Otherwise the record of the
+commit might reach disk before the WAL record does. Again, abort records
+need not factor into this consideration.
+
+In fact, we store more than one LSN for each clog page. This relates to
+the way we set transaction status hint bits during visibility tests.
+We must not set a transaction-committed hint bit on a relation page and
+have that record make it to disk prior to the WAL record of the commit.
+Since visibility tests are normally made while holding buffer share locks,
+we do not have the option of changing the page's LSN to guarantee WAL
+synchronization. Instead, we defer the setting of the hint bit if we have
+not yet flushed WAL as far as the LSN associated with the transaction.
+This requires tracking the LSN of each unflushed async commit. It is
+convenient to associate this data with clog buffers: because we will flush
+WAL before writing a clog page, we know that we do not need to remember a
+transaction's LSN longer than the clog page holding its commit status
+remains in memory. However, the naive approach of storing an LSN for each
+clog position is unattractive: the LSNs are 32x bigger than the two-bit
+commit status fields, and so we'd need 256K of additional shared memory for
+each 8K clog buffer page. We choose instead to store a smaller number of
+LSNs per page, where each LSN is the highest LSN associated with any
+transaction commit in a contiguous range of transaction IDs on that page.
+This saves storage at the price of some possibly-unnecessary delay in
+setting transaction hint bits.
+
+How many transactions should share the same cached LSN (N)? If the
+system's workload consists only of small async-commit transactions, then
+it's reasonable to have N similar to the number of transactions per
+walwriter cycle, since that is the granularity with which transactions will
+become truly committed (and thus hintable) anyway. The worst case is where
+a sync-commit xact shares a cached LSN with an async-commit xact that
+commits a bit later; even though we paid to sync the first xact to disk,
+we won't be able to hint its outputs until the second xact is sync'd, up to
+three walwriter cycles later. This argues for keeping N (the group size)
+as small as possible. For the moment we are setting the group size to 32,
+which makes the LSN cache space the same size as the actual clog buffer
+space (independently of BLCKSZ).
+
+It is useful that we can run both synchronous and asynchronous commit
+transactions concurrently, but the safety of this is perhaps not
+immediately obvious. Assume we have two transactions, T1 and T2. The Log
+Sequence Number (LSN) is the point in the WAL sequence where a transaction
+commit is recorded, so LSN1 and LSN2 are the commit records of those
+transactions. If T2 can see changes made by T1 then when T2 commits it
+must be true that LSN2 follows LSN1. Thus when T2 commits it is certain
+that all of the changes made by T1 are also now recorded in the WAL. This
+is true whether T1 was asynchronous or synchronous. As a result, it is
+safe for asynchronous commits and synchronous commits to work concurrently
+without endangering data written by synchronous commits. Sub-transactions
+are not important here since the final write to disk only occurs at the
+commit of the top level transaction.
+
+Changes to data blocks cannot reach disk unless WAL is flushed up to the
+point of the LSN of the data blocks. Any attempt to write unsafe data to
+disk will trigger a write which ensures the safety of all data written by
+that and prior transactions. Data blocks and clog pages are both protected
+by LSNs.
+
+Changes to a temp table are not WAL-logged, hence could reach disk in
+advance of T1's commit, but we don't care since temp table contents don't
+survive crashes anyway.
+
+Database writes made via any of the paths we have introduced to avoid WAL
+overhead for bulk updates are also safe. In these cases it's entirely
+possible for the data to reach disk before T1's commit, because T1 will
+fsync it down to disk without any sort of interlock, as soon as it finishes
+the bulk update. However, all these paths are designed to write data that
+no other transaction can see until after T1 commits. The situation is thus
+not different from ordinary WAL-logged updates.
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 5bafef1be34..9665d129541 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -14,17 +14,19 @@
* CLOG page is initialized to zeroes. Other writes of CLOG come from
* recording of transaction commit or abort in xact.c, which generates its
* own XLOG records for these events and will re-perform the status update
- * on redo; so we need make no additional XLOG entry here. Also, the XLOG
- * is guaranteed flushed through the XLOG commit record before we are called
- * to log a commit, so the WAL rule "write xlog before data" is satisfied
- * automatically for commits, and we don't really care for aborts. Therefore,
- * we don't need to mark CLOG pages with LSN information; we have enough
- * synchronization already.
+ * on redo; so we need make no additional XLOG entry here. For synchronous
+ * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
+ * record before we are called to log a commit, so the WAL rule "write xlog
+ * before data" is satisfied automatically. However, for async commits we
+ * must track the latest LSN affecting each CLOG page, so that we can flush
+ * XLOG that far and satisfy the WAL rule. We don't have to worry about this
+ * for aborts (whether sync or async), since the post-crash assumption would
+ * be that such transactions failed anyway.
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.42 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.43 2007/08/01 22:45:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -57,6 +59,13 @@
#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
+/* We store the latest async LSN for each group of transactions */
+#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
+#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
+
+#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
+ ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
+
/*
* Link to shared-memory data structures for CLOG control
@@ -75,11 +84,16 @@ static void WriteTruncateXlogRec(int pageno);
/*
* Record the final state of a transaction in the commit log.
*
+ * lsn must be the WAL location of the commit record when recording an async
+ * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
+ * caller guarantees the commit record is already flushed in that case. It
+ * should be InvalidXLogRecPtr for abort cases, too.
+ *
* NB: this is a low-level routine and is NOT the preferred entry point
* for most uses; TransactionLogUpdate() in transam.c is the intended caller.
*/
void
-TransactionIdSetStatus(TransactionId xid, XidStatus status)
+TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
{
int pageno = TransactionIdToPage(xid);
int byteno = TransactionIdToByte(xid);
@@ -94,7 +108,16 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
- slotno = SimpleLruReadPage(ClogCtl, pageno, xid);
+ /*
+ * If we're doing an async commit (ie, lsn is valid), then we must wait
+ * for any active write on the page slot to complete. Otherwise our
+ * update could reach disk in that write, which will not do since we
+ * mustn't let it reach disk until we've done the appropriate WAL flush.
+ * But when lsn is invalid, it's OK to scribble on a page while it is
+ * write-busy, since we don't care if the update reaches disk sooner than
+ * we think. Hence, pass write_ok = XLogRecPtrIsInvalid(lsn).
+ */
+ slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
/* Current state should be 0, subcommitted or target state */
@@ -110,22 +133,48 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
ClogCtl->shared->page_dirty[slotno] = true;
+ /*
+ * Update the group LSN if the transaction completion LSN is higher.
+ *
+ * Note: lsn will be invalid when supplied during InRecovery processing,
+ * so we don't need to do anything special to avoid LSN updates during
+ * recovery. After recovery completes the next clog change will set the
+ * LSN correctly.
+ */
+ if (!XLogRecPtrIsInvalid(lsn))
+ {
+ int lsnindex = GetLSNIndex(slotno, xid);
+
+ if (XLByteLT(ClogCtl->shared->group_lsn[lsnindex], lsn))
+ ClogCtl->shared->group_lsn[lsnindex] = lsn;
+ }
+
LWLockRelease(CLogControlLock);
}
/*
* Interrogate the state of a transaction in the commit log.
*
+ * Aside from the actual commit status, this function returns (into *lsn)
+ * an LSN that is late enough to be able to guarantee that if we flush up to
+ * that LSN then we will have flushed the transaction's commit record to disk.
+ * The result is not necessarily the exact LSN of the transaction's commit
+ * record! For example, for long-past transactions (those whose clog pages
+ * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because
+ * we group transactions on the same clog page to conserve storage, we might
+ * return the LSN of a later transaction that falls into the same group.
+ *
* NB: this is a low-level routine and is NOT the preferred entry point
* for most uses; TransactionLogFetch() in transam.c is the intended caller.
*/
XidStatus
-TransactionIdGetStatus(TransactionId xid)
+TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
{
int pageno = TransactionIdToPage(xid);
int byteno = TransactionIdToByte(xid);
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
+ int lsnindex;
char *byteptr;
XidStatus status;
@@ -136,6 +185,9 @@ TransactionIdGetStatus(TransactionId xid)
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
+ lsnindex = GetLSNIndex(slotno, xid);
+ *lsn = ClogCtl->shared->group_lsn[lsnindex];
+
LWLockRelease(CLogControlLock);
return status;
@@ -148,14 +200,14 @@ TransactionIdGetStatus(TransactionId xid)
Size
CLOGShmemSize(void)
{
- return SimpleLruShmemSize(NUM_CLOG_BUFFERS);
+ return SimpleLruShmemSize(NUM_CLOG_BUFFERS, CLOG_LSNS_PER_PAGE);
}
void
CLOGShmemInit(void)
{
ClogCtl->PagePrecedes = CLOGPagePrecedes;
- SimpleLruInit(ClogCtl, "CLOG Ctl", NUM_CLOG_BUFFERS,
+ SimpleLruInit(ClogCtl, "CLOG Ctl", NUM_CLOG_BUFFERS, CLOG_LSNS_PER_PAGE,
CLogControlLock, "pg_clog");
}
@@ -240,7 +292,7 @@ StartupCLOG(void)
int slotno;
char *byteptr;
- slotno = SimpleLruReadPage(ClogCtl, pageno, xid);
+ slotno = SimpleLruReadPage(ClogCtl, pageno, false, xid);
byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
/* Zero so-far-unused positions in the current byte */
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 704bf6a0ba6..3ce6f14bcf6 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -42,7 +42,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.23 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.24 2007/08/01 22:45:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -749,7 +749,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* enough that a MultiXactId is really involved. Perhaps someday we'll
* take the trouble to generalize the slru.c error reporting code.
*/
- slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi);
+ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
offptr += entryno;
@@ -773,7 +773,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
if (pageno != prev_pageno)
{
- slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, multi);
+ slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
prev_pageno = pageno;
}
@@ -993,7 +993,7 @@ retry:
pageno = MultiXactIdToOffsetPage(multi);
entryno = MultiXactIdToOffsetEntry(multi);
- slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi);
+ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
offptr += entryno;
offset = *offptr;
@@ -1025,7 +1025,7 @@ retry:
entryno = MultiXactIdToOffsetEntry(tmpMXact);
if (pageno != prev_pageno)
- slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, tmpMXact);
+ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
offptr += entryno;
@@ -1061,7 +1061,7 @@ retry:
if (pageno != prev_pageno)
{
- slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, multi);
+ slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
prev_pageno = pageno;
}
@@ -1289,8 +1289,8 @@ MultiXactShmemSize(void)
mul_size(sizeof(MultiXactId) * 2, MaxBackends))
size = SHARED_MULTIXACT_STATE_SIZE;
- size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS));
- size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS));
+ size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0));
return size;
}
@@ -1306,10 +1306,10 @@ MultiXactShmemInit(void)
MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
SimpleLruInit(MultiXactOffsetCtl,
- "MultiXactOffset Ctl", NUM_MXACTOFFSET_BUFFERS,
+ "MultiXactOffset Ctl", NUM_MXACTOFFSET_BUFFERS, 0,
MultiXactOffsetControlLock, "pg_multixact/offsets");
SimpleLruInit(MultiXactMemberCtl,
- "MultiXactMember Ctl", NUM_MXACTMEMBER_BUFFERS,
+ "MultiXactMember Ctl", NUM_MXACTMEMBER_BUFFERS, 0,
MultiXactMemberControlLock, "pg_multixact/members");
/* Initialize our shared state struct */
@@ -1442,7 +1442,7 @@ StartupMultiXact(void)
int slotno;
MultiXactOffset *offptr;
- slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi);
+ slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
offptr += entryno;
@@ -1472,7 +1472,7 @@ StartupMultiXact(void)
int slotno;
TransactionId *xidptr;
- slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, offset);
+ slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno];
xidptr += entryno;
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index e68ed7e331e..bf3990bc299 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -41,7 +41,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.40 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.41 2007/08/01 22:45:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -140,6 +140,8 @@ static SlruErrorCause slru_errcause;
static int slru_errno;
+static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
+static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
SlruFlush fdata);
@@ -152,7 +154,7 @@ static int SlruSelectLRUPage(SlruCtl ctl, int pageno);
*/
Size
-SimpleLruShmemSize(int nslots)
+SimpleLruShmemSize(int nslots, int nlsns)
{
Size sz;
@@ -165,18 +167,21 @@ SimpleLruShmemSize(int nslots)
sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
sz += MAXALIGN(nslots * sizeof(LWLockId)); /* buffer_locks[] */
+ if (nlsns > 0)
+ sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
+
return BUFFERALIGN(sz) + BLCKSZ * nslots;
}
void
-SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
LWLockId ctllock, const char *subdir)
{
SlruShared shared;
bool found;
shared = (SlruShared) ShmemInitStruct(name,
- SimpleLruShmemSize(nslots),
+ SimpleLruShmemSize(nslots, nlsns),
&found);
if (!IsUnderPostmaster)
@@ -193,6 +198,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
shared->ControlLock = ctllock;
shared->num_slots = nslots;
+ shared->lsn_groups_per_page = nlsns;
shared->cur_lru_count = 0;
@@ -212,8 +218,14 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
offset += MAXALIGN(nslots * sizeof(int));
shared->buffer_locks = (LWLockId *) (ptr + offset);
offset += MAXALIGN(nslots * sizeof(LWLockId));
- ptr += BUFFERALIGN(offset);
+ if (nlsns > 0)
+ {
+ shared->group_lsn = (XLogRecPtr *) (ptr + offset);
+ offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
+ }
+
+ ptr += BUFFERALIGN(offset);
for (slotno = 0; slotno < nslots; slotno++)
{
shared->page_buffer[slotno] = ptr;
@@ -266,6 +278,9 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
/* Set the buffer to zeroes */
MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ /* Set the LSNs for this new page to zero */
+ SimpleLruZeroLSNs(ctl, slotno);
+
/* Assume this page is now the latest active page */
shared->latest_page_number = pageno;
@@ -273,8 +288,27 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
}
/*
+ * Zero all the LSNs we store for this slru page.
+ *
+ * This should be called each time we create a new page, and each time we read
+ * in a page from disk into an existing buffer. (Such an old page cannot
+ * have any interesting LSNs, since we'd have flushed them before writing
+ * the page in the first place.)
+ */
+static void
+SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
+{
+ SlruShared shared = ctl->shared;
+
+ if (shared->lsn_groups_per_page > 0)
+ MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
+ shared->lsn_groups_per_page * sizeof(XLogRecPtr));
+}
+
+/*
* Wait for any active I/O on a page slot to finish. (This does not
- * guarantee that new I/O hasn't been started before we return, though.)
+ * guarantee that new I/O hasn't been started before we return, though.
+ * In fact the slot might not even contain the same page anymore.)
*
* Control lock must be held at entry, and will be held at exit.
*/
@@ -305,8 +339,7 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno)
/* indeed, the I/O must have failed */
if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
shared->page_status[slotno] = SLRU_PAGE_EMPTY;
- else
- /* write_in_progress */
+ else /* write_in_progress */
{
shared->page_status[slotno] = SLRU_PAGE_VALID;
shared->page_dirty[slotno] = true;
@@ -320,6 +353,11 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno)
* Find a page in a shared buffer, reading it in if necessary.
* The page number must correspond to an already-initialized page.
*
+ * If write_ok is true then it is OK to return a page that is in
+ * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
+ * that modification of the page is safe. If write_ok is false then we
+ * will not return the page until it is not undergoing active I/O.
+ *
* The passed-in xid is used only for error reporting, and may be
* InvalidTransactionId if no specific xid is associated with the action.
*
@@ -329,7 +367,8 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno)
* Control lock must be held at entry, and will be held at exit.
*/
int
-SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid)
+SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
+ TransactionId xid)
{
SlruShared shared = ctl->shared;
@@ -346,8 +385,13 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid)
if (shared->page_number[slotno] == pageno &&
shared->page_status[slotno] != SLRU_PAGE_EMPTY)
{
- /* If page is still being read in, we must wait for I/O */
- if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
+ /*
+ * If page is still being read in, we must wait for I/O. Likewise
+ * if the page is being written and the caller said that's not OK.
+ */
+ if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
+ (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
+ !write_ok))
{
SimpleLruWaitIO(ctl, slotno);
/* Now we must recheck state from the top */
@@ -383,6 +427,9 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid)
/* Do the read */
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
+ /* Set the LSNs for this newly read-in page to zero */
+ SimpleLruZeroLSNs(ctl, slotno);
+
/* Re-acquire control lock and update page state */
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
@@ -443,7 +490,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
LWLockRelease(shared->ControlLock);
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
- return SimpleLruReadPage(ctl, pageno, xid);
+ return SimpleLruReadPage(ctl, pageno, true, xid);
}
/*
@@ -622,6 +669,47 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
int fd = -1;
/*
+ * Honor the write-WAL-before-data rule, if appropriate, so that we do
+ * not write out data before associated WAL records. This is the same
+ * action performed during FlushBuffer() in the main buffer manager.
+ */
+ if (shared->group_lsn != NULL)
+ {
+ /*
+ * We must determine the largest async-commit LSN for the page.
+ * This is a bit tedious, but since this entire function is a slow
+ * path anyway, it seems better to do this here than to maintain
+ * a per-page LSN variable (which'd need an extra comparison in the
+ * transaction-commit path).
+ */
+ XLogRecPtr max_lsn;
+ int lsnindex, lsnoff;
+
+ lsnindex = slotno * shared->lsn_groups_per_page;
+ max_lsn = shared->group_lsn[lsnindex++];
+ for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
+ {
+ XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
+
+ if (XLByteLT(max_lsn, this_lsn))
+ max_lsn = this_lsn;
+ }
+
+ if (!XLogRecPtrIsInvalid(max_lsn))
+ {
+ /*
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of
+ * a restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
+ */
+ START_CRIT_SECTION();
+ XLogFlush(max_lsn);
+ END_CRIT_SECTION();
+ }
+ }
+
+ /*
* During a Flush, we may already have the desired file open.
*/
if (fdata)
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 6205d43820d..b3836c5231c 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -22,7 +22,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.18 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.19 2007/08/01 22:45:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -78,7 +78,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
- slotno = SimpleLruReadPage(SubTransCtl, pageno, xid);
+ slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
ptr += entryno;
@@ -165,14 +165,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
Size
SUBTRANSShmemSize(void)
{
- return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS);
+ return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
}
void
SUBTRANSShmemInit(void)
{
SubTransCtl->PagePrecedes = SubTransPagePrecedes;
- SimpleLruInit(SubTransCtl, "SUBTRANS Ctl", NUM_SUBTRANS_BUFFERS,
+ SimpleLruInit(SubTransCtl, "SUBTRANS Ctl", NUM_SUBTRANS_BUFFERS, 0,
SubtransControlLock, "pg_subtrans");
/* Override default assumption that writes should be fsync'd */
SubTransCtl->do_fsync = false;
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
index c2ad0c11a0f..3466b50ef24 100644
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.69 2007/01/05 22:19:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.70 2007/08/01 22:45:07 tgl Exp $
*
* NOTES
* This file contains the high level access-method interface to the
@@ -27,14 +27,17 @@
static XidStatus TransactionLogFetch(TransactionId transactionId);
static void TransactionLogUpdate(TransactionId transactionId,
- XidStatus status);
+ XidStatus status, XLogRecPtr lsn);
-/* ----------------
- * Single-item cache for results of TransactionLogFetch.
- * ----------------
+/*
+ * Single-item cache for results of TransactionLogFetch.
*/
static TransactionId cachedFetchXid = InvalidTransactionId;
static XidStatus cachedFetchXidStatus;
+static XLogRecPtr cachedCommitLSN;
+
+/* Handy constant for an invalid xlog recptr */
+static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
/* ----------------------------------------------------------------
@@ -52,6 +55,7 @@ static XidStatus
TransactionLogFetch(TransactionId transactionId)
{
XidStatus xidstatus;
+ XLogRecPtr xidlsn;
/*
* Before going to the commit log manager, check our single item cache to
@@ -73,9 +77,9 @@ TransactionLogFetch(TransactionId transactionId)
}
/*
- * Get the status.
+ * Get the transaction status.
*/
- xidstatus = TransactionIdGetStatus(transactionId);
+ xidstatus = TransactionIdGetStatus(transactionId, &xidlsn);
/*
* DO NOT cache status for unfinished or sub-committed transactions! We
@@ -84,8 +88,9 @@ TransactionLogFetch(TransactionId transactionId)
if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS &&
xidstatus != TRANSACTION_STATUS_SUB_COMMITTED)
{
- TransactionIdStore(transactionId, &cachedFetchXid);
+ cachedFetchXid = transactionId;
cachedFetchXidStatus = xidstatus;
+ cachedCommitLSN = xidlsn;
}
return xidstatus;
@@ -93,16 +98,19 @@ TransactionLogFetch(TransactionId transactionId)
/* --------------------------------
* TransactionLogUpdate
+ *
+ * Store the new status of a transaction. The commit record LSN must be
+ * passed when recording an async commit; else it should be InvalidXLogRecPtr.
* --------------------------------
*/
-static void
-TransactionLogUpdate(TransactionId transactionId, /* trans id to update */
- XidStatus status) /* new trans status */
+static inline void
+TransactionLogUpdate(TransactionId transactionId,
+ XidStatus status, XLogRecPtr lsn)
{
/*
* update the commit log
*/
- TransactionIdSetStatus(transactionId, status);
+ TransactionIdSetStatus(transactionId, status, lsn);
}
/*
@@ -111,15 +119,16 @@ TransactionLogUpdate(TransactionId transactionId, /* trans id to update */
* Update multiple transaction identifiers to a given status.
* Don't depend on this being atomic; it's not.
*/
-static void
-TransactionLogMultiUpdate(int nxids, TransactionId *xids, XidStatus status)
+static inline void
+TransactionLogMultiUpdate(int nxids, TransactionId *xids,
+ XidStatus status, XLogRecPtr lsn)
{
int i;
Assert(nxids != 0);
for (i = 0; i < nxids; i++)
- TransactionIdSetStatus(xids[i], status);
+ TransactionIdSetStatus(xids[i], status, lsn);
}
/* ----------------------------------------------------------------
@@ -269,31 +278,49 @@ TransactionIdDidAbort(TransactionId transactionId)
void
TransactionIdCommit(TransactionId transactionId)
{
- TransactionLogUpdate(transactionId, TRANSACTION_STATUS_COMMITTED);
+ TransactionLogUpdate(transactionId, TRANSACTION_STATUS_COMMITTED,
+ InvalidXLogRecPtr);
+}
+
+/*
+ * TransactionIdAsyncCommit
+ * Same as above, but for async commits. The commit record LSN is needed.
+ */
+void
+TransactionIdAsyncCommit(TransactionId transactionId, XLogRecPtr lsn)
+{
+ TransactionLogUpdate(transactionId, TRANSACTION_STATUS_COMMITTED, lsn);
}
+
/*
* TransactionIdAbort
* Aborts the transaction associated with the identifier.
*
* Note:
* Assumes transaction identifier is valid.
+ * No async version of this is needed.
*/
void
TransactionIdAbort(TransactionId transactionId)
{
- TransactionLogUpdate(transactionId, TRANSACTION_STATUS_ABORTED);
+ TransactionLogUpdate(transactionId, TRANSACTION_STATUS_ABORTED,
+ InvalidXLogRecPtr);
}
/*
* TransactionIdSubCommit
* Marks the subtransaction associated with the identifier as
* sub-committed.
+ *
+ * Note:
+ * No async version of this is needed.
*/
void
TransactionIdSubCommit(TransactionId transactionId)
{
- TransactionLogUpdate(transactionId, TRANSACTION_STATUS_SUB_COMMITTED);
+ TransactionLogUpdate(transactionId, TRANSACTION_STATUS_SUB_COMMITTED,
+ InvalidXLogRecPtr);
}
/*
@@ -309,10 +336,24 @@ void
TransactionIdCommitTree(int nxids, TransactionId *xids)
{
if (nxids > 0)
- TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED);
+ TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED,
+ InvalidXLogRecPtr);
}
/*
+ * TransactionIdAsyncCommitTree
+ * Same as above, but for async commits. The commit record LSN is needed.
+ */
+void
+TransactionIdAsyncCommitTree(int nxids, TransactionId *xids, XLogRecPtr lsn)
+{
+ if (nxids > 0)
+ TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED,
+ lsn);
+}
+
+
+/*
* TransactionIdAbortTree
* Marks all the given transaction ids as aborted.
*
@@ -323,7 +364,8 @@ void
TransactionIdAbortTree(int nxids, TransactionId *xids)
{
if (nxids > 0)
- TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_ABORTED);
+ TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_ABORTED,
+ InvalidXLogRecPtr);
}
/*
@@ -389,3 +431,43 @@ TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
diff = (int32) (id1 - id2);
return (diff >= 0);
}
+
+/*
+ * TransactionIdGetCommitLSN
+ *
+ * This function returns an LSN that is late enough to be able
+ * to guarantee that if we flush up to the LSN returned then we
+ * will have flushed the transaction's commit record to disk.
+ *
+ * The result is not necessarily the exact LSN of the transaction's
+ * commit record! For example, for long-past transactions (those whose
+ * clog pages already migrated to disk), we'll return InvalidXLogRecPtr.
+ * Also, because we group transactions on the same clog page to conserve
+ * storage, we might return the LSN of a later transaction that falls into
+ * the same group.
+ */
+XLogRecPtr
+TransactionIdGetCommitLSN(TransactionId xid)
+{
+ XLogRecPtr result;
+
+ /*
+ * Currently, all uses of this function are for xids that were just
+ * reported to be committed by TransactionLogFetch, so we expect that
+ * checking TransactionLogFetch's cache will usually succeed and avoid an
+ * extra trip to shared memory.
+ */
+ if (TransactionIdEquals(xid, cachedFetchXid))
+ return cachedCommitLSN;
+
+ /* Special XIDs are always known committed */
+ if (!TransactionIdIsNormal(xid))
+ return InvalidXLogRecPtr;
+
+ /*
+ * Get the transaction status.
+ */
+ (void) TransactionIdGetStatus(xid, &result);
+
+ return result;
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 7fdf5a7eed3..2ae81e823d5 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.31 2007/05/27 03:50:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.32 2007/08/01 22:45:07 tgl Exp $
*
* NOTES
* Each global transaction is associated with a global transaction
@@ -1706,7 +1706,11 @@ RecordTransactionCommitPrepared(TransactionId xid,
XLOG_XACT_COMMIT_PREPARED | XLOG_NO_TRAN,
rdata);
- /* we don't currently try to sleep before flush here ... */
+ /*
+ * We don't currently try to sleep before flush here ... nor is there
+ * any support for async commit of a prepared xact (the very idea is
+ * probably a contradiction)
+ */
/* Flush XLOG to disk */
XLogFlush(recptr);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 72a7cf40a63..117525b5ac4 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.245 2007/06/07 21:45:58 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.246 2007/08/01 22:45:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -55,6 +55,8 @@ int XactIsoLevel;
bool DefaultXactReadOnly = false;
bool XactReadOnly;
+bool XactSyncCommit = true;
+
int CommitDelay = 0; /* precommit delay in microseconds */
int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
@@ -175,6 +177,11 @@ static TimestampTz xactStopTimestamp;
static char *prepareGID;
/*
+ * Some commands want to force synchronous commit.
+ */
+static bool forceSyncCommit = false;
+
+/*
* Private context for transaction-abort work --- we reserve space for this
* at startup to ensure that AbortTransaction and AbortSubTransaction can work
* when we've run out of memory.
@@ -554,6 +561,18 @@ CommandCounterIncrement(void)
AtStart_Cache();
}
+/*
+ * ForceSyncCommit
+ *
+ * Interface routine to allow commands to force a synchronous commit of the
+ * current top-level transaction
+ */
+void
+ForceSyncCommit(void)
+{
+ forceSyncCommit = true;
+}
+
/* ----------------------------------------------------------------
* StartTransaction stuff
@@ -724,6 +743,7 @@ RecordTransactionCommit(void)
{
TransactionId xid = GetCurrentTransactionId();
bool madeTCentries;
+ bool isAsyncCommit = false;
XLogRecPtr recptr;
/* Tell bufmgr and smgr to prepare for commit */
@@ -810,21 +830,44 @@ RecordTransactionCommit(void)
if (MyXactMadeXLogEntry)
{
/*
- * Sleep before flush! So we can flush more than one commit
- * records per single fsync. (The idea is some other backend may
- * do the XLogFlush while we're sleeping. This needs work still,
- * because on most Unixen, the minimum select() delay is 10msec or
- * more, which is way too long.)
- *
- * We do not sleep if enableFsync is not turned on, nor if there
- * are fewer than CommitSiblings other backends with active
- * transactions.
+ * If the user has set synchronous_commit = off, and we're
+ * not doing cleanup of any rels nor committing any command
+ * that wanted to force sync commit, then we can defer fsync.
*/
- if (CommitDelay > 0 && enableFsync &&
- CountActiveBackends() >= CommitSiblings)
- pg_usleep(CommitDelay);
+ if (XactSyncCommit || forceSyncCommit || nrels > 0)
+ {
+ /*
+ * Synchronous commit case.
+ *
+ * Sleep before flush! So we can flush more than one commit
+ * records per single fsync. (The idea is some other backend
+ * may do the XLogFlush while we're sleeping. This needs work
+ * still, because on most Unixen, the minimum select() delay
+ * is 10msec or more, which is way too long.)
+ *
+ * We do not sleep if enableFsync is not turned on, nor if
+ * there are fewer than CommitSiblings other backends with
+ * active transactions.
+ */
+ if (CommitDelay > 0 && enableFsync &&
+ CountActiveBackends() >= CommitSiblings)
+ pg_usleep(CommitDelay);
- XLogFlush(recptr);
+ XLogFlush(recptr);
+ }
+ else
+ {
+ /*
+ * Asynchronous commit case.
+ */
+ isAsyncCommit = true;
+
+ /*
+ * Report the latest async commit LSN, so that
+ * the WAL writer knows to flush this commit.
+ */
+ XLogSetAsyncCommitLSN(recptr);
+ }
}
/*
@@ -835,12 +878,24 @@ RecordTransactionCommit(void)
* emitted an XLOG record for our commit, and so in the event of a
* crash the clog update might be lost. This is okay because no one
* else will ever care whether we committed.
+ *
+ * The recptr here refers to the last xlog entry by this transaction
+ * so is the correct value to use for setting the clog.
*/
if (madeTCentries || MyXactMadeTempRelUpdate)
{
- TransactionIdCommit(xid);
- /* to avoid race conditions, the parent must commit first */
- TransactionIdCommitTree(nchildren, children);
+ if (isAsyncCommit)
+ {
+ TransactionIdAsyncCommit(xid, recptr);
+ /* to avoid race conditions, the parent must commit first */
+ TransactionIdAsyncCommitTree(nchildren, children, recptr);
+ }
+ else
+ {
+ TransactionIdCommit(xid);
+ /* to avoid race conditions, the parent must commit first */
+ TransactionIdCommitTree(nchildren, children);
+ }
}
/* Checkpoint can proceed now */
@@ -1406,6 +1461,7 @@ StartTransaction(void)
FreeXactSnapshot();
XactIsoLevel = DefaultXactIsoLevel;
XactReadOnly = DefaultXactReadOnly;
+ forceSyncCommit = false;
/*
* reinitialize within-transaction counters
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 25789ddaa68..4c7024baa38 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.275 2007/07/24 04:54:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.276 2007/08/01 22:45:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -305,6 +305,7 @@ typedef struct XLogCtlData
XLogwrtResult LogwrtResult;
uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
TransactionId ckptXid;
+ XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
@@ -1644,6 +1645,22 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
}
/*
+ * Record the LSN for an asynchronous transaction commit.
+ * (This should not be called for aborts, nor for synchronous commits.)
+ */
+void
+XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ if (XLByteLT(xlogctl->asyncCommitLSN, asyncCommitLSN))
+ xlogctl->asyncCommitLSN = asyncCommitLSN;
+ SpinLockRelease(&xlogctl->info_lck);
+}
+
+/*
* Ensure that all XLOG data through the given position is flushed to disk.
*
* NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
@@ -1797,19 +1814,17 @@ XLogBackgroundFlush(void)
/* back off to last completed page boundary */
WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
-#ifdef NOT_YET /* async commit patch is still to come */
/* if we have already flushed that far, consider async commit records */
if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
{
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
- SpinLockAcquire(&xlogctl->async_commit_lck);
+ SpinLockAcquire(&xlogctl->info_lck);
WriteRqstPtr = xlogctl->asyncCommitLSN;
- SpinLockRelease(&xlogctl->async_commit_lck);
+ SpinLockRelease(&xlogctl->info_lck);
flexible = false; /* ensure it all gets written */
}
-#endif
/* Done if already known flushed */
if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
@@ -1842,6 +1857,23 @@ XLogBackgroundFlush(void)
}
/*
+ * Flush any previous asynchronously-committed transactions' commit records.
+ */
+void
+XLogAsyncCommitFlush(void)
+{
+ XLogRecPtr WriteRqstPtr;
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ WriteRqstPtr = xlogctl->asyncCommitLSN;
+ SpinLockRelease(&xlogctl->info_lck);
+
+ XLogFlush(WriteRqstPtr);
+}
+
+/*
* Test whether XLOG data has been flushed up to (at least) the given position.
*
* Returns true if a flush is still needed. (It may be that someone else
@@ -5466,7 +5498,7 @@ ShutdownXLOG(int code, Datum arg)
(errmsg("database system is shut down")));
}
-/*
+/*
* Log start of a checkpoint.
*/
static void
@@ -5481,7 +5513,7 @@ LogCheckpointStart(int flags)
(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}
-/*
+/*
* Log end of a checkpoint.
*/
static void
@@ -5523,7 +5555,7 @@ LogCheckpointEnd(void)
* flags is a bitwise OR of the following:
* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
- * ignoring checkpoint_completion_target parameter.
+ * ignoring checkpoint_completion_target parameter.
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
* since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 4ceb962fb95..34b6da99df9 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -13,7 +13,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.196 2007/06/28 00:02:38 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.197 2007/08/01 22:45:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -515,7 +515,11 @@ createdb(const CreatedbStmt *stmt)
heap_close(pg_database_rel, NoLock);
/*
- * Set flag to update flat database file at commit.
+ * Set flag to update flat database file at commit. Note: this also
+ * forces synchronous commit, which minimizes the window between
+ * creation of the database files and commital of the transaction.
+ * If we crash before committing, we'll have a DB that's taking up
+ * disk space but is not in pg_database, which is not good.
*/
database_file_update_needed();
}
@@ -675,7 +679,11 @@ dropdb(const char *dbname, bool missing_ok)
heap_close(pgdbrel, NoLock);
/*
- * Set flag to update flat database file at commit.
+ * Set flag to update flat database file at commit. Note: this also
+ * forces synchronous commit, which minimizes the window between
+ * removal of the database files and commital of the transaction.
+ * If we crash before committing, we'll have a DB that's gone on disk
+ * but still there according to pg_database, which is not good.
*/
database_file_update_needed();
}
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index d0dacf10782..f19e237315e 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -37,7 +37,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.48 2007/06/07 19:19:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -354,6 +354,14 @@ CreateTableSpace(CreateTableSpaceStmt *stmt)
(void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE, rdata);
}
+ /*
+ * Force synchronous commit, to minimize the window between creating
+ * the symlink on-disk and marking the transaction committed. It's
+ * not great that there is any window at all, but definitely we don't
+ * want to make it larger than necessary.
+ */
+ ForceSyncCommit();
+
pfree(linkloc);
pfree(location);
@@ -481,6 +489,14 @@ DropTableSpace(DropTableSpaceStmt *stmt)
*/
/*
+ * Force synchronous commit, to minimize the window between removing
+ * the files on-disk and marking the transaction committed. It's
+ * not great that there is any window at all, but definitely we don't
+ * want to make it larger than necessary.
+ */
+ ForceSyncCommit();
+
+ /*
* Allow TablespaceCreateDbspace again.
*/
LWLockRelease(TablespaceCreateLock);
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 8fa17ab2350..41c3b867912 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.353 2007/06/14 13:53:14 alvherre Exp $
+ * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.354 2007/08/01 22:45:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -27,6 +27,7 @@
#include "access/heapam.h"
#include "access/transam.h"
#include "access/xact.h"
+#include "access/xlog.h"
#include "catalog/namespace.h"
#include "catalog/pg_database.h"
#include "commands/dbcommands.h"
@@ -1162,6 +1163,16 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared,
&OldestXmin, &FreezeLimit);
+ /*
+ * VACUUM FULL assumes that all tuple states are well-known prior to
+ * moving tuples around --- see comment "known dead" in repair_frag(),
+ * as well as simplifications in tqual.c. So before we start we must
+ * ensure that any asynchronously-committed transactions with changes
+ * against this table have been flushed to disk. It's sufficient to do
+ * this once after we've acquired AccessExclusiveLock.
+ */
+ XLogAsyncCommitFlush();
+
/*
* Set up statistics-gathering machinery.
*/
@@ -2373,8 +2384,15 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
* exclusive access to the relation. However, that would require a
* lot of extra code to close and re-open the relation, indexes, etc.
* For now, a quick hack: record status of current transaction as
- * committed, and continue.
+ * committed, and continue. We force the commit to be synchronous
+ * so that it's down to disk before we truncate. (Note: tqual.c
+ * knows that VACUUM FULL always uses sync commit, too.)
+ *
+ * XXX This desperately needs to be revisited. Any failure after
+ * this point will result in a PANIC "cannot abort transaction nnn,
+ * it was already committed"!
*/
+ ForceSyncCommit();
RecordTransactionCommit();
}
diff --git a/src/backend/utils/init/flatfiles.c b/src/backend/utils/init/flatfiles.c
index 992fc70bb44..c9b1ac509c2 100644
--- a/src/backend/utils/init/flatfiles.c
+++ b/src/backend/utils/init/flatfiles.c
@@ -23,7 +23,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/utils/init/flatfiles.c,v 1.26 2007/06/12 17:16:52 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/init/flatfiles.c,v 1.27 2007/08/01 22:45:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -855,6 +855,14 @@ AtEOXact_UpdateFlatFiles(bool isCommit)
* Signal the postmaster to reload its caches.
*/
SendPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE);
+
+ /*
+ * Force synchronous commit, to minimize the window between changing
+ * the flat files on-disk and marking the transaction committed. It's
+ * not great that there is any window at all, but definitely we don't
+ * want to make it larger than necessary.
+ */
+ ForceSyncCommit();
}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b2d0ea9cae5..c30d8b50a05 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
* Written by Peter Eisentraut <peter_e@gmx.net>.
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.407 2007/07/24 04:54:09 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.408 2007/08/01 22:45:09 tgl Exp $
*
*--------------------------------------------------------------------
*/
@@ -554,6 +554,14 @@ static struct config_bool ConfigureNamesBool[] =
true, NULL, NULL
},
{
+ {"synchronous_commit", PGC_USERSET, WAL_SETTINGS,
+ gettext_noop("Sets immediate fsync at commit."),
+ NULL
+ },
+ &XactSyncCommit,
+ true, NULL, NULL
+ },
+ {
{"zero_damaged_pages", PGC_SUSET, DEVELOPER_OPTIONS,
gettext_noop("Continues processing past damaged page headers."),
gettext_noop("Detection of a damaged page header normally causes PostgreSQL to "
@@ -1521,7 +1529,7 @@ static struct config_int ConfigureNamesInt[] =
},
{
- {"commit_delay", PGC_USERSET, WAL_CHECKPOINTS,
+ {"commit_delay", PGC_USERSET, WAL_SETTINGS,
gettext_noop("Sets the delay in microseconds between transaction commit and "
"flushing WAL to disk."),
NULL
@@ -1531,7 +1539,7 @@ static struct config_int ConfigureNamesInt[] =
},
{
- {"commit_siblings", PGC_USERSET, WAL_CHECKPOINTS,
+ {"commit_siblings", PGC_USERSET, WAL_SETTINGS,
gettext_noop("Sets the minimum concurrent open transactions before performing "
"commit_delay."),
NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 8bfad997ff3..c87e4baf43d 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -149,6 +149,7 @@
# - Settings -
#fsync = on # turns forced synchronization on or off
+#synchronous_commit = on # immediate fsync at commit
#wal_sync_method = fsync # the default is the first option
# supported by the operating system:
# open_datasync
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 429005a843a..edbaa4d6b1a 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -31,7 +31,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.102 2007/03/25 19:45:14 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.103 2007/08/01 22:45:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -81,6 +81,44 @@ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
/*
+ * HeapTupleSetHintBits()
+ *
+ * Set commit/abort hint bits on a tuple, if appropriate at this time.
+ *
+ * We cannot change the LSN of the page here because we may hold only a share
+ * lock on the buffer, so it is only safe to set a transaction-committed hint
+ * bit if we know the transaction's commit record has been flushed to disk.
+ *
+ * We can always set hint bits when marking a transaction aborted. Also,
+ * if we are cleaning up HEAP_MOVED_IN or HEAP_MOVED_OFF entries, then
+ * we can always set the hint bits, since VACUUM FULL always uses synchronous
+ * commits.
+ *
+ * Normal commits may be asynchronous, so for those we need to get the LSN
+ * of the transaction and then check whether this is flushed.
+ *
+ * The caller should pass xid as the XID of the transaction to check, or
+ * InvalidTransactionId if no check is needed.
+ */
+static inline void
+HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
+ uint16 infomask, TransactionId xid)
+{
+ if (TransactionIdIsValid(xid))
+ {
+ /* NB: xid must be known committed here! */
+ XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid);
+
+ if (XLogNeedsFlush(commitLSN))
+ return; /* not flushed yet, so don't set hint */
+ }
+
+ tuple->t_infomask |= infomask;
+ SetBufferCommitInfoNeedsSave(buffer);
+}
+
+
+/*
* HeapTupleSatisfiesSelf
* True iff heap tuple is valid "for itself".
*
@@ -122,12 +160,12 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
{
if (TransactionIdDidCommit(xvac))
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
}
}
else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -139,14 +177,12 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
if (TransactionIdIsInProgress(xvac))
return false;
if (TransactionIdDidCommit(xvac))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
else
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -164,8 +200,8 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
/* deleting subtransaction aborted? */
if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
@@ -176,15 +212,13 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
return false;
else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ HeapTupleHeaderGetXmin(tuple));
else
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -221,8 +255,8 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
@@ -230,13 +264,13 @@ HeapTupleSatisfiesSelf(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
if (tuple->t_infomask & HEAP_IS_LOCKED)
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
- tuple->t_infomask |= HEAP_XMAX_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+ HeapTupleHeaderGetXmax(tuple));
return false;
}
@@ -299,12 +333,12 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
{
if (TransactionIdDidCommit(xvac))
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
}
}
else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -316,14 +350,12 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
if (TransactionIdIsInProgress(xvac))
return false;
if (TransactionIdDidCommit(xvac))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
else
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -344,8 +376,8 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
/* deleting subtransaction aborted? */
if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
@@ -359,15 +391,13 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
return false;
else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ HeapTupleHeaderGetXmin(tuple));
else
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -407,8 +437,8 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
@@ -416,13 +446,13 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer)
if (tuple->t_infomask & HEAP_IS_LOCKED)
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
- tuple->t_infomask |= HEAP_XMAX_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+ HeapTupleHeaderGetXmax(tuple));
return false;
}
@@ -469,12 +499,12 @@ HeapTupleSatisfiesToast(HeapTupleHeader tuple, Snapshot snapshot,
{
if (TransactionIdDidCommit(xvac))
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
}
}
else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -486,14 +516,12 @@ HeapTupleSatisfiesToast(HeapTupleHeader tuple, Snapshot snapshot,
if (TransactionIdIsInProgress(xvac))
return false;
if (TransactionIdDidCommit(xvac))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
else
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -550,12 +578,12 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
{
if (TransactionIdDidCommit(xvac))
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return HeapTupleInvisible;
}
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
}
}
else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -567,14 +595,12 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
if (TransactionIdIsInProgress(xvac))
return HeapTupleInvisible;
if (TransactionIdDidCommit(xvac))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
else
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return HeapTupleInvisible;
}
}
@@ -595,8 +621,8 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
/* deleting subtransaction aborted? */
if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return HeapTupleMayBeUpdated;
}
@@ -610,15 +636,13 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
return HeapTupleInvisible;
else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ HeapTupleHeaderGetXmin(tuple));
else
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return HeapTupleInvisible;
}
}
@@ -642,8 +666,8 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple)))
return HeapTupleBeingUpdated;
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return HeapTupleMayBeUpdated;
}
@@ -663,8 +687,8 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return HeapTupleMayBeUpdated;
}
@@ -672,13 +696,13 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid,
if (tuple->t_infomask & HEAP_IS_LOCKED)
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return HeapTupleMayBeUpdated;
}
- tuple->t_infomask |= HEAP_XMAX_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+ HeapTupleHeaderGetXmax(tuple));
return HeapTupleUpdated; /* updated by other */
}
@@ -723,12 +747,12 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
{
if (TransactionIdDidCommit(xvac))
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
}
}
else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -740,14 +764,12 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
if (TransactionIdIsInProgress(xvac))
return false;
if (TransactionIdDidCommit(xvac))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
else
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -765,8 +787,8 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
/* deleting subtransaction aborted? */
if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
@@ -781,15 +803,13 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
return true; /* in insertion by other */
}
else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ HeapTupleHeaderGetXmin(tuple));
else
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -829,8 +849,8 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
@@ -838,13 +858,13 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Snapshot snapshot,
if (tuple->t_infomask & HEAP_IS_LOCKED)
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
- tuple->t_infomask |= HEAP_XMAX_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+ HeapTupleHeaderGetXmax(tuple));
return false; /* updated by other */
}
@@ -888,12 +908,12 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
{
if (TransactionIdDidCommit(xvac))
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
}
}
else if (tuple->t_infomask & HEAP_MOVED_IN)
@@ -905,14 +925,12 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
if (TransactionIdIsInProgress(xvac))
return false;
if (TransactionIdDidCommit(xvac))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
else
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -934,8 +952,8 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
/* FIXME -- is this correct w.r.t. the cmax of the tuple? */
if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
{
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
@@ -949,15 +967,13 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple)))
return false;
else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ HeapTupleHeaderGetXmin(tuple));
else
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return false;
}
}
@@ -998,14 +1014,14 @@ HeapTupleSatisfiesMVCC(HeapTupleHeader tuple, Snapshot snapshot,
if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
{
/* it must have aborted or crashed */
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return true;
}
/* xmax transaction committed */
- tuple->t_infomask |= HEAP_XMAX_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+ HeapTupleHeaderGetXmax(tuple));
}
/*
@@ -1054,12 +1070,12 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
return HEAPTUPLE_DELETE_IN_PROGRESS;
if (TransactionIdDidCommit(xvac))
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return HEAPTUPLE_DEAD;
}
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
}
else if (tuple->t_infomask & HEAP_MOVED_IN)
{
@@ -1070,14 +1086,12 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
if (TransactionIdIsInProgress(xvac))
return HEAPTUPLE_INSERT_IN_PROGRESS;
if (TransactionIdDidCommit(xvac))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ InvalidTransactionId);
else
{
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return HEAPTUPLE_DEAD;
}
}
@@ -1091,21 +1105,22 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
return HEAPTUPLE_DELETE_IN_PROGRESS;
}
else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple)))
- {
- tuple->t_infomask |= HEAP_XMIN_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+ HeapTupleHeaderGetXmin(tuple));
else
{
/*
* Not in Progress, Not Committed, so either Aborted or crashed
*/
- tuple->t_infomask |= HEAP_XMIN_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+ InvalidTransactionId);
return HEAPTUPLE_DEAD;
}
- /* Should only get here if we set XMIN_COMMITTED */
- Assert(tuple->t_infomask & HEAP_XMIN_COMMITTED);
+ /*
+ * At this point the xmin is known committed, but we might not have
+ * been able to set the hint bit yet; so we can no longer Assert
+ * that it's set.
+ */
}
/*
@@ -1143,8 +1158,8 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
* We know that xmax did lock the tuple, but it did not and will
* never actually update it.
*/
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
}
return HEAPTUPLE_LIVE;
}
@@ -1161,21 +1176,22 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin,
if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple)))
return HEAPTUPLE_DELETE_IN_PROGRESS;
else if (TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple)))
- {
- tuple->t_infomask |= HEAP_XMAX_COMMITTED;
- SetBufferCommitInfoNeedsSave(buffer);
- }
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+ HeapTupleHeaderGetXmax(tuple));
else
{
/*
* Not in Progress, Not Committed, so either Aborted or crashed
*/
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- SetBufferCommitInfoNeedsSave(buffer);
+ HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+ InvalidTransactionId);
return HEAPTUPLE_LIVE;
}
- /* Should only get here if we set XMAX_COMMITTED */
- Assert(tuple->t_infomask & HEAP_XMAX_COMMITTED);
+ /*
+ * At this point the xmax is known committed, but we might not have
+ * been able to set the hint bit yet; so we can no longer Assert
+ * that it's set.
+ */
}
/*
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index f67eb2c048c..5e6cabe194b 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.19 2007/01/05 22:19:50 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.20 2007/08/01 22:45:09 tgl Exp $
*/
#ifndef CLOG_H
#define CLOG_H
@@ -32,8 +32,8 @@ typedef int XidStatus;
#define NUM_CLOG_BUFFERS 8
-extern void TransactionIdSetStatus(TransactionId xid, XidStatus status);
-extern XidStatus TransactionIdGetStatus(TransactionId xid);
+extern void TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn);
+extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn);
extern Size CLOGShmemSize(void);
extern void CLOGShmemInit(void);
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 6cb2e5294e5..d7fb404f4f6 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/gist_private.h,v 1.26 2007/01/20 18:43:35 neilc Exp $
+ * $PostgreSQL: pgsql/src/include/access/gist_private.h,v 1.27 2007/08/01 22:45:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -200,8 +200,6 @@ typedef struct GistSplitVector
* distributed between left and right pages */
} GistSplitVector;
-#define XLogRecPtrIsInvalid( r ) ( (r).xlogid == 0 && (r).xrecoff == 0 )
-
typedef struct
{
Relation r;
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 3cc30e76b70..9e18b9608be 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -6,13 +6,14 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.20 2007/01/05 22:19:51 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.21 2007/08/01 22:45:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef SLRU_H
#define SLRU_H
+#include "access/xlogdefs.h"
#include "storage/lwlock.h"
@@ -51,6 +52,17 @@ typedef struct SlruSharedData
int *page_lru_count;
LWLockId *buffer_locks;
+ /*
+ * Optional array of WAL flush LSNs associated with entries in the SLRU
+ * pages. If not zero/NULL, we must flush WAL before writing pages (true
+ * for pg_clog, false for multixact and pg_subtrans). group_lsn[] has
+ * lsn_groups_per_page entries per buffer slot, each containing the
+ * highest LSN known for a contiguous group of SLRU entries on that slot's
+ * page.
+ */
+ XLogRecPtr *group_lsn;
+ int lsn_groups_per_page;
+
/*----------
* We mark a page "most recently used" by setting
* page_lru_count[slotno] = ++cur_lru_count;
@@ -81,8 +93,8 @@ typedef struct SlruCtlData
SlruShared shared;
/*
- * This flag tells whether to fsync writes (true for pg_clog, false for
- * pg_subtrans).
+ * This flag tells whether to fsync writes (true for pg_clog and multixact
+ * stuff, false for pg_subtrans).
*/
bool do_fsync;
@@ -106,11 +118,12 @@ typedef SlruCtlData *SlruCtl;
typedef struct SlruFlushData *SlruFlush;
-extern Size SimpleLruShmemSize(int nslots);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
+extern Size SimpleLruShmemSize(int nslots, int nlsns);
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
LWLockId ctllock, const char *subdir);
extern int SimpleLruZeroPage(SlruCtl ctl, int pageno);
-extern int SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid);
+extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
+ TransactionId xid);
extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
TransactionId xid);
extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index b6fadcd4362..98850cc0d3b 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -7,13 +7,15 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.60 2007/01/05 22:19:51 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.61 2007/08/01 22:45:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef TRANSAM_H
#define TRANSAM_H
+#include "access/xlogdefs.h"
+
/* ----------------
* Special transaction ID values
@@ -115,14 +117,17 @@ extern VariableCache ShmemVariableCache;
extern bool TransactionIdDidCommit(TransactionId transactionId);
extern bool TransactionIdDidAbort(TransactionId transactionId);
extern void TransactionIdCommit(TransactionId transactionId);
+extern void TransactionIdAsyncCommit(TransactionId transactionId, XLogRecPtr lsn);
extern void TransactionIdAbort(TransactionId transactionId);
extern void TransactionIdSubCommit(TransactionId transactionId);
extern void TransactionIdCommitTree(int nxids, TransactionId *xids);
+extern void TransactionIdAsyncCommitTree(int nxids, TransactionId *xids, XLogRecPtr lsn);
extern void TransactionIdAbortTree(int nxids, TransactionId *xids);
extern bool TransactionIdPrecedes(TransactionId id1, TransactionId id2);
extern bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2);
extern bool TransactionIdFollows(TransactionId id1, TransactionId id2);
extern bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2);
+extern XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid);
/* in transam/varsup.c */
extern TransactionId GetNewTransactionId(bool isSubXact);
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 759eab1a3d9..e8e2b08de42 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.87 2007/04/30 21:01:53 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.88 2007/08/01 22:45:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -41,6 +41,9 @@ extern int XactIsoLevel;
extern bool DefaultXactReadOnly;
extern bool XactReadOnly;
+/* Asynchronous commits */
+extern bool XactSyncCommit;
+
/*
* start- and end-of-transaction callbacks for dynamically loaded modules
*/
@@ -147,6 +150,7 @@ extern void SetCurrentStatementStartTimestamp(void);
extern int GetCurrentTransactionNestLevel(void);
extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
extern void CommandCounterIncrement(void);
+extern void ForceSyncCommit(void);
extern void StartTransactionCommand(void);
extern void CommitTransactionCommand(void);
extern void AbortCurrentTransaction(void);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index adc99a6eb06..2e1928dace0 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.81 2007/07/24 04:54:09 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.82 2007/08/01 22:45:09 tgl Exp $
*/
#ifndef XLOG_H
#define XLOG_H
@@ -197,8 +197,11 @@ extern CheckpointStatsData CheckpointStats;
extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
extern void XLogFlush(XLogRecPtr RecPtr);
extern void XLogBackgroundFlush(void);
+extern void XLogAsyncCommitFlush(void);
extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
+extern void XLogSetAsyncCommitLSN(XLogRecPtr record);
+
extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index ceca7794325..843f078d656 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.17 2007/02/14 05:00:40 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.18 2007/08/01 22:45:09 tgl Exp $
*/
#ifndef XLOG_DEFS_H
#define XLOG_DEFS_H
@@ -33,6 +33,8 @@ typedef struct XLogRecPtr
uint32 xrecoff; /* byte offset of location in log file */
} XLogRecPtr;
+#define XLogRecPtrIsInvalid(r) ((r).xrecoff == 0)
+
/*
* Macros for comparing XLogRecPtrs