diff options
author | Alvaro Herrera <alvherre@alvh.no-ip.org> | 2008-10-20 19:18:18 +0000 |
---|---|---|
committer | Alvaro Herrera <alvherre@alvh.no-ip.org> | 2008-10-20 19:18:18 +0000 |
commit | 06da3c570f21394003fc392d80f54862f7dec19f (patch) | |
tree | 2e4c5dc1cb78d87e12fc1495b084bfaf5e69e737 /src/backend/access/transam/clog.c | |
parent | 3afffbc902f16f5b9abd2464a2bbc17d9bc63316 (diff) | |
download | postgresql-06da3c570f21394003fc392d80f54862f7dec19f.tar.gz postgresql-06da3c570f21394003fc392d80f54862f7dec19f.zip |
Rework subtransaction commit protocol for hot standby.
This patch eliminates the marking of subtransactions as SUBCOMMITTED in pg_clog
during their commit; instead they remain in-progress until main transaction
commit. At main transaction commit, the commit protocol is atomic-by-page
instead of one transaction at a time. To avoid a race condition with some
subtransactions appearing committed before others in the case where they span
more than one pg_clog page, we conserve the logic that marks them subcommitted
before marking the parent committed.
Simon Riggs with minor help from me
Diffstat (limited to 'src/backend/access/transam/clog.c')
-rw-r--r-- | src/backend/access/transam/clog.c | 229 |
1 files changed, 214 insertions, 15 deletions
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index f30ef3a2262..fcff3ea3cfd 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -26,7 +26,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.47 2008/08/01 13:16:08 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.48 2008/10/20 19:18:18 alvherre Exp $ * *------------------------------------------------------------------------- */ @@ -80,32 +80,182 @@ static int ZeroCLOGPage(int pageno, bool writeXlog); static bool CLOGPagePrecedes(int page1, int page2); static void WriteZeroPageXlogRec(int pageno); static void WriteTruncateXlogRec(int pageno); +static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno); +static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status, + XLogRecPtr lsn, int slotno); +static void set_status_by_pages(int nsubxids, TransactionId *subxids, + XidStatus status, XLogRecPtr lsn); /* - * Record the final state of a transaction in the commit log. + * TransactionIdSetTreeStatus + * + * Record the final state of transaction entries in the commit log for + * a transaction and its subtransaction tree. Take care to ensure this is + * efficient, and as atomic as possible. + * + * xid is a single xid to set status for. This will typically be + * the top level transactionid for a top level commit or abort. It can + * also be a subtransaction when we record transaction aborts. + * + * subxids is an array of xids of length nsubxids, representing subtransactions + * in the tree of xid. In various cases nsubxids may be zero. * * lsn must be the WAL location of the commit record when recording an async * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the * caller guarantees the commit record is already flushed in that case. It * should be InvalidXLogRecPtr for abort cases, too. * + * In the commit case, atomicity is limited by whether all the subxids are in + * the same CLOG page as xid. If they all are, then the lock will be grabbed + * only once, and the status will be set to committed directly. Otherwise + * we must + * 1. set sub-committed all subxids that are not on the same page as the + * main xid + * 2. atomically set committed the main xid and the subxids on the same page + * 3. go over the first bunch again and set them committed + * Note that as far as concurrent checkers are concerned, main transaction + * commit as a whole is still atomic. + * + * Example: + * TransactionId t commits and has subxids t1, t2, t3, t4 + * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3 + * 1. update pages2-3: + * page2: set t2,t3 as sub-committed + * page3: set t4 as sub-committed + * 2. update page1: + * set t1 as sub-committed, + * then set t as committed, + then set t1 as committed + * 3. update pages2-3: + * page2: set t2,t3 as committed + * page3: set t4 as committed + * * NB: this is a low-level routine and is NOT the preferred entry point - * for most uses; TransactionLogUpdate() in transam.c is the intended caller. + * for most uses; functions in transam.c are the intended callers. + * + * XXX Think about issuing FADVISE_WILLNEED on pages that we will need, + * but aren't yet in cache, as well as hinting pages not to fall out of + * cache yet. */ void -TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn) +TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, XLogRecPtr lsn) +{ + int pageno = TransactionIdToPage(xid); /* get page of parent */ + int i; + + Assert(status == TRANSACTION_STATUS_COMMITTED || + status == TRANSACTION_STATUS_ABORTED); + + /* + * See how many subxids, if any, are on the same page as the parent, if any. + */ + for (i = 0; i < nsubxids; i++) + { + if (TransactionIdToPage(subxids[i]) != pageno) + break; + } + + /* + * Do all items fit on a single page? + */ + if (i == nsubxids) + { + /* + * Set the parent and all subtransactions in a single call + */ + TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn, + pageno); + } + else + { + int nsubxids_on_first_page = i; + + /* + * If this is a commit then we care about doing this correctly (i.e. + * using the subcommitted intermediate status). By here, we know we're + * updating more than one page of clog, so we must mark entries that + * are *not* on the first page so that they show as subcommitted before + * we then return to update the status to fully committed. + * + * To avoid touching the first page twice, skip marking subcommitted + * for the subxids on that first page. + */ + if (status == TRANSACTION_STATUS_COMMITTED) + set_status_by_pages(nsubxids - nsubxids_on_first_page, + subxids + nsubxids_on_first_page, + TRANSACTION_STATUS_SUB_COMMITTED, lsn); + + /* + * Now set the parent and subtransactions on same page as the parent, + * if any + */ + pageno = TransactionIdToPage(xid); + TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status, + lsn, pageno); + + /* + * Now work through the rest of the subxids one clog page at a time, + * starting from the second page onwards, like we did above. + */ + set_status_by_pages(nsubxids - nsubxids_on_first_page, + subxids + nsubxids_on_first_page, + status, lsn); + } +} + +/* + * Helper for TransactionIdSetTreeStatus: set the status for a bunch of + * transactions, chunking in the separate CLOG pages involved. We never + * pass the whole transaction tree to this function, only subtransactions + * that are on different pages to the top level transaction id. + */ +static void +set_status_by_pages(int nsubxids, TransactionId *subxids, + XidStatus status, XLogRecPtr lsn) +{ + int pageno = TransactionIdToPage(subxids[0]); + int offset = 0; + int i = 0; + + while (i < nsubxids) + { + int num_on_page = 0; + + while (TransactionIdToPage(subxids[i]) == pageno && i < nsubxids) + { + num_on_page++; + i++; + } + + TransactionIdSetPageStatus(InvalidTransactionId, + num_on_page, subxids + offset, + status, lsn, pageno); + offset = i; + pageno = TransactionIdToPage(subxids[offset]); + } +} + +/* + * Record the final state of transaction entries in the commit log for + * all entries on a single page. Atomic only on this page. + * + * Otherwise API is same as TransactionIdSetTreeStatus() + */ +static void +TransactionIdSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, XidStatus status, + XLogRecPtr lsn, int pageno) { - int pageno = TransactionIdToPage(xid); - int byteno = TransactionIdToByte(xid); - int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; int slotno; - char *byteptr; - char byteval; + int i; Assert(status == TRANSACTION_STATUS_COMMITTED || status == TRANSACTION_STATUS_ABORTED || - status == TRANSACTION_STATUS_SUB_COMMITTED); + (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid))); LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); @@ -116,9 +266,62 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn) * mustn't let it reach disk until we've done the appropriate WAL flush. * But when lsn is invalid, it's OK to scribble on a page while it is * write-busy, since we don't care if the update reaches disk sooner than - * we think. Hence, pass write_ok = XLogRecPtrIsInvalid(lsn). + * we think. */ slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid); + + /* + * Set the main transaction id, if any. + * + * If we update more than one xid on this page while it is being written + * out, we might find that some of the bits go to disk and others don't. + * If we are updating commits on the page with the top-level xid that could + * break atomicity, so we subcommit the subxids first before we mark the + * top-level commit. + */ + if (TransactionIdIsValid(xid)) + { + /* Subtransactions first, if needed ... */ + if (status == TRANSACTION_STATUS_COMMITTED) + { + for (i = 0; i < nsubxids; i++) + { + Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + TransactionIdSetStatusBit(subxids[i], + TRANSACTION_STATUS_SUB_COMMITTED, + lsn, slotno); + } + } + + /* ... then the main transaction */ + TransactionIdSetStatusBit(xid, status, lsn, slotno); + } + + /* Set the subtransactions */ + for (i = 0; i < nsubxids; i++) + { + Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + TransactionIdSetStatusBit(subxids[i], status, lsn, slotno); + } + + ClogCtl->shared->page_dirty[slotno] = true; + + LWLockRelease(CLogControlLock); +} + +/* + * Sets the commit status of a single transaction. + * + * Must be called with CLogControlLock held + */ +static void +TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno) +{ + int byteno = TransactionIdToByte(xid); + int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; + char *byteptr; + char byteval; + byteptr = ClogCtl->shared->page_buffer[slotno] + byteno; /* Current state should be 0, subcommitted or target state */ @@ -132,8 +335,6 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn) byteval |= (status << bshift); *byteptr = byteval; - ClogCtl->shared->page_dirty[slotno] = true; - /* * Update the group LSN if the transaction completion LSN is higher. * @@ -149,8 +350,6 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn) if (XLByteLT(ClogCtl->shared->group_lsn[lsnindex], lsn)) ClogCtl->shared->group_lsn[lsnindex] = lsn; } - - LWLockRelease(CLogControlLock); } /* |