diff options
author | Simon Riggs <simon@2ndQuadrant.com> | 2017-03-22 07:05:12 +0000 |
---|---|---|
committer | Simon Riggs <simon@2ndQuadrant.com> | 2017-03-22 07:05:12 +0000 |
commit | 1148e22a82edc96172fc78855da392b6f0015c88 (patch) | |
tree | ba60e49124bf0eb9f003ee11f2d2a275dffddb1b /src/backend/access/transam/xlogutils.c | |
parent | 9ca2dd578db4086ae8a6eb6fd82ac376b7b2804e (diff) | |
download | postgresql-1148e22a82edc96172fc78855da392b6f0015c88.tar.gz postgresql-1148e22a82edc96172fc78855da392b6f0015c88.zip |
Teach xlogreader to follow timeline switches
Uses page-based mechanism to ensure we’re using the correct timeline.
Tests are included to exercise the functionality using a cold disk-level copy
of the master that's started up as a replica with slots intact, but the
intended use of the functionality is with later features.
Craig Ringer, reviewed by Simon Riggs and Andres Freund
Diffstat (limited to 'src/backend/access/transam/xlogutils.c')
-rw-r--r-- | src/backend/access/transam/xlogutils.c | 215 |
1 files changed, 201 insertions, 14 deletions
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index b2b9fcbebb0..28c07d37c17 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -19,6 +19,7 @@ #include <unistd.h> +#include "access/timeline.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" @@ -662,6 +663,7 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) /* state maintained across calls */ static int sendFile = -1; static XLogSegNo sendSegNo = 0; + static TimeLineID sendTLI = 0; static uint32 sendOff = 0; p = buf; @@ -677,7 +679,8 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) startoff = recptr % XLogSegSize; /* Do we need to switch to a different xlog segment? */ - if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo)) + if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo) || + sendTLI != tli) { char path[MAXPGPATH]; @@ -704,6 +707,7 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) path))); } sendOff = 0; + sendTLI = tli; } /* Need to seek in the file? */ @@ -754,6 +758,133 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) } /* + * Determine which timeline to read an xlog page from and set the + * XLogReaderState's currTLI to that timeline ID. + * + * We care about timelines in xlogreader when we might be reading xlog + * generated prior to a promotion, either if we're currently a standby in + * recovery or if we're a promoted master reading xlogs generated by the old + * master before our promotion. + * + * wantPage must be set to the start address of the page to read and + * wantLength to the amount of the page that will be read, up to + * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ. + * + * We switch to an xlog segment from the new timeline eagerly when on a + * historical timeline, as soon as we reach the start of the xlog segment + * containing the timeline switch. The server copied the segment to the new + * timeline so all the data up to the switch point is the same, but there's no + * guarantee the old segment will still exist. It may have been deleted or + * renamed with a .partial suffix so we can't necessarily keep reading from + * the old TLI even though tliSwitchPoint says it's OK. + * + * We can't just check the timeline when we read a page on a different segment + * to the last page. We could've received a timeline switch from a cascading + * upstream, so the current segment ends apruptly (possibly getting renamed to + * .partial) and we have to switch to a new one. Even in the middle of reading + * a page we could have to dump the cached page and switch to a new TLI. + * + * Because of this, callers MAY NOT assume that currTLI is the timeline that + * will be in a page's xlp_tli; the page may begin on an older timeline or we + * might be reading from historical timeline data on a segment that's been + * copied to a new timeline. + * + * The caller must also make sure it doesn't read past the current replay + * position (using GetWalRcvWriteRecPtr) if executing in recovery, so it + * doesn't fail to notice that the current timeline became historical. The + * caller must also update ThisTimeLineID with the result of + * GetWalRcvWriteRecPtr and must check RecoveryInProgress(). + */ +void +XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength) +{ + const XLogRecPtr lastReadPage = state->readSegNo * XLogSegSize + state->readOff; + + Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); + Assert(wantLength <= XLOG_BLCKSZ); + Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); + + /* + * If the desired page is currently read in and valid, we have nothing to do. + * + * The caller should've ensured that it didn't previously advance readOff + * past the valid limit of this timeline, so it doesn't matter if the current + * TLI has since become historical. + */ + if (lastReadPage == wantPage && + state->readLen != 0 && + lastReadPage + state->readLen >= wantPage + Min(wantLength,XLOG_BLCKSZ-1)) + return; + + /* + * If we're reading from the current timeline, it hasn't become historical + * and the page we're reading is after the last page read, we can again + * just carry on. (Seeking backwards requires a check to make sure the older + * page isn't on a prior timeline). + * + * ThisTimeLineID might've become historical since we last looked, but the + * caller is required not to read past the flush limit it saw at the time + * it looked up the timeline. There's nothing we can do about it if + * StartupXLOG() renames it to .partial concurrently. + */ + if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage) + { + Assert(state->currTLIValidUntil == InvalidXLogRecPtr); + return; + } + + /* + * If we're just reading pages from a previously validated historical + * timeline and the timeline we're reading from is valid until the + * end of the current segment we can just keep reading. + */ + if (state->currTLIValidUntil != InvalidXLogRecPtr && + state->currTLI != ThisTimeLineID && + state->currTLI != 0 && + (wantPage + wantLength) / XLogSegSize < state->currTLIValidUntil / XLogSegSize) + return; + + /* + * If we reach this point we're either looking up a page for random access, + * the current timeline just became historical, or we're reading from a new + * segment containing a timeline switch. In all cases we need to determine + * the newest timeline on the segment. + * + * If it's the current timeline we can just keep reading from here unless + * we detect a timeline switch that makes the current timeline historical. + * If it's a historical timeline we can read all the segment on the newest + * timeline because it contains all the old timelines' data too. So only + * one switch check is required. + */ + { + /* + * We need to re-read the timeline history in case it's been changed + * by a promotion or replay from a cascaded replica. + */ + List *timelineHistory = readTimeLineHistory(ThisTimeLineID); + + XLogRecPtr endOfSegment = (((wantPage / XLogSegSize) + 1) * XLogSegSize) - 1; + + Assert(wantPage / XLogSegSize == endOfSegment / XLogSegSize); + + /* Find the timeline of the last LSN on the segment containing wantPage. */ + state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory); + state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, + &state->nextTLI); + + Assert(state->currTLIValidUntil == InvalidXLogRecPtr || + wantPage + wantLength < state->currTLIValidUntil); + + list_free_deep(timelineHistory); + + elog(DEBUG3, "switched to timeline %u valid until %X/%X", + state->currTLI, + (uint32)(state->currTLIValidUntil >> 32), + (uint32)(state->currTLIValidUntil)); + } +} + +/* * read_page callback for reading local xlog files * * Public because it would likely be very helpful for someone writing another @@ -774,28 +905,84 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int count; loc = targetPagePtr + reqLen; + + /* Loop waiting for xlog to be available if necessary */ while (1) { /* - * TODO: we're going to have to do something more intelligent about - * timelines on standbys. Use readTimeLineHistory() and - * tliOfPointInHistory() to get the proper LSN? For now we'll catch - * that case earlier, but the code and TODO is left in here for when - * that changes. + * Determine the limit of xlog we can currently read to, and what the + * most recent timeline is. + * + * RecoveryInProgress() will update ThisTimeLineID when it first + * notices recovery finishes, so we only have to maintain it for the + * local process until recovery ends. */ if (!RecoveryInProgress()) - { - *pageTLI = ThisTimeLineID; read_upto = GetFlushRecPtr(); - } else - read_upto = GetXLogReplayRecPtr(pageTLI); + read_upto = GetXLogReplayRecPtr(&ThisTimeLineID); - if (loc <= read_upto) - break; + *pageTLI = ThisTimeLineID; + + /* + * Check which timeline to get the record from. + * + * We have to do it each time through the loop because if we're in + * recovery as a cascading standby, the current timeline might've + * become historical. We can't rely on RecoveryInProgress() because + * in a standby configuration like + * + * A => B => C + * + * if we're a logical decoding session on C, and B gets promoted, our + * timeline will change while we remain in recovery. + * + * We can't just keep reading from the old timeline as the last WAL + * archive in the timeline will get renamed to .partial by StartupXLOG(). + * + * If that happens after our caller updated ThisTimeLineID but before + * we actually read the xlog page, we might still try to read from the + * old (now renamed) segment and fail. There's not much we can do about + * this, but it can only happen when we're a leaf of a cascading + * standby whose master gets promoted while we're decoding, so a + * one-off ERROR isn't too bad. + */ + XLogReadDetermineTimeline(state, targetPagePtr, reqLen); + + if (state->currTLI == ThisTimeLineID) + { - CHECK_FOR_INTERRUPTS(); - pg_usleep(1000L); + if (loc <= read_upto) + break; + + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000L); + } + else + { + /* + * We're on a historical timeline, so limit reading to the switch + * point where we moved to the next timeline. + * + * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know + * about the new timeline, so we must've received past the end of + * it. + */ + read_upto = state->currTLIValidUntil; + + /* + * Setting pageTLI to our wanted record's TLI is slightly wrong; + * the page might begin on an older timeline if it contains a + * timeline switch, since its xlog segment will have been copied + * from the prior timeline. This is pretty harmless though, as + * nothing cares so long as the timeline doesn't go backwards. We + * should read the page header instead; FIXME someday. + */ + *pageTLI = state->currTLI; + + /* No need to wait on a historical timeline */ + break; + } } if (targetPagePtr + XLOG_BLCKSZ <= read_upto) |