aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/transam/xlogutils.c
diff options
context:
space:
mode:
authorSimon Riggs <simon@2ndQuadrant.com>2017-03-22 07:05:12 +0000
committerSimon Riggs <simon@2ndQuadrant.com>2017-03-22 07:05:12 +0000
commit1148e22a82edc96172fc78855da392b6f0015c88 (patch)
treeba60e49124bf0eb9f003ee11f2d2a275dffddb1b /src/backend/access/transam/xlogutils.c
parent9ca2dd578db4086ae8a6eb6fd82ac376b7b2804e (diff)
downloadpostgresql-1148e22a82edc96172fc78855da392b6f0015c88.tar.gz
postgresql-1148e22a82edc96172fc78855da392b6f0015c88.zip
Teach xlogreader to follow timeline switches
Uses page-based mechanism to ensure we’re using the correct timeline. Tests are included to exercise the functionality using a cold disk-level copy of the master that's started up as a replica with slots intact, but the intended use of the functionality is with later features. Craig Ringer, reviewed by Simon Riggs and Andres Freund
Diffstat (limited to 'src/backend/access/transam/xlogutils.c')
-rw-r--r--src/backend/access/transam/xlogutils.c215
1 files changed, 201 insertions, 14 deletions
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b2b9fcbebb0..28c07d37c17 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -19,6 +19,7 @@
#include <unistd.h>
+#include "access/timeline.h"
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "access/xlogutils.h"
@@ -662,6 +663,7 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
/* state maintained across calls */
static int sendFile = -1;
static XLogSegNo sendSegNo = 0;
+ static TimeLineID sendTLI = 0;
static uint32 sendOff = 0;
p = buf;
@@ -677,7 +679,8 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
startoff = recptr % XLogSegSize;
/* Do we need to switch to a different xlog segment? */
- if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
+ if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo) ||
+ sendTLI != tli)
{
char path[MAXPGPATH];
@@ -704,6 +707,7 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
path)));
}
sendOff = 0;
+ sendTLI = tli;
}
/* Need to seek in the file? */
@@ -754,6 +758,133 @@ XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
}
/*
+ * Determine which timeline to read an xlog page from and set the
+ * XLogReaderState's currTLI to that timeline ID.
+ *
+ * We care about timelines in xlogreader when we might be reading xlog
+ * generated prior to a promotion, either if we're currently a standby in
+ * recovery or if we're a promoted master reading xlogs generated by the old
+ * master before our promotion.
+ *
+ * wantPage must be set to the start address of the page to read and
+ * wantLength to the amount of the page that will be read, up to
+ * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ.
+ *
+ * We switch to an xlog segment from the new timeline eagerly when on a
+ * historical timeline, as soon as we reach the start of the xlog segment
+ * containing the timeline switch. The server copied the segment to the new
+ * timeline so all the data up to the switch point is the same, but there's no
+ * guarantee the old segment will still exist. It may have been deleted or
+ * renamed with a .partial suffix so we can't necessarily keep reading from
+ * the old TLI even though tliSwitchPoint says it's OK.
+ *
+ * We can't just check the timeline when we read a page on a different segment
+ * to the last page. We could've received a timeline switch from a cascading
+ * upstream, so the current segment ends apruptly (possibly getting renamed to
+ * .partial) and we have to switch to a new one. Even in the middle of reading
+ * a page we could have to dump the cached page and switch to a new TLI.
+ *
+ * Because of this, callers MAY NOT assume that currTLI is the timeline that
+ * will be in a page's xlp_tli; the page may begin on an older timeline or we
+ * might be reading from historical timeline data on a segment that's been
+ * copied to a new timeline.
+ *
+ * The caller must also make sure it doesn't read past the current replay
+ * position (using GetWalRcvWriteRecPtr) if executing in recovery, so it
+ * doesn't fail to notice that the current timeline became historical. The
+ * caller must also update ThisTimeLineID with the result of
+ * GetWalRcvWriteRecPtr and must check RecoveryInProgress().
+ */
+void
+XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength)
+{
+ const XLogRecPtr lastReadPage = state->readSegNo * XLogSegSize + state->readOff;
+
+ Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0);
+ Assert(wantLength <= XLOG_BLCKSZ);
+ Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ);
+
+ /*
+ * If the desired page is currently read in and valid, we have nothing to do.
+ *
+ * The caller should've ensured that it didn't previously advance readOff
+ * past the valid limit of this timeline, so it doesn't matter if the current
+ * TLI has since become historical.
+ */
+ if (lastReadPage == wantPage &&
+ state->readLen != 0 &&
+ lastReadPage + state->readLen >= wantPage + Min(wantLength,XLOG_BLCKSZ-1))
+ return;
+
+ /*
+ * If we're reading from the current timeline, it hasn't become historical
+ * and the page we're reading is after the last page read, we can again
+ * just carry on. (Seeking backwards requires a check to make sure the older
+ * page isn't on a prior timeline).
+ *
+ * ThisTimeLineID might've become historical since we last looked, but the
+ * caller is required not to read past the flush limit it saw at the time
+ * it looked up the timeline. There's nothing we can do about it if
+ * StartupXLOG() renames it to .partial concurrently.
+ */
+ if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage)
+ {
+ Assert(state->currTLIValidUntil == InvalidXLogRecPtr);
+ return;
+ }
+
+ /*
+ * If we're just reading pages from a previously validated historical
+ * timeline and the timeline we're reading from is valid until the
+ * end of the current segment we can just keep reading.
+ */
+ if (state->currTLIValidUntil != InvalidXLogRecPtr &&
+ state->currTLI != ThisTimeLineID &&
+ state->currTLI != 0 &&
+ (wantPage + wantLength) / XLogSegSize < state->currTLIValidUntil / XLogSegSize)
+ return;
+
+ /*
+ * If we reach this point we're either looking up a page for random access,
+ * the current timeline just became historical, or we're reading from a new
+ * segment containing a timeline switch. In all cases we need to determine
+ * the newest timeline on the segment.
+ *
+ * If it's the current timeline we can just keep reading from here unless
+ * we detect a timeline switch that makes the current timeline historical.
+ * If it's a historical timeline we can read all the segment on the newest
+ * timeline because it contains all the old timelines' data too. So only
+ * one switch check is required.
+ */
+ {
+ /*
+ * We need to re-read the timeline history in case it's been changed
+ * by a promotion or replay from a cascaded replica.
+ */
+ List *timelineHistory = readTimeLineHistory(ThisTimeLineID);
+
+ XLogRecPtr endOfSegment = (((wantPage / XLogSegSize) + 1) * XLogSegSize) - 1;
+
+ Assert(wantPage / XLogSegSize == endOfSegment / XLogSegSize);
+
+ /* Find the timeline of the last LSN on the segment containing wantPage. */
+ state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory);
+ state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory,
+ &state->nextTLI);
+
+ Assert(state->currTLIValidUntil == InvalidXLogRecPtr ||
+ wantPage + wantLength < state->currTLIValidUntil);
+
+ list_free_deep(timelineHistory);
+
+ elog(DEBUG3, "switched to timeline %u valid until %X/%X",
+ state->currTLI,
+ (uint32)(state->currTLIValidUntil >> 32),
+ (uint32)(state->currTLIValidUntil));
+ }
+}
+
+/*
* read_page callback for reading local xlog files
*
* Public because it would likely be very helpful for someone writing another
@@ -774,28 +905,84 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
int count;
loc = targetPagePtr + reqLen;
+
+ /* Loop waiting for xlog to be available if necessary */
while (1)
{
/*
- * TODO: we're going to have to do something more intelligent about
- * timelines on standbys. Use readTimeLineHistory() and
- * tliOfPointInHistory() to get the proper LSN? For now we'll catch
- * that case earlier, but the code and TODO is left in here for when
- * that changes.
+ * Determine the limit of xlog we can currently read to, and what the
+ * most recent timeline is.
+ *
+ * RecoveryInProgress() will update ThisTimeLineID when it first
+ * notices recovery finishes, so we only have to maintain it for the
+ * local process until recovery ends.
*/
if (!RecoveryInProgress())
- {
- *pageTLI = ThisTimeLineID;
read_upto = GetFlushRecPtr();
- }
else
- read_upto = GetXLogReplayRecPtr(pageTLI);
+ read_upto = GetXLogReplayRecPtr(&ThisTimeLineID);
- if (loc <= read_upto)
- break;
+ *pageTLI = ThisTimeLineID;
+
+ /*
+ * Check which timeline to get the record from.
+ *
+ * We have to do it each time through the loop because if we're in
+ * recovery as a cascading standby, the current timeline might've
+ * become historical. We can't rely on RecoveryInProgress() because
+ * in a standby configuration like
+ *
+ * A => B => C
+ *
+ * if we're a logical decoding session on C, and B gets promoted, our
+ * timeline will change while we remain in recovery.
+ *
+ * We can't just keep reading from the old timeline as the last WAL
+ * archive in the timeline will get renamed to .partial by StartupXLOG().
+ *
+ * If that happens after our caller updated ThisTimeLineID but before
+ * we actually read the xlog page, we might still try to read from the
+ * old (now renamed) segment and fail. There's not much we can do about
+ * this, but it can only happen when we're a leaf of a cascading
+ * standby whose master gets promoted while we're decoding, so a
+ * one-off ERROR isn't too bad.
+ */
+ XLogReadDetermineTimeline(state, targetPagePtr, reqLen);
+
+ if (state->currTLI == ThisTimeLineID)
+ {
- CHECK_FOR_INTERRUPTS();
- pg_usleep(1000L);
+ if (loc <= read_upto)
+ break;
+
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(1000L);
+ }
+ else
+ {
+ /*
+ * We're on a historical timeline, so limit reading to the switch
+ * point where we moved to the next timeline.
+ *
+ * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know
+ * about the new timeline, so we must've received past the end of
+ * it.
+ */
+ read_upto = state->currTLIValidUntil;
+
+ /*
+ * Setting pageTLI to our wanted record's TLI is slightly wrong;
+ * the page might begin on an older timeline if it contains a
+ * timeline switch, since its xlog segment will have been copied
+ * from the prior timeline. This is pretty harmless though, as
+ * nothing cares so long as the timeline doesn't go backwards. We
+ * should read the page header instead; FIXME someday.
+ */
+ *pageTLI = state->currTLI;
+
+ /* No need to wait on a historical timeline */
+ break;
+ }
}
if (targetPagePtr + XLOG_BLCKSZ <= read_upto)