aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/transam/xlog.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/transam/xlog.c')
-rw-r--r--src/backend/access/transam/xlog.c154
1 files changed, 149 insertions, 5 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e51a7a749da..1388afdfb02 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -200,6 +200,15 @@ static XLogRecPtr flushedUpto = 0;
static TimeLineID receiveTLI = 0;
/*
+ * abortedRecPtr is the start pointer of a broken record at end of WAL when
+ * recovery completes; missingContrecPtr is the location of the first
+ * contrecord that went missing. See CreateOverwriteContrecordRecord for
+ * details.
+ */
+static XLogRecPtr abortedRecPtr;
+static XLogRecPtr missingContrecPtr;
+
+/*
* During recovery, lastFullPageWrites keeps track of full_page_writes that
* the replayed WAL records indicate. It's initialized with full_page_writes
* that the recovery starting checkpoint record indicates, and then updated
@@ -892,8 +901,11 @@ static void CheckRequiredParameterValues(void);
static void XLogReportParameters(void);
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
TimeLineID prevTLI);
+static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
+ XLogReaderState *state);
static void LocalSetXLogInsertAllowed(void);
static void CreateEndOfRecoveryRecord(void);
+static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
@@ -2247,6 +2259,18 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
NewPage->xlp_info |= XLP_BKP_REMOVABLE;
/*
+ * If a record was found to be broken at the end of recovery, and
+ * we're going to write on the page where its first contrecord was
+ * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
+ * header. See CreateOverwriteContrecordRecord().
+ */
+ if (missingContrecPtr == NewPageBeginPtr)
+ {
+ NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
+ missingContrecPtr = InvalidXLogRecPtr;
+ }
+
+ /*
* If first page of an XLOG segment file, make it a long header.
*/
if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
@@ -4394,6 +4418,19 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
EndRecPtr = xlogreader->EndRecPtr;
if (record == NULL)
{
+ /*
+ * When not in standby mode we find that WAL ends in an incomplete
+ * record, keep track of that record. After recovery is done,
+ * we'll write a record to indicate downstream WAL readers that
+ * that portion is to be ignored.
+ */
+ if (!StandbyMode &&
+ !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
+ {
+ abortedRecPtr = xlogreader->abortedRecPtr;
+ missingContrecPtr = xlogreader->missingContrecPtr;
+ }
+
if (readFile >= 0)
{
close(readFile);
@@ -7069,6 +7106,12 @@ StartupXLOG(void)
InRecovery = true;
}
+ /*
+ * Start recovery assuming that the final record isn't lost.
+ */
+ abortedRecPtr = InvalidXLogRecPtr;
+ missingContrecPtr = InvalidXLogRecPtr;
+
/* REDO */
if (InRecovery)
{
@@ -7655,8 +7698,9 @@ StartupXLOG(void)
/*
* Kill WAL receiver, if it's still running, before we continue to write
- * the startup checkpoint record. It will trump over the checkpoint and
- * subsequent records if it's still alive when we start writing WAL.
+ * the startup checkpoint and aborted-contrecord records. It will trump
+ * over these records and subsequent ones if it's still alive when we
+ * start writing WAL.
*/
XLogShutdownWalRcv();
@@ -7689,8 +7733,12 @@ StartupXLOG(void)
StandbyMode = false;
/*
- * Re-fetch the last valid or last applied record, so we can identify the
- * exact endpoint of what we consider the valid portion of WAL.
+ * Determine where to start writing WAL next.
+ *
+ * When recovery ended in an incomplete record, write a WAL record about
+ * that and continue after it. In all other cases, re-fetch the last
+ * valid or last applied record, so we can identify the exact endpoint of
+ * what we consider the valid portion of WAL.
*/
XLogBeginRead(xlogreader, LastRec);
record = ReadRecord(xlogreader, PANIC, false);
@@ -7822,6 +7870,18 @@ StartupXLOG(void)
XLogCtl->PrevTimeLineID = PrevTimeLineID;
/*
+ * Actually, if WAL ended in an incomplete record, skip the parts that
+ * made it through and start writing after the portion that persisted.
+ * (It's critical to first write an OVERWRITE_CONTRECORD message, which
+ * we'll do as soon as we're open for writing new WAL.)
+ */
+ if (!XLogRecPtrIsInvalid(missingContrecPtr))
+ {
+ Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
+ EndOfLog = missingContrecPtr;
+ }
+
+ /*
* Prepare to write WAL starting at EndOfLog location, and init xlog
* buffer cache using the block containing the last record from the
* previous incarnation.
@@ -7873,13 +7933,23 @@ StartupXLOG(void)
XLogCtl->LogwrtRqst.Write = EndOfLog;
XLogCtl->LogwrtRqst.Flush = EndOfLog;
+ LocalSetXLogInsertAllowed();
+
+ /* If necessary, write overwrite-contrecord before doing anything else */
+ if (!XLogRecPtrIsInvalid(abortedRecPtr))
+ {
+ Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
+ CreateOverwriteContrecordRecord(abortedRecPtr);
+ abortedRecPtr = InvalidXLogRecPtr;
+ missingContrecPtr = InvalidXLogRecPtr;
+ }
+
/*
* Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
* record before resource manager writes cleanup WAL records or checkpoint
* record is written.
*/
Insert->fullPageWrites = lastFullPageWrites;
- LocalSetXLogInsertAllowed();
UpdateFullPageWrites();
LocalXLogInsertAllowed = -1;
@@ -9366,6 +9436,53 @@ CreateEndOfRecoveryRecord(void)
}
/*
+ * Write an OVERWRITE_CONTRECORD message.
+ *
+ * When on WAL replay we expect a continuation record at the start of a page
+ * that is not there, recovery ends and WAL writing resumes at that point.
+ * But it's wrong to resume writing new WAL back at the start of the record
+ * that was broken, because downstream consumers of that WAL (physical
+ * replicas) are not prepared to "rewind". So the first action after
+ * finishing replay of all valid WAL must be to write a record of this type
+ * at the point where the contrecord was missing; to support xlogreader
+ * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
+ * to the page header where the record occurs. xlogreader has an ad-hoc
+ * mechanism to report metadata about the broken record, which is what we
+ * use here.
+ *
+ * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
+ * skip the record it was reading, and pass back the LSN of the skipped
+ * record, so that its caller can verify (on "replay" of that record) that the
+ * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
+ */
+static XLogRecPtr
+CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
+{
+ xl_overwrite_contrecord xlrec;
+ XLogRecPtr recptr;
+
+ /* sanity check */
+ if (!RecoveryInProgress())
+ elog(ERROR, "can only be used at end of recovery");
+
+ xlrec.overwritten_lsn = aborted_lsn;
+ xlrec.overwrite_time = GetCurrentTimestamp();
+
+ START_CRIT_SECTION();
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
+
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
+
+ XLogFlush(recptr);
+
+ END_CRIT_SECTION();
+
+ return recptr;
+}
+
+/*
* Flush all data in shared memory to disk, and fsync
*
* This is the common code shared between regular checkpoints and
@@ -10295,6 +10412,13 @@ xlog_redo(XLogReaderState *record)
RecoveryRestartPoint(&checkPoint);
}
+ else if (info == XLOG_OVERWRITE_CONTRECORD)
+ {
+ xl_overwrite_contrecord xlrec;
+
+ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
+ VerifyOverwriteContrecord(&xlrec, record);
+ }
else if (info == XLOG_END_OF_RECOVERY)
{
xl_end_of_recovery xlrec;
@@ -10462,6 +10586,26 @@ xlog_redo(XLogReaderState *record)
}
}
+/*
+ * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
+ */
+static void
+VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
+{
+ if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
+ elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
+ LSN_FORMAT_ARGS(xlrec->overwritten_lsn),
+ LSN_FORMAT_ARGS(state->overwrittenRecPtr));
+
+ ereport(LOG,
+ (errmsg("sucessfully skipped missing contrecord at %X/%X, overwritten at %s",
+ LSN_FORMAT_ARGS(xlrec->overwritten_lsn),
+ timestamptz_to_str(xlrec->overwrite_time))));
+
+ /* Verifying the record should only happen once */
+ state->overwrittenRecPtr = InvalidXLogRecPtr;
+}
+
#ifdef WAL_DEBUG
static void