diff options
Diffstat (limited to 'src/backend/access/transam/xlog.c')
-rw-r--r-- | src/backend/access/transam/xlog.c | 154 |
1 files changed, 149 insertions, 5 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e51a7a749da..1388afdfb02 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -200,6 +200,15 @@ static XLogRecPtr flushedUpto = 0; static TimeLineID receiveTLI = 0; /* + * abortedRecPtr is the start pointer of a broken record at end of WAL when + * recovery completes; missingContrecPtr is the location of the first + * contrecord that went missing. See CreateOverwriteContrecordRecord for + * details. + */ +static XLogRecPtr abortedRecPtr; +static XLogRecPtr missingContrecPtr; + +/* * During recovery, lastFullPageWrites keeps track of full_page_writes that * the replayed WAL records indicate. It's initialized with full_page_writes * that the recovery starting checkpoint record indicates, and then updated @@ -892,8 +901,11 @@ static void CheckRequiredParameterValues(void); static void XLogReportParameters(void); static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI); +static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, + XLogReaderState *state); static void LocalSetXLogInsertAllowed(void); static void CreateEndOfRecoveryRecord(void); +static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); @@ -2247,6 +2259,18 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) NewPage->xlp_info |= XLP_BKP_REMOVABLE; /* + * If a record was found to be broken at the end of recovery, and + * we're going to write on the page where its first contrecord was + * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page + * header. See CreateOverwriteContrecordRecord(). + */ + if (missingContrecPtr == NewPageBeginPtr) + { + NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD; + missingContrecPtr = InvalidXLogRecPtr; + } + + /* * If first page of an XLOG segment file, make it a long header. */ if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0) @@ -4394,6 +4418,19 @@ ReadRecord(XLogReaderState *xlogreader, int emode, EndRecPtr = xlogreader->EndRecPtr; if (record == NULL) { + /* + * When not in standby mode we find that WAL ends in an incomplete + * record, keep track of that record. After recovery is done, + * we'll write a record to indicate downstream WAL readers that + * that portion is to be ignored. + */ + if (!StandbyMode && + !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr)) + { + abortedRecPtr = xlogreader->abortedRecPtr; + missingContrecPtr = xlogreader->missingContrecPtr; + } + if (readFile >= 0) { close(readFile); @@ -7069,6 +7106,12 @@ StartupXLOG(void) InRecovery = true; } + /* + * Start recovery assuming that the final record isn't lost. + */ + abortedRecPtr = InvalidXLogRecPtr; + missingContrecPtr = InvalidXLogRecPtr; + /* REDO */ if (InRecovery) { @@ -7655,8 +7698,9 @@ StartupXLOG(void) /* * Kill WAL receiver, if it's still running, before we continue to write - * the startup checkpoint record. It will trump over the checkpoint and - * subsequent records if it's still alive when we start writing WAL. + * the startup checkpoint and aborted-contrecord records. It will trump + * over these records and subsequent ones if it's still alive when we + * start writing WAL. */ XLogShutdownWalRcv(); @@ -7689,8 +7733,12 @@ StartupXLOG(void) StandbyMode = false; /* - * Re-fetch the last valid or last applied record, so we can identify the - * exact endpoint of what we consider the valid portion of WAL. + * Determine where to start writing WAL next. + * + * When recovery ended in an incomplete record, write a WAL record about + * that and continue after it. In all other cases, re-fetch the last + * valid or last applied record, so we can identify the exact endpoint of + * what we consider the valid portion of WAL. */ XLogBeginRead(xlogreader, LastRec); record = ReadRecord(xlogreader, PANIC, false); @@ -7822,6 +7870,18 @@ StartupXLOG(void) XLogCtl->PrevTimeLineID = PrevTimeLineID; /* + * Actually, if WAL ended in an incomplete record, skip the parts that + * made it through and start writing after the portion that persisted. + * (It's critical to first write an OVERWRITE_CONTRECORD message, which + * we'll do as soon as we're open for writing new WAL.) + */ + if (!XLogRecPtrIsInvalid(missingContrecPtr)) + { + Assert(!XLogRecPtrIsInvalid(abortedRecPtr)); + EndOfLog = missingContrecPtr; + } + + /* * Prepare to write WAL starting at EndOfLog location, and init xlog * buffer cache using the block containing the last record from the * previous incarnation. @@ -7873,13 +7933,23 @@ StartupXLOG(void) XLogCtl->LogwrtRqst.Write = EndOfLog; XLogCtl->LogwrtRqst.Flush = EndOfLog; + LocalSetXLogInsertAllowed(); + + /* If necessary, write overwrite-contrecord before doing anything else */ + if (!XLogRecPtrIsInvalid(abortedRecPtr)) + { + Assert(!XLogRecPtrIsInvalid(missingContrecPtr)); + CreateOverwriteContrecordRecord(abortedRecPtr); + abortedRecPtr = InvalidXLogRecPtr; + missingContrecPtr = InvalidXLogRecPtr; + } + /* * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE * record before resource manager writes cleanup WAL records or checkpoint * record is written. */ Insert->fullPageWrites = lastFullPageWrites; - LocalSetXLogInsertAllowed(); UpdateFullPageWrites(); LocalXLogInsertAllowed = -1; @@ -9366,6 +9436,53 @@ CreateEndOfRecoveryRecord(void) } /* + * Write an OVERWRITE_CONTRECORD message. + * + * When on WAL replay we expect a continuation record at the start of a page + * that is not there, recovery ends and WAL writing resumes at that point. + * But it's wrong to resume writing new WAL back at the start of the record + * that was broken, because downstream consumers of that WAL (physical + * replicas) are not prepared to "rewind". So the first action after + * finishing replay of all valid WAL must be to write a record of this type + * at the point where the contrecord was missing; to support xlogreader + * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added + * to the page header where the record occurs. xlogreader has an ad-hoc + * mechanism to report metadata about the broken record, which is what we + * use here. + * + * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to + * skip the record it was reading, and pass back the LSN of the skipped + * record, so that its caller can verify (on "replay" of that record) that the + * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten. + */ +static XLogRecPtr +CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn) +{ + xl_overwrite_contrecord xlrec; + XLogRecPtr recptr; + + /* sanity check */ + if (!RecoveryInProgress()) + elog(ERROR, "can only be used at end of recovery"); + + xlrec.overwritten_lsn = aborted_lsn; + xlrec.overwrite_time = GetCurrentTimestamp(); + + START_CRIT_SECTION(); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord)); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD); + + XLogFlush(recptr); + + END_CRIT_SECTION(); + + return recptr; +} + +/* * Flush all data in shared memory to disk, and fsync * * This is the common code shared between regular checkpoints and @@ -10295,6 +10412,13 @@ xlog_redo(XLogReaderState *record) RecoveryRestartPoint(&checkPoint); } + else if (info == XLOG_OVERWRITE_CONTRECORD) + { + xl_overwrite_contrecord xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); + VerifyOverwriteContrecord(&xlrec, record); + } else if (info == XLOG_END_OF_RECOVERY) { xl_end_of_recovery xlrec; @@ -10462,6 +10586,26 @@ xlog_redo(XLogReaderState *record) } } +/* + * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. + */ +static void +VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state) +{ + if (xlrec->overwritten_lsn != state->overwrittenRecPtr) + elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X", + LSN_FORMAT_ARGS(xlrec->overwritten_lsn), + LSN_FORMAT_ARGS(state->overwrittenRecPtr)); + + ereport(LOG, + (errmsg("sucessfully skipped missing contrecord at %X/%X, overwritten at %s", + LSN_FORMAT_ARGS(xlrec->overwritten_lsn), + timestamptz_to_str(xlrec->overwrite_time)))); + + /* Verifying the record should only happen once */ + state->overwrittenRecPtr = InvalidXLogRecPtr; +} + #ifdef WAL_DEBUG static void |