diff options
author | Simon Riggs <simon@2ndQuadrant.com> | 2013-03-22 13:54:07 +0000 |
---|---|---|
committer | Simon Riggs <simon@2ndQuadrant.com> | 2013-03-22 13:54:07 +0000 |
commit | 96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56 (patch) | |
tree | 65849014627f4e211c6be8a4e9905b67694ed4ae /src/backend/access/transam/xlog.c | |
parent | e4a05c7512b23c8f48c186e685f2ef186374a20a (diff) | |
download | postgresql-96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56.tar.gz postgresql-96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56.zip |
Allow I/O reliability checks using 16-bit checksums
Checksums are set immediately prior to flush out of shared buffers
and checked when pages are read in again. Hint bit setting will
require full page write when block is dirtied, which causes various
infrastructure changes. Extensive comments, docs and README.
WARNING message thrown if checksum fails on non-all zeroes page;
ERROR thrown but can be disabled with ignore_checksum_failure = on.
Feature enabled by an initdb option, since transition from option off
to option on is long and complex and has not yet been implemented.
Default is not to use checksums.
Checksum used is WAL CRC-32 truncated to 16-bits.
Simon Riggs, Jeff Davis, Greg Smith
Wide input and assistance from many community members. Thank you.
Diffstat (limited to 'src/backend/access/transam/xlog.c')
-rw-r--r-- | src/backend/access/transam/xlog.c | 111 |
1 files changed, 107 insertions, 4 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7f9edef435c..07c68adf0bc 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -60,6 +60,7 @@ #include "utils/timestamp.h" #include "pg_trace.h" +extern bool bootstrap_data_checksums; /* File path names (all relative to $PGDATA) */ #define RECOVERY_COMMAND_FILE "recovery.conf" @@ -730,6 +731,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) bool updrqst; bool doPageWrites; bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + bool isHint = (rmid == RM_XLOG_ID && info == XLOG_HINT); uint8 info_orig = info; static XLogRecord *rechdr; @@ -1000,6 +1002,18 @@ begin:; } /* + * If this is a hint record and we don't need a backup block then + * we have no more work to do and can exit quickly without inserting + * a WAL record at all. In that case return InvalidXLogRecPtr. + */ + if (isHint && !(info & XLR_BKP_BLOCK_MASK)) + { + LWLockRelease(WALInsertLock); + END_CRIT_SECTION(); + return InvalidXLogRecPtr; + } + + /* * If the current page is completely full, the record goes to the next * page, right after the page header. */ @@ -1253,10 +1267,10 @@ XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites, * not. We don't need the buffer header lock for PageGetLSN because we * have exclusive lock on the page and/or the relation. */ - *lsn = PageGetLSN(page); + *lsn = BufferGetLSNAtomic(rdata->buffer); if (doPageWrites && - PageGetLSN(page) <= RedoRecPtr) + *lsn <= RedoRecPtr) { /* * The page needs to be backed up, so set up *bkpb @@ -3187,6 +3201,11 @@ RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index, BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); } + /* + * Any checksum set on this page will be invalid. We don't need + * to reset it here since it will be set before being written. + */ + PageSetLSN(page, lsn); MarkBufferDirty(buffer); @@ -3767,6 +3786,16 @@ GetSystemIdentifier(void) } /* + * Are checksums enabled for data pages? + */ +bool +DataChecksumsEnabled(void) +{ + Assert(ControlFile != NULL); + return ControlFile->data_checksums; +} + +/* * Returns a fake LSN for unlogged relations. * * Each call generates an LSN that is greater than any previous value @@ -4092,6 +4121,7 @@ BootStrapXLOG(void) ControlFile->max_prepared_xacts = max_prepared_xacts; ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; + ControlFile->data_checksums = bootstrap_data_checksums; /* some additional ControlFile fields are set in WriteControlFile() */ @@ -7602,6 +7632,51 @@ XLogRestorePoint(const char *rpName) } /* + * Write a backup block if needed when we are setting a hint. Note that + * this may be called for a variety of page types, not just heaps. + * + * Deciding the "if needed" part is delicate and requires us to either + * grab WALInsertLock or check the info_lck spinlock. If we check the + * spinlock and it says Yes then we will need to get WALInsertLock as well, + * so the design choice here is to just go straight for the WALInsertLock + * and trust that calls to this function are minimised elsewhere. + * + * Callable while holding just share lock on the buffer content. + * + * Possible that multiple concurrent backends could attempt to write + * WAL records. In that case, more than one backup block may be recorded + * though that isn't important to the outcome and the backup blocks are + * likely to be identical anyway. + */ +#define XLOG_HINT_WATERMARK 13579 +XLogRecPtr +XLogSaveBufferForHint(Buffer buffer) +{ + /* + * Make an XLOG entry reporting the hint + */ + XLogRecData rdata[2]; + int watermark = XLOG_HINT_WATERMARK; + + /* + * Not allowed to have zero-length records, so use a small watermark + */ + rdata[0].data = (char *) (&watermark); + rdata[0].len = sizeof(int); + rdata[0].buffer = InvalidBuffer; + rdata[0].buffer_std = false; + rdata[0].next = &(rdata[1]); + + rdata[1].data = NULL; + rdata[1].len = 0; + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + rdata[1].next = NULL; + + return XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata); +} + +/* * Check if any of the GUC parameters that are critical for hot standby * have changed, and update the value in pg_control file if necessary. */ @@ -7767,8 +7842,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; - /* Backup blocks are not used in xlog records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + /* Backup blocks are not used in most xlog records */ + Assert(info == XLOG_HINT || !(record->xl_info & XLR_BKP_BLOCK_MASK)); if (info == XLOG_NEXTOID) { @@ -7961,6 +8036,34 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) { /* nothing to do here */ } + else if (info == XLOG_HINT) + { +#ifdef USE_ASSERT_CHECKING + int *watermark = (int *) XLogRecGetData(record); +#endif + + /* Check the watermark is correct for the hint record */ + Assert(*watermark == XLOG_HINT_WATERMARK); + + /* Backup blocks must be present for smgr hint records */ + Assert(record->xl_info & XLR_BKP_BLOCK_MASK); + + /* + * Hint records have no information that needs to be replayed. + * The sole purpose of them is to ensure that a hint bit does + * not cause a checksum invalidation if a hint bit write should + * cause a torn page. So the body of the record is empty but + * there must be one backup block. + * + * Since the only change in the backup block is a hint bit, + * there is no confict with Hot Standby. + * + * This also means there is no corresponding API call for this, + * so an smgr implementation has no need to implement anything. + * Which means nothing is needed in md.c etc + */ + RestoreBackupBlock(lsn, record, 0, false, false); + } else if (info == XLOG_BACKUP_END) { XLogRecPtr startpoint; |