diff options
Diffstat (limited to 'src/backend/access/transam')
-rw-r--r-- | src/backend/access/transam/clog.c | 40 | ||||
-rw-r--r-- | src/backend/access/transam/commit_ts.c | 36 | ||||
-rw-r--r-- | src/backend/access/transam/multixact.c | 57 | ||||
-rw-r--r-- | src/backend/access/transam/slru.c | 154 | ||||
-rw-r--r-- | src/backend/access/transam/subtrans.c | 25 | ||||
-rw-r--r-- | src/backend/access/transam/xlog.c | 28 |
6 files changed, 194 insertions, 146 deletions
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 9e352d26583..034349aa7b9 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -42,6 +42,7 @@ #include "pg_trace.h" #include "pgstat.h" #include "storage/proc.h" +#include "storage/sync.h" /* * Defines for CLOG page sizes. A page is the same BLCKSZ as is used @@ -691,7 +692,8 @@ CLOGShmemInit(void) { XactCtl->PagePrecedes = CLOGPagePrecedes; SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, - XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER); + XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER, + SYNC_HANDLER_CLOG); } /* @@ -809,33 +811,18 @@ TrimCLOG(void) } /* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownCLOG(void) -{ - /* Flush dirty CLOG pages to disk */ - TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false); - SimpleLruFlush(XactCtl, false); - - /* - * fsync pg_xact to ensure that any files flushed previously are durably - * on disk. - */ - fsync_fname("pg_xact", true); - - TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false); -} - -/* * Perform a checkpoint --- either during shutdown, or on-the-fly */ void CheckPointCLOG(void) { - /* Flush dirty CLOG pages to disk */ + /* + * Write dirty CLOG pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); - SimpleLruFlush(XactCtl, true); + SimpleLruWriteAll(XactCtl, true); TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); } @@ -1026,3 +1013,12 @@ clog_redo(XLogReaderState *record) else elog(PANIC, "clog_redo: unknown op code %u", info); } + +/* + * Entrypoint for sync.c to sync clog files. + */ +int +clogsyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(XactCtl, ftag, path); +} diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index f6a7329ba3a..cb8a9688018 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -555,7 +555,8 @@ CommitTsShmemInit(void) CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, CommitTsSLRULock, "pg_commit_ts", - LWTRANCHE_COMMITTS_BUFFER); + LWTRANCHE_COMMITTS_BUFFER, + SYNC_HANDLER_COMMIT_TS); commitTsShared = ShmemInitStruct("CommitTs shared", sizeof(CommitTimestampShared), @@ -799,29 +800,17 @@ DeactivateCommitTs(void) } /* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownCommitTs(void) -{ - /* Flush dirty CommitTs pages to disk */ - SimpleLruFlush(CommitTsCtl, false); - - /* - * fsync pg_commit_ts to ensure that any files flushed previously are - * durably on disk. - */ - fsync_fname("pg_commit_ts", true); -} - -/* * Perform a checkpoint --- either during shutdown, or on-the-fly */ void CheckPointCommitTs(void) { - /* Flush dirty CommitTs pages to disk */ - SimpleLruFlush(CommitTsCtl, true); + /* + * Write dirty CommitTs pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + SimpleLruWriteAll(CommitTsCtl, true); } /* @@ -1077,3 +1066,12 @@ commit_ts_redo(XLogReaderState *record) else elog(PANIC, "commit_ts_redo: unknown op code %u", info); } + +/* + * Entrypoint for sync.c to sync commit_ts files. + */ +int +committssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(CommitTsCtl, ftag, path); +} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index b8bedca04a4..a2ce617c8ce 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -1831,11 +1831,13 @@ MultiXactShmemInit(void) SimpleLruInit(MultiXactOffsetCtl, "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0, MultiXactOffsetSLRULock, "pg_multixact/offsets", - LWTRANCHE_MULTIXACTOFFSET_BUFFER); + LWTRANCHE_MULTIXACTOFFSET_BUFFER, + SYNC_HANDLER_MULTIXACT_OFFSET); SimpleLruInit(MultiXactMemberCtl, "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0, MultiXactMemberSLRULock, "pg_multixact/members", - LWTRANCHE_MULTIXACTMEMBER_BUFFER); + LWTRANCHE_MULTIXACTMEMBER_BUFFER, + SYNC_HANDLER_MULTIXACT_MEMBER); /* Initialize our shared state struct */ MultiXactState = ShmemInitStruct("Shared MultiXact State", @@ -2101,19 +2103,6 @@ TrimMultiXact(void) } /* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownMultiXact(void) -{ - /* Flush dirty MultiXact pages to disk */ - TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false); - SimpleLruFlush(MultiXactOffsetCtl, false); - SimpleLruFlush(MultiXactMemberCtl, false); - TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false); -} - -/* * Get the MultiXact data to save in a checkpoint record */ void @@ -2143,9 +2132,13 @@ CheckPointMultiXact(void) { TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); - /* Flush dirty MultiXact pages to disk */ - SimpleLruFlush(MultiXactOffsetCtl, true); - SimpleLruFlush(MultiXactMemberCtl, true); + /* + * Write dirty MultiXact pages to disk. This may result in sync requests + * queued for later handling by ProcessSyncRequests(), as part of the + * checkpoint. + */ + SimpleLruWriteAll(MultiXactOffsetCtl, true); + SimpleLruWriteAll(MultiXactMemberCtl, true); TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); } @@ -2728,14 +2721,10 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) entryno = MultiXactIdToOffsetEntry(multi); /* - * Flush out dirty data, so PhysicalPageExists can work correctly. - * SimpleLruFlush() is a pretty big hammer for that. Alternatively we - * could add an in-memory version of page exists, but find_multixact_start - * is called infrequently, and it doesn't seem bad to flush buffers to - * disk before truncation. + * Write out dirty data, so PhysicalPageExists can work correctly. */ - SimpleLruFlush(MultiXactOffsetCtl, true); - SimpleLruFlush(MultiXactMemberCtl, true); + SimpleLruWriteAll(MultiXactOffsetCtl, true); + SimpleLruWriteAll(MultiXactMemberCtl, true); if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) return false; @@ -3386,3 +3375,21 @@ pg_get_multixact_members(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funccxt); } + +/* + * Entrypoint for sync.c to sync offsets files. + */ +int +multixactoffsetssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path); +} + +/* + * Entrypoint for sync.c to sync members files. + */ +int +multixactmemberssyncfiletag(const FileTag *ftag, char *path) +{ + return SlruSyncFileTag(MultiXactMemberCtl, ftag, path); +} diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index fe7d759a8c1..16a78986971 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -63,22 +63,33 @@ snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) /* - * During SimpleLruFlush(), we will usually not need to write/fsync more - * than one or two physical files, but we may need to write several pages - * per file. We can consolidate the I/O requests by leaving files open - * until control returns to SimpleLruFlush(). This data structure remembers - * which files are open. + * During SimpleLruWriteAll(), we will usually not need to write more than one + * or two physical files, but we may need to write several pages per file. We + * can consolidate the I/O requests by leaving files open until control returns + * to SimpleLruWriteAll(). This data structure remembers which files are open. */ -#define MAX_FLUSH_BUFFERS 16 +#define MAX_WRITEALL_BUFFERS 16 -typedef struct SlruFlushData +typedef struct SlruWriteAllData { int num_files; /* # files actually open */ - int fd[MAX_FLUSH_BUFFERS]; /* their FD's */ - int segno[MAX_FLUSH_BUFFERS]; /* their log seg#s */ -} SlruFlushData; + int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */ + int segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */ +} SlruWriteAllData; -typedef struct SlruFlushData *SlruFlush; +typedef struct SlruWriteAllData *SlruWriteAll; + +/* + * Populate a file tag describing a segment file. We only use the segment + * number, since we can derive everything else we need by having separate + * sync handler functions for clog, multixact etc. + */ +#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = (xx_handler), \ + (a).segno = (xx_segno) \ +) /* * Macro to mark a buffer slot "most recently used". Note multiple evaluation @@ -125,10 +136,10 @@ static int slru_errno; static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); static void SimpleLruWaitIO(SlruCtl ctl, int slotno); -static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata); +static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata); static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, - SlruFlush fdata); + SlruWriteAll fdata); static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid); static int SlruSelectLRUPage(SlruCtl ctl, int pageno); @@ -173,7 +184,8 @@ SimpleLruShmemSize(int nslots, int nlsns) */ void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id) + LWLock *ctllock, const char *subdir, int tranche_id, + SyncRequestHandler sync_handler) { SlruShared shared; bool found; @@ -251,7 +263,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, * assume caller set PagePrecedes. */ ctl->shared = shared; - ctl->do_fsync = true; /* default behavior */ + ctl->sync_handler = sync_handler; strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); } @@ -523,7 +535,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) * Control lock must be held at entry, and will be held at exit. */ static void -SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) +SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) { SlruShared shared = ctl->shared; int pageno = shared->page_number[slotno]; @@ -587,6 +599,10 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) /* Now it's okay to ereport if we failed */ if (!ok) SlruReportIOError(ctl, pageno, InvalidTransactionId); + + /* If part of a checkpoint, count this as a buffer written. */ + if (fdata) + CheckpointStats.ckpt_bufs_written++; } /* @@ -730,13 +746,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during - * SimpleLruFlush, though. + * SimpleLruWriteAll, though. * * fdata is NULL for a standalone write, pointer to open-file info during - * SimpleLruFlush. + * SimpleLruWriteAll. */ static bool -SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) +SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; @@ -791,7 +807,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) } /* - * During a Flush, we may already have the desired file open. + * During a WriteAll, we may already have the desired file open. */ if (fdata) { @@ -837,7 +853,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) if (fdata) { - if (fdata->num_files < MAX_FLUSH_BUFFERS) + if (fdata->num_files < MAX_WRITEALL_BUFFERS) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; @@ -870,23 +886,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) } pgstat_report_wait_end(); - /* - * If not part of Flush, need to fsync now. We assume this happens - * infrequently enough that it's not a performance issue. - */ - if (!fdata) + /* Queue up a sync request for the checkpointer. */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) { - pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); - if (ctl->do_fsync && pg_fsync(fd) != 0) + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false)) { + /* No space to enqueue sync request. Do it synchronously. */ + pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); + if (pg_fsync(fd) != 0) + { + pgstat_report_wait_end(); + slru_errcause = SLRU_FSYNC_FAILED; + slru_errno = errno; + CloseTransientFile(fd); + return false; + } pgstat_report_wait_end(); - slru_errcause = SLRU_FSYNC_FAILED; - slru_errno = errno; - CloseTransientFile(fd); - return false; } - pgstat_report_wait_end(); + } + /* Close file, unless part of flush request. */ + if (!fdata) + { if (CloseTransientFile(fd) != 0) { slru_errcause = SLRU_CLOSE_FAILED; @@ -1122,13 +1146,16 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) } /* - * Flush dirty pages to disk during checkpoint or database shutdown + * Write dirty pages to disk during checkpoint or database shutdown. Flushing + * is deferred until the next call to ProcessSyncRequests(), though we do fsync + * the containing directory here to make sure that newly created directory + * entries are on disk. */ void -SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) +SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) { SlruShared shared = ctl->shared; - SlruFlushData fdata; + SlruWriteAllData fdata; int slotno; int pageno = 0; int i; @@ -1162,21 +1189,11 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) LWLockRelease(shared->ControlLock); /* - * Now fsync and close any files that were open + * Now close any files that were open */ ok = true; for (i = 0; i < fdata.num_files; i++) { - pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC); - if (ctl->do_fsync && pg_fsync(fdata.fd[i]) != 0) - { - slru_errcause = SLRU_FSYNC_FAILED; - slru_errno = errno; - pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; - ok = false; - } - pgstat_report_wait_end(); - if (CloseTransientFile(fdata.fd[i]) != 0) { slru_errcause = SLRU_CLOSE_FAILED; @@ -1189,7 +1206,7 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) SlruReportIOError(ctl, pageno, InvalidTransactionId); /* Ensure that directory entries for new files are on disk. */ - if (ctl->do_fsync) + if (ctl->sync_handler != SYNC_HANDLER_NONE) fsync_fname(ctl->Dir, true); } @@ -1350,6 +1367,19 @@ restart: snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno); ereport(DEBUG2, (errmsg("removing file \"%s\"", path))); + + /* + * Tell the checkpointer to forget any sync requests, before we unlink the + * file. + */ + if (ctl->sync_handler != SYNC_HANDLER_NONE) + { + FileTag tag; + + INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true); + } + unlink(path); LWLockRelease(shared->ControlLock); @@ -1448,3 +1478,31 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) return retval; } + +/* + * Individual SLRUs (clog, ...) have to provide a sync.c handler function so + * that they can provide the correct "SlruCtl" (otherwise we don't know how to + * build the path), but they just forward to this common implementation that + * performs the fsync. + */ +int +SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path) +{ + int fd; + int save_errno; + int result; + + SlruFileName(ctl, path, ftag->segno); + + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + if (fd < 0) + return -1; + + result = pg_fsync(fd); + save_errno = errno; + + CloseTransientFile(fd); + + errno = save_errno; + return result; +} diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index a50f60b99af..0111e867c79 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -193,9 +193,7 @@ SUBTRANSShmemInit(void) SubTransCtl->PagePrecedes = SubTransPagePrecedes; SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0, SubtransSLRULock, "pg_subtrans", - LWTRANCHE_SUBTRANS_BUFFER); - /* Override default assumption that writes should be fsync'd */ - SubTransCtl->do_fsync = false; + LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE); } /* @@ -279,37 +277,20 @@ StartupSUBTRANS(TransactionId oldestActiveXID) } /* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownSUBTRANS(void) -{ - /* - * Flush dirty SUBTRANS pages to disk - * - * This is not actually necessary from a correctness point of view. We do - * it merely as a debugging aid. - */ - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(false); - SimpleLruFlush(SubTransCtl, false); - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(false); -} - -/* * Perform a checkpoint --- either during shutdown, or on-the-fly */ void CheckPointSUBTRANS(void) { /* - * Flush dirty SUBTRANS pages to disk + * Write dirty SUBTRANS pages to disk * * This is not actually necessary from a correctness point of view. We do * it merely to improve the odds that writing of dirty pages is done by * the checkpoint process and not by backends. */ TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true); - SimpleLruFlush(SubTransCtl, true); + SimpleLruWriteAll(SubTransCtl, true); TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 61754312e26..79a77ebbfe2 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8528,10 +8528,6 @@ ShutdownXLOG(int code, Datum arg) CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } - ShutdownCLOG(); - ShutdownCommitTs(); - ShutdownSUBTRANS(); - ShutdownMultiXact(); } /* @@ -9176,17 +9172,29 @@ CreateEndOfRecoveryRecord(void) static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { - CheckPointCLOG(); - CheckPointCommitTs(); - CheckPointSUBTRANS(); - CheckPointMultiXact(); - CheckPointPredicate(); CheckPointRelationMap(); CheckPointReplicationSlots(); CheckPointSnapBuild(); CheckPointLogicalRewriteHeap(); - CheckPointBuffers(flags); /* performs all required fsyncs */ CheckPointReplicationOrigin(); + + /* Write out all dirty data in SLRUs and the main buffer pool */ + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); + CheckpointStats.ckpt_write_t = GetCurrentTimestamp(); + CheckPointCLOG(); + CheckPointCommitTs(); + CheckPointSUBTRANS(); + CheckPointMultiXact(); + CheckPointPredicate(); + CheckPointBuffers(flags); + + /* Perform all queued up fsyncs */ + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); + CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); + ProcessSyncRequests(); + CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp(); + TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE(); + /* We deliberately delay 2PC checkpointing as long as possible */ CheckPointTwoPhase(checkPointRedo); } |