diff options
Diffstat (limited to 'src/backend/access/transam/xlog.c')
-rw-r--r-- | src/backend/access/transam/xlog.c | 104 |
1 files changed, 76 insertions, 28 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8d480f7ce24..94b79ac49d5 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -42,6 +42,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" +#include "postmaster/walwriter.h" #include "postmaster/startup.h" #include "replication/basebackup.h" #include "replication/logical.h" @@ -2729,28 +2730,37 @@ XLogFlush(XLogRecPtr record) } /* - * Flush xlog, but without specifying exactly where to flush to. + * Write & flush xlog, but without specifying exactly where to. * - * We normally flush only completed blocks; but if there is nothing to do on - * that basis, we check for unflushed async commits in the current incomplete - * block, and flush through the latest one of those. Thus, if async commits - * are not being used, we will flush complete blocks only. We can guarantee - * that async commits reach disk after at most three cycles; normally only - * one or two. (When flushing complete blocks, we allow XLogWrite to write - * "flexibly", meaning it can stop at the end of the buffer ring; this makes a - * difference only with very high load or long wal_writer_delay, but imposes - * one extra cycle for the worst case for async commits.) + * We normally write only completed blocks; but if there is nothing to do on + * that basis, we check for unwritten async commits in the current incomplete + * block, and write through the latest one of those. Thus, if async commits + * are not being used, we will write complete blocks only. + * + * If, based on the above, there's anything to write we do so immediately. But + * to avoid calling fsync, fdatasync et. al. at a rate that'd impact + * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's + * more than wal_writer_flush_after unflushed blocks. + * + * We can guarantee that async commits reach disk after at most three + * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite + * to write "flexibly", meaning it can stop at the end of the buffer ring; + * this makes a difference only with very high load or long wal_writer_delay, + * but imposes one extra cycle for the worst case for async commits.) * * This routine is invoked periodically by the background walwriter process. * - * Returns TRUE if we flushed anything. + * Returns TRUE if there was any work to do, even if we skipped flushing due + * to wal_writer_delay/wal_flush_after. */ bool XLogBackgroundFlush(void) { - XLogRecPtr WriteRqstPtr; + XLogwrtRqst WriteRqst; bool flexible = true; - bool wrote_something = false; + static TimestampTz lastflush; + TimestampTz now; + int flushbytes; /* XLOG doesn't need flushing during recovery */ if (RecoveryInProgress()) @@ -2759,17 +2769,17 @@ XLogBackgroundFlush(void) /* read LogwrtResult and update local state */ SpinLockAcquire(&XLogCtl->info_lck); LogwrtResult = XLogCtl->LogwrtResult; - WriteRqstPtr = XLogCtl->LogwrtRqst.Write; + WriteRqst = XLogCtl->LogwrtRqst; SpinLockRelease(&XLogCtl->info_lck); /* back off to last completed page boundary */ - WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ; + WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ; /* if we have already flushed that far, consider async commit records */ - if (WriteRqstPtr <= LogwrtResult.Flush) + if (WriteRqst.Write <= LogwrtResult.Flush) { SpinLockAcquire(&XLogCtl->info_lck); - WriteRqstPtr = XLogCtl->asyncXactLSN; + WriteRqst.Write = XLogCtl->asyncXactLSN; SpinLockRelease(&XLogCtl->info_lck); flexible = false; /* ensure it all gets written */ } @@ -2779,7 +2789,7 @@ XLogBackgroundFlush(void) * holding an open file handle to a logfile that's no longer in use, * preventing the file from being deleted. */ - if (WriteRqstPtr <= LogwrtResult.Flush) + if (WriteRqst.Write <= LogwrtResult.Flush) { if (openLogFile >= 0) { @@ -2791,10 +2801,47 @@ XLogBackgroundFlush(void) return false; } + /* + * Determine how far to flush WAL, based on the wal_writer_delay and + * wal_writer_flush_after GUCs. + */ + now = GetCurrentTimestamp(); + flushbytes = + WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ; + + if (WalWriterFlushAfter == 0 || lastflush == 0) + { + /* first call, or block based limits disabled */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay)) + { + /* + * Flush the writes at least every WalWriteDelay ms. This is important + * to bound the amount of time it takes for an asynchronous commit to + * hit disk. + */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else if (flushbytes >= WalWriterFlushAfter) + { + /* exceeded wal_writer_flush_after blocks, flush */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else + { + /* no flushing, this time round */ + WriteRqst.Flush = 0; + } + #ifdef WAL_DEBUG if (XLOG_DEBUG) - elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X", - (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr, + elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X", + (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write, + (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush, (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write, (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush); #endif @@ -2802,17 +2849,13 @@ XLogBackgroundFlush(void) START_CRIT_SECTION(); /* now wait for any in-progress insertions to finish and get write lock */ - WaitXLogInsertionsToFinish(WriteRqstPtr); + WaitXLogInsertionsToFinish(WriteRqst.Write); LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); LogwrtResult = XLogCtl->LogwrtResult; - if (WriteRqstPtr > LogwrtResult.Flush) + if (WriteRqst.Write > LogwrtResult.Write || + WriteRqst.Flush > LogwrtResult.Flush) { - XLogwrtRqst WriteRqst; - - WriteRqst.Write = WriteRqstPtr; - WriteRqst.Flush = WriteRqstPtr; XLogWrite(WriteRqst, flexible); - wrote_something = true; } LWLockRelease(WALWriteLock); @@ -2827,7 +2870,12 @@ XLogBackgroundFlush(void) */ AdvanceXLInsertBuffer(InvalidXLogRecPtr, true); - return wrote_something; + /* + * If we determined that we need to write data, but somebody else + * wrote/flushed already, it should be considered as being active, to + * avoid hibernating too early. + */ + return true; } /* |