diff options
-rw-r--r-- | doc/src/sgml/config.sgml | 41 | ||||
-rw-r--r-- | src/backend/access/transam/README | 32 | ||||
-rw-r--r-- | src/backend/access/transam/xlog.c | 104 | ||||
-rw-r--r-- | src/backend/postmaster/walwriter.c | 1 | ||||
-rw-r--r-- | src/backend/utils/misc/guc.c | 13 | ||||
-rw-r--r-- | src/backend/utils/misc/postgresql.conf.sample | 1 | ||||
-rw-r--r-- | src/include/postmaster/walwriter.h | 1 |
7 files changed, 141 insertions, 52 deletions
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index de84b777300..a09ceb2fea7 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2344,15 +2344,38 @@ include_dir 'conf.d' </indexterm> </term> <listitem> - <para> - Specifies the delay between activity rounds for the WAL writer. - In each round the writer will flush WAL to disk. It then sleeps for - <varname>wal_writer_delay</> milliseconds, and repeats. The default - value is 200 milliseconds (<literal>200ms</>). Note that on many - systems, the effective resolution of sleep delays is 10 milliseconds; - setting <varname>wal_writer_delay</> to a value that is not a multiple - of 10 might have the same results as setting it to the next higher - multiple of 10. This parameter can only be set in the + <para> + Specifies how often the WAL writer flushes WAL. After flushing WAL it + sleeps for <varname>wal_writer_delay</> milliseconds, unless woken up + by an asynchronously committing transaction. In case the last flush + happened less than <varname>wal_writer_delay</> milliseconds ago and + less than <varname>wal_writer_flush_after</> bytes of WAL have been + produced since, WAL is only written to the OS, not flushed to disk. + The default value is 200 milliseconds (<literal>200ms</>). Note that + on many systems, the effective resolution of sleep delays is 10 + milliseconds; setting <varname>wal_writer_delay</> to a value that is + not a multiple of 10 might have the same results as setting it to the + next higher multiple of 10. This parameter can only be set in the + <filename>postgresql.conf</> file or on the server command line. + </para> + </listitem> + </varlistentry> + + <varlistentry id="guc-wal-writer-flush-after" xreflabel="wal_writer_flush_after"> + <term><varname>wal_writer_flush_after</varname> (<type>integer</type>) + <indexterm> + <primary><varname>wal_writer_flush_after</> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + Specifies how often the WAL writer flushes WAL. In case the last flush + happened less than <varname>wal_writer_delay</> milliseconds ago and + less than <varname>wal_writer_flush_after</> bytes of WAL have been + produced since, WAL is only written to the OS, not flushed to disk. + If <varname>wal_writer_flush_after</> is set to <literal>0</> WAL is + flushed everytime the WAL writer has written WAL. The default is + <literal>1MB</literal>. This parameter can only be set in the <filename>postgresql.conf</> file or on the server command line. </para> </listitem> diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index f6db580a0bf..2de048933e2 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -736,20 +736,24 @@ non-roll-backable side effects (such as filesystem changes) force sync commit to minimize the window in which the filesystem change has been made but the transaction isn't guaranteed committed. -Every wal_writer_delay milliseconds, the walwriter process performs an -XLogBackgroundFlush(). This checks the location of the last completely -filled WAL page. If that has moved forwards, then we write all the changed -buffers up to that point, so that under full load we write only whole -buffers. If there has been a break in activity and the current WAL page is -the same as before, then we find out the LSN of the most recent -asynchronous commit, and flush up to that point, if required (i.e., -if it's in the current WAL page). This arrangement in itself would -guarantee that an async commit record reaches disk during at worst the -second walwriter cycle after the transaction completes. However, we also -allow XLogFlush to flush full buffers "flexibly" (ie, not wrapping around -at the end of the circular WAL buffer area), so as to minimize the number -of writes issued under high load when multiple WAL pages are filled per -walwriter cycle. This makes the worst-case delay three walwriter cycles. +The walwriter regularly wakes up (via wal_writer_delay) or is woken up +(via its latch, which is set by backends committing asynchronously) and +performs an XLogBackgroundFlush(). This checks the location of the last +completely filled WAL page. If that has moved forwards, then we write all +the changed buffers up to that point, so that under full load we write +only whole buffers. If there has been a break in activity and the current +WAL page is the same as before, then we find out the LSN of the most +recent asynchronous commit, and write up to that point, if required (i.e. +if it's in the current WAL page). If more than wal_writer_delay has +passed, or more than wal_writer_flush_after blocks have been written, since +the last flush, WAL is also flushed up to the current location. This +arrangement in itself would guarantee that an async commit record reaches +disk after at most two times wal_writer_delay after the transaction +completes. However, we also allow XLogFlush to write/flush full buffers +"flexibly" (ie, not wrapping around at the end of the circular WAL buffer +area), so as to minimize the number of writes issued under high load when +multiple WAL pages are filled per walwriter cycle. This makes the worst-case +delay three wal_writer_delay cycles. There are some other subtle points to consider with asynchronous commits. First, for each page of CLOG we must remember the LSN of the latest commit diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8d480f7ce24..94b79ac49d5 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -42,6 +42,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" +#include "postmaster/walwriter.h" #include "postmaster/startup.h" #include "replication/basebackup.h" #include "replication/logical.h" @@ -2729,28 +2730,37 @@ XLogFlush(XLogRecPtr record) } /* - * Flush xlog, but without specifying exactly where to flush to. + * Write & flush xlog, but without specifying exactly where to. * - * We normally flush only completed blocks; but if there is nothing to do on - * that basis, we check for unflushed async commits in the current incomplete - * block, and flush through the latest one of those. Thus, if async commits - * are not being used, we will flush complete blocks only. We can guarantee - * that async commits reach disk after at most three cycles; normally only - * one or two. (When flushing complete blocks, we allow XLogWrite to write - * "flexibly", meaning it can stop at the end of the buffer ring; this makes a - * difference only with very high load or long wal_writer_delay, but imposes - * one extra cycle for the worst case for async commits.) + * We normally write only completed blocks; but if there is nothing to do on + * that basis, we check for unwritten async commits in the current incomplete + * block, and write through the latest one of those. Thus, if async commits + * are not being used, we will write complete blocks only. + * + * If, based on the above, there's anything to write we do so immediately. But + * to avoid calling fsync, fdatasync et. al. at a rate that'd impact + * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's + * more than wal_writer_flush_after unflushed blocks. + * + * We can guarantee that async commits reach disk after at most three + * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite + * to write "flexibly", meaning it can stop at the end of the buffer ring; + * this makes a difference only with very high load or long wal_writer_delay, + * but imposes one extra cycle for the worst case for async commits.) * * This routine is invoked periodically by the background walwriter process. * - * Returns TRUE if we flushed anything. + * Returns TRUE if there was any work to do, even if we skipped flushing due + * to wal_writer_delay/wal_flush_after. */ bool XLogBackgroundFlush(void) { - XLogRecPtr WriteRqstPtr; + XLogwrtRqst WriteRqst; bool flexible = true; - bool wrote_something = false; + static TimestampTz lastflush; + TimestampTz now; + int flushbytes; /* XLOG doesn't need flushing during recovery */ if (RecoveryInProgress()) @@ -2759,17 +2769,17 @@ XLogBackgroundFlush(void) /* read LogwrtResult and update local state */ SpinLockAcquire(&XLogCtl->info_lck); LogwrtResult = XLogCtl->LogwrtResult; - WriteRqstPtr = XLogCtl->LogwrtRqst.Write; + WriteRqst = XLogCtl->LogwrtRqst; SpinLockRelease(&XLogCtl->info_lck); /* back off to last completed page boundary */ - WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ; + WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ; /* if we have already flushed that far, consider async commit records */ - if (WriteRqstPtr <= LogwrtResult.Flush) + if (WriteRqst.Write <= LogwrtResult.Flush) { SpinLockAcquire(&XLogCtl->info_lck); - WriteRqstPtr = XLogCtl->asyncXactLSN; + WriteRqst.Write = XLogCtl->asyncXactLSN; SpinLockRelease(&XLogCtl->info_lck); flexible = false; /* ensure it all gets written */ } @@ -2779,7 +2789,7 @@ XLogBackgroundFlush(void) * holding an open file handle to a logfile that's no longer in use, * preventing the file from being deleted. */ - if (WriteRqstPtr <= LogwrtResult.Flush) + if (WriteRqst.Write <= LogwrtResult.Flush) { if (openLogFile >= 0) { @@ -2791,10 +2801,47 @@ XLogBackgroundFlush(void) return false; } + /* + * Determine how far to flush WAL, based on the wal_writer_delay and + * wal_writer_flush_after GUCs. + */ + now = GetCurrentTimestamp(); + flushbytes = + WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ; + + if (WalWriterFlushAfter == 0 || lastflush == 0) + { + /* first call, or block based limits disabled */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay)) + { + /* + * Flush the writes at least every WalWriteDelay ms. This is important + * to bound the amount of time it takes for an asynchronous commit to + * hit disk. + */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else if (flushbytes >= WalWriterFlushAfter) + { + /* exceeded wal_writer_flush_after blocks, flush */ + WriteRqst.Flush = WriteRqst.Write; + lastflush = now; + } + else + { + /* no flushing, this time round */ + WriteRqst.Flush = 0; + } + #ifdef WAL_DEBUG if (XLOG_DEBUG) - elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X", - (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr, + elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X", + (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write, + (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush, (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write, (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush); #endif @@ -2802,17 +2849,13 @@ XLogBackgroundFlush(void) START_CRIT_SECTION(); /* now wait for any in-progress insertions to finish and get write lock */ - WaitXLogInsertionsToFinish(WriteRqstPtr); + WaitXLogInsertionsToFinish(WriteRqst.Write); LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); LogwrtResult = XLogCtl->LogwrtResult; - if (WriteRqstPtr > LogwrtResult.Flush) + if (WriteRqst.Write > LogwrtResult.Write || + WriteRqst.Flush > LogwrtResult.Flush) { - XLogwrtRqst WriteRqst; - - WriteRqst.Write = WriteRqstPtr; - WriteRqst.Flush = WriteRqstPtr; XLogWrite(WriteRqst, flexible); - wrote_something = true; } LWLockRelease(WALWriteLock); @@ -2827,7 +2870,12 @@ XLogBackgroundFlush(void) */ AdvanceXLInsertBuffer(InvalidXLogRecPtr, true); - return wrote_something; + /* + * If we determined that we need to write data, but somebody else + * wrote/flushed already, it should be considered as being active, to + * avoid hibernating too early. + */ + return true; } /* diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 243adb6dcd6..9852fed82d8 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -64,6 +64,7 @@ * GUC parameters */ int WalWriterDelay = 200; +int WalWriterFlushAfter = 128; /* * Number of do-nothing loops before lengthening the delay time, and the diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 31a69cac723..ea5a09ac1be 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2235,7 +2235,7 @@ static struct config_int ConfigureNamesInt[] = { {"wal_writer_delay", PGC_SIGHUP, WAL_SETTINGS, - gettext_noop("WAL writer sleep time between WAL flushes."), + gettext_noop("Time between WAL flushes performed in the WAL writer."), NULL, GUC_UNIT_MS }, @@ -2245,6 +2245,17 @@ static struct config_int ConfigureNamesInt[] = }, { + {"wal_writer_flush_after", PGC_SIGHUP, WAL_SETTINGS, + gettext_noop("Amount of WAL written out by WAL writer triggering a flush."), + NULL, + GUC_UNIT_XBLOCKS + }, + &WalWriterFlushAfter, + 128, 0, INT_MAX, + NULL, NULL, NULL + }, + + { /* see max_connections */ {"max_wal_senders", PGC_POSTMASTER, REPLICATION_SENDING, gettext_noop("Sets the maximum number of simultaneously running WAL sender processes."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 09b2003dbe2..ee3d37863ec 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -192,6 +192,7 @@ #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers # (change requires restart) #wal_writer_delay = 200ms # 1-10000 milliseconds +#wal_writer_flush_after = 1MB # 0 disables #commit_delay = 0 # range 0-100000, in microseconds #commit_siblings = 5 # range 1-1000 diff --git a/src/include/postmaster/walwriter.h b/src/include/postmaster/walwriter.h index d94cb97b262..49c5c1d016b 100644 --- a/src/include/postmaster/walwriter.h +++ b/src/include/postmaster/walwriter.h @@ -14,6 +14,7 @@ /* GUC options */ extern int WalWriterDelay; +extern int WalWriterFlushAfter; extern void WalWriterMain(void) pg_attribute_noreturn(); |