aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doc/src/sgml/config.sgml41
-rw-r--r--src/backend/access/transam/README32
-rw-r--r--src/backend/access/transam/xlog.c104
-rw-r--r--src/backend/postmaster/walwriter.c1
-rw-r--r--src/backend/utils/misc/guc.c13
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample1
-rw-r--r--src/include/postmaster/walwriter.h1
7 files changed, 141 insertions, 52 deletions
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index de84b777300..a09ceb2fea7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2344,15 +2344,38 @@ include_dir 'conf.d'
</indexterm>
</term>
<listitem>
- <para>
- Specifies the delay between activity rounds for the WAL writer.
- In each round the writer will flush WAL to disk. It then sleeps for
- <varname>wal_writer_delay</> milliseconds, and repeats. The default
- value is 200 milliseconds (<literal>200ms</>). Note that on many
- systems, the effective resolution of sleep delays is 10 milliseconds;
- setting <varname>wal_writer_delay</> to a value that is not a multiple
- of 10 might have the same results as setting it to the next higher
- multiple of 10. This parameter can only be set in the
+ <para>
+ Specifies how often the WAL writer flushes WAL. After flushing WAL it
+ sleeps for <varname>wal_writer_delay</> milliseconds, unless woken up
+ by an asynchronously committing transaction. In case the last flush
+ happened less than <varname>wal_writer_delay</> milliseconds ago and
+ less than <varname>wal_writer_flush_after</> bytes of WAL have been
+ produced since, WAL is only written to the OS, not flushed to disk.
+ The default value is 200 milliseconds (<literal>200ms</>). Note that
+ on many systems, the effective resolution of sleep delays is 10
+ milliseconds; setting <varname>wal_writer_delay</> to a value that is
+ not a multiple of 10 might have the same results as setting it to the
+ next higher multiple of 10. This parameter can only be set in the
+ <filename>postgresql.conf</> file or on the server command line.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry id="guc-wal-writer-flush-after" xreflabel="wal_writer_flush_after">
+ <term><varname>wal_writer_flush_after</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>wal_writer_flush_after</> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Specifies how often the WAL writer flushes WAL. In case the last flush
+ happened less than <varname>wal_writer_delay</> milliseconds ago and
+ less than <varname>wal_writer_flush_after</> bytes of WAL have been
+ produced since, WAL is only written to the OS, not flushed to disk.
+ If <varname>wal_writer_flush_after</> is set to <literal>0</> WAL is
+ flushed everytime the WAL writer has written WAL. The default is
+ <literal>1MB</literal>. This parameter can only be set in the
<filename>postgresql.conf</> file or on the server command line.
</para>
</listitem>
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index f6db580a0bf..2de048933e2 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -736,20 +736,24 @@ non-roll-backable side effects (such as filesystem changes) force sync
commit to minimize the window in which the filesystem change has been made
but the transaction isn't guaranteed committed.
-Every wal_writer_delay milliseconds, the walwriter process performs an
-XLogBackgroundFlush(). This checks the location of the last completely
-filled WAL page. If that has moved forwards, then we write all the changed
-buffers up to that point, so that under full load we write only whole
-buffers. If there has been a break in activity and the current WAL page is
-the same as before, then we find out the LSN of the most recent
-asynchronous commit, and flush up to that point, if required (i.e.,
-if it's in the current WAL page). This arrangement in itself would
-guarantee that an async commit record reaches disk during at worst the
-second walwriter cycle after the transaction completes. However, we also
-allow XLogFlush to flush full buffers "flexibly" (ie, not wrapping around
-at the end of the circular WAL buffer area), so as to minimize the number
-of writes issued under high load when multiple WAL pages are filled per
-walwriter cycle. This makes the worst-case delay three walwriter cycles.
+The walwriter regularly wakes up (via wal_writer_delay) or is woken up
+(via its latch, which is set by backends committing asynchronously) and
+performs an XLogBackgroundFlush(). This checks the location of the last
+completely filled WAL page. If that has moved forwards, then we write all
+the changed buffers up to that point, so that under full load we write
+only whole buffers. If there has been a break in activity and the current
+WAL page is the same as before, then we find out the LSN of the most
+recent asynchronous commit, and write up to that point, if required (i.e.
+if it's in the current WAL page). If more than wal_writer_delay has
+passed, or more than wal_writer_flush_after blocks have been written, since
+the last flush, WAL is also flushed up to the current location. This
+arrangement in itself would guarantee that an async commit record reaches
+disk after at most two times wal_writer_delay after the transaction
+completes. However, we also allow XLogFlush to write/flush full buffers
+"flexibly" (ie, not wrapping around at the end of the circular WAL buffer
+area), so as to minimize the number of writes issued under high load when
+multiple WAL pages are filled per walwriter cycle. This makes the worst-case
+delay three wal_writer_delay cycles.
There are some other subtle points to consider with asynchronous commits.
First, for each page of CLOG we must remember the LSN of the latest commit
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8d480f7ce24..94b79ac49d5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -42,6 +42,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
+#include "postmaster/walwriter.h"
#include "postmaster/startup.h"
#include "replication/basebackup.h"
#include "replication/logical.h"
@@ -2729,28 +2730,37 @@ XLogFlush(XLogRecPtr record)
}
/*
- * Flush xlog, but without specifying exactly where to flush to.
+ * Write & flush xlog, but without specifying exactly where to.
*
- * We normally flush only completed blocks; but if there is nothing to do on
- * that basis, we check for unflushed async commits in the current incomplete
- * block, and flush through the latest one of those. Thus, if async commits
- * are not being used, we will flush complete blocks only. We can guarantee
- * that async commits reach disk after at most three cycles; normally only
- * one or two. (When flushing complete blocks, we allow XLogWrite to write
- * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
- * difference only with very high load or long wal_writer_delay, but imposes
- * one extra cycle for the worst case for async commits.)
+ * We normally write only completed blocks; but if there is nothing to do on
+ * that basis, we check for unwritten async commits in the current incomplete
+ * block, and write through the latest one of those. Thus, if async commits
+ * are not being used, we will write complete blocks only.
+ *
+ * If, based on the above, there's anything to write we do so immediately. But
+ * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
+ * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
+ * more than wal_writer_flush_after unflushed blocks.
+ *
+ * We can guarantee that async commits reach disk after at most three
+ * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
+ * to write "flexibly", meaning it can stop at the end of the buffer ring;
+ * this makes a difference only with very high load or long wal_writer_delay,
+ * but imposes one extra cycle for the worst case for async commits.)
*
* This routine is invoked periodically by the background walwriter process.
*
- * Returns TRUE if we flushed anything.
+ * Returns TRUE if there was any work to do, even if we skipped flushing due
+ * to wal_writer_delay/wal_flush_after.
*/
bool
XLogBackgroundFlush(void)
{
- XLogRecPtr WriteRqstPtr;
+ XLogwrtRqst WriteRqst;
bool flexible = true;
- bool wrote_something = false;
+ static TimestampTz lastflush;
+ TimestampTz now;
+ int flushbytes;
/* XLOG doesn't need flushing during recovery */
if (RecoveryInProgress())
@@ -2759,17 +2769,17 @@ XLogBackgroundFlush(void)
/* read LogwrtResult and update local state */
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
- WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
+ WriteRqst = XLogCtl->LogwrtRqst;
SpinLockRelease(&XLogCtl->info_lck);
/* back off to last completed page boundary */
- WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
+ WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
/* if we have already flushed that far, consider async commit records */
- if (WriteRqstPtr <= LogwrtResult.Flush)
+ if (WriteRqst.Write <= LogwrtResult.Flush)
{
SpinLockAcquire(&XLogCtl->info_lck);
- WriteRqstPtr = XLogCtl->asyncXactLSN;
+ WriteRqst.Write = XLogCtl->asyncXactLSN;
SpinLockRelease(&XLogCtl->info_lck);
flexible = false; /* ensure it all gets written */
}
@@ -2779,7 +2789,7 @@ XLogBackgroundFlush(void)
* holding an open file handle to a logfile that's no longer in use,
* preventing the file from being deleted.
*/
- if (WriteRqstPtr <= LogwrtResult.Flush)
+ if (WriteRqst.Write <= LogwrtResult.Flush)
{
if (openLogFile >= 0)
{
@@ -2791,10 +2801,47 @@ XLogBackgroundFlush(void)
return false;
}
+ /*
+ * Determine how far to flush WAL, based on the wal_writer_delay and
+ * wal_writer_flush_after GUCs.
+ */
+ now = GetCurrentTimestamp();
+ flushbytes =
+ WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
+
+ if (WalWriterFlushAfter == 0 || lastflush == 0)
+ {
+ /* first call, or block based limits disabled */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
+ {
+ /*
+ * Flush the writes at least every WalWriteDelay ms. This is important
+ * to bound the amount of time it takes for an asynchronous commit to
+ * hit disk.
+ */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else if (flushbytes >= WalWriterFlushAfter)
+ {
+ /* exceeded wal_writer_flush_after blocks, flush */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else
+ {
+ /* no flushing, this time round */
+ WriteRqst.Flush = 0;
+ }
+
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
- elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
- (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
+ elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
+ (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
+ (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
(uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
(uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif
@@ -2802,17 +2849,13 @@ XLogBackgroundFlush(void)
START_CRIT_SECTION();
/* now wait for any in-progress insertions to finish and get write lock */
- WaitXLogInsertionsToFinish(WriteRqstPtr);
+ WaitXLogInsertionsToFinish(WriteRqst.Write);
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
LogwrtResult = XLogCtl->LogwrtResult;
- if (WriteRqstPtr > LogwrtResult.Flush)
+ if (WriteRqst.Write > LogwrtResult.Write ||
+ WriteRqst.Flush > LogwrtResult.Flush)
{
- XLogwrtRqst WriteRqst;
-
- WriteRqst.Write = WriteRqstPtr;
- WriteRqst.Flush = WriteRqstPtr;
XLogWrite(WriteRqst, flexible);
- wrote_something = true;
}
LWLockRelease(WALWriteLock);
@@ -2827,7 +2870,12 @@ XLogBackgroundFlush(void)
*/
AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
- return wrote_something;
+ /*
+ * If we determined that we need to write data, but somebody else
+ * wrote/flushed already, it should be considered as being active, to
+ * avoid hibernating too early.
+ */
+ return true;
}
/*
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
index 243adb6dcd6..9852fed82d8 100644
--- a/src/backend/postmaster/walwriter.c
+++ b/src/backend/postmaster/walwriter.c
@@ -64,6 +64,7 @@
* GUC parameters
*/
int WalWriterDelay = 200;
+int WalWriterFlushAfter = 128;
/*
* Number of do-nothing loops before lengthening the delay time, and the
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 31a69cac723..ea5a09ac1be 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2235,7 +2235,7 @@ static struct config_int ConfigureNamesInt[] =
{
{"wal_writer_delay", PGC_SIGHUP, WAL_SETTINGS,
- gettext_noop("WAL writer sleep time between WAL flushes."),
+ gettext_noop("Time between WAL flushes performed in the WAL writer."),
NULL,
GUC_UNIT_MS
},
@@ -2245,6 +2245,17 @@ static struct config_int ConfigureNamesInt[] =
},
{
+ {"wal_writer_flush_after", PGC_SIGHUP, WAL_SETTINGS,
+ gettext_noop("Amount of WAL written out by WAL writer triggering a flush."),
+ NULL,
+ GUC_UNIT_XBLOCKS
+ },
+ &WalWriterFlushAfter,
+ 128, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
+
+ {
/* see max_connections */
{"max_wal_senders", PGC_POSTMASTER, REPLICATION_SENDING,
gettext_noop("Sets the maximum number of simultaneously running WAL sender processes."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 09b2003dbe2..ee3d37863ec 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -192,6 +192,7 @@
#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers
# (change requires restart)
#wal_writer_delay = 200ms # 1-10000 milliseconds
+#wal_writer_flush_after = 1MB # 0 disables
#commit_delay = 0 # range 0-100000, in microseconds
#commit_siblings = 5 # range 1-1000
diff --git a/src/include/postmaster/walwriter.h b/src/include/postmaster/walwriter.h
index d94cb97b262..49c5c1d016b 100644
--- a/src/include/postmaster/walwriter.h
+++ b/src/include/postmaster/walwriter.h
@@ -14,6 +14,7 @@
/* GUC options */
extern int WalWriterDelay;
+extern int WalWriterFlushAfter;
extern void WalWriterMain(void) pg_attribute_noreturn();