aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2016-02-15 23:52:38 +0100
committerAndres Freund <andres@anarazel.de>2016-02-16 00:56:34 +0100
commit7975c5e0a992ae9a45e03d145e0d37e2b5a707f5 (patch)
treefa09f94f77fd54482b7fbd3ae411534ac1d7a85d
parent5df44d14ba9fd3f6149c3fa0919745c9e24bcffe (diff)
downloadpostgresql-7975c5e0a992ae9a45e03d145e0d37e2b5a707f5.tar.gz
postgresql-7975c5e0a992ae9a45e03d145e0d37e2b5a707f5.zip
Allow the WAL writer to flush WAL at a reduced rate.
Commit 4de82f7d7 increased the WAL flush rate, mainly to increase the likelihood that hint bits can be set quickly. More quickly set hint bits can reduce contention around the clog et al. But unfortunately the increased flush rate can have a significant negative performance impact, I have measured up to a factor of ~4. The reason for this slowdown is that if there are independent writes to the underlying devices, for example because shared buffers is a lot smaller than the hot data set, or because a checkpoint is ongoing, the fdatasync() calls force cache flushes to be emitted to the storage. This is achieved by flushing WAL only if the last flush was longer than wal_writer_delay ago, or if more than wal_writer_flush_after (new GUC) unflushed blocks are pending. Based on some tests the default for wal_writer_delay is 1MB, which seems to work well both on SSD and rotational media. To avoid negative performance impact due to 4de82f7d7 an earlier commit (db76b1e) made SetHintBits() more likely to succeed; preventing performance regressions in the pgbench tests I performed. Discussion: 20160118163908.GW10941@awork2.anarazel.de
-rw-r--r--doc/src/sgml/config.sgml41
-rw-r--r--src/backend/access/transam/README32
-rw-r--r--src/backend/access/transam/xlog.c104
-rw-r--r--src/backend/postmaster/walwriter.c1
-rw-r--r--src/backend/utils/misc/guc.c13
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample1
-rw-r--r--src/include/postmaster/walwriter.h1
7 files changed, 141 insertions, 52 deletions
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index de84b777300..a09ceb2fea7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2344,15 +2344,38 @@ include_dir 'conf.d'
</indexterm>
</term>
<listitem>
- <para>
- Specifies the delay between activity rounds for the WAL writer.
- In each round the writer will flush WAL to disk. It then sleeps for
- <varname>wal_writer_delay</> milliseconds, and repeats. The default
- value is 200 milliseconds (<literal>200ms</>). Note that on many
- systems, the effective resolution of sleep delays is 10 milliseconds;
- setting <varname>wal_writer_delay</> to a value that is not a multiple
- of 10 might have the same results as setting it to the next higher
- multiple of 10. This parameter can only be set in the
+ <para>
+ Specifies how often the WAL writer flushes WAL. After flushing WAL it
+ sleeps for <varname>wal_writer_delay</> milliseconds, unless woken up
+ by an asynchronously committing transaction. In case the last flush
+ happened less than <varname>wal_writer_delay</> milliseconds ago and
+ less than <varname>wal_writer_flush_after</> bytes of WAL have been
+ produced since, WAL is only written to the OS, not flushed to disk.
+ The default value is 200 milliseconds (<literal>200ms</>). Note that
+ on many systems, the effective resolution of sleep delays is 10
+ milliseconds; setting <varname>wal_writer_delay</> to a value that is
+ not a multiple of 10 might have the same results as setting it to the
+ next higher multiple of 10. This parameter can only be set in the
+ <filename>postgresql.conf</> file or on the server command line.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry id="guc-wal-writer-flush-after" xreflabel="wal_writer_flush_after">
+ <term><varname>wal_writer_flush_after</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>wal_writer_flush_after</> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Specifies how often the WAL writer flushes WAL. In case the last flush
+ happened less than <varname>wal_writer_delay</> milliseconds ago and
+ less than <varname>wal_writer_flush_after</> bytes of WAL have been
+ produced since, WAL is only written to the OS, not flushed to disk.
+ If <varname>wal_writer_flush_after</> is set to <literal>0</> WAL is
+ flushed everytime the WAL writer has written WAL. The default is
+ <literal>1MB</literal>. This parameter can only be set in the
<filename>postgresql.conf</> file or on the server command line.
</para>
</listitem>
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index f6db580a0bf..2de048933e2 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -736,20 +736,24 @@ non-roll-backable side effects (such as filesystem changes) force sync
commit to minimize the window in which the filesystem change has been made
but the transaction isn't guaranteed committed.
-Every wal_writer_delay milliseconds, the walwriter process performs an
-XLogBackgroundFlush(). This checks the location of the last completely
-filled WAL page. If that has moved forwards, then we write all the changed
-buffers up to that point, so that under full load we write only whole
-buffers. If there has been a break in activity and the current WAL page is
-the same as before, then we find out the LSN of the most recent
-asynchronous commit, and flush up to that point, if required (i.e.,
-if it's in the current WAL page). This arrangement in itself would
-guarantee that an async commit record reaches disk during at worst the
-second walwriter cycle after the transaction completes. However, we also
-allow XLogFlush to flush full buffers "flexibly" (ie, not wrapping around
-at the end of the circular WAL buffer area), so as to minimize the number
-of writes issued under high load when multiple WAL pages are filled per
-walwriter cycle. This makes the worst-case delay three walwriter cycles.
+The walwriter regularly wakes up (via wal_writer_delay) or is woken up
+(via its latch, which is set by backends committing asynchronously) and
+performs an XLogBackgroundFlush(). This checks the location of the last
+completely filled WAL page. If that has moved forwards, then we write all
+the changed buffers up to that point, so that under full load we write
+only whole buffers. If there has been a break in activity and the current
+WAL page is the same as before, then we find out the LSN of the most
+recent asynchronous commit, and write up to that point, if required (i.e.
+if it's in the current WAL page). If more than wal_writer_delay has
+passed, or more than wal_writer_flush_after blocks have been written, since
+the last flush, WAL is also flushed up to the current location. This
+arrangement in itself would guarantee that an async commit record reaches
+disk after at most two times wal_writer_delay after the transaction
+completes. However, we also allow XLogFlush to write/flush full buffers
+"flexibly" (ie, not wrapping around at the end of the circular WAL buffer
+area), so as to minimize the number of writes issued under high load when
+multiple WAL pages are filled per walwriter cycle. This makes the worst-case
+delay three wal_writer_delay cycles.
There are some other subtle points to consider with asynchronous commits.
First, for each page of CLOG we must remember the LSN of the latest commit
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8d480f7ce24..94b79ac49d5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -42,6 +42,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
+#include "postmaster/walwriter.h"
#include "postmaster/startup.h"
#include "replication/basebackup.h"
#include "replication/logical.h"
@@ -2729,28 +2730,37 @@ XLogFlush(XLogRecPtr record)
}
/*
- * Flush xlog, but without specifying exactly where to flush to.
+ * Write & flush xlog, but without specifying exactly where to.
*
- * We normally flush only completed blocks; but if there is nothing to do on
- * that basis, we check for unflushed async commits in the current incomplete
- * block, and flush through the latest one of those. Thus, if async commits
- * are not being used, we will flush complete blocks only. We can guarantee
- * that async commits reach disk after at most three cycles; normally only
- * one or two. (When flushing complete blocks, we allow XLogWrite to write
- * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
- * difference only with very high load or long wal_writer_delay, but imposes
- * one extra cycle for the worst case for async commits.)
+ * We normally write only completed blocks; but if there is nothing to do on
+ * that basis, we check for unwritten async commits in the current incomplete
+ * block, and write through the latest one of those. Thus, if async commits
+ * are not being used, we will write complete blocks only.
+ *
+ * If, based on the above, there's anything to write we do so immediately. But
+ * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
+ * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
+ * more than wal_writer_flush_after unflushed blocks.
+ *
+ * We can guarantee that async commits reach disk after at most three
+ * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
+ * to write "flexibly", meaning it can stop at the end of the buffer ring;
+ * this makes a difference only with very high load or long wal_writer_delay,
+ * but imposes one extra cycle for the worst case for async commits.)
*
* This routine is invoked periodically by the background walwriter process.
*
- * Returns TRUE if we flushed anything.
+ * Returns TRUE if there was any work to do, even if we skipped flushing due
+ * to wal_writer_delay/wal_flush_after.
*/
bool
XLogBackgroundFlush(void)
{
- XLogRecPtr WriteRqstPtr;
+ XLogwrtRqst WriteRqst;
bool flexible = true;
- bool wrote_something = false;
+ static TimestampTz lastflush;
+ TimestampTz now;
+ int flushbytes;
/* XLOG doesn't need flushing during recovery */
if (RecoveryInProgress())
@@ -2759,17 +2769,17 @@ XLogBackgroundFlush(void)
/* read LogwrtResult and update local state */
SpinLockAcquire(&XLogCtl->info_lck);
LogwrtResult = XLogCtl->LogwrtResult;
- WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
+ WriteRqst = XLogCtl->LogwrtRqst;
SpinLockRelease(&XLogCtl->info_lck);
/* back off to last completed page boundary */
- WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
+ WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
/* if we have already flushed that far, consider async commit records */
- if (WriteRqstPtr <= LogwrtResult.Flush)
+ if (WriteRqst.Write <= LogwrtResult.Flush)
{
SpinLockAcquire(&XLogCtl->info_lck);
- WriteRqstPtr = XLogCtl->asyncXactLSN;
+ WriteRqst.Write = XLogCtl->asyncXactLSN;
SpinLockRelease(&XLogCtl->info_lck);
flexible = false; /* ensure it all gets written */
}
@@ -2779,7 +2789,7 @@ XLogBackgroundFlush(void)
* holding an open file handle to a logfile that's no longer in use,
* preventing the file from being deleted.
*/
- if (WriteRqstPtr <= LogwrtResult.Flush)
+ if (WriteRqst.Write <= LogwrtResult.Flush)
{
if (openLogFile >= 0)
{
@@ -2791,10 +2801,47 @@ XLogBackgroundFlush(void)
return false;
}
+ /*
+ * Determine how far to flush WAL, based on the wal_writer_delay and
+ * wal_writer_flush_after GUCs.
+ */
+ now = GetCurrentTimestamp();
+ flushbytes =
+ WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
+
+ if (WalWriterFlushAfter == 0 || lastflush == 0)
+ {
+ /* first call, or block based limits disabled */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
+ {
+ /*
+ * Flush the writes at least every WalWriteDelay ms. This is important
+ * to bound the amount of time it takes for an asynchronous commit to
+ * hit disk.
+ */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else if (flushbytes >= WalWriterFlushAfter)
+ {
+ /* exceeded wal_writer_flush_after blocks, flush */
+ WriteRqst.Flush = WriteRqst.Write;
+ lastflush = now;
+ }
+ else
+ {
+ /* no flushing, this time round */
+ WriteRqst.Flush = 0;
+ }
+
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
- elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
- (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
+ elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
+ (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
+ (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
(uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
(uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif
@@ -2802,17 +2849,13 @@ XLogBackgroundFlush(void)
START_CRIT_SECTION();
/* now wait for any in-progress insertions to finish and get write lock */
- WaitXLogInsertionsToFinish(WriteRqstPtr);
+ WaitXLogInsertionsToFinish(WriteRqst.Write);
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
LogwrtResult = XLogCtl->LogwrtResult;
- if (WriteRqstPtr > LogwrtResult.Flush)
+ if (WriteRqst.Write > LogwrtResult.Write ||
+ WriteRqst.Flush > LogwrtResult.Flush)
{
- XLogwrtRqst WriteRqst;
-
- WriteRqst.Write = WriteRqstPtr;
- WriteRqst.Flush = WriteRqstPtr;
XLogWrite(WriteRqst, flexible);
- wrote_something = true;
}
LWLockRelease(WALWriteLock);
@@ -2827,7 +2870,12 @@ XLogBackgroundFlush(void)
*/
AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
- return wrote_something;
+ /*
+ * If we determined that we need to write data, but somebody else
+ * wrote/flushed already, it should be considered as being active, to
+ * avoid hibernating too early.
+ */
+ return true;
}
/*
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
index 243adb6dcd6..9852fed82d8 100644
--- a/src/backend/postmaster/walwriter.c
+++ b/src/backend/postmaster/walwriter.c
@@ -64,6 +64,7 @@
* GUC parameters
*/
int WalWriterDelay = 200;
+int WalWriterFlushAfter = 128;
/*
* Number of do-nothing loops before lengthening the delay time, and the
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 31a69cac723..ea5a09ac1be 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2235,7 +2235,7 @@ static struct config_int ConfigureNamesInt[] =
{
{"wal_writer_delay", PGC_SIGHUP, WAL_SETTINGS,
- gettext_noop("WAL writer sleep time between WAL flushes."),
+ gettext_noop("Time between WAL flushes performed in the WAL writer."),
NULL,
GUC_UNIT_MS
},
@@ -2245,6 +2245,17 @@ static struct config_int ConfigureNamesInt[] =
},
{
+ {"wal_writer_flush_after", PGC_SIGHUP, WAL_SETTINGS,
+ gettext_noop("Amount of WAL written out by WAL writer triggering a flush."),
+ NULL,
+ GUC_UNIT_XBLOCKS
+ },
+ &WalWriterFlushAfter,
+ 128, 0, INT_MAX,
+ NULL, NULL, NULL
+ },
+
+ {
/* see max_connections */
{"max_wal_senders", PGC_POSTMASTER, REPLICATION_SENDING,
gettext_noop("Sets the maximum number of simultaneously running WAL sender processes."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 09b2003dbe2..ee3d37863ec 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -192,6 +192,7 @@
#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers
# (change requires restart)
#wal_writer_delay = 200ms # 1-10000 milliseconds
+#wal_writer_flush_after = 1MB # 0 disables
#commit_delay = 0 # range 0-100000, in microseconds
#commit_siblings = 5 # range 1-1000
diff --git a/src/include/postmaster/walwriter.h b/src/include/postmaster/walwriter.h
index d94cb97b262..49c5c1d016b 100644
--- a/src/include/postmaster/walwriter.h
+++ b/src/include/postmaster/walwriter.h
@@ -14,6 +14,7 @@
/* GUC options */
extern int WalWriterDelay;
+extern int WalWriterFlushAfter;
extern void WalWriterMain(void) pg_attribute_noreturn();