diff options
author | Andres Freund <andres@anarazel.de> | 2016-02-19 12:13:05 -0800 |
---|---|---|
committer | Andres Freund <andres@anarazel.de> | 2016-03-10 17:04:34 -0800 |
commit | 428b1d6b29ca599c5700d4bc4f4ce4c5880369bf (patch) | |
tree | 55a957b82069476471bc9966ce81a9ffec8f4ba5 /src/backend/storage/buffer/bufmgr.c | |
parent | c82c92b111b7b636e80f8a432de10c62011b35b6 (diff) | |
download | postgresql-428b1d6b29ca599c5700d4bc4f4ce4c5880369bf.tar.gz postgresql-428b1d6b29ca599c5700d4bc4f4ce4c5880369bf.zip |
Allow to trigger kernel writeback after a configurable number of writes.
Currently writes to the main data files of postgres all go through the
OS page cache. This means that some operating systems can end up
collecting a large number of dirty buffers in their respective page
caches. When these dirty buffers are flushed to storage rapidly, be it
because of fsync(), timeouts, or dirty ratios, latency for other reads
and writes can increase massively. This is the primary reason for
regular massive stalls observed in real world scenarios and artificial
benchmarks; on rotating disks stalls on the order of hundreds of seconds
have been observed.
On linux it is possible to control this by reducing the global dirty
limits significantly, reducing the above problem. But global
configuration is rather problematic because it'll affect other
applications; also PostgreSQL itself doesn't always generally want this
behavior, e.g. for temporary files it's undesirable.
Several operating systems allow some control over the kernel page
cache. Linux has sync_file_range(2), several posix systems have msync(2)
and posix_fadvise(2). sync_file_range(2) is preferable because it
requires no special setup, whereas msync() requires the to-be-flushed
range to be mmap'ed. For the purpose of flushing dirty data
posix_fadvise(2) is the worst alternative, as flushing dirty data is
just a side-effect of POSIX_FADV_DONTNEED, which also removes the pages
from the page cache. Thus the feature is enabled by default only on
linux, but can be enabled on all systems that have any of the above
APIs.
While desirable and likely possible this patch does not contain an
implementation for windows.
With the infrastructure added, writes made via checkpointer, bgwriter
and normal user backends can be flushed after a configurable number of
writes. Each of these sources of writes controlled by a separate GUC,
checkpointer_flush_after, bgwriter_flush_after and backend_flush_after
respectively; they're separate because the number of flushes that are
good are separate, and because the performance considerations of
controlled flushing for each of these are different.
A later patch will add checkpoint sorting - after that flushes from the
ckeckpoint will almost always be desirable. Bgwriter flushes are most of
the time going to be random, which are slow on lots of storage hardware.
Flushing in backends works well if the storage and bgwriter can keep up,
but if not it can have negative consequences. This patch is likely to
have negative performance consequences without checkpoint sorting, but
unfortunately so has sorting without flush control.
Discussion: alpine.DEB.2.10.1506011320000.28433@sto
Author: Fabien Coelho and Andres Freund
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 193 |
1 files changed, 188 insertions, 5 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e8e0825eb0c..5b9192ed450 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -83,6 +83,14 @@ bool track_io_timing = false; int effective_io_concurrency = 0; /* + * GUC variables about triggering kernel writeback for buffers written; OS + * dependant defaults are set via the GUC mechanism. + */ +int checkpoint_flush_after = 0; +int bgwriter_flush_after = 0; +int backend_flush_after = 0; + +/* * How many buffers PrefetchBuffer callers should try to stay ahead of their * ReadBuffer calls by. This is maintained by the assign hook for * effective_io_concurrency. Zero means "never prefetch". This value is @@ -399,7 +407,7 @@ static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy); static void PinBuffer_Locked(BufferDesc *buf); static void UnpinBuffer(BufferDesc *buf, bool fixOwner); static void BufferSync(int flags); -static int SyncOneBuffer(int buf_id, bool skip_recently_used); +static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context); static void WaitIO(BufferDesc *buf); static bool StartBufferIO(BufferDesc *buf, bool forInput); static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, @@ -416,6 +424,7 @@ static void FlushBuffer(BufferDesc *buf, SMgrRelation reln); static void AtProcExit_Buffers(int code, Datum arg); static void CheckForBufferLeaks(void); static int rnode_comparator(const void *p1, const void *p2); +static int buffertag_comparator(const void *p1, const void *p2); /* @@ -818,6 +827,13 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, MemSet((char *) bufBlock, 0, BLCKSZ); /* don't set checksum for all-zero page */ smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false); + + /* + * NB: we're *not* doing a ScheduleBufferTagForWriteback here; + * although we're essentially performing a write. At least on linux + * doing so defeats the 'delayed allocation' mechanism, leading to + * increased file fragmentation. + */ } else { @@ -1084,6 +1100,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, FlushBuffer(buf, NULL); LWLockRelease(BufferDescriptorGetContentLock(buf)); + ScheduleBufferTagForWriteback(&BackendWritebackContext, + &buf->tag); + TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum, smgr->smgr_rnode.node.spcNode, smgr->smgr_rnode.node.dbNode, @@ -1642,6 +1661,7 @@ BufferSync(int flags) int num_to_write; int num_written; int mask = BM_DIRTY; + WritebackContext wb_context; /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); @@ -1694,6 +1714,8 @@ BufferSync(int flags) if (num_to_write == 0) return; /* nothing to do */ + WritebackContextInit(&wb_context, &checkpoint_flush_after); + TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write); /* @@ -1725,7 +1747,7 @@ BufferSync(int flags) */ if (bufHdr->flags & BM_CHECKPOINT_NEEDED) { - if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN) + if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); BgWriterStats.m_buf_written_checkpoints++; @@ -1756,6 +1778,9 @@ BufferSync(int flags) buf_id = 0; } + /* issue all pending flushes */ + IssuePendingWritebacks(&wb_context); + /* * Update checkpoint statistics. As noted above, this doesn't include * buffers written by other backends or bgwriter scan. @@ -1777,7 +1802,7 @@ BufferSync(int flags) * bgwriter_lru_maxpages to 0.) */ bool -BgBufferSync(void) +BgBufferSync(WritebackContext *wb_context) { /* info obtained from freelist.c */ int strategy_buf_id; @@ -2002,7 +2027,8 @@ BgBufferSync(void) /* Execute the LRU scan */ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) { - int buffer_state = SyncOneBuffer(next_to_clean, true); + int buffer_state = SyncOneBuffer(next_to_clean, true, + wb_context); if (++next_to_clean >= NBuffers) { @@ -2079,10 +2105,11 @@ BgBufferSync(void) * Note: caller must have done ResourceOwnerEnlargeBuffers. */ static int -SyncOneBuffer(int buf_id, bool skip_recently_used) +SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) { BufferDesc *bufHdr = GetBufferDescriptor(buf_id); int result = 0; + BufferTag tag; ReservePrivateRefCountEntry(); @@ -2123,8 +2150,13 @@ SyncOneBuffer(int buf_id, bool skip_recently_used) FlushBuffer(bufHdr, NULL); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + + tag = bufHdr->tag; + UnpinBuffer(bufHdr, true); + ScheduleBufferTagForWriteback(wb_context, &tag); + return result | BUF_WRITTEN; } @@ -3729,3 +3761,154 @@ rnode_comparator(const void *p1, const void *p2) else return 0; } + +/* + * BufferTag comparator. + */ +static int +buffertag_comparator(const void *a, const void *b) +{ + const BufferTag *ba = (const BufferTag *) a; + const BufferTag *bb = (const BufferTag *) b; + int ret; + + ret = rnode_comparator(&ba->rnode, &bb->rnode); + + if (ret != 0) + return ret; + + if (ba->forkNum < bb->forkNum) + return -1; + if (ba->forkNum > bb->forkNum) + return 1; + + if (ba->blockNum < bb->blockNum) + return -1; + if (ba->blockNum > bb->blockNum) + return 1; + + return 0; +} + +/* + * Initialize a writeback context, discarding potential previous state. + * + * *max_coalesce is a pointer to a variable containing the current maximum + * number of writeback requests that will be coalesced into a bigger one. A + * value <= 0 means that no writeback control will be performed. max_pending + * is a pointer instead of an immediate value, so the coalesce limits can + * easily changed by the GUC mechanism, and so calling code does not have to + * check the current configuration. + */ +void +WritebackContextInit(WritebackContext *context, int *max_pending) +{ + Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES); + + context->max_pending = max_pending; + context->nr_pending = 0; +} + +/* + * Add buffer to list of pending writeback requests. + */ +void +ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag) +{ + PendingWriteback *pending; + + /* + * Add buffer to the pending writeback array, unless writeback control is + * disabled. + */ + if (*context->max_pending > 0) + { + Assert(*context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES); + + pending = &context->pending_writebacks[context->nr_pending++]; + + pending->tag = *tag; + } + + /* + * Perform pending flushes if the writeback limit is exceeded. This + * includes the case where previously an item has been added, but control + * is now disabled. + */ + if (context->nr_pending >= *context->max_pending) + IssuePendingWritebacks(context); +} + +/* + * Issue all pending writeback requests, previously scheduled with + * ScheduleBufferTagForWriteback, to the OS. + * + * Because this is only used to improve the OSs IO scheduling we try to never + * error out - it's just a hint. + */ +void +IssuePendingWritebacks(WritebackContext *context) +{ + int i; + + if (context->nr_pending == 0) + return; + + /* + * Executing the writes in-order can make them a lot faster, and allows to + * merge writeback requests to consecutive blocks into larger writebacks. + */ + qsort(&context->pending_writebacks, context->nr_pending, + sizeof(PendingWriteback), buffertag_comparator); + + /* + * Coalesce neighbouring writes, but nothing else. For that we iterate + * through the, now sorted, array of pending flushes, and look forward to + * find all neighbouring (or identical) writes. + */ + for (i = 0; i < context->nr_pending; i++) + { + PendingWriteback *cur; + PendingWriteback *next; + SMgrRelation reln; + int ahead; + BufferTag tag; + Size nblocks = 1; + + cur = &context->pending_writebacks[i]; + tag = cur->tag; + + /* + * Peek ahead, into following writeback requests, to see if they can + * be combined with the current one. + */ + for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++) + { + next = &context->pending_writebacks[i + ahead + 1]; + + /* different file, stop */ + if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) || + cur->tag.forkNum != next->tag.forkNum) + break; + + /* ok, block queued twice, skip */ + if (cur->tag.blockNum == next->tag.blockNum) + continue; + + /* only merge consecutive writes */ + if (cur->tag.blockNum + 1 != next->tag.blockNum) + break; + + nblocks++; + cur = next; + } + + i += ahead; + + /* and finally tell the kernel to write the data to storage */ + reln = smgropen(tag.rnode, InvalidBackendId); + smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks); + } + + context->nr_pending = 0; +} |