diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2012-07-17 16:55:39 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2012-07-17 16:56:54 -0400 |
commit | 73b796a52c50d6f44400c99eff1a01c89d08782f (patch) | |
tree | d7f4cfff90c2d4412f4db45adb15ba957a56255e /src/backend/postmaster/checkpointer.c | |
parent | 71f2dd23210f9607d1584fad89e0f8df9750e921 (diff) | |
download | postgresql-73b796a52c50d6f44400c99eff1a01c89d08782f.tar.gz postgresql-73b796a52c50d6f44400c99eff1a01c89d08782f.zip |
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
Diffstat (limited to 'src/backend/postmaster/checkpointer.c')
-rw-r--r-- | src/backend/postmaster/checkpointer.c | 51 |
1 files changed, 36 insertions, 15 deletions
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 417e3bb0d1b..92fd4276cd1 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -105,7 +105,7 @@ */ typedef struct { - RelFileNodeBackend rnode; + RelFileNode rnode; ForkNumber forknum; BlockNumber segno; /* see md.c for special values */ /* might add a real request-type field later; not needed yet */ @@ -924,17 +924,22 @@ CheckpointerShmemSize(void) void CheckpointerShmemInit(void) { + Size size = CheckpointerShmemSize(); bool found; CheckpointerShmem = (CheckpointerShmemStruct *) ShmemInitStruct("Checkpointer Data", - CheckpointerShmemSize(), + size, &found); if (!found) { - /* First time through, so initialize */ - MemSet(CheckpointerShmem, 0, sizeof(CheckpointerShmemStruct)); + /* + * First time through, so initialize. Note that we zero the whole + * requests array; this is so that CompactCheckpointerRequestQueue + * can assume that any pad bytes in the request structs are zeroes. + */ + MemSet(CheckpointerShmem, 0, size); SpinLockInit(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->max_requests = NBuffers; } @@ -1091,11 +1096,15 @@ RequestCheckpoint(int flags) * Forward a file-fsync request from a backend to the checkpointer * * Whenever a backend is compelled to write directly to a relation - * (which should be seldom, if the checkpointer is getting its job done), + * (which should be seldom, if the background writer is getting its job done), * the backend calls this routine to pass over knowledge that the relation * is dirty and must be fsync'd before next checkpoint. We also use this * opportunity to count such writes for statistical purposes. * + * This functionality is only supported for regular (not backend-local) + * relations, so the rnode argument is intentionally RelFileNode not + * RelFileNodeBackend. + * * segno specifies which segment (not block!) of the relation needs to be * fsync'd. (Since the valid range is much less than BlockNumber, we can * use high values for special flags; that's all internal to md.c, which @@ -1112,8 +1121,7 @@ RequestCheckpoint(int flags) * let the backend know by returning false. */ bool -ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, - BlockNumber segno) +ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) { CheckpointerRequest *request; bool too_full; @@ -1169,6 +1177,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, /* * CompactCheckpointerRequestQueue * Remove duplicates from the request queue to avoid backend fsyncs. + * Returns "true" if any entries were removed. * * Although a full fsync request queue is not common, it can lead to severe * performance problems when it does happen. So far, this situation has @@ -1178,7 +1187,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, * gets very expensive and can slow down the whole system. * * Trying to do this every time the queue is full could lose if there - * aren't any removable entries. But should be vanishingly rare in + * aren't any removable entries. But that should be vanishingly rare in * practice: there's one queue entry per shared buffer. */ static bool @@ -1200,18 +1209,20 @@ CompactCheckpointerRequestQueue(void) /* must hold CheckpointerCommLock in exclusive mode */ Assert(LWLockHeldByMe(CheckpointerCommLock)); + /* Initialize skip_slot array */ + skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + /* Initialize temporary hash table */ MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(CheckpointerRequest); ctl.entrysize = sizeof(struct CheckpointerSlotMapping); ctl.hash = tag_hash; + ctl.hcxt = CurrentMemoryContext; + htab = hash_create("CompactCheckpointerRequestQueue", CheckpointerShmem->num_requests, &ctl, - HASH_ELEM | HASH_FUNCTION); - - /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); /* * The basic idea here is that a request can be skipped if it's followed @@ -1226,19 +1237,28 @@ CompactCheckpointerRequestQueue(void) * anyhow), but it's not clear that the extra complexity would buy us * anything. */ - for (n = 0; n < CheckpointerShmem->num_requests; ++n) + for (n = 0; n < CheckpointerShmem->num_requests; n++) { CheckpointerRequest *request; struct CheckpointerSlotMapping *slotmap; bool found; + /* + * We use the request struct directly as a hashtable key. This + * assumes that any padding bytes in the structs are consistently the + * same, which should be okay because we zeroed them in + * CheckpointerShmemInit. Note also that RelFileNode had better + * contain no pad bytes. + */ request = &CheckpointerShmem->requests[n]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { + /* Duplicate, so mark the previous occurrence as skippable */ skip_slot[slotmap->slot] = true; - ++num_skipped; + num_skipped++; } + /* Remember slot containing latest occurrence of this request value */ slotmap->slot = n; } @@ -1253,7 +1273,8 @@ CompactCheckpointerRequestQueue(void) } /* We found some duplicates; remove them. */ - for (n = 0, preserve_count = 0; n < CheckpointerShmem->num_requests; ++n) + preserve_count = 0; + for (n = 0; n < CheckpointerShmem->num_requests; n++) { if (skip_slot[n]) continue; |