aboutsummaryrefslogtreecommitdiff
path: root/src/backend/postmaster/checkpointer.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2012-07-17 16:55:39 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2012-07-17 16:56:54 -0400
commit73b796a52c50d6f44400c99eff1a01c89d08782f (patch)
treed7f4cfff90c2d4412f4db45adb15ba957a56255e /src/backend/postmaster/checkpointer.c
parent71f2dd23210f9607d1584fad89e0f8df9750e921 (diff)
downloadpostgresql-73b796a52c50d6f44400c99eff1a01c89d08782f.tar.gz
postgresql-73b796a52c50d6f44400c99eff1a01c89d08782f.zip
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are no uninitialized pad bytes in the request queue structs. This would only cause trouble if (a) there were such pad bytes, which could happen in 8.4 and up if the compiler makes enum ForkNumber narrower than 32 bits, but otherwise would require not-currently-planned changes in the widths of other typedefs; and (b) the kernel has not uniformly initialized the contents of shared memory to zeroes. Still, it seems a tad risky, and we can easily remove any risk by pre-zeroing the request array for ourselves. In addition to that, we need to establish a coding rule that struct RelFileNode can't contain any padding bytes, since such structs are copied into the request array verbatim. (There are other places that are assuming this anyway, it turns out.) In 9.1 and up, the risk was a bit larger because we were also effectively assuming that struct RelFileNodeBackend contained no pad bytes, and with fields of different types in there, that would be much easier to break. However, there is no good reason to ever transmit fsync or delete requests for temp files to the bgwriter/checkpointer, so we can revert the request structs to plain RelFileNode, getting rid of the padding risk and saving some marginal number of bytes and cycles in fsync queue manipulation while we are at it. The savings might be more than marginal during deletion of a temp relation, because the old code transmitted an entirely useless but nonetheless expensive-to-process ForgetRelationFsync request to the background process, and also had the background process perform the file deletion even though that can safely be done immediately. In addition, make some cleanup of nearby comments and small improvements to the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
Diffstat (limited to 'src/backend/postmaster/checkpointer.c')
-rw-r--r--src/backend/postmaster/checkpointer.c51
1 files changed, 36 insertions, 15 deletions
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 417e3bb0d1b..92fd4276cd1 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -105,7 +105,7 @@
*/
typedef struct
{
- RelFileNodeBackend rnode;
+ RelFileNode rnode;
ForkNumber forknum;
BlockNumber segno; /* see md.c for special values */
/* might add a real request-type field later; not needed yet */
@@ -924,17 +924,22 @@ CheckpointerShmemSize(void)
void
CheckpointerShmemInit(void)
{
+ Size size = CheckpointerShmemSize();
bool found;
CheckpointerShmem = (CheckpointerShmemStruct *)
ShmemInitStruct("Checkpointer Data",
- CheckpointerShmemSize(),
+ size,
&found);
if (!found)
{
- /* First time through, so initialize */
- MemSet(CheckpointerShmem, 0, sizeof(CheckpointerShmemStruct));
+ /*
+ * First time through, so initialize. Note that we zero the whole
+ * requests array; this is so that CompactCheckpointerRequestQueue
+ * can assume that any pad bytes in the request structs are zeroes.
+ */
+ MemSet(CheckpointerShmem, 0, size);
SpinLockInit(&CheckpointerShmem->ckpt_lck);
CheckpointerShmem->max_requests = NBuffers;
}
@@ -1091,11 +1096,15 @@ RequestCheckpoint(int flags)
* Forward a file-fsync request from a backend to the checkpointer
*
* Whenever a backend is compelled to write directly to a relation
- * (which should be seldom, if the checkpointer is getting its job done),
+ * (which should be seldom, if the background writer is getting its job done),
* the backend calls this routine to pass over knowledge that the relation
* is dirty and must be fsync'd before next checkpoint. We also use this
* opportunity to count such writes for statistical purposes.
*
+ * This functionality is only supported for regular (not backend-local)
+ * relations, so the rnode argument is intentionally RelFileNode not
+ * RelFileNodeBackend.
+ *
* segno specifies which segment (not block!) of the relation needs to be
* fsync'd. (Since the valid range is much less than BlockNumber, we can
* use high values for special flags; that's all internal to md.c, which
@@ -1112,8 +1121,7 @@ RequestCheckpoint(int flags)
* let the backend know by returning false.
*/
bool
-ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
- BlockNumber segno)
+ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
{
CheckpointerRequest *request;
bool too_full;
@@ -1169,6 +1177,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
/*
* CompactCheckpointerRequestQueue
* Remove duplicates from the request queue to avoid backend fsyncs.
+ * Returns "true" if any entries were removed.
*
* Although a full fsync request queue is not common, it can lead to severe
* performance problems when it does happen. So far, this situation has
@@ -1178,7 +1187,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
* gets very expensive and can slow down the whole system.
*
* Trying to do this every time the queue is full could lose if there
- * aren't any removable entries. But should be vanishingly rare in
+ * aren't any removable entries. But that should be vanishingly rare in
* practice: there's one queue entry per shared buffer.
*/
static bool
@@ -1200,18 +1209,20 @@ CompactCheckpointerRequestQueue(void)
/* must hold CheckpointerCommLock in exclusive mode */
Assert(LWLockHeldByMe(CheckpointerCommLock));
+ /* Initialize skip_slot array */
+ skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
+
/* Initialize temporary hash table */
MemSet(&ctl, 0, sizeof(ctl));
ctl.keysize = sizeof(CheckpointerRequest);
ctl.entrysize = sizeof(struct CheckpointerSlotMapping);
ctl.hash = tag_hash;
+ ctl.hcxt = CurrentMemoryContext;
+
htab = hash_create("CompactCheckpointerRequestQueue",
CheckpointerShmem->num_requests,
&ctl,
- HASH_ELEM | HASH_FUNCTION);
-
- /* Initialize skip_slot array */
- skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
+ HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
/*
* The basic idea here is that a request can be skipped if it's followed
@@ -1226,19 +1237,28 @@ CompactCheckpointerRequestQueue(void)
* anyhow), but it's not clear that the extra complexity would buy us
* anything.
*/
- for (n = 0; n < CheckpointerShmem->num_requests; ++n)
+ for (n = 0; n < CheckpointerShmem->num_requests; n++)
{
CheckpointerRequest *request;
struct CheckpointerSlotMapping *slotmap;
bool found;
+ /*
+ * We use the request struct directly as a hashtable key. This
+ * assumes that any padding bytes in the structs are consistently the
+ * same, which should be okay because we zeroed them in
+ * CheckpointerShmemInit. Note also that RelFileNode had better
+ * contain no pad bytes.
+ */
request = &CheckpointerShmem->requests[n];
slotmap = hash_search(htab, request, HASH_ENTER, &found);
if (found)
{
+ /* Duplicate, so mark the previous occurrence as skippable */
skip_slot[slotmap->slot] = true;
- ++num_skipped;
+ num_skipped++;
}
+ /* Remember slot containing latest occurrence of this request value */
slotmap->slot = n;
}
@@ -1253,7 +1273,8 @@ CompactCheckpointerRequestQueue(void)
}
/* We found some duplicates; remove them. */
- for (n = 0, preserve_count = 0; n < CheckpointerShmem->num_requests; ++n)
+ preserve_count = 0;
+ for (n = 0; n < CheckpointerShmem->num_requests; n++)
{
if (skip_slot[n])
continue;