diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2004-04-19 23:27:17 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2004-04-19 23:27:17 +0000 |
commit | 011c3e62e78b02a573f6f59ce78b1e7515d380c0 (patch) | |
tree | 19c3c8ab77264aa49421cd432025f0fa25edbf60 /src/backend/storage/buffer/bufmgr.c | |
parent | 8f73bbae854b71bea6a544941886f072829beb72 (diff) | |
download | postgresql-011c3e62e78b02a573f6f59ce78b1e7515d380c0.tar.gz postgresql-011c3e62e78b02a573f6f59ce78b1e7515d380c0.zip |
Code review for ARC patch. Eliminate static variables, improve handling
of VACUUM cases so that VACUUM requests don't affect the ARC state at all,
avoid corner case where BufferSync would uselessly rewrite a buffer that
no longer contains the page that was to be flushed. Make some minor
other cleanups in and around the bufmgr as well, such as moving PinBuffer
and UnpinBuffer into bufmgr.c where they really belong.
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 266 |
1 files changed, 179 insertions, 87 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d515a7a2590..a80435b7ec2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.160 2004/02/12 20:07:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.161 2004/04/19 23:27:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -54,9 +54,9 @@ #include "storage/proc.h" #include "storage/smgr.h" #include "utils/relcache.h" - #include "pgstat.h" + #define BufferGetLSN(bufHdr) \ (*((XLogRecPtr*) MAKE_PTR((bufHdr)->data))) @@ -64,15 +64,17 @@ /* GUC variable */ bool zero_damaged_pages = false; +#ifdef NOT_USED +int ShowPinTrace = 0; +#endif + int BgWriterDelay = 200; int BgWriterPercent = 1; int BgWriterMaxpages = 100; -static void WaitIO(BufferDesc *buf); -static void StartBufferIO(BufferDesc *buf, bool forInput); -static void TerminateBufferIO(BufferDesc *buf); -static void ContinueBufferIO(BufferDesc *buf, bool forInput); -static void buffer_write_error_callback(void *arg); +long NDirectFileRead; /* some I/O's are direct file access. + * bypass bufmgr */ +long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ /* * Macro : BUFFER_IS_BROKEN @@ -80,18 +82,22 @@ static void buffer_write_error_callback(void *arg); */ #define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY)) + +static void PinBuffer(BufferDesc *buf); +static void UnpinBuffer(BufferDesc *buf); +static void WaitIO(BufferDesc *buf); +static void StartBufferIO(BufferDesc *buf, bool forInput); +static void TerminateBufferIO(BufferDesc *buf); +static void ContinueBufferIO(BufferDesc *buf, bool forInput); +static void buffer_write_error_callback(void *arg); static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum, bool bufferLockHeld); static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr); static void BufferReplace(BufferDesc *bufHdr); - -#ifdef NOT_USED -void PrintBufferDescs(void); -#endif - static void write_buffer(Buffer buffer, bool unpin); + /* * ReadBuffer -- returns a buffer containing the requested * block of the requested relation. If the blknum @@ -282,14 +288,15 @@ BufferAlloc(Relation reln, BufferDesc *buf, *buf2; BufferTag newTag; /* identity of requested block */ + int cdb_found_index, + cdb_replace_index; bool inProgress; /* buffer undergoing IO */ - /* create a new tag so we can lookup the buffer */ - /* assume that the relation is already open */ + /* create a tag so we can lookup the buffer */ INIT_BUFFERTAG(&newTag, reln, blockNum); /* see if the block is in the buffer pool already */ - buf = StrategyBufferLookup(&newTag, false); + buf = StrategyBufferLookup(&newTag, false, &cdb_found_index); if (buf != NULL) { /* @@ -332,6 +339,13 @@ BufferAlloc(Relation reln, } LWLockRelease(BufMgrLock); + + /* + * Do the cost accounting for vacuum + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageHit; + return buf; } @@ -345,16 +359,16 @@ BufferAlloc(Relation reln, inProgress = FALSE; for (buf = NULL; buf == NULL;) { - buf = StrategyGetBuffer(); + buf = StrategyGetBuffer(&cdb_replace_index); - /* GetFreeBuffer will abort if it can't find a free buffer */ + /* StrategyGetBuffer will elog if it can't find a free buffer */ Assert(buf); /* * There should be exactly one pin on the buffer after it is * allocated -- ours. If it had a pin it wouldn't have been on * the free list. No one else could have pinned it between - * GetFreeBuffer and here because we have the BufMgrLock. + * StrategyGetBuffer and here because we have the BufMgrLock. */ Assert(buf->refcount == 0); buf->refcount = 1; @@ -438,7 +452,7 @@ BufferAlloc(Relation reln, * we haven't gotten around to insert the new tag into the * buffer table. So we need to check here. -ay 3/95 */ - buf2 = StrategyBufferLookup(&newTag, true); + buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index); if (buf2 != NULL) { /* @@ -471,6 +485,15 @@ BufferAlloc(Relation reln, } LWLockRelease(BufMgrLock); + + /* + * Do the cost accounting for vacuum. (XXX perhaps better + * to consider this a miss? We didn't have to do the read, + * but we did have to write ...) + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageHit; + return buf2; } } @@ -485,8 +508,8 @@ BufferAlloc(Relation reln, * Tell the buffer replacement strategy that we are replacing the * buffer content. Then rename the buffer. */ - StrategyReplaceBuffer(buf, reln, blockNum); - INIT_BUFFERTAG(&(buf->tag), reln, blockNum); + StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index); + buf->tag = newTag; /* * Buffer contents are currently invalid. Have to mark IO IN PROGRESS @@ -501,6 +524,12 @@ BufferAlloc(Relation reln, LWLockRelease(BufMgrLock); + /* + * Do the cost accounting for vacuum + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageMiss; + return buf; } @@ -624,69 +653,117 @@ ReleaseAndReadBuffer(Buffer buffer, } /* - * BufferSync -- Write all dirty buffers in the pool. + * PinBuffer -- make buffer unavailable for replacement. * - * This is called at checkpoint time and writes out all dirty shared buffers, + * This should be applied only to shared buffers, never local ones. + * Bufmgr lock must be held by caller. + */ +static void +PinBuffer(BufferDesc *buf) +{ + int b = BufferDescriptorGetBuffer(buf) - 1; + + if (PrivateRefCount[b] == 0) + buf->refcount++; + PrivateRefCount[b]++; + Assert(PrivateRefCount[b] > 0); +} + +/* + * UnpinBuffer -- make buffer available for replacement. + * + * This should be applied only to shared buffers, never local ones. + * Bufmgr lock must be held by caller. + */ +static void +UnpinBuffer(BufferDesc *buf) +{ + int b = BufferDescriptorGetBuffer(buf) - 1; + + Assert(buf->refcount > 0); + Assert(PrivateRefCount[b] > 0); + PrivateRefCount[b]--; + if (PrivateRefCount[b] == 0) + buf->refcount--; + + if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 && + buf->refcount == 1) + { + /* we just released the last pin other than the waiter's */ + buf->flags &= ~BM_PIN_COUNT_WAITER; + ProcSendSignal(buf->wait_backend_id); + } + else + { + /* do nothing */ + } +} + +/* + * BufferSync -- Write out dirty buffers in the pool. + * + * This is called at checkpoint time to write out all dirty shared buffers, * and by the background writer process to write out some of the dirty blocks. + * percent/maxpages should be zero in the former case, and nonzero limit + * values in the latter. */ int BufferSync(int percent, int maxpages) { + BufferDesc **dirty_buffers; + BufferTag *buftags; + int num_buffer_dirty; int i; - BufferDesc *bufHdr; ErrorContextCallback errcontext; - int num_buffer_dirty; - int *buffer_dirty; - - /* Setup error traceback support for ereport() */ - errcontext.callback = buffer_write_error_callback; - errcontext.arg = NULL; - errcontext.previous = error_context_stack; - error_context_stack = &errcontext; - /* * Get a list of all currently dirty buffers and how many there are. * We do not flush buffers that get dirtied after we started. They * have to wait until the next checkpoint. */ - buffer_dirty = (int *)palloc(NBuffers * sizeof(int)); + dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *)); + buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag)); LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); - num_buffer_dirty = StrategyDirtyBufferList(buffer_dirty, NBuffers); - LWLockRelease(BufMgrLock); + num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags, + NBuffers); /* * If called by the background writer, we are usually asked to - * only write out some percentage of dirty buffers now, to prevent + * only write out some portion of dirty buffers now, to prevent * the IO storm at checkpoint time. */ - if (percent > 0 && num_buffer_dirty > 10) + if (percent > 0) { Assert(percent <= 100); - num_buffer_dirty = (num_buffer_dirty * percent) / 100; - if (maxpages > 0 && num_buffer_dirty > maxpages) - num_buffer_dirty = maxpages; + num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100; } + if (maxpages > 0 && num_buffer_dirty > maxpages) + num_buffer_dirty = maxpages; + + /* Setup error traceback support for ereport() */ + errcontext.callback = buffer_write_error_callback; + errcontext.arg = NULL; + errcontext.previous = error_context_stack; + error_context_stack = &errcontext; + /* + * Loop over buffers to be written. Note the BufMgrLock is held at + * loop top, but is released and reacquired intraloop, so we aren't + * holding it long. + */ for (i = 0; i < num_buffer_dirty; i++) { + BufferDesc *bufHdr = dirty_buffers[i]; Buffer buffer; XLogRecPtr recptr; SMgrRelation reln; - LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); - - bufHdr = &BufferDescriptors[buffer_dirty[i]]; errcontext.arg = bufHdr; - if (!(bufHdr->flags & BM_VALID)) - { - LWLockRelease(BufMgrLock); - continue; - } - /* + * Check it is still the same page and still needs writing. + * * We can check bufHdr->cntxDirty here *without* holding any lock * on buffer context as long as we set this flag in access methods * *before* logging changes with XLogInsert(): if someone will set @@ -694,11 +771,12 @@ BufferSync(int percent, int maxpages) * checkpoint.redo points before log record for upcoming changes * and so we are not required to write such dirty buffer. */ + if (!(bufHdr->flags & BM_VALID)) + continue; + if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i])) + continue; if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) - { - LWLockRelease(BufMgrLock); continue; - } /* * IO synchronization. Note that we do it with unpinned buffer to @@ -707,12 +785,13 @@ BufferSync(int percent, int maxpages) if (bufHdr->flags & BM_IO_IN_PROGRESS) { WaitIO(bufHdr); - if (!(bufHdr->flags & BM_VALID) || - (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))) - { - LWLockRelease(BufMgrLock); + /* Still need writing? */ + if (!(bufHdr->flags & BM_VALID)) + continue; + if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i])) + continue; + if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) continue; - } } /* @@ -723,10 +802,11 @@ BufferSync(int percent, int maxpages) PinBuffer(bufHdr); StartBufferIO(bufHdr, false); /* output IO start */ - buffer = BufferDescriptorGetBuffer(bufHdr); - + /* Release BufMgrLock while doing xlog work */ LWLockRelease(BufMgrLock); + buffer = BufferDescriptorGetBuffer(bufHdr); + /* * Protect buffer content against concurrent update */ @@ -740,8 +820,12 @@ BufferSync(int percent, int maxpages) /* * Now it's safe to write buffer to disk. Note that no one else - * should not be able to write it while we were busy with locking - * and log flushing because of we setted IO flag. + * should have been able to write it while we were busy with + * locking and log flushing because we set the IO flag. + * + * Before we issue the actual write command, clear the just-dirtied + * flag. This lets us recognize concurrent changes (note that only + * hint-bit changes are possible since we hold the buffer shlock). */ LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty); @@ -767,12 +851,12 @@ BufferSync(int percent, int maxpages) * Release the per-buffer readlock, reacquire BufMgrLock. */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - BufferFlushCount++; LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ TerminateBufferIO(bufHdr); /* Sync IO finished */ + BufferFlushCount++; /* * If this buffer was marked by someone as DIRTY while we were @@ -781,14 +865,16 @@ BufferSync(int percent, int maxpages) if (!(bufHdr->flags & BM_JUST_DIRTIED)) bufHdr->flags &= ~BM_DIRTY; UnpinBuffer(bufHdr); - LWLockRelease(BufMgrLock); } - pfree(buffer_dirty); + LWLockRelease(BufMgrLock); /* Pop the error context stack */ error_context_stack = errcontext.previous; + pfree(dirty_buffers); + pfree(buftags); + return num_buffer_dirty; } @@ -818,11 +904,6 @@ WaitIO(BufferDesc *buf) } -long NDirectFileRead; /* some I/O's are direct file access. - * bypass bufmgr */ -long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ - - /* * Return a palloc'd string containing buffer usage statistics. */ @@ -892,9 +973,9 @@ AtEOXact_Buffers(bool isCommit) if (isCommit) elog(WARNING, - "buffer refcount leak: [%03d] (bufNext=%d, " - "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)", - i, buf->bufNext, + "buffer refcount leak: [%03d] " + "(rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)", + i, buf->tag.rnode.tblNode, buf->tag.rnode.relNode, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); @@ -1022,6 +1103,26 @@ BufferGetBlockNumber(Buffer buffer) } /* + * BufferGetFileNode + * Returns the relation ID (RelFileNode) associated with a buffer. + * + * This should make the same checks as BufferGetBlockNumber, but since the + * two are generally called together, we don't bother. + */ +RelFileNode +BufferGetFileNode(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + bufHdr = &(LocalBufferDescriptors[-buffer - 1]); + else + bufHdr = &BufferDescriptors[buffer - 1]; + + return (bufHdr->tag.rnode); +} + +/* * BufferReplace * * Write out the buffer corresponding to 'bufHdr'. @@ -1663,7 +1764,11 @@ refcount = %ld, file: %s, line: %d\n", * * This routine might get called many times on the same page, if we are making * the first scan after commit of an xact that added/deleted many tuples. - * So, be as quick as we can if the buffer is already dirty. + * So, be as quick as we can if the buffer is already dirty. We do this by + * not acquiring BufMgrLock if it looks like the status bits are already OK. + * (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after + * we look, because the buffer content update is already done and will be + * reflected in the I/O.) */ void SetBufferCommitInfoNeedsSave(Buffer buffer) @@ -2008,19 +2113,6 @@ AbortBufferIO(void) } } -RelFileNode -BufferGetFileNode(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - bufHdr = &(LocalBufferDescriptors[-buffer - 1]); - else - bufHdr = &BufferDescriptors[buffer - 1]; - - return (bufHdr->tag.rnode); -} - /* * Error context callback for errors occurring during buffer writes. */ |