diff options
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 266 |
1 files changed, 179 insertions, 87 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d515a7a2590..a80435b7ec2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.160 2004/02/12 20:07:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.161 2004/04/19 23:27:17 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -54,9 +54,9 @@ #include "storage/proc.h" #include "storage/smgr.h" #include "utils/relcache.h" - #include "pgstat.h" + #define BufferGetLSN(bufHdr) \ (*((XLogRecPtr*) MAKE_PTR((bufHdr)->data))) @@ -64,15 +64,17 @@ /* GUC variable */ bool zero_damaged_pages = false; +#ifdef NOT_USED +int ShowPinTrace = 0; +#endif + int BgWriterDelay = 200; int BgWriterPercent = 1; int BgWriterMaxpages = 100; -static void WaitIO(BufferDesc *buf); -static void StartBufferIO(BufferDesc *buf, bool forInput); -static void TerminateBufferIO(BufferDesc *buf); -static void ContinueBufferIO(BufferDesc *buf, bool forInput); -static void buffer_write_error_callback(void *arg); +long NDirectFileRead; /* some I/O's are direct file access. + * bypass bufmgr */ +long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ /* * Macro : BUFFER_IS_BROKEN @@ -80,18 +82,22 @@ static void buffer_write_error_callback(void *arg); */ #define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY)) + +static void PinBuffer(BufferDesc *buf); +static void UnpinBuffer(BufferDesc *buf); +static void WaitIO(BufferDesc *buf); +static void StartBufferIO(BufferDesc *buf, bool forInput); +static void TerminateBufferIO(BufferDesc *buf); +static void ContinueBufferIO(BufferDesc *buf, bool forInput); +static void buffer_write_error_callback(void *arg); static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum, bool bufferLockHeld); static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr); static void BufferReplace(BufferDesc *bufHdr); - -#ifdef NOT_USED -void PrintBufferDescs(void); -#endif - static void write_buffer(Buffer buffer, bool unpin); + /* * ReadBuffer -- returns a buffer containing the requested * block of the requested relation. If the blknum @@ -282,14 +288,15 @@ BufferAlloc(Relation reln, BufferDesc *buf, *buf2; BufferTag newTag; /* identity of requested block */ + int cdb_found_index, + cdb_replace_index; bool inProgress; /* buffer undergoing IO */ - /* create a new tag so we can lookup the buffer */ - /* assume that the relation is already open */ + /* create a tag so we can lookup the buffer */ INIT_BUFFERTAG(&newTag, reln, blockNum); /* see if the block is in the buffer pool already */ - buf = StrategyBufferLookup(&newTag, false); + buf = StrategyBufferLookup(&newTag, false, &cdb_found_index); if (buf != NULL) { /* @@ -332,6 +339,13 @@ BufferAlloc(Relation reln, } LWLockRelease(BufMgrLock); + + /* + * Do the cost accounting for vacuum + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageHit; + return buf; } @@ -345,16 +359,16 @@ BufferAlloc(Relation reln, inProgress = FALSE; for (buf = NULL; buf == NULL;) { - buf = StrategyGetBuffer(); + buf = StrategyGetBuffer(&cdb_replace_index); - /* GetFreeBuffer will abort if it can't find a free buffer */ + /* StrategyGetBuffer will elog if it can't find a free buffer */ Assert(buf); /* * There should be exactly one pin on the buffer after it is * allocated -- ours. If it had a pin it wouldn't have been on * the free list. No one else could have pinned it between - * GetFreeBuffer and here because we have the BufMgrLock. + * StrategyGetBuffer and here because we have the BufMgrLock. */ Assert(buf->refcount == 0); buf->refcount = 1; @@ -438,7 +452,7 @@ BufferAlloc(Relation reln, * we haven't gotten around to insert the new tag into the * buffer table. So we need to check here. -ay 3/95 */ - buf2 = StrategyBufferLookup(&newTag, true); + buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index); if (buf2 != NULL) { /* @@ -471,6 +485,15 @@ BufferAlloc(Relation reln, } LWLockRelease(BufMgrLock); + + /* + * Do the cost accounting for vacuum. (XXX perhaps better + * to consider this a miss? We didn't have to do the read, + * but we did have to write ...) + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageHit; + return buf2; } } @@ -485,8 +508,8 @@ BufferAlloc(Relation reln, * Tell the buffer replacement strategy that we are replacing the * buffer content. Then rename the buffer. */ - StrategyReplaceBuffer(buf, reln, blockNum); - INIT_BUFFERTAG(&(buf->tag), reln, blockNum); + StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index); + buf->tag = newTag; /* * Buffer contents are currently invalid. Have to mark IO IN PROGRESS @@ -501,6 +524,12 @@ BufferAlloc(Relation reln, LWLockRelease(BufMgrLock); + /* + * Do the cost accounting for vacuum + */ + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageMiss; + return buf; } @@ -624,69 +653,117 @@ ReleaseAndReadBuffer(Buffer buffer, } /* - * BufferSync -- Write all dirty buffers in the pool. + * PinBuffer -- make buffer unavailable for replacement. * - * This is called at checkpoint time and writes out all dirty shared buffers, + * This should be applied only to shared buffers, never local ones. + * Bufmgr lock must be held by caller. + */ +static void +PinBuffer(BufferDesc *buf) +{ + int b = BufferDescriptorGetBuffer(buf) - 1; + + if (PrivateRefCount[b] == 0) + buf->refcount++; + PrivateRefCount[b]++; + Assert(PrivateRefCount[b] > 0); +} + +/* + * UnpinBuffer -- make buffer available for replacement. + * + * This should be applied only to shared buffers, never local ones. + * Bufmgr lock must be held by caller. + */ +static void +UnpinBuffer(BufferDesc *buf) +{ + int b = BufferDescriptorGetBuffer(buf) - 1; + + Assert(buf->refcount > 0); + Assert(PrivateRefCount[b] > 0); + PrivateRefCount[b]--; + if (PrivateRefCount[b] == 0) + buf->refcount--; + + if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 && + buf->refcount == 1) + { + /* we just released the last pin other than the waiter's */ + buf->flags &= ~BM_PIN_COUNT_WAITER; + ProcSendSignal(buf->wait_backend_id); + } + else + { + /* do nothing */ + } +} + +/* + * BufferSync -- Write out dirty buffers in the pool. + * + * This is called at checkpoint time to write out all dirty shared buffers, * and by the background writer process to write out some of the dirty blocks. + * percent/maxpages should be zero in the former case, and nonzero limit + * values in the latter. */ int BufferSync(int percent, int maxpages) { + BufferDesc **dirty_buffers; + BufferTag *buftags; + int num_buffer_dirty; int i; - BufferDesc *bufHdr; ErrorContextCallback errcontext; - int num_buffer_dirty; - int *buffer_dirty; - - /* Setup error traceback support for ereport() */ - errcontext.callback = buffer_write_error_callback; - errcontext.arg = NULL; - errcontext.previous = error_context_stack; - error_context_stack = &errcontext; - /* * Get a list of all currently dirty buffers and how many there are. * We do not flush buffers that get dirtied after we started. They * have to wait until the next checkpoint. */ - buffer_dirty = (int *)palloc(NBuffers * sizeof(int)); + dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *)); + buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag)); LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); - num_buffer_dirty = StrategyDirtyBufferList(buffer_dirty, NBuffers); - LWLockRelease(BufMgrLock); + num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags, + NBuffers); /* * If called by the background writer, we are usually asked to - * only write out some percentage of dirty buffers now, to prevent + * only write out some portion of dirty buffers now, to prevent * the IO storm at checkpoint time. */ - if (percent > 0 && num_buffer_dirty > 10) + if (percent > 0) { Assert(percent <= 100); - num_buffer_dirty = (num_buffer_dirty * percent) / 100; - if (maxpages > 0 && num_buffer_dirty > maxpages) - num_buffer_dirty = maxpages; + num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100; } + if (maxpages > 0 && num_buffer_dirty > maxpages) + num_buffer_dirty = maxpages; + + /* Setup error traceback support for ereport() */ + errcontext.callback = buffer_write_error_callback; + errcontext.arg = NULL; + errcontext.previous = error_context_stack; + error_context_stack = &errcontext; + /* + * Loop over buffers to be written. Note the BufMgrLock is held at + * loop top, but is released and reacquired intraloop, so we aren't + * holding it long. + */ for (i = 0; i < num_buffer_dirty; i++) { + BufferDesc *bufHdr = dirty_buffers[i]; Buffer buffer; XLogRecPtr recptr; SMgrRelation reln; - LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); - - bufHdr = &BufferDescriptors[buffer_dirty[i]]; errcontext.arg = bufHdr; - if (!(bufHdr->flags & BM_VALID)) - { - LWLockRelease(BufMgrLock); - continue; - } - /* + * Check it is still the same page and still needs writing. + * * We can check bufHdr->cntxDirty here *without* holding any lock * on buffer context as long as we set this flag in access methods * *before* logging changes with XLogInsert(): if someone will set @@ -694,11 +771,12 @@ BufferSync(int percent, int maxpages) * checkpoint.redo points before log record for upcoming changes * and so we are not required to write such dirty buffer. */ + if (!(bufHdr->flags & BM_VALID)) + continue; + if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i])) + continue; if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) - { - LWLockRelease(BufMgrLock); continue; - } /* * IO synchronization. Note that we do it with unpinned buffer to @@ -707,12 +785,13 @@ BufferSync(int percent, int maxpages) if (bufHdr->flags & BM_IO_IN_PROGRESS) { WaitIO(bufHdr); - if (!(bufHdr->flags & BM_VALID) || - (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))) - { - LWLockRelease(BufMgrLock); + /* Still need writing? */ + if (!(bufHdr->flags & BM_VALID)) + continue; + if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i])) + continue; + if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)) continue; - } } /* @@ -723,10 +802,11 @@ BufferSync(int percent, int maxpages) PinBuffer(bufHdr); StartBufferIO(bufHdr, false); /* output IO start */ - buffer = BufferDescriptorGetBuffer(bufHdr); - + /* Release BufMgrLock while doing xlog work */ LWLockRelease(BufMgrLock); + buffer = BufferDescriptorGetBuffer(bufHdr); + /* * Protect buffer content against concurrent update */ @@ -740,8 +820,12 @@ BufferSync(int percent, int maxpages) /* * Now it's safe to write buffer to disk. Note that no one else - * should not be able to write it while we were busy with locking - * and log flushing because of we setted IO flag. + * should have been able to write it while we were busy with + * locking and log flushing because we set the IO flag. + * + * Before we issue the actual write command, clear the just-dirtied + * flag. This lets us recognize concurrent changes (note that only + * hint-bit changes are possible since we hold the buffer shlock). */ LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty); @@ -767,12 +851,12 @@ BufferSync(int percent, int maxpages) * Release the per-buffer readlock, reacquire BufMgrLock. */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - BufferFlushCount++; LWLockAcquire(BufMgrLock, LW_EXCLUSIVE); bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ TerminateBufferIO(bufHdr); /* Sync IO finished */ + BufferFlushCount++; /* * If this buffer was marked by someone as DIRTY while we were @@ -781,14 +865,16 @@ BufferSync(int percent, int maxpages) if (!(bufHdr->flags & BM_JUST_DIRTIED)) bufHdr->flags &= ~BM_DIRTY; UnpinBuffer(bufHdr); - LWLockRelease(BufMgrLock); } - pfree(buffer_dirty); + LWLockRelease(BufMgrLock); /* Pop the error context stack */ error_context_stack = errcontext.previous; + pfree(dirty_buffers); + pfree(buftags); + return num_buffer_dirty; } @@ -818,11 +904,6 @@ WaitIO(BufferDesc *buf) } -long NDirectFileRead; /* some I/O's are direct file access. - * bypass bufmgr */ -long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ - - /* * Return a palloc'd string containing buffer usage statistics. */ @@ -892,9 +973,9 @@ AtEOXact_Buffers(bool isCommit) if (isCommit) elog(WARNING, - "buffer refcount leak: [%03d] (bufNext=%d, " - "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)", - i, buf->bufNext, + "buffer refcount leak: [%03d] " + "(rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)", + i, buf->tag.rnode.tblNode, buf->tag.rnode.relNode, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); @@ -1022,6 +1103,26 @@ BufferGetBlockNumber(Buffer buffer) } /* + * BufferGetFileNode + * Returns the relation ID (RelFileNode) associated with a buffer. + * + * This should make the same checks as BufferGetBlockNumber, but since the + * two are generally called together, we don't bother. + */ +RelFileNode +BufferGetFileNode(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + bufHdr = &(LocalBufferDescriptors[-buffer - 1]); + else + bufHdr = &BufferDescriptors[buffer - 1]; + + return (bufHdr->tag.rnode); +} + +/* * BufferReplace * * Write out the buffer corresponding to 'bufHdr'. @@ -1663,7 +1764,11 @@ refcount = %ld, file: %s, line: %d\n", * * This routine might get called many times on the same page, if we are making * the first scan after commit of an xact that added/deleted many tuples. - * So, be as quick as we can if the buffer is already dirty. + * So, be as quick as we can if the buffer is already dirty. We do this by + * not acquiring BufMgrLock if it looks like the status bits are already OK. + * (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after + * we look, because the buffer content update is already done and will be + * reflected in the I/O.) */ void SetBufferCommitInfoNeedsSave(Buffer buffer) @@ -2008,19 +2113,6 @@ AbortBufferIO(void) } } -RelFileNode -BufferGetFileNode(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - bufHdr = &(LocalBufferDescriptors[-buffer - 1]); - else - bufHdr = &BufferDescriptors[buffer - 1]; - - return (bufHdr->tag.rnode); -} - /* * Error context callback for errors occurring during buffer writes. */ |