aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/buffer/bufmgr.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r--src/backend/storage/buffer/bufmgr.c266
1 files changed, 179 insertions, 87 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index d515a7a2590..a80435b7ec2 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.160 2004/02/12 20:07:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.161 2004/04/19 23:27:17 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -54,9 +54,9 @@
#include "storage/proc.h"
#include "storage/smgr.h"
#include "utils/relcache.h"
-
#include "pgstat.h"
+
#define BufferGetLSN(bufHdr) \
(*((XLogRecPtr*) MAKE_PTR((bufHdr)->data)))
@@ -64,15 +64,17 @@
/* GUC variable */
bool zero_damaged_pages = false;
+#ifdef NOT_USED
+int ShowPinTrace = 0;
+#endif
+
int BgWriterDelay = 200;
int BgWriterPercent = 1;
int BgWriterMaxpages = 100;
-static void WaitIO(BufferDesc *buf);
-static void StartBufferIO(BufferDesc *buf, bool forInput);
-static void TerminateBufferIO(BufferDesc *buf);
-static void ContinueBufferIO(BufferDesc *buf, bool forInput);
-static void buffer_write_error_callback(void *arg);
+long NDirectFileRead; /* some I/O's are direct file access.
+ * bypass bufmgr */
+long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */
/*
* Macro : BUFFER_IS_BROKEN
@@ -80,18 +82,22 @@ static void buffer_write_error_callback(void *arg);
*/
#define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY))
+
+static void PinBuffer(BufferDesc *buf);
+static void UnpinBuffer(BufferDesc *buf);
+static void WaitIO(BufferDesc *buf);
+static void StartBufferIO(BufferDesc *buf, bool forInput);
+static void TerminateBufferIO(BufferDesc *buf);
+static void ContinueBufferIO(BufferDesc *buf, bool forInput);
+static void buffer_write_error_callback(void *arg);
static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
bool bufferLockHeld);
static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
bool *foundPtr);
static void BufferReplace(BufferDesc *bufHdr);
-
-#ifdef NOT_USED
-void PrintBufferDescs(void);
-#endif
-
static void write_buffer(Buffer buffer, bool unpin);
+
/*
* ReadBuffer -- returns a buffer containing the requested
* block of the requested relation. If the blknum
@@ -282,14 +288,15 @@ BufferAlloc(Relation reln,
BufferDesc *buf,
*buf2;
BufferTag newTag; /* identity of requested block */
+ int cdb_found_index,
+ cdb_replace_index;
bool inProgress; /* buffer undergoing IO */
- /* create a new tag so we can lookup the buffer */
- /* assume that the relation is already open */
+ /* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(&newTag, reln, blockNum);
/* see if the block is in the buffer pool already */
- buf = StrategyBufferLookup(&newTag, false);
+ buf = StrategyBufferLookup(&newTag, false, &cdb_found_index);
if (buf != NULL)
{
/*
@@ -332,6 +339,13 @@ BufferAlloc(Relation reln,
}
LWLockRelease(BufMgrLock);
+
+ /*
+ * Do the cost accounting for vacuum
+ */
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageHit;
+
return buf;
}
@@ -345,16 +359,16 @@ BufferAlloc(Relation reln,
inProgress = FALSE;
for (buf = NULL; buf == NULL;)
{
- buf = StrategyGetBuffer();
+ buf = StrategyGetBuffer(&cdb_replace_index);
- /* GetFreeBuffer will abort if it can't find a free buffer */
+ /* StrategyGetBuffer will elog if it can't find a free buffer */
Assert(buf);
/*
* There should be exactly one pin on the buffer after it is
* allocated -- ours. If it had a pin it wouldn't have been on
* the free list. No one else could have pinned it between
- * GetFreeBuffer and here because we have the BufMgrLock.
+ * StrategyGetBuffer and here because we have the BufMgrLock.
*/
Assert(buf->refcount == 0);
buf->refcount = 1;
@@ -438,7 +452,7 @@ BufferAlloc(Relation reln,
* we haven't gotten around to insert the new tag into the
* buffer table. So we need to check here. -ay 3/95
*/
- buf2 = StrategyBufferLookup(&newTag, true);
+ buf2 = StrategyBufferLookup(&newTag, true, &cdb_found_index);
if (buf2 != NULL)
{
/*
@@ -471,6 +485,15 @@ BufferAlloc(Relation reln,
}
LWLockRelease(BufMgrLock);
+
+ /*
+ * Do the cost accounting for vacuum. (XXX perhaps better
+ * to consider this a miss? We didn't have to do the read,
+ * but we did have to write ...)
+ */
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageHit;
+
return buf2;
}
}
@@ -485,8 +508,8 @@ BufferAlloc(Relation reln,
* Tell the buffer replacement strategy that we are replacing the
* buffer content. Then rename the buffer.
*/
- StrategyReplaceBuffer(buf, reln, blockNum);
- INIT_BUFFERTAG(&(buf->tag), reln, blockNum);
+ StrategyReplaceBuffer(buf, &newTag, cdb_found_index, cdb_replace_index);
+ buf->tag = newTag;
/*
* Buffer contents are currently invalid. Have to mark IO IN PROGRESS
@@ -501,6 +524,12 @@ BufferAlloc(Relation reln,
LWLockRelease(BufMgrLock);
+ /*
+ * Do the cost accounting for vacuum
+ */
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageMiss;
+
return buf;
}
@@ -624,69 +653,117 @@ ReleaseAndReadBuffer(Buffer buffer,
}
/*
- * BufferSync -- Write all dirty buffers in the pool.
+ * PinBuffer -- make buffer unavailable for replacement.
*
- * This is called at checkpoint time and writes out all dirty shared buffers,
+ * This should be applied only to shared buffers, never local ones.
+ * Bufmgr lock must be held by caller.
+ */
+static void
+PinBuffer(BufferDesc *buf)
+{
+ int b = BufferDescriptorGetBuffer(buf) - 1;
+
+ if (PrivateRefCount[b] == 0)
+ buf->refcount++;
+ PrivateRefCount[b]++;
+ Assert(PrivateRefCount[b] > 0);
+}
+
+/*
+ * UnpinBuffer -- make buffer available for replacement.
+ *
+ * This should be applied only to shared buffers, never local ones.
+ * Bufmgr lock must be held by caller.
+ */
+static void
+UnpinBuffer(BufferDesc *buf)
+{
+ int b = BufferDescriptorGetBuffer(buf) - 1;
+
+ Assert(buf->refcount > 0);
+ Assert(PrivateRefCount[b] > 0);
+ PrivateRefCount[b]--;
+ if (PrivateRefCount[b] == 0)
+ buf->refcount--;
+
+ if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
+ buf->refcount == 1)
+ {
+ /* we just released the last pin other than the waiter's */
+ buf->flags &= ~BM_PIN_COUNT_WAITER;
+ ProcSendSignal(buf->wait_backend_id);
+ }
+ else
+ {
+ /* do nothing */
+ }
+}
+
+/*
+ * BufferSync -- Write out dirty buffers in the pool.
+ *
+ * This is called at checkpoint time to write out all dirty shared buffers,
* and by the background writer process to write out some of the dirty blocks.
+ * percent/maxpages should be zero in the former case, and nonzero limit
+ * values in the latter.
*/
int
BufferSync(int percent, int maxpages)
{
+ BufferDesc **dirty_buffers;
+ BufferTag *buftags;
+ int num_buffer_dirty;
int i;
- BufferDesc *bufHdr;
ErrorContextCallback errcontext;
- int num_buffer_dirty;
- int *buffer_dirty;
-
- /* Setup error traceback support for ereport() */
- errcontext.callback = buffer_write_error_callback;
- errcontext.arg = NULL;
- errcontext.previous = error_context_stack;
- error_context_stack = &errcontext;
-
/*
* Get a list of all currently dirty buffers and how many there are.
* We do not flush buffers that get dirtied after we started. They
* have to wait until the next checkpoint.
*/
- buffer_dirty = (int *)palloc(NBuffers * sizeof(int));
+ dirty_buffers = (BufferDesc **) palloc(NBuffers * sizeof(BufferDesc *));
+ buftags = (BufferTag *) palloc(NBuffers * sizeof(BufferTag));
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
- num_buffer_dirty = StrategyDirtyBufferList(buffer_dirty, NBuffers);
- LWLockRelease(BufMgrLock);
+ num_buffer_dirty = StrategyDirtyBufferList(dirty_buffers, buftags,
+ NBuffers);
/*
* If called by the background writer, we are usually asked to
- * only write out some percentage of dirty buffers now, to prevent
+ * only write out some portion of dirty buffers now, to prevent
* the IO storm at checkpoint time.
*/
- if (percent > 0 && num_buffer_dirty > 10)
+ if (percent > 0)
{
Assert(percent <= 100);
- num_buffer_dirty = (num_buffer_dirty * percent) / 100;
- if (maxpages > 0 && num_buffer_dirty > maxpages)
- num_buffer_dirty = maxpages;
+ num_buffer_dirty = (num_buffer_dirty * percent + 99) / 100;
}
+ if (maxpages > 0 && num_buffer_dirty > maxpages)
+ num_buffer_dirty = maxpages;
+
+ /* Setup error traceback support for ereport() */
+ errcontext.callback = buffer_write_error_callback;
+ errcontext.arg = NULL;
+ errcontext.previous = error_context_stack;
+ error_context_stack = &errcontext;
+ /*
+ * Loop over buffers to be written. Note the BufMgrLock is held at
+ * loop top, but is released and reacquired intraloop, so we aren't
+ * holding it long.
+ */
for (i = 0; i < num_buffer_dirty; i++)
{
+ BufferDesc *bufHdr = dirty_buffers[i];
Buffer buffer;
XLogRecPtr recptr;
SMgrRelation reln;
- LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
- bufHdr = &BufferDescriptors[buffer_dirty[i]];
errcontext.arg = bufHdr;
- if (!(bufHdr->flags & BM_VALID))
- {
- LWLockRelease(BufMgrLock);
- continue;
- }
-
/*
+ * Check it is still the same page and still needs writing.
+ *
* We can check bufHdr->cntxDirty here *without* holding any lock
* on buffer context as long as we set this flag in access methods
* *before* logging changes with XLogInsert(): if someone will set
@@ -694,11 +771,12 @@ BufferSync(int percent, int maxpages)
* checkpoint.redo points before log record for upcoming changes
* and so we are not required to write such dirty buffer.
*/
+ if (!(bufHdr->flags & BM_VALID))
+ continue;
+ if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i]))
+ continue;
if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
- {
- LWLockRelease(BufMgrLock);
continue;
- }
/*
* IO synchronization. Note that we do it with unpinned buffer to
@@ -707,12 +785,13 @@ BufferSync(int percent, int maxpages)
if (bufHdr->flags & BM_IO_IN_PROGRESS)
{
WaitIO(bufHdr);
- if (!(bufHdr->flags & BM_VALID) ||
- (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)))
- {
- LWLockRelease(BufMgrLock);
+ /* Still need writing? */
+ if (!(bufHdr->flags & BM_VALID))
+ continue;
+ if (!BUFFERTAGS_EQUAL(&bufHdr->tag, &buftags[i]))
+ continue;
+ if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
continue;
- }
}
/*
@@ -723,10 +802,11 @@ BufferSync(int percent, int maxpages)
PinBuffer(bufHdr);
StartBufferIO(bufHdr, false); /* output IO start */
- buffer = BufferDescriptorGetBuffer(bufHdr);
-
+ /* Release BufMgrLock while doing xlog work */
LWLockRelease(BufMgrLock);
+ buffer = BufferDescriptorGetBuffer(bufHdr);
+
/*
* Protect buffer content against concurrent update
*/
@@ -740,8 +820,12 @@ BufferSync(int percent, int maxpages)
/*
* Now it's safe to write buffer to disk. Note that no one else
- * should not be able to write it while we were busy with locking
- * and log flushing because of we setted IO flag.
+ * should have been able to write it while we were busy with
+ * locking and log flushing because we set the IO flag.
+ *
+ * Before we issue the actual write command, clear the just-dirtied
+ * flag. This lets us recognize concurrent changes (note that only
+ * hint-bit changes are possible since we hold the buffer shlock).
*/
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty);
@@ -767,12 +851,12 @@ BufferSync(int percent, int maxpages)
* Release the per-buffer readlock, reacquire BufMgrLock.
*/
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
- BufferFlushCount++;
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */
TerminateBufferIO(bufHdr); /* Sync IO finished */
+ BufferFlushCount++;
/*
* If this buffer was marked by someone as DIRTY while we were
@@ -781,14 +865,16 @@ BufferSync(int percent, int maxpages)
if (!(bufHdr->flags & BM_JUST_DIRTIED))
bufHdr->flags &= ~BM_DIRTY;
UnpinBuffer(bufHdr);
- LWLockRelease(BufMgrLock);
}
- pfree(buffer_dirty);
+ LWLockRelease(BufMgrLock);
/* Pop the error context stack */
error_context_stack = errcontext.previous;
+ pfree(dirty_buffers);
+ pfree(buftags);
+
return num_buffer_dirty;
}
@@ -818,11 +904,6 @@ WaitIO(BufferDesc *buf)
}
-long NDirectFileRead; /* some I/O's are direct file access.
- * bypass bufmgr */
-long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */
-
-
/*
* Return a palloc'd string containing buffer usage statistics.
*/
@@ -892,9 +973,9 @@ AtEOXact_Buffers(bool isCommit)
if (isCommit)
elog(WARNING,
- "buffer refcount leak: [%03d] (bufNext=%d, "
- "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
- i, buf->bufNext,
+ "buffer refcount leak: [%03d] "
+ "(rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
+ i,
buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i]);
@@ -1022,6 +1103,26 @@ BufferGetBlockNumber(Buffer buffer)
}
/*
+ * BufferGetFileNode
+ * Returns the relation ID (RelFileNode) associated with a buffer.
+ *
+ * This should make the same checks as BufferGetBlockNumber, but since the
+ * two are generally called together, we don't bother.
+ */
+RelFileNode
+BufferGetFileNode(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+
+ if (BufferIsLocal(buffer))
+ bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
+ else
+ bufHdr = &BufferDescriptors[buffer - 1];
+
+ return (bufHdr->tag.rnode);
+}
+
+/*
* BufferReplace
*
* Write out the buffer corresponding to 'bufHdr'.
@@ -1663,7 +1764,11 @@ refcount = %ld, file: %s, line: %d\n",
*
* This routine might get called many times on the same page, if we are making
* the first scan after commit of an xact that added/deleted many tuples.
- * So, be as quick as we can if the buffer is already dirty.
+ * So, be as quick as we can if the buffer is already dirty. We do this by
+ * not acquiring BufMgrLock if it looks like the status bits are already OK.
+ * (Note it is okay if someone else clears BM_JUST_DIRTIED immediately after
+ * we look, because the buffer content update is already done and will be
+ * reflected in the I/O.)
*/
void
SetBufferCommitInfoNeedsSave(Buffer buffer)
@@ -2008,19 +2113,6 @@ AbortBufferIO(void)
}
}
-RelFileNode
-BufferGetFileNode(Buffer buffer)
-{
- BufferDesc *bufHdr;
-
- if (BufferIsLocal(buffer))
- bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
- else
- bufHdr = &BufferDescriptors[buffer - 1];
-
- return (bufHdr->tag.rnode);
-}
-
/*
* Error context callback for errors occurring during buffer writes.
*/