diff options
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 1581 |
1 files changed, 1581 insertions, 0 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c new file mode 100644 index 00000000000..655f1f408e0 --- /dev/null +++ b/src/backend/storage/buffer/bufmgr.c @@ -0,0 +1,1581 @@ +/*------------------------------------------------------------------------- + * + * bufmgr.c-- + * buffer manager interface routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * + * BufferAlloc() -- lookup a buffer in the buffer table. If + * it isn't there add it, but do not read it into memory. + * This is used when we are about to reinitialize the + * buffer so don't care what the current disk contents are. + * BufferAlloc() pins the new buffer in memory. + * + * ReadBuffer() -- same as BufferAlloc() but reads the data + * on a buffer cache miss. + * + * ReleaseBuffer() -- unpin the buffer + * + * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty" + * but don't unpin. The disk IO is delayed until buffer + * replacement if LateWrite flag is set. + * + * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() + * + * DirtyBufferCopy() -- For a given dbid/relid/blockno, if the buffer is + * in the cache and is dirty, mark it clean and copy + * it to the requested location. This is a logical + * write, and has been installed to support the cache + * management code for write-once storage managers. + * + * FlushBuffer() -- as above but never delayed write. + * + * BufferSync() -- flush all dirty buffers in the buffer pool. + * + * InitBufferPool() -- Init the buffer module. + * + * See other files: + * freelist.c -- chooses victim for buffer replacement + * buf_table.c -- manages the buffer lookup table + */ +#include <sys/file.h> +#include <stdio.h> +#include <math.h> +#include <signal.h> + +/* declarations split between these three files */ +#include "storage/buf.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" + +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/smgr.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/hsearch.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/memutils.h" +#include "executor/execdebug.h" /* for NDirectFileRead */ +#include "catalog/catalog.h" + +extern int LateWrite; +extern SPINLOCK BufMgrLock; +extern int ReadBufferCount; +extern int BufferHitCount; +extern int BufferFlushCount; + +static void WaitIO(BufferDesc *buf, SPINLOCK spinlock); +#ifndef HAS_TEST_AND_SET +static void SignalIO(BufferDesc *buf); +extern long *NWaitIOBackendP; /* defined in buf_init.c */ +#endif /* HAS_TEST_AND_SET */ + +static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, + bool bufferLockHeld); +static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, + bool *foundPtr, bool bufferLockHeld); +static int FlushBuffer(Buffer buffer); +static void BufferSync(void); +static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld); + +/* --------------------------------------------------- + * RelationGetBufferWithBuffer + * see if the given buffer is what we want + * if yes, we don't need to bother the buffer manager + * --------------------------------------------------- + */ +Buffer +RelationGetBufferWithBuffer(Relation relation, + BlockNumber blockNumber, + Buffer buffer) +{ + BufferDesc *bufHdr; + LRelId lrelId; + + if (BufferIsValid(buffer)) { + if (!BufferIsLocal(buffer)) { + bufHdr = &BufferDescriptors[buffer-1]; + lrelId = RelationGetLRelId(relation); + SpinAcquire(BufMgrLock); + if (bufHdr->tag.blockNum == blockNumber && + bufHdr->tag.relId.relId == lrelId.relId && + bufHdr->tag.relId.dbId == lrelId.dbId) { + SpinRelease(BufMgrLock); + return(buffer); + } + return(ReadBufferWithBufferLock(relation, blockNumber, true)); + } else { + bufHdr = &LocalBufferDescriptors[-buffer-1]; + if (bufHdr->tag.relId.relId == relation->rd_id && + bufHdr->tag.blockNum == blockNumber) { + return(buffer); + } + } + } + return(ReadBuffer(relation, blockNumber)); +} + +/* + * ReadBuffer -- returns a buffer containing the requested + * block of the requested relation. If the blknum + * requested is P_NEW, extend the relation file and + * allocate a new block. + * + * Returns: the buffer number for the buffer containing + * the block read or NULL on an error. + * + * Assume when this function is called, that reln has been + * opened already. + */ + +extern int ShowPinTrace; + + +#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG defined */ + +/* + * ReadBuffer -- + * + */ +Buffer +ReadBuffer(Relation reln, BlockNumber blockNum) +{ + return ReadBufferWithBufferLock(reln, blockNum, false); +} + +/* + * is_userbuffer + * + * XXX caller must have already acquired BufMgrLock + */ +static bool +is_userbuffer(Buffer buffer) +{ + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + if (IsSystemRelationName(buf->sb_relname)) + return false; + return true; +} + +Buffer +ReadBuffer_Debug(char *file, + int line, + Relation reln, + BlockNumber blockNum) +{ + Buffer buffer; + + buffer = ReadBufferWithBufferLock(reln, blockNum, false); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } + return buffer; +} + +/* + * ReadBufferWithBufferLock -- does the work of + * ReadBuffer() but with the possibility that + * the buffer lock has already been held. this + * is yet another effort to reduce the number of + * semops in the system. + */ +static Buffer +ReadBufferWithBufferLock(Relation reln, + BlockNumber blockNum, + bool bufferLockHeld) +{ + BufferDesc *bufHdr; + int extend; /* extending the file by one block */ + int status; + bool found; + bool isLocalBuf; + + extend = (blockNum == P_NEW); + isLocalBuf = reln->rd_islocal; + + if (isLocalBuf) { + bufHdr = LocalBufferAlloc(reln, blockNum, &found); + } else { + ReadBufferCount++; + + /* lookup the buffer. IO_IN_PROGRESS is set if the requested + * block is not currently in memory. + */ + bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld); + if (found) BufferHitCount++; + } + + if (!bufHdr) { + return(InvalidBuffer); + } + + /* if its already in the buffer pool, we're done */ + if (found) { + /* + * This happens when a bogus buffer was returned previously and is + * floating around in the buffer pool. A routine calling this would + * want this extended. + */ + if (extend) { + /* new buffers are zero-filled */ + memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); + (void) smgrextend(bufHdr->bufsmgr, reln, + (char *) MAKE_PTR(bufHdr->data)); + } + return (BufferDescriptorGetBuffer(bufHdr)); + + } + + /* + * if we have gotten to this point, the reln pointer must be ok + * and the relation file must be open. + */ + if (extend) { + /* new buffers are zero-filled */ + (void) memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); + status = smgrextend(bufHdr->bufsmgr, reln, + (char *) MAKE_PTR(bufHdr->data)); + } else { + status = smgrread(bufHdr->bufsmgr, reln, blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + if (isLocalBuf) + return (BufferDescriptorGetBuffer(bufHdr)); + + /* lock buffer manager again to update IO IN PROGRESS */ + SpinAcquire(BufMgrLock); + + if (status == SM_FAIL) { + /* IO Failed. cleanup the data structures and go home */ + + if (! BufTableDelete(bufHdr)) { + SpinRelease(BufMgrLock); + elog(FATAL,"BufRead: buffer table broken after IO error\n"); + } + /* remember that BufferAlloc() pinned the buffer */ + UnpinBuffer(bufHdr); + + /* + * Have to reset the flag so that anyone waiting for + * the buffer can tell that the contents are invalid. + */ + bufHdr->flags |= BM_IO_ERROR; + + } else { + /* IO Succeeded. clear the flags, finish buffer update */ + + bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS); + } + + /* If anyone was waiting for IO to complete, wake them up now */ +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(bufHdr->io_in_progress_lock)); +#else + if (bufHdr->refcount > 1) + SignalIO(bufHdr); +#endif + + SpinRelease(BufMgrLock); + + return(BufferDescriptorGetBuffer(bufHdr)); +} + +/* + * BufferAlloc -- Get a buffer from the buffer pool but dont + * read it. + * + * Returns: descriptor for buffer + * + * When this routine returns, the BufMgrLock is guaranteed NOT be held. + */ +static BufferDesc * +BufferAlloc(Relation reln, + BlockNumber blockNum, + bool *foundPtr, + bool bufferLockHeld) +{ + BufferDesc *buf, *buf2; + BufferTag newTag; /* identity of requested block */ + bool inProgress; /* buffer undergoing IO */ + bool newblock = FALSE; + + /* create a new tag so we can lookup the buffer */ + /* assume that the relation is already open */ + if (blockNum == P_NEW) { + newblock = TRUE; + blockNum = smgrnblocks(reln->rd_rel->relsmgr, reln); + } + + INIT_BUFFERTAG(&newTag,reln,blockNum); + + if (!bufferLockHeld) + SpinAcquire(BufMgrLock); + + /* see if the block is in the buffer pool already */ + buf = BufTableLookup(&newTag); + if (buf != NULL) { + /* Found it. Now, (a) pin the buffer so no + * one steals it from the buffer pool, + * (b) check IO_IN_PROGRESS, someone may be + * faulting the buffer into the buffer pool. + */ + + PinBuffer(buf); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + + *foundPtr = TRUE; + if (inProgress) { + WaitIO(buf, BufMgrLock); + if (buf->flags & BM_IO_ERROR) { + /* wierd race condition: + * + * We were waiting for someone else to read the buffer. + * While we were waiting, the reader boof'd in some + * way, so the contents of the buffer are still + * invalid. By saying that we didn't find it, we can + * make the caller reinitialize the buffer. If two + * processes are waiting for this block, both will + * read the block. The second one to finish may overwrite + * any updates made by the first. (Assume higher level + * synchronization prevents this from happening). + * + * This is never going to happen, don't worry about it. + */ + *foundPtr = FALSE; + } + } +#ifdef BMTRACE + _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND); +#endif /* BMTRACE */ + + SpinRelease(BufMgrLock); + + return(buf); + } + + *foundPtr = FALSE; + + /* + * Didn't find it in the buffer pool. We'll have + * to initialize a new buffer. First, grab one from + * the free list. If it's dirty, flush it to disk. + * Remember to unlock BufMgr spinlock while doing the IOs. + */ + inProgress = FALSE; + for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL; ) { + + /* GetFreeBuffer will abort if it can't find a free buffer */ + buf = GetFreeBuffer(); + + /* + * There should be exactly one pin on the buffer after + * it is allocated -- ours. If it had a pin it wouldn't + * have been on the free list. No one else could have + * pinned it between GetFreeBuffer and here because we + * have the BufMgrLock. + */ + Assert(buf->refcount == 0); + buf->refcount = 1; + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; + + if (buf->flags & BM_DIRTY) { + /* + * Set BM_IO_IN_PROGRESS to keep anyone from doing anything + * with the contents of the buffer while we write it out. + * We don't really care if they try to read it, but if they + * can complete a BufferAlloc on it they can then scribble + * into it, and we'd really like to avoid that while we are + * flushing the buffer. Setting this flag should block them + * in WaitIO until we're done. + */ + inProgress = TRUE; + buf->flags |= BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + /* + * All code paths that acquire this lock pin the buffer + * first; since no one had it pinned (it just came off the + * free list), no one else can have this lock. + */ + Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + S_LOCK(&(buf->io_in_progress_lock)); +#endif /* HAS_TEST_AND_SET */ + + /* + * Write the buffer out, being careful to release BufMgrLock + * before starting the I/O. + * + * This #ifndef is here because a few extra semops REALLY kill + * you on machines that don't have spinlocks. If you don't + * operate with much concurrency, well... + */ + (void) BufferReplace(buf, true); + BufferFlushCount++; +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + /* + * Somebody could have pinned the buffer while we were + * doing the I/O and had given up the BufMgrLock (though + * they would be waiting for us to clear the BM_IO_IN_PROGRESS + * flag). That's why this is a loop -- if so, we need to clear + * the I/O flags, remove our pin and start all over again. + * + * People may be making buffers free at any time, so there's + * no reason to think that we have an immediate disaster on + * our hands. + */ + if (buf->refcount > 1) { + inProgress = FALSE; + buf->flags &= ~BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->io_in_progress_lock)); +#else /* !HAS_TEST_AND_SET */ + if (buf->refcount > 1) + SignalIO(buf); +#endif /* !HAS_TEST_AND_SET */ + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + buf->refcount--; + buf = (BufferDesc *) NULL; + } + + /* + * Somebody could have allocated another buffer for the + * same block we are about to read in. (While we flush out + * the dirty buffer, we don't hold the lock and someone could + * have allocated another buffer for the same block. The problem + * is we haven't gotten around to insert the new tag into + * the buffer table. So we need to check here. -ay 3/95 + */ + buf2 = BufTableLookup(&newTag); + if (buf2 != NULL) { + /* Found it. Someone has already done what we're about + * to do. We'll just handle this as if it were found in + * the buffer pool in the first place. + */ + + PinBuffer(buf2); + inProgress = (buf2->flags & BM_IO_IN_PROGRESS); + + *foundPtr = TRUE; + if (inProgress) { + WaitIO(buf2, BufMgrLock); + if (buf2->flags & BM_IO_ERROR) { + *foundPtr = FALSE; + } + } + +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->io_in_progress_lock)); +#else /* !HAS_TEST_AND_SET */ + if (buf->refcount > 1) + SignalIO(buf); +#endif /* !HAS_TEST_AND_SET */ + + /* give up the buffer since we don't need it any more */ + buf->refcount--; + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + AddBufferToFreelist(buf); + buf->flags |= BM_FREE; + buf->flags &= ~BM_DIRTY; + buf->flags &= ~BM_IO_IN_PROGRESS; + + SpinRelease(BufMgrLock); + + return(buf2); + } + } + } + /* + * At this point we should have the sole pin on a non-dirty + * buffer and we may or may not already have the BM_IO_IN_PROGRESS + * flag set. + */ + + /* + * Change the name of the buffer in the lookup table: + * + * Need to update the lookup table before the read starts. + * If someone comes along looking for the buffer while + * we are reading it in, we don't want them to allocate + * a new buffer. For the same reason, we didn't want + * to erase the buf table entry for the buffer we were + * writing back until now, either. + */ + + if (! BufTableDelete(buf)) { + SpinRelease(BufMgrLock); + elog(FATAL,"buffer wasn't in the buffer table\n"); + + } + + if (buf->flags & BM_DIRTY) { + /* must clear flag first because of wierd race + * condition described below. + */ + buf->flags &= ~BM_DIRTY; + } + + /* record the database name and relation name for this buffer */ + buf->sb_relname = pstrdup(reln->rd_rel->relname.data); + buf->sb_dbname = pstrdup(GetDatabaseName()); + + /* remember which storage manager is responsible for it */ + buf->bufsmgr = reln->rd_rel->relsmgr; + + INIT_BUFFERTAG(&(buf->tag),reln,blockNum); + if (! BufTableInsert(buf)) { + SpinRelease(BufMgrLock); + elog(FATAL,"Buffer in lookup table twice \n"); + } + + /* Buffer contents are currently invalid. Have + * to mark IO IN PROGRESS so no one fiddles with + * them until the read completes. If this routine + * has been called simply to allocate a buffer, no + * io will be attempted, so the flag isnt set. + */ + if (!inProgress) { + buf->flags |= BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + S_LOCK(&(buf->io_in_progress_lock)); +#endif /* HAS_TEST_AND_SET */ + } + +#ifdef BMTRACE + _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND); +#endif /* BMTRACE */ + + SpinRelease(BufMgrLock); + + return (buf); +} + +/* + * WriteBuffer-- + * + * Pushes buffer contents to disk if LateWrite is + * not set. Otherwise, marks contents as dirty. + * + * Assume that buffer is pinned. Assume that reln is + * valid. + * + * Side Effects: + * Pin count is decremented. + */ + +#undef WriteBuffer + +int +WriteBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (! LateWrite) { + return(FlushBuffer(buffer)); + } else { + + if (BufferIsLocal(buffer)) + return WriteLocalBuffer(buffer, TRUE); + + if (BAD_BUFFER_ID(buffer)) + return(FALSE); + + bufHdr = &BufferDescriptors[buffer-1]; + + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->flags |= BM_DIRTY; + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); + } + return(TRUE); +} + +void +WriteBuffer_Debug(char *file, int line, Buffer buffer) +{ + WriteBuffer(buffer); + if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf; + buf = &BufferDescriptors[buffer-1]; + fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +/* + * DirtyBufferCopy() -- Copy a given dirty buffer to the requested + * destination. + * + * We treat this as a write. If the requested buffer is in the pool + * and is dirty, we copy it to the location requested and mark it + * clean. This routine supports the Sony jukebox storage manager, + * which agrees to take responsibility for the data once we mark + * it clean. + * + * NOTE: used by sony jukebox code in postgres 4.2 - ay 2/95 + */ +void +DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest) +{ + BufferDesc *buf; + BufferTag btag; + + btag.relId.relId = relid; + btag.relId.dbId = dbid; + btag.blockNum = blkno; + + SpinAcquire(BufMgrLock); + buf = BufTableLookup(&btag); + + if (buf == (BufferDesc *) NULL + || !(buf->flags & BM_DIRTY) + || !(buf->flags & BM_VALID)) { + SpinRelease(BufMgrLock); + return; + } + + /* hate to do this holding the lock, but release and reacquire is slower */ + memmove(dest, (char *) MAKE_PTR(buf->data), BLCKSZ); + + buf->flags &= ~BM_DIRTY; + + SpinRelease(BufMgrLock); +} + +/* + * FlushBuffer -- like WriteBuffer, but force the page to disk. + * + * 'buffer' is known to be dirty/pinned, so there should not be a + * problem reading the BufferDesc members without the BufMgrLock + * (nobody should be able to change tags, flags, etc. out from under + * us). + */ +static int +FlushBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + return FlushLocalBuffer(buffer); + + if (BAD_BUFFER_ID(buffer)) + return (STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + if (!BufferReplace(bufHdr, false)) { + elog(WARN, "FlushBuffer: cannot flush %d", bufHdr->tag.blockNum); + return (STATUS_ERROR); + } + + SpinAcquire(BufMgrLock); + bufHdr->flags &= ~BM_DIRTY; + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); + + return(STATUS_OK); +} + +/* + * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer + * when the operation is complete. + * + * We know that the buffer is for a relation in our private cache, + * because this routine is called only to write out buffers that + * were changed by the executing backend. + */ +int +WriteNoReleaseBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (! LateWrite) { + return(FlushBuffer(buffer)); + } else { + + if (BufferIsLocal(buffer)) + return WriteLocalBuffer(buffer, FALSE); + + if (BAD_BUFFER_ID(buffer)) + return (STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + SpinAcquire(BufMgrLock); + bufHdr->flags |= BM_DIRTY; + SpinRelease(BufMgrLock); + } + return(STATUS_OK); +} + + +#undef ReleaseAndReadBuffer +/* + * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer() + * so that only one semop needs to be called. + * + */ +Buffer +ReleaseAndReadBuffer(Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + BufferDesc *bufHdr; + Buffer retbuf; + + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] > 0); + LocalRefCount[-buffer - 1]--; + } else { + if (BufferIsValid(buffer)) { + bufHdr = &BufferDescriptors[buffer-1]; + Assert(PrivateRefCount[buffer - 1] > 0); + PrivateRefCount[buffer - 1]--; + if (PrivateRefCount[buffer - 1] == 0 && + LastRefCount[buffer - 1] == 0) { + /* only release buffer if it is not pinned in previous ExecMain + level */ + SpinAcquire(BufMgrLock); + bufHdr->refcount--; + if (bufHdr->refcount == 0) { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + retbuf = ReadBufferWithBufferLock(relation, blockNum, true); + return retbuf; + } + } + } + + return (ReadBuffer(relation, blockNum)); +} + +/* + * BufferSync -- Flush all dirty buffers in the pool. + * + * This is called at transaction commit time. It does the wrong thing, + * right now. We should flush only our own changes to stable storage, + * and we should obey the lock protocol on the buffer manager metadata + * as we do it. Also, we need to be sure that no other transaction is + * modifying the page as we flush it. This is only a problem for objects + * that use a non-two-phase locking protocol, like btree indices. For + * those objects, we would like to set a write lock for the duration of + * our IO. Another possibility is to code updates to btree pages + * carefully, so that writing them out out of order cannot cause + * any unrecoverable errors. + * + * I don't want to think hard about this right now, so I will try + * to come back to it later. + */ +static void +BufferSync() +{ + int i; + Oid bufdb; + Oid bufrel; + Relation reln; + BufferDesc *bufHdr; + int status; + + SpinAcquire(BufMgrLock); + for (i=0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) { + if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { + bufdb = bufHdr->tag.relId.dbId; + bufrel = bufHdr->tag.relId.relId; + if (bufdb == MyDatabaseId || bufdb == (Oid) 0) { + reln = RelationIdCacheGetRelation(bufrel); + + /* + * If we didn't have the reldesc in our local cache, flush this + * page out using the 'blind write' storage manager routine. If + * we did find it, use the standard interface. + */ + +#ifndef OPTIMIZE_SINGLE + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + if (reln == (Relation) NULL) { + status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname, + bufHdr->sb_relname, bufdb, bufrel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } else { + status = smgrwrite(bufHdr->bufsmgr, reln, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + if (status == SM_FAIL) { + elog(WARN, "cannot write %d for %16s", + bufHdr->tag.blockNum, bufHdr->sb_relname); + } + + bufHdr->flags &= ~BM_DIRTY; + if (reln != (Relation)NULL) + RelationDecrementReferenceCount(reln); + } + } + } + SpinRelease(BufMgrLock); + + LocalBufferSync(); +} + + +/* + * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' + * is cleared. Because IO_IN_PROGRESS conflicts are + * expected to be rare, there is only one BufferIO + * lock in the entire system. All processes block + * on this semaphore when they try to use a buffer + * that someone else is faulting in. Whenever a + * process finishes an IO and someone is waiting for + * the buffer, BufferIO is signaled (SignalIO). All + * waiting processes then wake up and check to see + * if their buffer is now ready. This implementation + * is simple, but efficient enough if WaitIO is + * rarely called by multiple processes simultaneously. + * + * ProcSleep atomically releases the spinlock and goes to + * sleep. + * + * Note: there is an easy fix if the queue becomes long. + * save the id of the buffer we are waiting for in + * the queue structure. That way signal can figure + * out which proc to wake up. + */ +#ifdef HAS_TEST_AND_SET +static void +WaitIO(BufferDesc *buf, SPINLOCK spinlock) +{ + SpinRelease(spinlock); + S_LOCK(&(buf->io_in_progress_lock)); + S_UNLOCK(&(buf->io_in_progress_lock)); + SpinAcquire(spinlock); +} + +#else /* HAS_TEST_AND_SET */ +IpcSemaphoreId WaitIOSemId; + +static void +WaitIO(BufferDesc *buf, SPINLOCK spinlock) +{ + bool inProgress; + + for (;;) { + + /* wait until someone releases IO lock */ + (*NWaitIOBackendP)++; + SpinRelease(spinlock); + IpcSemaphoreLock(WaitIOSemId, 0, 1); + SpinAcquire(spinlock); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + if (!inProgress) break; + } +} + +/* + * SignalIO -- + */ +static void +SignalIO(BufferDesc *buf) +{ + /* somebody better be waiting. */ + Assert( buf->refcount > 1); + IpcSemaphoreUnlock(WaitIOSemId, 0, *NWaitIOBackendP); + *NWaitIOBackendP = 0; +} +#endif /* HAS_TEST_AND_SET */ + +long NDirectFileRead; /* some I/O's are direct file access. bypass bufmgr */ +long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ + +void +PrintBufferUsage(FILE *statfp) +{ + float hitrate; + + if (ReadBufferCount==0) + hitrate = 0.0; + else + hitrate = (float)BufferHitCount * 100.0/ReadBufferCount; + + fprintf(statfp, "!\t%ld blocks read, %ld blocks written, buffer hit rate = %.2f%%\n", + ReadBufferCount - BufferHitCount + NDirectFileRead, + BufferFlushCount + NDirectFileWrite, + hitrate); +} + +void +ResetBufferUsage() +{ + BufferHitCount = 0; + ReadBufferCount = 0; + BufferFlushCount = 0; + NDirectFileRead = 0; + NDirectFileWrite = 0; +} + +/* ---------------------------------------------- + * ResetBufferPool + * + * this routine is supposed to be called when a transaction aborts. + * it will release all the buffer pins held by the transaciton. + * + * ---------------------------------------------- + */ +void +ResetBufferPool() +{ + register int i; + for (i=1; i<=NBuffers; i++) { + if (BufferIsValid(i)) { + while(PrivateRefCount[i - 1] > 0) { + ReleaseBuffer(i); + } + } + LastRefCount[i - 1] = 0; + } + + ResetLocalBufferPool(); +} + +/* ----------------------------------------------- + * BufferPoolCheckLeak + * + * check if there is buffer leak + * + * ----------------------------------------------- + */ +int +BufferPoolCheckLeak() +{ + register int i; + void PrintBufferDescs(); + + for (i = 1; i <= NBuffers; i++) { + if (BufferIsValid(i)) { + elog(NOTICE, "buffer leak detected in BufferPoolCheckLeak()"); + PrintBufferDescs(); + return(1); + } + } + return(0); +} + +/* ------------------------------------------------ + * FlushBufferPool + * + * flush all dirty blocks in buffer pool to disk + * + * ------------------------------------------------ + */ +void +FlushBufferPool(int StableMainMemoryFlag) +{ + if (!StableMainMemoryFlag) { + BufferSync(); + smgrcommit(); + } +} + +/* + * BufferIsValid -- + * True iff the refcnt of the local buffer is > 0 + * Note: + * BufferIsValid(InvalidBuffer) is False. + * BufferIsValid(UnknownBuffer) is False. + */ +bool +BufferIsValid(Buffer bufnum) +{ + if (BufferIsLocal(bufnum)) + return (bufnum >= -NLocBuffer && LocalRefCount[-bufnum - 1] > 0); + + if (BAD_BUFFER_ID(bufnum)) + return(false); + + return ((bool)(PrivateRefCount[bufnum - 1] > 0)); +} + +/* + * BufferGetBlockNumber -- + * Returns the block number associated with a buffer. + * + * Note: + * Assumes that the buffer is valid. + */ +BlockNumber +BufferGetBlockNumber(Buffer buffer) +{ + Assert(BufferIsValid(buffer)); + + /* XXX should be a critical section */ + if (BufferIsLocal(buffer)) + return (LocalBufferDescriptors[-buffer-1].tag.blockNum); + else + return (BufferDescriptors[buffer-1].tag.blockNum); +} + +/* + * BufferGetRelation -- + * Returns the relation desciptor associated with a buffer. + * + * Note: + * Assumes buffer is valid. + */ +Relation +BufferGetRelation(Buffer buffer) +{ + Relation relation; + Oid relid; + + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); /* not supported for local buffers */ + + /* XXX should be a critical section */ + relid = LRelIdGetRelationId(BufferDescriptors[buffer-1].tag.relId); + relation = RelationIdGetRelation(relid); + + RelationDecrementReferenceCount(relation); + + if (RelationHasReferenceCountZero(relation)) { + /* + elog(NOTICE, "BufferGetRelation: 0->1"); + */ + + RelationIncrementReferenceCount(relation); + } + + return (relation); +} + +/* + * BufferReplace + * + * Flush the buffer corresponding to 'bufHdr' + * + * Assumes that the BufMgrLock has NOT been acquired. + */ +static int +BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) +{ + Relation reln; + Oid bufdb, bufrel; + int status; + + if (!bufferLockHeld) + SpinAcquire(BufMgrLock); + + /* + * first try to find the reldesc in the cache, if no luck, + * don't bother to build the reldesc from scratch, just do + * a blind write. + */ + + bufdb = bufHdr->tag.relId.dbId; + bufrel = bufHdr->tag.relId.relId; + + if (bufdb == MyDatabaseId || bufdb == (Oid) NULL) + reln = RelationIdCacheGetRelation(bufrel); + else + reln = (Relation) NULL; + + SpinRelease(BufMgrLock); + + if (reln != (Relation) NULL) { + status = smgrflush(bufHdr->bufsmgr, reln, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } else { + + /* blind write always flushes */ + status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname, + bufHdr->sb_relname, bufdb, bufrel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + if (status == SM_FAIL) + return (FALSE); + + return (TRUE); +} + +/* + * RelationGetNumberOfBlocks -- + * Returns the buffer descriptor associated with a page in a relation. + * + * Note: + * XXX may fail for huge relations. + * XXX should be elsewhere. + * XXX maybe should be hidden + */ +BlockNumber +RelationGetNumberOfBlocks(Relation relation) +{ + return + ((relation->rd_islocal) ? relation->rd_nblocks : + smgrnblocks(relation->rd_rel->relsmgr, relation)); +} + +/* + * BufferGetBlock -- + * Returns a reference to a disk page image associated with a buffer. + * + * Note: + * Assumes buffer is valid. + */ +Block +BufferGetBlock(Buffer buffer) +{ + Assert(BufferIsValid(buffer)); + + if (BufferIsLocal(buffer)) + return((Block)MAKE_PTR(LocalBufferDescriptors[-buffer-1].data)); + else + return((Block)MAKE_PTR(BufferDescriptors[buffer-1].data)); +} + +/* --------------------------------------------------------------------- + * ReleaseTmpRelBuffers + * + * this function unmarks all the dirty pages of a temporary + * relation in the buffer pool so that at the end of transaction + * these pages will not be flushed. + * XXX currently it sequentially searches the buffer pool, should be + * changed to more clever ways of searching. + * -------------------------------------------------------------------- + */ +void +ReleaseTmpRelBuffers(Relation tempreldesc) +{ + register int i; + int holding = 0; + BufferDesc *buf; + + for (i=1; i<=NBuffers; i++) { + buf = &BufferDescriptors[i-1]; + if (!holding) { + SpinAcquire(BufMgrLock); + holding = 1; + } + if ((buf->flags & BM_DIRTY) && + (buf->tag.relId.dbId == MyDatabaseId) && + (buf->tag.relId.relId == tempreldesc->rd_id)) { + buf->flags &= ~BM_DIRTY; + if (!(buf->flags & BM_FREE)) { + SpinRelease(BufMgrLock); + holding = 0; + ReleaseBuffer(i); + } + } + } + if (holding) + SpinRelease(BufMgrLock); +} + +/* --------------------------------------------------------------------- + * DropBuffers + * + * This function marks all the buffers in the buffer cache for a + * particular database as clean. This is used when we destroy a + * database, to avoid trying to flush data to disk when the directory + * tree no longer exists. + * + * This is an exceedingly non-public interface. + * -------------------------------------------------------------------- + */ +void +DropBuffers(Oid dbid) +{ + register int i; + BufferDesc *buf; + + SpinAcquire(BufMgrLock); + for (i=1; i<=NBuffers; i++) { + buf = &BufferDescriptors[i-1]; + if ((buf->tag.relId.dbId == dbid) && (buf->flags & BM_DIRTY)) { + buf->flags &= ~BM_DIRTY; + } + } + SpinRelease(BufMgrLock); +} + +/* ----------------------------------------------------------------- + * PrintBufferDescs + * + * this function prints all the buffer descriptors, for debugging + * use only. + * ----------------------------------------------------------------- + */ +void +PrintBufferDescs() +{ + int i; + BufferDesc *buf = BufferDescriptors; + + if (IsUnderPostmaster) { + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; ++i, ++buf) { + elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \ +blockNum=%d, flags=0x%x, refcount=%d %d)", + i, buf->freeNext, buf->freePrev, NAMEDATALEN, + &(buf->sb_relname), buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i]); + } + SpinRelease(BufMgrLock); + } else { + /* interactive backend */ + for (i = 0; i < NBuffers; ++i, ++buf) { + printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n", + i, buf->sb_relname, buf->tag.blockNum, + buf->flags, buf->refcount, PrivateRefCount[i]); + } + } +} + +void +PrintPinnedBufs() +{ + int i; + BufferDesc *buf = BufferDescriptors; + + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; ++i, ++buf) { + if (PrivateRefCount[i] > 0) + elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \ +blockNum=%d, flags=0x%x, refcount=%d %d)\n", + i, buf->freeNext, buf->freePrev, NAMEDATALEN, &(buf->sb_relname), + buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i]); + } + SpinRelease(BufMgrLock); +} + +/* + * BufferPoolBlowaway + * + * this routine is solely for the purpose of experiments -- sometimes + * you may want to blowaway whatever is left from the past in buffer + * pool and start measuring some performance with a clean empty buffer + * pool. + */ +void +BufferPoolBlowaway() +{ + register int i; + + BufferSync(); + for (i=1; i<=NBuffers; i++) { + if (BufferIsValid(i)) { + while(BufferIsValid(i)) + ReleaseBuffer(i); + } + BufTableDelete(&BufferDescriptors[i-1]); + } +} + +#undef IncrBufferRefCount +#undef ReleaseBuffer + +void +IncrBufferRefCount(Buffer buffer) +{ + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] >= 0); + LocalRefCount[-buffer - 1]++; + } else { + Assert(!BAD_BUFFER_ID(buffer)); + Assert(PrivateRefCount[buffer - 1] >= 0); + PrivateRefCount[buffer - 1]++; + } +} + +/* + * ReleaseBuffer -- remove the pin on a buffer without + * marking it dirty. + * + */ +int +ReleaseBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] > 0); + LocalRefCount[-buffer - 1]--; + return (STATUS_OK); + } + + if (BAD_BUFFER_ID(buffer)) + return(STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + Assert(PrivateRefCount[buffer - 1] > 0); + PrivateRefCount[buffer - 1]--; + if (PrivateRefCount[buffer - 1] == 0 && LastRefCount[buffer - 1] == 0) { + /* only release buffer if it is not pinned in previous ExecMain + levels */ + SpinAcquire(BufMgrLock); + bufHdr->refcount--; + if (bufHdr->refcount == 0) { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + SpinRelease(BufMgrLock); + } + + return(STATUS_OK); +} + +void +IncrBufferRefCount_Debug(char *file, int line, Buffer buffer) +{ + IncrBufferRefCount(buffer); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +void +ReleaseBuffer_Debug(char *file, int line, Buffer buffer) +{ + ReleaseBuffer(buffer); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +int +ReleaseAndReadBuffer_Debug(char *file, + int line, + Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + bool bufferValid; + Buffer b; + + bufferValid = BufferIsValid(buffer); + b = ReleaseAndReadBuffer(buffer, relation, blockNum); + if (ShowPinTrace && bufferValid && BufferIsLocal(buffer) + && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } + if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[b-1]; + + fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + b, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[b - 1], file, line); + } + return b; +} + +#ifdef BMTRACE + +/* + * trace allocations and deallocations in a circular buffer in + * shared memory. check the buffer before doing the allocation, + * and die if there's anything fishy. + */ + +_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType) +{ + static int mypid = 0; + long start, cur; + bmtrace *tb; + + if (mypid == 0) + mypid = getpid(); + + start = *CurTraceBuf; + + if (start > 0) + cur = start - 1; + else + cur = BMT_LIMIT - 1; + + for (;;) { + tb = &TraceBuf[cur]; + if (tb->bmt_op != BMT_NOTUSED) { + if (tb->bmt_buf == bufNo) { + if ((tb->bmt_op == BMT_DEALLOC) + || (tb->bmt_dbid == dbId && tb->bmt_relid == relId + && tb->bmt_blkno == blkNo)) + goto okay; + + /* die holding the buffer lock */ + _bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur); + } + } + + if (cur == start) + goto okay; + + if (cur == 0) + cur = BMT_LIMIT - 1; + else + cur--; + } + + okay: + tb = &TraceBuf[start]; + tb->bmt_pid = mypid; + tb->bmt_buf = bufNo; + tb->bmt_dbid = dbId; + tb->bmt_relid = relId; + tb->bmt_blkno = blkNo; + tb->bmt_op = allocType; + + *CurTraceBuf = (start + 1) % BMT_LIMIT; +} + +_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo, + int allocType, long start, long cur) +{ + FILE *fp; + bmtrace *tb; + int i; + + tb = &TraceBuf[cur]; + + if ((fp = fopen("/tmp/death_notice", "w")) == (FILE *) NULL) + elog(FATAL, "buffer alloc trace error and can't open log file"); + + fprintf(fp, "buffer alloc trace detected the following error:\n\n"); + fprintf(fp, " buffer %d being %s inconsistently with a previous %s\n\n", + bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"), + (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation")); + + fprintf(fp, "the trace buffer contains:\n"); + + i = start; + for (;;) { + tb = &TraceBuf[i]; + if (tb->bmt_op != BMT_NOTUSED) { + fprintf(fp, " [%3d]%spid %d buf %2d for <%d,%d,%d> ", + i, (i == cur ? " ---> " : "\t"), + tb->bmt_pid, tb->bmt_buf, + tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno); + + switch (tb->bmt_op) { + case BMT_ALLOCFND: + fprintf(fp, "allocate (found)\n"); + break; + + case BMT_ALLOCNOTFND: + fprintf(fp, "allocate (not found)\n"); + break; + + case BMT_DEALLOC: + fprintf(fp, "deallocate\n"); + break; + + default: + fprintf(fp, "unknown op type %d\n", tb->bmt_op); + break; + } + } + + i = (i + 1) % BMT_LIMIT; + if (i == start) + break; + } + + fprintf(fp, "\noperation causing error:\n"); + fprintf(fp, "\tpid %d buf %d for <%d,%d,%d> ", + getpid(), bufNo, dbId, relId, blkNo); + + switch (allocType) { + case BMT_ALLOCFND: + fprintf(fp, "allocate (found)\n"); + break; + + case BMT_ALLOCNOTFND: + fprintf(fp, "allocate (not found)\n"); + break; + + case BMT_DEALLOC: + fprintf(fp, "deallocate\n"); + break; + + default: + fprintf(fp, "unknown op type %d\n", allocType); + break; + } + + (void) fclose(fp); + + kill(getpid(), SIGILL); +} + +#endif /* BMTRACE */ + +void +BufferRefCountReset(int *refcountsave) +{ + int i; + for (i=0; i<NBuffers; i++) { + refcountsave[i] = PrivateRefCount[i]; + LastRefCount[i] += PrivateRefCount[i]; + PrivateRefCount[i] = 0; + } +} + +void +BufferRefCountRestore(int *refcountsave) +{ + int i; + for (i=0; i<NBuffers; i++) { + PrivateRefCount[i] = refcountsave[i]; + LastRefCount[i] -= refcountsave[i]; + refcountsave[i] = 0; + } +} + |