diff options
Diffstat (limited to 'src/backend/storage/buffer/xlog_bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/xlog_bufmgr.c | 2202 |
1 files changed, 0 insertions, 2202 deletions
diff --git a/src/backend/storage/buffer/xlog_bufmgr.c b/src/backend/storage/buffer/xlog_bufmgr.c deleted file mode 100644 index fb02413f970..00000000000 --- a/src/backend/storage/buffer/xlog_bufmgr.c +++ /dev/null @@ -1,2202 +0,0 @@ -/*------------------------------------------------------------------------- - * - * xlog_bufmgr.c - * buffer manager interface routines - * - * Portions Copyright (c) 1996-2000, PostgreSQL, Inc - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/xlog_bufmgr.c,v 1.6 2000/11/30 01:39:07 tgl Exp $ - * - *------------------------------------------------------------------------- - */ -/* - * - * BufferAlloc() -- lookup a buffer in the buffer table. If - * it isn't there add it, but do not read data into memory. - * This is used when we are about to reinitialize the - * buffer so don't care what the current disk contents are. - * BufferAlloc() also pins the new buffer in memory. - * - * ReadBuffer() -- like BufferAlloc() but reads the data - * on a buffer cache miss. - * - * ReleaseBuffer() -- unpin the buffer - * - * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty" - * but don't unpin. The disk IO is delayed until buffer - * replacement. - * - * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() - * - * BufferSync() -- flush all dirty buffers in the buffer pool. - * - * InitBufferPool() -- Init the buffer module. - * - * See other files: - * freelist.c -- chooses victim for buffer replacement - * buf_table.c -- manages the buffer lookup table - */ -#include "postgres.h" - -#include <sys/types.h> -#include <sys/file.h> -#include <math.h> -#include <signal.h> - -#include "executor/execdebug.h" -#include "miscadmin.h" -#include "storage/buf_internals.h" -#include "storage/bufmgr.h" -#include "storage/s_lock.h" -#include "storage/smgr.h" -#include "utils/relcache.h" - -#ifdef XLOG -#include "catalog/pg_database.h" -#endif - -#define BufferGetLSN(bufHdr) \ - (*((XLogRecPtr*)MAKE_PTR((bufHdr)->data))) - - -extern SPINLOCK BufMgrLock; -extern long int ReadBufferCount; -extern long int ReadLocalBufferCount; -extern long int BufferHitCount; -extern long int LocalBufferHitCount; -extern long int BufferFlushCount; -extern long int LocalBufferFlushCount; - -/* - * It's used to avoid disk writes for read-only transactions - * (i.e. when no one shared buffer was changed by transaction). - * We set it to true in WriteBuffer/WriteNoReleaseBuffer when - * marking shared buffer as dirty. We set it to false in xact.c - * after transaction is committed/aborted. - */ -bool SharedBufferChanged = false; - -static void WaitIO(BufferDesc *buf, SPINLOCK spinlock); -static void StartBufferIO(BufferDesc *buf, bool forInput); -static void TerminateBufferIO(BufferDesc *buf); -static void ContinueBufferIO(BufferDesc *buf, bool forInput); -extern void AbortBufferIO(void); - -/* - * Macro : BUFFER_IS_BROKEN - * Note that write error doesn't mean the buffer broken -*/ -#define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY)) - -static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, - bool bufferLockHeld); -static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, - bool *foundPtr, bool bufferLockHeld); -static int BufferReplace(BufferDesc *bufHdr); -void PrintBufferDescs(void); - -/* --------------------------------------------------- - * RelationGetBufferWithBuffer - * see if the given buffer is what we want - * if yes, we don't need to bother the buffer manager - * --------------------------------------------------- - */ -Buffer -RelationGetBufferWithBuffer(Relation relation, - BlockNumber blockNumber, - Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsValid(buffer)) - { - if (!BufferIsLocal(buffer)) - { - bufHdr = &BufferDescriptors[buffer - 1]; - SpinAcquire(BufMgrLock); - if (bufHdr->tag.blockNum == blockNumber && - RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node)) - { - SpinRelease(BufMgrLock); - return buffer; - } - return ReadBufferWithBufferLock(relation, blockNumber, true); - } - else - { - bufHdr = &LocalBufferDescriptors[-buffer - 1]; - if (bufHdr->tag.blockNum == blockNumber && - RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node)) - return buffer; - } - } - return ReadBuffer(relation, blockNumber); -} - -/* - * ReadBuffer -- returns a buffer containing the requested - * block of the requested relation. If the blknum - * requested is P_NEW, extend the relation file and - * allocate a new block. - * - * Returns: the buffer number for the buffer containing - * the block read or NULL on an error. - * - * Assume when this function is called, that reln has been - * opened already. - */ - -#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG - * defined */ - -/* - * ReadBuffer - * - */ -Buffer -ReadBuffer(Relation reln, BlockNumber blockNum) -{ - return ReadBufferWithBufferLock(reln, blockNum, false); -} - -/* - * ReadBufferWithBufferLock -- does the work of - * ReadBuffer() but with the possibility that - * the buffer lock has already been held. this - * is yet another effort to reduce the number of - * semops in the system. - */ -static Buffer -ReadBufferWithBufferLock(Relation reln, - BlockNumber blockNum, - bool bufferLockHeld) -{ - BufferDesc *bufHdr; - int extend; /* extending the file by one block */ - int status; - bool found; - bool isLocalBuf; - - extend = (blockNum == P_NEW); - isLocalBuf = reln->rd_myxactonly; - - if (isLocalBuf) - { - ReadLocalBufferCount++; - bufHdr = LocalBufferAlloc(reln, blockNum, &found); - if (found) - LocalBufferHitCount++; - } - else - { - ReadBufferCount++; - - /* - * lookup the buffer. IO_IN_PROGRESS is set if the requested - * block is not currently in memory. - */ - bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld); - if (found) - BufferHitCount++; - } - - if (!bufHdr) - return InvalidBuffer; - - /* if it's already in the buffer pool, we're done */ - if (found) - { - - /* - * This happens when a bogus buffer was returned previously and is - * floating around in the buffer pool. A routine calling this - * would want this extended. - */ - if (extend) - { - /* new buffers are zero-filled */ - MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); - smgrextend(DEFAULT_SMGR, reln, - (char *) MAKE_PTR(bufHdr->data)); - } - return BufferDescriptorGetBuffer(bufHdr); - - } - - /* - * if we have gotten to this point, the reln pointer must be ok and - * the relation file must be open. - */ - if (extend) - { - /* new buffers are zero-filled */ - MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); - status = smgrextend(DEFAULT_SMGR, reln, - (char *) MAKE_PTR(bufHdr->data)); - } - else - { - status = smgrread(DEFAULT_SMGR, reln, blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } - - if (isLocalBuf) - return BufferDescriptorGetBuffer(bufHdr); - - /* lock buffer manager again to update IO IN PROGRESS */ - SpinAcquire(BufMgrLock); - - if (status == SM_FAIL) - { - /* IO Failed. cleanup the data structures and go home */ - - if (!BufTableDelete(bufHdr)) - { - SpinRelease(BufMgrLock); - elog(FATAL, "BufRead: buffer table broken after IO error\n"); - } - /* remember that BufferAlloc() pinned the buffer */ - UnpinBuffer(bufHdr); - - /* - * Have to reset the flag so that anyone waiting for the buffer - * can tell that the contents are invalid. - */ - bufHdr->flags |= BM_IO_ERROR; - bufHdr->flags &= ~BM_IO_IN_PROGRESS; - } - else - { - /* IO Succeeded. clear the flags, finish buffer update */ - - bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS); - } - - /* If anyone was waiting for IO to complete, wake them up now */ - TerminateBufferIO(bufHdr); - - SpinRelease(BufMgrLock); - - if (status == SM_FAIL) - return InvalidBuffer; - - return BufferDescriptorGetBuffer(bufHdr); -} - -/* - * BufferAlloc -- Get a buffer from the buffer pool but dont - * read it. - * - * Returns: descriptor for buffer - * - * When this routine returns, the BufMgrLock is guaranteed NOT be held. - */ -static BufferDesc * -BufferAlloc(Relation reln, - BlockNumber blockNum, - bool *foundPtr, - bool bufferLockHeld) -{ - BufferDesc *buf, - *buf2; - BufferTag newTag; /* identity of requested block */ - bool inProgress; /* buffer undergoing IO */ - bool newblock = FALSE; - - /* create a new tag so we can lookup the buffer */ - /* assume that the relation is already open */ - if (blockNum == P_NEW) - { - newblock = TRUE; - blockNum = smgrnblocks(DEFAULT_SMGR, reln); - } - - INIT_BUFFERTAG(&newTag, reln, blockNum); - - if (!bufferLockHeld) - SpinAcquire(BufMgrLock); - - /* see if the block is in the buffer pool already */ - buf = BufTableLookup(&newTag); - if (buf != NULL) - { - - /* - * Found it. Now, (a) pin the buffer so no one steals it from the - * buffer pool, (b) check IO_IN_PROGRESS, someone may be faulting - * the buffer into the buffer pool. - */ - - PinBuffer(buf); - inProgress = (buf->flags & BM_IO_IN_PROGRESS); - - *foundPtr = TRUE; - if (inProgress) /* confirm end of IO */ - { - WaitIO(buf, BufMgrLock); - inProgress = (buf->flags & BM_IO_IN_PROGRESS); - } - if (BUFFER_IS_BROKEN(buf)) - { - - /* - * I couldn't understand the following old comment. If there's - * no IO for the buffer and the buffer is BROKEN,it should be - * read again. So start a new buffer IO here. - * - * wierd race condition: - * - * We were waiting for someone else to read the buffer. While we - * were waiting, the reader boof'd in some way, so the - * contents of the buffer are still invalid. By saying that - * we didn't find it, we can make the caller reinitialize the - * buffer. If two processes are waiting for this block, both - * will read the block. The second one to finish may - * overwrite any updates made by the first. (Assume higher - * level synchronization prevents this from happening). - * - * This is never going to happen, don't worry about it. - */ - *foundPtr = FALSE; - } -#ifdef BMTRACE - _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND); -#endif /* BMTRACE */ - - if (!(*foundPtr)) - StartBufferIO(buf, true); - SpinRelease(BufMgrLock); - - return buf; - } - - *foundPtr = FALSE; - - /* - * Didn't find it in the buffer pool. We'll have to initialize a new - * buffer. First, grab one from the free list. If it's dirty, flush - * it to disk. Remember to unlock BufMgr spinlock while doing the IOs. - */ - inProgress = FALSE; - for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL;) - { - buf = GetFreeBuffer(); - - /* GetFreeBuffer will abort if it can't find a free buffer */ - Assert(buf); - - /* - * There should be exactly one pin on the buffer after it is - * allocated -- ours. If it had a pin it wouldn't have been on - * the free list. No one else could have pinned it between - * GetFreeBuffer and here because we have the BufMgrLock. - */ - Assert(buf->refcount == 0); - buf->refcount = 1; - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; - - if (buf->flags & BM_DIRTY || buf->cntxDirty) - { - bool smok; - - /* - * skip write error buffers - */ - if ((buf->flags & BM_IO_ERROR) != 0) - { - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; - buf->refcount--; - buf = (BufferDesc *) NULL; - continue; - } - /* - * Set BM_IO_IN_PROGRESS to keep anyone from doing anything - * with the contents of the buffer while we write it out. We - * don't really care if they try to read it, but if they can - * complete a BufferAlloc on it they can then scribble into - * it, and we'd really like to avoid that while we are - * flushing the buffer. Setting this flag should block them - * in WaitIO until we're done. - */ - inProgress = TRUE; - - /* - * All code paths that acquire this lock pin the buffer first; - * since no one had it pinned (it just came off the free - * list), no one else can have this lock. - */ - StartBufferIO(buf, false); - - /* - * Write the buffer out, being careful to release BufMgrLock - * before starting the I/O. - */ - smok = BufferReplace(buf); - - if (smok == FALSE) - { - elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s", - buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); - inProgress = FALSE; - buf->flags |= BM_IO_ERROR; - buf->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(buf); - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; - Assert(buf->refcount > 0); - buf->refcount--; - if (buf->refcount == 0) - { - AddBufferToFreelist(buf); - buf->flags |= BM_FREE; - } - buf = (BufferDesc *) NULL; - } - else - { - /* - * BM_JUST_DIRTIED cleared by BufferReplace and shouldn't - * be setted by anyone. - vadim 01/17/97 - */ - if (buf->flags & BM_JUST_DIRTIED) - { - elog(STOP, "BufferAlloc: content of block %u (%s) changed while flushing", - buf->tag.blockNum, buf->blind.relname); - } - else - buf->flags &= ~BM_DIRTY; - buf->cntxDirty = false; - } - - /* - * Somebody could have pinned the buffer while we were doing - * the I/O and had given up the BufMgrLock (though they would - * be waiting for us to clear the BM_IO_IN_PROGRESS flag). - * That's why this is a loop -- if so, we need to clear the - * I/O flags, remove our pin and start all over again. - * - * People may be making buffers free at any time, so there's no - * reason to think that we have an immediate disaster on our - * hands. - */ - if (buf && buf->refcount > 1) - { - inProgress = FALSE; - buf->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(buf); - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; - buf->refcount--; - buf = (BufferDesc *) NULL; - } - - /* - * Somebody could have allocated another buffer for the same - * block we are about to read in. (While we flush out the - * dirty buffer, we don't hold the lock and someone could have - * allocated another buffer for the same block. The problem is - * we haven't gotten around to insert the new tag into the - * buffer table. So we need to check here. -ay 3/95 - */ - buf2 = BufTableLookup(&newTag); - if (buf2 != NULL) - { - - /* - * Found it. Someone has already done what we're about to - * do. We'll just handle this as if it were found in the - * buffer pool in the first place. - */ - if (buf != NULL) - { - buf->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(buf); - /* give up the buffer since we don't need it any more */ - PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; - Assert(buf->refcount > 0); - buf->refcount--; - if (buf->refcount == 0) - { - AddBufferToFreelist(buf); - buf->flags |= BM_FREE; - } - } - - PinBuffer(buf2); - inProgress = (buf2->flags & BM_IO_IN_PROGRESS); - - *foundPtr = TRUE; - if (inProgress) - { - WaitIO(buf2, BufMgrLock); - inProgress = (buf2->flags & BM_IO_IN_PROGRESS); - } - if (BUFFER_IS_BROKEN(buf2)) - *foundPtr = FALSE; - - if (!(*foundPtr)) - StartBufferIO(buf2, true); - SpinRelease(BufMgrLock); - - return buf2; - } - } - } - - /* - * At this point we should have the sole pin on a non-dirty buffer and - * we may or may not already have the BM_IO_IN_PROGRESS flag set. - */ - - /* - * Change the name of the buffer in the lookup table: - * - * Need to update the lookup table before the read starts. If someone - * comes along looking for the buffer while we are reading it in, we - * don't want them to allocate a new buffer. For the same reason, we - * didn't want to erase the buf table entry for the buffer we were - * writing back until now, either. - */ - - if (!BufTableDelete(buf)) - { - SpinRelease(BufMgrLock); - elog(FATAL, "buffer wasn't in the buffer table\n"); - } - - /* record the database name and relation name for this buffer */ - strcpy(buf->blind.dbname, (DatabaseName) ? DatabaseName : "Recovery"); - strcpy(buf->blind.relname, RelationGetPhysicalRelationName(reln)); - - INIT_BUFFERTAG(&(buf->tag), reln, blockNum); - if (!BufTableInsert(buf)) - { - SpinRelease(BufMgrLock); - elog(FATAL, "Buffer in lookup table twice \n"); - } - - /* - * Buffer contents are currently invalid. Have to mark IO IN PROGRESS - * so no one fiddles with them until the read completes. If this - * routine has been called simply to allocate a buffer, no io will be - * attempted, so the flag isnt set. - */ - if (!inProgress) - StartBufferIO(buf, true); - else - ContinueBufferIO(buf, true); - -#ifdef BMTRACE - _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND); -#endif /* BMTRACE */ - - SpinRelease(BufMgrLock); - - return buf; -} - -/* - * WriteBuffer - * - * Marks buffer contents as dirty (actual write happens later). - * - * Assume that buffer is pinned. Assume that reln is - * valid. - * - * Side Effects: - * Pin count is decremented. - */ - -#undef WriteBuffer - -int -WriteBuffer(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - return WriteLocalBuffer(buffer, TRUE); - - if (BAD_BUFFER_ID(buffer)) - return FALSE; - - bufHdr = &BufferDescriptors[buffer - 1]; - - SharedBufferChanged = true; - - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - - UnpinBuffer(bufHdr); - SpinRelease(BufMgrLock); - - return TRUE; -} - -/* - * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer - * when the operation is complete. - */ -int -WriteNoReleaseBuffer(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - return WriteLocalBuffer(buffer, FALSE); - - if (BAD_BUFFER_ID(buffer)) - return STATUS_ERROR; - - bufHdr = &BufferDescriptors[buffer - 1]; - - SharedBufferChanged = true; - - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - - SpinRelease(BufMgrLock); - - return STATUS_OK; -} - - -#undef ReleaseAndReadBuffer -/* - * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer() - * so that only one semop needs to be called. - * - */ -Buffer -ReleaseAndReadBuffer(Buffer buffer, - Relation relation, - BlockNumber blockNum) -{ - BufferDesc *bufHdr; - Buffer retbuf; - - if (BufferIsLocal(buffer)) - { - Assert(LocalRefCount[-buffer - 1] > 0); - LocalRefCount[-buffer - 1]--; - } - else - { - if (BufferIsValid(buffer)) - { - bufHdr = &BufferDescriptors[buffer - 1]; - Assert(PrivateRefCount[buffer - 1] > 0); - PrivateRefCount[buffer - 1]--; - if (PrivateRefCount[buffer - 1] == 0) - { - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->refcount--; - if (bufHdr->refcount == 0) - { - AddBufferToFreelist(bufHdr); - bufHdr->flags |= BM_FREE; - } - retbuf = ReadBufferWithBufferLock(relation, blockNum, true); - return retbuf; - } - } - } - - return ReadBuffer(relation, blockNum); -} - -/* - * BufferSync -- Write all dirty buffers in the pool. - * - * This is called at checkpoint time and write out all dirty buffers. - */ -void -BufferSync() -{ - int i; - BufferDesc *bufHdr; - Buffer buffer; - int status; - RelFileNode rnode; - XLogRecPtr recptr; - Relation reln = NULL; - - for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) - { - - SpinAcquire(BufMgrLock); - - if (!(bufHdr->flags & BM_VALID)) - { - SpinRelease(BufMgrLock); - continue; - } - - /* - * Pin buffer and ensure that no one reads it from disk - */ - PinBuffer(bufHdr); - /* Synchronize with BufferAlloc */ - if (bufHdr->flags & BM_IO_IN_PROGRESS) - WaitIO(bufHdr, BufMgrLock); - - buffer = BufferDescriptorGetBuffer(bufHdr); - rnode = bufHdr->tag.rnode; - - SpinRelease(BufMgrLock); - - /* - * Try to find relation for buffer - */ - reln = RelationNodeCacheGetRelation(rnode); - - /* - * Protect buffer content against concurrent update - */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - - /* - * Force XLOG flush for buffer' LSN - */ - recptr = BufferGetLSN(bufHdr); - XLogFlush(recptr); - - /* - * Now it's safe to write buffer to disk - * (if needed at all -:)) - */ - - SpinAcquire(BufMgrLock); - if (bufHdr->flags & BM_IO_IN_PROGRESS) - WaitIO(bufHdr, BufMgrLock); - - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - bufHdr->flags &= ~BM_JUST_DIRTIED; - StartBufferIO(bufHdr, false); /* output IO start */ - - SpinRelease(BufMgrLock); - - if (reln == (Relation) NULL) - { - status = smgrblindwrt(DEFAULT_SMGR, - bufHdr->tag.rnode, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data), - true); /* must fsync */ - } - else - { - status = smgrwrite(DEFAULT_SMGR, reln, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } - - if (status == SM_FAIL) /* disk failure ?! */ - elog(STOP, "BufferSync: cannot write %u for %s", - bufHdr->tag.blockNum, bufHdr->blind.relname); - - /* - * Note that it's safe to change cntxDirty here because of - * we protect it from upper writers by share lock and from - * other bufmgr routines by BM_IO_IN_PROGRESS - */ - bufHdr->cntxDirty = false; - - /* - * Release the per-buffer readlock, reacquire BufMgrLock. - */ - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - BufferFlushCount++; - - SpinAcquire(BufMgrLock); - - bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ - TerminateBufferIO(bufHdr); /* Sync IO finished */ - - /* - * If this buffer was marked by someone as DIRTY while - * we were flushing it out we must not clear DIRTY - * flag - vadim 01/17/97 - */ - if (!(bufHdr->flags & BM_JUST_DIRTIED)) - bufHdr->flags &= ~BM_DIRTY; - } - else - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - - UnpinBuffer(bufHdr); - - SpinRelease(BufMgrLock); - - /* drop refcnt obtained by RelationNodeCacheGetRelation */ - if (reln != (Relation) NULL) - { - RelationDecrementReferenceCount(reln); - reln = NULL; - } - } - -} - -/* - * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared. - * - * Should be entered with buffer manager spinlock held; releases it before - * waiting and re-acquires it afterwards. - */ -static void -WaitIO(BufferDesc *buf, SPINLOCK spinlock) -{ - - /* - * Changed to wait until there's no IO - Inoue 01/13/2000 - */ - while ((buf->flags & BM_IO_IN_PROGRESS) != 0) - { - SpinRelease(spinlock); - S_LOCK(&(buf->io_in_progress_lock)); - S_UNLOCK(&(buf->io_in_progress_lock)); - SpinAcquire(spinlock); - } -} - - -long NDirectFileRead; /* some I/O's are direct file access. - * bypass bufmgr */ -long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ - -void -PrintBufferUsage(FILE *statfp) -{ - float hitrate; - float localhitrate; - - if (ReadBufferCount == 0) - hitrate = 0.0; - else - hitrate = (float) BufferHitCount *100.0 / ReadBufferCount; - - if (ReadLocalBufferCount == 0) - localhitrate = 0.0; - else - localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount; - - fprintf(statfp, "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n", - ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate); - fprintf(statfp, "!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n", - ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate); - fprintf(statfp, "!\tDirect blocks: %10ld read, %10ld written\n", - NDirectFileRead, NDirectFileWrite); -} - -void -ResetBufferUsage() -{ - BufferHitCount = 0; - ReadBufferCount = 0; - BufferFlushCount = 0; - LocalBufferHitCount = 0; - ReadLocalBufferCount = 0; - LocalBufferFlushCount = 0; - NDirectFileRead = 0; - NDirectFileWrite = 0; -} - -/* ---------------------------------------------- - * ResetBufferPool - * - * This routine is supposed to be called when a transaction aborts. - * it will release all the buffer pins held by the transaction. - * Currently, we also call it during commit if BufferPoolCheckLeak - * detected a problem --- in that case, isCommit is TRUE, and we - * only clean up buffer pin counts. - * - * During abort, we also forget any pending fsync requests. Dirtied buffers - * will still get written, eventually, but there will be no fsync for them. - * - * ---------------------------------------------- - */ -void -ResetBufferPool(bool isCommit) -{ - int i; - - for (i = 0; i < NBuffers; i++) - { - if (PrivateRefCount[i] != 0) - { - BufferDesc *buf = &BufferDescriptors[i]; - - SpinAcquire(BufMgrLock); - Assert(buf->refcount > 0); - buf->refcount--; - if (buf->refcount == 0) - { - AddBufferToFreelist(buf); - buf->flags |= BM_FREE; - } - SpinRelease(BufMgrLock); - } - PrivateRefCount[i] = 0; - } - - ResetLocalBufferPool(); - - if (!isCommit) - smgrabort(); -} - -/* ----------------------------------------------- - * BufferPoolCheckLeak - * - * check if there is buffer leak - * - * ----------------------------------------------- - */ -int -BufferPoolCheckLeak() -{ - int i; - int result = 0; - - for (i = 1; i <= NBuffers; i++) - { - if (PrivateRefCount[i - 1] != 0) - { - BufferDesc *buf = &(BufferDescriptors[i - 1]); - - elog(NOTICE, - "Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \ -relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)", - i - 1, buf->freeNext, buf->freePrev, - buf->blind.relname, buf->tag.blockNum, buf->flags, - buf->refcount, PrivateRefCount[i - 1]); - result = 1; - } - } - return result; -} - -/* ------------------------------------------------ - * FlushBufferPool - * - * Flush all dirty blocks in buffer pool to disk - * at the checkpoint time - * ------------------------------------------------ - */ -void -FlushBufferPool(void) -{ - BufferSync(); - smgrsync(); -} - -/* - * At the commit time we have to flush local buffer pool only - */ -void -BufmgrCommit(void) -{ - LocalBufferSync(); - /* - * All files created in current transaction will be fsync-ed - */ - smgrcommit(); -} - -/* - * BufferGetBlockNumber - * Returns the block number associated with a buffer. - * - * Note: - * Assumes that the buffer is valid. - */ -BlockNumber -BufferGetBlockNumber(Buffer buffer) -{ - Assert(BufferIsValid(buffer)); - - /* XXX should be a critical section */ - if (BufferIsLocal(buffer)) - return LocalBufferDescriptors[-buffer - 1].tag.blockNum; - else - return BufferDescriptors[buffer - 1].tag.blockNum; -} - -/* - * BufferReplace - * - * Write out the buffer corresponding to 'bufHdr' - * - * BufMgrLock must be held at entry, and the buffer must be pinned. - */ -static int -BufferReplace(BufferDesc *bufHdr) -{ - Relation reln; - XLogRecPtr recptr; - int status; - - /* To check if block content changed while flushing. - vadim 01/17/97 */ - bufHdr->flags &= ~BM_JUST_DIRTIED; - - SpinRelease(BufMgrLock); - - /* - * No need to lock buffer context - no one should be able to - * end ReadBuffer - */ - recptr = BufferGetLSN(bufHdr); - XLogFlush(recptr); - - reln = RelationNodeCacheGetRelation(bufHdr->tag.rnode); - - if (reln != (Relation) NULL) - { - status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } - else - { - status = smgrblindwrt(DEFAULT_SMGR, bufHdr->tag.rnode, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data), - false); /* no fsync */ - } - - /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ - if (reln != (Relation) NULL) - RelationDecrementReferenceCount(reln); - - SpinAcquire(BufMgrLock); - - if (status == SM_FAIL) - return FALSE; - - BufferFlushCount++; - - return TRUE; -} - -/* - * RelationGetNumberOfBlocks - * Returns the buffer descriptor associated with a page in a relation. - * - * Note: - * XXX may fail for huge relations. - * XXX should be elsewhere. - * XXX maybe should be hidden - */ -BlockNumber -RelationGetNumberOfBlocks(Relation relation) -{ - return ((relation->rd_myxactonly) ? relation->rd_nblocks : - ((relation->rd_rel->relkind == RELKIND_VIEW) ? 0 : - smgrnblocks(DEFAULT_SMGR, relation))); -} - -/* --------------------------------------------------------------------- - * DropRelationBuffers - * - * This function removes all the buffered pages for a relation - * from the buffer pool. Dirty pages are simply dropped, without - * bothering to write them out first. This is NOT rollback-able, - * and so should be used only with extreme caution! - * - * We assume that the caller holds an exclusive lock on the relation, - * which should assure that no new buffers will be acquired for the rel - * meanwhile. - * - * XXX currently it sequentially searches the buffer pool, should be - * changed to more clever ways of searching. - * -------------------------------------------------------------------- - */ -void -DropRelationBuffers(Relation rel) -{ - int i; - BufferDesc *bufHdr; - - if (rel->rd_myxactonly) - { - for (i = 0; i < NLocBuffer; i++) - { - bufHdr = &LocalBufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) - { - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - LocalRefCount[i] = 0; - bufHdr->tag.rnode.relNode = InvalidOid; - } - } - return; - } - - SpinAcquire(BufMgrLock); - for (i = 1; i <= NBuffers; i++) - { - bufHdr = &BufferDescriptors[i - 1]; -recheck: - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) - { - - /* - * If there is I/O in progress, better wait till it's done; - * don't want to delete the relation out from under someone - * who's just trying to flush the buffer! - */ - if (bufHdr->flags & BM_IO_IN_PROGRESS) - { - WaitIO(bufHdr, BufMgrLock); - - /* - * By now, the buffer very possibly belongs to some other - * rel, so check again before proceeding. - */ - goto recheck; - } - /* Now we can do what we came for */ - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - - /* - * Release any refcount we may have. - * - * This is very probably dead code, and if it isn't then it's - * probably wrong. I added the Assert to find out --- tgl - * 11/99. - */ - if (!(bufHdr->flags & BM_FREE)) - { - /* Assert checks that buffer will actually get freed! */ - Assert(PrivateRefCount[i - 1] == 1 && - bufHdr->refcount == 1); - /* ReleaseBuffer expects we do not hold the lock at entry */ - SpinRelease(BufMgrLock); - ReleaseBuffer(i); - SpinAcquire(BufMgrLock); - } - /* - * And mark the buffer as no longer occupied by this rel. - */ - BufTableDelete(bufHdr); - } - } - - SpinRelease(BufMgrLock); -} - -/* --------------------------------------------------------------------- - * DropRelFileNodeBuffers - * - * This is the same as DropRelationBuffers, except that the target - * relation is specified by RelFileNode. - * - * This is NOT rollback-able. One legitimate use is to clear the - * buffer cache of buffers for a relation that is being deleted - * during transaction abort. - * -------------------------------------------------------------------- - */ -void -DropRelFileNodeBuffers(RelFileNode rnode) -{ - int i; - BufferDesc *bufHdr; - - /* We have to search both local and shared buffers... */ - - for (i = 0; i < NLocBuffer; i++) - { - bufHdr = &LocalBufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) - { - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - LocalRefCount[i] = 0; - bufHdr->tag.rnode.relNode = InvalidOid; - } - } - - SpinAcquire(BufMgrLock); - for (i = 1; i <= NBuffers; i++) - { - bufHdr = &BufferDescriptors[i - 1]; -recheck: - if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) - { - - /* - * If there is I/O in progress, better wait till it's done; - * don't want to delete the relation out from under someone - * who's just trying to flush the buffer! - */ - if (bufHdr->flags & BM_IO_IN_PROGRESS) - { - WaitIO(bufHdr, BufMgrLock); - - /* - * By now, the buffer very possibly belongs to some other - * rel, so check again before proceeding. - */ - goto recheck; - } - /* Now we can do what we came for */ - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - - /* - * Release any refcount we may have. - * - * This is very probably dead code, and if it isn't then it's - * probably wrong. I added the Assert to find out --- tgl - * 11/99. - */ - if (!(bufHdr->flags & BM_FREE)) - { - /* Assert checks that buffer will actually get freed! */ - Assert(PrivateRefCount[i - 1] == 1 && - bufHdr->refcount == 1); - /* ReleaseBuffer expects we do not hold the lock at entry */ - SpinRelease(BufMgrLock); - ReleaseBuffer(i); - SpinAcquire(BufMgrLock); - } - /* - * And mark the buffer as no longer occupied by this rel. - */ - BufTableDelete(bufHdr); - } - } - - SpinRelease(BufMgrLock); -} - -/* --------------------------------------------------------------------- - * DropBuffers - * - * This function removes all the buffers in the buffer cache for a - * particular database. Dirty pages are simply dropped, without - * bothering to write them out first. This is used when we destroy a - * database, to avoid trying to flush data to disk when the directory - * tree no longer exists. Implementation is pretty similar to - * DropRelationBuffers() which is for destroying just one relation. - * -------------------------------------------------------------------- - */ -void -DropBuffers(Oid dbid) -{ - int i; - BufferDesc *bufHdr; - - SpinAcquire(BufMgrLock); - for (i = 1; i <= NBuffers; i++) - { - bufHdr = &BufferDescriptors[i - 1]; -recheck: - /* - * We know that currently database OID is tblNode but - * this probably will be changed in future and this - * func will be used to drop tablespace buffers. - */ - if (bufHdr->tag.rnode.tblNode == dbid) - { - - /* - * If there is I/O in progress, better wait till it's done; - * don't want to delete the database out from under someone - * who's just trying to flush the buffer! - */ - if (bufHdr->flags & BM_IO_IN_PROGRESS) - { - WaitIO(bufHdr, BufMgrLock); - - /* - * By now, the buffer very possibly belongs to some other - * DB, so check again before proceeding. - */ - goto recheck; - } - /* Now we can do what we came for */ - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - - /* - * The thing should be free, if caller has checked that no - * backends are running in that database. - */ - Assert(bufHdr->flags & BM_FREE); - /* - * And mark the buffer as no longer occupied by this page. - */ - BufTableDelete(bufHdr); - } - } - SpinRelease(BufMgrLock); -} - -/* ----------------------------------------------------------------- - * PrintBufferDescs - * - * this function prints all the buffer descriptors, for debugging - * use only. - * ----------------------------------------------------------------- - */ -void -PrintBufferDescs() -{ - int i; - BufferDesc *buf = BufferDescriptors; - - if (IsUnderPostmaster) - { - SpinAcquire(BufMgrLock); - for (i = 0; i < NBuffers; ++i, ++buf) - { - elog(DEBUG, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ -blockNum=%d, flags=0x%x, refcount=%d %ld)", - i, buf->freeNext, buf->freePrev, - buf->blind.relname, buf->tag.blockNum, buf->flags, - buf->refcount, PrivateRefCount[i]); - } - SpinRelease(BufMgrLock); - } - else - { - /* interactive backend */ - for (i = 0; i < NBuffers; ++i, ++buf) - { - printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n", - i, buf->blind.relname, buf->tag.blockNum, - buf->flags, buf->refcount, PrivateRefCount[i]); - } - } -} - -void -PrintPinnedBufs() -{ - int i; - BufferDesc *buf = BufferDescriptors; - - SpinAcquire(BufMgrLock); - for (i = 0; i < NBuffers; ++i, ++buf) - { - if (PrivateRefCount[i] > 0) - elog(NOTICE, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ -blockNum=%d, flags=0x%x, refcount=%d %ld)\n", - i, buf->freeNext, buf->freePrev, buf->blind.relname, - buf->tag.blockNum, buf->flags, - buf->refcount, PrivateRefCount[i]); - } - SpinRelease(BufMgrLock); -} - -/* - * BufferPoolBlowaway - * - * this routine is solely for the purpose of experiments -- sometimes - * you may want to blowaway whatever is left from the past in buffer - * pool and start measuring some performance with a clean empty buffer - * pool. - */ -#ifdef NOT_USED -void -BufferPoolBlowaway() -{ - int i; - - BufferSync(); - for (i = 1; i <= NBuffers; i++) - { - if (BufferIsValid(i)) - { - while (BufferIsValid(i)) - ReleaseBuffer(i); - } - BufTableDelete(&BufferDescriptors[i - 1]); - } -} - -#endif - -/* --------------------------------------------------------------------- - * FlushRelationBuffers - * - * This function writes all dirty pages of a relation out to disk. - * Furthermore, pages that have blocknumber >= firstDelBlock are - * actually removed from the buffer pool. An error code is returned - * if we fail to dump a dirty buffer or if we find one of - * the target pages is pinned into the cache. - * - * This is called by DROP TABLE to clear buffers for the relation - * from the buffer pool. Note that we must write dirty buffers, - * rather than just dropping the changes, because our transaction - * might abort later on; we want to roll back safely in that case. - * - * This is also called by VACUUM before truncating the relation to the - * given number of blocks. It might seem unnecessary for VACUUM to - * write dirty pages before firstDelBlock, since VACUUM should already - * have committed its changes. However, it is possible for there still - * to be dirty pages: if some page had unwritten on-row tuple status - * updates from a prior transaction, and VACUUM had no additional - * changes to make to that page, then VACUUM won't have written it. - * This is harmless in most cases but will break pg_upgrade, which - * relies on VACUUM to ensure that *all* tuples have correct on-row - * status. So, we check and flush all dirty pages of the rel - * regardless of block number. - * - * In all cases, the caller should be holding AccessExclusiveLock on - * the target relation to ensure that no other backend is busy reading - * more blocks of the relation (or might do so before we commit). - * - * Formerly, we considered it an error condition if we found dirty - * buffers here. However, since BufferSync no longer forces out all - * dirty buffers at every xact commit, it's possible for dirty buffers - * to still be present in the cache due to failure of an earlier - * transaction. So, must flush dirty buffers without complaint. - * - * Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED - * - * XXX currently it sequentially searches the buffer pool, should be - * changed to more clever ways of searching. - * -------------------------------------------------------------------- - */ -int -FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) -{ - int i; - BufferDesc *bufHdr; - XLogRecPtr recptr; - int status; - - if (rel->rd_myxactonly) - { - for (i = 0; i < NLocBuffer; i++) - { - bufHdr = &LocalBufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) - { - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - status = smgrwrite(DEFAULT_SMGR, rel, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - if (status == SM_FAIL) - { - elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", - RelationGetRelationName(rel), firstDelBlock, - bufHdr->tag.blockNum); - return(-1); - } - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->cntxDirty = false; - } - if (LocalRefCount[i] > 0) - { - elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is referenced (%ld)", - RelationGetRelationName(rel), firstDelBlock, - bufHdr->tag.blockNum, LocalRefCount[i]); - return(-2); - } - if (bufHdr->tag.blockNum >= firstDelBlock) - { - bufHdr->tag.rnode.relNode = InvalidOid; - } - } - } - return 0; - } - - SpinAcquire(BufMgrLock); - for (i = 0; i < NBuffers; i++) - { - bufHdr = &BufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) - { - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - PinBuffer(bufHdr); - if (bufHdr->flags & BM_IO_IN_PROGRESS) - WaitIO(bufHdr, BufMgrLock); - SpinRelease(BufMgrLock); - - /* - * Force XLOG flush for buffer' LSN - */ - recptr = BufferGetLSN(bufHdr); - XLogFlush(recptr); - - /* - * Now it's safe to write buffer to disk - */ - - SpinAcquire(BufMgrLock); - if (bufHdr->flags & BM_IO_IN_PROGRESS) - WaitIO(bufHdr, BufMgrLock); - - if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) - { - bufHdr->flags &= ~BM_JUST_DIRTIED; - StartBufferIO(bufHdr, false); /* output IO start */ - - SpinRelease(BufMgrLock); - - status = smgrwrite(DEFAULT_SMGR, rel, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - - if (status == SM_FAIL) /* disk failure ?! */ - elog(STOP, "FlushRelationBuffers: cannot write %u for %s", - bufHdr->tag.blockNum, bufHdr->blind.relname); - - BufferFlushCount++; - - SpinAcquire(BufMgrLock); - bufHdr->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(bufHdr); - Assert(!(bufHdr->flags & BM_JUST_DIRTIED)); - bufHdr->flags &= ~BM_DIRTY; - /* - * Note that it's safe to change cntxDirty here because - * of we protect it from upper writers by - * AccessExclusiveLock and from other bufmgr routines - * by BM_IO_IN_PROGRESS - */ - bufHdr->cntxDirty = false; - } - UnpinBuffer(bufHdr); - } - if (!(bufHdr->flags & BM_FREE)) - { - SpinRelease(BufMgrLock); - elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)", - RelationGetRelationName(rel), firstDelBlock, - bufHdr->tag.blockNum, - PrivateRefCount[i], bufHdr->refcount); - return -2; - } - if (bufHdr->tag.blockNum >= firstDelBlock) - { - BufTableDelete(bufHdr); - } - } - } - SpinRelease(BufMgrLock); - return 0; -} - -#undef ReleaseBuffer - -/* - * ReleaseBuffer -- remove the pin on a buffer without - * marking it dirty. - * - */ -int -ReleaseBuffer(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - { - Assert(LocalRefCount[-buffer - 1] > 0); - LocalRefCount[-buffer - 1]--; - return STATUS_OK; - } - - if (BAD_BUFFER_ID(buffer)) - return STATUS_ERROR; - - bufHdr = &BufferDescriptors[buffer - 1]; - - Assert(PrivateRefCount[buffer - 1] > 0); - PrivateRefCount[buffer - 1]--; - if (PrivateRefCount[buffer - 1] == 0) - { - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->refcount--; - if (bufHdr->refcount == 0) - { - AddBufferToFreelist(bufHdr); - bufHdr->flags |= BM_FREE; - } - SpinRelease(BufMgrLock); - } - - return STATUS_OK; -} - -#ifdef NOT_USED -void -IncrBufferRefCount_Debug(char *file, int line, Buffer buffer) -{ - IncrBufferRefCount(buffer); - if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[buffer - 1]; - - fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } -} - -#endif - -#ifdef NOT_USED -void -ReleaseBuffer_Debug(char *file, int line, Buffer buffer) -{ - ReleaseBuffer(buffer); - if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[buffer - 1]; - - fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } -} - -#endif - -#ifdef NOT_USED -int -ReleaseAndReadBuffer_Debug(char *file, - int line, - Buffer buffer, - Relation relation, - BlockNumber blockNum) -{ - bool bufferValid; - Buffer b; - - bufferValid = BufferIsValid(buffer); - b = ReleaseAndReadBuffer(buffer, relation, blockNum); - if (ShowPinTrace && bufferValid && BufferIsLocal(buffer) - && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[buffer - 1]; - - fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - buffer, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[buffer - 1], file, line); - } - if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) - { - BufferDesc *buf = &BufferDescriptors[b - 1]; - - fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ -refcount = %ld, file: %s, line: %d\n", - b, buf->blind.relname, buf->tag.blockNum, - PrivateRefCount[b - 1], file, line); - } - return b; -} - -#endif - -#ifdef BMTRACE - -/* - * trace allocations and deallocations in a circular buffer in - * shared memory. check the buffer before doing the allocation, - * and die if there's anything fishy. - */ - -_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType) -{ - long start, - cur; - bmtrace *tb; - - start = *CurTraceBuf; - - if (start > 0) - cur = start - 1; - else - cur = BMT_LIMIT - 1; - - for (;;) - { - tb = &TraceBuf[cur]; - if (tb->bmt_op != BMT_NOTUSED) - { - if (tb->bmt_buf == bufNo) - { - if ((tb->bmt_op == BMT_DEALLOC) - || (tb->bmt_dbid == dbId && tb->bmt_relid == relId - && tb->bmt_blkno == blkNo)) - goto okay; - - /* die holding the buffer lock */ - _bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur); - } - } - - if (cur == start) - goto okay; - - if (cur == 0) - cur = BMT_LIMIT - 1; - else - cur--; - } - -okay: - tb = &TraceBuf[start]; - tb->bmt_pid = MyProcPid; - tb->bmt_buf = bufNo; - tb->bmt_dbid = dbId; - tb->bmt_relid = relId; - tb->bmt_blkno = blkNo; - tb->bmt_op = allocType; - - *CurTraceBuf = (start + 1) % BMT_LIMIT; -} - -_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo, - int allocType, long start, long cur) -{ - FILE *fp; - bmtrace *tb; - int i; - - tb = &TraceBuf[cur]; - - if ((fp = AllocateFile("/tmp/death_notice", "w")) == NULL) - elog(FATAL, "buffer alloc trace error and can't open log file"); - - fprintf(fp, "buffer alloc trace detected the following error:\n\n"); - fprintf(fp, " buffer %d being %s inconsistently with a previous %s\n\n", - bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"), - (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation")); - - fprintf(fp, "the trace buffer contains:\n"); - - i = start; - for (;;) - { - tb = &TraceBuf[i]; - if (tb->bmt_op != BMT_NOTUSED) - { - fprintf(fp, " [%3d]%spid %d buf %2d for <%d,%u,%d> ", - i, (i == cur ? " ---> " : "\t"), - tb->bmt_pid, tb->bmt_buf, - tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno); - - switch (tb->bmt_op) - { - case BMT_ALLOCFND: - fprintf(fp, "allocate (found)\n"); - break; - - case BMT_ALLOCNOTFND: - fprintf(fp, "allocate (not found)\n"); - break; - - case BMT_DEALLOC: - fprintf(fp, "deallocate\n"); - break; - - default: - fprintf(fp, "unknown op type %d\n", tb->bmt_op); - break; - } - } - - i = (i + 1) % BMT_LIMIT; - if (i == start) - break; - } - - fprintf(fp, "\noperation causing error:\n"); - fprintf(fp, "\tpid %d buf %d for <%d,%u,%d> ", - getpid(), bufNo, dbId, relId, blkNo); - - switch (allocType) - { - case BMT_ALLOCFND: - fprintf(fp, "allocate (found)\n"); - break; - - case BMT_ALLOCNOTFND: - fprintf(fp, "allocate (not found)\n"); - break; - - case BMT_DEALLOC: - fprintf(fp, "deallocate\n"); - break; - - default: - fprintf(fp, "unknown op type %d\n", allocType); - break; - } - - FreeFile(fp); - - kill(getpid(), SIGILL); -} - -#endif /* BMTRACE */ - -/* - * SetBufferCommitInfoNeedsSave - * - * Mark a buffer dirty when we have updated tuple commit-status bits in it. - * - * This is similar to WriteNoReleaseBuffer, except that we do not set - * SharedBufferChanged or BufferDirtiedByMe, because we have not made a - * critical change that has to be flushed to disk before xact commit --- the - * status-bit update could be redone by someone else just as easily. The - * buffer will be marked dirty, but it will not be written to disk until - * there is another reason to write it. - * - * This routine might get called many times on the same page, if we are making - * the first scan after commit of an xact that added/deleted many tuples. - * So, be as quick as we can if the buffer is already dirty. - */ -void -SetBufferCommitInfoNeedsSave(Buffer buffer) -{ - BufferDesc *bufHdr; - - if (BufferIsLocal(buffer)) - return; - - if (BAD_BUFFER_ID(buffer)) - return; - - bufHdr = &BufferDescriptors[buffer - 1]; - - if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != - (BM_DIRTY | BM_JUST_DIRTIED)) - { - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - SpinRelease(BufMgrLock); - } -} - -void -UnlockBuffers() -{ - BufferDesc *buf; - int i; - - for (i = 0; i < NBuffers; i++) - { - if (BufferLocks[i] == 0) - continue; - - Assert(BufferIsValid(i + 1)); - buf = &(BufferDescriptors[i]); - - S_LOCK(&(buf->cntx_lock)); - - if (BufferLocks[i] & BL_R_LOCK) - { - Assert(buf->r_locks > 0); - (buf->r_locks)--; - } - if (BufferLocks[i] & BL_RI_LOCK) - { - - /* - * Someone else could remove our RI lock when acquiring W - * lock. This is possible if we came here from elog(ERROR) - * from IpcSemaphore{Lock|Unlock}(WaitCLSemId). And so we - * don't do Assert(buf->ri_lock) here. - */ - buf->ri_lock = false; - } - if (BufferLocks[i] & BL_W_LOCK) - { - Assert(buf->w_lock); - buf->w_lock = false; - } - - S_UNLOCK(&(buf->cntx_lock)); - - BufferLocks[i] = 0; - } -} - -void -LockBuffer(Buffer buffer, int mode) -{ - BufferDesc *buf; - bits8 *buflock; - - Assert(BufferIsValid(buffer)); - if (BufferIsLocal(buffer)) - return; - - buf = &(BufferDescriptors[buffer - 1]); - buflock = &(BufferLocks[buffer - 1]); - - S_LOCK(&(buf->cntx_lock)); - - if (mode == BUFFER_LOCK_UNLOCK) - { - if (*buflock & BL_R_LOCK) - { - Assert(buf->r_locks > 0); - Assert(!(buf->w_lock)); - Assert(!(*buflock & (BL_W_LOCK | BL_RI_LOCK))); - (buf->r_locks)--; - *buflock &= ~BL_R_LOCK; - } - else if (*buflock & BL_W_LOCK) - { - Assert(buf->w_lock); - Assert(buf->r_locks == 0); - Assert(!(*buflock & (BL_R_LOCK | BL_RI_LOCK))); - buf->w_lock = false; - *buflock &= ~BL_W_LOCK; - } - else - elog(ERROR, "UNLockBuffer: buffer %lu is not locked", buffer); - } - else if (mode == BUFFER_LOCK_SHARE) - { - unsigned i = 0; - - Assert(!(*buflock & (BL_R_LOCK | BL_W_LOCK | BL_RI_LOCK))); - while (buf->ri_lock || buf->w_lock) - { - S_UNLOCK(&(buf->cntx_lock)); - s_lock_sleep(i++); - S_LOCK(&(buf->cntx_lock)); - } - (buf->r_locks)++; - *buflock |= BL_R_LOCK; - } - else if (mode == BUFFER_LOCK_EXCLUSIVE) - { - unsigned i = 0; - - Assert(!(*buflock & (BL_R_LOCK | BL_W_LOCK | BL_RI_LOCK))); - while (buf->r_locks > 0 || buf->w_lock) - { - if (buf->r_locks > 3 || (*buflock & BL_RI_LOCK)) - { - - /* - * Our RI lock might be removed by concurrent W lock - * acquiring (see what we do with RI locks below when our - * own W acquiring succeeded) and so we set RI lock again - * if we already did this. - */ - *buflock |= BL_RI_LOCK; - buf->ri_lock = true; - } - S_UNLOCK(&(buf->cntx_lock)); - s_lock_sleep(i++); - S_LOCK(&(buf->cntx_lock)); - } - buf->w_lock = true; - *buflock |= BL_W_LOCK; - - buf->cntxDirty = true; - - if (*buflock & BL_RI_LOCK) - { - - /* - * It's possible to remove RI locks acquired by another W - * lockers here, but they'll take care about it. - */ - buf->ri_lock = false; - *buflock &= ~BL_RI_LOCK; - } - } - else - elog(ERROR, "LockBuffer: unknown lock mode %d", mode); - - S_UNLOCK(&(buf->cntx_lock)); -} - -/* - * Functions for IO error handling - * - * Note : We assume that nested buffer IO never occur. - * i.e at most one io_in_progress spinlock is held - * per proc. -*/ -static BufferDesc *InProgressBuf = (BufferDesc *) NULL; -static bool IsForInput; - -/* - * Function:StartBufferIO - * (Assumptions) - * My process is executing no IO - * BufMgrLock is held - * BM_IO_IN_PROGRESS mask is not set for the buffer - * The buffer is Pinned - * -*/ -static void -StartBufferIO(BufferDesc *buf, bool forInput) -{ - Assert(!InProgressBuf); - Assert(!(buf->flags & BM_IO_IN_PROGRESS)); - buf->flags |= BM_IO_IN_PROGRESS; - - /* - * There used to be - * - * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); - * - * here, but that's wrong because of the way WaitIO works: someone else - * waiting for the I/O to complete will succeed in grabbing the lock - * for a few instructions, and if we context-swap back to here the - * Assert could fail. Tiny window for failure, but I've seen it - * happen -- tgl - */ - S_LOCK(&(buf->io_in_progress_lock)); - - InProgressBuf = buf; - IsForInput = forInput; -} - -/* - * Function:TerminateBufferIO - * (Assumptions) - * My process is executing IO for the buffer - * BufMgrLock is held - * The buffer is Pinned - * -*/ -static void -TerminateBufferIO(BufferDesc *buf) -{ - Assert(buf == InProgressBuf); - S_UNLOCK(&(buf->io_in_progress_lock)); - InProgressBuf = (BufferDesc *) 0; -} - -/* - * Function:ContinueBufferIO - * (Assumptions) - * My process is executing IO for the buffer - * BufMgrLock is held - * The buffer is Pinned - * -*/ -static void -ContinueBufferIO(BufferDesc *buf, bool forInput) -{ - Assert(buf == InProgressBuf); - Assert(buf->flags & BM_IO_IN_PROGRESS); - IsForInput = forInput; -} - -#ifdef NOT_USED -void -InitBufferIO(void) -{ - InProgressBuf = (BufferDesc *) 0; -} -#endif - -/* - * This function is called from ProcReleaseSpins(). - * BufMgrLock isn't held when this function is called. - * BM_IO_ERROR is always set. If BM_IO_ERROR was already - * set in case of output,this routine would kill all - * backends and reset postmaster. - */ -void -AbortBufferIO(void) -{ - BufferDesc *buf = InProgressBuf; - - if (buf) - { - Assert(buf->flags & BM_IO_IN_PROGRESS); - SpinAcquire(BufMgrLock); - if (IsForInput) - Assert(!(buf->flags & BM_DIRTY) && !(buf->cntxDirty)); - else - { - Assert(buf->flags & BM_DIRTY || buf->cntxDirty); - if (buf->flags & BM_IO_ERROR) - { - elog(NOTICE, "write error may be permanent: cannot write block %u for %s/%s", - buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); - } - buf->flags |= BM_DIRTY; - } - buf->flags |= BM_IO_ERROR; - buf->flags &= ~BM_IO_IN_PROGRESS; - TerminateBufferIO(buf); - SpinRelease(BufMgrLock); - } -} - -/* - * Cleanup buffer or mark it for cleanup. Buffer may be cleaned - * up if it's pinned only once. - * - * NOTE: buffer must be excl locked. - */ -void -MarkBufferForCleanup(Buffer buffer, void (*CleanupFunc)(Buffer)) -{ - BufferDesc *bufHdr = &BufferDescriptors[buffer - 1]; - - Assert(PrivateRefCount[buffer - 1] > 0); - - if (PrivateRefCount[buffer - 1] > 1) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - PrivateRefCount[buffer - 1]--; - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->CleanupFunc = CleanupFunc; - SpinRelease(BufMgrLock); - return; - } - - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - if (bufHdr->refcount == 1) - { - SpinRelease(BufMgrLock); - CleanupFunc(buffer); - CleanupFunc = NULL; - } - else - SpinRelease(BufMgrLock); - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - PrivateRefCount[buffer - 1]--; - - SpinAcquire(BufMgrLock); - Assert(bufHdr->refcount > 0); - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - bufHdr->CleanupFunc = CleanupFunc; - bufHdr->refcount--; - if (bufHdr->refcount == 0) - { - AddBufferToFreelist(bufHdr); - bufHdr->flags |= BM_FREE; - } - SpinRelease(BufMgrLock); - return; -} |