diff options
author | Marc G. Fournier <scrappy@hub.org> | 1996-07-09 06:22:35 +0000 |
---|---|---|
committer | Marc G. Fournier <scrappy@hub.org> | 1996-07-09 06:22:35 +0000 |
commit | d31084e9d1118b25fd16580d9d8c2924b5740dff (patch) | |
tree | 3179e66307d54df9c7b966543550e601eb55e668 /src/backend/storage/buffer | |
download | postgresql-PG95-1_01.tar.gz postgresql-PG95-1_01.zip |
Postgres95 1.01 Distribution - Virgin SourcesPG95-1_01
Diffstat (limited to 'src/backend/storage/buffer')
-rw-r--r-- | src/backend/storage/buffer/Makefile.inc | 16 | ||||
-rw-r--r-- | src/backend/storage/buffer/buf_init.c | 280 | ||||
-rw-r--r-- | src/backend/storage/buffer/buf_table.c | 162 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 1581 | ||||
-rw-r--r-- | src/backend/storage/buffer/freelist.c | 285 | ||||
-rw-r--r-- | src/backend/storage/buffer/localbuf.c | 284 |
6 files changed, 2608 insertions, 0 deletions
diff --git a/src/backend/storage/buffer/Makefile.inc b/src/backend/storage/buffer/Makefile.inc new file mode 100644 index 00000000000..1d507f9227b --- /dev/null +++ b/src/backend/storage/buffer/Makefile.inc @@ -0,0 +1,16 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for storage/buffer +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= buf_table.c buf_init.c bufmgr.c freelist.c localbuf.c + +SRCS_SITEMGR+= buf_table.c buf_init.c freelist.c diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c new file mode 100644 index 00000000000..823bf41eecf --- /dev/null +++ b/src/backend/storage/buffer/buf_init.c @@ -0,0 +1,280 @@ +/*------------------------------------------------------------------------- + * + * buf_init.c-- + * buffer manager initialization routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <sys/file.h> +#include <stdio.h> +#include <math.h> +#include <signal.h> + +/* declarations split between these three files */ +#include "storage/buf.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" + +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/smgr.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/hsearch.h" +#include "utils/elog.h" +#include "utils/memutils.h" +#include "executor/execdebug.h" /* for NDirectFileRead */ +#include "catalog/catalog.h" + +/* + * if BMTRACE is defined, we trace the last 200 buffer allocations and + * deallocations in a circular buffer in shared memory. + */ +#ifdef BMTRACE +bmtrace *TraceBuf; +long *CurTraceBuf; +#define BMT_LIMIT 200 +#endif /* BMTRACE */ +int ShowPinTrace = 0; + +int NBuffers = NDBUFS; /* NDBUFS defined in miscadmin.h */ +int Data_Descriptors; +int Free_List_Descriptor; +int Lookup_List_Descriptor; +int Num_Descriptors; + +BufferDesc *BufferDescriptors; +BufferBlock BufferBlocks; +#ifndef HAS_TEST_AND_SET +long *NWaitIOBackendP; +#endif + +extern IpcSemaphoreId WaitIOSemId; + +long *PrivateRefCount; /* also used in freelist.c */ +long *LastRefCount; /* refcounts of last ExecMain level */ + +/* + * Data Structures: + * buffers live in a freelist and a lookup data structure. + * + * + * Buffer Lookup: + * Two important notes. First, the buffer has to be + * available for lookup BEFORE an IO begins. Otherwise + * a second process trying to read the buffer will + * allocate its own copy and the buffeer pool will + * become inconsistent. + * + * Buffer Replacement: + * see freelist.c. A buffer cannot be replaced while in + * use either by data manager or during IO. + * + * WriteBufferBack: + * currently, a buffer is only written back at the time + * it is selected for replacement. It should + * be done sooner if possible to reduce latency of + * BufferAlloc(). Maybe there should be a daemon process. + * + * Synchronization/Locking: + * + * BufMgrLock lock -- must be acquired before manipulating the + * buffer queues (lookup/freelist). Must be released + * before exit and before doing any IO. + * + * IO_IN_PROGRESS -- this is a flag in the buffer descriptor. + * It must be set when an IO is initiated and cleared at + * the end of the IO. It is there to make sure that one + * process doesn't start to use a buffer while another is + * faulting it in. see IOWait/IOSignal. + * + * refcount -- A buffer is pinned during IO and immediately + * after a BufferAlloc(). A buffer is always either pinned + * or on the freelist but never both. The buffer must be + * released, written, or flushed before the end of + * transaction. + * + * PrivateRefCount -- Each buffer also has a private refcount the keeps + * track of the number of times the buffer is pinned in the current + * processes. This is used for two purposes, first, if we pin a + * a buffer more than once, we only need to change the shared refcount + * once, thus only lock the buffer pool once, second, when a transaction + * aborts, it should only unpin the buffers exactly the number of times it + * has pinned them, so that it will not blow away buffers of another + * backend. + * + */ + +SPINLOCK BufMgrLock; + +/* delayed write: TRUE on, FALSE off */ +int LateWrite = TRUE; + +int ReadBufferCount; +int BufferHitCount; +int BufferFlushCount; + + +/* + * Initialize module: + * + * should calculate size of pool dynamically based on the + * amount of available memory. + */ +void +InitBufferPool(IPCKey key) +{ + bool foundBufs,foundDescs; + int i; + + Data_Descriptors = NBuffers; + Free_List_Descriptor = Data_Descriptors; + Lookup_List_Descriptor = Data_Descriptors + 1; + Num_Descriptors = Data_Descriptors + 1; + + SpinAcquire(BufMgrLock); + +#ifdef BMTRACE + CurTraceBuf = (long *) ShmemInitStruct("Buffer trace", + (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long), + &foundDescs); + if (!foundDescs) + memset(CurTraceBuf, 0, (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long)); + + TraceBuf = (bmtrace *) &(CurTraceBuf[1]); +#endif + + BufferDescriptors = (BufferDesc *) + ShmemInitStruct("Buffer Descriptors", + Num_Descriptors*sizeof(BufferDesc),&foundDescs); + + BufferBlocks = (BufferBlock) + ShmemInitStruct("Buffer Blocks", + NBuffers*BLCKSZ,&foundBufs); + +#ifndef HAS_TEST_AND_SET + { + bool foundNWaitIO; + + NWaitIOBackendP = (long *)ShmemInitStruct("#Backends Waiting IO", + sizeof(long), + &foundNWaitIO); + if (!foundNWaitIO) + *NWaitIOBackendP = 0; + } +#endif + + if (foundDescs || foundBufs) { + + /* both should be present or neither */ + Assert(foundDescs && foundBufs); + + } else { + BufferDesc *buf; + unsigned long block; + + buf = BufferDescriptors; + block = (unsigned long) BufferBlocks; + + /* + * link the buffers into a circular, doubly-linked list to + * initialize free list. Still don't know anything about + * replacement strategy in this file. + */ + for (i = 0; i < Data_Descriptors; block+=BLCKSZ,buf++,i++) { + Assert(ShmemIsValid((unsigned long)block)); + + buf->freeNext = i+1; + buf->freePrev = i-1; + + CLEAR_BUFFERTAG(&(buf->tag)); + buf->data = MAKE_OFFSET(block); + buf->flags = (BM_DELETED | BM_FREE | BM_VALID); + buf->refcount = 0; + buf->buf_id = i; +#ifdef HAS_TEST_AND_SET + S_INIT_LOCK(&(buf->io_in_progress_lock)); +#endif + } + + /* close the circular queue */ + BufferDescriptors[0].freePrev = Data_Descriptors-1; + BufferDescriptors[Data_Descriptors-1].freeNext = 0; + } + + /* Init the rest of the module */ + InitBufTable(); + InitFreeList(!foundDescs); + + SpinRelease(BufMgrLock); + +#ifndef HAS_TEST_AND_SET + { + int status; + WaitIOSemId = IpcSemaphoreCreate(IPCKeyGetWaitIOSemaphoreKey(key), + 1, IPCProtection, 0, 1, &status); + } +#endif + PrivateRefCount = (long *) calloc(NBuffers, sizeof(long)); + LastRefCount = (long *) calloc(NBuffers, sizeof(long)); +} + +/* ----------------------------------------------------- + * BufferShmemSize + * + * compute the size of shared memory for the buffer pool including + * data pages, buffer descriptors, hash tables, etc. + * ---------------------------------------------------- + */ +int +BufferShmemSize() +{ + int size = 0; + int nbuckets; + int nsegs; + int tmp; + + nbuckets = 1 << (int)my_log2((NBuffers - 1) / DEF_FFACTOR + 1); + nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1); + + /* size of shmem binding table */ + size += MAXALIGN(my_log2(BTABLE_SIZE) * sizeof(void *)); /* HTAB->dir */ + size += MAXALIGN(sizeof(HHDR)); /* HTAB->hctl */ + size += MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT)); + size += BUCKET_ALLOC_INCR * + (MAXALIGN(sizeof(BUCKET_INDEX)) + + MAXALIGN(BTABLE_KEYSIZE) + + MAXALIGN(BTABLE_DATASIZE)); + + /* size of buffer descriptors */ + size += MAXALIGN((NBuffers + 1) * sizeof(BufferDesc)); + + /* size of data pages */ + size += NBuffers * MAXALIGN(BLCKSZ); + + /* size of buffer hash table */ + size += MAXALIGN(my_log2(NBuffers) * sizeof(void *)); /* HTAB->dir */ + size += MAXALIGN(sizeof(HHDR)); /* HTAB->hctl */ + size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT)); + tmp = (int)ceil((double)NBuffers/BUCKET_ALLOC_INCR); + size += tmp * BUCKET_ALLOC_INCR * + (MAXALIGN(sizeof(BUCKET_INDEX)) + + MAXALIGN(sizeof(BufferTag)) + + MAXALIGN(sizeof(Buffer))); + +#ifdef BMTRACE + size += (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long); +#endif + return size; +} + + diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c new file mode 100644 index 00000000000..502ded954ed --- /dev/null +++ b/src/backend/storage/buffer/buf_table.c @@ -0,0 +1,162 @@ +/*------------------------------------------------------------------------- + * + * buf_table.c-- + * routines for finding buffers in the buffer pool. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_table.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * OLD COMMENTS + * + * Data Structures: + * + * Buffers are identified by their BufferTag (buf.h). This + * file contains routines for allocating a shmem hash table to + * map buffer tags to buffer descriptors. + * + * Synchronization: + * + * All routines in this file assume buffer manager spinlock is + * held by their caller. + */ +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" /* where the declarations go */ +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/hsearch.h" +#include "utils/elog.h" + +static HTAB *SharedBufHash; + +extern HTAB *ShmemInitHash(); + +typedef struct lookup { + BufferTag key; + Buffer id; +} LookupEnt; + +/* + * Initialize shmem hash table for mapping buffers + */ +void +InitBufTable() +{ + HASHCTL info; + int hash_flags; + + /* assume lock is held */ + + /* BufferTag maps to Buffer */ + info.keysize = sizeof(BufferTag); + info.datasize = sizeof(Buffer); + info.hash = tag_hash; + + hash_flags = (HASH_ELEM | HASH_FUNCTION); + + + SharedBufHash = (HTAB *) ShmemInitHash("Shared Buf Lookup Table", + NBuffers,NBuffers, + &info,hash_flags); + + if (! SharedBufHash) { + elog(FATAL,"couldn't initialize shared buffer pool Hash Tbl"); + exit(1); + } + +} + +BufferDesc * +BufTableLookup(BufferTag *tagPtr) +{ + LookupEnt * result; + bool found; + + if (tagPtr->blockNum == P_NEW) + return(NULL); + + result = (LookupEnt *) + hash_search(SharedBufHash,(char *) tagPtr,HASH_FIND,&found); + + if (! result){ + elog(WARN,"BufTableLookup: BufferLookup table corrupted"); + return(NULL); + } + if (! found) { + return(NULL); + } + return(&(BufferDescriptors[result->id])); +} + +/* + * BufTableDelete + */ +bool +BufTableDelete(BufferDesc *buf) +{ + LookupEnt * result; + bool found; + + /* buffer not initialized or has been removed from + * table already. BM_DELETED keeps us from removing + * buffer twice. + */ + if (buf->flags & BM_DELETED) { + return(TRUE); + } + + buf->flags |= BM_DELETED; + + result = (LookupEnt *) + hash_search(SharedBufHash,(char *) &(buf->tag),HASH_REMOVE,&found); + + if (! (result && found)) { + elog(WARN,"BufTableDelete: BufferLookup table corrupted"); + return(FALSE); + } + + return(TRUE); +} + +bool +BufTableInsert(BufferDesc *buf) +{ + LookupEnt * result; + bool found; + + /* cannot insert it twice */ + Assert (buf->flags & BM_DELETED); + buf->flags &= ~(BM_DELETED); + + result = (LookupEnt *) + hash_search(SharedBufHash,(char *) &(buf->tag),HASH_ENTER,&found); + + if (! result) { + Assert(0); + elog(WARN,"BufTableInsert: BufferLookup table corrupted"); + return(FALSE); + } + /* found something else in the table ! */ + if (found) { + Assert(0); + elog(WARN,"BufTableInsert: BufferLookup table corrupted"); + return(FALSE); + } + + result->id = buf->buf_id; + return(TRUE); +} + +/* prints out collision stats for the buf table */ +void +DBG_LookupListCheck(int nlookup) +{ + nlookup = 10; + + hash_stats("Shared",SharedBufHash); +} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c new file mode 100644 index 00000000000..655f1f408e0 --- /dev/null +++ b/src/backend/storage/buffer/bufmgr.c @@ -0,0 +1,1581 @@ +/*------------------------------------------------------------------------- + * + * bufmgr.c-- + * buffer manager interface routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * + * BufferAlloc() -- lookup a buffer in the buffer table. If + * it isn't there add it, but do not read it into memory. + * This is used when we are about to reinitialize the + * buffer so don't care what the current disk contents are. + * BufferAlloc() pins the new buffer in memory. + * + * ReadBuffer() -- same as BufferAlloc() but reads the data + * on a buffer cache miss. + * + * ReleaseBuffer() -- unpin the buffer + * + * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty" + * but don't unpin. The disk IO is delayed until buffer + * replacement if LateWrite flag is set. + * + * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() + * + * DirtyBufferCopy() -- For a given dbid/relid/blockno, if the buffer is + * in the cache and is dirty, mark it clean and copy + * it to the requested location. This is a logical + * write, and has been installed to support the cache + * management code for write-once storage managers. + * + * FlushBuffer() -- as above but never delayed write. + * + * BufferSync() -- flush all dirty buffers in the buffer pool. + * + * InitBufferPool() -- Init the buffer module. + * + * See other files: + * freelist.c -- chooses victim for buffer replacement + * buf_table.c -- manages the buffer lookup table + */ +#include <sys/file.h> +#include <stdio.h> +#include <math.h> +#include <signal.h> + +/* declarations split between these three files */ +#include "storage/buf.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" + +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/smgr.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/hsearch.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/memutils.h" +#include "executor/execdebug.h" /* for NDirectFileRead */ +#include "catalog/catalog.h" + +extern int LateWrite; +extern SPINLOCK BufMgrLock; +extern int ReadBufferCount; +extern int BufferHitCount; +extern int BufferFlushCount; + +static void WaitIO(BufferDesc *buf, SPINLOCK spinlock); +#ifndef HAS_TEST_AND_SET +static void SignalIO(BufferDesc *buf); +extern long *NWaitIOBackendP; /* defined in buf_init.c */ +#endif /* HAS_TEST_AND_SET */ + +static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, + bool bufferLockHeld); +static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, + bool *foundPtr, bool bufferLockHeld); +static int FlushBuffer(Buffer buffer); +static void BufferSync(void); +static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld); + +/* --------------------------------------------------- + * RelationGetBufferWithBuffer + * see if the given buffer is what we want + * if yes, we don't need to bother the buffer manager + * --------------------------------------------------- + */ +Buffer +RelationGetBufferWithBuffer(Relation relation, + BlockNumber blockNumber, + Buffer buffer) +{ + BufferDesc *bufHdr; + LRelId lrelId; + + if (BufferIsValid(buffer)) { + if (!BufferIsLocal(buffer)) { + bufHdr = &BufferDescriptors[buffer-1]; + lrelId = RelationGetLRelId(relation); + SpinAcquire(BufMgrLock); + if (bufHdr->tag.blockNum == blockNumber && + bufHdr->tag.relId.relId == lrelId.relId && + bufHdr->tag.relId.dbId == lrelId.dbId) { + SpinRelease(BufMgrLock); + return(buffer); + } + return(ReadBufferWithBufferLock(relation, blockNumber, true)); + } else { + bufHdr = &LocalBufferDescriptors[-buffer-1]; + if (bufHdr->tag.relId.relId == relation->rd_id && + bufHdr->tag.blockNum == blockNumber) { + return(buffer); + } + } + } + return(ReadBuffer(relation, blockNumber)); +} + +/* + * ReadBuffer -- returns a buffer containing the requested + * block of the requested relation. If the blknum + * requested is P_NEW, extend the relation file and + * allocate a new block. + * + * Returns: the buffer number for the buffer containing + * the block read or NULL on an error. + * + * Assume when this function is called, that reln has been + * opened already. + */ + +extern int ShowPinTrace; + + +#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG defined */ + +/* + * ReadBuffer -- + * + */ +Buffer +ReadBuffer(Relation reln, BlockNumber blockNum) +{ + return ReadBufferWithBufferLock(reln, blockNum, false); +} + +/* + * is_userbuffer + * + * XXX caller must have already acquired BufMgrLock + */ +static bool +is_userbuffer(Buffer buffer) +{ + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + if (IsSystemRelationName(buf->sb_relname)) + return false; + return true; +} + +Buffer +ReadBuffer_Debug(char *file, + int line, + Relation reln, + BlockNumber blockNum) +{ + Buffer buffer; + + buffer = ReadBufferWithBufferLock(reln, blockNum, false); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } + return buffer; +} + +/* + * ReadBufferWithBufferLock -- does the work of + * ReadBuffer() but with the possibility that + * the buffer lock has already been held. this + * is yet another effort to reduce the number of + * semops in the system. + */ +static Buffer +ReadBufferWithBufferLock(Relation reln, + BlockNumber blockNum, + bool bufferLockHeld) +{ + BufferDesc *bufHdr; + int extend; /* extending the file by one block */ + int status; + bool found; + bool isLocalBuf; + + extend = (blockNum == P_NEW); + isLocalBuf = reln->rd_islocal; + + if (isLocalBuf) { + bufHdr = LocalBufferAlloc(reln, blockNum, &found); + } else { + ReadBufferCount++; + + /* lookup the buffer. IO_IN_PROGRESS is set if the requested + * block is not currently in memory. + */ + bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld); + if (found) BufferHitCount++; + } + + if (!bufHdr) { + return(InvalidBuffer); + } + + /* if its already in the buffer pool, we're done */ + if (found) { + /* + * This happens when a bogus buffer was returned previously and is + * floating around in the buffer pool. A routine calling this would + * want this extended. + */ + if (extend) { + /* new buffers are zero-filled */ + memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); + (void) smgrextend(bufHdr->bufsmgr, reln, + (char *) MAKE_PTR(bufHdr->data)); + } + return (BufferDescriptorGetBuffer(bufHdr)); + + } + + /* + * if we have gotten to this point, the reln pointer must be ok + * and the relation file must be open. + */ + if (extend) { + /* new buffers are zero-filled */ + (void) memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); + status = smgrextend(bufHdr->bufsmgr, reln, + (char *) MAKE_PTR(bufHdr->data)); + } else { + status = smgrread(bufHdr->bufsmgr, reln, blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + if (isLocalBuf) + return (BufferDescriptorGetBuffer(bufHdr)); + + /* lock buffer manager again to update IO IN PROGRESS */ + SpinAcquire(BufMgrLock); + + if (status == SM_FAIL) { + /* IO Failed. cleanup the data structures and go home */ + + if (! BufTableDelete(bufHdr)) { + SpinRelease(BufMgrLock); + elog(FATAL,"BufRead: buffer table broken after IO error\n"); + } + /* remember that BufferAlloc() pinned the buffer */ + UnpinBuffer(bufHdr); + + /* + * Have to reset the flag so that anyone waiting for + * the buffer can tell that the contents are invalid. + */ + bufHdr->flags |= BM_IO_ERROR; + + } else { + /* IO Succeeded. clear the flags, finish buffer update */ + + bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS); + } + + /* If anyone was waiting for IO to complete, wake them up now */ +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(bufHdr->io_in_progress_lock)); +#else + if (bufHdr->refcount > 1) + SignalIO(bufHdr); +#endif + + SpinRelease(BufMgrLock); + + return(BufferDescriptorGetBuffer(bufHdr)); +} + +/* + * BufferAlloc -- Get a buffer from the buffer pool but dont + * read it. + * + * Returns: descriptor for buffer + * + * When this routine returns, the BufMgrLock is guaranteed NOT be held. + */ +static BufferDesc * +BufferAlloc(Relation reln, + BlockNumber blockNum, + bool *foundPtr, + bool bufferLockHeld) +{ + BufferDesc *buf, *buf2; + BufferTag newTag; /* identity of requested block */ + bool inProgress; /* buffer undergoing IO */ + bool newblock = FALSE; + + /* create a new tag so we can lookup the buffer */ + /* assume that the relation is already open */ + if (blockNum == P_NEW) { + newblock = TRUE; + blockNum = smgrnblocks(reln->rd_rel->relsmgr, reln); + } + + INIT_BUFFERTAG(&newTag,reln,blockNum); + + if (!bufferLockHeld) + SpinAcquire(BufMgrLock); + + /* see if the block is in the buffer pool already */ + buf = BufTableLookup(&newTag); + if (buf != NULL) { + /* Found it. Now, (a) pin the buffer so no + * one steals it from the buffer pool, + * (b) check IO_IN_PROGRESS, someone may be + * faulting the buffer into the buffer pool. + */ + + PinBuffer(buf); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + + *foundPtr = TRUE; + if (inProgress) { + WaitIO(buf, BufMgrLock); + if (buf->flags & BM_IO_ERROR) { + /* wierd race condition: + * + * We were waiting for someone else to read the buffer. + * While we were waiting, the reader boof'd in some + * way, so the contents of the buffer are still + * invalid. By saying that we didn't find it, we can + * make the caller reinitialize the buffer. If two + * processes are waiting for this block, both will + * read the block. The second one to finish may overwrite + * any updates made by the first. (Assume higher level + * synchronization prevents this from happening). + * + * This is never going to happen, don't worry about it. + */ + *foundPtr = FALSE; + } + } +#ifdef BMTRACE + _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND); +#endif /* BMTRACE */ + + SpinRelease(BufMgrLock); + + return(buf); + } + + *foundPtr = FALSE; + + /* + * Didn't find it in the buffer pool. We'll have + * to initialize a new buffer. First, grab one from + * the free list. If it's dirty, flush it to disk. + * Remember to unlock BufMgr spinlock while doing the IOs. + */ + inProgress = FALSE; + for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL; ) { + + /* GetFreeBuffer will abort if it can't find a free buffer */ + buf = GetFreeBuffer(); + + /* + * There should be exactly one pin on the buffer after + * it is allocated -- ours. If it had a pin it wouldn't + * have been on the free list. No one else could have + * pinned it between GetFreeBuffer and here because we + * have the BufMgrLock. + */ + Assert(buf->refcount == 0); + buf->refcount = 1; + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; + + if (buf->flags & BM_DIRTY) { + /* + * Set BM_IO_IN_PROGRESS to keep anyone from doing anything + * with the contents of the buffer while we write it out. + * We don't really care if they try to read it, but if they + * can complete a BufferAlloc on it they can then scribble + * into it, and we'd really like to avoid that while we are + * flushing the buffer. Setting this flag should block them + * in WaitIO until we're done. + */ + inProgress = TRUE; + buf->flags |= BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + /* + * All code paths that acquire this lock pin the buffer + * first; since no one had it pinned (it just came off the + * free list), no one else can have this lock. + */ + Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + S_LOCK(&(buf->io_in_progress_lock)); +#endif /* HAS_TEST_AND_SET */ + + /* + * Write the buffer out, being careful to release BufMgrLock + * before starting the I/O. + * + * This #ifndef is here because a few extra semops REALLY kill + * you on machines that don't have spinlocks. If you don't + * operate with much concurrency, well... + */ + (void) BufferReplace(buf, true); + BufferFlushCount++; +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + /* + * Somebody could have pinned the buffer while we were + * doing the I/O and had given up the BufMgrLock (though + * they would be waiting for us to clear the BM_IO_IN_PROGRESS + * flag). That's why this is a loop -- if so, we need to clear + * the I/O flags, remove our pin and start all over again. + * + * People may be making buffers free at any time, so there's + * no reason to think that we have an immediate disaster on + * our hands. + */ + if (buf->refcount > 1) { + inProgress = FALSE; + buf->flags &= ~BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->io_in_progress_lock)); +#else /* !HAS_TEST_AND_SET */ + if (buf->refcount > 1) + SignalIO(buf); +#endif /* !HAS_TEST_AND_SET */ + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + buf->refcount--; + buf = (BufferDesc *) NULL; + } + + /* + * Somebody could have allocated another buffer for the + * same block we are about to read in. (While we flush out + * the dirty buffer, we don't hold the lock and someone could + * have allocated another buffer for the same block. The problem + * is we haven't gotten around to insert the new tag into + * the buffer table. So we need to check here. -ay 3/95 + */ + buf2 = BufTableLookup(&newTag); + if (buf2 != NULL) { + /* Found it. Someone has already done what we're about + * to do. We'll just handle this as if it were found in + * the buffer pool in the first place. + */ + + PinBuffer(buf2); + inProgress = (buf2->flags & BM_IO_IN_PROGRESS); + + *foundPtr = TRUE; + if (inProgress) { + WaitIO(buf2, BufMgrLock); + if (buf2->flags & BM_IO_ERROR) { + *foundPtr = FALSE; + } + } + +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->io_in_progress_lock)); +#else /* !HAS_TEST_AND_SET */ + if (buf->refcount > 1) + SignalIO(buf); +#endif /* !HAS_TEST_AND_SET */ + + /* give up the buffer since we don't need it any more */ + buf->refcount--; + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + AddBufferToFreelist(buf); + buf->flags |= BM_FREE; + buf->flags &= ~BM_DIRTY; + buf->flags &= ~BM_IO_IN_PROGRESS; + + SpinRelease(BufMgrLock); + + return(buf2); + } + } + } + /* + * At this point we should have the sole pin on a non-dirty + * buffer and we may or may not already have the BM_IO_IN_PROGRESS + * flag set. + */ + + /* + * Change the name of the buffer in the lookup table: + * + * Need to update the lookup table before the read starts. + * If someone comes along looking for the buffer while + * we are reading it in, we don't want them to allocate + * a new buffer. For the same reason, we didn't want + * to erase the buf table entry for the buffer we were + * writing back until now, either. + */ + + if (! BufTableDelete(buf)) { + SpinRelease(BufMgrLock); + elog(FATAL,"buffer wasn't in the buffer table\n"); + + } + + if (buf->flags & BM_DIRTY) { + /* must clear flag first because of wierd race + * condition described below. + */ + buf->flags &= ~BM_DIRTY; + } + + /* record the database name and relation name for this buffer */ + buf->sb_relname = pstrdup(reln->rd_rel->relname.data); + buf->sb_dbname = pstrdup(GetDatabaseName()); + + /* remember which storage manager is responsible for it */ + buf->bufsmgr = reln->rd_rel->relsmgr; + + INIT_BUFFERTAG(&(buf->tag),reln,blockNum); + if (! BufTableInsert(buf)) { + SpinRelease(BufMgrLock); + elog(FATAL,"Buffer in lookup table twice \n"); + } + + /* Buffer contents are currently invalid. Have + * to mark IO IN PROGRESS so no one fiddles with + * them until the read completes. If this routine + * has been called simply to allocate a buffer, no + * io will be attempted, so the flag isnt set. + */ + if (!inProgress) { + buf->flags |= BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + S_LOCK(&(buf->io_in_progress_lock)); +#endif /* HAS_TEST_AND_SET */ + } + +#ifdef BMTRACE + _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND); +#endif /* BMTRACE */ + + SpinRelease(BufMgrLock); + + return (buf); +} + +/* + * WriteBuffer-- + * + * Pushes buffer contents to disk if LateWrite is + * not set. Otherwise, marks contents as dirty. + * + * Assume that buffer is pinned. Assume that reln is + * valid. + * + * Side Effects: + * Pin count is decremented. + */ + +#undef WriteBuffer + +int +WriteBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (! LateWrite) { + return(FlushBuffer(buffer)); + } else { + + if (BufferIsLocal(buffer)) + return WriteLocalBuffer(buffer, TRUE); + + if (BAD_BUFFER_ID(buffer)) + return(FALSE); + + bufHdr = &BufferDescriptors[buffer-1]; + + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->flags |= BM_DIRTY; + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); + } + return(TRUE); +} + +void +WriteBuffer_Debug(char *file, int line, Buffer buffer) +{ + WriteBuffer(buffer); + if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf; + buf = &BufferDescriptors[buffer-1]; + fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +/* + * DirtyBufferCopy() -- Copy a given dirty buffer to the requested + * destination. + * + * We treat this as a write. If the requested buffer is in the pool + * and is dirty, we copy it to the location requested and mark it + * clean. This routine supports the Sony jukebox storage manager, + * which agrees to take responsibility for the data once we mark + * it clean. + * + * NOTE: used by sony jukebox code in postgres 4.2 - ay 2/95 + */ +void +DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest) +{ + BufferDesc *buf; + BufferTag btag; + + btag.relId.relId = relid; + btag.relId.dbId = dbid; + btag.blockNum = blkno; + + SpinAcquire(BufMgrLock); + buf = BufTableLookup(&btag); + + if (buf == (BufferDesc *) NULL + || !(buf->flags & BM_DIRTY) + || !(buf->flags & BM_VALID)) { + SpinRelease(BufMgrLock); + return; + } + + /* hate to do this holding the lock, but release and reacquire is slower */ + memmove(dest, (char *) MAKE_PTR(buf->data), BLCKSZ); + + buf->flags &= ~BM_DIRTY; + + SpinRelease(BufMgrLock); +} + +/* + * FlushBuffer -- like WriteBuffer, but force the page to disk. + * + * 'buffer' is known to be dirty/pinned, so there should not be a + * problem reading the BufferDesc members without the BufMgrLock + * (nobody should be able to change tags, flags, etc. out from under + * us). + */ +static int +FlushBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + return FlushLocalBuffer(buffer); + + if (BAD_BUFFER_ID(buffer)) + return (STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + if (!BufferReplace(bufHdr, false)) { + elog(WARN, "FlushBuffer: cannot flush %d", bufHdr->tag.blockNum); + return (STATUS_ERROR); + } + + SpinAcquire(BufMgrLock); + bufHdr->flags &= ~BM_DIRTY; + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); + + return(STATUS_OK); +} + +/* + * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer + * when the operation is complete. + * + * We know that the buffer is for a relation in our private cache, + * because this routine is called only to write out buffers that + * were changed by the executing backend. + */ +int +WriteNoReleaseBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (! LateWrite) { + return(FlushBuffer(buffer)); + } else { + + if (BufferIsLocal(buffer)) + return WriteLocalBuffer(buffer, FALSE); + + if (BAD_BUFFER_ID(buffer)) + return (STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + SpinAcquire(BufMgrLock); + bufHdr->flags |= BM_DIRTY; + SpinRelease(BufMgrLock); + } + return(STATUS_OK); +} + + +#undef ReleaseAndReadBuffer +/* + * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer() + * so that only one semop needs to be called. + * + */ +Buffer +ReleaseAndReadBuffer(Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + BufferDesc *bufHdr; + Buffer retbuf; + + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] > 0); + LocalRefCount[-buffer - 1]--; + } else { + if (BufferIsValid(buffer)) { + bufHdr = &BufferDescriptors[buffer-1]; + Assert(PrivateRefCount[buffer - 1] > 0); + PrivateRefCount[buffer - 1]--; + if (PrivateRefCount[buffer - 1] == 0 && + LastRefCount[buffer - 1] == 0) { + /* only release buffer if it is not pinned in previous ExecMain + level */ + SpinAcquire(BufMgrLock); + bufHdr->refcount--; + if (bufHdr->refcount == 0) { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + retbuf = ReadBufferWithBufferLock(relation, blockNum, true); + return retbuf; + } + } + } + + return (ReadBuffer(relation, blockNum)); +} + +/* + * BufferSync -- Flush all dirty buffers in the pool. + * + * This is called at transaction commit time. It does the wrong thing, + * right now. We should flush only our own changes to stable storage, + * and we should obey the lock protocol on the buffer manager metadata + * as we do it. Also, we need to be sure that no other transaction is + * modifying the page as we flush it. This is only a problem for objects + * that use a non-two-phase locking protocol, like btree indices. For + * those objects, we would like to set a write lock for the duration of + * our IO. Another possibility is to code updates to btree pages + * carefully, so that writing them out out of order cannot cause + * any unrecoverable errors. + * + * I don't want to think hard about this right now, so I will try + * to come back to it later. + */ +static void +BufferSync() +{ + int i; + Oid bufdb; + Oid bufrel; + Relation reln; + BufferDesc *bufHdr; + int status; + + SpinAcquire(BufMgrLock); + for (i=0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) { + if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { + bufdb = bufHdr->tag.relId.dbId; + bufrel = bufHdr->tag.relId.relId; + if (bufdb == MyDatabaseId || bufdb == (Oid) 0) { + reln = RelationIdCacheGetRelation(bufrel); + + /* + * If we didn't have the reldesc in our local cache, flush this + * page out using the 'blind write' storage manager routine. If + * we did find it, use the standard interface. + */ + +#ifndef OPTIMIZE_SINGLE + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + if (reln == (Relation) NULL) { + status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname, + bufHdr->sb_relname, bufdb, bufrel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } else { + status = smgrwrite(bufHdr->bufsmgr, reln, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + if (status == SM_FAIL) { + elog(WARN, "cannot write %d for %16s", + bufHdr->tag.blockNum, bufHdr->sb_relname); + } + + bufHdr->flags &= ~BM_DIRTY; + if (reln != (Relation)NULL) + RelationDecrementReferenceCount(reln); + } + } + } + SpinRelease(BufMgrLock); + + LocalBufferSync(); +} + + +/* + * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' + * is cleared. Because IO_IN_PROGRESS conflicts are + * expected to be rare, there is only one BufferIO + * lock in the entire system. All processes block + * on this semaphore when they try to use a buffer + * that someone else is faulting in. Whenever a + * process finishes an IO and someone is waiting for + * the buffer, BufferIO is signaled (SignalIO). All + * waiting processes then wake up and check to see + * if their buffer is now ready. This implementation + * is simple, but efficient enough if WaitIO is + * rarely called by multiple processes simultaneously. + * + * ProcSleep atomically releases the spinlock and goes to + * sleep. + * + * Note: there is an easy fix if the queue becomes long. + * save the id of the buffer we are waiting for in + * the queue structure. That way signal can figure + * out which proc to wake up. + */ +#ifdef HAS_TEST_AND_SET +static void +WaitIO(BufferDesc *buf, SPINLOCK spinlock) +{ + SpinRelease(spinlock); + S_LOCK(&(buf->io_in_progress_lock)); + S_UNLOCK(&(buf->io_in_progress_lock)); + SpinAcquire(spinlock); +} + +#else /* HAS_TEST_AND_SET */ +IpcSemaphoreId WaitIOSemId; + +static void +WaitIO(BufferDesc *buf, SPINLOCK spinlock) +{ + bool inProgress; + + for (;;) { + + /* wait until someone releases IO lock */ + (*NWaitIOBackendP)++; + SpinRelease(spinlock); + IpcSemaphoreLock(WaitIOSemId, 0, 1); + SpinAcquire(spinlock); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + if (!inProgress) break; + } +} + +/* + * SignalIO -- + */ +static void +SignalIO(BufferDesc *buf) +{ + /* somebody better be waiting. */ + Assert( buf->refcount > 1); + IpcSemaphoreUnlock(WaitIOSemId, 0, *NWaitIOBackendP); + *NWaitIOBackendP = 0; +} +#endif /* HAS_TEST_AND_SET */ + +long NDirectFileRead; /* some I/O's are direct file access. bypass bufmgr */ +long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ + +void +PrintBufferUsage(FILE *statfp) +{ + float hitrate; + + if (ReadBufferCount==0) + hitrate = 0.0; + else + hitrate = (float)BufferHitCount * 100.0/ReadBufferCount; + + fprintf(statfp, "!\t%ld blocks read, %ld blocks written, buffer hit rate = %.2f%%\n", + ReadBufferCount - BufferHitCount + NDirectFileRead, + BufferFlushCount + NDirectFileWrite, + hitrate); +} + +void +ResetBufferUsage() +{ + BufferHitCount = 0; + ReadBufferCount = 0; + BufferFlushCount = 0; + NDirectFileRead = 0; + NDirectFileWrite = 0; +} + +/* ---------------------------------------------- + * ResetBufferPool + * + * this routine is supposed to be called when a transaction aborts. + * it will release all the buffer pins held by the transaciton. + * + * ---------------------------------------------- + */ +void +ResetBufferPool() +{ + register int i; + for (i=1; i<=NBuffers; i++) { + if (BufferIsValid(i)) { + while(PrivateRefCount[i - 1] > 0) { + ReleaseBuffer(i); + } + } + LastRefCount[i - 1] = 0; + } + + ResetLocalBufferPool(); +} + +/* ----------------------------------------------- + * BufferPoolCheckLeak + * + * check if there is buffer leak + * + * ----------------------------------------------- + */ +int +BufferPoolCheckLeak() +{ + register int i; + void PrintBufferDescs(); + + for (i = 1; i <= NBuffers; i++) { + if (BufferIsValid(i)) { + elog(NOTICE, "buffer leak detected in BufferPoolCheckLeak()"); + PrintBufferDescs(); + return(1); + } + } + return(0); +} + +/* ------------------------------------------------ + * FlushBufferPool + * + * flush all dirty blocks in buffer pool to disk + * + * ------------------------------------------------ + */ +void +FlushBufferPool(int StableMainMemoryFlag) +{ + if (!StableMainMemoryFlag) { + BufferSync(); + smgrcommit(); + } +} + +/* + * BufferIsValid -- + * True iff the refcnt of the local buffer is > 0 + * Note: + * BufferIsValid(InvalidBuffer) is False. + * BufferIsValid(UnknownBuffer) is False. + */ +bool +BufferIsValid(Buffer bufnum) +{ + if (BufferIsLocal(bufnum)) + return (bufnum >= -NLocBuffer && LocalRefCount[-bufnum - 1] > 0); + + if (BAD_BUFFER_ID(bufnum)) + return(false); + + return ((bool)(PrivateRefCount[bufnum - 1] > 0)); +} + +/* + * BufferGetBlockNumber -- + * Returns the block number associated with a buffer. + * + * Note: + * Assumes that the buffer is valid. + */ +BlockNumber +BufferGetBlockNumber(Buffer buffer) +{ + Assert(BufferIsValid(buffer)); + + /* XXX should be a critical section */ + if (BufferIsLocal(buffer)) + return (LocalBufferDescriptors[-buffer-1].tag.blockNum); + else + return (BufferDescriptors[buffer-1].tag.blockNum); +} + +/* + * BufferGetRelation -- + * Returns the relation desciptor associated with a buffer. + * + * Note: + * Assumes buffer is valid. + */ +Relation +BufferGetRelation(Buffer buffer) +{ + Relation relation; + Oid relid; + + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); /* not supported for local buffers */ + + /* XXX should be a critical section */ + relid = LRelIdGetRelationId(BufferDescriptors[buffer-1].tag.relId); + relation = RelationIdGetRelation(relid); + + RelationDecrementReferenceCount(relation); + + if (RelationHasReferenceCountZero(relation)) { + /* + elog(NOTICE, "BufferGetRelation: 0->1"); + */ + + RelationIncrementReferenceCount(relation); + } + + return (relation); +} + +/* + * BufferReplace + * + * Flush the buffer corresponding to 'bufHdr' + * + * Assumes that the BufMgrLock has NOT been acquired. + */ +static int +BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) +{ + Relation reln; + Oid bufdb, bufrel; + int status; + + if (!bufferLockHeld) + SpinAcquire(BufMgrLock); + + /* + * first try to find the reldesc in the cache, if no luck, + * don't bother to build the reldesc from scratch, just do + * a blind write. + */ + + bufdb = bufHdr->tag.relId.dbId; + bufrel = bufHdr->tag.relId.relId; + + if (bufdb == MyDatabaseId || bufdb == (Oid) NULL) + reln = RelationIdCacheGetRelation(bufrel); + else + reln = (Relation) NULL; + + SpinRelease(BufMgrLock); + + if (reln != (Relation) NULL) { + status = smgrflush(bufHdr->bufsmgr, reln, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } else { + + /* blind write always flushes */ + status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname, + bufHdr->sb_relname, bufdb, bufrel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + if (status == SM_FAIL) + return (FALSE); + + return (TRUE); +} + +/* + * RelationGetNumberOfBlocks -- + * Returns the buffer descriptor associated with a page in a relation. + * + * Note: + * XXX may fail for huge relations. + * XXX should be elsewhere. + * XXX maybe should be hidden + */ +BlockNumber +RelationGetNumberOfBlocks(Relation relation) +{ + return + ((relation->rd_islocal) ? relation->rd_nblocks : + smgrnblocks(relation->rd_rel->relsmgr, relation)); +} + +/* + * BufferGetBlock -- + * Returns a reference to a disk page image associated with a buffer. + * + * Note: + * Assumes buffer is valid. + */ +Block +BufferGetBlock(Buffer buffer) +{ + Assert(BufferIsValid(buffer)); + + if (BufferIsLocal(buffer)) + return((Block)MAKE_PTR(LocalBufferDescriptors[-buffer-1].data)); + else + return((Block)MAKE_PTR(BufferDescriptors[buffer-1].data)); +} + +/* --------------------------------------------------------------------- + * ReleaseTmpRelBuffers + * + * this function unmarks all the dirty pages of a temporary + * relation in the buffer pool so that at the end of transaction + * these pages will not be flushed. + * XXX currently it sequentially searches the buffer pool, should be + * changed to more clever ways of searching. + * -------------------------------------------------------------------- + */ +void +ReleaseTmpRelBuffers(Relation tempreldesc) +{ + register int i; + int holding = 0; + BufferDesc *buf; + + for (i=1; i<=NBuffers; i++) { + buf = &BufferDescriptors[i-1]; + if (!holding) { + SpinAcquire(BufMgrLock); + holding = 1; + } + if ((buf->flags & BM_DIRTY) && + (buf->tag.relId.dbId == MyDatabaseId) && + (buf->tag.relId.relId == tempreldesc->rd_id)) { + buf->flags &= ~BM_DIRTY; + if (!(buf->flags & BM_FREE)) { + SpinRelease(BufMgrLock); + holding = 0; + ReleaseBuffer(i); + } + } + } + if (holding) + SpinRelease(BufMgrLock); +} + +/* --------------------------------------------------------------------- + * DropBuffers + * + * This function marks all the buffers in the buffer cache for a + * particular database as clean. This is used when we destroy a + * database, to avoid trying to flush data to disk when the directory + * tree no longer exists. + * + * This is an exceedingly non-public interface. + * -------------------------------------------------------------------- + */ +void +DropBuffers(Oid dbid) +{ + register int i; + BufferDesc *buf; + + SpinAcquire(BufMgrLock); + for (i=1; i<=NBuffers; i++) { + buf = &BufferDescriptors[i-1]; + if ((buf->tag.relId.dbId == dbid) && (buf->flags & BM_DIRTY)) { + buf->flags &= ~BM_DIRTY; + } + } + SpinRelease(BufMgrLock); +} + +/* ----------------------------------------------------------------- + * PrintBufferDescs + * + * this function prints all the buffer descriptors, for debugging + * use only. + * ----------------------------------------------------------------- + */ +void +PrintBufferDescs() +{ + int i; + BufferDesc *buf = BufferDescriptors; + + if (IsUnderPostmaster) { + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; ++i, ++buf) { + elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \ +blockNum=%d, flags=0x%x, refcount=%d %d)", + i, buf->freeNext, buf->freePrev, NAMEDATALEN, + &(buf->sb_relname), buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i]); + } + SpinRelease(BufMgrLock); + } else { + /* interactive backend */ + for (i = 0; i < NBuffers; ++i, ++buf) { + printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n", + i, buf->sb_relname, buf->tag.blockNum, + buf->flags, buf->refcount, PrivateRefCount[i]); + } + } +} + +void +PrintPinnedBufs() +{ + int i; + BufferDesc *buf = BufferDescriptors; + + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; ++i, ++buf) { + if (PrivateRefCount[i] > 0) + elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \ +blockNum=%d, flags=0x%x, refcount=%d %d)\n", + i, buf->freeNext, buf->freePrev, NAMEDATALEN, &(buf->sb_relname), + buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i]); + } + SpinRelease(BufMgrLock); +} + +/* + * BufferPoolBlowaway + * + * this routine is solely for the purpose of experiments -- sometimes + * you may want to blowaway whatever is left from the past in buffer + * pool and start measuring some performance with a clean empty buffer + * pool. + */ +void +BufferPoolBlowaway() +{ + register int i; + + BufferSync(); + for (i=1; i<=NBuffers; i++) { + if (BufferIsValid(i)) { + while(BufferIsValid(i)) + ReleaseBuffer(i); + } + BufTableDelete(&BufferDescriptors[i-1]); + } +} + +#undef IncrBufferRefCount +#undef ReleaseBuffer + +void +IncrBufferRefCount(Buffer buffer) +{ + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] >= 0); + LocalRefCount[-buffer - 1]++; + } else { + Assert(!BAD_BUFFER_ID(buffer)); + Assert(PrivateRefCount[buffer - 1] >= 0); + PrivateRefCount[buffer - 1]++; + } +} + +/* + * ReleaseBuffer -- remove the pin on a buffer without + * marking it dirty. + * + */ +int +ReleaseBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] > 0); + LocalRefCount[-buffer - 1]--; + return (STATUS_OK); + } + + if (BAD_BUFFER_ID(buffer)) + return(STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + Assert(PrivateRefCount[buffer - 1] > 0); + PrivateRefCount[buffer - 1]--; + if (PrivateRefCount[buffer - 1] == 0 && LastRefCount[buffer - 1] == 0) { + /* only release buffer if it is not pinned in previous ExecMain + levels */ + SpinAcquire(BufMgrLock); + bufHdr->refcount--; + if (bufHdr->refcount == 0) { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + SpinRelease(BufMgrLock); + } + + return(STATUS_OK); +} + +void +IncrBufferRefCount_Debug(char *file, int line, Buffer buffer) +{ + IncrBufferRefCount(buffer); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +void +ReleaseBuffer_Debug(char *file, int line, Buffer buffer) +{ + ReleaseBuffer(buffer); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +int +ReleaseAndReadBuffer_Debug(char *file, + int line, + Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + bool bufferValid; + Buffer b; + + bufferValid = BufferIsValid(buffer); + b = ReleaseAndReadBuffer(buffer, relation, blockNum); + if (ShowPinTrace && bufferValid && BufferIsLocal(buffer) + && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } + if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[b-1]; + + fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + b, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[b - 1], file, line); + } + return b; +} + +#ifdef BMTRACE + +/* + * trace allocations and deallocations in a circular buffer in + * shared memory. check the buffer before doing the allocation, + * and die if there's anything fishy. + */ + +_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType) +{ + static int mypid = 0; + long start, cur; + bmtrace *tb; + + if (mypid == 0) + mypid = getpid(); + + start = *CurTraceBuf; + + if (start > 0) + cur = start - 1; + else + cur = BMT_LIMIT - 1; + + for (;;) { + tb = &TraceBuf[cur]; + if (tb->bmt_op != BMT_NOTUSED) { + if (tb->bmt_buf == bufNo) { + if ((tb->bmt_op == BMT_DEALLOC) + || (tb->bmt_dbid == dbId && tb->bmt_relid == relId + && tb->bmt_blkno == blkNo)) + goto okay; + + /* die holding the buffer lock */ + _bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur); + } + } + + if (cur == start) + goto okay; + + if (cur == 0) + cur = BMT_LIMIT - 1; + else + cur--; + } + + okay: + tb = &TraceBuf[start]; + tb->bmt_pid = mypid; + tb->bmt_buf = bufNo; + tb->bmt_dbid = dbId; + tb->bmt_relid = relId; + tb->bmt_blkno = blkNo; + tb->bmt_op = allocType; + + *CurTraceBuf = (start + 1) % BMT_LIMIT; +} + +_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo, + int allocType, long start, long cur) +{ + FILE *fp; + bmtrace *tb; + int i; + + tb = &TraceBuf[cur]; + + if ((fp = fopen("/tmp/death_notice", "w")) == (FILE *) NULL) + elog(FATAL, "buffer alloc trace error and can't open log file"); + + fprintf(fp, "buffer alloc trace detected the following error:\n\n"); + fprintf(fp, " buffer %d being %s inconsistently with a previous %s\n\n", + bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"), + (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation")); + + fprintf(fp, "the trace buffer contains:\n"); + + i = start; + for (;;) { + tb = &TraceBuf[i]; + if (tb->bmt_op != BMT_NOTUSED) { + fprintf(fp, " [%3d]%spid %d buf %2d for <%d,%d,%d> ", + i, (i == cur ? " ---> " : "\t"), + tb->bmt_pid, tb->bmt_buf, + tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno); + + switch (tb->bmt_op) { + case BMT_ALLOCFND: + fprintf(fp, "allocate (found)\n"); + break; + + case BMT_ALLOCNOTFND: + fprintf(fp, "allocate (not found)\n"); + break; + + case BMT_DEALLOC: + fprintf(fp, "deallocate\n"); + break; + + default: + fprintf(fp, "unknown op type %d\n", tb->bmt_op); + break; + } + } + + i = (i + 1) % BMT_LIMIT; + if (i == start) + break; + } + + fprintf(fp, "\noperation causing error:\n"); + fprintf(fp, "\tpid %d buf %d for <%d,%d,%d> ", + getpid(), bufNo, dbId, relId, blkNo); + + switch (allocType) { + case BMT_ALLOCFND: + fprintf(fp, "allocate (found)\n"); + break; + + case BMT_ALLOCNOTFND: + fprintf(fp, "allocate (not found)\n"); + break; + + case BMT_DEALLOC: + fprintf(fp, "deallocate\n"); + break; + + default: + fprintf(fp, "unknown op type %d\n", allocType); + break; + } + + (void) fclose(fp); + + kill(getpid(), SIGILL); +} + +#endif /* BMTRACE */ + +void +BufferRefCountReset(int *refcountsave) +{ + int i; + for (i=0; i<NBuffers; i++) { + refcountsave[i] = PrivateRefCount[i]; + LastRefCount[i] += PrivateRefCount[i]; + PrivateRefCount[i] = 0; + } +} + +void +BufferRefCountRestore(int *refcountsave) +{ + int i; + for (i=0; i<NBuffers; i++) { + PrivateRefCount[i] = refcountsave[i]; + LastRefCount[i] -= refcountsave[i]; + refcountsave[i] = 0; + } +} + diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c new file mode 100644 index 00000000000..fabc3c29829 --- /dev/null +++ b/src/backend/storage/buffer/freelist.c @@ -0,0 +1,285 @@ +/*------------------------------------------------------------------------- + * + * freelist.c-- + * routines for manipulating the buffer pool's replacement strategy + * freelist. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * OLD COMMENTS + * + * Data Structures: + * SharedFreeList is a circular queue. Notice that this + * is a shared memory queue so the next/prev "ptrs" are + * buffer ids, not addresses. + * + * Sync: all routines in this file assume that the buffer + * semaphore has been acquired by the caller. + */ +#include <stdio.h> +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" /* where declarations go */ +#include "storage/spin.h" +#include "utils/elog.h" + + +static BufferDesc *SharedFreeList; + +/* only actually used in debugging. The lock + * should be acquired before calling the freelist manager. + */ +extern SPINLOCK BufMgrLock; + +#define IsInQueue(bf) \ + Assert((bf->freeNext != INVALID_DESCRIPTOR));\ + Assert((bf->freePrev != INVALID_DESCRIPTOR));\ + Assert((bf->flags & BM_FREE)) + +#define NotInQueue(bf) \ + Assert((bf->freeNext == INVALID_DESCRIPTOR));\ + Assert((bf->freePrev == INVALID_DESCRIPTOR));\ + Assert(! (bf->flags & BM_FREE)) + + +/* + * AddBufferToFreelist -- + * + * In theory, this is the only routine that needs to be changed + * if the buffer replacement strategy changes. Just change + * the manner in which buffers are added to the freelist queue. + * Currently, they are added on an LRU basis. + */ +void +AddBufferToFreelist(BufferDesc *bf) +{ +#ifdef BMTRACE + _bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum, + BufferDescriptorGetBuffer(bf), BMT_DEALLOC); +#endif /* BMTRACE */ + NotInQueue(bf); + + /* change bf so it points to inFrontOfNew and its successor */ + bf->freePrev = SharedFreeList->freePrev; + bf->freeNext = Free_List_Descriptor; + + /* insert new into chain */ + BufferDescriptors[bf->freeNext].freePrev = bf->buf_id; + BufferDescriptors[bf->freePrev].freeNext = bf->buf_id; +} + +#undef PinBuffer + +/* + * PinBuffer -- make buffer unavailable for replacement. + */ +void +PinBuffer(BufferDesc *buf) +{ + long b; + + /* Assert (buf->refcount < 25); */ + + if (buf->refcount == 0) { + IsInQueue(buf); + + /* remove from freelist queue */ + BufferDescriptors[buf->freeNext].freePrev = buf->freePrev; + BufferDescriptors[buf->freePrev].freeNext = buf->freeNext; + buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR; + + /* mark buffer as no longer free */ + buf->flags &= ~BM_FREE; + } else { + NotInQueue(buf); + } + + b = BufferDescriptorGetBuffer(buf) - 1; + Assert(PrivateRefCount[b] >= 0); + if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0) + buf->refcount++; + PrivateRefCount[b]++; +} + +void +PinBuffer_Debug(char *file, int line, BufferDesc *buf) +{ + PinBuffer(buf); + if (ShowPinTrace) { + Buffer buffer = BufferDescriptorGetBuffer(buf); + + fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +#undef UnpinBuffer + +/* + * UnpinBuffer -- make buffer available for replacement. + */ +void +UnpinBuffer(BufferDesc *buf) +{ + long b = BufferDescriptorGetBuffer(buf) - 1; + + Assert(buf->refcount); + Assert(PrivateRefCount[b] > 0); + PrivateRefCount[b]--; + if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0) + buf->refcount--; + NotInQueue(buf); + + if (buf->refcount == 0) { + AddBufferToFreelist(buf); + buf->flags |= BM_FREE; + } else { + /* do nothing */ + } +} + +void +UnpinBuffer_Debug(char *file, int line, BufferDesc *buf) +{ + UnpinBuffer(buf); + if (ShowPinTrace) { + Buffer buffer = BufferDescriptorGetBuffer(buf); + + fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +/* + * GetFreeBuffer() -- get the 'next' buffer from the freelist. + * + */ +BufferDesc * +GetFreeBuffer() +{ + BufferDesc *buf; + + if (Free_List_Descriptor == SharedFreeList->freeNext) { + + /* queue is empty. All buffers in the buffer pool are pinned. */ + elog(WARN,"out of free buffers: time to abort !\n"); + return(NULL); + } + buf = &(BufferDescriptors[SharedFreeList->freeNext]); + + /* remove from freelist queue */ + BufferDescriptors[buf->freeNext].freePrev = buf->freePrev; + BufferDescriptors[buf->freePrev].freeNext = buf->freeNext; + buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR; + + buf->flags &= ~(BM_FREE); + + return(buf); +} + +/* + * InitFreeList -- initialize the dummy buffer descriptor used + * as a freelist head. + * + * Assume: All of the buffers are already linked in a circular + * queue. Only called by postmaster and only during + * initialization. + */ +void +InitFreeList(bool init) +{ + SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]); + + if (init) { + /* we only do this once, normally the postmaster */ + SharedFreeList->data = INVALID_OFFSET; + SharedFreeList->flags = 0; + SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE); + SharedFreeList->buf_id = Free_List_Descriptor; + + /* insert it into a random spot in the circular queue */ + SharedFreeList->freeNext = BufferDescriptors[0].freeNext; + SharedFreeList->freePrev = 0; + BufferDescriptors[SharedFreeList->freeNext].freePrev = + BufferDescriptors[SharedFreeList->freePrev].freeNext = + Free_List_Descriptor; + } +} + + +/* + * print out the free list and check for breaks. + */ +void +DBG_FreeListCheck(int nfree) +{ + int i; + BufferDesc *buf; + + buf = &(BufferDescriptors[SharedFreeList->freeNext]); + for (i=0;i<nfree;i++,buf = &(BufferDescriptors[buf->freeNext])) { + + if (! (buf->flags & (BM_FREE))){ + if (buf != SharedFreeList) { + printf("\tfree list corrupted: %d flags %x\n", + buf->buf_id,buf->flags); + } else { + printf("\tfree list corrupted: too short -- %d not %d\n", + i,nfree); + + } + + + } + if ((BufferDescriptors[buf->freeNext].freePrev != buf->buf_id) || + (BufferDescriptors[buf->freePrev].freeNext != buf->buf_id)) { + printf("\tfree list links corrupted: %d %ld %ld\n", + buf->buf_id,buf->freePrev,buf->freeNext); + } + + } + if (buf != SharedFreeList) { + printf("\tfree list corrupted: %d-th buffer is %d\n", + nfree,buf->buf_id); + + } +} + +/* + * PrintBufferFreeList - + * prints the buffer free list, for debugging + */ +void +PrintBufferFreeList() +{ + BufferDesc *buf; + + if (SharedFreeList->freeNext == Free_List_Descriptor) { + printf("free list is empty.\n"); + return; + } + + buf = &(BufferDescriptors[SharedFreeList->freeNext]); + for (;;) { + int i = (buf - BufferDescriptors); + printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n", + i, buf->sb_relname, buf->tag.blockNum, + buf->flags, buf->refcount, PrivateRefCount[i], + buf->freeNext, buf->freePrev); + + if (buf->freeNext == Free_List_Descriptor) + break; + + buf = &(BufferDescriptors[buf->freeNext]); + } +} diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c new file mode 100644 index 00000000000..ec625940867 --- /dev/null +++ b/src/backend/storage/buffer/localbuf.c @@ -0,0 +1,284 @@ +/*------------------------------------------------------------------------- + * + * localbuf.c-- + * local buffer manager. Fast buffer manager for temporary tables + * or special cases when the operation is not visible to other backends. + * + * When a relation is being created, the descriptor will have rd_islocal + * set to indicate that the local buffer manager should be used. During + * the same transaction the relation is being created, any inserts or + * selects from the newly created relation will use the local buffer + * pool. rd_islocal is reset at the end of a transaction (commit/abort). + * This is useful for queries like SELECT INTO TABLE and create index. + * + * Copyright (c) 1994-5, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <sys/file.h> +#include <stdio.h> +#include <math.h> +#include <signal.h> + +/* declarations split between these three files */ +#include "storage/buf.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" + +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/smgr.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/hsearch.h" +#include "utils/elog.h" +#include "utils/memutils.h" +#include "executor/execdebug.h" /* for NDirectFileRead */ +#include "catalog/catalog.h" + +int NLocBuffer = 64; +BufferDesc *LocalBufferDescriptors = NULL; +long *LocalRefCount = NULL; + +static int nextFreeLocalBuf = 0; + +/*#define LBDEBUG*/ + +/* + * LocalBufferAlloc - + * allocate a local buffer. We do round robin allocation for now. + */ +BufferDesc * +LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) +{ + int i; + BufferDesc *bufHdr = (BufferDesc *) NULL; + + if (blockNum == P_NEW) { + blockNum = reln->rd_nblocks; + reln->rd_nblocks++; + } + + /* a low tech search for now -- not optimized for scans */ + for (i=0; i < NLocBuffer; i++) { + if (LocalBufferDescriptors[i].tag.relId.relId == reln->rd_id && + LocalBufferDescriptors[i].tag.blockNum == blockNum) { + +#ifdef LBDEBUG + fprintf(stderr, "LB ALLOC (%d,%d) %d\n", + reln->rd_id, blockNum, -i-1); +#endif + LocalRefCount[i]++; + *foundPtr = TRUE; + return &LocalBufferDescriptors[i]; + } + } + +#ifdef LBDEBUG + fprintf(stderr, "LB ALLOC (%d,%d) %d\n", + reln->rd_id, blockNum, -nextFreeLocalBuf-1); +#endif + + /* need to get a new buffer (round robin for now) */ + for(i=0; i < NLocBuffer; i++) { + int b = (nextFreeLocalBuf + i) % NLocBuffer; + + if (LocalRefCount[b]==0) { + bufHdr = &LocalBufferDescriptors[b]; + LocalRefCount[b]++; + nextFreeLocalBuf = (b + 1) % NLocBuffer; + break; + } + } + if (bufHdr==NULL) + elog(WARN, "no empty local buffer."); + + /* + * this buffer is not referenced but it might still be dirty (the + * last transaction to touch it doesn't need its contents but has + * not flushed it). if that's the case, write it out before + * reusing it! + */ + if (bufHdr->flags & BM_DIRTY) { + Relation bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId); + + Assert(bufrel != NULL); + + /* flush this page */ + smgrwrite(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + /* + * it's all ours now. + */ + bufHdr->tag.relId.relId = reln->rd_id; + bufHdr->tag.blockNum = blockNum; + bufHdr->flags &= ~BM_DIRTY; + + /* + * lazy memory allocation. (see MAKE_PTR for why we need to do + * MAKE_OFFSET.) + */ + if (bufHdr->data == (SHMEM_OFFSET)0) { + char *data = (char *)malloc(BLCKSZ); + + bufHdr->data = MAKE_OFFSET(data); + } + + *foundPtr = FALSE; + return bufHdr; +} + +/* + * WriteLocalBuffer - + * writes out a local buffer + */ +int +WriteLocalBuffer(Buffer buffer, bool release) +{ + int bufid; + + Assert(BufferIsLocal(buffer)); + +#ifdef LBDEBUG + fprintf(stderr, "LB WRITE %d\n", buffer); +#endif + + bufid = - (buffer + 1); + LocalBufferDescriptors[bufid].flags |= BM_DIRTY; + + if (release) { + Assert(LocalRefCount[bufid] > 0); + LocalRefCount[bufid]--; + } + + return true; +} + +/* + * FlushLocalBuffer - + * flushes a local buffer + */ +int +FlushLocalBuffer(Buffer buffer) +{ + int bufid; + Relation bufrel; + BufferDesc *bufHdr; + + Assert(BufferIsLocal(buffer)); + +#ifdef LBDEBUG + fprintf(stderr, "LB FLUSH %d\n", buffer); +#endif + + bufid = - (buffer + 1); + bufHdr = &LocalBufferDescriptors[bufid]; + bufHdr->flags &= ~BM_DIRTY; + bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId); + + Assert(bufrel != NULL); + smgrflush(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + + Assert(LocalRefCount[bufid] > 0); + LocalRefCount[bufid]--; + + return true; +} + +/* + * InitLocalBuffer - + * init the local buffer cache. Since most queries (esp. multi-user ones) + * don't involve local buffers, we delay allocating memory for actual the + * buffer until we need it. + */ +void +InitLocalBuffer() +{ + int i; + + /* + * these aren't going away. I'm not gonna use palloc. + */ + LocalBufferDescriptors = + (BufferDesc *)malloc(sizeof(BufferDesc) * NLocBuffer); + memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer); + nextFreeLocalBuf = 0; + + for (i = 0; i < NLocBuffer; i++) { + BufferDesc *buf = &LocalBufferDescriptors[i]; + + /* + * negative to indicate local buffer. This is tricky: shared buffers + * start with 0. We have to start with -2. (Note that the routine + * BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id + * is -1.) + */ + buf->buf_id = - i - 2; + } + + LocalRefCount = + (long *)malloc(sizeof(long) * NLocBuffer); + memset(LocalRefCount, 0, sizeof(long) * NLocBuffer); +} + +/* + * LocalBufferSync - + * flush all dirty buffers in the local buffer cache. Since the buffer + * cache is only used for keeping relations visible during a transaction, + * we will not need these buffers again. + */ +void +LocalBufferSync() +{ + int i; + + for (i = 0; i < NLocBuffer; i++) { + BufferDesc *buf = &LocalBufferDescriptors[i]; + Relation bufrel; + + if (buf->flags & BM_DIRTY) { +#ifdef LBDEBUG + fprintf(stderr, "LB SYNC %d\n", -i-1); +#endif + bufrel = RelationIdCacheGetRelation(buf->tag.relId.relId); + + Assert(bufrel != NULL); + + smgrwrite(bufrel->rd_rel->relsmgr, bufrel, buf->tag.blockNum, + (char *) MAKE_PTR(buf->data)); + + buf->tag.relId.relId = InvalidOid; + buf->flags &= ~BM_DIRTY; + } + } + + memset(LocalRefCount, 0, sizeof(long) * NLocBuffer); +} + +void +ResetLocalBufferPool() +{ + int i; + + memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer); + nextFreeLocalBuf = 0; + + for (i = 0; i < NLocBuffer; i++) { + BufferDesc *buf = &LocalBufferDescriptors[i]; + + /* just like InitLocalBuffer() */ + buf->buf_id = - i - 2; + } + + memset(LocalRefCount, 0, sizeof(long) * NLocBuffer); +} |