diff options
Diffstat (limited to 'src/backend/storage')
63 files changed, 16385 insertions, 0 deletions
diff --git a/src/backend/storage/Makefile.inc b/src/backend/storage/Makefile.inc new file mode 100644 index 00000000000..aef287ca71a --- /dev/null +++ b/src/backend/storage/Makefile.inc @@ -0,0 +1,31 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for the storage modules +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ +# +#------------------------------------------------------------------------- + +stordir= $(CURDIR)/storage +VPATH:= $(VPATH):$(stordir):$(stordir)/buffer:$(stordir)/file:$(stordir)/ipc:\ + $(stordir)/large_object:$(stordir)/lmgr:$(stordir)/page:$(stordir)/smgr + +SUBSRCS= +include $(stordir)/buffer/Makefile.inc +include $(stordir)/file/Makefile.inc +include $(stordir)/ipc/Makefile.inc +include $(stordir)/large_object/Makefile.inc +include $(stordir)/lmgr/Makefile.inc +include $(stordir)/page/Makefile.inc +include $(stordir)/smgr/Makefile.inc +SRCS_STORAGE:= $(SUBSRCS) + +HEADERS+= backendid.h block.h buf.h buf_internals.h bufmgr.h bufpage.h \ + fd.h ipc.h item.h itemid.h itempos.h \ + itemptr.h large_object.h lmgr.h lock.h multilev.h off.h page.h \ + pagenum.h pos.h proc.h shmem.h sinval.h sinvaladt.h smgr.h spin.h diff --git a/src/backend/storage/backendid.h b/src/backend/storage/backendid.h new file mode 100644 index 00000000000..eb874bbad79 --- /dev/null +++ b/src/backend/storage/backendid.h @@ -0,0 +1,32 @@ +/*------------------------------------------------------------------------- + * + * backendid.h-- + * POSTGRES backend id communication definitions + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: backendid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef BACKENDID_H +#define BACKENDID_H + +/* ---------------- + * pulled out of sinval.h to temporarily reduce #include nesting. + * -cim 8/17/90 + * ---------------- + */ +typedef int16 BackendId; /* unique currently active backend identifier */ + +#define InvalidBackendId (-1) + +typedef int32 BackendTag; /* unique backend identifier */ + +#define InvalidBackendTag (-1) + +extern BackendId MyBackendId; /* backend id of this backend */ +extern BackendTag MyBackendTag; /* backend tag of this backend */ + +#endif /* BACKENDID_H */ diff --git a/src/backend/storage/block.h b/src/backend/storage/block.h new file mode 100644 index 00000000000..5c006aa9d90 --- /dev/null +++ b/src/backend/storage/block.h @@ -0,0 +1,114 @@ +/*------------------------------------------------------------------------- + * + * block.h-- + * POSTGRES disk block definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: block.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef BLOCK_H +#define BLOCK_H + +#include "c.h" + +/* + * BlockNumber: + * + * each data file (heap or index) is divided into postgres disk blocks + * (which may be thought of as the unit of i/o -- a postgres buffer + * contains exactly one disk block). the blocks are numbered + * sequentially, 0 to 0xFFFFFFFE. + * + * InvalidBlockNumber is the same thing as P_NEW in buf.h. + * + * the access methods, the buffer manager and the storage manager are + * more or less the only pieces of code that should be accessing disk + * blocks directly. + */ +typedef uint32 BlockNumber; + +#define InvalidBlockNumber ((BlockNumber) 0xFFFFFFFF) + +/* + * BlockId: + * + * this is a storage type for BlockNumber. in other words, this type + * is used for on-disk structures (e.g., in HeapTupleData) whereas + * BlockNumber is the type on which calculations are performed (e.g., + * in access method code). + * + * there doesn't appear to be any reason to have separate types except + * for the fact that BlockIds can be SHORTALIGN'd (and therefore any + * structures that contains them, such as ItemPointerData, can also be + * SHORTALIGN'd). this is an important consideration for reducing the + * space requirements of the line pointer (ItemIdData) array on each + * page and the header of each heap or index tuple, so it doesn't seem + * wise to change this without good reason. + */ +typedef struct BlockIdData { + uint16 bi_hi; + uint16 bi_lo; +} BlockIdData; + +typedef BlockIdData *BlockId; /* block identifier */ + +/* ---------------- + * support macros + * ---------------- + */ + +/* + * BlockNumberIsValid -- + * True iff blockNumber is valid. + */ +#define BlockNumberIsValid(blockNumber) \ + ((bool) ((int32) (blockNumber) != InvalidBlockNumber)) + +/* + * BlockIdIsValid -- + * True iff the block identifier is valid. + */ +#define BlockIdIsValid(blockId) \ + ((bool) PointerIsValid(blockId)) + +/* + * BlockIdSet -- + * Sets a block identifier to the specified value. + */ +#define BlockIdSet(blockId, blockNumber) \ + Assert(PointerIsValid(blockId)); \ + (blockId)->bi_hi = (blockNumber) >> 16; \ + (blockId)->bi_lo = (blockNumber) & 0xffff + +/* + * BlockIdCopy -- + * Copy a block identifier. + */ +#define BlockIdCopy(toBlockId, fromBlockId) \ + Assert(PointerIsValid(toBlockId)); \ + Assert(PointerIsValid(fromBlockId)); \ + (toBlockId)->bi_hi = (fromBlockId)->bi_hi; \ + (toBlockId)->bi_lo = (fromBlockId)->bi_lo + +/* + * BlockIdEquals -- + * Check for block number equality. + */ +#define BlockIdEquals(blockId1, blockId2) \ + ((blockId1)->bi_hi == (blockId2)->bi_hi && \ + (blockId1)->bi_lo == (blockId2)->bi_lo) + +/* + * BlockIdGetBlockNumber -- + * Retrieve the block number from a block identifier. + */ +#define BlockIdGetBlockNumber(blockId) \ + (AssertMacro(BlockIdIsValid(blockId)) ? \ + (BlockNumber) (((blockId)->bi_hi << 16) | ((uint16) (blockId)->bi_lo)) : \ + (BlockNumber) InvalidBlockNumber) + +#endif /* BLOCK_H */ diff --git a/src/backend/storage/buf.h b/src/backend/storage/buf.h new file mode 100644 index 00000000000..73582e8a61c --- /dev/null +++ b/src/backend/storage/buf.h @@ -0,0 +1,47 @@ +/*------------------------------------------------------------------------- + * + * buf.h-- + * Basic buffer manager data types. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: buf.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef BUF_H +#define BUF_H + +#define InvalidBuffer (0) +#define UnknownBuffer (-99999) + +typedef long Buffer; + +/* + * BufferIsInvalid -- + * True iff the buffer is invalid. + */ +#define BufferIsInvalid(buffer) ((buffer) == InvalidBuffer) + +/* + * BufferIsUnknown -- + * True iff the buffer is unknown. + */ +#define BufferIsUnknown(buffer) ((buffer) == UnknownBuffer) + +/* + * BufferIsLocal -- + * True iff the buffer is local (not visible to other servers). + */ +#define BufferIsLocal(buffer) ((buffer) < 0) + +/* + * If NO_BUFFERISVALID is defined, all error checking using BufferIsValid() + * are suppressed. Decision-making using BufferIsValid is not affected. + * This should be set only if one is sure there will be no errors. + * - plai 9/10/90 + */ +#undef NO_BUFFERISVALID + +#endif /* BUF_H */ diff --git a/src/backend/storage/buf_internals.h b/src/backend/storage/buf_internals.h new file mode 100644 index 00000000000..84583867faf --- /dev/null +++ b/src/backend/storage/buf_internals.h @@ -0,0 +1,220 @@ +/*------------------------------------------------------------------------- + * + * buf_internals.h-- + * Internal definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: buf_internals.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + * NOTE + * If BUFFERPAGE0 is defined, then 0 will be used as a + * valid buffer page number. + * + *------------------------------------------------------------------------- + */ +#ifndef BUFMGR_INTERNALS_H +#define BUFMGR_INTERNALS_H + +#include "postgres.h" +#include "storage/buf.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "utils/rel.h" +#include "utils/relcache.h" + +/* Buf Mgr constants */ +/* in bufmgr.c */ +extern int NBuffers; +extern int Data_Descriptors; +extern int Free_List_Descriptor; +extern int Lookup_List_Descriptor; +extern int Num_Descriptors; + +/* + * Flags for buffer descriptors + */ +#define BM_DIRTY (1 << 0) +#define BM_PRIVATE (1 << 1) +#define BM_VALID (1 << 2) +#define BM_DELETED (1 << 3) +#define BM_FREE (1 << 4) +#define BM_IO_IN_PROGRESS (1 << 5) +#define BM_IO_ERROR (1 << 6) + +typedef bits16 BufFlags; + +typedef struct sbufdesc BufferDesc; +typedef struct sbufdesc BufferHdr; +typedef struct buftag BufferTag; +/* long * so alignment will be correct */ +typedef long **BufferBlock; + +struct buftag{ + LRelId relId; + BlockNumber blockNum; /* blknum relative to begin of reln */ +}; + +#define CLEAR_BUFFERTAG(a)\ + (a)->relId.dbId = InvalidOid; \ + (a)->relId.relId = InvalidOid; \ + (a)->blockNum = InvalidBlockNumber + +#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \ +{ \ + (a)->blockNum = xx_blockNum;\ + (a)->relId = RelationGetLRelId(xx_reln); \ +} + +#define COPY_BUFFERTAG(a,b)\ +{ \ + (a)->blockNum = (b)->blockNum;\ + LRelIdAssign(*(a),*(b));\ +} + +#define EQUAL_BUFFERTAG(a,b) \ + (((a)->blockNum == (b)->blockNum) &&\ + (OID_Equal((a)->relId.relId,(b)->relId.relId))) + + +#define BAD_BUFFER_ID(bid) ((bid<1) || (bid>(NBuffers))) +#define INVALID_DESCRIPTOR (-3) + +/* + * bletch hack -- anyplace that we declare space for relation or + * database names, we just use '16', not a symbolic constant, to + * specify their lengths. BM_NAMESIZE is the length of these names, + * and is used in the buffer manager code. somebody with lots of + * spare time should do this for all the other modules, too. + */ +#define BM_NAMESIZE 16 + +/* + * struct sbufdesc -- shared buffer cache metadata for a single + * shared buffer descriptor. + * + * We keep the name of the database and relation in which this + * buffer appears in order to avoid a catalog lookup on cache + * flush if we don't have the reldesc in the cache. It is also + * possible that the relation to which this buffer belongs is + * not visible to all backends at the time that it gets flushed. + * Dbname, relname, dbid, and relid are enough to determine where + * to put the buffer, for all storage managers. + */ + +struct sbufdesc { + Buffer freeNext; /* link for freelist chain */ + Buffer freePrev; + SHMEM_OFFSET data; /* pointer to data in buf pool */ + + /* tag and id must be together for table lookup to work */ + BufferTag tag; /* file/block identifier */ + int buf_id; /* maps global desc to local desc */ + + BufFlags flags; /* described below */ + int16 bufsmgr; /* storage manager id for buffer */ + unsigned refcount; /* # of times buffer is pinned */ + + char *sb_dbname; /* name of db in which buf belongs */ + char *sb_relname; /* name of reln */ +#ifdef HAS_TEST_AND_SET + /* can afford a dedicated lock if test-and-set locks are available */ + slock_t io_in_progress_lock; +#endif /* HAS_TEST_AND_SET */ + + /* + * I padded this structure to a power of 2 (128 bytes on a MIPS) because + * BufferDescriptorGetBuffer is called a billion times and it does an + * C pointer subtraction (i.e., "x - y" -> array index of x relative + * to y, which is calculated using division by struct size). Integer + * ".div" hits you for 35 cycles, as opposed to a 1-cycle "sra" ... + * this hack cut 10% off of the time to create the Wisconsin database! + * It eats up more shared memory, of course, but we're (allegedly) + * going to make some of these types bigger soon anyway... -pma 1/2/93 + */ +#if defined(PORTNAME_ultrix4) + char sb_pad[60]; /* no slock_t */ +#endif /* mips */ +#if defined(PORTNAME_sparc) || defined(PORTNAME_sparc_solaris) || defined(PORTNAME_irix5) + char sb_pad[56]; /* has slock_t */ +#endif /* sparc || irix5 */ +#if defined(PORTNAME_hpux) + char sb_pad[44]; /* has slock_t */ +#endif /* alpha */ +#if defined(PORTNAME_alpha) + char sb_pad[40]; /* has slock_t */ +#endif /* alpha */ +}; + +/* + * mao tracing buffer allocation + */ + +/*#define BMTRACE*/ +#ifdef BMTRACE + +typedef struct _bmtrace { + int bmt_pid; + long bmt_buf; + long bmt_dbid; + long bmt_relid; + int bmt_blkno; + int bmt_op; + +#define BMT_NOTUSED 0 +#define BMT_ALLOCFND 1 +#define BMT_ALLOCNOTFND 2 +#define BMT_DEALLOC 3 + +} bmtrace; + +#endif /* BMTRACE */ + + +/* + * Bufmgr Interface: + */ + +/* Internal routines: only called by buf.c */ + +/*freelist.c*/ +extern void AddBufferToFreelist(BufferDesc *bf); +extern void PinBuffer(BufferDesc *buf); +extern void PinBuffer_Debug(char *file, int line, BufferDesc *buf); +extern void UnpinBuffer(BufferDesc *buf); +extern void UnpinBuffer_Debug(char *file, int line, BufferDesc *buf); +extern BufferDesc *GetFreeBuffer(void); +extern void InitFreeList(bool init); +extern void DBG_FreeListCheck(int nfree); + +/* buf_table.c */ +extern void InitBufTable(void); +extern BufferDesc *BufTableLookup(BufferTag *tagPtr); +extern bool BufTableDelete(BufferDesc *buf); +extern bool BufTableInsert(BufferDesc *buf); +extern void DBG_LookupListCheck(int nlookup); + +/* bufmgr.c */ +extern BufferDesc *BufferDescriptors; +extern BufferBlock BufferBlocks; +extern long *PrivateRefCount; +extern long *LastRefCount; +extern SPINLOCK BufMgrLock; + +/* localbuf.c */ +extern long *LocalRefCount; +extern BufferDesc *LocalBufferDescriptors; +extern int NLocBuffer; + +extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum, + bool *foundPtr); +extern int WriteLocalBuffer(Buffer buffer, bool release); +extern int FlushLocalBuffer(Buffer buffer); +extern void InitLocalBuffer(); +extern void LocalBufferSync(); +extern void ResetLocalBufferPool(); + +#endif /* BUFMGR_INTERNALS_H */ diff --git a/src/backend/storage/buffer/Makefile.inc b/src/backend/storage/buffer/Makefile.inc new file mode 100644 index 00000000000..1d507f9227b --- /dev/null +++ b/src/backend/storage/buffer/Makefile.inc @@ -0,0 +1,16 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for storage/buffer +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= buf_table.c buf_init.c bufmgr.c freelist.c localbuf.c + +SRCS_SITEMGR+= buf_table.c buf_init.c freelist.c diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c new file mode 100644 index 00000000000..823bf41eecf --- /dev/null +++ b/src/backend/storage/buffer/buf_init.c @@ -0,0 +1,280 @@ +/*------------------------------------------------------------------------- + * + * buf_init.c-- + * buffer manager initialization routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <sys/file.h> +#include <stdio.h> +#include <math.h> +#include <signal.h> + +/* declarations split between these three files */ +#include "storage/buf.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" + +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/smgr.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/hsearch.h" +#include "utils/elog.h" +#include "utils/memutils.h" +#include "executor/execdebug.h" /* for NDirectFileRead */ +#include "catalog/catalog.h" + +/* + * if BMTRACE is defined, we trace the last 200 buffer allocations and + * deallocations in a circular buffer in shared memory. + */ +#ifdef BMTRACE +bmtrace *TraceBuf; +long *CurTraceBuf; +#define BMT_LIMIT 200 +#endif /* BMTRACE */ +int ShowPinTrace = 0; + +int NBuffers = NDBUFS; /* NDBUFS defined in miscadmin.h */ +int Data_Descriptors; +int Free_List_Descriptor; +int Lookup_List_Descriptor; +int Num_Descriptors; + +BufferDesc *BufferDescriptors; +BufferBlock BufferBlocks; +#ifndef HAS_TEST_AND_SET +long *NWaitIOBackendP; +#endif + +extern IpcSemaphoreId WaitIOSemId; + +long *PrivateRefCount; /* also used in freelist.c */ +long *LastRefCount; /* refcounts of last ExecMain level */ + +/* + * Data Structures: + * buffers live in a freelist and a lookup data structure. + * + * + * Buffer Lookup: + * Two important notes. First, the buffer has to be + * available for lookup BEFORE an IO begins. Otherwise + * a second process trying to read the buffer will + * allocate its own copy and the buffeer pool will + * become inconsistent. + * + * Buffer Replacement: + * see freelist.c. A buffer cannot be replaced while in + * use either by data manager or during IO. + * + * WriteBufferBack: + * currently, a buffer is only written back at the time + * it is selected for replacement. It should + * be done sooner if possible to reduce latency of + * BufferAlloc(). Maybe there should be a daemon process. + * + * Synchronization/Locking: + * + * BufMgrLock lock -- must be acquired before manipulating the + * buffer queues (lookup/freelist). Must be released + * before exit and before doing any IO. + * + * IO_IN_PROGRESS -- this is a flag in the buffer descriptor. + * It must be set when an IO is initiated and cleared at + * the end of the IO. It is there to make sure that one + * process doesn't start to use a buffer while another is + * faulting it in. see IOWait/IOSignal. + * + * refcount -- A buffer is pinned during IO and immediately + * after a BufferAlloc(). A buffer is always either pinned + * or on the freelist but never both. The buffer must be + * released, written, or flushed before the end of + * transaction. + * + * PrivateRefCount -- Each buffer also has a private refcount the keeps + * track of the number of times the buffer is pinned in the current + * processes. This is used for two purposes, first, if we pin a + * a buffer more than once, we only need to change the shared refcount + * once, thus only lock the buffer pool once, second, when a transaction + * aborts, it should only unpin the buffers exactly the number of times it + * has pinned them, so that it will not blow away buffers of another + * backend. + * + */ + +SPINLOCK BufMgrLock; + +/* delayed write: TRUE on, FALSE off */ +int LateWrite = TRUE; + +int ReadBufferCount; +int BufferHitCount; +int BufferFlushCount; + + +/* + * Initialize module: + * + * should calculate size of pool dynamically based on the + * amount of available memory. + */ +void +InitBufferPool(IPCKey key) +{ + bool foundBufs,foundDescs; + int i; + + Data_Descriptors = NBuffers; + Free_List_Descriptor = Data_Descriptors; + Lookup_List_Descriptor = Data_Descriptors + 1; + Num_Descriptors = Data_Descriptors + 1; + + SpinAcquire(BufMgrLock); + +#ifdef BMTRACE + CurTraceBuf = (long *) ShmemInitStruct("Buffer trace", + (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long), + &foundDescs); + if (!foundDescs) + memset(CurTraceBuf, 0, (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long)); + + TraceBuf = (bmtrace *) &(CurTraceBuf[1]); +#endif + + BufferDescriptors = (BufferDesc *) + ShmemInitStruct("Buffer Descriptors", + Num_Descriptors*sizeof(BufferDesc),&foundDescs); + + BufferBlocks = (BufferBlock) + ShmemInitStruct("Buffer Blocks", + NBuffers*BLCKSZ,&foundBufs); + +#ifndef HAS_TEST_AND_SET + { + bool foundNWaitIO; + + NWaitIOBackendP = (long *)ShmemInitStruct("#Backends Waiting IO", + sizeof(long), + &foundNWaitIO); + if (!foundNWaitIO) + *NWaitIOBackendP = 0; + } +#endif + + if (foundDescs || foundBufs) { + + /* both should be present or neither */ + Assert(foundDescs && foundBufs); + + } else { + BufferDesc *buf; + unsigned long block; + + buf = BufferDescriptors; + block = (unsigned long) BufferBlocks; + + /* + * link the buffers into a circular, doubly-linked list to + * initialize free list. Still don't know anything about + * replacement strategy in this file. + */ + for (i = 0; i < Data_Descriptors; block+=BLCKSZ,buf++,i++) { + Assert(ShmemIsValid((unsigned long)block)); + + buf->freeNext = i+1; + buf->freePrev = i-1; + + CLEAR_BUFFERTAG(&(buf->tag)); + buf->data = MAKE_OFFSET(block); + buf->flags = (BM_DELETED | BM_FREE | BM_VALID); + buf->refcount = 0; + buf->buf_id = i; +#ifdef HAS_TEST_AND_SET + S_INIT_LOCK(&(buf->io_in_progress_lock)); +#endif + } + + /* close the circular queue */ + BufferDescriptors[0].freePrev = Data_Descriptors-1; + BufferDescriptors[Data_Descriptors-1].freeNext = 0; + } + + /* Init the rest of the module */ + InitBufTable(); + InitFreeList(!foundDescs); + + SpinRelease(BufMgrLock); + +#ifndef HAS_TEST_AND_SET + { + int status; + WaitIOSemId = IpcSemaphoreCreate(IPCKeyGetWaitIOSemaphoreKey(key), + 1, IPCProtection, 0, 1, &status); + } +#endif + PrivateRefCount = (long *) calloc(NBuffers, sizeof(long)); + LastRefCount = (long *) calloc(NBuffers, sizeof(long)); +} + +/* ----------------------------------------------------- + * BufferShmemSize + * + * compute the size of shared memory for the buffer pool including + * data pages, buffer descriptors, hash tables, etc. + * ---------------------------------------------------- + */ +int +BufferShmemSize() +{ + int size = 0; + int nbuckets; + int nsegs; + int tmp; + + nbuckets = 1 << (int)my_log2((NBuffers - 1) / DEF_FFACTOR + 1); + nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1); + + /* size of shmem binding table */ + size += MAXALIGN(my_log2(BTABLE_SIZE) * sizeof(void *)); /* HTAB->dir */ + size += MAXALIGN(sizeof(HHDR)); /* HTAB->hctl */ + size += MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT)); + size += BUCKET_ALLOC_INCR * + (MAXALIGN(sizeof(BUCKET_INDEX)) + + MAXALIGN(BTABLE_KEYSIZE) + + MAXALIGN(BTABLE_DATASIZE)); + + /* size of buffer descriptors */ + size += MAXALIGN((NBuffers + 1) * sizeof(BufferDesc)); + + /* size of data pages */ + size += NBuffers * MAXALIGN(BLCKSZ); + + /* size of buffer hash table */ + size += MAXALIGN(my_log2(NBuffers) * sizeof(void *)); /* HTAB->dir */ + size += MAXALIGN(sizeof(HHDR)); /* HTAB->hctl */ + size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT)); + tmp = (int)ceil((double)NBuffers/BUCKET_ALLOC_INCR); + size += tmp * BUCKET_ALLOC_INCR * + (MAXALIGN(sizeof(BUCKET_INDEX)) + + MAXALIGN(sizeof(BufferTag)) + + MAXALIGN(sizeof(Buffer))); + +#ifdef BMTRACE + size += (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long); +#endif + return size; +} + + diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c new file mode 100644 index 00000000000..502ded954ed --- /dev/null +++ b/src/backend/storage/buffer/buf_table.c @@ -0,0 +1,162 @@ +/*------------------------------------------------------------------------- + * + * buf_table.c-- + * routines for finding buffers in the buffer pool. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_table.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * OLD COMMENTS + * + * Data Structures: + * + * Buffers are identified by their BufferTag (buf.h). This + * file contains routines for allocating a shmem hash table to + * map buffer tags to buffer descriptors. + * + * Synchronization: + * + * All routines in this file assume buffer manager spinlock is + * held by their caller. + */ +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" /* where the declarations go */ +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/hsearch.h" +#include "utils/elog.h" + +static HTAB *SharedBufHash; + +extern HTAB *ShmemInitHash(); + +typedef struct lookup { + BufferTag key; + Buffer id; +} LookupEnt; + +/* + * Initialize shmem hash table for mapping buffers + */ +void +InitBufTable() +{ + HASHCTL info; + int hash_flags; + + /* assume lock is held */ + + /* BufferTag maps to Buffer */ + info.keysize = sizeof(BufferTag); + info.datasize = sizeof(Buffer); + info.hash = tag_hash; + + hash_flags = (HASH_ELEM | HASH_FUNCTION); + + + SharedBufHash = (HTAB *) ShmemInitHash("Shared Buf Lookup Table", + NBuffers,NBuffers, + &info,hash_flags); + + if (! SharedBufHash) { + elog(FATAL,"couldn't initialize shared buffer pool Hash Tbl"); + exit(1); + } + +} + +BufferDesc * +BufTableLookup(BufferTag *tagPtr) +{ + LookupEnt * result; + bool found; + + if (tagPtr->blockNum == P_NEW) + return(NULL); + + result = (LookupEnt *) + hash_search(SharedBufHash,(char *) tagPtr,HASH_FIND,&found); + + if (! result){ + elog(WARN,"BufTableLookup: BufferLookup table corrupted"); + return(NULL); + } + if (! found) { + return(NULL); + } + return(&(BufferDescriptors[result->id])); +} + +/* + * BufTableDelete + */ +bool +BufTableDelete(BufferDesc *buf) +{ + LookupEnt * result; + bool found; + + /* buffer not initialized or has been removed from + * table already. BM_DELETED keeps us from removing + * buffer twice. + */ + if (buf->flags & BM_DELETED) { + return(TRUE); + } + + buf->flags |= BM_DELETED; + + result = (LookupEnt *) + hash_search(SharedBufHash,(char *) &(buf->tag),HASH_REMOVE,&found); + + if (! (result && found)) { + elog(WARN,"BufTableDelete: BufferLookup table corrupted"); + return(FALSE); + } + + return(TRUE); +} + +bool +BufTableInsert(BufferDesc *buf) +{ + LookupEnt * result; + bool found; + + /* cannot insert it twice */ + Assert (buf->flags & BM_DELETED); + buf->flags &= ~(BM_DELETED); + + result = (LookupEnt *) + hash_search(SharedBufHash,(char *) &(buf->tag),HASH_ENTER,&found); + + if (! result) { + Assert(0); + elog(WARN,"BufTableInsert: BufferLookup table corrupted"); + return(FALSE); + } + /* found something else in the table ! */ + if (found) { + Assert(0); + elog(WARN,"BufTableInsert: BufferLookup table corrupted"); + return(FALSE); + } + + result->id = buf->buf_id; + return(TRUE); +} + +/* prints out collision stats for the buf table */ +void +DBG_LookupListCheck(int nlookup) +{ + nlookup = 10; + + hash_stats("Shared",SharedBufHash); +} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c new file mode 100644 index 00000000000..655f1f408e0 --- /dev/null +++ b/src/backend/storage/buffer/bufmgr.c @@ -0,0 +1,1581 @@ +/*------------------------------------------------------------------------- + * + * bufmgr.c-- + * buffer manager interface routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * + * BufferAlloc() -- lookup a buffer in the buffer table. If + * it isn't there add it, but do not read it into memory. + * This is used when we are about to reinitialize the + * buffer so don't care what the current disk contents are. + * BufferAlloc() pins the new buffer in memory. + * + * ReadBuffer() -- same as BufferAlloc() but reads the data + * on a buffer cache miss. + * + * ReleaseBuffer() -- unpin the buffer + * + * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty" + * but don't unpin. The disk IO is delayed until buffer + * replacement if LateWrite flag is set. + * + * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() + * + * DirtyBufferCopy() -- For a given dbid/relid/blockno, if the buffer is + * in the cache and is dirty, mark it clean and copy + * it to the requested location. This is a logical + * write, and has been installed to support the cache + * management code for write-once storage managers. + * + * FlushBuffer() -- as above but never delayed write. + * + * BufferSync() -- flush all dirty buffers in the buffer pool. + * + * InitBufferPool() -- Init the buffer module. + * + * See other files: + * freelist.c -- chooses victim for buffer replacement + * buf_table.c -- manages the buffer lookup table + */ +#include <sys/file.h> +#include <stdio.h> +#include <math.h> +#include <signal.h> + +/* declarations split between these three files */ +#include "storage/buf.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" + +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/smgr.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/hsearch.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/memutils.h" +#include "executor/execdebug.h" /* for NDirectFileRead */ +#include "catalog/catalog.h" + +extern int LateWrite; +extern SPINLOCK BufMgrLock; +extern int ReadBufferCount; +extern int BufferHitCount; +extern int BufferFlushCount; + +static void WaitIO(BufferDesc *buf, SPINLOCK spinlock); +#ifndef HAS_TEST_AND_SET +static void SignalIO(BufferDesc *buf); +extern long *NWaitIOBackendP; /* defined in buf_init.c */ +#endif /* HAS_TEST_AND_SET */ + +static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, + bool bufferLockHeld); +static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, + bool *foundPtr, bool bufferLockHeld); +static int FlushBuffer(Buffer buffer); +static void BufferSync(void); +static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld); + +/* --------------------------------------------------- + * RelationGetBufferWithBuffer + * see if the given buffer is what we want + * if yes, we don't need to bother the buffer manager + * --------------------------------------------------- + */ +Buffer +RelationGetBufferWithBuffer(Relation relation, + BlockNumber blockNumber, + Buffer buffer) +{ + BufferDesc *bufHdr; + LRelId lrelId; + + if (BufferIsValid(buffer)) { + if (!BufferIsLocal(buffer)) { + bufHdr = &BufferDescriptors[buffer-1]; + lrelId = RelationGetLRelId(relation); + SpinAcquire(BufMgrLock); + if (bufHdr->tag.blockNum == blockNumber && + bufHdr->tag.relId.relId == lrelId.relId && + bufHdr->tag.relId.dbId == lrelId.dbId) { + SpinRelease(BufMgrLock); + return(buffer); + } + return(ReadBufferWithBufferLock(relation, blockNumber, true)); + } else { + bufHdr = &LocalBufferDescriptors[-buffer-1]; + if (bufHdr->tag.relId.relId == relation->rd_id && + bufHdr->tag.blockNum == blockNumber) { + return(buffer); + } + } + } + return(ReadBuffer(relation, blockNumber)); +} + +/* + * ReadBuffer -- returns a buffer containing the requested + * block of the requested relation. If the blknum + * requested is P_NEW, extend the relation file and + * allocate a new block. + * + * Returns: the buffer number for the buffer containing + * the block read or NULL on an error. + * + * Assume when this function is called, that reln has been + * opened already. + */ + +extern int ShowPinTrace; + + +#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG defined */ + +/* + * ReadBuffer -- + * + */ +Buffer +ReadBuffer(Relation reln, BlockNumber blockNum) +{ + return ReadBufferWithBufferLock(reln, blockNum, false); +} + +/* + * is_userbuffer + * + * XXX caller must have already acquired BufMgrLock + */ +static bool +is_userbuffer(Buffer buffer) +{ + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + if (IsSystemRelationName(buf->sb_relname)) + return false; + return true; +} + +Buffer +ReadBuffer_Debug(char *file, + int line, + Relation reln, + BlockNumber blockNum) +{ + Buffer buffer; + + buffer = ReadBufferWithBufferLock(reln, blockNum, false); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } + return buffer; +} + +/* + * ReadBufferWithBufferLock -- does the work of + * ReadBuffer() but with the possibility that + * the buffer lock has already been held. this + * is yet another effort to reduce the number of + * semops in the system. + */ +static Buffer +ReadBufferWithBufferLock(Relation reln, + BlockNumber blockNum, + bool bufferLockHeld) +{ + BufferDesc *bufHdr; + int extend; /* extending the file by one block */ + int status; + bool found; + bool isLocalBuf; + + extend = (blockNum == P_NEW); + isLocalBuf = reln->rd_islocal; + + if (isLocalBuf) { + bufHdr = LocalBufferAlloc(reln, blockNum, &found); + } else { + ReadBufferCount++; + + /* lookup the buffer. IO_IN_PROGRESS is set if the requested + * block is not currently in memory. + */ + bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld); + if (found) BufferHitCount++; + } + + if (!bufHdr) { + return(InvalidBuffer); + } + + /* if its already in the buffer pool, we're done */ + if (found) { + /* + * This happens when a bogus buffer was returned previously and is + * floating around in the buffer pool. A routine calling this would + * want this extended. + */ + if (extend) { + /* new buffers are zero-filled */ + memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); + (void) smgrextend(bufHdr->bufsmgr, reln, + (char *) MAKE_PTR(bufHdr->data)); + } + return (BufferDescriptorGetBuffer(bufHdr)); + + } + + /* + * if we have gotten to this point, the reln pointer must be ok + * and the relation file must be open. + */ + if (extend) { + /* new buffers are zero-filled */ + (void) memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); + status = smgrextend(bufHdr->bufsmgr, reln, + (char *) MAKE_PTR(bufHdr->data)); + } else { + status = smgrread(bufHdr->bufsmgr, reln, blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + if (isLocalBuf) + return (BufferDescriptorGetBuffer(bufHdr)); + + /* lock buffer manager again to update IO IN PROGRESS */ + SpinAcquire(BufMgrLock); + + if (status == SM_FAIL) { + /* IO Failed. cleanup the data structures and go home */ + + if (! BufTableDelete(bufHdr)) { + SpinRelease(BufMgrLock); + elog(FATAL,"BufRead: buffer table broken after IO error\n"); + } + /* remember that BufferAlloc() pinned the buffer */ + UnpinBuffer(bufHdr); + + /* + * Have to reset the flag so that anyone waiting for + * the buffer can tell that the contents are invalid. + */ + bufHdr->flags |= BM_IO_ERROR; + + } else { + /* IO Succeeded. clear the flags, finish buffer update */ + + bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS); + } + + /* If anyone was waiting for IO to complete, wake them up now */ +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(bufHdr->io_in_progress_lock)); +#else + if (bufHdr->refcount > 1) + SignalIO(bufHdr); +#endif + + SpinRelease(BufMgrLock); + + return(BufferDescriptorGetBuffer(bufHdr)); +} + +/* + * BufferAlloc -- Get a buffer from the buffer pool but dont + * read it. + * + * Returns: descriptor for buffer + * + * When this routine returns, the BufMgrLock is guaranteed NOT be held. + */ +static BufferDesc * +BufferAlloc(Relation reln, + BlockNumber blockNum, + bool *foundPtr, + bool bufferLockHeld) +{ + BufferDesc *buf, *buf2; + BufferTag newTag; /* identity of requested block */ + bool inProgress; /* buffer undergoing IO */ + bool newblock = FALSE; + + /* create a new tag so we can lookup the buffer */ + /* assume that the relation is already open */ + if (blockNum == P_NEW) { + newblock = TRUE; + blockNum = smgrnblocks(reln->rd_rel->relsmgr, reln); + } + + INIT_BUFFERTAG(&newTag,reln,blockNum); + + if (!bufferLockHeld) + SpinAcquire(BufMgrLock); + + /* see if the block is in the buffer pool already */ + buf = BufTableLookup(&newTag); + if (buf != NULL) { + /* Found it. Now, (a) pin the buffer so no + * one steals it from the buffer pool, + * (b) check IO_IN_PROGRESS, someone may be + * faulting the buffer into the buffer pool. + */ + + PinBuffer(buf); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + + *foundPtr = TRUE; + if (inProgress) { + WaitIO(buf, BufMgrLock); + if (buf->flags & BM_IO_ERROR) { + /* wierd race condition: + * + * We were waiting for someone else to read the buffer. + * While we were waiting, the reader boof'd in some + * way, so the contents of the buffer are still + * invalid. By saying that we didn't find it, we can + * make the caller reinitialize the buffer. If two + * processes are waiting for this block, both will + * read the block. The second one to finish may overwrite + * any updates made by the first. (Assume higher level + * synchronization prevents this from happening). + * + * This is never going to happen, don't worry about it. + */ + *foundPtr = FALSE; + } + } +#ifdef BMTRACE + _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND); +#endif /* BMTRACE */ + + SpinRelease(BufMgrLock); + + return(buf); + } + + *foundPtr = FALSE; + + /* + * Didn't find it in the buffer pool. We'll have + * to initialize a new buffer. First, grab one from + * the free list. If it's dirty, flush it to disk. + * Remember to unlock BufMgr spinlock while doing the IOs. + */ + inProgress = FALSE; + for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL; ) { + + /* GetFreeBuffer will abort if it can't find a free buffer */ + buf = GetFreeBuffer(); + + /* + * There should be exactly one pin on the buffer after + * it is allocated -- ours. If it had a pin it wouldn't + * have been on the free list. No one else could have + * pinned it between GetFreeBuffer and here because we + * have the BufMgrLock. + */ + Assert(buf->refcount == 0); + buf->refcount = 1; + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; + + if (buf->flags & BM_DIRTY) { + /* + * Set BM_IO_IN_PROGRESS to keep anyone from doing anything + * with the contents of the buffer while we write it out. + * We don't really care if they try to read it, but if they + * can complete a BufferAlloc on it they can then scribble + * into it, and we'd really like to avoid that while we are + * flushing the buffer. Setting this flag should block them + * in WaitIO until we're done. + */ + inProgress = TRUE; + buf->flags |= BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + /* + * All code paths that acquire this lock pin the buffer + * first; since no one had it pinned (it just came off the + * free list), no one else can have this lock. + */ + Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + S_LOCK(&(buf->io_in_progress_lock)); +#endif /* HAS_TEST_AND_SET */ + + /* + * Write the buffer out, being careful to release BufMgrLock + * before starting the I/O. + * + * This #ifndef is here because a few extra semops REALLY kill + * you on machines that don't have spinlocks. If you don't + * operate with much concurrency, well... + */ + (void) BufferReplace(buf, true); + BufferFlushCount++; +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + /* + * Somebody could have pinned the buffer while we were + * doing the I/O and had given up the BufMgrLock (though + * they would be waiting for us to clear the BM_IO_IN_PROGRESS + * flag). That's why this is a loop -- if so, we need to clear + * the I/O flags, remove our pin and start all over again. + * + * People may be making buffers free at any time, so there's + * no reason to think that we have an immediate disaster on + * our hands. + */ + if (buf->refcount > 1) { + inProgress = FALSE; + buf->flags &= ~BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->io_in_progress_lock)); +#else /* !HAS_TEST_AND_SET */ + if (buf->refcount > 1) + SignalIO(buf); +#endif /* !HAS_TEST_AND_SET */ + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + buf->refcount--; + buf = (BufferDesc *) NULL; + } + + /* + * Somebody could have allocated another buffer for the + * same block we are about to read in. (While we flush out + * the dirty buffer, we don't hold the lock and someone could + * have allocated another buffer for the same block. The problem + * is we haven't gotten around to insert the new tag into + * the buffer table. So we need to check here. -ay 3/95 + */ + buf2 = BufTableLookup(&newTag); + if (buf2 != NULL) { + /* Found it. Someone has already done what we're about + * to do. We'll just handle this as if it were found in + * the buffer pool in the first place. + */ + + PinBuffer(buf2); + inProgress = (buf2->flags & BM_IO_IN_PROGRESS); + + *foundPtr = TRUE; + if (inProgress) { + WaitIO(buf2, BufMgrLock); + if (buf2->flags & BM_IO_ERROR) { + *foundPtr = FALSE; + } + } + +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->io_in_progress_lock)); +#else /* !HAS_TEST_AND_SET */ + if (buf->refcount > 1) + SignalIO(buf); +#endif /* !HAS_TEST_AND_SET */ + + /* give up the buffer since we don't need it any more */ + buf->refcount--; + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + AddBufferToFreelist(buf); + buf->flags |= BM_FREE; + buf->flags &= ~BM_DIRTY; + buf->flags &= ~BM_IO_IN_PROGRESS; + + SpinRelease(BufMgrLock); + + return(buf2); + } + } + } + /* + * At this point we should have the sole pin on a non-dirty + * buffer and we may or may not already have the BM_IO_IN_PROGRESS + * flag set. + */ + + /* + * Change the name of the buffer in the lookup table: + * + * Need to update the lookup table before the read starts. + * If someone comes along looking for the buffer while + * we are reading it in, we don't want them to allocate + * a new buffer. For the same reason, we didn't want + * to erase the buf table entry for the buffer we were + * writing back until now, either. + */ + + if (! BufTableDelete(buf)) { + SpinRelease(BufMgrLock); + elog(FATAL,"buffer wasn't in the buffer table\n"); + + } + + if (buf->flags & BM_DIRTY) { + /* must clear flag first because of wierd race + * condition described below. + */ + buf->flags &= ~BM_DIRTY; + } + + /* record the database name and relation name for this buffer */ + buf->sb_relname = pstrdup(reln->rd_rel->relname.data); + buf->sb_dbname = pstrdup(GetDatabaseName()); + + /* remember which storage manager is responsible for it */ + buf->bufsmgr = reln->rd_rel->relsmgr; + + INIT_BUFFERTAG(&(buf->tag),reln,blockNum); + if (! BufTableInsert(buf)) { + SpinRelease(BufMgrLock); + elog(FATAL,"Buffer in lookup table twice \n"); + } + + /* Buffer contents are currently invalid. Have + * to mark IO IN PROGRESS so no one fiddles with + * them until the read completes. If this routine + * has been called simply to allocate a buffer, no + * io will be attempted, so the flag isnt set. + */ + if (!inProgress) { + buf->flags |= BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + S_LOCK(&(buf->io_in_progress_lock)); +#endif /* HAS_TEST_AND_SET */ + } + +#ifdef BMTRACE + _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND); +#endif /* BMTRACE */ + + SpinRelease(BufMgrLock); + + return (buf); +} + +/* + * WriteBuffer-- + * + * Pushes buffer contents to disk if LateWrite is + * not set. Otherwise, marks contents as dirty. + * + * Assume that buffer is pinned. Assume that reln is + * valid. + * + * Side Effects: + * Pin count is decremented. + */ + +#undef WriteBuffer + +int +WriteBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (! LateWrite) { + return(FlushBuffer(buffer)); + } else { + + if (BufferIsLocal(buffer)) + return WriteLocalBuffer(buffer, TRUE); + + if (BAD_BUFFER_ID(buffer)) + return(FALSE); + + bufHdr = &BufferDescriptors[buffer-1]; + + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->flags |= BM_DIRTY; + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); + } + return(TRUE); +} + +void +WriteBuffer_Debug(char *file, int line, Buffer buffer) +{ + WriteBuffer(buffer); + if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf; + buf = &BufferDescriptors[buffer-1]; + fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +/* + * DirtyBufferCopy() -- Copy a given dirty buffer to the requested + * destination. + * + * We treat this as a write. If the requested buffer is in the pool + * and is dirty, we copy it to the location requested and mark it + * clean. This routine supports the Sony jukebox storage manager, + * which agrees to take responsibility for the data once we mark + * it clean. + * + * NOTE: used by sony jukebox code in postgres 4.2 - ay 2/95 + */ +void +DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest) +{ + BufferDesc *buf; + BufferTag btag; + + btag.relId.relId = relid; + btag.relId.dbId = dbid; + btag.blockNum = blkno; + + SpinAcquire(BufMgrLock); + buf = BufTableLookup(&btag); + + if (buf == (BufferDesc *) NULL + || !(buf->flags & BM_DIRTY) + || !(buf->flags & BM_VALID)) { + SpinRelease(BufMgrLock); + return; + } + + /* hate to do this holding the lock, but release and reacquire is slower */ + memmove(dest, (char *) MAKE_PTR(buf->data), BLCKSZ); + + buf->flags &= ~BM_DIRTY; + + SpinRelease(BufMgrLock); +} + +/* + * FlushBuffer -- like WriteBuffer, but force the page to disk. + * + * 'buffer' is known to be dirty/pinned, so there should not be a + * problem reading the BufferDesc members without the BufMgrLock + * (nobody should be able to change tags, flags, etc. out from under + * us). + */ +static int +FlushBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + return FlushLocalBuffer(buffer); + + if (BAD_BUFFER_ID(buffer)) + return (STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + if (!BufferReplace(bufHdr, false)) { + elog(WARN, "FlushBuffer: cannot flush %d", bufHdr->tag.blockNum); + return (STATUS_ERROR); + } + + SpinAcquire(BufMgrLock); + bufHdr->flags &= ~BM_DIRTY; + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); + + return(STATUS_OK); +} + +/* + * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer + * when the operation is complete. + * + * We know that the buffer is for a relation in our private cache, + * because this routine is called only to write out buffers that + * were changed by the executing backend. + */ +int +WriteNoReleaseBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (! LateWrite) { + return(FlushBuffer(buffer)); + } else { + + if (BufferIsLocal(buffer)) + return WriteLocalBuffer(buffer, FALSE); + + if (BAD_BUFFER_ID(buffer)) + return (STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + SpinAcquire(BufMgrLock); + bufHdr->flags |= BM_DIRTY; + SpinRelease(BufMgrLock); + } + return(STATUS_OK); +} + + +#undef ReleaseAndReadBuffer +/* + * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer() + * so that only one semop needs to be called. + * + */ +Buffer +ReleaseAndReadBuffer(Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + BufferDesc *bufHdr; + Buffer retbuf; + + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] > 0); + LocalRefCount[-buffer - 1]--; + } else { + if (BufferIsValid(buffer)) { + bufHdr = &BufferDescriptors[buffer-1]; + Assert(PrivateRefCount[buffer - 1] > 0); + PrivateRefCount[buffer - 1]--; + if (PrivateRefCount[buffer - 1] == 0 && + LastRefCount[buffer - 1] == 0) { + /* only release buffer if it is not pinned in previous ExecMain + level */ + SpinAcquire(BufMgrLock); + bufHdr->refcount--; + if (bufHdr->refcount == 0) { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + retbuf = ReadBufferWithBufferLock(relation, blockNum, true); + return retbuf; + } + } + } + + return (ReadBuffer(relation, blockNum)); +} + +/* + * BufferSync -- Flush all dirty buffers in the pool. + * + * This is called at transaction commit time. It does the wrong thing, + * right now. We should flush only our own changes to stable storage, + * and we should obey the lock protocol on the buffer manager metadata + * as we do it. Also, we need to be sure that no other transaction is + * modifying the page as we flush it. This is only a problem for objects + * that use a non-two-phase locking protocol, like btree indices. For + * those objects, we would like to set a write lock for the duration of + * our IO. Another possibility is to code updates to btree pages + * carefully, so that writing them out out of order cannot cause + * any unrecoverable errors. + * + * I don't want to think hard about this right now, so I will try + * to come back to it later. + */ +static void +BufferSync() +{ + int i; + Oid bufdb; + Oid bufrel; + Relation reln; + BufferDesc *bufHdr; + int status; + + SpinAcquire(BufMgrLock); + for (i=0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) { + if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { + bufdb = bufHdr->tag.relId.dbId; + bufrel = bufHdr->tag.relId.relId; + if (bufdb == MyDatabaseId || bufdb == (Oid) 0) { + reln = RelationIdCacheGetRelation(bufrel); + + /* + * If we didn't have the reldesc in our local cache, flush this + * page out using the 'blind write' storage manager routine. If + * we did find it, use the standard interface. + */ + +#ifndef OPTIMIZE_SINGLE + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + if (reln == (Relation) NULL) { + status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname, + bufHdr->sb_relname, bufdb, bufrel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } else { + status = smgrwrite(bufHdr->bufsmgr, reln, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + if (status == SM_FAIL) { + elog(WARN, "cannot write %d for %16s", + bufHdr->tag.blockNum, bufHdr->sb_relname); + } + + bufHdr->flags &= ~BM_DIRTY; + if (reln != (Relation)NULL) + RelationDecrementReferenceCount(reln); + } + } + } + SpinRelease(BufMgrLock); + + LocalBufferSync(); +} + + +/* + * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' + * is cleared. Because IO_IN_PROGRESS conflicts are + * expected to be rare, there is only one BufferIO + * lock in the entire system. All processes block + * on this semaphore when they try to use a buffer + * that someone else is faulting in. Whenever a + * process finishes an IO and someone is waiting for + * the buffer, BufferIO is signaled (SignalIO). All + * waiting processes then wake up and check to see + * if their buffer is now ready. This implementation + * is simple, but efficient enough if WaitIO is + * rarely called by multiple processes simultaneously. + * + * ProcSleep atomically releases the spinlock and goes to + * sleep. + * + * Note: there is an easy fix if the queue becomes long. + * save the id of the buffer we are waiting for in + * the queue structure. That way signal can figure + * out which proc to wake up. + */ +#ifdef HAS_TEST_AND_SET +static void +WaitIO(BufferDesc *buf, SPINLOCK spinlock) +{ + SpinRelease(spinlock); + S_LOCK(&(buf->io_in_progress_lock)); + S_UNLOCK(&(buf->io_in_progress_lock)); + SpinAcquire(spinlock); +} + +#else /* HAS_TEST_AND_SET */ +IpcSemaphoreId WaitIOSemId; + +static void +WaitIO(BufferDesc *buf, SPINLOCK spinlock) +{ + bool inProgress; + + for (;;) { + + /* wait until someone releases IO lock */ + (*NWaitIOBackendP)++; + SpinRelease(spinlock); + IpcSemaphoreLock(WaitIOSemId, 0, 1); + SpinAcquire(spinlock); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + if (!inProgress) break; + } +} + +/* + * SignalIO -- + */ +static void +SignalIO(BufferDesc *buf) +{ + /* somebody better be waiting. */ + Assert( buf->refcount > 1); + IpcSemaphoreUnlock(WaitIOSemId, 0, *NWaitIOBackendP); + *NWaitIOBackendP = 0; +} +#endif /* HAS_TEST_AND_SET */ + +long NDirectFileRead; /* some I/O's are direct file access. bypass bufmgr */ +long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ + +void +PrintBufferUsage(FILE *statfp) +{ + float hitrate; + + if (ReadBufferCount==0) + hitrate = 0.0; + else + hitrate = (float)BufferHitCount * 100.0/ReadBufferCount; + + fprintf(statfp, "!\t%ld blocks read, %ld blocks written, buffer hit rate = %.2f%%\n", + ReadBufferCount - BufferHitCount + NDirectFileRead, + BufferFlushCount + NDirectFileWrite, + hitrate); +} + +void +ResetBufferUsage() +{ + BufferHitCount = 0; + ReadBufferCount = 0; + BufferFlushCount = 0; + NDirectFileRead = 0; + NDirectFileWrite = 0; +} + +/* ---------------------------------------------- + * ResetBufferPool + * + * this routine is supposed to be called when a transaction aborts. + * it will release all the buffer pins held by the transaciton. + * + * ---------------------------------------------- + */ +void +ResetBufferPool() +{ + register int i; + for (i=1; i<=NBuffers; i++) { + if (BufferIsValid(i)) { + while(PrivateRefCount[i - 1] > 0) { + ReleaseBuffer(i); + } + } + LastRefCount[i - 1] = 0; + } + + ResetLocalBufferPool(); +} + +/* ----------------------------------------------- + * BufferPoolCheckLeak + * + * check if there is buffer leak + * + * ----------------------------------------------- + */ +int +BufferPoolCheckLeak() +{ + register int i; + void PrintBufferDescs(); + + for (i = 1; i <= NBuffers; i++) { + if (BufferIsValid(i)) { + elog(NOTICE, "buffer leak detected in BufferPoolCheckLeak()"); + PrintBufferDescs(); + return(1); + } + } + return(0); +} + +/* ------------------------------------------------ + * FlushBufferPool + * + * flush all dirty blocks in buffer pool to disk + * + * ------------------------------------------------ + */ +void +FlushBufferPool(int StableMainMemoryFlag) +{ + if (!StableMainMemoryFlag) { + BufferSync(); + smgrcommit(); + } +} + +/* + * BufferIsValid -- + * True iff the refcnt of the local buffer is > 0 + * Note: + * BufferIsValid(InvalidBuffer) is False. + * BufferIsValid(UnknownBuffer) is False. + */ +bool +BufferIsValid(Buffer bufnum) +{ + if (BufferIsLocal(bufnum)) + return (bufnum >= -NLocBuffer && LocalRefCount[-bufnum - 1] > 0); + + if (BAD_BUFFER_ID(bufnum)) + return(false); + + return ((bool)(PrivateRefCount[bufnum - 1] > 0)); +} + +/* + * BufferGetBlockNumber -- + * Returns the block number associated with a buffer. + * + * Note: + * Assumes that the buffer is valid. + */ +BlockNumber +BufferGetBlockNumber(Buffer buffer) +{ + Assert(BufferIsValid(buffer)); + + /* XXX should be a critical section */ + if (BufferIsLocal(buffer)) + return (LocalBufferDescriptors[-buffer-1].tag.blockNum); + else + return (BufferDescriptors[buffer-1].tag.blockNum); +} + +/* + * BufferGetRelation -- + * Returns the relation desciptor associated with a buffer. + * + * Note: + * Assumes buffer is valid. + */ +Relation +BufferGetRelation(Buffer buffer) +{ + Relation relation; + Oid relid; + + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); /* not supported for local buffers */ + + /* XXX should be a critical section */ + relid = LRelIdGetRelationId(BufferDescriptors[buffer-1].tag.relId); + relation = RelationIdGetRelation(relid); + + RelationDecrementReferenceCount(relation); + + if (RelationHasReferenceCountZero(relation)) { + /* + elog(NOTICE, "BufferGetRelation: 0->1"); + */ + + RelationIncrementReferenceCount(relation); + } + + return (relation); +} + +/* + * BufferReplace + * + * Flush the buffer corresponding to 'bufHdr' + * + * Assumes that the BufMgrLock has NOT been acquired. + */ +static int +BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) +{ + Relation reln; + Oid bufdb, bufrel; + int status; + + if (!bufferLockHeld) + SpinAcquire(BufMgrLock); + + /* + * first try to find the reldesc in the cache, if no luck, + * don't bother to build the reldesc from scratch, just do + * a blind write. + */ + + bufdb = bufHdr->tag.relId.dbId; + bufrel = bufHdr->tag.relId.relId; + + if (bufdb == MyDatabaseId || bufdb == (Oid) NULL) + reln = RelationIdCacheGetRelation(bufrel); + else + reln = (Relation) NULL; + + SpinRelease(BufMgrLock); + + if (reln != (Relation) NULL) { + status = smgrflush(bufHdr->bufsmgr, reln, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } else { + + /* blind write always flushes */ + status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname, + bufHdr->sb_relname, bufdb, bufrel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + if (status == SM_FAIL) + return (FALSE); + + return (TRUE); +} + +/* + * RelationGetNumberOfBlocks -- + * Returns the buffer descriptor associated with a page in a relation. + * + * Note: + * XXX may fail for huge relations. + * XXX should be elsewhere. + * XXX maybe should be hidden + */ +BlockNumber +RelationGetNumberOfBlocks(Relation relation) +{ + return + ((relation->rd_islocal) ? relation->rd_nblocks : + smgrnblocks(relation->rd_rel->relsmgr, relation)); +} + +/* + * BufferGetBlock -- + * Returns a reference to a disk page image associated with a buffer. + * + * Note: + * Assumes buffer is valid. + */ +Block +BufferGetBlock(Buffer buffer) +{ + Assert(BufferIsValid(buffer)); + + if (BufferIsLocal(buffer)) + return((Block)MAKE_PTR(LocalBufferDescriptors[-buffer-1].data)); + else + return((Block)MAKE_PTR(BufferDescriptors[buffer-1].data)); +} + +/* --------------------------------------------------------------------- + * ReleaseTmpRelBuffers + * + * this function unmarks all the dirty pages of a temporary + * relation in the buffer pool so that at the end of transaction + * these pages will not be flushed. + * XXX currently it sequentially searches the buffer pool, should be + * changed to more clever ways of searching. + * -------------------------------------------------------------------- + */ +void +ReleaseTmpRelBuffers(Relation tempreldesc) +{ + register int i; + int holding = 0; + BufferDesc *buf; + + for (i=1; i<=NBuffers; i++) { + buf = &BufferDescriptors[i-1]; + if (!holding) { + SpinAcquire(BufMgrLock); + holding = 1; + } + if ((buf->flags & BM_DIRTY) && + (buf->tag.relId.dbId == MyDatabaseId) && + (buf->tag.relId.relId == tempreldesc->rd_id)) { + buf->flags &= ~BM_DIRTY; + if (!(buf->flags & BM_FREE)) { + SpinRelease(BufMgrLock); + holding = 0; + ReleaseBuffer(i); + } + } + } + if (holding) + SpinRelease(BufMgrLock); +} + +/* --------------------------------------------------------------------- + * DropBuffers + * + * This function marks all the buffers in the buffer cache for a + * particular database as clean. This is used when we destroy a + * database, to avoid trying to flush data to disk when the directory + * tree no longer exists. + * + * This is an exceedingly non-public interface. + * -------------------------------------------------------------------- + */ +void +DropBuffers(Oid dbid) +{ + register int i; + BufferDesc *buf; + + SpinAcquire(BufMgrLock); + for (i=1; i<=NBuffers; i++) { + buf = &BufferDescriptors[i-1]; + if ((buf->tag.relId.dbId == dbid) && (buf->flags & BM_DIRTY)) { + buf->flags &= ~BM_DIRTY; + } + } + SpinRelease(BufMgrLock); +} + +/* ----------------------------------------------------------------- + * PrintBufferDescs + * + * this function prints all the buffer descriptors, for debugging + * use only. + * ----------------------------------------------------------------- + */ +void +PrintBufferDescs() +{ + int i; + BufferDesc *buf = BufferDescriptors; + + if (IsUnderPostmaster) { + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; ++i, ++buf) { + elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \ +blockNum=%d, flags=0x%x, refcount=%d %d)", + i, buf->freeNext, buf->freePrev, NAMEDATALEN, + &(buf->sb_relname), buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i]); + } + SpinRelease(BufMgrLock); + } else { + /* interactive backend */ + for (i = 0; i < NBuffers; ++i, ++buf) { + printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n", + i, buf->sb_relname, buf->tag.blockNum, + buf->flags, buf->refcount, PrivateRefCount[i]); + } + } +} + +void +PrintPinnedBufs() +{ + int i; + BufferDesc *buf = BufferDescriptors; + + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; ++i, ++buf) { + if (PrivateRefCount[i] > 0) + elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \ +blockNum=%d, flags=0x%x, refcount=%d %d)\n", + i, buf->freeNext, buf->freePrev, NAMEDATALEN, &(buf->sb_relname), + buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i]); + } + SpinRelease(BufMgrLock); +} + +/* + * BufferPoolBlowaway + * + * this routine is solely for the purpose of experiments -- sometimes + * you may want to blowaway whatever is left from the past in buffer + * pool and start measuring some performance with a clean empty buffer + * pool. + */ +void +BufferPoolBlowaway() +{ + register int i; + + BufferSync(); + for (i=1; i<=NBuffers; i++) { + if (BufferIsValid(i)) { + while(BufferIsValid(i)) + ReleaseBuffer(i); + } + BufTableDelete(&BufferDescriptors[i-1]); + } +} + +#undef IncrBufferRefCount +#undef ReleaseBuffer + +void +IncrBufferRefCount(Buffer buffer) +{ + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] >= 0); + LocalRefCount[-buffer - 1]++; + } else { + Assert(!BAD_BUFFER_ID(buffer)); + Assert(PrivateRefCount[buffer - 1] >= 0); + PrivateRefCount[buffer - 1]++; + } +} + +/* + * ReleaseBuffer -- remove the pin on a buffer without + * marking it dirty. + * + */ +int +ReleaseBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) { + Assert(LocalRefCount[-buffer - 1] > 0); + LocalRefCount[-buffer - 1]--; + return (STATUS_OK); + } + + if (BAD_BUFFER_ID(buffer)) + return(STATUS_ERROR); + + bufHdr = &BufferDescriptors[buffer-1]; + + Assert(PrivateRefCount[buffer - 1] > 0); + PrivateRefCount[buffer - 1]--; + if (PrivateRefCount[buffer - 1] == 0 && LastRefCount[buffer - 1] == 0) { + /* only release buffer if it is not pinned in previous ExecMain + levels */ + SpinAcquire(BufMgrLock); + bufHdr->refcount--; + if (bufHdr->refcount == 0) { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + SpinRelease(BufMgrLock); + } + + return(STATUS_OK); +} + +void +IncrBufferRefCount_Debug(char *file, int line, Buffer buffer) +{ + IncrBufferRefCount(buffer); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +void +ReleaseBuffer_Debug(char *file, int line, Buffer buffer) +{ + ReleaseBuffer(buffer); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +int +ReleaseAndReadBuffer_Debug(char *file, + int line, + Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + bool bufferValid; + Buffer b; + + bufferValid = BufferIsValid(buffer); + b = ReleaseAndReadBuffer(buffer, relation, blockNum); + if (ShowPinTrace && bufferValid && BufferIsLocal(buffer) + && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[buffer-1]; + + fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } + if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) { + BufferDesc *buf = &BufferDescriptors[b-1]; + + fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + b, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[b - 1], file, line); + } + return b; +} + +#ifdef BMTRACE + +/* + * trace allocations and deallocations in a circular buffer in + * shared memory. check the buffer before doing the allocation, + * and die if there's anything fishy. + */ + +_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType) +{ + static int mypid = 0; + long start, cur; + bmtrace *tb; + + if (mypid == 0) + mypid = getpid(); + + start = *CurTraceBuf; + + if (start > 0) + cur = start - 1; + else + cur = BMT_LIMIT - 1; + + for (;;) { + tb = &TraceBuf[cur]; + if (tb->bmt_op != BMT_NOTUSED) { + if (tb->bmt_buf == bufNo) { + if ((tb->bmt_op == BMT_DEALLOC) + || (tb->bmt_dbid == dbId && tb->bmt_relid == relId + && tb->bmt_blkno == blkNo)) + goto okay; + + /* die holding the buffer lock */ + _bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur); + } + } + + if (cur == start) + goto okay; + + if (cur == 0) + cur = BMT_LIMIT - 1; + else + cur--; + } + + okay: + tb = &TraceBuf[start]; + tb->bmt_pid = mypid; + tb->bmt_buf = bufNo; + tb->bmt_dbid = dbId; + tb->bmt_relid = relId; + tb->bmt_blkno = blkNo; + tb->bmt_op = allocType; + + *CurTraceBuf = (start + 1) % BMT_LIMIT; +} + +_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo, + int allocType, long start, long cur) +{ + FILE *fp; + bmtrace *tb; + int i; + + tb = &TraceBuf[cur]; + + if ((fp = fopen("/tmp/death_notice", "w")) == (FILE *) NULL) + elog(FATAL, "buffer alloc trace error and can't open log file"); + + fprintf(fp, "buffer alloc trace detected the following error:\n\n"); + fprintf(fp, " buffer %d being %s inconsistently with a previous %s\n\n", + bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"), + (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation")); + + fprintf(fp, "the trace buffer contains:\n"); + + i = start; + for (;;) { + tb = &TraceBuf[i]; + if (tb->bmt_op != BMT_NOTUSED) { + fprintf(fp, " [%3d]%spid %d buf %2d for <%d,%d,%d> ", + i, (i == cur ? " ---> " : "\t"), + tb->bmt_pid, tb->bmt_buf, + tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno); + + switch (tb->bmt_op) { + case BMT_ALLOCFND: + fprintf(fp, "allocate (found)\n"); + break; + + case BMT_ALLOCNOTFND: + fprintf(fp, "allocate (not found)\n"); + break; + + case BMT_DEALLOC: + fprintf(fp, "deallocate\n"); + break; + + default: + fprintf(fp, "unknown op type %d\n", tb->bmt_op); + break; + } + } + + i = (i + 1) % BMT_LIMIT; + if (i == start) + break; + } + + fprintf(fp, "\noperation causing error:\n"); + fprintf(fp, "\tpid %d buf %d for <%d,%d,%d> ", + getpid(), bufNo, dbId, relId, blkNo); + + switch (allocType) { + case BMT_ALLOCFND: + fprintf(fp, "allocate (found)\n"); + break; + + case BMT_ALLOCNOTFND: + fprintf(fp, "allocate (not found)\n"); + break; + + case BMT_DEALLOC: + fprintf(fp, "deallocate\n"); + break; + + default: + fprintf(fp, "unknown op type %d\n", allocType); + break; + } + + (void) fclose(fp); + + kill(getpid(), SIGILL); +} + +#endif /* BMTRACE */ + +void +BufferRefCountReset(int *refcountsave) +{ + int i; + for (i=0; i<NBuffers; i++) { + refcountsave[i] = PrivateRefCount[i]; + LastRefCount[i] += PrivateRefCount[i]; + PrivateRefCount[i] = 0; + } +} + +void +BufferRefCountRestore(int *refcountsave) +{ + int i; + for (i=0; i<NBuffers; i++) { + PrivateRefCount[i] = refcountsave[i]; + LastRefCount[i] -= refcountsave[i]; + refcountsave[i] = 0; + } +} + diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c new file mode 100644 index 00000000000..fabc3c29829 --- /dev/null +++ b/src/backend/storage/buffer/freelist.c @@ -0,0 +1,285 @@ +/*------------------------------------------------------------------------- + * + * freelist.c-- + * routines for manipulating the buffer pool's replacement strategy + * freelist. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * OLD COMMENTS + * + * Data Structures: + * SharedFreeList is a circular queue. Notice that this + * is a shared memory queue so the next/prev "ptrs" are + * buffer ids, not addresses. + * + * Sync: all routines in this file assume that the buffer + * semaphore has been acquired by the caller. + */ +#include <stdio.h> +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" /* where declarations go */ +#include "storage/spin.h" +#include "utils/elog.h" + + +static BufferDesc *SharedFreeList; + +/* only actually used in debugging. The lock + * should be acquired before calling the freelist manager. + */ +extern SPINLOCK BufMgrLock; + +#define IsInQueue(bf) \ + Assert((bf->freeNext != INVALID_DESCRIPTOR));\ + Assert((bf->freePrev != INVALID_DESCRIPTOR));\ + Assert((bf->flags & BM_FREE)) + +#define NotInQueue(bf) \ + Assert((bf->freeNext == INVALID_DESCRIPTOR));\ + Assert((bf->freePrev == INVALID_DESCRIPTOR));\ + Assert(! (bf->flags & BM_FREE)) + + +/* + * AddBufferToFreelist -- + * + * In theory, this is the only routine that needs to be changed + * if the buffer replacement strategy changes. Just change + * the manner in which buffers are added to the freelist queue. + * Currently, they are added on an LRU basis. + */ +void +AddBufferToFreelist(BufferDesc *bf) +{ +#ifdef BMTRACE + _bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum, + BufferDescriptorGetBuffer(bf), BMT_DEALLOC); +#endif /* BMTRACE */ + NotInQueue(bf); + + /* change bf so it points to inFrontOfNew and its successor */ + bf->freePrev = SharedFreeList->freePrev; + bf->freeNext = Free_List_Descriptor; + + /* insert new into chain */ + BufferDescriptors[bf->freeNext].freePrev = bf->buf_id; + BufferDescriptors[bf->freePrev].freeNext = bf->buf_id; +} + +#undef PinBuffer + +/* + * PinBuffer -- make buffer unavailable for replacement. + */ +void +PinBuffer(BufferDesc *buf) +{ + long b; + + /* Assert (buf->refcount < 25); */ + + if (buf->refcount == 0) { + IsInQueue(buf); + + /* remove from freelist queue */ + BufferDescriptors[buf->freeNext].freePrev = buf->freePrev; + BufferDescriptors[buf->freePrev].freeNext = buf->freeNext; + buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR; + + /* mark buffer as no longer free */ + buf->flags &= ~BM_FREE; + } else { + NotInQueue(buf); + } + + b = BufferDescriptorGetBuffer(buf) - 1; + Assert(PrivateRefCount[b] >= 0); + if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0) + buf->refcount++; + PrivateRefCount[b]++; +} + +void +PinBuffer_Debug(char *file, int line, BufferDesc *buf) +{ + PinBuffer(buf); + if (ShowPinTrace) { + Buffer buffer = BufferDescriptorGetBuffer(buf); + + fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +#undef UnpinBuffer + +/* + * UnpinBuffer -- make buffer available for replacement. + */ +void +UnpinBuffer(BufferDesc *buf) +{ + long b = BufferDescriptorGetBuffer(buf) - 1; + + Assert(buf->refcount); + Assert(PrivateRefCount[b] > 0); + PrivateRefCount[b]--; + if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0) + buf->refcount--; + NotInQueue(buf); + + if (buf->refcount == 0) { + AddBufferToFreelist(buf); + buf->flags |= BM_FREE; + } else { + /* do nothing */ + } +} + +void +UnpinBuffer_Debug(char *file, int line, BufferDesc *buf) +{ + UnpinBuffer(buf); + if (ShowPinTrace) { + Buffer buffer = BufferDescriptorGetBuffer(buf); + + fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->sb_relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +/* + * GetFreeBuffer() -- get the 'next' buffer from the freelist. + * + */ +BufferDesc * +GetFreeBuffer() +{ + BufferDesc *buf; + + if (Free_List_Descriptor == SharedFreeList->freeNext) { + + /* queue is empty. All buffers in the buffer pool are pinned. */ + elog(WARN,"out of free buffers: time to abort !\n"); + return(NULL); + } + buf = &(BufferDescriptors[SharedFreeList->freeNext]); + + /* remove from freelist queue */ + BufferDescriptors[buf->freeNext].freePrev = buf->freePrev; + BufferDescriptors[buf->freePrev].freeNext = buf->freeNext; + buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR; + + buf->flags &= ~(BM_FREE); + + return(buf); +} + +/* + * InitFreeList -- initialize the dummy buffer descriptor used + * as a freelist head. + * + * Assume: All of the buffers are already linked in a circular + * queue. Only called by postmaster and only during + * initialization. + */ +void +InitFreeList(bool init) +{ + SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]); + + if (init) { + /* we only do this once, normally the postmaster */ + SharedFreeList->data = INVALID_OFFSET; + SharedFreeList->flags = 0; + SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE); + SharedFreeList->buf_id = Free_List_Descriptor; + + /* insert it into a random spot in the circular queue */ + SharedFreeList->freeNext = BufferDescriptors[0].freeNext; + SharedFreeList->freePrev = 0; + BufferDescriptors[SharedFreeList->freeNext].freePrev = + BufferDescriptors[SharedFreeList->freePrev].freeNext = + Free_List_Descriptor; + } +} + + +/* + * print out the free list and check for breaks. + */ +void +DBG_FreeListCheck(int nfree) +{ + int i; + BufferDesc *buf; + + buf = &(BufferDescriptors[SharedFreeList->freeNext]); + for (i=0;i<nfree;i++,buf = &(BufferDescriptors[buf->freeNext])) { + + if (! (buf->flags & (BM_FREE))){ + if (buf != SharedFreeList) { + printf("\tfree list corrupted: %d flags %x\n", + buf->buf_id,buf->flags); + } else { + printf("\tfree list corrupted: too short -- %d not %d\n", + i,nfree); + + } + + + } + if ((BufferDescriptors[buf->freeNext].freePrev != buf->buf_id) || + (BufferDescriptors[buf->freePrev].freeNext != buf->buf_id)) { + printf("\tfree list links corrupted: %d %ld %ld\n", + buf->buf_id,buf->freePrev,buf->freeNext); + } + + } + if (buf != SharedFreeList) { + printf("\tfree list corrupted: %d-th buffer is %d\n", + nfree,buf->buf_id); + + } +} + +/* + * PrintBufferFreeList - + * prints the buffer free list, for debugging + */ +void +PrintBufferFreeList() +{ + BufferDesc *buf; + + if (SharedFreeList->freeNext == Free_List_Descriptor) { + printf("free list is empty.\n"); + return; + } + + buf = &(BufferDescriptors[SharedFreeList->freeNext]); + for (;;) { + int i = (buf - BufferDescriptors); + printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n", + i, buf->sb_relname, buf->tag.blockNum, + buf->flags, buf->refcount, PrivateRefCount[i], + buf->freeNext, buf->freePrev); + + if (buf->freeNext == Free_List_Descriptor) + break; + + buf = &(BufferDescriptors[buf->freeNext]); + } +} diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c new file mode 100644 index 00000000000..ec625940867 --- /dev/null +++ b/src/backend/storage/buffer/localbuf.c @@ -0,0 +1,284 @@ +/*------------------------------------------------------------------------- + * + * localbuf.c-- + * local buffer manager. Fast buffer manager for temporary tables + * or special cases when the operation is not visible to other backends. + * + * When a relation is being created, the descriptor will have rd_islocal + * set to indicate that the local buffer manager should be used. During + * the same transaction the relation is being created, any inserts or + * selects from the newly created relation will use the local buffer + * pool. rd_islocal is reset at the end of a transaction (commit/abort). + * This is useful for queries like SELECT INTO TABLE and create index. + * + * Copyright (c) 1994-5, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <sys/file.h> +#include <stdio.h> +#include <math.h> +#include <signal.h> + +/* declarations split between these three files */ +#include "storage/buf.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" + +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/smgr.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "utils/builtins.h" +#include "utils/hsearch.h" +#include "utils/elog.h" +#include "utils/memutils.h" +#include "executor/execdebug.h" /* for NDirectFileRead */ +#include "catalog/catalog.h" + +int NLocBuffer = 64; +BufferDesc *LocalBufferDescriptors = NULL; +long *LocalRefCount = NULL; + +static int nextFreeLocalBuf = 0; + +/*#define LBDEBUG*/ + +/* + * LocalBufferAlloc - + * allocate a local buffer. We do round robin allocation for now. + */ +BufferDesc * +LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr) +{ + int i; + BufferDesc *bufHdr = (BufferDesc *) NULL; + + if (blockNum == P_NEW) { + blockNum = reln->rd_nblocks; + reln->rd_nblocks++; + } + + /* a low tech search for now -- not optimized for scans */ + for (i=0; i < NLocBuffer; i++) { + if (LocalBufferDescriptors[i].tag.relId.relId == reln->rd_id && + LocalBufferDescriptors[i].tag.blockNum == blockNum) { + +#ifdef LBDEBUG + fprintf(stderr, "LB ALLOC (%d,%d) %d\n", + reln->rd_id, blockNum, -i-1); +#endif + LocalRefCount[i]++; + *foundPtr = TRUE; + return &LocalBufferDescriptors[i]; + } + } + +#ifdef LBDEBUG + fprintf(stderr, "LB ALLOC (%d,%d) %d\n", + reln->rd_id, blockNum, -nextFreeLocalBuf-1); +#endif + + /* need to get a new buffer (round robin for now) */ + for(i=0; i < NLocBuffer; i++) { + int b = (nextFreeLocalBuf + i) % NLocBuffer; + + if (LocalRefCount[b]==0) { + bufHdr = &LocalBufferDescriptors[b]; + LocalRefCount[b]++; + nextFreeLocalBuf = (b + 1) % NLocBuffer; + break; + } + } + if (bufHdr==NULL) + elog(WARN, "no empty local buffer."); + + /* + * this buffer is not referenced but it might still be dirty (the + * last transaction to touch it doesn't need its contents but has + * not flushed it). if that's the case, write it out before + * reusing it! + */ + if (bufHdr->flags & BM_DIRTY) { + Relation bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId); + + Assert(bufrel != NULL); + + /* flush this page */ + smgrwrite(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + /* + * it's all ours now. + */ + bufHdr->tag.relId.relId = reln->rd_id; + bufHdr->tag.blockNum = blockNum; + bufHdr->flags &= ~BM_DIRTY; + + /* + * lazy memory allocation. (see MAKE_PTR for why we need to do + * MAKE_OFFSET.) + */ + if (bufHdr->data == (SHMEM_OFFSET)0) { + char *data = (char *)malloc(BLCKSZ); + + bufHdr->data = MAKE_OFFSET(data); + } + + *foundPtr = FALSE; + return bufHdr; +} + +/* + * WriteLocalBuffer - + * writes out a local buffer + */ +int +WriteLocalBuffer(Buffer buffer, bool release) +{ + int bufid; + + Assert(BufferIsLocal(buffer)); + +#ifdef LBDEBUG + fprintf(stderr, "LB WRITE %d\n", buffer); +#endif + + bufid = - (buffer + 1); + LocalBufferDescriptors[bufid].flags |= BM_DIRTY; + + if (release) { + Assert(LocalRefCount[bufid] > 0); + LocalRefCount[bufid]--; + } + + return true; +} + +/* + * FlushLocalBuffer - + * flushes a local buffer + */ +int +FlushLocalBuffer(Buffer buffer) +{ + int bufid; + Relation bufrel; + BufferDesc *bufHdr; + + Assert(BufferIsLocal(buffer)); + +#ifdef LBDEBUG + fprintf(stderr, "LB FLUSH %d\n", buffer); +#endif + + bufid = - (buffer + 1); + bufHdr = &LocalBufferDescriptors[bufid]; + bufHdr->flags &= ~BM_DIRTY; + bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId); + + Assert(bufrel != NULL); + smgrflush(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + + Assert(LocalRefCount[bufid] > 0); + LocalRefCount[bufid]--; + + return true; +} + +/* + * InitLocalBuffer - + * init the local buffer cache. Since most queries (esp. multi-user ones) + * don't involve local buffers, we delay allocating memory for actual the + * buffer until we need it. + */ +void +InitLocalBuffer() +{ + int i; + + /* + * these aren't going away. I'm not gonna use palloc. + */ + LocalBufferDescriptors = + (BufferDesc *)malloc(sizeof(BufferDesc) * NLocBuffer); + memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer); + nextFreeLocalBuf = 0; + + for (i = 0; i < NLocBuffer; i++) { + BufferDesc *buf = &LocalBufferDescriptors[i]; + + /* + * negative to indicate local buffer. This is tricky: shared buffers + * start with 0. We have to start with -2. (Note that the routine + * BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id + * is -1.) + */ + buf->buf_id = - i - 2; + } + + LocalRefCount = + (long *)malloc(sizeof(long) * NLocBuffer); + memset(LocalRefCount, 0, sizeof(long) * NLocBuffer); +} + +/* + * LocalBufferSync - + * flush all dirty buffers in the local buffer cache. Since the buffer + * cache is only used for keeping relations visible during a transaction, + * we will not need these buffers again. + */ +void +LocalBufferSync() +{ + int i; + + for (i = 0; i < NLocBuffer; i++) { + BufferDesc *buf = &LocalBufferDescriptors[i]; + Relation bufrel; + + if (buf->flags & BM_DIRTY) { +#ifdef LBDEBUG + fprintf(stderr, "LB SYNC %d\n", -i-1); +#endif + bufrel = RelationIdCacheGetRelation(buf->tag.relId.relId); + + Assert(bufrel != NULL); + + smgrwrite(bufrel->rd_rel->relsmgr, bufrel, buf->tag.blockNum, + (char *) MAKE_PTR(buf->data)); + + buf->tag.relId.relId = InvalidOid; + buf->flags &= ~BM_DIRTY; + } + } + + memset(LocalRefCount, 0, sizeof(long) * NLocBuffer); +} + +void +ResetLocalBufferPool() +{ + int i; + + memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer); + nextFreeLocalBuf = 0; + + for (i = 0; i < NLocBuffer; i++) { + BufferDesc *buf = &LocalBufferDescriptors[i]; + + /* just like InitLocalBuffer() */ + buf->buf_id = - i - 2; + } + + memset(LocalRefCount, 0, sizeof(long) * NLocBuffer); +} diff --git a/src/backend/storage/bufmgr.h b/src/backend/storage/bufmgr.h new file mode 100644 index 00000000000..581d3237cad --- /dev/null +++ b/src/backend/storage/bufmgr.h @@ -0,0 +1,112 @@ +/*------------------------------------------------------------------------- + * + * bufmgr.h-- + * POSTGRES buffer manager definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: bufmgr.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef BUFMGR_H +#define BUFMGR_H + +#include "c.h" + +#include "machine.h" /* for BLCKSZ */ +#include "utils/rel.h" + +#include "storage/buf_internals.h" /* UGLY! -- ay */ + +/* + * the maximum size of a disk block for any possible installation. + * + * in theory this could be anything, but in practice this is actually + * limited to 2^13 bytes because we have limited ItemIdData.lp_off and + * ItemIdData.lp_len to 13 bits (see itemid.h). + */ +#define MAXBLCKSZ 8192 + +typedef void *Block; + + +/* special pageno for bget */ +#define P_NEW InvalidBlockNumber /* grow the file to get a new page */ + +typedef bits16 BufferLock; + +/********************************************************************** + + the rest is function defns in the bufmgr that are externally callable + + **********************************************************************/ + +/* + * These routines are beaten on quite heavily, hence the macroization. + * See buf_internals.h for a related comment. + */ +#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) + +/* + * BufferIsPinned -- + * True iff the buffer is pinned (and therefore valid) + * + * Note: + * Smenatics are identical to BufferIsValid + * XXX - need to remove either one eventually. + */ +#define BufferIsPinned BufferIsValid + + +extern int ShowPinTrace; + +/* + * prototypes for functions in bufmgr.c + */ +extern Buffer RelationGetBufferWithBuffer(Relation relation, + BlockNumber blockNumber, Buffer buffer); +extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum); +extern Buffer ReadBuffer_Debug(char *file, int line, Relation reln, + BlockNumber blockNum); +extern int WriteBuffer(Buffer buffer); +extern void WriteBuffer_Debug(char *file, int line, Buffer buffer); +extern void DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, + char *dest); +extern int WriteNoReleaseBuffer(Buffer buffer); +extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, + BlockNumber blockNum); + +extern void InitBufferPool(IPCKey key); +extern void PrintBufferUsage(FILE *statfp); +extern void ResetBufferUsage(void); +extern void ResetBufferPool(void); +extern int BufferPoolCheckLeak(void); +extern void FlushBufferPool(int StableMainMemoryFlag); +extern bool BufferIsValid(Buffer bufnum); +extern BlockNumber BufferGetBlockNumber(Buffer buffer); +extern Relation BufferGetRelation(Buffer buffer); +extern BlockNumber RelationGetNumberOfBlocks(Relation relation); +extern Block BufferGetBlock(Buffer buffer); +extern void ReleaseTmpRelBuffers(Relation tempreldesc); +extern void DropBuffers(Oid dbid); +extern void PrintBufferDescs(void); +extern void PrintPinnedBufs(void); +extern int BufferShmemSize(void); +extern void BufferPoolBlowaway(void); +extern void IncrBufferRefCount(Buffer buffer); +extern int ReleaseBuffer(Buffer buffer); + +extern void IncrBufferRefCount_Debug(char *file, int line, Buffer buffer); +extern void ReleaseBuffer_Debug(char *file, int line, Buffer buffer); +extern int ReleaseAndReadBuffer_Debug(char *file, + int line, + Buffer buffer, + Relation relation, + BlockNumber blockNum); +extern void BufferRefCountReset(int *refcountsave); +extern void BufferRefCountRestore(int *refcountsave); + +#endif /* !defined(BufMgrIncluded) */ + diff --git a/src/backend/storage/bufpage.h b/src/backend/storage/bufpage.h new file mode 100644 index 00000000000..9fda973889d --- /dev/null +++ b/src/backend/storage/bufpage.h @@ -0,0 +1,256 @@ +/*------------------------------------------------------------------------- + * + * bufpage.h-- + * Standard POSTGRES buffer page definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: bufpage.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef BUFPAGE_H +#define BUFPAGE_H + +#include "c.h" +#include "machine.h" /* for BLCKSZ */ + +#include "storage/buf.h" +#include "storage/item.h" +#include "storage/itemid.h" +#include "storage/itemptr.h" + +/* + * a postgres disk page is an abstraction layered on top of a postgres + * disk block (which is simply a unit of i/o, see block.h). + * + * specifically, while a disk block can be unformatted, a postgres + * disk page is always a slotted page of the form: + * + * +----------------+---------------------------------+ + * | PageHeaderData | linp0 linp1 linp2 ... | + * +-----------+----+---------------------------------+ + * | ... linpN | | + * +-----------+--------------------------------------+ + * | ^ pd_lower | + * | | + * | v pd_upper | + * +-------------+------------------------------------+ + * | | tupleN ... | + * +-------------+------------------+-----------------+ + * | ... tuple2 tuple1 tuple0 | "special space" | + * +--------------------------------+-----------------+ + * ^ pd_special + * + * a page is full when nothing can be added between pd_lower and + * pd_upper. + * + * all blocks written out by an access method must be disk pages. + * + * EXCEPTIONS: + * + * obviously, a page is not formatted before it is initialized with by + * a call to PageInit. + * + * the contents of the special pg_variable/pg_time/pg_log tables are + * raw disk blocks with special formats. these are the only "access + * methods" that need not write disk pages. + * + * NOTES: + * + * linp0..N form an ItemId array. ItemPointers point into this array + * rather than pointing directly to a tuple. + * + * tuple0..N are added "backwards" on the page. because a tuple's + * ItemPointer points to its ItemId entry rather than its actual + * byte-offset position, tuples can be physically shuffled on a page + * whenever the need arises. + * + * AM-generic per-page information is kept in the pd_opaque field of + * the PageHeaderData. (this is currently only the page size.) + * AM-specific per-page data is kept in the area marked "special + * space"; each AM has an "opaque" structure defined somewhere that is + * stored as the page trailer. an access method should always + * initialize its pages with PageInit and then set its own opaque + * fields. + */ +typedef Pointer Page; + +/* + * PageIsValid -- + * True iff page is valid. + */ +#define PageIsValid(page) PointerIsValid(page) + + +/* + * location (byte offset) within a page. + * + * note that this is actually limited to 2^13 because we have limited + * ItemIdData.lp_off and ItemIdData.lp_len to 13 bits (see itemid.h). + */ +typedef uint16 LocationIndex; + + +/* + * space management information generic to any page + * + * od_pagesize - size in bytes. + * in reality, we need at least 64B to fit the + * page header, opaque space and a minimal tuple; + * on the high end, we can only support pages up + * to 8KB because lp_off/lp_len are 13 bits. + */ +typedef struct OpaqueData { + uint16 od_pagesize; +} OpaqueData; + +typedef OpaqueData *Opaque; + + +/* + * disk page organization + */ +typedef struct PageHeaderData { + LocationIndex pd_lower; /* offset to start of free space */ + LocationIndex pd_upper; /* offset to end of free space */ + LocationIndex pd_special; /* offset to start of special space */ + OpaqueData pd_opaque; /* AM-generic information */ + ItemIdData pd_linp[1]; /* line pointers */ +} PageHeaderData; + +typedef PageHeaderData *PageHeader; + +typedef enum { + ShufflePageManagerMode, + OverwritePageManagerMode +} PageManagerMode; + +/* ---------------- + * misc support macros + * ---------------- + */ + +/* + * XXX this is wrong -- ignores padding/alignment, variable page size, + * AM-specific opaque space at the end of the page (as in btrees), ... + * however, it at least serves as an upper bound for heap pages. + */ +#define MAXTUPLEN (BLCKSZ - sizeof (PageHeaderData)) + +/* ---------------------------------------------------------------- + * page support macros + * ---------------------------------------------------------------- + */ +/* + * PageIsValid -- This is defined in page.h. + */ + +/* + * PageIsUsed -- + * True iff the page size is used. + * + * Note: + * Assumes page is valid. + */ +#define PageIsUsed(page) \ + (AssertMacro(PageIsValid(page)) ? \ + ((bool) (((PageHeader) (page))->pd_lower != 0)) : false) + +/* + * PageIsEmpty -- + * returns true iff no itemid has been allocated on the page + */ +#define PageIsEmpty(page) \ + (((PageHeader) (page))->pd_lower == \ + (sizeof(PageHeaderData) - sizeof(ItemIdData)) ? true : false) + +/* + * PageGetItemId -- + * Returns an item identifier of a page. + */ +#define PageGetItemId(page, offsetNumber) \ + ((ItemId) (&((PageHeader) (page))->pd_linp[(-1) + (offsetNumber)])) + +/* ---------------- + * macros to access opaque space + * ---------------- + */ + +/* + * PageSizeIsValid -- + * True iff the page size is valid. + * + * XXX currently all page sizes are "valid" but we only actually + * use BLCKSZ. + */ +#define PageSizeIsValid(pageSize) 1 + +/* + * PageGetPageSize -- + * Returns the page size of a page. + * + * this can only be called on a formatted page (unlike + * BufferGetPageSize, which can be called on an unformatted page). + * however, it can be called on a page for which there is no buffer. + */ +#define PageGetPageSize(page) \ + ((Size) ((PageHeader) (page))->pd_opaque.od_pagesize) + +/* + * PageSetPageSize -- + * Sets the page size of a page. + */ +#define PageSetPageSize(page, size) \ + ((PageHeader) (page))->pd_opaque.od_pagesize = (size) + +/* ---------------- + * page special data macros + * ---------------- + */ +/* + * PageGetSpecialSize -- + * Returns size of special space on a page. + * + * Note: + * Assumes page is locked. + */ +#define PageGetSpecialSize(page) \ + ((uint16) (PageGetPageSize(page) - ((PageHeader)page)->pd_special)) + +/* + * PageGetSpecialPointer -- + * Returns pointer to special space on a page. + * + * Note: + * Assumes page is locked. + */ +#define PageGetSpecialPointer(page) \ + (AssertMacro(PageIsValid(page)) ? \ + (char *) ((char *) (page) + ((PageHeader) (page))->pd_special) \ + : (char *) 0) + +/* ---------------------------------------------------------------- + * extern declarations + * ---------------------------------------------------------------- + */ + +extern Size BufferGetPageSize(Buffer buffer); +extern Page BufferGetPage(Buffer buffer); +extern void PageInit(Page page, Size pageSize, Size specialSize); +extern Item PageGetItem(Page page, ItemId itemId); +extern OffsetNumber PageAddItem(Page page, Item item, Size size, + OffsetNumber offsetNumber, ItemIdFlags flags); +extern Page PageGetTempPage(Page page, Size specialSize); +extern void PageRestoreTempPage(Page tempPage, Page oldPage); +extern OffsetNumber PageGetMaxOffsetNumber(Page page); +extern void PageRepairFragmentation(Page page); +extern Size PageGetFreeSpace(Page page); +extern void PageManagerModeSet(PageManagerMode mode); +extern void PageIndexTupleDelete(Page page, OffsetNumber offset); +extern void PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr, + char *location, Size size); + + +#endif /* BUFPAGE_H */ diff --git a/src/backend/storage/fd.h b/src/backend/storage/fd.h new file mode 100644 index 00000000000..da28b031bb8 --- /dev/null +++ b/src/backend/storage/fd.h @@ -0,0 +1,96 @@ +/*------------------------------------------------------------------------- + * + * fd.h-- + * Virtual file descriptor definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: fd.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * calls: + * + * File {Close, Read, Write, Seek, Tell, Sync} + * {File Name Open, Allocate, Free} File + * + * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES. + * use them for all file activity... + * + * fd = FilePathOpenFile("foo", O_RDONLY); + * File fd; + * + * use AllocateFile if you need a file descriptor in some other context. + * it will make sure that there is a file descriptor free + * + * use FreeFile to let the virtual file descriptor package know that + * there is now a free fd (when you are done with it) + * + * AllocateFile(); + * FreeFile(); + */ +#ifndef FD_H +#define FD_H + +/* + * FileOpen uses the standard UNIX open(2) flags. + */ +#include <fcntl.h> /* for O_ on most */ +#ifndef O_RDONLY +#include <sys/file.h> /* for O_ on the rest */ +#endif /* O_RDONLY */ + +/* + * FileSeek uses the standard UNIX lseek(2) flags. + */ +#ifndef WIN32 +#include <unistd.h> /* for SEEK_ on most */ +#else +#ifndef SEEK_SET +#include <stdio.h> /* for SEEK_ on the rest */ +#endif /* SEEK_SET */ +#endif /* WIN32 */ + +#include "c.h" +#include "storage/block.h" + +typedef char *FileName; + +typedef int File; + +/* originally in libpq-fs.h */ +struct pgstat { /* just the fields we need from stat structure */ + int st_ino; + int st_mode; + unsigned int st_size; + unsigned int st_sizehigh; /* high order bits */ +/* 2^64 == 1.8 x 10^20 bytes */ + int st_uid; + int st_atime_s; /* just the seconds */ + int st_mtime_s; /* since SysV and the new BSD both have */ + int st_ctime_s; /* usec fields.. */ +}; + +/* + * prototypes for functions in fd.c + */ +extern void FileInvalidate(File file); +extern File FileNameOpenFile(FileName fileName, int fileFlags, int fileMode); +extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode); +extern void FileClose(File file); +extern void FileUnlink(File file); +extern int FileRead(File file, char *buffer, int amount); +extern int FileWrite(File file, char *buffer, int amount); +extern long FileSeek(File file, long offset, int whence); +extern long FileTell(File file); +extern int FileTruncate(File file, int offset); +extern int FileSync(File file); +extern int FileNameUnlink(char *filename); +extern void AllocateFile(void); +extern void FreeFile(void); +extern void closeAllVfds(void); +extern void closeOneVfd(void); + +#endif /* FD_H */ diff --git a/src/backend/storage/file/Makefile.inc b/src/backend/storage/file/Makefile.inc new file mode 100644 index 00000000000..767cbecd38a --- /dev/null +++ b/src/backend/storage/file/Makefile.inc @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for storage/file +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/file/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= fd.c diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c new file mode 100644 index 00000000000..bb94c4c5dec --- /dev/null +++ b/src/backend/storage/file/fd.c @@ -0,0 +1,888 @@ +/*------------------------------------------------------------------------- + * + * fd.c-- + * Virtual file descriptor code. + * + * Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * $Id: fd.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $ + * + * NOTES: + * + * This code manages a cache of 'virtual' file descriptors (VFDs). + * The server opens many file descriptors for a variety of reasons, + * including base tables, scratch files (e.g., sort and hash spool + * files), and random calls to C library routines like system(3); it + * is quite easy to exceed system limits on the number of open files a + * single process can have. (This is around 256 on many modern + * operating systems, but can be as low as 32 on others.) + * + * VFDs are managed as an LRU pool, with actual OS file descriptors + * being opened and closed as needed. Obviously, if a routine is + * opened using these interfaces, all subsequent operations must also + * be through these interfaces (the File type is not a real file + * descriptor). + * + * For this scheme to work, most (if not all) routines throughout the + * server should use these interfaces instead of calling the C library + * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we + * may find ourselves short of real file descriptors anyway. + * + * This file used to contain a bunch of stuff to support RAID levels 0 + * (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone + * because the parallel query processing code that called it is all + * gone. If you really need it you could get it from the original + * POSTGRES source. + *------------------------------------------------------------------------- + */ + +#include <stdio.h> +#include <sys/file.h> +#include <sys/param.h> +#include <errno.h> +#include <sys/stat.h> +#include <string.h> +#include <unistd.h> + +#include "c.h" +#include "miscadmin.h" /* for DataDir */ +#include "utils/palloc.h" + +#ifdef PORTNAME_sparc +/* + * the SunOS 4 NOFILE is a lie, because the default limit is *not* the + * maximum number of file descriptors you can have open. + * + * we have to either use this number (the default dtablesize) or + * explicitly call setrlimit(RLIMIT_NOFILE, NOFILE). + */ +#include <sys/user.h> +#undef NOFILE +#define NOFILE NOFILE_IN_U +#endif /* PORTNAME_sparc */ + +/* + * Problem: Postgres does a system(ld...) to do dynamic loading. This + * will open several extra files in addition to those used by + * Postgres. We need to do this hack to guarentee that there are file + * descriptors free for ld to use. + * + * The current solution is to limit the number of files descriptors + * that this code will allocated at one time. (it leaves + * RESERVE_FOR_LD free). + * + * (Even though most dynamic loaders now use dlopen(3) or the + * equivalent, the OS must still open several files to perform the + * dynamic loading. Keep this here.) + */ +#define RESERVE_FOR_LD 10 + +/* + * If we are using weird storage managers, we may need to keep real + * file descriptors open so that the jukebox server doesn't think we + * have gone away (and no longer care about a platter or file that + * we've been using). This might be an actual file descriptor for a + * local jukebox interface that uses paths, or a socket connection for + * a network jukebox server. Since we can't be opening and closing + * these descriptors at whim, we must make allowances for them. + */ +#ifdef HP_JUKEBOX +#define RESERVE_FOR_JB 25 +#define MAXFILES ((NOFILE - RESERVE_FOR_LD) - RESERVE_FOR_JB) +#else /* HP_JUKEBOX */ +#define MAXFILES (NOFILE - RESERVE_FOR_LD) +#endif /* HP_JUKEBOX */ + +/* Debugging.... */ + +#ifdef FDDEBUG +# define DO_DB(A) A +#else +# define DO_DB(A) /* A */ +#endif + +#define VFD_CLOSED -1 + +#include "storage/fd.h" +#include "utils/elog.h" + +#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED) + +typedef struct vfd { + signed short fd; + unsigned short fdstate; + +#define FD_DIRTY (1 << 0) + + File nextFree; + File lruMoreRecently; + File lruLessRecently; + long seekPos; + char *fileName; + int fileFlags; + int fileMode; +} Vfd; + +/* + * Virtual File Descriptor array pointer and size. This grows as + * needed. + */ +static Vfd *VfdCache; +static Size SizeVfdCache = 0; + +/* + * Minimum number of file descriptors known to be free. + */ +static int FreeFd = 0; + +/* + * Number of file descriptors known to be open. + */ +static int nfile = 0; + +/* + * we use the name of the null device in various places, mostly so + * that we can open it and find out if we really have any descriptors + * available or not. + */ +#ifndef WIN32 +static char *Nulldev = "/dev/null"; +static char Sep_char = '/'; +#else +static char *Nulldev = "NUL"; +static char Sep_char = '\\'; +#endif /* WIN32 */ + +/* + * Private Routines + * + * Delete - delete a file from the Lru ring + * LruDelete - remove a file from the Lru ring and close + * Insert - put a file at the front of the Lru ring + * LruInsert - put a file at the front of the Lru ring and open + * AssertLruRoom - make sure that there is a free fd. + * + * the Last Recently Used ring is a doubly linked list that begins and + * ends on element zero. + * + * example: + * + * /--less----\ /---------\ + * v \ v \ + * #0 --more---> LeastRecentlyUsed --more-\ \ + * ^\ | | + * \\less--> MostRecentlyUsedFile <---/ | + * \more---/ \--less--/ + * + * AllocateVfd - grab a free (or new) file record (from VfdArray) + * FreeVfd - free a file record + * + */ +static void Delete(File file); +static void LruDelete(File file); +static void Insert(File file); +static int LruInsert (File file); +static void AssertLruRoom(void); +static File AllocateVfd(void); +static void FreeVfd(File file); + +static int FileAccess(File file); +static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode); +static char *filepath(char *filename); + +#if defined(FDDEBUG) +static void +_dump_lru() +{ + int mru = VfdCache[0].lruLessRecently; + Vfd *vfdP = &VfdCache[mru]; + + printf("MOST %d ", mru); + while (mru != 0) + { + mru = vfdP->lruLessRecently; + vfdP = &VfdCache[mru]; + printf("%d ", mru); + } + printf("LEAST\n"); +} +#endif /* FDDEBUG */ + +static void +Delete(File file) +{ + Vfd *fileP; + + DO_DB(printf("DEBUG: Delete %d (%s)\n", + file, VfdCache[file].fileName)); + DO_DB(_dump_lru()); + + Assert(file != 0); + + fileP = &VfdCache[file]; + + VfdCache[fileP->lruLessRecently].lruMoreRecently = + VfdCache[file].lruMoreRecently; + VfdCache[fileP->lruMoreRecently].lruLessRecently = + VfdCache[file].lruLessRecently; + + DO_DB(_dump_lru()); +} + +static void +LruDelete(File file) +{ + Vfd *fileP; + int returnValue; + + DO_DB(printf("DEBUG: LruDelete %d (%s)\n", + file, VfdCache[file].fileName)); + + Assert(file != 0); + + fileP = &VfdCache[file]; + + /* delete the vfd record from the LRU ring */ + Delete(file); + + /* save the seek position */ + fileP->seekPos = lseek(fileP->fd, 0L, SEEK_CUR); + Assert( fileP->seekPos != -1); + + /* if we have written to the file, sync it */ + if (fileP->fdstate & FD_DIRTY) { + returnValue = fsync(fileP->fd); + Assert(returnValue != -1); + fileP->fdstate &= ~FD_DIRTY; + } + + /* close the file */ + returnValue = close(fileP->fd); + Assert(returnValue != -1); + + --nfile; + fileP->fd = VFD_CLOSED; + + /* note that there is now one more free real file descriptor */ + FreeFd++; +} + +static void +Insert(File file) +{ + Vfd *vfdP; + + DO_DB(printf("DEBUG: Insert %d (%s)\n", + file, VfdCache[file].fileName)); + DO_DB(_dump_lru()); + + vfdP = &VfdCache[file]; + + vfdP->lruMoreRecently = 0; + vfdP->lruLessRecently = VfdCache[0].lruLessRecently; + VfdCache[0].lruLessRecently = file; + VfdCache[vfdP->lruLessRecently].lruMoreRecently = file; + + DO_DB(_dump_lru()); +} + +static int +LruInsert (File file) +{ + Vfd *vfdP; + int returnValue; + + DO_DB(printf("DEBUG: LruInsert %d (%s)\n", + file, VfdCache[file].fileName)); + + vfdP = &VfdCache[file]; + + if (FileIsNotOpen(file)) { + int tmpfd; + + /* + * Note, we check to see if there's a free file descriptor + * before attempting to open a file. One general way to do + * this is to try to open the null device which everybody + * should be able to open all the time. If this fails, we + * assume this is because there's no free file descriptors. + */ + tryAgain: + tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666); + if (tmpfd < 0) { + FreeFd = 0; + errno = 0; + AssertLruRoom(); + goto tryAgain; + } else { + close(tmpfd); + } + vfdP->fd = open(vfdP->fileName,vfdP->fileFlags,vfdP->fileMode); + + if (vfdP->fd < 0) { + DO_DB(printf("RE_OPEN FAILED: %d\n", + errno)); + return (vfdP->fd); + } else { + DO_DB(printf("RE_OPEN SUCCESS\n")); + ++nfile; + } + + /* seek to the right position */ + if (vfdP->seekPos != 0L) { + returnValue = + lseek(vfdP->fd, vfdP->seekPos, SEEK_SET); + Assert(returnValue != -1); + } + + /* init state on open */ + vfdP->fdstate = 0x0; + + /* note that a file descriptor has been used up */ + if (FreeFd > 0) + FreeFd--; + } + + /* + * put it at the head of the Lru ring + */ + + Insert(file); + + return (0); +} + +static void +AssertLruRoom() +{ + DO_DB(printf("DEBUG: AssertLruRoom (FreeFd = %d)\n", + FreeFd)); + + if (FreeFd <= 0 || nfile >= MAXFILES) { + LruDelete(VfdCache[0].lruMoreRecently); + } +} + +static File +AllocateVfd() +{ + Index i; + File file; + + DO_DB(printf("DEBUG: AllocateVfd\n")); + + if (SizeVfdCache == 0) { + + /* initialize */ + VfdCache = (Vfd *)malloc(sizeof(Vfd)); + + VfdCache->nextFree = 0; + VfdCache->lruMoreRecently = 0; + VfdCache->lruLessRecently = 0; + VfdCache->fd = VFD_CLOSED; + VfdCache->fdstate = 0x0; + + SizeVfdCache = 1; + } + + if (VfdCache[0].nextFree == 0) { + + /* + * The free list is empty so it is time to increase the + * size of the array + */ + + VfdCache =(Vfd *)realloc(VfdCache, sizeof(Vfd)*SizeVfdCache*2); + Assert(VfdCache != NULL); + + /* + * Set up the free list for the new entries + */ + + for (i = SizeVfdCache; i < 2*SizeVfdCache; i++) { + memset((char *) &(VfdCache[i]), 0, sizeof(VfdCache[0])); + VfdCache[i].nextFree = i+1; + VfdCache[i].fd = VFD_CLOSED; + } + + /* + * Element 0 is the first and last element of the free + * list + */ + + VfdCache[0].nextFree = SizeVfdCache; + VfdCache[2*SizeVfdCache-1].nextFree = 0; + + /* + * Record the new size + */ + + SizeVfdCache *= 2; + } + file = VfdCache[0].nextFree; + + VfdCache[0].nextFree = VfdCache[file].nextFree; + + return file; +} + +static void +FreeVfd(File file) +{ + DO_DB(printf("DB: FreeVfd: %d (%s)\n", + file, VfdCache[file].fileName)); + + VfdCache[file].nextFree = VfdCache[0].nextFree; + VfdCache[0].nextFree = file; +} + +static char * +filepath(char *filename) +{ + char *buf; + char basename[16]; + int len; + +#ifndef WIN32 + if (*filename != Sep_char) { +#else + if (!(filename[1] == ':' && filename[2] == Sep_char)) { +#endif /* WIN32 */ + + /* Either /base/ or \base\ */ + sprintf(basename, "%cbase%c", Sep_char, Sep_char); + + len = strlen(DataDir) + strlen(basename) + strlen(GetDatabaseName()) + + strlen(filename) + 2; + buf = (char*) palloc(len); + sprintf(buf, "%s%s%s%c%s", + DataDir, basename, GetDatabaseName(), Sep_char, filename); + } else { + buf = (char *) palloc(strlen(filename) + 1); + strcpy(buf, filename); + } + + return(buf); +} + +static int +FileAccess(File file) +{ + int returnValue; + + DO_DB(printf("DB: FileAccess %d (%s)\n", + file, VfdCache[file].fileName)); + + /* + * Is the file open? If not, close the least recently used, + * then open it and stick it at the head of the used ring + */ + + if (FileIsNotOpen(file)) { + + AssertLruRoom(); + + returnValue = LruInsert(file); + if (returnValue != 0) + return returnValue; + + } else { + + /* + * We now know that the file is open and that it is not the + * last one accessed, so we need to more it to the head of + * the Lru ring. + */ + + Delete(file); + Insert(file); + } + + return (0); +} + +/* + * Called when we get a shared invalidation message on some relation. + */ +void +FileInvalidate(File file) +{ + if (!FileIsNotOpen(file)) { + LruDelete(file); + } +} + +/* VARARGS2 */ +static File +fileNameOpenFile(FileName fileName, + int fileFlags, + int fileMode) +{ + static int osRanOut = 0; + File file; + Vfd *vfdP; + int tmpfd; + + DO_DB(printf("DEBUG: FileNameOpenFile: %s %x %o\n", + fileName, fileFlags, fileMode)); + + file = AllocateVfd(); + vfdP = &VfdCache[file]; + + if (nfile >= MAXFILES || (FreeFd == 0 && osRanOut)) { + AssertLruRoom(); + } + + tryAgain: + tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666); + if (tmpfd < 0) { + DO_DB(printf("DB: not enough descs, retry, er= %d\n", + errno)); + errno = 0; + FreeFd = 0; + osRanOut = 1; + AssertLruRoom(); + goto tryAgain; + } else { + close(tmpfd); + } + +#ifdef WIN32 + fileFlags |= _O_BINARY; +#endif /* WIN32 */ + vfdP->fd = open(fileName,fileFlags,fileMode); + vfdP->fdstate = 0x0; + + if (vfdP->fd < 0) { + FreeVfd(file); + return -1; + } + ++nfile; + DO_DB(printf("DB: FNOF success %d\n", + vfdP->fd)); + + (void)LruInsert(file); + + if (fileName==NULL) { + elog(WARN, "fileNameOpenFile: NULL fname"); + } + vfdP->fileName = malloc(strlen(fileName)+1); + strcpy(vfdP->fileName,fileName); + + vfdP->fileFlags = fileFlags & ~(O_TRUNC|O_EXCL); + vfdP->fileMode = fileMode; + vfdP->seekPos = 0; + + return file; +} + +/* + * open a file in the database directory ($PGDATA/base/...) + */ +File +FileNameOpenFile(FileName fileName, int fileFlags, int fileMode) +{ + File fd; + char *fname; + + fname = filepath(fileName); + fd = fileNameOpenFile(fname, fileFlags, fileMode); + pfree(fname); + return(fd); +} + +/* + * open a file in an arbitrary directory + */ +File +PathNameOpenFile(FileName fileName, int fileFlags, int fileMode) +{ + return(fileNameOpenFile(fileName, fileFlags, fileMode)); +} + +void +FileClose(File file) +{ + int returnValue; + + DO_DB(printf("DEBUG: FileClose: %d (%s)\n", + file, VfdCache[file].fileName)); + + if (!FileIsNotOpen(file)) { + + /* remove the file from the lru ring */ + Delete(file); + + /* record the new free operating system file descriptor */ + FreeFd++; + + /* if we did any writes, sync the file before closing */ + if (VfdCache[file].fdstate & FD_DIRTY) { + returnValue = fsync(VfdCache[file].fd); + Assert(returnValue != -1); + VfdCache[file].fdstate &= ~FD_DIRTY; + } + + /* close the file */ + returnValue = close(VfdCache[file].fd); + Assert(returnValue != -1); + + --nfile; + VfdCache[file].fd = VFD_CLOSED; + } + /* + * Add the Vfd slot to the free list + */ + FreeVfd(file); + /* + * Free the filename string + */ + free(VfdCache[file].fileName); +} + +void +FileUnlink(File file) +{ + int returnValue; + + DO_DB(printf("DB: FileClose: %d (%s)\n", + file, VfdCache[file].fileName)); + + if (!FileIsNotOpen(file)) { + + /* remove the file from the lru ring */ + Delete(file); + + /* record the new free operating system file descriptor */ + FreeFd++; + + /* if we did any writes, sync the file before closing */ + if (VfdCache[file].fdstate & FD_DIRTY) { + returnValue = fsync(VfdCache[file].fd); + Assert(returnValue != -1); + VfdCache[file].fdstate &= ~FD_DIRTY; + } + + /* close the file */ + returnValue = close(VfdCache[file].fd); + Assert(returnValue != -1); + + --nfile; + VfdCache[file].fd = VFD_CLOSED; + } + /* add the Vfd slot to the free list */ + FreeVfd(file); + + /* free the filename string */ + unlink(VfdCache[file].fileName); + free(VfdCache[file].fileName); +} + +int +FileRead(File file, char *buffer, int amount) +{ + int returnCode; + + DO_DB(printf("DEBUG: FileRead: %d (%s) %d 0x%x\n", + file, VfdCache[file].fileName, amount, buffer)); + + FileAccess(file); + returnCode = read(VfdCache[file].fd, buffer, amount); + if (returnCode > 0) { + VfdCache[file].seekPos += returnCode; + } + + return returnCode; +} + +int +FileWrite(File file, char *buffer, int amount) +{ + int returnCode; + + DO_DB(printf("DB: FileWrite: %d (%s) %d 0x%lx\n", + file, VfdCache[file].fileName, amount, buffer)); + + FileAccess(file); + returnCode = write(VfdCache[file].fd, buffer, amount); + if (returnCode > 0) { /* changed by Boris with Mao's advice */ + VfdCache[file].seekPos += returnCode; + } + + /* record the write */ + VfdCache[file].fdstate |= FD_DIRTY; + + return returnCode; +} + +long +FileSeek(File file, long offset, int whence) +{ + int returnCode; + + DO_DB(printf("DEBUG: FileSeek: %d (%s) %d %d\n", + file, VfdCache[file].fileName, offset, whence)); + + if (FileIsNotOpen(file)) { + switch(whence) { + case SEEK_SET: + VfdCache[file].seekPos = offset; + return offset; + case SEEK_CUR: + VfdCache[file].seekPos = VfdCache[file].seekPos +offset; + return VfdCache[file].seekPos; + case SEEK_END: + FileAccess(file); + returnCode = VfdCache[file].seekPos = + lseek(VfdCache[file].fd, offset, whence); + return returnCode; + default: + elog(WARN, "FileSeek: invalid whence: %d", whence); + break; + } + } else { + returnCode = VfdCache[file].seekPos = + lseek(VfdCache[file].fd, offset, whence); + return returnCode; + } + /*NOTREACHED*/ + return(-1L); +} + +/* + * XXX not actually used but here for completeness + */ +long +FileTell(File file) +{ + DO_DB(printf("DEBUG: FileTell %d (%s)\n", + file, VfdCache[file].fileName)); + return VfdCache[file].seekPos; +} + +int +FileTruncate(File file, int offset) +{ + int returnCode; + + DO_DB(printf("DEBUG: FileTruncate %d (%s)\n", + file, VfdCache[file].fileName)); + + (void) FileSync(file); + (void) FileAccess(file); + returnCode = ftruncate(VfdCache[file].fd, offset); + return(returnCode); +} + +int +FileSync(File file) +{ + int returnCode; + + /* + * If the file isn't open, then we don't need to sync it; we + * always sync files when we close them. Also, if we haven't + * done any writes that we haven't already synced, we can ignore + * the request. + */ + + if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY)) { + returnCode = 0; + } else { + returnCode = fsync(VfdCache[file].fd); + VfdCache[file].fdstate &= ~FD_DIRTY; + } + + return returnCode; +} + +int +FileNameUnlink(char *filename) +{ + int retval; + char *fname; + + fname = filepath(filename); + retval = unlink(fname); + pfree(fname); + return(retval); +} + +/* + * if we want to be sure that we have a real file descriptor available + * (e.g., we want to know this in psort) we call AllocateFile to force + * availability. when we are done we call FreeFile to deallocate the + * descriptor. + * + * allocatedFiles keeps track of how many have been allocated so we + * can give a warning if there are too few left. + */ +static int allocatedFiles = 0; + +void +AllocateFile() +{ + int fd; + int fdleft; + + while ((fd = open(Nulldev,O_WRONLY,0)) < 0) { + if (errno == EMFILE) { + errno = 0; + FreeFd = 0; + AssertLruRoom(); + } else { + elog(WARN,"Open: %s in %s line %d\n", Nulldev, + __FILE__, __LINE__); + } + } + close(fd); + ++allocatedFiles; + fdleft = MAXFILES - allocatedFiles; + if (fdleft < 6) { + elog(DEBUG,"warning: few usable file descriptors left (%d)", fdleft); + } + + DO_DB(printf("DEBUG: AllocatedFile. FreeFd = %d\n", + FreeFd)); +} + +/* + * XXX What happens if FreeFile() is called without a previous + * AllocateFile()? + */ +void +FreeFile() +{ + DO_DB(printf("DEBUG: FreeFile. FreeFd now %d\n", + FreeFd)); + FreeFd++; + nfile++; /* dangerous */ + Assert(allocatedFiles > 0); + --allocatedFiles; +} + +void +closeAllVfds() +{ + int i; + for (i=0; i<SizeVfdCache; i++) { + if (!FileIsNotOpen(i)) + LruDelete(i); + } +} + +void +closeOneVfd() +{ + int tmpfd; + + tmpfd = open(Nulldev, O_CREAT | O_RDWR, 0666); + if (tmpfd < 0) { + FreeFd = 0; + AssertLruRoom(); + FreeFd = 0; + } + else + close(tmpfd); +} diff --git a/src/backend/storage/ipc.h b/src/backend/storage/ipc.h new file mode 100644 index 00000000000..0da041bc9c8 --- /dev/null +++ b/src/backend/storage/ipc.h @@ -0,0 +1,285 @@ +/*------------------------------------------------------------------------- + * + * ipc.h-- + * POSTGRES inter-process communication definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: ipc.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + * NOTES + * This file is very architecture-specific. This stuff should actually + * be factored into the port/ directories. + * + *------------------------------------------------------------------------- + */ +#ifndef IPC_H +#define IPC_H + +#include <sys/types.h> +#ifndef _IPC_ +#define _IPC_ +#include <sys/ipc.h> +#endif + +#include "c.h" + +/* + * Many architectures have support for user-level spinlocks (i.e., an + * atomic test-and-set instruction). However, we have only written + * spinlock code for the architectures listed. + */ +#if defined(PORTNAME_aix) || \ + defined(PORTNAME_alpha) || \ + defined(PORTNAME_hpux) || \ + defined(PORTNAME_irix5) || \ + defined(PORTNAME_next) || \ + defined(PORTNAME_sparc) || \ + defined(PORTNAME_sparc_solaris) || \ + (defined(__i386__) && defined(__GNUC__)) +#define HAS_TEST_AND_SET +#endif + +#if defined(HAS_TEST_AND_SET) + +#if defined(PORTNAME_next) +/* + * Use Mach mutex routines since these are, in effect, test-and-set + * spinlocks. + */ +#undef NEVER /* definition in cthreads.h conflicts with parse.h */ +#include <mach/cthreads.h> + +typedef struct mutex slock_t; +#else /* next */ +#if defined(PORTNAME_aix) +/* + * The AIX C library has the cs(3) builtin for compare-and-set that + * operates on ints. + */ +typedef unsigned int slock_t; +#else /* aix */ +#if defined(PORTNAME_alpha) +#include <sys/mman.h> +typedef msemaphore slock_t; +#else /* alpha */ +#if defined(PORTNAME_hpux) +/* + * The PA-RISC "semaphore" for the LDWCX instruction is 4 bytes aligned + * to a 16-byte boundary. + */ +typedef struct { int sem[4]; } slock_t; +#else /* hpux */ +#if defined(PORTNAME_irix5) +#include <abi_mutex.h> +typedef abilock_t slock_t; +#else /* irix5 */ +/* + * On all other architectures spinlocks are a single byte. + */ +typedef unsigned char slock_t; +#endif /* irix5 */ +#endif /* hpux */ +#endif /* alpha */ +#endif /* aix */ +#endif /* next */ + +extern void S_LOCK(slock_t *lock); +extern void S_UNLOCK(slock_t *lock); +extern void S_INIT_LOCK(slock_t *lock); + +#if defined(PORTNAME_hpux) || defined(PORTNAME_alpha) || defined(PORTNAME_irix5) || defined(PORTNAME_next) +extern int S_LOCK_FREE(slock_t *lock); +#else /* PORTNAME_hpux */ +#define S_LOCK_FREE(lock) ((*lock) == 0) +#endif /* PORTNAME_hpux */ + +#endif /* HAS_TEST_AND_SET */ + +/* + * On architectures for which we have not implemented spinlocks (or + * cannot do so), we use System V semaphores. We also use them for + * long locks. For some reason union semun is never defined in the + * System V header files so we must do it ourselves. + */ +#if defined(sequent) || \ + defined(PORTNAME_aix) || \ + defined(PORTNAME_alpha) || \ + defined(PORTNAME_hpux) || \ + defined(PORTNAME_sparc_solaris) || \ + defined(WIN32) || \ + defined(PORTNAME_ultrix4) +union semun { + int val; + struct semid_ds *buf; + unsigned short *array; +}; +#endif + +typedef uint16 SystemPortAddress; + +/* semaphore definitions */ + +#define IPCProtection (0600) /* access/modify by user only */ + +#define IPC_NMAXSEM 25 /* maximum number of semaphores */ +#define IpcSemaphoreDefaultStartValue 255 +#define IpcSharedLock (-1) +#define IpcExclusiveLock (-255) + +#define IpcUnknownStatus (-1) +#define IpcInvalidArgument (-2) +#define IpcSemIdExist (-3) +#define IpcSemIdNotExist (-4) + +typedef uint32 IpcSemaphoreKey; /* semaphore key */ +typedef int IpcSemaphoreId; + +/* shared memory definitions */ + +#define IpcMemCreationFailed (-1) +#define IpcMemIdGetFailed (-2) +#define IpcMemAttachFailed 0 + +typedef uint32 IPCKey; +#define PrivateIPCKey IPC_PRIVATE +#define DefaultIPCKey 17317 + +typedef uint32 IpcMemoryKey; /* shared memory key */ +typedef int IpcMemoryId; + + +/* ipc.c */ +extern void exitpg(int code); +extern void quasi_exitpg(void); +extern on_exitpg(void (*function)(), caddr_t arg); + +extern IpcSemaphoreId IpcSemaphoreCreate(IpcSemaphoreKey semKey, + int semNum, int permission, int semStartValue, + int removeOnExit, int *status); +extern void IpcSemaphoreSet(int semId, int semno, int value); +extern void IpcSemaphoreKill(IpcSemaphoreKey key); +extern void IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock); +extern void IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock); +extern int IpcSemaphoreGetCount(IpcSemaphoreId semId, int sem); +extern int IpcSemaphoreGetValue(IpcSemaphoreId semId, int sem); +extern IpcMemoryId IpcMemoryCreate(IpcMemoryKey memKey, uint32 size, + int permission); +extern IpcMemoryId IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size); +extern void IpcMemoryDetach(int status, char *shmaddr); +extern char *IpcMemoryAttach(IpcMemoryId memId); +extern void IpcMemoryKill(IpcMemoryKey memKey); +extern void CreateAndInitSLockMemory(IPCKey key); +extern void AttachSLockMemory(IPCKey key); + + +#ifdef HAS_TEST_AND_SET + +#define NSLOCKS 2048 +#define NOLOCK 0 +#define SHAREDLOCK 1 +#define EXCLUSIVELOCK 2 + +typedef enum _LockId_ { + BUFMGRLOCKID, + LOCKLOCKID, + OIDGENLOCKID, + SHMEMLOCKID, + BINDINGLOCKID, + LOCKMGRLOCKID, + SINVALLOCKID, + +#ifdef MAIN_MEMORY + MMCACHELOCKID, +#endif /* MAIN_MEMORY */ + + PROCSTRUCTLOCKID, + FIRSTFREELOCKID +} _LockId_; + +#define MAX_SPINS FIRSTFREELOCKID + +typedef struct slock { + slock_t locklock; + unsigned char flag; + short nshlocks; + slock_t shlock; + slock_t exlock; + slock_t comlock; + struct slock *next; +} SLock; + +extern void ExclusiveLock(int lockid); +extern void ExclusiveUnlock(int lockid); +extern bool LockIsFree(int lockid); +#else /* HAS_TEST_AND_SET */ + +typedef enum _LockId_ { + SHMEMLOCKID, + BINDINGLOCKID, + BUFMGRLOCKID, + LOCKMGRLOCKID, + SINVALLOCKID, + +#ifdef MAIN_MEMORY + MMCACHELOCKID, +#endif /* MAIN_MEMORY */ + + PROCSTRUCTLOCKID, + OIDGENLOCKID, + FIRSTFREELOCKID +} _LockId_; + +#define MAX_SPINS FIRSTFREELOCKID + +#endif /* HAS_TEST_AND_SET */ + +/* + * the following are originally in ipci.h but the prototypes have circular + * dependencies and most files include both ipci.h and ipc.h anyway, hence + * combined. + * + */ + +/* + * Note: + * These must not hash to DefaultIPCKey or PrivateIPCKey. + */ +#define SystemPortAddressGetIPCKey(address) \ + (28597 * (address) + 17491) + +/* + * these keys are originally numbered from 1 to 12 consecutively but not + * all are used. The unused ones are removed. - ay 4/95. + */ +#define IPCKeyGetBufferMemoryKey(key) \ + ((key == PrivateIPCKey) ? key : 1 + (key)) + +#define IPCKeyGetSIBufferMemoryBlock(key) \ + ((key == PrivateIPCKey) ? key : 7 + (key)) + +#define IPCKeyGetSLockSharedMemoryKey(key) \ + ((key == PrivateIPCKey) ? key : 10 + (key)) + +#define IPCKeyGetSpinLockSemaphoreKey(key) \ + ((key == PrivateIPCKey) ? key : 11 + (key)) +#define IPCKeyGetWaitIOSemaphoreKey(key) \ + ((key == PrivateIPCKey) ? key : 12 + (key)) + +/* -------------------------- + * NOTE: This macro must always give the highest numbered key as every backend + * process forked off by the postmaster will be trying to acquire a semaphore + * with a unique key value starting at key+14 and incrementing up. Each + * backend uses the current key value then increments it by one. + * -------------------------- + */ +#define IPCGetProcessSemaphoreInitKey(key) \ + ((key == PrivateIPCKey) ? key : 14 + (key)) + +/* ipci.c */ +extern IPCKey SystemPortAddressCreateIPCKey(SystemPortAddress address); +extern void CreateSharedMemoryAndSemaphores(IPCKey key); +extern void AttachSharedMemoryAndSemaphores(IPCKey key); + +#endif /* IPC_H */ diff --git a/src/backend/storage/ipc/Makefile.inc b/src/backend/storage/ipc/Makefile.inc new file mode 100644 index 00000000000..b426dba0ff0 --- /dev/null +++ b/src/backend/storage/ipc/Makefile.inc @@ -0,0 +1,15 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for storage/ipc +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= ipc.c ipci.c s_lock.c shmem.c shmqueue.c sinval.c \ + sinvaladt.c spin.c diff --git a/src/backend/storage/ipc/README b/src/backend/storage/ipc/README new file mode 100644 index 00000000000..02d66045f82 --- /dev/null +++ b/src/backend/storage/ipc/README @@ -0,0 +1,31 @@ +$Header: /cvsroot/pgsql/src/backend/storage/ipc/README,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ +Mon Jul 18 11:09:22 PDT 1988 W.KLAS + +Cache invalidation synchronization routines: +=========================================== + +The cache synchronization is done using a message queue. Every +backend can register a message which then has to be read by +all backends. A message read by all backends is removed from the +queue automatically. If a message has been lost because the buffer +was full, all backends that haven't read this message will be +noticed that they have to reset their cache state. This is done +at the time when they try to read the message queue. + +The message queue is implemented as a shared buffer segment. Actually, +the queue is a circle to allow fast inserting, reading (invalidate data) and +maintaining the buffer. + +Access to this shared message buffer is synchronized by the lock manager. +The lock manager treats the buffer as a regular relation and sets +relation level locks (with mode = LockWait) to block backends while +another backend is writing or reading the buffer. The identifiers used +for this special 'relation' are database id = 0 and relation id = 0. + +The current implementation prints regular (e)log information +when a message has been removed from the buffer because the buffer +is full, and a backend has to reset its cache state. The elog level +is NOTICE. This can be used to improve teh behavior of backends +when invalidating or reseting their cache state. + + diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c new file mode 100644 index 00000000000..306300b90c3 --- /dev/null +++ b/src/backend/storage/ipc/ipc.c @@ -0,0 +1,718 @@ +/*------------------------------------------------------------------------- + * + * ipc.c-- + * POSTGRES inter-process communication definitions. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipc.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + * NOTES + * + * Currently, semaphores are used (my understanding anyway) in two + * different ways: + * 1. as mutexes on machines that don't have test-and-set (eg. + * mips R3000). + * 2. for putting processes to sleep when waiting on a lock + * and waking them up when the lock is free. + * The number of semaphores in (1) is fixed and those are shared + * among all backends. In (2), there is 1 semaphore per process and those + * are not shared with anyone else. + * -ay 4/95 + * + *------------------------------------------------------------------------- + */ +#include <sys/types.h> +#include <sys/file.h> +#include <stdio.h> +#include <errno.h> + +/* XXX - the following dependency should be moved into the defaults.mk file */ +#ifndef _IPC_ +#define _IPC_ +#include <sys/ipc.h> +#include <sys/sem.h> +#include <sys/shm.h> +#endif + +#include "storage/ipc.h" +#include "utils/memutils.h" +#include "utils/elog.h" + +#if defined(PORTNAME_bsd44) +int UsePrivateMemory = 1; +#else +int UsePrivateMemory = 0; +#endif + +#if defined(PORTNAME_bsdi) +/* hacka, hacka, hacka (XXX) */ +union semun { + int val; /* value for SETVAL */ + struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */ + ushort *array; /* array for GETALL & SETALL */ +}; +#endif + + +/* ---------------------------------------------------------------- + * exit() handling stuff + * ---------------------------------------------------------------- + */ + +#define MAX_ON_EXITS 20 + +static struct ONEXIT { + void (*function)(); + caddr_t arg; +} onexit_list[ MAX_ON_EXITS ]; + +static int onexit_index; + +typedef struct _PrivateMemStruct { + int id; + char *memptr; +} PrivateMem; + +PrivateMem IpcPrivateMem[16]; + +static int +PrivateMemoryCreate(IpcMemoryKey memKey, + uint32 size) +{ + static int memid = 0; + + UsePrivateMemory = 1; + + IpcPrivateMem[memid].id = memid; + IpcPrivateMem[memid].memptr = malloc(size); + if (IpcPrivateMem[memid].memptr == NULL) + elog(WARN, "PrivateMemoryCreate: not enough memory to malloc"); + memset(IpcPrivateMem[memid].memptr, 0, size); /* XXX PURIFY */ + + return (memid++); +} + +static char * +PrivateMemoryAttach(IpcMemoryId memid) +{ + return ( IpcPrivateMem[memid].memptr ); +} + + +/* ---------------------------------------------------------------- + * exitpg + * + * this function calls all the callbacks registered + * for it (to free resources) and then calls exit. + * This should be the only function to call exit(). + * -cim 2/6/90 + * ---------------------------------------------------------------- + */ +static int exitpg_inprogress = 0; + +void +exitpg(int code) +{ + int i; + + /* ---------------- + * if exitpg_inprocess is true, then it means that we + * are being invoked from within an on_exit() handler + * and so we return immediately to avoid recursion. + * ---------------- + */ + if (exitpg_inprogress) + return; + + exitpg_inprogress = 1; + + /* ---------------- + * call all the callbacks registered before calling exit(). + * ---------------- + */ + for (i = onexit_index - 1; i >= 0; --i) + (*onexit_list[i].function)(code, onexit_list[i].arg); + + exit(code); +} + +/* ------------------ + * Run all of the on_exitpg routines but don't exit in the end. + * This is used by the postmaster to re-initialize shared memory and + * semaphores after a backend dies horribly + * ------------------ + */ +void +quasi_exitpg() +{ + int i; + + /* ---------------- + * if exitpg_inprocess is true, then it means that we + * are being invoked from within an on_exit() handler + * and so we return immediately to avoid recursion. + * ---------------- + */ + if (exitpg_inprogress) + return; + + exitpg_inprogress = 1; + + /* ---------------- + * call all the callbacks registered before calling exit(). + * ---------------- + */ + for (i = onexit_index - 1; i >= 0; --i) + (*onexit_list[i].function)(0, onexit_list[i].arg); + + onexit_index = 0; + exitpg_inprogress = 0; +} + +/* ---------------------------------------------------------------- + * on_exitpg + * + * this function adds a callback function to the list of + * functions invoked by exitpg(). -cim 2/6/90 + * ---------------------------------------------------------------- + */ +int +on_exitpg(void (*function)(), caddr_t arg) +{ + if (onexit_index >= MAX_ON_EXITS) + return(-1); + + onexit_list[ onexit_index ].function = function; + onexit_list[ onexit_index ].arg = arg; + + ++onexit_index; + + return(0); +} + +/****************************************************************************/ +/* IPCPrivateSemaphoreKill(status, semId) */ +/* */ +/****************************************************************************/ +static void +IPCPrivateSemaphoreKill(int status, + int semId) /* caddr_t */ +{ + union semun semun; + semctl(semId, 0, IPC_RMID, semun); +} + + +/****************************************************************************/ +/* IPCPrivateMemoryKill(status, shmId) */ +/* */ +/****************************************************************************/ +static void +IPCPrivateMemoryKill(int status, + int shmId) /* caddr_t */ +{ + if ( UsePrivateMemory ) { + /* free ( IpcPrivateMem[shmId].memptr ); */ + } else { + if (shmctl(shmId, IPC_RMID, (struct shmid_ds *) NULL) < 0) { + elog(NOTICE, "IPCPrivateMemoryKill: shmctl(%d, %d, 0) failed: %m", + shmId, IPC_RMID); + } + } +} + + +/****************************************************************************/ +/* IpcSemaphoreCreate(semKey, semNum, permission, semStartValue) */ +/* */ +/* - returns a semaphore identifier: */ +/* */ +/* if key doesn't exist: return a new id, status:= IpcSemIdNotExist */ +/* if key exists: return the old id, status:= IpcSemIdExist */ +/* if semNum > MAX : return # of argument, status:=IpcInvalidArgument */ +/* */ +/****************************************************************************/ + +/* + * Note: + * XXX This should be split into two different calls. One should + * XXX be used to create a semaphore set. The other to "attach" a + * XXX existing set. It should be an error for the semaphore set + * XXX to to already exist or for it not to, respectively. + * + * Currently, the semaphore sets are "attached" and an error + * is detected only when a later shared memory attach fails. + */ + +IpcSemaphoreId +IpcSemaphoreCreate(IpcSemaphoreKey semKey, + int semNum, + int permission, + int semStartValue, + int removeOnExit, + int *status) +{ + int i; + int errStatus; + int semId; + u_short array[IPC_NMAXSEM]; + union semun semun; + + /* get a semaphore if non-existent */ + /* check arguments */ + if (semNum > IPC_NMAXSEM || semNum <= 0) { + *status = IpcInvalidArgument; + return(2); /* returns the number of the invalid argument */ + } + + semId = semget(semKey, 0, 0); + + if (semId == -1) { + *status = IpcSemIdNotExist; /* there doesn't exist a semaphore */ +#ifdef DEBUG_IPC + fprintf(stderr,"calling semget with %d, %d , %d\n", + semKey, + semNum, + IPC_CREAT|permission ); +#endif + semId = semget(semKey, semNum, IPC_CREAT|permission); + + if (semId < 0) { + perror("semget"); + exitpg(3); + } + for (i = 0; i < semNum; i++) { + array[i] = semStartValue; + } + semun.array = array; + errStatus = semctl(semId, 0, SETALL, semun); + if (errStatus == -1) { + perror("semctl"); + } + + if (removeOnExit) + on_exitpg(IPCPrivateSemaphoreKill, (caddr_t)semId); + + } else { + /* there is a semaphore id for this key */ + *status = IpcSemIdExist; + } + +#ifdef DEBUG_IPC + fprintf(stderr,"\nIpcSemaphoreCreate, status %d, returns %d\n", + *status, + semId ); + fflush(stdout); + fflush(stderr); +#endif + return(semId); +} + + +/****************************************************************************/ +/* IpcSemaphoreSet() - sets the initial value of the semaphore */ +/* */ +/* note: the xxx_return variables are only used for debugging. */ +/****************************************************************************/ +static int IpcSemaphoreSet_return; + +void +IpcSemaphoreSet(int semId, int semno, int value) +{ + int errStatus; + union semun semun; + + semun.val = value; + errStatus = semctl(semId, semno, SETVAL, semun); + IpcSemaphoreSet_return = errStatus; + + if (errStatus == -1) + perror("semctl"); +} + +/****************************************************************************/ +/* IpcSemaphoreKill(key) - removes a semaphore */ +/* */ +/****************************************************************************/ +void +IpcSemaphoreKill(IpcSemaphoreKey key) +{ + int semId; + union semun semun; + + /* kill semaphore if existent */ + + semId = semget(key, 0, 0); + if (semId != -1) + semctl(semId, 0, IPC_RMID, semun); +} + +/****************************************************************************/ +/* IpcSemaphoreLock(semId, sem, lock) - locks a semaphore */ +/* */ +/* note: the xxx_return variables are only used for debugging. */ +/****************************************************************************/ +static int IpcSemaphoreLock_return; + +void +IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock) +{ + extern int errno; + int errStatus; + struct sembuf sops; + + sops.sem_op = lock; + sops.sem_flg = 0; + sops.sem_num = sem; + + /* ---------------- + * Note: if errStatus is -1 and errno == EINTR then it means we + * returned from the operation prematurely because we were + * sent a signal. So we try and lock the semaphore again. + * I am not certain this is correct, but the semantics aren't + * clear it fixes problems with parallel abort synchronization, + * namely that after processing an abort signal, the semaphore + * call returns with -1 (and errno == EINTR) before it should. + * -cim 3/28/90 + * ---------------- + */ + do { + errStatus = semop(semId, &sops, 1); + } while (errStatus == -1 && errno == EINTR); + + IpcSemaphoreLock_return = errStatus; + + if (errStatus == -1) { + perror("semop"); + exitpg(255); + } +} + +/****************************************************************************/ +/* IpcSemaphoreUnlock(semId, sem, lock) - unlocks a semaphore */ +/* */ +/* note: the xxx_return variables are only used for debugging. */ +/****************************************************************************/ +static int IpcSemaphoreUnlock_return; + +void +IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock) +{ + extern int errno; + int errStatus; + struct sembuf sops; + + sops.sem_op = -lock; + sops.sem_flg = 0; + sops.sem_num = sem; + + + /* ---------------- + * Note: if errStatus is -1 and errno == EINTR then it means we + * returned from the operation prematurely because we were + * sent a signal. So we try and lock the semaphore again. + * I am not certain this is correct, but the semantics aren't + * clear it fixes problems with parallel abort synchronization, + * namely that after processing an abort signal, the semaphore + * call returns with -1 (and errno == EINTR) before it should. + * -cim 3/28/90 + * ---------------- + */ + do { + errStatus = semop(semId, &sops, 1); + } while (errStatus == -1 && errno == EINTR); + + IpcSemaphoreUnlock_return = errStatus; + + if (errStatus == -1) { + perror("semop"); + exitpg(255); + } +} + +int +IpcSemaphoreGetCount(IpcSemaphoreId semId, int sem) +{ + int semncnt; + union semun dummy; /* for Solaris */ + + semncnt = semctl(semId, sem, GETNCNT, dummy); + return semncnt; +} + +int +IpcSemaphoreGetValue(IpcSemaphoreId semId, int sem) +{ + int semval; + union semun dummy; /* for Solaris */ + + semval = semctl(semId, sem, GETVAL, dummy); + return semval; +} + +/****************************************************************************/ +/* IpcMemoryCreate(memKey) */ +/* */ +/* - returns the memory identifier, if creation succeeds */ +/* returns IpcMemCreationFailed, if failure */ +/****************************************************************************/ + +IpcMemoryId +IpcMemoryCreate(IpcMemoryKey memKey, uint32 size, int permission) +{ + IpcMemoryId shmid; + + if (memKey == PrivateIPCKey) { + /* private */ + shmid = PrivateMemoryCreate(memKey, size); + }else { + shmid = shmget(memKey, size, IPC_CREAT|permission); + } + + if (shmid < 0) { + fprintf(stderr,"IpcMemoryCreate: memKey=%d , size=%d , permission=%d", + memKey, size , permission ); + perror("IpcMemoryCreate: shmget(..., create, ...) failed"); + return(IpcMemCreationFailed); + } + + /* if (memKey == PrivateIPCKey) */ + on_exitpg(IPCPrivateMemoryKill, (caddr_t)shmid); + + return(shmid); +} + +/****************************************************************************/ +/* IpcMemoryIdGet(memKey, size) returns the shared memory Id */ +/* or IpcMemIdGetFailed */ +/****************************************************************************/ +IpcMemoryId +IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size) +{ + IpcMemoryId shmid; + + shmid = shmget(memKey, size, 0); + + if (shmid < 0) { + fprintf(stderr,"IpcMemoryIdGet: memKey=%d , size=%d , permission=%d", + memKey, size , 0 ); + perror("IpcMemoryIdGet: shmget() failed"); + return(IpcMemIdGetFailed); + } + + return(shmid); +} + +/****************************************************************************/ +/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ +/* from a backend address space */ +/* (only called by backends running under the postmaster) */ +/****************************************************************************/ +void +IpcMemoryDetach(int status, char *shmaddr) +{ + if (shmdt(shmaddr) < 0) { + elog(NOTICE, "IpcMemoryDetach: shmdt(0x%x): %m", shmaddr); + } +} + +/****************************************************************************/ +/* IpcMemoryAttach(memId) returns the adress of shared memory */ +/* or IpcMemAttachFailed */ +/* */ +/* CALL IT: addr = (struct <MemoryStructure> *) IpcMemoryAttach(memId); */ +/* */ +/****************************************************************************/ +char * +IpcMemoryAttach(IpcMemoryId memId) +{ + char *memAddress; + + if (UsePrivateMemory) { + memAddress = (char *) PrivateMemoryAttach(memId); + } else { + memAddress = (char *) shmat(memId, 0, 0); + } + + /* if ( *memAddress == -1) { XXX ??? */ + if ( memAddress == (char *)-1) { + perror("IpcMemoryAttach: shmat() failed"); + return(IpcMemAttachFailed); + } + + if (!UsePrivateMemory) + on_exitpg(IpcMemoryDetach, (caddr_t) memAddress); + + return((char *) memAddress); +} + + +/****************************************************************************/ +/* IpcMemoryKill(memKey) removes a shared memory segment */ +/* (only called by the postmaster and standalone backends) */ +/****************************************************************************/ +void +IpcMemoryKill(IpcMemoryKey memKey) +{ + IpcMemoryId shmid; + + if (!UsePrivateMemory && (shmid = shmget(memKey, 0, 0)) >= 0) { + if (shmctl(shmid, IPC_RMID, (struct shmid_ds *) NULL) < 0) { + elog(NOTICE, "IpcMemoryKill: shmctl(%d, %d, 0) failed: %m", + shmid, IPC_RMID); + } + } +} + +#ifdef HAS_TEST_AND_SET +/* ------------------ + * use hardware locks to replace semaphores for sequent machines + * to avoid costs of swapping processes and to provide unlimited + * supply of locks. + * ------------------ + */ +static SLock *SLockArray = NULL; +static SLock **FreeSLockPP; +static int *UnusedSLockIP; +static slock_t *SLockMemoryLock; +static IpcMemoryId SLockMemoryId = -1; + +struct ipcdummy { /* to get alignment/size right */ + SLock *free; + int unused; + slock_t memlock; + SLock slocks[NSLOCKS]; +}; +static int SLockMemorySize = sizeof(struct ipcdummy); + +void +CreateAndInitSLockMemory(IPCKey key) +{ + int id; + SLock *slckP; + + SLockMemoryId = IpcMemoryCreate(key, + SLockMemorySize, + 0700); + AttachSLockMemory(key); + *FreeSLockPP = NULL; + *UnusedSLockIP = (int)FIRSTFREELOCKID; + for (id=0; id<(int)FIRSTFREELOCKID; id++) { + slckP = &(SLockArray[id]); + S_INIT_LOCK(&(slckP->locklock)); + slckP->flag = NOLOCK; + slckP->nshlocks = 0; + S_INIT_LOCK(&(slckP->shlock)); + S_INIT_LOCK(&(slckP->exlock)); + S_INIT_LOCK(&(slckP->comlock)); + slckP->next = NULL; + } + return; +} + +void +AttachSLockMemory(IPCKey key) +{ + struct ipcdummy *slockM; + + if (SLockMemoryId == -1) + SLockMemoryId = IpcMemoryIdGet(key,SLockMemorySize); + if (SLockMemoryId == -1) + elog(FATAL, "SLockMemory not in shared memory"); + slockM = (struct ipcdummy *) IpcMemoryAttach(SLockMemoryId); + if (slockM == IpcMemAttachFailed) + elog(FATAL, "AttachSLockMemory: could not attach segment"); + FreeSLockPP = (SLock **) &(slockM->free); + UnusedSLockIP = (int *) &(slockM->unused); + SLockMemoryLock = (slock_t *) &(slockM->memlock); + S_INIT_LOCK(SLockMemoryLock); + SLockArray = (SLock *) &(slockM->slocks[0]); + return; +} + + +#ifdef LOCKDEBUG +#define PRINT_LOCK(LOCK) printf("(locklock = %d, flag = %d, nshlocks = %d, \ +shlock = %d, exlock =%d)\n", LOCK->locklock, \ + LOCK->flag, LOCK->nshlocks, LOCK->shlock, \ + LOCK->exlock) +#endif + +void +ExclusiveLock(int lockid) +{ + SLock *slckP; + slckP = &(SLockArray[lockid]); +#ifdef LOCKDEBUG + printf("ExclusiveLock(%d)\n", lockid); + printf("IN: "); + PRINT_LOCK(slckP); +#endif + ex_try_again: + S_LOCK(&(slckP->locklock)); + switch (slckP->flag) { + case NOLOCK: + slckP->flag = EXCLUSIVELOCK; + S_LOCK(&(slckP->exlock)); + S_LOCK(&(slckP->shlock)); + S_UNLOCK(&(slckP->locklock)); +#ifdef LOCKDEBUG + printf("OUT: "); + PRINT_LOCK(slckP); +#endif + return; + case SHAREDLOCK: + case EXCLUSIVELOCK: + S_UNLOCK(&(slckP->locklock)); + S_LOCK(&(slckP->exlock)); + S_UNLOCK(&(slckP->exlock)); + goto ex_try_again; + } +} + +void +ExclusiveUnlock(int lockid) +{ + SLock *slckP; + + slckP = &(SLockArray[lockid]); +#ifdef LOCKDEBUG + printf("ExclusiveUnlock(%d)\n", lockid); + printf("IN: "); + PRINT_LOCK(slckP); +#endif + S_LOCK(&(slckP->locklock)); + /* ------------- + * give favor to read processes + * ------------- + */ + slckP->flag = NOLOCK; + if (slckP->nshlocks > 0) { + while (slckP->nshlocks > 0) { + S_UNLOCK(&(slckP->shlock)); + S_LOCK(&(slckP->comlock)); + } + S_UNLOCK(&(slckP->shlock)); + } + else { + S_UNLOCK(&(slckP->shlock)); + } + S_UNLOCK(&(slckP->exlock)); + S_UNLOCK(&(slckP->locklock)); +#ifdef LOCKDEBUG + printf("OUT: "); + PRINT_LOCK(slckP); +#endif + return; +} + +bool +LockIsFree(int lockid) +{ + return(SLockArray[lockid].flag == NOLOCK); +} + +#endif /* HAS_TEST_AND_SET */ diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c new file mode 100644 index 00000000000..18d3cccd0ee --- /dev/null +++ b/src/backend/storage/ipc/ipci.c @@ -0,0 +1,149 @@ +/*------------------------------------------------------------------------- + * + * ipci.c-- + * POSTGRES inter-process communication initialization code. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "storage/ipc.h" +#include "storage/multilev.h" +#include "utils/elog.h" +#include "storage/sinval.h" +#include "storage/bufmgr.h" +#include "storage/proc.h" +#include "storage/smgr.h" +#include "storage/lock.h" +#include "miscadmin.h" /* for DebugLvl */ + +/* + * SystemPortAddressCreateMemoryKey -- + * Returns a memory key given a port address. + */ +IPCKey +SystemPortAddressCreateIPCKey(SystemPortAddress address) +{ + Assert(address < 32768); /* XXX */ + + return (SystemPortAddressGetIPCKey(address)); +} + +/* + * CreateSharedMemoryAndSemaphores -- + * Creates and initializes shared memory and semaphores. + */ +/************************************************** + + CreateSharedMemoryAndSemaphores + is called exactly *ONCE* by the postmaster. + It is *NEVER* called by the postgres backend + + 0) destroy any existing semaphores for both buffer + and lock managers. + 1) create the appropriate *SHARED* memory segments + for the two resource managers. + + **************************************************/ + +void +CreateSharedMemoryAndSemaphores(IPCKey key) +{ + int size; + +#ifdef HAS_TEST_AND_SET + /* --------------- + * create shared memory for slocks + * -------------- + */ + CreateAndInitSLockMemory(IPCKeyGetSLockSharedMemoryKey(key)); +#endif + /* ---------------- + * kill and create the buffer manager buffer pool (and semaphore) + * ---------------- + */ + CreateSpinlocks(IPCKeyGetSpinLockSemaphoreKey(key)); + size = BufferShmemSize() + LockShmemSize(); + +#ifdef MAIN_MEMORY + size += MMShmemSize(); +#endif /* MAIN_MEMORY */ + + if (DebugLvl > 1) { + fprintf(stderr, "binding ShmemCreate(key=%x, size=%d)\n", + IPCKeyGetBufferMemoryKey(key), size); + } + ShmemCreate(IPCKeyGetBufferMemoryKey(key), size); + ShmemBindingTabReset(); + InitShmem(key, size); + InitBufferPool(key); + + /* ---------------- + * do the lock table stuff + * ---------------- + */ + InitLocks(); + InitMultiLevelLockm(); + if (InitMultiLevelLockm() == INVALID_TABLEID) + elog(FATAL, "Couldn't create the lock table"); + + /* ---------------- + * do process table stuff + * ---------------- + */ + InitProcGlobal(key); + on_exitpg(ProcFreeAllSemaphores, 0); + + CreateSharedInvalidationState(key); +} + + +/* + * AttachSharedMemoryAndSemaphores -- + * Attachs existant shared memory and semaphores. + */ +void +AttachSharedMemoryAndSemaphores(IPCKey key) +{ + int size; + + /* ---------------- + * create rather than attach if using private key + * ---------------- + */ + if (key == PrivateIPCKey) { + CreateSharedMemoryAndSemaphores(key); + return; + } + +#ifdef HAS_TEST_AND_SET + /* ---------------- + * attach the slock shared memory + * ---------------- + */ + AttachSLockMemory(IPCKeyGetSLockSharedMemoryKey(key)); +#endif + /* ---------------- + * attach the buffer manager buffer pool (and semaphore) + * ---------------- + */ + size = BufferShmemSize() + LockShmemSize(); + InitShmem(key, size); + InitBufferPool(key); + + /* ---------------- + * initialize lock table stuff + * ---------------- + */ + InitLocks(); + if (InitMultiLevelLockm() == INVALID_TABLEID) + elog(FATAL, "Couldn't attach to the lock table"); + + AttachSharedInvalidationState(key); +} diff --git a/src/backend/storage/ipc/s_lock.c b/src/backend/storage/ipc/s_lock.c new file mode 100644 index 00000000000..3cbe796fc59 --- /dev/null +++ b/src/backend/storage/ipc/s_lock.c @@ -0,0 +1,440 @@ +/*------------------------------------------------------------------------- + * + * s_lock.c-- + * This file contains the implementation (if any) for spinlocks. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/s_lock.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * DESCRIPTION + * The following code fragment should be written (in assembly + * language) on machines that have a native test-and-set instruction: + * + * void + * S_LOCK(char_address) + * char *char_address; + * { + * while (test_and_set(char_address)) + * ; + * } + * + * If this is not done, POSTGRES will default to using System V + * semaphores (and take a large performance hit -- around 40% of + * its time on a DS5000/240 is spent in semop(3)...). + * + * NOTES + * AIX has a test-and-set but the recommended interface is the cs(3) + * system call. This provides an 8-instruction (plus system call + * overhead) uninterruptible compare-and-set operation. True + * spinlocks might be faster but using cs(3) still speeds up the + * regression test suite by about 25%. I don't have an assembler + * manual for POWER in any case. + * + */ +#ifdef WIN32 +#include <windows.h> +#endif /* WIN32 */ +#include "storage/ipc.h" + + +#if defined(HAS_TEST_AND_SET) + +#if defined (PORTNAME_next) +/* + * NEXTSTEP (mach) + * slock_t is defined as a struct mutex. + */ +void +S_LOCK(slock_t *lock) +{ + mutex_lock(lock); +} +void +S_UNLOCK(slock_t *lock) +{ + mutex_unlock(lock); +} +void +S_INIT_LOCK(slock_t *lock) +{ + mutex_init(lock); +} + + /* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */ +int + S_LOCK_FREE(slock_t *lock) +{ + /* For Mach, we have to delve inside the entrails of `struct +mutex'. Ick! */ + return (lock->lock == 0); +} + +#endif /* PORTNAME_next */ + + + +#if defined(PORTNAME_irix5) +/* + * SGI IRIX 5 + * slock_t is defined as a struct abilock_t, which has a single unsigned long + * member. + * + * This stuff may be supplemented in the future with Masato Kataoka's MIPS-II + * assembly from his NECEWS SVR4 port, but we probably ought to retain this + * for the R3000 chips out there. + */ +void +S_LOCK(slock_t *lock) +{ + /* spin_lock(lock); */ + while (!acquire_lock(lock)) + ; +} + +void +S_UNLOCK(slock_t *lock) +{ + (void)release_lock(lock); +} + +void +S_INIT_LOCK(slock_t *lock) +{ + (void)init_lock(lock); +} + +/* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */ +int +S_LOCK_FREE(slock_t *lock) +{ + return(stat_lock(lock)==UNLOCKED); +} + +#endif /* PORTNAME_irix5 */ + + +/* + * OSF/1 (Alpha AXP) + * + * Note that slock_t on the Alpha AXP is msemaphore instead of char + * (see storage/ipc.h). + */ + +#if defined(PORTNAME_alpha) + +void +S_LOCK(slock_t *lock) +{ + while (msem_lock(lock, MSEM_IF_NOWAIT) < 0) + ; +} + +void +S_UNLOCK(slock_t *lock) +{ + (void) msem_unlock(lock, 0); +} + +void +S_INIT_LOCK(slock_t *lock) +{ + (void) msem_init(lock, MSEM_UNLOCKED); +} + +int +S_LOCK_FREE(slock_t *lock) +{ + return(lock->msem_state ? 0 : 1); +} + +#endif /* PORTNAME_alpha */ + +/* + * Solaris 2 + */ + +#if defined(PORTNAME_sparc_solaris) + +/* defined in port/.../tas.s */ +extern int tas(slock_t *lock); + +void +S_LOCK(slock_t *lock) +{ + while (tas(lock)) + ; +} + +void +S_UNLOCK(slock_t *lock) +{ + *lock = 0; +} + +void +S_INIT_LOCK(slock_t *lock) +{ + S_UNLOCK(lock); +} + +#endif /* PORTNAME_sparc_solaris */ + +/* + * AIX (POWER) + * + * Note that slock_t on POWER/POWER2/PowerPC is int instead of char + * (see storage/ipc.h). + */ + +#if defined(PORTNAME_aix) + +void +S_LOCK(slock_t *lock) +{ + while (cs((int *) lock, 0, 1)) + ; +} + +void +S_UNLOCK(slock_t *lock) +{ + *lock = 0; +} + +void +S_INIT_LOCK(slock_t *lock) +{ + S_UNLOCK(lock); +} + +#endif /* PORTNAME_aix */ + +/* + * HP-UX (PA-RISC) + * + * Note that slock_t on PA-RISC is a structure instead of char + * (see storage/ipc.h). + */ + +#if defined(PORTNAME_hpux) + +/* defined in port/.../tas.s */ +extern int tas(slock_t *lock); + +/* +* a "set" slock_t has a single word cleared. a "clear" slock_t has +* all words set to non-zero. +*/ +static slock_t clear_lock = { -1, -1, -1, -1 }; + +void +S_LOCK(slock_t *lock) +{ + while (tas(lock)) + ; +} + +void +S_UNLOCK(slock_t *lock) +{ + *lock = clear_lock; /* struct assignment */ +} + +void +S_INIT_LOCK(slock_t *lock) +{ + S_UNLOCK(lock); +} + +int +S_LOCK_FREE(slock_t *lock) +{ + register int *lock_word = (int *) (((long) lock + 15) & ~15); + + return(*lock_word != 0); +} + +#endif /* PORTNAME_hpux */ + +/* + * sun3 + */ + +#if (defined(sun) && ! defined(sparc)) + +void +S_LOCK(slock_t *lock) +{ + while (tas(lock)); +} + +void +S_UNLOCK(slock_t *lock) +{ + *lock = 0; +} + +void +S_INIT_LOCK(slock_t *lock) +{ + S_UNLOCK(lock); +} + +static int +tas_dummy() +{ + asm("LLA0:"); + asm(" .data"); + asm(" .text"); + asm("|#PROC# 04"); + asm(" .globl _tas"); + asm("_tas:"); + asm("|#PROLOGUE# 1"); + asm(" movel sp@(0x4),a0"); + asm(" tas a0@"); + asm(" beq LLA1"); + asm(" moveq #-128,d0"); + asm(" rts"); + asm("LLA1:"); + asm(" moveq #0,d0"); + asm(" rts"); + asm(" .data"); +} + +#endif + +/* + * SPARC (SunOS 4) + */ + +#if defined(PORTNAME_sparc) + +/* if we're using -ansi w/ gcc, use __asm__ instead of asm */ +#if defined(__STRICT_ANSI__) +#define asm(x) __asm__(x) +#endif + +static int +tas_dummy() +{ + asm(".seg \"data\""); + asm(".seg \"text\""); + asm(".global _tas"); + asm("_tas:"); + + /* + * Sparc atomic test and set (sparc calls it "atomic load-store") + */ + + asm("ldstub [%r8], %r8"); + + /* + * Did test and set actually do the set? + */ + + asm("tst %r8"); + + asm("be,a ReturnZero"); + + /* + * otherwise, just return. + */ + + asm("clr %r8"); + asm("mov 0x1, %r8"); + asm("ReturnZero:"); + asm("retl"); + asm("nop"); +} + +void +S_LOCK(unsigned char *addr) +{ + while (tas(addr)); +} + + +/* + * addr should be as in the above S_LOCK routine + */ +void +S_UNLOCK(unsigned char *addr) +{ + *addr = 0; +} + +void +S_INIT_LOCK(unsigned char *addr) +{ + *addr = 0; +} + +#endif /* PORTNAME_sparc */ + +/* + * Linux and friends + */ + +#if defined(PORTNAME_linux) || defined(PORTNAME_BSD44_derived) + +int +tas(slock_t *m) +{ + slock_t res; + __asm__("xchgb %0,%1":"=q" (res),"=m" (*m):"0" (0x1)); + return(res); +} + +void +S_LOCK(slock_t *lock) +{ + while (tas(lock)) + ; +} + +void +S_UNLOCK(slock_t *lock) +{ + *lock = 0; +} + +void +S_INIT_LOCK(slock_t *lock) +{ + S_UNLOCK(lock); +} + +#endif /* PORTNAME_linux || PORTNAME_BSD44_derived */ + + +#endif /* HAS_TEST_AND_SET */ + + +#ifdef WIN32 +void +S_LOCK(HANDLE *lock) +{ + int x = 0; + x = x / x; +} + +void +S_UNLOCK(HANDLE *lock) +{ + int x = 0; + x = x / x; +} + +void +S_INIT_LOCK(HANDLE *lock) +{ + int x = 0; + x = x / x; +} +#endif /*WIN32*/ diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c new file mode 100644 index 00000000000..4eba3729ac8 --- /dev/null +++ b/src/backend/storage/ipc/shmem.c @@ -0,0 +1,561 @@ +/*------------------------------------------------------------------------- + * + * shmem.c-- + * create shared memory and initialize shared memory data structures. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * POSTGRES processes share one or more regions of shared memory. + * The shared memory is created by a postmaster and is "attached to" + * by each of the backends. The routines in this file are used for + * allocating and binding to shared memory data structures. + * + * NOTES: + * (a) There are three kinds of shared memory data structures + * available to POSTGRES: fixed-size structures, queues and hash + * tables. Fixed-size structures contain things like global variables + * for a module and should never be allocated after the process + * initialization phase. Hash tables have a fixed maximum size, but + * their actual size can vary dynamically. When entries are added + * to the table, more space is allocated. Queues link data structures + * that have been allocated either as fixed size structures or as hash + * buckets. Each shared data structure has a string name to identify + * it (assigned in the module that declares it). + * + * (b) During initialization, each module looks for its + * shared data structures in a hash table called the "Binding Table". + * If the data structure is not present, the caller can allocate + * a new one and initialize it. If the data structure is present, + * the caller "attaches" to the structure by initializing a pointer + * in the local address space. + * The binding table has two purposes: first, it gives us + * a simple model of how the world looks when a backend process + * initializes. If something is present in the binding table, + * it is initialized. If it is not, it is uninitialized. Second, + * the binding table allows us to allocate shared memory on demand + * instead of trying to preallocate structures and hard-wire the + * sizes and locations in header files. If you are using a lot + * of shared memory in a lot of different places (and changing + * things during development), this is important. + * + * (c) memory allocation model: shared memory can never be + * freed, once allocated. Each hash table has its own free list, + * so hash buckets can be reused when an item is deleted. However, + * if one hash table grows very large and then shrinks, its space + * cannot be redistributed to other tables. We could build a simple + * hash bucket garbage collector if need be. Right now, it seems + * unnecessary. + * + * See InitSem() in sem.c for an example of how to use the + * binding table. + * + */ +#include <stdio.h> +#include <string.h> +#include "postgres.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/hsearch.h" +#include "utils/elog.h" + +/* shared memory global variables */ + +unsigned long ShmemBase = 0; /* start and end address of + * shared memory + */ +static unsigned long ShmemEnd = 0; +static unsigned long ShmemSize = 0; /* current size (and default) */ + +SPINLOCK ShmemLock; /* lock for shared memory allocation */ + +SPINLOCK BindingLock; /* lock for binding table access */ + +static unsigned long *ShmemFreeStart = NULL; /* pointer to the OFFSET of + * first free shared memory + */ +static unsigned long *ShmemBindingTabOffset = NULL; /* start of the binding + * table (for bootstrap) + */ +static int ShmemBootstrap = FALSE; /* flag becomes true when shared mem + * is created by POSTMASTER + */ + +static HTAB *BindingTable = NULL; + +/* --------------------- + * ShmemBindingTabReset() - Resets the binding table to NULL.... + * useful when the postmaster destroys existing shared memory + * and creates all new segments after a backend crash. + * ---------------------- + */ +void +ShmemBindingTabReset() +{ + BindingTable = (HTAB *)NULL; +} + +/* + * CreateSharedRegion() -- + * + * This routine is called once by the postmaster to + * initialize the shared buffer pool. Assume there is + * only one postmaster so no synchronization is necessary + * until after this routine completes successfully. + * + * key is a unique identifier for the shmem region. + * size is the size of the region. + */ +static IpcMemoryId ShmemId; + +void +ShmemCreate(unsigned int key, unsigned int size) +{ + if (size) + ShmemSize = size; + /* create shared mem region */ + if ((ShmemId=IpcMemoryCreate(key,ShmemSize,IPCProtection)) + ==IpcMemCreationFailed) { + elog(FATAL,"ShmemCreate: cannot create region"); + exit(1); + } + + /* ShmemBootstrap is true if shared memory has been + * created, but not yet initialized. Only the + * postmaster/creator-of-all-things should have + * this flag set. + */ + ShmemBootstrap = TRUE; +} + +/* + * InitShmem() -- map region into process address space + * and initialize shared data structures. + * + */ +int +InitShmem(unsigned int key, unsigned int size) +{ + Pointer sharedRegion; + unsigned long currFreeSpace; + + HASHCTL info; + int hash_flags; + BindingEnt * result,item; + bool found; + IpcMemoryId shmid; + + /* if zero key, use default memory size */ + if (size) + ShmemSize = size; + + /* default key is 0 */ + + /* attach to shared memory region (SysV or BSD OS specific) */ + if (ShmemBootstrap && key == PrivateIPCKey) + /* if we are running backend alone */ + shmid = ShmemId; + else + shmid = IpcMemoryIdGet(IPCKeyGetBufferMemoryKey(key), ShmemSize); + sharedRegion = IpcMemoryAttach(shmid); + if (sharedRegion == NULL) { + elog(FATAL,"AttachSharedRegion: couldn't attach to shmem\n"); + return(FALSE); + } + + /* get pointers to the dimensions of shared memory */ + ShmemBase = (unsigned long) sharedRegion; + ShmemEnd = (unsigned long) sharedRegion + ShmemSize; + currFreeSpace = 0; + + /* First long in shared memory is the count of available space */ + ShmemFreeStart = (unsigned long *) ShmemBase; + /* next is a shmem pointer to the binding table */ + ShmemBindingTabOffset = ShmemFreeStart + 1; + + currFreeSpace += + sizeof(ShmemFreeStart) + sizeof(ShmemBindingTabOffset); + + /* bootstrap initialize spin locks so we can start to use the + * allocator and binding table. + */ + if (! InitSpinLocks(ShmemBootstrap, IPCKeyGetSpinLockSemaphoreKey(key))) { + return(FALSE); + } + + /* We have just allocated additional space for two spinlocks. + * Now setup the global free space count + */ + if (ShmemBootstrap) { + *ShmemFreeStart = currFreeSpace; + } + + /* if ShmemFreeStart is NULL, then the allocator won't work */ + Assert(*ShmemFreeStart); + + /* create OR attach to the shared memory binding table */ + info.keysize = BTABLE_KEYSIZE; + info.datasize = BTABLE_DATASIZE; + hash_flags = (HASH_ELEM); + + /* This will acquire the binding table lock, but not release it. */ + BindingTable = ShmemInitHash("BindingTable", + BTABLE_SIZE,BTABLE_SIZE, + &info,hash_flags); + + if (! BindingTable) { + elog(FATAL,"InitShmem: couldn't initialize Binding Table"); + return(FALSE); + } + + /* Now, check the binding table for an entry to the binding + * table. If there is an entry there, someone else created + * the table. Otherwise, we did and we have to initialize it. + */ + memset(item.key, 0, BTABLE_KEYSIZE); + strncpy(item.key,"BindingTable",BTABLE_KEYSIZE); + + result = (BindingEnt *) + hash_search(BindingTable,(char *) &item,HASH_ENTER, &found); + + + if (! result ) { + elog(FATAL,"InitShmem: corrupted binding table"); + return(FALSE); + } + + if (! found) { + /* bootstrapping shmem: we have to initialize the + * binding table now. + */ + + Assert(ShmemBootstrap); + result->location = MAKE_OFFSET(BindingTable->hctl); + *ShmemBindingTabOffset = result->location; + result->size = BTABLE_SIZE; + + ShmemBootstrap = FALSE; + + } else { + Assert(! ShmemBootstrap); + } + /* now release the lock acquired in ShmemHashInit */ + SpinRelease (BindingLock); + + Assert (result->location == MAKE_OFFSET(BindingTable->hctl)); + + return(TRUE); +} + +/* + * ShmemAlloc -- allocate word-aligned byte string from + * shared memory + * + * Assumes ShmemLock and ShmemFreeStart are initialized. + * Returns: real pointer to memory or NULL if we are out + * of space. Has to return a real pointer in order + * to be compatable with malloc(). + */ +long * +ShmemAlloc(unsigned long size) +{ + unsigned long tmpFree; + long *newSpace; + + /* + * ensure space is word aligned. + * + * Word-alignment is not good enough. We have to be more + * conservative: doubles need 8-byte alignment. (We probably only need + * this on RISC platforms but this is not a big waste of space.) + * - ay 12/94 + */ + if (size % sizeof(double)) + size += sizeof(double) - (size % sizeof(double)); + + Assert(*ShmemFreeStart); + + SpinAcquire(ShmemLock); + + tmpFree = *ShmemFreeStart + size; + if (tmpFree <= ShmemSize) { + newSpace = (long *)MAKE_PTR(*ShmemFreeStart); + *ShmemFreeStart += size; + } else { + newSpace = NULL; + } + + SpinRelease(ShmemLock); + + if (! newSpace) { + elog(NOTICE,"ShmemAlloc: out of memory "); + } + return(newSpace); +} + +/* + * ShmemIsValid -- test if an offset refers to valid shared memory + * + * Returns TRUE if the pointer is valid. + */ +int +ShmemIsValid(unsigned long addr) +{ + return ((addr<ShmemEnd) && (addr>=ShmemBase)); +} + +/* + * ShmemInitHash -- Create/Attach to and initialize + * shared memory hash table. + * + * Notes: + * + * assume caller is doing some kind of synchronization + * so that two people dont try to create/initialize the + * table at once. Use SpinAlloc() to create a spinlock + * for the structure before creating the structure itself. + */ +HTAB * +ShmemInitHash(char *name, /* table string name for binding */ + long init_size, /* initial size */ + long max_size, /* max size of the table */ + HASHCTL *infoP, /* info about key and bucket size */ + int hash_flags) /* info about infoP */ +{ + bool found; + long * location; + + /* shared memory hash tables have a fixed max size so that the + * control structures don't try to grow. The segbase is for + * calculating pointer values. The shared memory allocator + * must be specified. + */ + infoP->segbase = (long *) ShmemBase; + infoP->alloc = ShmemAlloc; + infoP->max_size = max_size; + hash_flags |= HASH_SHARED_MEM; + + /* look it up in the binding table */ + location = + ShmemInitStruct(name,my_log2(max_size) + sizeof(HHDR),&found); + + /* binding table is corrupted. Let someone else give the + * error message since they have more information + */ + if (location == NULL) { + return(0); + } + + /* it already exists, attach to it rather than allocate and + * initialize new space + */ + if (found) { + hash_flags |= HASH_ATTACH; + } + + /* these structures were allocated or bound in ShmemInitStruct */ + /* control information and parameters */ + infoP->hctl = (long *) location; + /* directory for hash lookup */ + infoP->dir = (long *) (location + sizeof(HHDR)); + + return(hash_create(init_size, infoP, hash_flags));; +} + +/* + * ShmemPIDLookup -- lookup process data structure using process id + * + * Returns: TRUE if no error. locationPtr is initialized if PID is + * found in the binding table. + * + * NOTES: + * only information about success or failure is the value of + * locationPtr. + */ +bool +ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr) +{ + BindingEnt * result,item; + bool found; + + Assert (BindingTable); + memset(item.key, 0, BTABLE_KEYSIZE); + sprintf(item.key,"PID %d",pid); + + SpinAcquire(BindingLock); + result = (BindingEnt *) + hash_search(BindingTable,(char *) &item, HASH_ENTER, &found); + + if (! result) { + + SpinRelease(BindingLock); + elog(WARN,"ShmemInitPID: BindingTable corrupted"); + return(FALSE); + + } + + if (found) { + *locationPtr = result->location; + } else { + result->location = *locationPtr; + } + + SpinRelease(BindingLock); + return (TRUE); +} + +/* + * ShmemPIDDestroy -- destroy binding table entry for process + * using process id + * + * Returns: offset of the process struct in shared memory or + * INVALID_OFFSET if not found. + * + * Side Effect: removes the entry from the binding table + */ +SHMEM_OFFSET +ShmemPIDDestroy(int pid) +{ + BindingEnt * result,item; + bool found; + SHMEM_OFFSET location; + + Assert(BindingTable); + + memset(item.key, 0, BTABLE_KEYSIZE); + sprintf(item.key,"PID %d",pid); + + SpinAcquire(BindingLock); + result = (BindingEnt *) + hash_search(BindingTable,(char *) &item, HASH_REMOVE, &found); + + if (found) + location = result->location; + SpinRelease(BindingLock); + + if (! result) { + + elog(WARN,"ShmemPIDDestroy: PID table corrupted"); + return(INVALID_OFFSET); + + } + + if (found) + return (location); + else { + return(INVALID_OFFSET); + } +} + +/* + * ShmemInitStruct -- Create/attach to a structure in shared + * memory. + * + * This is called during initialization to find or allocate + * a data structure in shared memory. If no other processes + * have created the structure, this routine allocates space + * for it. If it exists already, a pointer to the existing + * table is returned. + * + * Returns: real pointer to the object. FoundPtr is TRUE if + * the object is already in the binding table (hence, already + * initialized). + */ +long * +ShmemInitStruct(char *name, unsigned long size, bool *foundPtr) +{ + BindingEnt * result,item; + long * structPtr; + + strncpy(item.key,name,BTABLE_KEYSIZE); + item.location = BAD_LOCATION; + + SpinAcquire(BindingLock); + + if (! BindingTable) { + /* Assert() is a macro now. substitutes inside quotes. */ + char *strname = "BindingTable"; + + /* If the binding table doesnt exist, we fake it. + * + * If we are creating the first binding table, then let + * shmemalloc() allocate the space for a new HTAB. Otherwise, + * find the old one and return that. Notice that the + * BindingLock is held until the binding table has been completely + * initialized. + */ + Assert (! strcmp(name,strname)) ; + if (ShmemBootstrap) { + /* in POSTMASTER/Single process */ + + *foundPtr = FALSE; + return((long *)ShmemAlloc(size)); + + } else { + Assert (ShmemBindingTabOffset); + + *foundPtr = TRUE; + return((long *)MAKE_PTR(*ShmemBindingTabOffset)); + } + + + } else { + /* look it up in the bindint table */ + result = (BindingEnt *) + hash_search(BindingTable,(char *) &item,HASH_ENTER, foundPtr); + } + + if (! result) { + + SpinRelease(BindingLock); + + elog(WARN,"ShmemInitStruct: Binding Table corrupted"); + return(NULL); + + } else if (*foundPtr) { + /* + * Structure is in the binding table so someone else has allocated + * it already. The size better be the same as the size we are + * trying to initialize to or there is a name conflict (or worse). + */ + if (result->size != size) { + SpinRelease(BindingLock); + + elog(NOTICE,"ShmemInitStruct: BindingTable entry size is wrong"); + /* let caller print its message too */ + return(NULL); + } + structPtr = (long *)MAKE_PTR(result->location); + } else { + + /* It isn't in the table yet. allocate and initialize it */ + structPtr = ShmemAlloc((long)size); + if (! structPtr) { + /* out of memory */ + Assert (BindingTable); + (void) hash_search(BindingTable,(char *) &item,HASH_REMOVE, foundPtr); + SpinRelease(BindingLock); + *foundPtr = FALSE; + + elog(NOTICE,"ShmemInitStruct: cannot allocate '%s'", + name); + return(NULL); + } + result->size = size; + result->location = MAKE_OFFSET(structPtr); + } + Assert (ShmemIsValid((unsigned long)structPtr)); + + SpinRelease(BindingLock); + return(structPtr); +} + + + diff --git a/src/backend/storage/ipc/shmqueue.c b/src/backend/storage/ipc/shmqueue.c new file mode 100644 index 00000000000..f08546742b5 --- /dev/null +++ b/src/backend/storage/ipc/shmqueue.c @@ -0,0 +1,251 @@ +/*------------------------------------------------------------------------- + * + * shmqueue.c-- + * shared memory linked lists + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmqueue.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + * NOTES + * + * Package for managing doubly-linked lists in shared memory. + * The only tricky thing is that SHM_QUEUE will usually be a field + * in a larger record. SHMQueueGetFirst has to return a pointer + * to the record itself instead of a pointer to the SHMQueue field + * of the record. It takes an extra pointer and does some extra + * pointer arithmetic to do this correctly. + * + * NOTE: These are set up so they can be turned into macros some day. + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> /* for sprintf() */ +#include "postgres.h" +#include "storage/shmem.h" /* where the declarations go */ +#include "utils/elog.h" + +/*#define SHMQUEUE_DEBUG*/ +#ifdef SHMQUEUE_DEBUG +#define SHMQUEUE_DEBUG_DEL /* deletions */ +#define SHMQUEUE_DEBUG_HD /* head inserts */ +#define SHMQUEUE_DEBUG_TL /* tail inserts */ +#define SHMQUEUE_DEBUG_ELOG NOTICE +#endif /* SHMQUEUE_DEBUG */ + +/* + * ShmemQueueInit -- make the head of a new queue point + * to itself + */ +void +SHMQueueInit(SHM_QUEUE *queue) +{ + Assert(SHM_PTR_VALID(queue)); + (queue)->prev = (queue)->next = MAKE_OFFSET(queue); +} + +/* + * SHMQueueIsDetached -- TRUE if element is not currently + * in a queue. + */ +bool +SHMQueueIsDetached(SHM_QUEUE *queue) +{ + Assert(SHM_PTR_VALID(queue)); + return ((queue)->prev == INVALID_OFFSET); +} + +/* + * SHMQueueElemInit -- clear an element's links + */ +void +SHMQueueElemInit(SHM_QUEUE *queue) +{ + Assert(SHM_PTR_VALID(queue)); + (queue)->prev = (queue)->next = INVALID_OFFSET; +} + +/* + * SHMQueueDelete -- remove an element from the queue and + * close the links + */ +void +SHMQueueDelete(SHM_QUEUE *queue) +{ + SHM_QUEUE *nextElem = (SHM_QUEUE *) MAKE_PTR((queue)->next); + SHM_QUEUE *prevElem = (SHM_QUEUE *) MAKE_PTR((queue)->prev); + + Assert(SHM_PTR_VALID(queue)); + Assert(SHM_PTR_VALID(nextElem)); + Assert(SHM_PTR_VALID(prevElem)); + +#ifdef SHMQUEUE_DEBUG_DEL + dumpQ(queue, "in SHMQueueDelete: begin"); +#endif /* SHMQUEUE_DEBUG_DEL */ + + prevElem->next = (queue)->next; + nextElem->prev = (queue)->prev; + +#ifdef SHMQUEUE_DEBUG_DEL + dumpQ((SHM_QUEUE *)MAKE_PTR(queue->prev), "in SHMQueueDelete: end"); +#endif /* SHMQUEUE_DEBUG_DEL */ +} + +#ifdef SHMQUEUE_DEBUG +void +dumpQ(SHM_QUEUE *q, char *s) +{ + char elem[16]; + char buf[1024]; + SHM_QUEUE *start = q; + int count = 0; + + sprintf(buf, "q prevs: %x", MAKE_OFFSET(q)); + q = (SHM_QUEUE *)MAKE_PTR(q->prev); + while (q != start) + { + sprintf(elem, "--->%x", MAKE_OFFSET(q)); + strcat(buf, elem); + q = (SHM_QUEUE *)MAKE_PTR(q->prev); + if (q->prev == MAKE_OFFSET(q)) + break; + if (count++ > 40) + { + strcat(buf, "BAD PREV QUEUE!!"); + break; + } + } + sprintf(elem, "--->%x", MAKE_OFFSET(q)); + strcat(buf, elem); + elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf); + + sprintf(buf, "q nexts: %x", MAKE_OFFSET(q)); + count = 0; + q = (SHM_QUEUE *)MAKE_PTR(q->next); + while (q != start) + { + sprintf(elem, "--->%x", MAKE_OFFSET(q)); + strcat(buf, elem); + q = (SHM_QUEUE *)MAKE_PTR(q->next); + if (q->next == MAKE_OFFSET(q)) + break; + if (count++ > 10) + { + strcat(buf, "BAD NEXT QUEUE!!"); + break; + } + } + sprintf(elem, "--->%x", MAKE_OFFSET(q)); + strcat(buf, elem); + elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf); +} +#endif /* SHMQUEUE_DEBUG */ + +/* + * SHMQueueInsertHD -- put elem in queue between the queue head + * and its "prev" element. + */ +void +SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem) +{ + SHM_QUEUE *prevPtr = (SHM_QUEUE *) MAKE_PTR((queue)->prev); + SHMEM_OFFSET elemOffset = MAKE_OFFSET(elem); + + Assert(SHM_PTR_VALID(queue)); + Assert(SHM_PTR_VALID(elem)); + +#ifdef SHMQUEUE_DEBUG_HD + dumpQ(queue, "in SHMQueueInsertHD: begin"); +#endif /* SHMQUEUE_DEBUG_HD */ + + (elem)->next = prevPtr->next; + (elem)->prev = queue->prev; + (queue)->prev = elemOffset; + prevPtr->next = elemOffset; + +#ifdef SHMQUEUE_DEBUG_HD + dumpQ(queue, "in SHMQueueInsertHD: end"); +#endif /* SHMQUEUE_DEBUG_HD */ +} + +void +SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem) +{ + SHM_QUEUE *nextPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next); + SHMEM_OFFSET elemOffset = MAKE_OFFSET(elem); + + Assert(SHM_PTR_VALID(queue)); + Assert(SHM_PTR_VALID(elem)); + +#ifdef SHMQUEUE_DEBUG_TL + dumpQ(queue, "in SHMQueueInsertTL: begin"); +#endif /* SHMQUEUE_DEBUG_TL */ + + (elem)->prev = nextPtr->prev; + (elem)->next = queue->next; + (queue)->next = elemOffset; + nextPtr->prev = elemOffset; + +#ifdef SHMQUEUE_DEBUG_TL + dumpQ(queue, "in SHMQueueInsertTL: end"); +#endif /* SHMQUEUE_DEBUG_TL */ +} + +/* + * SHMQueueFirst -- Get the first element from a queue + * + * First element is queue->next. If SHMQueue is part of + * a larger structure, we want to return a pointer to the + * whole structure rather than a pointer to its SHMQueue field. + * I.E. struct { + * int stuff; + * SHMQueue elem; + * } ELEMType; + * when this element is in a queue (queue->next) is struct.elem. + * nextQueue allows us to calculate the offset of the SHMQueue + * field in the structure. + * + * call to SHMQueueFirst should take these parameters: + * + * &(queueHead),&firstElem,&(firstElem->next) + * + * Note that firstElem may well be uninitialized. if firstElem + * is initially K, &(firstElem->next) will be K+ the offset to + * next. + */ +void +SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr, SHM_QUEUE *nextQueue) +{ + SHM_QUEUE *elemPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next); + + Assert(SHM_PTR_VALID(queue)); + *nextPtrPtr = (Pointer) (((unsigned long) *nextPtrPtr) + + ((unsigned long) elemPtr) - ((unsigned long) nextQueue)); + + /* + nextPtrPtr a ptr to a structure linked in the queue + nextQueue is the SHMQueue field of the structure + *nextPtrPtr - nextQueue is 0 minus the offset of the queue + field n the record + elemPtr + (*nextPtrPtr - nexQueue) is the start of the + structure containing elemPtr. + */ +} + +/* + * SHMQueueEmpty -- TRUE if queue head is only element, FALSE otherwise + */ +bool +SHMQueueEmpty(SHM_QUEUE *queue) +{ + Assert(SHM_PTR_VALID(queue)); + + if (queue->prev == MAKE_OFFSET(queue)) + { + Assert(queue->next = MAKE_OFFSET(queue)); + return(TRUE); + } + return(FALSE); +} diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c new file mode 100644 index 00000000000..9151ee77686 --- /dev/null +++ b/src/backend/storage/ipc/sinval.c @@ -0,0 +1,169 @@ +/*------------------------------------------------------------------------- + * + * sinval.c-- + * POSTGRES shared cache invalidation communication code. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* #define INVALIDDEBUG 1 */ + +#include "postgres.h" + +#include "storage/sinval.h" +#include "storage/sinvaladt.h" +#include "storage/spin.h" +#include "utils/elog.h" + +extern SISeg *shmInvalBuffer;/* the shared buffer segment, set by*/ + /* SISegmentAttach() */ +extern BackendId MyBackendId; +extern BackendTag MyBackendTag; + +SPINLOCK SInvalLock = (SPINLOCK) NULL; + +/****************************************************************************/ +/* CreateSharedInvalidationState(key) Create a buffer segment */ +/* */ +/* should be called only by the POSTMASTER */ +/****************************************************************************/ +void +CreateSharedInvalidationState(IPCKey key) +{ + int status; + + /* REMOVED + SISyncKill(IPCKeyGetSIBufferMemorySemaphoreKey(key)); + SISyncInit(IPCKeyGetSIBufferMemorySemaphoreKey(key)); + */ + + /* SInvalLock gets set in spin.c, during spinlock init */ + status = SISegmentInit(true, IPCKeyGetSIBufferMemoryBlock(key)); + + if (status == -1) { + elog(FATAL, "CreateSharedInvalidationState: failed segment init"); + } +} +/****************************************************************************/ +/* AttachSharedInvalidationState(key) Attach a buffer segment */ +/* */ +/* should be called only by the POSTMASTER */ +/****************************************************************************/ +void +AttachSharedInvalidationState(IPCKey key) +{ + int status; + + if (key == PrivateIPCKey) { + CreateSharedInvalidationState(key); + return; + } + /* SInvalLock gets set in spin.c, during spinlock init */ + status = SISegmentInit(false, IPCKeyGetSIBufferMemoryBlock(key)); + + if (status == -1) { + elog(FATAL, "AttachSharedInvalidationState: failed segment init"); + } +} + +void +InitSharedInvalidationState() +{ + SpinAcquire(SInvalLock); + if (!SIBackendInit(shmInvalBuffer)) + { + SpinRelease(SInvalLock); + elog(FATAL, "Backend cache invalidation initialization failed"); + } + SpinRelease(SInvalLock); +} + +/* + * RegisterSharedInvalid -- + * Returns a new local cache invalidation state containing a new entry. + * + * Note: + * Assumes hash index is valid. + * Assumes item pointer is valid. + */ +/****************************************************************************/ +/* RegisterSharedInvalid(cacheId, hashIndex, pointer) */ +/* */ +/* register a message in the buffer */ +/* should be called by a backend */ +/****************************************************************************/ +void +RegisterSharedInvalid(int cacheId, /* XXX */ + Index hashIndex, + ItemPointer pointer) +{ + SharedInvalidData newInvalid; + + /* + * This code has been hacked to accept two types of messages. This might + * be treated more generally in the future. + * + * (1) + * cacheId= system cache id + * hashIndex= system cache hash index for a (possibly) cached tuple + * pointer= pointer of (possibly) cached tuple + * + * (2) + * cacheId= special non-syscache id + * hashIndex= object id contained in (possibly) cached relation descriptor + * pointer= null + */ + + newInvalid.cacheId = cacheId; + newInvalid.hashIndex = hashIndex; + + if (ItemPointerIsValid(pointer)) { + ItemPointerCopy(pointer, &newInvalid.pointerData); + } else { + ItemPointerSetInvalid(&newInvalid.pointerData); + } + + SpinAcquire(SInvalLock); + if (!SISetDataEntry(shmInvalBuffer, &newInvalid)) { + /* buffer full */ + /* release a message, mark process cache states to be invalid */ + SISetProcStateInvalid(shmInvalBuffer); + + if (!SIDelDataEntry(shmInvalBuffer)) { + /* inconsistent buffer state -- shd never happen */ + SpinRelease(SInvalLock); + elog(FATAL, "RegisterSharedInvalid: inconsistent buffer state"); + } + + /* write again */ + (void) SISetDataEntry(shmInvalBuffer, &newInvalid); + } + SpinRelease(SInvalLock); +} + +/* + * InvalidateSharedInvalid -- + * Processes all entries in a shared cache invalidation state. + */ +/****************************************************************************/ +/* InvalidateSharedInvalid(invalFunction, resetFunction) */ +/* */ +/* invalidate a message in the buffer (read and clean up) */ +/* should be called by a backend */ +/****************************************************************************/ +void +InvalidateSharedInvalid(void (*invalFunction)(), + void (*resetFunction)()) +{ + SpinAcquire(SInvalLock); + SIReadEntryData(shmInvalBuffer, MyBackendId, + invalFunction, resetFunction); + + SIDelExpiredDataEntries(shmInvalBuffer); + SpinRelease(SInvalLock); +} diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c new file mode 100644 index 00000000000..a30afdb6fed --- /dev/null +++ b/src/backend/storage/ipc/sinvaladt.c @@ -0,0 +1,797 @@ +/*------------------------------------------------------------------------- + * + * sinvaladt.c-- + * POSTGRES shared cache invalidation segment definitions. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "storage/ipc.h" +#include "storage/sinvaladt.h" +#include "storage/lmgr.h" +#include "utils/elog.h" +#include "utils/palloc.h" + +/* ---------------- + * global variable notes + * + * SharedInvalidationSemaphore + * + * shmInvalBuffer + * the shared buffer segment, set by SISegmentAttach() + * + * MyBackendId + * might be removed later, used only for + * debugging in debug routines (end of file) + * + * SIDbId + * identification of buffer (disappears) + * + * SIRelId \ + * SIDummyOid \ identification of buffer + * SIXidData / + * SIXid / + * + * XXX This file really needs to be cleaned up. We switched to using + * spinlocks to protect critical sections (as opposed to using fake + * relations and going through the lock manager) and some of the old + * cruft was 'ifdef'ed out, while other parts (now unused) are still + * compiled into the system. -mer 5/24/92 + * ---------------- + */ +#ifdef HAS_TEST_AND_SET +int SharedInvalidationLockId; +#else +IpcSemaphoreId SharedInvalidationSemaphore; +#endif + +SISeg *shmInvalBuffer; +extern BackendId MyBackendId; + +static void CleanupInvalidationState(int status, SISeg *segInOutP); +static BackendId SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag); +static int SIGetNumEntries(SISeg *segP); + +/************************************************************************/ +/* SISetActiveProcess(segP, backendId) set the backend status active */ +/* should be called only by the postmaster when creating a backend */ +/************************************************************************/ +/* XXX I suspect that the segP parameter is extraneous. -hirohama */ +static void +SISetActiveProcess(SISeg *segInOutP, BackendId backendId) +{ + /* mark all messages as read */ + + /* Assert(segP->procState[backendId - 1].tag == MyBackendTag); */ + + segInOutP->procState[backendId - 1].resetState = false; + segInOutP->procState[backendId - 1].limit = SIGetNumEntries(segInOutP); +} + +/****************************************************************************/ +/* SIBackendInit() initializes a backend to operate on the buffer */ +/****************************************************************************/ +int +SIBackendInit(SISeg *segInOutP) +{ + LRelId LtCreateRelId(); + TransactionId LMITransactionIdCopy(); + + Assert(MyBackendTag > 0); + + MyBackendId = SIAssignBackendId(segInOutP, MyBackendTag); + if (MyBackendId == InvalidBackendTag) + return 0; + +#ifdef INVALIDDEBUG + elog(DEBUG, "SIBackendInit: backend tag %d; backend id %d.", + MyBackendTag, MyBackendId); +#endif /* INVALIDDEBUG */ + + SISetActiveProcess(segInOutP, MyBackendId); + on_exitpg(CleanupInvalidationState, (caddr_t)segInOutP); + return 1; +} + +/* ---------------- + * SIAssignBackendId + * ---------------- + */ +static BackendId +SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag) +{ + Index index; + ProcState *stateP; + + stateP = NULL; + + for (index = 0; index < MaxBackendId; index += 1) { + if (segInOutP->procState[index].tag == InvalidBackendTag || + segInOutP->procState[index].tag == backendTag) + { + stateP = &segInOutP->procState[index]; + break; + } + + if (!PointerIsValid(stateP) || + (segInOutP->procState[index].resetState && + (!stateP->resetState || + stateP->tag < backendTag)) || + (!stateP->resetState && + (segInOutP->procState[index].limit < + stateP->limit || + stateP->tag < backendTag))) + { + stateP = &segInOutP->procState[index]; + } + } + + /* verify that all "procState" entries checked for matching tags */ + + for (index += 1; index < MaxBackendId; index += 1) { + if (segInOutP->procState[index].tag == backendTag) { + elog (FATAL, "SIAssignBackendId: tag %d found twice", + backendTag); + } + } + + if (stateP->tag != InvalidBackendTag) { + if (stateP->tag == backendTag) { + elog(NOTICE, "SIAssignBackendId: reusing tag %d", + backendTag); + } else { + elog(NOTICE, + "SIAssignBackendId: discarding tag %d", + stateP->tag); + return InvalidBackendTag; + } + } + + stateP->tag = backendTag; + + return (1 + stateP - &segInOutP->procState[0]); +} + + +/************************************************************************/ +/* The following function should be called only by the postmaster !! */ +/************************************************************************/ + +/************************************************************************/ +/* SISetDeadProcess(segP, backendId) set the backend status DEAD */ +/* should be called only by the postmaster when a backend died */ +/************************************************************************/ +static void +SISetDeadProcess(SISeg *segP, int backendId) +{ + /* XXX call me.... */ + + segP->procState[backendId - 1].resetState = false; + segP->procState[backendId - 1].limit = -1; + segP->procState[backendId - 1].tag = InvalidBackendTag; +} + +/* + * CleanupInvalidationState -- + * Note: + * This is a temporary hack. ExitBackend should call this instead + * of exit (via on_exitpg). + */ +static void +CleanupInvalidationState(int status, /* XXX */ + SISeg *segInOutP) /* XXX style */ +{ + Assert(PointerIsValid(segInOutP)); + + SISetDeadProcess(segInOutP, MyBackendId); +} + + +/************************************************************************/ +/* SIComputeSize() - retuns the size of a buffer segment */ +/************************************************************************/ +static SISegOffsets * +SIComputeSize(int *segSize) +{ + int A, B, a, b, totalSize; + SISegOffsets *oP; + + A = 0; + a = SizeSISeg; /* offset to first data entry */ + b = SizeOfOneSISegEntry * MAXNUMMESSAGES; + B = A + a + b; + totalSize = B - A; + *segSize = totalSize; + + oP = (SISegOffsets *) palloc(sizeof(SISegOffsets)); + oP->startSegment = A; + oP->offsetToFirstEntry = a; /* relatiove to A */ + oP->offsetToEndOfSegemnt = totalSize; /* relative to A */ + return(oP); +} + + +/************************************************************************/ +/* SISetStartEntrySection(segP, offset) - sets the offset */ +/************************************************************************/ +static void +SISetStartEntrySection(SISeg *segP, Offset offset) +{ + segP->startEntrySection = offset; +} + +/************************************************************************/ +/* SIGetStartEntrySection(segP) - returnss the offset */ +/************************************************************************/ +static Offset +SIGetStartEntrySection(SISeg *segP) +{ + return(segP->startEntrySection); +} + + +/************************************************************************/ +/* SISetEndEntrySection(segP, offset) - sets the offset */ +/************************************************************************/ +static void +SISetEndEntrySection(SISeg *segP, Offset offset) +{ + segP->endEntrySection = offset; +} + +/************************************************************************/ +/* SISetEndEntryChain(segP, offset) - sets the offset */ +/************************************************************************/ +static void +SISetEndEntryChain(SISeg *segP, Offset offset) +{ + segP->endEntryChain = offset; +} + +/************************************************************************/ +/* SIGetEndEntryChain(segP) - returnss the offset */ +/************************************************************************/ +static Offset +SIGetEndEntryChain(SISeg *segP) +{ + return(segP->endEntryChain); +} + +/************************************************************************/ +/* SISetStartEntryChain(segP, offset) - sets the offset */ +/************************************************************************/ +static void +SISetStartEntryChain(SISeg *segP, Offset offset) +{ + segP->startEntryChain = offset; +} + +/************************************************************************/ +/* SIGetStartEntryChain(segP) - returns the offset */ +/************************************************************************/ +static Offset +SIGetStartEntryChain(SISeg *segP) +{ + return(segP->startEntryChain); +} + +/************************************************************************/ +/* SISetNumEntries(segP, num) sets the current nuber of entries */ +/************************************************************************/ +static bool +SISetNumEntries(SISeg *segP, int num) +{ + if ( num <= MAXNUMMESSAGES) { + segP->numEntries = num; + return(true); + } else { + return(false); /* table full */ + } +} + +/************************************************************************/ +/* SIGetNumEntries(segP) - returns the current nuber of entries */ +/************************************************************************/ +static int +SIGetNumEntries(SISeg *segP) +{ + return(segP->numEntries); +} + + +/************************************************************************/ +/* SISetMaxNumEntries(segP, num) sets the maximal number of entries */ +/************************************************************************/ +static bool +SISetMaxNumEntries(SISeg *segP, int num) +{ + if ( num <= MAXNUMMESSAGES) { + segP->maxNumEntries = num; + return(true); + } else { + return(false); /* wrong number */ + } +} + + +/************************************************************************/ +/* SIGetProcStateLimit(segP, i) returns the limit of read messages */ +/************************************************************************/ +static int +SIGetProcStateLimit(SISeg *segP, int i) +{ + return(segP->procState[i].limit); +} + +/************************************************************************/ +/* SIIncNumEntries(segP, num) increments the current nuber of entries */ +/************************************************************************/ +static bool +SIIncNumEntries(SISeg *segP, int num) +{ + if ((segP->numEntries + num) <= MAXNUMMESSAGES) { + segP->numEntries = segP->numEntries + num; + return(true); + } else { + return(false); /* table full */ + } +} + +/************************************************************************/ +/* SIDecNumEntries(segP, num) decrements the current nuber of entries */ +/************************************************************************/ +static bool +SIDecNumEntries(SISeg *segP, int num) +{ + if ((segP->numEntries - num) >= 0) { + segP->numEntries = segP->numEntries - num; + return(true); + } else { + return(false); /* not enough entries in table */ + } +} + +/************************************************************************/ +/* SISetStartFreeSpace(segP, offset) - sets the offset */ +/************************************************************************/ +static void +SISetStartFreeSpace(SISeg *segP, Offset offset) +{ + segP->startFreeSpace = offset; +} + +/************************************************************************/ +/* SIGetStartFreeSpace(segP) - returns the offset */ +/************************************************************************/ +static Offset +SIGetStartFreeSpace(SISeg *segP) +{ + return(segP->startFreeSpace); +} + + + +/************************************************************************/ +/* SIGetFirstDataEntry(segP) returns first data entry */ +/************************************************************************/ +static SISegEntry * +SIGetFirstDataEntry(SISeg *segP) +{ + SISegEntry *eP; + Offset startChain; + + startChain = SIGetStartEntryChain(segP); + + if (startChain == InvalidOffset) + return(NULL); + + eP = (SISegEntry *) ((Pointer) segP + + SIGetStartEntrySection(segP) + + startChain ); + return(eP); +} + + +/************************************************************************/ +/* SIGetLastDataEntry(segP) returns last data entry in the chain */ +/************************************************************************/ +static SISegEntry * +SIGetLastDataEntry(SISeg *segP) +{ + SISegEntry *eP; + Offset endChain; + + endChain = SIGetEndEntryChain(segP); + + if (endChain == InvalidOffset) + return(NULL); + + eP = (SISegEntry *) ((Pointer) segP + + SIGetStartEntrySection(segP) + + endChain ); + return(eP); +} + +/************************************************************************/ +/* SIGetNextDataEntry(segP, offset) returns next data entry */ +/************************************************************************/ +static SISegEntry * +SIGetNextDataEntry(SISeg *segP, Offset offset) +{ + SISegEntry *eP; + + if (offset == InvalidOffset) + return(NULL); + + eP = (SISegEntry *) ((Pointer) segP + + SIGetStartEntrySection(segP) + + offset); + return(eP); +} + + +/************************************************************************/ +/* SIGetNthDataEntry(segP, n) returns the n-th data entry in chain */ +/************************************************************************/ +static SISegEntry * +SIGetNthDataEntry(SISeg *segP, + int n) /* must range from 1 to MaxMessages */ +{ + SISegEntry *eP; + int i; + + if (n <= 0) return(NULL); + + eP = SIGetFirstDataEntry(segP); + for (i = 1; i < n; i++) { + /* skip one and get the next */ + eP = SIGetNextDataEntry(segP, eP->next); + } + + return(eP); +} + +/************************************************************************/ +/* SIEntryOffset(segP, entryP) returns the offset for an pointer */ +/************************************************************************/ +static Offset +SIEntryOffset(SISeg *segP, SISegEntry *entryP) +{ + /* relative to B !! */ + return ((Offset) ((Pointer) entryP - + (Pointer) segP - + SIGetStartEntrySection(segP) )); +} + + +/************************************************************************/ +/* SISetDataEntry(segP, data) - sets a message in the segemnt */ +/************************************************************************/ +bool +SISetDataEntry(SISeg *segP, SharedInvalidData *data) +{ + Offset offsetToNewData; + SISegEntry *eP, *lastP; + bool SISegFull(); + Offset SIEntryOffset(); + Offset SIGetStartFreeSpace(); + SISegEntry *SIGetFirstDataEntry(); + SISegEntry *SIGetNextDataEntry(); + SISegEntry *SIGetLastDataEntry(); + + if (!SIIncNumEntries(segP, 1)) + return(false); /* no space */ + + /* get a free entry */ + offsetToNewData = SIGetStartFreeSpace(segP); + eP = SIGetNextDataEntry(segP, offsetToNewData); /* it's a free one */ + SISetStartFreeSpace(segP, eP->next); + /* fill it up */ + eP->entryData = *data; + eP->isfree = false; + eP->next = InvalidOffset; + + /* handle insertion point at the end of the chain !!*/ + lastP = SIGetLastDataEntry(segP); + if (lastP == NULL) { + /* there is no chain, insert the first entry */ + SISetStartEntryChain(segP, SIEntryOffset(segP, eP)); + } else { + /* there is a last entry in the chain */ + lastP->next = SIEntryOffset(segP, eP); + } + SISetEndEntryChain(segP, SIEntryOffset(segP, eP)); + return(true); +} + + +/************************************************************************/ +/* SIDecProcLimit(segP, num) decrements all process limits */ +/************************************************************************/ +static void +SIDecProcLimit(SISeg *segP, int num) +{ + int i; + for (i=0; i < MaxBackendId; i++) { + /* decrement only, if there is a limit > 0 */ + if (segP->procState[i].limit > 0) { + segP->procState[i].limit = segP->procState[i].limit - num; + if (segP->procState[i].limit < 0) { + /* limit was not high enough, reset to zero */ + /* negative means it's a dead backend */ + segP->procState[i].limit = 0; + } + } + } +} + + +/************************************************************************/ +/* SIDelDataEntry(segP) - free the FIRST entry */ +/************************************************************************/ +bool +SIDelDataEntry(SISeg *segP) +{ + SISegEntry *e1P; + SISegEntry *SIGetFirstDataEntry(); + + if (!SIDecNumEntries(segP, 1)) { + /* no entries in buffer */ + return(false); + } + + e1P = SIGetFirstDataEntry(segP); + SISetStartEntryChain(segP, e1P->next); + if (SIGetStartEntryChain(segP) == InvalidOffset) { + /* it was the last entry */ + SISetEndEntryChain(segP, InvalidOffset); + } + /* free the entry */ + e1P->isfree = true; + e1P->next = SIGetStartFreeSpace(segP); + SISetStartFreeSpace(segP, SIEntryOffset(segP, e1P)); + SIDecProcLimit(segP, 1); + return(true); +} + + + +/************************************************************************/ +/* SISetProcStateInvalid(segP) checks and marks a backends state as */ +/* invalid */ +/************************************************************************/ +void +SISetProcStateInvalid(SISeg *segP) +{ + int i; + + for (i=0; i < MaxBackendId; i++) { + if (segP->procState[i].limit == 0) { + /* backend i didn't read any message */ + segP->procState[i].resetState = true; + /*XXX signal backend that it has to reset its internal cache ? */ + } + } +} + +/************************************************************************/ +/* SIReadEntryData(segP, backendId, function) */ +/* - marks messages to be read by id */ +/* and executes function */ +/************************************************************************/ +void +SIReadEntryData(SISeg *segP, + int backendId, + void (*invalFunction)(), + void (*resetFunction)()) +{ + int i = 0; + SISegEntry *data; + + Assert(segP->procState[backendId - 1].tag == MyBackendTag); + + if (!segP->procState[backendId - 1].resetState) { + /* invalidate data, but only those, you have not seen yet !!*/ + /* therefore skip read messages */ + data = SIGetNthDataEntry(segP, + SIGetProcStateLimit(segP, backendId - 1) + 1); + while (data != NULL) { + i++; + segP->procState[backendId - 1].limit++; /* one more message read */ + invalFunction(data->entryData.cacheId, + data->entryData.hashIndex, + &data->entryData.pointerData); + data = SIGetNextDataEntry(segP, data->next); + } + /* SIDelExpiredDataEntries(segP); */ + } else { + /*backend must not read messages, its own state has to be reset */ + elog(NOTICE, "SIMarkEntryData: cache state reset"); + resetFunction(); /* XXXX call it here, parameters? */ + + /* new valid state--mark all messages "read" */ + segP->procState[backendId - 1].resetState = false; + segP->procState[backendId - 1].limit = SIGetNumEntries(segP); + } + /* check whether we can remove dead messages */ + if (i > MAXNUMMESSAGES) { + elog(FATAL, "SIReadEntryData: Invalid segment state"); + } +} + +/************************************************************************/ +/* SIDelExpiredDataEntries (segP) - removes irrelevant messages */ +/************************************************************************/ +void +SIDelExpiredDataEntries(SISeg *segP) +{ + int min, i, h; + + min = 9999999; + for (i = 0; i < MaxBackendId; i++) { + h = SIGetProcStateLimit(segP, i); + if (h >= 0) { /* backend active */ + if (h < min ) min = h; + } + } + if (min != 9999999) { + /* we can remove min messages */ + for (i = 1; i <= min; i++) { + /* this adjusts also the state limits!*/ + if (!SIDelDataEntry(segP)) { + elog(FATAL, "SIDelExpiredDataEntries: Invalid segment state"); + } + } + } +} + + + +/************************************************************************/ +/* SISegInit(segP) - initializes the segment */ +/************************************************************************/ +static void +SISegInit(SISeg *segP) +{ + SISegOffsets *oP; + int segSize, i; + SISegEntry *eP; + + oP = SIComputeSize(&segSize); + /* set sempahore ids in the segment */ + /* XXX */ + SISetStartEntrySection(segP, oP->offsetToFirstEntry); + SISetEndEntrySection(segP, oP->offsetToEndOfSegemnt); + SISetStartFreeSpace(segP, 0); + SISetStartEntryChain(segP, InvalidOffset); + SISetEndEntryChain(segP, InvalidOffset); + (void) SISetNumEntries(segP, 0); + (void) SISetMaxNumEntries(segP, MAXNUMMESSAGES); + for (i = 0; i < MaxBackendId; i++) { + segP->procState[i].limit = -1; /* no backend active !!*/ + segP->procState[i].resetState = false; + segP->procState[i].tag = InvalidBackendTag; + } + /* construct a chain of free entries */ + for (i = 1; i < MAXNUMMESSAGES; i++) { + eP = (SISegEntry *) ((Pointer) segP + + SIGetStartEntrySection(segP) + + (i - 1) * sizeof(SISegEntry)); + eP->isfree = true; + eP->next = i * sizeof(SISegEntry); /* relative to B */ + } + /* handle the last free entry separate */ + eP = (SISegEntry *) ((Pointer) segP + + SIGetStartEntrySection(segP) + + (MAXNUMMESSAGES - 1) * sizeof(SISegEntry)); + eP->isfree = true; + eP->next = InvalidOffset; /* it's the end of the chain !! */ + /* + * Be tidy + */ + pfree(oP); + +} + + + +/************************************************************************/ +/* SISegmentKill(key) - kill any segment */ +/************************************************************************/ +static void +SISegmentKill(int key) /* the corresponding key for the segment */ +{ + IpcMemoryKill(key); +} + + +/************************************************************************/ +/* SISegmentGet(key, size) - get a shared segment of size <size> */ +/* returns a segment id */ +/************************************************************************/ +static IpcMemoryId +SISegmentGet(int key, /* the corresponding key for the segment */ + int size, /* size of segment in bytes */ + bool create) +{ + IpcMemoryId shmid; + + if (create) { + shmid = IpcMemoryCreate(key, size, IPCProtection); + } else { + shmid = IpcMemoryIdGet(key, size); + } + return(shmid); +} + +/************************************************************************/ +/* SISegmentAttach(shmid) - attach a shared segment with id shmid */ +/************************************************************************/ +static void +SISegmentAttach(IpcMemoryId shmid) +{ + shmInvalBuffer = (struct SISeg *) IpcMemoryAttach(shmid); + if (shmInvalBuffer == IpcMemAttachFailed) { + /* XXX use validity function */ + elog(NOTICE, "SISegmentAttach: Could not attach segment"); + elog(FATAL, "SISegmentAttach: %m"); + } +} + + +/************************************************************************/ +/* SISegmentInit(killExistingSegment, key) initialize segment */ +/************************************************************************/ +int +SISegmentInit(bool killExistingSegment, IPCKey key) +{ + SISegOffsets *oP; + int segSize; + IpcMemoryId shmId; + bool create; + + if (killExistingSegment) { + /* Kill existing segment */ + /* set semaphore */ + SISegmentKill(key); + + /* Get a shared segment */ + + oP = SIComputeSize(&segSize); + /* + * Be tidy + */ + pfree(oP); + + create = true; + shmId = SISegmentGet(key,segSize, create); + if (shmId < 0) { + perror("SISegmentGet: failed"); + return(-1); /* an error */ + } + + /* Attach the shared cache invalidation segment */ + /* sets the global variable shmInvalBuffer */ + SISegmentAttach(shmId); + + /* Init shared memory table */ + SISegInit(shmInvalBuffer); + } else { + /* use an existing segment */ + create = false; + shmId = SISegmentGet(key, 0, create); + if (shmId < 0) { + perror("SISegmentGet: getting an existent segment failed"); + return(-1); /* an error */ + } + /* Attach the shared cache invalidation segment */ + SISegmentAttach(shmId); + } + return(1); +} + diff --git a/src/backend/storage/ipc/spin.c b/src/backend/storage/ipc/spin.c new file mode 100644 index 00000000000..7ff2561f237 --- /dev/null +++ b/src/backend/storage/ipc/spin.c @@ -0,0 +1,247 @@ +/*------------------------------------------------------------------------- + * + * spin.c-- + * routines for managing spin locks + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/spin.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * POSTGRES has two kinds of locks: semaphores (which put the + * process to sleep) and spinlocks (which are supposed to be + * short term locks). Currently both are implemented as SysV + * semaphores, but presumably this can change if we move to + * a machine with a test-and-set (TAS) instruction. Its probably + * a good idea to think about (and allocate) short term and long + * term semaphores separately anyway. + * + * NOTE: These routines are not supposed to be widely used in Postgres. + * They are preserved solely for the purpose of porting Mark Sullivan's + * buffer manager to Postgres. + */ +#include <errno.h> +#include "postgres.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/proc.h" +#include "utils/elog.h" + +/* globals used in this file */ +IpcSemaphoreId SpinLockId; + +#ifdef HAS_TEST_AND_SET +/* real spin lock implementations */ + +bool +CreateSpinlocks(IPCKey key) +{ + /* the spin lock shared memory must have been created by now */ + return(TRUE); +} + +bool +AttachSpinLocks(IPCKey key) +{ + /* the spin lock shared memory must have been attached by now */ + return(TRUE); +} + +bool +InitSpinLocks(int init, IPCKey key) +{ + extern SPINLOCK ShmemLock; + extern SPINLOCK BindingLock; + extern SPINLOCK BufMgrLock; + extern SPINLOCK LockMgrLock; + extern SPINLOCK ProcStructLock; + extern SPINLOCK SInvalLock; + extern SPINLOCK OidGenLockId; + +#ifdef MAIN_MEMORY + extern SPINLOCK MMCacheLock; +#endif /* SONY_JUKEBOX */ + + /* These six spinlocks have fixed location is shmem */ + ShmemLock = (SPINLOCK) SHMEMLOCKID; + BindingLock = (SPINLOCK) BINDINGLOCKID; + BufMgrLock = (SPINLOCK) BUFMGRLOCKID; + LockMgrLock = (SPINLOCK) LOCKMGRLOCKID; + ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID; + SInvalLock = (SPINLOCK) SINVALLOCKID; + OidGenLockId = (SPINLOCK) OIDGENLOCKID; + +#ifdef MAIN_MEMORY + MMCacheLock = (SPINLOCK) MMCACHELOCKID; +#endif /* MAIN_MEMORY */ + + return(TRUE); +} + +void +SpinAcquire(SPINLOCK lock) +{ + ExclusiveLock(lock); + PROC_INCR_SLOCK(lock); +} + +void +SpinRelease(SPINLOCK lock) +{ + PROC_DECR_SLOCK(lock); + ExclusiveUnlock(lock); +} + +bool +SpinIsLocked(SPINLOCK lock) +{ + return(!LockIsFree(lock)); +} + +#else /* HAS_TEST_AND_SET */ +/* Spinlocks are implemented using SysV semaphores */ + + +/* + * SpinAcquire -- try to grab a spinlock + * + * FAILS if the semaphore is corrupted. + */ +void +SpinAcquire(SPINLOCK lock) +{ + IpcSemaphoreLock(SpinLockId, lock, IpcExclusiveLock); + PROC_INCR_SLOCK(lock); +} + +/* + * SpinRelease -- release a spin lock + * + * FAILS if the semaphore is corrupted + */ +void +SpinRelease(SPINLOCK lock) +{ + Assert(SpinIsLocked(lock)) + PROC_DECR_SLOCK(lock); + IpcSemaphoreUnlock(SpinLockId, lock, IpcExclusiveLock); +} + +bool +SpinIsLocked(SPINLOCK lock) +{ + int semval; + + semval = IpcSemaphoreGetValue(SpinLockId, lock); + return(semval < IpcSemaphoreDefaultStartValue); +} + +/* + * CreateSpinlocks -- Create a sysV semaphore array for + * the spinlocks + * + */ +bool +CreateSpinlocks(IPCKey key) +{ + + int status; + IpcSemaphoreId semid; + semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection, + IpcSemaphoreDefaultStartValue, 1, &status); + if (status == IpcSemIdExist) { + IpcSemaphoreKill(key); + elog(NOTICE,"Destroying old spinlock semaphore"); + semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection, + IpcSemaphoreDefaultStartValue, 1, &status); + } + + if (semid >= 0) { + SpinLockId = semid; + return(TRUE); + } + /* cannot create spinlocks */ + elog(FATAL,"CreateSpinlocks: cannot create spin locks"); + return(FALSE); +} + +/* + * Attach to existing spinlock set + */ +bool +AttachSpinLocks(IPCKey key) +{ + IpcSemaphoreId id; + + id = semget (key, MAX_SPINS, 0); + if (id < 0) { + if (errno == EEXIST) { + /* key is the name of someone else's semaphore */ + elog (FATAL,"AttachSpinlocks: SPIN_KEY belongs to someone else"); + } + /* cannot create spinlocks */ + elog(FATAL,"AttachSpinlocks: cannot create spin locks"); + return(FALSE); + } + SpinLockId = id; + return(TRUE); +} + +/* + * InitSpinLocks -- Spinlock bootstrapping + * + * We need several spinlocks for bootstrapping: + * BindingLock (for the shmem binding table) and + * ShmemLock (for the shmem allocator), BufMgrLock (for buffer + * pool exclusive access), LockMgrLock (for the lock table), and + * ProcStructLock (a spin lock for the shared process structure). + * If there's a Sony WORM drive attached, we also have a spinlock + * (SJCacheLock) for it. Same story for the main memory storage mgr. + * + */ +bool +InitSpinLocks(int init, IPCKey key) +{ + extern SPINLOCK ShmemLock; + extern SPINLOCK BindingLock; + extern SPINLOCK BufMgrLock; + extern SPINLOCK LockMgrLock; + extern SPINLOCK ProcStructLock; + extern SPINLOCK SInvalLock; + extern SPINLOCK OidGenLockId; + +#ifdef MAIN_MEMORY + extern SPINLOCK MMCacheLock; +#endif /* MAIN_MEMORY */ + + if (!init || key != IPC_PRIVATE) { + /* if bootstrap and key is IPC_PRIVATE, it means that we are running + * backend by itself. no need to attach spinlocks + */ + if (! AttachSpinLocks(key)) { + elog(FATAL,"InitSpinLocks: couldnt attach spin locks"); + return(FALSE); + } + } + + /* These five (or six) spinlocks have fixed location is shmem */ + ShmemLock = (SPINLOCK) SHMEMLOCKID; + BindingLock = (SPINLOCK) BINDINGLOCKID; + BufMgrLock = (SPINLOCK) BUFMGRLOCKID; + LockMgrLock = (SPINLOCK) LOCKMGRLOCKID; + ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID; + SInvalLock = (SPINLOCK) SINVALLOCKID; + OidGenLockId = (SPINLOCK) OIDGENLOCKID; + +#ifdef MAIN_MEMORY + MMCacheLock = (SPINLOCK) MMCACHELOCKID; +#endif /* MAIN_MEMORY */ + + return(TRUE); +} +#endif /* HAS_TEST_AND_SET */ diff --git a/src/backend/storage/item.h b/src/backend/storage/item.h new file mode 100644 index 00000000000..ca989fec654 --- /dev/null +++ b/src/backend/storage/item.h @@ -0,0 +1,20 @@ +/*------------------------------------------------------------------------- + * + * item.h-- + * POSTGRES disk item definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: item.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef ITEM_H +#define ITEM_H + +#include "c.h" + +typedef Pointer Item; + +#endif /* ITEM_H */ diff --git a/src/backend/storage/itemid.h b/src/backend/storage/itemid.h new file mode 100644 index 00000000000..f5cd0c62cc0 --- /dev/null +++ b/src/backend/storage/itemid.h @@ -0,0 +1,75 @@ +/*------------------------------------------------------------------------- + * + * itemid.h-- + * Standard POSTGRES buffer page item identifier definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: itemid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef ITEMID_H +#define ITEMID_H + +typedef uint16 ItemOffset; +typedef uint16 ItemLength; + +typedef bits16 ItemIdFlags; + + + +typedef struct ItemIdData { /* line pointers */ + unsigned lp_off:13, /* offset to find tup */ + /* can be reduced by 2 if necc. */ + lp_flags:6, /* flags on tuple */ + lp_len:13; /* length of tuple */ +} ItemIdData; + +typedef struct ItemIdData *ItemId; + +#ifndef LP_USED +#define LP_USED 0x01 /* this line pointer is being used */ +#endif + +/* ---------------- + * support macros + * ---------------- + */ +/* + * ItemIdGetLength + */ +#define ItemIdGetLength(itemId) \ + ((itemId)->lp_len) + +/* + * ItemIdGetOffset + */ +#define ItemIdGetOffset(itemId) \ + ((itemId)->lp_off) + +/* + * ItemIdGetFlags + */ +#define ItemIdGetFlags(itemId) \ + ((itemId)->lp_flags) + +/* + * ItemIdIsValid -- + * True iff disk item identifier is valid. + */ +#define ItemIdIsValid(itemId) PointerIsValid(itemId) + +/* + * ItemIdIsUsed -- + * True iff disk item identifier is in use. + * + * Note: + * Assumes disk item identifier is valid. + */ +#define ItemIdIsUsed(itemId) \ + (AssertMacro(ItemIdIsValid(itemId)) ? \ + (bool) (((itemId)->lp_flags & LP_USED) != 0) : false) + +#endif /* ITEMID_H */ diff --git a/src/backend/storage/itempos.h b/src/backend/storage/itempos.h new file mode 100644 index 00000000000..c3b895ae075 --- /dev/null +++ b/src/backend/storage/itempos.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * itempos.h-- + * Standard POSTGRES buffer page long item subposition definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: itempos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef ITEMPOS_H +#define ITEMPOS_H + +#include "c.h" +#include "storage/buf.h" +#include "storage/itemid.h" + +typedef struct ItemSubpositionData { + Buffer op_db; + ItemId op_lpp; + char *op_cp; /* XXX */ + uint32 op_len; +} ItemSubpositionData; + +typedef ItemSubpositionData *ItemSubposition; + +/* + * PNOBREAK(OBJP, LEN) + * struct objpos *OBJP; + * unsigned LEN; + */ +#define PNOBREAK(OBJP, LEN) ((OBJP)->op_len >= LEN) + +/* + * PSKIP(OBJP, LEN) + * struct objpos *OBJP; + * unsigned LEN; + */ +#define PSKIP(OBJP, LEN)\ + { (OBJP)->op_cp += (LEN); (OBJP)->op_len -= (LEN); } + +#endif /* ITEMPOS_H */ diff --git a/src/backend/storage/itemptr.h b/src/backend/storage/itemptr.h new file mode 100644 index 00000000000..ba3c154ef14 --- /dev/null +++ b/src/backend/storage/itemptr.h @@ -0,0 +1,115 @@ +/*------------------------------------------------------------------------- + * + * itemptr.h-- + * POSTGRES disk item pointer definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: itemptr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef ITEMPTR_H +#define ITEMPTR_H + +#include "c.h" +#include "storage/block.h" +#include "storage/off.h" +#include "storage/itemid.h" + +/* + * ItemPointer: + * + * this is a pointer to an item on another disk page in the same file. + * blkid tells us which block, posid tells us which entry in the linp + * (ItemIdData) array we want. + */ +typedef struct ItemPointerData { + BlockIdData ip_blkid; + OffsetNumber ip_posid; +} ItemPointerData; + +typedef ItemPointerData *ItemPointer; + +/* ---------------- + * support macros + * ---------------- + */ + +/* + * ItemPointerIsValid -- + * True iff the disk item pointer is not NULL. + */ +#define ItemPointerIsValid(pointer) \ + ((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0))) + +/* + * ItemPointerGetBlockNumber -- + * Returns the block number of a disk item pointer. + */ +#define ItemPointerGetBlockNumber(pointer) \ + (AssertMacro(ItemPointerIsValid(pointer)) ? \ + BlockIdGetBlockNumber(&(pointer)->ip_blkid) : (BlockNumber) 0) + +/* + * ItemPointerGetOffsetNumber -- + * Returns the offset number of a disk item pointer. + */ +#define ItemPointerGetOffsetNumber(pointer) \ + (AssertMacro(ItemPointerIsValid(pointer)) ? \ + (pointer)->ip_posid : \ + InvalidOffsetNumber) + +/* + * ItemPointerSet -- + * Sets a disk item pointer to the specified block and offset. + */ +#define ItemPointerSet(pointer, blockNumber, offNum) \ + Assert(PointerIsValid(pointer)); \ + BlockIdSet(&((pointer)->ip_blkid), blockNumber); \ + (pointer)->ip_posid = offNum + +/* + * ItemPointerSetBlockNumber -- + * Sets a disk item pointer to the specified block. + */ +#define ItemPointerSetBlockNumber(pointer, blockNumber) \ + Assert(PointerIsValid(pointer)); \ + BlockIdSet(&((pointer)->ip_blkid), blockNumber) + +/* + * ItemPointerSetOffsetNumber -- + * Sets a disk item pointer to the specified offset. + */ +#define ItemPointerSetOffsetNumber(pointer, offsetNumber) \ + AssertMacro(PointerIsValid(pointer)); \ + (pointer)->ip_posid = (offsetNumber) + +/* + * ItemPointerCopy -- + * Copies the contents of one disk item pointer to another. + */ +#define ItemPointerCopy(fromPointer, toPointer) \ + Assert(PointerIsValid(toPointer)); \ + Assert(PointerIsValid(fromPointer)); \ + *(toPointer) = *(fromPointer) + +/* + * ItemPointerSetInvalid -- + * Sets a disk item pointer to be invalid. + */ +#define ItemPointerSetInvalid(pointer) \ + Assert(PointerIsValid(pointer)); \ + BlockIdSet(&((pointer)->ip_blkid), InvalidBlockNumber); \ + (pointer)->ip_posid = InvalidOffsetNumber + +/* ---------------- + * externs + * ---------------- + */ + +extern bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2); + +#endif /* ITEMPTR_H */ + diff --git a/src/backend/storage/large_object.h b/src/backend/storage/large_object.h new file mode 100644 index 00000000000..177d2c26e47 --- /dev/null +++ b/src/backend/storage/large_object.h @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * large_object.h-- + * file of info for Postgres large objects. POSTGRES 4.2 supports + * zillions of large objects (internal, external, jaquith, inversion). + * Now we only support inversion. + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: large_object.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef LARGE_OBJECT_H +#define LARGE_OBJECT_H + +#include "c.h" +#include "utils/rel.h" +#include "access/relscan.h" + +/* + * This structure will eventually have lots more stuff associated with it. + */ +typedef struct LargeObjectDesc +{ + Relation heap_r; /* heap relation */ + Relation index_r; /* index relation on seqno attribute */ + IndexScanDesc iscan; /* index scan we're using */ + TupleDesc hdesc; /* heap relation tuple desc */ + TupleDesc idesc; /* index relation tuple desc */ + uint32 lowbyte; /* low byte on the current page */ + uint32 highbyte; /* high byte on the current page */ + uint32 offset; /* current seek pointer */ + ItemPointerData htid; /* tid of current heap tuple */ + +#define IFS_RDLOCK (1 << 0) +#define IFS_WRLOCK (1 << 1) +#define IFS_ATEOF (1 << 2) + + u_long flags; /* locking info, etc */ +} LargeObjectDesc; + +/* + * Function definitions... + */ + +/* inversion stuff in inv_api.c */ +extern LargeObjectDesc *inv_create(int flags); +extern LargeObjectDesc *inv_open(Oid lobjId, int flags); +extern void inv_close(LargeObjectDesc *obj_desc); +extern int inv_destroy(Oid lobjId); +extern int inv_stat(LargeObjectDesc *obj_desc, struct pgstat *stbuf); +extern int inv_seek(LargeObjectDesc *obj_desc, int offset, int whence); +extern int inv_tell(LargeObjectDesc *obj_desc); +extern int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes); +extern int inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes); + +#endif /* LARGE_OBJECT_H */ diff --git a/src/backend/storage/large_object/Makefile.inc b/src/backend/storage/large_object/Makefile.inc new file mode 100644 index 00000000000..fd27b46a49d --- /dev/null +++ b/src/backend/storage/large_object/Makefile.inc @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for storage/large_object +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/large_object/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= inv_api.c diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c new file mode 100644 index 00000000000..ae57032f94a --- /dev/null +++ b/src/backend/storage/large_object/inv_api.c @@ -0,0 +1,1165 @@ +/*------------------------------------------------------------------------- + * + * inv_api.c-- + * routines for manipulating inversion fs large objects. This file + * contains the user-level large object application interface routines. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/large_object/inv_api.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> /* for sprintf() */ +#include <sys/file.h> +#include "c.h" +#include "libpq/libpq-fs.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/relscan.h" +#include "access/tupdesc.h" +#include "access/xact.h" +#include "access/nbtree.h" +#include "access/tupdesc.h" +#include "catalog/index.h" /* for index_create() */ +#include "catalog/catalog.h" /* for newoid() */ +#include "catalog/pg_am.h" /* for BTREE_AM_OID */ +#include "catalog/pg_opclass.h" /* for INT4_OPS_OID */ +#include "catalog/pg_proc.h" /* for INT4GE_PROC_OID */ +#include "storage/itemptr.h" +#include "storage/bufpage.h" +#include "storage/bufmgr.h" +#include "utils/rel.h" +#include "utils/palloc.h" +#include "storage/large_object.h" +#include "utils/elog.h" +#include "utils/syscache.h" +#include "utils/builtins.h" /* for namestrcpy() */ +#include "catalog/heap.h" +#include "nodes/pg_list.h" + +/* + * Warning, Will Robinson... In order to pack data into an inversion + * file as densely as possible, we violate the class abstraction here. + * When we're appending a new tuple to the end of the table, we check + * the last page to see how much data we can put on it. If it's more + * than IMINBLK, we write enough to fill the page. This limits external + * fragmentation. In no case can we write more than IMAXBLK, since + * the 8K postgres page size less overhead leaves only this much space + * for data. + */ + +#define IFREESPC(p) (PageGetFreeSpace(p) - sizeof(HeapTupleData) - sizeof(struct varlena) - sizeof(int32)) +#define IMAXBLK 8092 +#define IMINBLK 512 + +/* non-export function prototypes */ +static HeapTuple inv_fetchtup(); +static HeapTuple inv_newtuple(); +static int inv_wrnew(LargeObjectDesc *obj_desc, char *buf, int nbytes); +static int inv_wrold(LargeObjectDesc *obj_desc, char *dbuf, int nbytes, + HeapTuple htup, Buffer buffer); +static void inv_indextup(LargeObjectDesc *obj_desc, HeapTuple htup); +static int _inv_getsize(Relation hreln, TupleDesc hdesc, Relation ireln); + +/* + * inv_create -- create a new large object. + * + * Arguments: + * flags -- storage manager to use, archive mode, etc. + * + * Returns: + * large object descriptor, appropriately filled in. + */ +LargeObjectDesc * +inv_create(int flags) +{ + int file_oid; + LargeObjectDesc *retval; + Relation r; + Relation indr; + int smgr; + char archchar; + TupleDesc tupdesc; + AttrNumber attNums[1]; + Oid classObjectId[1]; + char objname[NAMEDATALEN]; + char indname[NAMEDATALEN]; + + /* parse flags */ + smgr = flags & INV_SMGRMASK; + if (flags & INV_ARCHIVE) + archchar = 'h'; + else + archchar = 'n'; + + /* add one here since the pg_class tuple created + will have the next oid and we want to have the relation name + to correspond to the tuple OID */ + file_oid = newoid()+1; + + /* come up with some table names */ + sprintf(objname, "Xinv%d", file_oid); + sprintf(indname, "Xinx%d", file_oid); + + if (SearchSysCacheTuple(RELNAME, PointerGetDatum(objname), + 0,0,0) != NULL) { + elog(WARN, + "internal error: %s already exists -- cannot create large obj", + objname); + } + if (SearchSysCacheTuple(RELNAME, PointerGetDatum(indname), + 0,0,0) != NULL) { + elog(WARN, + "internal error: %s already exists -- cannot create large obj", + indname); + } + + /* this is pretty painful... want a tuple descriptor */ + tupdesc = CreateTemplateTupleDesc(2); + (void) TupleDescInitEntry(tupdesc, (AttrNumber) 1, + "olastbye", + "int4", + 0, false); + (void) TupleDescInitEntry(tupdesc, (AttrNumber) 2, + "odata", + "bytea", + 0, false); + /* + * First create the table to hold the inversion large object. It + * will be located on whatever storage manager the user requested. + */ + + (void) heap_create(objname, + objname, + (int) archchar, smgr, + tupdesc); + + /* make the relation visible in this transaction */ + CommandCounterIncrement(); + r = heap_openr(objname); + + if (!RelationIsValid(r)) { + elog(WARN, "cannot create large object on %s under inversion", + smgrout(smgr)); + } + + /* + * Now create a btree index on the relation's olastbyte attribute to + * make seeks go faster. The hardwired constants are embarassing + * to me, and are symptomatic of the pressure under which this code + * was written. + * + * ok, mao, let's put in some symbolic constants - jolly + */ + + attNums[0] = 1; + classObjectId[0] = INT4_OPS_OID; + index_create(objname, indname, NULL, BTREE_AM_OID, + 1, &attNums[0], &classObjectId[0], + 0, (Datum) NULL, NULL); + + /* make the index visible in this transaction */ + CommandCounterIncrement(); + indr = index_openr(indname); + + if (!RelationIsValid(indr)) { + elog(WARN, "cannot create index for large obj on %s under inversion", + smgrout(smgr)); + } + + retval = (LargeObjectDesc *) palloc(sizeof(LargeObjectDesc)); + + retval->heap_r = r; + retval->index_r = indr; + retval->iscan = (IndexScanDesc) NULL; + retval->hdesc = RelationGetTupleDescriptor(r); + retval->idesc = RelationGetTupleDescriptor(indr); + retval->offset = retval->lowbyte = + retval->highbyte = 0; + ItemPointerSetInvalid(&(retval->htid)); + + if (flags & INV_WRITE) { + RelationSetLockForWrite(r); + retval->flags = IFS_WRLOCK|IFS_RDLOCK; + } else if (flags & INV_READ) { + RelationSetLockForRead(r); + retval->flags = IFS_RDLOCK; + } + retval->flags |= IFS_ATEOF; + + return(retval); +} + +LargeObjectDesc * +inv_open(Oid lobjId, int flags) +{ + LargeObjectDesc *retval; + Relation r; + char *indname; + Relation indrel; + + r = heap_open(lobjId); + + if (!RelationIsValid(r)) + return ((LargeObjectDesc *) NULL); + + indname = pstrdup((r->rd_rel->relname).data); + + /* + * hack hack hack... we know that the fourth character of the relation + * name is a 'v', and that the fourth character of the index name is an + * 'x', and that they're otherwise identical. + */ + indname[3] = 'x'; + indrel = index_openr(indname); + + if (!RelationIsValid(indrel)) + return ((LargeObjectDesc *) NULL); + + retval = (LargeObjectDesc *) palloc(sizeof(LargeObjectDesc)); + + retval->heap_r = r; + retval->index_r = indrel; + retval->iscan = (IndexScanDesc) NULL; + retval->hdesc = RelationGetTupleDescriptor(r); + retval->idesc = RelationGetTupleDescriptor(indrel); + retval->offset = retval->lowbyte = retval->highbyte = 0; + ItemPointerSetInvalid(&(retval->htid)); + + if (flags & INV_WRITE) { + RelationSetLockForWrite(r); + retval->flags = IFS_WRLOCK|IFS_RDLOCK; + } else if (flags & INV_READ) { + RelationSetLockForRead(r); + retval->flags = IFS_RDLOCK; + } + + return(retval); +} + +/* + * Closes an existing large object descriptor. + */ +void +inv_close(LargeObjectDesc *obj_desc) +{ + Assert(PointerIsValid(obj_desc)); + + if (obj_desc->iscan != (IndexScanDesc) NULL) + index_endscan(obj_desc->iscan); + + heap_close(obj_desc->heap_r); + index_close(obj_desc->index_r); + + pfree(obj_desc); +} + +/* + * Destroys an existing large object, and frees its associated pointers. + * + * returns -1 if failed + */ +int +inv_destroy(Oid lobjId) +{ + Relation r; + + r = (Relation) RelationIdGetRelation(lobjId); + if (!RelationIsValid(r) || r->rd_rel->relkind == RELKIND_INDEX) + return -1; + + heap_destroy(r->rd_rel->relname.data); + return 1; +} + +/* + * inv_stat() -- do a stat on an inversion file. + * + * For the time being, this is an insanely expensive operation. In + * order to find the size of the file, we seek to the last block in + * it and compute the size from that. We scan pg_class to determine + * the file's owner and create time. We don't maintain mod time or + * access time, yet. + * + * These fields aren't stored in a table anywhere because they're + * updated so frequently, and postgres only appends tuples at the + * end of relations. Once clustering works, we should fix this. + */ +int +inv_stat(LargeObjectDesc *obj_desc, struct pgstat *stbuf) +{ + Assert(PointerIsValid(obj_desc)); + Assert(stbuf != NULL); + + /* need read lock for stat */ + if (!(obj_desc->flags & IFS_RDLOCK)) { + RelationSetLockForRead(obj_desc->heap_r); + obj_desc->flags |= IFS_RDLOCK; + } + + stbuf->st_ino = obj_desc->heap_r->rd_id; +#if 1 + stbuf->st_mode = (S_IFREG | 0666); /* IFREG|rw-rw-rw- */ +#else + stbuf->st_mode = 100666; /* IFREG|rw-rw-rw- */ +#endif + stbuf->st_size = _inv_getsize(obj_desc->heap_r, + obj_desc->hdesc, + obj_desc->index_r); + + stbuf->st_uid = obj_desc->heap_r->rd_rel->relowner; + + /* we have no good way of computing access times right now */ + stbuf->st_atime_s = stbuf->st_mtime_s = stbuf->st_ctime_s = 0; + + return (0); +} + +int +inv_seek(LargeObjectDesc *obj_desc, int offset, int whence) +{ + int oldOffset; + Datum d; + ScanKeyData skey; + + Assert(PointerIsValid(obj_desc)); + + if (whence == SEEK_CUR) { + offset += obj_desc->offset; /* calculate absolute position */ + return (inv_seek(obj_desc, offset, SEEK_SET)); + } + + /* + * if you seek past the end (offset > 0) I have + * no clue what happens :-( B.L. 9/1/93 + */ + if (whence == SEEK_END) { + /* need read lock for getsize */ + if (!(obj_desc->flags & IFS_RDLOCK)) { + RelationSetLockForRead(obj_desc->heap_r); + obj_desc->flags |= IFS_RDLOCK; + } + offset += _inv_getsize(obj_desc->heap_r, + obj_desc->hdesc, + obj_desc->index_r ); + return (inv_seek(obj_desc, offset, SEEK_SET)); + } + + /* + * Whenever we do a seek, we turn off the EOF flag bit to force + * ourselves to check for real on the next read. + */ + + obj_desc->flags &= ~IFS_ATEOF; + oldOffset = obj_desc->offset; + obj_desc->offset = offset; + + /* try to avoid doing any work, if we can manage it */ + if (offset >= obj_desc->lowbyte + && offset <= obj_desc->highbyte + && oldOffset <= obj_desc->highbyte + && obj_desc->iscan != (IndexScanDesc) NULL) + return (offset); + + /* + * To do a seek on an inversion file, we start an index scan that + * will bring us to the right place. Each tuple in an inversion file + * stores the offset of the last byte that appears on it, and we have + * an index on this. + */ + + + /* right now, just assume that the operation is SEEK_SET */ + if (obj_desc->iscan != (IndexScanDesc) NULL) { + d = Int32GetDatum(offset); + btmovescan(obj_desc->iscan, d); + } else { + + ScanKeyEntryInitialize(&skey, 0x0, 1, INT4GE_PROC_OID, + Int32GetDatum(offset)); + + obj_desc->iscan = index_beginscan(obj_desc->index_r, + (bool) 0, (uint16) 1, + &skey); + } + + return (offset); +} + +int +inv_tell(LargeObjectDesc *obj_desc) +{ + Assert(PointerIsValid(obj_desc)); + + return (obj_desc->offset); +} + +int +inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes) +{ + HeapTuple htup; + Buffer b; + int nread; + int off; + int ncopy; + Datum d; + struct varlena *fsblock; + bool isNull; + + Assert(PointerIsValid(obj_desc)); + Assert(buf != NULL); + + /* if we're already at EOF, we don't need to do any work here */ + if (obj_desc->flags & IFS_ATEOF) + return (0); + + /* make sure we obey two-phase locking */ + if (!(obj_desc->flags & IFS_RDLOCK)) { + RelationSetLockForRead(obj_desc->heap_r); + obj_desc->flags |= IFS_RDLOCK; + } + + nread = 0; + + /* fetch a block at a time */ + while (nread < nbytes) { + + /* fetch an inversion file system block */ + htup = inv_fetchtup(obj_desc, &b); + + if (!HeapTupleIsValid(htup)) { + obj_desc->flags |= IFS_ATEOF; + break; + } + + /* copy the data from this block into the buffer */ + d = (Datum) heap_getattr(htup, b, 2, obj_desc->hdesc, &isNull); + fsblock = (struct varlena *) DatumGetPointer(d); + + off = obj_desc->offset - obj_desc->lowbyte; + ncopy = obj_desc->highbyte - obj_desc->offset + 1; + if (ncopy > (nbytes - nread)) + ncopy = (nbytes - nread); + memmove(buf, &(fsblock->vl_dat[off]), ncopy); + + /* be a good citizen */ + ReleaseBuffer(b); + + /* move pointers past the amount we just read */ + buf += ncopy; + nread += ncopy; + obj_desc->offset += ncopy; + } + + /* that's it */ + return (nread); +} + +int +inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes) +{ + HeapTuple htup; + Buffer b; + int nwritten; + int tuplen; + + Assert(PointerIsValid(obj_desc)); + Assert(buf != NULL); + + /* + * Make sure we obey two-phase locking. A write lock entitles you + * to read the relation, as well. + */ + + if (!(obj_desc->flags & IFS_WRLOCK)) { + RelationSetLockForRead(obj_desc->heap_r); + obj_desc->flags |= (IFS_WRLOCK|IFS_RDLOCK); + } + + nwritten = 0; + + /* write a block at a time */ + while (nwritten < nbytes) { + + /* + * Fetch the current inversion file system block. If the + * class storing the inversion file is empty, we don't want + * to do an index lookup, since index lookups choke on empty + * files (should be fixed someday). + */ + + if ((obj_desc->flags & IFS_ATEOF) + || obj_desc->heap_r->rd_nblocks == 0) + htup = (HeapTuple) NULL; + else + htup = inv_fetchtup(obj_desc, &b); + + /* either append or replace a block, as required */ + if (!HeapTupleIsValid(htup)) { + tuplen = inv_wrnew(obj_desc, buf, nbytes - nwritten); + } else { + if (obj_desc->offset > obj_desc->highbyte) + tuplen = inv_wrnew(obj_desc, buf, nbytes - nwritten); + else + tuplen = inv_wrold(obj_desc, buf, nbytes - nwritten, htup, b); + } + + /* move pointers past the amount we just wrote */ + buf += tuplen; + nwritten += tuplen; + obj_desc->offset += tuplen; + } + + /* that's it */ + return (nwritten); +} + +/* + * inv_fetchtup -- Fetch an inversion file system block. + * + * This routine finds the file system block containing the offset + * recorded in the obj_desc structure. Later, we need to think about + * the effects of non-functional updates (can you rewrite the same + * block twice in a single transaction?), but for now, we won't bother. + * + * Parameters: + * obj_desc -- the object descriptor. + * bufP -- pointer to a buffer in the buffer cache; caller + * must free this. + * + * Returns: + * A heap tuple containing the desired block, or NULL if no + * such tuple exists. + */ +static HeapTuple +inv_fetchtup(LargeObjectDesc *obj_desc, Buffer *bufP) +{ + HeapTuple htup; + RetrieveIndexResult res; + Datum d; + int firstbyte, lastbyte; + struct varlena *fsblock; + bool isNull; + + /* + * If we've exhausted the current block, we need to get the next one. + * When we support time travel and non-functional updates, we will + * need to loop over the blocks, rather than just have an 'if', in + * order to find the one we're really interested in. + */ + + if (obj_desc->offset > obj_desc->highbyte + || obj_desc->offset < obj_desc->lowbyte + || !ItemPointerIsValid(&(obj_desc->htid))) { + + /* initialize scan key if not done */ + if (obj_desc->iscan==(IndexScanDesc)NULL) { + ScanKeyData skey; + + ScanKeyEntryInitialize(&skey, 0x0, 1, INT4GE_PROC_OID, + Int32GetDatum(0)); + obj_desc->iscan = + index_beginscan(obj_desc->index_r, + (bool) 0, (uint16) 1, + &skey); + } + + do { + res = index_getnext(obj_desc->iscan, ForwardScanDirection); + + if (res == (RetrieveIndexResult) NULL) { + ItemPointerSetInvalid(&(obj_desc->htid)); + return ((HeapTuple) NULL); + } + + /* + * For time travel, we need to use the actual time qual here, + * rather that NowTimeQual. We currently have no way to pass + * a time qual in. + */ + + htup = heap_fetch(obj_desc->heap_r, NowTimeQual, + &(res->heap_iptr), bufP); + + } while (htup == (HeapTuple) NULL); + + /* remember this tid -- we may need it for later reads/writes */ + ItemPointerCopy(&(res->heap_iptr), &(obj_desc->htid)); + + } else { + htup = heap_fetch(obj_desc->heap_r, NowTimeQual, + &(obj_desc->htid), bufP); + } + + /* + * By here, we have the heap tuple we're interested in. We cache + * the upper and lower bounds for this block in the object descriptor + * and return the tuple. + */ + + d = (Datum)heap_getattr(htup, *bufP, 1, obj_desc->hdesc, &isNull); + lastbyte = (int32) DatumGetInt32(d); + d = (Datum)heap_getattr(htup, *bufP, 2, obj_desc->hdesc, &isNull); + fsblock = (struct varlena *) DatumGetPointer(d); + + /* order of + and - is important -- these are unsigned quantites near 0 */ + firstbyte = (lastbyte + 1 + sizeof(fsblock->vl_len)) - fsblock->vl_len; + + obj_desc->lowbyte = firstbyte; + obj_desc->highbyte = lastbyte; + + /* done */ + return (htup); +} + +/* + * inv_wrnew() -- append a new filesystem block tuple to the inversion + * file. + * + * In response to an inv_write, we append one or more file system + * blocks to the class containing the large object. We violate the + * class abstraction here in order to pack things as densely as we + * are able. We examine the last page in the relation, and write + * just enough to fill it, assuming that it has above a certain + * threshold of space available. If the space available is less than + * the threshold, we allocate a new page by writing a big tuple. + * + * By the time we get here, we know all the parameters passed in + * are valid, and that we hold the appropriate lock on the heap + * relation. + * + * Parameters: + * obj_desc: large object descriptor for which to append block. + * buf: buffer containing data to write. + * nbytes: amount to write + * + * Returns: + * number of bytes actually written to the new tuple. + */ +static int +inv_wrnew(LargeObjectDesc *obj_desc, char *buf, int nbytes) +{ + Relation hr; + HeapTuple ntup; + Buffer buffer; + Page page; + int nblocks; + int nwritten; + + hr = obj_desc->heap_r; + + /* + * Get the last block in the relation. If there's no data in the + * relation at all, then we just get a new block. Otherwise, we + * check the last block to see whether it has room to accept some + * or all of the data that the user wants to write. If it doesn't, + * then we allocate a new block. + */ + + nblocks = RelationGetNumberOfBlocks(hr); + + if (nblocks > 0) + buffer = ReadBuffer(hr, nblocks - 1); + else + buffer = ReadBuffer(hr, P_NEW); + + page = BufferGetPage(buffer); + + /* + * If the last page is too small to hold all the data, and it's too + * small to hold IMINBLK, then we allocate a new page. If it will + * hold at least IMINBLK, but less than all the data requested, then + * we write IMINBLK here. The caller is responsible for noticing that + * less than the requested number of bytes were written, and calling + * this routine again. + */ + + nwritten = IFREESPC(page); + if (nwritten < nbytes) { + if (nwritten < IMINBLK) { + ReleaseBuffer(buffer); + buffer = ReadBuffer(hr, P_NEW); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + if (nbytes > IMAXBLK) + nwritten = IMAXBLK; + else + nwritten = nbytes; + } + } else { + nwritten = nbytes; + } + + /* + * Insert a new file system block tuple, index it, and write it out. + */ + + ntup = inv_newtuple(obj_desc, buffer, page, buf, nwritten); + inv_indextup(obj_desc, ntup); + + /* new tuple is inserted */ + WriteBuffer(buffer); + + return (nwritten); +} + +static int +inv_wrold(LargeObjectDesc *obj_desc, + char *dbuf, + int nbytes, + HeapTuple htup, + Buffer buffer) +{ + Relation hr; + HeapTuple ntup; + Buffer newbuf; + Page page; + Page newpage; + int tupbytes; + Datum d; + struct varlena *fsblock; + int nwritten, nblocks, freespc; + bool isNull; + int keep_offset; + + /* + * Since we're using a no-overwrite storage manager, the way we + * overwrite blocks is to mark the old block invalid and append + * a new block. First mark the old block invalid. This violates + * the tuple abstraction. + */ + + TransactionIdStore(GetCurrentTransactionId(), &(htup->t_xmax)); + htup->t_cmax = GetCurrentCommandId(); + + /* + * If we're overwriting the entire block, we're lucky. All we need + * to do is to insert a new block. + */ + + if (obj_desc->offset == obj_desc->lowbyte + && obj_desc->lowbyte + nbytes >= obj_desc->highbyte) { + WriteBuffer(buffer); + return (inv_wrnew(obj_desc, dbuf, nbytes)); + } + + /* + * By here, we need to overwrite part of the data in the current + * tuple. In order to reduce the degree to which we fragment blocks, + * we guarantee that no block will be broken up due to an overwrite. + * This means that we need to allocate a tuple on a new page, if + * there's not room for the replacement on this one. + */ + + newbuf = buffer; + page = BufferGetPage(buffer); + newpage = BufferGetPage(newbuf); + hr = obj_desc->heap_r; + freespc = IFREESPC(page); + d = (Datum)heap_getattr(htup, buffer, 2, obj_desc->hdesc, &isNull); + fsblock = (struct varlena *) DatumGetPointer(d); + tupbytes = fsblock->vl_len - sizeof(fsblock->vl_len); + + if (freespc < tupbytes) { + + /* + * First see if there's enough space on the last page of the + * table to put this tuple. + */ + + nblocks = RelationGetNumberOfBlocks(hr); + + if (nblocks > 0) + newbuf = ReadBuffer(hr, nblocks - 1); + else + newbuf = ReadBuffer(hr, P_NEW); + + newpage = BufferGetPage(newbuf); + freespc = IFREESPC(newpage); + + /* + * If there's no room on the last page, allocate a new last + * page for the table, and put it there. + */ + + if (freespc < tupbytes) { + ReleaseBuffer(newbuf); + newbuf = ReadBuffer(hr, P_NEW); + newpage = BufferGetPage(newbuf); + PageInit(newpage, BufferGetPageSize(newbuf), 0); + } + } + + nwritten = nbytes; + if (nwritten > obj_desc->highbyte - obj_desc->offset + 1) + nwritten = obj_desc->highbyte - obj_desc->offset + 1; + memmove(VARDATA(fsblock)+ (obj_desc->offset - obj_desc->lowbyte), + dbuf,nwritten); + /* we are rewriting the entire old block, therefore + we reset offset to the lowbyte of the original block + before jumping into inv_newtuple() */ + keep_offset = obj_desc->offset; + obj_desc->offset = obj_desc->lowbyte; + ntup = inv_newtuple(obj_desc, newbuf, newpage, VARDATA(fsblock), + tupbytes); + /* after we are done, we restore to the true offset */ + obj_desc->offset = keep_offset; + + /* + * By here, we have a page (newpage) that's guaranteed to have + * enough space on it to put the new tuple. Call inv_newtuple + * to do the work. Passing NULL as a buffer to inv_newtuple() + * keeps it from copying any data into the new tuple. When it + * returns, the tuple is ready to receive data from the old + * tuple and the user's data buffer. + */ +/* + ntup = inv_newtuple(obj_desc, newbuf, newpage, (char *) NULL, tupbytes); + dptr = ((char *) ntup) + ntup->t_hoff - sizeof(ntup->t_bits) + sizeof(int4) + + sizeof(fsblock->vl_len); + + if (obj_desc->offset > obj_desc->lowbyte) { + memmove(dptr, + &(fsblock->vl_dat[0]), + obj_desc->offset - obj_desc->lowbyte); + dptr += obj_desc->offset - obj_desc->lowbyte; + } + + + nwritten = nbytes; + if (nwritten > obj_desc->highbyte - obj_desc->offset + 1) + nwritten = obj_desc->highbyte - obj_desc->offset + 1; + + memmove(dptr, dbuf, nwritten); + dptr += nwritten; + + if (obj_desc->offset + nwritten < obj_desc->highbyte + 1) { +*/ +/* + loc = (obj_desc->highbyte - obj_desc->offset) + + nwritten; + sz = obj_desc->highbyte - (obj_desc->lowbyte + loc); + + what's going on here?? - jolly +*/ +/* + sz = (obj_desc->highbyte + 1) - (obj_desc->offset + nwritten); + memmove(&(fsblock->vl_dat[0]), dptr, sz); + } +*/ + + + /* index the new tuple */ + inv_indextup(obj_desc, ntup); + + /* move the scandesc forward so we don't reread the newly inserted + tuple on the next index scan */ + if (obj_desc->iscan) + index_getnext(obj_desc->iscan, ForwardScanDirection); + + /* + * Okay, by here, a tuple for the new block is correctly placed, + * indexed, and filled. Write the changed pages out. + */ + + WriteBuffer(buffer); + if (newbuf != buffer) + WriteBuffer(newbuf); + + /* done */ + return (nwritten); +} + +static HeapTuple +inv_newtuple(LargeObjectDesc *obj_desc, + Buffer buffer, + Page page, + char *dbuf, + int nwrite) +{ + HeapTuple ntup; + PageHeader ph; + int tupsize; + int hoff; + Offset lower; + Offset upper; + ItemId itemId; + OffsetNumber off; + OffsetNumber limit; + char *attptr; + + /* compute tuple size -- no nulls */ + hoff = sizeof(HeapTupleData) - sizeof(ntup->t_bits); + + /* add in olastbyte, varlena.vl_len, varlena.vl_dat */ + tupsize = hoff + (2 * sizeof(int32)) + nwrite; + tupsize = LONGALIGN(tupsize); + + /* + * Allocate the tuple on the page, violating the page abstraction. + * This code was swiped from PageAddItem(). + */ + + ph = (PageHeader) page; + limit = OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + /* look for "recyclable" (unused & deallocated) ItemId */ + for (off = FirstOffsetNumber; off < limit; off = OffsetNumberNext(off)) { + itemId = &ph->pd_linp[off - 1]; + if ((((*itemId).lp_flags & LP_USED) == 0) && + ((*itemId).lp_len == 0)) + break; + } + + if (off > limit) + lower = (Offset) (((char *) (&ph->pd_linp[off])) - ((char *) page)); + else if (off == limit) + lower = ph->pd_lower + sizeof (ItemIdData); + else + lower = ph->pd_lower; + + upper = ph->pd_upper - tupsize; + + itemId = &ph->pd_linp[off - 1]; + (*itemId).lp_off = upper; + (*itemId).lp_len = tupsize; + (*itemId).lp_flags = LP_USED; + ph->pd_lower = lower; + ph->pd_upper = upper; + + ntup = (HeapTuple) ((char *) page + upper); + + /* + * Tuple is now allocated on the page. Next, fill in the tuple + * header. This block of code violates the tuple abstraction. + */ + + ntup->t_len = tupsize; + ItemPointerSet(&(ntup->t_ctid), BufferGetBlockNumber(buffer), off); + ItemPointerSetInvalid(&(ntup->t_chain)); + LastOidProcessed = ntup->t_oid = newoid(); + TransactionIdStore(GetCurrentTransactionId(), &(ntup->t_xmin)); + ntup->t_cmin = GetCurrentCommandId(); + StoreInvalidTransactionId(&(ntup->t_xmax)); + ntup->t_cmax = 0; + ntup->t_tmin = INVALID_ABSTIME; + ntup->t_tmax = CURRENT_ABSTIME; + ntup->t_natts = 2; + ntup->t_hoff = hoff; + ntup->t_vtype = 0; + ntup->t_infomask = 0x0; + + /* if a NULL is passed in, avoid the calculations below */ + if (dbuf == NULL) + return ntup; + + /* + * Finally, copy the user's data buffer into the tuple. This violates + * the tuple and class abstractions. + */ + + attptr = ((char *) ntup) + hoff; + *((int32 *) attptr) = obj_desc->offset + nwrite - 1; + attptr += sizeof(int32); + + /* + ** mer fixed disk layout of varlenas to get rid of the need for this. + ** + ** *((int32 *) attptr) = nwrite + sizeof(int32); + ** attptr += sizeof(int32); + */ + + *((int32 *) attptr) = nwrite + sizeof(int32); + attptr += sizeof(int32); + + /* + * If a data buffer was passed in, then copy the data from the buffer + * to the tuple. Some callers (eg, inv_wrold()) may not pass in a + * buffer, since they have to copy part of the old tuple data and + * part of the user's new data into the new tuple. + */ + + if (dbuf != (char *) NULL) + memmove(attptr, dbuf, nwrite); + + /* keep track of boundary of current tuple */ + obj_desc->lowbyte = obj_desc->offset; + obj_desc->highbyte = obj_desc->offset + nwrite - 1; + + /* new tuple is filled -- return it */ + return (ntup); +} + +static void +inv_indextup(LargeObjectDesc *obj_desc, HeapTuple htup) +{ + IndexTuple itup; + InsertIndexResult res; + Datum v[1]; + char n[1]; + + n[0] = ' '; + v[0] = Int32GetDatum(obj_desc->highbyte); + itup = index_formtuple(obj_desc->idesc, &v[0], &n[0]); + memmove((char *)&(itup->t_tid), + (char *)&(htup->t_ctid), + sizeof(ItemPointerData)); + res = index_insert(obj_desc->index_r, itup); + + if (res) + pfree(res); + + pfree(itup); +} + +/* +static void +DumpPage(Page page, int blkno) +{ + ItemId lp; + HeapTuple tup; + int flags, i, nline; + ItemPointerData pointerData; + + printf("\t[subblock=%d]:lower=%d:upper=%d:special=%d\n", 0, + ((PageHeader)page)->pd_lower, ((PageHeader)page)->pd_upper, + ((PageHeader)page)->pd_special); + + printf("\t:MaxOffsetNumber=%d\n", + (int16) PageGetMaxOffsetNumber(page)); + + nline = (int16) PageGetMaxOffsetNumber(page); + +{ + int i; + char *cp; + + i = PageGetSpecialSize(page); + cp = PageGetSpecialPointer(page); + + printf("\t:SpecialData="); + + while (i > 0) { + printf(" 0x%02x", *cp); + cp += 1; + i -= 1; + } + printf("\n"); +} + for (i = 0; i < nline; i++) { + lp = ((PageHeader)page)->pd_linp + i; + flags = (*lp).lp_flags; + ItemPointerSet(&pointerData, blkno, 1 + i); + printf("%s:off=%d:flags=0x%x:len=%d", + ItemPointerFormExternal(&pointerData), (*lp).lp_off, + flags, (*lp).lp_len); + + if (flags & LP_USED) { + HeapTupleData htdata; + + printf(":USED"); + + memmove((char *) &htdata, + (char *) &((char *)page)[(*lp).lp_off], + sizeof(htdata)); + + tup = &htdata; + + printf("\n\t:ctid=%s:oid=%d", + ItemPointerFormExternal(&tup->t_ctid), + tup->t_oid); + printf(":natts=%d:thoff=%d:vtype=`%c' (0x%02x):", + tup->t_natts, + tup->t_hoff, tup->t_vtype, tup->t_vtype); + + printf("\n\t:tmin=%d:cmin=%u:", + tup->t_tmin, tup->t_cmin); + + printf("xmin=%u:", tup->t_xmin); + + printf("\n\t:tmax=%d:cmax=%u:", + tup->t_tmax, tup->t_cmax); + + printf("xmax=%u:", tup->t_xmax); + + printf("\n\t:chain=%s:\n", + ItemPointerFormExternal(&tup->t_chain)); + } else + putchar('\n'); + } +} + +static char* +ItemPointerFormExternal(ItemPointer pointer) +{ + static char itemPointerString[32]; + + if (!ItemPointerIsValid(pointer)) { + memmove(itemPointerString, "<-,-,->", sizeof "<-,-,->"); + } else { + sprintf(itemPointerString, "<%u,%u>", + ItemPointerGetBlockNumber(pointer), + ItemPointerGetOffsetNumber(pointer)); + } + + return (itemPointerString); +} +*/ + +static int +_inv_getsize(Relation hreln, TupleDesc hdesc, Relation ireln) +{ + IndexScanDesc iscan; + RetrieveIndexResult res; + Buffer buf; + HeapTuple htup; + Datum d; + long size; + bool isNull; + + /* scan backwards from end */ + iscan = index_beginscan(ireln, (bool) 1, 0, (ScanKey) NULL); + + buf = InvalidBuffer; + + do { + res = index_getnext(iscan, BackwardScanDirection); + + /* + * If there are no more index tuples, then the relation is empty, + * so the file's size is zero. + */ + + if (res == (RetrieveIndexResult) NULL) { + index_endscan(iscan); + return (0); + } + + /* + * For time travel, we need to use the actual time qual here, + * rather that NowTimeQual. We currently have no way to pass + * a time qual in. + */ + + if (buf != InvalidBuffer) + (void) ReleaseBuffer(buf); + + htup = heap_fetch(hreln, NowTimeQual, &(res->heap_iptr), &buf); + + } while (!HeapTupleIsValid(htup)); + + /* don't need the index scan anymore */ + index_endscan(iscan); + + /* get olastbyte attribute */ + d = (Datum) heap_getattr(htup, buf, 1, hdesc, &isNull); + size = DatumGetInt32(d) + 1; + + /* wei hates it if you forget to do this */ + ReleaseBuffer(buf); + + return (size); +} diff --git a/src/backend/storage/lmgr.h b/src/backend/storage/lmgr.h new file mode 100644 index 00000000000..fe87eb05546 --- /dev/null +++ b/src/backend/storage/lmgr.h @@ -0,0 +1,84 @@ +/*------------------------------------------------------------------------- + * + * lmgr.h-- + * POSTGRES lock manager definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: lmgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef LMGR_H +#define LMGR_H + +#include "postgres.h" + +#include "storage/itemptr.h" +#include "storage/lock.h" +#include "utils/rel.h" + +/* + * This was moved from pladt.h for the new lock manager. Want to obsolete + * all of the old code. + */ +typedef struct LRelId { + Oid relId; /* a relation identifier */ + Oid dbId; /* a database identifier */ +} LRelId; + +typedef struct LockInfoData { + bool initialized; + LRelId lRelId; + TransactionId transactionIdData; + uint16 flags; +} LockInfoData; +typedef LockInfoData *LockInfo; + +#define LockInfoIsValid(linfo) \ + ((PointerIsValid(linfo)) && ((LockInfo) linfo)->initialized) + + +extern LRelId RelationGetLRelId(Relation relation); +extern Oid LRelIdGetDatabaseId(LRelId lRelId); +extern Oid LRelIdGetRelationId(LRelId lRelId); +extern bool DatabaseIdIsMyDatabaseId(Oid databaseId); +extern bool LRelIdContainsMyDatabaseId(LRelId lRelId); +extern void RelationInitLockInfo(Relation relation); +extern void RelationDiscardLockInfo(Relation relation); +extern void RelationSetLockForDescriptorOpen(Relation relation); +extern void RelationSetLockForRead(Relation relation); +extern void RelationUnsetLockForRead(Relation relation); +extern void RelationSetLockForWrite(Relation relation); +extern void RelationUnsetLockForWrite(Relation relation); +extern void RelationSetLockForTupleRead(Relation relation, + ItemPointer itemPointer); + +/* used in vaccum.c */ +extern void RelationSetLockForWritePage(Relation relation, + ItemPointer itemPointer); + +/* used in nbtpage.c, hashpage.c */ +extern void RelationSetSingleWLockPage(Relation relation, + ItemPointer itemPointer); +extern void RelationUnsetSingleWLockPage(Relation relation, + ItemPointer itemPointer); +extern void RelationSetSingleRLockPage(Relation relation, + ItemPointer itemPointer); +extern void RelationUnsetSingleRLockPage(Relation relation, + ItemPointer itemPointer); +extern void RelationSetRIntentLock(Relation relation); +extern void RelationUnsetRIntentLock(Relation relation); +extern void RelationSetWIntentLock(Relation relation); +extern void RelationUnsetWIntentLock(Relation relation); +extern void RelationSetLockForExtend(Relation relation); +extern void RelationUnsetLockForExtend(Relation relation); +extern void LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId); + +/* single.c */ +extern bool SingleLockReln(LockInfo linfo, LOCKT lockt, int action); +extern bool SingleLockPage(LockInfo linfo, ItemPointer tidPtr, + LOCKT lockt, int action); + +#endif /* LMGR_H */ diff --git a/src/backend/storage/lmgr/Makefile.inc b/src/backend/storage/lmgr/Makefile.inc new file mode 100644 index 00000000000..ac507558b57 --- /dev/null +++ b/src/backend/storage/lmgr/Makefile.inc @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for storage/lmgr +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= lmgr.c lock.c multi.c proc.c single.c diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README new file mode 100644 index 00000000000..e382003f2a4 --- /dev/null +++ b/src/backend/storage/lmgr/README @@ -0,0 +1,93 @@ +$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $ + +This file is an attempt to save me (and future code maintainers) some +time and a lot of headaches. The existing lock manager code at the time +of this writing (June 16 1992) can best be described as confusing. The +complexity seems inherent in lock manager functionality, but variable +names chosen in the current implementation really confuse me everytime +I have to track down a bug. Also, what gets done where and by whom isn't +always clear.... + +Starting with the data structures the lock manager relies upon... + +(NOTE - these will undoubtedly change over time and it is likely +that this file won't always be updated along with the structs.) + +The lock manager's LOCK: + +tag - + The key fields that are used for hashing locks in the shared memory + lock hash table. This is kept as a separate struct to ensure that we + always zero out the correct number of bytes. This is a problem as + part of the tag is an itempointer which is 6 bytes and causes 2 + additional bytes to be added as padding. + + tag.relId - + Uniquely identifies the relation that the lock corresponds to. + + tag.dbId - + Uniquely identifies the database in which the relation lives. If + this is a shared system relation (e.g. pg_user) the dbId should be + set to 0. + + tag.tupleId - + Uniquely identifies the block/page within the relation and the + tuple within the block. If we are setting a table level lock + both the blockId and tupleId (in an item pointer this is called + the position) are set to invalid, if it is a page level lock the + blockId is valid, while the tuleId is still invalid. Finally if + this is a tuple level lock (we currently never do this) then both + the blockId and tupleId are set to valid specifications. This is + how we get the appearance of a multi-level lock table while using + only a single table (see Gray's paper on 2 phase locking if + you are puzzled about how multi-level lock tables work). + +mask - + This field indicates what types of locks are currently held in the + given lock. It is used (against the lock table's conflict table) + to determine if the new lock request will conflict with existing + lock types held. Conficts are determined by bitwise AND operations + between the mask and the conflict table entry for the given lock type + to be set. The current representation is that each bit (1 through 5) + is set when that lock type (WRITE, READ, WRITE INTENT, READ INTENT, EXTEND) + has been acquired for the lock. + +waitProcs - + This is a shared memory queue of all process structures corresponding to + a backend that is waiting (sleeping) until another backend releases this + lock. The process structure holds the information needed to determine + if it should be woken up when this lock is released. If, for example, + we are releasing a read lock and the process is sleeping trying to acquire + a read lock then there is no point in waking it since the lock being + released isn't what caused it to sleep in the first place. There will + be more on this below (when I get to releasing locks and waking sleeping + process routines). + +nHolding - + Keeps a count of how many times this lock has been attempted to be + acquired. The count includes attempts by processes which were put + to sleep due to conflicts. It also counts the same backend twice + if, for example, a backend process first acquires a read and then + acquires a write. + +holders - + Keeps a count of how many locks of each type have been attempted. Only + elements 1 through MAX_LOCK_TYPES are used as they correspond to the lock + type defined constants (WRITE through EXTEND). Summing the values of + holders should come out equal to nHolding. + +nActive - + Keeps a count of how many times this lock has been succesfully acquired. + This count does not include attempts that were rejected due to conflicts, + but can count the same backend twice (e.g. a read then a write -- since + its the same transaction this won't cause a conflict) + +activeHolders - + Keeps a count of how locks of each type are currently held. Once again + only elements 1 through MAX_LOCK_TYPES are used (0 is not). Also, like + holders, summing the values of activeHolders should total to the value + of nActive. + + +This is all I had the stomach for right now..... I will get back to this +someday. -mer 17 June 1992 12:00 am diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c new file mode 100644 index 00000000000..bfc2f5b2eec --- /dev/null +++ b/src/backend/storage/lmgr/lmgr.c @@ -0,0 +1,933 @@ +/*------------------------------------------------------------------------- + * + * lmgr.c-- + * POSTGRES lock manager code + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* #define LOCKDEBUGALL 1 */ +/* #define LOCKDEBUG 1 */ + +#ifdef LOCKDEBUGALL +#define LOCKDEBUG 1 +#endif /* LOCKDEBUGALL */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup.h" +#include "access/relscan.h" +#include "access/skey.h" +#include "utils/tqual.h" +#include "access/xact.h" + +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/itemptr.h" +#include "storage/bufpage.h" +#include "storage/multilev.h" +#include "storage/lmgr.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" + +#include "catalog/catname.h" +#include "catalog/catalog.h" +#include "catalog/pg_class.h" + +#include "nodes/memnodes.h" +#include "storage/bufmgr.h" +#include "access/transam.h" /* for AmiTransactionId */ + +/* ---------------- + * + * ---------------- + */ +#define MaxRetries 4 /* XXX about 1/4 minute--a hack */ + +#define IntentReadRelationLock 0x0100 +#define ReadRelationLock 0x0200 +#define IntentWriteRelationLock 0x0400 +#define WriteRelationLock 0x0800 +#define IntentReadPageLock 0x1000 +#define ReadTupleLock 0x2000 + +#define TupleLevelLockCountMask 0x000f + +#define TupleLevelLockLimit 10 + +extern Oid MyDatabaseId; + +static LRelId VariableRelationLRelId = { + RelOid_pg_variable, + InvalidOid +}; + +/* ---------------- + * RelationGetLRelId + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_10 \ +elog(NOTICE, "RelationGetLRelId(%s) invalid lockInfo", \ + RelationGetRelationName(relation)); +#else +#define LOCKDEBUG_10 +#endif /* LOCKDEBUG */ + +/* + * RelationGetLRelId -- + * Returns "lock" relation identifier for a relation. + */ +LRelId +RelationGetLRelId(Relation relation) +{ + LockInfo linfo; + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + linfo = (LockInfo) relation->lockInfo; + + /* ---------------- + * initialize lock info if necessary + * ---------------- + */ + if (! LockInfoIsValid(linfo)) { + LOCKDEBUG_10; + RelationInitLockInfo(relation); + linfo = (LockInfo) relation->lockInfo; + } + + /* ---------------- + * XXX hack to prevent problems during + * VARIABLE relation initialization + * ---------------- + */ + if (strcmp(RelationGetRelationName(relation)->data, + VariableRelationName) == 0) { + return (VariableRelationLRelId); + } + + return (linfo->lRelId); +} + +/* + * LRelIdGetDatabaseId -- + * Returns database identifier for a "lock" relation identifier. + */ +/* ---------------- + * LRelIdGetDatabaseId + * + * Note: The argument may not be correct, if it is not used soon + * after it is created. + * ---------------- + */ +Oid +LRelIdGetDatabaseId(LRelId lRelId) +{ + return (lRelId.dbId); +} + + +/* + * LRelIdGetRelationId -- + * Returns relation identifier for a "lock" relation identifier. + */ +Oid +LRelIdGetRelationId(LRelId lRelId) +{ + return (lRelId.relId); +} + +/* + * DatabaseIdIsMyDatabaseId -- + * True iff database object identifier is valid in my present database. + */ +bool +DatabaseIdIsMyDatabaseId(Oid databaseId) +{ + return (bool) + (!OidIsValid(databaseId) || databaseId == MyDatabaseId); +} + +/* + * LRelIdContainsMyDatabaseId -- + * True iff "lock" relation identifier is valid in my present database. + */ +bool +LRelIdContainsMyDatabaseId(LRelId lRelId) +{ + return (bool) + (!OidIsValid(lRelId.dbId) || lRelId.dbId == MyDatabaseId); +} + +/* + * RelationInitLockInfo -- + * Initializes the lock information in a relation descriptor. + */ +/* ---------------- + * RelationInitLockInfo + * + * XXX processingVariable is a hack to prevent problems during + * VARIABLE relation initialization. + * ---------------- + */ +void +RelationInitLockInfo(Relation relation) +{ + LockInfo info; + char *relname; + Oid relationid; + bool processingVariable; + extern Oid MyDatabaseId; /* XXX use include */ + extern GlobalMemory CacheCxt; + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + Assert(OidIsValid(RelationGetRelationId(relation))); + + /* ---------------- + * get information from relation descriptor + * ---------------- + */ + info = (LockInfo) relation->lockInfo; + relname = (char *) RelationGetRelationName(relation); + relationid = RelationGetRelationId(relation); + processingVariable = (strcmp(relname, VariableRelationName) == 0); + + /* ---------------- + * create a new lockinfo if not already done + * ---------------- + */ + if (! PointerIsValid(info)) + { + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo((MemoryContext)CacheCxt); + info = (LockInfo)palloc(sizeof(LockInfoData)); + MemoryContextSwitchTo(oldcxt); + } + else if (processingVariable) { + if (IsTransactionState()) { + TransactionIdStore(GetCurrentTransactionId(), + &info->transactionIdData); + } + info->flags = 0x0; + return; /* prevent an infinite loop--still true? */ + } + else if (info->initialized) + { + /* ------------ + * If we've already initialized we're done. + * ------------ + */ + return; + } + + /* ---------------- + * initialize lockinfo.dbId and .relId appropriately + * ---------------- + */ + if (IsSharedSystemRelationName(relname)) + LRelIdAssign(&info->lRelId, InvalidOid, relationid); + else + LRelIdAssign(&info->lRelId, MyDatabaseId, relationid); + + /* ---------------- + * store the transaction id in the lockInfo field + * ---------------- + */ + if (processingVariable) + TransactionIdStore(AmiTransactionId, + &info->transactionIdData); + else if (IsTransactionState()) + TransactionIdStore(GetCurrentTransactionId(), + &info->transactionIdData); + else + StoreInvalidTransactionId(&(info->transactionIdData)); + + /* ---------------- + * initialize rest of lockinfo + * ---------------- + */ + info->flags = 0x0; + info->initialized = (bool)true; + relation->lockInfo = (Pointer) info; +} + +/* ---------------- + * RelationDiscardLockInfo + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_20 \ +elog(DEBUG, "DiscardLockInfo: NULL relation->lockInfo") +#else +#define LOCKDEBUG_20 +#endif /* LOCKDEBUG */ + +/* + * RelationDiscardLockInfo -- + * Discards the lock information in a relation descriptor. + */ +void +RelationDiscardLockInfo(Relation relation) +{ + if (! LockInfoIsValid(relation->lockInfo)) { + LOCKDEBUG_20; + return; + } + + pfree(relation->lockInfo); + relation->lockInfo = NULL; +} + +/* + * RelationSetLockForDescriptorOpen -- + * Sets read locks for a relation descriptor. + */ +#ifdef LOCKDEBUGALL +#define LOCKDEBUGALL_30 \ +elog(DEBUG, "RelationSetLockForDescriptorOpen(%s[%d,%d]) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId) +#else +#define LOCKDEBUGALL_30 +#endif /* LOCKDEBUGALL*/ + +void +RelationSetLockForDescriptorOpen(Relation relation) +{ + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + LOCKDEBUGALL_30; + + /* ---------------- + * read lock catalog tuples which compose the relation descriptor + * XXX race condition? XXX For now, do nothing. + * ---------------- + */ +} + +/* ---------------- + * RelationSetLockForRead + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_40 \ +elog(DEBUG, "RelationSetLockForRead(%s[%d,%d]) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId) +#else +#define LOCKDEBUG_40 +#endif /* LOCKDEBUG*/ + +/* + * RelationSetLockForRead -- + * Sets relation level read lock. + */ +void +RelationSetLockForRead(Relation relation) +{ + LockInfo linfo; + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + LOCKDEBUG_40; + + /* ---------------- + * If we don't have lock info on the reln just go ahead and + * lock it without trying to short circuit the lock manager. + * ---------------- + */ + if (!LockInfoIsValid(relation->lockInfo)) + { + RelationInitLockInfo(relation); + linfo = (LockInfo) relation->lockInfo; + linfo->flags |= ReadRelationLock; + MultiLockReln(linfo, READ_LOCK); + return; + } + else + linfo = (LockInfo) relation->lockInfo; + + MultiLockReln(linfo, READ_LOCK); +} + +/* ---------------- + * RelationUnsetLockForRead + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_50 \ +elog(DEBUG, "RelationUnsetLockForRead(%s[%d,%d]) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId) +#else +#define LOCKDEBUG_50 +#endif /* LOCKDEBUG*/ + +/* + * RelationUnsetLockForRead -- + * Unsets relation level read lock. + */ +void +RelationUnsetLockForRead(Relation relation) +{ + LockInfo linfo; + + /* ---------------- + * sanity check + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + linfo = (LockInfo) relation->lockInfo; + + /* ---------------- + * If we don't have lock info on the reln just go ahead and + * release it. + * ---------------- + */ + if (!LockInfoIsValid(linfo)) + { + elog(WARN, + "Releasing a lock on %s with invalid lock information", + RelationGetRelationName(relation)); + } + + MultiReleaseReln(linfo, READ_LOCK); +} + +/* ---------------- + * RelationSetLockForWrite(relation) + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_60 \ +elog(DEBUG, "RelationSetLockForWrite(%s[%d,%d]) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId) +#else +#define LOCKDEBUG_60 +#endif /* LOCKDEBUG*/ + +/* + * RelationSetLockForWrite -- + * Sets relation level write lock. + */ +void +RelationSetLockForWrite(Relation relation) +{ + LockInfo linfo; + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + LOCKDEBUG_60; + + /* ---------------- + * If we don't have lock info on the reln just go ahead and + * lock it without trying to short circuit the lock manager. + * ---------------- + */ + if (!LockInfoIsValid(relation->lockInfo)) + { + RelationInitLockInfo(relation); + linfo = (LockInfo) relation->lockInfo; + linfo->flags |= WriteRelationLock; + MultiLockReln(linfo, WRITE_LOCK); + return; + } + else + linfo = (LockInfo) relation->lockInfo; + + MultiLockReln(linfo, WRITE_LOCK); +} + +/* ---------------- + * RelationUnsetLockForWrite + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_70 \ +elog(DEBUG, "RelationUnsetLockForWrite(%s[%d,%d]) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId); +#else +#define LOCKDEBUG_70 +#endif /* LOCKDEBUG */ + +/* + * RelationUnsetLockForWrite -- + * Unsets relation level write lock. + */ +void +RelationUnsetLockForWrite(Relation relation) +{ + LockInfo linfo; + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) { + return; + } + + linfo = (LockInfo) relation->lockInfo; + + if (!LockInfoIsValid(linfo)) + { + elog(WARN, + "Releasing a lock on %s with invalid lock information", + RelationGetRelationName(relation)); + } + + MultiReleaseReln(linfo, WRITE_LOCK); +} + +/* ---------------- + * RelationSetLockForTupleRead + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_80 \ +elog(DEBUG, "RelationSetLockForTupleRead(%s[%d,%d], 0x%x) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, \ + itemPointer) +#define LOCKDEBUG_81 \ + elog(DEBUG, "RelationSetLockForTupleRead() escalating"); +#else +#define LOCKDEBUG_80 +#define LOCKDEBUG_81 +#endif /* LOCKDEBUG */ + +/* + * RelationSetLockForTupleRead -- + * Sets tuple level read lock. + */ +void +RelationSetLockForTupleRead(Relation relation, ItemPointer itemPointer) +{ + LockInfo linfo; + TransactionId curXact; + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + LOCKDEBUG_80; + + /* --------------------- + * If our lock info is invalid don't bother trying to short circuit + * the lock manager. + * --------------------- + */ + if (!LockInfoIsValid(relation->lockInfo)) + { + RelationInitLockInfo(relation); + linfo = (LockInfo) relation->lockInfo; + linfo->flags |= + IntentReadRelationLock | + IntentReadPageLock | + ReadTupleLock; + MultiLockTuple(linfo, itemPointer, READ_LOCK); + return; + } + else + linfo = (LockInfo) relation->lockInfo; + + /* ---------------- + * no need to set a lower granularity lock + * ---------------- + */ + curXact = GetCurrentTransactionId(); + if ((linfo->flags & ReadRelationLock) && + TransactionIdEquals(curXact, linfo->transactionIdData)) + { + return; + } + + /* ---------------- + * If we don't already have a tuple lock this transaction + * ---------------- + */ + if (!( (linfo->flags & ReadTupleLock) && + TransactionIdEquals(curXact, linfo->transactionIdData) )) { + + linfo->flags |= + IntentReadRelationLock | + IntentReadPageLock | + ReadTupleLock; + + /* clear count */ + linfo->flags &= ~TupleLevelLockCountMask; + + } else { + if (TupleLevelLockLimit == (TupleLevelLockCountMask & + linfo->flags)) { + LOCKDEBUG_81; + + /* escalate */ + MultiLockReln(linfo, READ_LOCK); + + /* clear count */ + linfo->flags &= ~TupleLevelLockCountMask; + return; + } + + /* increment count */ + linfo->flags = + (linfo->flags & ~TupleLevelLockCountMask) | + (1 + (TupleLevelLockCountMask & linfo->flags)); + } + + TransactionIdStore(curXact, &linfo->transactionIdData); + + /* ---------------- + * Lock the tuple. + * ---------------- + */ + MultiLockTuple(linfo, itemPointer, READ_LOCK); +} + +/* ---------------- + * RelationSetLockForReadPage + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_90 \ +elog(DEBUG, "RelationSetLockForReadPage(%s[%d,%d], @%d) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page); +#else +#define LOCKDEBUG_90 +#endif /* LOCKDEBUG*/ + +/* ---------------- + * RelationSetLockForWritePage + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_100 \ +elog(DEBUG, "RelationSetLockForWritePage(%s[%d,%d], @%d) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page); +#else +#define LOCKDEBUG_100 +#endif /* LOCKDEBUG */ + +/* + * RelationSetLockForWritePage -- + * Sets write lock on a page. + */ +void +RelationSetLockForWritePage(Relation relation, + ItemPointer itemPointer) +{ + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + /* --------------- + * Make sure linfo is initialized + * --------------- + */ + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + /* ---------------- + * attempt to set lock + * ---------------- + */ + MultiLockPage((LockInfo) relation->lockInfo, itemPointer, WRITE_LOCK); +} + +/* ---------------- + * RelationUnsetLockForReadPage + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_110 \ +elog(DEBUG, "RelationUnsetLockForReadPage(%s[%d,%d], @%d) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page) +#else +#define LOCKDEBUG_110 +#endif /* LOCKDEBUG */ + +/* ---------------- + * RelationUnsetLockForWritePage + * ---------------- + */ +#ifdef LOCKDEBUG +#define LOCKDEBUG_120 \ +elog(DEBUG, "RelationUnsetLockForWritePage(%s[%d,%d], @%d) called", \ + RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page) +#else +#define LOCKDEBUG_120 +#endif /* LOCKDEBUG */ + +/* + * Set a single level write page lock. Assumes that you already + * have a write intent lock on the relation. + */ +void +RelationSetSingleWLockPage(Relation relation, + ItemPointer itemPointer) +{ + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, !UNLOCK); +} + +/* + * Unset a single level write page lock + */ +void +RelationUnsetSingleWLockPage(Relation relation, + ItemPointer itemPointer) +{ + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + elog(WARN, + "Releasing a lock on %s with invalid lock information", + RelationGetRelationName(relation)); + + SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, UNLOCK); +} + +/* + * Set a single level read page lock. Assumes you already have a read + * intent lock set on the relation. + */ +void +RelationSetSingleRLockPage(Relation relation, + ItemPointer itemPointer) +{ + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, !UNLOCK); +} + +/* + * Unset a single level read page lock. + */ +void +RelationUnsetSingleRLockPage(Relation relation, + ItemPointer itemPointer) +{ + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + elog(WARN, + "Releasing a lock on %s with invalid lock information", + RelationGetRelationName(relation)); + + SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, UNLOCK); +} + +/* + * Set a read intent lock on a relation. + * + * Usually these are set in a multi-level table when you acquiring a + * page level lock. i.e. To acquire a lock on a page you first acquire + * an intent lock on the entire relation. Acquiring an intent lock along + * allows one to use the single level locking routines later. Good for + * index scans that do a lot of page level locking. + */ +void +RelationSetRIntentLock(Relation relation) +{ + /* ----------------- + * Sanity check + * ----------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, !UNLOCK); +} + +/* + * Unset a read intent lock on a relation + */ +void +RelationUnsetRIntentLock(Relation relation) +{ + /* ----------------- + * Sanity check + * ----------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, UNLOCK); +} + +/* + * Set a write intent lock on a relation. For a more complete explanation + * see RelationSetRIntentLock() + */ +void +RelationSetWIntentLock(Relation relation) +{ + /* ----------------- + * Sanity check + * ----------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, !UNLOCK); +} + +/* + * Unset a write intent lock. + */ +void +RelationUnsetWIntentLock(Relation relation) +{ + /* ----------------- + * Sanity check + * ----------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, UNLOCK); +} + +/* + * Extend locks are used primarily in tertiary storage devices such as + * a WORM disk jukebox. Sometimes need exclusive access to extend a + * file by a block. + */ +void +RelationSetLockForExtend(Relation relation) +{ + /* ----------------- + * Sanity check + * ----------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + MultiLockReln((LockInfo) relation->lockInfo, EXTEND_LOCK); +} + +void +RelationUnsetLockForExtend(Relation relation) +{ + /* ----------------- + * Sanity check + * ----------------- + */ + Assert(RelationIsValid(relation)); + if (LockingDisabled()) + return; + + if (!LockInfoIsValid(relation->lockInfo)) + RelationInitLockInfo(relation); + + MultiReleaseReln((LockInfo) relation->lockInfo, EXTEND_LOCK); +} + +/* + * Create an LRelid --- Why not just pass in a pointer to the storage? + */ +void +LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId) +{ + lRelId->dbId = dbId; + lRelId->relId = relId; +} diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c new file mode 100644 index 00000000000..8df898a0068 --- /dev/null +++ b/src/backend/storage/lmgr/lock.c @@ -0,0 +1,1020 @@ +/*------------------------------------------------------------------------- + * + * lock.c-- + * simple lock acquisition + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lock.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $ + * + * NOTES + * Outside modules can create a lock table and acquire/release + * locks. A lock table is a shared memory hash table. When + * a process tries to acquire a lock of a type that conflicts + * with existing locks, it is put to sleep using the routines + * in storage/lmgr/proc.c. + * + * Interface: + * + * LockAcquire(), LockRelease(), LockTabInit(). + * + * LockReplace() is called only within this module and by the + * lkchain module. It releases a lock without looking + * the lock up in the lock table. + * + * NOTE: This module is used to define new lock tables. The + * multi-level lock table (multi.c) used by the heap + * access methods calls these routines. See multi.c for + * examples showing how to use this interface. + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> /* for sprintf() */ +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/proc.h" +#include "storage/lock.h" +#include "utils/hsearch.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "access/xact.h" + +/*#define LOCK_MGR_DEBUG*/ + +#ifndef LOCK_MGR_DEBUG + +#define LOCK_PRINT(where,tag,type) +#define LOCK_DUMP(where,lock,type) +#define XID_PRINT(where,xidentP) + +#else /* LOCK_MGR_DEBUG */ + +#define LOCK_PRINT(where,tag,type)\ + elog(NOTICE, "%s: rel (%d) dbid (%d) tid (%d,%d) type (%d)\n",where, \ + tag->relId, tag->dbId, \ + ( (tag->tupleId.ip_blkid.data[0] >= 0) ? \ + BlockIdGetBlockNumber(&tag->tupleId.ip_blkid) : -1 ), \ + tag->tupleId.ip_posid, \ + type); + +#define LOCK_DUMP(where,lock,type)\ + elog(NOTICE, "%s: rel (%d) dbid (%d) tid (%d,%d) nHolding (%d) holders (%d,%d,%d,%d,%d) type (%d)\n",where, \ + lock->tag.relId, lock->tag.dbId, \ + ((lock->tag.tupleId.ip_blkid.data[0] >= 0) ? \ + BlockIdGetBlockNumber(&lock->tag.tupleId.ip_blkid) : -1 ), \ + lock->tag.tupleId.ip_posid, \ + lock->nHolding,\ + lock->holders[1],\ + lock->holders[2],\ + lock->holders[3],\ + lock->holders[4],\ + lock->holders[5],\ + type); + +#define XID_PRINT(where,xidentP)\ + elog(NOTICE,\ + "%s:xid (%d) pid (%d) lock (%x) nHolding (%d) holders (%d,%d,%d,%d,%d)",\ + where,\ + xidentP->tag.xid,\ + xidentP->tag.pid,\ + xidentP->tag.lock,\ + xidentP->nHolding,\ + xidentP->holders[1],\ + xidentP->holders[2],\ + xidentP->holders[3],\ + xidentP->holders[4],\ + xidentP->holders[5]); + +#endif /* LOCK_MGR_DEBUG */ + +SPINLOCK LockMgrLock; /* in Shmem or created in CreateSpinlocks() */ + +/* This is to simplify/speed up some bit arithmetic */ + +static MASK BITS_OFF[MAX_LOCKTYPES]; +static MASK BITS_ON[MAX_LOCKTYPES]; + +/* ----------------- + * XXX Want to move this to this file + * ----------------- + */ +static bool LockingIsDisabled; + +/* ------------------ + * from storage/ipc/shmem.c + * ------------------ + */ +extern HTAB *ShmemInitHash(); + +/* ------------------- + * map from tableId to the lock table structure + * ------------------- + */ +static LOCKTAB *AllTables[MAX_TABLES]; + +/* ------------------- + * no zero-th table + * ------------------- + */ +static int NumTables = 1; + +/* ------------------- + * InitLocks -- Init the lock module. Create a private data + * structure for constructing conflict masks. + * ------------------- + */ +void +InitLocks() +{ + int i; + int bit; + + bit = 1; + /* ------------------- + * remember 0th locktype is invalid + * ------------------- + */ + for (i=0;i<MAX_LOCKTYPES;i++,bit <<= 1) + { + BITS_ON[i] = bit; + BITS_OFF[i] = ~bit; + } +} + +/* ------------------- + * LockDisable -- sets LockingIsDisabled flag to TRUE or FALSE. + * ------------------ + */ +void +LockDisable(int status) +{ + LockingIsDisabled = status; +} + + +/* + * LockTypeInit -- initialize the lock table's lock type + * structures + * + * Notes: just copying. Should only be called once. + */ +static void +LockTypeInit(LOCKTAB *ltable, + MASK *conflictsP, + int *prioP, + int ntypes) +{ + int i; + + ltable->ctl->nLockTypes = ntypes; + ntypes++; + for (i=0;i<ntypes;i++,prioP++,conflictsP++) + { + ltable->ctl->conflictTab[i] = *conflictsP; + ltable->ctl->prio[i] = *prioP; + } +} + +/* + * LockTabInit -- initialize a lock table structure + * + * Notes: + * (a) a lock table has four separate entries in the binding + * table. This is because every shared hash table and spinlock + * has its name stored in the binding table at its creation. It + * is wasteful, in this case, but not much space is involved. + * + */ +LockTableId +LockTabInit(char *tabName, + MASK *conflictsP, + int *prioP, + int ntypes) +{ + LOCKTAB *ltable; + char *shmemName; + HASHCTL info; + int hash_flags; + bool found; + int status = TRUE; + + if (ntypes > MAX_LOCKTYPES) + { + elog(NOTICE,"LockTabInit: too many lock types %d greater than %d", + ntypes,MAX_LOCKTYPES); + return(INVALID_TABLEID); + } + + if (NumTables > MAX_TABLES) + { + elog(NOTICE, + "LockTabInit: system limit of MAX_TABLES (%d) lock tables", + MAX_TABLES); + return(INVALID_TABLEID); + } + + /* allocate a string for the binding table lookup */ + shmemName = (char *) palloc((unsigned)(strlen(tabName)+32)); + if (! shmemName) + { + elog(NOTICE,"LockTabInit: couldn't malloc string %s \n",tabName); + return(INVALID_TABLEID); + } + + /* each lock table has a non-shared header */ + ltable = (LOCKTAB *) palloc((unsigned) sizeof(LOCKTAB)); + if (! ltable) + { + elog(NOTICE,"LockTabInit: couldn't malloc lock table %s\n",tabName); + (void) pfree (shmemName); + return(INVALID_TABLEID); + } + + /* ------------------------ + * find/acquire the spinlock for the table + * ------------------------ + */ + SpinAcquire(LockMgrLock); + + + /* ----------------------- + * allocate a control structure from shared memory or attach to it + * if it already exists. + * ----------------------- + */ + sprintf(shmemName,"%s (ctl)",tabName); + ltable->ctl = (LOCKCTL *) + ShmemInitStruct(shmemName,(unsigned)sizeof(LOCKCTL),&found); + + if (! ltable->ctl) + { + elog(FATAL,"LockTabInit: couldn't initialize %s",tabName); + status = FALSE; + } + + /* ---------------- + * we're first - initialize + * ---------------- + */ + if (! found) + { + memset(ltable->ctl, 0, sizeof(LOCKCTL)); + ltable->ctl->masterLock = LockMgrLock; + ltable->ctl->tableId = NumTables; + } + + /* -------------------- + * other modules refer to the lock table by a tableId + * -------------------- + */ + AllTables[NumTables] = ltable; + NumTables++; + Assert(NumTables <= MAX_TABLES); + + /* ---------------------- + * allocate a hash table for the lock tags. This is used + * to find the different locks. + * ---------------------- + */ + info.keysize = sizeof(LOCKTAG); + info.datasize = sizeof(LOCK); + info.hash = tag_hash; + hash_flags = (HASH_ELEM | HASH_FUNCTION); + + sprintf(shmemName,"%s (lock hash)",tabName); + ltable->lockHash = (HTAB *) ShmemInitHash(shmemName, + INIT_TABLE_SIZE,MAX_TABLE_SIZE, + &info,hash_flags); + + Assert( ltable->lockHash->hash == tag_hash); + if (! ltable->lockHash) + { + elog(FATAL,"LockTabInit: couldn't initialize %s",tabName); + status = FALSE; + } + + /* ------------------------- + * allocate an xid table. When different transactions hold + * the same lock, additional information must be saved (locks per tx). + * ------------------------- + */ + info.keysize = XID_TAGSIZE; + info.datasize = sizeof(XIDLookupEnt); + info.hash = tag_hash; + hash_flags = (HASH_ELEM | HASH_FUNCTION); + + sprintf(shmemName,"%s (xid hash)",tabName); + ltable->xidHash = (HTAB *) ShmemInitHash(shmemName, + INIT_TABLE_SIZE,MAX_TABLE_SIZE, + &info,hash_flags); + + if (! ltable->xidHash) + { + elog(FATAL,"LockTabInit: couldn't initialize %s",tabName); + status = FALSE; + } + + /* init ctl data structures */ + LockTypeInit(ltable, conflictsP, prioP, ntypes); + + SpinRelease(LockMgrLock); + + (void) pfree (shmemName); + + if (status) + return(ltable->ctl->tableId); + else + return(INVALID_TABLEID); +} + +/* + * LockTabRename -- allocate another tableId to the same + * lock table. + * + * NOTES: Both the lock module and the lock chain (lchain.c) + * module use table id's to distinguish between different + * kinds of locks. Short term and long term locks look + * the same to the lock table, but are handled differently + * by the lock chain manager. This function allows the + * client to use different tableIds when acquiring/releasing + * short term and long term locks. + */ +LockTableId +LockTabRename(LockTableId tableId) +{ + LockTableId newTableId; + + if (NumTables >= MAX_TABLES) + { + return(INVALID_TABLEID); + } + if (AllTables[tableId] == INVALID_TABLEID) + { + return(INVALID_TABLEID); + } + + /* other modules refer to the lock table by a tableId */ + newTableId = NumTables; + NumTables++; + + AllTables[newTableId] = AllTables[tableId]; + return(newTableId); +} + +/* + * LockAcquire -- Check for lock conflicts, sleep if conflict found, + * set lock if/when no conflicts. + * + * Returns: TRUE if parameters are correct, FALSE otherwise. + * + * Side Effects: The lock is always acquired. No way to abort + * a lock acquisition other than aborting the transaction. + * Lock is recorded in the lkchain. + */ +bool +LockAcquire(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt) +{ + XIDLookupEnt *result,item; + HTAB *xidTable; + bool found; + LOCK *lock = NULL; + SPINLOCK masterLock; + LOCKTAB *ltable; + int status; + TransactionId myXid; + + Assert (tableId < NumTables); + ltable = AllTables[tableId]; + if (!ltable) + { + elog(NOTICE,"LockAcquire: bad lock table %d",tableId); + return (FALSE); + } + + if (LockingIsDisabled) + { + return(TRUE); + } + + LOCK_PRINT("Acquire",lockName,lockt); + masterLock = ltable->ctl->masterLock; + + SpinAcquire(masterLock); + + Assert( ltable->lockHash->hash == tag_hash); + lock = (LOCK *)hash_search(ltable->lockHash,(Pointer)lockName,HASH_ENTER,&found); + + if (! lock) + { + SpinRelease(masterLock); + elog(FATAL,"LockAcquire: lock table %d is corrupted",tableId); + return(FALSE); + } + + /* -------------------- + * if there was nothing else there, complete initialization + * -------------------- + */ + if (! found) + { + lock->mask = 0; + ProcQueueInit(&(lock->waitProcs)); + memset((char *)lock->holders, 0, sizeof(int)*MAX_LOCKTYPES); + memset((char *)lock->activeHolders, 0, sizeof(int)*MAX_LOCKTYPES); + lock->nHolding = 0; + lock->nActive = 0; + + Assert(BlockIdEquals(&(lock->tag.tupleId.ip_blkid), + &(lockName->tupleId.ip_blkid))); + + } + + /* ------------------ + * add an element to the lock queue so that we can clear the + * locks at end of transaction. + * ------------------ + */ + xidTable = ltable->xidHash; + myXid = GetCurrentTransactionId(); + + /* ------------------ + * Zero out all of the tag bytes (this clears the padding bytes for long + * word alignment and ensures hashing consistency). + * ------------------ + */ + memset(&item, 0, XID_TAGSIZE); + TransactionIdStore(myXid, &item.tag.xid); + item.tag.lock = MAKE_OFFSET(lock); +#if 0 + item.tag.pid = MyPid; +#endif + + result = (XIDLookupEnt *)hash_search(xidTable, (Pointer)&item, HASH_ENTER, &found); + if (!result) + { + elog(NOTICE,"LockAcquire: xid table corrupted"); + return(STATUS_ERROR); + } + if (!found) + { + XID_PRINT("queueing XidEnt LockAcquire:", result); + ProcAddLock(&result->queue); + result->nHolding = 0; + memset((char *)result->holders, 0, sizeof(int)*MAX_LOCKTYPES); + } + + /* ---------------- + * lock->nholding tells us how many processes have _tried_ to + * acquire this lock, Regardless of whether they succeeded or + * failed in doing so. + * ---------------- + */ + lock->nHolding++; + lock->holders[lockt]++; + + /* -------------------- + * If I'm the only one holding a lock, then there + * cannot be a conflict. Need to subtract one from the + * lock's count since we just bumped the count up by 1 + * above. + * -------------------- + */ + if (result->nHolding == lock->nActive) + { + result->holders[lockt]++; + result->nHolding++; + GrantLock(lock, lockt); + SpinRelease(masterLock); + return(TRUE); + } + + Assert(result->nHolding <= lock->nActive); + + status = LockResolveConflicts(ltable, lock, lockt, myXid); + + if (status == STATUS_OK) + { + GrantLock(lock, lockt); + } + else if (status == STATUS_FOUND) + { + status = WaitOnLock(ltable, tableId, lock, lockt); + XID_PRINT("Someone granted me the lock", result); + } + + SpinRelease(masterLock); + + return(status == STATUS_OK); +} + +/* ---------------------------- + * LockResolveConflicts -- test for lock conflicts + * + * NOTES: + * Here's what makes this complicated: one transaction's + * locks don't conflict with one another. When many processes + * hold locks, each has to subtract off the other's locks when + * determining whether or not any new lock acquired conflicts with + * the old ones. + * + * For example, if I am already holding a WRITE_INTENT lock, + * there will not be a conflict with my own READ_LOCK. If I + * don't consider the intent lock when checking for conflicts, + * I find no conflict. + * ---------------------------- + */ +int +LockResolveConflicts(LOCKTAB *ltable, + LOCK *lock, + LOCKT lockt, + TransactionId xid) +{ + XIDLookupEnt *result,item; + int *myHolders; + int nLockTypes; + HTAB *xidTable; + bool found; + int bitmask; + int i,tmpMask; + + nLockTypes = ltable->ctl->nLockTypes; + xidTable = ltable->xidHash; + + /* --------------------- + * read my own statistics from the xid table. If there + * isn't an entry, then we'll just add one. + * + * Zero out the tag, this clears the padding bytes for long + * word alignment and ensures hashing consistency. + * ------------------ + */ + memset(&item, 0, XID_TAGSIZE); + TransactionIdStore(xid, &item.tag.xid); + item.tag.lock = MAKE_OFFSET(lock); +#if 0 + item.tag.pid = pid; +#endif + + if (! (result = (XIDLookupEnt *) + hash_search(xidTable, (Pointer)&item, HASH_ENTER, &found))) + { + elog(NOTICE,"LockResolveConflicts: xid table corrupted"); + return(STATUS_ERROR); + } + myHolders = result->holders; + + if (! found) + { + /* --------------- + * we're not holding any type of lock yet. Clear + * the lock stats. + * --------------- + */ + memset(result->holders, 0, nLockTypes * sizeof(*(lock->holders))); + result->nHolding = 0; + } + + /* ---------------------------- + * first check for global conflicts: If no locks conflict + * with mine, then I get the lock. + * + * Checking for conflict: lock->mask represents the types of + * currently held locks. conflictTable[lockt] has a bit + * set for each type of lock that conflicts with mine. Bitwise + * compare tells if there is a conflict. + * ---------------------------- + */ + if (! (ltable->ctl->conflictTab[lockt] & lock->mask)) + { + + result->holders[lockt]++; + result->nHolding++; + + XID_PRINT("Conflict Resolved: updated xid entry stats", result); + + return(STATUS_OK); + } + + /* ------------------------ + * Rats. Something conflicts. But it could still be my own + * lock. We have to construct a conflict mask + * that does not reflect our own locks. + * ------------------------ + */ + bitmask = 0; + tmpMask = 2; + for (i=1;i<=nLockTypes;i++, tmpMask <<= 1) + { + if (lock->activeHolders[i] - myHolders[i]) + { + bitmask |= tmpMask; + } + } + + /* ------------------------ + * now check again for conflicts. 'bitmask' describes the types + * of locks held by other processes. If one of these + * conflicts with the kind of lock that I want, there is a + * conflict and I have to sleep. + * ------------------------ + */ + if (! (ltable->ctl->conflictTab[lockt] & bitmask)) + { + + /* no conflict. Get the lock and go on */ + + result->holders[lockt]++; + result->nHolding++; + + XID_PRINT("Conflict Resolved: updated xid entry stats", result); + + return(STATUS_OK); + + } + + return(STATUS_FOUND); +} + +int +WaitOnLock(LOCKTAB *ltable, LockTableId tableId, LOCK *lock, LOCKT lockt) +{ + PROC_QUEUE *waitQueue = &(lock->waitProcs); + + int prio = ltable->ctl->prio[lockt]; + + /* the waitqueue is ordered by priority. I insert myself + * according to the priority of the lock I am acquiring. + * + * SYNC NOTE: I am assuming that the lock table spinlock + * is sufficient synchronization for this queue. That + * will not be true if/when people can be deleted from + * the queue by a SIGINT or something. + */ + LOCK_DUMP("WaitOnLock: sleeping on lock", lock, lockt); + if (ProcSleep(waitQueue, + ltable->ctl->masterLock, + lockt, + prio, + lock) != NO_ERROR) + { + /* ------------------- + * This could have happend as a result of a deadlock, see HandleDeadLock() + * Decrement the lock nHolding and holders fields as we are no longer + * waiting on this lock. + * ------------------- + */ + lock->nHolding--; + lock->holders[lockt]--; + LOCK_DUMP("WaitOnLock: aborting on lock", lock, lockt); + SpinRelease(ltable->ctl->masterLock); + elog(WARN,"WaitOnLock: error on wakeup - Aborting this transaction"); + } + + return(STATUS_OK); +} + +/* + * LockRelease -- look up 'lockName' in lock table 'tableId' and + * release it. + * + * Side Effects: if the lock no longer conflicts with the highest + * priority waiting process, that process is granted the lock + * and awoken. (We have to grant the lock here to avoid a + * race between the waking process and any new process to + * come along and request the lock). + */ +bool +LockRelease(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt) +{ + LOCK *lock = NULL; + SPINLOCK masterLock; + bool found; + LOCKTAB *ltable; + XIDLookupEnt *result,item; + HTAB *xidTable; + bool wakeupNeeded = true; + + Assert (tableId < NumTables); + ltable = AllTables[tableId]; + if (!ltable) { + elog(NOTICE, "ltable is null in LockRelease"); + return (FALSE); + } + + if (LockingIsDisabled) + { + return(TRUE); + } + + LOCK_PRINT("Release",lockName,lockt); + + masterLock = ltable->ctl->masterLock; + xidTable = ltable->xidHash; + + SpinAcquire(masterLock); + + Assert( ltable->lockHash->hash == tag_hash); + lock = (LOCK *) + hash_search(ltable->lockHash,(Pointer)lockName,HASH_FIND_SAVE,&found); + + /* let the caller print its own error message, too. + * Do not elog(WARN). + */ + if (! lock) + { + SpinRelease(masterLock); + elog(NOTICE,"LockRelease: locktable corrupted"); + return(FALSE); + } + + if (! found) + { + SpinRelease(masterLock); + elog(NOTICE,"LockRelease: locktable lookup failed, no lock"); + return(FALSE); + } + + Assert(lock->nHolding > 0); + + /* + * fix the general lock stats + */ + lock->nHolding--; + lock->holders[lockt]--; + lock->nActive--; + lock->activeHolders[lockt]--; + + Assert(lock->nActive >= 0); + + if (! lock->nHolding) + { + /* ------------------ + * if there's no one waiting in the queue, + * we just released the last lock. + * Delete it from the lock table. + * ------------------ + */ + Assert( ltable->lockHash->hash == tag_hash); + lock = (LOCK *) hash_search(ltable->lockHash, + (Pointer) &(lock->tag), + HASH_REMOVE_SAVED, + &found); + Assert(lock && found); + wakeupNeeded = false; + } + + /* ------------------ + * Zero out all of the tag bytes (this clears the padding bytes for long + * word alignment and ensures hashing consistency). + * ------------------ + */ + memset(&item, 0, XID_TAGSIZE); + + TransactionIdStore(GetCurrentTransactionId(), &item.tag.xid); + item.tag.lock = MAKE_OFFSET(lock); +#if 0 + item.tag.pid = MyPid; +#endif + + if (! ( result = (XIDLookupEnt *) hash_search(xidTable, + (Pointer)&item, + HASH_FIND_SAVE, + &found) ) + || !found) + { + SpinRelease(masterLock); + elog(NOTICE,"LockReplace: xid table corrupted"); + return(FALSE); + } + /* + * now check to see if I have any private locks. If I do, + * decrement the counts associated with them. + */ + result->holders[lockt]--; + result->nHolding--; + + XID_PRINT("LockRelease updated xid stats", result); + + /* + * If this was my last hold on this lock, delete my entry + * in the XID table. + */ + if (! result->nHolding) + { + if (result->queue.next != INVALID_OFFSET) + SHMQueueDelete(&result->queue); + if (! (result = (XIDLookupEnt *) + hash_search(xidTable, (Pointer)&item, HASH_REMOVE_SAVED, &found)) || + ! found) + { + SpinRelease(masterLock); + elog(NOTICE,"LockReplace: xid table corrupted"); + return(FALSE); + } + } + + /* -------------------------- + * If there are still active locks of the type I just released, no one + * should be woken up. Whoever is asleep will still conflict + * with the remaining locks. + * -------------------------- + */ + if (! (lock->activeHolders[lockt])) + { + /* change the conflict mask. No more of this lock type. */ + lock->mask &= BITS_OFF[lockt]; + } + + if (wakeupNeeded) + { + /* -------------------------- + * Wake the first waiting process and grant him the lock if it + * doesn't conflict. The woken process must record the lock + * himself. + * -------------------------- + */ + (void) ProcLockWakeup(&(lock->waitProcs), (char *) ltable, (char *) lock); + } + + SpinRelease(masterLock); + return(TRUE); +} + +/* + * GrantLock -- udpate the lock data structure to show + * the new lock holder. + */ +void +GrantLock(LOCK *lock, LOCKT lockt) +{ + lock->nActive++; + lock->activeHolders[lockt]++; + lock->mask |= BITS_ON[lockt]; +} + +bool +LockReleaseAll(LockTableId tableId, SHM_QUEUE *lockQueue) +{ + PROC_QUEUE *waitQueue; + int done; + XIDLookupEnt *xidLook = NULL; + XIDLookupEnt *tmp = NULL; + SHMEM_OFFSET end = MAKE_OFFSET(lockQueue); + SPINLOCK masterLock; + LOCKTAB *ltable; + int i,nLockTypes; + LOCK *lock; + bool found; + + Assert (tableId < NumTables); + ltable = AllTables[tableId]; + if (!ltable) + return (FALSE); + + nLockTypes = ltable->ctl->nLockTypes; + masterLock = ltable->ctl->masterLock; + + if (SHMQueueEmpty(lockQueue)) + return TRUE; + + SHMQueueFirst(lockQueue,(Pointer*)&xidLook,&xidLook->queue); + + XID_PRINT("LockReleaseAll:", xidLook); + + SpinAcquire(masterLock); + for (;;) + { + /* --------------------------- + * XXX Here we assume the shared memory queue is circular and + * that we know its internal structure. Should have some sort of + * macros to allow one to walk it. mer 20 July 1991 + * --------------------------- + */ + done = (xidLook->queue.next == end); + lock = (LOCK *) MAKE_PTR(xidLook->tag.lock); + + LOCK_PRINT("ReleaseAll",(&lock->tag),0); + + /* ------------------ + * fix the general lock stats + * ------------------ + */ + if (lock->nHolding != xidLook->nHolding) + { + lock->nHolding -= xidLook->nHolding; + lock->nActive -= xidLook->nHolding; + Assert(lock->nActive >= 0); + for (i=1; i<=nLockTypes; i++) + { + lock->holders[i] -= xidLook->holders[i]; + lock->activeHolders[i] -= xidLook->holders[i]; + if (! lock->activeHolders[i]) + lock->mask &= BITS_OFF[i]; + } + } + else + { + /* -------------- + * set nHolding to zero so that we can garbage collect the lock + * down below... + * -------------- + */ + lock->nHolding = 0; + } + /* ---------------- + * always remove the xidLookup entry, we're done with it now + * ---------------- + */ + if ((! hash_search(ltable->xidHash, (Pointer)xidLook, HASH_REMOVE, &found)) + || !found) + { + SpinRelease(masterLock); + elog(NOTICE,"LockReplace: xid table corrupted"); + return(FALSE); + } + + if (! lock->nHolding) + { + /* -------------------- + * if there's no one waiting in the queue, we've just released + * the last lock. + * -------------------- + */ + + Assert( ltable->lockHash->hash == tag_hash); + lock = (LOCK *) + hash_search(ltable->lockHash,(Pointer)&(lock->tag),HASH_REMOVE, &found); + if ((! lock) || (!found)) + { + SpinRelease(masterLock); + elog(NOTICE,"LockReplace: cannot remove lock from HTAB"); + return(FALSE); + } + } + else + { + /* -------------------- + * Wake the first waiting process and grant him the lock if it + * doesn't conflict. The woken process must record the lock + * him/herself. + * -------------------- + */ + waitQueue = &(lock->waitProcs); + (void) ProcLockWakeup(waitQueue, (char *) ltable, (char *) lock); + } + + if (done) + break; + SHMQueueFirst(&xidLook->queue,(Pointer*)&tmp,&tmp->queue); + xidLook = tmp; + } + SpinRelease(masterLock); + SHMQueueInit(lockQueue); + return TRUE; +} + +int +LockShmemSize() +{ + int size = 0; + int nLockBuckets, nLockSegs; + int nXidBuckets, nXidSegs; + + nLockBuckets = 1 << (int)my_log2((NLOCKENTS - 1) / DEF_FFACTOR + 1); + nLockSegs = 1 << (int)my_log2((nLockBuckets - 1) / DEF_SEGSIZE + 1); + + nXidBuckets = 1 << (int)my_log2((NLOCKS_PER_XACT-1) / DEF_FFACTOR + 1); + nXidSegs = 1 << (int)my_log2((nLockBuckets - 1) / DEF_SEGSIZE + 1); + + size += MAXALIGN(NBACKENDS * sizeof(PROC)); /* each MyProc */ + size += MAXALIGN(NBACKENDS * sizeof(LOCKCTL)); /* each ltable->ctl */ + size += MAXALIGN(sizeof(PROC_HDR)); /* ProcGlobal */ + + size += MAXALIGN(my_log2(NLOCKENTS) * sizeof(void *)); + size += MAXALIGN(sizeof(HHDR)); + size += nLockSegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT)); + size += NLOCKENTS * /* XXX not multiple of BUCKET_ALLOC_INCR? */ + (MAXALIGN(sizeof(BUCKET_INDEX)) + + MAXALIGN(sizeof(LOCK))); /* contains hash key */ + + size += MAXALIGN(my_log2(NBACKENDS) * sizeof(void *)); + size += MAXALIGN(sizeof(HHDR)); + size += nXidSegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT)); + size += NBACKENDS * /* XXX not multiple of BUCKET_ALLOC_INCR? */ + (MAXALIGN(sizeof(BUCKET_INDEX)) + + MAXALIGN(sizeof(XIDLookupEnt))); /* contains hash key */ + + return size; +} + +/* ----------------- + * Boolean function to determine current locking status + * ----------------- + */ +bool +LockingDisabled() +{ + return LockingIsDisabled; +} diff --git a/src/backend/storage/lmgr/multi.c b/src/backend/storage/lmgr/multi.c new file mode 100644 index 00000000000..c1702d18cb8 --- /dev/null +++ b/src/backend/storage/lmgr/multi.c @@ -0,0 +1,415 @@ +/*------------------------------------------------------------------------- + * + * multi.c-- + * multi level lock table manager + * + * Standard multi-level lock manager as per the Gray paper + * (at least, that is what it is supposed to be). We implement + * three levels -- RELN, PAGE, TUPLE. Tuple is actually TID + * a physical record pointer. It isn't an object id. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/multi.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $ + * + * NOTES: + * (1) The lock.c module assumes that the caller here is doing + * two phase locking. + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> +#include <string.h> +#include "storage/lmgr.h" +#include "storage/multilev.h" + +#include "utils/rel.h" +#include "utils/elog.h" +#include "miscadmin.h" /* MyDatabaseId */ + + +/* + * INTENT indicates to higher level that a lower level lock has been + * set. For example, a write lock on a tuple conflicts with a write + * lock on a relation. This conflict is detected as a WRITE_INTENT/ + * WRITE conflict between the tuple's intent lock and the relation's + * write lock. + */ +static int MultiConflicts[] = { + (int)NULL, + /* All reads and writes at any level conflict with a write lock */ + (1 << WRITE_LOCK)|(1 << WRITE_INTENT)|(1 << READ_LOCK)|(1 << READ_INTENT), + /* read locks conflict with write locks at curr and lower levels */ + (1 << WRITE_LOCK)| (1 << WRITE_INTENT), + /* write intent locks */ + (1 << READ_LOCK) | (1 << WRITE_LOCK), + /* read intent locks*/ + (1 << WRITE_LOCK), + /* extend locks for archive storage manager conflict only w/extend locks */ + (1 << EXTEND_LOCK) +}; + +/* + * write locks have higher priority than read locks and extend locks. May + * want to treat INTENT locks differently. + */ +static int MultiPrios[] = { + (int)NULL, + 2, + 1, + 2, + 1, + 1 +}; + +/* + * Lock table identifier for this lock table. The multi-level + * lock table is ONE lock table, not three. + */ +LockTableId MultiTableId = (LockTableId)NULL; +LockTableId ShortTermTableId = (LockTableId)NULL; + +/* + * Create the lock table described by MultiConflicts and Multiprio. + */ +LockTableId +InitMultiLevelLockm() +{ + int tableId; + + /* ----------------------- + * If we're already initialized just return the table id. + * ----------------------- + */ + if (MultiTableId) + return MultiTableId; + + tableId = LockTabInit("LockTable", MultiConflicts, MultiPrios, 5); + MultiTableId = tableId; + if (! (MultiTableId)) { + elog(WARN,"InitMultiLockm: couldnt initialize lock table"); + } + /* ----------------------- + * No short term lock table for now. -Jeff 15 July 1991 + * + * ShortTermTableId = LockTabRename(tableId); + * if (! (ShortTermTableId)) { + * elog(WARN,"InitMultiLockm: couldnt rename lock table"); + * } + * ----------------------- + */ + return MultiTableId; +} + +/* + * MultiLockReln -- lock a relation + * + * Returns: TRUE if the lock can be set, FALSE otherwise. + */ +bool +MultiLockReln(LockInfo linfo, LOCKT lockt) +{ + LOCKTAG tag; + + /* LOCKTAG has two bytes of padding, unfortunately. The + * hash function will return miss if the padding bytes aren't + * zero'd. + */ + memset(&tag,0,sizeof(tag)); + tag.relId = linfo->lRelId.relId; + tag.dbId = linfo->lRelId.dbId; + return(MultiAcquire(MultiTableId, &tag, lockt, RELN_LEVEL)); +} + +/* + * MultiLockTuple -- Lock the TID associated with a tuple + * + * Returns: TRUE if lock is set, FALSE otherwise. + * + * Side Effects: causes intention level locks to be set + * at the page and relation level. + */ +bool +MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt) +{ + LOCKTAG tag; + + /* LOCKTAG has two bytes of padding, unfortunately. The + * hash function will return miss if the padding bytes aren't + * zero'd. + */ + memset(&tag,0,sizeof(tag)); + + tag.relId = linfo->lRelId.relId; + tag.dbId = linfo->lRelId.dbId; + + /* not locking any valid Tuple, just the page */ + tag.tupleId = *tidPtr; + return(MultiAcquire(MultiTableId, &tag, lockt, TUPLE_LEVEL)); +} + +/* + * same as above at page level + */ +bool +MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt) +{ + LOCKTAG tag; + + /* LOCKTAG has two bytes of padding, unfortunately. The + * hash function will return miss if the padding bytes aren't + * zero'd. + */ + memset(&tag,0,sizeof(tag)); + + + /* ---------------------------- + * Now we want to set the page offset to be invalid + * and lock the block. There is some confusion here as to what + * a page is. In Postgres a page is an 8k block, however this + * block may be partitioned into many subpages which are sometimes + * also called pages. The term is overloaded, so don't be fooled + * when we say lock the page we mean the 8k block. -Jeff 16 July 1991 + * ---------------------------- + */ + tag.relId = linfo->lRelId.relId; + tag.dbId = linfo->lRelId.dbId; + BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid)); + return(MultiAcquire(MultiTableId, &tag, lockt, PAGE_LEVEL)); +} + +/* + * MultiAcquire -- acquire multi level lock at requested level + * + * Returns: TRUE if lock is set, FALSE if not + * Side Effects: + */ +bool +MultiAcquire(LockTableId tableId, + LOCKTAG *tag, + LOCKT lockt, + LOCK_LEVEL level) +{ + LOCKT locks[N_LEVELS]; + int i,status; + LOCKTAG xxTag, *tmpTag = &xxTag; + int retStatus = TRUE; + + /* + * Three levels implemented. If we set a low level (e.g. Tuple) + * lock, we must set INTENT locks on the higher levels. The + * intent lock detects conflicts between the low level lock + * and an existing high level lock. For example, setting a + * write lock on a tuple in a relation is disallowed if there + * is an existing read lock on the entire relation. The + * write lock would set a WRITE + INTENT lock on the relation + * and that lock would conflict with the read. + */ + switch (level) { + case RELN_LEVEL: + locks[0] = lockt; + locks[1] = NO_LOCK; + locks[2] = NO_LOCK; + break; + case PAGE_LEVEL: + locks[0] = lockt + INTENT; + locks[1] = lockt; + locks[2] = NO_LOCK; + break; + case TUPLE_LEVEL: + locks[0] = lockt + INTENT; + locks[1] = lockt + INTENT; + locks[2] = lockt; + break; + default: + elog(WARN,"MultiAcquire: bad lock level"); + return(FALSE); + } + + /* + * construct a new tag as we go. Always loop through all levels, + * but if we arent' seting a low level lock, locks[i] is set to + * NO_LOCK for the lower levels. Always start from the highest + * level and go to the lowest level. + */ + memset(tmpTag,0,sizeof(*tmpTag)); + tmpTag->relId = tag->relId; + tmpTag->dbId = tag->dbId; + + for (i=0;i<N_LEVELS;i++) { + if (locks[i] != NO_LOCK) { + switch (i) { + case RELN_LEVEL: + /* ------------- + * Set the block # and offset to invalid + * ------------- + */ + BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber); + tmpTag->tupleId.ip_posid = InvalidOffsetNumber; + break; + case PAGE_LEVEL: + /* ------------- + * Copy the block #, set the offset to invalid + * ------------- + */ + BlockIdCopy(&(tmpTag->tupleId.ip_blkid), + &(tag->tupleId.ip_blkid)); + tmpTag->tupleId.ip_posid = InvalidOffsetNumber; + break; + case TUPLE_LEVEL: + /* -------------- + * Copy the entire tuple id. + * -------------- + */ + ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId); + break; + } + + status = LockAcquire(tableId, tmpTag, locks[i]); + if (! status) { + /* failed for some reason. Before returning we have + * to release all of the locks we just acquired. + * MultiRelease(xx,xx,xx, i) means release starting from + * the last level lock we successfully acquired + */ + retStatus = FALSE; + (void) MultiRelease(tableId, tag, lockt, i); + /* now leave the loop. Don't try for any more locks */ + break; + } + } + } + return(retStatus); +} + +/* ------------------ + * Release a page in the multi-level lock table + * ------------------ + */ +bool +MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt) +{ + LOCKTAG tag; + + /* ------------------ + * LOCKTAG has two bytes of padding, unfortunately. The + * hash function will return miss if the padding bytes aren't + * zero'd. + * ------------------ + */ + memset(&tag, 0,sizeof(LOCKTAG)); + + tag.relId = linfo->lRelId.relId; + tag.dbId = linfo->lRelId.dbId; + BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid)); + + return (MultiRelease(MultiTableId, &tag, lockt, PAGE_LEVEL)); +} + +/* ------------------ + * Release a relation in the multi-level lock table + * ------------------ + */ +bool +MultiReleaseReln(LockInfo linfo, LOCKT lockt) +{ + LOCKTAG tag; + + /* ------------------ + * LOCKTAG has two bytes of padding, unfortunately. The + * hash function will return miss if the padding bytes aren't + * zero'd. + * ------------------ + */ + memset(&tag, 0, sizeof(LOCKTAG)); + tag.relId = linfo->lRelId.relId; + tag.dbId = linfo->lRelId.dbId; + + return (MultiRelease(MultiTableId, &tag, lockt, RELN_LEVEL)); +} + +/* + * MultiRelease -- release a multi-level lock + * + * Returns: TRUE if successful, FALSE otherwise. + */ +bool +MultiRelease(LockTableId tableId, + LOCKTAG *tag, + LOCKT lockt, + LOCK_LEVEL level) +{ + LOCKT locks[N_LEVELS]; + int i,status; + LOCKTAG xxTag, *tmpTag = &xxTag; + + /* + * same level scheme as MultiAcquire(). + */ + switch (level) { + case RELN_LEVEL: + locks[0] = lockt; + locks[1] = NO_LOCK; + locks[2] = NO_LOCK; + break; + case PAGE_LEVEL: + locks[0] = lockt + INTENT; + locks[1] = lockt; + locks[2] = NO_LOCK; + break; + case TUPLE_LEVEL: + locks[0] = lockt + INTENT; + locks[1] = lockt + INTENT; + locks[2] = lockt; + break; + default: + elog(WARN,"MultiRelease: bad lockt"); + } + + /* + * again, construct the tag on the fly. This time, however, + * we release the locks in the REVERSE order -- from lowest + * level to highest level. + * + * Must zero out the tag to set padding byes to zero and ensure + * hashing consistency. + */ + memset(tmpTag, 0, sizeof(*tmpTag)); + tmpTag->relId = tag->relId; + tmpTag->dbId = tag->dbId; + + for (i=(N_LEVELS-1); i>=0; i--) { + if (locks[i] != NO_LOCK) { + switch (i) { + case RELN_LEVEL: + /* ------------- + * Set the block # and offset to invalid + * ------------- + */ + BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber); + tmpTag->tupleId.ip_posid = InvalidOffsetNumber; + break; + case PAGE_LEVEL: + /* ------------- + * Copy the block #, set the offset to invalid + * ------------- + */ + BlockIdCopy(&(tmpTag->tupleId.ip_blkid), + &(tag->tupleId.ip_blkid)); + tmpTag->tupleId.ip_posid = InvalidOffsetNumber; + break; + case TUPLE_LEVEL: + ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId); + break; + } + status = LockRelease(tableId, tmpTag, locks[i]); + if (! status) { + elog(WARN,"MultiRelease: couldn't release after error"); + } + } + } + /* shouldn't reach here */ + return false; +} diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c new file mode 100644 index 00000000000..0955cdfc2f5 --- /dev/null +++ b/src/backend/storage/lmgr/proc.c @@ -0,0 +1,826 @@ +/*------------------------------------------------------------------------- + * + * proc.c-- + * routines to manage per-process shared memory data structure + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * Each postgres backend gets one of these. We'll use it to + * clean up after the process should the process suddenly die. + * + * + * Interface (a): + * ProcSleep(), ProcWakeup(), ProcWakeupNext(), + * ProcQueueAlloc() -- create a shm queue for sleeping processes + * ProcQueueInit() -- create a queue without allocing memory + * + * Locking and waiting for buffers can cause the backend to be + * put to sleep. Whoever releases the lock, etc. wakes the + * process up again (and gives it an error code so it knows + * whether it was awoken on an error condition). + * + * Interface (b): + * + * ProcReleaseLocks -- frees the locks associated with this process, + * ProcKill -- destroys the shared memory state (and locks) + * associated with the process. + * + * 5/15/91 -- removed the buffer pool based lock chain in favor + * of a shared memory lock chain. The write-protection is + * more expensive if the lock chain is in the buffer pool. + * The only reason I kept the lock chain in the buffer pool + * in the first place was to allow the lock table to grow larger + * than available shared memory and that isn't going to work + * without a lot of unimplemented support anyway. + * + * 4/7/95 -- instead of allocating a set of 1 semaphore per process, we + * allocate a semaphore from a set of PROC_NSEMS_PER_SET semaphores + * shared among backends (we keep a few sets of semaphores around). + * This is so that we can support more backends. (system-wide semaphore + * sets run out pretty fast.) -ay 4/95 + * + * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $ + */ +#include <sys/time.h> +#ifndef WIN32 +#include <unistd.h> +#endif /* WIN32 */ +#include <string.h> +#include <sys/types.h> +#include "libpq/pqsignal.h" /* substitute for <signal.h> */ + +#if defined(PORTNAME_bsdi) +/* hacka, hacka, hacka (XXX) */ +union semun { + int val; /* value for SETVAL */ + struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */ + ushort *array; /* array for GETALL & SETALL */ +}; +#endif + +#include "access/xact.h" +#include "utils/hsearch.h" +#include "utils/elog.h" + +#include "storage/buf.h" +#include "storage/lock.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/proc.h" + +/* + * timeout (in seconds) for resolving possible deadlock + */ +#ifndef DEADLOCK_TIMEOUT +#define DEADLOCK_TIMEOUT 60 +#endif + +/* -------------------- + * Spin lock for manipulating the shared process data structure: + * ProcGlobal.... Adding an extra spin lock seemed like the smallest + * hack to get around reading and updating this structure in shared + * memory. -mer 17 July 1991 + * -------------------- + */ +SPINLOCK ProcStructLock; + +/* + * For cleanup routines. Don't cleanup if the initialization + * has not happened. + */ +static bool ProcInitialized = FALSE; + +static PROC_HDR *ProcGlobal = NULL; + +PROC *MyProc = NULL; + +static void ProcKill(int exitStatus, int pid); +static void ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum); +static void ProcFreeSem(IpcSemaphoreKey semKey, int semNum); +#if defined(PORTNAME_linux) +extern int HandleDeadLock(int); +#else +extern int HandleDeadLock(void); +#endif +/* + * InitProcGlobal - + * initializes the global process table. We put it here so that + * the postmaster can do this initialization. (ProcFreeAllSem needs + * to read this table on exiting the postmaster. If we have the first + * backend do this, starting up and killing the postmaster without + * starting any backends will be a problem.) + */ +void +InitProcGlobal(IPCKey key) +{ + bool found = false; + + /* attach to the free list */ + ProcGlobal = (PROC_HDR *) + ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found); + + /* -------------------- + * We're the first - initialize. + * -------------------- + */ + if (! found) + { + int i; + + ProcGlobal->numProcs = 0; + ProcGlobal->freeProcs = INVALID_OFFSET; + ProcGlobal->currKey = IPCGetProcessSemaphoreInitKey(key); + for (i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) + ProcGlobal->freeSemMap[i] = 0; + } +} + +/* ------------------------ + * InitProc -- create a per-process data structure for this process + * used by the lock manager on semaphore queues. + * ------------------------ + */ +void +InitProcess(IPCKey key) +{ + bool found = false; + int pid; + int semstat; + unsigned long location, myOffset; + + /* ------------------ + * Routine called if deadlock timer goes off. See ProcSleep() + * ------------------ + */ +#ifndef WIN32 + signal(SIGALRM, HandleDeadLock); +#endif /* WIN32 we'll have to figure out how to handle this later */ + + SpinAcquire(ProcStructLock); + + /* attach to the free list */ + ProcGlobal = (PROC_HDR *) + ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found); + if (!found) { + /* this should not happen. InitProcGlobal() is called before this. */ + elog(WARN, "InitProcess: Proc Header uninitialized"); + } + + if (MyProc != NULL) + { + SpinRelease(ProcStructLock); + elog(WARN,"ProcInit: you already exist"); + return; + } + + /* try to get a proc from the free list first */ + + myOffset = ProcGlobal->freeProcs; + + if (myOffset != INVALID_OFFSET) + { + MyProc = (PROC *) MAKE_PTR(myOffset); + ProcGlobal->freeProcs = MyProc->links.next; + } + else + { + /* have to allocate one. We can't use the normal binding + * table mechanism because the proc structure is stored + * by PID instead of by a global name (need to look it + * up by PID when we cleanup dead processes). + */ + + MyProc = (PROC *) ShmemAlloc((unsigned)sizeof(PROC)); + if (! MyProc) + { + SpinRelease(ProcStructLock); + elog (FATAL,"cannot create new proc: out of memory"); + } + + /* this cannot be initialized until after the buffer pool */ + SHMQueueInit(&(MyProc->lockQueue)); + MyProc->procId = ProcGlobal->numProcs; + ProcGlobal->numProcs++; + } + + /* + * zero out the spin lock counts and set the sLocks field for + * ProcStructLock to 1 as we have acquired this spinlock above but + * didn't record it since we didn't have MyProc until now. + */ + memset(MyProc->sLocks, 0, sizeof(MyProc->sLocks)); + MyProc->sLocks[ProcStructLock] = 1; + + + if (IsUnderPostmaster) { + IPCKey semKey; + int semNum; + int semId; + union semun semun; + + ProcGetNewSemKeyAndNum(&semKey, &semNum); + + semId = IpcSemaphoreCreate(semKey, + PROC_NSEMS_PER_SET, + IPCProtection, + IpcSemaphoreDefaultStartValue, + 0, + &semstat); + /* + * we might be reusing a semaphore that belongs to a dead + * backend. So be careful and reinitialize its value here. + */ + semun.val = IpcSemaphoreDefaultStartValue; + semctl(semId, semNum, SETVAL, semun); + + IpcSemaphoreLock(semId, semNum, IpcExclusiveLock); + MyProc->sem.semId = semId; + MyProc->sem.semNum = semNum; + MyProc->sem.semKey = semKey; + } else { + MyProc->sem.semId = -1; + } + + /* ---------------------- + * Release the lock. + * ---------------------- + */ + SpinRelease(ProcStructLock); + + MyProc->pid = 0; +#if 0 + MyProc->pid = MyPid; +#endif + + /* ---------------- + * Start keeping spin lock stats from here on. Any botch before + * this initialization is forever botched + * ---------------- + */ + memset(MyProc->sLocks, 0, MAX_SPINS*sizeof(*MyProc->sLocks)); + + /* ------------------------- + * Install ourselves in the binding table. The name to + * use is determined by the OS-assigned process id. That + * allows the cleanup process to find us after any untimely + * exit. + * ------------------------- + */ + pid = getpid(); + location = MAKE_OFFSET(MyProc); + if ((! ShmemPIDLookup(pid,&location)) || (location != MAKE_OFFSET(MyProc))) + { + elog(FATAL,"InitProc: ShmemPID table broken"); + } + + MyProc->errType = NO_ERROR; + SHMQueueElemInit(&(MyProc->links)); + + on_exitpg(ProcKill, (caddr_t)pid); + + ProcInitialized = TRUE; +} + +/* + * ProcReleaseLocks() -- release all locks associated with this process + * + */ +void +ProcReleaseLocks() +{ + if (!MyProc) + return; + LockReleaseAll(1,&MyProc->lockQueue); +} + +/* + * ProcRemove - + * used by the postmaster to clean up the global tables. This also frees + * up the semaphore used for the lmgr of the process. (We have to do + * this is the postmaster instead of doing a IpcSemaphoreKill on exiting + * the process because the semaphore set is shared among backends and + * we don't want to remove other's semaphores on exit.) + */ +bool +ProcRemove(int pid) +{ + SHMEM_OFFSET location; + PROC *proc; + + location = INVALID_OFFSET; + + location = ShmemPIDDestroy(pid); + if (location == INVALID_OFFSET) + return(FALSE); + proc = (PROC *) MAKE_PTR(location); + + SpinAcquire(ProcStructLock); + + ProcFreeSem(proc->sem.semKey, proc->sem.semNum); + + proc->links.next = ProcGlobal->freeProcs; + ProcGlobal->freeProcs = MAKE_OFFSET(proc); + + SpinRelease(ProcStructLock); + + return(TRUE); +} + +/* + * ProcKill() -- Destroy the per-proc data structure for + * this process. Release any of its held spin locks. + */ +static void +ProcKill(int exitStatus, int pid) +{ + PROC *proc; + SHMEM_OFFSET location; + + /* -------------------- + * If this is a FATAL exit the postmaster will have to kill all the + * existing backends and reinitialize shared memory. So all we don't + * need to do anything here. + * -------------------- + */ + if (exitStatus != 0) + return; + + if (! pid) + { + pid = getpid(); + } + + ShmemPIDLookup(pid,&location); + if (location == INVALID_OFFSET) + return; + + proc = (PROC *) MAKE_PTR(location); + + if (proc != MyProc) { + Assert( pid != getpid() ); + } else + MyProc = NULL; + + /* --------------- + * Assume one lock table. + * --------------- + */ + ProcReleaseSpins(proc); + LockReleaseAll(1,&proc->lockQueue); + + /* ---------------- + * get off the wait queue + * ---------------- + */ + LockLockTable(); + if (proc->links.next != INVALID_OFFSET) { + Assert(proc->waitLock->waitProcs.size > 0); + SHMQueueDelete(&(proc->links)); + --proc->waitLock->waitProcs.size; + } + SHMQueueElemInit(&(proc->links)); + UnlockLockTable(); + + return; +} + +/* + * ProcQueue package: routines for putting processes to sleep + * and waking them up + */ + +/* + * ProcQueueAlloc -- alloc/attach to a shared memory process queue + * + * Returns: a pointer to the queue or NULL + * Side Effects: Initializes the queue if we allocated one + */ +PROC_QUEUE * +ProcQueueAlloc(char *name) +{ + bool found; + PROC_QUEUE *queue = (PROC_QUEUE *) + ShmemInitStruct(name,(unsigned)sizeof(PROC_QUEUE),&found); + + if (! queue) + { + return(NULL); + } + if (! found) + { + ProcQueueInit(queue); + } + return(queue); +} + +/* + * ProcQueueInit -- initialize a shared memory process queue + */ +void +ProcQueueInit(PROC_QUEUE *queue) +{ + SHMQueueInit(&(queue->links)); + queue->size = 0; +} + + + +/* + * ProcSleep -- put a process to sleep + * + * P() on the semaphore should put us to sleep. The process + * semaphore is cleared by default, so the first time we try + * to acquire it, we sleep. + * + * ASSUME: that no one will fiddle with the queue until after + * we release the spin lock. + * + * NOTES: The process queue is now a priority queue for locking. + */ +int +ProcSleep(PROC_QUEUE *queue, + SPINLOCK spinlock, + int token, + int prio, + LOCK *lock) +{ + int i; + PROC *proc; +#ifndef WIN32 /* figure this out later */ + struct itimerval timeval, dummy; +#endif /* WIN32 */ + + proc = (PROC *) MAKE_PTR(queue->links.prev); + for (i=0;i<queue->size;i++) + { + if (proc->prio < prio) + proc = (PROC *) MAKE_PTR(proc->links.prev); + else + break; + } + + MyProc->token = token; + MyProc->waitLock = lock; + + /* ------------------- + * currently, we only need this for the ProcWakeup routines + * ------------------- + */ + TransactionIdStore((TransactionId) GetCurrentTransactionId(), &MyProc->xid); + + /* ------------------- + * assume that these two operations are atomic (because + * of the spinlock). + * ------------------- + */ + SHMQueueInsertTL(&(proc->links),&(MyProc->links)); + queue->size++; + + SpinRelease(spinlock); + + /* -------------- + * Postgres does not have any deadlock detection code and for this + * reason we must set a timer to wake up the process in the event of + * a deadlock. For now the timer is set for 1 minute and we assume that + * any process which sleeps for this amount of time is deadlocked and will + * receive a SIGALRM signal. The handler should release the processes + * semaphore and abort the current transaction. + * + * Need to zero out struct to set the interval and the micro seconds fields + * to 0. + * -------------- + */ +#ifndef WIN32 + memset(&timeval, 0, sizeof(struct itimerval)); + timeval.it_value.tv_sec = DEADLOCK_TIMEOUT; + + if (setitimer(ITIMER_REAL, &timeval, &dummy)) + elog(FATAL, "ProcSleep: Unable to set timer for process wakeup"); +#endif /* WIN32 */ + + /* -------------- + * if someone wakes us between SpinRelease and IpcSemaphoreLock, + * IpcSemaphoreLock will not block. The wakeup is "saved" by + * the semaphore implementation. + * -------------- + */ + IpcSemaphoreLock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock); + + /* --------------- + * We were awoken before a timeout - now disable the timer + * --------------- + */ +#ifndef WIN32 + timeval.it_value.tv_sec = 0; + + + if (setitimer(ITIMER_REAL, &timeval, &dummy)) + elog(FATAL, "ProcSleep: Unable to diable timer for process wakeup"); +#endif /* WIN32 */ + + /* ---------------- + * We were assumed to be in a critical section when we went + * to sleep. + * ---------------- + */ + SpinAcquire(spinlock); + + return(MyProc->errType); +} + + +/* + * ProcWakeup -- wake up a process by releasing its private semaphore. + * + * remove the process from the wait queue and set its links invalid. + * RETURN: the next process in the wait queue. + */ +PROC * +ProcWakeup(PROC *proc, int errType) +{ + PROC *retProc; + /* assume that spinlock has been acquired */ + + if (proc->links.prev == INVALID_OFFSET || + proc->links.next == INVALID_OFFSET) + return((PROC *) NULL); + + retProc = (PROC *) MAKE_PTR(proc->links.prev); + + /* you have to update waitLock->waitProcs.size yourself */ + SHMQueueDelete(&(proc->links)); + SHMQueueElemInit(&(proc->links)); + + proc->errType = errType; + + IpcSemaphoreUnlock(proc->sem.semId, proc->sem.semNum, IpcExclusiveLock); + + return retProc; +} + + +/* + * ProcGetId -- + */ +int +ProcGetId() +{ + return( MyProc->procId ); +} + +/* + * ProcLockWakeup -- routine for waking up processes when a lock is + * released. + */ +int +ProcLockWakeup(PROC_QUEUE *queue, char *ltable, char *lock) +{ + PROC *proc; + int count; + + if (! queue->size) + return(STATUS_NOT_FOUND); + + proc = (PROC *) MAKE_PTR(queue->links.prev); + count = 0; + while ((LockResolveConflicts ((LOCKTAB *) ltable, + (LOCK *) lock, + proc->token, + proc->xid) == STATUS_OK)) + { + /* there was a waiting process, grant it the lock before waking it + * up. This will prevent another process from seizing the lock + * between the time we release the lock master (spinlock) and + * the time that the awoken process begins executing again. + */ + GrantLock((LOCK *) lock, proc->token); + queue->size--; + + /* + * ProcWakeup removes proc from the lock waiting process queue and + * returns the next proc in chain. If a writer just dropped + * its lock and there are several waiting readers, wake them all up. + */ + proc = ProcWakeup(proc, NO_ERROR); + + count++; + if (!proc || queue->size == 0) + break; + } + + if (count) + return(STATUS_OK); + else + /* Something is still blocking us. May have deadlocked. */ + return(STATUS_NOT_FOUND); +} + +void +ProcAddLock(SHM_QUEUE *elem) +{ + SHMQueueInsertTL(&MyProc->lockQueue,elem); +} + +/* -------------------- + * We only get to this routine if we got SIGALRM after DEADLOCK_TIMEOUT + * while waiting for a lock to be released by some other process. After + * the one minute deadline we assume we have a deadlock and must abort + * this transaction. We must also indicate that I'm no longer waiting + * on a lock so that other processes don't try to wake me up and screw + * up my semaphore. + * -------------------- + */ +int +#if defined(PORTNAME_linux) +HandleDeadLock(int i) +#else +HandleDeadLock() +#endif +{ + LOCK *lock; + int size; + + LockLockTable(); + + /* --------------------- + * Check to see if we've been awoken by anyone in the interim. + * + * If we have we can return and resume our transaction -- happy day. + * Before we are awoken the process releasing the lock grants it to + * us so we know that we don't have to wait anymore. + * + * Damn these names are LONG! -mer + * --------------------- + */ + if (IpcSemaphoreGetCount(MyProc->sem.semId, MyProc->sem.semNum) == + IpcSemaphoreDefaultStartValue) { + UnlockLockTable(); + return 1; + } + + /* + * you would think this would be unnecessary, but... + * + * this also means we've been removed already. in some ports + * (e.g., sparc and aix) the semop(2) implementation is such that + * we can actually end up in this handler after someone has removed + * us from the queue and bopped the semaphore *but the test above + * fails to detect the semaphore update* (presumably something weird + * having to do with the order in which the semaphore wakeup signal + * and SIGALRM get handled). + */ + if (MyProc->links.prev == INVALID_OFFSET || + MyProc->links.next == INVALID_OFFSET) { + UnlockLockTable(); + return(1); + } + + lock = MyProc->waitLock; + size = lock->waitProcs.size; /* so we can look at this in the core */ + + /* ------------------------ + * Get this process off the lock's wait queue + * ------------------------ + */ + Assert(lock->waitProcs.size > 0); + --lock->waitProcs.size; + SHMQueueDelete(&(MyProc->links)); + SHMQueueElemInit(&(MyProc->links)); + + /* ------------------ + * Unlock my semaphore so that the count is right for next time. + * I was awoken by a signal, not by someone unlocking my semaphore. + * ------------------ + */ + IpcSemaphoreUnlock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock); + + /* ------------- + * Set MyProc->errType to STATUS_ERROR so that we abort after + * returning from this handler. + * ------------- + */ + MyProc->errType = STATUS_ERROR; + + /* + * if this doesn't follow the IpcSemaphoreUnlock then we get lock + * table corruption ("LockReplace: xid table corrupted") due to + * race conditions. i don't claim to understand this... + */ + UnlockLockTable(); + + elog(NOTICE, "Timeout -- possible deadlock"); + return 0; +} + +void +ProcReleaseSpins(PROC *proc) +{ + int i; + + if (!proc) + proc = MyProc; + + if (!proc) + return; + for (i=0; i < (int)MAX_SPINS; i++) + { + if (proc->sLocks[i]) + { + Assert(proc->sLocks[i] == 1); + SpinRelease(i); + } + } +} + +/***************************************************************************** + * + *****************************************************************************/ + +/* + * ProcGetNewSemKeyAndNum - + * scan the free semaphore bitmap and allocate a single semaphore from + * a semaphore set. (If the semaphore set doesn't exist yet, + * IpcSemaphoreCreate will create it. Otherwise, we use the existing + * semaphore set.) + */ +static void +ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum) +{ + int i; + int32 *freeSemMap = ProcGlobal->freeSemMap; + unsigned int fullmask; + + /* + * we hold ProcStructLock when entering this routine. We scan through + * the bitmap to look for a free semaphore. + */ + fullmask = ~0 >> (32 - PROC_NSEMS_PER_SET); + for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) { + int mask = 1; + int j; + + if (freeSemMap[i] == fullmask) + continue; /* none free for this set */ + + for(j = 0; j < PROC_NSEMS_PER_SET; j++) { + if ((freeSemMap[i] & mask) == 0) { + /* + * a free semaphore found. Mark it as allocated. + */ + freeSemMap[i] |= mask; + + *key = ProcGlobal->currKey + i; + *semNum = j; + return; + } + mask <<= 1; + } + } + + /* if we reach here, all the semaphores are in use. */ + elog(WARN, "InitProc: cannot allocate a free semaphore"); +} + +/* + * ProcFreeSem - + * free up our semaphore in the semaphore set. If we're the last one + * in the set, also remove the semaphore set. + */ +static void +ProcFreeSem(IpcSemaphoreKey semKey, int semNum) +{ + int mask; + int i; + int32 *freeSemMap = ProcGlobal->freeSemMap; + + i = semKey - ProcGlobal->currKey; + mask = ~(1 << semNum); + freeSemMap[i] &= mask; + + if (freeSemMap[i]==0) + IpcSemaphoreKill(semKey); +} + +/* + * ProcFreeAllSemaphores - + * on exiting the postmaster, we free up all the semaphores allocated + * to the lmgrs of the backends. + */ +void +ProcFreeAllSemaphores() +{ + int i; + int32 *freeSemMap = ProcGlobal->freeSemMap; + + for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) { + if (freeSemMap[i]!=0) + IpcSemaphoreKill(ProcGlobal->currKey + i); + } +} diff --git a/src/backend/storage/lmgr/single.c b/src/backend/storage/lmgr/single.c new file mode 100644 index 00000000000..8d41ea38bb6 --- /dev/null +++ b/src/backend/storage/lmgr/single.c @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * single.c-- + * set single locks in the multi-level lock hierarchy + * + * Sometimes we don't want to set all levels of the multi-level + * lock hierarchy at once. This allows us to set and release + * one level at a time. It's useful in index scans when + * you can set an intent lock at the beginning and thereafter + * only set page locks. Tends to speed things up. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/single.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <string.h> +#include "storage/lmgr.h" /* where the declarations go */ +#include "storage/lock.h" +#include "storage/multilev.h" +#include "utils/rel.h" + +/* + * SingleLockReln -- lock a relation + * + * Returns: TRUE if the lock can be set, FALSE otherwise. + */ +bool +SingleLockReln(LockInfo linfo, LOCKT lockt, int action) +{ + LOCKTAG tag; + + /* + * LOCKTAG has two bytes of padding, unfortunately. The + * hash function will return miss if the padding bytes aren't + * zero'd. + */ + memset(&tag,0,sizeof(tag)); + tag.relId = linfo->lRelId.relId; + tag.dbId = linfo->lRelId.dbId; + BlockIdSet(&(tag.tupleId.ip_blkid), InvalidBlockNumber); + tag.tupleId.ip_posid = InvalidOffsetNumber; + + if (action == UNLOCK) + return(LockRelease(MultiTableId, &tag, lockt)); + else + return(LockAcquire(MultiTableId, &tag, lockt)); +} + +/* + * SingleLockPage -- use multi-level lock table, but lock + * only at the page level. + * + * Assumes that an INTENT lock has already been set in the + * multi-level lock table. + * + */ +bool +SingleLockPage(LockInfo linfo, + ItemPointer tidPtr, + LOCKT lockt, + int action) +{ + LOCKTAG tag; + + /* + * LOCKTAG has two bytes of padding, unfortunately. The + * hash function will return miss if the padding bytes aren't + * zero'd. + */ + memset(&tag,0,sizeof(tag)); + tag.relId = linfo->lRelId.relId; + tag.dbId = linfo->lRelId.dbId; + BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid)); + tag.tupleId.ip_posid = InvalidOffsetNumber; + + + if (action == UNLOCK) + return(LockRelease(MultiTableId, &tag, lockt)); + else + return(LockAcquire(MultiTableId, &tag, lockt)); +} + diff --git a/src/backend/storage/lock.h b/src/backend/storage/lock.h new file mode 100644 index 00000000000..df490e76512 --- /dev/null +++ b/src/backend/storage/lock.h @@ -0,0 +1,218 @@ +/*------------------------------------------------------------------------- + * + * lock.h-- + * + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: lock.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef LOCK_H_ +#define LOCK_H_ + +#include "postgres.h" +#include "storage/itemptr.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "storage/backendid.h" +#include "utils/hsearch.h" + +extern SPINLOCK LockMgrLock; +typedef int MASK; + +#define INIT_TABLE_SIZE 100 +#define MAX_TABLE_SIZE 1000 + + +/* ---------------------- + * The following defines are used to estimate how much shared + * memory the lock manager is going to require. + * + * NBACKENDS - The number of concurrently running backends + * NLOCKS_PER_XACT - The number of unique locks acquired in a transaction + * NLOCKENTS - The maximum number of lock entries in the lock table. + * ---------------------- + */ +#define NBACKENDS 50 +#define NLOCKS_PER_XACT 40 +#define NLOCKENTS NLOCKS_PER_XACT*NBACKENDS + +typedef int LOCK_TYPE; +typedef int LOCKT; +typedef int LockTableId; + +/* MAX_LOCKTYPES cannot be larger than the bits in MASK */ +#define MAX_LOCKTYPES 6 + +/* + * MAX_TABLES corresponds to the number of spin locks allocated in + * CreateSpinLocks() or the number of shared memory locations allocated + * for lock table spin locks in the case of machines with TAS instructions. + */ +#define MAX_TABLES 2 + +#define INVALID_TABLEID 0 + +/*typedef struct LOCK LOCK; */ + + +typedef struct ltag { + Oid relId; + Oid dbId; + ItemPointerData tupleId; +} LOCKTAG; + +#define TAGSIZE (sizeof(LOCKTAG)) + +/* This is the control structure for a lock table. It + * lives in shared memory: + * + * tableID -- the handle used by the lock table's clients to + * refer to the table. + * + * nLockTypes -- number of lock types (READ,WRITE,etc) that + * are defined on this lock table + * + * conflictTab -- this is an array of bitmasks showing lock + * type conflicts. conflictTab[i] is a mask with the j-th bit + * turned on if lock types i and j conflict. + * + * prio -- each locktype has a priority, so, for example, waiting + * writers can be given priority over readers (to avoid + * starvation). + * + * masterlock -- synchronizes access to the table + * + */ +typedef struct lockctl { + LockTableId tableId; + int nLockTypes; + int conflictTab[MAX_LOCKTYPES]; + int prio[MAX_LOCKTYPES]; + SPINLOCK masterLock; +} LOCKCTL; + +/* + * lockHash -- hash table on lock Ids, + * xidHash -- hash on xid and lockId in case + * multiple processes are holding the lock + * ctl - control structure described above. + */ +typedef struct ltable { + HTAB *lockHash; + HTAB *xidHash; + LOCKCTL *ctl; +} LOCKTAB; + +/* ----------------------- + * A transaction never conflicts with its own locks. Hence, if + * multiple transactions hold non-conflicting locks on the same + * data, private per-transaction information must be stored in the + * XID table. The tag is XID + shared memory lock address so that + * all locks can use the same XID table. The private information + * we store is the number of locks of each type (holders) and the + * total number of locks (nHolding) held by the transaction. + * + * NOTE: -- + * There were some problems with the fact that currently TransactionIdData + * is a 5 byte entity and compilers long word aligning of structure fields. + * If the 3 byte padding is put in front of the actual xid data then the + * hash function (which uses XID_TAGSIZE when deciding how many bytes of a + * struct to look at for the key) might only see the last two bytes of the xid. + * + * Clearly this is not good since its likely that these bytes will be the + * same for many transactions and hence they will share the same entry in + * hash table causing the entry to be corrupted. For this long-winded + * reason I have put the tag in a struct of its own to ensure that the + * XID_TAGSIZE is computed correctly. It used to be sizeof (SHMEM_OFFSET) + + * sizeof(TransactionIdData) which != sizeof(XIDTAG). + * + * Finally since the hash function will now look at all 12 bytes of the tag + * the padding bytes MUST be zero'd before use in hash_search() as they + * will have random values otherwise. Jeff 22 July 1991. + * ----------------------- + */ + +typedef struct XIDTAG { + SHMEM_OFFSET lock; + int pid; + TransactionId xid; +} XIDTAG; + +typedef struct XIDLookupEnt { + /* tag */ + XIDTAG tag; + + /* data */ + int holders[MAX_LOCKTYPES]; + int nHolding; + SHM_QUEUE queue; +} XIDLookupEnt; + +#define XID_TAGSIZE (sizeof(XIDTAG)) + +/* originally in procq.h */ +typedef struct procQueue { + SHM_QUEUE links; + int size; +} PROC_QUEUE; + + +/* + * lock information: + * + * tag -- uniquely identifies the object being locked + * mask -- union of the conflict masks of all lock types + * currently held on this object. + * waitProcs -- queue of processes waiting for this lock + * holders -- count of each lock type currently held on the + * lock. + * nHolding -- total locks of all types. + */ +typedef struct Lock { + /* hash key */ + LOCKTAG tag; + + /* data */ + int mask; + PROC_QUEUE waitProcs; + int holders[MAX_LOCKTYPES]; + int nHolding; + int activeHolders[MAX_LOCKTYPES]; + int nActive; +} LOCK; + +#define LockGetLock_nHolders(l) l->nHolders + +#define LockDecrWaitHolders(lock, lockt) \ + lock->nHolding--; \ + lock->holders[lockt]-- + +#define LockLockTable() SpinAcquire(LockMgrLock); +#define UnlockLockTable() SpinRelease(LockMgrLock); + +extern SPINLOCK LockMgrLock; + +/* + * function prototypes + */ +extern void InitLocks(void); +extern void LockDisable(int status); +extern LockTableId LockTabInit(char *tabName, MASK *conflictsP, int *prioP, + int ntypes); +extern LockTableId LockTabRename(LockTableId tableId); +extern bool LockAcquire(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt); +extern int LockResolveConflicts(LOCKTAB *ltable, LOCK *lock, LOCKT lockt, + TransactionId xid); +extern int WaitOnLock(LOCKTAB *ltable, LockTableId tableId, LOCK *lock, + LOCKT lockt); +extern bool LockRelease(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt); +extern void GrantLock(LOCK *lock, LOCKT lockt); +extern bool LockReleaseAll(LockTableId tableId, SHM_QUEUE *lockQueue); +extern int LockShmemSize(void); +extern bool LockingDisabled(void); + +#endif /* LOCK_H */ diff --git a/src/backend/storage/multilev.h b/src/backend/storage/multilev.h new file mode 100644 index 00000000000..582c1cb6c37 --- /dev/null +++ b/src/backend/storage/multilev.h @@ -0,0 +1,64 @@ +/*------------------------------------------------------------------------- + * + * multilev.h-- + * multi level lock table consts/defs for single.c and multi.c and their + * clients + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: multilev.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef MULTILEV_H +#define MULTILEV_H + +#include "storage/lock.h" +#include "storage/lmgr.h" + +#define READ_LOCK 2 +#define WRITE_LOCK 1 + +/* any time a small granularity READ/WRITE lock is set. + * Higher granularity READ_INTENT/WRITE_INTENT locks must + * also be set. A read intent lock is has value READ+INTENT. + * in this implementation. + */ +#define NO_LOCK 0 +#define INTENT 2 +#define READ_INTENT (READ_LOCK+INTENT) +#define WRITE_INTENT (WRITE_LOCK+INTENT) + +#define EXTEND_LOCK 5 + +#define SHORT_TERM 1 +#define LONG_TERM 2 +#define UNLOCK 0 + +#define N_LEVELS 3 +#define RELN_LEVEL 0 +#define PAGE_LEVEL 1 +#define TUPLE_LEVEL 2 +typedef int LOCK_LEVEL; + +/* multi.c */ + +extern LockTableId MultiTableId; +extern LockTableId ShortTermTableId; + +/* + * function prototypes + */ +extern LockTableId InitMultiLevelLockm(void); +extern bool MultiLockReln(LockInfo linfo, LOCKT lockt); +extern bool MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt); +extern bool MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt); +extern bool MultiAcquire(LockTableId tableId, LOCKTAG *tag, LOCKT lockt, + LOCK_LEVEL level); +extern bool MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt); +extern bool MultiReleaseReln(LockInfo linfo, LOCKT lockt); +extern bool MultiRelease(LockTableId tableId, LOCKTAG *tag, LOCKT lockt, + LOCK_LEVEL level); + +#endif /* MULTILEV_H */ diff --git a/src/backend/storage/off.h b/src/backend/storage/off.h new file mode 100644 index 00000000000..e5f5cbf5482 --- /dev/null +++ b/src/backend/storage/off.h @@ -0,0 +1,60 @@ +/*------------------------------------------------------------------------- + * + * off.h-- + * POSTGRES disk "offset" definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: off.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef OFF_H +#define OFF_H + +#include "c.h" +#include "machine.h" /* for BLCKSZ */ +#include "storage/itemid.h" + +/* + * OffsetNumber: + * + * this is a 1-based index into the linp (ItemIdData) array in the + * header of each disk page. + */ +typedef uint16 OffsetNumber; + +#define InvalidOffsetNumber ((OffsetNumber) 0) +#define FirstOffsetNumber ((OffsetNumber) 1) +#define MaxOffsetNumber ((OffsetNumber) (BLCKSZ / sizeof(ItemIdData))) +#define OffsetNumberMask (0xffff) /* valid uint16 bits */ + +/* ---------------- + * support macros + * ---------------- + */ + +/* + * OffsetNumberIsValid -- + * True iff the offset number is valid. + */ +#define OffsetNumberIsValid(offsetNumber) \ + ((bool) ((offsetNumber != InvalidOffsetNumber) && \ + (offsetNumber <= MaxOffsetNumber))) + +/* + * OffsetNumberNext -- + * OffsetNumberPrev -- + * Increments/decrements the argument. These macros look pointless + * but they help us disambiguate the different manipulations on + * OffsetNumbers (e.g., sometimes we substract one from an + * OffsetNumber to move back, and sometimes we do so to form a + * real C array index). + */ +#define OffsetNumberNext(offsetNumber) \ + ((OffsetNumber) (1 + (offsetNumber))) +#define OffsetNumberPrev(offsetNumber) \ + ((OffsetNumber) (-1 + (offsetNumber))) + +#endif /* OFF_H */ diff --git a/src/backend/storage/page.h b/src/backend/storage/page.h new file mode 100644 index 00000000000..a012ea522c0 --- /dev/null +++ b/src/backend/storage/page.h @@ -0,0 +1,26 @@ +/*------------------------------------------------------------------------- + * + * page.h-- + * POSTGRES buffer page abstraction definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: page.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef PAGE_H +#define PAGE_H + +#include "c.h" + +typedef Pointer Page; + +/* + * PageIsValid -- + * True iff page is valid. + */ +#define PageIsValid(page) PointerIsValid(page) + +#endif /* PAGE_H */ diff --git a/src/backend/storage/page/Makefile.inc b/src/backend/storage/page/Makefile.inc new file mode 100644 index 00000000000..2a7d8408512 --- /dev/null +++ b/src/backend/storage/page/Makefile.inc @@ -0,0 +1,16 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for storage/page +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/page/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= bufpage.c itemptr.c + + diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c new file mode 100644 index 00000000000..14b5ead85bc --- /dev/null +++ b/src/backend/storage/page/bufpage.c @@ -0,0 +1,519 @@ +/*------------------------------------------------------------------------- + * + * bufpage.c-- + * POSTGRES standard buffer page code. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/page/bufpage.c,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <sys/types.h> +#include <sys/file.h> + +#include "c.h" + +#include "storage/item.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/memutils.h" +#include "storage/bufpage.h" + +#include "lib/qsort.h" + +static bool PageManagerShuffle = true; /* default is shuffle mode */ + +/* ---------------------------------------------------------------- + * Buffer support functions + * ---------------------------------------------------------------- + */ +/* + * BufferGetPageSize -- + * Returns the page size within a buffer. + * + * Notes: + * Assumes buffer is valid. + * + * The buffer can be a raw disk block and need not contain a valid + * (formatted) disk page. + */ +Size +BufferGetPageSize(Buffer buffer) +{ + Size pageSize; + + Assert(BufferIsValid(buffer)); + pageSize = BLCKSZ; /* XXX dig out of buffer descriptor */ + + Assert(PageSizeIsValid(pageSize)); + return (pageSize); +} + +/* + * BufferGetPage -- + * Returns the page associated with a buffer. + */ +Page +BufferGetPage(Buffer buffer) +{ + return (Page) BufferGetBlock(buffer); +} + + +/* ---------------------------------------------------------------- + * Page support functions + * ---------------------------------------------------------------- + */ + +/* + * PageInit -- + * Initializes the contents of a page. + */ +void +PageInit(Page page, Size pageSize, Size specialSize) +{ + PageHeader p = (PageHeader) page; + + Assert(pageSize == BLCKSZ); + Assert(pageSize > + specialSize + sizeof(PageHeaderData) - sizeof(ItemIdData)); + + specialSize = DOUBLEALIGN(specialSize); + + p->pd_lower = sizeof(PageHeaderData) - sizeof(ItemIdData); + p->pd_upper = pageSize - specialSize; + p->pd_special = pageSize - specialSize; + PageSetPageSize(page, pageSize); +} + +/* + * PageGetItem -- + * Retrieves an item on the given page. + * + * Note: + * This does change the status of any of the resources passed. + * The semantics may change in the future. + */ +Item +PageGetItem(Page page, ItemId itemId) +{ + Item item; + + Assert(PageIsValid(page)); + Assert((*itemId).lp_flags & LP_USED); + + item = (Item)(((char *)page) + (*itemId).lp_off); + + return (item); +} + +/* + * PageAddItem -- + * Adds item to the given page. + * + * Note: + * This does not assume that the item resides on a single page. + * It is the responsiblity of the caller to act appropriately + * depending on this fact. The "pskip" routines provide a + * friendlier interface, in this case. + * + * This does change the status of any of the resources passed. + * The semantics may change in the future. + * + * This routine should probably be combined with others? + */ +/* ---------------- + * PageAddItem + * + * add an item to a page. + * + * Notes on interface: + * If offsetNumber is valid, shuffle ItemId's down to make room + * to use it, if PageManagerShuffle is true. If PageManagerShuffle is + * false, then overwrite the specified ItemId. (PageManagerShuffle is + * true by default, and is modified by calling PageManagerModeSet.) + * If offsetNumber is not valid, then assign one by finding the first + * one that is both unused and deallocated. + * + * NOTE: If offsetNumber is valid, and PageManagerShuffle is true, it + * is assumed that there is room on the page to shuffle the ItemId's + * down by one. + * ---------------- + */ +OffsetNumber +PageAddItem(Page page, + Item item, + Size size, + OffsetNumber offsetNumber, + ItemIdFlags flags) +{ + register i; + Size alignedSize; + Offset lower; + Offset upper; + ItemId itemId; + ItemId fromitemId, toitemId; + OffsetNumber limit; + + bool shuffled = false; + + /* + * Find first unallocated offsetNumber + */ + limit = OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + /* was offsetNumber passed in? */ + if (OffsetNumberIsValid(offsetNumber)) { + if (PageManagerShuffle == true) { + /* shuffle ItemId's (Do the PageManager Shuffle...) */ + for (i = (limit - 1); i >= offsetNumber; i--) { + fromitemId = &((PageHeader)page)->pd_linp[i - 1]; + toitemId = &((PageHeader)page)->pd_linp[i]; + *toitemId = *fromitemId; + } + shuffled = true; /* need to increase "lower" */ + } else { /* overwrite mode */ + itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1]; + if (((*itemId).lp_flags & LP_USED) || + ((*itemId).lp_len != 0)) { + elog(WARN, "PageAddItem: tried overwrite of used ItemId"); + return (InvalidOffsetNumber); + } + } + } else { /* offsetNumber was not passed in, so find one */ + /* look for "recyclable" (unused & deallocated) ItemId */ + for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) { + itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1]; + if ((((*itemId).lp_flags & LP_USED) == 0) && + ((*itemId).lp_len == 0)) + break; + } + } + if (offsetNumber > limit) + lower = (Offset) (((char *) (&((PageHeader)page)->pd_linp[offsetNumber])) - ((char *) page)); + else if (offsetNumber == limit || shuffled == true) + lower = ((PageHeader)page)->pd_lower + sizeof (ItemIdData); + else + lower = ((PageHeader)page)->pd_lower; + + alignedSize = DOUBLEALIGN(size); + + upper = ((PageHeader)page)->pd_upper - alignedSize; + + if (lower > upper) { + return (InvalidOffsetNumber); + } + + itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1]; + (*itemId).lp_off = upper; + (*itemId).lp_len = size; + (*itemId).lp_flags = flags; + memmove((char *)page + upper, item, size); + ((PageHeader)page)->pd_lower = lower; + ((PageHeader)page)->pd_upper = upper; + + return (offsetNumber); +} + +/* + * PageGetTempPage -- + * Get a temporary page in local memory for special processing + */ +Page +PageGetTempPage(Page page, Size specialSize) +{ + Size pageSize; + Size size; + Page temp; + PageHeader thdr; + + pageSize = PageGetPageSize(page); + + if ((temp = (Page) palloc(pageSize)) == (Page) NULL) + elog(FATAL, "Cannot allocate %d bytes for temp page.", pageSize); + thdr = (PageHeader) temp; + + /* copy old page in */ + memmove(temp, page, pageSize); + + /* clear out the middle */ + size = (pageSize - sizeof(PageHeaderData)) + sizeof(ItemIdData); + size -= DOUBLEALIGN(specialSize); + memset((char *) &(thdr->pd_linp[0]), 0, size); + + /* set high, low water marks */ + thdr->pd_lower = sizeof (PageHeaderData) - sizeof (ItemIdData); + thdr->pd_upper = pageSize - DOUBLEALIGN(specialSize); + + return (temp); +} + +/* + * PageRestoreTempPage -- + * Copy temporary page back to permanent page after special processing + * and release the temporary page. + */ +void +PageRestoreTempPage(Page tempPage, Page oldPage) +{ + Size pageSize; + + pageSize = PageGetPageSize(tempPage); + memmove((char *) oldPage, (char *) tempPage, pageSize); + + pfree(tempPage); +} + +/* + * PageGetMaxOffsetNumber -- + * Returns the maximum offset number used by the given page. + * + * NOTE: The offset is invalid if the page is non-empty. + * Test whether PageIsEmpty before calling this routine + * and/or using its return value. + */ +OffsetNumber +PageGetMaxOffsetNumber(Page page) +{ + LocationIndex low; + OffsetNumber i; + + low = ((PageHeader) page)->pd_lower; + i = (low - (sizeof(PageHeaderData) - sizeof(ItemIdData))) + / sizeof(ItemIdData); + + return(i); +} + +/* ---------------- + * itemid stuff for PageRepairFragmentation + * ---------------- + */ +struct itemIdSortData { + int offsetindex; /* linp array index */ + ItemIdData itemiddata; +}; + +static int +itemidcompare(struct itemIdSortData *itemidp1, struct itemIdSortData *itemidp2) +{ + if (itemidp1->itemiddata.lp_off == itemidp2->itemiddata.lp_off) + return(0); + else if (itemidp1->itemiddata.lp_off < itemidp2->itemiddata.lp_off) + return(1); + else + return(-1); +} + +/* + * PageRepairFragmentation -- + * Frees fragmented space on a page. + */ +void +PageRepairFragmentation(Page page) +{ + int i; + struct itemIdSortData *itemidbase, *itemidptr; + ItemId lp; + int nline, nused; + int itemidcompare(); + Offset upper; + Size alignedSize; + + nline = (int16) PageGetMaxOffsetNumber(page); + nused = 0; + for (i=0; i<nline; i++) { + lp = ((PageHeader)page)->pd_linp + i; + if ((*lp).lp_flags & LP_USED) + nused++; + } + + if (nused == 0) { + for (i=0; i<nline; i++) { + lp = ((PageHeader)page)->pd_linp + i; + if ((*lp).lp_len > 0) /* unused, but allocated */ + (*lp).lp_len = 0; /* indicate unused & deallocated */ + } + + ((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special; + } else { /* nused != 0 */ + itemidbase = (struct itemIdSortData *) + palloc(sizeof(struct itemIdSortData) * nused); + memset((char *) itemidbase, 0, sizeof(struct itemIdSortData) * nused); + itemidptr = itemidbase; + for (i=0; i<nline; i++) { + lp = ((PageHeader)page)->pd_linp + i; + if ((*lp).lp_flags & LP_USED) { + itemidptr->offsetindex = i; + itemidptr->itemiddata = *lp; + itemidptr++; + } else { + if ((*lp).lp_len > 0) /* unused, but allocated */ + (*lp).lp_len = 0; /* indicate unused & deallocated */ + } + } + + /* sort itemIdSortData array...*/ + pg_qsort((char *) itemidbase, nused, sizeof(struct itemIdSortData), + (void*) itemidcompare); + + /* compactify page */ + ((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special; + + for (i=0, itemidptr = itemidbase; i<nused; i++, itemidptr++) { + lp = ((PageHeader)page)->pd_linp + itemidptr->offsetindex; + alignedSize = DOUBLEALIGN((*lp).lp_len); + upper = ((PageHeader)page)->pd_upper - alignedSize; + memmove((char *) page + upper, + (char *)page + (*lp).lp_off, + (*lp).lp_len); + (*lp).lp_off = upper; + ((PageHeader)page)->pd_upper = upper; + } + + pfree(itemidbase); + } +} + +/* + * PageGetFreeSpace -- + * Returns the size of the free (allocatable) space on a page. + */ +Size +PageGetFreeSpace(Page page) +{ + Size space; + + + space = ((PageHeader)page)->pd_upper - ((PageHeader)page)->pd_lower; + + if (space < sizeof (ItemIdData)) { + return (0); + } + space -= sizeof (ItemIdData); /* XXX not always true */ + + return (space); +} + +/* + * PageManagerModeSet -- + * + * Sets mode to either: ShufflePageManagerMode (the default) or + * OverwritePageManagerMode. For use by access methods code + * for determining semantics of PageAddItem when the offsetNumber + * argument is passed in. + */ +void +PageManagerModeSet(PageManagerMode mode) +{ + if (mode == ShufflePageManagerMode) + PageManagerShuffle = true; + else if (mode == OverwritePageManagerMode) + PageManagerShuffle = false; +} + +/* + *---------------------------------------------------------------- + * PageIndexTupleDelete + *---------------------------------------------------------------- + * + * This routine does the work of removing a tuple from an index page. + */ +void +PageIndexTupleDelete(Page page, OffsetNumber offnum) +{ + PageHeader phdr; + char *addr; + ItemId tup; + Size size; + char *locn; + int nbytes; + int offidx; + + phdr = (PageHeader) page; + + /* change offset number to offset index */ + offidx = offnum - 1; + + tup = PageGetItemId(page, offnum); + size = ItemIdGetLength(tup); + size = DOUBLEALIGN(size); + + /* location of deleted tuple data */ + locn = (char *) (page + ItemIdGetOffset(tup)); + + /* + * First, we want to get rid of the pd_linp entry for the index + * tuple. We copy all subsequent linp's back one slot in the + * array. + */ + + nbytes = phdr->pd_lower - + ((char *)&phdr->pd_linp[offidx + 1] - (char *) phdr); + memmove((char *) &(phdr->pd_linp[offidx]), + (char *) &(phdr->pd_linp[offidx + 1]), + nbytes); + + /* + * Now move everything between the old upper bound (beginning of tuple + * space) and the beginning of the deleted tuple forward, so that + * space in the middle of the page is left free. If we've just deleted + * the tuple at the beginning of tuple space, then there's no need + * to do the copy (and bcopy on some architectures SEGV's if asked + * to move zero bytes). + */ + + /* beginning of tuple space */ + addr = (char *) (page + phdr->pd_upper); + + if (locn != addr) + memmove(addr + size, addr, (int) (locn - addr)); + + /* adjust free space boundary pointers */ + phdr->pd_upper += size; + phdr->pd_lower -= sizeof (ItemIdData); + + /* finally, we need to adjust the linp entries that remain */ + if (!PageIsEmpty(page)) + PageIndexTupleDeleteAdjustLinePointers(phdr, locn, size); +} + +/* + *---------------------------------------------------------------- + * PageIndexTupleDeleteAdjustLinePointers + *---------------------------------------------------------------- + * + * Once the line pointers and tuple data have been shifted around + * on the page, we need to go down the line pointer vector and + * adjust pointers to reflect new locations. Anything that used + * to be before the deleted tuple's data was moved forward by the + * size of the deleted tuple. + * + * This routine does the work of adjusting the line pointers. + * Location is where the tuple data used to lie; size is how + * much space it occupied. We assume that size has been aligned + * as required by the time we get here. + * + * This routine should never be called on an empty page. + */ +void +PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr, + char *location, + Size size) +{ + int i; + + /* location is an index into the page... */ + location -= (int) phdr; + + for (i = PageGetMaxOffsetNumber((Page) phdr) - 1; i >= 0; i--) { + if (phdr->pd_linp[i].lp_off <= (unsigned) location) { + phdr->pd_linp[i].lp_off += size; + } + } +} diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c new file mode 100644 index 00000000000..9d063374038 --- /dev/null +++ b/src/backend/storage/page/itemptr.c @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * itemptr.c-- + * POSTGRES disk item pointer code. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/page/itemptr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "storage/block.h" +#include "storage/off.h" +#include "storage/itemptr.h" +#include "storage/bufpage.h" + +/* + * ItemPointerEquals -- + * Returns true if both item pointers point to the same item, + * otherwise returns false. + * + * Note: + * Assumes that the disk item pointers are not NULL. + */ +bool +ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2) +{ + if (ItemPointerGetBlockNumber(pointer1) == + ItemPointerGetBlockNumber(pointer2) && + ItemPointerGetOffsetNumber(pointer1) == + ItemPointerGetOffsetNumber(pointer2)) + return(true); + else + return(false); +} + diff --git a/src/backend/storage/pagenum.h b/src/backend/storage/pagenum.h new file mode 100644 index 00000000000..f32624c226d --- /dev/null +++ b/src/backend/storage/pagenum.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * pagenum.h-- + * POSTGRES page number definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: pagenum.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef PAGENUM_H +#define PAGENUM_H + +#include "c.h" +#include "storage/page.h" + +typedef uint16 PageNumber; + +typedef uint32 LogicalPageNumber; + +#define InvalidLogicalPageNumber 0 + +/* + * LogicalPageNumberIsValid -- + * True iff the logical page number is valid. + */ +#define LogicalPageNumberIsValid(pageNumber) \ + ((bool)((pageNumber) != InvalidLogicalPageNumber)) + + +#endif /* PAGENUM_H */ diff --git a/src/backend/storage/pos.h b/src/backend/storage/pos.h new file mode 100644 index 00000000000..9a7f603416b --- /dev/null +++ b/src/backend/storage/pos.h @@ -0,0 +1,64 @@ +/*------------------------------------------------------------------------- + * + * pos.h-- + * POSTGRES "position" definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: pos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef POS_H +#define POS_H + +#include "c.h" + +/* + * a 'position' used to be <pagenumber, offset> in postgres. this has + * been changed to just <offset> as the notion of having multiple pages + * within a block has been removed. + * + * the 'offset' abstraction is somewhat confusing. it is NOT a byte + * offset within the page; instead, it is an offset into the line + * pointer array contained on every page that store (heap or index) + * tuples. + */ +typedef bits16 PositionIdData; +typedef PositionIdData *PositionId; + +/* ---------------- + * support macros + * ---------------- + */ + +/* + * PositionIdIsValid -- + * True iff the position identifier is valid. + */ +#define PositionIdIsValid(positionId) \ + PointerIsValid(positionId) + +/* + * PositionIdSetInvalid -- + * Make an invalid position. + */ +#define PositionIdSetInvalid(positionId) \ + *(positionId) = (bits16) 0 + +/* + * PositionIdSet -- + * Sets a position identifier to the specified value. + */ +#define PositionIdSet(positionId, offsetNumber) \ + *(positionId) = (offsetNumber) + +/* + * PositionIdGetOffsetNumber -- + * Retrieve the offset number from a position identifier. + */ +#define PositionIdGetOffsetNumber(positionId) \ + ((OffsetNumber) *(positionId)) + +#endif /* POS_H */ diff --git a/src/backend/storage/proc.h b/src/backend/storage/proc.h new file mode 100644 index 00000000000..1ec89dedc2d --- /dev/null +++ b/src/backend/storage/proc.h @@ -0,0 +1,127 @@ +/*------------------------------------------------------------------------- + * + * proc.h-- + * + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: proc.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef _PROC_H_ +#define _PROC_H_ + +#include "storage/ipc.h" +#include "storage/lock.h" +#ifndef WIN32 +#include <sys/sem.h> +#else +/* This is because WIN32 already defines PROC */ +#define PROC PGL_PROC +#endif /* WIN32 */ +#include "storage/shmem.h" + + +typedef struct { + int sleeplock; + int semNum; + IpcSemaphoreId semId; + IpcSemaphoreKey semKey; +} SEMA; + +/* + * Each backend has: + */ +typedef struct proc { + + /* proc->links MUST BE THE FIRST ELEMENT OF STRUCT (see ProcWakeup()) */ + + SHM_QUEUE links; /* proc can be waiting for one event(lock) */ + SEMA sem; /* ONE semaphore to sleep on */ + int errType; /* error code tells why we woke up */ + + int procId; /* unique number for this structure + * NOT unique per backend, these things + * are reused after the backend dies. + */ + + int critSects; /* If critSects > 0, we are in sensitive + * routines that cannot be recovered when + * the process fails. + */ + + int prio; /* priority for sleep queue */ + + TransactionId xid; /* transaction currently being executed + * by this proc + */ + + LOCK * waitLock; /* Lock we're sleeping on */ + int token; /* info for proc wakeup routines */ + int pid; /* This procs process id */ + short sLocks[MAX_SPINS]; /* Spin lock stats */ + SHM_QUEUE lockQueue; /* locks associated with current transaction */ +} PROC; + + +/* + * MAX_PROC_SEMS is the maximum number of per-process semaphores (those used + * by the lock mgr) we can keep track of. PROC_NSEMS_PER_SET is the number + * of semaphores in each (sys-V) semaphore set allocated. (Be careful not + * to set it to greater 32. Otherwise, the bitmap will overflow.) + */ +#define MAX_PROC_SEMS 128 +#define PROC_NSEMS_PER_SET 16 + +typedef struct procglobal { + SHMEM_OFFSET freeProcs; + int numProcs; + IPCKey currKey; + int32 freeSemMap[MAX_PROC_SEMS/PROC_NSEMS_PER_SET]; +} PROC_HDR; + +extern PROC *MyProc; + +#define PROC_INCR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])++ +#define PROC_DECR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])-- + +/* + * flags explaining why process woke up + */ +#define NO_ERROR 0 +#define ERR_TIMEOUT 1 +#define ERR_BUFFER_IO 2 + +#define MAX_PRIO 50 +#define MIN_PRIO (-1) + +extern SPINLOCK ProcStructLock; + +/* + * Function Prototypes + */ +extern void InitProcess(IPCKey key); +extern void ProcReleaseLocks(void); +extern bool ProcRemove(int pid); +/* extern bool ProcKill(int exitStatus, int pid); */ +/* make static in storage/lmgr/proc.c -- jolly */ + +extern PROC_QUEUE *ProcQueueAlloc(char *name); +extern void ProcQueueInit(PROC_QUEUE *queue); +extern int ProcSleep(PROC_QUEUE *queue, SPINLOCK spinlock, int token, + int prio, LOCK *lock); +extern PROC *ProcWakeup(PROC *proc, int errType); +extern int ProcGetId(void); +extern int ProcLockWakeup(PROC_QUEUE *queue, char * ltable, char * lock); +extern void ProcAddLock(SHM_QUEUE *elem); +#if defined(PORTNAME_linux) +extern int HandleDeadLock(int); +#else +extern int HandleDeadLock(void); +#endif +extern void ProcReleaseSpins(PROC *proc); +extern void ProcFreeAllSemaphores(void); + +#endif /* PROC_H */ diff --git a/src/backend/storage/shmem.h b/src/backend/storage/shmem.h new file mode 100644 index 00000000000..a00b33581a4 --- /dev/null +++ b/src/backend/storage/shmem.h @@ -0,0 +1,104 @@ +/*------------------------------------------------------------------------- + * + * shmem.h-- + * shared memory management structures + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: shmem.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef SHMEM_H +#define SHMEM_H + +#include "storage/spin.h" /* for SPINLOCK */ +#include "utils/hsearch.h" /* for HTAB */ + +/* The shared memory region can start at a different address + * in every process. Shared memory "pointers" are actually + * offsets relative to the start of the shared memory region(s). + */ +typedef unsigned long SHMEM_OFFSET; +#define INVALID_OFFSET (-1) +#define BAD_LOCATION (-1) + +/* start of the lowest shared memory region. For now, assume that + * there is only one shared memory region + */ +extern SHMEM_OFFSET ShmemBase; + + +/* coerce an offset into a pointer in this process's address space */ +#define MAKE_PTR(xx_offs)\ + (ShmemBase+((unsigned long)(xx_offs))) + +/* coerce a pointer into a shmem offset */ +#define MAKE_OFFSET(xx_ptr)\ + (SHMEM_OFFSET) (((unsigned long)(xx_ptr))-ShmemBase) + +#define SHM_PTR_VALID(xx_ptr)\ + (((unsigned long)xx_ptr) > ShmemBase) + +/* cannot have an offset to ShmemFreeStart (offset 0) */ +#define SHM_OFFSET_VALID(xx_offs)\ + ((xx_offs != 0) && (xx_offs != INVALID_OFFSET)) + + +extern SPINLOCK ShmemLock; +extern SPINLOCK BindingLock; + +/* shmemqueue.c */ +typedef struct SHM_QUEUE { + SHMEM_OFFSET prev; + SHMEM_OFFSET next; +} SHM_QUEUE; + +/* shmem.c */ +extern void ShmemBindingTabReset(); +extern void ShmemCreate(unsigned int key, unsigned int size); +extern int InitShmem(unsigned int key, unsigned int size); +extern long *ShmemAlloc(unsigned long size); +extern int ShmemIsValid(unsigned long addr); +extern HTAB *ShmemInitHash(char *name, long init_size, long max_size, + HASHCTL *infoP, int hash_flags); +extern bool ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr); +extern SHMEM_OFFSET ShmemPIDDestroy(int pid); +extern long *ShmemInitStruct(char *name, unsigned long size, + bool *foundPtr); + + +typedef int TableID; + +/* size constants for the binding table */ + /* max size of data structure string name */ +#define BTABLE_KEYSIZE (50) + /* data in binding table hash bucket */ +#define BTABLE_DATASIZE (sizeof(BindingEnt) - BTABLE_KEYSIZE) + /* maximum size of the binding table */ +#define BTABLE_SIZE (100) + +/* this is a hash bucket in the binding table */ +typedef struct { + char key[BTABLE_KEYSIZE]; /* string name */ + unsigned long location; /* location in shared mem */ + unsigned long size; /* numbytes allocated for the + * structure + */ +} BindingEnt; + +/* + * prototypes for functions in shmqueue.c + */ +extern void SHMQueueInit(SHM_QUEUE *queue); +extern bool SHMQueueIsDetached(SHM_QUEUE *queue); +extern void SHMQueueElemInit(SHM_QUEUE *queue); +extern void SHMQueueDelete(SHM_QUEUE *queue); +extern void SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem); +extern void SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem); +extern void SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr, + SHM_QUEUE *nextQueue); +extern bool SHMQueueEmpty(SHM_QUEUE *queue); + +#endif /* SHMEM_H */ diff --git a/src/backend/storage/sinval.h b/src/backend/storage/sinval.h new file mode 100644 index 00000000000..036597dbb7a --- /dev/null +++ b/src/backend/storage/sinval.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * sinval.h-- + * POSTGRES shared cache invalidation communication definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: sinval.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef SINVAL_H +#define SINVAL_H + +#include "c.h" +#include "storage/spin.h" +#include "storage/ipc.h" +#include "storage/itemptr.h" +#include "storage/backendid.h" + +extern SPINLOCK SInvalLock; + +extern void CreateSharedInvalidationState(IPCKey key); +extern void AttachSharedInvalidationState(IPCKey key); +extern void InitSharedInvalidationState(); +extern void RegisterSharedInvalid(int cacheId, Index hashIndex, + ItemPointer pointer); +extern void InvalidateSharedInvalid(void (*invalFunction)(), + void (*resetFunction)()); + + +#endif /* SINVAL_H */ diff --git a/src/backend/storage/sinvaladt.h b/src/backend/storage/sinvaladt.h new file mode 100644 index 00000000000..06029978980 --- /dev/null +++ b/src/backend/storage/sinvaladt.h @@ -0,0 +1,126 @@ +/*------------------------------------------------------------------------- + * + * sinvaladt.h-- + * POSTGRES shared cache invalidation segment definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: sinvaladt.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef SINVALADT_H +#define SINVALADT_H + +#include "postgres.h" /* XXX */ + +#include "storage/ipc.h" +#include "storage/itemptr.h" +#include "storage/sinval.h" + +/* + * The structure of the shared cache invaidation segment + * + */ +/* +A------------- Header info -------------- + criticalSectionSemaphoreId + generalSemaphoreId + startEntrySection (offset a) + endEntrySection (offset a + b) + startFreeSpace (offset relative to B) + startEntryChain (offset relatiev to B) + endEntryChain (offset relative to B) + numEntries + maxNumEntries + procState[MaxBackendId] --> limit + resetState (bool) +a tag (POSTID) +B------------- Start entry section ------- + SISegEntry --> entryData --> ... (see SharedInvalidData!) + isfree (bool) + next (offset to next entry in chain ) +b .... (dynamically growing down) +C----------------End shared segment ------- + +*/ + +/* Parameters (configurable) *******************************************/ +#define MaxBackendId 32 /* maximum number of backends */ +#define MAXNUMMESSAGES 1000 /* maximum number of messages in seg*/ + + +#define InvalidOffset 1000000000 /* a invalid offset (End of chain) */ + +typedef struct ProcState { + int limit; /* the number of read messages */ + bool resetState; /* true, if backend has to reset its state */ + int tag; /* special tag, recieved from the postmaster */ +} ProcState; + + +typedef struct SISeg { + IpcSemaphoreId criticalSectionSemaphoreId; /* semaphore id */ + IpcSemaphoreId generalSemaphoreId; /* semaphore id */ + Offset startEntrySection; /* (offset a) */ + Offset endEntrySection; /* (offset a + b) */ + Offset startFreeSpace; /* (offset relative to B) */ + Offset startEntryChain; /* (offset relative to B) */ + Offset endEntryChain; /* (offset relative to B) */ + int numEntries; + int maxNumEntries; + ProcState procState[MaxBackendId]; /* reflects the invalidation state */ + /* here starts the entry section, controlled by offsets */ +} SISeg; +#define SizeSISeg sizeof(SISeg) + +typedef struct SharedInvalidData { + int cacheId; /* XXX */ + Index hashIndex; + ItemPointerData pointerData; +} SharedInvalidData; + +typedef SharedInvalidData *SharedInvalid; + + +typedef struct SISegEntry { + SharedInvalidData entryData; /* the message data */ + bool isfree; /* entry free? */ + Offset next; /* offset to next entry*/ +} SISegEntry; + +#define SizeOfOneSISegEntry sizeof(SISegEntry) + +typedef struct SISegOffsets { + Offset startSegment; /* always 0 (for now) */ + Offset offsetToFirstEntry; /* A + a = B */ + Offset offsetToEndOfSegemnt; /* A + a + b */ +} SISegOffsets; + + +/****************************************************************************/ +/* synchronization of the shared buffer access */ +/* access to the buffer is synchronized by the lock manager !! */ +/****************************************************************************/ + +#define SI_LockStartValue 255 +#define SI_SharedLock (-1) +#define SI_ExclusiveLock (-255) + +extern SISeg *shmInvalBuffer; + +/* + * prototypes for functions in sinvaladt.c + */ +extern int SIBackendInit(SISeg *segInOutP); +extern int SISegmentInit(bool killExistingSegment, IPCKey key); + +extern bool SISetDataEntry(SISeg *segP, SharedInvalidData *data); +extern void SISetProcStateInvalid(SISeg *segP); +extern bool SIDelDataEntry(SISeg *segP); +extern void SIReadEntryData(SISeg *segP, int backendId, + void (*invalFunction)(), void (*resetFunction)()); +extern void SIDelExpiredDataEntries(SISeg *segP); + +#endif /* SINVALADT_H */ diff --git a/src/backend/storage/smgr.h b/src/backend/storage/smgr.h new file mode 100644 index 00000000000..2e91938290a --- /dev/null +++ b/src/backend/storage/smgr.h @@ -0,0 +1,84 @@ +/*------------------------------------------------------------------------- + * + * smgr.h-- + * storage manager switch public interface declarations. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: smgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef SMGR_H +#define SMGR_H + +#include "utils/rel.h" +#include "storage/spin.h" /* for SPINLOCK */ + +#define SM_FAIL 0 +#define SM_SUCCESS 1 + +#define DEFAULT_SMGR 0 + +extern int smgrinit(void); +extern void smgrshutdown(int dummy); +extern int smgrcreate(int16 which, Relation reln); +extern int smgrunlink(int16 which, Relation reln); +extern int smgrextend(int16 which, Relation reln, char *buffer); +extern int smgropen(int16 which, Relation reln); +extern int smgrclose(int16 which, Relation reln); +extern int smgrread(int16 which, Relation reln, BlockNumber blocknum, + char *buffer); +extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum, + char *buffer); +extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum, + char *buffer); +extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid, + Oid relid, BlockNumber blkno, char *buffer); +extern int smgrnblocks(int16 which, Relation reln); +extern int smgrcommit(void); +extern int smgrabort(void); +extern bool smgriswo(int16 smgrno); + + + +/* internals: move me elsewhere -- ay 7/94 */ + +/* in md.c */ +extern int mdinit(void); +extern int mdcreate(Relation reln); +extern int mdunlink(Relation reln); +extern int mdextend(Relation reln, char *buffer); +extern int mdopen(Relation reln); +extern int mdclose(Relation reln); +extern int mdread(Relation reln, BlockNumber blocknum, char *buffer); +extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer); +extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer); +extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid, + BlockNumber blkno, char *buffer); +extern int mdnblocks(Relation reln); +extern int mdcommit(void); +extern int mdabort(void); + +/* mm.c */ +extern SPINLOCK MMCacheLock; + +extern int mminit(void); +extern int mmshutdown(void); +extern int mmcreate(Relation reln); +extern int mmunlink(Relation reln); +extern int mmextend(Relation reln, char *buffer); +extern int mmopen(Relation reln); +extern int mmclose(Relation reln); +extern int mmread(Relation reln, BlockNumber blocknum, char *buffer); +extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer); +extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer); +extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid, + BlockNumber blkno, char *buffer); +extern int mmnblocks(Relation reln); +extern int mmcommit(void); +extern int mmabort(void); +extern int MMShmemSize(void); + +#endif /* SMGR_H */ diff --git a/src/backend/storage/smgr/Makefile.inc b/src/backend/storage/smgr/Makefile.inc new file mode 100644 index 00000000000..8ff067afbe8 --- /dev/null +++ b/src/backend/storage/smgr/Makefile.inc @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for storage/smgr +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= md.c mm.c smgr.c smgrtype.c diff --git a/src/backend/storage/smgr/README b/src/backend/storage/smgr/README new file mode 100644 index 00000000000..4dbb2dce708 --- /dev/null +++ b/src/backend/storage/smgr/README @@ -0,0 +1,40 @@ +# $Header: /cvsroot/pgsql/src/backend/storage/smgr/README,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $ + +This directory contains the code that supports the Postgres storage manager +switch and all of the installed storage managers. In released systems, +the only supported storage manager is the magnetic disk manager. At UC +Berkeley, the Sony WORM optical disk jukebox and persistent main memory are +also supported. + +As of Postgres Release 3.0, every relation in the system is tagged with the +storage manager on which it resides. The storage manager switch code turns +what used to by filesystem operations into operations on the correct store, +for any given relation. + +The files in this directory, and their contents, are + + smgrtype.c Storage manager type -- maps string names to storage manager + IDs and provides simple comparison operators. This is the + regproc support for type 'smgr' in the system catalogs. + + smgr.c The storage manager switch dispatch code. The routines in + this file call the appropriate storage manager to do hardware + accesses requested by the backend. + + md.c The magnetic disk storage manager. + + mm.c The persistent main memory storage manager (#undef'ed in + tmp/c.h for all distributed systems). + + sj.c The sony jukebox storage manager and cache management code + (#undef'ed in tmp/c.h for all distributed systems). The + routines in this file allocate extents, maintain block + maps, and guarantee the persistence and coherency of a cache + of jukebox blocks on magnetic disk. + + pgjb.c The postgres jukebox interface routines. The routines here + handle exclusion on the physical device and translate requests + from the storage manager code (sj.c) into jbaccess calls. + + jbaccess.c Access code for the physical Sony jukebox device. This code + was swiped from Andy McFadden's jblib.a code at UC Berkeley. diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c new file mode 100644 index 00000000000..31aa1336a86 --- /dev/null +++ b/src/backend/storage/smgr/md.c @@ -0,0 +1,697 @@ +/*------------------------------------------------------------------------- + * + * md.c-- + * This code manages relations that reside on magnetic disk. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> /* for sprintf() */ +#include <sys/file.h> + +#include "postgres.h" +#include "miscadmin.h" /* for DataDir */ + +#include "machine.h" +#include "storage/smgr.h" /* where the declarations go */ +#include "storage/block.h" +#include "storage/fd.h" +#include "utils/mcxt.h" +#include "utils/rel.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "catalog/catalog.h" + +#undef DIAGNOSTIC + +/* + * The magnetic disk storage manager keeps track of open file descriptors + * in its own descriptor pool. This happens for two reasons. First, at + * transaction boundaries, we walk the list of descriptors and flush + * anything that we've dirtied in the current transaction. Second, we + * have to support relations of > 4GBytes. In order to do this, we break + * relations up into chunks of < 2GBytes and store one chunk in each of + * several files that represent the relation. + */ + +typedef struct _MdfdVec { + int mdfd_vfd; /* fd number in vfd pool */ + uint16 mdfd_flags; /* clean, dirty */ + int mdfd_lstbcnt; /* most recent block count */ + struct _MdfdVec *mdfd_chain; /* for large relations */ +} MdfdVec; + +static int Nfds = 100; +static MdfdVec *Md_fdvec = (MdfdVec *) NULL; +static int CurFd = 0; +static MemoryContext MdCxt; + +#define MDFD_DIRTY (uint16) 0x01 + +#define RELSEG_SIZE 262144 /* (2 ** 31) / 8192 -- 2GB file */ + +/* routines declared here */ +static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags); +static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag); +static int _fdvec_ext(void); +static BlockNumber _mdnblocks(File file, Size blcksz); + +/* + * mdinit() -- Initialize private state for magnetic disk storage manager. + * + * We keep a private table of all file descriptors. Whenever we do + * a write to one, we mark it dirty in our table. Whenever we force + * changes to disk, we mark the file descriptor clean. At transaction + * commit, we force changes to disk for all dirty file descriptors. + * This routine allocates and initializes the table. + * + * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. + */ +int +mdinit() +{ + MemoryContext oldcxt; + + MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr"); + if (MdCxt == (MemoryContext) NULL) + return (SM_FAIL); + + oldcxt = MemoryContextSwitchTo(MdCxt); + Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec)); + (void) MemoryContextSwitchTo(oldcxt); + + if (Md_fdvec == (MdfdVec *) NULL) + return (SM_FAIL); + + memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec)); + + return (SM_SUCCESS); +} + +int +mdcreate(Relation reln) +{ + int fd, vfd; + int tmp; + char *path; + extern bool IsBootstrapProcessingMode(); + + path = relpath(&(reln->rd_rel->relname.data[0])); + fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600); + + /* + * If the file already exists and is empty, we pretend that the + * create succeeded. During bootstrap processing, we skip that check, + * because pg_time, pg_variable, and pg_log get created before their + * .bki file entries are processed. + */ + + if (fd < 0) { + if ((fd = FileNameOpenFile(path, O_RDWR, 0600)) >= 0) { + if (!IsBootstrapProcessingMode() && + FileRead(fd, (char *) &tmp, sizeof(tmp)) != 0) { + FileClose(fd); + return (-1); + } + } + } + + if (CurFd >= Nfds) { + if (_fdvec_ext() == SM_FAIL) + return (-1); + } + + Md_fdvec[CurFd].mdfd_vfd = fd; + Md_fdvec[CurFd].mdfd_flags = (uint16) 0; + Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL; + Md_fdvec[CurFd].mdfd_lstbcnt = 0; + + vfd = CurFd++; + + return (vfd); +} + +/* + * mdunlink() -- Unlink a relation. + */ +int +mdunlink(Relation reln) +{ + int fd; + int i; + MdfdVec *v, *ov; + MemoryContext oldcxt; + char fname[20]; /* XXX should have NAMESIZE defined */ + char tname[20]; + + /* On Windows NT you can't unlink a file if it is open so we have + ** to do this. + */ +#ifdef WIN32 + (void) mdclose(reln); +#endif /* WIN32 */ + + + memset(fname,0,20); + strncpy(fname, RelationGetRelationName(reln)->data, 16); + + if (FileNameUnlink(fname) < 0) + return (SM_FAIL); + + /* unlink all the overflow files for large relations */ + for (i = 1; ; i++) { +#ifdef WIN32 + (void) mdclose(reln); +#endif /* WIN32 */ + sprintf(tname, "%s.%d", fname, i); + if (FileNameUnlink(tname) < 0) + break; + } + + /* finally, clean out the mdfd vector */ + fd = RelationGetFile(reln); + Md_fdvec[fd].mdfd_flags = (uint16) 0; + + oldcxt = MemoryContextSwitchTo(MdCxt); + for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; ) { + ov = v; + v = v->mdfd_chain; + if (ov != &Md_fdvec[fd]) + pfree(ov); + } + Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL; + (void) MemoryContextSwitchTo(oldcxt); + + return (SM_SUCCESS); +} + +/* + * mdextend() -- Add a block to the specified relation. + * + * This routine returns SM_FAIL or SM_SUCCESS, with errno set as + * appropriate. + */ +int +mdextend(Relation reln, char *buffer) +{ + long pos; + int nblocks; + MdfdVec *v; + + nblocks = mdnblocks(reln); + v = _mdfd_getseg(reln, nblocks, O_CREAT); + + if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0) + return (SM_FAIL); + + if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) + return (SM_FAIL); + + /* remember that we did a write, so we can sync at xact commit */ + v->mdfd_flags |= MDFD_DIRTY; + + /* try to keep the last block count current, though it's just a hint */ + if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0) + v->mdfd_lstbcnt = RELSEG_SIZE; + +#ifdef DIAGNOSTIC + if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE + || v->mdfd_lstbcnt > RELSEG_SIZE) + elog(FATAL, "segment too big!"); +#endif + + return (SM_SUCCESS); +} + +/* + * mdopen() -- Open the specified relation. + */ +int +mdopen(Relation reln) +{ + char *path; + int fd; + int vfd; + + if (CurFd >= Nfds) { + if (_fdvec_ext() == SM_FAIL) + return (-1); + } + + path = relpath(&(reln->rd_rel->relname.data[0])); + + fd = FileNameOpenFile(path, O_RDWR, 0600); + + /* this should only happen during bootstrap processing */ + if (fd < 0) + fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600); + + Md_fdvec[CurFd].mdfd_vfd = fd; + Md_fdvec[CurFd].mdfd_flags = (uint16) 0; + Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL; + Md_fdvec[CurFd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); + +#ifdef DIAGNOSTIC + if (Md_fdvec[CurFd].mdfd_lstbcnt > RELSEG_SIZE) + elog(FATAL, "segment too big on relopen!"); +#endif + + vfd = CurFd++; + + return (vfd); +} + +/* + * mdclose() -- Close the specified relation. + * + * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. + */ +int +mdclose(Relation reln) +{ + int fd; + MdfdVec *v; + + fd = RelationGetFile(reln); + + for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; v = v->mdfd_chain) { + + /* may be closed already */ + if (v->mdfd_vfd < 0) + continue; + + /* + * We sync the file descriptor so that we don't need to reopen it at + * transaction commit to force changes to disk. + */ + + FileSync(v->mdfd_vfd); + FileClose(v->mdfd_vfd); + + /* mark this file descriptor as clean in our private table */ + v->mdfd_flags &= ~MDFD_DIRTY; + } + + return (SM_SUCCESS); +} + +/* + * mdread() -- Read the specified block from a relation. + * + * Returns SM_SUCCESS or SM_FAIL. + */ +int +mdread(Relation reln, BlockNumber blocknum, char *buffer) +{ + int status; + long seekpos; + int nbytes; + MdfdVec *v; + + v = _mdfd_getseg(reln, blocknum, 0); + + seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE)); + +#ifdef DIAGNOSTIC + if (seekpos >= BLCKSZ * RELSEG_SIZE) + elog(FATAL, "seekpos too big!"); +#endif + + if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) { + return (SM_FAIL); + } + + status = SM_SUCCESS; + if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) { + if (nbytes == 0) { + memset(buffer, 0, BLCKSZ); + } else { + status = SM_FAIL; + } + } + + return (status); +} + +/* + * mdwrite() -- Write the supplied block at the appropriate location. + * + * Returns SM_SUCCESS or SM_FAIL. + */ +int +mdwrite(Relation reln, BlockNumber blocknum, char *buffer) +{ + int status; + long seekpos; + MdfdVec *v; + + v = _mdfd_getseg(reln, blocknum, 0); + + seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE)); +#ifdef DIAGNOSTIC + if (seekpos >= BLCKSZ * RELSEG_SIZE) + elog(FATAL, "seekpos too big!"); +#endif + + if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) { + return (SM_FAIL); + } + + status = SM_SUCCESS; + if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) + status = SM_FAIL; + + v->mdfd_flags |= MDFD_DIRTY; + + return (status); +} + +/* + * mdflush() -- Synchronously write a block to disk. + * + * This is exactly like mdwrite(), but doesn't return until the file + * system buffer cache has been flushed. + */ +int +mdflush(Relation reln, BlockNumber blocknum, char *buffer) +{ + int status; + long seekpos; + MdfdVec *v; + + v = _mdfd_getseg(reln, blocknum, 0); + + seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE)); +#ifdef DIAGNOSTIC + if (seekpos >= BLCKSZ * RELSEG_SIZE) + elog(FATAL, "seekpos too big!"); +#endif + + if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) { + return (SM_FAIL); + } + + /* write and sync the block */ + status = SM_SUCCESS; + if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ + || FileSync(v->mdfd_vfd) < 0) + status = SM_FAIL; + + /* + * By here, the block is written and changes have been forced to stable + * storage. Mark the descriptor as clean until the next write, so we + * don't sync it again unnecessarily at transaction commit. + */ + + v->mdfd_flags &= ~MDFD_DIRTY; + + return (status); +} + +/* + * mdblindwrt() -- Write a block to disk blind. + * + * We have to be able to do this using only the name and OID of + * the database and relation in which the block belongs. This + * is a synchronous write. + */ +int +mdblindwrt(char *dbstr, + char *relstr, + Oid dbid, + Oid relid, + BlockNumber blkno, + char *buffer) +{ + int fd; + int segno; + long seekpos; + int status; + char *path; + int nchars; + + /* be sure we have enough space for the '.segno', if any */ + segno = blkno / RELSEG_SIZE; + if (segno > 0) + nchars = 10; + else + nchars = 0; + + /* construct the path to the file and open it */ + if (dbid == (Oid) 0) { + path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars); + if (segno == 0) + sprintf(path, "%s/%.*s", DataDir, NAMEDATALEN, relstr); + else + sprintf(path, "%s/%.*s.%d", DataDir, NAMEDATALEN, relstr, segno); + } else { + path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars); + if (segno == 0) + sprintf(path, "%s/base/%.*s/%.*s", DataDir, NAMEDATALEN, + dbstr, NAMEDATALEN, relstr); + else + sprintf(path, "%s/base/%.*s/%.*s.%d", DataDir, NAMEDATALEN, dbstr, + NAMEDATALEN, relstr, segno); + } + + if ((fd = open(path, O_RDWR, 0600)) < 0) + return (SM_FAIL); + + /* seek to the right spot */ + seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE)); + if (lseek(fd, seekpos, SEEK_SET) != seekpos) { + (void) close(fd); + return (SM_FAIL); + } + + status = SM_SUCCESS; + + /* write and sync the block */ + if (write(fd, buffer, BLCKSZ) != BLCKSZ || fsync(fd) < 0) + status = SM_FAIL; + + if (close(fd) < 0) + status = SM_FAIL; + + pfree(path); + + return (status); +} + +/* + * mdnblocks() -- Get the number of blocks stored in a relation. + * + * Returns # of blocks or -1 on error. + */ +int +mdnblocks(Relation reln) +{ + int fd; + MdfdVec *v; + int nblocks; + int segno; + + fd = RelationGetFile(reln); + v = &Md_fdvec[fd]; + +#ifdef DIAGNOSTIC + if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE) + elog(FATAL, "segment too big in getseg!"); +#endif + + segno = 0; + for (;;) { + if (v->mdfd_lstbcnt == RELSEG_SIZE + || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE) { + + v->mdfd_lstbcnt = RELSEG_SIZE; + segno++; + + if (v->mdfd_chain == (MdfdVec *) NULL) { + v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT); + if (v->mdfd_chain == (MdfdVec *) NULL) + elog(WARN, "cannot count blocks for %.16s -- open failed", + RelationGetRelationName(reln)); + } + + v = v->mdfd_chain; + } else { + return ((segno * RELSEG_SIZE) + nblocks); + } + } +} + +/* + * mdcommit() -- Commit a transaction. + * + * All changes to magnetic disk relations must be forced to stable + * storage. This routine makes a pass over the private table of + * file descriptors. Any descriptors to which we have done writes, + * but not synced, are synced here. + * + * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. + */ +int +mdcommit() +{ + int i; + MdfdVec *v; + + for (i = 0; i < CurFd; i++) { + for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) { + if (v->mdfd_flags & MDFD_DIRTY) { + if (FileSync(v->mdfd_vfd) < 0) + return (SM_FAIL); + + v->mdfd_flags &= ~MDFD_DIRTY; + } + } + } + + return (SM_SUCCESS); +} + +/* + * mdabort() -- Abort a transaction. + * + * Changes need not be forced to disk at transaction abort. We mark + * all file descriptors as clean here. Always returns SM_SUCCESS. + */ +int +mdabort() +{ + int i; + MdfdVec *v; + + for (i = 0; i < CurFd; i++) { + for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) { + v->mdfd_flags &= ~MDFD_DIRTY; + } + } + + return (SM_SUCCESS); +} + +/* + * _fdvec_ext() -- Extend the md file descriptor vector. + * + * The file descriptor vector must be large enough to hold at least + * 'fd' entries. + */ +static +int _fdvec_ext() +{ + MdfdVec *nvec; + MemoryContext oldcxt; + + Nfds *= 2; + + oldcxt = MemoryContextSwitchTo(MdCxt); + + nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec)); + memset(nvec, 0, Nfds * sizeof(MdfdVec)); + memmove(nvec, (char *) Md_fdvec, (Nfds / 2) * sizeof(MdfdVec)); + pfree(Md_fdvec); + + (void) MemoryContextSwitchTo(oldcxt); + + Md_fdvec = nvec; + + return (SM_SUCCESS); +} + +static MdfdVec * +_mdfd_openseg(Relation reln, int segno, int oflags) +{ + MemoryContext oldcxt; + MdfdVec *v; + int fd; + bool dofree; + char *path, *fullpath; + + /* be sure we have enough space for the '.segno', if any */ + path = relpath(RelationGetRelationName(reln)->data); + + dofree = false; + if (segno > 0) { + dofree = true; + fullpath = (char *) palloc(strlen(path) + 12); + sprintf(fullpath, "%s.%d", path, segno); + } else + fullpath = path; + + /* open the file */ + fd = PathNameOpenFile(fullpath, O_RDWR|oflags, 0600); + + if (dofree) + pfree(fullpath); + + if (fd < 0) + return ((MdfdVec *) NULL); + + /* allocate an mdfdvec entry for it */ + oldcxt = MemoryContextSwitchTo(MdCxt); + v = (MdfdVec *) palloc(sizeof(MdfdVec)); + (void) MemoryContextSwitchTo(oldcxt); + + /* fill the entry */ + v->mdfd_vfd = fd; + v->mdfd_flags = (uint16) 0; + v->mdfd_chain = (MdfdVec *) NULL; + v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ); + +#ifdef DIAGNOSTIC + if (v->mdfd_lstbcnt > RELSEG_SIZE) + elog(FATAL, "segment too big on open!"); +#endif + + /* all done */ + return (v); +} + +static MdfdVec * +_mdfd_getseg(Relation reln, int blkno, int oflag) +{ + MdfdVec *v; + int segno; + int fd; + int i; + + fd = RelationGetFile(reln); + if (fd < 0) { + if ((fd = mdopen(reln)) < 0) + elog(WARN, "cannot open relation %.16s", + RelationGetRelationName(reln)); + reln->rd_fd = fd; + } + + for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1; + segno > 0; + i++, segno--) { + + if (v->mdfd_chain == (MdfdVec *) NULL) { + v->mdfd_chain = _mdfd_openseg(reln, i, oflag); + + if (v->mdfd_chain == (MdfdVec *) NULL) + elog(WARN, "cannot open segment %d of relation %.16s", + i, RelationGetRelationName(reln)); + } + v = v->mdfd_chain; + } + + return (v); +} + +static BlockNumber +_mdnblocks(File file, Size blcksz) +{ + long len; + + len = FileSeek(file, 0L, SEEK_END) - 1; + return((BlockNumber)((len < 0) ? 0 : 1 + len / blcksz)); +} diff --git a/src/backend/storage/smgr/mm.c b/src/backend/storage/smgr/mm.c new file mode 100644 index 00000000000..24a8d2472a6 --- /dev/null +++ b/src/backend/storage/smgr/mm.c @@ -0,0 +1,586 @@ +/*------------------------------------------------------------------------- + * + * mm.c-- + * main memory storage manager + * + * This code manages relations that reside in (presumably stable) + * main memory. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#ifdef MAIN_MEMORY + +#include <math.h> +#include "machine.h" +#include "storage/ipc.h" +#include "storage/smgr.h" /* where the declarations go */ +#include "storage/block.h" +#include "storage/shmem.h" +#include "storage/spin.h" + +#include "utils/hsearch.h" +#include "utils/rel.h" +#include "utils/elog.h" +#include "utils/memutils.h" + +/* + * MMCacheTag -- Unique triplet for blocks stored by the main memory + * storage manager. + */ + +typedef struct MMCacheTag { + Oid mmct_dbid; + Oid mmct_relid; + BlockNumber mmct_blkno; +} MMCacheTag; + +/* + * Shared-memory hash table for main memory relations contains + * entries of this form. + */ + +typedef struct MMHashEntry { + MMCacheTag mmhe_tag; + int mmhe_bufno; +} MMHashEntry; + +/* + * MMRelTag -- Unique identifier for each relation that is stored in the + * main-memory storage manager. + */ + +typedef struct MMRelTag { + Oid mmrt_dbid; + Oid mmrt_relid; +} MMRelTag; + +/* + * Shared-memory hash table for # blocks in main memory relations contains + * entries of this form. + */ + +typedef struct MMRelHashEntry { + MMRelTag mmrhe_tag; + int mmrhe_nblocks; +} MMRelHashEntry; + +#define MMNBUFFERS 10 +#define MMNRELATIONS 2 + +SPINLOCK MMCacheLock; +extern bool IsPostmaster; +extern Oid MyDatabaseId; + +static int *MMCurTop; +static int *MMCurRelno; +static MMCacheTag *MMBlockTags; +static char *MMBlockCache; +static HTAB *MMCacheHT; +static HTAB *MMRelCacheHT; + +int +mminit() +{ + char *mmcacheblk; + int mmsize = 0; + bool found; + HASHCTL info; + + SpinAcquire(MMCacheLock); + + mmsize += MAXALIGN(BLCKSZ * MMNBUFFERS); + mmsize += MAXALIGN(sizeof(*MMCurTop)); + mmsize += MAXALIGN(sizeof(*MMCurRelno)); + mmsize += MAXALIGN((MMNBUFFERS * sizeof(MMCacheTag))); + mmcacheblk = (char *) ShmemInitStruct("Main memory smgr", mmsize, &found); + + if (mmcacheblk == (char *) NULL) { + SpinRelease(MMCacheLock); + return (SM_FAIL); + } + + info.keysize = sizeof(MMCacheTag); + info.datasize = sizeof(int); + info.hash = tag_hash; + + MMCacheHT = (HTAB *) ShmemInitHash("Main memory store HT", + MMNBUFFERS, MMNBUFFERS, + &info, (HASH_ELEM|HASH_FUNCTION)); + + if (MMCacheHT == (HTAB *) NULL) { + SpinRelease(MMCacheLock); + return (SM_FAIL); + } + + info.keysize = sizeof(MMRelTag); + info.datasize = sizeof(int); + info.hash = tag_hash; + + MMRelCacheHT = (HTAB *) ShmemInitHash("Main memory rel HT", + MMNRELATIONS, MMNRELATIONS, + &info, (HASH_ELEM|HASH_FUNCTION)); + + if (MMRelCacheHT == (HTAB *) NULL) { + SpinRelease(MMCacheLock); + return (SM_FAIL); + } + + if (IsPostmaster) { + memset(mmcacheblk, 0, mmsize); + SpinRelease(MMCacheLock); + return (SM_SUCCESS); + } + + SpinRelease(MMCacheLock); + + MMCurTop = (int *) mmcacheblk; + mmcacheblk += sizeof(int); + MMCurRelno = (int *) mmcacheblk; + mmcacheblk += sizeof(int); + MMBlockTags = (MMCacheTag *) mmcacheblk; + mmcacheblk += (MMNBUFFERS * sizeof(MMCacheTag)); + MMBlockCache = mmcacheblk; + + return (SM_SUCCESS); +} + +int +mmshutdown() +{ + return (SM_SUCCESS); +} + +int +mmcreate(Relation reln) +{ + MMRelHashEntry *entry; + bool found; + MMRelTag tag; + + SpinAcquire(MMCacheLock); + + if (*MMCurRelno == MMNRELATIONS) { + SpinRelease(MMCacheLock); + return (SM_FAIL); + } + + (*MMCurRelno)++; + + tag.mmrt_relid = reln->rd_id; + if (reln->rd_rel->relisshared) + tag.mmrt_dbid = (Oid) 0; + else + tag.mmrt_dbid = MyDatabaseId; + + entry = (MMRelHashEntry *) hash_search(MMRelCacheHT, + (char *) &tag, HASH_ENTER, &found); + + if (entry == (MMRelHashEntry *) NULL) { + SpinRelease(MMCacheLock); + elog(FATAL, "main memory storage mgr rel cache hash table corrupt"); + } + + if (found) { + /* already exists */ + SpinRelease(MMCacheLock); + return (SM_FAIL); + } + + entry->mmrhe_nblocks = 0; + + SpinRelease(MMCacheLock); + + return (SM_SUCCESS); +} + +/* + * mmunlink() -- Unlink a relation. + */ +int +mmunlink(Relation reln) +{ + int i; + Oid reldbid; + MMHashEntry *entry; + MMRelHashEntry *rentry; + bool found; + MMRelTag rtag; + + if (reln->rd_rel->relisshared) + reldbid = (Oid) 0; + else + reldbid = MyDatabaseId; + + SpinAcquire(MMCacheLock); + + for (i = 0; i < MMNBUFFERS; i++) { + if (MMBlockTags[i].mmct_dbid == reldbid + && MMBlockTags[i].mmct_relid == reln->rd_id) { + entry = (MMHashEntry *) hash_search(MMCacheHT, + (char *) &MMBlockTags[i], + HASH_REMOVE, &found); + if (entry == (MMHashEntry *) NULL || !found) { + SpinRelease(MMCacheLock); + elog(FATAL, "mmunlink: cache hash table corrupted"); + } + MMBlockTags[i].mmct_dbid = (Oid) 0; + MMBlockTags[i].mmct_relid = (Oid) 0; + MMBlockTags[i].mmct_blkno = (BlockNumber) 0; + } + } + rtag.mmrt_dbid = reldbid; + rtag.mmrt_relid = reln->rd_id; + + rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag, + HASH_REMOVE, &found); + + if (rentry == (MMRelHashEntry *) NULL || !found) { + SpinRelease(MMCacheLock); + elog(FATAL, "mmunlink: rel cache hash table corrupted"); + } + + (*MMCurRelno)--; + + SpinRelease(MMCacheLock); + return 1; +} + +/* + * mmextend() -- Add a block to the specified relation. + * + * This routine returns SM_FAIL or SM_SUCCESS, with errno set as + * appropriate. + */ +int +mmextend(Relation reln, char *buffer) +{ + MMRelHashEntry *rentry; + MMHashEntry *entry; + int i; + Oid reldbid; + int offset; + bool found; + MMRelTag rtag; + MMCacheTag tag; + + if (reln->rd_rel->relisshared) + reldbid = (Oid) 0; + else + reldbid = MyDatabaseId; + + tag.mmct_dbid = rtag.mmrt_dbid = reldbid; + tag.mmct_relid = rtag.mmrt_relid = reln->rd_id; + + SpinAcquire(MMCacheLock); + + if (*MMCurTop == MMNBUFFERS) { + for (i = 0; i < MMNBUFFERS; i++) { + if (MMBlockTags[i].mmct_dbid == 0 && + MMBlockTags[i].mmct_relid == 0) + break; + } + if (i == MMNBUFFERS) { + SpinRelease(MMCacheLock); + return (SM_FAIL); + } + } else { + i = *MMCurTop; + (*MMCurTop)++; + } + + rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag, + HASH_FIND, &found); + if (rentry == (MMRelHashEntry *) NULL || !found) { + SpinRelease(MMCacheLock); + elog(FATAL, "mmextend: rel cache hash table corrupt"); + } + + tag.mmct_blkno = rentry->mmrhe_nblocks; + + entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag, + HASH_ENTER, &found); + if (entry == (MMHashEntry *) NULL || found) { + SpinRelease(MMCacheLock); + elog(FATAL, "mmextend: cache hash table corrupt"); + } + + entry->mmhe_bufno = i; + MMBlockTags[i].mmct_dbid = reldbid; + MMBlockTags[i].mmct_relid = reln->rd_id; + MMBlockTags[i].mmct_blkno = rentry->mmrhe_nblocks; + + /* page numbers are zero-based, so we increment this at the end */ + (rentry->mmrhe_nblocks)++; + + /* write the extended page */ + offset = (i * BLCKSZ); + memmove(&(MMBlockCache[offset]), buffer, BLCKSZ); + + SpinRelease(MMCacheLock); + + return (SM_SUCCESS); +} + +/* + * mmopen() -- Open the specified relation. + */ +int +mmopen(Relation reln) +{ + /* automatically successful */ + return (0); +} + +/* + * mmclose() -- Close the specified relation. + * + * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. + */ +int +mmclose(Relation reln) +{ + /* automatically successful */ + return (SM_SUCCESS); +} + +/* + * mmread() -- Read the specified block from a relation. + * + * Returns SM_SUCCESS or SM_FAIL. + */ +int +mmread(Relation reln, BlockNumber blocknum, char *buffer) +{ + MMHashEntry *entry; + bool found; + int offset; + MMCacheTag tag; + + if (reln->rd_rel->relisshared) + tag.mmct_dbid = (Oid) 0; + else + tag.mmct_dbid = MyDatabaseId; + + tag.mmct_relid = reln->rd_id; + tag.mmct_blkno = blocknum; + + SpinAcquire(MMCacheLock); + entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag, + HASH_FIND, &found); + + if (entry == (MMHashEntry *) NULL) { + SpinRelease(MMCacheLock); + elog(FATAL, "mmread: hash table corrupt"); + } + + if (!found) { + /* reading nonexistent pages is defined to fill them with zeroes */ + SpinRelease(MMCacheLock); + memset(buffer, 0, BLCKSZ); + return (SM_SUCCESS); + } + + offset = (entry->mmhe_bufno * BLCKSZ); + memmove(buffer, &MMBlockCache[offset], BLCKSZ); + + SpinRelease(MMCacheLock); + + return (SM_SUCCESS); +} + +/* + * mmwrite() -- Write the supplied block at the appropriate location. + * + * Returns SM_SUCCESS or SM_FAIL. + */ +int +mmwrite(Relation reln, BlockNumber blocknum, char *buffer) +{ + MMHashEntry *entry; + bool found; + int offset; + MMCacheTag tag; + + if (reln->rd_rel->relisshared) + tag.mmct_dbid = (Oid) 0; + else + tag.mmct_dbid = MyDatabaseId; + + tag.mmct_relid = reln->rd_id; + tag.mmct_blkno = blocknum; + + SpinAcquire(MMCacheLock); + entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag, + HASH_FIND, &found); + + if (entry == (MMHashEntry *) NULL) { + SpinRelease(MMCacheLock); + elog(FATAL, "mmread: hash table corrupt"); + } + + if (!found) { + SpinRelease(MMCacheLock); + elog(FATAL, "mmwrite: hash table missing requested page"); + } + + offset = (entry->mmhe_bufno * BLCKSZ); + memmove(&MMBlockCache[offset], buffer, BLCKSZ); + + SpinRelease(MMCacheLock); + + return (SM_SUCCESS); +} + +/* + * mmflush() -- Synchronously write a block to stable storage. + * + * For main-memory relations, this is exactly equivalent to mmwrite(). + */ +int +mmflush(Relation reln, BlockNumber blocknum, char *buffer) +{ + return (mmwrite(reln, blocknum, buffer)); +} + +/* + * mmblindwrt() -- Write a block to stable storage blind. + * + * We have to be able to do this using only the name and OID of + * the database and relation in which the block belongs. + */ +int +mmblindwrt(char *dbstr, + char *relstr, + Oid dbid, + Oid relid, + BlockNumber blkno, + char *buffer) +{ + return (SM_FAIL); +} + +/* + * mmnblocks() -- Get the number of blocks stored in a relation. + * + * Returns # of blocks or -1 on error. + */ +int +mmnblocks(Relation reln) +{ + MMRelTag rtag; + MMRelHashEntry *rentry; + bool found; + int nblocks; + + if (reln->rd_rel->relisshared) + rtag.mmrt_dbid = (Oid) 0; + else + rtag.mmrt_dbid = MyDatabaseId; + + rtag.mmrt_relid = reln->rd_id; + + SpinAcquire(MMCacheLock); + + rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag, + HASH_FIND, &found); + + if (rentry == (MMRelHashEntry *) NULL) { + SpinRelease(MMCacheLock); + elog(FATAL, "mmnblocks: rel cache hash table corrupt"); + } + + if (found) + nblocks = rentry->mmrhe_nblocks; + else + nblocks = -1; + + SpinRelease(MMCacheLock); + + return (nblocks); +} + +/* + * mmcommit() -- Commit a transaction. + * + * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate. + */ +int +mmcommit() +{ + return (SM_SUCCESS); +} + +/* + * mmabort() -- Abort a transaction. + */ + +int +mmabort() +{ + return (SM_SUCCESS); +} + +/* + * MMShmemSize() -- Declare amount of shared memory we require. + * + * The shared memory initialization code creates a block of shared + * memory exactly big enough to hold all the structures it needs to. + * This routine declares how much space the main memory storage + * manager will use. + */ +int +MMShmemSize() +{ + int size = 0; + int nbuckets; + int nsegs; + int tmp; + + /* + * first compute space occupied by the (dbid,relid,blkno) hash table + */ + + nbuckets = 1 << (int)my_log2((MMNBUFFERS - 1) / DEF_FFACTOR + 1); + nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1); + + size += MAXALIGN(my_log2(MMNBUFFERS) * sizeof(void *)); + size += MAXALIGN(sizeof(HHDR)); + size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT)); + tmp = (int)ceil((double)MMNBUFFERS/BUCKET_ALLOC_INCR); + size += tmp * BUCKET_ALLOC_INCR * + (MAXALIGN(sizeof(BUCKET_INDEX)) + + MAXALIGN(sizeof(MMHashEntry))); /* contains hash key */ + + /* + * now do the same for the rel hash table + */ + + size += MAXALIGN(my_log2(MMNRELATIONS) * sizeof(void *)); + size += MAXALIGN(sizeof(HHDR)); + size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT)); + tmp = (int)ceil((double)MMNRELATIONS/BUCKET_ALLOC_INCR); + size += tmp * BUCKET_ALLOC_INCR * + (MAXALIGN(sizeof(BUCKET_INDEX)) + + MAXALIGN(sizeof(MMRelHashEntry))); /* contains hash key */ + + /* + * finally, add in the memory block we use directly + */ + + size += MAXALIGN(BLCKSZ * MMNBUFFERS); + size += MAXALIGN(sizeof(*MMCurTop)); + size += MAXALIGN(sizeof(*MMCurRelno)); + size += MAXALIGN(MMNBUFFERS * sizeof(MMCacheTag)); + + return (size); +} + +#endif /* MAIN_MEMORY */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c new file mode 100644 index 00000000000..426c3d93480 --- /dev/null +++ b/src/backend/storage/smgr/smgr.c @@ -0,0 +1,371 @@ +/*------------------------------------------------------------------------- + * + * smgr.c-- + * public interface routines to storage manager switch. + * + * All file system operations in POSTGRES dispatch through these + * routines. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <string.h> +#include "postgres.h" + +#include "machine.h" +#include "storage/ipc.h" +#include "storage/smgr.h" +#include "storage/block.h" +#include "utils/rel.h" +#include "utils/elog.h" +#include "utils/palloc.h" + +typedef struct f_smgr { + int (*smgr_init)(); /* may be NULL */ + int (*smgr_shutdown)(); /* may be NULL */ + int (*smgr_create)(); + int (*smgr_unlink)(); + int (*smgr_extend)(); + int (*smgr_open)(); + int (*smgr_close)(); + int (*smgr_read)(); + int (*smgr_write)(); + int (*smgr_flush)(); + int (*smgr_blindwrt)(); + int (*smgr_nblocks)(); + int (*smgr_commit)(); /* may be NULL */ + int (*smgr_abort)(); /* may be NULL */ +} f_smgr; + +/* + * The weird placement of commas in this init block is to keep the compiler + * happy, regardless of what storage managers we have (or don't have). + */ + +static f_smgr smgrsw[] = { + + /* magnetic disk */ + { mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose, + mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdcommit, mdabort }, + +#ifdef MAIN_MEMORY + /* main memory */ + { mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose, + mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, mmcommit, mmabort }, + +#endif /* MAIN_MEMORY */ +}; + +/* + * This array records which storage managers are write-once, and which + * support overwrite. A 'true' entry means that the storage manager is + * write-once. In the best of all possible worlds, there would be no + * write-once storage managers. + */ + +static bool smgrwo[] = { + false, /* magnetic disk */ +#ifdef MAIN_MEMORY + false, /* main memory*/ +#endif /* MAIN_MEMORY */ +}; +static int NSmgr = lengthof(smgrsw); + +/* + * smgrinit(), smgrshutdown() -- Initialize or shut down all storage + * managers. + * + */ +int +smgrinit() +{ + int i; + extern char *smgrout(); + + for (i = 0; i < NSmgr; i++) { + if (smgrsw[i].smgr_init) { + if ((*(smgrsw[i].smgr_init))() == SM_FAIL) + elog(FATAL, "initialization failed on %s", smgrout(i)); + } + } + + /* register the shutdown proc */ + on_exitpg(smgrshutdown, 0); + + return (SM_SUCCESS); +} + +void +smgrshutdown(int dummy) +{ + int i; + extern char *smgrout(); + + for (i = 0; i < NSmgr; i++) { + if (smgrsw[i].smgr_shutdown) { + if ((*(smgrsw[i].smgr_shutdown))() == SM_FAIL) + elog(FATAL, "shutdown failed on %s", smgrout(i)); + } + } +} + +/* + * smgrcreate() -- Create a new relation. + * + * This routine takes a reldesc, creates the relation on the appropriate + * device, and returns a file descriptor for it. + */ +int +smgrcreate(int16 which, Relation reln) +{ + int fd; + + if ((fd = (*(smgrsw[which].smgr_create))(reln)) < 0) + elog(WARN, "cannot open %.*s", + NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (fd); +} + +/* + * smgrunlink() -- Unlink a relation. + * + * The relation is removed from the store. + */ +int +smgrunlink(int16 which, Relation reln) +{ + int status; + + if ((status = (*(smgrsw[which].smgr_unlink))(reln)) == SM_FAIL) + elog(WARN, "cannot unlink %.*s", + NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (status); +} + +/* + * smgrextend() -- Add a new block to a file. + * + * Returns SM_SUCCESS on success; aborts the current transaction on + * failure. + */ +int +smgrextend(int16 which, Relation reln, char *buffer) +{ + int status; + + status = (*(smgrsw[which].smgr_extend))(reln, buffer); + + if (status == SM_FAIL) + elog(WARN, "%.*s: cannot extend", + NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (status); +} + +/* + * smgropen() -- Open a relation using a particular storage manager. + * + * Returns the fd for the open relation on success, aborts the + * transaction on failure. + */ +int +smgropen(int16 which, Relation reln) +{ + int fd; + + if ((fd = (*(smgrsw[which].smgr_open))(reln)) < 0) + elog(WARN, "cannot open %.*s", + NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (fd); +} + +/* + * smgrclose() -- Close a relation. + * + * Returns SM_SUCCESS on success, aborts on failure. + */ +int +smgrclose(int16 which, Relation reln) +{ + if ((*(smgrsw[which].smgr_close))(reln) == SM_FAIL) + elog(WARN, "cannot close %.*s", + NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (SM_SUCCESS); +} + +/* + * smgrread() -- read a particular block from a relation into the supplied + * buffer. + * + * This routine is called from the buffer manager in order to + * instantiate pages in the shared buffer cache. All storage managers + * return pages in the format that POSTGRES expects. This routine + * dispatches the read. On success, it returns SM_SUCCESS. On failure, + * the current transaction is aborted. + */ +int +smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer) +{ + int status; + + status = (*(smgrsw[which].smgr_read))(reln, blocknum, buffer); + + if (status == SM_FAIL) + elog(WARN, "cannot read block %d of %.*s", + blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (status); +} + +/* + * smgrwrite() -- Write the supplied buffer out. + * + * This is not a synchronous write -- the interface for that is + * smgrflush(). The buffer is written out via the appropriate + * storage manager. This routine returns SM_SUCCESS or aborts + * the current transaction. + */ +int +smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer) +{ + int status; + + status = (*(smgrsw[which].smgr_write))(reln, blocknum, buffer); + + if (status == SM_FAIL) + elog(WARN, "cannot write block %d of %.*s", + blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (status); +} + +/* + * smgrflush() -- A synchronous smgrwrite(). + */ +int +smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer) +{ + int status; + + status = (*(smgrsw[which].smgr_flush))(reln, blocknum, buffer); + + if (status == SM_FAIL) + elog(WARN, "cannot flush block %d of %.*s to stable store", + blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (status); +} + +/* + * smgrblindwrt() -- Write a page out blind. + * + * In some cases, we may find a page in the buffer cache that we + * can't make a reldesc for. This happens, for example, when we + * want to reuse a dirty page that was written by a transaction + * that has not yet committed, which created a new relation. In + * this case, the buffer manager will call smgrblindwrt() with + * the name and OID of the database and the relation to which the + * buffer belongs. Every storage manager must be able to force + * this page down to stable storage in this circumstance. + */ +int +smgrblindwrt(int16 which, + char *dbname, + char *relname, + Oid dbid, + Oid relid, + BlockNumber blkno, + char *buffer) +{ + char *dbstr; + char *relstr; + int status; + + dbstr = pstrdup(dbname); + relstr = pstrdup(relname); + + status = (*(smgrsw[which].smgr_blindwrt))(dbstr, relstr, dbid, relid, + blkno, buffer); + + if (status == SM_FAIL) + elog(WARN, "cannot write block %d of %s [%s] blind", + blkno, relstr, dbstr); + + pfree(dbstr); + pfree(relstr); + + return (status); +} + +/* + * smgrnblocks() -- Calculate the number of POSTGRES blocks in the + * supplied relation. + * + * Returns the number of blocks on success, aborts the current + * transaction on failure. + */ +int +smgrnblocks(int16 which, Relation reln) +{ + int nblocks; + + if ((nblocks = (*(smgrsw[which].smgr_nblocks))(reln)) < 0) + elog(WARN, "cannot count blocks for %.*s", + NAMEDATALEN, &(reln->rd_rel->relname.data[0])); + + return (nblocks); +} + +/* + * smgrcommit(), smgrabort() -- Commit or abort changes made during the + * current transaction. + */ +int +smgrcommit() +{ + int i; + extern char *smgrout(); + + for (i = 0; i < NSmgr; i++) { + if (smgrsw[i].smgr_commit) { + if ((*(smgrsw[i].smgr_commit))() == SM_FAIL) + elog(FATAL, "transaction commit failed on %s", smgrout(i)); + } + } + + return (SM_SUCCESS); +} + +int +smgrabort() +{ + int i; + extern char *smgrout(); + + for (i = 0; i < NSmgr; i++) { + if (smgrsw[i].smgr_abort) { + if ((*(smgrsw[i].smgr_abort))() == SM_FAIL) + elog(FATAL, "transaction abort failed on %s", smgrout(i)); + } + } + + return (SM_SUCCESS); +} + +bool +smgriswo(int16 smgrno) +{ + if (smgrno < 0 || smgrno >= NSmgr) + elog(WARN, "illegal storage manager number %d", smgrno); + + return (smgrwo[smgrno]); +} diff --git a/src/backend/storage/smgr/smgrtype.c b/src/backend/storage/smgr/smgrtype.c new file mode 100644 index 00000000000..5c90d590914 --- /dev/null +++ b/src/backend/storage/smgr/smgrtype.c @@ -0,0 +1,82 @@ +/*------------------------------------------------------------------------- + * + * smgrtype.c-- + * storage manager type + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgrtype.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <string.h> +#include "postgres.h" + +#include "utils/builtins.h" /* where the declarations go */ +#include "utils/elog.h" +#include "utils/palloc.h" +#include "storage/smgr.h" + +typedef struct smgrid { + char *smgr_name; +} smgrid; + +/* + * StorageManager[] -- List of defined storage managers. + * + * The weird comma placement is to keep compilers happy no matter + * which of these is (or is not) defined. + */ + +static smgrid StorageManager[] = { + {"magnetic disk"}, +#ifdef MAIN_MEMORY + {"main memory"} +#endif /* MAIN_MEMORY */ +}; + +static int NStorageManagers = lengthof(StorageManager); + +int2 +smgrin(char *s) +{ + int i; + + for (i = 0; i < NStorageManagers; i++) { + if (strcmp(s, StorageManager[i].smgr_name) == 0) + return((int2) i); + } + elog(WARN, "smgrin: illegal storage manager name %s", s); + return 0; +} + +char * +smgrout(int2 i) +{ + char *s; + + if (i >= NStorageManagers || i < 0) + elog(WARN, "Illegal storage manager id %d", i); + + s = (char *) palloc(strlen(StorageManager[i].smgr_name) + 1); + strcpy(s, StorageManager[i].smgr_name); + return (s); +} + +bool +smgreq(int2 a, int2 b) +{ + if (a == b) + return (true); + return (false); +} + +bool +smgrne(int2 a, int2 b) +{ + if (a == b) + return (false); + return (true); +} diff --git a/src/backend/storage/spin.h b/src/backend/storage/spin.h new file mode 100644 index 00000000000..32037684ec1 --- /dev/null +++ b/src/backend/storage/spin.h @@ -0,0 +1,38 @@ +/*------------------------------------------------------------------------- + * + * spin.h-- + * synchronization routines + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: spin.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef SPIN_H +#define SPIN_H + +#include "ipc.h" + +/* + * two implementations of spin locks + * + * sequent, sparc, sun3: real spin locks. uses a TAS instruction; see + * src/storage/ipc/s_lock.c for details. + * + * default: fake spin locks using semaphores. see spin.c + * + */ + +typedef int SPINLOCK; + +extern bool CreateSpinlocks(IPCKey key); +extern bool AttachSpinLocks(IPCKey key); +extern bool InitSpinLocks(int init, IPCKey key); + +extern void SpinAcquire(SPINLOCK lock); +extern void SpinRelease(SPINLOCK lock); +extern bool SpinIsLocked(SPINLOCK lock); + +#endif /* SPIN_H */ |