aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage')
-rw-r--r--src/backend/storage/Makefile.inc31
-rw-r--r--src/backend/storage/backendid.h32
-rw-r--r--src/backend/storage/block.h114
-rw-r--r--src/backend/storage/buf.h47
-rw-r--r--src/backend/storage/buf_internals.h220
-rw-r--r--src/backend/storage/buffer/Makefile.inc16
-rw-r--r--src/backend/storage/buffer/buf_init.c280
-rw-r--r--src/backend/storage/buffer/buf_table.c162
-rw-r--r--src/backend/storage/buffer/bufmgr.c1581
-rw-r--r--src/backend/storage/buffer/freelist.c285
-rw-r--r--src/backend/storage/buffer/localbuf.c284
-rw-r--r--src/backend/storage/bufmgr.h112
-rw-r--r--src/backend/storage/bufpage.h256
-rw-r--r--src/backend/storage/fd.h96
-rw-r--r--src/backend/storage/file/Makefile.inc14
-rw-r--r--src/backend/storage/file/fd.c888
-rw-r--r--src/backend/storage/ipc.h285
-rw-r--r--src/backend/storage/ipc/Makefile.inc15
-rw-r--r--src/backend/storage/ipc/README31
-rw-r--r--src/backend/storage/ipc/ipc.c718
-rw-r--r--src/backend/storage/ipc/ipci.c149
-rw-r--r--src/backend/storage/ipc/s_lock.c440
-rw-r--r--src/backend/storage/ipc/shmem.c561
-rw-r--r--src/backend/storage/ipc/shmqueue.c251
-rw-r--r--src/backend/storage/ipc/sinval.c169
-rw-r--r--src/backend/storage/ipc/sinvaladt.c797
-rw-r--r--src/backend/storage/ipc/spin.c247
-rw-r--r--src/backend/storage/item.h20
-rw-r--r--src/backend/storage/itemid.h75
-rw-r--r--src/backend/storage/itempos.h44
-rw-r--r--src/backend/storage/itemptr.h115
-rw-r--r--src/backend/storage/large_object.h58
-rw-r--r--src/backend/storage/large_object/Makefile.inc14
-rw-r--r--src/backend/storage/large_object/inv_api.c1165
-rw-r--r--src/backend/storage/lmgr.h84
-rw-r--r--src/backend/storage/lmgr/Makefile.inc14
-rw-r--r--src/backend/storage/lmgr/README93
-rw-r--r--src/backend/storage/lmgr/lmgr.c933
-rw-r--r--src/backend/storage/lmgr/lock.c1020
-rw-r--r--src/backend/storage/lmgr/multi.c415
-rw-r--r--src/backend/storage/lmgr/proc.c826
-rw-r--r--src/backend/storage/lmgr/single.c86
-rw-r--r--src/backend/storage/lock.h218
-rw-r--r--src/backend/storage/multilev.h64
-rw-r--r--src/backend/storage/off.h60
-rw-r--r--src/backend/storage/page.h26
-rw-r--r--src/backend/storage/page/Makefile.inc16
-rw-r--r--src/backend/storage/page/bufpage.c519
-rw-r--r--src/backend/storage/page/itemptr.c40
-rw-r--r--src/backend/storage/pagenum.h33
-rw-r--r--src/backend/storage/pos.h64
-rw-r--r--src/backend/storage/proc.h127
-rw-r--r--src/backend/storage/shmem.h104
-rw-r--r--src/backend/storage/sinval.h33
-rw-r--r--src/backend/storage/sinvaladt.h126
-rw-r--r--src/backend/storage/smgr.h84
-rw-r--r--src/backend/storage/smgr/Makefile.inc14
-rw-r--r--src/backend/storage/smgr/README40
-rw-r--r--src/backend/storage/smgr/md.c697
-rw-r--r--src/backend/storage/smgr/mm.c586
-rw-r--r--src/backend/storage/smgr/smgr.c371
-rw-r--r--src/backend/storage/smgr/smgrtype.c82
-rw-r--r--src/backend/storage/spin.h38
63 files changed, 16385 insertions, 0 deletions
diff --git a/src/backend/storage/Makefile.inc b/src/backend/storage/Makefile.inc
new file mode 100644
index 00000000000..aef287ca71a
--- /dev/null
+++ b/src/backend/storage/Makefile.inc
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for the storage modules
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/storage/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+stordir= $(CURDIR)/storage
+VPATH:= $(VPATH):$(stordir):$(stordir)/buffer:$(stordir)/file:$(stordir)/ipc:\
+ $(stordir)/large_object:$(stordir)/lmgr:$(stordir)/page:$(stordir)/smgr
+
+SUBSRCS=
+include $(stordir)/buffer/Makefile.inc
+include $(stordir)/file/Makefile.inc
+include $(stordir)/ipc/Makefile.inc
+include $(stordir)/large_object/Makefile.inc
+include $(stordir)/lmgr/Makefile.inc
+include $(stordir)/page/Makefile.inc
+include $(stordir)/smgr/Makefile.inc
+SRCS_STORAGE:= $(SUBSRCS)
+
+HEADERS+= backendid.h block.h buf.h buf_internals.h bufmgr.h bufpage.h \
+ fd.h ipc.h item.h itemid.h itempos.h \
+ itemptr.h large_object.h lmgr.h lock.h multilev.h off.h page.h \
+ pagenum.h pos.h proc.h shmem.h sinval.h sinvaladt.h smgr.h spin.h
diff --git a/src/backend/storage/backendid.h b/src/backend/storage/backendid.h
new file mode 100644
index 00000000000..eb874bbad79
--- /dev/null
+++ b/src/backend/storage/backendid.h
@@ -0,0 +1,32 @@
+/*-------------------------------------------------------------------------
+ *
+ * backendid.h--
+ * POSTGRES backend id communication definitions
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: backendid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BACKENDID_H
+#define BACKENDID_H
+
+/* ----------------
+ * pulled out of sinval.h to temporarily reduce #include nesting.
+ * -cim 8/17/90
+ * ----------------
+ */
+typedef int16 BackendId; /* unique currently active backend identifier */
+
+#define InvalidBackendId (-1)
+
+typedef int32 BackendTag; /* unique backend identifier */
+
+#define InvalidBackendTag (-1)
+
+extern BackendId MyBackendId; /* backend id of this backend */
+extern BackendTag MyBackendTag; /* backend tag of this backend */
+
+#endif /* BACKENDID_H */
diff --git a/src/backend/storage/block.h b/src/backend/storage/block.h
new file mode 100644
index 00000000000..5c006aa9d90
--- /dev/null
+++ b/src/backend/storage/block.h
@@ -0,0 +1,114 @@
+/*-------------------------------------------------------------------------
+ *
+ * block.h--
+ * POSTGRES disk block definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: block.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BLOCK_H
+#define BLOCK_H
+
+#include "c.h"
+
+/*
+ * BlockNumber:
+ *
+ * each data file (heap or index) is divided into postgres disk blocks
+ * (which may be thought of as the unit of i/o -- a postgres buffer
+ * contains exactly one disk block). the blocks are numbered
+ * sequentially, 0 to 0xFFFFFFFE.
+ *
+ * InvalidBlockNumber is the same thing as P_NEW in buf.h.
+ *
+ * the access methods, the buffer manager and the storage manager are
+ * more or less the only pieces of code that should be accessing disk
+ * blocks directly.
+ */
+typedef uint32 BlockNumber;
+
+#define InvalidBlockNumber ((BlockNumber) 0xFFFFFFFF)
+
+/*
+ * BlockId:
+ *
+ * this is a storage type for BlockNumber. in other words, this type
+ * is used for on-disk structures (e.g., in HeapTupleData) whereas
+ * BlockNumber is the type on which calculations are performed (e.g.,
+ * in access method code).
+ *
+ * there doesn't appear to be any reason to have separate types except
+ * for the fact that BlockIds can be SHORTALIGN'd (and therefore any
+ * structures that contains them, such as ItemPointerData, can also be
+ * SHORTALIGN'd). this is an important consideration for reducing the
+ * space requirements of the line pointer (ItemIdData) array on each
+ * page and the header of each heap or index tuple, so it doesn't seem
+ * wise to change this without good reason.
+ */
+typedef struct BlockIdData {
+ uint16 bi_hi;
+ uint16 bi_lo;
+} BlockIdData;
+
+typedef BlockIdData *BlockId; /* block identifier */
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+
+/*
+ * BlockNumberIsValid --
+ * True iff blockNumber is valid.
+ */
+#define BlockNumberIsValid(blockNumber) \
+ ((bool) ((int32) (blockNumber) != InvalidBlockNumber))
+
+/*
+ * BlockIdIsValid --
+ * True iff the block identifier is valid.
+ */
+#define BlockIdIsValid(blockId) \
+ ((bool) PointerIsValid(blockId))
+
+/*
+ * BlockIdSet --
+ * Sets a block identifier to the specified value.
+ */
+#define BlockIdSet(blockId, blockNumber) \
+ Assert(PointerIsValid(blockId)); \
+ (blockId)->bi_hi = (blockNumber) >> 16; \
+ (blockId)->bi_lo = (blockNumber) & 0xffff
+
+/*
+ * BlockIdCopy --
+ * Copy a block identifier.
+ */
+#define BlockIdCopy(toBlockId, fromBlockId) \
+ Assert(PointerIsValid(toBlockId)); \
+ Assert(PointerIsValid(fromBlockId)); \
+ (toBlockId)->bi_hi = (fromBlockId)->bi_hi; \
+ (toBlockId)->bi_lo = (fromBlockId)->bi_lo
+
+/*
+ * BlockIdEquals --
+ * Check for block number equality.
+ */
+#define BlockIdEquals(blockId1, blockId2) \
+ ((blockId1)->bi_hi == (blockId2)->bi_hi && \
+ (blockId1)->bi_lo == (blockId2)->bi_lo)
+
+/*
+ * BlockIdGetBlockNumber --
+ * Retrieve the block number from a block identifier.
+ */
+#define BlockIdGetBlockNumber(blockId) \
+ (AssertMacro(BlockIdIsValid(blockId)) ? \
+ (BlockNumber) (((blockId)->bi_hi << 16) | ((uint16) (blockId)->bi_lo)) : \
+ (BlockNumber) InvalidBlockNumber)
+
+#endif /* BLOCK_H */
diff --git a/src/backend/storage/buf.h b/src/backend/storage/buf.h
new file mode 100644
index 00000000000..73582e8a61c
--- /dev/null
+++ b/src/backend/storage/buf.h
@@ -0,0 +1,47 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf.h--
+ * Basic buffer manager data types.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: buf.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUF_H
+#define BUF_H
+
+#define InvalidBuffer (0)
+#define UnknownBuffer (-99999)
+
+typedef long Buffer;
+
+/*
+ * BufferIsInvalid --
+ * True iff the buffer is invalid.
+ */
+#define BufferIsInvalid(buffer) ((buffer) == InvalidBuffer)
+
+/*
+ * BufferIsUnknown --
+ * True iff the buffer is unknown.
+ */
+#define BufferIsUnknown(buffer) ((buffer) == UnknownBuffer)
+
+/*
+ * BufferIsLocal --
+ * True iff the buffer is local (not visible to other servers).
+ */
+#define BufferIsLocal(buffer) ((buffer) < 0)
+
+/*
+ * If NO_BUFFERISVALID is defined, all error checking using BufferIsValid()
+ * are suppressed. Decision-making using BufferIsValid is not affected.
+ * This should be set only if one is sure there will be no errors.
+ * - plai 9/10/90
+ */
+#undef NO_BUFFERISVALID
+
+#endif /* BUF_H */
diff --git a/src/backend/storage/buf_internals.h b/src/backend/storage/buf_internals.h
new file mode 100644
index 00000000000..84583867faf
--- /dev/null
+++ b/src/backend/storage/buf_internals.h
@@ -0,0 +1,220 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_internals.h--
+ * Internal definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: buf_internals.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ * NOTE
+ * If BUFFERPAGE0 is defined, then 0 will be used as a
+ * valid buffer page number.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFMGR_INTERNALS_H
+#define BUFMGR_INTERNALS_H
+
+#include "postgres.h"
+#include "storage/buf.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+
+/* Buf Mgr constants */
+/* in bufmgr.c */
+extern int NBuffers;
+extern int Data_Descriptors;
+extern int Free_List_Descriptor;
+extern int Lookup_List_Descriptor;
+extern int Num_Descriptors;
+
+/*
+ * Flags for buffer descriptors
+ */
+#define BM_DIRTY (1 << 0)
+#define BM_PRIVATE (1 << 1)
+#define BM_VALID (1 << 2)
+#define BM_DELETED (1 << 3)
+#define BM_FREE (1 << 4)
+#define BM_IO_IN_PROGRESS (1 << 5)
+#define BM_IO_ERROR (1 << 6)
+
+typedef bits16 BufFlags;
+
+typedef struct sbufdesc BufferDesc;
+typedef struct sbufdesc BufferHdr;
+typedef struct buftag BufferTag;
+/* long * so alignment will be correct */
+typedef long **BufferBlock;
+
+struct buftag{
+ LRelId relId;
+ BlockNumber blockNum; /* blknum relative to begin of reln */
+};
+
+#define CLEAR_BUFFERTAG(a)\
+ (a)->relId.dbId = InvalidOid; \
+ (a)->relId.relId = InvalidOid; \
+ (a)->blockNum = InvalidBlockNumber
+
+#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \
+{ \
+ (a)->blockNum = xx_blockNum;\
+ (a)->relId = RelationGetLRelId(xx_reln); \
+}
+
+#define COPY_BUFFERTAG(a,b)\
+{ \
+ (a)->blockNum = (b)->blockNum;\
+ LRelIdAssign(*(a),*(b));\
+}
+
+#define EQUAL_BUFFERTAG(a,b) \
+ (((a)->blockNum == (b)->blockNum) &&\
+ (OID_Equal((a)->relId.relId,(b)->relId.relId)))
+
+
+#define BAD_BUFFER_ID(bid) ((bid<1) || (bid>(NBuffers)))
+#define INVALID_DESCRIPTOR (-3)
+
+/*
+ * bletch hack -- anyplace that we declare space for relation or
+ * database names, we just use '16', not a symbolic constant, to
+ * specify their lengths. BM_NAMESIZE is the length of these names,
+ * and is used in the buffer manager code. somebody with lots of
+ * spare time should do this for all the other modules, too.
+ */
+#define BM_NAMESIZE 16
+
+/*
+ * struct sbufdesc -- shared buffer cache metadata for a single
+ * shared buffer descriptor.
+ *
+ * We keep the name of the database and relation in which this
+ * buffer appears in order to avoid a catalog lookup on cache
+ * flush if we don't have the reldesc in the cache. It is also
+ * possible that the relation to which this buffer belongs is
+ * not visible to all backends at the time that it gets flushed.
+ * Dbname, relname, dbid, and relid are enough to determine where
+ * to put the buffer, for all storage managers.
+ */
+
+struct sbufdesc {
+ Buffer freeNext; /* link for freelist chain */
+ Buffer freePrev;
+ SHMEM_OFFSET data; /* pointer to data in buf pool */
+
+ /* tag and id must be together for table lookup to work */
+ BufferTag tag; /* file/block identifier */
+ int buf_id; /* maps global desc to local desc */
+
+ BufFlags flags; /* described below */
+ int16 bufsmgr; /* storage manager id for buffer */
+ unsigned refcount; /* # of times buffer is pinned */
+
+ char *sb_dbname; /* name of db in which buf belongs */
+ char *sb_relname; /* name of reln */
+#ifdef HAS_TEST_AND_SET
+ /* can afford a dedicated lock if test-and-set locks are available */
+ slock_t io_in_progress_lock;
+#endif /* HAS_TEST_AND_SET */
+
+ /*
+ * I padded this structure to a power of 2 (128 bytes on a MIPS) because
+ * BufferDescriptorGetBuffer is called a billion times and it does an
+ * C pointer subtraction (i.e., "x - y" -> array index of x relative
+ * to y, which is calculated using division by struct size). Integer
+ * ".div" hits you for 35 cycles, as opposed to a 1-cycle "sra" ...
+ * this hack cut 10% off of the time to create the Wisconsin database!
+ * It eats up more shared memory, of course, but we're (allegedly)
+ * going to make some of these types bigger soon anyway... -pma 1/2/93
+ */
+#if defined(PORTNAME_ultrix4)
+ char sb_pad[60]; /* no slock_t */
+#endif /* mips */
+#if defined(PORTNAME_sparc) || defined(PORTNAME_sparc_solaris) || defined(PORTNAME_irix5)
+ char sb_pad[56]; /* has slock_t */
+#endif /* sparc || irix5 */
+#if defined(PORTNAME_hpux)
+ char sb_pad[44]; /* has slock_t */
+#endif /* alpha */
+#if defined(PORTNAME_alpha)
+ char sb_pad[40]; /* has slock_t */
+#endif /* alpha */
+};
+
+/*
+ * mao tracing buffer allocation
+ */
+
+/*#define BMTRACE*/
+#ifdef BMTRACE
+
+typedef struct _bmtrace {
+ int bmt_pid;
+ long bmt_buf;
+ long bmt_dbid;
+ long bmt_relid;
+ int bmt_blkno;
+ int bmt_op;
+
+#define BMT_NOTUSED 0
+#define BMT_ALLOCFND 1
+#define BMT_ALLOCNOTFND 2
+#define BMT_DEALLOC 3
+
+} bmtrace;
+
+#endif /* BMTRACE */
+
+
+/*
+ * Bufmgr Interface:
+ */
+
+/* Internal routines: only called by buf.c */
+
+/*freelist.c*/
+extern void AddBufferToFreelist(BufferDesc *bf);
+extern void PinBuffer(BufferDesc *buf);
+extern void PinBuffer_Debug(char *file, int line, BufferDesc *buf);
+extern void UnpinBuffer(BufferDesc *buf);
+extern void UnpinBuffer_Debug(char *file, int line, BufferDesc *buf);
+extern BufferDesc *GetFreeBuffer(void);
+extern void InitFreeList(bool init);
+extern void DBG_FreeListCheck(int nfree);
+
+/* buf_table.c */
+extern void InitBufTable(void);
+extern BufferDesc *BufTableLookup(BufferTag *tagPtr);
+extern bool BufTableDelete(BufferDesc *buf);
+extern bool BufTableInsert(BufferDesc *buf);
+extern void DBG_LookupListCheck(int nlookup);
+
+/* bufmgr.c */
+extern BufferDesc *BufferDescriptors;
+extern BufferBlock BufferBlocks;
+extern long *PrivateRefCount;
+extern long *LastRefCount;
+extern SPINLOCK BufMgrLock;
+
+/* localbuf.c */
+extern long *LocalRefCount;
+extern BufferDesc *LocalBufferDescriptors;
+extern int NLocBuffer;
+
+extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum,
+ bool *foundPtr);
+extern int WriteLocalBuffer(Buffer buffer, bool release);
+extern int FlushLocalBuffer(Buffer buffer);
+extern void InitLocalBuffer();
+extern void LocalBufferSync();
+extern void ResetLocalBufferPool();
+
+#endif /* BUFMGR_INTERNALS_H */
diff --git a/src/backend/storage/buffer/Makefile.inc b/src/backend/storage/buffer/Makefile.inc
new file mode 100644
index 00000000000..1d507f9227b
--- /dev/null
+++ b/src/backend/storage/buffer/Makefile.inc
@@ -0,0 +1,16 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for storage/buffer
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= buf_table.c buf_init.c bufmgr.c freelist.c localbuf.c
+
+SRCS_SITEMGR+= buf_table.c buf_init.c freelist.c
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index 00000000000..823bf41eecf
--- /dev/null
+++ b/src/backend/storage/buffer/buf_init.c
@@ -0,0 +1,280 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_init.c--
+ * buffer manager initialization routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <math.h>
+#include <signal.h>
+
+/* declarations split between these three files */
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/smgr.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "executor/execdebug.h" /* for NDirectFileRead */
+#include "catalog/catalog.h"
+
+/*
+ * if BMTRACE is defined, we trace the last 200 buffer allocations and
+ * deallocations in a circular buffer in shared memory.
+ */
+#ifdef BMTRACE
+bmtrace *TraceBuf;
+long *CurTraceBuf;
+#define BMT_LIMIT 200
+#endif /* BMTRACE */
+int ShowPinTrace = 0;
+
+int NBuffers = NDBUFS; /* NDBUFS defined in miscadmin.h */
+int Data_Descriptors;
+int Free_List_Descriptor;
+int Lookup_List_Descriptor;
+int Num_Descriptors;
+
+BufferDesc *BufferDescriptors;
+BufferBlock BufferBlocks;
+#ifndef HAS_TEST_AND_SET
+long *NWaitIOBackendP;
+#endif
+
+extern IpcSemaphoreId WaitIOSemId;
+
+long *PrivateRefCount; /* also used in freelist.c */
+long *LastRefCount; /* refcounts of last ExecMain level */
+
+/*
+ * Data Structures:
+ * buffers live in a freelist and a lookup data structure.
+ *
+ *
+ * Buffer Lookup:
+ * Two important notes. First, the buffer has to be
+ * available for lookup BEFORE an IO begins. Otherwise
+ * a second process trying to read the buffer will
+ * allocate its own copy and the buffeer pool will
+ * become inconsistent.
+ *
+ * Buffer Replacement:
+ * see freelist.c. A buffer cannot be replaced while in
+ * use either by data manager or during IO.
+ *
+ * WriteBufferBack:
+ * currently, a buffer is only written back at the time
+ * it is selected for replacement. It should
+ * be done sooner if possible to reduce latency of
+ * BufferAlloc(). Maybe there should be a daemon process.
+ *
+ * Synchronization/Locking:
+ *
+ * BufMgrLock lock -- must be acquired before manipulating the
+ * buffer queues (lookup/freelist). Must be released
+ * before exit and before doing any IO.
+ *
+ * IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
+ * It must be set when an IO is initiated and cleared at
+ * the end of the IO. It is there to make sure that one
+ * process doesn't start to use a buffer while another is
+ * faulting it in. see IOWait/IOSignal.
+ *
+ * refcount -- A buffer is pinned during IO and immediately
+ * after a BufferAlloc(). A buffer is always either pinned
+ * or on the freelist but never both. The buffer must be
+ * released, written, or flushed before the end of
+ * transaction.
+ *
+ * PrivateRefCount -- Each buffer also has a private refcount the keeps
+ * track of the number of times the buffer is pinned in the current
+ * processes. This is used for two purposes, first, if we pin a
+ * a buffer more than once, we only need to change the shared refcount
+ * once, thus only lock the buffer pool once, second, when a transaction
+ * aborts, it should only unpin the buffers exactly the number of times it
+ * has pinned them, so that it will not blow away buffers of another
+ * backend.
+ *
+ */
+
+SPINLOCK BufMgrLock;
+
+/* delayed write: TRUE on, FALSE off */
+int LateWrite = TRUE;
+
+int ReadBufferCount;
+int BufferHitCount;
+int BufferFlushCount;
+
+
+/*
+ * Initialize module:
+ *
+ * should calculate size of pool dynamically based on the
+ * amount of available memory.
+ */
+void
+InitBufferPool(IPCKey key)
+{
+ bool foundBufs,foundDescs;
+ int i;
+
+ Data_Descriptors = NBuffers;
+ Free_List_Descriptor = Data_Descriptors;
+ Lookup_List_Descriptor = Data_Descriptors + 1;
+ Num_Descriptors = Data_Descriptors + 1;
+
+ SpinAcquire(BufMgrLock);
+
+#ifdef BMTRACE
+ CurTraceBuf = (long *) ShmemInitStruct("Buffer trace",
+ (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long),
+ &foundDescs);
+ if (!foundDescs)
+ memset(CurTraceBuf, 0, (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long));
+
+ TraceBuf = (bmtrace *) &(CurTraceBuf[1]);
+#endif
+
+ BufferDescriptors = (BufferDesc *)
+ ShmemInitStruct("Buffer Descriptors",
+ Num_Descriptors*sizeof(BufferDesc),&foundDescs);
+
+ BufferBlocks = (BufferBlock)
+ ShmemInitStruct("Buffer Blocks",
+ NBuffers*BLCKSZ,&foundBufs);
+
+#ifndef HAS_TEST_AND_SET
+ {
+ bool foundNWaitIO;
+
+ NWaitIOBackendP = (long *)ShmemInitStruct("#Backends Waiting IO",
+ sizeof(long),
+ &foundNWaitIO);
+ if (!foundNWaitIO)
+ *NWaitIOBackendP = 0;
+ }
+#endif
+
+ if (foundDescs || foundBufs) {
+
+ /* both should be present or neither */
+ Assert(foundDescs && foundBufs);
+
+ } else {
+ BufferDesc *buf;
+ unsigned long block;
+
+ buf = BufferDescriptors;
+ block = (unsigned long) BufferBlocks;
+
+ /*
+ * link the buffers into a circular, doubly-linked list to
+ * initialize free list. Still don't know anything about
+ * replacement strategy in this file.
+ */
+ for (i = 0; i < Data_Descriptors; block+=BLCKSZ,buf++,i++) {
+ Assert(ShmemIsValid((unsigned long)block));
+
+ buf->freeNext = i+1;
+ buf->freePrev = i-1;
+
+ CLEAR_BUFFERTAG(&(buf->tag));
+ buf->data = MAKE_OFFSET(block);
+ buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
+ buf->refcount = 0;
+ buf->buf_id = i;
+#ifdef HAS_TEST_AND_SET
+ S_INIT_LOCK(&(buf->io_in_progress_lock));
+#endif
+ }
+
+ /* close the circular queue */
+ BufferDescriptors[0].freePrev = Data_Descriptors-1;
+ BufferDescriptors[Data_Descriptors-1].freeNext = 0;
+ }
+
+ /* Init the rest of the module */
+ InitBufTable();
+ InitFreeList(!foundDescs);
+
+ SpinRelease(BufMgrLock);
+
+#ifndef HAS_TEST_AND_SET
+ {
+ int status;
+ WaitIOSemId = IpcSemaphoreCreate(IPCKeyGetWaitIOSemaphoreKey(key),
+ 1, IPCProtection, 0, 1, &status);
+ }
+#endif
+ PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
+ LastRefCount = (long *) calloc(NBuffers, sizeof(long));
+}
+
+/* -----------------------------------------------------
+ * BufferShmemSize
+ *
+ * compute the size of shared memory for the buffer pool including
+ * data pages, buffer descriptors, hash tables, etc.
+ * ----------------------------------------------------
+ */
+int
+BufferShmemSize()
+{
+ int size = 0;
+ int nbuckets;
+ int nsegs;
+ int tmp;
+
+ nbuckets = 1 << (int)my_log2((NBuffers - 1) / DEF_FFACTOR + 1);
+ nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1);
+
+ /* size of shmem binding table */
+ size += MAXALIGN(my_log2(BTABLE_SIZE) * sizeof(void *)); /* HTAB->dir */
+ size += MAXALIGN(sizeof(HHDR)); /* HTAB->hctl */
+ size += MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+ size += BUCKET_ALLOC_INCR *
+ (MAXALIGN(sizeof(BUCKET_INDEX)) +
+ MAXALIGN(BTABLE_KEYSIZE) +
+ MAXALIGN(BTABLE_DATASIZE));
+
+ /* size of buffer descriptors */
+ size += MAXALIGN((NBuffers + 1) * sizeof(BufferDesc));
+
+ /* size of data pages */
+ size += NBuffers * MAXALIGN(BLCKSZ);
+
+ /* size of buffer hash table */
+ size += MAXALIGN(my_log2(NBuffers) * sizeof(void *)); /* HTAB->dir */
+ size += MAXALIGN(sizeof(HHDR)); /* HTAB->hctl */
+ size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+ tmp = (int)ceil((double)NBuffers/BUCKET_ALLOC_INCR);
+ size += tmp * BUCKET_ALLOC_INCR *
+ (MAXALIGN(sizeof(BUCKET_INDEX)) +
+ MAXALIGN(sizeof(BufferTag)) +
+ MAXALIGN(sizeof(Buffer)));
+
+#ifdef BMTRACE
+ size += (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long);
+#endif
+ return size;
+}
+
+
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
new file mode 100644
index 00000000000..502ded954ed
--- /dev/null
+++ b/src/backend/storage/buffer/buf_table.c
@@ -0,0 +1,162 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_table.c--
+ * routines for finding buffers in the buffer pool.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_table.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * OLD COMMENTS
+ *
+ * Data Structures:
+ *
+ * Buffers are identified by their BufferTag (buf.h). This
+ * file contains routines for allocating a shmem hash table to
+ * map buffer tags to buffer descriptors.
+ *
+ * Synchronization:
+ *
+ * All routines in this file assume buffer manager spinlock is
+ * held by their caller.
+ */
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h" /* where the declarations go */
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+static HTAB *SharedBufHash;
+
+extern HTAB *ShmemInitHash();
+
+typedef struct lookup {
+ BufferTag key;
+ Buffer id;
+} LookupEnt;
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ */
+void
+InitBufTable()
+{
+ HASHCTL info;
+ int hash_flags;
+
+ /* assume lock is held */
+
+ /* BufferTag maps to Buffer */
+ info.keysize = sizeof(BufferTag);
+ info.datasize = sizeof(Buffer);
+ info.hash = tag_hash;
+
+ hash_flags = (HASH_ELEM | HASH_FUNCTION);
+
+
+ SharedBufHash = (HTAB *) ShmemInitHash("Shared Buf Lookup Table",
+ NBuffers,NBuffers,
+ &info,hash_flags);
+
+ if (! SharedBufHash) {
+ elog(FATAL,"couldn't initialize shared buffer pool Hash Tbl");
+ exit(1);
+ }
+
+}
+
+BufferDesc *
+BufTableLookup(BufferTag *tagPtr)
+{
+ LookupEnt * result;
+ bool found;
+
+ if (tagPtr->blockNum == P_NEW)
+ return(NULL);
+
+ result = (LookupEnt *)
+ hash_search(SharedBufHash,(char *) tagPtr,HASH_FIND,&found);
+
+ if (! result){
+ elog(WARN,"BufTableLookup: BufferLookup table corrupted");
+ return(NULL);
+ }
+ if (! found) {
+ return(NULL);
+ }
+ return(&(BufferDescriptors[result->id]));
+}
+
+/*
+ * BufTableDelete
+ */
+bool
+BufTableDelete(BufferDesc *buf)
+{
+ LookupEnt * result;
+ bool found;
+
+ /* buffer not initialized or has been removed from
+ * table already. BM_DELETED keeps us from removing
+ * buffer twice.
+ */
+ if (buf->flags & BM_DELETED) {
+ return(TRUE);
+ }
+
+ buf->flags |= BM_DELETED;
+
+ result = (LookupEnt *)
+ hash_search(SharedBufHash,(char *) &(buf->tag),HASH_REMOVE,&found);
+
+ if (! (result && found)) {
+ elog(WARN,"BufTableDelete: BufferLookup table corrupted");
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+bool
+BufTableInsert(BufferDesc *buf)
+{
+ LookupEnt * result;
+ bool found;
+
+ /* cannot insert it twice */
+ Assert (buf->flags & BM_DELETED);
+ buf->flags &= ~(BM_DELETED);
+
+ result = (LookupEnt *)
+ hash_search(SharedBufHash,(char *) &(buf->tag),HASH_ENTER,&found);
+
+ if (! result) {
+ Assert(0);
+ elog(WARN,"BufTableInsert: BufferLookup table corrupted");
+ return(FALSE);
+ }
+ /* found something else in the table ! */
+ if (found) {
+ Assert(0);
+ elog(WARN,"BufTableInsert: BufferLookup table corrupted");
+ return(FALSE);
+ }
+
+ result->id = buf->buf_id;
+ return(TRUE);
+}
+
+/* prints out collision stats for the buf table */
+void
+DBG_LookupListCheck(int nlookup)
+{
+ nlookup = 10;
+
+ hash_stats("Shared",SharedBufHash);
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 00000000000..655f1f408e0
--- /dev/null
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -0,0 +1,1581 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.c--
+ * buffer manager interface routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *
+ * BufferAlloc() -- lookup a buffer in the buffer table. If
+ * it isn't there add it, but do not read it into memory.
+ * This is used when we are about to reinitialize the
+ * buffer so don't care what the current disk contents are.
+ * BufferAlloc() pins the new buffer in memory.
+ *
+ * ReadBuffer() -- same as BufferAlloc() but reads the data
+ * on a buffer cache miss.
+ *
+ * ReleaseBuffer() -- unpin the buffer
+ *
+ * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty"
+ * but don't unpin. The disk IO is delayed until buffer
+ * replacement if LateWrite flag is set.
+ *
+ * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer()
+ *
+ * DirtyBufferCopy() -- For a given dbid/relid/blockno, if the buffer is
+ * in the cache and is dirty, mark it clean and copy
+ * it to the requested location. This is a logical
+ * write, and has been installed to support the cache
+ * management code for write-once storage managers.
+ *
+ * FlushBuffer() -- as above but never delayed write.
+ *
+ * BufferSync() -- flush all dirty buffers in the buffer pool.
+ *
+ * InitBufferPool() -- Init the buffer module.
+ *
+ * See other files:
+ * freelist.c -- chooses victim for buffer replacement
+ * buf_table.c -- manages the buffer lookup table
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <math.h>
+#include <signal.h>
+
+/* declarations split between these three files */
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/smgr.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/memutils.h"
+#include "executor/execdebug.h" /* for NDirectFileRead */
+#include "catalog/catalog.h"
+
+extern int LateWrite;
+extern SPINLOCK BufMgrLock;
+extern int ReadBufferCount;
+extern int BufferHitCount;
+extern int BufferFlushCount;
+
+static void WaitIO(BufferDesc *buf, SPINLOCK spinlock);
+#ifndef HAS_TEST_AND_SET
+static void SignalIO(BufferDesc *buf);
+extern long *NWaitIOBackendP; /* defined in buf_init.c */
+#endif /* HAS_TEST_AND_SET */
+
+static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum,
+ bool bufferLockHeld);
+static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
+ bool *foundPtr, bool bufferLockHeld);
+static int FlushBuffer(Buffer buffer);
+static void BufferSync(void);
+static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld);
+
+/* ---------------------------------------------------
+ * RelationGetBufferWithBuffer
+ * see if the given buffer is what we want
+ * if yes, we don't need to bother the buffer manager
+ * ---------------------------------------------------
+ */
+Buffer
+RelationGetBufferWithBuffer(Relation relation,
+ BlockNumber blockNumber,
+ Buffer buffer)
+{
+ BufferDesc *bufHdr;
+ LRelId lrelId;
+
+ if (BufferIsValid(buffer)) {
+ if (!BufferIsLocal(buffer)) {
+ bufHdr = &BufferDescriptors[buffer-1];
+ lrelId = RelationGetLRelId(relation);
+ SpinAcquire(BufMgrLock);
+ if (bufHdr->tag.blockNum == blockNumber &&
+ bufHdr->tag.relId.relId == lrelId.relId &&
+ bufHdr->tag.relId.dbId == lrelId.dbId) {
+ SpinRelease(BufMgrLock);
+ return(buffer);
+ }
+ return(ReadBufferWithBufferLock(relation, blockNumber, true));
+ } else {
+ bufHdr = &LocalBufferDescriptors[-buffer-1];
+ if (bufHdr->tag.relId.relId == relation->rd_id &&
+ bufHdr->tag.blockNum == blockNumber) {
+ return(buffer);
+ }
+ }
+ }
+ return(ReadBuffer(relation, blockNumber));
+}
+
+/*
+ * ReadBuffer -- returns a buffer containing the requested
+ * block of the requested relation. If the blknum
+ * requested is P_NEW, extend the relation file and
+ * allocate a new block.
+ *
+ * Returns: the buffer number for the buffer containing
+ * the block read or NULL on an error.
+ *
+ * Assume when this function is called, that reln has been
+ * opened already.
+ */
+
+extern int ShowPinTrace;
+
+
+#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG defined */
+
+/*
+ * ReadBuffer --
+ *
+ */
+Buffer
+ReadBuffer(Relation reln, BlockNumber blockNum)
+{
+ return ReadBufferWithBufferLock(reln, blockNum, false);
+}
+
+/*
+ * is_userbuffer
+ *
+ * XXX caller must have already acquired BufMgrLock
+ */
+static bool
+is_userbuffer(Buffer buffer)
+{
+ BufferDesc *buf = &BufferDescriptors[buffer-1];
+
+ if (IsSystemRelationName(buf->sb_relname))
+ return false;
+ return true;
+}
+
+Buffer
+ReadBuffer_Debug(char *file,
+ int line,
+ Relation reln,
+ BlockNumber blockNum)
+{
+ Buffer buffer;
+
+ buffer = ReadBufferWithBufferLock(reln, blockNum, false);
+ if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+ BufferDesc *buf = &BufferDescriptors[buffer-1];
+
+ fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+ buffer, buf->sb_relname, buf->tag.blockNum,
+ PrivateRefCount[buffer - 1], file, line);
+ }
+ return buffer;
+}
+
+/*
+ * ReadBufferWithBufferLock -- does the work of
+ * ReadBuffer() but with the possibility that
+ * the buffer lock has already been held. this
+ * is yet another effort to reduce the number of
+ * semops in the system.
+ */
+static Buffer
+ReadBufferWithBufferLock(Relation reln,
+ BlockNumber blockNum,
+ bool bufferLockHeld)
+{
+ BufferDesc *bufHdr;
+ int extend; /* extending the file by one block */
+ int status;
+ bool found;
+ bool isLocalBuf;
+
+ extend = (blockNum == P_NEW);
+ isLocalBuf = reln->rd_islocal;
+
+ if (isLocalBuf) {
+ bufHdr = LocalBufferAlloc(reln, blockNum, &found);
+ } else {
+ ReadBufferCount++;
+
+ /* lookup the buffer. IO_IN_PROGRESS is set if the requested
+ * block is not currently in memory.
+ */
+ bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld);
+ if (found) BufferHitCount++;
+ }
+
+ if (!bufHdr) {
+ return(InvalidBuffer);
+ }
+
+ /* if its already in the buffer pool, we're done */
+ if (found) {
+ /*
+ * This happens when a bogus buffer was returned previously and is
+ * floating around in the buffer pool. A routine calling this would
+ * want this extended.
+ */
+ if (extend) {
+ /* new buffers are zero-filled */
+ memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
+ (void) smgrextend(bufHdr->bufsmgr, reln,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+ return (BufferDescriptorGetBuffer(bufHdr));
+
+ }
+
+ /*
+ * if we have gotten to this point, the reln pointer must be ok
+ * and the relation file must be open.
+ */
+ if (extend) {
+ /* new buffers are zero-filled */
+ (void) memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
+ status = smgrextend(bufHdr->bufsmgr, reln,
+ (char *) MAKE_PTR(bufHdr->data));
+ } else {
+ status = smgrread(bufHdr->bufsmgr, reln, blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+
+ if (isLocalBuf)
+ return (BufferDescriptorGetBuffer(bufHdr));
+
+ /* lock buffer manager again to update IO IN PROGRESS */
+ SpinAcquire(BufMgrLock);
+
+ if (status == SM_FAIL) {
+ /* IO Failed. cleanup the data structures and go home */
+
+ if (! BufTableDelete(bufHdr)) {
+ SpinRelease(BufMgrLock);
+ elog(FATAL,"BufRead: buffer table broken after IO error\n");
+ }
+ /* remember that BufferAlloc() pinned the buffer */
+ UnpinBuffer(bufHdr);
+
+ /*
+ * Have to reset the flag so that anyone waiting for
+ * the buffer can tell that the contents are invalid.
+ */
+ bufHdr->flags |= BM_IO_ERROR;
+
+ } else {
+ /* IO Succeeded. clear the flags, finish buffer update */
+
+ bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS);
+ }
+
+ /* If anyone was waiting for IO to complete, wake them up now */
+#ifdef HAS_TEST_AND_SET
+ S_UNLOCK(&(bufHdr->io_in_progress_lock));
+#else
+ if (bufHdr->refcount > 1)
+ SignalIO(bufHdr);
+#endif
+
+ SpinRelease(BufMgrLock);
+
+ return(BufferDescriptorGetBuffer(bufHdr));
+}
+
+/*
+ * BufferAlloc -- Get a buffer from the buffer pool but dont
+ * read it.
+ *
+ * Returns: descriptor for buffer
+ *
+ * When this routine returns, the BufMgrLock is guaranteed NOT be held.
+ */
+static BufferDesc *
+BufferAlloc(Relation reln,
+ BlockNumber blockNum,
+ bool *foundPtr,
+ bool bufferLockHeld)
+{
+ BufferDesc *buf, *buf2;
+ BufferTag newTag; /* identity of requested block */
+ bool inProgress; /* buffer undergoing IO */
+ bool newblock = FALSE;
+
+ /* create a new tag so we can lookup the buffer */
+ /* assume that the relation is already open */
+ if (blockNum == P_NEW) {
+ newblock = TRUE;
+ blockNum = smgrnblocks(reln->rd_rel->relsmgr, reln);
+ }
+
+ INIT_BUFFERTAG(&newTag,reln,blockNum);
+
+ if (!bufferLockHeld)
+ SpinAcquire(BufMgrLock);
+
+ /* see if the block is in the buffer pool already */
+ buf = BufTableLookup(&newTag);
+ if (buf != NULL) {
+ /* Found it. Now, (a) pin the buffer so no
+ * one steals it from the buffer pool,
+ * (b) check IO_IN_PROGRESS, someone may be
+ * faulting the buffer into the buffer pool.
+ */
+
+ PinBuffer(buf);
+ inProgress = (buf->flags & BM_IO_IN_PROGRESS);
+
+ *foundPtr = TRUE;
+ if (inProgress) {
+ WaitIO(buf, BufMgrLock);
+ if (buf->flags & BM_IO_ERROR) {
+ /* wierd race condition:
+ *
+ * We were waiting for someone else to read the buffer.
+ * While we were waiting, the reader boof'd in some
+ * way, so the contents of the buffer are still
+ * invalid. By saying that we didn't find it, we can
+ * make the caller reinitialize the buffer. If two
+ * processes are waiting for this block, both will
+ * read the block. The second one to finish may overwrite
+ * any updates made by the first. (Assume higher level
+ * synchronization prevents this from happening).
+ *
+ * This is never going to happen, don't worry about it.
+ */
+ *foundPtr = FALSE;
+ }
+ }
+#ifdef BMTRACE
+ _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND);
+#endif /* BMTRACE */
+
+ SpinRelease(BufMgrLock);
+
+ return(buf);
+ }
+
+ *foundPtr = FALSE;
+
+ /*
+ * Didn't find it in the buffer pool. We'll have
+ * to initialize a new buffer. First, grab one from
+ * the free list. If it's dirty, flush it to disk.
+ * Remember to unlock BufMgr spinlock while doing the IOs.
+ */
+ inProgress = FALSE;
+ for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL; ) {
+
+ /* GetFreeBuffer will abort if it can't find a free buffer */
+ buf = GetFreeBuffer();
+
+ /*
+ * There should be exactly one pin on the buffer after
+ * it is allocated -- ours. If it had a pin it wouldn't
+ * have been on the free list. No one else could have
+ * pinned it between GetFreeBuffer and here because we
+ * have the BufMgrLock.
+ */
+ Assert(buf->refcount == 0);
+ buf->refcount = 1;
+ PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1;
+
+ if (buf->flags & BM_DIRTY) {
+ /*
+ * Set BM_IO_IN_PROGRESS to keep anyone from doing anything
+ * with the contents of the buffer while we write it out.
+ * We don't really care if they try to read it, but if they
+ * can complete a BufferAlloc on it they can then scribble
+ * into it, and we'd really like to avoid that while we are
+ * flushing the buffer. Setting this flag should block them
+ * in WaitIO until we're done.
+ */
+ inProgress = TRUE;
+ buf->flags |= BM_IO_IN_PROGRESS;
+#ifdef HAS_TEST_AND_SET
+ /*
+ * All code paths that acquire this lock pin the buffer
+ * first; since no one had it pinned (it just came off the
+ * free list), no one else can have this lock.
+ */
+ Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
+ S_LOCK(&(buf->io_in_progress_lock));
+#endif /* HAS_TEST_AND_SET */
+
+ /*
+ * Write the buffer out, being careful to release BufMgrLock
+ * before starting the I/O.
+ *
+ * This #ifndef is here because a few extra semops REALLY kill
+ * you on machines that don't have spinlocks. If you don't
+ * operate with much concurrency, well...
+ */
+ (void) BufferReplace(buf, true);
+ BufferFlushCount++;
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
+ /*
+ * Somebody could have pinned the buffer while we were
+ * doing the I/O and had given up the BufMgrLock (though
+ * they would be waiting for us to clear the BM_IO_IN_PROGRESS
+ * flag). That's why this is a loop -- if so, we need to clear
+ * the I/O flags, remove our pin and start all over again.
+ *
+ * People may be making buffers free at any time, so there's
+ * no reason to think that we have an immediate disaster on
+ * our hands.
+ */
+ if (buf->refcount > 1) {
+ inProgress = FALSE;
+ buf->flags &= ~BM_IO_IN_PROGRESS;
+#ifdef HAS_TEST_AND_SET
+ S_UNLOCK(&(buf->io_in_progress_lock));
+#else /* !HAS_TEST_AND_SET */
+ if (buf->refcount > 1)
+ SignalIO(buf);
+#endif /* !HAS_TEST_AND_SET */
+ PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
+ buf->refcount--;
+ buf = (BufferDesc *) NULL;
+ }
+
+ /*
+ * Somebody could have allocated another buffer for the
+ * same block we are about to read in. (While we flush out
+ * the dirty buffer, we don't hold the lock and someone could
+ * have allocated another buffer for the same block. The problem
+ * is we haven't gotten around to insert the new tag into
+ * the buffer table. So we need to check here. -ay 3/95
+ */
+ buf2 = BufTableLookup(&newTag);
+ if (buf2 != NULL) {
+ /* Found it. Someone has already done what we're about
+ * to do. We'll just handle this as if it were found in
+ * the buffer pool in the first place.
+ */
+
+ PinBuffer(buf2);
+ inProgress = (buf2->flags & BM_IO_IN_PROGRESS);
+
+ *foundPtr = TRUE;
+ if (inProgress) {
+ WaitIO(buf2, BufMgrLock);
+ if (buf2->flags & BM_IO_ERROR) {
+ *foundPtr = FALSE;
+ }
+ }
+
+#ifdef HAS_TEST_AND_SET
+ S_UNLOCK(&(buf->io_in_progress_lock));
+#else /* !HAS_TEST_AND_SET */
+ if (buf->refcount > 1)
+ SignalIO(buf);
+#endif /* !HAS_TEST_AND_SET */
+
+ /* give up the buffer since we don't need it any more */
+ buf->refcount--;
+ PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
+ AddBufferToFreelist(buf);
+ buf->flags |= BM_FREE;
+ buf->flags &= ~BM_DIRTY;
+ buf->flags &= ~BM_IO_IN_PROGRESS;
+
+ SpinRelease(BufMgrLock);
+
+ return(buf2);
+ }
+ }
+ }
+ /*
+ * At this point we should have the sole pin on a non-dirty
+ * buffer and we may or may not already have the BM_IO_IN_PROGRESS
+ * flag set.
+ */
+
+ /*
+ * Change the name of the buffer in the lookup table:
+ *
+ * Need to update the lookup table before the read starts.
+ * If someone comes along looking for the buffer while
+ * we are reading it in, we don't want them to allocate
+ * a new buffer. For the same reason, we didn't want
+ * to erase the buf table entry for the buffer we were
+ * writing back until now, either.
+ */
+
+ if (! BufTableDelete(buf)) {
+ SpinRelease(BufMgrLock);
+ elog(FATAL,"buffer wasn't in the buffer table\n");
+
+ }
+
+ if (buf->flags & BM_DIRTY) {
+ /* must clear flag first because of wierd race
+ * condition described below.
+ */
+ buf->flags &= ~BM_DIRTY;
+ }
+
+ /* record the database name and relation name for this buffer */
+ buf->sb_relname = pstrdup(reln->rd_rel->relname.data);
+ buf->sb_dbname = pstrdup(GetDatabaseName());
+
+ /* remember which storage manager is responsible for it */
+ buf->bufsmgr = reln->rd_rel->relsmgr;
+
+ INIT_BUFFERTAG(&(buf->tag),reln,blockNum);
+ if (! BufTableInsert(buf)) {
+ SpinRelease(BufMgrLock);
+ elog(FATAL,"Buffer in lookup table twice \n");
+ }
+
+ /* Buffer contents are currently invalid. Have
+ * to mark IO IN PROGRESS so no one fiddles with
+ * them until the read completes. If this routine
+ * has been called simply to allocate a buffer, no
+ * io will be attempted, so the flag isnt set.
+ */
+ if (!inProgress) {
+ buf->flags |= BM_IO_IN_PROGRESS;
+#ifdef HAS_TEST_AND_SET
+ Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
+ S_LOCK(&(buf->io_in_progress_lock));
+#endif /* HAS_TEST_AND_SET */
+ }
+
+#ifdef BMTRACE
+ _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND);
+#endif /* BMTRACE */
+
+ SpinRelease(BufMgrLock);
+
+ return (buf);
+}
+
+/*
+ * WriteBuffer--
+ *
+ * Pushes buffer contents to disk if LateWrite is
+ * not set. Otherwise, marks contents as dirty.
+ *
+ * Assume that buffer is pinned. Assume that reln is
+ * valid.
+ *
+ * Side Effects:
+ * Pin count is decremented.
+ */
+
+#undef WriteBuffer
+
+int
+WriteBuffer(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+
+ if (! LateWrite) {
+ return(FlushBuffer(buffer));
+ } else {
+
+ if (BufferIsLocal(buffer))
+ return WriteLocalBuffer(buffer, TRUE);
+
+ if (BAD_BUFFER_ID(buffer))
+ return(FALSE);
+
+ bufHdr = &BufferDescriptors[buffer-1];
+
+ SpinAcquire(BufMgrLock);
+ Assert(bufHdr->refcount > 0);
+ bufHdr->flags |= BM_DIRTY;
+ UnpinBuffer(bufHdr);
+ SpinRelease(BufMgrLock);
+ }
+ return(TRUE);
+}
+
+void
+WriteBuffer_Debug(char *file, int line, Buffer buffer)
+{
+ WriteBuffer(buffer);
+ if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+ BufferDesc *buf;
+ buf = &BufferDescriptors[buffer-1];
+ fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+ buffer, buf->sb_relname, buf->tag.blockNum,
+ PrivateRefCount[buffer - 1], file, line);
+ }
+}
+
+/*
+ * DirtyBufferCopy() -- Copy a given dirty buffer to the requested
+ * destination.
+ *
+ * We treat this as a write. If the requested buffer is in the pool
+ * and is dirty, we copy it to the location requested and mark it
+ * clean. This routine supports the Sony jukebox storage manager,
+ * which agrees to take responsibility for the data once we mark
+ * it clean.
+ *
+ * NOTE: used by sony jukebox code in postgres 4.2 - ay 2/95
+ */
+void
+DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest)
+{
+ BufferDesc *buf;
+ BufferTag btag;
+
+ btag.relId.relId = relid;
+ btag.relId.dbId = dbid;
+ btag.blockNum = blkno;
+
+ SpinAcquire(BufMgrLock);
+ buf = BufTableLookup(&btag);
+
+ if (buf == (BufferDesc *) NULL
+ || !(buf->flags & BM_DIRTY)
+ || !(buf->flags & BM_VALID)) {
+ SpinRelease(BufMgrLock);
+ return;
+ }
+
+ /* hate to do this holding the lock, but release and reacquire is slower */
+ memmove(dest, (char *) MAKE_PTR(buf->data), BLCKSZ);
+
+ buf->flags &= ~BM_DIRTY;
+
+ SpinRelease(BufMgrLock);
+}
+
+/*
+ * FlushBuffer -- like WriteBuffer, but force the page to disk.
+ *
+ * 'buffer' is known to be dirty/pinned, so there should not be a
+ * problem reading the BufferDesc members without the BufMgrLock
+ * (nobody should be able to change tags, flags, etc. out from under
+ * us).
+ */
+static int
+FlushBuffer(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+
+ if (BufferIsLocal(buffer))
+ return FlushLocalBuffer(buffer);
+
+ if (BAD_BUFFER_ID(buffer))
+ return (STATUS_ERROR);
+
+ bufHdr = &BufferDescriptors[buffer-1];
+
+ if (!BufferReplace(bufHdr, false)) {
+ elog(WARN, "FlushBuffer: cannot flush %d", bufHdr->tag.blockNum);
+ return (STATUS_ERROR);
+ }
+
+ SpinAcquire(BufMgrLock);
+ bufHdr->flags &= ~BM_DIRTY;
+ UnpinBuffer(bufHdr);
+ SpinRelease(BufMgrLock);
+
+ return(STATUS_OK);
+}
+
+/*
+ * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer
+ * when the operation is complete.
+ *
+ * We know that the buffer is for a relation in our private cache,
+ * because this routine is called only to write out buffers that
+ * were changed by the executing backend.
+ */
+int
+WriteNoReleaseBuffer(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+
+ if (! LateWrite) {
+ return(FlushBuffer(buffer));
+ } else {
+
+ if (BufferIsLocal(buffer))
+ return WriteLocalBuffer(buffer, FALSE);
+
+ if (BAD_BUFFER_ID(buffer))
+ return (STATUS_ERROR);
+
+ bufHdr = &BufferDescriptors[buffer-1];
+
+ SpinAcquire(BufMgrLock);
+ bufHdr->flags |= BM_DIRTY;
+ SpinRelease(BufMgrLock);
+ }
+ return(STATUS_OK);
+}
+
+
+#undef ReleaseAndReadBuffer
+/*
+ * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
+ * so that only one semop needs to be called.
+ *
+ */
+Buffer
+ReleaseAndReadBuffer(Buffer buffer,
+ Relation relation,
+ BlockNumber blockNum)
+{
+ BufferDesc *bufHdr;
+ Buffer retbuf;
+
+ if (BufferIsLocal(buffer)) {
+ Assert(LocalRefCount[-buffer - 1] > 0);
+ LocalRefCount[-buffer - 1]--;
+ } else {
+ if (BufferIsValid(buffer)) {
+ bufHdr = &BufferDescriptors[buffer-1];
+ Assert(PrivateRefCount[buffer - 1] > 0);
+ PrivateRefCount[buffer - 1]--;
+ if (PrivateRefCount[buffer - 1] == 0 &&
+ LastRefCount[buffer - 1] == 0) {
+ /* only release buffer if it is not pinned in previous ExecMain
+ level */
+ SpinAcquire(BufMgrLock);
+ bufHdr->refcount--;
+ if (bufHdr->refcount == 0) {
+ AddBufferToFreelist(bufHdr);
+ bufHdr->flags |= BM_FREE;
+ }
+ retbuf = ReadBufferWithBufferLock(relation, blockNum, true);
+ return retbuf;
+ }
+ }
+ }
+
+ return (ReadBuffer(relation, blockNum));
+}
+
+/*
+ * BufferSync -- Flush all dirty buffers in the pool.
+ *
+ * This is called at transaction commit time. It does the wrong thing,
+ * right now. We should flush only our own changes to stable storage,
+ * and we should obey the lock protocol on the buffer manager metadata
+ * as we do it. Also, we need to be sure that no other transaction is
+ * modifying the page as we flush it. This is only a problem for objects
+ * that use a non-two-phase locking protocol, like btree indices. For
+ * those objects, we would like to set a write lock for the duration of
+ * our IO. Another possibility is to code updates to btree pages
+ * carefully, so that writing them out out of order cannot cause
+ * any unrecoverable errors.
+ *
+ * I don't want to think hard about this right now, so I will try
+ * to come back to it later.
+ */
+static void
+BufferSync()
+{
+ int i;
+ Oid bufdb;
+ Oid bufrel;
+ Relation reln;
+ BufferDesc *bufHdr;
+ int status;
+
+ SpinAcquire(BufMgrLock);
+ for (i=0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) {
+ if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) {
+ bufdb = bufHdr->tag.relId.dbId;
+ bufrel = bufHdr->tag.relId.relId;
+ if (bufdb == MyDatabaseId || bufdb == (Oid) 0) {
+ reln = RelationIdCacheGetRelation(bufrel);
+
+ /*
+ * If we didn't have the reldesc in our local cache, flush this
+ * page out using the 'blind write' storage manager routine. If
+ * we did find it, use the standard interface.
+ */
+
+#ifndef OPTIMIZE_SINGLE
+ SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+ if (reln == (Relation) NULL) {
+ status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname,
+ bufHdr->sb_relname, bufdb, bufrel,
+ bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ } else {
+ status = smgrwrite(bufHdr->bufsmgr, reln,
+ bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
+ if (status == SM_FAIL) {
+ elog(WARN, "cannot write %d for %16s",
+ bufHdr->tag.blockNum, bufHdr->sb_relname);
+ }
+
+ bufHdr->flags &= ~BM_DIRTY;
+ if (reln != (Relation)NULL)
+ RelationDecrementReferenceCount(reln);
+ }
+ }
+ }
+ SpinRelease(BufMgrLock);
+
+ LocalBufferSync();
+}
+
+
+/*
+ * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf'
+ * is cleared. Because IO_IN_PROGRESS conflicts are
+ * expected to be rare, there is only one BufferIO
+ * lock in the entire system. All processes block
+ * on this semaphore when they try to use a buffer
+ * that someone else is faulting in. Whenever a
+ * process finishes an IO and someone is waiting for
+ * the buffer, BufferIO is signaled (SignalIO). All
+ * waiting processes then wake up and check to see
+ * if their buffer is now ready. This implementation
+ * is simple, but efficient enough if WaitIO is
+ * rarely called by multiple processes simultaneously.
+ *
+ * ProcSleep atomically releases the spinlock and goes to
+ * sleep.
+ *
+ * Note: there is an easy fix if the queue becomes long.
+ * save the id of the buffer we are waiting for in
+ * the queue structure. That way signal can figure
+ * out which proc to wake up.
+ */
+#ifdef HAS_TEST_AND_SET
+static void
+WaitIO(BufferDesc *buf, SPINLOCK spinlock)
+{
+ SpinRelease(spinlock);
+ S_LOCK(&(buf->io_in_progress_lock));
+ S_UNLOCK(&(buf->io_in_progress_lock));
+ SpinAcquire(spinlock);
+}
+
+#else /* HAS_TEST_AND_SET */
+IpcSemaphoreId WaitIOSemId;
+
+static void
+WaitIO(BufferDesc *buf, SPINLOCK spinlock)
+{
+ bool inProgress;
+
+ for (;;) {
+
+ /* wait until someone releases IO lock */
+ (*NWaitIOBackendP)++;
+ SpinRelease(spinlock);
+ IpcSemaphoreLock(WaitIOSemId, 0, 1);
+ SpinAcquire(spinlock);
+ inProgress = (buf->flags & BM_IO_IN_PROGRESS);
+ if (!inProgress) break;
+ }
+}
+
+/*
+ * SignalIO --
+ */
+static void
+SignalIO(BufferDesc *buf)
+{
+ /* somebody better be waiting. */
+ Assert( buf->refcount > 1);
+ IpcSemaphoreUnlock(WaitIOSemId, 0, *NWaitIOBackendP);
+ *NWaitIOBackendP = 0;
+}
+#endif /* HAS_TEST_AND_SET */
+
+long NDirectFileRead; /* some I/O's are direct file access. bypass bufmgr */
+long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */
+
+void
+PrintBufferUsage(FILE *statfp)
+{
+ float hitrate;
+
+ if (ReadBufferCount==0)
+ hitrate = 0.0;
+ else
+ hitrate = (float)BufferHitCount * 100.0/ReadBufferCount;
+
+ fprintf(statfp, "!\t%ld blocks read, %ld blocks written, buffer hit rate = %.2f%%\n",
+ ReadBufferCount - BufferHitCount + NDirectFileRead,
+ BufferFlushCount + NDirectFileWrite,
+ hitrate);
+}
+
+void
+ResetBufferUsage()
+{
+ BufferHitCount = 0;
+ ReadBufferCount = 0;
+ BufferFlushCount = 0;
+ NDirectFileRead = 0;
+ NDirectFileWrite = 0;
+}
+
+/* ----------------------------------------------
+ * ResetBufferPool
+ *
+ * this routine is supposed to be called when a transaction aborts.
+ * it will release all the buffer pins held by the transaciton.
+ *
+ * ----------------------------------------------
+ */
+void
+ResetBufferPool()
+{
+ register int i;
+ for (i=1; i<=NBuffers; i++) {
+ if (BufferIsValid(i)) {
+ while(PrivateRefCount[i - 1] > 0) {
+ ReleaseBuffer(i);
+ }
+ }
+ LastRefCount[i - 1] = 0;
+ }
+
+ ResetLocalBufferPool();
+}
+
+/* -----------------------------------------------
+ * BufferPoolCheckLeak
+ *
+ * check if there is buffer leak
+ *
+ * -----------------------------------------------
+ */
+int
+BufferPoolCheckLeak()
+{
+ register int i;
+ void PrintBufferDescs();
+
+ for (i = 1; i <= NBuffers; i++) {
+ if (BufferIsValid(i)) {
+ elog(NOTICE, "buffer leak detected in BufferPoolCheckLeak()");
+ PrintBufferDescs();
+ return(1);
+ }
+ }
+ return(0);
+}
+
+/* ------------------------------------------------
+ * FlushBufferPool
+ *
+ * flush all dirty blocks in buffer pool to disk
+ *
+ * ------------------------------------------------
+ */
+void
+FlushBufferPool(int StableMainMemoryFlag)
+{
+ if (!StableMainMemoryFlag) {
+ BufferSync();
+ smgrcommit();
+ }
+}
+
+/*
+ * BufferIsValid --
+ * True iff the refcnt of the local buffer is > 0
+ * Note:
+ * BufferIsValid(InvalidBuffer) is False.
+ * BufferIsValid(UnknownBuffer) is False.
+ */
+bool
+BufferIsValid(Buffer bufnum)
+{
+ if (BufferIsLocal(bufnum))
+ return (bufnum >= -NLocBuffer && LocalRefCount[-bufnum - 1] > 0);
+
+ if (BAD_BUFFER_ID(bufnum))
+ return(false);
+
+ return ((bool)(PrivateRefCount[bufnum - 1] > 0));
+}
+
+/*
+ * BufferGetBlockNumber --
+ * Returns the block number associated with a buffer.
+ *
+ * Note:
+ * Assumes that the buffer is valid.
+ */
+BlockNumber
+BufferGetBlockNumber(Buffer buffer)
+{
+ Assert(BufferIsValid(buffer));
+
+ /* XXX should be a critical section */
+ if (BufferIsLocal(buffer))
+ return (LocalBufferDescriptors[-buffer-1].tag.blockNum);
+ else
+ return (BufferDescriptors[buffer-1].tag.blockNum);
+}
+
+/*
+ * BufferGetRelation --
+ * Returns the relation desciptor associated with a buffer.
+ *
+ * Note:
+ * Assumes buffer is valid.
+ */
+Relation
+BufferGetRelation(Buffer buffer)
+{
+ Relation relation;
+ Oid relid;
+
+ Assert(BufferIsValid(buffer));
+ Assert(!BufferIsLocal(buffer)); /* not supported for local buffers */
+
+ /* XXX should be a critical section */
+ relid = LRelIdGetRelationId(BufferDescriptors[buffer-1].tag.relId);
+ relation = RelationIdGetRelation(relid);
+
+ RelationDecrementReferenceCount(relation);
+
+ if (RelationHasReferenceCountZero(relation)) {
+ /*
+ elog(NOTICE, "BufferGetRelation: 0->1");
+ */
+
+ RelationIncrementReferenceCount(relation);
+ }
+
+ return (relation);
+}
+
+/*
+ * BufferReplace
+ *
+ * Flush the buffer corresponding to 'bufHdr'
+ *
+ * Assumes that the BufMgrLock has NOT been acquired.
+ */
+static int
+BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
+{
+ Relation reln;
+ Oid bufdb, bufrel;
+ int status;
+
+ if (!bufferLockHeld)
+ SpinAcquire(BufMgrLock);
+
+ /*
+ * first try to find the reldesc in the cache, if no luck,
+ * don't bother to build the reldesc from scratch, just do
+ * a blind write.
+ */
+
+ bufdb = bufHdr->tag.relId.dbId;
+ bufrel = bufHdr->tag.relId.relId;
+
+ if (bufdb == MyDatabaseId || bufdb == (Oid) NULL)
+ reln = RelationIdCacheGetRelation(bufrel);
+ else
+ reln = (Relation) NULL;
+
+ SpinRelease(BufMgrLock);
+
+ if (reln != (Relation) NULL) {
+ status = smgrflush(bufHdr->bufsmgr, reln, bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ } else {
+
+ /* blind write always flushes */
+ status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname,
+ bufHdr->sb_relname, bufdb, bufrel,
+ bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+
+ if (status == SM_FAIL)
+ return (FALSE);
+
+ return (TRUE);
+}
+
+/*
+ * RelationGetNumberOfBlocks --
+ * Returns the buffer descriptor associated with a page in a relation.
+ *
+ * Note:
+ * XXX may fail for huge relations.
+ * XXX should be elsewhere.
+ * XXX maybe should be hidden
+ */
+BlockNumber
+RelationGetNumberOfBlocks(Relation relation)
+{
+ return
+ ((relation->rd_islocal) ? relation->rd_nblocks :
+ smgrnblocks(relation->rd_rel->relsmgr, relation));
+}
+
+/*
+ * BufferGetBlock --
+ * Returns a reference to a disk page image associated with a buffer.
+ *
+ * Note:
+ * Assumes buffer is valid.
+ */
+Block
+BufferGetBlock(Buffer buffer)
+{
+ Assert(BufferIsValid(buffer));
+
+ if (BufferIsLocal(buffer))
+ return((Block)MAKE_PTR(LocalBufferDescriptors[-buffer-1].data));
+ else
+ return((Block)MAKE_PTR(BufferDescriptors[buffer-1].data));
+}
+
+/* ---------------------------------------------------------------------
+ * ReleaseTmpRelBuffers
+ *
+ * this function unmarks all the dirty pages of a temporary
+ * relation in the buffer pool so that at the end of transaction
+ * these pages will not be flushed.
+ * XXX currently it sequentially searches the buffer pool, should be
+ * changed to more clever ways of searching.
+ * --------------------------------------------------------------------
+ */
+void
+ReleaseTmpRelBuffers(Relation tempreldesc)
+{
+ register int i;
+ int holding = 0;
+ BufferDesc *buf;
+
+ for (i=1; i<=NBuffers; i++) {
+ buf = &BufferDescriptors[i-1];
+ if (!holding) {
+ SpinAcquire(BufMgrLock);
+ holding = 1;
+ }
+ if ((buf->flags & BM_DIRTY) &&
+ (buf->tag.relId.dbId == MyDatabaseId) &&
+ (buf->tag.relId.relId == tempreldesc->rd_id)) {
+ buf->flags &= ~BM_DIRTY;
+ if (!(buf->flags & BM_FREE)) {
+ SpinRelease(BufMgrLock);
+ holding = 0;
+ ReleaseBuffer(i);
+ }
+ }
+ }
+ if (holding)
+ SpinRelease(BufMgrLock);
+}
+
+/* ---------------------------------------------------------------------
+ * DropBuffers
+ *
+ * This function marks all the buffers in the buffer cache for a
+ * particular database as clean. This is used when we destroy a
+ * database, to avoid trying to flush data to disk when the directory
+ * tree no longer exists.
+ *
+ * This is an exceedingly non-public interface.
+ * --------------------------------------------------------------------
+ */
+void
+DropBuffers(Oid dbid)
+{
+ register int i;
+ BufferDesc *buf;
+
+ SpinAcquire(BufMgrLock);
+ for (i=1; i<=NBuffers; i++) {
+ buf = &BufferDescriptors[i-1];
+ if ((buf->tag.relId.dbId == dbid) && (buf->flags & BM_DIRTY)) {
+ buf->flags &= ~BM_DIRTY;
+ }
+ }
+ SpinRelease(BufMgrLock);
+}
+
+/* -----------------------------------------------------------------
+ * PrintBufferDescs
+ *
+ * this function prints all the buffer descriptors, for debugging
+ * use only.
+ * -----------------------------------------------------------------
+ */
+void
+PrintBufferDescs()
+{
+ int i;
+ BufferDesc *buf = BufferDescriptors;
+
+ if (IsUnderPostmaster) {
+ SpinAcquire(BufMgrLock);
+ for (i = 0; i < NBuffers; ++i, ++buf) {
+ elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \
+blockNum=%d, flags=0x%x, refcount=%d %d)",
+ i, buf->freeNext, buf->freePrev, NAMEDATALEN,
+ &(buf->sb_relname), buf->tag.blockNum, buf->flags,
+ buf->refcount, PrivateRefCount[i]);
+ }
+ SpinRelease(BufMgrLock);
+ } else {
+ /* interactive backend */
+ for (i = 0; i < NBuffers; ++i, ++buf) {
+ printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n",
+ i, buf->sb_relname, buf->tag.blockNum,
+ buf->flags, buf->refcount, PrivateRefCount[i]);
+ }
+ }
+}
+
+void
+PrintPinnedBufs()
+{
+ int i;
+ BufferDesc *buf = BufferDescriptors;
+
+ SpinAcquire(BufMgrLock);
+ for (i = 0; i < NBuffers; ++i, ++buf) {
+ if (PrivateRefCount[i] > 0)
+ elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \
+blockNum=%d, flags=0x%x, refcount=%d %d)\n",
+ i, buf->freeNext, buf->freePrev, NAMEDATALEN, &(buf->sb_relname),
+ buf->tag.blockNum, buf->flags,
+ buf->refcount, PrivateRefCount[i]);
+ }
+ SpinRelease(BufMgrLock);
+}
+
+/*
+ * BufferPoolBlowaway
+ *
+ * this routine is solely for the purpose of experiments -- sometimes
+ * you may want to blowaway whatever is left from the past in buffer
+ * pool and start measuring some performance with a clean empty buffer
+ * pool.
+ */
+void
+BufferPoolBlowaway()
+{
+ register int i;
+
+ BufferSync();
+ for (i=1; i<=NBuffers; i++) {
+ if (BufferIsValid(i)) {
+ while(BufferIsValid(i))
+ ReleaseBuffer(i);
+ }
+ BufTableDelete(&BufferDescriptors[i-1]);
+ }
+}
+
+#undef IncrBufferRefCount
+#undef ReleaseBuffer
+
+void
+IncrBufferRefCount(Buffer buffer)
+{
+ if (BufferIsLocal(buffer)) {
+ Assert(LocalRefCount[-buffer - 1] >= 0);
+ LocalRefCount[-buffer - 1]++;
+ } else {
+ Assert(!BAD_BUFFER_ID(buffer));
+ Assert(PrivateRefCount[buffer - 1] >= 0);
+ PrivateRefCount[buffer - 1]++;
+ }
+}
+
+/*
+ * ReleaseBuffer -- remove the pin on a buffer without
+ * marking it dirty.
+ *
+ */
+int
+ReleaseBuffer(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+
+ if (BufferIsLocal(buffer)) {
+ Assert(LocalRefCount[-buffer - 1] > 0);
+ LocalRefCount[-buffer - 1]--;
+ return (STATUS_OK);
+ }
+
+ if (BAD_BUFFER_ID(buffer))
+ return(STATUS_ERROR);
+
+ bufHdr = &BufferDescriptors[buffer-1];
+
+ Assert(PrivateRefCount[buffer - 1] > 0);
+ PrivateRefCount[buffer - 1]--;
+ if (PrivateRefCount[buffer - 1] == 0 && LastRefCount[buffer - 1] == 0) {
+ /* only release buffer if it is not pinned in previous ExecMain
+ levels */
+ SpinAcquire(BufMgrLock);
+ bufHdr->refcount--;
+ if (bufHdr->refcount == 0) {
+ AddBufferToFreelist(bufHdr);
+ bufHdr->flags |= BM_FREE;
+ }
+ SpinRelease(BufMgrLock);
+ }
+
+ return(STATUS_OK);
+}
+
+void
+IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
+{
+ IncrBufferRefCount(buffer);
+ if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+ BufferDesc *buf = &BufferDescriptors[buffer-1];
+
+ fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+ buffer, buf->sb_relname, buf->tag.blockNum,
+ PrivateRefCount[buffer - 1], file, line);
+ }
+}
+
+void
+ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
+{
+ ReleaseBuffer(buffer);
+ if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+ BufferDesc *buf = &BufferDescriptors[buffer-1];
+
+ fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+ buffer, buf->sb_relname, buf->tag.blockNum,
+ PrivateRefCount[buffer - 1], file, line);
+ }
+}
+
+int
+ReleaseAndReadBuffer_Debug(char *file,
+ int line,
+ Buffer buffer,
+ Relation relation,
+ BlockNumber blockNum)
+{
+ bool bufferValid;
+ Buffer b;
+
+ bufferValid = BufferIsValid(buffer);
+ b = ReleaseAndReadBuffer(buffer, relation, blockNum);
+ if (ShowPinTrace && bufferValid && BufferIsLocal(buffer)
+ && is_userbuffer(buffer)) {
+ BufferDesc *buf = &BufferDescriptors[buffer-1];
+
+ fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+ buffer, buf->sb_relname, buf->tag.blockNum,
+ PrivateRefCount[buffer - 1], file, line);
+ }
+ if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+ BufferDesc *buf = &BufferDescriptors[b-1];
+
+ fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+ b, buf->sb_relname, buf->tag.blockNum,
+ PrivateRefCount[b - 1], file, line);
+ }
+ return b;
+}
+
+#ifdef BMTRACE
+
+/*
+ * trace allocations and deallocations in a circular buffer in
+ * shared memory. check the buffer before doing the allocation,
+ * and die if there's anything fishy.
+ */
+
+_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType)
+{
+ static int mypid = 0;
+ long start, cur;
+ bmtrace *tb;
+
+ if (mypid == 0)
+ mypid = getpid();
+
+ start = *CurTraceBuf;
+
+ if (start > 0)
+ cur = start - 1;
+ else
+ cur = BMT_LIMIT - 1;
+
+ for (;;) {
+ tb = &TraceBuf[cur];
+ if (tb->bmt_op != BMT_NOTUSED) {
+ if (tb->bmt_buf == bufNo) {
+ if ((tb->bmt_op == BMT_DEALLOC)
+ || (tb->bmt_dbid == dbId && tb->bmt_relid == relId
+ && tb->bmt_blkno == blkNo))
+ goto okay;
+
+ /* die holding the buffer lock */
+ _bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur);
+ }
+ }
+
+ if (cur == start)
+ goto okay;
+
+ if (cur == 0)
+ cur = BMT_LIMIT - 1;
+ else
+ cur--;
+ }
+
+ okay:
+ tb = &TraceBuf[start];
+ tb->bmt_pid = mypid;
+ tb->bmt_buf = bufNo;
+ tb->bmt_dbid = dbId;
+ tb->bmt_relid = relId;
+ tb->bmt_blkno = blkNo;
+ tb->bmt_op = allocType;
+
+ *CurTraceBuf = (start + 1) % BMT_LIMIT;
+}
+
+_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo,
+ int allocType, long start, long cur)
+{
+ FILE *fp;
+ bmtrace *tb;
+ int i;
+
+ tb = &TraceBuf[cur];
+
+ if ((fp = fopen("/tmp/death_notice", "w")) == (FILE *) NULL)
+ elog(FATAL, "buffer alloc trace error and can't open log file");
+
+ fprintf(fp, "buffer alloc trace detected the following error:\n\n");
+ fprintf(fp, " buffer %d being %s inconsistently with a previous %s\n\n",
+ bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"),
+ (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation"));
+
+ fprintf(fp, "the trace buffer contains:\n");
+
+ i = start;
+ for (;;) {
+ tb = &TraceBuf[i];
+ if (tb->bmt_op != BMT_NOTUSED) {
+ fprintf(fp, " [%3d]%spid %d buf %2d for <%d,%d,%d> ",
+ i, (i == cur ? " ---> " : "\t"),
+ tb->bmt_pid, tb->bmt_buf,
+ tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno);
+
+ switch (tb->bmt_op) {
+ case BMT_ALLOCFND:
+ fprintf(fp, "allocate (found)\n");
+ break;
+
+ case BMT_ALLOCNOTFND:
+ fprintf(fp, "allocate (not found)\n");
+ break;
+
+ case BMT_DEALLOC:
+ fprintf(fp, "deallocate\n");
+ break;
+
+ default:
+ fprintf(fp, "unknown op type %d\n", tb->bmt_op);
+ break;
+ }
+ }
+
+ i = (i + 1) % BMT_LIMIT;
+ if (i == start)
+ break;
+ }
+
+ fprintf(fp, "\noperation causing error:\n");
+ fprintf(fp, "\tpid %d buf %d for <%d,%d,%d> ",
+ getpid(), bufNo, dbId, relId, blkNo);
+
+ switch (allocType) {
+ case BMT_ALLOCFND:
+ fprintf(fp, "allocate (found)\n");
+ break;
+
+ case BMT_ALLOCNOTFND:
+ fprintf(fp, "allocate (not found)\n");
+ break;
+
+ case BMT_DEALLOC:
+ fprintf(fp, "deallocate\n");
+ break;
+
+ default:
+ fprintf(fp, "unknown op type %d\n", allocType);
+ break;
+ }
+
+ (void) fclose(fp);
+
+ kill(getpid(), SIGILL);
+}
+
+#endif /* BMTRACE */
+
+void
+BufferRefCountReset(int *refcountsave)
+{
+ int i;
+ for (i=0; i<NBuffers; i++) {
+ refcountsave[i] = PrivateRefCount[i];
+ LastRefCount[i] += PrivateRefCount[i];
+ PrivateRefCount[i] = 0;
+ }
+}
+
+void
+BufferRefCountRestore(int *refcountsave)
+{
+ int i;
+ for (i=0; i<NBuffers; i++) {
+ PrivateRefCount[i] = refcountsave[i];
+ LastRefCount[i] -= refcountsave[i];
+ refcountsave[i] = 0;
+ }
+}
+
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 00000000000..fabc3c29829
--- /dev/null
+++ b/src/backend/storage/buffer/freelist.c
@@ -0,0 +1,285 @@
+/*-------------------------------------------------------------------------
+ *
+ * freelist.c--
+ * routines for manipulating the buffer pool's replacement strategy
+ * freelist.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * OLD COMMENTS
+ *
+ * Data Structures:
+ * SharedFreeList is a circular queue. Notice that this
+ * is a shared memory queue so the next/prev "ptrs" are
+ * buffer ids, not addresses.
+ *
+ * Sync: all routines in this file assume that the buffer
+ * semaphore has been acquired by the caller.
+ */
+#include <stdio.h>
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h" /* where declarations go */
+#include "storage/spin.h"
+#include "utils/elog.h"
+
+
+static BufferDesc *SharedFreeList;
+
+/* only actually used in debugging. The lock
+ * should be acquired before calling the freelist manager.
+ */
+extern SPINLOCK BufMgrLock;
+
+#define IsInQueue(bf) \
+ Assert((bf->freeNext != INVALID_DESCRIPTOR));\
+ Assert((bf->freePrev != INVALID_DESCRIPTOR));\
+ Assert((bf->flags & BM_FREE))
+
+#define NotInQueue(bf) \
+ Assert((bf->freeNext == INVALID_DESCRIPTOR));\
+ Assert((bf->freePrev == INVALID_DESCRIPTOR));\
+ Assert(! (bf->flags & BM_FREE))
+
+
+/*
+ * AddBufferToFreelist --
+ *
+ * In theory, this is the only routine that needs to be changed
+ * if the buffer replacement strategy changes. Just change
+ * the manner in which buffers are added to the freelist queue.
+ * Currently, they are added on an LRU basis.
+ */
+void
+AddBufferToFreelist(BufferDesc *bf)
+{
+#ifdef BMTRACE
+ _bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
+ BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
+#endif /* BMTRACE */
+ NotInQueue(bf);
+
+ /* change bf so it points to inFrontOfNew and its successor */
+ bf->freePrev = SharedFreeList->freePrev;
+ bf->freeNext = Free_List_Descriptor;
+
+ /* insert new into chain */
+ BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
+ BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
+}
+
+#undef PinBuffer
+
+/*
+ * PinBuffer -- make buffer unavailable for replacement.
+ */
+void
+PinBuffer(BufferDesc *buf)
+{
+ long b;
+
+ /* Assert (buf->refcount < 25); */
+
+ if (buf->refcount == 0) {
+ IsInQueue(buf);
+
+ /* remove from freelist queue */
+ BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
+ BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
+ buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
+
+ /* mark buffer as no longer free */
+ buf->flags &= ~BM_FREE;
+ } else {
+ NotInQueue(buf);
+ }
+
+ b = BufferDescriptorGetBuffer(buf) - 1;
+ Assert(PrivateRefCount[b] >= 0);
+ if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
+ buf->refcount++;
+ PrivateRefCount[b]++;
+}
+
+void
+PinBuffer_Debug(char *file, int line, BufferDesc *buf)
+{
+ PinBuffer(buf);
+ if (ShowPinTrace) {
+ Buffer buffer = BufferDescriptorGetBuffer(buf);
+
+ fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+ buffer, buf->sb_relname, buf->tag.blockNum,
+ PrivateRefCount[buffer - 1], file, line);
+ }
+}
+
+#undef UnpinBuffer
+
+/*
+ * UnpinBuffer -- make buffer available for replacement.
+ */
+void
+UnpinBuffer(BufferDesc *buf)
+{
+ long b = BufferDescriptorGetBuffer(buf) - 1;
+
+ Assert(buf->refcount);
+ Assert(PrivateRefCount[b] > 0);
+ PrivateRefCount[b]--;
+ if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
+ buf->refcount--;
+ NotInQueue(buf);
+
+ if (buf->refcount == 0) {
+ AddBufferToFreelist(buf);
+ buf->flags |= BM_FREE;
+ } else {
+ /* do nothing */
+ }
+}
+
+void
+UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)
+{
+ UnpinBuffer(buf);
+ if (ShowPinTrace) {
+ Buffer buffer = BufferDescriptorGetBuffer(buf);
+
+ fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+ buffer, buf->sb_relname, buf->tag.blockNum,
+ PrivateRefCount[buffer - 1], file, line);
+ }
+}
+
+/*
+ * GetFreeBuffer() -- get the 'next' buffer from the freelist.
+ *
+ */
+BufferDesc *
+GetFreeBuffer()
+{
+ BufferDesc *buf;
+
+ if (Free_List_Descriptor == SharedFreeList->freeNext) {
+
+ /* queue is empty. All buffers in the buffer pool are pinned. */
+ elog(WARN,"out of free buffers: time to abort !\n");
+ return(NULL);
+ }
+ buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+
+ /* remove from freelist queue */
+ BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
+ BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
+ buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
+
+ buf->flags &= ~(BM_FREE);
+
+ return(buf);
+}
+
+/*
+ * InitFreeList -- initialize the dummy buffer descriptor used
+ * as a freelist head.
+ *
+ * Assume: All of the buffers are already linked in a circular
+ * queue. Only called by postmaster and only during
+ * initialization.
+ */
+void
+InitFreeList(bool init)
+{
+ SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]);
+
+ if (init) {
+ /* we only do this once, normally the postmaster */
+ SharedFreeList->data = INVALID_OFFSET;
+ SharedFreeList->flags = 0;
+ SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
+ SharedFreeList->buf_id = Free_List_Descriptor;
+
+ /* insert it into a random spot in the circular queue */
+ SharedFreeList->freeNext = BufferDescriptors[0].freeNext;
+ SharedFreeList->freePrev = 0;
+ BufferDescriptors[SharedFreeList->freeNext].freePrev =
+ BufferDescriptors[SharedFreeList->freePrev].freeNext =
+ Free_List_Descriptor;
+ }
+}
+
+
+/*
+ * print out the free list and check for breaks.
+ */
+void
+DBG_FreeListCheck(int nfree)
+{
+ int i;
+ BufferDesc *buf;
+
+ buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+ for (i=0;i<nfree;i++,buf = &(BufferDescriptors[buf->freeNext])) {
+
+ if (! (buf->flags & (BM_FREE))){
+ if (buf != SharedFreeList) {
+ printf("\tfree list corrupted: %d flags %x\n",
+ buf->buf_id,buf->flags);
+ } else {
+ printf("\tfree list corrupted: too short -- %d not %d\n",
+ i,nfree);
+
+ }
+
+
+ }
+ if ((BufferDescriptors[buf->freeNext].freePrev != buf->buf_id) ||
+ (BufferDescriptors[buf->freePrev].freeNext != buf->buf_id)) {
+ printf("\tfree list links corrupted: %d %ld %ld\n",
+ buf->buf_id,buf->freePrev,buf->freeNext);
+ }
+
+ }
+ if (buf != SharedFreeList) {
+ printf("\tfree list corrupted: %d-th buffer is %d\n",
+ nfree,buf->buf_id);
+
+ }
+}
+
+/*
+ * PrintBufferFreeList -
+ * prints the buffer free list, for debugging
+ */
+void
+PrintBufferFreeList()
+{
+ BufferDesc *buf;
+
+ if (SharedFreeList->freeNext == Free_List_Descriptor) {
+ printf("free list is empty.\n");
+ return;
+ }
+
+ buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+ for (;;) {
+ int i = (buf - BufferDescriptors);
+ printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n",
+ i, buf->sb_relname, buf->tag.blockNum,
+ buf->flags, buf->refcount, PrivateRefCount[i],
+ buf->freeNext, buf->freePrev);
+
+ if (buf->freeNext == Free_List_Descriptor)
+ break;
+
+ buf = &(BufferDescriptors[buf->freeNext]);
+ }
+}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 00000000000..ec625940867
--- /dev/null
+++ b/src/backend/storage/buffer/localbuf.c
@@ -0,0 +1,284 @@
+/*-------------------------------------------------------------------------
+ *
+ * localbuf.c--
+ * local buffer manager. Fast buffer manager for temporary tables
+ * or special cases when the operation is not visible to other backends.
+ *
+ * When a relation is being created, the descriptor will have rd_islocal
+ * set to indicate that the local buffer manager should be used. During
+ * the same transaction the relation is being created, any inserts or
+ * selects from the newly created relation will use the local buffer
+ * pool. rd_islocal is reset at the end of a transaction (commit/abort).
+ * This is useful for queries like SELECT INTO TABLE and create index.
+ *
+ * Copyright (c) 1994-5, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <math.h>
+#include <signal.h>
+
+/* declarations split between these three files */
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/smgr.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "executor/execdebug.h" /* for NDirectFileRead */
+#include "catalog/catalog.h"
+
+int NLocBuffer = 64;
+BufferDesc *LocalBufferDescriptors = NULL;
+long *LocalRefCount = NULL;
+
+static int nextFreeLocalBuf = 0;
+
+/*#define LBDEBUG*/
+
+/*
+ * LocalBufferAlloc -
+ * allocate a local buffer. We do round robin allocation for now.
+ */
+BufferDesc *
+LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
+{
+ int i;
+ BufferDesc *bufHdr = (BufferDesc *) NULL;
+
+ if (blockNum == P_NEW) {
+ blockNum = reln->rd_nblocks;
+ reln->rd_nblocks++;
+ }
+
+ /* a low tech search for now -- not optimized for scans */
+ for (i=0; i < NLocBuffer; i++) {
+ if (LocalBufferDescriptors[i].tag.relId.relId == reln->rd_id &&
+ LocalBufferDescriptors[i].tag.blockNum == blockNum) {
+
+#ifdef LBDEBUG
+ fprintf(stderr, "LB ALLOC (%d,%d) %d\n",
+ reln->rd_id, blockNum, -i-1);
+#endif
+ LocalRefCount[i]++;
+ *foundPtr = TRUE;
+ return &LocalBufferDescriptors[i];
+ }
+ }
+
+#ifdef LBDEBUG
+ fprintf(stderr, "LB ALLOC (%d,%d) %d\n",
+ reln->rd_id, blockNum, -nextFreeLocalBuf-1);
+#endif
+
+ /* need to get a new buffer (round robin for now) */
+ for(i=0; i < NLocBuffer; i++) {
+ int b = (nextFreeLocalBuf + i) % NLocBuffer;
+
+ if (LocalRefCount[b]==0) {
+ bufHdr = &LocalBufferDescriptors[b];
+ LocalRefCount[b]++;
+ nextFreeLocalBuf = (b + 1) % NLocBuffer;
+ break;
+ }
+ }
+ if (bufHdr==NULL)
+ elog(WARN, "no empty local buffer.");
+
+ /*
+ * this buffer is not referenced but it might still be dirty (the
+ * last transaction to touch it doesn't need its contents but has
+ * not flushed it). if that's the case, write it out before
+ * reusing it!
+ */
+ if (bufHdr->flags & BM_DIRTY) {
+ Relation bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId);
+
+ Assert(bufrel != NULL);
+
+ /* flush this page */
+ smgrwrite(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+
+ /*
+ * it's all ours now.
+ */
+ bufHdr->tag.relId.relId = reln->rd_id;
+ bufHdr->tag.blockNum = blockNum;
+ bufHdr->flags &= ~BM_DIRTY;
+
+ /*
+ * lazy memory allocation. (see MAKE_PTR for why we need to do
+ * MAKE_OFFSET.)
+ */
+ if (bufHdr->data == (SHMEM_OFFSET)0) {
+ char *data = (char *)malloc(BLCKSZ);
+
+ bufHdr->data = MAKE_OFFSET(data);
+ }
+
+ *foundPtr = FALSE;
+ return bufHdr;
+}
+
+/*
+ * WriteLocalBuffer -
+ * writes out a local buffer
+ */
+int
+WriteLocalBuffer(Buffer buffer, bool release)
+{
+ int bufid;
+
+ Assert(BufferIsLocal(buffer));
+
+#ifdef LBDEBUG
+ fprintf(stderr, "LB WRITE %d\n", buffer);
+#endif
+
+ bufid = - (buffer + 1);
+ LocalBufferDescriptors[bufid].flags |= BM_DIRTY;
+
+ if (release) {
+ Assert(LocalRefCount[bufid] > 0);
+ LocalRefCount[bufid]--;
+ }
+
+ return true;
+}
+
+/*
+ * FlushLocalBuffer -
+ * flushes a local buffer
+ */
+int
+FlushLocalBuffer(Buffer buffer)
+{
+ int bufid;
+ Relation bufrel;
+ BufferDesc *bufHdr;
+
+ Assert(BufferIsLocal(buffer));
+
+#ifdef LBDEBUG
+ fprintf(stderr, "LB FLUSH %d\n", buffer);
+#endif
+
+ bufid = - (buffer + 1);
+ bufHdr = &LocalBufferDescriptors[bufid];
+ bufHdr->flags &= ~BM_DIRTY;
+ bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId);
+
+ Assert(bufrel != NULL);
+ smgrflush(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+
+ Assert(LocalRefCount[bufid] > 0);
+ LocalRefCount[bufid]--;
+
+ return true;
+}
+
+/*
+ * InitLocalBuffer -
+ * init the local buffer cache. Since most queries (esp. multi-user ones)
+ * don't involve local buffers, we delay allocating memory for actual the
+ * buffer until we need it.
+ */
+void
+InitLocalBuffer()
+{
+ int i;
+
+ /*
+ * these aren't going away. I'm not gonna use palloc.
+ */
+ LocalBufferDescriptors =
+ (BufferDesc *)malloc(sizeof(BufferDesc) * NLocBuffer);
+ memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer);
+ nextFreeLocalBuf = 0;
+
+ for (i = 0; i < NLocBuffer; i++) {
+ BufferDesc *buf = &LocalBufferDescriptors[i];
+
+ /*
+ * negative to indicate local buffer. This is tricky: shared buffers
+ * start with 0. We have to start with -2. (Note that the routine
+ * BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id
+ * is -1.)
+ */
+ buf->buf_id = - i - 2;
+ }
+
+ LocalRefCount =
+ (long *)malloc(sizeof(long) * NLocBuffer);
+ memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
+
+/*
+ * LocalBufferSync -
+ * flush all dirty buffers in the local buffer cache. Since the buffer
+ * cache is only used for keeping relations visible during a transaction,
+ * we will not need these buffers again.
+ */
+void
+LocalBufferSync()
+{
+ int i;
+
+ for (i = 0; i < NLocBuffer; i++) {
+ BufferDesc *buf = &LocalBufferDescriptors[i];
+ Relation bufrel;
+
+ if (buf->flags & BM_DIRTY) {
+#ifdef LBDEBUG
+ fprintf(stderr, "LB SYNC %d\n", -i-1);
+#endif
+ bufrel = RelationIdCacheGetRelation(buf->tag.relId.relId);
+
+ Assert(bufrel != NULL);
+
+ smgrwrite(bufrel->rd_rel->relsmgr, bufrel, buf->tag.blockNum,
+ (char *) MAKE_PTR(buf->data));
+
+ buf->tag.relId.relId = InvalidOid;
+ buf->flags &= ~BM_DIRTY;
+ }
+ }
+
+ memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
+
+void
+ResetLocalBufferPool()
+{
+ int i;
+
+ memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer);
+ nextFreeLocalBuf = 0;
+
+ for (i = 0; i < NLocBuffer; i++) {
+ BufferDesc *buf = &LocalBufferDescriptors[i];
+
+ /* just like InitLocalBuffer() */
+ buf->buf_id = - i - 2;
+ }
+
+ memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
diff --git a/src/backend/storage/bufmgr.h b/src/backend/storage/bufmgr.h
new file mode 100644
index 00000000000..581d3237cad
--- /dev/null
+++ b/src/backend/storage/bufmgr.h
@@ -0,0 +1,112 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.h--
+ * POSTGRES buffer manager definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: bufmgr.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFMGR_H
+#define BUFMGR_H
+
+#include "c.h"
+
+#include "machine.h" /* for BLCKSZ */
+#include "utils/rel.h"
+
+#include "storage/buf_internals.h" /* UGLY! -- ay */
+
+/*
+ * the maximum size of a disk block for any possible installation.
+ *
+ * in theory this could be anything, but in practice this is actually
+ * limited to 2^13 bytes because we have limited ItemIdData.lp_off and
+ * ItemIdData.lp_len to 13 bits (see itemid.h).
+ */
+#define MAXBLCKSZ 8192
+
+typedef void *Block;
+
+
+/* special pageno for bget */
+#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
+
+typedef bits16 BufferLock;
+
+/**********************************************************************
+
+ the rest is function defns in the bufmgr that are externally callable
+
+ **********************************************************************/
+
+/*
+ * These routines are beaten on quite heavily, hence the macroization.
+ * See buf_internals.h for a related comment.
+ */
+#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
+
+/*
+ * BufferIsPinned --
+ * True iff the buffer is pinned (and therefore valid)
+ *
+ * Note:
+ * Smenatics are identical to BufferIsValid
+ * XXX - need to remove either one eventually.
+ */
+#define BufferIsPinned BufferIsValid
+
+
+extern int ShowPinTrace;
+
+/*
+ * prototypes for functions in bufmgr.c
+ */
+extern Buffer RelationGetBufferWithBuffer(Relation relation,
+ BlockNumber blockNumber, Buffer buffer);
+extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+extern Buffer ReadBuffer_Debug(char *file, int line, Relation reln,
+ BlockNumber blockNum);
+extern int WriteBuffer(Buffer buffer);
+extern void WriteBuffer_Debug(char *file, int line, Buffer buffer);
+extern void DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno,
+ char *dest);
+extern int WriteNoReleaseBuffer(Buffer buffer);
+extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
+ BlockNumber blockNum);
+
+extern void InitBufferPool(IPCKey key);
+extern void PrintBufferUsage(FILE *statfp);
+extern void ResetBufferUsage(void);
+extern void ResetBufferPool(void);
+extern int BufferPoolCheckLeak(void);
+extern void FlushBufferPool(int StableMainMemoryFlag);
+extern bool BufferIsValid(Buffer bufnum);
+extern BlockNumber BufferGetBlockNumber(Buffer buffer);
+extern Relation BufferGetRelation(Buffer buffer);
+extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
+extern Block BufferGetBlock(Buffer buffer);
+extern void ReleaseTmpRelBuffers(Relation tempreldesc);
+extern void DropBuffers(Oid dbid);
+extern void PrintBufferDescs(void);
+extern void PrintPinnedBufs(void);
+extern int BufferShmemSize(void);
+extern void BufferPoolBlowaway(void);
+extern void IncrBufferRefCount(Buffer buffer);
+extern int ReleaseBuffer(Buffer buffer);
+
+extern void IncrBufferRefCount_Debug(char *file, int line, Buffer buffer);
+extern void ReleaseBuffer_Debug(char *file, int line, Buffer buffer);
+extern int ReleaseAndReadBuffer_Debug(char *file,
+ int line,
+ Buffer buffer,
+ Relation relation,
+ BlockNumber blockNum);
+extern void BufferRefCountReset(int *refcountsave);
+extern void BufferRefCountRestore(int *refcountsave);
+
+#endif /* !defined(BufMgrIncluded) */
+
diff --git a/src/backend/storage/bufpage.h b/src/backend/storage/bufpage.h
new file mode 100644
index 00000000000..9fda973889d
--- /dev/null
+++ b/src/backend/storage/bufpage.h
@@ -0,0 +1,256 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.h--
+ * Standard POSTGRES buffer page definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: bufpage.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFPAGE_H
+#define BUFPAGE_H
+
+#include "c.h"
+#include "machine.h" /* for BLCKSZ */
+
+#include "storage/buf.h"
+#include "storage/item.h"
+#include "storage/itemid.h"
+#include "storage/itemptr.h"
+
+/*
+ * a postgres disk page is an abstraction layered on top of a postgres
+ * disk block (which is simply a unit of i/o, see block.h).
+ *
+ * specifically, while a disk block can be unformatted, a postgres
+ * disk page is always a slotted page of the form:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ... |
+ * +-----------+----+---------------------------------+
+ * | ... linpN | |
+ * +-----------+--------------------------------------+
+ * | ^ pd_lower |
+ * | |
+ * | v pd_upper |
+ * +-------------+------------------------------------+
+ * | | tupleN ... |
+ * +-------------+------------------+-----------------+
+ * | ... tuple2 tuple1 tuple0 | "special space" |
+ * +--------------------------------+-----------------+
+ * ^ pd_special
+ *
+ * a page is full when nothing can be added between pd_lower and
+ * pd_upper.
+ *
+ * all blocks written out by an access method must be disk pages.
+ *
+ * EXCEPTIONS:
+ *
+ * obviously, a page is not formatted before it is initialized with by
+ * a call to PageInit.
+ *
+ * the contents of the special pg_variable/pg_time/pg_log tables are
+ * raw disk blocks with special formats. these are the only "access
+ * methods" that need not write disk pages.
+ *
+ * NOTES:
+ *
+ * linp0..N form an ItemId array. ItemPointers point into this array
+ * rather than pointing directly to a tuple.
+ *
+ * tuple0..N are added "backwards" on the page. because a tuple's
+ * ItemPointer points to its ItemId entry rather than its actual
+ * byte-offset position, tuples can be physically shuffled on a page
+ * whenever the need arises.
+ *
+ * AM-generic per-page information is kept in the pd_opaque field of
+ * the PageHeaderData. (this is currently only the page size.)
+ * AM-specific per-page data is kept in the area marked "special
+ * space"; each AM has an "opaque" structure defined somewhere that is
+ * stored as the page trailer. an access method should always
+ * initialize its pages with PageInit and then set its own opaque
+ * fields.
+ */
+typedef Pointer Page;
+
+/*
+ * PageIsValid --
+ * True iff page is valid.
+ */
+#define PageIsValid(page) PointerIsValid(page)
+
+
+/*
+ * location (byte offset) within a page.
+ *
+ * note that this is actually limited to 2^13 because we have limited
+ * ItemIdData.lp_off and ItemIdData.lp_len to 13 bits (see itemid.h).
+ */
+typedef uint16 LocationIndex;
+
+
+/*
+ * space management information generic to any page
+ *
+ * od_pagesize - size in bytes.
+ * in reality, we need at least 64B to fit the
+ * page header, opaque space and a minimal tuple;
+ * on the high end, we can only support pages up
+ * to 8KB because lp_off/lp_len are 13 bits.
+ */
+typedef struct OpaqueData {
+ uint16 od_pagesize;
+} OpaqueData;
+
+typedef OpaqueData *Opaque;
+
+
+/*
+ * disk page organization
+ */
+typedef struct PageHeaderData {
+ LocationIndex pd_lower; /* offset to start of free space */
+ LocationIndex pd_upper; /* offset to end of free space */
+ LocationIndex pd_special; /* offset to start of special space */
+ OpaqueData pd_opaque; /* AM-generic information */
+ ItemIdData pd_linp[1]; /* line pointers */
+} PageHeaderData;
+
+typedef PageHeaderData *PageHeader;
+
+typedef enum {
+ ShufflePageManagerMode,
+ OverwritePageManagerMode
+} PageManagerMode;
+
+/* ----------------
+ * misc support macros
+ * ----------------
+ */
+
+/*
+ * XXX this is wrong -- ignores padding/alignment, variable page size,
+ * AM-specific opaque space at the end of the page (as in btrees), ...
+ * however, it at least serves as an upper bound for heap pages.
+ */
+#define MAXTUPLEN (BLCKSZ - sizeof (PageHeaderData))
+
+/* ----------------------------------------------------------------
+ * page support macros
+ * ----------------------------------------------------------------
+ */
+/*
+ * PageIsValid -- This is defined in page.h.
+ */
+
+/*
+ * PageIsUsed --
+ * True iff the page size is used.
+ *
+ * Note:
+ * Assumes page is valid.
+ */
+#define PageIsUsed(page) \
+ (AssertMacro(PageIsValid(page)) ? \
+ ((bool) (((PageHeader) (page))->pd_lower != 0)) : false)
+
+/*
+ * PageIsEmpty --
+ * returns true iff no itemid has been allocated on the page
+ */
+#define PageIsEmpty(page) \
+ (((PageHeader) (page))->pd_lower == \
+ (sizeof(PageHeaderData) - sizeof(ItemIdData)) ? true : false)
+
+/*
+ * PageGetItemId --
+ * Returns an item identifier of a page.
+ */
+#define PageGetItemId(page, offsetNumber) \
+ ((ItemId) (&((PageHeader) (page))->pd_linp[(-1) + (offsetNumber)]))
+
+/* ----------------
+ * macros to access opaque space
+ * ----------------
+ */
+
+/*
+ * PageSizeIsValid --
+ * True iff the page size is valid.
+ *
+ * XXX currently all page sizes are "valid" but we only actually
+ * use BLCKSZ.
+ */
+#define PageSizeIsValid(pageSize) 1
+
+/*
+ * PageGetPageSize --
+ * Returns the page size of a page.
+ *
+ * this can only be called on a formatted page (unlike
+ * BufferGetPageSize, which can be called on an unformatted page).
+ * however, it can be called on a page for which there is no buffer.
+ */
+#define PageGetPageSize(page) \
+ ((Size) ((PageHeader) (page))->pd_opaque.od_pagesize)
+
+/*
+ * PageSetPageSize --
+ * Sets the page size of a page.
+ */
+#define PageSetPageSize(page, size) \
+ ((PageHeader) (page))->pd_opaque.od_pagesize = (size)
+
+/* ----------------
+ * page special data macros
+ * ----------------
+ */
+/*
+ * PageGetSpecialSize --
+ * Returns size of special space on a page.
+ *
+ * Note:
+ * Assumes page is locked.
+ */
+#define PageGetSpecialSize(page) \
+ ((uint16) (PageGetPageSize(page) - ((PageHeader)page)->pd_special))
+
+/*
+ * PageGetSpecialPointer --
+ * Returns pointer to special space on a page.
+ *
+ * Note:
+ * Assumes page is locked.
+ */
+#define PageGetSpecialPointer(page) \
+ (AssertMacro(PageIsValid(page)) ? \
+ (char *) ((char *) (page) + ((PageHeader) (page))->pd_special) \
+ : (char *) 0)
+
+/* ----------------------------------------------------------------
+ * extern declarations
+ * ----------------------------------------------------------------
+ */
+
+extern Size BufferGetPageSize(Buffer buffer);
+extern Page BufferGetPage(Buffer buffer);
+extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern Item PageGetItem(Page page, ItemId itemId);
+extern OffsetNumber PageAddItem(Page page, Item item, Size size,
+ OffsetNumber offsetNumber, ItemIdFlags flags);
+extern Page PageGetTempPage(Page page, Size specialSize);
+extern void PageRestoreTempPage(Page tempPage, Page oldPage);
+extern OffsetNumber PageGetMaxOffsetNumber(Page page);
+extern void PageRepairFragmentation(Page page);
+extern Size PageGetFreeSpace(Page page);
+extern void PageManagerModeSet(PageManagerMode mode);
+extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
+extern void PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr,
+ char *location, Size size);
+
+
+#endif /* BUFPAGE_H */
diff --git a/src/backend/storage/fd.h b/src/backend/storage/fd.h
new file mode 100644
index 00000000000..da28b031bb8
--- /dev/null
+++ b/src/backend/storage/fd.h
@@ -0,0 +1,96 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.h--
+ * Virtual file descriptor definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: fd.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * calls:
+ *
+ * File {Close, Read, Write, Seek, Tell, Sync}
+ * {File Name Open, Allocate, Free} File
+ *
+ * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
+ * use them for all file activity...
+ *
+ * fd = FilePathOpenFile("foo", O_RDONLY);
+ * File fd;
+ *
+ * use AllocateFile if you need a file descriptor in some other context.
+ * it will make sure that there is a file descriptor free
+ *
+ * use FreeFile to let the virtual file descriptor package know that
+ * there is now a free fd (when you are done with it)
+ *
+ * AllocateFile();
+ * FreeFile();
+ */
+#ifndef FD_H
+#define FD_H
+
+/*
+ * FileOpen uses the standard UNIX open(2) flags.
+ */
+#include <fcntl.h> /* for O_ on most */
+#ifndef O_RDONLY
+#include <sys/file.h> /* for O_ on the rest */
+#endif /* O_RDONLY */
+
+/*
+ * FileSeek uses the standard UNIX lseek(2) flags.
+ */
+#ifndef WIN32
+#include <unistd.h> /* for SEEK_ on most */
+#else
+#ifndef SEEK_SET
+#include <stdio.h> /* for SEEK_ on the rest */
+#endif /* SEEK_SET */
+#endif /* WIN32 */
+
+#include "c.h"
+#include "storage/block.h"
+
+typedef char *FileName;
+
+typedef int File;
+
+/* originally in libpq-fs.h */
+struct pgstat { /* just the fields we need from stat structure */
+ int st_ino;
+ int st_mode;
+ unsigned int st_size;
+ unsigned int st_sizehigh; /* high order bits */
+/* 2^64 == 1.8 x 10^20 bytes */
+ int st_uid;
+ int st_atime_s; /* just the seconds */
+ int st_mtime_s; /* since SysV and the new BSD both have */
+ int st_ctime_s; /* usec fields.. */
+};
+
+/*
+ * prototypes for functions in fd.c
+ */
+extern void FileInvalidate(File file);
+extern File FileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+extern void FileClose(File file);
+extern void FileUnlink(File file);
+extern int FileRead(File file, char *buffer, int amount);
+extern int FileWrite(File file, char *buffer, int amount);
+extern long FileSeek(File file, long offset, int whence);
+extern long FileTell(File file);
+extern int FileTruncate(File file, int offset);
+extern int FileSync(File file);
+extern int FileNameUnlink(char *filename);
+extern void AllocateFile(void);
+extern void FreeFile(void);
+extern void closeAllVfds(void);
+extern void closeOneVfd(void);
+
+#endif /* FD_H */
diff --git a/src/backend/storage/file/Makefile.inc b/src/backend/storage/file/Makefile.inc
new file mode 100644
index 00000000000..767cbecd38a
--- /dev/null
+++ b/src/backend/storage/file/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for storage/file
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/storage/file/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= fd.c
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
new file mode 100644
index 00000000000..bb94c4c5dec
--- /dev/null
+++ b/src/backend/storage/file/fd.c
@@ -0,0 +1,888 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.c--
+ * Virtual file descriptor code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * $Id: fd.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+ *
+ * NOTES:
+ *
+ * This code manages a cache of 'virtual' file descriptors (VFDs).
+ * The server opens many file descriptors for a variety of reasons,
+ * including base tables, scratch files (e.g., sort and hash spool
+ * files), and random calls to C library routines like system(3); it
+ * is quite easy to exceed system limits on the number of open files a
+ * single process can have. (This is around 256 on many modern
+ * operating systems, but can be as low as 32 on others.)
+ *
+ * VFDs are managed as an LRU pool, with actual OS file descriptors
+ * being opened and closed as needed. Obviously, if a routine is
+ * opened using these interfaces, all subsequent operations must also
+ * be through these interfaces (the File type is not a real file
+ * descriptor).
+ *
+ * For this scheme to work, most (if not all) routines throughout the
+ * server should use these interfaces instead of calling the C library
+ * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
+ * may find ourselves short of real file descriptors anyway.
+ *
+ * This file used to contain a bunch of stuff to support RAID levels 0
+ * (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone
+ * because the parallel query processing code that called it is all
+ * gone. If you really need it you could get it from the original
+ * POSTGRES source.
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <sys/file.h>
+#include <sys/param.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "c.h"
+#include "miscadmin.h" /* for DataDir */
+#include "utils/palloc.h"
+
+#ifdef PORTNAME_sparc
+/*
+ * the SunOS 4 NOFILE is a lie, because the default limit is *not* the
+ * maximum number of file descriptors you can have open.
+ *
+ * we have to either use this number (the default dtablesize) or
+ * explicitly call setrlimit(RLIMIT_NOFILE, NOFILE).
+ */
+#include <sys/user.h>
+#undef NOFILE
+#define NOFILE NOFILE_IN_U
+#endif /* PORTNAME_sparc */
+
+/*
+ * Problem: Postgres does a system(ld...) to do dynamic loading. This
+ * will open several extra files in addition to those used by
+ * Postgres. We need to do this hack to guarentee that there are file
+ * descriptors free for ld to use.
+ *
+ * The current solution is to limit the number of files descriptors
+ * that this code will allocated at one time. (it leaves
+ * RESERVE_FOR_LD free).
+ *
+ * (Even though most dynamic loaders now use dlopen(3) or the
+ * equivalent, the OS must still open several files to perform the
+ * dynamic loading. Keep this here.)
+ */
+#define RESERVE_FOR_LD 10
+
+/*
+ * If we are using weird storage managers, we may need to keep real
+ * file descriptors open so that the jukebox server doesn't think we
+ * have gone away (and no longer care about a platter or file that
+ * we've been using). This might be an actual file descriptor for a
+ * local jukebox interface that uses paths, or a socket connection for
+ * a network jukebox server. Since we can't be opening and closing
+ * these descriptors at whim, we must make allowances for them.
+ */
+#ifdef HP_JUKEBOX
+#define RESERVE_FOR_JB 25
+#define MAXFILES ((NOFILE - RESERVE_FOR_LD) - RESERVE_FOR_JB)
+#else /* HP_JUKEBOX */
+#define MAXFILES (NOFILE - RESERVE_FOR_LD)
+#endif /* HP_JUKEBOX */
+
+/* Debugging.... */
+
+#ifdef FDDEBUG
+# define DO_DB(A) A
+#else
+# define DO_DB(A) /* A */
+#endif
+
+#define VFD_CLOSED -1
+
+#include "storage/fd.h"
+#include "utils/elog.h"
+
+#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
+
+typedef struct vfd {
+ signed short fd;
+ unsigned short fdstate;
+
+#define FD_DIRTY (1 << 0)
+
+ File nextFree;
+ File lruMoreRecently;
+ File lruLessRecently;
+ long seekPos;
+ char *fileName;
+ int fileFlags;
+ int fileMode;
+} Vfd;
+
+/*
+ * Virtual File Descriptor array pointer and size. This grows as
+ * needed.
+ */
+static Vfd *VfdCache;
+static Size SizeVfdCache = 0;
+
+/*
+ * Minimum number of file descriptors known to be free.
+ */
+static int FreeFd = 0;
+
+/*
+ * Number of file descriptors known to be open.
+ */
+static int nfile = 0;
+
+/*
+ * we use the name of the null device in various places, mostly so
+ * that we can open it and find out if we really have any descriptors
+ * available or not.
+ */
+#ifndef WIN32
+static char *Nulldev = "/dev/null";
+static char Sep_char = '/';
+#else
+static char *Nulldev = "NUL";
+static char Sep_char = '\\';
+#endif /* WIN32 */
+
+/*
+ * Private Routines
+ *
+ * Delete - delete a file from the Lru ring
+ * LruDelete - remove a file from the Lru ring and close
+ * Insert - put a file at the front of the Lru ring
+ * LruInsert - put a file at the front of the Lru ring and open
+ * AssertLruRoom - make sure that there is a free fd.
+ *
+ * the Last Recently Used ring is a doubly linked list that begins and
+ * ends on element zero.
+ *
+ * example:
+ *
+ * /--less----\ /---------\
+ * v \ v \
+ * #0 --more---> LeastRecentlyUsed --more-\ \
+ * ^\ | |
+ * \\less--> MostRecentlyUsedFile <---/ |
+ * \more---/ \--less--/
+ *
+ * AllocateVfd - grab a free (or new) file record (from VfdArray)
+ * FreeVfd - free a file record
+ *
+ */
+static void Delete(File file);
+static void LruDelete(File file);
+static void Insert(File file);
+static int LruInsert (File file);
+static void AssertLruRoom(void);
+static File AllocateVfd(void);
+static void FreeVfd(File file);
+
+static int FileAccess(File file);
+static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+static char *filepath(char *filename);
+
+#if defined(FDDEBUG)
+static void
+_dump_lru()
+{
+ int mru = VfdCache[0].lruLessRecently;
+ Vfd *vfdP = &VfdCache[mru];
+
+ printf("MOST %d ", mru);
+ while (mru != 0)
+ {
+ mru = vfdP->lruLessRecently;
+ vfdP = &VfdCache[mru];
+ printf("%d ", mru);
+ }
+ printf("LEAST\n");
+}
+#endif /* FDDEBUG */
+
+static void
+Delete(File file)
+{
+ Vfd *fileP;
+
+ DO_DB(printf("DEBUG: Delete %d (%s)\n",
+ file, VfdCache[file].fileName));
+ DO_DB(_dump_lru());
+
+ Assert(file != 0);
+
+ fileP = &VfdCache[file];
+
+ VfdCache[fileP->lruLessRecently].lruMoreRecently =
+ VfdCache[file].lruMoreRecently;
+ VfdCache[fileP->lruMoreRecently].lruLessRecently =
+ VfdCache[file].lruLessRecently;
+
+ DO_DB(_dump_lru());
+}
+
+static void
+LruDelete(File file)
+{
+ Vfd *fileP;
+ int returnValue;
+
+ DO_DB(printf("DEBUG: LruDelete %d (%s)\n",
+ file, VfdCache[file].fileName));
+
+ Assert(file != 0);
+
+ fileP = &VfdCache[file];
+
+ /* delete the vfd record from the LRU ring */
+ Delete(file);
+
+ /* save the seek position */
+ fileP->seekPos = lseek(fileP->fd, 0L, SEEK_CUR);
+ Assert( fileP->seekPos != -1);
+
+ /* if we have written to the file, sync it */
+ if (fileP->fdstate & FD_DIRTY) {
+ returnValue = fsync(fileP->fd);
+ Assert(returnValue != -1);
+ fileP->fdstate &= ~FD_DIRTY;
+ }
+
+ /* close the file */
+ returnValue = close(fileP->fd);
+ Assert(returnValue != -1);
+
+ --nfile;
+ fileP->fd = VFD_CLOSED;
+
+ /* note that there is now one more free real file descriptor */
+ FreeFd++;
+}
+
+static void
+Insert(File file)
+{
+ Vfd *vfdP;
+
+ DO_DB(printf("DEBUG: Insert %d (%s)\n",
+ file, VfdCache[file].fileName));
+ DO_DB(_dump_lru());
+
+ vfdP = &VfdCache[file];
+
+ vfdP->lruMoreRecently = 0;
+ vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
+ VfdCache[0].lruLessRecently = file;
+ VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
+
+ DO_DB(_dump_lru());
+}
+
+static int
+LruInsert (File file)
+{
+ Vfd *vfdP;
+ int returnValue;
+
+ DO_DB(printf("DEBUG: LruInsert %d (%s)\n",
+ file, VfdCache[file].fileName));
+
+ vfdP = &VfdCache[file];
+
+ if (FileIsNotOpen(file)) {
+ int tmpfd;
+
+ /*
+ * Note, we check to see if there's a free file descriptor
+ * before attempting to open a file. One general way to do
+ * this is to try to open the null device which everybody
+ * should be able to open all the time. If this fails, we
+ * assume this is because there's no free file descriptors.
+ */
+ tryAgain:
+ tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666);
+ if (tmpfd < 0) {
+ FreeFd = 0;
+ errno = 0;
+ AssertLruRoom();
+ goto tryAgain;
+ } else {
+ close(tmpfd);
+ }
+ vfdP->fd = open(vfdP->fileName,vfdP->fileFlags,vfdP->fileMode);
+
+ if (vfdP->fd < 0) {
+ DO_DB(printf("RE_OPEN FAILED: %d\n",
+ errno));
+ return (vfdP->fd);
+ } else {
+ DO_DB(printf("RE_OPEN SUCCESS\n"));
+ ++nfile;
+ }
+
+ /* seek to the right position */
+ if (vfdP->seekPos != 0L) {
+ returnValue =
+ lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
+ Assert(returnValue != -1);
+ }
+
+ /* init state on open */
+ vfdP->fdstate = 0x0;
+
+ /* note that a file descriptor has been used up */
+ if (FreeFd > 0)
+ FreeFd--;
+ }
+
+ /*
+ * put it at the head of the Lru ring
+ */
+
+ Insert(file);
+
+ return (0);
+}
+
+static void
+AssertLruRoom()
+{
+ DO_DB(printf("DEBUG: AssertLruRoom (FreeFd = %d)\n",
+ FreeFd));
+
+ if (FreeFd <= 0 || nfile >= MAXFILES) {
+ LruDelete(VfdCache[0].lruMoreRecently);
+ }
+}
+
+static File
+AllocateVfd()
+{
+ Index i;
+ File file;
+
+ DO_DB(printf("DEBUG: AllocateVfd\n"));
+
+ if (SizeVfdCache == 0) {
+
+ /* initialize */
+ VfdCache = (Vfd *)malloc(sizeof(Vfd));
+
+ VfdCache->nextFree = 0;
+ VfdCache->lruMoreRecently = 0;
+ VfdCache->lruLessRecently = 0;
+ VfdCache->fd = VFD_CLOSED;
+ VfdCache->fdstate = 0x0;
+
+ SizeVfdCache = 1;
+ }
+
+ if (VfdCache[0].nextFree == 0) {
+
+ /*
+ * The free list is empty so it is time to increase the
+ * size of the array
+ */
+
+ VfdCache =(Vfd *)realloc(VfdCache, sizeof(Vfd)*SizeVfdCache*2);
+ Assert(VfdCache != NULL);
+
+ /*
+ * Set up the free list for the new entries
+ */
+
+ for (i = SizeVfdCache; i < 2*SizeVfdCache; i++) {
+ memset((char *) &(VfdCache[i]), 0, sizeof(VfdCache[0]));
+ VfdCache[i].nextFree = i+1;
+ VfdCache[i].fd = VFD_CLOSED;
+ }
+
+ /*
+ * Element 0 is the first and last element of the free
+ * list
+ */
+
+ VfdCache[0].nextFree = SizeVfdCache;
+ VfdCache[2*SizeVfdCache-1].nextFree = 0;
+
+ /*
+ * Record the new size
+ */
+
+ SizeVfdCache *= 2;
+ }
+ file = VfdCache[0].nextFree;
+
+ VfdCache[0].nextFree = VfdCache[file].nextFree;
+
+ return file;
+}
+
+static void
+FreeVfd(File file)
+{
+ DO_DB(printf("DB: FreeVfd: %d (%s)\n",
+ file, VfdCache[file].fileName));
+
+ VfdCache[file].nextFree = VfdCache[0].nextFree;
+ VfdCache[0].nextFree = file;
+}
+
+static char *
+filepath(char *filename)
+{
+ char *buf;
+ char basename[16];
+ int len;
+
+#ifndef WIN32
+ if (*filename != Sep_char) {
+#else
+ if (!(filename[1] == ':' && filename[2] == Sep_char)) {
+#endif /* WIN32 */
+
+ /* Either /base/ or \base\ */
+ sprintf(basename, "%cbase%c", Sep_char, Sep_char);
+
+ len = strlen(DataDir) + strlen(basename) + strlen(GetDatabaseName())
+ + strlen(filename) + 2;
+ buf = (char*) palloc(len);
+ sprintf(buf, "%s%s%s%c%s",
+ DataDir, basename, GetDatabaseName(), Sep_char, filename);
+ } else {
+ buf = (char *) palloc(strlen(filename) + 1);
+ strcpy(buf, filename);
+ }
+
+ return(buf);
+}
+
+static int
+FileAccess(File file)
+{
+ int returnValue;
+
+ DO_DB(printf("DB: FileAccess %d (%s)\n",
+ file, VfdCache[file].fileName));
+
+ /*
+ * Is the file open? If not, close the least recently used,
+ * then open it and stick it at the head of the used ring
+ */
+
+ if (FileIsNotOpen(file)) {
+
+ AssertLruRoom();
+
+ returnValue = LruInsert(file);
+ if (returnValue != 0)
+ return returnValue;
+
+ } else {
+
+ /*
+ * We now know that the file is open and that it is not the
+ * last one accessed, so we need to more it to the head of
+ * the Lru ring.
+ */
+
+ Delete(file);
+ Insert(file);
+ }
+
+ return (0);
+}
+
+/*
+ * Called when we get a shared invalidation message on some relation.
+ */
+void
+FileInvalidate(File file)
+{
+ if (!FileIsNotOpen(file)) {
+ LruDelete(file);
+ }
+}
+
+/* VARARGS2 */
+static File
+fileNameOpenFile(FileName fileName,
+ int fileFlags,
+ int fileMode)
+{
+ static int osRanOut = 0;
+ File file;
+ Vfd *vfdP;
+ int tmpfd;
+
+ DO_DB(printf("DEBUG: FileNameOpenFile: %s %x %o\n",
+ fileName, fileFlags, fileMode));
+
+ file = AllocateVfd();
+ vfdP = &VfdCache[file];
+
+ if (nfile >= MAXFILES || (FreeFd == 0 && osRanOut)) {
+ AssertLruRoom();
+ }
+
+ tryAgain:
+ tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666);
+ if (tmpfd < 0) {
+ DO_DB(printf("DB: not enough descs, retry, er= %d\n",
+ errno));
+ errno = 0;
+ FreeFd = 0;
+ osRanOut = 1;
+ AssertLruRoom();
+ goto tryAgain;
+ } else {
+ close(tmpfd);
+ }
+
+#ifdef WIN32
+ fileFlags |= _O_BINARY;
+#endif /* WIN32 */
+ vfdP->fd = open(fileName,fileFlags,fileMode);
+ vfdP->fdstate = 0x0;
+
+ if (vfdP->fd < 0) {
+ FreeVfd(file);
+ return -1;
+ }
+ ++nfile;
+ DO_DB(printf("DB: FNOF success %d\n",
+ vfdP->fd));
+
+ (void)LruInsert(file);
+
+ if (fileName==NULL) {
+ elog(WARN, "fileNameOpenFile: NULL fname");
+ }
+ vfdP->fileName = malloc(strlen(fileName)+1);
+ strcpy(vfdP->fileName,fileName);
+
+ vfdP->fileFlags = fileFlags & ~(O_TRUNC|O_EXCL);
+ vfdP->fileMode = fileMode;
+ vfdP->seekPos = 0;
+
+ return file;
+}
+
+/*
+ * open a file in the database directory ($PGDATA/base/...)
+ */
+File
+FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
+{
+ File fd;
+ char *fname;
+
+ fname = filepath(fileName);
+ fd = fileNameOpenFile(fname, fileFlags, fileMode);
+ pfree(fname);
+ return(fd);
+}
+
+/*
+ * open a file in an arbitrary directory
+ */
+File
+PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
+{
+ return(fileNameOpenFile(fileName, fileFlags, fileMode));
+}
+
+void
+FileClose(File file)
+{
+ int returnValue;
+
+ DO_DB(printf("DEBUG: FileClose: %d (%s)\n",
+ file, VfdCache[file].fileName));
+
+ if (!FileIsNotOpen(file)) {
+
+ /* remove the file from the lru ring */
+ Delete(file);
+
+ /* record the new free operating system file descriptor */
+ FreeFd++;
+
+ /* if we did any writes, sync the file before closing */
+ if (VfdCache[file].fdstate & FD_DIRTY) {
+ returnValue = fsync(VfdCache[file].fd);
+ Assert(returnValue != -1);
+ VfdCache[file].fdstate &= ~FD_DIRTY;
+ }
+
+ /* close the file */
+ returnValue = close(VfdCache[file].fd);
+ Assert(returnValue != -1);
+
+ --nfile;
+ VfdCache[file].fd = VFD_CLOSED;
+ }
+ /*
+ * Add the Vfd slot to the free list
+ */
+ FreeVfd(file);
+ /*
+ * Free the filename string
+ */
+ free(VfdCache[file].fileName);
+}
+
+void
+FileUnlink(File file)
+{
+ int returnValue;
+
+ DO_DB(printf("DB: FileClose: %d (%s)\n",
+ file, VfdCache[file].fileName));
+
+ if (!FileIsNotOpen(file)) {
+
+ /* remove the file from the lru ring */
+ Delete(file);
+
+ /* record the new free operating system file descriptor */
+ FreeFd++;
+
+ /* if we did any writes, sync the file before closing */
+ if (VfdCache[file].fdstate & FD_DIRTY) {
+ returnValue = fsync(VfdCache[file].fd);
+ Assert(returnValue != -1);
+ VfdCache[file].fdstate &= ~FD_DIRTY;
+ }
+
+ /* close the file */
+ returnValue = close(VfdCache[file].fd);
+ Assert(returnValue != -1);
+
+ --nfile;
+ VfdCache[file].fd = VFD_CLOSED;
+ }
+ /* add the Vfd slot to the free list */
+ FreeVfd(file);
+
+ /* free the filename string */
+ unlink(VfdCache[file].fileName);
+ free(VfdCache[file].fileName);
+}
+
+int
+FileRead(File file, char *buffer, int amount)
+{
+ int returnCode;
+
+ DO_DB(printf("DEBUG: FileRead: %d (%s) %d 0x%x\n",
+ file, VfdCache[file].fileName, amount, buffer));
+
+ FileAccess(file);
+ returnCode = read(VfdCache[file].fd, buffer, amount);
+ if (returnCode > 0) {
+ VfdCache[file].seekPos += returnCode;
+ }
+
+ return returnCode;
+}
+
+int
+FileWrite(File file, char *buffer, int amount)
+{
+ int returnCode;
+
+ DO_DB(printf("DB: FileWrite: %d (%s) %d 0x%lx\n",
+ file, VfdCache[file].fileName, amount, buffer));
+
+ FileAccess(file);
+ returnCode = write(VfdCache[file].fd, buffer, amount);
+ if (returnCode > 0) { /* changed by Boris with Mao's advice */
+ VfdCache[file].seekPos += returnCode;
+ }
+
+ /* record the write */
+ VfdCache[file].fdstate |= FD_DIRTY;
+
+ return returnCode;
+}
+
+long
+FileSeek(File file, long offset, int whence)
+{
+ int returnCode;
+
+ DO_DB(printf("DEBUG: FileSeek: %d (%s) %d %d\n",
+ file, VfdCache[file].fileName, offset, whence));
+
+ if (FileIsNotOpen(file)) {
+ switch(whence) {
+ case SEEK_SET:
+ VfdCache[file].seekPos = offset;
+ return offset;
+ case SEEK_CUR:
+ VfdCache[file].seekPos = VfdCache[file].seekPos +offset;
+ return VfdCache[file].seekPos;
+ case SEEK_END:
+ FileAccess(file);
+ returnCode = VfdCache[file].seekPos =
+ lseek(VfdCache[file].fd, offset, whence);
+ return returnCode;
+ default:
+ elog(WARN, "FileSeek: invalid whence: %d", whence);
+ break;
+ }
+ } else {
+ returnCode = VfdCache[file].seekPos =
+ lseek(VfdCache[file].fd, offset, whence);
+ return returnCode;
+ }
+ /*NOTREACHED*/
+ return(-1L);
+}
+
+/*
+ * XXX not actually used but here for completeness
+ */
+long
+FileTell(File file)
+{
+ DO_DB(printf("DEBUG: FileTell %d (%s)\n",
+ file, VfdCache[file].fileName));
+ return VfdCache[file].seekPos;
+}
+
+int
+FileTruncate(File file, int offset)
+{
+ int returnCode;
+
+ DO_DB(printf("DEBUG: FileTruncate %d (%s)\n",
+ file, VfdCache[file].fileName));
+
+ (void) FileSync(file);
+ (void) FileAccess(file);
+ returnCode = ftruncate(VfdCache[file].fd, offset);
+ return(returnCode);
+}
+
+int
+FileSync(File file)
+{
+ int returnCode;
+
+ /*
+ * If the file isn't open, then we don't need to sync it; we
+ * always sync files when we close them. Also, if we haven't
+ * done any writes that we haven't already synced, we can ignore
+ * the request.
+ */
+
+ if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY)) {
+ returnCode = 0;
+ } else {
+ returnCode = fsync(VfdCache[file].fd);
+ VfdCache[file].fdstate &= ~FD_DIRTY;
+ }
+
+ return returnCode;
+}
+
+int
+FileNameUnlink(char *filename)
+{
+ int retval;
+ char *fname;
+
+ fname = filepath(filename);
+ retval = unlink(fname);
+ pfree(fname);
+ return(retval);
+}
+
+/*
+ * if we want to be sure that we have a real file descriptor available
+ * (e.g., we want to know this in psort) we call AllocateFile to force
+ * availability. when we are done we call FreeFile to deallocate the
+ * descriptor.
+ *
+ * allocatedFiles keeps track of how many have been allocated so we
+ * can give a warning if there are too few left.
+ */
+static int allocatedFiles = 0;
+
+void
+AllocateFile()
+{
+ int fd;
+ int fdleft;
+
+ while ((fd = open(Nulldev,O_WRONLY,0)) < 0) {
+ if (errno == EMFILE) {
+ errno = 0;
+ FreeFd = 0;
+ AssertLruRoom();
+ } else {
+ elog(WARN,"Open: %s in %s line %d\n", Nulldev,
+ __FILE__, __LINE__);
+ }
+ }
+ close(fd);
+ ++allocatedFiles;
+ fdleft = MAXFILES - allocatedFiles;
+ if (fdleft < 6) {
+ elog(DEBUG,"warning: few usable file descriptors left (%d)", fdleft);
+ }
+
+ DO_DB(printf("DEBUG: AllocatedFile. FreeFd = %d\n",
+ FreeFd));
+}
+
+/*
+ * XXX What happens if FreeFile() is called without a previous
+ * AllocateFile()?
+ */
+void
+FreeFile()
+{
+ DO_DB(printf("DEBUG: FreeFile. FreeFd now %d\n",
+ FreeFd));
+ FreeFd++;
+ nfile++; /* dangerous */
+ Assert(allocatedFiles > 0);
+ --allocatedFiles;
+}
+
+void
+closeAllVfds()
+{
+ int i;
+ for (i=0; i<SizeVfdCache; i++) {
+ if (!FileIsNotOpen(i))
+ LruDelete(i);
+ }
+}
+
+void
+closeOneVfd()
+{
+ int tmpfd;
+
+ tmpfd = open(Nulldev, O_CREAT | O_RDWR, 0666);
+ if (tmpfd < 0) {
+ FreeFd = 0;
+ AssertLruRoom();
+ FreeFd = 0;
+ }
+ else
+ close(tmpfd);
+}
diff --git a/src/backend/storage/ipc.h b/src/backend/storage/ipc.h
new file mode 100644
index 00000000000..0da041bc9c8
--- /dev/null
+++ b/src/backend/storage/ipc.h
@@ -0,0 +1,285 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.h--
+ * POSTGRES inter-process communication definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: ipc.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ * NOTES
+ * This file is very architecture-specific. This stuff should actually
+ * be factored into the port/ directories.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IPC_H
+#define IPC_H
+
+#include <sys/types.h>
+#ifndef _IPC_
+#define _IPC_
+#include <sys/ipc.h>
+#endif
+
+#include "c.h"
+
+/*
+ * Many architectures have support for user-level spinlocks (i.e., an
+ * atomic test-and-set instruction). However, we have only written
+ * spinlock code for the architectures listed.
+ */
+#if defined(PORTNAME_aix) || \
+ defined(PORTNAME_alpha) || \
+ defined(PORTNAME_hpux) || \
+ defined(PORTNAME_irix5) || \
+ defined(PORTNAME_next) || \
+ defined(PORTNAME_sparc) || \
+ defined(PORTNAME_sparc_solaris) || \
+ (defined(__i386__) && defined(__GNUC__))
+#define HAS_TEST_AND_SET
+#endif
+
+#if defined(HAS_TEST_AND_SET)
+
+#if defined(PORTNAME_next)
+/*
+ * Use Mach mutex routines since these are, in effect, test-and-set
+ * spinlocks.
+ */
+#undef NEVER /* definition in cthreads.h conflicts with parse.h */
+#include <mach/cthreads.h>
+
+typedef struct mutex slock_t;
+#else /* next */
+#if defined(PORTNAME_aix)
+/*
+ * The AIX C library has the cs(3) builtin for compare-and-set that
+ * operates on ints.
+ */
+typedef unsigned int slock_t;
+#else /* aix */
+#if defined(PORTNAME_alpha)
+#include <sys/mman.h>
+typedef msemaphore slock_t;
+#else /* alpha */
+#if defined(PORTNAME_hpux)
+/*
+ * The PA-RISC "semaphore" for the LDWCX instruction is 4 bytes aligned
+ * to a 16-byte boundary.
+ */
+typedef struct { int sem[4]; } slock_t;
+#else /* hpux */
+#if defined(PORTNAME_irix5)
+#include <abi_mutex.h>
+typedef abilock_t slock_t;
+#else /* irix5 */
+/*
+ * On all other architectures spinlocks are a single byte.
+ */
+typedef unsigned char slock_t;
+#endif /* irix5 */
+#endif /* hpux */
+#endif /* alpha */
+#endif /* aix */
+#endif /* next */
+
+extern void S_LOCK(slock_t *lock);
+extern void S_UNLOCK(slock_t *lock);
+extern void S_INIT_LOCK(slock_t *lock);
+
+#if defined(PORTNAME_hpux) || defined(PORTNAME_alpha) || defined(PORTNAME_irix5) || defined(PORTNAME_next)
+extern int S_LOCK_FREE(slock_t *lock);
+#else /* PORTNAME_hpux */
+#define S_LOCK_FREE(lock) ((*lock) == 0)
+#endif /* PORTNAME_hpux */
+
+#endif /* HAS_TEST_AND_SET */
+
+/*
+ * On architectures for which we have not implemented spinlocks (or
+ * cannot do so), we use System V semaphores. We also use them for
+ * long locks. For some reason union semun is never defined in the
+ * System V header files so we must do it ourselves.
+ */
+#if defined(sequent) || \
+ defined(PORTNAME_aix) || \
+ defined(PORTNAME_alpha) || \
+ defined(PORTNAME_hpux) || \
+ defined(PORTNAME_sparc_solaris) || \
+ defined(WIN32) || \
+ defined(PORTNAME_ultrix4)
+union semun {
+ int val;
+ struct semid_ds *buf;
+ unsigned short *array;
+};
+#endif
+
+typedef uint16 SystemPortAddress;
+
+/* semaphore definitions */
+
+#define IPCProtection (0600) /* access/modify by user only */
+
+#define IPC_NMAXSEM 25 /* maximum number of semaphores */
+#define IpcSemaphoreDefaultStartValue 255
+#define IpcSharedLock (-1)
+#define IpcExclusiveLock (-255)
+
+#define IpcUnknownStatus (-1)
+#define IpcInvalidArgument (-2)
+#define IpcSemIdExist (-3)
+#define IpcSemIdNotExist (-4)
+
+typedef uint32 IpcSemaphoreKey; /* semaphore key */
+typedef int IpcSemaphoreId;
+
+/* shared memory definitions */
+
+#define IpcMemCreationFailed (-1)
+#define IpcMemIdGetFailed (-2)
+#define IpcMemAttachFailed 0
+
+typedef uint32 IPCKey;
+#define PrivateIPCKey IPC_PRIVATE
+#define DefaultIPCKey 17317
+
+typedef uint32 IpcMemoryKey; /* shared memory key */
+typedef int IpcMemoryId;
+
+
+/* ipc.c */
+extern void exitpg(int code);
+extern void quasi_exitpg(void);
+extern on_exitpg(void (*function)(), caddr_t arg);
+
+extern IpcSemaphoreId IpcSemaphoreCreate(IpcSemaphoreKey semKey,
+ int semNum, int permission, int semStartValue,
+ int removeOnExit, int *status);
+extern void IpcSemaphoreSet(int semId, int semno, int value);
+extern void IpcSemaphoreKill(IpcSemaphoreKey key);
+extern void IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock);
+extern void IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock);
+extern int IpcSemaphoreGetCount(IpcSemaphoreId semId, int sem);
+extern int IpcSemaphoreGetValue(IpcSemaphoreId semId, int sem);
+extern IpcMemoryId IpcMemoryCreate(IpcMemoryKey memKey, uint32 size,
+ int permission);
+extern IpcMemoryId IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size);
+extern void IpcMemoryDetach(int status, char *shmaddr);
+extern char *IpcMemoryAttach(IpcMemoryId memId);
+extern void IpcMemoryKill(IpcMemoryKey memKey);
+extern void CreateAndInitSLockMemory(IPCKey key);
+extern void AttachSLockMemory(IPCKey key);
+
+
+#ifdef HAS_TEST_AND_SET
+
+#define NSLOCKS 2048
+#define NOLOCK 0
+#define SHAREDLOCK 1
+#define EXCLUSIVELOCK 2
+
+typedef enum _LockId_ {
+ BUFMGRLOCKID,
+ LOCKLOCKID,
+ OIDGENLOCKID,
+ SHMEMLOCKID,
+ BINDINGLOCKID,
+ LOCKMGRLOCKID,
+ SINVALLOCKID,
+
+#ifdef MAIN_MEMORY
+ MMCACHELOCKID,
+#endif /* MAIN_MEMORY */
+
+ PROCSTRUCTLOCKID,
+ FIRSTFREELOCKID
+} _LockId_;
+
+#define MAX_SPINS FIRSTFREELOCKID
+
+typedef struct slock {
+ slock_t locklock;
+ unsigned char flag;
+ short nshlocks;
+ slock_t shlock;
+ slock_t exlock;
+ slock_t comlock;
+ struct slock *next;
+} SLock;
+
+extern void ExclusiveLock(int lockid);
+extern void ExclusiveUnlock(int lockid);
+extern bool LockIsFree(int lockid);
+#else /* HAS_TEST_AND_SET */
+
+typedef enum _LockId_ {
+ SHMEMLOCKID,
+ BINDINGLOCKID,
+ BUFMGRLOCKID,
+ LOCKMGRLOCKID,
+ SINVALLOCKID,
+
+#ifdef MAIN_MEMORY
+ MMCACHELOCKID,
+#endif /* MAIN_MEMORY */
+
+ PROCSTRUCTLOCKID,
+ OIDGENLOCKID,
+ FIRSTFREELOCKID
+} _LockId_;
+
+#define MAX_SPINS FIRSTFREELOCKID
+
+#endif /* HAS_TEST_AND_SET */
+
+/*
+ * the following are originally in ipci.h but the prototypes have circular
+ * dependencies and most files include both ipci.h and ipc.h anyway, hence
+ * combined.
+ *
+ */
+
+/*
+ * Note:
+ * These must not hash to DefaultIPCKey or PrivateIPCKey.
+ */
+#define SystemPortAddressGetIPCKey(address) \
+ (28597 * (address) + 17491)
+
+/*
+ * these keys are originally numbered from 1 to 12 consecutively but not
+ * all are used. The unused ones are removed. - ay 4/95.
+ */
+#define IPCKeyGetBufferMemoryKey(key) \
+ ((key == PrivateIPCKey) ? key : 1 + (key))
+
+#define IPCKeyGetSIBufferMemoryBlock(key) \
+ ((key == PrivateIPCKey) ? key : 7 + (key))
+
+#define IPCKeyGetSLockSharedMemoryKey(key) \
+ ((key == PrivateIPCKey) ? key : 10 + (key))
+
+#define IPCKeyGetSpinLockSemaphoreKey(key) \
+ ((key == PrivateIPCKey) ? key : 11 + (key))
+#define IPCKeyGetWaitIOSemaphoreKey(key) \
+ ((key == PrivateIPCKey) ? key : 12 + (key))
+
+/* --------------------------
+ * NOTE: This macro must always give the highest numbered key as every backend
+ * process forked off by the postmaster will be trying to acquire a semaphore
+ * with a unique key value starting at key+14 and incrementing up. Each
+ * backend uses the current key value then increments it by one.
+ * --------------------------
+ */
+#define IPCGetProcessSemaphoreInitKey(key) \
+ ((key == PrivateIPCKey) ? key : 14 + (key))
+
+/* ipci.c */
+extern IPCKey SystemPortAddressCreateIPCKey(SystemPortAddress address);
+extern void CreateSharedMemoryAndSemaphores(IPCKey key);
+extern void AttachSharedMemoryAndSemaphores(IPCKey key);
+
+#endif /* IPC_H */
diff --git a/src/backend/storage/ipc/Makefile.inc b/src/backend/storage/ipc/Makefile.inc
new file mode 100644
index 00000000000..b426dba0ff0
--- /dev/null
+++ b/src/backend/storage/ipc/Makefile.inc
@@ -0,0 +1,15 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for storage/ipc
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= ipc.c ipci.c s_lock.c shmem.c shmqueue.c sinval.c \
+ sinvaladt.c spin.c
diff --git a/src/backend/storage/ipc/README b/src/backend/storage/ipc/README
new file mode 100644
index 00000000000..02d66045f82
--- /dev/null
+++ b/src/backend/storage/ipc/README
@@ -0,0 +1,31 @@
+$Header: /cvsroot/pgsql/src/backend/storage/ipc/README,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+Mon Jul 18 11:09:22 PDT 1988 W.KLAS
+
+Cache invalidation synchronization routines:
+===========================================
+
+The cache synchronization is done using a message queue. Every
+backend can register a message which then has to be read by
+all backends. A message read by all backends is removed from the
+queue automatically. If a message has been lost because the buffer
+was full, all backends that haven't read this message will be
+noticed that they have to reset their cache state. This is done
+at the time when they try to read the message queue.
+
+The message queue is implemented as a shared buffer segment. Actually,
+the queue is a circle to allow fast inserting, reading (invalidate data) and
+maintaining the buffer.
+
+Access to this shared message buffer is synchronized by the lock manager.
+The lock manager treats the buffer as a regular relation and sets
+relation level locks (with mode = LockWait) to block backends while
+another backend is writing or reading the buffer. The identifiers used
+for this special 'relation' are database id = 0 and relation id = 0.
+
+The current implementation prints regular (e)log information
+when a message has been removed from the buffer because the buffer
+is full, and a backend has to reset its cache state. The elog level
+is NOTICE. This can be used to improve teh behavior of backends
+when invalidating or reseting their cache state.
+
+
diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c
new file mode 100644
index 00000000000..306300b90c3
--- /dev/null
+++ b/src/backend/storage/ipc/ipc.c
@@ -0,0 +1,718 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.c--
+ * POSTGRES inter-process communication definitions.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipc.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ * NOTES
+ *
+ * Currently, semaphores are used (my understanding anyway) in two
+ * different ways:
+ * 1. as mutexes on machines that don't have test-and-set (eg.
+ * mips R3000).
+ * 2. for putting processes to sleep when waiting on a lock
+ * and waking them up when the lock is free.
+ * The number of semaphores in (1) is fixed and those are shared
+ * among all backends. In (2), there is 1 semaphore per process and those
+ * are not shared with anyone else.
+ * -ay 4/95
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/types.h>
+#include <sys/file.h>
+#include <stdio.h>
+#include <errno.h>
+
+/* XXX - the following dependency should be moved into the defaults.mk file */
+#ifndef _IPC_
+#define _IPC_
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#endif
+
+#include "storage/ipc.h"
+#include "utils/memutils.h"
+#include "utils/elog.h"
+
+#if defined(PORTNAME_bsd44)
+int UsePrivateMemory = 1;
+#else
+int UsePrivateMemory = 0;
+#endif
+
+#if defined(PORTNAME_bsdi)
+/* hacka, hacka, hacka (XXX) */
+union semun {
+ int val; /* value for SETVAL */
+ struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */
+ ushort *array; /* array for GETALL & SETALL */
+};
+#endif
+
+
+/* ----------------------------------------------------------------
+ * exit() handling stuff
+ * ----------------------------------------------------------------
+ */
+
+#define MAX_ON_EXITS 20
+
+static struct ONEXIT {
+ void (*function)();
+ caddr_t arg;
+} onexit_list[ MAX_ON_EXITS ];
+
+static int onexit_index;
+
+typedef struct _PrivateMemStruct {
+ int id;
+ char *memptr;
+} PrivateMem;
+
+PrivateMem IpcPrivateMem[16];
+
+static int
+PrivateMemoryCreate(IpcMemoryKey memKey,
+ uint32 size)
+{
+ static int memid = 0;
+
+ UsePrivateMemory = 1;
+
+ IpcPrivateMem[memid].id = memid;
+ IpcPrivateMem[memid].memptr = malloc(size);
+ if (IpcPrivateMem[memid].memptr == NULL)
+ elog(WARN, "PrivateMemoryCreate: not enough memory to malloc");
+ memset(IpcPrivateMem[memid].memptr, 0, size); /* XXX PURIFY */
+
+ return (memid++);
+}
+
+static char *
+PrivateMemoryAttach(IpcMemoryId memid)
+{
+ return ( IpcPrivateMem[memid].memptr );
+}
+
+
+/* ----------------------------------------------------------------
+ * exitpg
+ *
+ * this function calls all the callbacks registered
+ * for it (to free resources) and then calls exit.
+ * This should be the only function to call exit().
+ * -cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+static int exitpg_inprogress = 0;
+
+void
+exitpg(int code)
+{
+ int i;
+
+ /* ----------------
+ * if exitpg_inprocess is true, then it means that we
+ * are being invoked from within an on_exit() handler
+ * and so we return immediately to avoid recursion.
+ * ----------------
+ */
+ if (exitpg_inprogress)
+ return;
+
+ exitpg_inprogress = 1;
+
+ /* ----------------
+ * call all the callbacks registered before calling exit().
+ * ----------------
+ */
+ for (i = onexit_index - 1; i >= 0; --i)
+ (*onexit_list[i].function)(code, onexit_list[i].arg);
+
+ exit(code);
+}
+
+/* ------------------
+ * Run all of the on_exitpg routines but don't exit in the end.
+ * This is used by the postmaster to re-initialize shared memory and
+ * semaphores after a backend dies horribly
+ * ------------------
+ */
+void
+quasi_exitpg()
+{
+ int i;
+
+ /* ----------------
+ * if exitpg_inprocess is true, then it means that we
+ * are being invoked from within an on_exit() handler
+ * and so we return immediately to avoid recursion.
+ * ----------------
+ */
+ if (exitpg_inprogress)
+ return;
+
+ exitpg_inprogress = 1;
+
+ /* ----------------
+ * call all the callbacks registered before calling exit().
+ * ----------------
+ */
+ for (i = onexit_index - 1; i >= 0; --i)
+ (*onexit_list[i].function)(0, onexit_list[i].arg);
+
+ onexit_index = 0;
+ exitpg_inprogress = 0;
+}
+
+/* ----------------------------------------------------------------
+ * on_exitpg
+ *
+ * this function adds a callback function to the list of
+ * functions invoked by exitpg(). -cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+int
+on_exitpg(void (*function)(), caddr_t arg)
+{
+ if (onexit_index >= MAX_ON_EXITS)
+ return(-1);
+
+ onexit_list[ onexit_index ].function = function;
+ onexit_list[ onexit_index ].arg = arg;
+
+ ++onexit_index;
+
+ return(0);
+}
+
+/****************************************************************************/
+/* IPCPrivateSemaphoreKill(status, semId) */
+/* */
+/****************************************************************************/
+static void
+IPCPrivateSemaphoreKill(int status,
+ int semId) /* caddr_t */
+{
+ union semun semun;
+ semctl(semId, 0, IPC_RMID, semun);
+}
+
+
+/****************************************************************************/
+/* IPCPrivateMemoryKill(status, shmId) */
+/* */
+/****************************************************************************/
+static void
+IPCPrivateMemoryKill(int status,
+ int shmId) /* caddr_t */
+{
+ if ( UsePrivateMemory ) {
+ /* free ( IpcPrivateMem[shmId].memptr ); */
+ } else {
+ if (shmctl(shmId, IPC_RMID, (struct shmid_ds *) NULL) < 0) {
+ elog(NOTICE, "IPCPrivateMemoryKill: shmctl(%d, %d, 0) failed: %m",
+ shmId, IPC_RMID);
+ }
+ }
+}
+
+
+/****************************************************************************/
+/* IpcSemaphoreCreate(semKey, semNum, permission, semStartValue) */
+/* */
+/* - returns a semaphore identifier: */
+/* */
+/* if key doesn't exist: return a new id, status:= IpcSemIdNotExist */
+/* if key exists: return the old id, status:= IpcSemIdExist */
+/* if semNum > MAX : return # of argument, status:=IpcInvalidArgument */
+/* */
+/****************************************************************************/
+
+/*
+ * Note:
+ * XXX This should be split into two different calls. One should
+ * XXX be used to create a semaphore set. The other to "attach" a
+ * XXX existing set. It should be an error for the semaphore set
+ * XXX to to already exist or for it not to, respectively.
+ *
+ * Currently, the semaphore sets are "attached" and an error
+ * is detected only when a later shared memory attach fails.
+ */
+
+IpcSemaphoreId
+IpcSemaphoreCreate(IpcSemaphoreKey semKey,
+ int semNum,
+ int permission,
+ int semStartValue,
+ int removeOnExit,
+ int *status)
+{
+ int i;
+ int errStatus;
+ int semId;
+ u_short array[IPC_NMAXSEM];
+ union semun semun;
+
+ /* get a semaphore if non-existent */
+ /* check arguments */
+ if (semNum > IPC_NMAXSEM || semNum <= 0) {
+ *status = IpcInvalidArgument;
+ return(2); /* returns the number of the invalid argument */
+ }
+
+ semId = semget(semKey, 0, 0);
+
+ if (semId == -1) {
+ *status = IpcSemIdNotExist; /* there doesn't exist a semaphore */
+#ifdef DEBUG_IPC
+ fprintf(stderr,"calling semget with %d, %d , %d\n",
+ semKey,
+ semNum,
+ IPC_CREAT|permission );
+#endif
+ semId = semget(semKey, semNum, IPC_CREAT|permission);
+
+ if (semId < 0) {
+ perror("semget");
+ exitpg(3);
+ }
+ for (i = 0; i < semNum; i++) {
+ array[i] = semStartValue;
+ }
+ semun.array = array;
+ errStatus = semctl(semId, 0, SETALL, semun);
+ if (errStatus == -1) {
+ perror("semctl");
+ }
+
+ if (removeOnExit)
+ on_exitpg(IPCPrivateSemaphoreKill, (caddr_t)semId);
+
+ } else {
+ /* there is a semaphore id for this key */
+ *status = IpcSemIdExist;
+ }
+
+#ifdef DEBUG_IPC
+ fprintf(stderr,"\nIpcSemaphoreCreate, status %d, returns %d\n",
+ *status,
+ semId );
+ fflush(stdout);
+ fflush(stderr);
+#endif
+ return(semId);
+}
+
+
+/****************************************************************************/
+/* IpcSemaphoreSet() - sets the initial value of the semaphore */
+/* */
+/* note: the xxx_return variables are only used for debugging. */
+/****************************************************************************/
+static int IpcSemaphoreSet_return;
+
+void
+IpcSemaphoreSet(int semId, int semno, int value)
+{
+ int errStatus;
+ union semun semun;
+
+ semun.val = value;
+ errStatus = semctl(semId, semno, SETVAL, semun);
+ IpcSemaphoreSet_return = errStatus;
+
+ if (errStatus == -1)
+ perror("semctl");
+}
+
+/****************************************************************************/
+/* IpcSemaphoreKill(key) - removes a semaphore */
+/* */
+/****************************************************************************/
+void
+IpcSemaphoreKill(IpcSemaphoreKey key)
+{
+ int semId;
+ union semun semun;
+
+ /* kill semaphore if existent */
+
+ semId = semget(key, 0, 0);
+ if (semId != -1)
+ semctl(semId, 0, IPC_RMID, semun);
+}
+
+/****************************************************************************/
+/* IpcSemaphoreLock(semId, sem, lock) - locks a semaphore */
+/* */
+/* note: the xxx_return variables are only used for debugging. */
+/****************************************************************************/
+static int IpcSemaphoreLock_return;
+
+void
+IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock)
+{
+ extern int errno;
+ int errStatus;
+ struct sembuf sops;
+
+ sops.sem_op = lock;
+ sops.sem_flg = 0;
+ sops.sem_num = sem;
+
+ /* ----------------
+ * Note: if errStatus is -1 and errno == EINTR then it means we
+ * returned from the operation prematurely because we were
+ * sent a signal. So we try and lock the semaphore again.
+ * I am not certain this is correct, but the semantics aren't
+ * clear it fixes problems with parallel abort synchronization,
+ * namely that after processing an abort signal, the semaphore
+ * call returns with -1 (and errno == EINTR) before it should.
+ * -cim 3/28/90
+ * ----------------
+ */
+ do {
+ errStatus = semop(semId, &sops, 1);
+ } while (errStatus == -1 && errno == EINTR);
+
+ IpcSemaphoreLock_return = errStatus;
+
+ if (errStatus == -1) {
+ perror("semop");
+ exitpg(255);
+ }
+}
+
+/****************************************************************************/
+/* IpcSemaphoreUnlock(semId, sem, lock) - unlocks a semaphore */
+/* */
+/* note: the xxx_return variables are only used for debugging. */
+/****************************************************************************/
+static int IpcSemaphoreUnlock_return;
+
+void
+IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock)
+{
+ extern int errno;
+ int errStatus;
+ struct sembuf sops;
+
+ sops.sem_op = -lock;
+ sops.sem_flg = 0;
+ sops.sem_num = sem;
+
+
+ /* ----------------
+ * Note: if errStatus is -1 and errno == EINTR then it means we
+ * returned from the operation prematurely because we were
+ * sent a signal. So we try and lock the semaphore again.
+ * I am not certain this is correct, but the semantics aren't
+ * clear it fixes problems with parallel abort synchronization,
+ * namely that after processing an abort signal, the semaphore
+ * call returns with -1 (and errno == EINTR) before it should.
+ * -cim 3/28/90
+ * ----------------
+ */
+ do {
+ errStatus = semop(semId, &sops, 1);
+ } while (errStatus == -1 && errno == EINTR);
+
+ IpcSemaphoreUnlock_return = errStatus;
+
+ if (errStatus == -1) {
+ perror("semop");
+ exitpg(255);
+ }
+}
+
+int
+IpcSemaphoreGetCount(IpcSemaphoreId semId, int sem)
+{
+ int semncnt;
+ union semun dummy; /* for Solaris */
+
+ semncnt = semctl(semId, sem, GETNCNT, dummy);
+ return semncnt;
+}
+
+int
+IpcSemaphoreGetValue(IpcSemaphoreId semId, int sem)
+{
+ int semval;
+ union semun dummy; /* for Solaris */
+
+ semval = semctl(semId, sem, GETVAL, dummy);
+ return semval;
+}
+
+/****************************************************************************/
+/* IpcMemoryCreate(memKey) */
+/* */
+/* - returns the memory identifier, if creation succeeds */
+/* returns IpcMemCreationFailed, if failure */
+/****************************************************************************/
+
+IpcMemoryId
+IpcMemoryCreate(IpcMemoryKey memKey, uint32 size, int permission)
+{
+ IpcMemoryId shmid;
+
+ if (memKey == PrivateIPCKey) {
+ /* private */
+ shmid = PrivateMemoryCreate(memKey, size);
+ }else {
+ shmid = shmget(memKey, size, IPC_CREAT|permission);
+ }
+
+ if (shmid < 0) {
+ fprintf(stderr,"IpcMemoryCreate: memKey=%d , size=%d , permission=%d",
+ memKey, size , permission );
+ perror("IpcMemoryCreate: shmget(..., create, ...) failed");
+ return(IpcMemCreationFailed);
+ }
+
+ /* if (memKey == PrivateIPCKey) */
+ on_exitpg(IPCPrivateMemoryKill, (caddr_t)shmid);
+
+ return(shmid);
+}
+
+/****************************************************************************/
+/* IpcMemoryIdGet(memKey, size) returns the shared memory Id */
+/* or IpcMemIdGetFailed */
+/****************************************************************************/
+IpcMemoryId
+IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size)
+{
+ IpcMemoryId shmid;
+
+ shmid = shmget(memKey, size, 0);
+
+ if (shmid < 0) {
+ fprintf(stderr,"IpcMemoryIdGet: memKey=%d , size=%d , permission=%d",
+ memKey, size , 0 );
+ perror("IpcMemoryIdGet: shmget() failed");
+ return(IpcMemIdGetFailed);
+ }
+
+ return(shmid);
+}
+
+/****************************************************************************/
+/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
+/* from a backend address space */
+/* (only called by backends running under the postmaster) */
+/****************************************************************************/
+void
+IpcMemoryDetach(int status, char *shmaddr)
+{
+ if (shmdt(shmaddr) < 0) {
+ elog(NOTICE, "IpcMemoryDetach: shmdt(0x%x): %m", shmaddr);
+ }
+}
+
+/****************************************************************************/
+/* IpcMemoryAttach(memId) returns the adress of shared memory */
+/* or IpcMemAttachFailed */
+/* */
+/* CALL IT: addr = (struct <MemoryStructure> *) IpcMemoryAttach(memId); */
+/* */
+/****************************************************************************/
+char *
+IpcMemoryAttach(IpcMemoryId memId)
+{
+ char *memAddress;
+
+ if (UsePrivateMemory) {
+ memAddress = (char *) PrivateMemoryAttach(memId);
+ } else {
+ memAddress = (char *) shmat(memId, 0, 0);
+ }
+
+ /* if ( *memAddress == -1) { XXX ??? */
+ if ( memAddress == (char *)-1) {
+ perror("IpcMemoryAttach: shmat() failed");
+ return(IpcMemAttachFailed);
+ }
+
+ if (!UsePrivateMemory)
+ on_exitpg(IpcMemoryDetach, (caddr_t) memAddress);
+
+ return((char *) memAddress);
+}
+
+
+/****************************************************************************/
+/* IpcMemoryKill(memKey) removes a shared memory segment */
+/* (only called by the postmaster and standalone backends) */
+/****************************************************************************/
+void
+IpcMemoryKill(IpcMemoryKey memKey)
+{
+ IpcMemoryId shmid;
+
+ if (!UsePrivateMemory && (shmid = shmget(memKey, 0, 0)) >= 0) {
+ if (shmctl(shmid, IPC_RMID, (struct shmid_ds *) NULL) < 0) {
+ elog(NOTICE, "IpcMemoryKill: shmctl(%d, %d, 0) failed: %m",
+ shmid, IPC_RMID);
+ }
+ }
+}
+
+#ifdef HAS_TEST_AND_SET
+/* ------------------
+ * use hardware locks to replace semaphores for sequent machines
+ * to avoid costs of swapping processes and to provide unlimited
+ * supply of locks.
+ * ------------------
+ */
+static SLock *SLockArray = NULL;
+static SLock **FreeSLockPP;
+static int *UnusedSLockIP;
+static slock_t *SLockMemoryLock;
+static IpcMemoryId SLockMemoryId = -1;
+
+struct ipcdummy { /* to get alignment/size right */
+ SLock *free;
+ int unused;
+ slock_t memlock;
+ SLock slocks[NSLOCKS];
+};
+static int SLockMemorySize = sizeof(struct ipcdummy);
+
+void
+CreateAndInitSLockMemory(IPCKey key)
+{
+ int id;
+ SLock *slckP;
+
+ SLockMemoryId = IpcMemoryCreate(key,
+ SLockMemorySize,
+ 0700);
+ AttachSLockMemory(key);
+ *FreeSLockPP = NULL;
+ *UnusedSLockIP = (int)FIRSTFREELOCKID;
+ for (id=0; id<(int)FIRSTFREELOCKID; id++) {
+ slckP = &(SLockArray[id]);
+ S_INIT_LOCK(&(slckP->locklock));
+ slckP->flag = NOLOCK;
+ slckP->nshlocks = 0;
+ S_INIT_LOCK(&(slckP->shlock));
+ S_INIT_LOCK(&(slckP->exlock));
+ S_INIT_LOCK(&(slckP->comlock));
+ slckP->next = NULL;
+ }
+ return;
+}
+
+void
+AttachSLockMemory(IPCKey key)
+{
+ struct ipcdummy *slockM;
+
+ if (SLockMemoryId == -1)
+ SLockMemoryId = IpcMemoryIdGet(key,SLockMemorySize);
+ if (SLockMemoryId == -1)
+ elog(FATAL, "SLockMemory not in shared memory");
+ slockM = (struct ipcdummy *) IpcMemoryAttach(SLockMemoryId);
+ if (slockM == IpcMemAttachFailed)
+ elog(FATAL, "AttachSLockMemory: could not attach segment");
+ FreeSLockPP = (SLock **) &(slockM->free);
+ UnusedSLockIP = (int *) &(slockM->unused);
+ SLockMemoryLock = (slock_t *) &(slockM->memlock);
+ S_INIT_LOCK(SLockMemoryLock);
+ SLockArray = (SLock *) &(slockM->slocks[0]);
+ return;
+}
+
+
+#ifdef LOCKDEBUG
+#define PRINT_LOCK(LOCK) printf("(locklock = %d, flag = %d, nshlocks = %d, \
+shlock = %d, exlock =%d)\n", LOCK->locklock, \
+ LOCK->flag, LOCK->nshlocks, LOCK->shlock, \
+ LOCK->exlock)
+#endif
+
+void
+ExclusiveLock(int lockid)
+{
+ SLock *slckP;
+ slckP = &(SLockArray[lockid]);
+#ifdef LOCKDEBUG
+ printf("ExclusiveLock(%d)\n", lockid);
+ printf("IN: ");
+ PRINT_LOCK(slckP);
+#endif
+ ex_try_again:
+ S_LOCK(&(slckP->locklock));
+ switch (slckP->flag) {
+ case NOLOCK:
+ slckP->flag = EXCLUSIVELOCK;
+ S_LOCK(&(slckP->exlock));
+ S_LOCK(&(slckP->shlock));
+ S_UNLOCK(&(slckP->locklock));
+#ifdef LOCKDEBUG
+ printf("OUT: ");
+ PRINT_LOCK(slckP);
+#endif
+ return;
+ case SHAREDLOCK:
+ case EXCLUSIVELOCK:
+ S_UNLOCK(&(slckP->locklock));
+ S_LOCK(&(slckP->exlock));
+ S_UNLOCK(&(slckP->exlock));
+ goto ex_try_again;
+ }
+}
+
+void
+ExclusiveUnlock(int lockid)
+{
+ SLock *slckP;
+
+ slckP = &(SLockArray[lockid]);
+#ifdef LOCKDEBUG
+ printf("ExclusiveUnlock(%d)\n", lockid);
+ printf("IN: ");
+ PRINT_LOCK(slckP);
+#endif
+ S_LOCK(&(slckP->locklock));
+ /* -------------
+ * give favor to read processes
+ * -------------
+ */
+ slckP->flag = NOLOCK;
+ if (slckP->nshlocks > 0) {
+ while (slckP->nshlocks > 0) {
+ S_UNLOCK(&(slckP->shlock));
+ S_LOCK(&(slckP->comlock));
+ }
+ S_UNLOCK(&(slckP->shlock));
+ }
+ else {
+ S_UNLOCK(&(slckP->shlock));
+ }
+ S_UNLOCK(&(slckP->exlock));
+ S_UNLOCK(&(slckP->locklock));
+#ifdef LOCKDEBUG
+ printf("OUT: ");
+ PRINT_LOCK(slckP);
+#endif
+ return;
+}
+
+bool
+LockIsFree(int lockid)
+{
+ return(SLockArray[lockid].flag == NOLOCK);
+}
+
+#endif /* HAS_TEST_AND_SET */
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
new file mode 100644
index 00000000000..18d3cccd0ee
--- /dev/null
+++ b/src/backend/storage/ipc/ipci.c
@@ -0,0 +1,149 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipci.c--
+ * POSTGRES inter-process communication initialization code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "storage/ipc.h"
+#include "storage/multilev.h"
+#include "utils/elog.h"
+#include "storage/sinval.h"
+#include "storage/bufmgr.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "storage/lock.h"
+#include "miscadmin.h" /* for DebugLvl */
+
+/*
+ * SystemPortAddressCreateMemoryKey --
+ * Returns a memory key given a port address.
+ */
+IPCKey
+SystemPortAddressCreateIPCKey(SystemPortAddress address)
+{
+ Assert(address < 32768); /* XXX */
+
+ return (SystemPortAddressGetIPCKey(address));
+}
+
+/*
+ * CreateSharedMemoryAndSemaphores --
+ * Creates and initializes shared memory and semaphores.
+ */
+/**************************************************
+
+ CreateSharedMemoryAndSemaphores
+ is called exactly *ONCE* by the postmaster.
+ It is *NEVER* called by the postgres backend
+
+ 0) destroy any existing semaphores for both buffer
+ and lock managers.
+ 1) create the appropriate *SHARED* memory segments
+ for the two resource managers.
+
+ **************************************************/
+
+void
+CreateSharedMemoryAndSemaphores(IPCKey key)
+{
+ int size;
+
+#ifdef HAS_TEST_AND_SET
+ /* ---------------
+ * create shared memory for slocks
+ * --------------
+ */
+ CreateAndInitSLockMemory(IPCKeyGetSLockSharedMemoryKey(key));
+#endif
+ /* ----------------
+ * kill and create the buffer manager buffer pool (and semaphore)
+ * ----------------
+ */
+ CreateSpinlocks(IPCKeyGetSpinLockSemaphoreKey(key));
+ size = BufferShmemSize() + LockShmemSize();
+
+#ifdef MAIN_MEMORY
+ size += MMShmemSize();
+#endif /* MAIN_MEMORY */
+
+ if (DebugLvl > 1) {
+ fprintf(stderr, "binding ShmemCreate(key=%x, size=%d)\n",
+ IPCKeyGetBufferMemoryKey(key), size);
+ }
+ ShmemCreate(IPCKeyGetBufferMemoryKey(key), size);
+ ShmemBindingTabReset();
+ InitShmem(key, size);
+ InitBufferPool(key);
+
+ /* ----------------
+ * do the lock table stuff
+ * ----------------
+ */
+ InitLocks();
+ InitMultiLevelLockm();
+ if (InitMultiLevelLockm() == INVALID_TABLEID)
+ elog(FATAL, "Couldn't create the lock table");
+
+ /* ----------------
+ * do process table stuff
+ * ----------------
+ */
+ InitProcGlobal(key);
+ on_exitpg(ProcFreeAllSemaphores, 0);
+
+ CreateSharedInvalidationState(key);
+}
+
+
+/*
+ * AttachSharedMemoryAndSemaphores --
+ * Attachs existant shared memory and semaphores.
+ */
+void
+AttachSharedMemoryAndSemaphores(IPCKey key)
+{
+ int size;
+
+ /* ----------------
+ * create rather than attach if using private key
+ * ----------------
+ */
+ if (key == PrivateIPCKey) {
+ CreateSharedMemoryAndSemaphores(key);
+ return;
+ }
+
+#ifdef HAS_TEST_AND_SET
+ /* ----------------
+ * attach the slock shared memory
+ * ----------------
+ */
+ AttachSLockMemory(IPCKeyGetSLockSharedMemoryKey(key));
+#endif
+ /* ----------------
+ * attach the buffer manager buffer pool (and semaphore)
+ * ----------------
+ */
+ size = BufferShmemSize() + LockShmemSize();
+ InitShmem(key, size);
+ InitBufferPool(key);
+
+ /* ----------------
+ * initialize lock table stuff
+ * ----------------
+ */
+ InitLocks();
+ if (InitMultiLevelLockm() == INVALID_TABLEID)
+ elog(FATAL, "Couldn't attach to the lock table");
+
+ AttachSharedInvalidationState(key);
+}
diff --git a/src/backend/storage/ipc/s_lock.c b/src/backend/storage/ipc/s_lock.c
new file mode 100644
index 00000000000..3cbe796fc59
--- /dev/null
+++ b/src/backend/storage/ipc/s_lock.c
@@ -0,0 +1,440 @@
+/*-------------------------------------------------------------------------
+ *
+ * s_lock.c--
+ * This file contains the implementation (if any) for spinlocks.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/s_lock.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * DESCRIPTION
+ * The following code fragment should be written (in assembly
+ * language) on machines that have a native test-and-set instruction:
+ *
+ * void
+ * S_LOCK(char_address)
+ * char *char_address;
+ * {
+ * while (test_and_set(char_address))
+ * ;
+ * }
+ *
+ * If this is not done, POSTGRES will default to using System V
+ * semaphores (and take a large performance hit -- around 40% of
+ * its time on a DS5000/240 is spent in semop(3)...).
+ *
+ * NOTES
+ * AIX has a test-and-set but the recommended interface is the cs(3)
+ * system call. This provides an 8-instruction (plus system call
+ * overhead) uninterruptible compare-and-set operation. True
+ * spinlocks might be faster but using cs(3) still speeds up the
+ * regression test suite by about 25%. I don't have an assembler
+ * manual for POWER in any case.
+ *
+ */
+#ifdef WIN32
+#include <windows.h>
+#endif /* WIN32 */
+#include "storage/ipc.h"
+
+
+#if defined(HAS_TEST_AND_SET)
+
+#if defined (PORTNAME_next)
+/*
+ * NEXTSTEP (mach)
+ * slock_t is defined as a struct mutex.
+ */
+void
+S_LOCK(slock_t *lock)
+{
+ mutex_lock(lock);
+}
+void
+S_UNLOCK(slock_t *lock)
+{
+ mutex_unlock(lock);
+}
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ mutex_init(lock);
+}
+
+ /* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */
+int
+ S_LOCK_FREE(slock_t *lock)
+{
+ /* For Mach, we have to delve inside the entrails of `struct
+mutex'. Ick! */
+ return (lock->lock == 0);
+}
+
+#endif /* PORTNAME_next */
+
+
+
+#if defined(PORTNAME_irix5)
+/*
+ * SGI IRIX 5
+ * slock_t is defined as a struct abilock_t, which has a single unsigned long
+ * member.
+ *
+ * This stuff may be supplemented in the future with Masato Kataoka's MIPS-II
+ * assembly from his NECEWS SVR4 port, but we probably ought to retain this
+ * for the R3000 chips out there.
+ */
+void
+S_LOCK(slock_t *lock)
+{
+ /* spin_lock(lock); */
+ while (!acquire_lock(lock))
+ ;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+ (void)release_lock(lock);
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ (void)init_lock(lock);
+}
+
+/* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */
+int
+S_LOCK_FREE(slock_t *lock)
+{
+ return(stat_lock(lock)==UNLOCKED);
+}
+
+#endif /* PORTNAME_irix5 */
+
+
+/*
+ * OSF/1 (Alpha AXP)
+ *
+ * Note that slock_t on the Alpha AXP is msemaphore instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_alpha)
+
+void
+S_LOCK(slock_t *lock)
+{
+ while (msem_lock(lock, MSEM_IF_NOWAIT) < 0)
+ ;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+ (void) msem_unlock(lock, 0);
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ (void) msem_init(lock, MSEM_UNLOCKED);
+}
+
+int
+S_LOCK_FREE(slock_t *lock)
+{
+ return(lock->msem_state ? 0 : 1);
+}
+
+#endif /* PORTNAME_alpha */
+
+/*
+ * Solaris 2
+ */
+
+#if defined(PORTNAME_sparc_solaris)
+
+/* defined in port/.../tas.s */
+extern int tas(slock_t *lock);
+
+void
+S_LOCK(slock_t *lock)
+{
+ while (tas(lock))
+ ;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+ *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_sparc_solaris */
+
+/*
+ * AIX (POWER)
+ *
+ * Note that slock_t on POWER/POWER2/PowerPC is int instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_aix)
+
+void
+S_LOCK(slock_t *lock)
+{
+ while (cs((int *) lock, 0, 1))
+ ;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+ *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_aix */
+
+/*
+ * HP-UX (PA-RISC)
+ *
+ * Note that slock_t on PA-RISC is a structure instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_hpux)
+
+/* defined in port/.../tas.s */
+extern int tas(slock_t *lock);
+
+/*
+* a "set" slock_t has a single word cleared. a "clear" slock_t has
+* all words set to non-zero.
+*/
+static slock_t clear_lock = { -1, -1, -1, -1 };
+
+void
+S_LOCK(slock_t *lock)
+{
+ while (tas(lock))
+ ;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+ *lock = clear_lock; /* struct assignment */
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ S_UNLOCK(lock);
+}
+
+int
+S_LOCK_FREE(slock_t *lock)
+{
+ register int *lock_word = (int *) (((long) lock + 15) & ~15);
+
+ return(*lock_word != 0);
+}
+
+#endif /* PORTNAME_hpux */
+
+/*
+ * sun3
+ */
+
+#if (defined(sun) && ! defined(sparc))
+
+void
+S_LOCK(slock_t *lock)
+{
+ while (tas(lock));
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+ *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ S_UNLOCK(lock);
+}
+
+static int
+tas_dummy()
+{
+ asm("LLA0:");
+ asm(" .data");
+ asm(" .text");
+ asm("|#PROC# 04");
+ asm(" .globl _tas");
+ asm("_tas:");
+ asm("|#PROLOGUE# 1");
+ asm(" movel sp@(0x4),a0");
+ asm(" tas a0@");
+ asm(" beq LLA1");
+ asm(" moveq #-128,d0");
+ asm(" rts");
+ asm("LLA1:");
+ asm(" moveq #0,d0");
+ asm(" rts");
+ asm(" .data");
+}
+
+#endif
+
+/*
+ * SPARC (SunOS 4)
+ */
+
+#if defined(PORTNAME_sparc)
+
+/* if we're using -ansi w/ gcc, use __asm__ instead of asm */
+#if defined(__STRICT_ANSI__)
+#define asm(x) __asm__(x)
+#endif
+
+static int
+tas_dummy()
+{
+ asm(".seg \"data\"");
+ asm(".seg \"text\"");
+ asm(".global _tas");
+ asm("_tas:");
+
+ /*
+ * Sparc atomic test and set (sparc calls it "atomic load-store")
+ */
+
+ asm("ldstub [%r8], %r8");
+
+ /*
+ * Did test and set actually do the set?
+ */
+
+ asm("tst %r8");
+
+ asm("be,a ReturnZero");
+
+ /*
+ * otherwise, just return.
+ */
+
+ asm("clr %r8");
+ asm("mov 0x1, %r8");
+ asm("ReturnZero:");
+ asm("retl");
+ asm("nop");
+}
+
+void
+S_LOCK(unsigned char *addr)
+{
+ while (tas(addr));
+}
+
+
+/*
+ * addr should be as in the above S_LOCK routine
+ */
+void
+S_UNLOCK(unsigned char *addr)
+{
+ *addr = 0;
+}
+
+void
+S_INIT_LOCK(unsigned char *addr)
+{
+ *addr = 0;
+}
+
+#endif /* PORTNAME_sparc */
+
+/*
+ * Linux and friends
+ */
+
+#if defined(PORTNAME_linux) || defined(PORTNAME_BSD44_derived)
+
+int
+tas(slock_t *m)
+{
+ slock_t res;
+ __asm__("xchgb %0,%1":"=q" (res),"=m" (*m):"0" (0x1));
+ return(res);
+}
+
+void
+S_LOCK(slock_t *lock)
+{
+ while (tas(lock))
+ ;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+ *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_linux || PORTNAME_BSD44_derived */
+
+
+#endif /* HAS_TEST_AND_SET */
+
+
+#ifdef WIN32
+void
+S_LOCK(HANDLE *lock)
+{
+ int x = 0;
+ x = x / x;
+}
+
+void
+S_UNLOCK(HANDLE *lock)
+{
+ int x = 0;
+ x = x / x;
+}
+
+void
+S_INIT_LOCK(HANDLE *lock)
+{
+ int x = 0;
+ x = x / x;
+}
+#endif /*WIN32*/
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
new file mode 100644
index 00000000000..4eba3729ac8
--- /dev/null
+++ b/src/backend/storage/ipc/shmem.c
@@ -0,0 +1,561 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.c--
+ * create shared memory and initialize shared memory data structures.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES processes share one or more regions of shared memory.
+ * The shared memory is created by a postmaster and is "attached to"
+ * by each of the backends. The routines in this file are used for
+ * allocating and binding to shared memory data structures.
+ *
+ * NOTES:
+ * (a) There are three kinds of shared memory data structures
+ * available to POSTGRES: fixed-size structures, queues and hash
+ * tables. Fixed-size structures contain things like global variables
+ * for a module and should never be allocated after the process
+ * initialization phase. Hash tables have a fixed maximum size, but
+ * their actual size can vary dynamically. When entries are added
+ * to the table, more space is allocated. Queues link data structures
+ * that have been allocated either as fixed size structures or as hash
+ * buckets. Each shared data structure has a string name to identify
+ * it (assigned in the module that declares it).
+ *
+ * (b) During initialization, each module looks for its
+ * shared data structures in a hash table called the "Binding Table".
+ * If the data structure is not present, the caller can allocate
+ * a new one and initialize it. If the data structure is present,
+ * the caller "attaches" to the structure by initializing a pointer
+ * in the local address space.
+ * The binding table has two purposes: first, it gives us
+ * a simple model of how the world looks when a backend process
+ * initializes. If something is present in the binding table,
+ * it is initialized. If it is not, it is uninitialized. Second,
+ * the binding table allows us to allocate shared memory on demand
+ * instead of trying to preallocate structures and hard-wire the
+ * sizes and locations in header files. If you are using a lot
+ * of shared memory in a lot of different places (and changing
+ * things during development), this is important.
+ *
+ * (c) memory allocation model: shared memory can never be
+ * freed, once allocated. Each hash table has its own free list,
+ * so hash buckets can be reused when an item is deleted. However,
+ * if one hash table grows very large and then shrinks, its space
+ * cannot be redistributed to other tables. We could build a simple
+ * hash bucket garbage collector if need be. Right now, it seems
+ * unnecessary.
+ *
+ * See InitSem() in sem.c for an example of how to use the
+ * binding table.
+ *
+ */
+#include <stdio.h>
+#include <string.h>
+#include "postgres.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+/* shared memory global variables */
+
+unsigned long ShmemBase = 0; /* start and end address of
+ * shared memory
+ */
+static unsigned long ShmemEnd = 0;
+static unsigned long ShmemSize = 0; /* current size (and default) */
+
+SPINLOCK ShmemLock; /* lock for shared memory allocation */
+
+SPINLOCK BindingLock; /* lock for binding table access */
+
+static unsigned long *ShmemFreeStart = NULL; /* pointer to the OFFSET of
+ * first free shared memory
+ */
+static unsigned long *ShmemBindingTabOffset = NULL; /* start of the binding
+ * table (for bootstrap)
+ */
+static int ShmemBootstrap = FALSE; /* flag becomes true when shared mem
+ * is created by POSTMASTER
+ */
+
+static HTAB *BindingTable = NULL;
+
+/* ---------------------
+ * ShmemBindingTabReset() - Resets the binding table to NULL....
+ * useful when the postmaster destroys existing shared memory
+ * and creates all new segments after a backend crash.
+ * ----------------------
+ */
+void
+ShmemBindingTabReset()
+{
+ BindingTable = (HTAB *)NULL;
+}
+
+/*
+ * CreateSharedRegion() --
+ *
+ * This routine is called once by the postmaster to
+ * initialize the shared buffer pool. Assume there is
+ * only one postmaster so no synchronization is necessary
+ * until after this routine completes successfully.
+ *
+ * key is a unique identifier for the shmem region.
+ * size is the size of the region.
+ */
+static IpcMemoryId ShmemId;
+
+void
+ShmemCreate(unsigned int key, unsigned int size)
+{
+ if (size)
+ ShmemSize = size;
+ /* create shared mem region */
+ if ((ShmemId=IpcMemoryCreate(key,ShmemSize,IPCProtection))
+ ==IpcMemCreationFailed) {
+ elog(FATAL,"ShmemCreate: cannot create region");
+ exit(1);
+ }
+
+ /* ShmemBootstrap is true if shared memory has been
+ * created, but not yet initialized. Only the
+ * postmaster/creator-of-all-things should have
+ * this flag set.
+ */
+ ShmemBootstrap = TRUE;
+}
+
+/*
+ * InitShmem() -- map region into process address space
+ * and initialize shared data structures.
+ *
+ */
+int
+InitShmem(unsigned int key, unsigned int size)
+{
+ Pointer sharedRegion;
+ unsigned long currFreeSpace;
+
+ HASHCTL info;
+ int hash_flags;
+ BindingEnt * result,item;
+ bool found;
+ IpcMemoryId shmid;
+
+ /* if zero key, use default memory size */
+ if (size)
+ ShmemSize = size;
+
+ /* default key is 0 */
+
+ /* attach to shared memory region (SysV or BSD OS specific) */
+ if (ShmemBootstrap && key == PrivateIPCKey)
+ /* if we are running backend alone */
+ shmid = ShmemId;
+ else
+ shmid = IpcMemoryIdGet(IPCKeyGetBufferMemoryKey(key), ShmemSize);
+ sharedRegion = IpcMemoryAttach(shmid);
+ if (sharedRegion == NULL) {
+ elog(FATAL,"AttachSharedRegion: couldn't attach to shmem\n");
+ return(FALSE);
+ }
+
+ /* get pointers to the dimensions of shared memory */
+ ShmemBase = (unsigned long) sharedRegion;
+ ShmemEnd = (unsigned long) sharedRegion + ShmemSize;
+ currFreeSpace = 0;
+
+ /* First long in shared memory is the count of available space */
+ ShmemFreeStart = (unsigned long *) ShmemBase;
+ /* next is a shmem pointer to the binding table */
+ ShmemBindingTabOffset = ShmemFreeStart + 1;
+
+ currFreeSpace +=
+ sizeof(ShmemFreeStart) + sizeof(ShmemBindingTabOffset);
+
+ /* bootstrap initialize spin locks so we can start to use the
+ * allocator and binding table.
+ */
+ if (! InitSpinLocks(ShmemBootstrap, IPCKeyGetSpinLockSemaphoreKey(key))) {
+ return(FALSE);
+ }
+
+ /* We have just allocated additional space for two spinlocks.
+ * Now setup the global free space count
+ */
+ if (ShmemBootstrap) {
+ *ShmemFreeStart = currFreeSpace;
+ }
+
+ /* if ShmemFreeStart is NULL, then the allocator won't work */
+ Assert(*ShmemFreeStart);
+
+ /* create OR attach to the shared memory binding table */
+ info.keysize = BTABLE_KEYSIZE;
+ info.datasize = BTABLE_DATASIZE;
+ hash_flags = (HASH_ELEM);
+
+ /* This will acquire the binding table lock, but not release it. */
+ BindingTable = ShmemInitHash("BindingTable",
+ BTABLE_SIZE,BTABLE_SIZE,
+ &info,hash_flags);
+
+ if (! BindingTable) {
+ elog(FATAL,"InitShmem: couldn't initialize Binding Table");
+ return(FALSE);
+ }
+
+ /* Now, check the binding table for an entry to the binding
+ * table. If there is an entry there, someone else created
+ * the table. Otherwise, we did and we have to initialize it.
+ */
+ memset(item.key, 0, BTABLE_KEYSIZE);
+ strncpy(item.key,"BindingTable",BTABLE_KEYSIZE);
+
+ result = (BindingEnt *)
+ hash_search(BindingTable,(char *) &item,HASH_ENTER, &found);
+
+
+ if (! result ) {
+ elog(FATAL,"InitShmem: corrupted binding table");
+ return(FALSE);
+ }
+
+ if (! found) {
+ /* bootstrapping shmem: we have to initialize the
+ * binding table now.
+ */
+
+ Assert(ShmemBootstrap);
+ result->location = MAKE_OFFSET(BindingTable->hctl);
+ *ShmemBindingTabOffset = result->location;
+ result->size = BTABLE_SIZE;
+
+ ShmemBootstrap = FALSE;
+
+ } else {
+ Assert(! ShmemBootstrap);
+ }
+ /* now release the lock acquired in ShmemHashInit */
+ SpinRelease (BindingLock);
+
+ Assert (result->location == MAKE_OFFSET(BindingTable->hctl));
+
+ return(TRUE);
+}
+
+/*
+ * ShmemAlloc -- allocate word-aligned byte string from
+ * shared memory
+ *
+ * Assumes ShmemLock and ShmemFreeStart are initialized.
+ * Returns: real pointer to memory or NULL if we are out
+ * of space. Has to return a real pointer in order
+ * to be compatable with malloc().
+ */
+long *
+ShmemAlloc(unsigned long size)
+{
+ unsigned long tmpFree;
+ long *newSpace;
+
+ /*
+ * ensure space is word aligned.
+ *
+ * Word-alignment is not good enough. We have to be more
+ * conservative: doubles need 8-byte alignment. (We probably only need
+ * this on RISC platforms but this is not a big waste of space.)
+ * - ay 12/94
+ */
+ if (size % sizeof(double))
+ size += sizeof(double) - (size % sizeof(double));
+
+ Assert(*ShmemFreeStart);
+
+ SpinAcquire(ShmemLock);
+
+ tmpFree = *ShmemFreeStart + size;
+ if (tmpFree <= ShmemSize) {
+ newSpace = (long *)MAKE_PTR(*ShmemFreeStart);
+ *ShmemFreeStart += size;
+ } else {
+ newSpace = NULL;
+ }
+
+ SpinRelease(ShmemLock);
+
+ if (! newSpace) {
+ elog(NOTICE,"ShmemAlloc: out of memory ");
+ }
+ return(newSpace);
+}
+
+/*
+ * ShmemIsValid -- test if an offset refers to valid shared memory
+ *
+ * Returns TRUE if the pointer is valid.
+ */
+int
+ShmemIsValid(unsigned long addr)
+{
+ return ((addr<ShmemEnd) && (addr>=ShmemBase));
+}
+
+/*
+ * ShmemInitHash -- Create/Attach to and initialize
+ * shared memory hash table.
+ *
+ * Notes:
+ *
+ * assume caller is doing some kind of synchronization
+ * so that two people dont try to create/initialize the
+ * table at once. Use SpinAlloc() to create a spinlock
+ * for the structure before creating the structure itself.
+ */
+HTAB *
+ShmemInitHash(char *name, /* table string name for binding */
+ long init_size, /* initial size */
+ long max_size, /* max size of the table */
+ HASHCTL *infoP, /* info about key and bucket size */
+ int hash_flags) /* info about infoP */
+{
+ bool found;
+ long * location;
+
+ /* shared memory hash tables have a fixed max size so that the
+ * control structures don't try to grow. The segbase is for
+ * calculating pointer values. The shared memory allocator
+ * must be specified.
+ */
+ infoP->segbase = (long *) ShmemBase;
+ infoP->alloc = ShmemAlloc;
+ infoP->max_size = max_size;
+ hash_flags |= HASH_SHARED_MEM;
+
+ /* look it up in the binding table */
+ location =
+ ShmemInitStruct(name,my_log2(max_size) + sizeof(HHDR),&found);
+
+ /* binding table is corrupted. Let someone else give the
+ * error message since they have more information
+ */
+ if (location == NULL) {
+ return(0);
+ }
+
+ /* it already exists, attach to it rather than allocate and
+ * initialize new space
+ */
+ if (found) {
+ hash_flags |= HASH_ATTACH;
+ }
+
+ /* these structures were allocated or bound in ShmemInitStruct */
+ /* control information and parameters */
+ infoP->hctl = (long *) location;
+ /* directory for hash lookup */
+ infoP->dir = (long *) (location + sizeof(HHDR));
+
+ return(hash_create(init_size, infoP, hash_flags));;
+}
+
+/*
+ * ShmemPIDLookup -- lookup process data structure using process id
+ *
+ * Returns: TRUE if no error. locationPtr is initialized if PID is
+ * found in the binding table.
+ *
+ * NOTES:
+ * only information about success or failure is the value of
+ * locationPtr.
+ */
+bool
+ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr)
+{
+ BindingEnt * result,item;
+ bool found;
+
+ Assert (BindingTable);
+ memset(item.key, 0, BTABLE_KEYSIZE);
+ sprintf(item.key,"PID %d",pid);
+
+ SpinAcquire(BindingLock);
+ result = (BindingEnt *)
+ hash_search(BindingTable,(char *) &item, HASH_ENTER, &found);
+
+ if (! result) {
+
+ SpinRelease(BindingLock);
+ elog(WARN,"ShmemInitPID: BindingTable corrupted");
+ return(FALSE);
+
+ }
+
+ if (found) {
+ *locationPtr = result->location;
+ } else {
+ result->location = *locationPtr;
+ }
+
+ SpinRelease(BindingLock);
+ return (TRUE);
+}
+
+/*
+ * ShmemPIDDestroy -- destroy binding table entry for process
+ * using process id
+ *
+ * Returns: offset of the process struct in shared memory or
+ * INVALID_OFFSET if not found.
+ *
+ * Side Effect: removes the entry from the binding table
+ */
+SHMEM_OFFSET
+ShmemPIDDestroy(int pid)
+{
+ BindingEnt * result,item;
+ bool found;
+ SHMEM_OFFSET location;
+
+ Assert(BindingTable);
+
+ memset(item.key, 0, BTABLE_KEYSIZE);
+ sprintf(item.key,"PID %d",pid);
+
+ SpinAcquire(BindingLock);
+ result = (BindingEnt *)
+ hash_search(BindingTable,(char *) &item, HASH_REMOVE, &found);
+
+ if (found)
+ location = result->location;
+ SpinRelease(BindingLock);
+
+ if (! result) {
+
+ elog(WARN,"ShmemPIDDestroy: PID table corrupted");
+ return(INVALID_OFFSET);
+
+ }
+
+ if (found)
+ return (location);
+ else {
+ return(INVALID_OFFSET);
+ }
+}
+
+/*
+ * ShmemInitStruct -- Create/attach to a structure in shared
+ * memory.
+ *
+ * This is called during initialization to find or allocate
+ * a data structure in shared memory. If no other processes
+ * have created the structure, this routine allocates space
+ * for it. If it exists already, a pointer to the existing
+ * table is returned.
+ *
+ * Returns: real pointer to the object. FoundPtr is TRUE if
+ * the object is already in the binding table (hence, already
+ * initialized).
+ */
+long *
+ShmemInitStruct(char *name, unsigned long size, bool *foundPtr)
+{
+ BindingEnt * result,item;
+ long * structPtr;
+
+ strncpy(item.key,name,BTABLE_KEYSIZE);
+ item.location = BAD_LOCATION;
+
+ SpinAcquire(BindingLock);
+
+ if (! BindingTable) {
+ /* Assert() is a macro now. substitutes inside quotes. */
+ char *strname = "BindingTable";
+
+ /* If the binding table doesnt exist, we fake it.
+ *
+ * If we are creating the first binding table, then let
+ * shmemalloc() allocate the space for a new HTAB. Otherwise,
+ * find the old one and return that. Notice that the
+ * BindingLock is held until the binding table has been completely
+ * initialized.
+ */
+ Assert (! strcmp(name,strname)) ;
+ if (ShmemBootstrap) {
+ /* in POSTMASTER/Single process */
+
+ *foundPtr = FALSE;
+ return((long *)ShmemAlloc(size));
+
+ } else {
+ Assert (ShmemBindingTabOffset);
+
+ *foundPtr = TRUE;
+ return((long *)MAKE_PTR(*ShmemBindingTabOffset));
+ }
+
+
+ } else {
+ /* look it up in the bindint table */
+ result = (BindingEnt *)
+ hash_search(BindingTable,(char *) &item,HASH_ENTER, foundPtr);
+ }
+
+ if (! result) {
+
+ SpinRelease(BindingLock);
+
+ elog(WARN,"ShmemInitStruct: Binding Table corrupted");
+ return(NULL);
+
+ } else if (*foundPtr) {
+ /*
+ * Structure is in the binding table so someone else has allocated
+ * it already. The size better be the same as the size we are
+ * trying to initialize to or there is a name conflict (or worse).
+ */
+ if (result->size != size) {
+ SpinRelease(BindingLock);
+
+ elog(NOTICE,"ShmemInitStruct: BindingTable entry size is wrong");
+ /* let caller print its message too */
+ return(NULL);
+ }
+ structPtr = (long *)MAKE_PTR(result->location);
+ } else {
+
+ /* It isn't in the table yet. allocate and initialize it */
+ structPtr = ShmemAlloc((long)size);
+ if (! structPtr) {
+ /* out of memory */
+ Assert (BindingTable);
+ (void) hash_search(BindingTable,(char *) &item,HASH_REMOVE, foundPtr);
+ SpinRelease(BindingLock);
+ *foundPtr = FALSE;
+
+ elog(NOTICE,"ShmemInitStruct: cannot allocate '%s'",
+ name);
+ return(NULL);
+ }
+ result->size = size;
+ result->location = MAKE_OFFSET(structPtr);
+ }
+ Assert (ShmemIsValid((unsigned long)structPtr));
+
+ SpinRelease(BindingLock);
+ return(structPtr);
+}
+
+
+
diff --git a/src/backend/storage/ipc/shmqueue.c b/src/backend/storage/ipc/shmqueue.c
new file mode 100644
index 00000000000..f08546742b5
--- /dev/null
+++ b/src/backend/storage/ipc/shmqueue.c
@@ -0,0 +1,251 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmqueue.c--
+ * shared memory linked lists
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmqueue.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ * NOTES
+ *
+ * Package for managing doubly-linked lists in shared memory.
+ * The only tricky thing is that SHM_QUEUE will usually be a field
+ * in a larger record. SHMQueueGetFirst has to return a pointer
+ * to the record itself instead of a pointer to the SHMQueue field
+ * of the record. It takes an extra pointer and does some extra
+ * pointer arithmetic to do this correctly.
+ *
+ * NOTE: These are set up so they can be turned into macros some day.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h> /* for sprintf() */
+#include "postgres.h"
+#include "storage/shmem.h" /* where the declarations go */
+#include "utils/elog.h"
+
+/*#define SHMQUEUE_DEBUG*/
+#ifdef SHMQUEUE_DEBUG
+#define SHMQUEUE_DEBUG_DEL /* deletions */
+#define SHMQUEUE_DEBUG_HD /* head inserts */
+#define SHMQUEUE_DEBUG_TL /* tail inserts */
+#define SHMQUEUE_DEBUG_ELOG NOTICE
+#endif /* SHMQUEUE_DEBUG */
+
+/*
+ * ShmemQueueInit -- make the head of a new queue point
+ * to itself
+ */
+void
+SHMQueueInit(SHM_QUEUE *queue)
+{
+ Assert(SHM_PTR_VALID(queue));
+ (queue)->prev = (queue)->next = MAKE_OFFSET(queue);
+}
+
+/*
+ * SHMQueueIsDetached -- TRUE if element is not currently
+ * in a queue.
+ */
+bool
+SHMQueueIsDetached(SHM_QUEUE *queue)
+{
+ Assert(SHM_PTR_VALID(queue));
+ return ((queue)->prev == INVALID_OFFSET);
+}
+
+/*
+ * SHMQueueElemInit -- clear an element's links
+ */
+void
+SHMQueueElemInit(SHM_QUEUE *queue)
+{
+ Assert(SHM_PTR_VALID(queue));
+ (queue)->prev = (queue)->next = INVALID_OFFSET;
+}
+
+/*
+ * SHMQueueDelete -- remove an element from the queue and
+ * close the links
+ */
+void
+SHMQueueDelete(SHM_QUEUE *queue)
+{
+ SHM_QUEUE *nextElem = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+ SHM_QUEUE *prevElem = (SHM_QUEUE *) MAKE_PTR((queue)->prev);
+
+ Assert(SHM_PTR_VALID(queue));
+ Assert(SHM_PTR_VALID(nextElem));
+ Assert(SHM_PTR_VALID(prevElem));
+
+#ifdef SHMQUEUE_DEBUG_DEL
+ dumpQ(queue, "in SHMQueueDelete: begin");
+#endif /* SHMQUEUE_DEBUG_DEL */
+
+ prevElem->next = (queue)->next;
+ nextElem->prev = (queue)->prev;
+
+#ifdef SHMQUEUE_DEBUG_DEL
+ dumpQ((SHM_QUEUE *)MAKE_PTR(queue->prev), "in SHMQueueDelete: end");
+#endif /* SHMQUEUE_DEBUG_DEL */
+}
+
+#ifdef SHMQUEUE_DEBUG
+void
+dumpQ(SHM_QUEUE *q, char *s)
+{
+ char elem[16];
+ char buf[1024];
+ SHM_QUEUE *start = q;
+ int count = 0;
+
+ sprintf(buf, "q prevs: %x", MAKE_OFFSET(q));
+ q = (SHM_QUEUE *)MAKE_PTR(q->prev);
+ while (q != start)
+ {
+ sprintf(elem, "--->%x", MAKE_OFFSET(q));
+ strcat(buf, elem);
+ q = (SHM_QUEUE *)MAKE_PTR(q->prev);
+ if (q->prev == MAKE_OFFSET(q))
+ break;
+ if (count++ > 40)
+ {
+ strcat(buf, "BAD PREV QUEUE!!");
+ break;
+ }
+ }
+ sprintf(elem, "--->%x", MAKE_OFFSET(q));
+ strcat(buf, elem);
+ elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf);
+
+ sprintf(buf, "q nexts: %x", MAKE_OFFSET(q));
+ count = 0;
+ q = (SHM_QUEUE *)MAKE_PTR(q->next);
+ while (q != start)
+ {
+ sprintf(elem, "--->%x", MAKE_OFFSET(q));
+ strcat(buf, elem);
+ q = (SHM_QUEUE *)MAKE_PTR(q->next);
+ if (q->next == MAKE_OFFSET(q))
+ break;
+ if (count++ > 10)
+ {
+ strcat(buf, "BAD NEXT QUEUE!!");
+ break;
+ }
+ }
+ sprintf(elem, "--->%x", MAKE_OFFSET(q));
+ strcat(buf, elem);
+ elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf);
+}
+#endif /* SHMQUEUE_DEBUG */
+
+/*
+ * SHMQueueInsertHD -- put elem in queue between the queue head
+ * and its "prev" element.
+ */
+void
+SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+ SHM_QUEUE *prevPtr = (SHM_QUEUE *) MAKE_PTR((queue)->prev);
+ SHMEM_OFFSET elemOffset = MAKE_OFFSET(elem);
+
+ Assert(SHM_PTR_VALID(queue));
+ Assert(SHM_PTR_VALID(elem));
+
+#ifdef SHMQUEUE_DEBUG_HD
+ dumpQ(queue, "in SHMQueueInsertHD: begin");
+#endif /* SHMQUEUE_DEBUG_HD */
+
+ (elem)->next = prevPtr->next;
+ (elem)->prev = queue->prev;
+ (queue)->prev = elemOffset;
+ prevPtr->next = elemOffset;
+
+#ifdef SHMQUEUE_DEBUG_HD
+ dumpQ(queue, "in SHMQueueInsertHD: end");
+#endif /* SHMQUEUE_DEBUG_HD */
+}
+
+void
+SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+ SHM_QUEUE *nextPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+ SHMEM_OFFSET elemOffset = MAKE_OFFSET(elem);
+
+ Assert(SHM_PTR_VALID(queue));
+ Assert(SHM_PTR_VALID(elem));
+
+#ifdef SHMQUEUE_DEBUG_TL
+ dumpQ(queue, "in SHMQueueInsertTL: begin");
+#endif /* SHMQUEUE_DEBUG_TL */
+
+ (elem)->prev = nextPtr->prev;
+ (elem)->next = queue->next;
+ (queue)->next = elemOffset;
+ nextPtr->prev = elemOffset;
+
+#ifdef SHMQUEUE_DEBUG_TL
+ dumpQ(queue, "in SHMQueueInsertTL: end");
+#endif /* SHMQUEUE_DEBUG_TL */
+}
+
+/*
+ * SHMQueueFirst -- Get the first element from a queue
+ *
+ * First element is queue->next. If SHMQueue is part of
+ * a larger structure, we want to return a pointer to the
+ * whole structure rather than a pointer to its SHMQueue field.
+ * I.E. struct {
+ * int stuff;
+ * SHMQueue elem;
+ * } ELEMType;
+ * when this element is in a queue (queue->next) is struct.elem.
+ * nextQueue allows us to calculate the offset of the SHMQueue
+ * field in the structure.
+ *
+ * call to SHMQueueFirst should take these parameters:
+ *
+ * &(queueHead),&firstElem,&(firstElem->next)
+ *
+ * Note that firstElem may well be uninitialized. if firstElem
+ * is initially K, &(firstElem->next) will be K+ the offset to
+ * next.
+ */
+void
+SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr, SHM_QUEUE *nextQueue)
+{
+ SHM_QUEUE *elemPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+
+ Assert(SHM_PTR_VALID(queue));
+ *nextPtrPtr = (Pointer) (((unsigned long) *nextPtrPtr) +
+ ((unsigned long) elemPtr) - ((unsigned long) nextQueue));
+
+ /*
+ nextPtrPtr a ptr to a structure linked in the queue
+ nextQueue is the SHMQueue field of the structure
+ *nextPtrPtr - nextQueue is 0 minus the offset of the queue
+ field n the record
+ elemPtr + (*nextPtrPtr - nexQueue) is the start of the
+ structure containing elemPtr.
+ */
+}
+
+/*
+ * SHMQueueEmpty -- TRUE if queue head is only element, FALSE otherwise
+ */
+bool
+SHMQueueEmpty(SHM_QUEUE *queue)
+{
+ Assert(SHM_PTR_VALID(queue));
+
+ if (queue->prev == MAKE_OFFSET(queue))
+ {
+ Assert(queue->next = MAKE_OFFSET(queue));
+ return(TRUE);
+ }
+ return(FALSE);
+}
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
new file mode 100644
index 00000000000..9151ee77686
--- /dev/null
+++ b/src/backend/storage/ipc/sinval.c
@@ -0,0 +1,169 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.c--
+ * POSTGRES shared cache invalidation communication code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/* #define INVALIDDEBUG 1 */
+
+#include "postgres.h"
+
+#include "storage/sinval.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "utils/elog.h"
+
+extern SISeg *shmInvalBuffer;/* the shared buffer segment, set by*/
+ /* SISegmentAttach() */
+extern BackendId MyBackendId;
+extern BackendTag MyBackendTag;
+
+SPINLOCK SInvalLock = (SPINLOCK) NULL;
+
+/****************************************************************************/
+/* CreateSharedInvalidationState(key) Create a buffer segment */
+/* */
+/* should be called only by the POSTMASTER */
+/****************************************************************************/
+void
+CreateSharedInvalidationState(IPCKey key)
+{
+ int status;
+
+ /* REMOVED
+ SISyncKill(IPCKeyGetSIBufferMemorySemaphoreKey(key));
+ SISyncInit(IPCKeyGetSIBufferMemorySemaphoreKey(key));
+ */
+
+ /* SInvalLock gets set in spin.c, during spinlock init */
+ status = SISegmentInit(true, IPCKeyGetSIBufferMemoryBlock(key));
+
+ if (status == -1) {
+ elog(FATAL, "CreateSharedInvalidationState: failed segment init");
+ }
+}
+/****************************************************************************/
+/* AttachSharedInvalidationState(key) Attach a buffer segment */
+/* */
+/* should be called only by the POSTMASTER */
+/****************************************************************************/
+void
+AttachSharedInvalidationState(IPCKey key)
+{
+ int status;
+
+ if (key == PrivateIPCKey) {
+ CreateSharedInvalidationState(key);
+ return;
+ }
+ /* SInvalLock gets set in spin.c, during spinlock init */
+ status = SISegmentInit(false, IPCKeyGetSIBufferMemoryBlock(key));
+
+ if (status == -1) {
+ elog(FATAL, "AttachSharedInvalidationState: failed segment init");
+ }
+}
+
+void
+InitSharedInvalidationState()
+{
+ SpinAcquire(SInvalLock);
+ if (!SIBackendInit(shmInvalBuffer))
+ {
+ SpinRelease(SInvalLock);
+ elog(FATAL, "Backend cache invalidation initialization failed");
+ }
+ SpinRelease(SInvalLock);
+}
+
+/*
+ * RegisterSharedInvalid --
+ * Returns a new local cache invalidation state containing a new entry.
+ *
+ * Note:
+ * Assumes hash index is valid.
+ * Assumes item pointer is valid.
+ */
+/****************************************************************************/
+/* RegisterSharedInvalid(cacheId, hashIndex, pointer) */
+/* */
+/* register a message in the buffer */
+/* should be called by a backend */
+/****************************************************************************/
+void
+RegisterSharedInvalid(int cacheId, /* XXX */
+ Index hashIndex,
+ ItemPointer pointer)
+{
+ SharedInvalidData newInvalid;
+
+ /*
+ * This code has been hacked to accept two types of messages. This might
+ * be treated more generally in the future.
+ *
+ * (1)
+ * cacheId= system cache id
+ * hashIndex= system cache hash index for a (possibly) cached tuple
+ * pointer= pointer of (possibly) cached tuple
+ *
+ * (2)
+ * cacheId= special non-syscache id
+ * hashIndex= object id contained in (possibly) cached relation descriptor
+ * pointer= null
+ */
+
+ newInvalid.cacheId = cacheId;
+ newInvalid.hashIndex = hashIndex;
+
+ if (ItemPointerIsValid(pointer)) {
+ ItemPointerCopy(pointer, &newInvalid.pointerData);
+ } else {
+ ItemPointerSetInvalid(&newInvalid.pointerData);
+ }
+
+ SpinAcquire(SInvalLock);
+ if (!SISetDataEntry(shmInvalBuffer, &newInvalid)) {
+ /* buffer full */
+ /* release a message, mark process cache states to be invalid */
+ SISetProcStateInvalid(shmInvalBuffer);
+
+ if (!SIDelDataEntry(shmInvalBuffer)) {
+ /* inconsistent buffer state -- shd never happen */
+ SpinRelease(SInvalLock);
+ elog(FATAL, "RegisterSharedInvalid: inconsistent buffer state");
+ }
+
+ /* write again */
+ (void) SISetDataEntry(shmInvalBuffer, &newInvalid);
+ }
+ SpinRelease(SInvalLock);
+}
+
+/*
+ * InvalidateSharedInvalid --
+ * Processes all entries in a shared cache invalidation state.
+ */
+/****************************************************************************/
+/* InvalidateSharedInvalid(invalFunction, resetFunction) */
+/* */
+/* invalidate a message in the buffer (read and clean up) */
+/* should be called by a backend */
+/****************************************************************************/
+void
+InvalidateSharedInvalid(void (*invalFunction)(),
+ void (*resetFunction)())
+{
+ SpinAcquire(SInvalLock);
+ SIReadEntryData(shmInvalBuffer, MyBackendId,
+ invalFunction, resetFunction);
+
+ SIDelExpiredDataEntries(shmInvalBuffer);
+ SpinRelease(SInvalLock);
+}
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
new file mode 100644
index 00000000000..a30afdb6fed
--- /dev/null
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -0,0 +1,797 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.c--
+ * POSTGRES shared cache invalidation segment definitions.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "storage/ipc.h"
+#include "storage/sinvaladt.h"
+#include "storage/lmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+
+/* ----------------
+ * global variable notes
+ *
+ * SharedInvalidationSemaphore
+ *
+ * shmInvalBuffer
+ * the shared buffer segment, set by SISegmentAttach()
+ *
+ * MyBackendId
+ * might be removed later, used only for
+ * debugging in debug routines (end of file)
+ *
+ * SIDbId
+ * identification of buffer (disappears)
+ *
+ * SIRelId \
+ * SIDummyOid \ identification of buffer
+ * SIXidData /
+ * SIXid /
+ *
+ * XXX This file really needs to be cleaned up. We switched to using
+ * spinlocks to protect critical sections (as opposed to using fake
+ * relations and going through the lock manager) and some of the old
+ * cruft was 'ifdef'ed out, while other parts (now unused) are still
+ * compiled into the system. -mer 5/24/92
+ * ----------------
+ */
+#ifdef HAS_TEST_AND_SET
+int SharedInvalidationLockId;
+#else
+IpcSemaphoreId SharedInvalidationSemaphore;
+#endif
+
+SISeg *shmInvalBuffer;
+extern BackendId MyBackendId;
+
+static void CleanupInvalidationState(int status, SISeg *segInOutP);
+static BackendId SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag);
+static int SIGetNumEntries(SISeg *segP);
+
+/************************************************************************/
+/* SISetActiveProcess(segP, backendId) set the backend status active */
+/* should be called only by the postmaster when creating a backend */
+/************************************************************************/
+/* XXX I suspect that the segP parameter is extraneous. -hirohama */
+static void
+SISetActiveProcess(SISeg *segInOutP, BackendId backendId)
+{
+ /* mark all messages as read */
+
+ /* Assert(segP->procState[backendId - 1].tag == MyBackendTag); */
+
+ segInOutP->procState[backendId - 1].resetState = false;
+ segInOutP->procState[backendId - 1].limit = SIGetNumEntries(segInOutP);
+}
+
+/****************************************************************************/
+/* SIBackendInit() initializes a backend to operate on the buffer */
+/****************************************************************************/
+int
+SIBackendInit(SISeg *segInOutP)
+{
+ LRelId LtCreateRelId();
+ TransactionId LMITransactionIdCopy();
+
+ Assert(MyBackendTag > 0);
+
+ MyBackendId = SIAssignBackendId(segInOutP, MyBackendTag);
+ if (MyBackendId == InvalidBackendTag)
+ return 0;
+
+#ifdef INVALIDDEBUG
+ elog(DEBUG, "SIBackendInit: backend tag %d; backend id %d.",
+ MyBackendTag, MyBackendId);
+#endif /* INVALIDDEBUG */
+
+ SISetActiveProcess(segInOutP, MyBackendId);
+ on_exitpg(CleanupInvalidationState, (caddr_t)segInOutP);
+ return 1;
+}
+
+/* ----------------
+ * SIAssignBackendId
+ * ----------------
+ */
+static BackendId
+SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag)
+{
+ Index index;
+ ProcState *stateP;
+
+ stateP = NULL;
+
+ for (index = 0; index < MaxBackendId; index += 1) {
+ if (segInOutP->procState[index].tag == InvalidBackendTag ||
+ segInOutP->procState[index].tag == backendTag)
+ {
+ stateP = &segInOutP->procState[index];
+ break;
+ }
+
+ if (!PointerIsValid(stateP) ||
+ (segInOutP->procState[index].resetState &&
+ (!stateP->resetState ||
+ stateP->tag < backendTag)) ||
+ (!stateP->resetState &&
+ (segInOutP->procState[index].limit <
+ stateP->limit ||
+ stateP->tag < backendTag)))
+ {
+ stateP = &segInOutP->procState[index];
+ }
+ }
+
+ /* verify that all "procState" entries checked for matching tags */
+
+ for (index += 1; index < MaxBackendId; index += 1) {
+ if (segInOutP->procState[index].tag == backendTag) {
+ elog (FATAL, "SIAssignBackendId: tag %d found twice",
+ backendTag);
+ }
+ }
+
+ if (stateP->tag != InvalidBackendTag) {
+ if (stateP->tag == backendTag) {
+ elog(NOTICE, "SIAssignBackendId: reusing tag %d",
+ backendTag);
+ } else {
+ elog(NOTICE,
+ "SIAssignBackendId: discarding tag %d",
+ stateP->tag);
+ return InvalidBackendTag;
+ }
+ }
+
+ stateP->tag = backendTag;
+
+ return (1 + stateP - &segInOutP->procState[0]);
+}
+
+
+/************************************************************************/
+/* The following function should be called only by the postmaster !! */
+/************************************************************************/
+
+/************************************************************************/
+/* SISetDeadProcess(segP, backendId) set the backend status DEAD */
+/* should be called only by the postmaster when a backend died */
+/************************************************************************/
+static void
+SISetDeadProcess(SISeg *segP, int backendId)
+{
+ /* XXX call me.... */
+
+ segP->procState[backendId - 1].resetState = false;
+ segP->procState[backendId - 1].limit = -1;
+ segP->procState[backendId - 1].tag = InvalidBackendTag;
+}
+
+/*
+ * CleanupInvalidationState --
+ * Note:
+ * This is a temporary hack. ExitBackend should call this instead
+ * of exit (via on_exitpg).
+ */
+static void
+CleanupInvalidationState(int status, /* XXX */
+ SISeg *segInOutP) /* XXX style */
+{
+ Assert(PointerIsValid(segInOutP));
+
+ SISetDeadProcess(segInOutP, MyBackendId);
+}
+
+
+/************************************************************************/
+/* SIComputeSize() - retuns the size of a buffer segment */
+/************************************************************************/
+static SISegOffsets *
+SIComputeSize(int *segSize)
+{
+ int A, B, a, b, totalSize;
+ SISegOffsets *oP;
+
+ A = 0;
+ a = SizeSISeg; /* offset to first data entry */
+ b = SizeOfOneSISegEntry * MAXNUMMESSAGES;
+ B = A + a + b;
+ totalSize = B - A;
+ *segSize = totalSize;
+
+ oP = (SISegOffsets *) palloc(sizeof(SISegOffsets));
+ oP->startSegment = A;
+ oP->offsetToFirstEntry = a; /* relatiove to A */
+ oP->offsetToEndOfSegemnt = totalSize; /* relative to A */
+ return(oP);
+}
+
+
+/************************************************************************/
+/* SISetStartEntrySection(segP, offset) - sets the offset */
+/************************************************************************/
+static void
+SISetStartEntrySection(SISeg *segP, Offset offset)
+{
+ segP->startEntrySection = offset;
+}
+
+/************************************************************************/
+/* SIGetStartEntrySection(segP) - returnss the offset */
+/************************************************************************/
+static Offset
+SIGetStartEntrySection(SISeg *segP)
+{
+ return(segP->startEntrySection);
+}
+
+
+/************************************************************************/
+/* SISetEndEntrySection(segP, offset) - sets the offset */
+/************************************************************************/
+static void
+SISetEndEntrySection(SISeg *segP, Offset offset)
+{
+ segP->endEntrySection = offset;
+}
+
+/************************************************************************/
+/* SISetEndEntryChain(segP, offset) - sets the offset */
+/************************************************************************/
+static void
+SISetEndEntryChain(SISeg *segP, Offset offset)
+{
+ segP->endEntryChain = offset;
+}
+
+/************************************************************************/
+/* SIGetEndEntryChain(segP) - returnss the offset */
+/************************************************************************/
+static Offset
+SIGetEndEntryChain(SISeg *segP)
+{
+ return(segP->endEntryChain);
+}
+
+/************************************************************************/
+/* SISetStartEntryChain(segP, offset) - sets the offset */
+/************************************************************************/
+static void
+SISetStartEntryChain(SISeg *segP, Offset offset)
+{
+ segP->startEntryChain = offset;
+}
+
+/************************************************************************/
+/* SIGetStartEntryChain(segP) - returns the offset */
+/************************************************************************/
+static Offset
+SIGetStartEntryChain(SISeg *segP)
+{
+ return(segP->startEntryChain);
+}
+
+/************************************************************************/
+/* SISetNumEntries(segP, num) sets the current nuber of entries */
+/************************************************************************/
+static bool
+SISetNumEntries(SISeg *segP, int num)
+{
+ if ( num <= MAXNUMMESSAGES) {
+ segP->numEntries = num;
+ return(true);
+ } else {
+ return(false); /* table full */
+ }
+}
+
+/************************************************************************/
+/* SIGetNumEntries(segP) - returns the current nuber of entries */
+/************************************************************************/
+static int
+SIGetNumEntries(SISeg *segP)
+{
+ return(segP->numEntries);
+}
+
+
+/************************************************************************/
+/* SISetMaxNumEntries(segP, num) sets the maximal number of entries */
+/************************************************************************/
+static bool
+SISetMaxNumEntries(SISeg *segP, int num)
+{
+ if ( num <= MAXNUMMESSAGES) {
+ segP->maxNumEntries = num;
+ return(true);
+ } else {
+ return(false); /* wrong number */
+ }
+}
+
+
+/************************************************************************/
+/* SIGetProcStateLimit(segP, i) returns the limit of read messages */
+/************************************************************************/
+static int
+SIGetProcStateLimit(SISeg *segP, int i)
+{
+ return(segP->procState[i].limit);
+}
+
+/************************************************************************/
+/* SIIncNumEntries(segP, num) increments the current nuber of entries */
+/************************************************************************/
+static bool
+SIIncNumEntries(SISeg *segP, int num)
+{
+ if ((segP->numEntries + num) <= MAXNUMMESSAGES) {
+ segP->numEntries = segP->numEntries + num;
+ return(true);
+ } else {
+ return(false); /* table full */
+ }
+}
+
+/************************************************************************/
+/* SIDecNumEntries(segP, num) decrements the current nuber of entries */
+/************************************************************************/
+static bool
+SIDecNumEntries(SISeg *segP, int num)
+{
+ if ((segP->numEntries - num) >= 0) {
+ segP->numEntries = segP->numEntries - num;
+ return(true);
+ } else {
+ return(false); /* not enough entries in table */
+ }
+}
+
+/************************************************************************/
+/* SISetStartFreeSpace(segP, offset) - sets the offset */
+/************************************************************************/
+static void
+SISetStartFreeSpace(SISeg *segP, Offset offset)
+{
+ segP->startFreeSpace = offset;
+}
+
+/************************************************************************/
+/* SIGetStartFreeSpace(segP) - returns the offset */
+/************************************************************************/
+static Offset
+SIGetStartFreeSpace(SISeg *segP)
+{
+ return(segP->startFreeSpace);
+}
+
+
+
+/************************************************************************/
+/* SIGetFirstDataEntry(segP) returns first data entry */
+/************************************************************************/
+static SISegEntry *
+SIGetFirstDataEntry(SISeg *segP)
+{
+ SISegEntry *eP;
+ Offset startChain;
+
+ startChain = SIGetStartEntryChain(segP);
+
+ if (startChain == InvalidOffset)
+ return(NULL);
+
+ eP = (SISegEntry *) ((Pointer) segP +
+ SIGetStartEntrySection(segP) +
+ startChain );
+ return(eP);
+}
+
+
+/************************************************************************/
+/* SIGetLastDataEntry(segP) returns last data entry in the chain */
+/************************************************************************/
+static SISegEntry *
+SIGetLastDataEntry(SISeg *segP)
+{
+ SISegEntry *eP;
+ Offset endChain;
+
+ endChain = SIGetEndEntryChain(segP);
+
+ if (endChain == InvalidOffset)
+ return(NULL);
+
+ eP = (SISegEntry *) ((Pointer) segP +
+ SIGetStartEntrySection(segP) +
+ endChain );
+ return(eP);
+}
+
+/************************************************************************/
+/* SIGetNextDataEntry(segP, offset) returns next data entry */
+/************************************************************************/
+static SISegEntry *
+SIGetNextDataEntry(SISeg *segP, Offset offset)
+{
+ SISegEntry *eP;
+
+ if (offset == InvalidOffset)
+ return(NULL);
+
+ eP = (SISegEntry *) ((Pointer) segP +
+ SIGetStartEntrySection(segP) +
+ offset);
+ return(eP);
+}
+
+
+/************************************************************************/
+/* SIGetNthDataEntry(segP, n) returns the n-th data entry in chain */
+/************************************************************************/
+static SISegEntry *
+SIGetNthDataEntry(SISeg *segP,
+ int n) /* must range from 1 to MaxMessages */
+{
+ SISegEntry *eP;
+ int i;
+
+ if (n <= 0) return(NULL);
+
+ eP = SIGetFirstDataEntry(segP);
+ for (i = 1; i < n; i++) {
+ /* skip one and get the next */
+ eP = SIGetNextDataEntry(segP, eP->next);
+ }
+
+ return(eP);
+}
+
+/************************************************************************/
+/* SIEntryOffset(segP, entryP) returns the offset for an pointer */
+/************************************************************************/
+static Offset
+SIEntryOffset(SISeg *segP, SISegEntry *entryP)
+{
+ /* relative to B !! */
+ return ((Offset) ((Pointer) entryP -
+ (Pointer) segP -
+ SIGetStartEntrySection(segP) ));
+}
+
+
+/************************************************************************/
+/* SISetDataEntry(segP, data) - sets a message in the segemnt */
+/************************************************************************/
+bool
+SISetDataEntry(SISeg *segP, SharedInvalidData *data)
+{
+ Offset offsetToNewData;
+ SISegEntry *eP, *lastP;
+ bool SISegFull();
+ Offset SIEntryOffset();
+ Offset SIGetStartFreeSpace();
+ SISegEntry *SIGetFirstDataEntry();
+ SISegEntry *SIGetNextDataEntry();
+ SISegEntry *SIGetLastDataEntry();
+
+ if (!SIIncNumEntries(segP, 1))
+ return(false); /* no space */
+
+ /* get a free entry */
+ offsetToNewData = SIGetStartFreeSpace(segP);
+ eP = SIGetNextDataEntry(segP, offsetToNewData); /* it's a free one */
+ SISetStartFreeSpace(segP, eP->next);
+ /* fill it up */
+ eP->entryData = *data;
+ eP->isfree = false;
+ eP->next = InvalidOffset;
+
+ /* handle insertion point at the end of the chain !!*/
+ lastP = SIGetLastDataEntry(segP);
+ if (lastP == NULL) {
+ /* there is no chain, insert the first entry */
+ SISetStartEntryChain(segP, SIEntryOffset(segP, eP));
+ } else {
+ /* there is a last entry in the chain */
+ lastP->next = SIEntryOffset(segP, eP);
+ }
+ SISetEndEntryChain(segP, SIEntryOffset(segP, eP));
+ return(true);
+}
+
+
+/************************************************************************/
+/* SIDecProcLimit(segP, num) decrements all process limits */
+/************************************************************************/
+static void
+SIDecProcLimit(SISeg *segP, int num)
+{
+ int i;
+ for (i=0; i < MaxBackendId; i++) {
+ /* decrement only, if there is a limit > 0 */
+ if (segP->procState[i].limit > 0) {
+ segP->procState[i].limit = segP->procState[i].limit - num;
+ if (segP->procState[i].limit < 0) {
+ /* limit was not high enough, reset to zero */
+ /* negative means it's a dead backend */
+ segP->procState[i].limit = 0;
+ }
+ }
+ }
+}
+
+
+/************************************************************************/
+/* SIDelDataEntry(segP) - free the FIRST entry */
+/************************************************************************/
+bool
+SIDelDataEntry(SISeg *segP)
+{
+ SISegEntry *e1P;
+ SISegEntry *SIGetFirstDataEntry();
+
+ if (!SIDecNumEntries(segP, 1)) {
+ /* no entries in buffer */
+ return(false);
+ }
+
+ e1P = SIGetFirstDataEntry(segP);
+ SISetStartEntryChain(segP, e1P->next);
+ if (SIGetStartEntryChain(segP) == InvalidOffset) {
+ /* it was the last entry */
+ SISetEndEntryChain(segP, InvalidOffset);
+ }
+ /* free the entry */
+ e1P->isfree = true;
+ e1P->next = SIGetStartFreeSpace(segP);
+ SISetStartFreeSpace(segP, SIEntryOffset(segP, e1P));
+ SIDecProcLimit(segP, 1);
+ return(true);
+}
+
+
+
+/************************************************************************/
+/* SISetProcStateInvalid(segP) checks and marks a backends state as */
+/* invalid */
+/************************************************************************/
+void
+SISetProcStateInvalid(SISeg *segP)
+{
+ int i;
+
+ for (i=0; i < MaxBackendId; i++) {
+ if (segP->procState[i].limit == 0) {
+ /* backend i didn't read any message */
+ segP->procState[i].resetState = true;
+ /*XXX signal backend that it has to reset its internal cache ? */
+ }
+ }
+}
+
+/************************************************************************/
+/* SIReadEntryData(segP, backendId, function) */
+/* - marks messages to be read by id */
+/* and executes function */
+/************************************************************************/
+void
+SIReadEntryData(SISeg *segP,
+ int backendId,
+ void (*invalFunction)(),
+ void (*resetFunction)())
+{
+ int i = 0;
+ SISegEntry *data;
+
+ Assert(segP->procState[backendId - 1].tag == MyBackendTag);
+
+ if (!segP->procState[backendId - 1].resetState) {
+ /* invalidate data, but only those, you have not seen yet !!*/
+ /* therefore skip read messages */
+ data = SIGetNthDataEntry(segP,
+ SIGetProcStateLimit(segP, backendId - 1) + 1);
+ while (data != NULL) {
+ i++;
+ segP->procState[backendId - 1].limit++; /* one more message read */
+ invalFunction(data->entryData.cacheId,
+ data->entryData.hashIndex,
+ &data->entryData.pointerData);
+ data = SIGetNextDataEntry(segP, data->next);
+ }
+ /* SIDelExpiredDataEntries(segP); */
+ } else {
+ /*backend must not read messages, its own state has to be reset */
+ elog(NOTICE, "SIMarkEntryData: cache state reset");
+ resetFunction(); /* XXXX call it here, parameters? */
+
+ /* new valid state--mark all messages "read" */
+ segP->procState[backendId - 1].resetState = false;
+ segP->procState[backendId - 1].limit = SIGetNumEntries(segP);
+ }
+ /* check whether we can remove dead messages */
+ if (i > MAXNUMMESSAGES) {
+ elog(FATAL, "SIReadEntryData: Invalid segment state");
+ }
+}
+
+/************************************************************************/
+/* SIDelExpiredDataEntries (segP) - removes irrelevant messages */
+/************************************************************************/
+void
+SIDelExpiredDataEntries(SISeg *segP)
+{
+ int min, i, h;
+
+ min = 9999999;
+ for (i = 0; i < MaxBackendId; i++) {
+ h = SIGetProcStateLimit(segP, i);
+ if (h >= 0) { /* backend active */
+ if (h < min ) min = h;
+ }
+ }
+ if (min != 9999999) {
+ /* we can remove min messages */
+ for (i = 1; i <= min; i++) {
+ /* this adjusts also the state limits!*/
+ if (!SIDelDataEntry(segP)) {
+ elog(FATAL, "SIDelExpiredDataEntries: Invalid segment state");
+ }
+ }
+ }
+}
+
+
+
+/************************************************************************/
+/* SISegInit(segP) - initializes the segment */
+/************************************************************************/
+static void
+SISegInit(SISeg *segP)
+{
+ SISegOffsets *oP;
+ int segSize, i;
+ SISegEntry *eP;
+
+ oP = SIComputeSize(&segSize);
+ /* set sempahore ids in the segment */
+ /* XXX */
+ SISetStartEntrySection(segP, oP->offsetToFirstEntry);
+ SISetEndEntrySection(segP, oP->offsetToEndOfSegemnt);
+ SISetStartFreeSpace(segP, 0);
+ SISetStartEntryChain(segP, InvalidOffset);
+ SISetEndEntryChain(segP, InvalidOffset);
+ (void) SISetNumEntries(segP, 0);
+ (void) SISetMaxNumEntries(segP, MAXNUMMESSAGES);
+ for (i = 0; i < MaxBackendId; i++) {
+ segP->procState[i].limit = -1; /* no backend active !!*/
+ segP->procState[i].resetState = false;
+ segP->procState[i].tag = InvalidBackendTag;
+ }
+ /* construct a chain of free entries */
+ for (i = 1; i < MAXNUMMESSAGES; i++) {
+ eP = (SISegEntry *) ((Pointer) segP +
+ SIGetStartEntrySection(segP) +
+ (i - 1) * sizeof(SISegEntry));
+ eP->isfree = true;
+ eP->next = i * sizeof(SISegEntry); /* relative to B */
+ }
+ /* handle the last free entry separate */
+ eP = (SISegEntry *) ((Pointer) segP +
+ SIGetStartEntrySection(segP) +
+ (MAXNUMMESSAGES - 1) * sizeof(SISegEntry));
+ eP->isfree = true;
+ eP->next = InvalidOffset; /* it's the end of the chain !! */
+ /*
+ * Be tidy
+ */
+ pfree(oP);
+
+}
+
+
+
+/************************************************************************/
+/* SISegmentKill(key) - kill any segment */
+/************************************************************************/
+static void
+SISegmentKill(int key) /* the corresponding key for the segment */
+{
+ IpcMemoryKill(key);
+}
+
+
+/************************************************************************/
+/* SISegmentGet(key, size) - get a shared segment of size <size> */
+/* returns a segment id */
+/************************************************************************/
+static IpcMemoryId
+SISegmentGet(int key, /* the corresponding key for the segment */
+ int size, /* size of segment in bytes */
+ bool create)
+{
+ IpcMemoryId shmid;
+
+ if (create) {
+ shmid = IpcMemoryCreate(key, size, IPCProtection);
+ } else {
+ shmid = IpcMemoryIdGet(key, size);
+ }
+ return(shmid);
+}
+
+/************************************************************************/
+/* SISegmentAttach(shmid) - attach a shared segment with id shmid */
+/************************************************************************/
+static void
+SISegmentAttach(IpcMemoryId shmid)
+{
+ shmInvalBuffer = (struct SISeg *) IpcMemoryAttach(shmid);
+ if (shmInvalBuffer == IpcMemAttachFailed) {
+ /* XXX use validity function */
+ elog(NOTICE, "SISegmentAttach: Could not attach segment");
+ elog(FATAL, "SISegmentAttach: %m");
+ }
+}
+
+
+/************************************************************************/
+/* SISegmentInit(killExistingSegment, key) initialize segment */
+/************************************************************************/
+int
+SISegmentInit(bool killExistingSegment, IPCKey key)
+{
+ SISegOffsets *oP;
+ int segSize;
+ IpcMemoryId shmId;
+ bool create;
+
+ if (killExistingSegment) {
+ /* Kill existing segment */
+ /* set semaphore */
+ SISegmentKill(key);
+
+ /* Get a shared segment */
+
+ oP = SIComputeSize(&segSize);
+ /*
+ * Be tidy
+ */
+ pfree(oP);
+
+ create = true;
+ shmId = SISegmentGet(key,segSize, create);
+ if (shmId < 0) {
+ perror("SISegmentGet: failed");
+ return(-1); /* an error */
+ }
+
+ /* Attach the shared cache invalidation segment */
+ /* sets the global variable shmInvalBuffer */
+ SISegmentAttach(shmId);
+
+ /* Init shared memory table */
+ SISegInit(shmInvalBuffer);
+ } else {
+ /* use an existing segment */
+ create = false;
+ shmId = SISegmentGet(key, 0, create);
+ if (shmId < 0) {
+ perror("SISegmentGet: getting an existent segment failed");
+ return(-1); /* an error */
+ }
+ /* Attach the shared cache invalidation segment */
+ SISegmentAttach(shmId);
+ }
+ return(1);
+}
+
diff --git a/src/backend/storage/ipc/spin.c b/src/backend/storage/ipc/spin.c
new file mode 100644
index 00000000000..7ff2561f237
--- /dev/null
+++ b/src/backend/storage/ipc/spin.c
@@ -0,0 +1,247 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.c--
+ * routines for managing spin locks
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/spin.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES has two kinds of locks: semaphores (which put the
+ * process to sleep) and spinlocks (which are supposed to be
+ * short term locks). Currently both are implemented as SysV
+ * semaphores, but presumably this can change if we move to
+ * a machine with a test-and-set (TAS) instruction. Its probably
+ * a good idea to think about (and allocate) short term and long
+ * term semaphores separately anyway.
+ *
+ * NOTE: These routines are not supposed to be widely used in Postgres.
+ * They are preserved solely for the purpose of porting Mark Sullivan's
+ * buffer manager to Postgres.
+ */
+#include <errno.h>
+#include "postgres.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/proc.h"
+#include "utils/elog.h"
+
+/* globals used in this file */
+IpcSemaphoreId SpinLockId;
+
+#ifdef HAS_TEST_AND_SET
+/* real spin lock implementations */
+
+bool
+CreateSpinlocks(IPCKey key)
+{
+ /* the spin lock shared memory must have been created by now */
+ return(TRUE);
+}
+
+bool
+AttachSpinLocks(IPCKey key)
+{
+ /* the spin lock shared memory must have been attached by now */
+ return(TRUE);
+}
+
+bool
+InitSpinLocks(int init, IPCKey key)
+{
+ extern SPINLOCK ShmemLock;
+ extern SPINLOCK BindingLock;
+ extern SPINLOCK BufMgrLock;
+ extern SPINLOCK LockMgrLock;
+ extern SPINLOCK ProcStructLock;
+ extern SPINLOCK SInvalLock;
+ extern SPINLOCK OidGenLockId;
+
+#ifdef MAIN_MEMORY
+ extern SPINLOCK MMCacheLock;
+#endif /* SONY_JUKEBOX */
+
+ /* These six spinlocks have fixed location is shmem */
+ ShmemLock = (SPINLOCK) SHMEMLOCKID;
+ BindingLock = (SPINLOCK) BINDINGLOCKID;
+ BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
+ LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
+ ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
+ SInvalLock = (SPINLOCK) SINVALLOCKID;
+ OidGenLockId = (SPINLOCK) OIDGENLOCKID;
+
+#ifdef MAIN_MEMORY
+ MMCacheLock = (SPINLOCK) MMCACHELOCKID;
+#endif /* MAIN_MEMORY */
+
+ return(TRUE);
+}
+
+void
+SpinAcquire(SPINLOCK lock)
+{
+ ExclusiveLock(lock);
+ PROC_INCR_SLOCK(lock);
+}
+
+void
+SpinRelease(SPINLOCK lock)
+{
+ PROC_DECR_SLOCK(lock);
+ ExclusiveUnlock(lock);
+}
+
+bool
+SpinIsLocked(SPINLOCK lock)
+{
+ return(!LockIsFree(lock));
+}
+
+#else /* HAS_TEST_AND_SET */
+/* Spinlocks are implemented using SysV semaphores */
+
+
+/*
+ * SpinAcquire -- try to grab a spinlock
+ *
+ * FAILS if the semaphore is corrupted.
+ */
+void
+SpinAcquire(SPINLOCK lock)
+{
+ IpcSemaphoreLock(SpinLockId, lock, IpcExclusiveLock);
+ PROC_INCR_SLOCK(lock);
+}
+
+/*
+ * SpinRelease -- release a spin lock
+ *
+ * FAILS if the semaphore is corrupted
+ */
+void
+SpinRelease(SPINLOCK lock)
+{
+ Assert(SpinIsLocked(lock))
+ PROC_DECR_SLOCK(lock);
+ IpcSemaphoreUnlock(SpinLockId, lock, IpcExclusiveLock);
+}
+
+bool
+SpinIsLocked(SPINLOCK lock)
+{
+ int semval;
+
+ semval = IpcSemaphoreGetValue(SpinLockId, lock);
+ return(semval < IpcSemaphoreDefaultStartValue);
+}
+
+/*
+ * CreateSpinlocks -- Create a sysV semaphore array for
+ * the spinlocks
+ *
+ */
+bool
+CreateSpinlocks(IPCKey key)
+{
+
+ int status;
+ IpcSemaphoreId semid;
+ semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection,
+ IpcSemaphoreDefaultStartValue, 1, &status);
+ if (status == IpcSemIdExist) {
+ IpcSemaphoreKill(key);
+ elog(NOTICE,"Destroying old spinlock semaphore");
+ semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection,
+ IpcSemaphoreDefaultStartValue, 1, &status);
+ }
+
+ if (semid >= 0) {
+ SpinLockId = semid;
+ return(TRUE);
+ }
+ /* cannot create spinlocks */
+ elog(FATAL,"CreateSpinlocks: cannot create spin locks");
+ return(FALSE);
+}
+
+/*
+ * Attach to existing spinlock set
+ */
+bool
+AttachSpinLocks(IPCKey key)
+{
+ IpcSemaphoreId id;
+
+ id = semget (key, MAX_SPINS, 0);
+ if (id < 0) {
+ if (errno == EEXIST) {
+ /* key is the name of someone else's semaphore */
+ elog (FATAL,"AttachSpinlocks: SPIN_KEY belongs to someone else");
+ }
+ /* cannot create spinlocks */
+ elog(FATAL,"AttachSpinlocks: cannot create spin locks");
+ return(FALSE);
+ }
+ SpinLockId = id;
+ return(TRUE);
+}
+
+/*
+ * InitSpinLocks -- Spinlock bootstrapping
+ *
+ * We need several spinlocks for bootstrapping:
+ * BindingLock (for the shmem binding table) and
+ * ShmemLock (for the shmem allocator), BufMgrLock (for buffer
+ * pool exclusive access), LockMgrLock (for the lock table), and
+ * ProcStructLock (a spin lock for the shared process structure).
+ * If there's a Sony WORM drive attached, we also have a spinlock
+ * (SJCacheLock) for it. Same story for the main memory storage mgr.
+ *
+ */
+bool
+InitSpinLocks(int init, IPCKey key)
+{
+ extern SPINLOCK ShmemLock;
+ extern SPINLOCK BindingLock;
+ extern SPINLOCK BufMgrLock;
+ extern SPINLOCK LockMgrLock;
+ extern SPINLOCK ProcStructLock;
+ extern SPINLOCK SInvalLock;
+ extern SPINLOCK OidGenLockId;
+
+#ifdef MAIN_MEMORY
+ extern SPINLOCK MMCacheLock;
+#endif /* MAIN_MEMORY */
+
+ if (!init || key != IPC_PRIVATE) {
+ /* if bootstrap and key is IPC_PRIVATE, it means that we are running
+ * backend by itself. no need to attach spinlocks
+ */
+ if (! AttachSpinLocks(key)) {
+ elog(FATAL,"InitSpinLocks: couldnt attach spin locks");
+ return(FALSE);
+ }
+ }
+
+ /* These five (or six) spinlocks have fixed location is shmem */
+ ShmemLock = (SPINLOCK) SHMEMLOCKID;
+ BindingLock = (SPINLOCK) BINDINGLOCKID;
+ BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
+ LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
+ ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
+ SInvalLock = (SPINLOCK) SINVALLOCKID;
+ OidGenLockId = (SPINLOCK) OIDGENLOCKID;
+
+#ifdef MAIN_MEMORY
+ MMCacheLock = (SPINLOCK) MMCACHELOCKID;
+#endif /* MAIN_MEMORY */
+
+ return(TRUE);
+}
+#endif /* HAS_TEST_AND_SET */
diff --git a/src/backend/storage/item.h b/src/backend/storage/item.h
new file mode 100644
index 00000000000..ca989fec654
--- /dev/null
+++ b/src/backend/storage/item.h
@@ -0,0 +1,20 @@
+/*-------------------------------------------------------------------------
+ *
+ * item.h--
+ * POSTGRES disk item definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: item.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEM_H
+#define ITEM_H
+
+#include "c.h"
+
+typedef Pointer Item;
+
+#endif /* ITEM_H */
diff --git a/src/backend/storage/itemid.h b/src/backend/storage/itemid.h
new file mode 100644
index 00000000000..f5cd0c62cc0
--- /dev/null
+++ b/src/backend/storage/itemid.h
@@ -0,0 +1,75 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemid.h--
+ * Standard POSTGRES buffer page item identifier definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itemid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEMID_H
+#define ITEMID_H
+
+typedef uint16 ItemOffset;
+typedef uint16 ItemLength;
+
+typedef bits16 ItemIdFlags;
+
+
+
+typedef struct ItemIdData { /* line pointers */
+ unsigned lp_off:13, /* offset to find tup */
+ /* can be reduced by 2 if necc. */
+ lp_flags:6, /* flags on tuple */
+ lp_len:13; /* length of tuple */
+} ItemIdData;
+
+typedef struct ItemIdData *ItemId;
+
+#ifndef LP_USED
+#define LP_USED 0x01 /* this line pointer is being used */
+#endif
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+/*
+ * ItemIdGetLength
+ */
+#define ItemIdGetLength(itemId) \
+ ((itemId)->lp_len)
+
+/*
+ * ItemIdGetOffset
+ */
+#define ItemIdGetOffset(itemId) \
+ ((itemId)->lp_off)
+
+/*
+ * ItemIdGetFlags
+ */
+#define ItemIdGetFlags(itemId) \
+ ((itemId)->lp_flags)
+
+/*
+ * ItemIdIsValid --
+ * True iff disk item identifier is valid.
+ */
+#define ItemIdIsValid(itemId) PointerIsValid(itemId)
+
+/*
+ * ItemIdIsUsed --
+ * True iff disk item identifier is in use.
+ *
+ * Note:
+ * Assumes disk item identifier is valid.
+ */
+#define ItemIdIsUsed(itemId) \
+ (AssertMacro(ItemIdIsValid(itemId)) ? \
+ (bool) (((itemId)->lp_flags & LP_USED) != 0) : false)
+
+#endif /* ITEMID_H */
diff --git a/src/backend/storage/itempos.h b/src/backend/storage/itempos.h
new file mode 100644
index 00000000000..c3b895ae075
--- /dev/null
+++ b/src/backend/storage/itempos.h
@@ -0,0 +1,44 @@
+/*-------------------------------------------------------------------------
+ *
+ * itempos.h--
+ * Standard POSTGRES buffer page long item subposition definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itempos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEMPOS_H
+#define ITEMPOS_H
+
+#include "c.h"
+#include "storage/buf.h"
+#include "storage/itemid.h"
+
+typedef struct ItemSubpositionData {
+ Buffer op_db;
+ ItemId op_lpp;
+ char *op_cp; /* XXX */
+ uint32 op_len;
+} ItemSubpositionData;
+
+typedef ItemSubpositionData *ItemSubposition;
+
+/*
+ * PNOBREAK(OBJP, LEN)
+ * struct objpos *OBJP;
+ * unsigned LEN;
+ */
+#define PNOBREAK(OBJP, LEN) ((OBJP)->op_len >= LEN)
+
+/*
+ * PSKIP(OBJP, LEN)
+ * struct objpos *OBJP;
+ * unsigned LEN;
+ */
+#define PSKIP(OBJP, LEN)\
+ { (OBJP)->op_cp += (LEN); (OBJP)->op_len -= (LEN); }
+
+#endif /* ITEMPOS_H */
diff --git a/src/backend/storage/itemptr.h b/src/backend/storage/itemptr.h
new file mode 100644
index 00000000000..ba3c154ef14
--- /dev/null
+++ b/src/backend/storage/itemptr.h
@@ -0,0 +1,115 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.h--
+ * POSTGRES disk item pointer definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itemptr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEMPTR_H
+#define ITEMPTR_H
+
+#include "c.h"
+#include "storage/block.h"
+#include "storage/off.h"
+#include "storage/itemid.h"
+
+/*
+ * ItemPointer:
+ *
+ * this is a pointer to an item on another disk page in the same file.
+ * blkid tells us which block, posid tells us which entry in the linp
+ * (ItemIdData) array we want.
+ */
+typedef struct ItemPointerData {
+ BlockIdData ip_blkid;
+ OffsetNumber ip_posid;
+} ItemPointerData;
+
+typedef ItemPointerData *ItemPointer;
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+
+/*
+ * ItemPointerIsValid --
+ * True iff the disk item pointer is not NULL.
+ */
+#define ItemPointerIsValid(pointer) \
+ ((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0)))
+
+/*
+ * ItemPointerGetBlockNumber --
+ * Returns the block number of a disk item pointer.
+ */
+#define ItemPointerGetBlockNumber(pointer) \
+ (AssertMacro(ItemPointerIsValid(pointer)) ? \
+ BlockIdGetBlockNumber(&(pointer)->ip_blkid) : (BlockNumber) 0)
+
+/*
+ * ItemPointerGetOffsetNumber --
+ * Returns the offset number of a disk item pointer.
+ */
+#define ItemPointerGetOffsetNumber(pointer) \
+ (AssertMacro(ItemPointerIsValid(pointer)) ? \
+ (pointer)->ip_posid : \
+ InvalidOffsetNumber)
+
+/*
+ * ItemPointerSet --
+ * Sets a disk item pointer to the specified block and offset.
+ */
+#define ItemPointerSet(pointer, blockNumber, offNum) \
+ Assert(PointerIsValid(pointer)); \
+ BlockIdSet(&((pointer)->ip_blkid), blockNumber); \
+ (pointer)->ip_posid = offNum
+
+/*
+ * ItemPointerSetBlockNumber --
+ * Sets a disk item pointer to the specified block.
+ */
+#define ItemPointerSetBlockNumber(pointer, blockNumber) \
+ Assert(PointerIsValid(pointer)); \
+ BlockIdSet(&((pointer)->ip_blkid), blockNumber)
+
+/*
+ * ItemPointerSetOffsetNumber --
+ * Sets a disk item pointer to the specified offset.
+ */
+#define ItemPointerSetOffsetNumber(pointer, offsetNumber) \
+ AssertMacro(PointerIsValid(pointer)); \
+ (pointer)->ip_posid = (offsetNumber)
+
+/*
+ * ItemPointerCopy --
+ * Copies the contents of one disk item pointer to another.
+ */
+#define ItemPointerCopy(fromPointer, toPointer) \
+ Assert(PointerIsValid(toPointer)); \
+ Assert(PointerIsValid(fromPointer)); \
+ *(toPointer) = *(fromPointer)
+
+/*
+ * ItemPointerSetInvalid --
+ * Sets a disk item pointer to be invalid.
+ */
+#define ItemPointerSetInvalid(pointer) \
+ Assert(PointerIsValid(pointer)); \
+ BlockIdSet(&((pointer)->ip_blkid), InvalidBlockNumber); \
+ (pointer)->ip_posid = InvalidOffsetNumber
+
+/* ----------------
+ * externs
+ * ----------------
+ */
+
+extern bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2);
+
+#endif /* ITEMPTR_H */
+
diff --git a/src/backend/storage/large_object.h b/src/backend/storage/large_object.h
new file mode 100644
index 00000000000..177d2c26e47
--- /dev/null
+++ b/src/backend/storage/large_object.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * large_object.h--
+ * file of info for Postgres large objects. POSTGRES 4.2 supports
+ * zillions of large objects (internal, external, jaquith, inversion).
+ * Now we only support inversion.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: large_object.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LARGE_OBJECT_H
+#define LARGE_OBJECT_H
+
+#include "c.h"
+#include "utils/rel.h"
+#include "access/relscan.h"
+
+/*
+ * This structure will eventually have lots more stuff associated with it.
+ */
+typedef struct LargeObjectDesc
+{
+ Relation heap_r; /* heap relation */
+ Relation index_r; /* index relation on seqno attribute */
+ IndexScanDesc iscan; /* index scan we're using */
+ TupleDesc hdesc; /* heap relation tuple desc */
+ TupleDesc idesc; /* index relation tuple desc */
+ uint32 lowbyte; /* low byte on the current page */
+ uint32 highbyte; /* high byte on the current page */
+ uint32 offset; /* current seek pointer */
+ ItemPointerData htid; /* tid of current heap tuple */
+
+#define IFS_RDLOCK (1 << 0)
+#define IFS_WRLOCK (1 << 1)
+#define IFS_ATEOF (1 << 2)
+
+ u_long flags; /* locking info, etc */
+} LargeObjectDesc;
+
+/*
+ * Function definitions...
+ */
+
+/* inversion stuff in inv_api.c */
+extern LargeObjectDesc *inv_create(int flags);
+extern LargeObjectDesc *inv_open(Oid lobjId, int flags);
+extern void inv_close(LargeObjectDesc *obj_desc);
+extern int inv_destroy(Oid lobjId);
+extern int inv_stat(LargeObjectDesc *obj_desc, struct pgstat *stbuf);
+extern int inv_seek(LargeObjectDesc *obj_desc, int offset, int whence);
+extern int inv_tell(LargeObjectDesc *obj_desc);
+extern int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+extern int inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+
+#endif /* LARGE_OBJECT_H */
diff --git a/src/backend/storage/large_object/Makefile.inc b/src/backend/storage/large_object/Makefile.inc
new file mode 100644
index 00000000000..fd27b46a49d
--- /dev/null
+++ b/src/backend/storage/large_object/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for storage/large_object
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/storage/large_object/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= inv_api.c
diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c
new file mode 100644
index 00000000000..ae57032f94a
--- /dev/null
+++ b/src/backend/storage/large_object/inv_api.c
@@ -0,0 +1,1165 @@
+/*-------------------------------------------------------------------------
+ *
+ * inv_api.c--
+ * routines for manipulating inversion fs large objects. This file
+ * contains the user-level large object application interface routines.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/large_object/inv_api.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h> /* for sprintf() */
+#include <sys/file.h>
+#include "c.h"
+#include "libpq/libpq-fs.h"
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/relscan.h"
+#include "access/tupdesc.h"
+#include "access/xact.h"
+#include "access/nbtree.h"
+#include "access/tupdesc.h"
+#include "catalog/index.h" /* for index_create() */
+#include "catalog/catalog.h" /* for newoid() */
+#include "catalog/pg_am.h" /* for BTREE_AM_OID */
+#include "catalog/pg_opclass.h" /* for INT4_OPS_OID */
+#include "catalog/pg_proc.h" /* for INT4GE_PROC_OID */
+#include "storage/itemptr.h"
+#include "storage/bufpage.h"
+#include "storage/bufmgr.h"
+#include "utils/rel.h"
+#include "utils/palloc.h"
+#include "storage/large_object.h"
+#include "utils/elog.h"
+#include "utils/syscache.h"
+#include "utils/builtins.h" /* for namestrcpy() */
+#include "catalog/heap.h"
+#include "nodes/pg_list.h"
+
+/*
+ * Warning, Will Robinson... In order to pack data into an inversion
+ * file as densely as possible, we violate the class abstraction here.
+ * When we're appending a new tuple to the end of the table, we check
+ * the last page to see how much data we can put on it. If it's more
+ * than IMINBLK, we write enough to fill the page. This limits external
+ * fragmentation. In no case can we write more than IMAXBLK, since
+ * the 8K postgres page size less overhead leaves only this much space
+ * for data.
+ */
+
+#define IFREESPC(p) (PageGetFreeSpace(p) - sizeof(HeapTupleData) - sizeof(struct varlena) - sizeof(int32))
+#define IMAXBLK 8092
+#define IMINBLK 512
+
+/* non-export function prototypes */
+static HeapTuple inv_fetchtup();
+static HeapTuple inv_newtuple();
+static int inv_wrnew(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+static int inv_wrold(LargeObjectDesc *obj_desc, char *dbuf, int nbytes,
+ HeapTuple htup, Buffer buffer);
+static void inv_indextup(LargeObjectDesc *obj_desc, HeapTuple htup);
+static int _inv_getsize(Relation hreln, TupleDesc hdesc, Relation ireln);
+
+/*
+ * inv_create -- create a new large object.
+ *
+ * Arguments:
+ * flags -- storage manager to use, archive mode, etc.
+ *
+ * Returns:
+ * large object descriptor, appropriately filled in.
+ */
+LargeObjectDesc *
+inv_create(int flags)
+{
+ int file_oid;
+ LargeObjectDesc *retval;
+ Relation r;
+ Relation indr;
+ int smgr;
+ char archchar;
+ TupleDesc tupdesc;
+ AttrNumber attNums[1];
+ Oid classObjectId[1];
+ char objname[NAMEDATALEN];
+ char indname[NAMEDATALEN];
+
+ /* parse flags */
+ smgr = flags & INV_SMGRMASK;
+ if (flags & INV_ARCHIVE)
+ archchar = 'h';
+ else
+ archchar = 'n';
+
+ /* add one here since the pg_class tuple created
+ will have the next oid and we want to have the relation name
+ to correspond to the tuple OID */
+ file_oid = newoid()+1;
+
+ /* come up with some table names */
+ sprintf(objname, "Xinv%d", file_oid);
+ sprintf(indname, "Xinx%d", file_oid);
+
+ if (SearchSysCacheTuple(RELNAME, PointerGetDatum(objname),
+ 0,0,0) != NULL) {
+ elog(WARN,
+ "internal error: %s already exists -- cannot create large obj",
+ objname);
+ }
+ if (SearchSysCacheTuple(RELNAME, PointerGetDatum(indname),
+ 0,0,0) != NULL) {
+ elog(WARN,
+ "internal error: %s already exists -- cannot create large obj",
+ indname);
+ }
+
+ /* this is pretty painful... want a tuple descriptor */
+ tupdesc = CreateTemplateTupleDesc(2);
+ (void) TupleDescInitEntry(tupdesc, (AttrNumber) 1,
+ "olastbye",
+ "int4",
+ 0, false);
+ (void) TupleDescInitEntry(tupdesc, (AttrNumber) 2,
+ "odata",
+ "bytea",
+ 0, false);
+ /*
+ * First create the table to hold the inversion large object. It
+ * will be located on whatever storage manager the user requested.
+ */
+
+ (void) heap_create(objname,
+ objname,
+ (int) archchar, smgr,
+ tupdesc);
+
+ /* make the relation visible in this transaction */
+ CommandCounterIncrement();
+ r = heap_openr(objname);
+
+ if (!RelationIsValid(r)) {
+ elog(WARN, "cannot create large object on %s under inversion",
+ smgrout(smgr));
+ }
+
+ /*
+ * Now create a btree index on the relation's olastbyte attribute to
+ * make seeks go faster. The hardwired constants are embarassing
+ * to me, and are symptomatic of the pressure under which this code
+ * was written.
+ *
+ * ok, mao, let's put in some symbolic constants - jolly
+ */
+
+ attNums[0] = 1;
+ classObjectId[0] = INT4_OPS_OID;
+ index_create(objname, indname, NULL, BTREE_AM_OID,
+ 1, &attNums[0], &classObjectId[0],
+ 0, (Datum) NULL, NULL);
+
+ /* make the index visible in this transaction */
+ CommandCounterIncrement();
+ indr = index_openr(indname);
+
+ if (!RelationIsValid(indr)) {
+ elog(WARN, "cannot create index for large obj on %s under inversion",
+ smgrout(smgr));
+ }
+
+ retval = (LargeObjectDesc *) palloc(sizeof(LargeObjectDesc));
+
+ retval->heap_r = r;
+ retval->index_r = indr;
+ retval->iscan = (IndexScanDesc) NULL;
+ retval->hdesc = RelationGetTupleDescriptor(r);
+ retval->idesc = RelationGetTupleDescriptor(indr);
+ retval->offset = retval->lowbyte =
+ retval->highbyte = 0;
+ ItemPointerSetInvalid(&(retval->htid));
+
+ if (flags & INV_WRITE) {
+ RelationSetLockForWrite(r);
+ retval->flags = IFS_WRLOCK|IFS_RDLOCK;
+ } else if (flags & INV_READ) {
+ RelationSetLockForRead(r);
+ retval->flags = IFS_RDLOCK;
+ }
+ retval->flags |= IFS_ATEOF;
+
+ return(retval);
+}
+
+LargeObjectDesc *
+inv_open(Oid lobjId, int flags)
+{
+ LargeObjectDesc *retval;
+ Relation r;
+ char *indname;
+ Relation indrel;
+
+ r = heap_open(lobjId);
+
+ if (!RelationIsValid(r))
+ return ((LargeObjectDesc *) NULL);
+
+ indname = pstrdup((r->rd_rel->relname).data);
+
+ /*
+ * hack hack hack... we know that the fourth character of the relation
+ * name is a 'v', and that the fourth character of the index name is an
+ * 'x', and that they're otherwise identical.
+ */
+ indname[3] = 'x';
+ indrel = index_openr(indname);
+
+ if (!RelationIsValid(indrel))
+ return ((LargeObjectDesc *) NULL);
+
+ retval = (LargeObjectDesc *) palloc(sizeof(LargeObjectDesc));
+
+ retval->heap_r = r;
+ retval->index_r = indrel;
+ retval->iscan = (IndexScanDesc) NULL;
+ retval->hdesc = RelationGetTupleDescriptor(r);
+ retval->idesc = RelationGetTupleDescriptor(indrel);
+ retval->offset = retval->lowbyte = retval->highbyte = 0;
+ ItemPointerSetInvalid(&(retval->htid));
+
+ if (flags & INV_WRITE) {
+ RelationSetLockForWrite(r);
+ retval->flags = IFS_WRLOCK|IFS_RDLOCK;
+ } else if (flags & INV_READ) {
+ RelationSetLockForRead(r);
+ retval->flags = IFS_RDLOCK;
+ }
+
+ return(retval);
+}
+
+/*
+ * Closes an existing large object descriptor.
+ */
+void
+inv_close(LargeObjectDesc *obj_desc)
+{
+ Assert(PointerIsValid(obj_desc));
+
+ if (obj_desc->iscan != (IndexScanDesc) NULL)
+ index_endscan(obj_desc->iscan);
+
+ heap_close(obj_desc->heap_r);
+ index_close(obj_desc->index_r);
+
+ pfree(obj_desc);
+}
+
+/*
+ * Destroys an existing large object, and frees its associated pointers.
+ *
+ * returns -1 if failed
+ */
+int
+inv_destroy(Oid lobjId)
+{
+ Relation r;
+
+ r = (Relation) RelationIdGetRelation(lobjId);
+ if (!RelationIsValid(r) || r->rd_rel->relkind == RELKIND_INDEX)
+ return -1;
+
+ heap_destroy(r->rd_rel->relname.data);
+ return 1;
+}
+
+/*
+ * inv_stat() -- do a stat on an inversion file.
+ *
+ * For the time being, this is an insanely expensive operation. In
+ * order to find the size of the file, we seek to the last block in
+ * it and compute the size from that. We scan pg_class to determine
+ * the file's owner and create time. We don't maintain mod time or
+ * access time, yet.
+ *
+ * These fields aren't stored in a table anywhere because they're
+ * updated so frequently, and postgres only appends tuples at the
+ * end of relations. Once clustering works, we should fix this.
+ */
+int
+inv_stat(LargeObjectDesc *obj_desc, struct pgstat *stbuf)
+{
+ Assert(PointerIsValid(obj_desc));
+ Assert(stbuf != NULL);
+
+ /* need read lock for stat */
+ if (!(obj_desc->flags & IFS_RDLOCK)) {
+ RelationSetLockForRead(obj_desc->heap_r);
+ obj_desc->flags |= IFS_RDLOCK;
+ }
+
+ stbuf->st_ino = obj_desc->heap_r->rd_id;
+#if 1
+ stbuf->st_mode = (S_IFREG | 0666); /* IFREG|rw-rw-rw- */
+#else
+ stbuf->st_mode = 100666; /* IFREG|rw-rw-rw- */
+#endif
+ stbuf->st_size = _inv_getsize(obj_desc->heap_r,
+ obj_desc->hdesc,
+ obj_desc->index_r);
+
+ stbuf->st_uid = obj_desc->heap_r->rd_rel->relowner;
+
+ /* we have no good way of computing access times right now */
+ stbuf->st_atime_s = stbuf->st_mtime_s = stbuf->st_ctime_s = 0;
+
+ return (0);
+}
+
+int
+inv_seek(LargeObjectDesc *obj_desc, int offset, int whence)
+{
+ int oldOffset;
+ Datum d;
+ ScanKeyData skey;
+
+ Assert(PointerIsValid(obj_desc));
+
+ if (whence == SEEK_CUR) {
+ offset += obj_desc->offset; /* calculate absolute position */
+ return (inv_seek(obj_desc, offset, SEEK_SET));
+ }
+
+ /*
+ * if you seek past the end (offset > 0) I have
+ * no clue what happens :-( B.L. 9/1/93
+ */
+ if (whence == SEEK_END) {
+ /* need read lock for getsize */
+ if (!(obj_desc->flags & IFS_RDLOCK)) {
+ RelationSetLockForRead(obj_desc->heap_r);
+ obj_desc->flags |= IFS_RDLOCK;
+ }
+ offset += _inv_getsize(obj_desc->heap_r,
+ obj_desc->hdesc,
+ obj_desc->index_r );
+ return (inv_seek(obj_desc, offset, SEEK_SET));
+ }
+
+ /*
+ * Whenever we do a seek, we turn off the EOF flag bit to force
+ * ourselves to check for real on the next read.
+ */
+
+ obj_desc->flags &= ~IFS_ATEOF;
+ oldOffset = obj_desc->offset;
+ obj_desc->offset = offset;
+
+ /* try to avoid doing any work, if we can manage it */
+ if (offset >= obj_desc->lowbyte
+ && offset <= obj_desc->highbyte
+ && oldOffset <= obj_desc->highbyte
+ && obj_desc->iscan != (IndexScanDesc) NULL)
+ return (offset);
+
+ /*
+ * To do a seek on an inversion file, we start an index scan that
+ * will bring us to the right place. Each tuple in an inversion file
+ * stores the offset of the last byte that appears on it, and we have
+ * an index on this.
+ */
+
+
+ /* right now, just assume that the operation is SEEK_SET */
+ if (obj_desc->iscan != (IndexScanDesc) NULL) {
+ d = Int32GetDatum(offset);
+ btmovescan(obj_desc->iscan, d);
+ } else {
+
+ ScanKeyEntryInitialize(&skey, 0x0, 1, INT4GE_PROC_OID,
+ Int32GetDatum(offset));
+
+ obj_desc->iscan = index_beginscan(obj_desc->index_r,
+ (bool) 0, (uint16) 1,
+ &skey);
+ }
+
+ return (offset);
+}
+
+int
+inv_tell(LargeObjectDesc *obj_desc)
+{
+ Assert(PointerIsValid(obj_desc));
+
+ return (obj_desc->offset);
+}
+
+int
+inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
+{
+ HeapTuple htup;
+ Buffer b;
+ int nread;
+ int off;
+ int ncopy;
+ Datum d;
+ struct varlena *fsblock;
+ bool isNull;
+
+ Assert(PointerIsValid(obj_desc));
+ Assert(buf != NULL);
+
+ /* if we're already at EOF, we don't need to do any work here */
+ if (obj_desc->flags & IFS_ATEOF)
+ return (0);
+
+ /* make sure we obey two-phase locking */
+ if (!(obj_desc->flags & IFS_RDLOCK)) {
+ RelationSetLockForRead(obj_desc->heap_r);
+ obj_desc->flags |= IFS_RDLOCK;
+ }
+
+ nread = 0;
+
+ /* fetch a block at a time */
+ while (nread < nbytes) {
+
+ /* fetch an inversion file system block */
+ htup = inv_fetchtup(obj_desc, &b);
+
+ if (!HeapTupleIsValid(htup)) {
+ obj_desc->flags |= IFS_ATEOF;
+ break;
+ }
+
+ /* copy the data from this block into the buffer */
+ d = (Datum) heap_getattr(htup, b, 2, obj_desc->hdesc, &isNull);
+ fsblock = (struct varlena *) DatumGetPointer(d);
+
+ off = obj_desc->offset - obj_desc->lowbyte;
+ ncopy = obj_desc->highbyte - obj_desc->offset + 1;
+ if (ncopy > (nbytes - nread))
+ ncopy = (nbytes - nread);
+ memmove(buf, &(fsblock->vl_dat[off]), ncopy);
+
+ /* be a good citizen */
+ ReleaseBuffer(b);
+
+ /* move pointers past the amount we just read */
+ buf += ncopy;
+ nread += ncopy;
+ obj_desc->offset += ncopy;
+ }
+
+ /* that's it */
+ return (nread);
+}
+
+int
+inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
+{
+ HeapTuple htup;
+ Buffer b;
+ int nwritten;
+ int tuplen;
+
+ Assert(PointerIsValid(obj_desc));
+ Assert(buf != NULL);
+
+ /*
+ * Make sure we obey two-phase locking. A write lock entitles you
+ * to read the relation, as well.
+ */
+
+ if (!(obj_desc->flags & IFS_WRLOCK)) {
+ RelationSetLockForRead(obj_desc->heap_r);
+ obj_desc->flags |= (IFS_WRLOCK|IFS_RDLOCK);
+ }
+
+ nwritten = 0;
+
+ /* write a block at a time */
+ while (nwritten < nbytes) {
+
+ /*
+ * Fetch the current inversion file system block. If the
+ * class storing the inversion file is empty, we don't want
+ * to do an index lookup, since index lookups choke on empty
+ * files (should be fixed someday).
+ */
+
+ if ((obj_desc->flags & IFS_ATEOF)
+ || obj_desc->heap_r->rd_nblocks == 0)
+ htup = (HeapTuple) NULL;
+ else
+ htup = inv_fetchtup(obj_desc, &b);
+
+ /* either append or replace a block, as required */
+ if (!HeapTupleIsValid(htup)) {
+ tuplen = inv_wrnew(obj_desc, buf, nbytes - nwritten);
+ } else {
+ if (obj_desc->offset > obj_desc->highbyte)
+ tuplen = inv_wrnew(obj_desc, buf, nbytes - nwritten);
+ else
+ tuplen = inv_wrold(obj_desc, buf, nbytes - nwritten, htup, b);
+ }
+
+ /* move pointers past the amount we just wrote */
+ buf += tuplen;
+ nwritten += tuplen;
+ obj_desc->offset += tuplen;
+ }
+
+ /* that's it */
+ return (nwritten);
+}
+
+/*
+ * inv_fetchtup -- Fetch an inversion file system block.
+ *
+ * This routine finds the file system block containing the offset
+ * recorded in the obj_desc structure. Later, we need to think about
+ * the effects of non-functional updates (can you rewrite the same
+ * block twice in a single transaction?), but for now, we won't bother.
+ *
+ * Parameters:
+ * obj_desc -- the object descriptor.
+ * bufP -- pointer to a buffer in the buffer cache; caller
+ * must free this.
+ *
+ * Returns:
+ * A heap tuple containing the desired block, or NULL if no
+ * such tuple exists.
+ */
+static HeapTuple
+inv_fetchtup(LargeObjectDesc *obj_desc, Buffer *bufP)
+{
+ HeapTuple htup;
+ RetrieveIndexResult res;
+ Datum d;
+ int firstbyte, lastbyte;
+ struct varlena *fsblock;
+ bool isNull;
+
+ /*
+ * If we've exhausted the current block, we need to get the next one.
+ * When we support time travel and non-functional updates, we will
+ * need to loop over the blocks, rather than just have an 'if', in
+ * order to find the one we're really interested in.
+ */
+
+ if (obj_desc->offset > obj_desc->highbyte
+ || obj_desc->offset < obj_desc->lowbyte
+ || !ItemPointerIsValid(&(obj_desc->htid))) {
+
+ /* initialize scan key if not done */
+ if (obj_desc->iscan==(IndexScanDesc)NULL) {
+ ScanKeyData skey;
+
+ ScanKeyEntryInitialize(&skey, 0x0, 1, INT4GE_PROC_OID,
+ Int32GetDatum(0));
+ obj_desc->iscan =
+ index_beginscan(obj_desc->index_r,
+ (bool) 0, (uint16) 1,
+ &skey);
+ }
+
+ do {
+ res = index_getnext(obj_desc->iscan, ForwardScanDirection);
+
+ if (res == (RetrieveIndexResult) NULL) {
+ ItemPointerSetInvalid(&(obj_desc->htid));
+ return ((HeapTuple) NULL);
+ }
+
+ /*
+ * For time travel, we need to use the actual time qual here,
+ * rather that NowTimeQual. We currently have no way to pass
+ * a time qual in.
+ */
+
+ htup = heap_fetch(obj_desc->heap_r, NowTimeQual,
+ &(res->heap_iptr), bufP);
+
+ } while (htup == (HeapTuple) NULL);
+
+ /* remember this tid -- we may need it for later reads/writes */
+ ItemPointerCopy(&(res->heap_iptr), &(obj_desc->htid));
+
+ } else {
+ htup = heap_fetch(obj_desc->heap_r, NowTimeQual,
+ &(obj_desc->htid), bufP);
+ }
+
+ /*
+ * By here, we have the heap tuple we're interested in. We cache
+ * the upper and lower bounds for this block in the object descriptor
+ * and return the tuple.
+ */
+
+ d = (Datum)heap_getattr(htup, *bufP, 1, obj_desc->hdesc, &isNull);
+ lastbyte = (int32) DatumGetInt32(d);
+ d = (Datum)heap_getattr(htup, *bufP, 2, obj_desc->hdesc, &isNull);
+ fsblock = (struct varlena *) DatumGetPointer(d);
+
+ /* order of + and - is important -- these are unsigned quantites near 0 */
+ firstbyte = (lastbyte + 1 + sizeof(fsblock->vl_len)) - fsblock->vl_len;
+
+ obj_desc->lowbyte = firstbyte;
+ obj_desc->highbyte = lastbyte;
+
+ /* done */
+ return (htup);
+}
+
+/*
+ * inv_wrnew() -- append a new filesystem block tuple to the inversion
+ * file.
+ *
+ * In response to an inv_write, we append one or more file system
+ * blocks to the class containing the large object. We violate the
+ * class abstraction here in order to pack things as densely as we
+ * are able. We examine the last page in the relation, and write
+ * just enough to fill it, assuming that it has above a certain
+ * threshold of space available. If the space available is less than
+ * the threshold, we allocate a new page by writing a big tuple.
+ *
+ * By the time we get here, we know all the parameters passed in
+ * are valid, and that we hold the appropriate lock on the heap
+ * relation.
+ *
+ * Parameters:
+ * obj_desc: large object descriptor for which to append block.
+ * buf: buffer containing data to write.
+ * nbytes: amount to write
+ *
+ * Returns:
+ * number of bytes actually written to the new tuple.
+ */
+static int
+inv_wrnew(LargeObjectDesc *obj_desc, char *buf, int nbytes)
+{
+ Relation hr;
+ HeapTuple ntup;
+ Buffer buffer;
+ Page page;
+ int nblocks;
+ int nwritten;
+
+ hr = obj_desc->heap_r;
+
+ /*
+ * Get the last block in the relation. If there's no data in the
+ * relation at all, then we just get a new block. Otherwise, we
+ * check the last block to see whether it has room to accept some
+ * or all of the data that the user wants to write. If it doesn't,
+ * then we allocate a new block.
+ */
+
+ nblocks = RelationGetNumberOfBlocks(hr);
+
+ if (nblocks > 0)
+ buffer = ReadBuffer(hr, nblocks - 1);
+ else
+ buffer = ReadBuffer(hr, P_NEW);
+
+ page = BufferGetPage(buffer);
+
+ /*
+ * If the last page is too small to hold all the data, and it's too
+ * small to hold IMINBLK, then we allocate a new page. If it will
+ * hold at least IMINBLK, but less than all the data requested, then
+ * we write IMINBLK here. The caller is responsible for noticing that
+ * less than the requested number of bytes were written, and calling
+ * this routine again.
+ */
+
+ nwritten = IFREESPC(page);
+ if (nwritten < nbytes) {
+ if (nwritten < IMINBLK) {
+ ReleaseBuffer(buffer);
+ buffer = ReadBuffer(hr, P_NEW);
+ page = BufferGetPage(buffer);
+ PageInit(page, BufferGetPageSize(buffer), 0);
+ if (nbytes > IMAXBLK)
+ nwritten = IMAXBLK;
+ else
+ nwritten = nbytes;
+ }
+ } else {
+ nwritten = nbytes;
+ }
+
+ /*
+ * Insert a new file system block tuple, index it, and write it out.
+ */
+
+ ntup = inv_newtuple(obj_desc, buffer, page, buf, nwritten);
+ inv_indextup(obj_desc, ntup);
+
+ /* new tuple is inserted */
+ WriteBuffer(buffer);
+
+ return (nwritten);
+}
+
+static int
+inv_wrold(LargeObjectDesc *obj_desc,
+ char *dbuf,
+ int nbytes,
+ HeapTuple htup,
+ Buffer buffer)
+{
+ Relation hr;
+ HeapTuple ntup;
+ Buffer newbuf;
+ Page page;
+ Page newpage;
+ int tupbytes;
+ Datum d;
+ struct varlena *fsblock;
+ int nwritten, nblocks, freespc;
+ bool isNull;
+ int keep_offset;
+
+ /*
+ * Since we're using a no-overwrite storage manager, the way we
+ * overwrite blocks is to mark the old block invalid and append
+ * a new block. First mark the old block invalid. This violates
+ * the tuple abstraction.
+ */
+
+ TransactionIdStore(GetCurrentTransactionId(), &(htup->t_xmax));
+ htup->t_cmax = GetCurrentCommandId();
+
+ /*
+ * If we're overwriting the entire block, we're lucky. All we need
+ * to do is to insert a new block.
+ */
+
+ if (obj_desc->offset == obj_desc->lowbyte
+ && obj_desc->lowbyte + nbytes >= obj_desc->highbyte) {
+ WriteBuffer(buffer);
+ return (inv_wrnew(obj_desc, dbuf, nbytes));
+ }
+
+ /*
+ * By here, we need to overwrite part of the data in the current
+ * tuple. In order to reduce the degree to which we fragment blocks,
+ * we guarantee that no block will be broken up due to an overwrite.
+ * This means that we need to allocate a tuple on a new page, if
+ * there's not room for the replacement on this one.
+ */
+
+ newbuf = buffer;
+ page = BufferGetPage(buffer);
+ newpage = BufferGetPage(newbuf);
+ hr = obj_desc->heap_r;
+ freespc = IFREESPC(page);
+ d = (Datum)heap_getattr(htup, buffer, 2, obj_desc->hdesc, &isNull);
+ fsblock = (struct varlena *) DatumGetPointer(d);
+ tupbytes = fsblock->vl_len - sizeof(fsblock->vl_len);
+
+ if (freespc < tupbytes) {
+
+ /*
+ * First see if there's enough space on the last page of the
+ * table to put this tuple.
+ */
+
+ nblocks = RelationGetNumberOfBlocks(hr);
+
+ if (nblocks > 0)
+ newbuf = ReadBuffer(hr, nblocks - 1);
+ else
+ newbuf = ReadBuffer(hr, P_NEW);
+
+ newpage = BufferGetPage(newbuf);
+ freespc = IFREESPC(newpage);
+
+ /*
+ * If there's no room on the last page, allocate a new last
+ * page for the table, and put it there.
+ */
+
+ if (freespc < tupbytes) {
+ ReleaseBuffer(newbuf);
+ newbuf = ReadBuffer(hr, P_NEW);
+ newpage = BufferGetPage(newbuf);
+ PageInit(newpage, BufferGetPageSize(newbuf), 0);
+ }
+ }
+
+ nwritten = nbytes;
+ if (nwritten > obj_desc->highbyte - obj_desc->offset + 1)
+ nwritten = obj_desc->highbyte - obj_desc->offset + 1;
+ memmove(VARDATA(fsblock)+ (obj_desc->offset - obj_desc->lowbyte),
+ dbuf,nwritten);
+ /* we are rewriting the entire old block, therefore
+ we reset offset to the lowbyte of the original block
+ before jumping into inv_newtuple() */
+ keep_offset = obj_desc->offset;
+ obj_desc->offset = obj_desc->lowbyte;
+ ntup = inv_newtuple(obj_desc, newbuf, newpage, VARDATA(fsblock),
+ tupbytes);
+ /* after we are done, we restore to the true offset */
+ obj_desc->offset = keep_offset;
+
+ /*
+ * By here, we have a page (newpage) that's guaranteed to have
+ * enough space on it to put the new tuple. Call inv_newtuple
+ * to do the work. Passing NULL as a buffer to inv_newtuple()
+ * keeps it from copying any data into the new tuple. When it
+ * returns, the tuple is ready to receive data from the old
+ * tuple and the user's data buffer.
+ */
+/*
+ ntup = inv_newtuple(obj_desc, newbuf, newpage, (char *) NULL, tupbytes);
+ dptr = ((char *) ntup) + ntup->t_hoff - sizeof(ntup->t_bits) + sizeof(int4)
+ + sizeof(fsblock->vl_len);
+
+ if (obj_desc->offset > obj_desc->lowbyte) {
+ memmove(dptr,
+ &(fsblock->vl_dat[0]),
+ obj_desc->offset - obj_desc->lowbyte);
+ dptr += obj_desc->offset - obj_desc->lowbyte;
+ }
+
+
+ nwritten = nbytes;
+ if (nwritten > obj_desc->highbyte - obj_desc->offset + 1)
+ nwritten = obj_desc->highbyte - obj_desc->offset + 1;
+
+ memmove(dptr, dbuf, nwritten);
+ dptr += nwritten;
+
+ if (obj_desc->offset + nwritten < obj_desc->highbyte + 1) {
+*/
+/*
+ loc = (obj_desc->highbyte - obj_desc->offset)
+ + nwritten;
+ sz = obj_desc->highbyte - (obj_desc->lowbyte + loc);
+
+ what's going on here?? - jolly
+*/
+/*
+ sz = (obj_desc->highbyte + 1) - (obj_desc->offset + nwritten);
+ memmove(&(fsblock->vl_dat[0]), dptr, sz);
+ }
+*/
+
+
+ /* index the new tuple */
+ inv_indextup(obj_desc, ntup);
+
+ /* move the scandesc forward so we don't reread the newly inserted
+ tuple on the next index scan */
+ if (obj_desc->iscan)
+ index_getnext(obj_desc->iscan, ForwardScanDirection);
+
+ /*
+ * Okay, by here, a tuple for the new block is correctly placed,
+ * indexed, and filled. Write the changed pages out.
+ */
+
+ WriteBuffer(buffer);
+ if (newbuf != buffer)
+ WriteBuffer(newbuf);
+
+ /* done */
+ return (nwritten);
+}
+
+static HeapTuple
+inv_newtuple(LargeObjectDesc *obj_desc,
+ Buffer buffer,
+ Page page,
+ char *dbuf,
+ int nwrite)
+{
+ HeapTuple ntup;
+ PageHeader ph;
+ int tupsize;
+ int hoff;
+ Offset lower;
+ Offset upper;
+ ItemId itemId;
+ OffsetNumber off;
+ OffsetNumber limit;
+ char *attptr;
+
+ /* compute tuple size -- no nulls */
+ hoff = sizeof(HeapTupleData) - sizeof(ntup->t_bits);
+
+ /* add in olastbyte, varlena.vl_len, varlena.vl_dat */
+ tupsize = hoff + (2 * sizeof(int32)) + nwrite;
+ tupsize = LONGALIGN(tupsize);
+
+ /*
+ * Allocate the tuple on the page, violating the page abstraction.
+ * This code was swiped from PageAddItem().
+ */
+
+ ph = (PageHeader) page;
+ limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+ /* look for "recyclable" (unused & deallocated) ItemId */
+ for (off = FirstOffsetNumber; off < limit; off = OffsetNumberNext(off)) {
+ itemId = &ph->pd_linp[off - 1];
+ if ((((*itemId).lp_flags & LP_USED) == 0) &&
+ ((*itemId).lp_len == 0))
+ break;
+ }
+
+ if (off > limit)
+ lower = (Offset) (((char *) (&ph->pd_linp[off])) - ((char *) page));
+ else if (off == limit)
+ lower = ph->pd_lower + sizeof (ItemIdData);
+ else
+ lower = ph->pd_lower;
+
+ upper = ph->pd_upper - tupsize;
+
+ itemId = &ph->pd_linp[off - 1];
+ (*itemId).lp_off = upper;
+ (*itemId).lp_len = tupsize;
+ (*itemId).lp_flags = LP_USED;
+ ph->pd_lower = lower;
+ ph->pd_upper = upper;
+
+ ntup = (HeapTuple) ((char *) page + upper);
+
+ /*
+ * Tuple is now allocated on the page. Next, fill in the tuple
+ * header. This block of code violates the tuple abstraction.
+ */
+
+ ntup->t_len = tupsize;
+ ItemPointerSet(&(ntup->t_ctid), BufferGetBlockNumber(buffer), off);
+ ItemPointerSetInvalid(&(ntup->t_chain));
+ LastOidProcessed = ntup->t_oid = newoid();
+ TransactionIdStore(GetCurrentTransactionId(), &(ntup->t_xmin));
+ ntup->t_cmin = GetCurrentCommandId();
+ StoreInvalidTransactionId(&(ntup->t_xmax));
+ ntup->t_cmax = 0;
+ ntup->t_tmin = INVALID_ABSTIME;
+ ntup->t_tmax = CURRENT_ABSTIME;
+ ntup->t_natts = 2;
+ ntup->t_hoff = hoff;
+ ntup->t_vtype = 0;
+ ntup->t_infomask = 0x0;
+
+ /* if a NULL is passed in, avoid the calculations below */
+ if (dbuf == NULL)
+ return ntup;
+
+ /*
+ * Finally, copy the user's data buffer into the tuple. This violates
+ * the tuple and class abstractions.
+ */
+
+ attptr = ((char *) ntup) + hoff;
+ *((int32 *) attptr) = obj_desc->offset + nwrite - 1;
+ attptr += sizeof(int32);
+
+ /*
+ ** mer fixed disk layout of varlenas to get rid of the need for this.
+ **
+ ** *((int32 *) attptr) = nwrite + sizeof(int32);
+ ** attptr += sizeof(int32);
+ */
+
+ *((int32 *) attptr) = nwrite + sizeof(int32);
+ attptr += sizeof(int32);
+
+ /*
+ * If a data buffer was passed in, then copy the data from the buffer
+ * to the tuple. Some callers (eg, inv_wrold()) may not pass in a
+ * buffer, since they have to copy part of the old tuple data and
+ * part of the user's new data into the new tuple.
+ */
+
+ if (dbuf != (char *) NULL)
+ memmove(attptr, dbuf, nwrite);
+
+ /* keep track of boundary of current tuple */
+ obj_desc->lowbyte = obj_desc->offset;
+ obj_desc->highbyte = obj_desc->offset + nwrite - 1;
+
+ /* new tuple is filled -- return it */
+ return (ntup);
+}
+
+static void
+inv_indextup(LargeObjectDesc *obj_desc, HeapTuple htup)
+{
+ IndexTuple itup;
+ InsertIndexResult res;
+ Datum v[1];
+ char n[1];
+
+ n[0] = ' ';
+ v[0] = Int32GetDatum(obj_desc->highbyte);
+ itup = index_formtuple(obj_desc->idesc, &v[0], &n[0]);
+ memmove((char *)&(itup->t_tid),
+ (char *)&(htup->t_ctid),
+ sizeof(ItemPointerData));
+ res = index_insert(obj_desc->index_r, itup);
+
+ if (res)
+ pfree(res);
+
+ pfree(itup);
+}
+
+/*
+static void
+DumpPage(Page page, int blkno)
+{
+ ItemId lp;
+ HeapTuple tup;
+ int flags, i, nline;
+ ItemPointerData pointerData;
+
+ printf("\t[subblock=%d]:lower=%d:upper=%d:special=%d\n", 0,
+ ((PageHeader)page)->pd_lower, ((PageHeader)page)->pd_upper,
+ ((PageHeader)page)->pd_special);
+
+ printf("\t:MaxOffsetNumber=%d\n",
+ (int16) PageGetMaxOffsetNumber(page));
+
+ nline = (int16) PageGetMaxOffsetNumber(page);
+
+{
+ int i;
+ char *cp;
+
+ i = PageGetSpecialSize(page);
+ cp = PageGetSpecialPointer(page);
+
+ printf("\t:SpecialData=");
+
+ while (i > 0) {
+ printf(" 0x%02x", *cp);
+ cp += 1;
+ i -= 1;
+ }
+ printf("\n");
+}
+ for (i = 0; i < nline; i++) {
+ lp = ((PageHeader)page)->pd_linp + i;
+ flags = (*lp).lp_flags;
+ ItemPointerSet(&pointerData, blkno, 1 + i);
+ printf("%s:off=%d:flags=0x%x:len=%d",
+ ItemPointerFormExternal(&pointerData), (*lp).lp_off,
+ flags, (*lp).lp_len);
+
+ if (flags & LP_USED) {
+ HeapTupleData htdata;
+
+ printf(":USED");
+
+ memmove((char *) &htdata,
+ (char *) &((char *)page)[(*lp).lp_off],
+ sizeof(htdata));
+
+ tup = &htdata;
+
+ printf("\n\t:ctid=%s:oid=%d",
+ ItemPointerFormExternal(&tup->t_ctid),
+ tup->t_oid);
+ printf(":natts=%d:thoff=%d:vtype=`%c' (0x%02x):",
+ tup->t_natts,
+ tup->t_hoff, tup->t_vtype, tup->t_vtype);
+
+ printf("\n\t:tmin=%d:cmin=%u:",
+ tup->t_tmin, tup->t_cmin);
+
+ printf("xmin=%u:", tup->t_xmin);
+
+ printf("\n\t:tmax=%d:cmax=%u:",
+ tup->t_tmax, tup->t_cmax);
+
+ printf("xmax=%u:", tup->t_xmax);
+
+ printf("\n\t:chain=%s:\n",
+ ItemPointerFormExternal(&tup->t_chain));
+ } else
+ putchar('\n');
+ }
+}
+
+static char*
+ItemPointerFormExternal(ItemPointer pointer)
+{
+ static char itemPointerString[32];
+
+ if (!ItemPointerIsValid(pointer)) {
+ memmove(itemPointerString, "<-,-,->", sizeof "<-,-,->");
+ } else {
+ sprintf(itemPointerString, "<%u,%u>",
+ ItemPointerGetBlockNumber(pointer),
+ ItemPointerGetOffsetNumber(pointer));
+ }
+
+ return (itemPointerString);
+}
+*/
+
+static int
+_inv_getsize(Relation hreln, TupleDesc hdesc, Relation ireln)
+{
+ IndexScanDesc iscan;
+ RetrieveIndexResult res;
+ Buffer buf;
+ HeapTuple htup;
+ Datum d;
+ long size;
+ bool isNull;
+
+ /* scan backwards from end */
+ iscan = index_beginscan(ireln, (bool) 1, 0, (ScanKey) NULL);
+
+ buf = InvalidBuffer;
+
+ do {
+ res = index_getnext(iscan, BackwardScanDirection);
+
+ /*
+ * If there are no more index tuples, then the relation is empty,
+ * so the file's size is zero.
+ */
+
+ if (res == (RetrieveIndexResult) NULL) {
+ index_endscan(iscan);
+ return (0);
+ }
+
+ /*
+ * For time travel, we need to use the actual time qual here,
+ * rather that NowTimeQual. We currently have no way to pass
+ * a time qual in.
+ */
+
+ if (buf != InvalidBuffer)
+ (void) ReleaseBuffer(buf);
+
+ htup = heap_fetch(hreln, NowTimeQual, &(res->heap_iptr), &buf);
+
+ } while (!HeapTupleIsValid(htup));
+
+ /* don't need the index scan anymore */
+ index_endscan(iscan);
+
+ /* get olastbyte attribute */
+ d = (Datum) heap_getattr(htup, buf, 1, hdesc, &isNull);
+ size = DatumGetInt32(d) + 1;
+
+ /* wei hates it if you forget to do this */
+ ReleaseBuffer(buf);
+
+ return (size);
+}
diff --git a/src/backend/storage/lmgr.h b/src/backend/storage/lmgr.h
new file mode 100644
index 00000000000..fe87eb05546
--- /dev/null
+++ b/src/backend/storage/lmgr.h
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.h--
+ * POSTGRES lock manager definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: lmgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LMGR_H
+#define LMGR_H
+
+#include "postgres.h"
+
+#include "storage/itemptr.h"
+#include "storage/lock.h"
+#include "utils/rel.h"
+
+/*
+ * This was moved from pladt.h for the new lock manager. Want to obsolete
+ * all of the old code.
+ */
+typedef struct LRelId {
+ Oid relId; /* a relation identifier */
+ Oid dbId; /* a database identifier */
+} LRelId;
+
+typedef struct LockInfoData {
+ bool initialized;
+ LRelId lRelId;
+ TransactionId transactionIdData;
+ uint16 flags;
+} LockInfoData;
+typedef LockInfoData *LockInfo;
+
+#define LockInfoIsValid(linfo) \
+ ((PointerIsValid(linfo)) && ((LockInfo) linfo)->initialized)
+
+
+extern LRelId RelationGetLRelId(Relation relation);
+extern Oid LRelIdGetDatabaseId(LRelId lRelId);
+extern Oid LRelIdGetRelationId(LRelId lRelId);
+extern bool DatabaseIdIsMyDatabaseId(Oid databaseId);
+extern bool LRelIdContainsMyDatabaseId(LRelId lRelId);
+extern void RelationInitLockInfo(Relation relation);
+extern void RelationDiscardLockInfo(Relation relation);
+extern void RelationSetLockForDescriptorOpen(Relation relation);
+extern void RelationSetLockForRead(Relation relation);
+extern void RelationUnsetLockForRead(Relation relation);
+extern void RelationSetLockForWrite(Relation relation);
+extern void RelationUnsetLockForWrite(Relation relation);
+extern void RelationSetLockForTupleRead(Relation relation,
+ ItemPointer itemPointer);
+
+/* used in vaccum.c */
+extern void RelationSetLockForWritePage(Relation relation,
+ ItemPointer itemPointer);
+
+/* used in nbtpage.c, hashpage.c */
+extern void RelationSetSingleWLockPage(Relation relation,
+ ItemPointer itemPointer);
+extern void RelationUnsetSingleWLockPage(Relation relation,
+ ItemPointer itemPointer);
+extern void RelationSetSingleRLockPage(Relation relation,
+ ItemPointer itemPointer);
+extern void RelationUnsetSingleRLockPage(Relation relation,
+ ItemPointer itemPointer);
+extern void RelationSetRIntentLock(Relation relation);
+extern void RelationUnsetRIntentLock(Relation relation);
+extern void RelationSetWIntentLock(Relation relation);
+extern void RelationUnsetWIntentLock(Relation relation);
+extern void RelationSetLockForExtend(Relation relation);
+extern void RelationUnsetLockForExtend(Relation relation);
+extern void LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId);
+
+/* single.c */
+extern bool SingleLockReln(LockInfo linfo, LOCKT lockt, int action);
+extern bool SingleLockPage(LockInfo linfo, ItemPointer tidPtr,
+ LOCKT lockt, int action);
+
+#endif /* LMGR_H */
diff --git a/src/backend/storage/lmgr/Makefile.inc b/src/backend/storage/lmgr/Makefile.inc
new file mode 100644
index 00000000000..ac507558b57
--- /dev/null
+++ b/src/backend/storage/lmgr/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for storage/lmgr
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= lmgr.c lock.c multi.c proc.c single.c
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README
new file mode 100644
index 00000000000..e382003f2a4
--- /dev/null
+++ b/src/backend/storage/lmgr/README
@@ -0,0 +1,93 @@
+$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+
+This file is an attempt to save me (and future code maintainers) some
+time and a lot of headaches. The existing lock manager code at the time
+of this writing (June 16 1992) can best be described as confusing. The
+complexity seems inherent in lock manager functionality, but variable
+names chosen in the current implementation really confuse me everytime
+I have to track down a bug. Also, what gets done where and by whom isn't
+always clear....
+
+Starting with the data structures the lock manager relies upon...
+
+(NOTE - these will undoubtedly change over time and it is likely
+that this file won't always be updated along with the structs.)
+
+The lock manager's LOCK:
+
+tag -
+ The key fields that are used for hashing locks in the shared memory
+ lock hash table. This is kept as a separate struct to ensure that we
+ always zero out the correct number of bytes. This is a problem as
+ part of the tag is an itempointer which is 6 bytes and causes 2
+ additional bytes to be added as padding.
+
+ tag.relId -
+ Uniquely identifies the relation that the lock corresponds to.
+
+ tag.dbId -
+ Uniquely identifies the database in which the relation lives. If
+ this is a shared system relation (e.g. pg_user) the dbId should be
+ set to 0.
+
+ tag.tupleId -
+ Uniquely identifies the block/page within the relation and the
+ tuple within the block. If we are setting a table level lock
+ both the blockId and tupleId (in an item pointer this is called
+ the position) are set to invalid, if it is a page level lock the
+ blockId is valid, while the tuleId is still invalid. Finally if
+ this is a tuple level lock (we currently never do this) then both
+ the blockId and tupleId are set to valid specifications. This is
+ how we get the appearance of a multi-level lock table while using
+ only a single table (see Gray's paper on 2 phase locking if
+ you are puzzled about how multi-level lock tables work).
+
+mask -
+ This field indicates what types of locks are currently held in the
+ given lock. It is used (against the lock table's conflict table)
+ to determine if the new lock request will conflict with existing
+ lock types held. Conficts are determined by bitwise AND operations
+ between the mask and the conflict table entry for the given lock type
+ to be set. The current representation is that each bit (1 through 5)
+ is set when that lock type (WRITE, READ, WRITE INTENT, READ INTENT, EXTEND)
+ has been acquired for the lock.
+
+waitProcs -
+ This is a shared memory queue of all process structures corresponding to
+ a backend that is waiting (sleeping) until another backend releases this
+ lock. The process structure holds the information needed to determine
+ if it should be woken up when this lock is released. If, for example,
+ we are releasing a read lock and the process is sleeping trying to acquire
+ a read lock then there is no point in waking it since the lock being
+ released isn't what caused it to sleep in the first place. There will
+ be more on this below (when I get to releasing locks and waking sleeping
+ process routines).
+
+nHolding -
+ Keeps a count of how many times this lock has been attempted to be
+ acquired. The count includes attempts by processes which were put
+ to sleep due to conflicts. It also counts the same backend twice
+ if, for example, a backend process first acquires a read and then
+ acquires a write.
+
+holders -
+ Keeps a count of how many locks of each type have been attempted. Only
+ elements 1 through MAX_LOCK_TYPES are used as they correspond to the lock
+ type defined constants (WRITE through EXTEND). Summing the values of
+ holders should come out equal to nHolding.
+
+nActive -
+ Keeps a count of how many times this lock has been succesfully acquired.
+ This count does not include attempts that were rejected due to conflicts,
+ but can count the same backend twice (e.g. a read then a write -- since
+ its the same transaction this won't cause a conflict)
+
+activeHolders -
+ Keeps a count of how locks of each type are currently held. Once again
+ only elements 1 through MAX_LOCK_TYPES are used (0 is not). Also, like
+ holders, summing the values of activeHolders should total to the value
+ of nActive.
+
+
+This is all I had the stomach for right now..... I will get back to this
+someday. -mer 17 June 1992 12:00 am
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
new file mode 100644
index 00000000000..bfc2f5b2eec
--- /dev/null
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -0,0 +1,933 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.c--
+ * POSTGRES lock manager code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/* #define LOCKDEBUGALL 1 */
+/* #define LOCKDEBUG 1 */
+
+#ifdef LOCKDEBUGALL
+#define LOCKDEBUG 1
+#endif /* LOCKDEBUGALL */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup.h"
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "utils/tqual.h"
+#include "access/xact.h"
+
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/itemptr.h"
+#include "storage/bufpage.h"
+#include "storage/multilev.h"
+#include "storage/lmgr.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+
+#include "catalog/catname.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_class.h"
+
+#include "nodes/memnodes.h"
+#include "storage/bufmgr.h"
+#include "access/transam.h" /* for AmiTransactionId */
+
+/* ----------------
+ *
+ * ----------------
+ */
+#define MaxRetries 4 /* XXX about 1/4 minute--a hack */
+
+#define IntentReadRelationLock 0x0100
+#define ReadRelationLock 0x0200
+#define IntentWriteRelationLock 0x0400
+#define WriteRelationLock 0x0800
+#define IntentReadPageLock 0x1000
+#define ReadTupleLock 0x2000
+
+#define TupleLevelLockCountMask 0x000f
+
+#define TupleLevelLockLimit 10
+
+extern Oid MyDatabaseId;
+
+static LRelId VariableRelationLRelId = {
+ RelOid_pg_variable,
+ InvalidOid
+};
+
+/* ----------------
+ * RelationGetLRelId
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_10 \
+elog(NOTICE, "RelationGetLRelId(%s) invalid lockInfo", \
+ RelationGetRelationName(relation));
+#else
+#define LOCKDEBUG_10
+#endif /* LOCKDEBUG */
+
+/*
+ * RelationGetLRelId --
+ * Returns "lock" relation identifier for a relation.
+ */
+LRelId
+RelationGetLRelId(Relation relation)
+{
+ LockInfo linfo;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ linfo = (LockInfo) relation->lockInfo;
+
+ /* ----------------
+ * initialize lock info if necessary
+ * ----------------
+ */
+ if (! LockInfoIsValid(linfo)) {
+ LOCKDEBUG_10;
+ RelationInitLockInfo(relation);
+ linfo = (LockInfo) relation->lockInfo;
+ }
+
+ /* ----------------
+ * XXX hack to prevent problems during
+ * VARIABLE relation initialization
+ * ----------------
+ */
+ if (strcmp(RelationGetRelationName(relation)->data,
+ VariableRelationName) == 0) {
+ return (VariableRelationLRelId);
+ }
+
+ return (linfo->lRelId);
+}
+
+/*
+ * LRelIdGetDatabaseId --
+ * Returns database identifier for a "lock" relation identifier.
+ */
+/* ----------------
+ * LRelIdGetDatabaseId
+ *
+ * Note: The argument may not be correct, if it is not used soon
+ * after it is created.
+ * ----------------
+ */
+Oid
+LRelIdGetDatabaseId(LRelId lRelId)
+{
+ return (lRelId.dbId);
+}
+
+
+/*
+ * LRelIdGetRelationId --
+ * Returns relation identifier for a "lock" relation identifier.
+ */
+Oid
+LRelIdGetRelationId(LRelId lRelId)
+{
+ return (lRelId.relId);
+}
+
+/*
+ * DatabaseIdIsMyDatabaseId --
+ * True iff database object identifier is valid in my present database.
+ */
+bool
+DatabaseIdIsMyDatabaseId(Oid databaseId)
+{
+ return (bool)
+ (!OidIsValid(databaseId) || databaseId == MyDatabaseId);
+}
+
+/*
+ * LRelIdContainsMyDatabaseId --
+ * True iff "lock" relation identifier is valid in my present database.
+ */
+bool
+LRelIdContainsMyDatabaseId(LRelId lRelId)
+{
+ return (bool)
+ (!OidIsValid(lRelId.dbId) || lRelId.dbId == MyDatabaseId);
+}
+
+/*
+ * RelationInitLockInfo --
+ * Initializes the lock information in a relation descriptor.
+ */
+/* ----------------
+ * RelationInitLockInfo
+ *
+ * XXX processingVariable is a hack to prevent problems during
+ * VARIABLE relation initialization.
+ * ----------------
+ */
+void
+RelationInitLockInfo(Relation relation)
+{
+ LockInfo info;
+ char *relname;
+ Oid relationid;
+ bool processingVariable;
+ extern Oid MyDatabaseId; /* XXX use include */
+ extern GlobalMemory CacheCxt;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ Assert(OidIsValid(RelationGetRelationId(relation)));
+
+ /* ----------------
+ * get information from relation descriptor
+ * ----------------
+ */
+ info = (LockInfo) relation->lockInfo;
+ relname = (char *) RelationGetRelationName(relation);
+ relationid = RelationGetRelationId(relation);
+ processingVariable = (strcmp(relname, VariableRelationName) == 0);
+
+ /* ----------------
+ * create a new lockinfo if not already done
+ * ----------------
+ */
+ if (! PointerIsValid(info))
+ {
+ MemoryContext oldcxt;
+
+ oldcxt = MemoryContextSwitchTo((MemoryContext)CacheCxt);
+ info = (LockInfo)palloc(sizeof(LockInfoData));
+ MemoryContextSwitchTo(oldcxt);
+ }
+ else if (processingVariable) {
+ if (IsTransactionState()) {
+ TransactionIdStore(GetCurrentTransactionId(),
+ &info->transactionIdData);
+ }
+ info->flags = 0x0;
+ return; /* prevent an infinite loop--still true? */
+ }
+ else if (info->initialized)
+ {
+ /* ------------
+ * If we've already initialized we're done.
+ * ------------
+ */
+ return;
+ }
+
+ /* ----------------
+ * initialize lockinfo.dbId and .relId appropriately
+ * ----------------
+ */
+ if (IsSharedSystemRelationName(relname))
+ LRelIdAssign(&info->lRelId, InvalidOid, relationid);
+ else
+ LRelIdAssign(&info->lRelId, MyDatabaseId, relationid);
+
+ /* ----------------
+ * store the transaction id in the lockInfo field
+ * ----------------
+ */
+ if (processingVariable)
+ TransactionIdStore(AmiTransactionId,
+ &info->transactionIdData);
+ else if (IsTransactionState())
+ TransactionIdStore(GetCurrentTransactionId(),
+ &info->transactionIdData);
+ else
+ StoreInvalidTransactionId(&(info->transactionIdData));
+
+ /* ----------------
+ * initialize rest of lockinfo
+ * ----------------
+ */
+ info->flags = 0x0;
+ info->initialized = (bool)true;
+ relation->lockInfo = (Pointer) info;
+}
+
+/* ----------------
+ * RelationDiscardLockInfo
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_20 \
+elog(DEBUG, "DiscardLockInfo: NULL relation->lockInfo")
+#else
+#define LOCKDEBUG_20
+#endif /* LOCKDEBUG */
+
+/*
+ * RelationDiscardLockInfo --
+ * Discards the lock information in a relation descriptor.
+ */
+void
+RelationDiscardLockInfo(Relation relation)
+{
+ if (! LockInfoIsValid(relation->lockInfo)) {
+ LOCKDEBUG_20;
+ return;
+ }
+
+ pfree(relation->lockInfo);
+ relation->lockInfo = NULL;
+}
+
+/*
+ * RelationSetLockForDescriptorOpen --
+ * Sets read locks for a relation descriptor.
+ */
+#ifdef LOCKDEBUGALL
+#define LOCKDEBUGALL_30 \
+elog(DEBUG, "RelationSetLockForDescriptorOpen(%s[%d,%d]) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUGALL_30
+#endif /* LOCKDEBUGALL*/
+
+void
+RelationSetLockForDescriptorOpen(Relation relation)
+{
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ LOCKDEBUGALL_30;
+
+ /* ----------------
+ * read lock catalog tuples which compose the relation descriptor
+ * XXX race condition? XXX For now, do nothing.
+ * ----------------
+ */
+}
+
+/* ----------------
+ * RelationSetLockForRead
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_40 \
+elog(DEBUG, "RelationSetLockForRead(%s[%d,%d]) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_40
+#endif /* LOCKDEBUG*/
+
+/*
+ * RelationSetLockForRead --
+ * Sets relation level read lock.
+ */
+void
+RelationSetLockForRead(Relation relation)
+{
+ LockInfo linfo;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ LOCKDEBUG_40;
+
+ /* ----------------
+ * If we don't have lock info on the reln just go ahead and
+ * lock it without trying to short circuit the lock manager.
+ * ----------------
+ */
+ if (!LockInfoIsValid(relation->lockInfo))
+ {
+ RelationInitLockInfo(relation);
+ linfo = (LockInfo) relation->lockInfo;
+ linfo->flags |= ReadRelationLock;
+ MultiLockReln(linfo, READ_LOCK);
+ return;
+ }
+ else
+ linfo = (LockInfo) relation->lockInfo;
+
+ MultiLockReln(linfo, READ_LOCK);
+}
+
+/* ----------------
+ * RelationUnsetLockForRead
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_50 \
+elog(DEBUG, "RelationUnsetLockForRead(%s[%d,%d]) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_50
+#endif /* LOCKDEBUG*/
+
+/*
+ * RelationUnsetLockForRead --
+ * Unsets relation level read lock.
+ */
+void
+RelationUnsetLockForRead(Relation relation)
+{
+ LockInfo linfo;
+
+ /* ----------------
+ * sanity check
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ linfo = (LockInfo) relation->lockInfo;
+
+ /* ----------------
+ * If we don't have lock info on the reln just go ahead and
+ * release it.
+ * ----------------
+ */
+ if (!LockInfoIsValid(linfo))
+ {
+ elog(WARN,
+ "Releasing a lock on %s with invalid lock information",
+ RelationGetRelationName(relation));
+ }
+
+ MultiReleaseReln(linfo, READ_LOCK);
+}
+
+/* ----------------
+ * RelationSetLockForWrite(relation)
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_60 \
+elog(DEBUG, "RelationSetLockForWrite(%s[%d,%d]) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_60
+#endif /* LOCKDEBUG*/
+
+/*
+ * RelationSetLockForWrite --
+ * Sets relation level write lock.
+ */
+void
+RelationSetLockForWrite(Relation relation)
+{
+ LockInfo linfo;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ LOCKDEBUG_60;
+
+ /* ----------------
+ * If we don't have lock info on the reln just go ahead and
+ * lock it without trying to short circuit the lock manager.
+ * ----------------
+ */
+ if (!LockInfoIsValid(relation->lockInfo))
+ {
+ RelationInitLockInfo(relation);
+ linfo = (LockInfo) relation->lockInfo;
+ linfo->flags |= WriteRelationLock;
+ MultiLockReln(linfo, WRITE_LOCK);
+ return;
+ }
+ else
+ linfo = (LockInfo) relation->lockInfo;
+
+ MultiLockReln(linfo, WRITE_LOCK);
+}
+
+/* ----------------
+ * RelationUnsetLockForWrite
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_70 \
+elog(DEBUG, "RelationUnsetLockForWrite(%s[%d,%d]) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId);
+#else
+#define LOCKDEBUG_70
+#endif /* LOCKDEBUG */
+
+/*
+ * RelationUnsetLockForWrite --
+ * Unsets relation level write lock.
+ */
+void
+RelationUnsetLockForWrite(Relation relation)
+{
+ LockInfo linfo;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled()) {
+ return;
+ }
+
+ linfo = (LockInfo) relation->lockInfo;
+
+ if (!LockInfoIsValid(linfo))
+ {
+ elog(WARN,
+ "Releasing a lock on %s with invalid lock information",
+ RelationGetRelationName(relation));
+ }
+
+ MultiReleaseReln(linfo, WRITE_LOCK);
+}
+
+/* ----------------
+ * RelationSetLockForTupleRead
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_80 \
+elog(DEBUG, "RelationSetLockForTupleRead(%s[%d,%d], 0x%x) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, \
+ itemPointer)
+#define LOCKDEBUG_81 \
+ elog(DEBUG, "RelationSetLockForTupleRead() escalating");
+#else
+#define LOCKDEBUG_80
+#define LOCKDEBUG_81
+#endif /* LOCKDEBUG */
+
+/*
+ * RelationSetLockForTupleRead --
+ * Sets tuple level read lock.
+ */
+void
+RelationSetLockForTupleRead(Relation relation, ItemPointer itemPointer)
+{
+ LockInfo linfo;
+ TransactionId curXact;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ LOCKDEBUG_80;
+
+ /* ---------------------
+ * If our lock info is invalid don't bother trying to short circuit
+ * the lock manager.
+ * ---------------------
+ */
+ if (!LockInfoIsValid(relation->lockInfo))
+ {
+ RelationInitLockInfo(relation);
+ linfo = (LockInfo) relation->lockInfo;
+ linfo->flags |=
+ IntentReadRelationLock |
+ IntentReadPageLock |
+ ReadTupleLock;
+ MultiLockTuple(linfo, itemPointer, READ_LOCK);
+ return;
+ }
+ else
+ linfo = (LockInfo) relation->lockInfo;
+
+ /* ----------------
+ * no need to set a lower granularity lock
+ * ----------------
+ */
+ curXact = GetCurrentTransactionId();
+ if ((linfo->flags & ReadRelationLock) &&
+ TransactionIdEquals(curXact, linfo->transactionIdData))
+ {
+ return;
+ }
+
+ /* ----------------
+ * If we don't already have a tuple lock this transaction
+ * ----------------
+ */
+ if (!( (linfo->flags & ReadTupleLock) &&
+ TransactionIdEquals(curXact, linfo->transactionIdData) )) {
+
+ linfo->flags |=
+ IntentReadRelationLock |
+ IntentReadPageLock |
+ ReadTupleLock;
+
+ /* clear count */
+ linfo->flags &= ~TupleLevelLockCountMask;
+
+ } else {
+ if (TupleLevelLockLimit == (TupleLevelLockCountMask &
+ linfo->flags)) {
+ LOCKDEBUG_81;
+
+ /* escalate */
+ MultiLockReln(linfo, READ_LOCK);
+
+ /* clear count */
+ linfo->flags &= ~TupleLevelLockCountMask;
+ return;
+ }
+
+ /* increment count */
+ linfo->flags =
+ (linfo->flags & ~TupleLevelLockCountMask) |
+ (1 + (TupleLevelLockCountMask & linfo->flags));
+ }
+
+ TransactionIdStore(curXact, &linfo->transactionIdData);
+
+ /* ----------------
+ * Lock the tuple.
+ * ----------------
+ */
+ MultiLockTuple(linfo, itemPointer, READ_LOCK);
+}
+
+/* ----------------
+ * RelationSetLockForReadPage
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_90 \
+elog(DEBUG, "RelationSetLockForReadPage(%s[%d,%d], @%d) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page);
+#else
+#define LOCKDEBUG_90
+#endif /* LOCKDEBUG*/
+
+/* ----------------
+ * RelationSetLockForWritePage
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_100 \
+elog(DEBUG, "RelationSetLockForWritePage(%s[%d,%d], @%d) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page);
+#else
+#define LOCKDEBUG_100
+#endif /* LOCKDEBUG */
+
+/*
+ * RelationSetLockForWritePage --
+ * Sets write lock on a page.
+ */
+void
+RelationSetLockForWritePage(Relation relation,
+ ItemPointer itemPointer)
+{
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ /* ---------------
+ * Make sure linfo is initialized
+ * ---------------
+ */
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ /* ----------------
+ * attempt to set lock
+ * ----------------
+ */
+ MultiLockPage((LockInfo) relation->lockInfo, itemPointer, WRITE_LOCK);
+}
+
+/* ----------------
+ * RelationUnsetLockForReadPage
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_110 \
+elog(DEBUG, "RelationUnsetLockForReadPage(%s[%d,%d], @%d) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page)
+#else
+#define LOCKDEBUG_110
+#endif /* LOCKDEBUG */
+
+/* ----------------
+ * RelationUnsetLockForWritePage
+ * ----------------
+ */
+#ifdef LOCKDEBUG
+#define LOCKDEBUG_120 \
+elog(DEBUG, "RelationUnsetLockForWritePage(%s[%d,%d], @%d) called", \
+ RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page)
+#else
+#define LOCKDEBUG_120
+#endif /* LOCKDEBUG */
+
+/*
+ * Set a single level write page lock. Assumes that you already
+ * have a write intent lock on the relation.
+ */
+void
+RelationSetSingleWLockPage(Relation relation,
+ ItemPointer itemPointer)
+{
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, !UNLOCK);
+}
+
+/*
+ * Unset a single level write page lock
+ */
+void
+RelationUnsetSingleWLockPage(Relation relation,
+ ItemPointer itemPointer)
+{
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ elog(WARN,
+ "Releasing a lock on %s with invalid lock information",
+ RelationGetRelationName(relation));
+
+ SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, UNLOCK);
+}
+
+/*
+ * Set a single level read page lock. Assumes you already have a read
+ * intent lock set on the relation.
+ */
+void
+RelationSetSingleRLockPage(Relation relation,
+ ItemPointer itemPointer)
+{
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, !UNLOCK);
+}
+
+/*
+ * Unset a single level read page lock.
+ */
+void
+RelationUnsetSingleRLockPage(Relation relation,
+ ItemPointer itemPointer)
+{
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ elog(WARN,
+ "Releasing a lock on %s with invalid lock information",
+ RelationGetRelationName(relation));
+
+ SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, UNLOCK);
+}
+
+/*
+ * Set a read intent lock on a relation.
+ *
+ * Usually these are set in a multi-level table when you acquiring a
+ * page level lock. i.e. To acquire a lock on a page you first acquire
+ * an intent lock on the entire relation. Acquiring an intent lock along
+ * allows one to use the single level locking routines later. Good for
+ * index scans that do a lot of page level locking.
+ */
+void
+RelationSetRIntentLock(Relation relation)
+{
+ /* -----------------
+ * Sanity check
+ * -----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, !UNLOCK);
+}
+
+/*
+ * Unset a read intent lock on a relation
+ */
+void
+RelationUnsetRIntentLock(Relation relation)
+{
+ /* -----------------
+ * Sanity check
+ * -----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, UNLOCK);
+}
+
+/*
+ * Set a write intent lock on a relation. For a more complete explanation
+ * see RelationSetRIntentLock()
+ */
+void
+RelationSetWIntentLock(Relation relation)
+{
+ /* -----------------
+ * Sanity check
+ * -----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, !UNLOCK);
+}
+
+/*
+ * Unset a write intent lock.
+ */
+void
+RelationUnsetWIntentLock(Relation relation)
+{
+ /* -----------------
+ * Sanity check
+ * -----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, UNLOCK);
+}
+
+/*
+ * Extend locks are used primarily in tertiary storage devices such as
+ * a WORM disk jukebox. Sometimes need exclusive access to extend a
+ * file by a block.
+ */
+void
+RelationSetLockForExtend(Relation relation)
+{
+ /* -----------------
+ * Sanity check
+ * -----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ MultiLockReln((LockInfo) relation->lockInfo, EXTEND_LOCK);
+}
+
+void
+RelationUnsetLockForExtend(Relation relation)
+{
+ /* -----------------
+ * Sanity check
+ * -----------------
+ */
+ Assert(RelationIsValid(relation));
+ if (LockingDisabled())
+ return;
+
+ if (!LockInfoIsValid(relation->lockInfo))
+ RelationInitLockInfo(relation);
+
+ MultiReleaseReln((LockInfo) relation->lockInfo, EXTEND_LOCK);
+}
+
+/*
+ * Create an LRelid --- Why not just pass in a pointer to the storage?
+ */
+void
+LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId)
+{
+ lRelId->dbId = dbId;
+ lRelId->relId = relId;
+}
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
new file mode 100644
index 00000000000..8df898a0068
--- /dev/null
+++ b/src/backend/storage/lmgr/lock.c
@@ -0,0 +1,1020 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.c--
+ * simple lock acquisition
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lock.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
+ *
+ * NOTES
+ * Outside modules can create a lock table and acquire/release
+ * locks. A lock table is a shared memory hash table. When
+ * a process tries to acquire a lock of a type that conflicts
+ * with existing locks, it is put to sleep using the routines
+ * in storage/lmgr/proc.c.
+ *
+ * Interface:
+ *
+ * LockAcquire(), LockRelease(), LockTabInit().
+ *
+ * LockReplace() is called only within this module and by the
+ * lkchain module. It releases a lock without looking
+ * the lock up in the lock table.
+ *
+ * NOTE: This module is used to define new lock tables. The
+ * multi-level lock table (multi.c) used by the heap
+ * access methods calls these routines. See multi.c for
+ * examples showing how to use this interface.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h> /* for sprintf() */
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/proc.h"
+#include "storage/lock.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "access/xact.h"
+
+/*#define LOCK_MGR_DEBUG*/
+
+#ifndef LOCK_MGR_DEBUG
+
+#define LOCK_PRINT(where,tag,type)
+#define LOCK_DUMP(where,lock,type)
+#define XID_PRINT(where,xidentP)
+
+#else /* LOCK_MGR_DEBUG */
+
+#define LOCK_PRINT(where,tag,type)\
+ elog(NOTICE, "%s: rel (%d) dbid (%d) tid (%d,%d) type (%d)\n",where, \
+ tag->relId, tag->dbId, \
+ ( (tag->tupleId.ip_blkid.data[0] >= 0) ? \
+ BlockIdGetBlockNumber(&tag->tupleId.ip_blkid) : -1 ), \
+ tag->tupleId.ip_posid, \
+ type);
+
+#define LOCK_DUMP(where,lock,type)\
+ elog(NOTICE, "%s: rel (%d) dbid (%d) tid (%d,%d) nHolding (%d) holders (%d,%d,%d,%d,%d) type (%d)\n",where, \
+ lock->tag.relId, lock->tag.dbId, \
+ ((lock->tag.tupleId.ip_blkid.data[0] >= 0) ? \
+ BlockIdGetBlockNumber(&lock->tag.tupleId.ip_blkid) : -1 ), \
+ lock->tag.tupleId.ip_posid, \
+ lock->nHolding,\
+ lock->holders[1],\
+ lock->holders[2],\
+ lock->holders[3],\
+ lock->holders[4],\
+ lock->holders[5],\
+ type);
+
+#define XID_PRINT(where,xidentP)\
+ elog(NOTICE,\
+ "%s:xid (%d) pid (%d) lock (%x) nHolding (%d) holders (%d,%d,%d,%d,%d)",\
+ where,\
+ xidentP->tag.xid,\
+ xidentP->tag.pid,\
+ xidentP->tag.lock,\
+ xidentP->nHolding,\
+ xidentP->holders[1],\
+ xidentP->holders[2],\
+ xidentP->holders[3],\
+ xidentP->holders[4],\
+ xidentP->holders[5]);
+
+#endif /* LOCK_MGR_DEBUG */
+
+SPINLOCK LockMgrLock; /* in Shmem or created in CreateSpinlocks() */
+
+/* This is to simplify/speed up some bit arithmetic */
+
+static MASK BITS_OFF[MAX_LOCKTYPES];
+static MASK BITS_ON[MAX_LOCKTYPES];
+
+/* -----------------
+ * XXX Want to move this to this file
+ * -----------------
+ */
+static bool LockingIsDisabled;
+
+/* ------------------
+ * from storage/ipc/shmem.c
+ * ------------------
+ */
+extern HTAB *ShmemInitHash();
+
+/* -------------------
+ * map from tableId to the lock table structure
+ * -------------------
+ */
+static LOCKTAB *AllTables[MAX_TABLES];
+
+/* -------------------
+ * no zero-th table
+ * -------------------
+ */
+static int NumTables = 1;
+
+/* -------------------
+ * InitLocks -- Init the lock module. Create a private data
+ * structure for constructing conflict masks.
+ * -------------------
+ */
+void
+InitLocks()
+{
+ int i;
+ int bit;
+
+ bit = 1;
+ /* -------------------
+ * remember 0th locktype is invalid
+ * -------------------
+ */
+ for (i=0;i<MAX_LOCKTYPES;i++,bit <<= 1)
+ {
+ BITS_ON[i] = bit;
+ BITS_OFF[i] = ~bit;
+ }
+}
+
+/* -------------------
+ * LockDisable -- sets LockingIsDisabled flag to TRUE or FALSE.
+ * ------------------
+ */
+void
+LockDisable(int status)
+{
+ LockingIsDisabled = status;
+}
+
+
+/*
+ * LockTypeInit -- initialize the lock table's lock type
+ * structures
+ *
+ * Notes: just copying. Should only be called once.
+ */
+static void
+LockTypeInit(LOCKTAB *ltable,
+ MASK *conflictsP,
+ int *prioP,
+ int ntypes)
+{
+ int i;
+
+ ltable->ctl->nLockTypes = ntypes;
+ ntypes++;
+ for (i=0;i<ntypes;i++,prioP++,conflictsP++)
+ {
+ ltable->ctl->conflictTab[i] = *conflictsP;
+ ltable->ctl->prio[i] = *prioP;
+ }
+}
+
+/*
+ * LockTabInit -- initialize a lock table structure
+ *
+ * Notes:
+ * (a) a lock table has four separate entries in the binding
+ * table. This is because every shared hash table and spinlock
+ * has its name stored in the binding table at its creation. It
+ * is wasteful, in this case, but not much space is involved.
+ *
+ */
+LockTableId
+LockTabInit(char *tabName,
+ MASK *conflictsP,
+ int *prioP,
+ int ntypes)
+{
+ LOCKTAB *ltable;
+ char *shmemName;
+ HASHCTL info;
+ int hash_flags;
+ bool found;
+ int status = TRUE;
+
+ if (ntypes > MAX_LOCKTYPES)
+ {
+ elog(NOTICE,"LockTabInit: too many lock types %d greater than %d",
+ ntypes,MAX_LOCKTYPES);
+ return(INVALID_TABLEID);
+ }
+
+ if (NumTables > MAX_TABLES)
+ {
+ elog(NOTICE,
+ "LockTabInit: system limit of MAX_TABLES (%d) lock tables",
+ MAX_TABLES);
+ return(INVALID_TABLEID);
+ }
+
+ /* allocate a string for the binding table lookup */
+ shmemName = (char *) palloc((unsigned)(strlen(tabName)+32));
+ if (! shmemName)
+ {
+ elog(NOTICE,"LockTabInit: couldn't malloc string %s \n",tabName);
+ return(INVALID_TABLEID);
+ }
+
+ /* each lock table has a non-shared header */
+ ltable = (LOCKTAB *) palloc((unsigned) sizeof(LOCKTAB));
+ if (! ltable)
+ {
+ elog(NOTICE,"LockTabInit: couldn't malloc lock table %s\n",tabName);
+ (void) pfree (shmemName);
+ return(INVALID_TABLEID);
+ }
+
+ /* ------------------------
+ * find/acquire the spinlock for the table
+ * ------------------------
+ */
+ SpinAcquire(LockMgrLock);
+
+
+ /* -----------------------
+ * allocate a control structure from shared memory or attach to it
+ * if it already exists.
+ * -----------------------
+ */
+ sprintf(shmemName,"%s (ctl)",tabName);
+ ltable->ctl = (LOCKCTL *)
+ ShmemInitStruct(shmemName,(unsigned)sizeof(LOCKCTL),&found);
+
+ if (! ltable->ctl)
+ {
+ elog(FATAL,"LockTabInit: couldn't initialize %s",tabName);
+ status = FALSE;
+ }
+
+ /* ----------------
+ * we're first - initialize
+ * ----------------
+ */
+ if (! found)
+ {
+ memset(ltable->ctl, 0, sizeof(LOCKCTL));
+ ltable->ctl->masterLock = LockMgrLock;
+ ltable->ctl->tableId = NumTables;
+ }
+
+ /* --------------------
+ * other modules refer to the lock table by a tableId
+ * --------------------
+ */
+ AllTables[NumTables] = ltable;
+ NumTables++;
+ Assert(NumTables <= MAX_TABLES);
+
+ /* ----------------------
+ * allocate a hash table for the lock tags. This is used
+ * to find the different locks.
+ * ----------------------
+ */
+ info.keysize = sizeof(LOCKTAG);
+ info.datasize = sizeof(LOCK);
+ info.hash = tag_hash;
+ hash_flags = (HASH_ELEM | HASH_FUNCTION);
+
+ sprintf(shmemName,"%s (lock hash)",tabName);
+ ltable->lockHash = (HTAB *) ShmemInitHash(shmemName,
+ INIT_TABLE_SIZE,MAX_TABLE_SIZE,
+ &info,hash_flags);
+
+ Assert( ltable->lockHash->hash == tag_hash);
+ if (! ltable->lockHash)
+ {
+ elog(FATAL,"LockTabInit: couldn't initialize %s",tabName);
+ status = FALSE;
+ }
+
+ /* -------------------------
+ * allocate an xid table. When different transactions hold
+ * the same lock, additional information must be saved (locks per tx).
+ * -------------------------
+ */
+ info.keysize = XID_TAGSIZE;
+ info.datasize = sizeof(XIDLookupEnt);
+ info.hash = tag_hash;
+ hash_flags = (HASH_ELEM | HASH_FUNCTION);
+
+ sprintf(shmemName,"%s (xid hash)",tabName);
+ ltable->xidHash = (HTAB *) ShmemInitHash(shmemName,
+ INIT_TABLE_SIZE,MAX_TABLE_SIZE,
+ &info,hash_flags);
+
+ if (! ltable->xidHash)
+ {
+ elog(FATAL,"LockTabInit: couldn't initialize %s",tabName);
+ status = FALSE;
+ }
+
+ /* init ctl data structures */
+ LockTypeInit(ltable, conflictsP, prioP, ntypes);
+
+ SpinRelease(LockMgrLock);
+
+ (void) pfree (shmemName);
+
+ if (status)
+ return(ltable->ctl->tableId);
+ else
+ return(INVALID_TABLEID);
+}
+
+/*
+ * LockTabRename -- allocate another tableId to the same
+ * lock table.
+ *
+ * NOTES: Both the lock module and the lock chain (lchain.c)
+ * module use table id's to distinguish between different
+ * kinds of locks. Short term and long term locks look
+ * the same to the lock table, but are handled differently
+ * by the lock chain manager. This function allows the
+ * client to use different tableIds when acquiring/releasing
+ * short term and long term locks.
+ */
+LockTableId
+LockTabRename(LockTableId tableId)
+{
+ LockTableId newTableId;
+
+ if (NumTables >= MAX_TABLES)
+ {
+ return(INVALID_TABLEID);
+ }
+ if (AllTables[tableId] == INVALID_TABLEID)
+ {
+ return(INVALID_TABLEID);
+ }
+
+ /* other modules refer to the lock table by a tableId */
+ newTableId = NumTables;
+ NumTables++;
+
+ AllTables[newTableId] = AllTables[tableId];
+ return(newTableId);
+}
+
+/*
+ * LockAcquire -- Check for lock conflicts, sleep if conflict found,
+ * set lock if/when no conflicts.
+ *
+ * Returns: TRUE if parameters are correct, FALSE otherwise.
+ *
+ * Side Effects: The lock is always acquired. No way to abort
+ * a lock acquisition other than aborting the transaction.
+ * Lock is recorded in the lkchain.
+ */
+bool
+LockAcquire(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt)
+{
+ XIDLookupEnt *result,item;
+ HTAB *xidTable;
+ bool found;
+ LOCK *lock = NULL;
+ SPINLOCK masterLock;
+ LOCKTAB *ltable;
+ int status;
+ TransactionId myXid;
+
+ Assert (tableId < NumTables);
+ ltable = AllTables[tableId];
+ if (!ltable)
+ {
+ elog(NOTICE,"LockAcquire: bad lock table %d",tableId);
+ return (FALSE);
+ }
+
+ if (LockingIsDisabled)
+ {
+ return(TRUE);
+ }
+
+ LOCK_PRINT("Acquire",lockName,lockt);
+ masterLock = ltable->ctl->masterLock;
+
+ SpinAcquire(masterLock);
+
+ Assert( ltable->lockHash->hash == tag_hash);
+ lock = (LOCK *)hash_search(ltable->lockHash,(Pointer)lockName,HASH_ENTER,&found);
+
+ if (! lock)
+ {
+ SpinRelease(masterLock);
+ elog(FATAL,"LockAcquire: lock table %d is corrupted",tableId);
+ return(FALSE);
+ }
+
+ /* --------------------
+ * if there was nothing else there, complete initialization
+ * --------------------
+ */
+ if (! found)
+ {
+ lock->mask = 0;
+ ProcQueueInit(&(lock->waitProcs));
+ memset((char *)lock->holders, 0, sizeof(int)*MAX_LOCKTYPES);
+ memset((char *)lock->activeHolders, 0, sizeof(int)*MAX_LOCKTYPES);
+ lock->nHolding = 0;
+ lock->nActive = 0;
+
+ Assert(BlockIdEquals(&(lock->tag.tupleId.ip_blkid),
+ &(lockName->tupleId.ip_blkid)));
+
+ }
+
+ /* ------------------
+ * add an element to the lock queue so that we can clear the
+ * locks at end of transaction.
+ * ------------------
+ */
+ xidTable = ltable->xidHash;
+ myXid = GetCurrentTransactionId();
+
+ /* ------------------
+ * Zero out all of the tag bytes (this clears the padding bytes for long
+ * word alignment and ensures hashing consistency).
+ * ------------------
+ */
+ memset(&item, 0, XID_TAGSIZE);
+ TransactionIdStore(myXid, &item.tag.xid);
+ item.tag.lock = MAKE_OFFSET(lock);
+#if 0
+ item.tag.pid = MyPid;
+#endif
+
+ result = (XIDLookupEnt *)hash_search(xidTable, (Pointer)&item, HASH_ENTER, &found);
+ if (!result)
+ {
+ elog(NOTICE,"LockAcquire: xid table corrupted");
+ return(STATUS_ERROR);
+ }
+ if (!found)
+ {
+ XID_PRINT("queueing XidEnt LockAcquire:", result);
+ ProcAddLock(&result->queue);
+ result->nHolding = 0;
+ memset((char *)result->holders, 0, sizeof(int)*MAX_LOCKTYPES);
+ }
+
+ /* ----------------
+ * lock->nholding tells us how many processes have _tried_ to
+ * acquire this lock, Regardless of whether they succeeded or
+ * failed in doing so.
+ * ----------------
+ */
+ lock->nHolding++;
+ lock->holders[lockt]++;
+
+ /* --------------------
+ * If I'm the only one holding a lock, then there
+ * cannot be a conflict. Need to subtract one from the
+ * lock's count since we just bumped the count up by 1
+ * above.
+ * --------------------
+ */
+ if (result->nHolding == lock->nActive)
+ {
+ result->holders[lockt]++;
+ result->nHolding++;
+ GrantLock(lock, lockt);
+ SpinRelease(masterLock);
+ return(TRUE);
+ }
+
+ Assert(result->nHolding <= lock->nActive);
+
+ status = LockResolveConflicts(ltable, lock, lockt, myXid);
+
+ if (status == STATUS_OK)
+ {
+ GrantLock(lock, lockt);
+ }
+ else if (status == STATUS_FOUND)
+ {
+ status = WaitOnLock(ltable, tableId, lock, lockt);
+ XID_PRINT("Someone granted me the lock", result);
+ }
+
+ SpinRelease(masterLock);
+
+ return(status == STATUS_OK);
+}
+
+/* ----------------------------
+ * LockResolveConflicts -- test for lock conflicts
+ *
+ * NOTES:
+ * Here's what makes this complicated: one transaction's
+ * locks don't conflict with one another. When many processes
+ * hold locks, each has to subtract off the other's locks when
+ * determining whether or not any new lock acquired conflicts with
+ * the old ones.
+ *
+ * For example, if I am already holding a WRITE_INTENT lock,
+ * there will not be a conflict with my own READ_LOCK. If I
+ * don't consider the intent lock when checking for conflicts,
+ * I find no conflict.
+ * ----------------------------
+ */
+int
+LockResolveConflicts(LOCKTAB *ltable,
+ LOCK *lock,
+ LOCKT lockt,
+ TransactionId xid)
+{
+ XIDLookupEnt *result,item;
+ int *myHolders;
+ int nLockTypes;
+ HTAB *xidTable;
+ bool found;
+ int bitmask;
+ int i,tmpMask;
+
+ nLockTypes = ltable->ctl->nLockTypes;
+ xidTable = ltable->xidHash;
+
+ /* ---------------------
+ * read my own statistics from the xid table. If there
+ * isn't an entry, then we'll just add one.
+ *
+ * Zero out the tag, this clears the padding bytes for long
+ * word alignment and ensures hashing consistency.
+ * ------------------
+ */
+ memset(&item, 0, XID_TAGSIZE);
+ TransactionIdStore(xid, &item.tag.xid);
+ item.tag.lock = MAKE_OFFSET(lock);
+#if 0
+ item.tag.pid = pid;
+#endif
+
+ if (! (result = (XIDLookupEnt *)
+ hash_search(xidTable, (Pointer)&item, HASH_ENTER, &found)))
+ {
+ elog(NOTICE,"LockResolveConflicts: xid table corrupted");
+ return(STATUS_ERROR);
+ }
+ myHolders = result->holders;
+
+ if (! found)
+ {
+ /* ---------------
+ * we're not holding any type of lock yet. Clear
+ * the lock stats.
+ * ---------------
+ */
+ memset(result->holders, 0, nLockTypes * sizeof(*(lock->holders)));
+ result->nHolding = 0;
+ }
+
+ /* ----------------------------
+ * first check for global conflicts: If no locks conflict
+ * with mine, then I get the lock.
+ *
+ * Checking for conflict: lock->mask represents the types of
+ * currently held locks. conflictTable[lockt] has a bit
+ * set for each type of lock that conflicts with mine. Bitwise
+ * compare tells if there is a conflict.
+ * ----------------------------
+ */
+ if (! (ltable->ctl->conflictTab[lockt] & lock->mask))
+ {
+
+ result->holders[lockt]++;
+ result->nHolding++;
+
+ XID_PRINT("Conflict Resolved: updated xid entry stats", result);
+
+ return(STATUS_OK);
+ }
+
+ /* ------------------------
+ * Rats. Something conflicts. But it could still be my own
+ * lock. We have to construct a conflict mask
+ * that does not reflect our own locks.
+ * ------------------------
+ */
+ bitmask = 0;
+ tmpMask = 2;
+ for (i=1;i<=nLockTypes;i++, tmpMask <<= 1)
+ {
+ if (lock->activeHolders[i] - myHolders[i])
+ {
+ bitmask |= tmpMask;
+ }
+ }
+
+ /* ------------------------
+ * now check again for conflicts. 'bitmask' describes the types
+ * of locks held by other processes. If one of these
+ * conflicts with the kind of lock that I want, there is a
+ * conflict and I have to sleep.
+ * ------------------------
+ */
+ if (! (ltable->ctl->conflictTab[lockt] & bitmask))
+ {
+
+ /* no conflict. Get the lock and go on */
+
+ result->holders[lockt]++;
+ result->nHolding++;
+
+ XID_PRINT("Conflict Resolved: updated xid entry stats", result);
+
+ return(STATUS_OK);
+
+ }
+
+ return(STATUS_FOUND);
+}
+
+int
+WaitOnLock(LOCKTAB *ltable, LockTableId tableId, LOCK *lock, LOCKT lockt)
+{
+ PROC_QUEUE *waitQueue = &(lock->waitProcs);
+
+ int prio = ltable->ctl->prio[lockt];
+
+ /* the waitqueue is ordered by priority. I insert myself
+ * according to the priority of the lock I am acquiring.
+ *
+ * SYNC NOTE: I am assuming that the lock table spinlock
+ * is sufficient synchronization for this queue. That
+ * will not be true if/when people can be deleted from
+ * the queue by a SIGINT or something.
+ */
+ LOCK_DUMP("WaitOnLock: sleeping on lock", lock, lockt);
+ if (ProcSleep(waitQueue,
+ ltable->ctl->masterLock,
+ lockt,
+ prio,
+ lock) != NO_ERROR)
+ {
+ /* -------------------
+ * This could have happend as a result of a deadlock, see HandleDeadLock()
+ * Decrement the lock nHolding and holders fields as we are no longer
+ * waiting on this lock.
+ * -------------------
+ */
+ lock->nHolding--;
+ lock->holders[lockt]--;
+ LOCK_DUMP("WaitOnLock: aborting on lock", lock, lockt);
+ SpinRelease(ltable->ctl->masterLock);
+ elog(WARN,"WaitOnLock: error on wakeup - Aborting this transaction");
+ }
+
+ return(STATUS_OK);
+}
+
+/*
+ * LockRelease -- look up 'lockName' in lock table 'tableId' and
+ * release it.
+ *
+ * Side Effects: if the lock no longer conflicts with the highest
+ * priority waiting process, that process is granted the lock
+ * and awoken. (We have to grant the lock here to avoid a
+ * race between the waking process and any new process to
+ * come along and request the lock).
+ */
+bool
+LockRelease(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt)
+{
+ LOCK *lock = NULL;
+ SPINLOCK masterLock;
+ bool found;
+ LOCKTAB *ltable;
+ XIDLookupEnt *result,item;
+ HTAB *xidTable;
+ bool wakeupNeeded = true;
+
+ Assert (tableId < NumTables);
+ ltable = AllTables[tableId];
+ if (!ltable) {
+ elog(NOTICE, "ltable is null in LockRelease");
+ return (FALSE);
+ }
+
+ if (LockingIsDisabled)
+ {
+ return(TRUE);
+ }
+
+ LOCK_PRINT("Release",lockName,lockt);
+
+ masterLock = ltable->ctl->masterLock;
+ xidTable = ltable->xidHash;
+
+ SpinAcquire(masterLock);
+
+ Assert( ltable->lockHash->hash == tag_hash);
+ lock = (LOCK *)
+ hash_search(ltable->lockHash,(Pointer)lockName,HASH_FIND_SAVE,&found);
+
+ /* let the caller print its own error message, too.
+ * Do not elog(WARN).
+ */
+ if (! lock)
+ {
+ SpinRelease(masterLock);
+ elog(NOTICE,"LockRelease: locktable corrupted");
+ return(FALSE);
+ }
+
+ if (! found)
+ {
+ SpinRelease(masterLock);
+ elog(NOTICE,"LockRelease: locktable lookup failed, no lock");
+ return(FALSE);
+ }
+
+ Assert(lock->nHolding > 0);
+
+ /*
+ * fix the general lock stats
+ */
+ lock->nHolding--;
+ lock->holders[lockt]--;
+ lock->nActive--;
+ lock->activeHolders[lockt]--;
+
+ Assert(lock->nActive >= 0);
+
+ if (! lock->nHolding)
+ {
+ /* ------------------
+ * if there's no one waiting in the queue,
+ * we just released the last lock.
+ * Delete it from the lock table.
+ * ------------------
+ */
+ Assert( ltable->lockHash->hash == tag_hash);
+ lock = (LOCK *) hash_search(ltable->lockHash,
+ (Pointer) &(lock->tag),
+ HASH_REMOVE_SAVED,
+ &found);
+ Assert(lock && found);
+ wakeupNeeded = false;
+ }
+
+ /* ------------------
+ * Zero out all of the tag bytes (this clears the padding bytes for long
+ * word alignment and ensures hashing consistency).
+ * ------------------
+ */
+ memset(&item, 0, XID_TAGSIZE);
+
+ TransactionIdStore(GetCurrentTransactionId(), &item.tag.xid);
+ item.tag.lock = MAKE_OFFSET(lock);
+#if 0
+ item.tag.pid = MyPid;
+#endif
+
+ if (! ( result = (XIDLookupEnt *) hash_search(xidTable,
+ (Pointer)&item,
+ HASH_FIND_SAVE,
+ &found) )
+ || !found)
+ {
+ SpinRelease(masterLock);
+ elog(NOTICE,"LockReplace: xid table corrupted");
+ return(FALSE);
+ }
+ /*
+ * now check to see if I have any private locks. If I do,
+ * decrement the counts associated with them.
+ */
+ result->holders[lockt]--;
+ result->nHolding--;
+
+ XID_PRINT("LockRelease updated xid stats", result);
+
+ /*
+ * If this was my last hold on this lock, delete my entry
+ * in the XID table.
+ */
+ if (! result->nHolding)
+ {
+ if (result->queue.next != INVALID_OFFSET)
+ SHMQueueDelete(&result->queue);
+ if (! (result = (XIDLookupEnt *)
+ hash_search(xidTable, (Pointer)&item, HASH_REMOVE_SAVED, &found)) ||
+ ! found)
+ {
+ SpinRelease(masterLock);
+ elog(NOTICE,"LockReplace: xid table corrupted");
+ return(FALSE);
+ }
+ }
+
+ /* --------------------------
+ * If there are still active locks of the type I just released, no one
+ * should be woken up. Whoever is asleep will still conflict
+ * with the remaining locks.
+ * --------------------------
+ */
+ if (! (lock->activeHolders[lockt]))
+ {
+ /* change the conflict mask. No more of this lock type. */
+ lock->mask &= BITS_OFF[lockt];
+ }
+
+ if (wakeupNeeded)
+ {
+ /* --------------------------
+ * Wake the first waiting process and grant him the lock if it
+ * doesn't conflict. The woken process must record the lock
+ * himself.
+ * --------------------------
+ */
+ (void) ProcLockWakeup(&(lock->waitProcs), (char *) ltable, (char *) lock);
+ }
+
+ SpinRelease(masterLock);
+ return(TRUE);
+}
+
+/*
+ * GrantLock -- udpate the lock data structure to show
+ * the new lock holder.
+ */
+void
+GrantLock(LOCK *lock, LOCKT lockt)
+{
+ lock->nActive++;
+ lock->activeHolders[lockt]++;
+ lock->mask |= BITS_ON[lockt];
+}
+
+bool
+LockReleaseAll(LockTableId tableId, SHM_QUEUE *lockQueue)
+{
+ PROC_QUEUE *waitQueue;
+ int done;
+ XIDLookupEnt *xidLook = NULL;
+ XIDLookupEnt *tmp = NULL;
+ SHMEM_OFFSET end = MAKE_OFFSET(lockQueue);
+ SPINLOCK masterLock;
+ LOCKTAB *ltable;
+ int i,nLockTypes;
+ LOCK *lock;
+ bool found;
+
+ Assert (tableId < NumTables);
+ ltable = AllTables[tableId];
+ if (!ltable)
+ return (FALSE);
+
+ nLockTypes = ltable->ctl->nLockTypes;
+ masterLock = ltable->ctl->masterLock;
+
+ if (SHMQueueEmpty(lockQueue))
+ return TRUE;
+
+ SHMQueueFirst(lockQueue,(Pointer*)&xidLook,&xidLook->queue);
+
+ XID_PRINT("LockReleaseAll:", xidLook);
+
+ SpinAcquire(masterLock);
+ for (;;)
+ {
+ /* ---------------------------
+ * XXX Here we assume the shared memory queue is circular and
+ * that we know its internal structure. Should have some sort of
+ * macros to allow one to walk it. mer 20 July 1991
+ * ---------------------------
+ */
+ done = (xidLook->queue.next == end);
+ lock = (LOCK *) MAKE_PTR(xidLook->tag.lock);
+
+ LOCK_PRINT("ReleaseAll",(&lock->tag),0);
+
+ /* ------------------
+ * fix the general lock stats
+ * ------------------
+ */
+ if (lock->nHolding != xidLook->nHolding)
+ {
+ lock->nHolding -= xidLook->nHolding;
+ lock->nActive -= xidLook->nHolding;
+ Assert(lock->nActive >= 0);
+ for (i=1; i<=nLockTypes; i++)
+ {
+ lock->holders[i] -= xidLook->holders[i];
+ lock->activeHolders[i] -= xidLook->holders[i];
+ if (! lock->activeHolders[i])
+ lock->mask &= BITS_OFF[i];
+ }
+ }
+ else
+ {
+ /* --------------
+ * set nHolding to zero so that we can garbage collect the lock
+ * down below...
+ * --------------
+ */
+ lock->nHolding = 0;
+ }
+ /* ----------------
+ * always remove the xidLookup entry, we're done with it now
+ * ----------------
+ */
+ if ((! hash_search(ltable->xidHash, (Pointer)xidLook, HASH_REMOVE, &found))
+ || !found)
+ {
+ SpinRelease(masterLock);
+ elog(NOTICE,"LockReplace: xid table corrupted");
+ return(FALSE);
+ }
+
+ if (! lock->nHolding)
+ {
+ /* --------------------
+ * if there's no one waiting in the queue, we've just released
+ * the last lock.
+ * --------------------
+ */
+
+ Assert( ltable->lockHash->hash == tag_hash);
+ lock = (LOCK *)
+ hash_search(ltable->lockHash,(Pointer)&(lock->tag),HASH_REMOVE, &found);
+ if ((! lock) || (!found))
+ {
+ SpinRelease(masterLock);
+ elog(NOTICE,"LockReplace: cannot remove lock from HTAB");
+ return(FALSE);
+ }
+ }
+ else
+ {
+ /* --------------------
+ * Wake the first waiting process and grant him the lock if it
+ * doesn't conflict. The woken process must record the lock
+ * him/herself.
+ * --------------------
+ */
+ waitQueue = &(lock->waitProcs);
+ (void) ProcLockWakeup(waitQueue, (char *) ltable, (char *) lock);
+ }
+
+ if (done)
+ break;
+ SHMQueueFirst(&xidLook->queue,(Pointer*)&tmp,&tmp->queue);
+ xidLook = tmp;
+ }
+ SpinRelease(masterLock);
+ SHMQueueInit(lockQueue);
+ return TRUE;
+}
+
+int
+LockShmemSize()
+{
+ int size = 0;
+ int nLockBuckets, nLockSegs;
+ int nXidBuckets, nXidSegs;
+
+ nLockBuckets = 1 << (int)my_log2((NLOCKENTS - 1) / DEF_FFACTOR + 1);
+ nLockSegs = 1 << (int)my_log2((nLockBuckets - 1) / DEF_SEGSIZE + 1);
+
+ nXidBuckets = 1 << (int)my_log2((NLOCKS_PER_XACT-1) / DEF_FFACTOR + 1);
+ nXidSegs = 1 << (int)my_log2((nLockBuckets - 1) / DEF_SEGSIZE + 1);
+
+ size += MAXALIGN(NBACKENDS * sizeof(PROC)); /* each MyProc */
+ size += MAXALIGN(NBACKENDS * sizeof(LOCKCTL)); /* each ltable->ctl */
+ size += MAXALIGN(sizeof(PROC_HDR)); /* ProcGlobal */
+
+ size += MAXALIGN(my_log2(NLOCKENTS) * sizeof(void *));
+ size += MAXALIGN(sizeof(HHDR));
+ size += nLockSegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+ size += NLOCKENTS * /* XXX not multiple of BUCKET_ALLOC_INCR? */
+ (MAXALIGN(sizeof(BUCKET_INDEX)) +
+ MAXALIGN(sizeof(LOCK))); /* contains hash key */
+
+ size += MAXALIGN(my_log2(NBACKENDS) * sizeof(void *));
+ size += MAXALIGN(sizeof(HHDR));
+ size += nXidSegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+ size += NBACKENDS * /* XXX not multiple of BUCKET_ALLOC_INCR? */
+ (MAXALIGN(sizeof(BUCKET_INDEX)) +
+ MAXALIGN(sizeof(XIDLookupEnt))); /* contains hash key */
+
+ return size;
+}
+
+/* -----------------
+ * Boolean function to determine current locking status
+ * -----------------
+ */
+bool
+LockingDisabled()
+{
+ return LockingIsDisabled;
+}
diff --git a/src/backend/storage/lmgr/multi.c b/src/backend/storage/lmgr/multi.c
new file mode 100644
index 00000000000..c1702d18cb8
--- /dev/null
+++ b/src/backend/storage/lmgr/multi.c
@@ -0,0 +1,415 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi.c--
+ * multi level lock table manager
+ *
+ * Standard multi-level lock manager as per the Gray paper
+ * (at least, that is what it is supposed to be). We implement
+ * three levels -- RELN, PAGE, TUPLE. Tuple is actually TID
+ * a physical record pointer. It isn't an object id.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/multi.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
+ *
+ * NOTES:
+ * (1) The lock.c module assumes that the caller here is doing
+ * two phase locking.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>
+#include <string.h>
+#include "storage/lmgr.h"
+#include "storage/multilev.h"
+
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "miscadmin.h" /* MyDatabaseId */
+
+
+/*
+ * INTENT indicates to higher level that a lower level lock has been
+ * set. For example, a write lock on a tuple conflicts with a write
+ * lock on a relation. This conflict is detected as a WRITE_INTENT/
+ * WRITE conflict between the tuple's intent lock and the relation's
+ * write lock.
+ */
+static int MultiConflicts[] = {
+ (int)NULL,
+ /* All reads and writes at any level conflict with a write lock */
+ (1 << WRITE_LOCK)|(1 << WRITE_INTENT)|(1 << READ_LOCK)|(1 << READ_INTENT),
+ /* read locks conflict with write locks at curr and lower levels */
+ (1 << WRITE_LOCK)| (1 << WRITE_INTENT),
+ /* write intent locks */
+ (1 << READ_LOCK) | (1 << WRITE_LOCK),
+ /* read intent locks*/
+ (1 << WRITE_LOCK),
+ /* extend locks for archive storage manager conflict only w/extend locks */
+ (1 << EXTEND_LOCK)
+};
+
+/*
+ * write locks have higher priority than read locks and extend locks. May
+ * want to treat INTENT locks differently.
+ */
+static int MultiPrios[] = {
+ (int)NULL,
+ 2,
+ 1,
+ 2,
+ 1,
+ 1
+};
+
+/*
+ * Lock table identifier for this lock table. The multi-level
+ * lock table is ONE lock table, not three.
+ */
+LockTableId MultiTableId = (LockTableId)NULL;
+LockTableId ShortTermTableId = (LockTableId)NULL;
+
+/*
+ * Create the lock table described by MultiConflicts and Multiprio.
+ */
+LockTableId
+InitMultiLevelLockm()
+{
+ int tableId;
+
+ /* -----------------------
+ * If we're already initialized just return the table id.
+ * -----------------------
+ */
+ if (MultiTableId)
+ return MultiTableId;
+
+ tableId = LockTabInit("LockTable", MultiConflicts, MultiPrios, 5);
+ MultiTableId = tableId;
+ if (! (MultiTableId)) {
+ elog(WARN,"InitMultiLockm: couldnt initialize lock table");
+ }
+ /* -----------------------
+ * No short term lock table for now. -Jeff 15 July 1991
+ *
+ * ShortTermTableId = LockTabRename(tableId);
+ * if (! (ShortTermTableId)) {
+ * elog(WARN,"InitMultiLockm: couldnt rename lock table");
+ * }
+ * -----------------------
+ */
+ return MultiTableId;
+}
+
+/*
+ * MultiLockReln -- lock a relation
+ *
+ * Returns: TRUE if the lock can be set, FALSE otherwise.
+ */
+bool
+MultiLockReln(LockInfo linfo, LOCKT lockt)
+{
+ LOCKTAG tag;
+
+ /* LOCKTAG has two bytes of padding, unfortunately. The
+ * hash function will return miss if the padding bytes aren't
+ * zero'd.
+ */
+ memset(&tag,0,sizeof(tag));
+ tag.relId = linfo->lRelId.relId;
+ tag.dbId = linfo->lRelId.dbId;
+ return(MultiAcquire(MultiTableId, &tag, lockt, RELN_LEVEL));
+}
+
+/*
+ * MultiLockTuple -- Lock the TID associated with a tuple
+ *
+ * Returns: TRUE if lock is set, FALSE otherwise.
+ *
+ * Side Effects: causes intention level locks to be set
+ * at the page and relation level.
+ */
+bool
+MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
+{
+ LOCKTAG tag;
+
+ /* LOCKTAG has two bytes of padding, unfortunately. The
+ * hash function will return miss if the padding bytes aren't
+ * zero'd.
+ */
+ memset(&tag,0,sizeof(tag));
+
+ tag.relId = linfo->lRelId.relId;
+ tag.dbId = linfo->lRelId.dbId;
+
+ /* not locking any valid Tuple, just the page */
+ tag.tupleId = *tidPtr;
+ return(MultiAcquire(MultiTableId, &tag, lockt, TUPLE_LEVEL));
+}
+
+/*
+ * same as above at page level
+ */
+bool
+MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
+{
+ LOCKTAG tag;
+
+ /* LOCKTAG has two bytes of padding, unfortunately. The
+ * hash function will return miss if the padding bytes aren't
+ * zero'd.
+ */
+ memset(&tag,0,sizeof(tag));
+
+
+ /* ----------------------------
+ * Now we want to set the page offset to be invalid
+ * and lock the block. There is some confusion here as to what
+ * a page is. In Postgres a page is an 8k block, however this
+ * block may be partitioned into many subpages which are sometimes
+ * also called pages. The term is overloaded, so don't be fooled
+ * when we say lock the page we mean the 8k block. -Jeff 16 July 1991
+ * ----------------------------
+ */
+ tag.relId = linfo->lRelId.relId;
+ tag.dbId = linfo->lRelId.dbId;
+ BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+ return(MultiAcquire(MultiTableId, &tag, lockt, PAGE_LEVEL));
+}
+
+/*
+ * MultiAcquire -- acquire multi level lock at requested level
+ *
+ * Returns: TRUE if lock is set, FALSE if not
+ * Side Effects:
+ */
+bool
+MultiAcquire(LockTableId tableId,
+ LOCKTAG *tag,
+ LOCKT lockt,
+ LOCK_LEVEL level)
+{
+ LOCKT locks[N_LEVELS];
+ int i,status;
+ LOCKTAG xxTag, *tmpTag = &xxTag;
+ int retStatus = TRUE;
+
+ /*
+ * Three levels implemented. If we set a low level (e.g. Tuple)
+ * lock, we must set INTENT locks on the higher levels. The
+ * intent lock detects conflicts between the low level lock
+ * and an existing high level lock. For example, setting a
+ * write lock on a tuple in a relation is disallowed if there
+ * is an existing read lock on the entire relation. The
+ * write lock would set a WRITE + INTENT lock on the relation
+ * and that lock would conflict with the read.
+ */
+ switch (level) {
+ case RELN_LEVEL:
+ locks[0] = lockt;
+ locks[1] = NO_LOCK;
+ locks[2] = NO_LOCK;
+ break;
+ case PAGE_LEVEL:
+ locks[0] = lockt + INTENT;
+ locks[1] = lockt;
+ locks[2] = NO_LOCK;
+ break;
+ case TUPLE_LEVEL:
+ locks[0] = lockt + INTENT;
+ locks[1] = lockt + INTENT;
+ locks[2] = lockt;
+ break;
+ default:
+ elog(WARN,"MultiAcquire: bad lock level");
+ return(FALSE);
+ }
+
+ /*
+ * construct a new tag as we go. Always loop through all levels,
+ * but if we arent' seting a low level lock, locks[i] is set to
+ * NO_LOCK for the lower levels. Always start from the highest
+ * level and go to the lowest level.
+ */
+ memset(tmpTag,0,sizeof(*tmpTag));
+ tmpTag->relId = tag->relId;
+ tmpTag->dbId = tag->dbId;
+
+ for (i=0;i<N_LEVELS;i++) {
+ if (locks[i] != NO_LOCK) {
+ switch (i) {
+ case RELN_LEVEL:
+ /* -------------
+ * Set the block # and offset to invalid
+ * -------------
+ */
+ BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
+ tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+ break;
+ case PAGE_LEVEL:
+ /* -------------
+ * Copy the block #, set the offset to invalid
+ * -------------
+ */
+ BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
+ &(tag->tupleId.ip_blkid));
+ tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+ break;
+ case TUPLE_LEVEL:
+ /* --------------
+ * Copy the entire tuple id.
+ * --------------
+ */
+ ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
+ break;
+ }
+
+ status = LockAcquire(tableId, tmpTag, locks[i]);
+ if (! status) {
+ /* failed for some reason. Before returning we have
+ * to release all of the locks we just acquired.
+ * MultiRelease(xx,xx,xx, i) means release starting from
+ * the last level lock we successfully acquired
+ */
+ retStatus = FALSE;
+ (void) MultiRelease(tableId, tag, lockt, i);
+ /* now leave the loop. Don't try for any more locks */
+ break;
+ }
+ }
+ }
+ return(retStatus);
+}
+
+/* ------------------
+ * Release a page in the multi-level lock table
+ * ------------------
+ */
+bool
+MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
+{
+ LOCKTAG tag;
+
+ /* ------------------
+ * LOCKTAG has two bytes of padding, unfortunately. The
+ * hash function will return miss if the padding bytes aren't
+ * zero'd.
+ * ------------------
+ */
+ memset(&tag, 0,sizeof(LOCKTAG));
+
+ tag.relId = linfo->lRelId.relId;
+ tag.dbId = linfo->lRelId.dbId;
+ BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+
+ return (MultiRelease(MultiTableId, &tag, lockt, PAGE_LEVEL));
+}
+
+/* ------------------
+ * Release a relation in the multi-level lock table
+ * ------------------
+ */
+bool
+MultiReleaseReln(LockInfo linfo, LOCKT lockt)
+{
+ LOCKTAG tag;
+
+ /* ------------------
+ * LOCKTAG has two bytes of padding, unfortunately. The
+ * hash function will return miss if the padding bytes aren't
+ * zero'd.
+ * ------------------
+ */
+ memset(&tag, 0, sizeof(LOCKTAG));
+ tag.relId = linfo->lRelId.relId;
+ tag.dbId = linfo->lRelId.dbId;
+
+ return (MultiRelease(MultiTableId, &tag, lockt, RELN_LEVEL));
+}
+
+/*
+ * MultiRelease -- release a multi-level lock
+ *
+ * Returns: TRUE if successful, FALSE otherwise.
+ */
+bool
+MultiRelease(LockTableId tableId,
+ LOCKTAG *tag,
+ LOCKT lockt,
+ LOCK_LEVEL level)
+{
+ LOCKT locks[N_LEVELS];
+ int i,status;
+ LOCKTAG xxTag, *tmpTag = &xxTag;
+
+ /*
+ * same level scheme as MultiAcquire().
+ */
+ switch (level) {
+ case RELN_LEVEL:
+ locks[0] = lockt;
+ locks[1] = NO_LOCK;
+ locks[2] = NO_LOCK;
+ break;
+ case PAGE_LEVEL:
+ locks[0] = lockt + INTENT;
+ locks[1] = lockt;
+ locks[2] = NO_LOCK;
+ break;
+ case TUPLE_LEVEL:
+ locks[0] = lockt + INTENT;
+ locks[1] = lockt + INTENT;
+ locks[2] = lockt;
+ break;
+ default:
+ elog(WARN,"MultiRelease: bad lockt");
+ }
+
+ /*
+ * again, construct the tag on the fly. This time, however,
+ * we release the locks in the REVERSE order -- from lowest
+ * level to highest level.
+ *
+ * Must zero out the tag to set padding byes to zero and ensure
+ * hashing consistency.
+ */
+ memset(tmpTag, 0, sizeof(*tmpTag));
+ tmpTag->relId = tag->relId;
+ tmpTag->dbId = tag->dbId;
+
+ for (i=(N_LEVELS-1); i>=0; i--) {
+ if (locks[i] != NO_LOCK) {
+ switch (i) {
+ case RELN_LEVEL:
+ /* -------------
+ * Set the block # and offset to invalid
+ * -------------
+ */
+ BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
+ tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+ break;
+ case PAGE_LEVEL:
+ /* -------------
+ * Copy the block #, set the offset to invalid
+ * -------------
+ */
+ BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
+ &(tag->tupleId.ip_blkid));
+ tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+ break;
+ case TUPLE_LEVEL:
+ ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
+ break;
+ }
+ status = LockRelease(tableId, tmpTag, locks[i]);
+ if (! status) {
+ elog(WARN,"MultiRelease: couldn't release after error");
+ }
+ }
+ }
+ /* shouldn't reach here */
+ return false;
+}
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
new file mode 100644
index 00000000000..0955cdfc2f5
--- /dev/null
+++ b/src/backend/storage/lmgr/proc.c
@@ -0,0 +1,826 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.c--
+ * routines to manage per-process shared memory data structure
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * Each postgres backend gets one of these. We'll use it to
+ * clean up after the process should the process suddenly die.
+ *
+ *
+ * Interface (a):
+ * ProcSleep(), ProcWakeup(), ProcWakeupNext(),
+ * ProcQueueAlloc() -- create a shm queue for sleeping processes
+ * ProcQueueInit() -- create a queue without allocing memory
+ *
+ * Locking and waiting for buffers can cause the backend to be
+ * put to sleep. Whoever releases the lock, etc. wakes the
+ * process up again (and gives it an error code so it knows
+ * whether it was awoken on an error condition).
+ *
+ * Interface (b):
+ *
+ * ProcReleaseLocks -- frees the locks associated with this process,
+ * ProcKill -- destroys the shared memory state (and locks)
+ * associated with the process.
+ *
+ * 5/15/91 -- removed the buffer pool based lock chain in favor
+ * of a shared memory lock chain. The write-protection is
+ * more expensive if the lock chain is in the buffer pool.
+ * The only reason I kept the lock chain in the buffer pool
+ * in the first place was to allow the lock table to grow larger
+ * than available shared memory and that isn't going to work
+ * without a lot of unimplemented support anyway.
+ *
+ * 4/7/95 -- instead of allocating a set of 1 semaphore per process, we
+ * allocate a semaphore from a set of PROC_NSEMS_PER_SET semaphores
+ * shared among backends (we keep a few sets of semaphores around).
+ * This is so that we can support more backends. (system-wide semaphore
+ * sets run out pretty fast.) -ay 4/95
+ *
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ */
+#include <sys/time.h>
+#ifndef WIN32
+#include <unistd.h>
+#endif /* WIN32 */
+#include <string.h>
+#include <sys/types.h>
+#include "libpq/pqsignal.h" /* substitute for <signal.h> */
+
+#if defined(PORTNAME_bsdi)
+/* hacka, hacka, hacka (XXX) */
+union semun {
+ int val; /* value for SETVAL */
+ struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */
+ ushort *array; /* array for GETALL & SETALL */
+};
+#endif
+
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+#include "storage/buf.h"
+#include "storage/lock.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/proc.h"
+
+/*
+ * timeout (in seconds) for resolving possible deadlock
+ */
+#ifndef DEADLOCK_TIMEOUT
+#define DEADLOCK_TIMEOUT 60
+#endif
+
+/* --------------------
+ * Spin lock for manipulating the shared process data structure:
+ * ProcGlobal.... Adding an extra spin lock seemed like the smallest
+ * hack to get around reading and updating this structure in shared
+ * memory. -mer 17 July 1991
+ * --------------------
+ */
+SPINLOCK ProcStructLock;
+
+/*
+ * For cleanup routines. Don't cleanup if the initialization
+ * has not happened.
+ */
+static bool ProcInitialized = FALSE;
+
+static PROC_HDR *ProcGlobal = NULL;
+
+PROC *MyProc = NULL;
+
+static void ProcKill(int exitStatus, int pid);
+static void ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum);
+static void ProcFreeSem(IpcSemaphoreKey semKey, int semNum);
+#if defined(PORTNAME_linux)
+extern int HandleDeadLock(int);
+#else
+extern int HandleDeadLock(void);
+#endif
+/*
+ * InitProcGlobal -
+ * initializes the global process table. We put it here so that
+ * the postmaster can do this initialization. (ProcFreeAllSem needs
+ * to read this table on exiting the postmaster. If we have the first
+ * backend do this, starting up and killing the postmaster without
+ * starting any backends will be a problem.)
+ */
+void
+InitProcGlobal(IPCKey key)
+{
+ bool found = false;
+
+ /* attach to the free list */
+ ProcGlobal = (PROC_HDR *)
+ ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found);
+
+ /* --------------------
+ * We're the first - initialize.
+ * --------------------
+ */
+ if (! found)
+ {
+ int i;
+
+ ProcGlobal->numProcs = 0;
+ ProcGlobal->freeProcs = INVALID_OFFSET;
+ ProcGlobal->currKey = IPCGetProcessSemaphoreInitKey(key);
+ for (i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++)
+ ProcGlobal->freeSemMap[i] = 0;
+ }
+}
+
+/* ------------------------
+ * InitProc -- create a per-process data structure for this process
+ * used by the lock manager on semaphore queues.
+ * ------------------------
+ */
+void
+InitProcess(IPCKey key)
+{
+ bool found = false;
+ int pid;
+ int semstat;
+ unsigned long location, myOffset;
+
+ /* ------------------
+ * Routine called if deadlock timer goes off. See ProcSleep()
+ * ------------------
+ */
+#ifndef WIN32
+ signal(SIGALRM, HandleDeadLock);
+#endif /* WIN32 we'll have to figure out how to handle this later */
+
+ SpinAcquire(ProcStructLock);
+
+ /* attach to the free list */
+ ProcGlobal = (PROC_HDR *)
+ ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found);
+ if (!found) {
+ /* this should not happen. InitProcGlobal() is called before this. */
+ elog(WARN, "InitProcess: Proc Header uninitialized");
+ }
+
+ if (MyProc != NULL)
+ {
+ SpinRelease(ProcStructLock);
+ elog(WARN,"ProcInit: you already exist");
+ return;
+ }
+
+ /* try to get a proc from the free list first */
+
+ myOffset = ProcGlobal->freeProcs;
+
+ if (myOffset != INVALID_OFFSET)
+ {
+ MyProc = (PROC *) MAKE_PTR(myOffset);
+ ProcGlobal->freeProcs = MyProc->links.next;
+ }
+ else
+ {
+ /* have to allocate one. We can't use the normal binding
+ * table mechanism because the proc structure is stored
+ * by PID instead of by a global name (need to look it
+ * up by PID when we cleanup dead processes).
+ */
+
+ MyProc = (PROC *) ShmemAlloc((unsigned)sizeof(PROC));
+ if (! MyProc)
+ {
+ SpinRelease(ProcStructLock);
+ elog (FATAL,"cannot create new proc: out of memory");
+ }
+
+ /* this cannot be initialized until after the buffer pool */
+ SHMQueueInit(&(MyProc->lockQueue));
+ MyProc->procId = ProcGlobal->numProcs;
+ ProcGlobal->numProcs++;
+ }
+
+ /*
+ * zero out the spin lock counts and set the sLocks field for
+ * ProcStructLock to 1 as we have acquired this spinlock above but
+ * didn't record it since we didn't have MyProc until now.
+ */
+ memset(MyProc->sLocks, 0, sizeof(MyProc->sLocks));
+ MyProc->sLocks[ProcStructLock] = 1;
+
+
+ if (IsUnderPostmaster) {
+ IPCKey semKey;
+ int semNum;
+ int semId;
+ union semun semun;
+
+ ProcGetNewSemKeyAndNum(&semKey, &semNum);
+
+ semId = IpcSemaphoreCreate(semKey,
+ PROC_NSEMS_PER_SET,
+ IPCProtection,
+ IpcSemaphoreDefaultStartValue,
+ 0,
+ &semstat);
+ /*
+ * we might be reusing a semaphore that belongs to a dead
+ * backend. So be careful and reinitialize its value here.
+ */
+ semun.val = IpcSemaphoreDefaultStartValue;
+ semctl(semId, semNum, SETVAL, semun);
+
+ IpcSemaphoreLock(semId, semNum, IpcExclusiveLock);
+ MyProc->sem.semId = semId;
+ MyProc->sem.semNum = semNum;
+ MyProc->sem.semKey = semKey;
+ } else {
+ MyProc->sem.semId = -1;
+ }
+
+ /* ----------------------
+ * Release the lock.
+ * ----------------------
+ */
+ SpinRelease(ProcStructLock);
+
+ MyProc->pid = 0;
+#if 0
+ MyProc->pid = MyPid;
+#endif
+
+ /* ----------------
+ * Start keeping spin lock stats from here on. Any botch before
+ * this initialization is forever botched
+ * ----------------
+ */
+ memset(MyProc->sLocks, 0, MAX_SPINS*sizeof(*MyProc->sLocks));
+
+ /* -------------------------
+ * Install ourselves in the binding table. The name to
+ * use is determined by the OS-assigned process id. That
+ * allows the cleanup process to find us after any untimely
+ * exit.
+ * -------------------------
+ */
+ pid = getpid();
+ location = MAKE_OFFSET(MyProc);
+ if ((! ShmemPIDLookup(pid,&location)) || (location != MAKE_OFFSET(MyProc)))
+ {
+ elog(FATAL,"InitProc: ShmemPID table broken");
+ }
+
+ MyProc->errType = NO_ERROR;
+ SHMQueueElemInit(&(MyProc->links));
+
+ on_exitpg(ProcKill, (caddr_t)pid);
+
+ ProcInitialized = TRUE;
+}
+
+/*
+ * ProcReleaseLocks() -- release all locks associated with this process
+ *
+ */
+void
+ProcReleaseLocks()
+{
+ if (!MyProc)
+ return;
+ LockReleaseAll(1,&MyProc->lockQueue);
+}
+
+/*
+ * ProcRemove -
+ * used by the postmaster to clean up the global tables. This also frees
+ * up the semaphore used for the lmgr of the process. (We have to do
+ * this is the postmaster instead of doing a IpcSemaphoreKill on exiting
+ * the process because the semaphore set is shared among backends and
+ * we don't want to remove other's semaphores on exit.)
+ */
+bool
+ProcRemove(int pid)
+{
+ SHMEM_OFFSET location;
+ PROC *proc;
+
+ location = INVALID_OFFSET;
+
+ location = ShmemPIDDestroy(pid);
+ if (location == INVALID_OFFSET)
+ return(FALSE);
+ proc = (PROC *) MAKE_PTR(location);
+
+ SpinAcquire(ProcStructLock);
+
+ ProcFreeSem(proc->sem.semKey, proc->sem.semNum);
+
+ proc->links.next = ProcGlobal->freeProcs;
+ ProcGlobal->freeProcs = MAKE_OFFSET(proc);
+
+ SpinRelease(ProcStructLock);
+
+ return(TRUE);
+}
+
+/*
+ * ProcKill() -- Destroy the per-proc data structure for
+ * this process. Release any of its held spin locks.
+ */
+static void
+ProcKill(int exitStatus, int pid)
+{
+ PROC *proc;
+ SHMEM_OFFSET location;
+
+ /* --------------------
+ * If this is a FATAL exit the postmaster will have to kill all the
+ * existing backends and reinitialize shared memory. So all we don't
+ * need to do anything here.
+ * --------------------
+ */
+ if (exitStatus != 0)
+ return;
+
+ if (! pid)
+ {
+ pid = getpid();
+ }
+
+ ShmemPIDLookup(pid,&location);
+ if (location == INVALID_OFFSET)
+ return;
+
+ proc = (PROC *) MAKE_PTR(location);
+
+ if (proc != MyProc) {
+ Assert( pid != getpid() );
+ } else
+ MyProc = NULL;
+
+ /* ---------------
+ * Assume one lock table.
+ * ---------------
+ */
+ ProcReleaseSpins(proc);
+ LockReleaseAll(1,&proc->lockQueue);
+
+ /* ----------------
+ * get off the wait queue
+ * ----------------
+ */
+ LockLockTable();
+ if (proc->links.next != INVALID_OFFSET) {
+ Assert(proc->waitLock->waitProcs.size > 0);
+ SHMQueueDelete(&(proc->links));
+ --proc->waitLock->waitProcs.size;
+ }
+ SHMQueueElemInit(&(proc->links));
+ UnlockLockTable();
+
+ return;
+}
+
+/*
+ * ProcQueue package: routines for putting processes to sleep
+ * and waking them up
+ */
+
+/*
+ * ProcQueueAlloc -- alloc/attach to a shared memory process queue
+ *
+ * Returns: a pointer to the queue or NULL
+ * Side Effects: Initializes the queue if we allocated one
+ */
+PROC_QUEUE *
+ProcQueueAlloc(char *name)
+{
+ bool found;
+ PROC_QUEUE *queue = (PROC_QUEUE *)
+ ShmemInitStruct(name,(unsigned)sizeof(PROC_QUEUE),&found);
+
+ if (! queue)
+ {
+ return(NULL);
+ }
+ if (! found)
+ {
+ ProcQueueInit(queue);
+ }
+ return(queue);
+}
+
+/*
+ * ProcQueueInit -- initialize a shared memory process queue
+ */
+void
+ProcQueueInit(PROC_QUEUE *queue)
+{
+ SHMQueueInit(&(queue->links));
+ queue->size = 0;
+}
+
+
+
+/*
+ * ProcSleep -- put a process to sleep
+ *
+ * P() on the semaphore should put us to sleep. The process
+ * semaphore is cleared by default, so the first time we try
+ * to acquire it, we sleep.
+ *
+ * ASSUME: that no one will fiddle with the queue until after
+ * we release the spin lock.
+ *
+ * NOTES: The process queue is now a priority queue for locking.
+ */
+int
+ProcSleep(PROC_QUEUE *queue,
+ SPINLOCK spinlock,
+ int token,
+ int prio,
+ LOCK *lock)
+{
+ int i;
+ PROC *proc;
+#ifndef WIN32 /* figure this out later */
+ struct itimerval timeval, dummy;
+#endif /* WIN32 */
+
+ proc = (PROC *) MAKE_PTR(queue->links.prev);
+ for (i=0;i<queue->size;i++)
+ {
+ if (proc->prio < prio)
+ proc = (PROC *) MAKE_PTR(proc->links.prev);
+ else
+ break;
+ }
+
+ MyProc->token = token;
+ MyProc->waitLock = lock;
+
+ /* -------------------
+ * currently, we only need this for the ProcWakeup routines
+ * -------------------
+ */
+ TransactionIdStore((TransactionId) GetCurrentTransactionId(), &MyProc->xid);
+
+ /* -------------------
+ * assume that these two operations are atomic (because
+ * of the spinlock).
+ * -------------------
+ */
+ SHMQueueInsertTL(&(proc->links),&(MyProc->links));
+ queue->size++;
+
+ SpinRelease(spinlock);
+
+ /* --------------
+ * Postgres does not have any deadlock detection code and for this
+ * reason we must set a timer to wake up the process in the event of
+ * a deadlock. For now the timer is set for 1 minute and we assume that
+ * any process which sleeps for this amount of time is deadlocked and will
+ * receive a SIGALRM signal. The handler should release the processes
+ * semaphore and abort the current transaction.
+ *
+ * Need to zero out struct to set the interval and the micro seconds fields
+ * to 0.
+ * --------------
+ */
+#ifndef WIN32
+ memset(&timeval, 0, sizeof(struct itimerval));
+ timeval.it_value.tv_sec = DEADLOCK_TIMEOUT;
+
+ if (setitimer(ITIMER_REAL, &timeval, &dummy))
+ elog(FATAL, "ProcSleep: Unable to set timer for process wakeup");
+#endif /* WIN32 */
+
+ /* --------------
+ * if someone wakes us between SpinRelease and IpcSemaphoreLock,
+ * IpcSemaphoreLock will not block. The wakeup is "saved" by
+ * the semaphore implementation.
+ * --------------
+ */
+ IpcSemaphoreLock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock);
+
+ /* ---------------
+ * We were awoken before a timeout - now disable the timer
+ * ---------------
+ */
+#ifndef WIN32
+ timeval.it_value.tv_sec = 0;
+
+
+ if (setitimer(ITIMER_REAL, &timeval, &dummy))
+ elog(FATAL, "ProcSleep: Unable to diable timer for process wakeup");
+#endif /* WIN32 */
+
+ /* ----------------
+ * We were assumed to be in a critical section when we went
+ * to sleep.
+ * ----------------
+ */
+ SpinAcquire(spinlock);
+
+ return(MyProc->errType);
+}
+
+
+/*
+ * ProcWakeup -- wake up a process by releasing its private semaphore.
+ *
+ * remove the process from the wait queue and set its links invalid.
+ * RETURN: the next process in the wait queue.
+ */
+PROC *
+ProcWakeup(PROC *proc, int errType)
+{
+ PROC *retProc;
+ /* assume that spinlock has been acquired */
+
+ if (proc->links.prev == INVALID_OFFSET ||
+ proc->links.next == INVALID_OFFSET)
+ return((PROC *) NULL);
+
+ retProc = (PROC *) MAKE_PTR(proc->links.prev);
+
+ /* you have to update waitLock->waitProcs.size yourself */
+ SHMQueueDelete(&(proc->links));
+ SHMQueueElemInit(&(proc->links));
+
+ proc->errType = errType;
+
+ IpcSemaphoreUnlock(proc->sem.semId, proc->sem.semNum, IpcExclusiveLock);
+
+ return retProc;
+}
+
+
+/*
+ * ProcGetId --
+ */
+int
+ProcGetId()
+{
+ return( MyProc->procId );
+}
+
+/*
+ * ProcLockWakeup -- routine for waking up processes when a lock is
+ * released.
+ */
+int
+ProcLockWakeup(PROC_QUEUE *queue, char *ltable, char *lock)
+{
+ PROC *proc;
+ int count;
+
+ if (! queue->size)
+ return(STATUS_NOT_FOUND);
+
+ proc = (PROC *) MAKE_PTR(queue->links.prev);
+ count = 0;
+ while ((LockResolveConflicts ((LOCKTAB *) ltable,
+ (LOCK *) lock,
+ proc->token,
+ proc->xid) == STATUS_OK))
+ {
+ /* there was a waiting process, grant it the lock before waking it
+ * up. This will prevent another process from seizing the lock
+ * between the time we release the lock master (spinlock) and
+ * the time that the awoken process begins executing again.
+ */
+ GrantLock((LOCK *) lock, proc->token);
+ queue->size--;
+
+ /*
+ * ProcWakeup removes proc from the lock waiting process queue and
+ * returns the next proc in chain. If a writer just dropped
+ * its lock and there are several waiting readers, wake them all up.
+ */
+ proc = ProcWakeup(proc, NO_ERROR);
+
+ count++;
+ if (!proc || queue->size == 0)
+ break;
+ }
+
+ if (count)
+ return(STATUS_OK);
+ else
+ /* Something is still blocking us. May have deadlocked. */
+ return(STATUS_NOT_FOUND);
+}
+
+void
+ProcAddLock(SHM_QUEUE *elem)
+{
+ SHMQueueInsertTL(&MyProc->lockQueue,elem);
+}
+
+/* --------------------
+ * We only get to this routine if we got SIGALRM after DEADLOCK_TIMEOUT
+ * while waiting for a lock to be released by some other process. After
+ * the one minute deadline we assume we have a deadlock and must abort
+ * this transaction. We must also indicate that I'm no longer waiting
+ * on a lock so that other processes don't try to wake me up and screw
+ * up my semaphore.
+ * --------------------
+ */
+int
+#if defined(PORTNAME_linux)
+HandleDeadLock(int i)
+#else
+HandleDeadLock()
+#endif
+{
+ LOCK *lock;
+ int size;
+
+ LockLockTable();
+
+ /* ---------------------
+ * Check to see if we've been awoken by anyone in the interim.
+ *
+ * If we have we can return and resume our transaction -- happy day.
+ * Before we are awoken the process releasing the lock grants it to
+ * us so we know that we don't have to wait anymore.
+ *
+ * Damn these names are LONG! -mer
+ * ---------------------
+ */
+ if (IpcSemaphoreGetCount(MyProc->sem.semId, MyProc->sem.semNum) ==
+ IpcSemaphoreDefaultStartValue) {
+ UnlockLockTable();
+ return 1;
+ }
+
+ /*
+ * you would think this would be unnecessary, but...
+ *
+ * this also means we've been removed already. in some ports
+ * (e.g., sparc and aix) the semop(2) implementation is such that
+ * we can actually end up in this handler after someone has removed
+ * us from the queue and bopped the semaphore *but the test above
+ * fails to detect the semaphore update* (presumably something weird
+ * having to do with the order in which the semaphore wakeup signal
+ * and SIGALRM get handled).
+ */
+ if (MyProc->links.prev == INVALID_OFFSET ||
+ MyProc->links.next == INVALID_OFFSET) {
+ UnlockLockTable();
+ return(1);
+ }
+
+ lock = MyProc->waitLock;
+ size = lock->waitProcs.size; /* so we can look at this in the core */
+
+ /* ------------------------
+ * Get this process off the lock's wait queue
+ * ------------------------
+ */
+ Assert(lock->waitProcs.size > 0);
+ --lock->waitProcs.size;
+ SHMQueueDelete(&(MyProc->links));
+ SHMQueueElemInit(&(MyProc->links));
+
+ /* ------------------
+ * Unlock my semaphore so that the count is right for next time.
+ * I was awoken by a signal, not by someone unlocking my semaphore.
+ * ------------------
+ */
+ IpcSemaphoreUnlock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock);
+
+ /* -------------
+ * Set MyProc->errType to STATUS_ERROR so that we abort after
+ * returning from this handler.
+ * -------------
+ */
+ MyProc->errType = STATUS_ERROR;
+
+ /*
+ * if this doesn't follow the IpcSemaphoreUnlock then we get lock
+ * table corruption ("LockReplace: xid table corrupted") due to
+ * race conditions. i don't claim to understand this...
+ */
+ UnlockLockTable();
+
+ elog(NOTICE, "Timeout -- possible deadlock");
+ return 0;
+}
+
+void
+ProcReleaseSpins(PROC *proc)
+{
+ int i;
+
+ if (!proc)
+ proc = MyProc;
+
+ if (!proc)
+ return;
+ for (i=0; i < (int)MAX_SPINS; i++)
+ {
+ if (proc->sLocks[i])
+ {
+ Assert(proc->sLocks[i] == 1);
+ SpinRelease(i);
+ }
+ }
+}
+
+/*****************************************************************************
+ *
+ *****************************************************************************/
+
+/*
+ * ProcGetNewSemKeyAndNum -
+ * scan the free semaphore bitmap and allocate a single semaphore from
+ * a semaphore set. (If the semaphore set doesn't exist yet,
+ * IpcSemaphoreCreate will create it. Otherwise, we use the existing
+ * semaphore set.)
+ */
+static void
+ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum)
+{
+ int i;
+ int32 *freeSemMap = ProcGlobal->freeSemMap;
+ unsigned int fullmask;
+
+ /*
+ * we hold ProcStructLock when entering this routine. We scan through
+ * the bitmap to look for a free semaphore.
+ */
+ fullmask = ~0 >> (32 - PROC_NSEMS_PER_SET);
+ for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) {
+ int mask = 1;
+ int j;
+
+ if (freeSemMap[i] == fullmask)
+ continue; /* none free for this set */
+
+ for(j = 0; j < PROC_NSEMS_PER_SET; j++) {
+ if ((freeSemMap[i] & mask) == 0) {
+ /*
+ * a free semaphore found. Mark it as allocated.
+ */
+ freeSemMap[i] |= mask;
+
+ *key = ProcGlobal->currKey + i;
+ *semNum = j;
+ return;
+ }
+ mask <<= 1;
+ }
+ }
+
+ /* if we reach here, all the semaphores are in use. */
+ elog(WARN, "InitProc: cannot allocate a free semaphore");
+}
+
+/*
+ * ProcFreeSem -
+ * free up our semaphore in the semaphore set. If we're the last one
+ * in the set, also remove the semaphore set.
+ */
+static void
+ProcFreeSem(IpcSemaphoreKey semKey, int semNum)
+{
+ int mask;
+ int i;
+ int32 *freeSemMap = ProcGlobal->freeSemMap;
+
+ i = semKey - ProcGlobal->currKey;
+ mask = ~(1 << semNum);
+ freeSemMap[i] &= mask;
+
+ if (freeSemMap[i]==0)
+ IpcSemaphoreKill(semKey);
+}
+
+/*
+ * ProcFreeAllSemaphores -
+ * on exiting the postmaster, we free up all the semaphores allocated
+ * to the lmgrs of the backends.
+ */
+void
+ProcFreeAllSemaphores()
+{
+ int i;
+ int32 *freeSemMap = ProcGlobal->freeSemMap;
+
+ for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) {
+ if (freeSemMap[i]!=0)
+ IpcSemaphoreKill(ProcGlobal->currKey + i);
+ }
+}
diff --git a/src/backend/storage/lmgr/single.c b/src/backend/storage/lmgr/single.c
new file mode 100644
index 00000000000..8d41ea38bb6
--- /dev/null
+++ b/src/backend/storage/lmgr/single.c
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * single.c--
+ * set single locks in the multi-level lock hierarchy
+ *
+ * Sometimes we don't want to set all levels of the multi-level
+ * lock hierarchy at once. This allows us to set and release
+ * one level at a time. It's useful in index scans when
+ * you can set an intent lock at the beginning and thereafter
+ * only set page locks. Tends to speed things up.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/single.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "storage/lmgr.h" /* where the declarations go */
+#include "storage/lock.h"
+#include "storage/multilev.h"
+#include "utils/rel.h"
+
+/*
+ * SingleLockReln -- lock a relation
+ *
+ * Returns: TRUE if the lock can be set, FALSE otherwise.
+ */
+bool
+SingleLockReln(LockInfo linfo, LOCKT lockt, int action)
+{
+ LOCKTAG tag;
+
+ /*
+ * LOCKTAG has two bytes of padding, unfortunately. The
+ * hash function will return miss if the padding bytes aren't
+ * zero'd.
+ */
+ memset(&tag,0,sizeof(tag));
+ tag.relId = linfo->lRelId.relId;
+ tag.dbId = linfo->lRelId.dbId;
+ BlockIdSet(&(tag.tupleId.ip_blkid), InvalidBlockNumber);
+ tag.tupleId.ip_posid = InvalidOffsetNumber;
+
+ if (action == UNLOCK)
+ return(LockRelease(MultiTableId, &tag, lockt));
+ else
+ return(LockAcquire(MultiTableId, &tag, lockt));
+}
+
+/*
+ * SingleLockPage -- use multi-level lock table, but lock
+ * only at the page level.
+ *
+ * Assumes that an INTENT lock has already been set in the
+ * multi-level lock table.
+ *
+ */
+bool
+SingleLockPage(LockInfo linfo,
+ ItemPointer tidPtr,
+ LOCKT lockt,
+ int action)
+{
+ LOCKTAG tag;
+
+ /*
+ * LOCKTAG has two bytes of padding, unfortunately. The
+ * hash function will return miss if the padding bytes aren't
+ * zero'd.
+ */
+ memset(&tag,0,sizeof(tag));
+ tag.relId = linfo->lRelId.relId;
+ tag.dbId = linfo->lRelId.dbId;
+ BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+ tag.tupleId.ip_posid = InvalidOffsetNumber;
+
+
+ if (action == UNLOCK)
+ return(LockRelease(MultiTableId, &tag, lockt));
+ else
+ return(LockAcquire(MultiTableId, &tag, lockt));
+}
+
diff --git a/src/backend/storage/lock.h b/src/backend/storage/lock.h
new file mode 100644
index 00000000000..df490e76512
--- /dev/null
+++ b/src/backend/storage/lock.h
@@ -0,0 +1,218 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.h--
+ *
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: lock.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOCK_H_
+#define LOCK_H_
+
+#include "postgres.h"
+#include "storage/itemptr.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/backendid.h"
+#include "utils/hsearch.h"
+
+extern SPINLOCK LockMgrLock;
+typedef int MASK;
+
+#define INIT_TABLE_SIZE 100
+#define MAX_TABLE_SIZE 1000
+
+
+/* ----------------------
+ * The following defines are used to estimate how much shared
+ * memory the lock manager is going to require.
+ *
+ * NBACKENDS - The number of concurrently running backends
+ * NLOCKS_PER_XACT - The number of unique locks acquired in a transaction
+ * NLOCKENTS - The maximum number of lock entries in the lock table.
+ * ----------------------
+ */
+#define NBACKENDS 50
+#define NLOCKS_PER_XACT 40
+#define NLOCKENTS NLOCKS_PER_XACT*NBACKENDS
+
+typedef int LOCK_TYPE;
+typedef int LOCKT;
+typedef int LockTableId;
+
+/* MAX_LOCKTYPES cannot be larger than the bits in MASK */
+#define MAX_LOCKTYPES 6
+
+/*
+ * MAX_TABLES corresponds to the number of spin locks allocated in
+ * CreateSpinLocks() or the number of shared memory locations allocated
+ * for lock table spin locks in the case of machines with TAS instructions.
+ */
+#define MAX_TABLES 2
+
+#define INVALID_TABLEID 0
+
+/*typedef struct LOCK LOCK; */
+
+
+typedef struct ltag {
+ Oid relId;
+ Oid dbId;
+ ItemPointerData tupleId;
+} LOCKTAG;
+
+#define TAGSIZE (sizeof(LOCKTAG))
+
+/* This is the control structure for a lock table. It
+ * lives in shared memory:
+ *
+ * tableID -- the handle used by the lock table's clients to
+ * refer to the table.
+ *
+ * nLockTypes -- number of lock types (READ,WRITE,etc) that
+ * are defined on this lock table
+ *
+ * conflictTab -- this is an array of bitmasks showing lock
+ * type conflicts. conflictTab[i] is a mask with the j-th bit
+ * turned on if lock types i and j conflict.
+ *
+ * prio -- each locktype has a priority, so, for example, waiting
+ * writers can be given priority over readers (to avoid
+ * starvation).
+ *
+ * masterlock -- synchronizes access to the table
+ *
+ */
+typedef struct lockctl {
+ LockTableId tableId;
+ int nLockTypes;
+ int conflictTab[MAX_LOCKTYPES];
+ int prio[MAX_LOCKTYPES];
+ SPINLOCK masterLock;
+} LOCKCTL;
+
+/*
+ * lockHash -- hash table on lock Ids,
+ * xidHash -- hash on xid and lockId in case
+ * multiple processes are holding the lock
+ * ctl - control structure described above.
+ */
+typedef struct ltable {
+ HTAB *lockHash;
+ HTAB *xidHash;
+ LOCKCTL *ctl;
+} LOCKTAB;
+
+/* -----------------------
+ * A transaction never conflicts with its own locks. Hence, if
+ * multiple transactions hold non-conflicting locks on the same
+ * data, private per-transaction information must be stored in the
+ * XID table. The tag is XID + shared memory lock address so that
+ * all locks can use the same XID table. The private information
+ * we store is the number of locks of each type (holders) and the
+ * total number of locks (nHolding) held by the transaction.
+ *
+ * NOTE: --
+ * There were some problems with the fact that currently TransactionIdData
+ * is a 5 byte entity and compilers long word aligning of structure fields.
+ * If the 3 byte padding is put in front of the actual xid data then the
+ * hash function (which uses XID_TAGSIZE when deciding how many bytes of a
+ * struct to look at for the key) might only see the last two bytes of the xid.
+ *
+ * Clearly this is not good since its likely that these bytes will be the
+ * same for many transactions and hence they will share the same entry in
+ * hash table causing the entry to be corrupted. For this long-winded
+ * reason I have put the tag in a struct of its own to ensure that the
+ * XID_TAGSIZE is computed correctly. It used to be sizeof (SHMEM_OFFSET) +
+ * sizeof(TransactionIdData) which != sizeof(XIDTAG).
+ *
+ * Finally since the hash function will now look at all 12 bytes of the tag
+ * the padding bytes MUST be zero'd before use in hash_search() as they
+ * will have random values otherwise. Jeff 22 July 1991.
+ * -----------------------
+ */
+
+typedef struct XIDTAG {
+ SHMEM_OFFSET lock;
+ int pid;
+ TransactionId xid;
+} XIDTAG;
+
+typedef struct XIDLookupEnt {
+ /* tag */
+ XIDTAG tag;
+
+ /* data */
+ int holders[MAX_LOCKTYPES];
+ int nHolding;
+ SHM_QUEUE queue;
+} XIDLookupEnt;
+
+#define XID_TAGSIZE (sizeof(XIDTAG))
+
+/* originally in procq.h */
+typedef struct procQueue {
+ SHM_QUEUE links;
+ int size;
+} PROC_QUEUE;
+
+
+/*
+ * lock information:
+ *
+ * tag -- uniquely identifies the object being locked
+ * mask -- union of the conflict masks of all lock types
+ * currently held on this object.
+ * waitProcs -- queue of processes waiting for this lock
+ * holders -- count of each lock type currently held on the
+ * lock.
+ * nHolding -- total locks of all types.
+ */
+typedef struct Lock {
+ /* hash key */
+ LOCKTAG tag;
+
+ /* data */
+ int mask;
+ PROC_QUEUE waitProcs;
+ int holders[MAX_LOCKTYPES];
+ int nHolding;
+ int activeHolders[MAX_LOCKTYPES];
+ int nActive;
+} LOCK;
+
+#define LockGetLock_nHolders(l) l->nHolders
+
+#define LockDecrWaitHolders(lock, lockt) \
+ lock->nHolding--; \
+ lock->holders[lockt]--
+
+#define LockLockTable() SpinAcquire(LockMgrLock);
+#define UnlockLockTable() SpinRelease(LockMgrLock);
+
+extern SPINLOCK LockMgrLock;
+
+/*
+ * function prototypes
+ */
+extern void InitLocks(void);
+extern void LockDisable(int status);
+extern LockTableId LockTabInit(char *tabName, MASK *conflictsP, int *prioP,
+ int ntypes);
+extern LockTableId LockTabRename(LockTableId tableId);
+extern bool LockAcquire(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt);
+extern int LockResolveConflicts(LOCKTAB *ltable, LOCK *lock, LOCKT lockt,
+ TransactionId xid);
+extern int WaitOnLock(LOCKTAB *ltable, LockTableId tableId, LOCK *lock,
+ LOCKT lockt);
+extern bool LockRelease(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt);
+extern void GrantLock(LOCK *lock, LOCKT lockt);
+extern bool LockReleaseAll(LockTableId tableId, SHM_QUEUE *lockQueue);
+extern int LockShmemSize(void);
+extern bool LockingDisabled(void);
+
+#endif /* LOCK_H */
diff --git a/src/backend/storage/multilev.h b/src/backend/storage/multilev.h
new file mode 100644
index 00000000000..582c1cb6c37
--- /dev/null
+++ b/src/backend/storage/multilev.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * multilev.h--
+ * multi level lock table consts/defs for single.c and multi.c and their
+ * clients
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: multilev.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MULTILEV_H
+#define MULTILEV_H
+
+#include "storage/lock.h"
+#include "storage/lmgr.h"
+
+#define READ_LOCK 2
+#define WRITE_LOCK 1
+
+/* any time a small granularity READ/WRITE lock is set.
+ * Higher granularity READ_INTENT/WRITE_INTENT locks must
+ * also be set. A read intent lock is has value READ+INTENT.
+ * in this implementation.
+ */
+#define NO_LOCK 0
+#define INTENT 2
+#define READ_INTENT (READ_LOCK+INTENT)
+#define WRITE_INTENT (WRITE_LOCK+INTENT)
+
+#define EXTEND_LOCK 5
+
+#define SHORT_TERM 1
+#define LONG_TERM 2
+#define UNLOCK 0
+
+#define N_LEVELS 3
+#define RELN_LEVEL 0
+#define PAGE_LEVEL 1
+#define TUPLE_LEVEL 2
+typedef int LOCK_LEVEL;
+
+/* multi.c */
+
+extern LockTableId MultiTableId;
+extern LockTableId ShortTermTableId;
+
+/*
+ * function prototypes
+ */
+extern LockTableId InitMultiLevelLockm(void);
+extern bool MultiLockReln(LockInfo linfo, LOCKT lockt);
+extern bool MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiAcquire(LockTableId tableId, LOCKTAG *tag, LOCKT lockt,
+ LOCK_LEVEL level);
+extern bool MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiReleaseReln(LockInfo linfo, LOCKT lockt);
+extern bool MultiRelease(LockTableId tableId, LOCKTAG *tag, LOCKT lockt,
+ LOCK_LEVEL level);
+
+#endif /* MULTILEV_H */
diff --git a/src/backend/storage/off.h b/src/backend/storage/off.h
new file mode 100644
index 00000000000..e5f5cbf5482
--- /dev/null
+++ b/src/backend/storage/off.h
@@ -0,0 +1,60 @@
+/*-------------------------------------------------------------------------
+ *
+ * off.h--
+ * POSTGRES disk "offset" definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: off.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef OFF_H
+#define OFF_H
+
+#include "c.h"
+#include "machine.h" /* for BLCKSZ */
+#include "storage/itemid.h"
+
+/*
+ * OffsetNumber:
+ *
+ * this is a 1-based index into the linp (ItemIdData) array in the
+ * header of each disk page.
+ */
+typedef uint16 OffsetNumber;
+
+#define InvalidOffsetNumber ((OffsetNumber) 0)
+#define FirstOffsetNumber ((OffsetNumber) 1)
+#define MaxOffsetNumber ((OffsetNumber) (BLCKSZ / sizeof(ItemIdData)))
+#define OffsetNumberMask (0xffff) /* valid uint16 bits */
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+
+/*
+ * OffsetNumberIsValid --
+ * True iff the offset number is valid.
+ */
+#define OffsetNumberIsValid(offsetNumber) \
+ ((bool) ((offsetNumber != InvalidOffsetNumber) && \
+ (offsetNumber <= MaxOffsetNumber)))
+
+/*
+ * OffsetNumberNext --
+ * OffsetNumberPrev --
+ * Increments/decrements the argument. These macros look pointless
+ * but they help us disambiguate the different manipulations on
+ * OffsetNumbers (e.g., sometimes we substract one from an
+ * OffsetNumber to move back, and sometimes we do so to form a
+ * real C array index).
+ */
+#define OffsetNumberNext(offsetNumber) \
+ ((OffsetNumber) (1 + (offsetNumber)))
+#define OffsetNumberPrev(offsetNumber) \
+ ((OffsetNumber) (-1 + (offsetNumber)))
+
+#endif /* OFF_H */
diff --git a/src/backend/storage/page.h b/src/backend/storage/page.h
new file mode 100644
index 00000000000..a012ea522c0
--- /dev/null
+++ b/src/backend/storage/page.h
@@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * page.h--
+ * POSTGRES buffer page abstraction definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: page.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PAGE_H
+#define PAGE_H
+
+#include "c.h"
+
+typedef Pointer Page;
+
+/*
+ * PageIsValid --
+ * True iff page is valid.
+ */
+#define PageIsValid(page) PointerIsValid(page)
+
+#endif /* PAGE_H */
diff --git a/src/backend/storage/page/Makefile.inc b/src/backend/storage/page/Makefile.inc
new file mode 100644
index 00000000000..2a7d8408512
--- /dev/null
+++ b/src/backend/storage/page/Makefile.inc
@@ -0,0 +1,16 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for storage/page
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/storage/page/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= bufpage.c itemptr.c
+
+
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
new file mode 100644
index 00000000000..14b5ead85bc
--- /dev/null
+++ b/src/backend/storage/page/bufpage.c
@@ -0,0 +1,519 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.c--
+ * POSTGRES standard buffer page code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/page/bufpage.c,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/types.h>
+#include <sys/file.h>
+
+#include "c.h"
+
+#include "storage/item.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/memutils.h"
+#include "storage/bufpage.h"
+
+#include "lib/qsort.h"
+
+static bool PageManagerShuffle = true; /* default is shuffle mode */
+
+/* ----------------------------------------------------------------
+ * Buffer support functions
+ * ----------------------------------------------------------------
+ */
+/*
+ * BufferGetPageSize --
+ * Returns the page size within a buffer.
+ *
+ * Notes:
+ * Assumes buffer is valid.
+ *
+ * The buffer can be a raw disk block and need not contain a valid
+ * (formatted) disk page.
+ */
+Size
+BufferGetPageSize(Buffer buffer)
+{
+ Size pageSize;
+
+ Assert(BufferIsValid(buffer));
+ pageSize = BLCKSZ; /* XXX dig out of buffer descriptor */
+
+ Assert(PageSizeIsValid(pageSize));
+ return (pageSize);
+}
+
+/*
+ * BufferGetPage --
+ * Returns the page associated with a buffer.
+ */
+Page
+BufferGetPage(Buffer buffer)
+{
+ return (Page) BufferGetBlock(buffer);
+}
+
+
+/* ----------------------------------------------------------------
+ * Page support functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * PageInit --
+ * Initializes the contents of a page.
+ */
+void
+PageInit(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize >
+ specialSize + sizeof(PageHeaderData) - sizeof(ItemIdData));
+
+ specialSize = DOUBLEALIGN(specialSize);
+
+ p->pd_lower = sizeof(PageHeaderData) - sizeof(ItemIdData);
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSize(page, pageSize);
+}
+
+/*
+ * PageGetItem --
+ * Retrieves an item on the given page.
+ *
+ * Note:
+ * This does change the status of any of the resources passed.
+ * The semantics may change in the future.
+ */
+Item
+PageGetItem(Page page, ItemId itemId)
+{
+ Item item;
+
+ Assert(PageIsValid(page));
+ Assert((*itemId).lp_flags & LP_USED);
+
+ item = (Item)(((char *)page) + (*itemId).lp_off);
+
+ return (item);
+}
+
+/*
+ * PageAddItem --
+ * Adds item to the given page.
+ *
+ * Note:
+ * This does not assume that the item resides on a single page.
+ * It is the responsiblity of the caller to act appropriately
+ * depending on this fact. The "pskip" routines provide a
+ * friendlier interface, in this case.
+ *
+ * This does change the status of any of the resources passed.
+ * The semantics may change in the future.
+ *
+ * This routine should probably be combined with others?
+ */
+/* ----------------
+ * PageAddItem
+ *
+ * add an item to a page.
+ *
+ * Notes on interface:
+ * If offsetNumber is valid, shuffle ItemId's down to make room
+ * to use it, if PageManagerShuffle is true. If PageManagerShuffle is
+ * false, then overwrite the specified ItemId. (PageManagerShuffle is
+ * true by default, and is modified by calling PageManagerModeSet.)
+ * If offsetNumber is not valid, then assign one by finding the first
+ * one that is both unused and deallocated.
+ *
+ * NOTE: If offsetNumber is valid, and PageManagerShuffle is true, it
+ * is assumed that there is room on the page to shuffle the ItemId's
+ * down by one.
+ * ----------------
+ */
+OffsetNumber
+PageAddItem(Page page,
+ Item item,
+ Size size,
+ OffsetNumber offsetNumber,
+ ItemIdFlags flags)
+{
+ register i;
+ Size alignedSize;
+ Offset lower;
+ Offset upper;
+ ItemId itemId;
+ ItemId fromitemId, toitemId;
+ OffsetNumber limit;
+
+ bool shuffled = false;
+
+ /*
+ * Find first unallocated offsetNumber
+ */
+ limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+ /* was offsetNumber passed in? */
+ if (OffsetNumberIsValid(offsetNumber)) {
+ if (PageManagerShuffle == true) {
+ /* shuffle ItemId's (Do the PageManager Shuffle...) */
+ for (i = (limit - 1); i >= offsetNumber; i--) {
+ fromitemId = &((PageHeader)page)->pd_linp[i - 1];
+ toitemId = &((PageHeader)page)->pd_linp[i];
+ *toitemId = *fromitemId;
+ }
+ shuffled = true; /* need to increase "lower" */
+ } else { /* overwrite mode */
+ itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+ if (((*itemId).lp_flags & LP_USED) ||
+ ((*itemId).lp_len != 0)) {
+ elog(WARN, "PageAddItem: tried overwrite of used ItemId");
+ return (InvalidOffsetNumber);
+ }
+ }
+ } else { /* offsetNumber was not passed in, so find one */
+ /* look for "recyclable" (unused & deallocated) ItemId */
+ for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) {
+ itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+ if ((((*itemId).lp_flags & LP_USED) == 0) &&
+ ((*itemId).lp_len == 0))
+ break;
+ }
+ }
+ if (offsetNumber > limit)
+ lower = (Offset) (((char *) (&((PageHeader)page)->pd_linp[offsetNumber])) - ((char *) page));
+ else if (offsetNumber == limit || shuffled == true)
+ lower = ((PageHeader)page)->pd_lower + sizeof (ItemIdData);
+ else
+ lower = ((PageHeader)page)->pd_lower;
+
+ alignedSize = DOUBLEALIGN(size);
+
+ upper = ((PageHeader)page)->pd_upper - alignedSize;
+
+ if (lower > upper) {
+ return (InvalidOffsetNumber);
+ }
+
+ itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+ (*itemId).lp_off = upper;
+ (*itemId).lp_len = size;
+ (*itemId).lp_flags = flags;
+ memmove((char *)page + upper, item, size);
+ ((PageHeader)page)->pd_lower = lower;
+ ((PageHeader)page)->pd_upper = upper;
+
+ return (offsetNumber);
+}
+
+/*
+ * PageGetTempPage --
+ * Get a temporary page in local memory for special processing
+ */
+Page
+PageGetTempPage(Page page, Size specialSize)
+{
+ Size pageSize;
+ Size size;
+ Page temp;
+ PageHeader thdr;
+
+ pageSize = PageGetPageSize(page);
+
+ if ((temp = (Page) palloc(pageSize)) == (Page) NULL)
+ elog(FATAL, "Cannot allocate %d bytes for temp page.", pageSize);
+ thdr = (PageHeader) temp;
+
+ /* copy old page in */
+ memmove(temp, page, pageSize);
+
+ /* clear out the middle */
+ size = (pageSize - sizeof(PageHeaderData)) + sizeof(ItemIdData);
+ size -= DOUBLEALIGN(specialSize);
+ memset((char *) &(thdr->pd_linp[0]), 0, size);
+
+ /* set high, low water marks */
+ thdr->pd_lower = sizeof (PageHeaderData) - sizeof (ItemIdData);
+ thdr->pd_upper = pageSize - DOUBLEALIGN(specialSize);
+
+ return (temp);
+}
+
+/*
+ * PageRestoreTempPage --
+ * Copy temporary page back to permanent page after special processing
+ * and release the temporary page.
+ */
+void
+PageRestoreTempPage(Page tempPage, Page oldPage)
+{
+ Size pageSize;
+
+ pageSize = PageGetPageSize(tempPage);
+ memmove((char *) oldPage, (char *) tempPage, pageSize);
+
+ pfree(tempPage);
+}
+
+/*
+ * PageGetMaxOffsetNumber --
+ * Returns the maximum offset number used by the given page.
+ *
+ * NOTE: The offset is invalid if the page is non-empty.
+ * Test whether PageIsEmpty before calling this routine
+ * and/or using its return value.
+ */
+OffsetNumber
+PageGetMaxOffsetNumber(Page page)
+{
+ LocationIndex low;
+ OffsetNumber i;
+
+ low = ((PageHeader) page)->pd_lower;
+ i = (low - (sizeof(PageHeaderData) - sizeof(ItemIdData)))
+ / sizeof(ItemIdData);
+
+ return(i);
+}
+
+/* ----------------
+ * itemid stuff for PageRepairFragmentation
+ * ----------------
+ */
+struct itemIdSortData {
+ int offsetindex; /* linp array index */
+ ItemIdData itemiddata;
+};
+
+static int
+itemidcompare(struct itemIdSortData *itemidp1, struct itemIdSortData *itemidp2)
+{
+ if (itemidp1->itemiddata.lp_off == itemidp2->itemiddata.lp_off)
+ return(0);
+ else if (itemidp1->itemiddata.lp_off < itemidp2->itemiddata.lp_off)
+ return(1);
+ else
+ return(-1);
+}
+
+/*
+ * PageRepairFragmentation --
+ * Frees fragmented space on a page.
+ */
+void
+PageRepairFragmentation(Page page)
+{
+ int i;
+ struct itemIdSortData *itemidbase, *itemidptr;
+ ItemId lp;
+ int nline, nused;
+ int itemidcompare();
+ Offset upper;
+ Size alignedSize;
+
+ nline = (int16) PageGetMaxOffsetNumber(page);
+ nused = 0;
+ for (i=0; i<nline; i++) {
+ lp = ((PageHeader)page)->pd_linp + i;
+ if ((*lp).lp_flags & LP_USED)
+ nused++;
+ }
+
+ if (nused == 0) {
+ for (i=0; i<nline; i++) {
+ lp = ((PageHeader)page)->pd_linp + i;
+ if ((*lp).lp_len > 0) /* unused, but allocated */
+ (*lp).lp_len = 0; /* indicate unused & deallocated */
+ }
+
+ ((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special;
+ } else { /* nused != 0 */
+ itemidbase = (struct itemIdSortData *)
+ palloc(sizeof(struct itemIdSortData) * nused);
+ memset((char *) itemidbase, 0, sizeof(struct itemIdSortData) * nused);
+ itemidptr = itemidbase;
+ for (i=0; i<nline; i++) {
+ lp = ((PageHeader)page)->pd_linp + i;
+ if ((*lp).lp_flags & LP_USED) {
+ itemidptr->offsetindex = i;
+ itemidptr->itemiddata = *lp;
+ itemidptr++;
+ } else {
+ if ((*lp).lp_len > 0) /* unused, but allocated */
+ (*lp).lp_len = 0; /* indicate unused & deallocated */
+ }
+ }
+
+ /* sort itemIdSortData array...*/
+ pg_qsort((char *) itemidbase, nused, sizeof(struct itemIdSortData),
+ (void*) itemidcompare);
+
+ /* compactify page */
+ ((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special;
+
+ for (i=0, itemidptr = itemidbase; i<nused; i++, itemidptr++) {
+ lp = ((PageHeader)page)->pd_linp + itemidptr->offsetindex;
+ alignedSize = DOUBLEALIGN((*lp).lp_len);
+ upper = ((PageHeader)page)->pd_upper - alignedSize;
+ memmove((char *) page + upper,
+ (char *)page + (*lp).lp_off,
+ (*lp).lp_len);
+ (*lp).lp_off = upper;
+ ((PageHeader)page)->pd_upper = upper;
+ }
+
+ pfree(itemidbase);
+ }
+}
+
+/*
+ * PageGetFreeSpace --
+ * Returns the size of the free (allocatable) space on a page.
+ */
+Size
+PageGetFreeSpace(Page page)
+{
+ Size space;
+
+
+ space = ((PageHeader)page)->pd_upper - ((PageHeader)page)->pd_lower;
+
+ if (space < sizeof (ItemIdData)) {
+ return (0);
+ }
+ space -= sizeof (ItemIdData); /* XXX not always true */
+
+ return (space);
+}
+
+/*
+ * PageManagerModeSet --
+ *
+ * Sets mode to either: ShufflePageManagerMode (the default) or
+ * OverwritePageManagerMode. For use by access methods code
+ * for determining semantics of PageAddItem when the offsetNumber
+ * argument is passed in.
+ */
+void
+PageManagerModeSet(PageManagerMode mode)
+{
+ if (mode == ShufflePageManagerMode)
+ PageManagerShuffle = true;
+ else if (mode == OverwritePageManagerMode)
+ PageManagerShuffle = false;
+}
+
+/*
+ *----------------------------------------------------------------
+ * PageIndexTupleDelete
+ *----------------------------------------------------------------
+ *
+ * This routine does the work of removing a tuple from an index page.
+ */
+void
+PageIndexTupleDelete(Page page, OffsetNumber offnum)
+{
+ PageHeader phdr;
+ char *addr;
+ ItemId tup;
+ Size size;
+ char *locn;
+ int nbytes;
+ int offidx;
+
+ phdr = (PageHeader) page;
+
+ /* change offset number to offset index */
+ offidx = offnum - 1;
+
+ tup = PageGetItemId(page, offnum);
+ size = ItemIdGetLength(tup);
+ size = DOUBLEALIGN(size);
+
+ /* location of deleted tuple data */
+ locn = (char *) (page + ItemIdGetOffset(tup));
+
+ /*
+ * First, we want to get rid of the pd_linp entry for the index
+ * tuple. We copy all subsequent linp's back one slot in the
+ * array.
+ */
+
+ nbytes = phdr->pd_lower -
+ ((char *)&phdr->pd_linp[offidx + 1] - (char *) phdr);
+ memmove((char *) &(phdr->pd_linp[offidx]),
+ (char *) &(phdr->pd_linp[offidx + 1]),
+ nbytes);
+
+ /*
+ * Now move everything between the old upper bound (beginning of tuple
+ * space) and the beginning of the deleted tuple forward, so that
+ * space in the middle of the page is left free. If we've just deleted
+ * the tuple at the beginning of tuple space, then there's no need
+ * to do the copy (and bcopy on some architectures SEGV's if asked
+ * to move zero bytes).
+ */
+
+ /* beginning of tuple space */
+ addr = (char *) (page + phdr->pd_upper);
+
+ if (locn != addr)
+ memmove(addr + size, addr, (int) (locn - addr));
+
+ /* adjust free space boundary pointers */
+ phdr->pd_upper += size;
+ phdr->pd_lower -= sizeof (ItemIdData);
+
+ /* finally, we need to adjust the linp entries that remain */
+ if (!PageIsEmpty(page))
+ PageIndexTupleDeleteAdjustLinePointers(phdr, locn, size);
+}
+
+/*
+ *----------------------------------------------------------------
+ * PageIndexTupleDeleteAdjustLinePointers
+ *----------------------------------------------------------------
+ *
+ * Once the line pointers and tuple data have been shifted around
+ * on the page, we need to go down the line pointer vector and
+ * adjust pointers to reflect new locations. Anything that used
+ * to be before the deleted tuple's data was moved forward by the
+ * size of the deleted tuple.
+ *
+ * This routine does the work of adjusting the line pointers.
+ * Location is where the tuple data used to lie; size is how
+ * much space it occupied. We assume that size has been aligned
+ * as required by the time we get here.
+ *
+ * This routine should never be called on an empty page.
+ */
+void
+PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr,
+ char *location,
+ Size size)
+{
+ int i;
+
+ /* location is an index into the page... */
+ location -= (int) phdr;
+
+ for (i = PageGetMaxOffsetNumber((Page) phdr) - 1; i >= 0; i--) {
+ if (phdr->pd_linp[i].lp_off <= (unsigned) location) {
+ phdr->pd_linp[i].lp_off += size;
+ }
+ }
+}
diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c
new file mode 100644
index 00000000000..9d063374038
--- /dev/null
+++ b/src/backend/storage/page/itemptr.c
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.c--
+ * POSTGRES disk item pointer code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/page/itemptr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "storage/block.h"
+#include "storage/off.h"
+#include "storage/itemptr.h"
+#include "storage/bufpage.h"
+
+/*
+ * ItemPointerEquals --
+ * Returns true if both item pointers point to the same item,
+ * otherwise returns false.
+ *
+ * Note:
+ * Assumes that the disk item pointers are not NULL.
+ */
+bool
+ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
+{
+ if (ItemPointerGetBlockNumber(pointer1) ==
+ ItemPointerGetBlockNumber(pointer2) &&
+ ItemPointerGetOffsetNumber(pointer1) ==
+ ItemPointerGetOffsetNumber(pointer2))
+ return(true);
+ else
+ return(false);
+}
+
diff --git a/src/backend/storage/pagenum.h b/src/backend/storage/pagenum.h
new file mode 100644
index 00000000000..f32624c226d
--- /dev/null
+++ b/src/backend/storage/pagenum.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagenum.h--
+ * POSTGRES page number definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: pagenum.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PAGENUM_H
+#define PAGENUM_H
+
+#include "c.h"
+#include "storage/page.h"
+
+typedef uint16 PageNumber;
+
+typedef uint32 LogicalPageNumber;
+
+#define InvalidLogicalPageNumber 0
+
+/*
+ * LogicalPageNumberIsValid --
+ * True iff the logical page number is valid.
+ */
+#define LogicalPageNumberIsValid(pageNumber) \
+ ((bool)((pageNumber) != InvalidLogicalPageNumber))
+
+
+#endif /* PAGENUM_H */
diff --git a/src/backend/storage/pos.h b/src/backend/storage/pos.h
new file mode 100644
index 00000000000..9a7f603416b
--- /dev/null
+++ b/src/backend/storage/pos.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * pos.h--
+ * POSTGRES "position" definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: pos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef POS_H
+#define POS_H
+
+#include "c.h"
+
+/*
+ * a 'position' used to be <pagenumber, offset> in postgres. this has
+ * been changed to just <offset> as the notion of having multiple pages
+ * within a block has been removed.
+ *
+ * the 'offset' abstraction is somewhat confusing. it is NOT a byte
+ * offset within the page; instead, it is an offset into the line
+ * pointer array contained on every page that store (heap or index)
+ * tuples.
+ */
+typedef bits16 PositionIdData;
+typedef PositionIdData *PositionId;
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+
+/*
+ * PositionIdIsValid --
+ * True iff the position identifier is valid.
+ */
+#define PositionIdIsValid(positionId) \
+ PointerIsValid(positionId)
+
+/*
+ * PositionIdSetInvalid --
+ * Make an invalid position.
+ */
+#define PositionIdSetInvalid(positionId) \
+ *(positionId) = (bits16) 0
+
+/*
+ * PositionIdSet --
+ * Sets a position identifier to the specified value.
+ */
+#define PositionIdSet(positionId, offsetNumber) \
+ *(positionId) = (offsetNumber)
+
+/*
+ * PositionIdGetOffsetNumber --
+ * Retrieve the offset number from a position identifier.
+ */
+#define PositionIdGetOffsetNumber(positionId) \
+ ((OffsetNumber) *(positionId))
+
+#endif /* POS_H */
diff --git a/src/backend/storage/proc.h b/src/backend/storage/proc.h
new file mode 100644
index 00000000000..1ec89dedc2d
--- /dev/null
+++ b/src/backend/storage/proc.h
@@ -0,0 +1,127 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.h--
+ *
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: proc.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _PROC_H_
+#define _PROC_H_
+
+#include "storage/ipc.h"
+#include "storage/lock.h"
+#ifndef WIN32
+#include <sys/sem.h>
+#else
+/* This is because WIN32 already defines PROC */
+#define PROC PGL_PROC
+#endif /* WIN32 */
+#include "storage/shmem.h"
+
+
+typedef struct {
+ int sleeplock;
+ int semNum;
+ IpcSemaphoreId semId;
+ IpcSemaphoreKey semKey;
+} SEMA;
+
+/*
+ * Each backend has:
+ */
+typedef struct proc {
+
+ /* proc->links MUST BE THE FIRST ELEMENT OF STRUCT (see ProcWakeup()) */
+
+ SHM_QUEUE links; /* proc can be waiting for one event(lock) */
+ SEMA sem; /* ONE semaphore to sleep on */
+ int errType; /* error code tells why we woke up */
+
+ int procId; /* unique number for this structure
+ * NOT unique per backend, these things
+ * are reused after the backend dies.
+ */
+
+ int critSects; /* If critSects > 0, we are in sensitive
+ * routines that cannot be recovered when
+ * the process fails.
+ */
+
+ int prio; /* priority for sleep queue */
+
+ TransactionId xid; /* transaction currently being executed
+ * by this proc
+ */
+
+ LOCK * waitLock; /* Lock we're sleeping on */
+ int token; /* info for proc wakeup routines */
+ int pid; /* This procs process id */
+ short sLocks[MAX_SPINS]; /* Spin lock stats */
+ SHM_QUEUE lockQueue; /* locks associated with current transaction */
+} PROC;
+
+
+/*
+ * MAX_PROC_SEMS is the maximum number of per-process semaphores (those used
+ * by the lock mgr) we can keep track of. PROC_NSEMS_PER_SET is the number
+ * of semaphores in each (sys-V) semaphore set allocated. (Be careful not
+ * to set it to greater 32. Otherwise, the bitmap will overflow.)
+ */
+#define MAX_PROC_SEMS 128
+#define PROC_NSEMS_PER_SET 16
+
+typedef struct procglobal {
+ SHMEM_OFFSET freeProcs;
+ int numProcs;
+ IPCKey currKey;
+ int32 freeSemMap[MAX_PROC_SEMS/PROC_NSEMS_PER_SET];
+} PROC_HDR;
+
+extern PROC *MyProc;
+
+#define PROC_INCR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])++
+#define PROC_DECR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])--
+
+/*
+ * flags explaining why process woke up
+ */
+#define NO_ERROR 0
+#define ERR_TIMEOUT 1
+#define ERR_BUFFER_IO 2
+
+#define MAX_PRIO 50
+#define MIN_PRIO (-1)
+
+extern SPINLOCK ProcStructLock;
+
+/*
+ * Function Prototypes
+ */
+extern void InitProcess(IPCKey key);
+extern void ProcReleaseLocks(void);
+extern bool ProcRemove(int pid);
+/* extern bool ProcKill(int exitStatus, int pid); */
+/* make static in storage/lmgr/proc.c -- jolly */
+
+extern PROC_QUEUE *ProcQueueAlloc(char *name);
+extern void ProcQueueInit(PROC_QUEUE *queue);
+extern int ProcSleep(PROC_QUEUE *queue, SPINLOCK spinlock, int token,
+ int prio, LOCK *lock);
+extern PROC *ProcWakeup(PROC *proc, int errType);
+extern int ProcGetId(void);
+extern int ProcLockWakeup(PROC_QUEUE *queue, char * ltable, char * lock);
+extern void ProcAddLock(SHM_QUEUE *elem);
+#if defined(PORTNAME_linux)
+extern int HandleDeadLock(int);
+#else
+extern int HandleDeadLock(void);
+#endif
+extern void ProcReleaseSpins(PROC *proc);
+extern void ProcFreeAllSemaphores(void);
+
+#endif /* PROC_H */
diff --git a/src/backend/storage/shmem.h b/src/backend/storage/shmem.h
new file mode 100644
index 00000000000..a00b33581a4
--- /dev/null
+++ b/src/backend/storage/shmem.h
@@ -0,0 +1,104 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.h--
+ * shared memory management structures
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: shmem.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SHMEM_H
+#define SHMEM_H
+
+#include "storage/spin.h" /* for SPINLOCK */
+#include "utils/hsearch.h" /* for HTAB */
+
+/* The shared memory region can start at a different address
+ * in every process. Shared memory "pointers" are actually
+ * offsets relative to the start of the shared memory region(s).
+ */
+typedef unsigned long SHMEM_OFFSET;
+#define INVALID_OFFSET (-1)
+#define BAD_LOCATION (-1)
+
+/* start of the lowest shared memory region. For now, assume that
+ * there is only one shared memory region
+ */
+extern SHMEM_OFFSET ShmemBase;
+
+
+/* coerce an offset into a pointer in this process's address space */
+#define MAKE_PTR(xx_offs)\
+ (ShmemBase+((unsigned long)(xx_offs)))
+
+/* coerce a pointer into a shmem offset */
+#define MAKE_OFFSET(xx_ptr)\
+ (SHMEM_OFFSET) (((unsigned long)(xx_ptr))-ShmemBase)
+
+#define SHM_PTR_VALID(xx_ptr)\
+ (((unsigned long)xx_ptr) > ShmemBase)
+
+/* cannot have an offset to ShmemFreeStart (offset 0) */
+#define SHM_OFFSET_VALID(xx_offs)\
+ ((xx_offs != 0) && (xx_offs != INVALID_OFFSET))
+
+
+extern SPINLOCK ShmemLock;
+extern SPINLOCK BindingLock;
+
+/* shmemqueue.c */
+typedef struct SHM_QUEUE {
+ SHMEM_OFFSET prev;
+ SHMEM_OFFSET next;
+} SHM_QUEUE;
+
+/* shmem.c */
+extern void ShmemBindingTabReset();
+extern void ShmemCreate(unsigned int key, unsigned int size);
+extern int InitShmem(unsigned int key, unsigned int size);
+extern long *ShmemAlloc(unsigned long size);
+extern int ShmemIsValid(unsigned long addr);
+extern HTAB *ShmemInitHash(char *name, long init_size, long max_size,
+ HASHCTL *infoP, int hash_flags);
+extern bool ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr);
+extern SHMEM_OFFSET ShmemPIDDestroy(int pid);
+extern long *ShmemInitStruct(char *name, unsigned long size,
+ bool *foundPtr);
+
+
+typedef int TableID;
+
+/* size constants for the binding table */
+ /* max size of data structure string name */
+#define BTABLE_KEYSIZE (50)
+ /* data in binding table hash bucket */
+#define BTABLE_DATASIZE (sizeof(BindingEnt) - BTABLE_KEYSIZE)
+ /* maximum size of the binding table */
+#define BTABLE_SIZE (100)
+
+/* this is a hash bucket in the binding table */
+typedef struct {
+ char key[BTABLE_KEYSIZE]; /* string name */
+ unsigned long location; /* location in shared mem */
+ unsigned long size; /* numbytes allocated for the
+ * structure
+ */
+} BindingEnt;
+
+/*
+ * prototypes for functions in shmqueue.c
+ */
+extern void SHMQueueInit(SHM_QUEUE *queue);
+extern bool SHMQueueIsDetached(SHM_QUEUE *queue);
+extern void SHMQueueElemInit(SHM_QUEUE *queue);
+extern void SHMQueueDelete(SHM_QUEUE *queue);
+extern void SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern void SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern void SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr,
+ SHM_QUEUE *nextQueue);
+extern bool SHMQueueEmpty(SHM_QUEUE *queue);
+
+#endif /* SHMEM_H */
diff --git a/src/backend/storage/sinval.h b/src/backend/storage/sinval.h
new file mode 100644
index 00000000000..036597dbb7a
--- /dev/null
+++ b/src/backend/storage/sinval.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.h--
+ * POSTGRES shared cache invalidation communication definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: sinval.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SINVAL_H
+#define SINVAL_H
+
+#include "c.h"
+#include "storage/spin.h"
+#include "storage/ipc.h"
+#include "storage/itemptr.h"
+#include "storage/backendid.h"
+
+extern SPINLOCK SInvalLock;
+
+extern void CreateSharedInvalidationState(IPCKey key);
+extern void AttachSharedInvalidationState(IPCKey key);
+extern void InitSharedInvalidationState();
+extern void RegisterSharedInvalid(int cacheId, Index hashIndex,
+ ItemPointer pointer);
+extern void InvalidateSharedInvalid(void (*invalFunction)(),
+ void (*resetFunction)());
+
+
+#endif /* SINVAL_H */
diff --git a/src/backend/storage/sinvaladt.h b/src/backend/storage/sinvaladt.h
new file mode 100644
index 00000000000..06029978980
--- /dev/null
+++ b/src/backend/storage/sinvaladt.h
@@ -0,0 +1,126 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.h--
+ * POSTGRES shared cache invalidation segment definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: sinvaladt.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SINVALADT_H
+#define SINVALADT_H
+
+#include "postgres.h" /* XXX */
+
+#include "storage/ipc.h"
+#include "storage/itemptr.h"
+#include "storage/sinval.h"
+
+/*
+ * The structure of the shared cache invaidation segment
+ *
+ */
+/*
+A------------- Header info --------------
+ criticalSectionSemaphoreId
+ generalSemaphoreId
+ startEntrySection (offset a)
+ endEntrySection (offset a + b)
+ startFreeSpace (offset relative to B)
+ startEntryChain (offset relatiev to B)
+ endEntryChain (offset relative to B)
+ numEntries
+ maxNumEntries
+ procState[MaxBackendId] --> limit
+ resetState (bool)
+a tag (POSTID)
+B------------- Start entry section -------
+ SISegEntry --> entryData --> ... (see SharedInvalidData!)
+ isfree (bool)
+ next (offset to next entry in chain )
+b .... (dynamically growing down)
+C----------------End shared segment -------
+
+*/
+
+/* Parameters (configurable) *******************************************/
+#define MaxBackendId 32 /* maximum number of backends */
+#define MAXNUMMESSAGES 1000 /* maximum number of messages in seg*/
+
+
+#define InvalidOffset 1000000000 /* a invalid offset (End of chain) */
+
+typedef struct ProcState {
+ int limit; /* the number of read messages */
+ bool resetState; /* true, if backend has to reset its state */
+ int tag; /* special tag, recieved from the postmaster */
+} ProcState;
+
+
+typedef struct SISeg {
+ IpcSemaphoreId criticalSectionSemaphoreId; /* semaphore id */
+ IpcSemaphoreId generalSemaphoreId; /* semaphore id */
+ Offset startEntrySection; /* (offset a) */
+ Offset endEntrySection; /* (offset a + b) */
+ Offset startFreeSpace; /* (offset relative to B) */
+ Offset startEntryChain; /* (offset relative to B) */
+ Offset endEntryChain; /* (offset relative to B) */
+ int numEntries;
+ int maxNumEntries;
+ ProcState procState[MaxBackendId]; /* reflects the invalidation state */
+ /* here starts the entry section, controlled by offsets */
+} SISeg;
+#define SizeSISeg sizeof(SISeg)
+
+typedef struct SharedInvalidData {
+ int cacheId; /* XXX */
+ Index hashIndex;
+ ItemPointerData pointerData;
+} SharedInvalidData;
+
+typedef SharedInvalidData *SharedInvalid;
+
+
+typedef struct SISegEntry {
+ SharedInvalidData entryData; /* the message data */
+ bool isfree; /* entry free? */
+ Offset next; /* offset to next entry*/
+} SISegEntry;
+
+#define SizeOfOneSISegEntry sizeof(SISegEntry)
+
+typedef struct SISegOffsets {
+ Offset startSegment; /* always 0 (for now) */
+ Offset offsetToFirstEntry; /* A + a = B */
+ Offset offsetToEndOfSegemnt; /* A + a + b */
+} SISegOffsets;
+
+
+/****************************************************************************/
+/* synchronization of the shared buffer access */
+/* access to the buffer is synchronized by the lock manager !! */
+/****************************************************************************/
+
+#define SI_LockStartValue 255
+#define SI_SharedLock (-1)
+#define SI_ExclusiveLock (-255)
+
+extern SISeg *shmInvalBuffer;
+
+/*
+ * prototypes for functions in sinvaladt.c
+ */
+extern int SIBackendInit(SISeg *segInOutP);
+extern int SISegmentInit(bool killExistingSegment, IPCKey key);
+
+extern bool SISetDataEntry(SISeg *segP, SharedInvalidData *data);
+extern void SISetProcStateInvalid(SISeg *segP);
+extern bool SIDelDataEntry(SISeg *segP);
+extern void SIReadEntryData(SISeg *segP, int backendId,
+ void (*invalFunction)(), void (*resetFunction)());
+extern void SIDelExpiredDataEntries(SISeg *segP);
+
+#endif /* SINVALADT_H */
diff --git a/src/backend/storage/smgr.h b/src/backend/storage/smgr.h
new file mode 100644
index 00000000000..2e91938290a
--- /dev/null
+++ b/src/backend/storage/smgr.h
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.h--
+ * storage manager switch public interface declarations.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: smgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SMGR_H
+#define SMGR_H
+
+#include "utils/rel.h"
+#include "storage/spin.h" /* for SPINLOCK */
+
+#define SM_FAIL 0
+#define SM_SUCCESS 1
+
+#define DEFAULT_SMGR 0
+
+extern int smgrinit(void);
+extern void smgrshutdown(int dummy);
+extern int smgrcreate(int16 which, Relation reln);
+extern int smgrunlink(int16 which, Relation reln);
+extern int smgrextend(int16 which, Relation reln, char *buffer);
+extern int smgropen(int16 which, Relation reln);
+extern int smgrclose(int16 which, Relation reln);
+extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
+ char *buffer);
+extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
+ char *buffer);
+extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
+ char *buffer);
+extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid,
+ Oid relid, BlockNumber blkno, char *buffer);
+extern int smgrnblocks(int16 which, Relation reln);
+extern int smgrcommit(void);
+extern int smgrabort(void);
+extern bool smgriswo(int16 smgrno);
+
+
+
+/* internals: move me elsewhere -- ay 7/94 */
+
+/* in md.c */
+extern int mdinit(void);
+extern int mdcreate(Relation reln);
+extern int mdunlink(Relation reln);
+extern int mdextend(Relation reln, char *buffer);
+extern int mdopen(Relation reln);
+extern int mdclose(Relation reln);
+extern int mdread(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
+ BlockNumber blkno, char *buffer);
+extern int mdnblocks(Relation reln);
+extern int mdcommit(void);
+extern int mdabort(void);
+
+/* mm.c */
+extern SPINLOCK MMCacheLock;
+
+extern int mminit(void);
+extern int mmshutdown(void);
+extern int mmcreate(Relation reln);
+extern int mmunlink(Relation reln);
+extern int mmextend(Relation reln, char *buffer);
+extern int mmopen(Relation reln);
+extern int mmclose(Relation reln);
+extern int mmread(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
+ BlockNumber blkno, char *buffer);
+extern int mmnblocks(Relation reln);
+extern int mmcommit(void);
+extern int mmabort(void);
+extern int MMShmemSize(void);
+
+#endif /* SMGR_H */
diff --git a/src/backend/storage/smgr/Makefile.inc b/src/backend/storage/smgr/Makefile.inc
new file mode 100644
index 00000000000..8ff067afbe8
--- /dev/null
+++ b/src/backend/storage/smgr/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for storage/smgr
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= md.c mm.c smgr.c smgrtype.c
diff --git a/src/backend/storage/smgr/README b/src/backend/storage/smgr/README
new file mode 100644
index 00000000000..4dbb2dce708
--- /dev/null
+++ b/src/backend/storage/smgr/README
@@ -0,0 +1,40 @@
+# $Header: /cvsroot/pgsql/src/backend/storage/smgr/README,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+
+This directory contains the code that supports the Postgres storage manager
+switch and all of the installed storage managers. In released systems,
+the only supported storage manager is the magnetic disk manager. At UC
+Berkeley, the Sony WORM optical disk jukebox and persistent main memory are
+also supported.
+
+As of Postgres Release 3.0, every relation in the system is tagged with the
+storage manager on which it resides. The storage manager switch code turns
+what used to by filesystem operations into operations on the correct store,
+for any given relation.
+
+The files in this directory, and their contents, are
+
+ smgrtype.c Storage manager type -- maps string names to storage manager
+ IDs and provides simple comparison operators. This is the
+ regproc support for type 'smgr' in the system catalogs.
+
+ smgr.c The storage manager switch dispatch code. The routines in
+ this file call the appropriate storage manager to do hardware
+ accesses requested by the backend.
+
+ md.c The magnetic disk storage manager.
+
+ mm.c The persistent main memory storage manager (#undef'ed in
+ tmp/c.h for all distributed systems).
+
+ sj.c The sony jukebox storage manager and cache management code
+ (#undef'ed in tmp/c.h for all distributed systems). The
+ routines in this file allocate extents, maintain block
+ maps, and guarantee the persistence and coherency of a cache
+ of jukebox blocks on magnetic disk.
+
+ pgjb.c The postgres jukebox interface routines. The routines here
+ handle exclusion on the physical device and translate requests
+ from the storage manager code (sj.c) into jbaccess calls.
+
+ jbaccess.c Access code for the physical Sony jukebox device. This code
+ was swiped from Andy McFadden's jblib.a code at UC Berkeley.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
new file mode 100644
index 00000000000..31aa1336a86
--- /dev/null
+++ b/src/backend/storage/smgr/md.c
@@ -0,0 +1,697 @@
+/*-------------------------------------------------------------------------
+ *
+ * md.c--
+ * This code manages relations that reside on magnetic disk.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h> /* for sprintf() */
+#include <sys/file.h>
+
+#include "postgres.h"
+#include "miscadmin.h" /* for DataDir */
+
+#include "machine.h"
+#include "storage/smgr.h" /* where the declarations go */
+#include "storage/block.h"
+#include "storage/fd.h"
+#include "utils/mcxt.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "catalog/catalog.h"
+
+#undef DIAGNOSTIC
+
+/*
+ * The magnetic disk storage manager keeps track of open file descriptors
+ * in its own descriptor pool. This happens for two reasons. First, at
+ * transaction boundaries, we walk the list of descriptors and flush
+ * anything that we've dirtied in the current transaction. Second, we
+ * have to support relations of > 4GBytes. In order to do this, we break
+ * relations up into chunks of < 2GBytes and store one chunk in each of
+ * several files that represent the relation.
+ */
+
+typedef struct _MdfdVec {
+ int mdfd_vfd; /* fd number in vfd pool */
+ uint16 mdfd_flags; /* clean, dirty */
+ int mdfd_lstbcnt; /* most recent block count */
+ struct _MdfdVec *mdfd_chain; /* for large relations */
+} MdfdVec;
+
+static int Nfds = 100;
+static MdfdVec *Md_fdvec = (MdfdVec *) NULL;
+static int CurFd = 0;
+static MemoryContext MdCxt;
+
+#define MDFD_DIRTY (uint16) 0x01
+
+#define RELSEG_SIZE 262144 /* (2 ** 31) / 8192 -- 2GB file */
+
+/* routines declared here */
+static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
+static MdfdVec *_mdfd_getseg(Relation reln, int blkno, int oflag);
+static int _fdvec_ext(void);
+static BlockNumber _mdnblocks(File file, Size blcksz);
+
+/*
+ * mdinit() -- Initialize private state for magnetic disk storage manager.
+ *
+ * We keep a private table of all file descriptors. Whenever we do
+ * a write to one, we mark it dirty in our table. Whenever we force
+ * changes to disk, we mark the file descriptor clean. At transaction
+ * commit, we force changes to disk for all dirty file descriptors.
+ * This routine allocates and initializes the table.
+ *
+ * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdinit()
+{
+ MemoryContext oldcxt;
+
+ MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
+ if (MdCxt == (MemoryContext) NULL)
+ return (SM_FAIL);
+
+ oldcxt = MemoryContextSwitchTo(MdCxt);
+ Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
+ (void) MemoryContextSwitchTo(oldcxt);
+
+ if (Md_fdvec == (MdfdVec *) NULL)
+ return (SM_FAIL);
+
+ memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec));
+
+ return (SM_SUCCESS);
+}
+
+int
+mdcreate(Relation reln)
+{
+ int fd, vfd;
+ int tmp;
+ char *path;
+ extern bool IsBootstrapProcessingMode();
+
+ path = relpath(&(reln->rd_rel->relname.data[0]));
+ fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+
+ /*
+ * If the file already exists and is empty, we pretend that the
+ * create succeeded. During bootstrap processing, we skip that check,
+ * because pg_time, pg_variable, and pg_log get created before their
+ * .bki file entries are processed.
+ */
+
+ if (fd < 0) {
+ if ((fd = FileNameOpenFile(path, O_RDWR, 0600)) >= 0) {
+ if (!IsBootstrapProcessingMode() &&
+ FileRead(fd, (char *) &tmp, sizeof(tmp)) != 0) {
+ FileClose(fd);
+ return (-1);
+ }
+ }
+ }
+
+ if (CurFd >= Nfds) {
+ if (_fdvec_ext() == SM_FAIL)
+ return (-1);
+ }
+
+ Md_fdvec[CurFd].mdfd_vfd = fd;
+ Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
+ Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
+ Md_fdvec[CurFd].mdfd_lstbcnt = 0;
+
+ vfd = CurFd++;
+
+ return (vfd);
+}
+
+/*
+ * mdunlink() -- Unlink a relation.
+ */
+int
+mdunlink(Relation reln)
+{
+ int fd;
+ int i;
+ MdfdVec *v, *ov;
+ MemoryContext oldcxt;
+ char fname[20]; /* XXX should have NAMESIZE defined */
+ char tname[20];
+
+ /* On Windows NT you can't unlink a file if it is open so we have
+ ** to do this.
+ */
+#ifdef WIN32
+ (void) mdclose(reln);
+#endif /* WIN32 */
+
+
+ memset(fname,0,20);
+ strncpy(fname, RelationGetRelationName(reln)->data, 16);
+
+ if (FileNameUnlink(fname) < 0)
+ return (SM_FAIL);
+
+ /* unlink all the overflow files for large relations */
+ for (i = 1; ; i++) {
+#ifdef WIN32
+ (void) mdclose(reln);
+#endif /* WIN32 */
+ sprintf(tname, "%s.%d", fname, i);
+ if (FileNameUnlink(tname) < 0)
+ break;
+ }
+
+ /* finally, clean out the mdfd vector */
+ fd = RelationGetFile(reln);
+ Md_fdvec[fd].mdfd_flags = (uint16) 0;
+
+ oldcxt = MemoryContextSwitchTo(MdCxt);
+ for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; ) {
+ ov = v;
+ v = v->mdfd_chain;
+ if (ov != &Md_fdvec[fd])
+ pfree(ov);
+ }
+ Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
+ (void) MemoryContextSwitchTo(oldcxt);
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * mdextend() -- Add a block to the specified relation.
+ *
+ * This routine returns SM_FAIL or SM_SUCCESS, with errno set as
+ * appropriate.
+ */
+int
+mdextend(Relation reln, char *buffer)
+{
+ long pos;
+ int nblocks;
+ MdfdVec *v;
+
+ nblocks = mdnblocks(reln);
+ v = _mdfd_getseg(reln, nblocks, O_CREAT);
+
+ if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
+ return (SM_FAIL);
+
+ if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
+ return (SM_FAIL);
+
+ /* remember that we did a write, so we can sync at xact commit */
+ v->mdfd_flags |= MDFD_DIRTY;
+
+ /* try to keep the last block count current, though it's just a hint */
+ if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
+ v->mdfd_lstbcnt = RELSEG_SIZE;
+
+#ifdef DIAGNOSTIC
+ if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
+ || v->mdfd_lstbcnt > RELSEG_SIZE)
+ elog(FATAL, "segment too big!");
+#endif
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * mdopen() -- Open the specified relation.
+ */
+int
+mdopen(Relation reln)
+{
+ char *path;
+ int fd;
+ int vfd;
+
+ if (CurFd >= Nfds) {
+ if (_fdvec_ext() == SM_FAIL)
+ return (-1);
+ }
+
+ path = relpath(&(reln->rd_rel->relname.data[0]));
+
+ fd = FileNameOpenFile(path, O_RDWR, 0600);
+
+ /* this should only happen during bootstrap processing */
+ if (fd < 0)
+ fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+
+ Md_fdvec[CurFd].mdfd_vfd = fd;
+ Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
+ Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
+ Md_fdvec[CurFd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+
+#ifdef DIAGNOSTIC
+ if (Md_fdvec[CurFd].mdfd_lstbcnt > RELSEG_SIZE)
+ elog(FATAL, "segment too big on relopen!");
+#endif
+
+ vfd = CurFd++;
+
+ return (vfd);
+}
+
+/*
+ * mdclose() -- Close the specified relation.
+ *
+ * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdclose(Relation reln)
+{
+ int fd;
+ MdfdVec *v;
+
+ fd = RelationGetFile(reln);
+
+ for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+
+ /* may be closed already */
+ if (v->mdfd_vfd < 0)
+ continue;
+
+ /*
+ * We sync the file descriptor so that we don't need to reopen it at
+ * transaction commit to force changes to disk.
+ */
+
+ FileSync(v->mdfd_vfd);
+ FileClose(v->mdfd_vfd);
+
+ /* mark this file descriptor as clean in our private table */
+ v->mdfd_flags &= ~MDFD_DIRTY;
+ }
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * mdread() -- Read the specified block from a relation.
+ *
+ * Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mdread(Relation reln, BlockNumber blocknum, char *buffer)
+{
+ int status;
+ long seekpos;
+ int nbytes;
+ MdfdVec *v;
+
+ v = _mdfd_getseg(reln, blocknum, 0);
+
+ seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+
+#ifdef DIAGNOSTIC
+ if (seekpos >= BLCKSZ * RELSEG_SIZE)
+ elog(FATAL, "seekpos too big!");
+#endif
+
+ if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+ return (SM_FAIL);
+ }
+
+ status = SM_SUCCESS;
+ if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) {
+ if (nbytes == 0) {
+ memset(buffer, 0, BLCKSZ);
+ } else {
+ status = SM_FAIL;
+ }
+ }
+
+ return (status);
+}
+
+/*
+ * mdwrite() -- Write the supplied block at the appropriate location.
+ *
+ * Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
+{
+ int status;
+ long seekpos;
+ MdfdVec *v;
+
+ v = _mdfd_getseg(reln, blocknum, 0);
+
+ seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+#ifdef DIAGNOSTIC
+ if (seekpos >= BLCKSZ * RELSEG_SIZE)
+ elog(FATAL, "seekpos too big!");
+#endif
+
+ if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+ return (SM_FAIL);
+ }
+
+ status = SM_SUCCESS;
+ if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
+ status = SM_FAIL;
+
+ v->mdfd_flags |= MDFD_DIRTY;
+
+ return (status);
+}
+
+/*
+ * mdflush() -- Synchronously write a block to disk.
+ *
+ * This is exactly like mdwrite(), but doesn't return until the file
+ * system buffer cache has been flushed.
+ */
+int
+mdflush(Relation reln, BlockNumber blocknum, char *buffer)
+{
+ int status;
+ long seekpos;
+ MdfdVec *v;
+
+ v = _mdfd_getseg(reln, blocknum, 0);
+
+ seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+#ifdef DIAGNOSTIC
+ if (seekpos >= BLCKSZ * RELSEG_SIZE)
+ elog(FATAL, "seekpos too big!");
+#endif
+
+ if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+ return (SM_FAIL);
+ }
+
+ /* write and sync the block */
+ status = SM_SUCCESS;
+ if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
+ || FileSync(v->mdfd_vfd) < 0)
+ status = SM_FAIL;
+
+ /*
+ * By here, the block is written and changes have been forced to stable
+ * storage. Mark the descriptor as clean until the next write, so we
+ * don't sync it again unnecessarily at transaction commit.
+ */
+
+ v->mdfd_flags &= ~MDFD_DIRTY;
+
+ return (status);
+}
+
+/*
+ * mdblindwrt() -- Write a block to disk blind.
+ *
+ * We have to be able to do this using only the name and OID of
+ * the database and relation in which the block belongs. This
+ * is a synchronous write.
+ */
+int
+mdblindwrt(char *dbstr,
+ char *relstr,
+ Oid dbid,
+ Oid relid,
+ BlockNumber blkno,
+ char *buffer)
+{
+ int fd;
+ int segno;
+ long seekpos;
+ int status;
+ char *path;
+ int nchars;
+
+ /* be sure we have enough space for the '.segno', if any */
+ segno = blkno / RELSEG_SIZE;
+ if (segno > 0)
+ nchars = 10;
+ else
+ nchars = 0;
+
+ /* construct the path to the file and open it */
+ if (dbid == (Oid) 0) {
+ path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
+ if (segno == 0)
+ sprintf(path, "%s/%.*s", DataDir, NAMEDATALEN, relstr);
+ else
+ sprintf(path, "%s/%.*s.%d", DataDir, NAMEDATALEN, relstr, segno);
+ } else {
+ path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars);
+ if (segno == 0)
+ sprintf(path, "%s/base/%.*s/%.*s", DataDir, NAMEDATALEN,
+ dbstr, NAMEDATALEN, relstr);
+ else
+ sprintf(path, "%s/base/%.*s/%.*s.%d", DataDir, NAMEDATALEN, dbstr,
+ NAMEDATALEN, relstr, segno);
+ }
+
+ if ((fd = open(path, O_RDWR, 0600)) < 0)
+ return (SM_FAIL);
+
+ /* seek to the right spot */
+ seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
+ if (lseek(fd, seekpos, SEEK_SET) != seekpos) {
+ (void) close(fd);
+ return (SM_FAIL);
+ }
+
+ status = SM_SUCCESS;
+
+ /* write and sync the block */
+ if (write(fd, buffer, BLCKSZ) != BLCKSZ || fsync(fd) < 0)
+ status = SM_FAIL;
+
+ if (close(fd) < 0)
+ status = SM_FAIL;
+
+ pfree(path);
+
+ return (status);
+}
+
+/*
+ * mdnblocks() -- Get the number of blocks stored in a relation.
+ *
+ * Returns # of blocks or -1 on error.
+ */
+int
+mdnblocks(Relation reln)
+{
+ int fd;
+ MdfdVec *v;
+ int nblocks;
+ int segno;
+
+ fd = RelationGetFile(reln);
+ v = &Md_fdvec[fd];
+
+#ifdef DIAGNOSTIC
+ if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
+ elog(FATAL, "segment too big in getseg!");
+#endif
+
+ segno = 0;
+ for (;;) {
+ if (v->mdfd_lstbcnt == RELSEG_SIZE
+ || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE) {
+
+ v->mdfd_lstbcnt = RELSEG_SIZE;
+ segno++;
+
+ if (v->mdfd_chain == (MdfdVec *) NULL) {
+ v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
+ if (v->mdfd_chain == (MdfdVec *) NULL)
+ elog(WARN, "cannot count blocks for %.16s -- open failed",
+ RelationGetRelationName(reln));
+ }
+
+ v = v->mdfd_chain;
+ } else {
+ return ((segno * RELSEG_SIZE) + nblocks);
+ }
+ }
+}
+
+/*
+ * mdcommit() -- Commit a transaction.
+ *
+ * All changes to magnetic disk relations must be forced to stable
+ * storage. This routine makes a pass over the private table of
+ * file descriptors. Any descriptors to which we have done writes,
+ * but not synced, are synced here.
+ *
+ * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdcommit()
+{
+ int i;
+ MdfdVec *v;
+
+ for (i = 0; i < CurFd; i++) {
+ for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+ if (v->mdfd_flags & MDFD_DIRTY) {
+ if (FileSync(v->mdfd_vfd) < 0)
+ return (SM_FAIL);
+
+ v->mdfd_flags &= ~MDFD_DIRTY;
+ }
+ }
+ }
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * mdabort() -- Abort a transaction.
+ *
+ * Changes need not be forced to disk at transaction abort. We mark
+ * all file descriptors as clean here. Always returns SM_SUCCESS.
+ */
+int
+mdabort()
+{
+ int i;
+ MdfdVec *v;
+
+ for (i = 0; i < CurFd; i++) {
+ for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+ v->mdfd_flags &= ~MDFD_DIRTY;
+ }
+ }
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * _fdvec_ext() -- Extend the md file descriptor vector.
+ *
+ * The file descriptor vector must be large enough to hold at least
+ * 'fd' entries.
+ */
+static
+int _fdvec_ext()
+{
+ MdfdVec *nvec;
+ MemoryContext oldcxt;
+
+ Nfds *= 2;
+
+ oldcxt = MemoryContextSwitchTo(MdCxt);
+
+ nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
+ memset(nvec, 0, Nfds * sizeof(MdfdVec));
+ memmove(nvec, (char *) Md_fdvec, (Nfds / 2) * sizeof(MdfdVec));
+ pfree(Md_fdvec);
+
+ (void) MemoryContextSwitchTo(oldcxt);
+
+ Md_fdvec = nvec;
+
+ return (SM_SUCCESS);
+}
+
+static MdfdVec *
+_mdfd_openseg(Relation reln, int segno, int oflags)
+{
+ MemoryContext oldcxt;
+ MdfdVec *v;
+ int fd;
+ bool dofree;
+ char *path, *fullpath;
+
+ /* be sure we have enough space for the '.segno', if any */
+ path = relpath(RelationGetRelationName(reln)->data);
+
+ dofree = false;
+ if (segno > 0) {
+ dofree = true;
+ fullpath = (char *) palloc(strlen(path) + 12);
+ sprintf(fullpath, "%s.%d", path, segno);
+ } else
+ fullpath = path;
+
+ /* open the file */
+ fd = PathNameOpenFile(fullpath, O_RDWR|oflags, 0600);
+
+ if (dofree)
+ pfree(fullpath);
+
+ if (fd < 0)
+ return ((MdfdVec *) NULL);
+
+ /* allocate an mdfdvec entry for it */
+ oldcxt = MemoryContextSwitchTo(MdCxt);
+ v = (MdfdVec *) palloc(sizeof(MdfdVec));
+ (void) MemoryContextSwitchTo(oldcxt);
+
+ /* fill the entry */
+ v->mdfd_vfd = fd;
+ v->mdfd_flags = (uint16) 0;
+ v->mdfd_chain = (MdfdVec *) NULL;
+ v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+
+#ifdef DIAGNOSTIC
+ if (v->mdfd_lstbcnt > RELSEG_SIZE)
+ elog(FATAL, "segment too big on open!");
+#endif
+
+ /* all done */
+ return (v);
+}
+
+static MdfdVec *
+_mdfd_getseg(Relation reln, int blkno, int oflag)
+{
+ MdfdVec *v;
+ int segno;
+ int fd;
+ int i;
+
+ fd = RelationGetFile(reln);
+ if (fd < 0) {
+ if ((fd = mdopen(reln)) < 0)
+ elog(WARN, "cannot open relation %.16s",
+ RelationGetRelationName(reln));
+ reln->rd_fd = fd;
+ }
+
+ for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
+ segno > 0;
+ i++, segno--) {
+
+ if (v->mdfd_chain == (MdfdVec *) NULL) {
+ v->mdfd_chain = _mdfd_openseg(reln, i, oflag);
+
+ if (v->mdfd_chain == (MdfdVec *) NULL)
+ elog(WARN, "cannot open segment %d of relation %.16s",
+ i, RelationGetRelationName(reln));
+ }
+ v = v->mdfd_chain;
+ }
+
+ return (v);
+}
+
+static BlockNumber
+_mdnblocks(File file, Size blcksz)
+{
+ long len;
+
+ len = FileSeek(file, 0L, SEEK_END) - 1;
+ return((BlockNumber)((len < 0) ? 0 : 1 + len / blcksz));
+}
diff --git a/src/backend/storage/smgr/mm.c b/src/backend/storage/smgr/mm.c
new file mode 100644
index 00000000000..24a8d2472a6
--- /dev/null
+++ b/src/backend/storage/smgr/mm.c
@@ -0,0 +1,586 @@
+/*-------------------------------------------------------------------------
+ *
+ * mm.c--
+ * main memory storage manager
+ *
+ * This code manages relations that reside in (presumably stable)
+ * main memory.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#ifdef MAIN_MEMORY
+
+#include <math.h>
+#include "machine.h"
+#include "storage/ipc.h"
+#include "storage/smgr.h" /* where the declarations go */
+#include "storage/block.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+#include "utils/hsearch.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+
+/*
+ * MMCacheTag -- Unique triplet for blocks stored by the main memory
+ * storage manager.
+ */
+
+typedef struct MMCacheTag {
+ Oid mmct_dbid;
+ Oid mmct_relid;
+ BlockNumber mmct_blkno;
+} MMCacheTag;
+
+/*
+ * Shared-memory hash table for main memory relations contains
+ * entries of this form.
+ */
+
+typedef struct MMHashEntry {
+ MMCacheTag mmhe_tag;
+ int mmhe_bufno;
+} MMHashEntry;
+
+/*
+ * MMRelTag -- Unique identifier for each relation that is stored in the
+ * main-memory storage manager.
+ */
+
+typedef struct MMRelTag {
+ Oid mmrt_dbid;
+ Oid mmrt_relid;
+} MMRelTag;
+
+/*
+ * Shared-memory hash table for # blocks in main memory relations contains
+ * entries of this form.
+ */
+
+typedef struct MMRelHashEntry {
+ MMRelTag mmrhe_tag;
+ int mmrhe_nblocks;
+} MMRelHashEntry;
+
+#define MMNBUFFERS 10
+#define MMNRELATIONS 2
+
+SPINLOCK MMCacheLock;
+extern bool IsPostmaster;
+extern Oid MyDatabaseId;
+
+static int *MMCurTop;
+static int *MMCurRelno;
+static MMCacheTag *MMBlockTags;
+static char *MMBlockCache;
+static HTAB *MMCacheHT;
+static HTAB *MMRelCacheHT;
+
+int
+mminit()
+{
+ char *mmcacheblk;
+ int mmsize = 0;
+ bool found;
+ HASHCTL info;
+
+ SpinAcquire(MMCacheLock);
+
+ mmsize += MAXALIGN(BLCKSZ * MMNBUFFERS);
+ mmsize += MAXALIGN(sizeof(*MMCurTop));
+ mmsize += MAXALIGN(sizeof(*MMCurRelno));
+ mmsize += MAXALIGN((MMNBUFFERS * sizeof(MMCacheTag)));
+ mmcacheblk = (char *) ShmemInitStruct("Main memory smgr", mmsize, &found);
+
+ if (mmcacheblk == (char *) NULL) {
+ SpinRelease(MMCacheLock);
+ return (SM_FAIL);
+ }
+
+ info.keysize = sizeof(MMCacheTag);
+ info.datasize = sizeof(int);
+ info.hash = tag_hash;
+
+ MMCacheHT = (HTAB *) ShmemInitHash("Main memory store HT",
+ MMNBUFFERS, MMNBUFFERS,
+ &info, (HASH_ELEM|HASH_FUNCTION));
+
+ if (MMCacheHT == (HTAB *) NULL) {
+ SpinRelease(MMCacheLock);
+ return (SM_FAIL);
+ }
+
+ info.keysize = sizeof(MMRelTag);
+ info.datasize = sizeof(int);
+ info.hash = tag_hash;
+
+ MMRelCacheHT = (HTAB *) ShmemInitHash("Main memory rel HT",
+ MMNRELATIONS, MMNRELATIONS,
+ &info, (HASH_ELEM|HASH_FUNCTION));
+
+ if (MMRelCacheHT == (HTAB *) NULL) {
+ SpinRelease(MMCacheLock);
+ return (SM_FAIL);
+ }
+
+ if (IsPostmaster) {
+ memset(mmcacheblk, 0, mmsize);
+ SpinRelease(MMCacheLock);
+ return (SM_SUCCESS);
+ }
+
+ SpinRelease(MMCacheLock);
+
+ MMCurTop = (int *) mmcacheblk;
+ mmcacheblk += sizeof(int);
+ MMCurRelno = (int *) mmcacheblk;
+ mmcacheblk += sizeof(int);
+ MMBlockTags = (MMCacheTag *) mmcacheblk;
+ mmcacheblk += (MMNBUFFERS * sizeof(MMCacheTag));
+ MMBlockCache = mmcacheblk;
+
+ return (SM_SUCCESS);
+}
+
+int
+mmshutdown()
+{
+ return (SM_SUCCESS);
+}
+
+int
+mmcreate(Relation reln)
+{
+ MMRelHashEntry *entry;
+ bool found;
+ MMRelTag tag;
+
+ SpinAcquire(MMCacheLock);
+
+ if (*MMCurRelno == MMNRELATIONS) {
+ SpinRelease(MMCacheLock);
+ return (SM_FAIL);
+ }
+
+ (*MMCurRelno)++;
+
+ tag.mmrt_relid = reln->rd_id;
+ if (reln->rd_rel->relisshared)
+ tag.mmrt_dbid = (Oid) 0;
+ else
+ tag.mmrt_dbid = MyDatabaseId;
+
+ entry = (MMRelHashEntry *) hash_search(MMRelCacheHT,
+ (char *) &tag, HASH_ENTER, &found);
+
+ if (entry == (MMRelHashEntry *) NULL) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "main memory storage mgr rel cache hash table corrupt");
+ }
+
+ if (found) {
+ /* already exists */
+ SpinRelease(MMCacheLock);
+ return (SM_FAIL);
+ }
+
+ entry->mmrhe_nblocks = 0;
+
+ SpinRelease(MMCacheLock);
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * mmunlink() -- Unlink a relation.
+ */
+int
+mmunlink(Relation reln)
+{
+ int i;
+ Oid reldbid;
+ MMHashEntry *entry;
+ MMRelHashEntry *rentry;
+ bool found;
+ MMRelTag rtag;
+
+ if (reln->rd_rel->relisshared)
+ reldbid = (Oid) 0;
+ else
+ reldbid = MyDatabaseId;
+
+ SpinAcquire(MMCacheLock);
+
+ for (i = 0; i < MMNBUFFERS; i++) {
+ if (MMBlockTags[i].mmct_dbid == reldbid
+ && MMBlockTags[i].mmct_relid == reln->rd_id) {
+ entry = (MMHashEntry *) hash_search(MMCacheHT,
+ (char *) &MMBlockTags[i],
+ HASH_REMOVE, &found);
+ if (entry == (MMHashEntry *) NULL || !found) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "mmunlink: cache hash table corrupted");
+ }
+ MMBlockTags[i].mmct_dbid = (Oid) 0;
+ MMBlockTags[i].mmct_relid = (Oid) 0;
+ MMBlockTags[i].mmct_blkno = (BlockNumber) 0;
+ }
+ }
+ rtag.mmrt_dbid = reldbid;
+ rtag.mmrt_relid = reln->rd_id;
+
+ rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+ HASH_REMOVE, &found);
+
+ if (rentry == (MMRelHashEntry *) NULL || !found) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "mmunlink: rel cache hash table corrupted");
+ }
+
+ (*MMCurRelno)--;
+
+ SpinRelease(MMCacheLock);
+ return 1;
+}
+
+/*
+ * mmextend() -- Add a block to the specified relation.
+ *
+ * This routine returns SM_FAIL or SM_SUCCESS, with errno set as
+ * appropriate.
+ */
+int
+mmextend(Relation reln, char *buffer)
+{
+ MMRelHashEntry *rentry;
+ MMHashEntry *entry;
+ int i;
+ Oid reldbid;
+ int offset;
+ bool found;
+ MMRelTag rtag;
+ MMCacheTag tag;
+
+ if (reln->rd_rel->relisshared)
+ reldbid = (Oid) 0;
+ else
+ reldbid = MyDatabaseId;
+
+ tag.mmct_dbid = rtag.mmrt_dbid = reldbid;
+ tag.mmct_relid = rtag.mmrt_relid = reln->rd_id;
+
+ SpinAcquire(MMCacheLock);
+
+ if (*MMCurTop == MMNBUFFERS) {
+ for (i = 0; i < MMNBUFFERS; i++) {
+ if (MMBlockTags[i].mmct_dbid == 0 &&
+ MMBlockTags[i].mmct_relid == 0)
+ break;
+ }
+ if (i == MMNBUFFERS) {
+ SpinRelease(MMCacheLock);
+ return (SM_FAIL);
+ }
+ } else {
+ i = *MMCurTop;
+ (*MMCurTop)++;
+ }
+
+ rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+ HASH_FIND, &found);
+ if (rentry == (MMRelHashEntry *) NULL || !found) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "mmextend: rel cache hash table corrupt");
+ }
+
+ tag.mmct_blkno = rentry->mmrhe_nblocks;
+
+ entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+ HASH_ENTER, &found);
+ if (entry == (MMHashEntry *) NULL || found) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "mmextend: cache hash table corrupt");
+ }
+
+ entry->mmhe_bufno = i;
+ MMBlockTags[i].mmct_dbid = reldbid;
+ MMBlockTags[i].mmct_relid = reln->rd_id;
+ MMBlockTags[i].mmct_blkno = rentry->mmrhe_nblocks;
+
+ /* page numbers are zero-based, so we increment this at the end */
+ (rentry->mmrhe_nblocks)++;
+
+ /* write the extended page */
+ offset = (i * BLCKSZ);
+ memmove(&(MMBlockCache[offset]), buffer, BLCKSZ);
+
+ SpinRelease(MMCacheLock);
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * mmopen() -- Open the specified relation.
+ */
+int
+mmopen(Relation reln)
+{
+ /* automatically successful */
+ return (0);
+}
+
+/*
+ * mmclose() -- Close the specified relation.
+ *
+ * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mmclose(Relation reln)
+{
+ /* automatically successful */
+ return (SM_SUCCESS);
+}
+
+/*
+ * mmread() -- Read the specified block from a relation.
+ *
+ * Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mmread(Relation reln, BlockNumber blocknum, char *buffer)
+{
+ MMHashEntry *entry;
+ bool found;
+ int offset;
+ MMCacheTag tag;
+
+ if (reln->rd_rel->relisshared)
+ tag.mmct_dbid = (Oid) 0;
+ else
+ tag.mmct_dbid = MyDatabaseId;
+
+ tag.mmct_relid = reln->rd_id;
+ tag.mmct_blkno = blocknum;
+
+ SpinAcquire(MMCacheLock);
+ entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+ HASH_FIND, &found);
+
+ if (entry == (MMHashEntry *) NULL) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "mmread: hash table corrupt");
+ }
+
+ if (!found) {
+ /* reading nonexistent pages is defined to fill them with zeroes */
+ SpinRelease(MMCacheLock);
+ memset(buffer, 0, BLCKSZ);
+ return (SM_SUCCESS);
+ }
+
+ offset = (entry->mmhe_bufno * BLCKSZ);
+ memmove(buffer, &MMBlockCache[offset], BLCKSZ);
+
+ SpinRelease(MMCacheLock);
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * mmwrite() -- Write the supplied block at the appropriate location.
+ *
+ * Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mmwrite(Relation reln, BlockNumber blocknum, char *buffer)
+{
+ MMHashEntry *entry;
+ bool found;
+ int offset;
+ MMCacheTag tag;
+
+ if (reln->rd_rel->relisshared)
+ tag.mmct_dbid = (Oid) 0;
+ else
+ tag.mmct_dbid = MyDatabaseId;
+
+ tag.mmct_relid = reln->rd_id;
+ tag.mmct_blkno = blocknum;
+
+ SpinAcquire(MMCacheLock);
+ entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+ HASH_FIND, &found);
+
+ if (entry == (MMHashEntry *) NULL) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "mmread: hash table corrupt");
+ }
+
+ if (!found) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "mmwrite: hash table missing requested page");
+ }
+
+ offset = (entry->mmhe_bufno * BLCKSZ);
+ memmove(&MMBlockCache[offset], buffer, BLCKSZ);
+
+ SpinRelease(MMCacheLock);
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * mmflush() -- Synchronously write a block to stable storage.
+ *
+ * For main-memory relations, this is exactly equivalent to mmwrite().
+ */
+int
+mmflush(Relation reln, BlockNumber blocknum, char *buffer)
+{
+ return (mmwrite(reln, blocknum, buffer));
+}
+
+/*
+ * mmblindwrt() -- Write a block to stable storage blind.
+ *
+ * We have to be able to do this using only the name and OID of
+ * the database and relation in which the block belongs.
+ */
+int
+mmblindwrt(char *dbstr,
+ char *relstr,
+ Oid dbid,
+ Oid relid,
+ BlockNumber blkno,
+ char *buffer)
+{
+ return (SM_FAIL);
+}
+
+/*
+ * mmnblocks() -- Get the number of blocks stored in a relation.
+ *
+ * Returns # of blocks or -1 on error.
+ */
+int
+mmnblocks(Relation reln)
+{
+ MMRelTag rtag;
+ MMRelHashEntry *rentry;
+ bool found;
+ int nblocks;
+
+ if (reln->rd_rel->relisshared)
+ rtag.mmrt_dbid = (Oid) 0;
+ else
+ rtag.mmrt_dbid = MyDatabaseId;
+
+ rtag.mmrt_relid = reln->rd_id;
+
+ SpinAcquire(MMCacheLock);
+
+ rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+ HASH_FIND, &found);
+
+ if (rentry == (MMRelHashEntry *) NULL) {
+ SpinRelease(MMCacheLock);
+ elog(FATAL, "mmnblocks: rel cache hash table corrupt");
+ }
+
+ if (found)
+ nblocks = rentry->mmrhe_nblocks;
+ else
+ nblocks = -1;
+
+ SpinRelease(MMCacheLock);
+
+ return (nblocks);
+}
+
+/*
+ * mmcommit() -- Commit a transaction.
+ *
+ * Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mmcommit()
+{
+ return (SM_SUCCESS);
+}
+
+/*
+ * mmabort() -- Abort a transaction.
+ */
+
+int
+mmabort()
+{
+ return (SM_SUCCESS);
+}
+
+/*
+ * MMShmemSize() -- Declare amount of shared memory we require.
+ *
+ * The shared memory initialization code creates a block of shared
+ * memory exactly big enough to hold all the structures it needs to.
+ * This routine declares how much space the main memory storage
+ * manager will use.
+ */
+int
+MMShmemSize()
+{
+ int size = 0;
+ int nbuckets;
+ int nsegs;
+ int tmp;
+
+ /*
+ * first compute space occupied by the (dbid,relid,blkno) hash table
+ */
+
+ nbuckets = 1 << (int)my_log2((MMNBUFFERS - 1) / DEF_FFACTOR + 1);
+ nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1);
+
+ size += MAXALIGN(my_log2(MMNBUFFERS) * sizeof(void *));
+ size += MAXALIGN(sizeof(HHDR));
+ size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+ tmp = (int)ceil((double)MMNBUFFERS/BUCKET_ALLOC_INCR);
+ size += tmp * BUCKET_ALLOC_INCR *
+ (MAXALIGN(sizeof(BUCKET_INDEX)) +
+ MAXALIGN(sizeof(MMHashEntry))); /* contains hash key */
+
+ /*
+ * now do the same for the rel hash table
+ */
+
+ size += MAXALIGN(my_log2(MMNRELATIONS) * sizeof(void *));
+ size += MAXALIGN(sizeof(HHDR));
+ size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+ tmp = (int)ceil((double)MMNRELATIONS/BUCKET_ALLOC_INCR);
+ size += tmp * BUCKET_ALLOC_INCR *
+ (MAXALIGN(sizeof(BUCKET_INDEX)) +
+ MAXALIGN(sizeof(MMRelHashEntry))); /* contains hash key */
+
+ /*
+ * finally, add in the memory block we use directly
+ */
+
+ size += MAXALIGN(BLCKSZ * MMNBUFFERS);
+ size += MAXALIGN(sizeof(*MMCurTop));
+ size += MAXALIGN(sizeof(*MMCurRelno));
+ size += MAXALIGN(MMNBUFFERS * sizeof(MMCacheTag));
+
+ return (size);
+}
+
+#endif /* MAIN_MEMORY */
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
new file mode 100644
index 00000000000..426c3d93480
--- /dev/null
+++ b/src/backend/storage/smgr/smgr.c
@@ -0,0 +1,371 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.c--
+ * public interface routines to storage manager switch.
+ *
+ * All file system operations in POSTGRES dispatch through these
+ * routines.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+
+#include "machine.h"
+#include "storage/ipc.h"
+#include "storage/smgr.h"
+#include "storage/block.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+
+typedef struct f_smgr {
+ int (*smgr_init)(); /* may be NULL */
+ int (*smgr_shutdown)(); /* may be NULL */
+ int (*smgr_create)();
+ int (*smgr_unlink)();
+ int (*smgr_extend)();
+ int (*smgr_open)();
+ int (*smgr_close)();
+ int (*smgr_read)();
+ int (*smgr_write)();
+ int (*smgr_flush)();
+ int (*smgr_blindwrt)();
+ int (*smgr_nblocks)();
+ int (*smgr_commit)(); /* may be NULL */
+ int (*smgr_abort)(); /* may be NULL */
+} f_smgr;
+
+/*
+ * The weird placement of commas in this init block is to keep the compiler
+ * happy, regardless of what storage managers we have (or don't have).
+ */
+
+static f_smgr smgrsw[] = {
+
+ /* magnetic disk */
+ { mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
+ mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdcommit, mdabort },
+
+#ifdef MAIN_MEMORY
+ /* main memory */
+ { mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
+ mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, mmcommit, mmabort },
+
+#endif /* MAIN_MEMORY */
+};
+
+/*
+ * This array records which storage managers are write-once, and which
+ * support overwrite. A 'true' entry means that the storage manager is
+ * write-once. In the best of all possible worlds, there would be no
+ * write-once storage managers.
+ */
+
+static bool smgrwo[] = {
+ false, /* magnetic disk */
+#ifdef MAIN_MEMORY
+ false, /* main memory*/
+#endif /* MAIN_MEMORY */
+};
+static int NSmgr = lengthof(smgrsw);
+
+/*
+ * smgrinit(), smgrshutdown() -- Initialize or shut down all storage
+ * managers.
+ *
+ */
+int
+smgrinit()
+{
+ int i;
+ extern char *smgrout();
+
+ for (i = 0; i < NSmgr; i++) {
+ if (smgrsw[i].smgr_init) {
+ if ((*(smgrsw[i].smgr_init))() == SM_FAIL)
+ elog(FATAL, "initialization failed on %s", smgrout(i));
+ }
+ }
+
+ /* register the shutdown proc */
+ on_exitpg(smgrshutdown, 0);
+
+ return (SM_SUCCESS);
+}
+
+void
+smgrshutdown(int dummy)
+{
+ int i;
+ extern char *smgrout();
+
+ for (i = 0; i < NSmgr; i++) {
+ if (smgrsw[i].smgr_shutdown) {
+ if ((*(smgrsw[i].smgr_shutdown))() == SM_FAIL)
+ elog(FATAL, "shutdown failed on %s", smgrout(i));
+ }
+ }
+}
+
+/*
+ * smgrcreate() -- Create a new relation.
+ *
+ * This routine takes a reldesc, creates the relation on the appropriate
+ * device, and returns a file descriptor for it.
+ */
+int
+smgrcreate(int16 which, Relation reln)
+{
+ int fd;
+
+ if ((fd = (*(smgrsw[which].smgr_create))(reln)) < 0)
+ elog(WARN, "cannot open %.*s",
+ NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (fd);
+}
+
+/*
+ * smgrunlink() -- Unlink a relation.
+ *
+ * The relation is removed from the store.
+ */
+int
+smgrunlink(int16 which, Relation reln)
+{
+ int status;
+
+ if ((status = (*(smgrsw[which].smgr_unlink))(reln)) == SM_FAIL)
+ elog(WARN, "cannot unlink %.*s",
+ NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (status);
+}
+
+/*
+ * smgrextend() -- Add a new block to a file.
+ *
+ * Returns SM_SUCCESS on success; aborts the current transaction on
+ * failure.
+ */
+int
+smgrextend(int16 which, Relation reln, char *buffer)
+{
+ int status;
+
+ status = (*(smgrsw[which].smgr_extend))(reln, buffer);
+
+ if (status == SM_FAIL)
+ elog(WARN, "%.*s: cannot extend",
+ NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (status);
+}
+
+/*
+ * smgropen() -- Open a relation using a particular storage manager.
+ *
+ * Returns the fd for the open relation on success, aborts the
+ * transaction on failure.
+ */
+int
+smgropen(int16 which, Relation reln)
+{
+ int fd;
+
+ if ((fd = (*(smgrsw[which].smgr_open))(reln)) < 0)
+ elog(WARN, "cannot open %.*s",
+ NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (fd);
+}
+
+/*
+ * smgrclose() -- Close a relation.
+ *
+ * Returns SM_SUCCESS on success, aborts on failure.
+ */
+int
+smgrclose(int16 which, Relation reln)
+{
+ if ((*(smgrsw[which].smgr_close))(reln) == SM_FAIL)
+ elog(WARN, "cannot close %.*s",
+ NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (SM_SUCCESS);
+}
+
+/*
+ * smgrread() -- read a particular block from a relation into the supplied
+ * buffer.
+ *
+ * This routine is called from the buffer manager in order to
+ * instantiate pages in the shared buffer cache. All storage managers
+ * return pages in the format that POSTGRES expects. This routine
+ * dispatches the read. On success, it returns SM_SUCCESS. On failure,
+ * the current transaction is aborted.
+ */
+int
+smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+ int status;
+
+ status = (*(smgrsw[which].smgr_read))(reln, blocknum, buffer);
+
+ if (status == SM_FAIL)
+ elog(WARN, "cannot read block %d of %.*s",
+ blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (status);
+}
+
+/*
+ * smgrwrite() -- Write the supplied buffer out.
+ *
+ * This is not a synchronous write -- the interface for that is
+ * smgrflush(). The buffer is written out via the appropriate
+ * storage manager. This routine returns SM_SUCCESS or aborts
+ * the current transaction.
+ */
+int
+smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+ int status;
+
+ status = (*(smgrsw[which].smgr_write))(reln, blocknum, buffer);
+
+ if (status == SM_FAIL)
+ elog(WARN, "cannot write block %d of %.*s",
+ blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (status);
+}
+
+/*
+ * smgrflush() -- A synchronous smgrwrite().
+ */
+int
+smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+ int status;
+
+ status = (*(smgrsw[which].smgr_flush))(reln, blocknum, buffer);
+
+ if (status == SM_FAIL)
+ elog(WARN, "cannot flush block %d of %.*s to stable store",
+ blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (status);
+}
+
+/*
+ * smgrblindwrt() -- Write a page out blind.
+ *
+ * In some cases, we may find a page in the buffer cache that we
+ * can't make a reldesc for. This happens, for example, when we
+ * want to reuse a dirty page that was written by a transaction
+ * that has not yet committed, which created a new relation. In
+ * this case, the buffer manager will call smgrblindwrt() with
+ * the name and OID of the database and the relation to which the
+ * buffer belongs. Every storage manager must be able to force
+ * this page down to stable storage in this circumstance.
+ */
+int
+smgrblindwrt(int16 which,
+ char *dbname,
+ char *relname,
+ Oid dbid,
+ Oid relid,
+ BlockNumber blkno,
+ char *buffer)
+{
+ char *dbstr;
+ char *relstr;
+ int status;
+
+ dbstr = pstrdup(dbname);
+ relstr = pstrdup(relname);
+
+ status = (*(smgrsw[which].smgr_blindwrt))(dbstr, relstr, dbid, relid,
+ blkno, buffer);
+
+ if (status == SM_FAIL)
+ elog(WARN, "cannot write block %d of %s [%s] blind",
+ blkno, relstr, dbstr);
+
+ pfree(dbstr);
+ pfree(relstr);
+
+ return (status);
+}
+
+/*
+ * smgrnblocks() -- Calculate the number of POSTGRES blocks in the
+ * supplied relation.
+ *
+ * Returns the number of blocks on success, aborts the current
+ * transaction on failure.
+ */
+int
+smgrnblocks(int16 which, Relation reln)
+{
+ int nblocks;
+
+ if ((nblocks = (*(smgrsw[which].smgr_nblocks))(reln)) < 0)
+ elog(WARN, "cannot count blocks for %.*s",
+ NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+ return (nblocks);
+}
+
+/*
+ * smgrcommit(), smgrabort() -- Commit or abort changes made during the
+ * current transaction.
+ */
+int
+smgrcommit()
+{
+ int i;
+ extern char *smgrout();
+
+ for (i = 0; i < NSmgr; i++) {
+ if (smgrsw[i].smgr_commit) {
+ if ((*(smgrsw[i].smgr_commit))() == SM_FAIL)
+ elog(FATAL, "transaction commit failed on %s", smgrout(i));
+ }
+ }
+
+ return (SM_SUCCESS);
+}
+
+int
+smgrabort()
+{
+ int i;
+ extern char *smgrout();
+
+ for (i = 0; i < NSmgr; i++) {
+ if (smgrsw[i].smgr_abort) {
+ if ((*(smgrsw[i].smgr_abort))() == SM_FAIL)
+ elog(FATAL, "transaction abort failed on %s", smgrout(i));
+ }
+ }
+
+ return (SM_SUCCESS);
+}
+
+bool
+smgriswo(int16 smgrno)
+{
+ if (smgrno < 0 || smgrno >= NSmgr)
+ elog(WARN, "illegal storage manager number %d", smgrno);
+
+ return (smgrwo[smgrno]);
+}
diff --git a/src/backend/storage/smgr/smgrtype.c b/src/backend/storage/smgr/smgrtype.c
new file mode 100644
index 00000000000..5c90d590914
--- /dev/null
+++ b/src/backend/storage/smgr/smgrtype.c
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgrtype.c--
+ * storage manager type
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgrtype.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+
+#include "utils/builtins.h" /* where the declarations go */
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "storage/smgr.h"
+
+typedef struct smgrid {
+ char *smgr_name;
+} smgrid;
+
+/*
+ * StorageManager[] -- List of defined storage managers.
+ *
+ * The weird comma placement is to keep compilers happy no matter
+ * which of these is (or is not) defined.
+ */
+
+static smgrid StorageManager[] = {
+ {"magnetic disk"},
+#ifdef MAIN_MEMORY
+ {"main memory"}
+#endif /* MAIN_MEMORY */
+};
+
+static int NStorageManagers = lengthof(StorageManager);
+
+int2
+smgrin(char *s)
+{
+ int i;
+
+ for (i = 0; i < NStorageManagers; i++) {
+ if (strcmp(s, StorageManager[i].smgr_name) == 0)
+ return((int2) i);
+ }
+ elog(WARN, "smgrin: illegal storage manager name %s", s);
+ return 0;
+}
+
+char *
+smgrout(int2 i)
+{
+ char *s;
+
+ if (i >= NStorageManagers || i < 0)
+ elog(WARN, "Illegal storage manager id %d", i);
+
+ s = (char *) palloc(strlen(StorageManager[i].smgr_name) + 1);
+ strcpy(s, StorageManager[i].smgr_name);
+ return (s);
+}
+
+bool
+smgreq(int2 a, int2 b)
+{
+ if (a == b)
+ return (true);
+ return (false);
+}
+
+bool
+smgrne(int2 a, int2 b)
+{
+ if (a == b)
+ return (false);
+ return (true);
+}
diff --git a/src/backend/storage/spin.h b/src/backend/storage/spin.h
new file mode 100644
index 00000000000..32037684ec1
--- /dev/null
+++ b/src/backend/storage/spin.h
@@ -0,0 +1,38 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.h--
+ * synchronization routines
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: spin.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SPIN_H
+#define SPIN_H
+
+#include "ipc.h"
+
+/*
+ * two implementations of spin locks
+ *
+ * sequent, sparc, sun3: real spin locks. uses a TAS instruction; see
+ * src/storage/ipc/s_lock.c for details.
+ *
+ * default: fake spin locks using semaphores. see spin.c
+ *
+ */
+
+typedef int SPINLOCK;
+
+extern bool CreateSpinlocks(IPCKey key);
+extern bool AttachSpinLocks(IPCKey key);
+extern bool InitSpinLocks(int init, IPCKey key);
+
+extern void SpinAcquire(SPINLOCK lock);
+extern void SpinRelease(SPINLOCK lock);
+extern bool SpinIsLocked(SPINLOCK lock);
+
+#endif /* SPIN_H */