63 files changed, 16385 insertions, 0 deletions
diff --git a/src/backend/storage/Makefile.inc b/src/backend/storage/Makefile.inc
new file mode 100644
index 00000000000..aef287ca71a
--- /dev/null
+++ b/src/backend/storage/Makefile.inc
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for the storage modules
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+stordir= $(CURDIR)/storage
+VPATH:= $(VPATH):$(stordir):$(stordir)/buffer:$(stordir)/file:$(stordir)/ipc:\
+	$(stordir)/large_object:$(stordir)/lmgr:$(stordir)/page:$(stordir)/smgr
+
+SUBSRCS=
+include $(stordir)/buffer/Makefile.inc
+include $(stordir)/file/Makefile.inc
+include $(stordir)/ipc/Makefile.inc
+include $(stordir)/large_object/Makefile.inc
+include $(stordir)/lmgr/Makefile.inc
+include $(stordir)/page/Makefile.inc
+include $(stordir)/smgr/Makefile.inc
+SRCS_STORAGE:= $(SUBSRCS)
+
+HEADERS+= backendid.h block.h buf.h buf_internals.h bufmgr.h bufpage.h \
+	fd.h ipc.h item.h itemid.h itempos.h \
+	itemptr.h large_object.h lmgr.h lock.h multilev.h off.h page.h \
+	pagenum.h pos.h proc.h shmem.h sinval.h sinvaladt.h smgr.h spin.h
diff --git a/src/backend/storage/backendid.h b/src/backend/storage/backendid.h
new file mode 100644
index 00000000000..eb874bbad79
--- /dev/null
+++ b/src/backend/storage/backendid.h
@@ -0,0 +1,32 @@
+/*-------------------------------------------------------------------------
+ *
+ * backendid.h--
+ *    POSTGRES backend id communication definitions
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: backendid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BACKENDID_H
+#define BACKENDID_H
+
+/* ----------------
+ *	pulled out of sinval.h to temporarily reduce #include nesting.
+ *	-cim 8/17/90
+ * ----------------
+ */
+typedef int16	BackendId;	/* unique currently active backend identifier */
+
+#define InvalidBackendId	(-1)
+
+typedef int32	BackendTag;	/* unique backend identifier */
+
+#define InvalidBackendTag	(-1)
+
+extern BackendId	MyBackendId;	/* backend id of this backend */
+extern BackendTag	MyBackendTag;	/* backend tag of this backend */
+
+#endif /* BACKENDID_H */
diff --git a/src/backend/storage/block.h b/src/backend/storage/block.h
new file mode 100644
index 00000000000..5c006aa9d90
--- /dev/null
+++ b/src/backend/storage/block.h
@@ -0,0 +1,114 @@
+/*-------------------------------------------------------------------------
+ *
+ * block.h--
+ *    POSTGRES disk block definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: block.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BLOCK_H
+#define BLOCK_H
+
+#include "c.h"
+
+/*
+ * BlockNumber:
+ *
+ * each data file (heap or index) is divided into postgres disk blocks
+ * (which may be thought of as the unit of i/o -- a postgres buffer
+ * contains exactly one disk block).  the blocks are numbered
+ * sequentially, 0 to 0xFFFFFFFE.
+ *
+ * InvalidBlockNumber is the same thing as P_NEW in buf.h.
+ *
+ * the access methods, the buffer manager and the storage manager are
+ * more or less the only pieces of code that should be accessing disk
+ * blocks directly.
+ */
+typedef uint32	BlockNumber;
+
+#define InvalidBlockNumber	((BlockNumber) 0xFFFFFFFF)
+
+/*
+ * BlockId:
+ *
+ * this is a storage type for BlockNumber.  in other words, this type
+ * is used for on-disk structures (e.g., in HeapTupleData) whereas
+ * BlockNumber is the type on which calculations are performed (e.g.,
+ * in access method code).
+ *
+ * there doesn't appear to be any reason to have separate types except
+ * for the fact that BlockIds can be SHORTALIGN'd (and therefore any
+ * structures that contains them, such as ItemPointerData, can also be
+ * SHORTALIGN'd).  this is an important consideration for reducing the
+ * space requirements of the line pointer (ItemIdData) array on each
+ * page and the header of each heap or index tuple, so it doesn't seem
+ * wise to change this without good reason.
+ */
+typedef struct BlockIdData {
+    uint16	bi_hi;
+    uint16	bi_lo;
+} BlockIdData;
+
+typedef BlockIdData	*BlockId;	/* block identifier */
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+
+/*
+ * BlockNumberIsValid --
+ *	True iff blockNumber is valid.
+ */
+#define BlockNumberIsValid(blockNumber) \
+    ((bool) ((int32) (blockNumber) != InvalidBlockNumber))
+
+/*
+ * BlockIdIsValid --
+ *	True iff the block identifier is valid.
+ */
+#define BlockIdIsValid(blockId) \
+    ((bool) PointerIsValid(blockId))
+
+/*
+ * BlockIdSet --
+ *	Sets a block identifier to the specified value.
+ */
+#define BlockIdSet(blockId, blockNumber) \
+    Assert(PointerIsValid(blockId)); \
+    (blockId)->bi_hi = (blockNumber) >> 16; \
+    (blockId)->bi_lo = (blockNumber) & 0xffff
+
+/*
+ * BlockIdCopy --
+ *	Copy a block identifier.
+ */
+#define BlockIdCopy(toBlockId, fromBlockId) \
+    Assert(PointerIsValid(toBlockId)); \
+    Assert(PointerIsValid(fromBlockId)); \
+    (toBlockId)->bi_hi = (fromBlockId)->bi_hi; \
+    (toBlockId)->bi_lo = (fromBlockId)->bi_lo
+
+/*
+ * BlockIdEquals --
+ *	Check for block number equality.
+ */
+#define BlockIdEquals(blockId1, blockId2) \
+    ((blockId1)->bi_hi == (blockId2)->bi_hi && \
+     (blockId1)->bi_lo == (blockId2)->bi_lo)
+
+/*
+ * BlockIdGetBlockNumber --
+ *	Retrieve the block number from a block identifier.
+ */
+#define BlockIdGetBlockNumber(blockId) \
+    (AssertMacro(BlockIdIsValid(blockId)) ? \
+     (BlockNumber) (((blockId)->bi_hi << 16) | ((uint16) (blockId)->bi_lo)) : \
+     (BlockNumber) InvalidBlockNumber)
+
+#endif	/* BLOCK_H */
diff --git a/src/backend/storage/buf.h b/src/backend/storage/buf.h
new file mode 100644
index 00000000000..73582e8a61c
--- /dev/null
+++ b/src/backend/storage/buf.h
@@ -0,0 +1,47 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf.h--
+ *    Basic buffer manager data types.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: buf.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BUF_H
+#define BUF_H
+
+#define InvalidBuffer	(0)
+#define UnknownBuffer	(-99999)
+
+typedef long	Buffer;
+
+/*
+ * BufferIsInvalid --
+ *	True iff the buffer is invalid.
+ */
+#define BufferIsInvalid(buffer)	((buffer) == InvalidBuffer)
+
+/*
+ * BufferIsUnknown --
+ *	True iff the buffer is unknown.
+ */
+#define BufferIsUnknown(buffer)	((buffer) == UnknownBuffer)
+
+/*
+ * BufferIsLocal --
+ *	True iff the buffer is local (not visible to other servers).
+ */
+#define BufferIsLocal(buffer)	((buffer) < 0)
+
+/*
+ * If NO_BUFFERISVALID is defined, all error checking using BufferIsValid()
+ * are suppressed.  Decision-making using BufferIsValid is not affected.
+ * This should be set only if one is sure there will be no errors.
+ * - plai 9/10/90
+ */
+#undef NO_BUFFERISVALID
+
+#endif	/* BUF_H */
diff --git a/src/backend/storage/buf_internals.h b/src/backend/storage/buf_internals.h
new file mode 100644
index 00000000000..84583867faf
--- /dev/null
+++ b/src/backend/storage/buf_internals.h
@@ -0,0 +1,220 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_internals.h--
+ *    Internal definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: buf_internals.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ * NOTE
+ *	If BUFFERPAGE0 is defined, then 0 will be used as a
+ *	valid buffer page number.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BUFMGR_INTERNALS_H
+#define BUFMGR_INTERNALS_H
+
+#include "postgres.h"
+#include "storage/buf.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+
+/* Buf Mgr constants */
+/* in bufmgr.c */
+extern int NBuffers;
+extern int Data_Descriptors;
+extern int Free_List_Descriptor;
+extern int Lookup_List_Descriptor;
+extern int Num_Descriptors;
+
+/*
+ * Flags for buffer descriptors
+ */
+#define BM_DIRTY   		(1 << 0)
+#define BM_PRIVATE 		(1 << 1)
+#define BM_VALID 		(1 << 2)
+#define BM_DELETED   		(1 << 3)
+#define BM_FREE			(1 << 4)
+#define BM_IO_IN_PROGRESS	(1 << 5)
+#define BM_IO_ERROR		(1 << 6)
+
+typedef bits16 BufFlags;
+
+typedef struct sbufdesc BufferDesc;
+typedef struct sbufdesc BufferHdr;
+typedef struct buftag BufferTag;
+/* long * so alignment will be correct */
+typedef long **BufferBlock;
+
+struct buftag{
+  LRelId	relId;
+  BlockNumber   blockNum;  /* blknum relative to begin of reln */
+};
+
+#define CLEAR_BUFFERTAG(a)\
+  (a)->relId.dbId = InvalidOid; \
+  (a)->relId.relId = InvalidOid; \
+  (a)->blockNum = InvalidBlockNumber
+
+#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \
+{ \
+  (a)->blockNum = xx_blockNum;\
+  (a)->relId = RelationGetLRelId(xx_reln); \
+}
+
+#define COPY_BUFFERTAG(a,b)\
+{ \
+  (a)->blockNum = (b)->blockNum;\
+  LRelIdAssign(*(a),*(b));\
+}
+
+#define EQUAL_BUFFERTAG(a,b) \
+  (((a)->blockNum == (b)->blockNum) &&\
+   (OID_Equal((a)->relId.relId,(b)->relId.relId)))
+
+
+#define BAD_BUFFER_ID(bid) ((bid<1) || (bid>(NBuffers)))
+#define INVALID_DESCRIPTOR (-3)
+
+/*
+ *  bletch hack -- anyplace that we declare space for relation or
+ *  database names, we just use '16', not a symbolic constant, to
+ *  specify their lengths.  BM_NAMESIZE is the length of these names,
+ *  and is used in the buffer manager code.  somebody with lots of
+ *  spare time should do this for all the other modules, too.
+ */
+#define BM_NAMESIZE	16
+
+/*
+ *  struct sbufdesc -- shared buffer cache metadata for a single
+ *		       shared buffer descriptor.
+ *
+ *	We keep the name of the database and relation in which this
+ *	buffer appears in order to avoid a catalog lookup on cache
+ *	flush if we don't have the reldesc in the cache.  It is also
+ *	possible that the relation to which this buffer belongs is
+ *	not visible to all backends at the time that it gets flushed.
+ *	Dbname, relname, dbid, and relid are enough to determine where
+ *	to put the buffer, for all storage managers.
+ */
+
+struct sbufdesc {
+    Buffer		freeNext;	/* link for freelist chain */
+    Buffer		freePrev;
+    SHMEM_OFFSET	data;		/* pointer to data in buf pool */
+
+    /* tag and id must be together for table lookup to work */
+    BufferTag		tag;		/* file/block identifier */
+    int			buf_id;		/* maps global desc to local desc */
+
+    BufFlags		flags;    	/* described below */
+    int16		bufsmgr;	/* storage manager id for buffer */
+    unsigned		refcount;	/* # of times buffer is pinned */
+
+    char *sb_dbname;	/* name of db in which buf belongs */
+    char *sb_relname;	/* name of reln */
+#ifdef HAS_TEST_AND_SET
+    /* can afford a dedicated lock if test-and-set locks are available */
+    slock_t	io_in_progress_lock;
+#endif /* HAS_TEST_AND_SET */
+
+    /*
+     * I padded this structure to a power of 2 (128 bytes on a MIPS) because
+     * BufferDescriptorGetBuffer is called a billion times and it does an
+     * C pointer subtraction (i.e., "x - y" -> array index of x relative
+     * to y, which is calculated using division by struct size).  Integer
+     * ".div" hits you for 35 cycles, as opposed to a 1-cycle "sra" ...
+     * this hack cut 10% off of the time to create the Wisconsin database!
+     * It eats up more shared memory, of course, but we're (allegedly)
+     * going to make some of these types bigger soon anyway... -pma 1/2/93
+     */
+#if defined(PORTNAME_ultrix4)
+    char		sb_pad[60];	/* no slock_t */
+#endif /* mips */
+#if defined(PORTNAME_sparc) || defined(PORTNAME_sparc_solaris) || defined(PORTNAME_irix5)
+    char		sb_pad[56];	/* has slock_t */
+#endif /* sparc || irix5 */
+#if defined(PORTNAME_hpux)
+    char		sb_pad[44];	/* has slock_t */
+#endif /* alpha */
+#if defined(PORTNAME_alpha)
+    char		sb_pad[40];	/* has slock_t */
+#endif /* alpha */
+};
+
+/*
+ *  mao tracing buffer allocation
+ */
+
+/*#define BMTRACE*/
+#ifdef BMTRACE
+
+typedef struct _bmtrace {
+    int		bmt_pid;
+    long	bmt_buf;
+    long	bmt_dbid;
+    long	bmt_relid;
+    int		bmt_blkno;
+    int		bmt_op;
+
+#define BMT_NOTUSED	0
+#define BMT_ALLOCFND	1
+#define BMT_ALLOCNOTFND	2
+#define	BMT_DEALLOC	3
+
+} bmtrace;
+
+#endif /* BMTRACE */
+
+
+/* 
+ * Bufmgr Interface:
+ */
+
+/* Internal routines: only called by buf.c */
+
+/*freelist.c*/
+extern void AddBufferToFreelist(BufferDesc *bf);
+extern void PinBuffer(BufferDesc *buf);
+extern void PinBuffer_Debug(char *file, int line, BufferDesc *buf);
+extern void UnpinBuffer(BufferDesc *buf);
+extern void UnpinBuffer_Debug(char *file, int line, BufferDesc *buf);
+extern BufferDesc *GetFreeBuffer(void);
+extern void InitFreeList(bool init);
+extern void DBG_FreeListCheck(int nfree);
+
+/* buf_table.c */
+extern void InitBufTable(void);
+extern BufferDesc *BufTableLookup(BufferTag *tagPtr);
+extern bool BufTableDelete(BufferDesc *buf);
+extern bool BufTableInsert(BufferDesc *buf);
+extern void DBG_LookupListCheck(int nlookup);
+
+/* bufmgr.c */
+extern BufferDesc 	*BufferDescriptors;
+extern BufferBlock 	BufferBlocks;
+extern long		*PrivateRefCount;
+extern long		*LastRefCount;
+extern SPINLOCK		BufMgrLock;
+
+/* localbuf.c */
+extern long *LocalRefCount;
+extern BufferDesc *LocalBufferDescriptors;
+extern int NLocBuffer;
+
+extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum,
+				    bool *foundPtr);
+extern int WriteLocalBuffer(Buffer buffer, bool release);
+extern int FlushLocalBuffer(Buffer buffer);
+extern void InitLocalBuffer();
+extern void LocalBufferSync();
+extern void ResetLocalBufferPool();
+     
+#endif	/* BUFMGR_INTERNALS_H */
diff --git a/src/backend/storage/buffer/Makefile.inc b/src/backend/storage/buffer/Makefile.inc
new file mode 100644
index 00000000000..1d507f9227b
--- /dev/null
+++ b/src/backend/storage/buffer/Makefile.inc
@@ -0,0 +1,16 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/buffer
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= buf_table.c buf_init.c bufmgr.c freelist.c localbuf.c
+
+SRCS_SITEMGR+= buf_table.c buf_init.c freelist.c
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index 00000000000..823bf41eecf
--- /dev/null
+++ b/src/backend/storage/buffer/buf_init.c
@@ -0,0 +1,280 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_init.c--
+ *    buffer manager initialization routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <math.h>
+#include <signal.h>
+
+/* declarations split between these three files */
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/smgr.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "executor/execdebug.h"	/* for NDirectFileRead */
+#include "catalog/catalog.h"
+
+/*
+ *  if BMTRACE is defined, we trace the last 200 buffer allocations and
+ *  deallocations in a circular buffer in shared memory.
+ */
+#ifdef	BMTRACE
+bmtrace	*TraceBuf;
+long	*CurTraceBuf;
+#define	BMT_LIMIT	200
+#endif /* BMTRACE */
+int ShowPinTrace = 0;
+
+int		NBuffers = NDBUFS;  /* NDBUFS defined in miscadmin.h */
+int		Data_Descriptors;
+int		Free_List_Descriptor;
+int		Lookup_List_Descriptor;
+int		Num_Descriptors;
+
+BufferDesc 	*BufferDescriptors;
+BufferBlock 	BufferBlocks;
+#ifndef HAS_TEST_AND_SET
+long	*NWaitIOBackendP;
+#endif
+
+extern IpcSemaphoreId      WaitIOSemId;
+
+long	*PrivateRefCount;	/* also used in freelist.c */
+long	*LastRefCount;  /* refcounts of last ExecMain level */
+
+/*
+ * Data Structures:
+ *      buffers live in a freelist and a lookup data structure.
+ *	
+ *
+ * Buffer Lookup:
+ *	Two important notes.  First, the buffer has to be
+ *	available for lookup BEFORE an IO begins.  Otherwise
+ *	a second process trying to read the buffer will 
+ *	allocate its own copy and the buffeer pool will 
+ *	become inconsistent.
+ *
+ * Buffer Replacement:
+ *	see freelist.c.  A buffer cannot be replaced while in
+ *	use either by data manager or during IO.
+ *
+ * WriteBufferBack:
+ *	currently, a buffer is only written back at the time
+ *	it is selected for replacement.  It should 
+ *	be done sooner if possible to reduce latency of 
+ *	BufferAlloc().  Maybe there should be a daemon process.
+ *
+ * Synchronization/Locking:
+ *
+ * BufMgrLock lock -- must be acquired before manipulating the 
+ * 	buffer queues (lookup/freelist).  Must be released 
+ * 	before exit and before doing any IO.  
+ *
+ * IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
+ *      It must be set when an IO is initiated and cleared at
+ *      the end of  the IO.  It is there to make sure that one
+ *	process doesn't start to use a buffer while another is
+ *	faulting it in.  see IOWait/IOSignal.
+ *
+ * refcount --  A buffer is pinned during IO and immediately
+ *	after a BufferAlloc().  A buffer is always either pinned
+ *	or on the freelist but never both.  The buffer must be
+ *	released, written, or flushed before the end of 
+ * 	transaction.
+ *
+ * PrivateRefCount -- Each buffer also has a private refcount the keeps
+ *	track of the number of times the buffer is pinned in the current
+ *	processes.  This is used for two purposes, first, if we pin a
+ *	a buffer more than once, we only need to change the shared refcount
+ *	once, thus only lock the buffer pool once, second, when a transaction
+ *	aborts, it should only unpin the buffers exactly the number of times it
+ *	has pinned them, so that it will not blow away buffers of another
+ *	backend.
+ *
+ */
+
+SPINLOCK BufMgrLock;
+
+/* delayed write: TRUE on, FALSE off */
+int LateWrite = TRUE;
+
+int ReadBufferCount;
+int BufferHitCount;
+int BufferFlushCount;
+
+
+/*
+ * Initialize module:
+ *
+ * should calculate size of pool dynamically based on the
+ * amount of available memory.
+ */
+void
+InitBufferPool(IPCKey key)
+{
+    bool foundBufs,foundDescs;
+    int i;
+    
+    Data_Descriptors = NBuffers;
+    Free_List_Descriptor = Data_Descriptors;
+    Lookup_List_Descriptor = Data_Descriptors + 1;
+    Num_Descriptors = Data_Descriptors + 1;
+    
+    SpinAcquire(BufMgrLock);
+    
+#ifdef BMTRACE
+    CurTraceBuf = (long *) ShmemInitStruct("Buffer trace",
+					   (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long),
+					   &foundDescs);
+    if (!foundDescs)
+	memset(CurTraceBuf, 0, (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long));
+    
+    TraceBuf = (bmtrace *) &(CurTraceBuf[1]);
+#endif
+    
+    BufferDescriptors = (BufferDesc *)
+	ShmemInitStruct("Buffer Descriptors",
+			Num_Descriptors*sizeof(BufferDesc),&foundDescs);
+    
+    BufferBlocks = (BufferBlock)
+	ShmemInitStruct("Buffer Blocks",
+			NBuffers*BLCKSZ,&foundBufs);
+    
+#ifndef HAS_TEST_AND_SET
+    {
+	bool foundNWaitIO;
+	
+	NWaitIOBackendP = (long *)ShmemInitStruct("#Backends Waiting IO",
+						  sizeof(long),
+						  &foundNWaitIO);
+	if (!foundNWaitIO)
+	    *NWaitIOBackendP = 0;
+    }
+#endif
+    
+    if (foundDescs || foundBufs) {
+	
+	/* both should be present or neither */
+	Assert(foundDescs && foundBufs);
+	
+    } else {
+	BufferDesc *buf;
+	unsigned long block;
+	
+	buf = BufferDescriptors;
+	block = (unsigned long) BufferBlocks;
+	
+	/*
+	 * link the buffers into a circular, doubly-linked list to
+	 * initialize free list.  Still don't know anything about
+	 * replacement strategy in this file.
+	 */
+	for (i = 0; i < Data_Descriptors; block+=BLCKSZ,buf++,i++) {
+	    Assert(ShmemIsValid((unsigned long)block));
+	    
+	    buf->freeNext = i+1;
+	    buf->freePrev = i-1;
+	    
+	    CLEAR_BUFFERTAG(&(buf->tag));
+	    buf->data = MAKE_OFFSET(block);
+	    buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
+	    buf->refcount = 0;
+	    buf->buf_id = i;
+#ifdef HAS_TEST_AND_SET
+	    S_INIT_LOCK(&(buf->io_in_progress_lock));
+#endif
+	}
+	
+	/* close the circular queue */
+	BufferDescriptors[0].freePrev = Data_Descriptors-1;
+	BufferDescriptors[Data_Descriptors-1].freeNext = 0;
+    }
+    
+    /* Init the rest of the module */
+    InitBufTable();
+    InitFreeList(!foundDescs);
+    
+    SpinRelease(BufMgrLock);
+    
+#ifndef HAS_TEST_AND_SET
+    {
+	int status;
+	WaitIOSemId = IpcSemaphoreCreate(IPCKeyGetWaitIOSemaphoreKey(key),
+					 1, IPCProtection, 0, 1, &status);
+    }
+#endif
+    PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
+    LastRefCount = (long *) calloc(NBuffers, sizeof(long));
+}
+
+/* -----------------------------------------------------
+ * BufferShmemSize
+ *
+ * compute the size of shared memory for the buffer pool including
+ * data pages, buffer descriptors, hash tables, etc.
+ * ----------------------------------------------------
+ */
+int
+BufferShmemSize()
+{
+    int size = 0;
+    int nbuckets;
+    int nsegs;
+    int tmp;
+    
+    nbuckets = 1 << (int)my_log2((NBuffers - 1) / DEF_FFACTOR + 1);
+    nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1);
+    
+    /* size of shmem binding table */
+    size += MAXALIGN(my_log2(BTABLE_SIZE) * sizeof(void *)); /* HTAB->dir */
+    size += MAXALIGN(sizeof(HHDR));			     /* HTAB->hctl */
+    size += MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    size += BUCKET_ALLOC_INCR * 
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(BTABLE_KEYSIZE) +
+	 MAXALIGN(BTABLE_DATASIZE));
+    
+    /* size of buffer descriptors */
+    size += MAXALIGN((NBuffers + 1) * sizeof(BufferDesc));
+    
+    /* size of data pages */
+    size += NBuffers * MAXALIGN(BLCKSZ);
+    
+    /* size of buffer hash table */
+    size += MAXALIGN(my_log2(NBuffers) * sizeof(void *)); /* HTAB->dir */
+    size += MAXALIGN(sizeof(HHDR));			  /* HTAB->hctl */
+    size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    tmp = (int)ceil((double)NBuffers/BUCKET_ALLOC_INCR);
+    size += tmp * BUCKET_ALLOC_INCR * 
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(sizeof(BufferTag)) +
+	 MAXALIGN(sizeof(Buffer)));
+    
+#ifdef BMTRACE
+    size += (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long);
+#endif
+    return size;
+}
+
+
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
new file mode 100644
index 00000000000..502ded954ed
--- /dev/null
+++ b/src/backend/storage/buffer/buf_table.c
@@ -0,0 +1,162 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_table.c--
+ *    routines for finding buffers in the buffer pool.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_table.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * OLD COMMENTS
+ *
+ * Data Structures:
+ *
+ *	Buffers are identified by their BufferTag (buf.h).  This
+ * file contains routines for allocating a shmem hash table to
+ * map buffer tags to buffer descriptors.
+ *
+ * Synchronization:
+ *  
+ *  All routines in this file assume buffer manager spinlock is
+ *  held by their caller.
+ */
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"	/* where the declarations go */
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+static HTAB *SharedBufHash;
+
+extern HTAB *ShmemInitHash();
+
+typedef struct lookup { 
+    BufferTag	key; 
+    Buffer	id; 
+} LookupEnt;
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ */
+void
+InitBufTable()
+{
+    HASHCTL info;
+    int hash_flags;
+    
+    /* assume lock is held */
+    
+    /* BufferTag maps to Buffer */
+    info.keysize = sizeof(BufferTag);
+    info.datasize = sizeof(Buffer);
+    info.hash = tag_hash;
+    
+    hash_flags = (HASH_ELEM | HASH_FUNCTION);
+    
+    
+    SharedBufHash = (HTAB *) ShmemInitHash("Shared Buf Lookup Table",
+					   NBuffers,NBuffers,
+					   &info,hash_flags);
+    
+    if (! SharedBufHash) {
+	elog(FATAL,"couldn't initialize shared buffer pool Hash Tbl");
+	exit(1);
+    }
+    
+}
+
+BufferDesc *
+BufTableLookup(BufferTag *tagPtr)
+{
+    LookupEnt *	result;
+    bool	found;
+    
+    if (tagPtr->blockNum == P_NEW)
+	return(NULL);
+    
+    result = (LookupEnt *) 
+	hash_search(SharedBufHash,(char *) tagPtr,HASH_FIND,&found);
+    
+    if (! result){
+	elog(WARN,"BufTableLookup: BufferLookup table corrupted");
+	return(NULL);
+    }
+    if (! found) {
+	return(NULL);
+    }
+    return(&(BufferDescriptors[result->id]));
+}
+
+/*
+ * BufTableDelete
+ */
+bool
+BufTableDelete(BufferDesc *buf)
+{
+    LookupEnt *	result;
+    bool	found;
+    
+    /* buffer not initialized or has been removed from
+     * table already.  BM_DELETED keeps us from removing 
+     * buffer twice.
+     */
+    if (buf->flags & BM_DELETED) {
+	return(TRUE);
+    }
+    
+    buf->flags |= BM_DELETED;
+    
+    result = (LookupEnt *)
+	hash_search(SharedBufHash,(char *) &(buf->tag),HASH_REMOVE,&found);
+    
+    if (! (result && found)) {
+	elog(WARN,"BufTableDelete: BufferLookup table corrupted");    
+	return(FALSE);
+    }
+    
+    return(TRUE);
+}
+
+bool
+BufTableInsert(BufferDesc *buf)
+{
+    LookupEnt *	result;
+    bool	found;
+    
+    /* cannot insert it twice */
+    Assert (buf->flags & BM_DELETED);
+    buf->flags &= ~(BM_DELETED);
+    
+    result = (LookupEnt *)
+	hash_search(SharedBufHash,(char *) &(buf->tag),HASH_ENTER,&found);
+    
+    if (! result) {
+	Assert(0);
+	elog(WARN,"BufTableInsert: BufferLookup table corrupted");
+	return(FALSE);
+    }
+    /* found something else in the table ! */
+    if (found) {
+	Assert(0);
+	elog(WARN,"BufTableInsert: BufferLookup table corrupted");
+	return(FALSE);
+    } 
+    
+    result->id = buf->buf_id;
+    return(TRUE);
+}
+
+/* prints out collision stats for the buf table */
+void
+DBG_LookupListCheck(int nlookup)
+{
+    nlookup = 10;
+    
+    hash_stats("Shared",SharedBufHash);
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 00000000000..655f1f408e0
--- /dev/null
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -0,0 +1,1581 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.c--
+ *    buffer manager interface routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *
+ * BufferAlloc() -- lookup a buffer in the buffer table.  If
+ *	it isn't there add it, but do not read it into memory.
+ *	This is used when we are about to reinitialize the
+ *	buffer so don't care what the current disk contents are.
+ *	BufferAlloc() pins the new buffer in memory.
+ *
+ * ReadBuffer() -- same as BufferAlloc() but reads the data
+ *	on a buffer cache miss.
+ *
+ * ReleaseBuffer() -- unpin the buffer
+ *
+ * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty"
+ *	but don't unpin.  The disk IO is delayed until buffer
+ *	replacement if LateWrite flag is set.
+ *
+ * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() 
+ *
+ * DirtyBufferCopy() -- For a given dbid/relid/blockno, if the buffer is
+ *			in the cache and is dirty, mark it clean and copy
+ *			it to the requested location.  This is a logical
+ *			write, and has been installed to support the cache
+ *			management code for write-once storage managers.
+ *
+ * FlushBuffer() -- as above but never delayed write.
+ *
+ * BufferSync() -- flush all dirty buffers in the buffer pool.
+ * 
+ * InitBufferPool() -- Init the buffer module.
+ *
+ * See other files:  
+ * 	freelist.c -- chooses victim for buffer replacement 
+ *	buf_table.c -- manages the buffer lookup table
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <math.h>
+#include <signal.h>
+
+/* declarations split between these three files */
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/smgr.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/memutils.h"
+#include "executor/execdebug.h"	/* for NDirectFileRead */
+#include "catalog/catalog.h"
+
+extern int LateWrite;
+extern SPINLOCK BufMgrLock;
+extern int ReadBufferCount;
+extern int BufferHitCount;
+extern int BufferFlushCount;
+
+static void WaitIO(BufferDesc *buf, SPINLOCK spinlock);
+#ifndef HAS_TEST_AND_SET
+static void SignalIO(BufferDesc *buf);
+extern long *NWaitIOBackendP; /* defined in buf_init.c */
+#endif /* HAS_TEST_AND_SET */
+
+static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum,
+				       bool bufferLockHeld);
+static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
+			       bool *foundPtr, bool bufferLockHeld);
+static int FlushBuffer(Buffer buffer);
+static void BufferSync(void);
+static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld);
+
+/* ---------------------------------------------------
+ * RelationGetBufferWithBuffer
+ *	see if the given buffer is what we want
+ *	if yes, we don't need to bother the buffer manager
+ * ---------------------------------------------------
+ */
+Buffer
+RelationGetBufferWithBuffer(Relation relation,
+			    BlockNumber blockNumber,
+			    Buffer buffer)
+{
+    BufferDesc *bufHdr;
+    LRelId lrelId;
+    
+    if (BufferIsValid(buffer)) {
+	if (!BufferIsLocal(buffer)) {
+	    bufHdr = &BufferDescriptors[buffer-1];
+	    lrelId = RelationGetLRelId(relation);
+	    SpinAcquire(BufMgrLock);
+	    if (bufHdr->tag.blockNum == blockNumber &&
+		bufHdr->tag.relId.relId == lrelId.relId &&
+		bufHdr->tag.relId.dbId == lrelId.dbId) {
+		SpinRelease(BufMgrLock);
+		return(buffer);
+	    }
+	    return(ReadBufferWithBufferLock(relation, blockNumber, true));
+	} else {
+	    bufHdr = &LocalBufferDescriptors[-buffer-1];
+	    if (bufHdr->tag.relId.relId == relation->rd_id &&
+		bufHdr->tag.blockNum == blockNumber) {
+		return(buffer);
+	    }
+	}
+    }
+    return(ReadBuffer(relation, blockNumber));
+}
+
+/*
+ * ReadBuffer -- returns a buffer containing the requested
+ *	block of the requested relation.  If the blknum
+ *	requested is P_NEW, extend the relation file and
+ *	allocate a new block.
+ *
+ * Returns: the buffer number for the buffer containing
+ *	the block read or NULL on an error.
+ *
+ * Assume when this function is called, that reln has been
+ *	opened already.
+ */
+
+extern int ShowPinTrace;
+
+
+#undef ReadBuffer	/* conflicts with macro when BUFMGR_DEBUG defined */
+
+/*
+ * ReadBuffer --
+ *
+ */
+Buffer
+ReadBuffer(Relation reln, BlockNumber blockNum)
+{
+    return ReadBufferWithBufferLock(reln, blockNum, false);
+}
+
+/*
+ * is_userbuffer
+ *
+ * XXX caller must have already acquired BufMgrLock
+ */
+static bool
+is_userbuffer(Buffer buffer)
+{
+    BufferDesc *buf = &BufferDescriptors[buffer-1];
+    
+    if (IsSystemRelationName(buf->sb_relname))
+	return false;
+    return true;
+}
+
+Buffer
+ReadBuffer_Debug(char *file,
+		 int line,
+		 Relation reln,
+		 BlockNumber blockNum)
+{
+    Buffer buffer;
+    
+    buffer = ReadBufferWithBufferLock(reln, blockNum, false);
+    if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+	BufferDesc *buf = &BufferDescriptors[buffer-1];
+	
+	fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+    return buffer;
+}
+
+/*
+ * ReadBufferWithBufferLock -- does the work of 
+ *	ReadBuffer() but with the possibility that
+ *	the buffer lock has already been held. this
+ *	is yet another effort to reduce the number of
+ *	semops in the system.
+ */
+static Buffer
+ReadBufferWithBufferLock(Relation reln,
+			 BlockNumber blockNum,
+			 bool bufferLockHeld)
+{
+    BufferDesc *bufHdr;	  
+    int		extend;   /* extending the file by one block */
+    int		status;
+    bool	found;
+    bool	isLocalBuf;
+
+    extend = (blockNum == P_NEW);
+    isLocalBuf = reln->rd_islocal;
+
+    if (isLocalBuf) {
+	bufHdr = LocalBufferAlloc(reln, blockNum, &found);
+    } else {
+	ReadBufferCount++;
+
+	/* lookup the buffer.  IO_IN_PROGRESS is set if the requested
+	 * block is not currently in memory.
+	 */
+	bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld);
+	if (found) BufferHitCount++;
+    }
+
+    if (!bufHdr) {
+	return(InvalidBuffer);
+    }
+    
+    /* if its already in the buffer pool, we're done */
+    if (found) {
+	/*
+	 * This happens when a bogus buffer was returned previously and is
+	 * floating around in the buffer pool.  A routine calling this would
+	 * want this extended.
+	 */
+	if (extend) {
+	    /* new buffers are zero-filled */
+	    memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
+	    (void) smgrextend(bufHdr->bufsmgr, reln,
+			      (char *) MAKE_PTR(bufHdr->data));
+	}
+	return (BufferDescriptorGetBuffer(bufHdr));
+		
+    }
+    
+    /* 
+     * if we have gotten to this point, the reln pointer must be ok
+     * and the relation file must be open.
+     */
+    if (extend) {
+	/* new buffers are zero-filled */
+	(void) memset((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ);
+	status = smgrextend(bufHdr->bufsmgr, reln,
+			    (char *) MAKE_PTR(bufHdr->data));
+    } else {
+	status = smgrread(bufHdr->bufsmgr, reln, blockNum,
+			  (char *) MAKE_PTR(bufHdr->data));
+    }
+
+    if (isLocalBuf)
+	return (BufferDescriptorGetBuffer(bufHdr));
+
+    /* lock buffer manager again to update IO IN PROGRESS */
+    SpinAcquire(BufMgrLock);
+    
+    if (status == SM_FAIL) {
+	/* IO Failed.  cleanup the data structures and go home */
+	
+	if (! BufTableDelete(bufHdr)) {
+	    SpinRelease(BufMgrLock);
+	    elog(FATAL,"BufRead: buffer table broken after IO error\n");
+	}
+	/* remember that BufferAlloc() pinned the buffer */
+	UnpinBuffer(bufHdr);
+	
+	/* 
+	 * Have to reset the flag so that anyone waiting for
+	 * the buffer can tell that the contents are invalid.
+	 */
+	bufHdr->flags |= BM_IO_ERROR;
+	
+    } else {
+	/* IO Succeeded.  clear the flags, finish buffer update */
+	
+	bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS);
+    }
+    
+    /* If anyone was waiting for IO to complete, wake them up now */
+#ifdef HAS_TEST_AND_SET
+    S_UNLOCK(&(bufHdr->io_in_progress_lock));
+#else
+    if (bufHdr->refcount > 1)
+	SignalIO(bufHdr);
+#endif
+    
+    SpinRelease(BufMgrLock);
+    
+    return(BufferDescriptorGetBuffer(bufHdr));
+}
+
+/*
+ * BufferAlloc -- Get a buffer from the buffer pool but dont
+ *	read it.
+ *
+ * Returns: descriptor for buffer
+ *
+ * When this routine returns, the BufMgrLock is guaranteed NOT be held.
+ */
+static BufferDesc *
+BufferAlloc(Relation reln,
+	    BlockNumber blockNum,
+	    bool	*foundPtr,
+	    bool bufferLockHeld)
+{
+    BufferDesc 		*buf, *buf2;	  
+    BufferTag 		newTag;	 /* identity of requested block */
+    bool		inProgress; /* buffer undergoing IO */
+    bool		newblock = FALSE;
+    
+    /* create a new tag so we can lookup the buffer */
+    /* assume that the relation is already open */
+    if (blockNum == P_NEW) {
+	newblock = TRUE;
+	blockNum = smgrnblocks(reln->rd_rel->relsmgr, reln);
+    }
+    
+    INIT_BUFFERTAG(&newTag,reln,blockNum);
+    
+    if (!bufferLockHeld)
+	SpinAcquire(BufMgrLock);
+    
+    /* see if the block is in the buffer pool already */
+    buf = BufTableLookup(&newTag);
+    if (buf != NULL) {
+	/* Found it.  Now, (a) pin the buffer so no
+	 * one steals it from the buffer pool, 
+	 * (b) check IO_IN_PROGRESS, someone may be
+	 * faulting the buffer into the buffer pool.
+	 */
+	
+	PinBuffer(buf);
+	inProgress = (buf->flags & BM_IO_IN_PROGRESS);
+	
+	*foundPtr = TRUE;
+	if (inProgress) {
+	    WaitIO(buf, BufMgrLock);
+	    if (buf->flags & BM_IO_ERROR) {
+		/* wierd race condition: 
+		 *
+		 * We were waiting for someone else to read the buffer.  
+		 * While we were waiting, the reader boof'd in some
+		 *  way, so the contents of the buffer are still
+		 * invalid.  By saying that we didn't find it, we can
+		 * make the caller reinitialize the buffer.  If two
+		 * processes are waiting for this block, both will
+		 * read the block.  The second one to finish may overwrite 
+		 * any updates made by the first.  (Assume higher level
+		 * synchronization prevents this from happening).
+		 *
+		 * This is never going to happen, don't worry about it.
+		 */
+		*foundPtr = FALSE;
+	    }
+	}
+#ifdef BMTRACE
+	_bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND);
+#endif /* BMTRACE */
+	
+	SpinRelease(BufMgrLock);
+	
+	return(buf);
+    }
+    
+    *foundPtr = FALSE;
+    
+    /*
+     * Didn't find it in the buffer pool.  We'll have
+     * to initialize a new buffer.  First, grab one from
+     * the free list.  If it's dirty, flush it to disk.
+     * Remember to unlock BufMgr spinlock while doing the IOs.
+     */
+    inProgress = FALSE;
+    for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL; ) {
+	
+	/* GetFreeBuffer will abort if it can't find a free buffer */
+	buf = GetFreeBuffer();
+	
+	/*
+	 * There should be exactly one pin on the buffer after
+	 * it is allocated -- ours.  If it had a pin it wouldn't
+	 * have been on the free list.  No one else could have
+	 * pinned it between GetFreeBuffer and here because we
+	 * have the BufMgrLock.
+	 */
+	Assert(buf->refcount == 0);
+	buf->refcount = 1;	       
+	PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1;
+	
+	if (buf->flags & BM_DIRTY) {
+	    /*
+	     * Set BM_IO_IN_PROGRESS to keep anyone from doing anything
+	     * with the contents of the buffer while we write it out.
+	     * We don't really care if they try to read it, but if they
+	     * can complete a BufferAlloc on it they can then scribble
+	     * into it, and we'd really like to avoid that while we are
+	     * flushing the buffer.  Setting this flag should block them
+	     * in WaitIO until we're done.
+	     */
+	    inProgress = TRUE;
+	    buf->flags |= BM_IO_IN_PROGRESS; 
+#ifdef HAS_TEST_AND_SET
+	    /*
+	     * All code paths that acquire this lock pin the buffer
+	     * first; since no one had it pinned (it just came off the
+	     * free list), no one else can have this lock.
+	     */
+	    Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
+	    S_LOCK(&(buf->io_in_progress_lock));
+#endif /* HAS_TEST_AND_SET */
+	    
+	    /*
+	     * Write the buffer out, being careful to release BufMgrLock
+	     * before starting the I/O.
+	     *
+	     * This #ifndef is here because a few extra semops REALLY kill
+	     * you on machines that don't have spinlocks.  If you don't
+	     * operate with much concurrency, well...
+	     */
+	    (void) BufferReplace(buf, true);
+	    BufferFlushCount++;
+#ifndef OPTIMIZE_SINGLE
+	    SpinAcquire(BufMgrLock); 
+#endif /* OPTIMIZE_SINGLE */
+	    
+	    /*
+	     * Somebody could have pinned the buffer while we were
+	     * doing the I/O and had given up the BufMgrLock (though
+	     * they would be waiting for us to clear the BM_IO_IN_PROGRESS
+	     * flag).  That's why this is a loop -- if so, we need to clear
+	     * the I/O flags, remove our pin and start all over again.
+	     *
+	     * People may be making buffers free at any time, so there's
+	     * no reason to think that we have an immediate disaster on
+	     * our hands.
+	     */
+	    if (buf->refcount > 1) {
+		inProgress = FALSE;
+		buf->flags &= ~BM_IO_IN_PROGRESS;
+#ifdef HAS_TEST_AND_SET
+		S_UNLOCK(&(buf->io_in_progress_lock));
+#else /* !HAS_TEST_AND_SET */
+		if (buf->refcount > 1)
+		    SignalIO(buf);
+#endif /* !HAS_TEST_AND_SET */
+		PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
+		buf->refcount--;
+		buf = (BufferDesc *) NULL;
+	    }
+
+	    /*
+	     * Somebody could have allocated another buffer for the
+	     * same block we are about to read in. (While we flush out
+	     * the dirty buffer, we don't hold the lock and someone could
+	     * have allocated another buffer for the same block. The problem
+	     * is we haven't gotten around to insert the new tag into
+	     * the buffer table. So we need to check here.	-ay 3/95
+	     */
+	    buf2 = BufTableLookup(&newTag);
+	    if (buf2 != NULL) {
+		/* Found it. Someone has already done what we're about
+		 * to do. We'll just handle this as if it were found in
+		 * the buffer pool in the first place.
+		 */
+		
+		PinBuffer(buf2);
+		inProgress = (buf2->flags & BM_IO_IN_PROGRESS);
+		
+		*foundPtr = TRUE;
+		if (inProgress) {
+		    WaitIO(buf2, BufMgrLock);
+		    if (buf2->flags & BM_IO_ERROR) {
+			*foundPtr = FALSE;
+		    }
+		}
+		
+#ifdef HAS_TEST_AND_SET
+		S_UNLOCK(&(buf->io_in_progress_lock));
+#else /* !HAS_TEST_AND_SET */
+		if (buf->refcount > 1)
+		    SignalIO(buf);
+#endif /* !HAS_TEST_AND_SET */
+		
+		/* give up the buffer since we don't need it any more */
+		buf->refcount--;
+		PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
+		AddBufferToFreelist(buf);
+		buf->flags |= BM_FREE;
+		buf->flags &= ~BM_DIRTY;
+		buf->flags &= ~BM_IO_IN_PROGRESS;
+		
+		SpinRelease(BufMgrLock);
+		
+		return(buf2);
+	    }
+	}
+    }
+    /*
+     * At this point we should have the sole pin on a non-dirty
+     * buffer and we may or may not already have the BM_IO_IN_PROGRESS
+     * flag set.
+     */
+    
+    /* 
+     * Change the name of the buffer in the lookup table:
+     *  
+     * Need to update the lookup table before the read starts.
+     * If someone comes along looking for the buffer while
+     * we are reading it in, we don't want them to allocate
+     * a new buffer.  For the same reason, we didn't want
+     * to erase the buf table entry for the buffer we were
+     * writing back until now, either.
+     */
+    
+    if (! BufTableDelete(buf)) {
+	SpinRelease(BufMgrLock);
+	elog(FATAL,"buffer wasn't in the buffer table\n");
+
+    }
+    
+    if (buf->flags & BM_DIRTY) {
+	/* must clear flag first because of wierd race 
+	 * condition described below.  
+	 */
+	buf->flags &= ~BM_DIRTY;
+    }
+    
+    /* record the database name and relation name for this buffer */
+    buf->sb_relname = pstrdup(reln->rd_rel->relname.data);
+    buf->sb_dbname = pstrdup(GetDatabaseName());
+    
+    /* remember which storage manager is responsible for it */
+    buf->bufsmgr = reln->rd_rel->relsmgr;
+    
+    INIT_BUFFERTAG(&(buf->tag),reln,blockNum);
+    if (! BufTableInsert(buf)) {
+	SpinRelease(BufMgrLock);
+	elog(FATAL,"Buffer in lookup table twice \n");
+    } 
+    
+    /* Buffer contents are currently invalid.  Have
+     * to mark IO IN PROGRESS so no one fiddles with
+     * them until the read completes.  If this routine
+     * has been called simply to allocate a buffer, no
+     * io will be attempted, so the flag isnt set.
+     */
+    if (!inProgress) {
+	buf->flags |= BM_IO_IN_PROGRESS; 
+#ifdef HAS_TEST_AND_SET
+	Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
+	S_LOCK(&(buf->io_in_progress_lock));
+#endif /* HAS_TEST_AND_SET */
+    }
+    
+#ifdef BMTRACE
+    _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), reln->rd_id, blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND);
+#endif /* BMTRACE */
+    
+    SpinRelease(BufMgrLock);
+    
+    return (buf);
+}
+
+/*
+ * WriteBuffer--
+ *
+ *	Pushes buffer contents to disk if LateWrite is
+ * not set.  Otherwise, marks contents as dirty.  
+ *
+ * Assume that buffer is pinned.  Assume that reln is
+ *	valid.
+ *
+ * Side Effects:
+ *    	Pin count is decremented.
+ */
+
+#undef WriteBuffer
+
+int
+WriteBuffer(Buffer buffer)
+{
+    BufferDesc	*bufHdr;
+
+    if (! LateWrite) {
+	return(FlushBuffer(buffer));
+    } else {
+
+	if (BufferIsLocal(buffer))
+	    return WriteLocalBuffer(buffer, TRUE);
+    
+	if (BAD_BUFFER_ID(buffer))
+	    return(FALSE);
+
+	bufHdr = &BufferDescriptors[buffer-1];
+	
+	SpinAcquire(BufMgrLock);
+	Assert(bufHdr->refcount > 0);
+	bufHdr->flags |= BM_DIRTY; 
+	UnpinBuffer(bufHdr);
+	SpinRelease(BufMgrLock);
+    }
+    return(TRUE);
+} 
+
+void
+WriteBuffer_Debug(char *file, int line, Buffer buffer)
+{
+    WriteBuffer(buffer);
+    if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+	BufferDesc *buf;
+	buf = &BufferDescriptors[buffer-1];
+	fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+}
+
+/*
+ *  DirtyBufferCopy() -- Copy a given dirty buffer to the requested
+ *			 destination.
+ *
+ *	We treat this as a write.  If the requested buffer is in the pool
+ *	and is dirty, we copy it to the location requested and mark it
+ *	clean.  This routine supports the Sony jukebox storage manager,
+ *	which agrees to take responsibility for the data once we mark
+ *	it clean.
+ *
+ *  NOTE: used by sony jukebox code in postgres 4.2   - ay 2/95
+ */
+void
+DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest)
+{
+    BufferDesc *buf;
+    BufferTag btag;
+    
+    btag.relId.relId = relid;
+    btag.relId.dbId = dbid;
+    btag.blockNum = blkno;
+    
+    SpinAcquire(BufMgrLock);
+    buf = BufTableLookup(&btag);
+    
+    if (buf == (BufferDesc *) NULL
+	|| !(buf->flags & BM_DIRTY)
+	|| !(buf->flags & BM_VALID)) {
+	SpinRelease(BufMgrLock);
+	return;
+    }
+    
+    /* hate to do this holding the lock, but release and reacquire is slower */
+    memmove(dest, (char *) MAKE_PTR(buf->data), BLCKSZ);
+    
+    buf->flags &= ~BM_DIRTY;
+    
+    SpinRelease(BufMgrLock);
+}
+
+/*
+ * FlushBuffer -- like WriteBuffer, but force the page to disk.
+ *
+ * 'buffer' is known to be dirty/pinned, so there should not be a
+ * problem reading the BufferDesc members without the BufMgrLock
+ * (nobody should be able to change tags, flags, etc. out from under
+ * us).
+ */
+static int
+FlushBuffer(Buffer buffer)
+{
+    BufferDesc	*bufHdr;
+
+    if (BufferIsLocal(buffer))
+	return FlushLocalBuffer(buffer);
+	    
+    if (BAD_BUFFER_ID(buffer))
+	return (STATUS_ERROR);
+    
+    bufHdr = &BufferDescriptors[buffer-1];
+    
+    if (!BufferReplace(bufHdr, false)) {
+	elog(WARN, "FlushBuffer: cannot flush %d", bufHdr->tag.blockNum);
+	return (STATUS_ERROR);
+    }
+    
+    SpinAcquire(BufMgrLock); 
+    bufHdr->flags &= ~BM_DIRTY; 
+    UnpinBuffer(bufHdr);
+    SpinRelease(BufMgrLock);
+    
+    return(STATUS_OK);
+}
+
+/*
+ * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer
+ * 			   when the operation is complete.
+ *
+ *	We know that the buffer is for a relation in our private cache,
+ *	because this routine is called only to write out buffers that
+ *	were changed by the executing backend.
+ */
+int
+WriteNoReleaseBuffer(Buffer buffer)
+{
+    BufferDesc	*bufHdr;
+    
+    if (! LateWrite) {
+	return(FlushBuffer(buffer));
+    } else {
+
+	if (BufferIsLocal(buffer))
+	    return WriteLocalBuffer(buffer, FALSE);
+	    
+	if (BAD_BUFFER_ID(buffer))
+	    return (STATUS_ERROR);
+
+	bufHdr = &BufferDescriptors[buffer-1];
+	
+	SpinAcquire(BufMgrLock);
+	bufHdr->flags |= BM_DIRTY; 
+	SpinRelease(BufMgrLock);
+    }
+    return(STATUS_OK);
+}
+
+
+#undef ReleaseAndReadBuffer
+/*
+ * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
+ * 	so that only one semop needs to be called.
+ *
+ */
+Buffer
+ReleaseAndReadBuffer(Buffer buffer,
+		     Relation relation,
+		     BlockNumber blockNum)
+{
+    BufferDesc	*bufHdr;
+    Buffer retbuf;
+
+    if (BufferIsLocal(buffer)) {
+	Assert(LocalRefCount[-buffer - 1] > 0);
+	LocalRefCount[-buffer - 1]--;
+    } else {
+	if (BufferIsValid(buffer)) {
+	    bufHdr = &BufferDescriptors[buffer-1];
+	    Assert(PrivateRefCount[buffer - 1] > 0);
+	    PrivateRefCount[buffer - 1]--;
+	    if (PrivateRefCount[buffer - 1] == 0 &&
+		LastRefCount[buffer - 1] == 0) {
+		/* only release buffer if it is not pinned in previous ExecMain
+		   level */
+		SpinAcquire(BufMgrLock);
+		bufHdr->refcount--;
+		if (bufHdr->refcount == 0) {
+		    AddBufferToFreelist(bufHdr);
+		    bufHdr->flags |= BM_FREE;
+		}
+		retbuf = ReadBufferWithBufferLock(relation, blockNum, true);
+		return retbuf;
+	    }
+	}
+    }
+
+    return (ReadBuffer(relation, blockNum));
+}
+
+/*
+ * BufferSync -- Flush all dirty buffers in the pool.
+ *
+ *	This is called at transaction commit time.  It does the wrong thing,
+ *	right now.  We should flush only our own changes to stable storage,
+ *	and we should obey the lock protocol on the buffer manager metadata
+ *	as we do it.  Also, we need to be sure that no other transaction is
+ *	modifying the page as we flush it.  This is only a problem for objects
+ *	that use a non-two-phase locking protocol, like btree indices.  For
+ *	those objects, we would like to set a write lock for the duration of
+ *	our IO.  Another possibility is to code updates to btree pages
+ *	carefully, so that writing them out out of order cannot cause
+ *	any unrecoverable errors.
+ *
+ *	I don't want to think hard about this right now, so I will try
+ *	to come back to it later.
+ */
+static void
+BufferSync()
+{ 
+    int i;
+    Oid bufdb;
+    Oid bufrel;
+    Relation reln;
+    BufferDesc *bufHdr;
+    int status;
+    
+    SpinAcquire(BufMgrLock);
+    for (i=0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) {
+	if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) {
+	    bufdb = bufHdr->tag.relId.dbId;
+	    bufrel = bufHdr->tag.relId.relId;
+	    if (bufdb == MyDatabaseId || bufdb == (Oid) 0) {
+		reln = RelationIdCacheGetRelation(bufrel);
+		
+		/*
+		 *  If we didn't have the reldesc in our local cache, flush this
+		 *  page out using the 'blind write' storage manager routine.  If
+		 *  we did find it, use the standard interface.
+		 */
+		
+#ifndef OPTIMIZE_SINGLE
+		SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+		if (reln == (Relation) NULL) {
+		    status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname,
+					  bufHdr->sb_relname, bufdb, bufrel,
+					  bufHdr->tag.blockNum,
+					  (char *) MAKE_PTR(bufHdr->data));
+		} else {
+		    status = smgrwrite(bufHdr->bufsmgr, reln,
+				       bufHdr->tag.blockNum,
+				       (char *) MAKE_PTR(bufHdr->data));
+		}
+#ifndef OPTIMIZE_SINGLE
+		SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+		
+		if (status == SM_FAIL) {
+		    elog(WARN, "cannot write %d for %16s",
+			 bufHdr->tag.blockNum, bufHdr->sb_relname);
+		}
+		
+		bufHdr->flags &= ~BM_DIRTY;
+		if (reln != (Relation)NULL)
+		    RelationDecrementReferenceCount(reln);
+	    }
+	}
+    }
+    SpinRelease(BufMgrLock);
+
+    LocalBufferSync();
+}
+
+
+/*
+ * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf'
+ * 	is cleared.  Because IO_IN_PROGRESS conflicts are
+ *	expected to be rare, there is only one BufferIO
+ *	lock in the entire system.  All processes block
+ *	on this semaphore when they try to use a buffer
+ *	that someone else is faulting in.  Whenever a
+ *	process finishes an IO and someone is waiting for
+ *	the buffer, BufferIO is signaled (SignalIO).  All
+ *	waiting processes then wake up and check to see
+ *	if their buffer is now ready.  This implementation
+ *	is simple, but efficient enough if WaitIO is
+ *	rarely called by multiple processes simultaneously.
+ *
+ *  ProcSleep atomically releases the spinlock and goes to
+ *	sleep.
+ *
+ *  Note: there is an easy fix if the queue becomes long.
+ *	save the id of the buffer we are waiting for in
+ *	the queue structure.  That way signal can figure
+ *	out which proc to wake up.
+ */
+#ifdef HAS_TEST_AND_SET
+static void
+WaitIO(BufferDesc *buf, SPINLOCK spinlock)
+{
+    SpinRelease(spinlock);
+    S_LOCK(&(buf->io_in_progress_lock));
+    S_UNLOCK(&(buf->io_in_progress_lock));
+    SpinAcquire(spinlock);
+}
+
+#else /* HAS_TEST_AND_SET */
+IpcSemaphoreId        WaitIOSemId;
+
+static void
+WaitIO(BufferDesc *buf, SPINLOCK spinlock)
+{
+    bool 	inProgress;
+    
+    for (;;) {
+	
+	/* wait until someone releases IO lock */
+	(*NWaitIOBackendP)++;
+	SpinRelease(spinlock);
+	IpcSemaphoreLock(WaitIOSemId, 0, 1);
+	SpinAcquire(spinlock);
+	inProgress = (buf->flags & BM_IO_IN_PROGRESS);
+	if (!inProgress) break;
+    }
+}
+
+/*
+ * SignalIO --
+ */
+static void
+SignalIO(BufferDesc *buf)
+{
+    /* somebody better be waiting. */
+    Assert( buf->refcount > 1);
+    IpcSemaphoreUnlock(WaitIOSemId, 0, *NWaitIOBackendP);
+    *NWaitIOBackendP = 0;
+}
+#endif /* HAS_TEST_AND_SET */
+
+long NDirectFileRead;	/* some I/O's are direct file access.  bypass bufmgr */
+long NDirectFileWrite;   /* e.g., I/O in psort and hashjoin.		     */
+
+void
+PrintBufferUsage(FILE *statfp)
+{
+    float hitrate;
+    
+    if (ReadBufferCount==0)
+	hitrate = 0.0;
+    else
+	hitrate = (float)BufferHitCount * 100.0/ReadBufferCount;
+    
+    fprintf(statfp, "!\t%ld blocks read, %ld blocks written, buffer hit rate = %.2f%%\n", 
+	    ReadBufferCount - BufferHitCount + NDirectFileRead,
+	    BufferFlushCount + NDirectFileWrite,
+	    hitrate);
+}
+
+void
+ResetBufferUsage()
+{
+    BufferHitCount = 0;
+    ReadBufferCount = 0;
+    BufferFlushCount = 0;
+    NDirectFileRead = 0;
+    NDirectFileWrite = 0;
+}
+
+/* ----------------------------------------------
+ *	ResetBufferPool
+ *
+ *	this routine is supposed to be called when a transaction aborts.
+ *	it will release all the buffer pins held by the transaciton.
+ *
+ * ----------------------------------------------
+ */
+void
+ResetBufferPool()
+{
+    register int i;
+    for (i=1; i<=NBuffers; i++) {
+	if (BufferIsValid(i)) {
+	    while(PrivateRefCount[i - 1] > 0) {
+		ReleaseBuffer(i);
+	    }
+	}
+	LastRefCount[i - 1] = 0;
+    }
+
+    ResetLocalBufferPool();
+}
+
+/* -----------------------------------------------
+ *	BufferPoolCheckLeak
+ *
+ *	check if there is buffer leak
+ *
+ * -----------------------------------------------
+ */
+int
+BufferPoolCheckLeak()
+{
+    register int i;
+    void PrintBufferDescs();
+    
+    for (i = 1; i <= NBuffers; i++) {
+	if (BufferIsValid(i)) {
+	    elog(NOTICE, "buffer leak detected in BufferPoolCheckLeak()");
+	    PrintBufferDescs();
+	    return(1);
+	}
+    }
+    return(0);
+}
+
+/* ------------------------------------------------
+ *	FlushBufferPool
+ *
+ *	flush all dirty blocks in buffer pool to disk
+ *
+ * ------------------------------------------------
+ */
+void
+FlushBufferPool(int StableMainMemoryFlag)
+{
+    if (!StableMainMemoryFlag) {
+        BufferSync();
+	smgrcommit();
+    }
+}
+
+/*
+ * BufferIsValid --
+ *	True iff the refcnt of the local buffer is > 0
+ * Note:
+ *	BufferIsValid(InvalidBuffer) is False.
+ *	BufferIsValid(UnknownBuffer) is False.
+ */
+bool
+BufferIsValid(Buffer bufnum)
+{
+    if (BufferIsLocal(bufnum)) 
+	return (bufnum >= -NLocBuffer && LocalRefCount[-bufnum - 1] > 0);
+    
+    if (BAD_BUFFER_ID(bufnum))
+        return(false);
+
+    return ((bool)(PrivateRefCount[bufnum - 1] > 0));
+}
+
+/*
+ * BufferGetBlockNumber --
+ *	Returns the block number associated with a buffer.
+ *
+ * Note:
+ *	Assumes that the buffer is valid.
+ */
+BlockNumber
+BufferGetBlockNumber(Buffer buffer)
+{
+    Assert(BufferIsValid(buffer));
+
+    /* XXX should be a critical section */
+    if (BufferIsLocal(buffer))
+	return (LocalBufferDescriptors[-buffer-1].tag.blockNum);
+    else
+	return (BufferDescriptors[buffer-1].tag.blockNum);
+}
+
+/*
+ * BufferGetRelation --
+ *	Returns the relation desciptor associated with a buffer.
+ *
+ * Note:
+ *	Assumes buffer is valid.
+ */
+Relation
+BufferGetRelation(Buffer buffer)
+{
+    Relation    relation;
+    Oid		relid;
+
+    Assert(BufferIsValid(buffer));
+    Assert(!BufferIsLocal(buffer));	/* not supported for local buffers */
+    
+    /* XXX should be a critical section */
+    relid = LRelIdGetRelationId(BufferDescriptors[buffer-1].tag.relId);
+    relation = RelationIdGetRelation(relid);
+    
+    RelationDecrementReferenceCount(relation);
+    
+    if (RelationHasReferenceCountZero(relation)) {
+	/*
+	  elog(NOTICE, "BufferGetRelation: 0->1");
+	  */
+	
+        RelationIncrementReferenceCount(relation);
+    }
+    
+    return (relation);
+}
+
+/*
+ * BufferReplace
+ *
+ * Flush the buffer corresponding to 'bufHdr'
+ *
+ * Assumes that the BufMgrLock has NOT been acquired.
+ */
+static int
+BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
+{
+    Relation 	reln;
+    Oid	bufdb, bufrel;
+    int		status;
+    
+    if (!bufferLockHeld)
+	SpinAcquire(BufMgrLock);
+    
+    /*
+     * first try to find the reldesc in the cache, if no luck,
+     * don't bother to build the reldesc from scratch, just do
+     * a blind write.
+     */
+    
+    bufdb = bufHdr->tag.relId.dbId;
+    bufrel = bufHdr->tag.relId.relId;
+    
+    if (bufdb == MyDatabaseId || bufdb == (Oid) NULL)
+	reln = RelationIdCacheGetRelation(bufrel);
+    else
+	reln = (Relation) NULL;
+    
+    SpinRelease(BufMgrLock); 
+    
+    if (reln != (Relation) NULL) {
+	status = smgrflush(bufHdr->bufsmgr, reln, bufHdr->tag.blockNum,
+			   (char *) MAKE_PTR(bufHdr->data));
+    } else {
+	
+	/* blind write always flushes */
+	status = smgrblindwrt(bufHdr->bufsmgr, bufHdr->sb_dbname,
+			      bufHdr->sb_relname, bufdb, bufrel,
+			      bufHdr->tag.blockNum,
+			      (char *) MAKE_PTR(bufHdr->data));
+    }
+    
+    if (status == SM_FAIL)
+	return (FALSE);
+    
+    return (TRUE);
+}
+
+/*
+ * RelationGetNumberOfBlocks --
+ *	Returns the buffer descriptor associated with a page in a relation.
+ *
+ * Note:
+ *      XXX may fail for huge relations.
+ *      XXX should be elsewhere.
+ *      XXX maybe should be hidden
+ */
+BlockNumber
+RelationGetNumberOfBlocks(Relation relation)
+{
+    return
+	((relation->rd_islocal) ? relation->rd_nblocks :
+	    smgrnblocks(relation->rd_rel->relsmgr, relation));
+}
+
+/*
+ * BufferGetBlock --
+ *	Returns a reference to a disk page image associated with a buffer.
+ *
+ * Note:
+ *	Assumes buffer is valid.
+ */
+Block
+BufferGetBlock(Buffer buffer)
+{
+    Assert(BufferIsValid(buffer));
+
+    if (BufferIsLocal(buffer))
+	return((Block)MAKE_PTR(LocalBufferDescriptors[-buffer-1].data));
+    else
+	return((Block)MAKE_PTR(BufferDescriptors[buffer-1].data));
+}
+
+/* ---------------------------------------------------------------------
+ *      ReleaseTmpRelBuffers
+ *
+ *      this function unmarks all the dirty pages of a temporary
+ *      relation in the buffer pool so that at the end of transaction
+ *      these pages will not be flushed.
+ *      XXX currently it sequentially searches the buffer pool, should be
+ *      changed to more clever ways of searching.
+ * --------------------------------------------------------------------
+ */
+void
+ReleaseTmpRelBuffers(Relation tempreldesc)
+{
+    register int i;
+    int holding = 0;
+    BufferDesc *buf;
+    
+    for (i=1; i<=NBuffers; i++) {
+	buf = &BufferDescriptors[i-1];
+	if (!holding) {
+	    SpinAcquire(BufMgrLock);
+	    holding = 1;
+	}
+        if ((buf->flags & BM_DIRTY) &&
+            (buf->tag.relId.dbId == MyDatabaseId) &&
+            (buf->tag.relId.relId == tempreldesc->rd_id)) {
+            buf->flags &= ~BM_DIRTY;
+            if (!(buf->flags & BM_FREE)) {
+		SpinRelease(BufMgrLock);
+		holding = 0;
+		ReleaseBuffer(i);
+	    }
+	}
+    }
+    if (holding)
+	SpinRelease(BufMgrLock);
+}
+
+/* ---------------------------------------------------------------------
+ *      DropBuffers
+ *
+ *	This function marks all the buffers in the buffer cache for a
+ *	particular database as clean.  This is used when we destroy a
+ *	database, to avoid trying to flush data to disk when the directory
+ *	tree no longer exists.
+ *
+ *	This is an exceedingly non-public interface.
+ * --------------------------------------------------------------------
+ */
+void
+DropBuffers(Oid dbid)
+{
+    register int i;
+    BufferDesc *buf;
+    
+    SpinAcquire(BufMgrLock);
+    for (i=1; i<=NBuffers; i++) {
+	buf = &BufferDescriptors[i-1];
+        if ((buf->tag.relId.dbId == dbid) && (buf->flags & BM_DIRTY)) {
+            buf->flags &= ~BM_DIRTY;
+        }
+    }
+    SpinRelease(BufMgrLock);
+}
+
+/* -----------------------------------------------------------------
+ *	PrintBufferDescs
+ *
+ *	this function prints all the buffer descriptors, for debugging
+ *	use only.
+ * -----------------------------------------------------------------
+ */
+void
+PrintBufferDescs()
+{
+    int i;
+    BufferDesc *buf = BufferDescriptors;
+
+    if (IsUnderPostmaster) {
+	SpinAcquire(BufMgrLock);
+	for (i = 0; i < NBuffers; ++i, ++buf) {
+	    elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \
+blockNum=%d, flags=0x%x, refcount=%d %d)",
+		 i, buf->freeNext, buf->freePrev, NAMEDATALEN,
+		 &(buf->sb_relname), buf->tag.blockNum, buf->flags,
+		 buf->refcount, PrivateRefCount[i]);
+	}
+	SpinRelease(BufMgrLock);
+    } else {
+	/* interactive backend */
+	for (i = 0; i < NBuffers; ++i, ++buf) {
+	    printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n",
+		   i, buf->sb_relname, buf->tag.blockNum, 
+		   buf->flags, buf->refcount, PrivateRefCount[i]);
+	}
+    }
+}
+
+void
+PrintPinnedBufs()
+{
+    int i;
+    BufferDesc *buf = BufferDescriptors;
+    
+    SpinAcquire(BufMgrLock);
+    for (i = 0; i < NBuffers; ++i, ++buf) {
+	if (PrivateRefCount[i] > 0)
+	    elog(NOTICE, "[%02d] (freeNext=%d, freePrev=%d, relname=%.*s, \
+blockNum=%d, flags=0x%x, refcount=%d %d)\n",
+		 i, buf->freeNext, buf->freePrev, NAMEDATALEN, &(buf->sb_relname),
+		 buf->tag.blockNum, buf->flags,
+		 buf->refcount, PrivateRefCount[i]);
+    }
+    SpinRelease(BufMgrLock);
+}
+
+/*
+ * BufferPoolBlowaway
+ *
+ * this routine is solely for the purpose of experiments -- sometimes
+ * you may want to blowaway whatever is left from the past in buffer
+ * pool and start measuring some performance with a clean empty buffer
+ * pool.
+ */
+void
+BufferPoolBlowaway()
+{
+    register int i;
+    
+    BufferSync();
+    for (i=1; i<=NBuffers; i++) {
+        if (BufferIsValid(i)) {
+            while(BufferIsValid(i))
+                ReleaseBuffer(i);
+        }
+        BufTableDelete(&BufferDescriptors[i-1]);
+    }
+}
+
+#undef IncrBufferRefCount
+#undef ReleaseBuffer
+
+void
+IncrBufferRefCount(Buffer buffer)
+{
+    if (BufferIsLocal(buffer)) {
+	Assert(LocalRefCount[-buffer - 1] >= 0);
+	LocalRefCount[-buffer - 1]++;
+    } else {
+	Assert(!BAD_BUFFER_ID(buffer));
+	Assert(PrivateRefCount[buffer - 1] >= 0);
+	PrivateRefCount[buffer - 1]++;
+    }
+}
+
+/*
+ * ReleaseBuffer -- remove the pin on a buffer without
+ * 	marking it dirty.
+ *
+ */
+int
+ReleaseBuffer(Buffer buffer)
+{
+    BufferDesc	*bufHdr;
+    
+    if (BufferIsLocal(buffer)) {
+	Assert(LocalRefCount[-buffer - 1] > 0);
+	LocalRefCount[-buffer - 1]--;
+	return (STATUS_OK);
+    }
+    
+    if (BAD_BUFFER_ID(buffer))
+	return(STATUS_ERROR);
+
+    bufHdr = &BufferDescriptors[buffer-1];
+    
+    Assert(PrivateRefCount[buffer - 1] > 0);
+    PrivateRefCount[buffer - 1]--;
+    if (PrivateRefCount[buffer - 1] == 0 && LastRefCount[buffer - 1] == 0) {
+	/* only release buffer if it is not pinned in previous ExecMain
+	   levels */
+	SpinAcquire(BufMgrLock);
+	bufHdr->refcount--;
+	if (bufHdr->refcount == 0) {
+	    AddBufferToFreelist(bufHdr);
+	    bufHdr->flags |= BM_FREE;
+	}
+	SpinRelease(BufMgrLock);
+    }
+    
+    return(STATUS_OK);
+}
+
+void
+IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
+{
+    IncrBufferRefCount(buffer);
+    if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+        BufferDesc *buf = &BufferDescriptors[buffer-1];
+	
+        fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+}
+
+void
+ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
+{
+    ReleaseBuffer(buffer);
+    if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+        BufferDesc *buf = &BufferDescriptors[buffer-1];
+	
+        fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+}
+
+int
+ReleaseAndReadBuffer_Debug(char *file,
+			   int line,
+			   Buffer buffer,
+			   Relation relation,
+			   BlockNumber blockNum)
+{
+    bool bufferValid;
+    Buffer b;
+    
+    bufferValid = BufferIsValid(buffer);
+    b = ReleaseAndReadBuffer(buffer, relation, blockNum);
+    if (ShowPinTrace && bufferValid && BufferIsLocal(buffer)
+	&& is_userbuffer(buffer)) {
+	BufferDesc *buf = &BufferDescriptors[buffer-1];
+	
+        fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+    if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) {
+	BufferDesc *buf = &BufferDescriptors[b-1];
+	
+        fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		b, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[b - 1], file, line);
+    }
+    return b;
+}
+
+#ifdef BMTRACE
+
+/*
+ *  trace allocations and deallocations in a circular buffer in
+ *  shared memory.  check the buffer before doing the allocation,
+ *  and die if there's anything fishy.
+ */
+
+_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType)
+{
+    static int mypid = 0;
+    long start, cur;
+    bmtrace *tb;
+    
+    if (mypid == 0)
+	mypid = getpid();
+    
+    start = *CurTraceBuf;
+    
+    if (start > 0)
+	cur = start - 1;
+    else
+	cur = BMT_LIMIT - 1;
+    
+    for (;;) {
+	tb = &TraceBuf[cur];
+	if (tb->bmt_op != BMT_NOTUSED) {
+	    if (tb->bmt_buf == bufNo) {
+		if ((tb->bmt_op == BMT_DEALLOC)
+		    || (tb->bmt_dbid == dbId && tb->bmt_relid == relId
+			&& tb->bmt_blkno == blkNo))
+		    goto okay;
+		
+		/* die holding the buffer lock */
+		_bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur);
+	    }
+	}
+	
+	if (cur == start)
+	    goto okay;
+	
+	if (cur == 0)
+	    cur = BMT_LIMIT - 1;
+	else
+	    cur--;
+    }
+    
+ okay:
+    tb = &TraceBuf[start];
+    tb->bmt_pid = mypid;
+    tb->bmt_buf = bufNo;
+    tb->bmt_dbid = dbId;
+    tb->bmt_relid = relId;
+    tb->bmt_blkno = blkNo;
+    tb->bmt_op = allocType;
+    
+    *CurTraceBuf = (start + 1) % BMT_LIMIT;
+}
+
+_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo,
+	int allocType, long start, long cur)
+{
+    FILE *fp;
+    bmtrace *tb;
+    int i;
+    
+    tb = &TraceBuf[cur];
+    
+    if ((fp = fopen("/tmp/death_notice", "w")) == (FILE *) NULL)
+	elog(FATAL, "buffer alloc trace error and can't open log file");
+    
+    fprintf(fp, "buffer alloc trace detected the following error:\n\n");
+    fprintf(fp, "    buffer %d being %s inconsistently with a previous %s\n\n",
+	    bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"),
+	    (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation"));
+    
+    fprintf(fp, "the trace buffer contains:\n");
+    
+    i = start;
+    for (;;) {
+	tb = &TraceBuf[i];
+	if (tb->bmt_op != BMT_NOTUSED) {
+	    fprintf(fp, "     [%3d]%spid %d buf %2d for <%d,%d,%d> ",
+		    i, (i == cur ? " ---> " : "\t"),
+		    tb->bmt_pid, tb->bmt_buf,
+		    tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno);
+	    
+	    switch (tb->bmt_op) {
+	    case BMT_ALLOCFND:
+		fprintf(fp, "allocate (found)\n");
+		break;
+		
+	    case BMT_ALLOCNOTFND:
+		fprintf(fp, "allocate (not found)\n");
+		break;
+		
+	    case BMT_DEALLOC:
+		fprintf(fp, "deallocate\n");
+		break;
+		
+	    default:
+		fprintf(fp, "unknown op type %d\n", tb->bmt_op);
+		break;
+	    }
+	}
+	
+	i = (i + 1) % BMT_LIMIT;
+	if (i == start)
+	    break;
+    }
+    
+    fprintf(fp, "\noperation causing error:\n");
+    fprintf(fp, "\tpid %d buf %d for <%d,%d,%d> ",
+	    getpid(), bufNo, dbId, relId, blkNo);
+    
+    switch (allocType) {
+    case BMT_ALLOCFND:
+	fprintf(fp, "allocate (found)\n");
+	break;
+	
+    case BMT_ALLOCNOTFND:
+	fprintf(fp, "allocate (not found)\n");
+	break;
+	
+    case BMT_DEALLOC:
+	fprintf(fp, "deallocate\n");
+	break;
+	
+    default:
+	fprintf(fp, "unknown op type %d\n", allocType);
+	break;
+    }
+    
+    (void) fclose(fp);
+    
+    kill(getpid(), SIGILL);
+}
+
+#endif /* BMTRACE */
+
+void
+BufferRefCountReset(int *refcountsave)
+{
+    int i;
+    for (i=0; i<NBuffers; i++) {
+	refcountsave[i] = PrivateRefCount[i];
+	LastRefCount[i] += PrivateRefCount[i];
+	PrivateRefCount[i] = 0;
+    }
+}
+
+void
+BufferRefCountRestore(int *refcountsave)
+{
+    int i;
+    for (i=0; i<NBuffers; i++) {
+	PrivateRefCount[i] = refcountsave[i];
+	LastRefCount[i] -= refcountsave[i];
+	refcountsave[i] = 0;
+    }
+}
+
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 00000000000..fabc3c29829
--- /dev/null
+++ b/src/backend/storage/buffer/freelist.c
@@ -0,0 +1,285 @@
+/*-------------------------------------------------------------------------
+ *
+ * freelist.c--
+ *    routines for manipulating the buffer pool's replacement strategy
+ *    freelist.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * OLD COMMENTS
+ *
+ * Data Structures:
+ *	SharedFreeList is a circular queue.  Notice that this
+ *	is a shared memory queue so the next/prev "ptrs" are
+ *	buffer ids, not addresses.
+ *
+ * Sync: all routines in this file assume that the buffer
+ * 	semaphore has been acquired by the caller.
+ */
+#include <stdio.h>
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"	/* where declarations go */
+#include "storage/spin.h"
+#include "utils/elog.h"
+
+
+static BufferDesc 	*SharedFreeList;
+
+/* only actually used in debugging.  The lock
+ * should be acquired before calling the freelist manager.
+ */
+extern SPINLOCK BufMgrLock;
+
+#define IsInQueue(bf) \
+    Assert((bf->freeNext != INVALID_DESCRIPTOR));\
+    Assert((bf->freePrev != INVALID_DESCRIPTOR));\
+    Assert((bf->flags & BM_FREE))
+
+#define NotInQueue(bf) \
+    Assert((bf->freeNext == INVALID_DESCRIPTOR));\
+    Assert((bf->freePrev == INVALID_DESCRIPTOR));\
+    Assert(! (bf->flags & BM_FREE))
+
+
+/*
+ * AddBufferToFreelist --  
+ *
+ * In theory, this is the only routine that needs to be changed
+ * if the buffer replacement strategy changes.  Just change
+ * the manner in which buffers are added to the freelist queue.
+ * Currently, they are added on an LRU basis.
+ */
+void
+AddBufferToFreelist(BufferDesc *bf)
+{
+#ifdef BMTRACE
+    _bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
+	      BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
+#endif /* BMTRACE */
+    NotInQueue(bf);
+    
+    /* change bf so it points to inFrontOfNew and its successor */
+    bf->freePrev = SharedFreeList->freePrev;
+    bf->freeNext = Free_List_Descriptor;
+    
+    /* insert new into chain */
+    BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
+    BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
+}
+
+#undef PinBuffer
+
+/*
+ * PinBuffer -- make buffer unavailable for replacement.
+ */
+void
+PinBuffer(BufferDesc *buf)
+{
+    long b;
+    
+    /* Assert (buf->refcount < 25); */
+    
+    if (buf->refcount == 0) {
+	IsInQueue(buf);
+	
+	/* remove from freelist queue */
+	BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
+	BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
+	buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
+	
+	/* mark buffer as no longer free */
+	buf->flags &= ~BM_FREE;
+    } else {
+	NotInQueue(buf);
+    }
+    
+    b = BufferDescriptorGetBuffer(buf) - 1;
+    Assert(PrivateRefCount[b] >= 0);
+    if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
+	buf->refcount++;
+    PrivateRefCount[b]++;
+}
+
+void
+PinBuffer_Debug(char *file, int line, BufferDesc *buf)
+{
+    PinBuffer(buf);
+    if (ShowPinTrace) {
+	Buffer buffer = BufferDescriptorGetBuffer(buf);
+	
+	fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+}
+
+#undef UnpinBuffer
+
+/*
+ * UnpinBuffer -- make buffer available for replacement.
+ */
+void
+UnpinBuffer(BufferDesc *buf)
+{
+    long b = BufferDescriptorGetBuffer(buf) - 1;
+    
+    Assert(buf->refcount);
+    Assert(PrivateRefCount[b] > 0);
+    PrivateRefCount[b]--;
+    if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
+	buf->refcount--;
+    NotInQueue(buf);
+    
+    if (buf->refcount == 0) {
+	AddBufferToFreelist(buf);
+	buf->flags |= BM_FREE;
+    } else {
+	/* do nothing */
+    }
+}
+
+void
+UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)
+{
+    UnpinBuffer(buf);
+    if (ShowPinTrace) {
+	Buffer buffer = BufferDescriptorGetBuffer(buf);
+	
+	fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+}
+
+/*
+ * GetFreeBuffer() -- get the 'next' buffer from the freelist.
+ *
+ */
+BufferDesc *
+GetFreeBuffer()
+{
+    BufferDesc *buf;
+    
+    if (Free_List_Descriptor == SharedFreeList->freeNext) {
+	
+	/* queue is empty. All buffers in the buffer pool are pinned. */
+	elog(WARN,"out of free buffers: time to abort !\n");
+	return(NULL);
+    }
+    buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+    
+    /* remove from freelist queue */
+    BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
+    BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
+    buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
+    
+    buf->flags &= ~(BM_FREE);
+    
+    return(buf);
+}
+
+/*
+ * InitFreeList -- initialize the dummy buffer descriptor used
+ *   	as a freelist head.
+ *
+ * Assume: All of the buffers are already linked in a circular
+ *	queue.   Only called by postmaster and only during 
+ * 	initialization.
+ */
+void
+InitFreeList(bool init)
+{
+    SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]);
+    
+    if (init) {
+	/* we only do this once, normally the postmaster */
+	SharedFreeList->data = INVALID_OFFSET;
+	SharedFreeList->flags = 0;
+	SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
+	SharedFreeList->buf_id = Free_List_Descriptor;
+	
+	/* insert it into a random spot in the circular queue */
+	SharedFreeList->freeNext = BufferDescriptors[0].freeNext;
+	SharedFreeList->freePrev = 0;
+	BufferDescriptors[SharedFreeList->freeNext].freePrev = 
+	    BufferDescriptors[SharedFreeList->freePrev].freeNext = 
+		Free_List_Descriptor;
+    }
+}
+
+
+/*
+ * print out the free list and check for breaks.
+ */
+void
+DBG_FreeListCheck(int nfree)
+{
+    int i;
+    BufferDesc *buf;
+    
+    buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+    for (i=0;i<nfree;i++,buf = &(BufferDescriptors[buf->freeNext])) {
+	
+	if (! (buf->flags & (BM_FREE))){
+	    if (buf != SharedFreeList) {
+		printf("\tfree list corrupted: %d flags %x\n",
+		       buf->buf_id,buf->flags);
+	    } else  {
+		printf("\tfree list corrupted: too short -- %d not %d\n",
+		       i,nfree);
+		
+	    }
+	    
+	    
+	}
+	if ((BufferDescriptors[buf->freeNext].freePrev != buf->buf_id) ||
+	    (BufferDescriptors[buf->freePrev].freeNext != buf->buf_id)) {
+	    printf("\tfree list links corrupted: %d %ld %ld\n",
+		   buf->buf_id,buf->freePrev,buf->freeNext);
+	}
+	
+    }
+    if (buf != SharedFreeList) {
+	printf("\tfree list corrupted: %d-th buffer is %d\n",
+	       nfree,buf->buf_id);
+	
+    }
+}
+
+/*
+ * PrintBufferFreeList -
+ *    prints the buffer free list, for debugging
+ */
+void
+PrintBufferFreeList()
+{
+    BufferDesc *buf;
+
+    if (SharedFreeList->freeNext == Free_List_Descriptor) {
+	printf("free list is empty.\n");
+	return;
+    }
+    
+    buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+    for (;;) {
+	int i = (buf - BufferDescriptors);
+	printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n",
+	       i, buf->sb_relname, buf->tag.blockNum,
+	       buf->flags, buf->refcount, PrivateRefCount[i],
+	       buf->freeNext, buf->freePrev);
+	
+	if (buf->freeNext == Free_List_Descriptor)
+	    break;
+
+	buf = &(BufferDescriptors[buf->freeNext]);
+    }
+}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 00000000000..ec625940867
--- /dev/null
+++ b/src/backend/storage/buffer/localbuf.c
@@ -0,0 +1,284 @@
+/*-------------------------------------------------------------------------
+ *
+ * localbuf.c--
+ *    local buffer manager. Fast buffer manager for temporary tables
+ *    or special cases when the operation is not visible to other backends.
+ *
+ *    When a relation is being created, the descriptor will have rd_islocal
+ *    set to indicate that the local buffer manager should be used. During
+ *    the same transaction the relation is being created, any inserts or
+ *    selects from the newly created relation will use the local buffer
+ *    pool. rd_islocal is reset at the end of a transaction (commit/abort).
+ *    This is useful for queries like SELECT INTO TABLE and create index.
+ *
+ * Copyright (c) 1994-5, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <math.h>
+#include <signal.h>
+
+/* declarations split between these three files */
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/smgr.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "executor/execdebug.h"	/* for NDirectFileRead */
+#include "catalog/catalog.h"
+
+int NLocBuffer = 64;
+BufferDesc *LocalBufferDescriptors = NULL;
+long *LocalRefCount = NULL;
+
+static int nextFreeLocalBuf = 0;
+
+/*#define LBDEBUG*/
+
+/*
+ * LocalBufferAlloc -
+ *    allocate a local buffer. We do round robin allocation for now.
+ */
+BufferDesc *
+LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
+{
+    int i;
+    BufferDesc *bufHdr = (BufferDesc *) NULL;
+
+    if (blockNum == P_NEW) {
+	blockNum = reln->rd_nblocks;
+	reln->rd_nblocks++;
+    } 
+
+    /* a low tech search for now -- not optimized for scans */
+    for (i=0; i < NLocBuffer; i++) {
+	if (LocalBufferDescriptors[i].tag.relId.relId == reln->rd_id &&
+	    LocalBufferDescriptors[i].tag.blockNum == blockNum) {
+
+#ifdef LBDEBUG
+	    fprintf(stderr, "LB ALLOC (%d,%d) %d\n",
+		    reln->rd_id, blockNum, -i-1);
+#endif    
+	    LocalRefCount[i]++;
+	    *foundPtr = TRUE;
+	    return &LocalBufferDescriptors[i];
+	}
+    }
+
+#ifdef LBDEBUG
+    fprintf(stderr, "LB ALLOC (%d,%d) %d\n",
+	    reln->rd_id, blockNum, -nextFreeLocalBuf-1);
+#endif    
+    
+    /* need to get a new buffer (round robin for now) */
+    for(i=0; i < NLocBuffer; i++) {
+	int b = (nextFreeLocalBuf + i) % NLocBuffer;
+
+	if (LocalRefCount[b]==0) {
+	    bufHdr = &LocalBufferDescriptors[b];
+	    LocalRefCount[b]++;
+	    nextFreeLocalBuf = (b + 1) % NLocBuffer;
+	    break;
+	}
+    }
+    if (bufHdr==NULL)
+	elog(WARN, "no empty local buffer.");
+
+    /*
+     * this buffer is not referenced but it might still be dirty (the
+     * last transaction to touch it doesn't need its contents but has
+     * not flushed it).  if that's the case, write it out before
+     * reusing it!
+     */
+    if (bufHdr->flags & BM_DIRTY) {
+	Relation bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId);
+
+	Assert(bufrel != NULL);
+	
+	/* flush this page */
+	smgrwrite(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum,
+		  (char *) MAKE_PTR(bufHdr->data));
+    }
+
+    /*
+     * it's all ours now.
+     */
+    bufHdr->tag.relId.relId = reln->rd_id;
+    bufHdr->tag.blockNum = blockNum;
+    bufHdr->flags &= ~BM_DIRTY;
+
+    /*
+     * lazy memory allocation. (see MAKE_PTR for why we need to do 
+     * MAKE_OFFSET.)
+     */
+    if (bufHdr->data == (SHMEM_OFFSET)0) {
+	char *data = (char *)malloc(BLCKSZ);
+
+	bufHdr->data = MAKE_OFFSET(data);
+    }
+    
+    *foundPtr = FALSE;
+    return bufHdr;
+}
+
+/*
+ * WriteLocalBuffer -
+ *    writes out a local buffer
+ */
+int
+WriteLocalBuffer(Buffer buffer, bool release)
+{
+    int bufid;
+
+    Assert(BufferIsLocal(buffer));
+
+#ifdef LBDEBUG
+    fprintf(stderr, "LB WRITE %d\n", buffer);
+#endif    
+    
+    bufid = - (buffer + 1);
+    LocalBufferDescriptors[bufid].flags |= BM_DIRTY;
+
+    if (release) {
+	Assert(LocalRefCount[bufid] > 0);
+	LocalRefCount[bufid]--;
+    }
+
+    return true;
+}
+
+/*
+ * FlushLocalBuffer -
+ *    flushes a local buffer
+ */
+int
+FlushLocalBuffer(Buffer buffer)
+{
+    int bufid;
+    Relation bufrel;
+    BufferDesc *bufHdr;
+
+    Assert(BufferIsLocal(buffer));
+
+#ifdef LBDEBUG
+    fprintf(stderr, "LB FLUSH %d\n", buffer);
+#endif    
+
+    bufid = - (buffer + 1);
+    bufHdr = &LocalBufferDescriptors[bufid];
+    bufHdr->flags &= ~BM_DIRTY;
+    bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId);
+
+    Assert(bufrel != NULL);
+    smgrflush(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum,
+	      (char *) MAKE_PTR(bufHdr->data));
+
+    Assert(LocalRefCount[bufid] > 0);
+    LocalRefCount[bufid]--;
+    
+    return true;
+}
+
+/*
+ * InitLocalBuffer -
+ *    init the local buffer cache. Since most queries (esp. multi-user ones)
+ *    don't involve local buffers, we delay allocating memory for actual the
+ *    buffer until we need it.
+ */
+void
+InitLocalBuffer()
+{
+    int i;
+    
+    /*
+     * these aren't going away. I'm not gonna use palloc.
+     */
+    LocalBufferDescriptors =
+	(BufferDesc *)malloc(sizeof(BufferDesc) * NLocBuffer);
+    memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer);
+    nextFreeLocalBuf = 0;
+
+    for (i = 0; i < NLocBuffer; i++) {
+	BufferDesc *buf = &LocalBufferDescriptors[i];
+
+	/*
+	 * negative to indicate local buffer. This is tricky: shared buffers
+	 * start with 0. We have to start with -2. (Note that the routine
+	 * BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id
+	 * is -1.)
+	 */
+	buf->buf_id = - i - 2;	
+    }
+
+    LocalRefCount =
+	(long *)malloc(sizeof(long) * NLocBuffer);
+    memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
+
+/*
+ * LocalBufferSync -
+ *    flush all dirty buffers in the local buffer cache. Since the buffer
+ *    cache is only used for keeping relations visible during a transaction,
+ *    we will not need these buffers again.
+ */
+void
+LocalBufferSync()
+{
+    int i;
+    
+    for (i = 0; i < NLocBuffer; i++) {
+	BufferDesc *buf = &LocalBufferDescriptors[i];
+	Relation bufrel;
+
+	if (buf->flags & BM_DIRTY) {
+#ifdef LBDEBUG
+	    fprintf(stderr, "LB SYNC %d\n", -i-1);
+#endif	    
+	    bufrel = RelationIdCacheGetRelation(buf->tag.relId.relId);
+
+	    Assert(bufrel != NULL);
+	    
+	    smgrwrite(bufrel->rd_rel->relsmgr, bufrel, buf->tag.blockNum,
+		      (char *) MAKE_PTR(buf->data));
+
+	    buf->tag.relId.relId = InvalidOid;
+	    buf->flags &= ~BM_DIRTY;
+	}
+    }
+
+    memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
+
+void
+ResetLocalBufferPool()
+{
+    int i;
+
+    memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer);
+    nextFreeLocalBuf = 0;
+
+    for (i = 0; i < NLocBuffer; i++) {
+	BufferDesc *buf = &LocalBufferDescriptors[i];
+
+	/* just like InitLocalBuffer() */
+	buf->buf_id = - i - 2;	
+    }
+
+    memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
diff --git a/src/backend/storage/bufmgr.h b/src/backend/storage/bufmgr.h
new file mode 100644
index 00000000000..581d3237cad
--- /dev/null
+++ b/src/backend/storage/bufmgr.h
@@ -0,0 +1,112 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.h--
+ *    POSTGRES buffer manager definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: bufmgr.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BUFMGR_H
+#define BUFMGR_H
+
+#include "c.h"
+
+#include "machine.h"		/* for BLCKSZ */
+#include "utils/rel.h"
+
+#include "storage/buf_internals.h"	/* UGLY! -- ay */
+
+/*
+ * the maximum size of a disk block for any possible installation.
+ *
+ * in theory this could be anything, but in practice this is actually
+ * limited to 2^13 bytes because we have limited ItemIdData.lp_off and
+ * ItemIdData.lp_len to 13 bits (see itemid.h).
+ */
+#define	MAXBLCKSZ	8192
+
+typedef void *Block;
+
+
+/* special pageno for bget */
+#define P_NEW	InvalidBlockNumber	/* grow the file to get a new page */
+
+typedef bits16	BufferLock;
+
+/**********************************************************************
+
+  the rest is function defns in the bufmgr that are externally callable
+
+ **********************************************************************/
+
+/*
+ * These routines are beaten on quite heavily, hence the macroization.
+ * See buf_internals.h for a related comment.
+ */
+#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
+
+/*
+ * BufferIsPinned --
+ *	True iff the buffer is pinned (and therefore valid)
+ *
+ * Note:
+ *	Smenatics are identical to BufferIsValid 
+ *      XXX - need to remove either one eventually.
+ */
+#define BufferIsPinned BufferIsValid
+
+
+extern int ShowPinTrace;
+
+/*
+ * prototypes for functions in bufmgr.c 
+ */
+extern Buffer RelationGetBufferWithBuffer(Relation relation,
+		  BlockNumber blockNumber, Buffer buffer);
+extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+extern Buffer ReadBuffer_Debug(char *file, int line, Relation reln,
+			       BlockNumber blockNum);
+extern int WriteBuffer(Buffer buffer);
+extern void WriteBuffer_Debug(char *file, int line, Buffer buffer);
+extern void DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno,
+			    char *dest);
+extern int WriteNoReleaseBuffer(Buffer buffer);
+extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
+				   BlockNumber blockNum);
+
+extern void InitBufferPool(IPCKey key);
+extern void PrintBufferUsage(FILE *statfp);
+extern void ResetBufferUsage(void);
+extern void ResetBufferPool(void);
+extern int BufferPoolCheckLeak(void);
+extern void FlushBufferPool(int StableMainMemoryFlag);
+extern bool BufferIsValid(Buffer bufnum);
+extern BlockNumber BufferGetBlockNumber(Buffer buffer);
+extern Relation BufferGetRelation(Buffer buffer);
+extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
+extern Block BufferGetBlock(Buffer buffer);
+extern void ReleaseTmpRelBuffers(Relation tempreldesc);
+extern void DropBuffers(Oid dbid);
+extern void PrintBufferDescs(void);
+extern void PrintPinnedBufs(void);
+extern int BufferShmemSize(void);
+extern void BufferPoolBlowaway(void);
+extern void IncrBufferRefCount(Buffer buffer);
+extern int ReleaseBuffer(Buffer buffer);
+
+extern void IncrBufferRefCount_Debug(char *file, int line, Buffer buffer);
+extern void ReleaseBuffer_Debug(char *file, int line, Buffer buffer);
+extern int ReleaseAndReadBuffer_Debug(char *file,
+				int line,
+				Buffer buffer,
+				Relation relation,
+				BlockNumber blockNum);
+extern void BufferRefCountReset(int *refcountsave);
+extern void BufferRefCountRestore(int *refcountsave);
+
+#endif	/* !defined(BufMgrIncluded) */
+
diff --git a/src/backend/storage/bufpage.h b/src/backend/storage/bufpage.h
new file mode 100644
index 00000000000..9fda973889d
--- /dev/null
+++ b/src/backend/storage/bufpage.h
@@ -0,0 +1,256 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.h--
+ *    Standard POSTGRES buffer page definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: bufpage.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BUFPAGE_H
+#define BUFPAGE_H
+
+#include "c.h"
+#include "machine.h"		/* for BLCKSZ */
+
+#include "storage/buf.h"
+#include "storage/item.h"
+#include "storage/itemid.h"
+#include "storage/itemptr.h"
+
+/*
+ * a postgres disk page is an abstraction layered on top of a postgres
+ * disk block (which is simply a unit of i/o, see block.h).
+ *
+ * specifically, while a disk block can be unformatted, a postgres
+ * disk page is always a slotted page of the form:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ...           |
+ * +-----------+----+---------------------------------+
+ * | ... linpN |                                      |
+ * +-----------+--------------------------------------+
+ * |           ^ pd_lower                             |
+ * |                                                  |
+ * |             v pd_upper                           |
+ * +-------------+------------------------------------+
+ * |             | tupleN ...                         |
+ * +-------------+------------------+-----------------+
+ * |       ... tuple2 tuple1 tuple0 | "special space" |
+ * +--------------------------------+-----------------+
+ *                                  ^ pd_special
+ *
+ * a page is full when nothing can be added between pd_lower and
+ * pd_upper.
+ *
+ * all blocks written out by an access method must be disk pages.
+ *
+ * EXCEPTIONS:
+ *
+ * obviously, a page is not formatted before it is initialized with by
+ * a call to PageInit.
+ *
+ * the contents of the special pg_variable/pg_time/pg_log tables are
+ * raw disk blocks with special formats.  these are the only "access
+ * methods" that need not write disk pages.
+ *
+ * NOTES:
+ *
+ * linp0..N form an ItemId array.  ItemPointers point into this array
+ * rather than pointing directly to a tuple.
+ *
+ * tuple0..N are added "backwards" on the page.  because a tuple's
+ * ItemPointer points to its ItemId entry rather than its actual
+ * byte-offset position, tuples can be physically shuffled on a page
+ * whenever the need arises.
+ *
+ * AM-generic per-page information is kept in the pd_opaque field of
+ * the PageHeaderData.  (this is currently only the page size.)
+ * AM-specific per-page data is kept in the area marked "special
+ * space"; each AM has an "opaque" structure defined somewhere that is
+ * stored as the page trailer.  an access method should always
+ * initialize its pages with PageInit and then set its own opaque
+ * fields.
+ */
+typedef Pointer	Page;
+
+/*
+ * PageIsValid --
+ *	True iff page is valid.
+ */
+#define	PageIsValid(page) PointerIsValid(page)
+
+
+/*
+ * location (byte offset) within a page.
+ *
+ * note that this is actually limited to 2^13 because we have limited
+ * ItemIdData.lp_off and ItemIdData.lp_len to 13 bits (see itemid.h).
+ */
+typedef uint16	LocationIndex;
+
+
+/*
+ * space management information generic to any page
+ *
+ *	od_pagesize	- size in bytes.
+ *			  in reality, we need at least 64B to fit the 
+ *			  page header, opaque space and a minimal tuple;
+ *			  on the high end, we can only support pages up
+ *			  to 8KB because lp_off/lp_len are 13 bits.
+ */
+typedef struct OpaqueData {
+    uint16 od_pagesize;
+} OpaqueData;
+    
+typedef OpaqueData	*Opaque;
+
+
+/*
+ * disk page organization
+ */
+typedef struct PageHeaderData {
+    LocationIndex	pd_lower;	/* offset to start of free space */
+    LocationIndex	pd_upper;	/* offset to end of free space */
+    LocationIndex	pd_special;	/* offset to start of special space */
+    OpaqueData       	pd_opaque;	/* AM-generic information */
+    ItemIdData		pd_linp[1];	/* line pointers */
+} PageHeaderData;
+
+typedef PageHeaderData	*PageHeader;
+
+typedef enum {
+    ShufflePageManagerMode,
+    OverwritePageManagerMode
+} PageManagerMode;
+
+/* ----------------
+ *	misc support macros
+ * ----------------
+ */
+
+/*
+ * XXX this is wrong -- ignores padding/alignment, variable page size,
+ * AM-specific opaque space at the end of the page (as in btrees), ...
+ * however, it at least serves as an upper bound for heap pages.
+ */
+#define MAXTUPLEN	(BLCKSZ - sizeof (PageHeaderData))
+
+/* ----------------------------------------------------------------
+ *			page support macros
+ * ----------------------------------------------------------------
+ */
+/*
+ * PageIsValid -- This is defined in page.h.
+ */
+
+/*
+ * PageIsUsed --
+ *	True iff the page size is used.
+ *
+ * Note:
+ *	Assumes page is valid.
+ */
+#define PageIsUsed(page) \
+    (AssertMacro(PageIsValid(page)) ? \
+     ((bool) (((PageHeader) (page))->pd_lower != 0)) : false)
+
+/*
+ * PageIsEmpty --
+ *	returns true iff no itemid has been allocated on the page
+ */
+#define PageIsEmpty(page) \
+    (((PageHeader) (page))->pd_lower == \
+     (sizeof(PageHeaderData) - sizeof(ItemIdData)) ? true : false)
+
+/*
+ * PageGetItemId --
+ *	Returns an item identifier of a page.
+ */
+#define PageGetItemId(page, offsetNumber) \
+    ((ItemId) (&((PageHeader) (page))->pd_linp[(-1) + (offsetNumber)]))
+
+/* ----------------
+ *	macros to access opaque space
+ * ----------------
+ */
+
+/*
+ * PageSizeIsValid --
+ *	True iff the page size is valid.
+ *
+ * XXX currently all page sizes are "valid" but we only actually
+ *     use BLCKSZ.
+ */
+#define PageSizeIsValid(pageSize) 1
+
+/*
+ * PageGetPageSize --
+ *	Returns the page size of a page.
+ *
+ * this can only be called on a formatted page (unlike
+ * BufferGetPageSize, which can be called on an unformatted page).
+ * however, it can be called on a page for which there is no buffer.
+ */
+#define PageGetPageSize(page) \
+    ((Size) ((PageHeader) (page))->pd_opaque.od_pagesize)
+
+/*
+ * PageSetPageSize --
+ *	Sets the page size of a page.
+ */
+#define PageSetPageSize(page, size) \
+    ((PageHeader) (page))->pd_opaque.od_pagesize = (size)
+
+/* ----------------
+ *	page special data macros
+ * ----------------
+ */
+/*
+ * PageGetSpecialSize --
+ *	Returns size of special space on a page.
+ *
+ * Note:
+ *	Assumes page is locked.
+ */
+#define PageGetSpecialSize(page) \
+    ((uint16) (PageGetPageSize(page) - ((PageHeader)page)->pd_special))
+
+/*
+ * PageGetSpecialPointer --
+ *	Returns pointer to special space on a page.
+ *
+ * Note:
+ *	Assumes page is locked.
+ */
+#define PageGetSpecialPointer(page) \
+    (AssertMacro(PageIsValid(page)) ? \
+     (char *) ((char *) (page) + ((PageHeader) (page))->pd_special) \
+     : (char *) 0)
+
+/* ----------------------------------------------------------------
+ *	extern declarations
+ * ----------------------------------------------------------------
+ */
+
+extern Size BufferGetPageSize(Buffer buffer);
+extern Page BufferGetPage(Buffer buffer);
+extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern Item PageGetItem(Page page, ItemId itemId);
+extern OffsetNumber PageAddItem(Page page, Item item, Size size,
+			 OffsetNumber offsetNumber, ItemIdFlags flags);
+extern Page PageGetTempPage(Page page, Size specialSize);
+extern void PageRestoreTempPage(Page tempPage, Page oldPage);
+extern OffsetNumber PageGetMaxOffsetNumber(Page page);
+extern void PageRepairFragmentation(Page page);
+extern Size PageGetFreeSpace(Page page);
+extern void PageManagerModeSet(PageManagerMode mode);
+extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
+extern void PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr,
+				       char *location, Size size);
+
+
+#endif	/* BUFPAGE_H */
diff --git a/src/backend/storage/fd.h b/src/backend/storage/fd.h
new file mode 100644
index 00000000000..da28b031bb8
--- /dev/null
+++ b/src/backend/storage/fd.h
@@ -0,0 +1,96 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.h--
+ *    Virtual file descriptor definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: fd.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * calls:
+ * 
+ *  File {Close, Read, Write, Seek, Tell, Sync}
+ *  {File Name Open, Allocate, Free} File
+ *
+ * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
+ * use them for all file activity...
+ *
+ *  fd = FilePathOpenFile("foo", O_RDONLY);
+ *  File fd;
+ *
+ * use AllocateFile if you need a file descriptor in some other context.
+ * it will make sure that there is a file descriptor free
+ *
+ * use FreeFile to let the virtual file descriptor package know that 
+ * there is now a free fd (when you are done with it)
+ *
+ *  AllocateFile();
+ *  FreeFile();
+ */
+#ifndef	FD_H
+#define FD_H
+
+/*
+ * FileOpen uses the standard UNIX open(2) flags.
+ */
+#include <fcntl.h>	/* for O_ on most */
+#ifndef O_RDONLY
+#include <sys/file.h>	/* for O_ on the rest */
+#endif /* O_RDONLY */
+
+/*
+ * FileSeek uses the standard UNIX lseek(2) flags.
+ */
+#ifndef WIN32
+#include <unistd.h>	/* for SEEK_ on most */
+#else
+#ifndef SEEK_SET
+#include <stdio.h>	/* for SEEK_ on the rest */
+#endif /* SEEK_SET */
+#endif /* WIN32 */
+
+#include "c.h"
+#include "storage/block.h"
+
+typedef char   *FileName;
+
+typedef int	File;
+
+/* originally in libpq-fs.h */
+struct pgstat { /* just the fields we need from stat structure */
+    int st_ino;
+    int st_mode;
+    unsigned int st_size;
+    unsigned int st_sizehigh;	/* high order bits */
+/* 2^64 == 1.8 x 10^20 bytes */
+    int st_uid;
+    int st_atime_s;	/* just the seconds */
+    int st_mtime_s;	/* since SysV and the new BSD both have */
+    int st_ctime_s;	/* usec fields.. */
+};
+
+/*
+ * prototypes for functions in fd.c
+ */
+extern void FileInvalidate(File file);
+extern File FileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+extern void FileClose(File file);
+extern void FileUnlink(File file);
+extern int FileRead(File file, char *buffer, int amount);
+extern int FileWrite(File file, char *buffer, int amount);
+extern long FileSeek(File file, long offset, int whence);
+extern long FileTell(File file);
+extern int FileTruncate(File file, int offset);
+extern int FileSync(File file);
+extern int FileNameUnlink(char *filename);
+extern void AllocateFile(void);
+extern void FreeFile(void);
+extern void closeAllVfds(void);
+extern void closeOneVfd(void);
+
+#endif	/* FD_H */
diff --git a/src/backend/storage/file/Makefile.inc b/src/backend/storage/file/Makefile.inc
new file mode 100644
index 00000000000..767cbecd38a
--- /dev/null
+++ b/src/backend/storage/file/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/file
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/file/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= fd.c
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
new file mode 100644
index 00000000000..bb94c4c5dec
--- /dev/null
+++ b/src/backend/storage/file/fd.c
@@ -0,0 +1,888 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.c--
+ *    Virtual file descriptor code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *    $Id: fd.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+ *
+ * NOTES:
+ *
+ * This code manages a cache of 'virtual' file descriptors (VFDs).
+ * The server opens many file descriptors for a variety of reasons,
+ * including base tables, scratch files (e.g., sort and hash spool
+ * files), and random calls to C library routines like system(3); it
+ * is quite easy to exceed system limits on the number of open files a
+ * single process can have.  (This is around 256 on many modern
+ * operating systems, but can be as low as 32 on others.)
+ *
+ * VFDs are managed as an LRU pool, with actual OS file descriptors
+ * being opened and closed as needed.  Obviously, if a routine is
+ * opened using these interfaces, all subsequent operations must also
+ * be through these interfaces (the File type is not a real file
+ * descriptor).
+ *
+ * For this scheme to work, most (if not all) routines throughout the
+ * server should use these interfaces instead of calling the C library
+ * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
+ * may find ourselves short of real file descriptors anyway.
+ *
+ * This file used to contain a bunch of stuff to support RAID levels 0
+ * (jbod), 1 (duplex) and 5 (xor parity).  That stuff is all gone
+ * because the parallel query processing code that called it is all
+ * gone.  If you really need it you could get it from the original
+ * POSTGRES source.
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <sys/file.h>
+#include <sys/param.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "c.h"
+#include "miscadmin.h"	/* for DataDir */
+#include "utils/palloc.h"
+
+#ifdef PORTNAME_sparc
+/*
+ * the SunOS 4 NOFILE is a lie, because the default limit is *not* the
+ * maximum number of file descriptors you can have open.
+ *
+ * we have to either use this number (the default dtablesize) or
+ * explicitly call setrlimit(RLIMIT_NOFILE, NOFILE).
+ */
+#include <sys/user.h>
+#undef NOFILE
+#define NOFILE NOFILE_IN_U
+#endif /* PORTNAME_sparc */
+
+/*
+ * Problem: Postgres does a system(ld...) to do dynamic loading.  This
+ * will open several extra files in addition to those used by
+ * Postgres.  We need to do this hack to guarentee that there are file
+ * descriptors free for ld to use.
+ *
+ * The current solution is to limit the number of files descriptors
+ * that this code will allocated at one time.  (it leaves
+ * RESERVE_FOR_LD free).
+ *
+ * (Even though most dynamic loaders now use dlopen(3) or the
+ * equivalent, the OS must still open several files to perform the
+ * dynamic loading.  Keep this here.)
+ */
+#define RESERVE_FOR_LD	10
+
+/*
+ * If we are using weird storage managers, we may need to keep real
+ * file descriptors open so that the jukebox server doesn't think we
+ * have gone away (and no longer care about a platter or file that
+ * we've been using).  This might be an actual file descriptor for a
+ * local jukebox interface that uses paths, or a socket connection for
+ * a network jukebox server.  Since we can't be opening and closing
+ * these descriptors at whim, we must make allowances for them.
+ */
+#ifdef HP_JUKEBOX
+#define RESERVE_FOR_JB	25
+#define	MAXFILES	((NOFILE - RESERVE_FOR_LD) - RESERVE_FOR_JB)
+#else /* HP_JUKEBOX */
+#define	MAXFILES	(NOFILE - RESERVE_FOR_LD)
+#endif /* HP_JUKEBOX */
+
+/* Debugging.... */
+
+#ifdef FDDEBUG
+# define DO_DB(A) A
+#else
+# define DO_DB(A) /* A */
+#endif
+
+#define VFD_CLOSED -1
+
+#include "storage/fd.h"
+#include "utils/elog.h"
+
+#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
+
+typedef struct vfd {
+    signed short	fd;
+    unsigned short	fdstate;
+
+#define FD_DIRTY	(1 << 0)
+
+    File	nextFree;
+    File	lruMoreRecently;
+    File	lruLessRecently;
+    long	seekPos;
+    char	*fileName;
+    int		fileFlags;
+    int		fileMode;
+} Vfd;
+
+/*
+ * Virtual File Descriptor array pointer and size.  This grows as
+ * needed.
+ */
+static	Vfd	*VfdCache;
+static	Size	SizeVfdCache = 0;
+
+/*
+ * Minimum number of file descriptors known to be free.
+ */
+static	int	FreeFd = 0;
+
+/*
+ * Number of file descriptors known to be open.
+ */
+static	int	nfile = 0;
+
+/*
+ * we use the name of the null device in various places, mostly so
+ * that we can open it and find out if we really have any descriptors
+ * available or not.
+ */
+#ifndef WIN32
+static char *Nulldev = "/dev/null";
+static char Sep_char = '/';
+#else
+static char *Nulldev = "NUL";
+static char Sep_char = '\\';
+#endif /* WIN32 */
+
+/*
+ * Private Routines
+ *
+ * Delete	   - delete a file from the Lru ring
+ * LruDelete	   - remove a file from the Lru ring and close
+ * Insert	   - put a file at the front of the Lru ring
+ * LruInsert	   - put a file at the front of the Lru ring and open
+ * AssertLruRoom   - make sure that there is a free fd.
+ *
+ * the Last Recently Used ring is a doubly linked list that begins and
+ * ends on element zero.
+ *
+ * example:
+ *
+ *     /--less----\                /---------\
+ *     v           \              v           \
+ *   #0 --more---> LeastRecentlyUsed --more-\ \
+ *    ^\                                    | |
+ *     \\less--> MostRecentlyUsedFile   <---/ |
+ *      \more---/                    \--less--/
+ *
+ * AllocateVfd	   - grab a free (or new) file record (from VfdArray)
+ * FreeVfd	   - free a file record
+ *
+ */
+static void Delete(File file);
+static void LruDelete(File file);
+static void Insert(File file);
+static int LruInsert (File file);
+static void AssertLruRoom(void);
+static File AllocateVfd(void);
+static void FreeVfd(File file);
+
+static int FileAccess(File file);
+static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+static char *filepath(char *filename);
+
+#if defined(FDDEBUG)
+static void
+_dump_lru()
+{
+    int mru = VfdCache[0].lruLessRecently;
+    Vfd *vfdP = &VfdCache[mru];
+    
+    printf("MOST %d ", mru);
+    while (mru != 0)
+	{
+	    mru = vfdP->lruLessRecently;
+	    vfdP = &VfdCache[mru];
+	    printf("%d ", mru);
+	}
+    printf("LEAST\n");
+}
+#endif /* FDDEBUG */
+
+static void
+Delete(File file)
+{
+    Vfd	*fileP;
+    
+    DO_DB(printf("DEBUG:	Delete %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    DO_DB(_dump_lru());
+    
+    Assert(file != 0);
+    
+    fileP = &VfdCache[file];
+
+    VfdCache[fileP->lruLessRecently].lruMoreRecently =
+	VfdCache[file].lruMoreRecently;
+    VfdCache[fileP->lruMoreRecently].lruLessRecently =
+	VfdCache[file].lruLessRecently;
+    
+    DO_DB(_dump_lru());
+}
+
+static void
+LruDelete(File file)
+{
+    Vfd     *fileP;
+    int	returnValue;
+    
+    DO_DB(printf("DEBUG:	LruDelete %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    Assert(file != 0);
+    
+    fileP = &VfdCache[file];
+    
+    /* delete the vfd record from the LRU ring */
+    Delete(file);
+    
+    /* save the seek position */
+    fileP->seekPos = lseek(fileP->fd, 0L, SEEK_CUR);
+    Assert( fileP->seekPos != -1);
+    
+    /* if we have written to the file, sync it */
+    if (fileP->fdstate & FD_DIRTY) {
+	returnValue = fsync(fileP->fd);
+	Assert(returnValue != -1);
+	fileP->fdstate &= ~FD_DIRTY;
+    }
+    
+    /* close the file */
+    returnValue = close(fileP->fd);
+    Assert(returnValue != -1);
+    
+    --nfile;
+    fileP->fd = VFD_CLOSED;
+    
+    /* note that there is now one more free real file descriptor */
+    FreeFd++;
+}
+
+static void
+Insert(File file)
+{
+    Vfd	*vfdP;
+    
+    DO_DB(printf("DEBUG:	Insert %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    DO_DB(_dump_lru());
+    
+    vfdP = &VfdCache[file];
+    
+    vfdP->lruMoreRecently = 0;
+    vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
+    VfdCache[0].lruLessRecently = file;
+    VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
+    
+    DO_DB(_dump_lru());
+}
+
+static int
+LruInsert (File file)
+{
+    Vfd	*vfdP;
+    int	returnValue;
+    
+    DO_DB(printf("DEBUG:	LruInsert %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    vfdP = &VfdCache[file];
+    
+    if (FileIsNotOpen(file)) {
+	int tmpfd;
+	
+        /*
+	 * Note, we check to see if there's a free file descriptor
+	 * before attempting to open a file. One general way to do
+	 * this is to try to open the null device which everybody
+	 * should be able to open all the time. If this fails, we
+	 * assume this is because there's no free file descriptors.
+	 */
+    tryAgain:
+	tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666);
+	if (tmpfd < 0) {
+	    FreeFd = 0;
+	    errno = 0;
+	    AssertLruRoom();
+	    goto tryAgain;
+	} else {
+	    close(tmpfd);
+	}
+	vfdP->fd = open(vfdP->fileName,vfdP->fileFlags,vfdP->fileMode);
+	
+	if (vfdP->fd < 0) {
+	    DO_DB(printf("RE_OPEN FAILED: %d\n",
+			 errno));
+	    return (vfdP->fd);
+	} else {
+	    DO_DB(printf("RE_OPEN SUCCESS\n"));
+	    ++nfile;
+	}
+	
+	/* seek to the right position */
+	if (vfdP->seekPos != 0L) {
+	    returnValue =
+		lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
+	    Assert(returnValue != -1);
+	}
+	
+	/* init state on open */
+	vfdP->fdstate = 0x0;
+	
+	/* note that a file descriptor has been used up */
+	if (FreeFd > 0)
+	    FreeFd--;
+    }
+    
+    /*
+     * put it at the head of the Lru ring
+     */
+    
+    Insert(file);
+    
+    return (0);
+}
+
+static void
+AssertLruRoom()
+{
+    DO_DB(printf("DEBUG:	AssertLruRoom (FreeFd = %d)\n",
+		 FreeFd));
+    
+    if (FreeFd <= 0 || nfile >= MAXFILES) {
+	LruDelete(VfdCache[0].lruMoreRecently);
+    }
+}
+
+static File
+AllocateVfd()
+{
+    Index	i;
+    File	file;
+    
+    DO_DB(printf("DEBUG:	AllocateVfd\n"));
+    
+    if (SizeVfdCache == 0) {
+	
+	/* initialize */
+	VfdCache = (Vfd *)malloc(sizeof(Vfd));
+	
+	VfdCache->nextFree = 0;
+	VfdCache->lruMoreRecently = 0;
+	VfdCache->lruLessRecently = 0;
+	VfdCache->fd = VFD_CLOSED;
+	VfdCache->fdstate = 0x0;
+	
+	SizeVfdCache = 1;
+    }
+    
+    if (VfdCache[0].nextFree == 0) {
+	
+	/*
+	 * The free list is empty so it is time to increase the
+	 * size of the array
+	 */
+	
+	VfdCache =(Vfd *)realloc(VfdCache, sizeof(Vfd)*SizeVfdCache*2);
+	Assert(VfdCache != NULL);
+	
+	/*
+	 * Set up the free list for the new entries
+	 */
+	
+	for (i = SizeVfdCache; i < 2*SizeVfdCache; i++)  {
+	    memset((char *) &(VfdCache[i]), 0, sizeof(VfdCache[0]));
+	    VfdCache[i].nextFree = i+1;
+	    VfdCache[i].fd = VFD_CLOSED;
+	}
+	
+	/*
+	 * Element 0 is the first and last element of the free
+	 * list
+	 */
+	
+	VfdCache[0].nextFree = SizeVfdCache;
+	VfdCache[2*SizeVfdCache-1].nextFree = 0;
+	
+	/*
+	 * Record the new size
+	 */
+	
+	SizeVfdCache *= 2;
+    }
+    file = VfdCache[0].nextFree;
+    
+    VfdCache[0].nextFree = VfdCache[file].nextFree;
+    
+    return file;
+}
+
+static void
+FreeVfd(File file)
+{
+    DO_DB(printf("DB: FreeVfd: %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    VfdCache[file].nextFree = VfdCache[0].nextFree;
+    VfdCache[0].nextFree = file;
+}
+
+static char *
+filepath(char *filename)
+{
+    char *buf;
+    char basename[16];
+    int len;
+
+#ifndef WIN32    
+    if (*filename != Sep_char) {
+#else
+    if (!(filename[1] == ':' && filename[2] == Sep_char)) {
+#endif /* WIN32 */	
+
+	/* Either /base/ or \base\ */
+	sprintf(basename, "%cbase%c", Sep_char, Sep_char);
+
+	len = strlen(DataDir) + strlen(basename) + strlen(GetDatabaseName())
+	    + strlen(filename) + 2;
+	buf = (char*) palloc(len);
+	sprintf(buf, "%s%s%s%c%s",
+		DataDir, basename, GetDatabaseName(), Sep_char, filename);
+    } else {
+	buf = (char *) palloc(strlen(filename) + 1);
+	strcpy(buf, filename);
+    }
+    
+    return(buf);
+}
+
+static int
+FileAccess(File file)
+{
+    int	returnValue;
+    
+    DO_DB(printf("DB: FileAccess %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    /*
+     * Is the file open?  If not, close the least recently used,
+     * then open it and stick it at the head of the used ring
+     */
+    
+    if (FileIsNotOpen(file)) {
+	
+	AssertLruRoom();
+	
+	returnValue = LruInsert(file);
+	if (returnValue != 0)
+	    return returnValue;
+	
+    } else {
+	
+	/*
+	 * We now know that the file is open and that it is not the
+	 * last one accessed, so we need to more it to the head of
+	 * the Lru ring.
+	 */
+	
+	Delete(file);
+	Insert(file);
+    }
+    
+    return (0);
+}
+
+/*
+ *  Called when we get a shared invalidation message on some relation.
+ */
+void
+FileInvalidate(File file)
+{
+    if (!FileIsNotOpen(file)) {
+	LruDelete(file);
+    }
+}
+
+/* VARARGS2 */
+static File
+fileNameOpenFile(FileName fileName,
+		 int fileFlags,
+		 int fileMode)
+{
+    static int osRanOut = 0;
+    File	file;
+    Vfd	*vfdP;
+    int     tmpfd;
+    
+    DO_DB(printf("DEBUG: FileNameOpenFile: %s %x %o\n",
+		 fileName, fileFlags, fileMode));
+    
+    file = AllocateVfd();
+    vfdP = &VfdCache[file];
+    
+    if (nfile >= MAXFILES || (FreeFd == 0 && osRanOut)) {
+	AssertLruRoom();
+    }
+    
+ tryAgain:
+    tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666);
+    if (tmpfd < 0) {
+	DO_DB(printf("DB: not enough descs, retry, er= %d\n",
+		     errno));
+	errno = 0;
+	FreeFd = 0;
+	osRanOut = 1;
+	AssertLruRoom();
+	goto tryAgain;
+    } else {
+	close(tmpfd);
+    }
+    
+#ifdef WIN32
+      fileFlags |= _O_BINARY;
+#endif /* WIN32 */
+    vfdP->fd = open(fileName,fileFlags,fileMode);
+    vfdP->fdstate = 0x0;
+    
+    if (vfdP->fd < 0) {
+	FreeVfd(file);
+	return -1;
+    }
+    ++nfile;
+    DO_DB(printf("DB: FNOF success %d\n",
+		 vfdP->fd));
+    
+    (void)LruInsert(file);
+    
+    if (fileName==NULL) {
+	elog(WARN, "fileNameOpenFile: NULL fname");
+    }
+    vfdP->fileName = malloc(strlen(fileName)+1);
+    strcpy(vfdP->fileName,fileName);
+    
+    vfdP->fileFlags = fileFlags & ~(O_TRUNC|O_EXCL);
+    vfdP->fileMode = fileMode;
+    vfdP->seekPos = 0;
+    
+    return file;
+}
+
+/*
+ * open a file in the database directory ($PGDATA/base/...)
+ */
+File
+FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
+{
+    File fd;
+    char *fname;
+    
+    fname = filepath(fileName);
+    fd = fileNameOpenFile(fname, fileFlags, fileMode);
+    pfree(fname);
+    return(fd);
+}
+
+/*
+ * open a file in an arbitrary directory
+ */
+File
+PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
+{
+    return(fileNameOpenFile(fileName, fileFlags, fileMode));
+}
+
+void
+FileClose(File file)
+{
+    int	returnValue;
+    
+    DO_DB(printf("DEBUG: FileClose: %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    if (!FileIsNotOpen(file)) {
+	
+	/* remove the file from the lru ring */
+	Delete(file);
+	
+	/* record the new free operating system file descriptor */
+	FreeFd++;
+	
+	/* if we did any writes, sync the file before closing */
+	if (VfdCache[file].fdstate & FD_DIRTY) {
+	    returnValue = fsync(VfdCache[file].fd);
+	    Assert(returnValue != -1);
+	    VfdCache[file].fdstate &= ~FD_DIRTY;
+	}
+	
+	/* close the file */
+	returnValue = close(VfdCache[file].fd);
+	Assert(returnValue != -1);
+	
+	--nfile;
+	VfdCache[file].fd = VFD_CLOSED;
+    }
+    /*
+     * Add the Vfd slot to the free list
+     */
+    FreeVfd(file);
+    /*
+     * Free the filename string
+     */
+    free(VfdCache[file].fileName);
+}
+
+void
+FileUnlink(File file)
+{
+    int returnValue;
+    
+    DO_DB(printf("DB: FileClose: %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    if (!FileIsNotOpen(file)) {
+	
+	/* remove the file from the lru ring */
+	Delete(file);
+	
+	/* record the new free operating system file descriptor */
+	FreeFd++;
+	
+	/* if we did any writes, sync the file before closing */
+	if (VfdCache[file].fdstate & FD_DIRTY) {
+	    returnValue = fsync(VfdCache[file].fd);
+	    Assert(returnValue != -1);
+	    VfdCache[file].fdstate &= ~FD_DIRTY;
+	}
+	
+	/* close the file */
+	returnValue = close(VfdCache[file].fd);
+	Assert(returnValue != -1);
+	
+	--nfile;
+	VfdCache[file].fd = VFD_CLOSED;
+    }
+    /* add the Vfd slot to the free list */
+    FreeVfd(file);
+    
+    /* free the filename string */
+    unlink(VfdCache[file].fileName);
+    free(VfdCache[file].fileName);
+}
+
+int
+FileRead(File file, char *buffer, int amount)
+{
+    int	returnCode;
+
+    DO_DB(printf("DEBUG: FileRead: %d (%s) %d 0x%x\n",
+		 file, VfdCache[file].fileName, amount, buffer));
+    
+    FileAccess(file);
+    returnCode = read(VfdCache[file].fd, buffer, amount);
+    if (returnCode > 0) {
+	VfdCache[file].seekPos += returnCode;
+    }
+    
+    return returnCode;
+}
+
+int
+FileWrite(File file, char *buffer, int amount)
+{
+    int	returnCode;
+
+    DO_DB(printf("DB: FileWrite: %d (%s) %d 0x%lx\n",
+		 file, VfdCache[file].fileName, amount, buffer));
+    
+    FileAccess(file);
+    returnCode = write(VfdCache[file].fd, buffer, amount);
+    if (returnCode > 0) {  /* changed by Boris with Mao's advice */
+	VfdCache[file].seekPos += returnCode;
+    }
+    
+    /* record the write */
+    VfdCache[file].fdstate |= FD_DIRTY;
+    
+    return returnCode;
+}
+
+long
+FileSeek(File file, long offset, int whence)
+{
+    int	returnCode;
+    
+    DO_DB(printf("DEBUG: FileSeek: %d (%s) %d %d\n",
+		 file, VfdCache[file].fileName, offset, whence));
+    
+    if (FileIsNotOpen(file)) {
+	switch(whence) {
+	case SEEK_SET:
+	    VfdCache[file].seekPos = offset;
+	    return offset;
+	case SEEK_CUR:
+	    VfdCache[file].seekPos = VfdCache[file].seekPos +offset;
+	    return VfdCache[file].seekPos;
+	case SEEK_END:
+	    FileAccess(file);
+	    returnCode = VfdCache[file].seekPos = 
+		lseek(VfdCache[file].fd, offset, whence);
+	    return returnCode;
+	default:
+	    elog(WARN, "FileSeek: invalid whence: %d", whence);
+	    break;
+	}
+    } else {
+	returnCode = VfdCache[file].seekPos = 
+	    lseek(VfdCache[file].fd, offset, whence);
+	return returnCode;
+    }
+    /*NOTREACHED*/
+    return(-1L);
+}
+
+/*
+ * XXX not actually used but here for completeness
+ */
+long
+FileTell(File file)
+{
+    DO_DB(printf("DEBUG: FileTell %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    return VfdCache[file].seekPos;
+}
+
+int
+FileTruncate(File file, int offset)
+{
+    int returnCode;
+
+    DO_DB(printf("DEBUG: FileTruncate %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    (void) FileSync(file);
+    (void) FileAccess(file);
+    returnCode = ftruncate(VfdCache[file].fd, offset);
+    return(returnCode);
+}
+
+int
+FileSync(File file)
+{
+    int	returnCode;
+    
+    /*
+     *  If the file isn't open, then we don't need to sync it; we
+     *  always sync files when we close them.  Also, if we haven't
+     *  done any writes that we haven't already synced, we can ignore
+     *  the request.
+     */
+    
+    if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY)) {
+	returnCode = 0;
+    } else {
+	returnCode = fsync(VfdCache[file].fd);
+	VfdCache[file].fdstate &= ~FD_DIRTY;
+    }
+    
+    return returnCode;
+}
+
+int
+FileNameUnlink(char *filename)
+{
+    int retval;
+    char *fname;
+
+    fname = filepath(filename);
+    retval = unlink(fname);
+    pfree(fname);
+    return(retval);
+}
+
+/*
+ * if we want to be sure that we have a real file descriptor available
+ * (e.g., we want to know this in psort) we call AllocateFile to force
+ * availability.  when we are done we call FreeFile to deallocate the
+ * descriptor.
+ *
+ * allocatedFiles keeps track of how many have been allocated so we
+ * can give a warning if there are too few left.
+ */
+static int allocatedFiles = 0;
+
+void
+AllocateFile()
+{
+    int fd;
+    int fdleft;
+
+    while ((fd = open(Nulldev,O_WRONLY,0)) < 0) {
+	if (errno == EMFILE) {
+	    errno = 0;
+	    FreeFd = 0;
+	    AssertLruRoom();
+	} else {
+	    elog(WARN,"Open: %s in %s line %d\n", Nulldev,
+		 __FILE__, __LINE__);
+	}
+    }
+    close(fd);
+    ++allocatedFiles;
+    fdleft = MAXFILES - allocatedFiles;
+    if (fdleft < 6) {
+	elog(DEBUG,"warning: few usable file descriptors left (%d)", fdleft);
+    }
+    
+    DO_DB(printf("DEBUG: AllocatedFile.  FreeFd = %d\n",
+		 FreeFd));
+}
+
+/*
+ * XXX What happens if FreeFile() is called without a previous
+ * AllocateFile()?
+ */
+void
+FreeFile()
+{
+    DO_DB(printf("DEBUG: FreeFile.  FreeFd now %d\n",
+		 FreeFd));
+    FreeFd++;
+    nfile++;			/* dangerous */
+    Assert(allocatedFiles > 0);
+    --allocatedFiles;
+}
+
+void
+closeAllVfds()
+{
+    int i;
+    for (i=0; i<SizeVfdCache; i++) {
+	if (!FileIsNotOpen(i))
+	    LruDelete(i);
+    }
+}
+
+void
+closeOneVfd()
+{
+    int tmpfd;
+    
+    tmpfd = open(Nulldev, O_CREAT | O_RDWR, 0666);
+    if (tmpfd < 0) {
+	FreeFd = 0;
+	AssertLruRoom();
+	FreeFd = 0;
+    }
+    else
+	close(tmpfd);
+}
diff --git a/src/backend/storage/ipc.h b/src/backend/storage/ipc.h
new file mode 100644
index 00000000000..0da041bc9c8
--- /dev/null
+++ b/src/backend/storage/ipc.h
@@ -0,0 +1,285 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.h--
+ *    POSTGRES inter-process communication definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: ipc.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ * NOTES
+ *    This file is very architecture-specific.  This stuff should actually
+ *    be factored into the port/ directories.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	IPC_H
+#define IPC_H
+
+#include <sys/types.h>
+#ifndef	_IPC_
+#define _IPC_
+#include <sys/ipc.h>
+#endif
+
+#include "c.h"
+
+/*
+ * Many architectures have support for user-level spinlocks (i.e., an
+ * atomic test-and-set instruction).  However, we have only written
+ * spinlock code for the architectures listed.
+ */
+#if defined(PORTNAME_aix) || \
+    defined(PORTNAME_alpha) || \
+    defined(PORTNAME_hpux) || \
+    defined(PORTNAME_irix5) || \
+    defined(PORTNAME_next) || \
+    defined(PORTNAME_sparc) || \
+    defined(PORTNAME_sparc_solaris) || \
+    (defined(__i386__) && defined(__GNUC__))
+#define HAS_TEST_AND_SET
+#endif
+
+#if defined(HAS_TEST_AND_SET)
+
+#if defined(PORTNAME_next)
+/*
+ * Use Mach mutex routines since these are, in effect, test-and-set
+ * spinlocks.
+ */
+#undef NEVER	/* definition in cthreads.h conflicts with parse.h */
+#include <mach/cthreads.h>
+
+typedef struct mutex	slock_t;
+#else /* next */
+#if defined(PORTNAME_aix)
+/*
+ * The AIX C library has the cs(3) builtin for compare-and-set that 
+ * operates on ints.
+ */
+typedef unsigned int	slock_t;
+#else /* aix */
+#if defined(PORTNAME_alpha)
+#include <sys/mman.h>
+typedef msemaphore	slock_t;
+#else /* alpha */
+#if defined(PORTNAME_hpux)
+/*
+ * The PA-RISC "semaphore" for the LDWCX instruction is 4 bytes aligned
+ * to a 16-byte boundary.
+ */
+typedef struct { int sem[4]; } slock_t;
+#else /* hpux */
+#if defined(PORTNAME_irix5)
+#include <abi_mutex.h>
+typedef abilock_t	slock_t;
+#else /* irix5 */
+/*
+ * On all other architectures spinlocks are a single byte.
+ */
+typedef unsigned char   slock_t;
+#endif /* irix5 */
+#endif /* hpux */
+#endif /* alpha */
+#endif /* aix */
+#endif /* next */
+
+extern void S_LOCK(slock_t *lock);
+extern void S_UNLOCK(slock_t *lock);
+extern void S_INIT_LOCK(slock_t *lock);
+
+#if defined(PORTNAME_hpux) || defined(PORTNAME_alpha) || defined(PORTNAME_irix5) || defined(PORTNAME_next)
+extern int S_LOCK_FREE(slock_t *lock);
+#else /* PORTNAME_hpux */
+#define S_LOCK_FREE(lock)	((*lock) == 0)
+#endif /* PORTNAME_hpux */
+
+#endif /* HAS_TEST_AND_SET */
+
+/*
+ * On architectures for which we have not implemented spinlocks (or
+ * cannot do so), we use System V semaphores.  We also use them for 
+ * long locks.  For some reason union semun is never defined in the 
+ * System V header files so we must do it ourselves.
+ */
+#if defined(sequent) || \
+    defined(PORTNAME_aix) || \
+    defined(PORTNAME_alpha) || \
+    defined(PORTNAME_hpux) || \
+    defined(PORTNAME_sparc_solaris) || \
+    defined(WIN32) || \
+    defined(PORTNAME_ultrix4)
+union semun {
+    int val;
+    struct semid_ds *buf;
+    unsigned short *array;
+};
+#endif
+
+typedef uint16	SystemPortAddress;
+
+/* semaphore definitions */
+
+#define IPCProtection	(0600)		/* access/modify by user only */
+
+#define IPC_NMAXSEM	25		/* maximum number of semaphores */
+#define IpcSemaphoreDefaultStartValue	255
+#define IpcSharedLock					(-1)
+#define IpcExclusiveLock			  (-255)
+
+#define IpcUnknownStatus	(-1)
+#define IpcInvalidArgument	(-2)
+#define IpcSemIdExist		(-3)
+#define IpcSemIdNotExist	(-4)
+
+typedef uint32	IpcSemaphoreKey;		/* semaphore key */
+typedef int	IpcSemaphoreId;
+
+/* shared memory definitions */ 
+
+#define IpcMemCreationFailed	(-1)
+#define IpcMemIdGetFailed	(-2)
+#define IpcMemAttachFailed	0
+
+typedef uint32	IPCKey;
+#define PrivateIPCKey	IPC_PRIVATE
+#define DefaultIPCKey	17317
+
+typedef uint32  IpcMemoryKey;			/* shared memory key */
+typedef int	IpcMemoryId;
+
+
+/* ipc.c */
+extern void exitpg(int code);
+extern void quasi_exitpg(void);
+extern on_exitpg(void (*function)(), caddr_t arg);
+
+extern IpcSemaphoreId IpcSemaphoreCreate(IpcSemaphoreKey semKey,
+		int semNum, int permission, int semStartValue,
+		int removeOnExit, int *status);
+extern void IpcSemaphoreSet(int semId, int semno, int value);
+extern void IpcSemaphoreKill(IpcSemaphoreKey key);
+extern void IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock);
+extern void IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock);
+extern int IpcSemaphoreGetCount(IpcSemaphoreId semId, int sem);
+extern int IpcSemaphoreGetValue(IpcSemaphoreId semId, int sem);
+extern IpcMemoryId IpcMemoryCreate(IpcMemoryKey memKey, uint32 size,
+				   int permission);
+extern IpcMemoryId IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size);
+extern void IpcMemoryDetach(int status, char *shmaddr);
+extern char *IpcMemoryAttach(IpcMemoryId memId);
+extern void IpcMemoryKill(IpcMemoryKey memKey);
+extern void CreateAndInitSLockMemory(IPCKey key);
+extern void AttachSLockMemory(IPCKey key);
+
+
+#ifdef HAS_TEST_AND_SET
+
+#define NSLOCKS		2048
+#define	NOLOCK		0
+#define SHAREDLOCK	1
+#define EXCLUSIVELOCK	2
+
+typedef enum _LockId_ {
+    BUFMGRLOCKID,
+    LOCKLOCKID,
+    OIDGENLOCKID,
+    SHMEMLOCKID,
+    BINDINGLOCKID,
+    LOCKMGRLOCKID,
+    SINVALLOCKID,
+
+#ifdef MAIN_MEMORY
+    MMCACHELOCKID,
+#endif /* MAIN_MEMORY */
+
+    PROCSTRUCTLOCKID,
+    FIRSTFREELOCKID
+} _LockId_;
+
+#define MAX_SPINS	FIRSTFREELOCKID
+
+typedef struct slock {
+    slock_t		locklock;
+    unsigned char	flag;
+    short		nshlocks;
+    slock_t		shlock;
+    slock_t		exlock;
+    slock_t		comlock;
+    struct slock	*next;
+} SLock;
+
+extern void ExclusiveLock(int lockid);
+extern void ExclusiveUnlock(int lockid);
+extern bool LockIsFree(int lockid);
+#else /* HAS_TEST_AND_SET */
+
+typedef enum _LockId_ {
+    SHMEMLOCKID,
+    BINDINGLOCKID,
+    BUFMGRLOCKID,
+    LOCKMGRLOCKID,
+    SINVALLOCKID,
+
+#ifdef MAIN_MEMORY
+    MMCACHELOCKID,
+#endif /* MAIN_MEMORY */
+
+    PROCSTRUCTLOCKID,
+    OIDGENLOCKID,
+    FIRSTFREELOCKID
+} _LockId_;
+
+#define MAX_SPINS	FIRSTFREELOCKID
+
+#endif /* HAS_TEST_AND_SET */
+
+/*
+ * the following are originally in ipci.h but the prototypes have circular
+ * dependencies and most files include both ipci.h and ipc.h anyway, hence
+ * combined.
+ *
+ */
+
+/*
+ * Note:
+ *	These must not hash to DefaultIPCKey or PrivateIPCKey.
+ */
+#define SystemPortAddressGetIPCKey(address) \
+	(28597 * (address) + 17491)
+
+/*
+ * these keys are originally numbered from 1 to 12 consecutively but not
+ * all are used. The unused ones are removed.		- ay 4/95.
+ */
+#define IPCKeyGetBufferMemoryKey(key) \
+	((key == PrivateIPCKey) ? key : 1 + (key))
+
+#define IPCKeyGetSIBufferMemoryBlock(key) \
+	((key == PrivateIPCKey) ? key : 7 + (key))
+
+#define IPCKeyGetSLockSharedMemoryKey(key) \
+	((key == PrivateIPCKey) ? key : 10 + (key))
+
+#define IPCKeyGetSpinLockSemaphoreKey(key) \
+	((key == PrivateIPCKey) ? key : 11 + (key))
+#define IPCKeyGetWaitIOSemaphoreKey(key) \
+	((key == PrivateIPCKey) ? key : 12 + (key))
+
+/* --------------------------
+ * NOTE: This macro must always give the highest numbered key as every backend
+ * process forked off by the postmaster will be trying to acquire a semaphore
+ * with a unique key value starting at key+14 and incrementing up.  Each
+ * backend uses the current key value then increments it by one.
+ * --------------------------
+ */
+#define IPCGetProcessSemaphoreInitKey(key) \
+	((key == PrivateIPCKey) ? key : 14 + (key))
+
+/* ipci.c */
+extern IPCKey SystemPortAddressCreateIPCKey(SystemPortAddress address);
+extern void CreateSharedMemoryAndSemaphores(IPCKey key);
+extern void AttachSharedMemoryAndSemaphores(IPCKey key);
+
+#endif	/* IPC_H */
diff --git a/src/backend/storage/ipc/Makefile.inc b/src/backend/storage/ipc/Makefile.inc
new file mode 100644
index 00000000000..b426dba0ff0
--- /dev/null
+++ b/src/backend/storage/ipc/Makefile.inc
@@ -0,0 +1,15 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/ipc
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= ipc.c ipci.c s_lock.c shmem.c shmqueue.c sinval.c \
+	sinvaladt.c spin.c
diff --git a/src/backend/storage/ipc/README b/src/backend/storage/ipc/README
new file mode 100644
index 00000000000..02d66045f82
--- /dev/null
+++ b/src/backend/storage/ipc/README
@@ -0,0 +1,31 @@
+$Header: /cvsroot/pgsql/src/backend/storage/ipc/README,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+Mon Jul 18 11:09:22 PDT 1988  W.KLAS
+
+Cache invalidation synchronization routines:
+===========================================
+
+The cache synchronization is done using a message queue. Every
+backend can register a message which then has to be read by
+all backends. A message read by all backends is removed from the 
+queue automatically. If a message has been lost because the buffer
+was full, all backends that haven't read this message will be
+noticed that they have to reset their cache state. This is done
+at the time when they try to read the message queue.
+
+The message queue is implemented as a shared buffer segment. Actually,
+the queue is a circle to allow fast inserting, reading (invalidate data) and
+maintaining the buffer.
+
+Access to this shared message buffer is synchronized by the lock manager.
+The lock manager treats the buffer as a regular relation and sets
+relation level locks (with mode = LockWait) to block backends while 
+another backend is writing or reading the buffer. The identifiers used
+for this special 'relation' are database id = 0 and relation id = 0.
+
+The current implementation prints regular (e)log information
+when a message has been removed from the buffer because the buffer 
+is full, and a backend has to reset its cache state. The elog level
+is NOTICE. This can be used to improve teh behavior of backends
+when invalidating or reseting their cache state.
+
+
diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c
new file mode 100644
index 00000000000..306300b90c3
--- /dev/null
+++ b/src/backend/storage/ipc/ipc.c
@@ -0,0 +1,718 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.c--
+ *    POSTGRES inter-process communication definitions.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipc.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ * NOTES
+ *
+ *    Currently, semaphores are used (my understanding anyway) in two
+ *    different ways:
+ *      1. as mutexes on machines that don't have test-and-set (eg.
+ *         mips R3000).
+ *      2. for putting processes to sleep when waiting on a lock 
+ *         and waking them up when the lock is free.
+ *    The number of semaphores in (1) is fixed and those are shared
+ *    among all backends. In (2), there is 1 semaphore per process and those
+ *    are not shared with anyone else.
+ *                                                        -ay 4/95
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/types.h>
+#include <sys/file.h>
+#include <stdio.h>
+#include <errno.h>
+
+/* XXX - the following  dependency should be moved into the defaults.mk file */
+#ifndef	_IPC_
+#define _IPC_
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#endif
+
+#include "storage/ipc.h"
+#include "utils/memutils.h"
+#include "utils/elog.h"
+
+#if defined(PORTNAME_bsd44)
+int UsePrivateMemory = 1;
+#else
+int UsePrivateMemory = 0;
+#endif
+
+#if defined(PORTNAME_bsdi)
+/* hacka, hacka, hacka (XXX) */
+union semun {
+	int val; /* value for SETVAL */
+	struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */
+	ushort *array; /* array for GETALL & SETALL */
+};
+#endif
+
+
+/* ----------------------------------------------------------------
+ *			exit() handling stuff
+ * ----------------------------------------------------------------
+ */
+
+#define MAX_ON_EXITS 20
+
+static struct ONEXIT {
+    void (*function)();
+    caddr_t arg;
+} onexit_list[ MAX_ON_EXITS ];
+
+static int onexit_index;
+
+typedef struct _PrivateMemStruct {
+    int id;
+    char *memptr;
+} PrivateMem;
+
+PrivateMem IpcPrivateMem[16];
+
+static int
+PrivateMemoryCreate(IpcMemoryKey memKey,
+		    uint32 size)
+{
+    static int memid = 0;
+    
+    UsePrivateMemory = 1;
+    
+    IpcPrivateMem[memid].id = memid;
+    IpcPrivateMem[memid].memptr = malloc(size);
+    if (IpcPrivateMem[memid].memptr == NULL)
+	elog(WARN, "PrivateMemoryCreate: not enough memory to malloc");
+    memset(IpcPrivateMem[memid].memptr, 0, size);	/* XXX PURIFY */
+    
+    return (memid++);
+}
+
+static char *
+PrivateMemoryAttach(IpcMemoryId memid)
+{
+    return ( IpcPrivateMem[memid].memptr );
+}
+
+
+/* ----------------------------------------------------------------
+ *	exitpg
+ *
+ *	this function calls all the callbacks registered
+ *	for it (to free resources) and then calls exit.
+ *	This should be the only function to call exit().
+ *	-cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+static int exitpg_inprogress = 0;
+
+void
+exitpg(int code)
+{
+    int i;
+    
+    /* ----------------
+     *	if exitpg_inprocess is true, then it means that we
+     *  are being invoked from within an on_exit() handler
+     *  and so we return immediately to avoid recursion.
+     * ----------------
+     */
+    if (exitpg_inprogress)
+	return;
+    
+    exitpg_inprogress = 1;
+    
+    /* ----------------
+     *	call all the callbacks registered before calling exit().
+     * ----------------
+     */
+    for (i = onexit_index - 1; i >= 0; --i)
+	(*onexit_list[i].function)(code, onexit_list[i].arg);
+    
+    exit(code);
+}
+
+/* ------------------
+ * Run all of the on_exitpg routines but don't exit in the end.
+ * This is used by the postmaster to re-initialize shared memory and
+ * semaphores after a backend dies horribly
+ * ------------------
+ */
+void
+quasi_exitpg()
+{
+    int i;
+    
+    /* ----------------
+     *	if exitpg_inprocess is true, then it means that we
+     *  are being invoked from within an on_exit() handler
+     *  and so we return immediately to avoid recursion.
+     * ----------------
+     */
+    if (exitpg_inprogress)
+	return;
+    
+    exitpg_inprogress = 1;
+    
+    /* ----------------
+     *	call all the callbacks registered before calling exit().
+     * ----------------
+     */
+    for (i = onexit_index - 1; i >= 0; --i)
+	(*onexit_list[i].function)(0, onexit_list[i].arg);
+    
+    onexit_index = 0;
+    exitpg_inprogress = 0;
+}
+
+/* ----------------------------------------------------------------
+ *	on_exitpg
+ *
+ *	this function adds a callback function to the list of
+ *	functions invoked by exitpg().	-cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+int
+on_exitpg(void (*function)(), caddr_t arg)
+{
+    if (onexit_index >= MAX_ON_EXITS)
+	return(-1);
+    
+    onexit_list[ onexit_index ].function = function;
+    onexit_list[ onexit_index ].arg = arg;
+    
+    ++onexit_index;
+    
+    return(0);
+}
+
+/****************************************************************************/
+/*   IPCPrivateSemaphoreKill(status, semId)				    */
+/*									    */
+/****************************************************************************/
+static void
+IPCPrivateSemaphoreKill(int status,
+			int semId) /* caddr_t */
+{
+    union semun	semun;
+    semctl(semId, 0, IPC_RMID, semun);
+}
+
+
+/****************************************************************************/
+/*   IPCPrivateMemoryKill(status, shmId)				    */
+/*									    */
+/****************************************************************************/
+static void
+IPCPrivateMemoryKill(int status,
+		     int shmId)	/* caddr_t */
+{
+    if ( UsePrivateMemory ) {
+	/* free ( IpcPrivateMem[shmId].memptr ); */
+    } else {
+	if (shmctl(shmId, IPC_RMID, (struct shmid_ds *) NULL) < 0) {
+	    elog(NOTICE, "IPCPrivateMemoryKill: shmctl(%d, %d, 0) failed: %m",
+		 shmId, IPC_RMID);
+	}
+    } 
+}
+
+
+/****************************************************************************/
+/*   IpcSemaphoreCreate(semKey, semNum, permission, semStartValue)          */
+/*    									    */
+/*    - returns a semaphore identifier:					    */
+/*    									    */
+/* if key doesn't exist: return a new id,      status:= IpcSemIdNotExist    */
+/* if key exists:        return the old id,    status:= IpcSemIdExist	    */
+/* if semNum > MAX :     return # of argument, status:=IpcInvalidArgument   */
+/*									    */
+/****************************************************************************/
+
+/*
+ * Note:
+ * XXX	This should be split into two different calls.  One should
+ * XXX	be used to create a semaphore set.  The other to "attach" a
+ * XXX	existing set.  It should be an error for the semaphore set
+ * XXX	to to already exist or for it not to, respectively.
+ *
+ *	Currently, the semaphore sets are "attached" and an error
+ *	is detected only when a later shared memory attach fails.
+ */
+
+IpcSemaphoreId
+IpcSemaphoreCreate(IpcSemaphoreKey semKey,
+		   int semNum,
+		   int permission,
+		   int semStartValue,
+		   int removeOnExit,
+		   int *status)
+{
+    int		i;
+    int		errStatus;
+    int		semId;
+    u_short	array[IPC_NMAXSEM];
+    union semun	semun;
+
+    /* get a semaphore if non-existent */
+    /* check arguments	*/
+    if (semNum > IPC_NMAXSEM || semNum <= 0)  {
+	*status = IpcInvalidArgument;
+	return(2);	/* returns the number of the invalid argument	*/
+    }
+    
+    semId = semget(semKey, 0, 0);
+
+    if (semId == -1) {
+	*status = IpcSemIdNotExist;	/* there doesn't exist a semaphore */
+#ifdef DEBUG_IPC
+	fprintf(stderr,"calling semget with %d, %d , %d\n",
+		semKey,
+		semNum,
+		IPC_CREAT|permission );
+#endif
+	semId = semget(semKey, semNum, IPC_CREAT|permission);
+
+	if (semId < 0) {
+	    perror("semget");
+	    exitpg(3);
+	}
+	for (i = 0; i < semNum; i++) {
+	    array[i] = semStartValue;
+	}
+	semun.array = array;
+	errStatus = semctl(semId, 0, SETALL, semun);
+	if (errStatus == -1) {
+	    perror("semctl");
+	}
+	
+	if (removeOnExit)
+	    on_exitpg(IPCPrivateSemaphoreKill, (caddr_t)semId);
+	
+    } else {
+	/* there is a semaphore id for this key */
+	*status = IpcSemIdExist;
+    }
+    
+#ifdef DEBUG_IPC
+    fprintf(stderr,"\nIpcSemaphoreCreate, status %d, returns %d\n",
+	    *status,
+	    semId );
+    fflush(stdout);
+    fflush(stderr);
+#endif
+    return(semId);
+}
+
+
+/****************************************************************************/
+/*   IpcSemaphoreSet()		- sets the initial value of the semaphore   */
+/*									    */
+/*	note: the xxx_return variables are only used for debugging.	    */
+/****************************************************************************/
+static int IpcSemaphoreSet_return;
+
+void
+IpcSemaphoreSet(int semId, int semno, int value)
+{
+    int		errStatus;
+    union semun	semun;
+    
+    semun.val = value;
+    errStatus = semctl(semId, semno, SETVAL, semun);
+    IpcSemaphoreSet_return = errStatus;
+    
+    if (errStatus == -1)
+	perror("semctl");
+}
+
+/****************************************************************************/
+/*   IpcSemaphoreKill(key)	- removes a semaphore			    */
+/*									    */
+/****************************************************************************/
+void
+IpcSemaphoreKill(IpcSemaphoreKey key)
+{
+    int 	semId;
+    union semun	semun;
+    
+    /* kill semaphore if existent */
+    
+    semId = semget(key, 0, 0);
+    if (semId != -1)
+	semctl(semId, 0, IPC_RMID, semun);
+}
+
+/****************************************************************************/
+/*   IpcSemaphoreLock(semId, sem, lock)	- locks a semaphore		    */
+/*									    */
+/*	note: the xxx_return variables are only used for debugging.	    */
+/****************************************************************************/
+static int IpcSemaphoreLock_return;
+
+void
+IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock)
+{
+    extern int		errno;
+    int			errStatus;
+    struct sembuf	sops;
+    
+    sops.sem_op = lock;
+    sops.sem_flg = 0;
+    sops.sem_num = sem;
+    
+    /* ----------------
+     *	Note: if errStatus is -1 and errno == EINTR then it means we
+     *        returned from the operation prematurely because we were
+     *	      sent a signal.  So we try and lock the semaphore again.
+     *	      I am not certain this is correct, but the semantics aren't
+     *	      clear it fixes problems with parallel abort synchronization,
+     *	      namely that after processing an abort signal, the semaphore
+     *	      call returns with -1 (and errno == EINTR) before it should.
+     *	      -cim 3/28/90
+     * ----------------
+     */
+    do {
+	errStatus = semop(semId, &sops, 1);
+    } while (errStatus == -1 && errno == EINTR);
+    
+    IpcSemaphoreLock_return = errStatus;
+    
+    if (errStatus == -1) {
+	perror("semop");
+	exitpg(255);
+    }
+}
+
+/****************************************************************************/
+/*   IpcSemaphoreUnlock(semId, sem, lock)	- unlocks a semaphore	    */
+/*									    */
+/*	note: the xxx_return variables are only used for debugging.	    */
+/****************************************************************************/
+static int IpcSemaphoreUnlock_return;
+
+void
+IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock)
+{
+    extern int		errno;
+    int			errStatus;
+    struct sembuf	sops;
+    
+    sops.sem_op = -lock;
+    sops.sem_flg = 0;
+    sops.sem_num = sem;
+    
+    
+    /* ----------------
+     *	Note: if errStatus is -1 and errno == EINTR then it means we
+     *        returned from the operation prematurely because we were
+     *	      sent a signal.  So we try and lock the semaphore again.
+     *	      I am not certain this is correct, but the semantics aren't
+     *	      clear it fixes problems with parallel abort synchronization,
+     *	      namely that after processing an abort signal, the semaphore
+     *	      call returns with -1 (and errno == EINTR) before it should.
+     *	      -cim 3/28/90
+     * ----------------
+     */
+    do {
+	errStatus = semop(semId, &sops, 1);
+    } while (errStatus == -1 && errno == EINTR);
+    
+    IpcSemaphoreUnlock_return = errStatus;
+    
+    if (errStatus == -1) {
+	perror("semop");
+	exitpg(255);
+    }
+}
+
+int
+IpcSemaphoreGetCount(IpcSemaphoreId	semId, int sem)
+{
+    int semncnt;
+    union semun dummy; /* for Solaris */
+    
+    semncnt = semctl(semId, sem, GETNCNT, dummy);
+    return semncnt;
+}
+
+int
+IpcSemaphoreGetValue(IpcSemaphoreId	semId, int sem)
+{
+    int semval;
+    union semun dummy; /* for Solaris */
+    
+    semval = semctl(semId, sem, GETVAL, dummy);
+    return semval;
+}
+
+/****************************************************************************/
+/*   IpcMemoryCreate(memKey)						    */
+/*									    */
+/*    - returns the memory identifier, if creation succeeds		    */
+/*	returns IpcMemCreationFailed, if failure			    */
+/****************************************************************************/
+
+IpcMemoryId
+IpcMemoryCreate(IpcMemoryKey memKey, uint32 size, int permission)
+{
+    IpcMemoryId	 shmid;
+    
+    if (memKey == PrivateIPCKey) {
+	/* private */
+	shmid = PrivateMemoryCreate(memKey, size);
+    }else {
+    	shmid = shmget(memKey, size, IPC_CREAT|permission); 
+    }
+
+    if (shmid < 0) {
+	fprintf(stderr,"IpcMemoryCreate: memKey=%d , size=%d , permission=%d", 
+		memKey, size , permission );
+	perror("IpcMemoryCreate: shmget(..., create, ...) failed");
+	return(IpcMemCreationFailed);
+    }
+    
+    /* if (memKey == PrivateIPCKey) */
+    on_exitpg(IPCPrivateMemoryKill, (caddr_t)shmid);
+    
+    return(shmid);
+}
+
+/****************************************************************************/
+/*  IpcMemoryIdGet(memKey, size)    returns the shared memory Id 	    */
+/*				    or IpcMemIdGetFailed	            */
+/****************************************************************************/
+IpcMemoryId
+IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size)
+{
+    IpcMemoryId	shmid;
+    
+    shmid = shmget(memKey, size, 0);
+    
+    if (shmid < 0) {
+	fprintf(stderr,"IpcMemoryIdGet: memKey=%d , size=%d , permission=%d", 
+		memKey, size , 0 );
+	perror("IpcMemoryIdGet:  shmget() failed");
+	return(IpcMemIdGetFailed);
+    }
+    
+    return(shmid);
+}
+
+/****************************************************************************/
+/*  IpcMemoryDetach(status, shmaddr)	removes a shared memory segment	    */
+/*					from a backend address space	    */
+/*  (only called by backends running under the postmaster)		    */
+/****************************************************************************/
+void
+IpcMemoryDetach(int status, char *shmaddr)
+{
+    if (shmdt(shmaddr) < 0) {
+	elog(NOTICE, "IpcMemoryDetach: shmdt(0x%x): %m", shmaddr);
+    }
+}
+
+/****************************************************************************/
+/*  IpcMemoryAttach(memId)    returns the adress of shared memory	    */
+/*			      or IpcMemAttachFailed			    */
+/*							                    */
+/* CALL IT:  addr = (struct <MemoryStructure> *) IpcMemoryAttach(memId);    */
+/*									    */
+/****************************************************************************/
+char *
+IpcMemoryAttach(IpcMemoryId memId)
+{
+    char	*memAddress;
+    
+    if (UsePrivateMemory) {
+	memAddress = (char *) PrivateMemoryAttach(memId);
+    } else {
+	memAddress = (char *) shmat(memId, 0, 0);
+    }
+    
+    /*	if ( *memAddress == -1) { XXX ??? */
+    if ( memAddress == (char *)-1) {
+	perror("IpcMemoryAttach: shmat() failed");
+	return(IpcMemAttachFailed);
+    }
+    
+    if (!UsePrivateMemory)
+	on_exitpg(IpcMemoryDetach, (caddr_t) memAddress);
+    
+    return((char *) memAddress);
+}
+
+
+/****************************************************************************/
+/*  IpcMemoryKill(memKey)    		removes a shared memory segment     */
+/*  (only called by the postmaster and standalone backends)		    */
+/****************************************************************************/
+void
+IpcMemoryKill(IpcMemoryKey memKey)
+{	
+    IpcMemoryId		shmid;
+    
+    if (!UsePrivateMemory && (shmid = shmget(memKey, 0, 0)) >= 0) {
+	if (shmctl(shmid, IPC_RMID, (struct shmid_ds *) NULL) < 0) {
+	    elog(NOTICE, "IpcMemoryKill: shmctl(%d, %d, 0) failed: %m",
+		 shmid, IPC_RMID);
+	}
+    }
+} 
+
+#ifdef HAS_TEST_AND_SET
+/* ------------------
+ *  use hardware locks to replace semaphores for sequent machines
+ *  to avoid costs of swapping processes and to provide unlimited
+ *  supply of locks.
+ * ------------------
+ */
+static SLock *SLockArray = NULL;
+static SLock **FreeSLockPP;
+static int *UnusedSLockIP;
+static slock_t *SLockMemoryLock;
+static IpcMemoryId SLockMemoryId = -1;
+
+struct ipcdummy {		/* to get alignment/size right */
+    SLock	*free;
+    int		unused;
+    slock_t	memlock;
+    SLock	slocks[NSLOCKS];
+};
+static int SLockMemorySize = sizeof(struct ipcdummy);
+
+void
+CreateAndInitSLockMemory(IPCKey key)
+{
+    int id;
+    SLock *slckP;
+    
+    SLockMemoryId = IpcMemoryCreate(key,
+				    SLockMemorySize,
+				    0700);
+    AttachSLockMemory(key);
+    *FreeSLockPP = NULL;
+    *UnusedSLockIP = (int)FIRSTFREELOCKID;
+    for (id=0; id<(int)FIRSTFREELOCKID; id++) {
+	slckP = &(SLockArray[id]);
+	S_INIT_LOCK(&(slckP->locklock));
+	slckP->flag = NOLOCK;
+	slckP->nshlocks = 0;
+	S_INIT_LOCK(&(slckP->shlock));
+	S_INIT_LOCK(&(slckP->exlock));
+	S_INIT_LOCK(&(slckP->comlock));
+	slckP->next = NULL;
+    }
+    return;
+}
+
+void
+AttachSLockMemory(IPCKey key)
+{
+    struct ipcdummy *slockM;
+    
+    if (SLockMemoryId == -1)
+	SLockMemoryId = IpcMemoryIdGet(key,SLockMemorySize);
+    if (SLockMemoryId == -1)
+	elog(FATAL, "SLockMemory not in shared memory");
+    slockM = (struct ipcdummy *) IpcMemoryAttach(SLockMemoryId);
+    if (slockM == IpcMemAttachFailed)
+	elog(FATAL, "AttachSLockMemory: could not attach segment");
+    FreeSLockPP = (SLock **) &(slockM->free);
+    UnusedSLockIP = (int *) &(slockM->unused);
+    SLockMemoryLock = (slock_t *) &(slockM->memlock);
+    S_INIT_LOCK(SLockMemoryLock);
+    SLockArray = (SLock *) &(slockM->slocks[0]);
+    return;
+}
+
+
+#ifdef LOCKDEBUG
+#define PRINT_LOCK(LOCK) printf("(locklock = %d, flag = %d, nshlocks = %d, \
+shlock = %d, exlock =%d)\n", LOCK->locklock, \
+				LOCK->flag, LOCK->nshlocks, LOCK->shlock, \
+				LOCK->exlock)
+#endif
+
+void
+ExclusiveLock(int lockid)
+{
+    SLock *slckP;
+    slckP = &(SLockArray[lockid]);
+#ifdef LOCKDEBUG
+    printf("ExclusiveLock(%d)\n", lockid);
+    printf("IN: ");
+    PRINT_LOCK(slckP);
+#endif
+ ex_try_again:
+    S_LOCK(&(slckP->locklock));
+    switch (slckP->flag) {
+    case NOLOCK:
+	slckP->flag = EXCLUSIVELOCK;
+	S_LOCK(&(slckP->exlock));
+	S_LOCK(&(slckP->shlock));
+	S_UNLOCK(&(slckP->locklock));
+#ifdef LOCKDEBUG
+	printf("OUT: ");
+	PRINT_LOCK(slckP);
+#endif
+	return;
+    case SHAREDLOCK:
+    case EXCLUSIVELOCK:
+	S_UNLOCK(&(slckP->locklock));
+	S_LOCK(&(slckP->exlock));
+	S_UNLOCK(&(slckP->exlock));
+	goto ex_try_again;
+    }
+}
+
+void
+ExclusiveUnlock(int lockid)
+{
+    SLock *slckP;
+    
+    slckP = &(SLockArray[lockid]);
+#ifdef LOCKDEBUG
+    printf("ExclusiveUnlock(%d)\n", lockid);
+    printf("IN: ");
+    PRINT_LOCK(slckP);
+#endif
+    S_LOCK(&(slckP->locklock));
+    /* -------------
+     *  give favor to read processes
+     * -------------
+     */
+    slckP->flag = NOLOCK;
+    if (slckP->nshlocks > 0) {
+	while (slckP->nshlocks > 0) {
+	    S_UNLOCK(&(slckP->shlock));
+	    S_LOCK(&(slckP->comlock));
+	}
+	S_UNLOCK(&(slckP->shlock));
+    }
+    else {
+	S_UNLOCK(&(slckP->shlock));
+    }
+    S_UNLOCK(&(slckP->exlock));
+    S_UNLOCK(&(slckP->locklock));
+#ifdef LOCKDEBUG
+    printf("OUT: ");
+    PRINT_LOCK(slckP);
+#endif
+    return;
+}
+
+bool
+LockIsFree(int lockid)
+{
+    return(SLockArray[lockid].flag == NOLOCK);
+}
+
+#endif /* HAS_TEST_AND_SET */
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
new file mode 100644
index 00000000000..18d3cccd0ee
--- /dev/null
+++ b/src/backend/storage/ipc/ipci.c
@@ -0,0 +1,149 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipci.c--
+ *    POSTGRES inter-process communication initialization code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "storage/ipc.h"
+#include "storage/multilev.h"
+#include "utils/elog.h"
+#include "storage/sinval.h"
+#include "storage/bufmgr.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "storage/lock.h"
+#include "miscadmin.h"		/* for DebugLvl */
+
+/*
+ * SystemPortAddressCreateMemoryKey --
+ *	Returns a memory key given a port address.
+ */
+IPCKey
+SystemPortAddressCreateIPCKey(SystemPortAddress address)
+{
+    Assert(address < 32768);	/* XXX */
+    
+    return (SystemPortAddressGetIPCKey(address));
+}
+
+/*
+ * CreateSharedMemoryAndSemaphores --
+ *	Creates and initializes shared memory and semaphores.
+ */
+/**************************************************
+  
+  CreateSharedMemoryAndSemaphores
+  is called exactly *ONCE* by the postmaster.
+  It is *NEVER* called by the postgres backend
+  
+  0) destroy any existing semaphores for both buffer
+  and lock managers.
+  1) create the appropriate *SHARED* memory segments
+  for the two resource managers.
+  
+  **************************************************/
+
+void
+CreateSharedMemoryAndSemaphores(IPCKey key)
+{
+    int		size;
+    
+#ifdef HAS_TEST_AND_SET
+    /* ---------------
+     *  create shared memory for slocks
+     * --------------
+     */
+    CreateAndInitSLockMemory(IPCKeyGetSLockSharedMemoryKey(key));
+#endif
+    /* ----------------
+     *	kill and create the buffer manager buffer pool (and semaphore)
+     * ----------------
+     */
+    CreateSpinlocks(IPCKeyGetSpinLockSemaphoreKey(key));
+    size = BufferShmemSize() + LockShmemSize();
+    
+#ifdef MAIN_MEMORY
+    size += MMShmemSize();
+#endif /* MAIN_MEMORY */
+    
+    if (DebugLvl > 1) {
+	fprintf(stderr, "binding ShmemCreate(key=%x, size=%d)\n",
+		IPCKeyGetBufferMemoryKey(key), size);
+    }
+    ShmemCreate(IPCKeyGetBufferMemoryKey(key), size);
+    ShmemBindingTabReset();
+    InitShmem(key, size);
+    InitBufferPool(key);
+    
+    /* ----------------
+     *	do the lock table stuff
+     * ----------------
+     */
+    InitLocks();
+    InitMultiLevelLockm();
+    if (InitMultiLevelLockm() == INVALID_TABLEID)
+	elog(FATAL, "Couldn't create the lock table");
+
+    /* ----------------
+     *  do process table stuff
+     * ----------------
+     */
+    InitProcGlobal(key);
+    on_exitpg(ProcFreeAllSemaphores, 0);
+    
+    CreateSharedInvalidationState(key);
+}
+
+
+/*
+ * AttachSharedMemoryAndSemaphores --
+ *	Attachs existant shared memory and semaphores.
+ */
+void
+AttachSharedMemoryAndSemaphores(IPCKey key)
+{
+    int size;
+    
+    /* ----------------
+     *	create rather than attach if using private key
+     * ----------------
+     */
+    if (key == PrivateIPCKey) {
+	CreateSharedMemoryAndSemaphores(key);
+	return;
+    }
+    
+#ifdef HAS_TEST_AND_SET
+    /* ----------------
+     *  attach the slock shared memory
+     * ----------------
+     */
+    AttachSLockMemory(IPCKeyGetSLockSharedMemoryKey(key));
+#endif
+    /* ----------------
+     *	attach the buffer manager buffer pool (and semaphore)
+     * ----------------
+     */
+    size = BufferShmemSize() + LockShmemSize();
+    InitShmem(key, size);
+    InitBufferPool(key);
+    
+    /* ----------------
+     *	initialize lock table stuff
+     * ----------------
+     */
+    InitLocks();
+    if (InitMultiLevelLockm() == INVALID_TABLEID)
+	elog(FATAL, "Couldn't attach to the lock table");
+    
+    AttachSharedInvalidationState(key);
+}
diff --git a/src/backend/storage/ipc/s_lock.c b/src/backend/storage/ipc/s_lock.c
new file mode 100644
index 00000000000..3cbe796fc59
--- /dev/null
+++ b/src/backend/storage/ipc/s_lock.c
@@ -0,0 +1,440 @@
+/*-------------------------------------------------------------------------
+ *
+ * s_lock.c--
+ *     This file contains the implementation (if any) for spinlocks.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/s_lock.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *   DESCRIPTION
+ *	The following code fragment should be written (in assembly 
+ *	language) on machines that have a native test-and-set instruction:
+ *
+ *	void
+ *	S_LOCK(char_address)
+ *	    char *char_address;
+ *	{
+ *	    while (test_and_set(char_address))
+ *		;
+ *	}
+ *
+ *	If this is not done, POSTGRES will default to using System V
+ *	semaphores (and take a large performance hit -- around 40% of
+ *	its time on a DS5000/240 is spent in semop(3)...).
+ *
+ *   NOTES
+ *	AIX has a test-and-set but the recommended interface is the cs(3)
+ *	system call.  This provides an 8-instruction (plus system call 
+ *	overhead) uninterruptible compare-and-set operation.  True 
+ *	spinlocks might be faster but using cs(3) still speeds up the 
+ *	regression test suite by about 25%.  I don't have an assembler
+ *	manual for POWER in any case.
+ *
+ */
+#ifdef WIN32
+#include <windows.h>
+#endif /* WIN32 */
+#include "storage/ipc.h"
+
+
+#if defined(HAS_TEST_AND_SET)
+
+#if defined (PORTNAME_next)
+/*
+ * NEXTSTEP (mach)
+ * slock_t is defined as a struct mutex.
+ */
+void
+S_LOCK(slock_t *lock)
+{
+	mutex_lock(lock);
+}
+void
+S_UNLOCK(slock_t *lock)
+{
+	mutex_unlock(lock);
+}
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ 	mutex_init(lock);	
+}
+
+ /* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */
+int
+ S_LOCK_FREE(slock_t *lock)
+{
+ 	/* For Mach, we have to delve inside the entrails of `struct  
+mutex'.  Ick! */
+ 	return (lock->lock == 0);
+}
+
+#endif /* PORTNAME_next */
+
+
+
+#if defined(PORTNAME_irix5)
+/*
+ * SGI IRIX 5
+ * slock_t is defined as a struct abilock_t, which has a single unsigned long 
+ * member.
+ * 
+ * This stuff may be supplemented in the future with Masato Kataoka's MIPS-II
+ * assembly from his NECEWS SVR4 port, but we probably ought to retain this
+ * for the R3000 chips out there.
+ */
+void
+S_LOCK(slock_t *lock)
+{
+	/* spin_lock(lock); */
+	while (!acquire_lock(lock))
+	    ;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+	(void)release_lock(lock);
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+	(void)init_lock(lock);	
+}
+
+/* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */
+int
+S_LOCK_FREE(slock_t *lock)
+{
+	return(stat_lock(lock)==UNLOCKED); 
+}
+
+#endif /* PORTNAME_irix5 */
+
+
+/*
+ * OSF/1 (Alpha AXP)
+ *
+ * Note that slock_t on the Alpha AXP is msemaphore instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_alpha)
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (msem_lock(lock, MSEM_IF_NOWAIT) < 0)
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    (void) msem_unlock(lock, 0);
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    (void) msem_init(lock, MSEM_UNLOCKED);
+}
+
+int
+S_LOCK_FREE(slock_t *lock)
+{
+    return(lock->msem_state ? 0 : 1);
+}
+
+#endif /* PORTNAME_alpha */
+
+/*
+ * Solaris 2
+ */
+
+#if defined(PORTNAME_sparc_solaris)
+
+/* defined in port/.../tas.s */
+extern int tas(slock_t *lock);
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (tas(lock))
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_sparc_solaris */
+
+/*
+ * AIX (POWER)
+ *
+ * Note that slock_t on POWER/POWER2/PowerPC is int instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_aix)
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (cs((int *) lock, 0, 1))
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_aix */
+
+/*
+ * HP-UX (PA-RISC)
+ *
+ * Note that slock_t on PA-RISC is a structure instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_hpux)
+
+/* defined in port/.../tas.s */
+extern int tas(slock_t *lock);
+
+/*
+* a "set" slock_t has a single word cleared.  a "clear" slock_t has 
+* all words set to non-zero.
+*/
+static slock_t clear_lock = { -1, -1, -1, -1 };
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (tas(lock))
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = clear_lock;	/* struct assignment */
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+int
+S_LOCK_FREE(slock_t *lock)
+{
+    register int *lock_word = (int *) (((long) lock + 15) & ~15);
+
+    return(*lock_word != 0);
+}
+
+#endif /* PORTNAME_hpux */
+
+/*
+ * sun3
+ */
+ 
+#if (defined(sun) && ! defined(sparc))
+
+void    
+S_LOCK(slock_t *lock)
+{
+    while (tas(lock));
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+static int
+tas_dummy()
+{
+    asm("LLA0:");
+    asm("	.data");
+    asm("	.text");
+    asm("|#PROC# 04");
+    asm("	.globl	_tas");
+    asm("_tas:");
+    asm("|#PROLOGUE# 1");
+    asm("	movel   sp@(0x4),a0");
+    asm("	tas	a0@");
+    asm("	beq	LLA1");
+    asm("	moveq   #-128,d0");
+    asm("	rts");
+    asm("LLA1:");
+    asm("	moveq   #0,d0");
+    asm("	rts");
+    asm("	.data");
+}
+
+#endif
+
+/*
+ * SPARC (SunOS 4)
+ */
+
+#if defined(PORTNAME_sparc)
+
+/* if we're using -ansi w/ gcc, use __asm__ instead of asm */
+#if defined(__STRICT_ANSI__)
+#define asm(x)  __asm__(x)
+#endif 
+
+static int
+tas_dummy()
+{
+    asm(".seg \"data\"");
+    asm(".seg \"text\"");
+    asm(".global _tas");
+    asm("_tas:");
+    
+    /*
+     * Sparc atomic test and set (sparc calls it "atomic load-store")
+     */
+    
+    asm("ldstub [%r8], %r8");
+    
+    /*
+     * Did test and set actually do the set?
+     */
+    
+    asm("tst %r8");
+    
+    asm("be,a ReturnZero");
+    
+    /*
+     * otherwise, just return.
+     */
+    
+    asm("clr %r8");
+    asm("mov 0x1, %r8");
+    asm("ReturnZero:");
+    asm("retl");
+    asm("nop");
+}
+
+void
+S_LOCK(unsigned char *addr)
+{
+    while (tas(addr));
+}
+
+
+/*
+ * addr should be as in the above S_LOCK routine
+ */
+void
+S_UNLOCK(unsigned char *addr)
+{
+    *addr = 0;
+}
+
+void
+S_INIT_LOCK(unsigned char *addr)
+{
+    *addr = 0;
+}
+
+#endif /* PORTNAME_sparc */
+
+/*
+ * Linux and friends
+ */
+
+#if defined(PORTNAME_linux) || defined(PORTNAME_BSD44_derived)
+
+int
+tas(slock_t *m)
+{
+    slock_t res;
+    __asm__("xchgb %0,%1":"=q" (res),"=m" (*m):"0" (0x1));
+    return(res);
+}
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (tas(lock))
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_linux || PORTNAME_BSD44_derived */
+
+
+#endif /* HAS_TEST_AND_SET */
+
+
+#ifdef WIN32
+void
+S_LOCK(HANDLE *lock)
+{
+      int x = 0;
+      x = x / x;
+}
+
+void
+S_UNLOCK(HANDLE *lock)
+{
+      int x = 0;
+      x = x / x;
+}
+
+void
+S_INIT_LOCK(HANDLE *lock)
+{
+      int x = 0;
+      x = x / x;
+}
+#endif /*WIN32*/
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
new file mode 100644
index 00000000000..4eba3729ac8
--- /dev/null
+++ b/src/backend/storage/ipc/shmem.c
@@ -0,0 +1,561 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.c--
+ *    create shared memory and initialize shared memory data structures.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES processes share one or more regions of shared memory.
+ * The shared memory is created by a postmaster and is "attached to"
+ * by each of the backends.  The routines in this file are used for
+ * allocating and binding to shared memory data structures.
+ *
+ * NOTES:
+ *	(a) There are three kinds of shared memory data structures
+ *  available to POSTGRES: fixed-size structures, queues and hash 
+ *  tables.  Fixed-size structures contain things like global variables
+ *  for a module and should never be allocated after the process 
+ *  initialization phase.  Hash tables have a fixed maximum size, but
+ *  their actual size can vary dynamically.  When entries are added
+ *  to the table, more space is allocated.  Queues link data structures 
+ *  that have been allocated either as fixed size structures or as hash 
+ *  buckets.  Each shared data structure has a string name to identify 
+ *  it (assigned in the module that declares it).
+ *
+ *	(b) During initialization, each module looks for its
+ *  shared data structures in a hash table called the "Binding Table".
+ *  If the data structure is not present, the caller can allocate
+ *  a new one and initialize it.  If the data structure is present, 
+ *  the caller "attaches" to the structure by initializing a pointer 
+ *  in the local address space.  
+ * 	The binding table has two purposes: first, it gives us
+ *  a simple model of how the world looks when a backend process 
+ *  initializes.  If something is present in the binding table,
+ *  it is initialized.  If it is not, it is uninitialized.  Second,
+ *  the binding table allows us to allocate shared memory on demand
+ *  instead of trying to preallocate structures and hard-wire the
+ *  sizes and locations in header files.  If you are using a lot
+ *  of shared memory in a lot of different places (and changing
+ *  things during development), this is important.
+ *
+ *	(c) memory allocation model: shared memory can never be 
+ *  freed, once allocated.   Each hash table has its own free list,
+ *  so hash buckets can be reused when an item is deleted.  However,
+ *  if one hash table grows very large and then shrinks, its space
+ *  cannot be redistributed to other tables.  We could build a simple
+ *  hash bucket garbage collector if need be.  Right now, it seems
+ *  unnecessary.
+ *
+ *   	See InitSem() in sem.c for an example of how to use the
+ *  binding table.
+ *
+ */
+#include <stdio.h>
+#include <string.h>
+#include "postgres.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+/* shared memory global variables */
+
+unsigned long  ShmemBase = 0;	/* start and end address of
+				 * shared memory
+				 */
+static unsigned long  ShmemEnd = 0;
+static unsigned long  ShmemSize = 0;	/* current size (and default) */
+
+SPINLOCK      ShmemLock;	/* lock for shared memory allocation */
+
+SPINLOCK      BindingLock;	/* lock for binding table access */
+
+static unsigned long *ShmemFreeStart = NULL;	/* pointer to the OFFSET of
+						 * first free shared memory
+						 */
+static unsigned long *ShmemBindingTabOffset = NULL; /* start of the binding
+						     * table (for bootstrap)
+						     */
+static int  ShmemBootstrap = FALSE;	/* flag becomes true when shared mem
+					 * is created by POSTMASTER
+					 */
+
+static HTAB *BindingTable = NULL;
+
+/* ---------------------
+ * ShmemBindingTabReset() - Resets the binding table to NULL....
+ * useful when the postmaster destroys existing shared memory
+ * and creates all new segments after a backend crash.
+ * ----------------------
+ */
+void
+ShmemBindingTabReset()
+{
+    BindingTable = (HTAB *)NULL;
+}
+
+/*
+ *  CreateSharedRegion() --
+ *
+ *  This routine is called once by the postmaster to
+ *  initialize the shared buffer pool.  Assume there is
+ *  only one postmaster so no synchronization is necessary
+ *  until after this routine completes successfully.
+ *
+ * key is a unique identifier for the shmem region.
+ * size is the size of the region.
+ */
+static IpcMemoryId ShmemId;
+
+void
+ShmemCreate(unsigned int key, unsigned int size)
+{
+    if (size)
+	ShmemSize = size;
+    /* create shared mem region */
+    if ((ShmemId=IpcMemoryCreate(key,ShmemSize,IPCProtection))
+	==IpcMemCreationFailed) {
+	elog(FATAL,"ShmemCreate: cannot create region");
+	exit(1);
+    }
+    
+    /* ShmemBootstrap is true if shared memory has been
+     * created, but not yet initialized.  Only the 
+     * postmaster/creator-of-all-things should have
+     * this flag set.
+     */
+    ShmemBootstrap = TRUE;
+}
+
+/*
+ *  InitShmem() -- map region into process address space
+ *	and initialize shared data structures.
+ *
+ */
+int
+InitShmem(unsigned int key, unsigned int size)
+{
+    Pointer 	sharedRegion;
+    unsigned long currFreeSpace;
+    
+    HASHCTL 	info;
+    int 		hash_flags;
+    BindingEnt *	result,item;
+    bool	found;
+    IpcMemoryId	shmid;
+    
+    /* if zero key, use default memory size */
+    if (size)
+	ShmemSize = size;
+    
+    /* default key is 0 */
+    
+    /* attach to shared memory region (SysV or BSD OS specific) */
+    if (ShmemBootstrap && key == PrivateIPCKey)
+	/* if we are running backend alone */
+	shmid = ShmemId;
+    else
+	shmid = IpcMemoryIdGet(IPCKeyGetBufferMemoryKey(key), ShmemSize);
+    sharedRegion = IpcMemoryAttach(shmid);
+    if (sharedRegion == NULL) {
+	elog(FATAL,"AttachSharedRegion: couldn't attach to shmem\n");
+	return(FALSE);
+    }
+    
+    /* get pointers to the dimensions of shared memory */
+    ShmemBase = (unsigned long) sharedRegion;
+    ShmemEnd  = (unsigned long) sharedRegion + ShmemSize;
+    currFreeSpace = 0;
+    
+    /* First long in shared memory is the count of available space */
+    ShmemFreeStart = (unsigned long *) ShmemBase;
+    /* next is a shmem pointer to the binding table */
+    ShmemBindingTabOffset = ShmemFreeStart + 1;
+    
+    currFreeSpace += 
+	sizeof(ShmemFreeStart) + sizeof(ShmemBindingTabOffset);
+    
+    /* bootstrap initialize spin locks so we can start to use the
+     * allocator and binding table.
+     */
+    if (! InitSpinLocks(ShmemBootstrap, IPCKeyGetSpinLockSemaphoreKey(key))) {
+	return(FALSE);
+    }
+    
+    /* We have just allocated additional space for two spinlocks.
+     * Now setup the global free space count 
+     */
+    if (ShmemBootstrap) {
+	*ShmemFreeStart = currFreeSpace;
+    }
+    
+    /* if ShmemFreeStart is NULL, then the allocator won't work */
+    Assert(*ShmemFreeStart);
+    
+    /* create OR attach to the shared memory binding table */
+    info.keysize = BTABLE_KEYSIZE;
+    info.datasize = BTABLE_DATASIZE;
+    hash_flags = (HASH_ELEM);
+    
+    /* This will acquire the binding table lock, but not release it. */
+    BindingTable = ShmemInitHash("BindingTable",
+				 BTABLE_SIZE,BTABLE_SIZE,
+				 &info,hash_flags);
+    
+    if (! BindingTable) {
+	elog(FATAL,"InitShmem: couldn't initialize Binding Table");
+	return(FALSE);
+    }
+    
+    /* Now, check the binding table for an entry to the binding
+     * table.  If there is an entry there, someone else created
+     * the table.  Otherwise, we did and we have to initialize it.
+     */
+    memset(item.key, 0, BTABLE_KEYSIZE);
+    strncpy(item.key,"BindingTable",BTABLE_KEYSIZE);
+    
+    result = (BindingEnt *) 
+	hash_search(BindingTable,(char *) &item,HASH_ENTER, &found);
+    
+    
+    if (! result ) {
+	elog(FATAL,"InitShmem: corrupted binding table");
+	return(FALSE);
+    }
+    
+    if (! found) {
+	/* bootstrapping shmem: we have to initialize the 
+	 * binding table now.
+	 */
+	
+	Assert(ShmemBootstrap);
+	result->location = MAKE_OFFSET(BindingTable->hctl);
+	*ShmemBindingTabOffset = result->location;
+	result->size = BTABLE_SIZE;
+	
+	ShmemBootstrap = FALSE;
+	
+    }  else {
+	Assert(! ShmemBootstrap);
+    }
+    /* now release the lock acquired in ShmemHashInit */
+    SpinRelease (BindingLock);
+    
+    Assert (result->location == MAKE_OFFSET(BindingTable->hctl));
+    
+    return(TRUE);
+}
+
+/*
+ * ShmemAlloc -- allocate word-aligned byte string from
+ * 	shared memory
+ *
+ * Assumes ShmemLock and ShmemFreeStart are initialized.
+ * Returns: real pointer to memory or NULL if we are out
+ * 	of space.  Has to return a real pointer in order 
+ *  	to be compatable with malloc().
+ */
+long *
+ShmemAlloc(unsigned long size)
+{
+    unsigned long tmpFree;
+    long *newSpace;
+    
+    /*
+     * ensure space is word aligned.
+     *
+     * Word-alignment is not good enough. We have to be more
+     * conservative: doubles need 8-byte alignment. (We probably only need
+     * this on RISC platforms but this is not a big waste of space.) 
+     *                                                - ay 12/94
+     */
+    if (size % sizeof(double))
+	size += sizeof(double) - (size % sizeof(double));
+    
+    Assert(*ShmemFreeStart);
+    
+    SpinAcquire(ShmemLock);
+    
+    tmpFree = *ShmemFreeStart + size;
+    if (tmpFree <= ShmemSize) {
+	newSpace = (long *)MAKE_PTR(*ShmemFreeStart);
+	*ShmemFreeStart += size;
+    } else {
+	newSpace = NULL;
+    }
+    
+    SpinRelease(ShmemLock); 
+    
+    if (! newSpace) {
+	elog(NOTICE,"ShmemAlloc: out of memory ");
+    }
+    return(newSpace);
+}
+
+/*
+ * ShmemIsValid -- test if an offset refers to valid shared memory 
+ * 
+ * Returns TRUE if the pointer is valid.
+ */
+int
+ShmemIsValid(unsigned long addr)
+{
+    return ((addr<ShmemEnd) && (addr>=ShmemBase));
+}
+
+/*
+ * ShmemInitHash -- Create/Attach to and initialize 
+ * 	shared memory hash table.
+ *
+ * Notes:
+ *
+ * assume caller is doing some kind of synchronization
+ * so that two people dont try to create/initialize the
+ * table at once.  Use SpinAlloc() to create a spinlock
+ * for the structure before creating the structure itself.
+ */
+HTAB *
+ShmemInitHash(char *name,	/* table string name for binding */
+	      long init_size, 	/* initial size */
+	      long max_size, 	/* max size of the table */
+	      HASHCTL *infoP,	/* info about key and bucket size */
+	      int hash_flags)	/* info about infoP */
+{
+    bool	found;
+    long  *	location;
+    
+    /* shared memory hash tables have a fixed max size so that the
+     * control structures don't try to grow.  The segbase is for
+     * calculating pointer values.  The shared memory allocator
+     * must be specified.
+     */
+    infoP->segbase = (long *) ShmemBase;
+    infoP->alloc = ShmemAlloc;
+    infoP->max_size = max_size;
+    hash_flags |= HASH_SHARED_MEM;
+    
+    /* look it up in the binding table */
+    location = 
+	ShmemInitStruct(name,my_log2(max_size) + sizeof(HHDR),&found);
+    
+    /* binding table is corrupted.  Let someone else give the 
+     * error message since they have more information 
+     */
+    if (location == NULL) {
+	return(0);
+    }
+    
+    /* it already exists, attach to it rather than allocate and
+     * initialize new space 
+     */
+    if (found) {
+	hash_flags |= HASH_ATTACH;
+    }
+    
+    /* these structures were allocated or bound in ShmemInitStruct */
+    /* control information and parameters */
+    infoP->hctl = (long *) location;
+    /* directory for hash lookup */
+    infoP->dir = (long *) (location + sizeof(HHDR));
+    
+    return(hash_create(init_size, infoP, hash_flags));;
+}
+
+/*
+ * ShmemPIDLookup -- lookup process data structure using process id
+ *
+ * Returns: TRUE if no error.  locationPtr is initialized if PID is
+ *	found in the binding table.
+ *
+ * NOTES:
+ * 	only information about success or failure is the value of
+ *	locationPtr.
+ */
+bool
+ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr)
+{
+    BindingEnt *	result,item;
+    bool	found;
+    
+    Assert (BindingTable);
+    memset(item.key, 0, BTABLE_KEYSIZE);
+    sprintf(item.key,"PID %d",pid);
+    
+    SpinAcquire(BindingLock);
+    result = (BindingEnt *) 
+	hash_search(BindingTable,(char *) &item, HASH_ENTER, &found);
+    
+    if (! result) {
+	
+	SpinRelease(BindingLock);
+	elog(WARN,"ShmemInitPID: BindingTable corrupted");
+	return(FALSE);
+	
+    } 
+    
+    if (found) {
+	*locationPtr = result->location;
+    } else {
+	result->location = *locationPtr;
+    }
+    
+    SpinRelease(BindingLock);
+    return (TRUE);
+}
+
+/*
+ * ShmemPIDDestroy -- destroy binding table entry for process
+ *	using process id
+ *
+ * Returns: offset of the process struct in shared memory or
+ *	INVALID_OFFSET if not found.
+ *
+ * Side Effect: removes the entry from the binding table
+ */
+SHMEM_OFFSET
+ShmemPIDDestroy(int pid)
+{
+    BindingEnt *	result,item;
+    bool	found;
+    SHMEM_OFFSET  location;
+    
+    Assert(BindingTable);
+    
+    memset(item.key, 0, BTABLE_KEYSIZE);
+    sprintf(item.key,"PID %d",pid);
+    
+    SpinAcquire(BindingLock);
+    result = (BindingEnt *) 
+	hash_search(BindingTable,(char *) &item, HASH_REMOVE, &found);
+    
+    if (found)
+	location = result->location;
+    SpinRelease(BindingLock);
+    
+    if (! result) {
+	
+	elog(WARN,"ShmemPIDDestroy: PID table corrupted");
+	return(INVALID_OFFSET);
+	
+    } 
+    
+    if (found)
+	return (location);
+    else {
+	return(INVALID_OFFSET);
+    }
+}
+
+/*
+ * ShmemInitStruct -- Create/attach to a structure in shared
+ * 	memory.
+ *
+ *  This is called during initialization to find or allocate
+ *     	a data structure in shared memory.  If no other processes
+ *	have created the structure, this routine allocates space
+ *	for it.  If it exists already, a pointer to the existing
+ * 	table is returned.  
+ *
+ *  Returns: real pointer to the object.  FoundPtr is TRUE if
+ *	the object is already in the binding table (hence, already
+ *	initialized).
+ */
+long *
+ShmemInitStruct(char *name, unsigned long size, bool *foundPtr)
+{
+    BindingEnt *	result,item;
+    long * structPtr;
+
+    strncpy(item.key,name,BTABLE_KEYSIZE);
+    item.location = BAD_LOCATION;
+    
+    SpinAcquire(BindingLock);
+    
+    if (! BindingTable) {
+	/* Assert() is a macro now. substitutes inside quotes. */
+	char *strname = "BindingTable";
+	
+	/* If the binding table doesnt exist, we fake it.
+	 *
+	 * If we are creating the first binding table, then let 
+	 * shmemalloc() allocate the space for a new HTAB.  Otherwise,
+	 * find the old one and return that.  Notice that the
+	 * BindingLock is held until the binding table has been completely
+	 * initialized.
+	 */
+	Assert (! strcmp(name,strname)) ;
+	if (ShmemBootstrap) {
+	    /* in POSTMASTER/Single process */
+	    
+	    *foundPtr = FALSE;
+	    return((long *)ShmemAlloc(size));
+	    
+	} else {
+	    Assert (ShmemBindingTabOffset);
+	    
+	    *foundPtr = TRUE;
+	    return((long *)MAKE_PTR(*ShmemBindingTabOffset));
+	}
+	
+	
+    } else {
+	/* look it up in the bindint table */
+	result = (BindingEnt *) 
+	    hash_search(BindingTable,(char *) &item,HASH_ENTER, foundPtr);
+    }
+    
+    if (! result) {
+	
+	SpinRelease(BindingLock);
+	
+	elog(WARN,"ShmemInitStruct: Binding Table corrupted");
+	return(NULL);
+	
+    } else if (*foundPtr) {
+	/*
+	 * Structure is in the binding table so someone else has allocated 
+	 * it already.  The size better be the same as the size we are 
+	 * trying to initialize to or there is a name conflict (or worse).
+	 */
+	if (result->size != size) {
+	    SpinRelease(BindingLock);
+	    
+	    elog(NOTICE,"ShmemInitStruct: BindingTable entry size is wrong");
+	    /* let caller print its message too */
+	    return(NULL);
+	}
+	structPtr = (long *)MAKE_PTR(result->location);
+    } else {
+	
+	/* It isn't in the table yet. allocate and initialize it */
+	structPtr = ShmemAlloc((long)size);
+	if (! structPtr) {
+	    /* out of memory */
+	    Assert (BindingTable);
+	    (void) hash_search(BindingTable,(char *) &item,HASH_REMOVE, foundPtr);
+	    SpinRelease(BindingLock);
+	    *foundPtr = FALSE;
+	    
+	    elog(NOTICE,"ShmemInitStruct: cannot allocate '%s'",
+		 name);
+	    return(NULL);
+	} 
+	result->size = size;
+	result->location = MAKE_OFFSET(structPtr);
+    }
+    Assert (ShmemIsValid((unsigned long)structPtr));
+    
+    SpinRelease(BindingLock);
+    return(structPtr);
+}
+
+
+
diff --git a/src/backend/storage/ipc/shmqueue.c b/src/backend/storage/ipc/shmqueue.c
new file mode 100644
index 00000000000..f08546742b5
--- /dev/null
+++ b/src/backend/storage/ipc/shmqueue.c
@@ -0,0 +1,251 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmqueue.c--
+ *    shared memory linked lists
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmqueue.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ * NOTES
+ *
+ * Package for managing doubly-linked lists in shared memory.
+ * The only tricky thing is that SHM_QUEUE will usually be a field 
+ * in a larger record.  SHMQueueGetFirst has to return a pointer
+ * to the record itself instead of a pointer to the SHMQueue field
+ * of the record.  It takes an extra pointer and does some extra
+ * pointer arithmetic to do this correctly.
+ *
+ * NOTE: These are set up so they can be turned into macros some day.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>		/* for sprintf() */
+#include "postgres.h"
+#include "storage/shmem.h"	/* where the declarations go */
+#include "utils/elog.h"
+
+/*#define SHMQUEUE_DEBUG*/
+#ifdef SHMQUEUE_DEBUG
+#define SHMQUEUE_DEBUG_DEL	/* deletions */
+#define SHMQUEUE_DEBUG_HD	/* head inserts */
+#define SHMQUEUE_DEBUG_TL	/* tail inserts */
+#define SHMQUEUE_DEBUG_ELOG NOTICE
+#endif /* SHMQUEUE_DEBUG */
+
+/*
+ * ShmemQueueInit -- make the head of a new queue point
+ * 	to itself
+ */
+void
+SHMQueueInit(SHM_QUEUE *queue)
+{
+    Assert(SHM_PTR_VALID(queue));
+    (queue)->prev = (queue)->next = MAKE_OFFSET(queue);
+}
+
+/*
+ * SHMQueueIsDetached -- TRUE if element is not currently
+ *	in a queue.
+ */
+bool
+SHMQueueIsDetached(SHM_QUEUE *queue)
+{
+    Assert(SHM_PTR_VALID(queue));
+    return ((queue)->prev == INVALID_OFFSET);
+}
+
+/*
+ * SHMQueueElemInit -- clear an element's links
+ */
+void
+SHMQueueElemInit(SHM_QUEUE *queue)
+{
+    Assert(SHM_PTR_VALID(queue));
+    (queue)->prev = (queue)->next = INVALID_OFFSET;
+}
+
+/*
+ * SHMQueueDelete -- remove an element from the queue and
+ * 	close the links
+ */
+void
+SHMQueueDelete(SHM_QUEUE *queue)
+{
+    SHM_QUEUE *nextElem = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+    SHM_QUEUE *prevElem = (SHM_QUEUE *) MAKE_PTR((queue)->prev);
+    
+    Assert(SHM_PTR_VALID(queue));
+    Assert(SHM_PTR_VALID(nextElem));
+    Assert(SHM_PTR_VALID(prevElem));
+    
+#ifdef SHMQUEUE_DEBUG_DEL
+    dumpQ(queue, "in SHMQueueDelete: begin");
+#endif /* SHMQUEUE_DEBUG_DEL */
+    
+    prevElem->next =  (queue)->next;
+    nextElem->prev =  (queue)->prev;
+    
+#ifdef SHMQUEUE_DEBUG_DEL
+    dumpQ((SHM_QUEUE *)MAKE_PTR(queue->prev), "in SHMQueueDelete: end");
+#endif /* SHMQUEUE_DEBUG_DEL */
+}
+
+#ifdef SHMQUEUE_DEBUG
+void
+dumpQ(SHM_QUEUE *q, char *s)
+{
+    char elem[16];
+    char buf[1024];
+    SHM_QUEUE	*start = q;
+    int count = 0;
+    
+    sprintf(buf, "q prevs: %x", MAKE_OFFSET(q));
+    q = (SHM_QUEUE *)MAKE_PTR(q->prev);
+    while (q != start)
+	{
+	    sprintf(elem, "--->%x", MAKE_OFFSET(q));
+	    strcat(buf, elem);
+	    q = (SHM_QUEUE *)MAKE_PTR(q->prev);
+	    if (q->prev == MAKE_OFFSET(q))
+		break;
+	    if (count++ > 40)
+		{
+		    strcat(buf, "BAD PREV QUEUE!!");
+		    break;
+		}
+	}
+    sprintf(elem, "--->%x", MAKE_OFFSET(q));
+    strcat(buf, elem);
+    elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf);
+    
+    sprintf(buf, "q nexts: %x", MAKE_OFFSET(q));
+    count = 0;
+    q = (SHM_QUEUE *)MAKE_PTR(q->next);
+    while (q != start)
+	{
+	    sprintf(elem, "--->%x", MAKE_OFFSET(q));
+	    strcat(buf, elem);
+	    q = (SHM_QUEUE *)MAKE_PTR(q->next);
+	    if (q->next == MAKE_OFFSET(q))
+		break;
+	    if (count++ > 10)
+		{
+		    strcat(buf, "BAD NEXT QUEUE!!");
+		    break;
+		}
+	}
+    sprintf(elem, "--->%x", MAKE_OFFSET(q));
+    strcat(buf, elem);
+    elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf);
+}
+#endif /* SHMQUEUE_DEBUG */
+
+/*
+ * SHMQueueInsertHD -- put elem in queue between the queue head
+ *	and its "prev" element.
+ */
+void
+SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+    SHM_QUEUE *prevPtr = (SHM_QUEUE *) MAKE_PTR((queue)->prev);
+    SHMEM_OFFSET	elemOffset = MAKE_OFFSET(elem);
+    
+    Assert(SHM_PTR_VALID(queue));
+    Assert(SHM_PTR_VALID(elem));
+    
+#ifdef SHMQUEUE_DEBUG_HD
+    dumpQ(queue, "in SHMQueueInsertHD: begin");
+#endif /* SHMQUEUE_DEBUG_HD */
+    
+    (elem)->next = prevPtr->next;
+    (elem)->prev = queue->prev;
+    (queue)->prev = elemOffset;
+    prevPtr->next = elemOffset;
+    
+#ifdef SHMQUEUE_DEBUG_HD
+    dumpQ(queue, "in SHMQueueInsertHD: end");
+#endif /* SHMQUEUE_DEBUG_HD */
+}
+
+void
+SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+    SHM_QUEUE *nextPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+    SHMEM_OFFSET	elemOffset = MAKE_OFFSET(elem);
+    
+    Assert(SHM_PTR_VALID(queue));
+    Assert(SHM_PTR_VALID(elem));
+    
+#ifdef SHMQUEUE_DEBUG_TL
+    dumpQ(queue, "in SHMQueueInsertTL: begin");
+#endif /* SHMQUEUE_DEBUG_TL */
+    
+    (elem)->prev = nextPtr->prev;
+    (elem)->next = queue->next;
+    (queue)->next = elemOffset;
+    nextPtr->prev = elemOffset;
+    
+#ifdef SHMQUEUE_DEBUG_TL
+    dumpQ(queue, "in SHMQueueInsertTL: end");
+#endif /* SHMQUEUE_DEBUG_TL */
+}
+
+/*
+ * SHMQueueFirst -- Get the first element from a queue
+ *
+ * First element is queue->next.  If SHMQueue is part of
+ * a larger structure, we want to return a pointer to the
+ * whole structure rather than a pointer to its SHMQueue field.
+ * I.E. struct {
+ *	int 		stuff;
+ *	SHMQueue 	elem;
+ * } ELEMType; 
+ * when this element is in a queue (queue->next) is struct.elem.
+ * nextQueue allows us to calculate the offset of the SHMQueue
+ * field in the structure.
+ *
+ * call to SHMQueueFirst should take these parameters:
+ *
+ *   &(queueHead),&firstElem,&(firstElem->next)
+ *
+ * Note that firstElem may well be uninitialized.  if firstElem
+ * is initially K, &(firstElem->next) will be K+ the offset to
+ * next.
+ */
+void
+SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr, SHM_QUEUE *nextQueue)
+{
+    SHM_QUEUE *elemPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+    
+    Assert(SHM_PTR_VALID(queue));
+    *nextPtrPtr = (Pointer) (((unsigned long) *nextPtrPtr) +
+			  ((unsigned long) elemPtr) - ((unsigned long) nextQueue)); 
+    
+    /*
+      nextPtrPtr a ptr to a structure linked in the queue
+      nextQueue is the SHMQueue field of the structure
+      *nextPtrPtr - nextQueue is 0 minus the offset of the queue 
+      field n the record 
+      elemPtr + (*nextPtrPtr - nexQueue) is the start of the
+      structure containing elemPtr.
+      */
+}
+
+/*
+ * SHMQueueEmpty -- TRUE if queue head is only element, FALSE otherwise
+ */
+bool
+SHMQueueEmpty(SHM_QUEUE *queue)
+{
+    Assert(SHM_PTR_VALID(queue));
+    
+    if (queue->prev == MAKE_OFFSET(queue)) 
+	{
+	    Assert(queue->next = MAKE_OFFSET(queue));
+	    return(TRUE);
+	}
+    return(FALSE);
+}
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
new file mode 100644
index 00000000000..9151ee77686
--- /dev/null
+++ b/src/backend/storage/ipc/sinval.c
@@ -0,0 +1,169 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.c--
+ *    POSTGRES shared cache invalidation communication code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/* #define INVALIDDEBUG	1 */
+
+#include "postgres.h"
+
+#include "storage/sinval.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "utils/elog.h"
+
+extern SISeg		*shmInvalBuffer;/* the shared buffer segment, set by*/
+    	    	    	    	    	/*   SISegmentAttach()	    	    */
+extern BackendId	MyBackendId;
+extern BackendTag	MyBackendTag;
+
+SPINLOCK		SInvalLock = (SPINLOCK) NULL;
+
+/****************************************************************************/
+/*  CreateSharedInvalidationState(key)   Create a buffer segment    	    */
+/*  	    	    	    	    	    	    	    	    	    */
+/*  should be called only by the POSTMASTER 	    	    	    	    */
+/****************************************************************************/
+void
+CreateSharedInvalidationState(IPCKey key)
+{
+    int	status;
+    
+    /* REMOVED
+       SISyncKill(IPCKeyGetSIBufferMemorySemaphoreKey(key));
+       SISyncInit(IPCKeyGetSIBufferMemorySemaphoreKey(key));
+       */
+    
+    /* SInvalLock gets set in spin.c, during spinlock init */
+    status = SISegmentInit(true, IPCKeyGetSIBufferMemoryBlock(key));
+    
+    if (status == -1) {
+    	elog(FATAL, "CreateSharedInvalidationState: failed segment init");
+    }
+}
+/****************************************************************************/
+/*  AttachSharedInvalidationState(key)   Attach a buffer segment    	    */
+/*  	    	    	    	    	    	    	    	    	    */
+/*  should be called only by the POSTMASTER 	    	    	    	    */
+/****************************************************************************/
+void
+AttachSharedInvalidationState(IPCKey key)
+{
+    int	status;
+    
+    if (key == PrivateIPCKey) {
+	CreateSharedInvalidationState(key);
+	return;
+    }
+    /* SInvalLock gets set in spin.c, during spinlock init */
+    status = SISegmentInit(false, IPCKeyGetSIBufferMemoryBlock(key));
+    
+    if (status == -1) {
+    	elog(FATAL, "AttachSharedInvalidationState: failed segment init");
+    }
+}
+
+void
+InitSharedInvalidationState()
+{
+    SpinAcquire(SInvalLock);
+    if (!SIBackendInit(shmInvalBuffer))
+	{
+	    SpinRelease(SInvalLock);
+	    elog(FATAL, "Backend cache invalidation initialization failed");
+	}
+    SpinRelease(SInvalLock);
+}
+
+/*
+ * RegisterSharedInvalid --
+ *  Returns a new local cache invalidation state containing a new entry.
+ *
+ * Note:
+ *  Assumes hash index is valid.
+ *  Assumes item pointer is valid.
+ */
+/****************************************************************************/
+/*  RegisterSharedInvalid(cacheId, hashIndex, pointer)      	    	    */
+/*  	    	    	    	    	    	    	    	    	    */
+/*  register a message in the buffer	    	    	    	    	    */
+/*  should be called by a backend   	    	    	    	    	    */
+/****************************************************************************/
+void
+RegisterSharedInvalid(int cacheId, /* XXX */
+		      Index hashIndex,
+		      ItemPointer pointer)
+{
+    SharedInvalidData   newInvalid;
+    
+    /*
+     * This code has been hacked to accept two types of messages.  This might
+     * be treated more generally in the future.
+     *
+     * (1)
+     *	cacheId= system cache id
+     *	hashIndex= system cache hash index for a (possibly) cached tuple
+     *	pointer= pointer of (possibly) cached tuple
+     *
+     * (2)
+     *	cacheId= special non-syscache id
+     *	hashIndex= object id contained in (possibly) cached relation descriptor
+     *	pointer= null
+     */
+    
+    newInvalid.cacheId = cacheId;
+    newInvalid.hashIndex = hashIndex;
+    
+    if (ItemPointerIsValid(pointer)) {
+	ItemPointerCopy(pointer, &newInvalid.pointerData);
+    } else {
+	ItemPointerSetInvalid(&newInvalid.pointerData);
+    }
+    
+    SpinAcquire(SInvalLock);
+    if (!SISetDataEntry(shmInvalBuffer, &newInvalid)) {
+    	/* buffer full */
+    	/* release a message, mark process cache states to be invalid */
+    	SISetProcStateInvalid(shmInvalBuffer);
+	
+    	if (!SIDelDataEntry(shmInvalBuffer)) {
+    	    /* inconsistent buffer state -- shd never happen */
+	    SpinRelease(SInvalLock);
+    	    elog(FATAL, "RegisterSharedInvalid: inconsistent buffer state");
+    	}
+	
+    	/* write again */
+    	(void) SISetDataEntry(shmInvalBuffer, &newInvalid);
+    }
+    SpinRelease(SInvalLock);
+}
+
+/*
+ * InvalidateSharedInvalid --
+ *  Processes all entries in a shared cache invalidation state.
+ */
+/****************************************************************************/
+/*  InvalidateSharedInvalid(invalFunction, resetFunction)    	    	    */
+/*  	    	    	    	    	    	    	    	    	    */
+/*  invalidate a message in the buffer	 (read and clean up)	    	    */
+/*  should be called by a backend   	    	    	    	    	    */
+/****************************************************************************/
+void
+InvalidateSharedInvalid(void (*invalFunction)(),
+			void (*resetFunction)())
+{
+    SpinAcquire(SInvalLock);
+    SIReadEntryData(shmInvalBuffer, MyBackendId, 
+    	    	    invalFunction, resetFunction);  
+    
+    SIDelExpiredDataEntries(shmInvalBuffer);
+    SpinRelease(SInvalLock);
+}
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
new file mode 100644
index 00000000000..a30afdb6fed
--- /dev/null
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -0,0 +1,797 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.c--
+ *    POSTGRES shared cache invalidation segment definitions.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "storage/ipc.h"
+#include "storage/sinvaladt.h"
+#include "storage/lmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+
+/* ----------------
+ *	global variable notes
+ *
+ *	SharedInvalidationSemaphore
+ *
+ *	shmInvalBuffer
+ *		the shared buffer segment, set by SISegmentAttach()
+ *
+ *	MyBackendId
+ *		might be removed later, used only for
+ * 		debugging in debug routines (end of file)
+ *
+ *	SIDbId
+ *		identification of buffer (disappears)
+ *
+ *	SIRelId		\ 
+ *	SIDummyOid	 \  identification of buffer
+ *	SIXidData	 /
+ *	SIXid		/
+ *
+ *  XXX This file really needs to be cleaned up.  We switched to using
+ *	spinlocks to protect critical sections (as opposed to using fake
+ *	relations and going through the lock manager) and some of the old
+ *	cruft was 'ifdef'ed out, while other parts (now unused) are still
+ *	compiled into the system. -mer 5/24/92
+ * ----------------
+ */
+#ifdef HAS_TEST_AND_SET
+int SharedInvalidationLockId;
+#else
+IpcSemaphoreId	SharedInvalidationSemaphore;
+#endif
+
+SISeg		*shmInvalBuffer;	
+extern BackendId MyBackendId;
+
+static void CleanupInvalidationState(int status, SISeg *segInOutP);
+static BackendId SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag);
+static int SIGetNumEntries(SISeg *segP);
+
+/************************************************************************/
+/* SISetActiveProcess(segP, backendId)	set the backend status active	*/
+/*  	should be called only by the postmaster when creating a backend	*/
+/************************************************************************/
+/* XXX I suspect that the segP parameter is extraneous. -hirohama */
+static void
+SISetActiveProcess(SISeg *segInOutP, BackendId backendId)
+{
+    /* mark all messages as read */
+    
+    /* Assert(segP->procState[backendId - 1].tag == MyBackendTag); */
+    
+    segInOutP->procState[backendId - 1].resetState = false;
+    segInOutP->procState[backendId - 1].limit = SIGetNumEntries(segInOutP);
+}
+
+/****************************************************************************/
+/* SIBackendInit()  initializes a backend to operate on the buffer  	    */
+/****************************************************************************/
+int
+SIBackendInit(SISeg *segInOutP)
+{
+    LRelId  	    	    LtCreateRelId();
+    TransactionId           LMITransactionIdCopy();
+    
+    Assert(MyBackendTag > 0);
+    
+    MyBackendId = SIAssignBackendId(segInOutP, MyBackendTag);
+    if (MyBackendId == InvalidBackendTag)
+	return 0;
+    
+#ifdef	INVALIDDEBUG
+    elog(DEBUG, "SIBackendInit: backend tag %d; backend id %d.",
+	 MyBackendTag, MyBackendId);
+#endif	/* INVALIDDEBUG */
+    
+    SISetActiveProcess(segInOutP, MyBackendId);
+    on_exitpg(CleanupInvalidationState, (caddr_t)segInOutP);
+    return 1;
+}
+
+/* ----------------
+ *	SIAssignBackendId
+ * ----------------
+ */
+static BackendId
+SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag)
+{
+    Index		index;
+    ProcState	*stateP;
+    
+    stateP = NULL;
+    
+    for (index = 0; index < MaxBackendId; index += 1) {
+	if (segInOutP->procState[index].tag == InvalidBackendTag ||
+	    segInOutP->procState[index].tag == backendTag)
+	    {
+		stateP = &segInOutP->procState[index];
+		break;
+	    }
+	
+	if (!PointerIsValid(stateP) ||
+	    (segInOutP->procState[index].resetState &&
+	     (!stateP->resetState ||
+	      stateP->tag < backendTag)) ||
+	    (!stateP->resetState &&
+	     (segInOutP->procState[index].limit <
+	      stateP->limit ||
+	      stateP->tag < backendTag)))
+	    {
+		stateP = &segInOutP->procState[index];
+	    }
+    }
+    
+    /* verify that all "procState" entries checked for matching tags */
+    
+    for (index += 1; index < MaxBackendId; index += 1) {
+	if (segInOutP->procState[index].tag == backendTag) {
+	    elog (FATAL, "SIAssignBackendId: tag %d found twice",
+		  backendTag);
+	}
+    }
+    
+    if (stateP->tag != InvalidBackendTag) {
+	if (stateP->tag == backendTag) {
+	    elog(NOTICE, "SIAssignBackendId: reusing tag %d",
+		 backendTag);
+	} else {
+	    elog(NOTICE,
+		 "SIAssignBackendId: discarding tag %d",
+		 stateP->tag);
+	    return InvalidBackendTag;
+	}
+    }
+    
+    stateP->tag = backendTag;
+    
+    return (1 + stateP - &segInOutP->procState[0]);
+}
+
+
+/************************************************************************/
+/* The following function should be called only by the postmaster !!    */
+/************************************************************************/
+
+/************************************************************************/
+/* SISetDeadProcess(segP, backendId)  set the backend status DEAD   	*/
+/*  	should be called only by the postmaster when a backend died 	*/
+/************************************************************************/
+static void
+SISetDeadProcess(SISeg *segP, int backendId)
+{
+    /* XXX call me.... */
+    
+    segP->procState[backendId - 1].resetState = false;
+    segP->procState[backendId - 1].limit = -1;
+    segP->procState[backendId - 1].tag = InvalidBackendTag;
+}
+
+/*
+ * CleanupInvalidationState --
+ * Note:
+ *	This is a temporary hack.  ExitBackend should call this instead
+ *	of exit (via on_exitpg).
+ */
+static void
+CleanupInvalidationState(int status, /* XXX */
+			 SISeg *segInOutP) /* XXX style */
+{
+    Assert(PointerIsValid(segInOutP));
+    
+    SISetDeadProcess(segInOutP, MyBackendId);
+}
+
+
+/************************************************************************/
+/* SIComputeSize()  - retuns the size of a buffer segment   	    	*/
+/************************************************************************/
+static SISegOffsets *
+SIComputeSize(int *segSize)
+{
+    int      	 A, B, a, b, totalSize;
+    SISegOffsets *oP;
+    
+    A = 0;
+    a = SizeSISeg;  	/* offset to first data entry */
+    b = SizeOfOneSISegEntry * MAXNUMMESSAGES;
+    B = A + a + b;
+    totalSize = B - A;
+    *segSize = totalSize;
+    
+    oP = (SISegOffsets *) palloc(sizeof(SISegOffsets));
+    oP->startSegment = A;
+    oP->offsetToFirstEntry = a; /* relatiove to A */
+    oP->offsetToEndOfSegemnt = totalSize; /* relative to A */
+    return(oP);
+}
+
+
+/************************************************************************/
+/* SISetStartEntrySection(segP, offset)     - sets the offset		*/
+/************************************************************************/
+static void
+SISetStartEntrySection(SISeg *segP, Offset offset)
+{
+    segP->startEntrySection = offset;
+}
+
+/************************************************************************/
+/* SIGetStartEntrySection(segP)     - returnss the offset   		*/
+/************************************************************************/
+static Offset
+SIGetStartEntrySection(SISeg *segP)
+{
+    return(segP->startEntrySection);
+}
+
+
+/************************************************************************/
+/* SISetEndEntrySection(segP, offset) 	- sets the offset   		*/
+/************************************************************************/
+static void
+SISetEndEntrySection(SISeg *segP, Offset offset)
+{
+    segP->endEntrySection = offset;
+}
+
+/************************************************************************/
+/* SISetEndEntryChain(segP, offset) 	- sets the offset   	    	*/
+/************************************************************************/
+static void
+SISetEndEntryChain(SISeg *segP, Offset offset)
+{
+    segP->endEntryChain = offset;
+}
+
+/************************************************************************/
+/* SIGetEndEntryChain(segP) 	- returnss the offset	    	    	*/
+/************************************************************************/
+static Offset
+SIGetEndEntryChain(SISeg *segP)
+{
+    return(segP->endEntryChain);
+}
+
+/************************************************************************/
+/* SISetStartEntryChain(segP, offset) 	- sets the offset   	    	*/
+/************************************************************************/
+static void
+SISetStartEntryChain(SISeg *segP, Offset offset)
+{
+    segP->startEntryChain = offset;
+}
+
+/************************************************************************/
+/* SIGetStartEntryChain(segP) 	- returns  the offset	    	    	*/
+/************************************************************************/
+static Offset
+SIGetStartEntryChain(SISeg *segP)
+{
+    return(segP->startEntryChain);
+}
+
+/************************************************************************/
+/* SISetNumEntries(segP, num)	sets the current nuber of entries   	*/
+/************************************************************************/
+static bool
+SISetNumEntries(SISeg *segP, int num)
+{
+    if ( num <= MAXNUMMESSAGES) {
+        segP->numEntries =  num;
+        return(true);
+    } else {
+        return(false);  /* table full */
+    }    
+}
+
+/************************************************************************/
+/* SIGetNumEntries(segP)    - returns the current nuber of entries  	*/
+/************************************************************************/
+static int
+SIGetNumEntries(SISeg *segP)
+{
+    return(segP->numEntries);
+}
+
+
+/************************************************************************/
+/* SISetMaxNumEntries(segP, num)    sets the maximal number of entries	*/
+/************************************************************************/
+static bool
+SISetMaxNumEntries(SISeg *segP, int num)
+{
+    if ( num <= MAXNUMMESSAGES) {
+        segP->maxNumEntries =  num;
+        return(true);
+    } else {
+        return(false);  /* wrong number */
+    }   
+}
+
+
+/************************************************************************/
+/* SIGetProcStateLimit(segP, i)	returns the limit of read messages  	*/
+/************************************************************************/
+static int
+SIGetProcStateLimit(SISeg *segP, int i)
+{
+    return(segP->procState[i].limit);
+}
+
+/************************************************************************/
+/* SIIncNumEntries(segP, num)	increments the current nuber of entries	*/
+/************************************************************************/
+static bool
+SIIncNumEntries(SISeg *segP, int num)
+{
+    if ((segP->numEntries + num) <= MAXNUMMESSAGES) {
+        segP->numEntries = segP->numEntries + num;
+        return(true);
+    } else {
+        return(false);  /* table full */
+    }   
+}
+
+/************************************************************************/
+/* SIDecNumEntries(segP, num)	decrements the current nuber of entries	*/
+/************************************************************************/
+static bool
+SIDecNumEntries(SISeg *segP, int num)
+{
+    if ((segP->numEntries - num) >=  0) {
+        segP->numEntries = segP->numEntries - num;
+        return(true);
+    } else {
+        return(false);  /* not enough entries in table */
+    }   
+}
+
+/************************************************************************/
+/* SISetStartFreeSpace(segP, offset)  - sets the offset	    	    	*/
+/************************************************************************/
+static void
+SISetStartFreeSpace(SISeg *segP, Offset offset)
+{
+    segP->startFreeSpace = offset;
+}
+
+/************************************************************************/
+/* SIGetStartFreeSpace(segP)  - returns the offset  	    	    	*/
+/************************************************************************/
+static Offset
+SIGetStartFreeSpace(SISeg *segP)
+{
+    return(segP->startFreeSpace);
+}
+
+
+
+/************************************************************************/
+/* SIGetFirstDataEntry(segP)  returns first data entry	    	    	*/
+/************************************************************************/
+static SISegEntry *
+SIGetFirstDataEntry(SISeg *segP)
+{
+    SISegEntry  *eP;
+    Offset      startChain;
+    
+    startChain = SIGetStartEntryChain(segP);
+    
+    if (startChain == InvalidOffset)
+    	return(NULL);
+    
+    eP = (SISegEntry  *) ((Pointer) segP + 
+			  SIGetStartEntrySection(segP) +
+			  startChain );
+    return(eP);
+}
+
+
+/************************************************************************/
+/* SIGetLastDataEntry(segP)  returns last data entry in the chain   	*/
+/************************************************************************/
+static SISegEntry *
+SIGetLastDataEntry(SISeg *segP)
+{
+    SISegEntry  *eP;
+    Offset      endChain;
+    
+    endChain = SIGetEndEntryChain(segP);
+    
+    if (endChain == InvalidOffset)
+    	return(NULL);
+    
+    eP = (SISegEntry  *) ((Pointer) segP + 
+			  SIGetStartEntrySection(segP) +
+			  endChain );
+    return(eP);
+}
+
+/************************************************************************/
+/* SIGetNextDataEntry(segP, offset)  returns next data entry	    	*/
+/************************************************************************/
+static SISegEntry *
+SIGetNextDataEntry(SISeg *segP, Offset offset)
+{
+    SISegEntry  *eP;
+    
+    if (offset == InvalidOffset)
+    	return(NULL);
+    
+    eP = (SISegEntry  *) ((Pointer) segP +
+                          SIGetStartEntrySection(segP) + 
+                          offset);
+    return(eP);
+}
+
+
+/************************************************************************/
+/* SIGetNthDataEntry(segP, n)	returns the n-th data entry in chain	*/
+/************************************************************************/
+static SISegEntry *
+SIGetNthDataEntry(SISeg *segP,
+		  int n)	/* must range from 1 to MaxMessages */
+{
+    SISegEntry  *eP;
+    int	    	i;
+    
+    if (n <= 0) return(NULL);
+    
+    eP = SIGetFirstDataEntry(segP);
+    for (i = 1; i < n; i++) {
+    	/* skip one and get the next	*/
+    	eP = SIGetNextDataEntry(segP, eP->next);
+    }
+    
+    return(eP);
+}
+
+/************************************************************************/
+/* SIEntryOffset(segP, entryP)   returns the offset for an pointer  	*/
+/************************************************************************/
+static Offset
+SIEntryOffset(SISeg *segP, SISegEntry *entryP)
+{
+    /* relative to B !! */
+    return ((Offset) ((Pointer) entryP -
+                      (Pointer) segP - 
+                      SIGetStartEntrySection(segP) ));
+}
+
+
+/************************************************************************/
+/* SISetDataEntry(segP, data)  - sets a message in the segemnt	    	*/
+/************************************************************************/
+bool
+SISetDataEntry(SISeg *segP, SharedInvalidData  *data)
+{
+    Offset  	    offsetToNewData;
+    SISegEntry 	    *eP, *lastP;
+    bool    	    SISegFull();
+    Offset  	    SIEntryOffset();
+    Offset  	    SIGetStartFreeSpace();
+    SISegEntry 	    *SIGetFirstDataEntry();
+    SISegEntry 	    *SIGetNextDataEntry();
+    SISegEntry 	    *SIGetLastDataEntry();
+    
+    if (!SIIncNumEntries(segP, 1)) 
+	return(false);  /* no space */
+    
+    /* get a free entry */
+    offsetToNewData = SIGetStartFreeSpace(segP);
+    eP = SIGetNextDataEntry(segP, offsetToNewData); /* it's a free one */
+    SISetStartFreeSpace(segP, eP->next);
+    /* fill it up */
+    eP->entryData = *data;
+    eP->isfree = false;
+    eP->next = InvalidOffset;
+    
+    /* handle insertion point at the end of the chain !!*/
+    lastP = SIGetLastDataEntry(segP);
+    if (lastP == NULL) {
+    	/* there is no chain, insert the first entry */
+    	SISetStartEntryChain(segP, SIEntryOffset(segP, eP));
+    } else {
+    	/* there is a last entry in the chain */
+    	lastP->next = SIEntryOffset(segP, eP);
+    }
+    SISetEndEntryChain(segP, SIEntryOffset(segP, eP));
+    return(true);
+}
+
+
+/************************************************************************/
+/* SIDecProcLimit(segP, num)  decrements all process limits 	    	*/
+/************************************************************************/
+static void
+SIDecProcLimit(SISeg *segP, int num)
+{
+    int i;
+    for (i=0; i < MaxBackendId; i++) {
+    	/* decrement only, if there is a limit > 0  */
+    	if (segP->procState[i].limit > 0) {
+    	    segP->procState[i].limit = segP->procState[i].limit - num;
+    	    if (segP->procState[i].limit < 0) {
+    	    	/* limit was not high enough, reset to zero */
+    	    	/* negative means it's a dead backend	    */
+    	    	segP->procState[i].limit = 0;
+    	    }
+    	}
+    }
+}
+
+
+/************************************************************************/
+/* SIDelDataEntry(segP)	    - free the FIRST entry   	    	    	*/
+/************************************************************************/
+bool
+SIDelDataEntry(SISeg *segP)
+{
+    SISegEntry 	    *e1P;
+    SISegEntry 	    *SIGetFirstDataEntry();
+    
+    if (!SIDecNumEntries(segP, 1))  {
+    	/* no entries in buffer */
+    	return(false);
+    }
+    
+    e1P = SIGetFirstDataEntry(segP);
+    SISetStartEntryChain(segP, e1P->next);
+    if (SIGetStartEntryChain(segP) == InvalidOffset) {
+    	/* it was the last entry */
+    	SISetEndEntryChain(segP, InvalidOffset);
+    }
+    /* free the entry */
+    e1P->isfree = true;
+    e1P->next = SIGetStartFreeSpace(segP);
+    SISetStartFreeSpace(segP, SIEntryOffset(segP, e1P));
+    SIDecProcLimit(segP, 1);
+    return(true); 
+}
+
+
+
+/************************************************************************/
+/* SISetProcStateInvalid(segP)	checks and marks a backends state as 	*/
+/*  	    	    	    	    invalid 	    	    	    	*/
+/************************************************************************/
+void
+SISetProcStateInvalid(SISeg *segP)
+{
+    int i;
+    
+    for (i=0; i < MaxBackendId; i++) {
+    	if (segP->procState[i].limit == 0) {
+    	    /* backend i didn't read any message    	    	    	*/
+    	    segP->procState[i].resetState = true;
+    	    /*XXX signal backend that it has to reset its internal cache ? */
+    	}
+    }
+}
+
+/************************************************************************/
+/* SIReadEntryData(segP, backendId, function)	    	    	    	*/
+/*  	    	    	- marks messages to be read by id   	    	*/
+/*  	    	          and executes function	    	    	    	*/
+/************************************************************************/
+void
+SIReadEntryData(SISeg *segP,
+		int backendId,
+		void (*invalFunction)(),
+		void (*resetFunction)())
+{
+    int i = 0;
+    SISegEntry *data;
+    
+    Assert(segP->procState[backendId - 1].tag == MyBackendTag);
+    
+    if (!segP->procState[backendId - 1].resetState) {
+    	/* invalidate data, but only those, you have not seen yet !!*/
+    	/* therefore skip read messages */
+    	data = SIGetNthDataEntry(segP, 
+    	    	    	    	 SIGetProcStateLimit(segP, backendId - 1) + 1);
+    	while (data != NULL) {
+    	    i++;
+    	    segP->procState[backendId - 1].limit++;  /* one more message read */
+    	    invalFunction(data->entryData.cacheId, 
+			  data->entryData.hashIndex,
+			  &data->entryData.pointerData);
+    	    data = SIGetNextDataEntry(segP, data->next);
+    	}
+    	/* SIDelExpiredDataEntries(segP); */
+    } else {
+    	/*backend must not read messages, its own state has to be reset	    */
+    	elog(NOTICE, "SIMarkEntryData: cache state reset");
+        resetFunction(); /* XXXX call it here, parameters? */
+	
+	/* new valid state--mark all messages "read" */
+	segP->procState[backendId - 1].resetState = false;
+	segP->procState[backendId - 1].limit = SIGetNumEntries(segP);
+    }
+    /* check whether we can remove dead messages    	    	    	    */
+    if (i > MAXNUMMESSAGES) {
+    	elog(FATAL, "SIReadEntryData: Invalid segment state");
+    }
+}
+
+/************************************************************************/
+/* SIDelExpiredDataEntries  (segP)  - removes irrelevant messages   	*/
+/************************************************************************/
+void
+SIDelExpiredDataEntries(SISeg *segP)
+{
+    int   min, i, h;
+    
+    min = 9999999;
+    for (i = 0; i < MaxBackendId; i++) {
+    	h = SIGetProcStateLimit(segP, i);
+    	if (h >= 0)  { /* backend active */
+    	    if (h < min ) min = h;
+    	}
+    }
+    if (min != 9999999) {
+    	/* we can remove min messages */
+    	for (i = 1; i <= min; i++) {
+    	    /* this  adjusts also the state limits!*/
+    	    if (!SIDelDataEntry(segP)) { 
+            	elog(FATAL, "SIDelExpiredDataEntries: Invalid segment state");
+    	    }
+    	}
+    }
+}
+
+
+
+/************************************************************************/
+/* SISegInit(segP)  - initializes the segment	    	    	    	*/
+/************************************************************************/
+static void
+SISegInit(SISeg *segP)
+{
+    SISegOffsets    *oP;
+    int	    	    segSize, i;
+    SISegEntry      *eP;
+    
+    oP = SIComputeSize(&segSize);
+    /* set sempahore ids in the segment */
+    /* XXX */
+    SISetStartEntrySection(segP, oP->offsetToFirstEntry);
+    SISetEndEntrySection(segP, oP->offsetToEndOfSegemnt);
+    SISetStartFreeSpace(segP, 0);
+    SISetStartEntryChain(segP, InvalidOffset);
+    SISetEndEntryChain(segP, InvalidOffset);
+    (void) SISetNumEntries(segP, 0);
+    (void) SISetMaxNumEntries(segP, MAXNUMMESSAGES);
+    for (i = 0; i < MaxBackendId; i++) {
+    	segP->procState[i].limit = -1; 	    /* no backend active  !!*/
+    	segP->procState[i].resetState = false;
+	segP->procState[i].tag = InvalidBackendTag;
+    }
+    /* construct a chain of free entries    	    	    	    */
+    for (i = 1; i < MAXNUMMESSAGES; i++)  {
+    	eP = (SISegEntry  *) ((Pointer) segP +
+			      SIGetStartEntrySection(segP) +
+			      (i - 1) * sizeof(SISegEntry));
+    	eP->isfree = true;
+    	eP->next = i * sizeof(SISegEntry); /* relative to B */
+    }
+    /* handle the last free entry separate  	    	    	    */
+    eP = (SISegEntry  *) ((Pointer) segP +
+			  SIGetStartEntrySection(segP) +
+			  (MAXNUMMESSAGES - 1) * sizeof(SISegEntry));
+    eP->isfree = true;
+    eP->next = InvalidOffset;  /* it's the end of the chain !! */
+    /*
+     * Be tidy
+     */
+    pfree(oP);
+    
+}
+
+
+
+/************************************************************************/
+/* SISegmentKill(key)   - kill any segment                              */
+/************************************************************************/
+static void
+SISegmentKill(int key)	/* the corresponding key for the segment */
+{   
+    IpcMemoryKill(key);
+}	
+
+
+/************************************************************************/
+/* SISegmentGet(key, size)  - get a shared segment of size <size>       */
+/*                returns a segment id                                  */
+/************************************************************************/
+static IpcMemoryId
+SISegmentGet(int key,		/* the corresponding key for the segment */
+	     int size,		/* size of segment in bytes              */
+	     bool create)
+{
+    IpcMemoryId   shmid;
+    
+    if (create) {
+	shmid = IpcMemoryCreate(key, size, IPCProtection);
+    } else {
+	shmid = IpcMemoryIdGet(key, size);
+    }
+    return(shmid);
+}
+
+/************************************************************************/
+/* SISegmentAttach(shmid)   - attach a shared segment with id shmid     */
+/************************************************************************/
+static void
+SISegmentAttach(IpcMemoryId shmid)
+{
+    shmInvalBuffer = (struct SISeg *) IpcMemoryAttach(shmid);
+    if (shmInvalBuffer == IpcMemAttachFailed) {   
+	/* XXX use validity function */
+	elog(NOTICE, "SISegmentAttach: Could not attach segment");
+	elog(FATAL, "SISegmentAttach: %m");
+    }
+}
+
+
+/************************************************************************/
+/* SISegmentInit(killExistingSegment, key)  initialize segment	    	*/
+/************************************************************************/
+int
+SISegmentInit(bool killExistingSegment, IPCKey key)
+{ 
+    SISegOffsets	*oP;
+    int     	    	segSize;
+    IpcMemoryId	    	shmId;
+    bool    	    	create;
+    
+    if (killExistingSegment) {
+        /* Kill existing segment */
+        /* set semaphore */
+    	SISegmentKill(key);
+    	
+        /* Get a shared segment */
+	
+        oP = SIComputeSize(&segSize);
+	/*
+	 * Be tidy
+	 */
+	pfree(oP);
+	
+        create = true;
+        shmId = SISegmentGet(key,segSize, create);
+        if (shmId < 0) {
+            perror("SISegmentGet: failed");
+            return(-1);                                     /* an error */
+        }
+	
+        /* Attach the shared cache invalidation  segment */
+        /* sets the global variable shmInvalBuffer */
+        SISegmentAttach(shmId);
+	
+        /* Init shared memory table */
+        SISegInit(shmInvalBuffer);  
+    } else {
+    	/* use an existing segment */
+    	create = false;
+    	shmId = SISegmentGet(key, 0, create);
+    	if (shmId < 0) {
+    	    perror("SISegmentGet: getting an existent segment failed");
+    	    return(-1);	    	    	    	    	    /* an error */
+    	}
+    	/* Attach the shared cache invalidation segment */
+      	SISegmentAttach(shmId);
+    }
+    return(1);
+}
+
diff --git a/src/backend/storage/ipc/spin.c b/src/backend/storage/ipc/spin.c
new file mode 100644
index 00000000000..7ff2561f237
--- /dev/null
+++ b/src/backend/storage/ipc/spin.c
@@ -0,0 +1,247 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.c--
+ *    routines for managing spin locks
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/spin.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES has two kinds of locks: semaphores (which put the
+ * process to sleep) and spinlocks (which are supposed to be
+ * short term locks).  Currently both are implemented as SysV
+ * semaphores, but presumably this can change if we move to
+ * a machine with a test-and-set (TAS) instruction.  Its probably
+ * a good idea to think about (and allocate) short term and long
+ * term semaphores separately anyway.
+ *
+ * NOTE: These routines are not supposed to be widely used in Postgres.
+ *	 They are preserved solely for the purpose of porting Mark Sullivan's
+ *	 buffer manager to Postgres.
+ */
+#include <errno.h>
+#include "postgres.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/proc.h"
+#include "utils/elog.h"
+
+/* globals used in this file */
+IpcSemaphoreId	SpinLockId;
+
+#ifdef HAS_TEST_AND_SET
+/* real spin lock implementations */
+
+bool
+CreateSpinlocks(IPCKey key)
+{ 
+    /* the spin lock shared memory must have been created by now */
+    return(TRUE); 
+}
+
+bool
+AttachSpinLocks(IPCKey key)
+{
+    /* the spin lock shared memory must have been attached by now */
+    return(TRUE);
+}
+
+bool
+InitSpinLocks(int init, IPCKey key)
+{
+    extern SPINLOCK ShmemLock;
+    extern SPINLOCK BindingLock;
+    extern SPINLOCK BufMgrLock;
+    extern SPINLOCK LockMgrLock;
+    extern SPINLOCK ProcStructLock;
+    extern SPINLOCK SInvalLock;
+    extern SPINLOCK OidGenLockId;
+    
+#ifdef MAIN_MEMORY
+    extern SPINLOCK MMCacheLock;
+#endif /* SONY_JUKEBOX */
+    
+    /* These six spinlocks have fixed location is shmem */
+    ShmemLock = (SPINLOCK) SHMEMLOCKID;
+    BindingLock = (SPINLOCK) BINDINGLOCKID;
+    BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
+    LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
+    ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
+    SInvalLock = (SPINLOCK) SINVALLOCKID;
+    OidGenLockId = (SPINLOCK) OIDGENLOCKID;
+    
+#ifdef MAIN_MEMORY
+    MMCacheLock = (SPINLOCK) MMCACHELOCKID;
+#endif /* MAIN_MEMORY */
+    
+    return(TRUE);
+}
+
+void
+SpinAcquire(SPINLOCK lock)
+{
+    ExclusiveLock(lock);
+    PROC_INCR_SLOCK(lock);
+}
+
+void
+SpinRelease(SPINLOCK lock)
+{
+    PROC_DECR_SLOCK(lock);
+    ExclusiveUnlock(lock);
+}
+
+bool
+SpinIsLocked(SPINLOCK lock)
+{
+    return(!LockIsFree(lock));
+}
+
+#else /* HAS_TEST_AND_SET */
+/* Spinlocks are implemented using SysV semaphores */
+
+
+/*
+ * SpinAcquire -- try to grab a spinlock
+ *
+ * FAILS if the semaphore is corrupted.
+ */
+void
+SpinAcquire(SPINLOCK lock)
+{
+    IpcSemaphoreLock(SpinLockId, lock, IpcExclusiveLock);
+    PROC_INCR_SLOCK(lock);
+}
+
+/*
+ * SpinRelease -- release a spin lock
+ * 
+ * FAILS if the semaphore is corrupted
+ */
+void
+SpinRelease(SPINLOCK lock)
+{
+    Assert(SpinIsLocked(lock))
+	PROC_DECR_SLOCK(lock);
+    IpcSemaphoreUnlock(SpinLockId, lock, IpcExclusiveLock);
+}
+
+bool
+SpinIsLocked(SPINLOCK lock)
+{
+    int semval;
+    
+    semval = IpcSemaphoreGetValue(SpinLockId, lock);
+    return(semval < IpcSemaphoreDefaultStartValue);
+}
+
+/*
+ * CreateSpinlocks -- Create a sysV semaphore array for
+ *	the spinlocks
+ *
+ */
+bool
+CreateSpinlocks(IPCKey key)
+{
+    
+    int status;
+    IpcSemaphoreId semid;
+    semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection, 
+			       IpcSemaphoreDefaultStartValue, 1, &status);
+    if (status == IpcSemIdExist) {
+	IpcSemaphoreKill(key);
+	elog(NOTICE,"Destroying old spinlock semaphore");
+	semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection, 
+				   IpcSemaphoreDefaultStartValue, 1, &status);
+    }
+    
+    if (semid >= 0) {
+	SpinLockId = semid;
+	return(TRUE);
+    }
+    /* cannot create spinlocks */
+    elog(FATAL,"CreateSpinlocks: cannot create spin locks");
+    return(FALSE);
+}
+
+/*
+ * Attach to existing spinlock set
+ */
+bool
+AttachSpinLocks(IPCKey key)
+{
+    IpcSemaphoreId id;
+    
+    id = semget (key, MAX_SPINS, 0);
+    if (id < 0) {
+	if (errno == EEXIST) {
+	    /* key is the name of someone else's semaphore */
+	    elog (FATAL,"AttachSpinlocks: SPIN_KEY belongs to someone else");
+	}
+	/* cannot create spinlocks */
+	elog(FATAL,"AttachSpinlocks: cannot create spin locks");
+	return(FALSE);
+    }
+    SpinLockId = id;
+    return(TRUE);
+}
+
+/*
+ * InitSpinLocks -- Spinlock bootstrapping
+ * 
+ * We need several spinlocks for bootstrapping:
+ * BindingLock (for the shmem binding table) and
+ * ShmemLock (for the shmem allocator), BufMgrLock (for buffer
+ * pool exclusive access), LockMgrLock (for the lock table), and
+ * ProcStructLock (a spin lock for the shared process structure).
+ * If there's a Sony WORM drive attached, we also have a spinlock
+ * (SJCacheLock) for it.  Same story for the main memory storage mgr.
+ *
+ */
+bool
+InitSpinLocks(int init, IPCKey key)
+{
+    extern SPINLOCK ShmemLock;
+    extern SPINLOCK BindingLock;
+    extern SPINLOCK BufMgrLock;
+    extern SPINLOCK LockMgrLock;
+    extern SPINLOCK ProcStructLock;
+    extern SPINLOCK SInvalLock;
+    extern SPINLOCK OidGenLockId;
+    
+#ifdef MAIN_MEMORY
+    extern SPINLOCK MMCacheLock;
+#endif /* MAIN_MEMORY */
+    
+    if (!init || key != IPC_PRIVATE) {
+	/* if bootstrap and key is IPC_PRIVATE, it means that we are running
+	 * backend by itself.  no need to attach spinlocks
+	 */
+	if (! AttachSpinLocks(key)) {
+	    elog(FATAL,"InitSpinLocks: couldnt attach spin locks");
+	    return(FALSE);
+	}
+    }
+    
+    /* These five (or six) spinlocks have fixed location is shmem */
+    ShmemLock = (SPINLOCK) SHMEMLOCKID;
+    BindingLock = (SPINLOCK) BINDINGLOCKID;
+    BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
+    LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
+    ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
+    SInvalLock = (SPINLOCK) SINVALLOCKID;
+    OidGenLockId = (SPINLOCK) OIDGENLOCKID;
+    
+#ifdef MAIN_MEMORY
+    MMCacheLock = (SPINLOCK) MMCACHELOCKID;
+#endif /* MAIN_MEMORY */
+    
+    return(TRUE);
+}
+#endif /* HAS_TEST_AND_SET */
diff --git a/src/backend/storage/item.h b/src/backend/storage/item.h
new file mode 100644
index 00000000000..ca989fec654
--- /dev/null
+++ b/src/backend/storage/item.h
@@ -0,0 +1,20 @@
+/*-------------------------------------------------------------------------
+ *
+ * item.h--
+ *    POSTGRES disk item definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: item.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	ITEM_H
+#define ITEM_H
+
+#include "c.h"
+
+typedef Pointer	Item;
+
+#endif	/* ITEM_H */
diff --git a/src/backend/storage/itemid.h b/src/backend/storage/itemid.h
new file mode 100644
index 00000000000..f5cd0c62cc0
--- /dev/null
+++ b/src/backend/storage/itemid.h
@@ -0,0 +1,75 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemid.h--
+ *    Standard POSTGRES buffer page item identifier definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itemid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	ITEMID_H
+#define ITEMID_H
+
+typedef uint16	ItemOffset;
+typedef uint16	ItemLength;
+
+typedef bits16	ItemIdFlags;
+
+
+
+typedef struct ItemIdData {		/* line pointers */
+	unsigned	lp_off:13,	/* offset to find tup */
+					/* can be reduced by 2 if necc. */
+			lp_flags:6,	/* flags on tuple */
+			lp_len:13;	/* length of tuple */
+} ItemIdData;
+
+typedef struct ItemIdData	*ItemId;
+
+#ifndef	LP_USED
+#define LP_USED		0x01	/* this line pointer is being used */
+#endif
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+/* 
+ *	ItemIdGetLength
+ */
+#define ItemIdGetLength(itemId) \
+   ((itemId)->lp_len)
+
+/* 
+ *	ItemIdGetOffset
+ */
+#define ItemIdGetOffset(itemId) \
+   ((itemId)->lp_off)
+
+/* 
+ *	ItemIdGetFlags
+ */
+#define ItemIdGetFlags(itemId) \
+   ((itemId)->lp_flags)
+
+/*
+ * ItemIdIsValid --
+ *	True iff disk item identifier is valid.
+ */
+#define	ItemIdIsValid(itemId)	PointerIsValid(itemId)
+
+/*
+ * ItemIdIsUsed --
+ *	True iff disk item identifier is in use.
+ *
+ * Note:
+ *	Assumes disk item identifier is valid.
+ */
+#define ItemIdIsUsed(itemId) \
+    (AssertMacro(ItemIdIsValid(itemId)) ? \
+     (bool) (((itemId)->lp_flags & LP_USED) != 0) : false)
+
+#endif	/* ITEMID_H */
diff --git a/src/backend/storage/itempos.h b/src/backend/storage/itempos.h
new file mode 100644
index 00000000000..c3b895ae075
--- /dev/null
+++ b/src/backend/storage/itempos.h
@@ -0,0 +1,44 @@
+/*-------------------------------------------------------------------------
+ *
+ * itempos.h--
+ *    Standard POSTGRES buffer page long item subposition definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itempos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	ITEMPOS_H
+#define ITEMPOS_H
+
+#include "c.h"
+#include "storage/buf.h"
+#include "storage/itemid.h"
+
+typedef struct ItemSubpositionData {
+	Buffer		op_db;
+	ItemId		op_lpp;
+	char		*op_cp;		/* XXX */
+	uint32		op_len;
+} ItemSubpositionData;
+
+typedef ItemSubpositionData	*ItemSubposition;
+
+/*
+ *	PNOBREAK(OBJP, LEN)
+ *	struct	objpos	*OBJP;
+ *	unsigned	LEN;
+ */
+#define PNOBREAK(OBJP, LEN)	((OBJP)->op_len >= LEN)
+
+/*
+ *	PSKIP(OBJP, LEN)
+ *	struct	objpos	*OBJP;
+ *	unsigned	LEN;
+ */
+#define PSKIP(OBJP, LEN)\
+	{ (OBJP)->op_cp += (LEN); (OBJP)->op_len -= (LEN); }
+
+#endif	/* ITEMPOS_H */
diff --git a/src/backend/storage/itemptr.h b/src/backend/storage/itemptr.h
new file mode 100644
index 00000000000..ba3c154ef14
--- /dev/null
+++ b/src/backend/storage/itemptr.h
@@ -0,0 +1,115 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.h--
+ *    POSTGRES disk item pointer definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itemptr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	ITEMPTR_H
+#define ITEMPTR_H
+
+#include "c.h"
+#include "storage/block.h"
+#include "storage/off.h"
+#include "storage/itemid.h"
+
+/*
+ * ItemPointer:
+ *
+ * this is a pointer to an item on another disk page in the same file.
+ * blkid tells us which block, posid tells us which entry in the linp
+ * (ItemIdData) array we want.
+ */
+typedef struct ItemPointerData {
+    BlockIdData		ip_blkid;
+    OffsetNumber	ip_posid;
+} ItemPointerData;
+
+typedef ItemPointerData	*ItemPointer;
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+
+/*
+ * ItemPointerIsValid --
+ *	True iff the disk item pointer is not NULL.
+ */
+#define ItemPointerIsValid(pointer) \
+    ((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0)))
+
+/*
+ * ItemPointerGetBlockNumber --
+ *	Returns the block number of a disk item pointer.
+ */
+#define ItemPointerGetBlockNumber(pointer) \
+    (AssertMacro(ItemPointerIsValid(pointer)) ? \
+     BlockIdGetBlockNumber(&(pointer)->ip_blkid) : (BlockNumber) 0)
+
+/*
+ * ItemPointerGetOffsetNumber --
+ *	Returns the offset number of a disk item pointer.
+ */
+#define ItemPointerGetOffsetNumber(pointer) \
+    (AssertMacro(ItemPointerIsValid(pointer)) ? \
+     (pointer)->ip_posid : \
+     InvalidOffsetNumber)
+
+/*
+ * ItemPointerSet --
+ *	Sets a disk item pointer to the specified block and offset.
+ */
+#define ItemPointerSet(pointer, blockNumber, offNum) \
+    Assert(PointerIsValid(pointer)); \
+    BlockIdSet(&((pointer)->ip_blkid), blockNumber); \
+    (pointer)->ip_posid = offNum
+
+/*
+ * ItemPointerSetBlockNumber --
+ *	Sets a disk item pointer to the specified block.
+ */
+#define ItemPointerSetBlockNumber(pointer, blockNumber) \
+    Assert(PointerIsValid(pointer)); \
+    BlockIdSet(&((pointer)->ip_blkid), blockNumber)
+
+/*
+ * ItemPointerSetOffsetNumber --
+ *	Sets a disk item pointer to the specified offset.
+ */
+#define ItemPointerSetOffsetNumber(pointer, offsetNumber) \
+    AssertMacro(PointerIsValid(pointer)); \
+    (pointer)->ip_posid = (offsetNumber)
+
+/*
+ * ItemPointerCopy --
+ *	Copies the contents of one disk item pointer to another.
+ */
+#define ItemPointerCopy(fromPointer, toPointer) \
+    Assert(PointerIsValid(toPointer)); \
+    Assert(PointerIsValid(fromPointer)); \
+    *(toPointer) = *(fromPointer)
+
+/*
+ * ItemPointerSetInvalid --
+ *	Sets a disk item pointer to be invalid.
+ */
+#define ItemPointerSetInvalid(pointer) \
+    Assert(PointerIsValid(pointer)); \
+    BlockIdSet(&((pointer)->ip_blkid), InvalidBlockNumber); \
+    (pointer)->ip_posid = InvalidOffsetNumber
+
+/* ----------------
+ *	externs
+ * ----------------
+ */
+
+extern bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2);
+
+#endif	/* ITEMPTR_H */
+
diff --git a/src/backend/storage/large_object.h b/src/backend/storage/large_object.h
new file mode 100644
index 00000000000..177d2c26e47
--- /dev/null
+++ b/src/backend/storage/large_object.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * large_object.h--
+ *    file of info for Postgres large objects. POSTGRES 4.2 supports
+ *    zillions of large objects (internal, external, jaquith, inversion).
+ *    Now we only support inversion.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: large_object.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	LARGE_OBJECT_H
+#define	LARGE_OBJECT_H
+
+#include "c.h"
+#include "utils/rel.h"
+#include "access/relscan.h"
+
+/*
+ * This structure will eventually have lots more stuff associated with it.
+ */
+typedef struct LargeObjectDesc
+{
+    Relation heap_r;		/* heap relation */
+    Relation index_r;		/* index relation on seqno attribute */
+    IndexScanDesc iscan; 	/* index scan we're using */
+    TupleDesc hdesc; 		/* heap relation tuple desc */
+    TupleDesc idesc; 		/* index relation tuple desc */
+    uint32 lowbyte;		/* low byte on the current page */
+    uint32 highbyte;		/* high byte on the current page */
+    uint32 offset;		/* current seek pointer */
+    ItemPointerData htid; 	/* tid of current heap tuple */
+
+#define IFS_RDLOCK	(1 << 0)
+#define IFS_WRLOCK	(1 << 1)
+#define IFS_ATEOF	(1 << 2)
+
+    u_long flags;		/* locking info, etc */
+} LargeObjectDesc;
+
+/*
+ * Function definitions...
+ */
+
+/* inversion stuff in inv_api.c */
+extern LargeObjectDesc *inv_create(int flags);
+extern LargeObjectDesc *inv_open(Oid lobjId, int flags);
+extern void inv_close(LargeObjectDesc *obj_desc);
+extern int inv_destroy(Oid lobjId);
+extern int inv_stat(LargeObjectDesc *obj_desc, struct pgstat *stbuf);
+extern int inv_seek(LargeObjectDesc *obj_desc, int offset, int whence);
+extern int inv_tell(LargeObjectDesc *obj_desc);
+extern int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+extern int inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+
+#endif	/* LARGE_OBJECT_H */
diff --git a/src/backend/storage/large_object/Makefile.inc b/src/backend/storage/large_object/Makefile.inc
new file mode 100644
index 00000000000..fd27b46a49d
--- /dev/null
+++ b/src/backend/storage/large_object/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/large_object
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/large_object/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= inv_api.c 
diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c
new file mode 100644
index 00000000000..ae57032f94a
--- /dev/null
+++ b/src/backend/storage/large_object/inv_api.c
@@ -0,0 +1,1165 @@
+/*-------------------------------------------------------------------------
+ *
+ * inv_api.c--
+ *    routines for manipulating inversion fs large objects. This file
+ *    contains the user-level large object application interface routines.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/large_object/inv_api.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>		/* for sprintf() */
+#include <sys/file.h>
+#include "c.h"
+#include "libpq/libpq-fs.h"
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/relscan.h"
+#include "access/tupdesc.h"
+#include "access/xact.h"
+#include "access/nbtree.h"
+#include "access/tupdesc.h"
+#include "catalog/index.h"	/* for index_create() */
+#include "catalog/catalog.h"	/* for newoid() */
+#include "catalog/pg_am.h" /* for BTREE_AM_OID */
+#include "catalog/pg_opclass.h" /* for INT4_OPS_OID */
+#include "catalog/pg_proc.h" /* for INT4GE_PROC_OID */
+#include "storage/itemptr.h"
+#include "storage/bufpage.h"
+#include "storage/bufmgr.h"
+#include "utils/rel.h"
+#include "utils/palloc.h"
+#include "storage/large_object.h"
+#include "utils/elog.h"
+#include "utils/syscache.h"
+#include "utils/builtins.h"	/* for namestrcpy() */
+#include "catalog/heap.h"
+#include "nodes/pg_list.h"
+
+/*
+ *  Warning, Will Robinson...  In order to pack data into an inversion
+ *  file as densely as possible, we violate the class abstraction here.
+ *  When we're appending a new tuple to the end of the table, we check
+ *  the last page to see how much data we can put on it.  If it's more
+ *  than IMINBLK, we write enough to fill the page.  This limits external
+ *  fragmentation.  In no case can we write more than IMAXBLK, since
+ *  the 8K postgres page size less overhead leaves only this much space
+ *  for data.
+ */
+
+#define IFREESPC(p)	(PageGetFreeSpace(p) - sizeof(HeapTupleData) - sizeof(struct varlena) - sizeof(int32))
+#define IMAXBLK		8092
+#define IMINBLK		512
+
+/* non-export function prototypes */
+static HeapTuple inv_fetchtup();
+static HeapTuple inv_newtuple();
+static int inv_wrnew(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+static int inv_wrold(LargeObjectDesc *obj_desc, char *dbuf, int nbytes,
+		     HeapTuple htup, Buffer buffer);
+static void inv_indextup(LargeObjectDesc *obj_desc, HeapTuple htup);
+static int _inv_getsize(Relation hreln, TupleDesc hdesc, Relation ireln);
+
+/*
+ *  inv_create -- create a new large object.
+ *
+ *	Arguments:
+ *	  flags -- storage manager to use, archive mode, etc.
+ *
+ *	Returns:
+ *	  large object descriptor, appropriately filled in.
+ */
+LargeObjectDesc *
+inv_create(int flags)
+{
+    int file_oid;
+    LargeObjectDesc *retval;
+    Relation r;
+    Relation indr;
+    int smgr;
+    char archchar;
+    TupleDesc tupdesc;
+    AttrNumber attNums[1];
+    Oid classObjectId[1];
+    char objname[NAMEDATALEN];
+    char indname[NAMEDATALEN];
+
+    /* parse flags */
+    smgr = flags & INV_SMGRMASK;
+    if (flags & INV_ARCHIVE)
+	archchar = 'h';
+    else
+	archchar = 'n';
+
+    /* add one here since the pg_class tuple created 
+       will have the next oid and we want to have the relation name
+       to correspond to the tuple OID */
+    file_oid = newoid()+1;
+    
+    /* come up with some table names */
+    sprintf(objname, "Xinv%d", file_oid);
+    sprintf(indname, "Xinx%d", file_oid);
+
+    if (SearchSysCacheTuple(RELNAME, PointerGetDatum(objname),
+			    0,0,0) != NULL) {
+	elog(WARN,
+	     "internal error: %s already exists -- cannot create large obj",
+	     objname);
+    }
+    if (SearchSysCacheTuple(RELNAME, PointerGetDatum(indname),
+			    0,0,0) != NULL) {
+	elog(WARN,
+	     "internal error: %s already exists -- cannot create large obj",
+	     indname);
+    }
+
+    /* this is pretty painful...  want a tuple descriptor */
+    tupdesc = CreateTemplateTupleDesc(2);
+    (void) TupleDescInitEntry(tupdesc, (AttrNumber) 1,
+			      "olastbye",
+			      "int4",
+			      0, false);
+    (void) TupleDescInitEntry(tupdesc, (AttrNumber) 2,
+			      "odata",
+			      "bytea",
+			      0, false);
+    /*
+     *  First create the table to hold the inversion large object.  It
+     *  will be located on whatever storage manager the user requested.
+     */
+
+    (void) heap_create(objname, 
+		       objname,
+		       (int) archchar, smgr,
+		       tupdesc);
+
+    /* make the relation visible in this transaction */
+    CommandCounterIncrement();
+    r = heap_openr(objname);
+
+    if (!RelationIsValid(r)) {
+	elog(WARN, "cannot create large object on %s under inversion",
+	     smgrout(smgr));
+    }
+
+    /*
+     *  Now create a btree index on the relation's olastbyte attribute to
+     *  make seeks go faster.  The hardwired constants are embarassing
+     *  to me, and are symptomatic of the pressure under which this code
+     *  was written.
+     *
+     *  ok, mao, let's put in some symbolic constants - jolly
+     */
+
+    attNums[0] = 1;
+    classObjectId[0] = INT4_OPS_OID;
+    index_create(objname, indname, NULL, BTREE_AM_OID,
+		 1, &attNums[0], &classObjectId[0],
+		 0, (Datum) NULL, NULL);
+
+    /* make the index visible in this transaction */
+    CommandCounterIncrement();
+    indr = index_openr(indname);
+
+    if (!RelationIsValid(indr)) {
+	elog(WARN, "cannot create index for large obj on %s under inversion",
+	     smgrout(smgr));
+    }
+
+    retval = (LargeObjectDesc *) palloc(sizeof(LargeObjectDesc));
+
+    retval->heap_r = r;
+    retval->index_r = indr;
+    retval->iscan = (IndexScanDesc) NULL;
+    retval->hdesc = RelationGetTupleDescriptor(r);
+    retval->idesc = RelationGetTupleDescriptor(indr);
+    retval->offset = retval->lowbyte =
+	retval->highbyte = 0;
+    ItemPointerSetInvalid(&(retval->htid));
+
+    if (flags & INV_WRITE) {
+	RelationSetLockForWrite(r);
+	retval->flags = IFS_WRLOCK|IFS_RDLOCK;
+    } else if (flags & INV_READ) {
+	RelationSetLockForRead(r);
+	retval->flags = IFS_RDLOCK;
+    }
+    retval->flags |= IFS_ATEOF;
+
+    return(retval);
+}
+
+LargeObjectDesc *
+inv_open(Oid lobjId, int flags)
+{
+    LargeObjectDesc *retval;
+    Relation r;
+    char *indname;
+    Relation indrel;
+    
+    r = heap_open(lobjId);
+
+    if (!RelationIsValid(r))
+	return ((LargeObjectDesc *) NULL);
+
+    indname = pstrdup((r->rd_rel->relname).data);
+    
+    /*
+     *  hack hack hack...  we know that the fourth character of the relation
+     *  name is a 'v', and that the fourth character of the index name is an
+     *  'x', and that they're otherwise identical.
+     */
+    indname[3] = 'x';
+    indrel = index_openr(indname);
+
+    if (!RelationIsValid(indrel))
+	return ((LargeObjectDesc *) NULL);
+
+    retval = (LargeObjectDesc *) palloc(sizeof(LargeObjectDesc));
+
+    retval->heap_r = r;
+    retval->index_r = indrel;
+    retval->iscan = (IndexScanDesc) NULL;
+    retval->hdesc = RelationGetTupleDescriptor(r);
+    retval->idesc = RelationGetTupleDescriptor(indrel);
+    retval->offset = retval->lowbyte = retval->highbyte = 0;
+    ItemPointerSetInvalid(&(retval->htid));
+
+    if (flags & INV_WRITE) {
+	RelationSetLockForWrite(r);
+	retval->flags = IFS_WRLOCK|IFS_RDLOCK;
+    } else if (flags & INV_READ) {
+	RelationSetLockForRead(r);
+	retval->flags = IFS_RDLOCK;
+    }
+
+    return(retval);
+}
+
+/*
+ * Closes an existing large object descriptor.
+ */
+void
+inv_close(LargeObjectDesc *obj_desc)
+{
+    Assert(PointerIsValid(obj_desc));
+
+    if (obj_desc->iscan != (IndexScanDesc) NULL)
+	index_endscan(obj_desc->iscan);
+
+    heap_close(obj_desc->heap_r);
+    index_close(obj_desc->index_r);
+
+    pfree(obj_desc);
+}
+
+/*
+ * Destroys an existing large object, and frees its associated pointers.
+ *
+ * returns -1 if failed
+ */
+int
+inv_destroy(Oid lobjId)
+{
+    Relation r;
+
+    r = (Relation) RelationIdGetRelation(lobjId);
+    if (!RelationIsValid(r) || r->rd_rel->relkind == RELKIND_INDEX)
+	return -1;
+
+    heap_destroy(r->rd_rel->relname.data);
+    return 1;
+}
+
+/*
+ *  inv_stat() -- do a stat on an inversion file.
+ *
+ *	For the time being, this is an insanely expensive operation.  In
+ *	order to find the size of the file, we seek to the last block in
+ *	it and compute the size from that.  We scan pg_class to determine
+ *	the file's owner and create time.  We don't maintain mod time or
+ *	access time, yet.
+ *
+ *	These fields aren't stored in a table anywhere because they're
+ *	updated so frequently, and postgres only appends tuples at the
+ *	end of relations.  Once clustering works, we should fix this.
+ */
+int
+inv_stat(LargeObjectDesc *obj_desc, struct pgstat *stbuf)
+{
+    Assert(PointerIsValid(obj_desc));
+    Assert(stbuf != NULL);
+
+    /* need read lock for stat */
+    if (!(obj_desc->flags & IFS_RDLOCK)) {
+	RelationSetLockForRead(obj_desc->heap_r);
+	obj_desc->flags |= IFS_RDLOCK;
+    }
+
+    stbuf->st_ino = obj_desc->heap_r->rd_id;
+#if 1
+    stbuf->st_mode = (S_IFREG | 0666); /* IFREG|rw-rw-rw- */
+#else
+    stbuf->st_mode = 100666; /* IFREG|rw-rw-rw- */
+#endif
+    stbuf->st_size = _inv_getsize(obj_desc->heap_r,
+				  obj_desc->hdesc,
+				  obj_desc->index_r);
+
+    stbuf->st_uid = obj_desc->heap_r->rd_rel->relowner;
+
+    /* we have no good way of computing access times right now */
+    stbuf->st_atime_s = stbuf->st_mtime_s = stbuf->st_ctime_s = 0;
+
+    return (0);
+}
+
+int
+inv_seek(LargeObjectDesc *obj_desc, int offset, int whence)
+{
+    int oldOffset;
+    Datum d;
+    ScanKeyData skey;
+
+    Assert(PointerIsValid(obj_desc));
+
+    if (whence == SEEK_CUR) {
+        offset += obj_desc->offset;  /* calculate absolute position */
+        return (inv_seek(obj_desc, offset, SEEK_SET));
+    }
+
+    /*
+     * if you seek past the end (offset > 0) I have
+     * no clue what happens  :-(  		B.L.   9/1/93
+     */
+    if (whence == SEEK_END) {
+        /* need read lock for getsize */
+        if (!(obj_desc->flags & IFS_RDLOCK)) {
+            RelationSetLockForRead(obj_desc->heap_r);
+            obj_desc->flags |= IFS_RDLOCK;
+        }
+        offset += _inv_getsize(obj_desc->heap_r,
+                               obj_desc->hdesc,
+                               obj_desc->index_r );
+        return (inv_seek(obj_desc, offset, SEEK_SET));
+    }
+
+    /*
+     *  Whenever we do a seek, we turn off the EOF flag bit to force
+     *  ourselves to check for real on the next read.
+     */
+
+    obj_desc->flags &= ~IFS_ATEOF;
+    oldOffset = obj_desc->offset;
+    obj_desc->offset = offset;
+
+    /* try to avoid doing any work, if we can manage it */
+    if (offset >= obj_desc->lowbyte
+	&& offset <= obj_desc->highbyte
+        && oldOffset <= obj_desc->highbyte
+	&& obj_desc->iscan != (IndexScanDesc) NULL)
+	 return (offset);
+
+    /*
+     *  To do a seek on an inversion file, we start an index scan that
+     *  will bring us to the right place.  Each tuple in an inversion file
+     *  stores the offset of the last byte that appears on it, and we have
+     *  an index on this.
+     */
+
+
+    /* right now, just assume that the operation is SEEK_SET */
+    if (obj_desc->iscan != (IndexScanDesc) NULL) {
+	d = Int32GetDatum(offset);
+	btmovescan(obj_desc->iscan, d);
+    } else {
+
+	ScanKeyEntryInitialize(&skey, 0x0, 1, INT4GE_PROC_OID,
+			       Int32GetDatum(offset));
+
+	obj_desc->iscan = index_beginscan(obj_desc->index_r,
+					  (bool) 0, (uint16) 1,
+					  &skey);
+    }
+
+    return (offset);
+}
+
+int
+inv_tell(LargeObjectDesc *obj_desc)
+{
+    Assert(PointerIsValid(obj_desc));
+
+    return (obj_desc->offset);
+}
+
+int
+inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
+{
+    HeapTuple htup;
+    Buffer b;
+    int nread;
+    int off;
+    int ncopy;
+    Datum d;
+    struct varlena *fsblock;
+    bool isNull;
+
+    Assert(PointerIsValid(obj_desc));
+    Assert(buf != NULL);
+
+    /* if we're already at EOF, we don't need to do any work here */
+    if (obj_desc->flags & IFS_ATEOF)
+	return (0);
+
+    /* make sure we obey two-phase locking */
+    if (!(obj_desc->flags & IFS_RDLOCK)) {
+	RelationSetLockForRead(obj_desc->heap_r);
+	obj_desc->flags |= IFS_RDLOCK;
+    }
+
+    nread = 0;
+
+    /* fetch a block at a time */
+    while (nread < nbytes) {
+
+	/* fetch an inversion file system block */
+	htup = inv_fetchtup(obj_desc, &b);
+
+	if (!HeapTupleIsValid(htup)) {
+	    obj_desc->flags |= IFS_ATEOF;
+	    break;
+	}
+
+	/* copy the data from this block into the buffer */
+	d = (Datum) heap_getattr(htup, b, 2, obj_desc->hdesc, &isNull);
+	fsblock = (struct varlena *) DatumGetPointer(d);
+
+	off = obj_desc->offset - obj_desc->lowbyte;
+	ncopy = obj_desc->highbyte - obj_desc->offset + 1;
+	if (ncopy > (nbytes - nread))
+	    ncopy = (nbytes - nread);
+	memmove(buf, &(fsblock->vl_dat[off]), ncopy);
+
+	/* be a good citizen */
+	ReleaseBuffer(b);
+
+	/* move pointers past the amount we just read */
+	buf += ncopy;
+	nread += ncopy;
+	obj_desc->offset += ncopy;
+    }
+
+    /* that's it */
+    return (nread);
+}
+
+int
+inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
+{
+    HeapTuple htup;
+    Buffer b;
+    int nwritten;
+    int tuplen;
+
+    Assert(PointerIsValid(obj_desc));
+    Assert(buf != NULL);
+
+    /*
+     *  Make sure we obey two-phase locking.  A write lock entitles you
+     *  to read the relation, as well.
+     */
+
+    if (!(obj_desc->flags & IFS_WRLOCK)) {
+	RelationSetLockForRead(obj_desc->heap_r);
+	obj_desc->flags |= (IFS_WRLOCK|IFS_RDLOCK);
+    }
+
+    nwritten = 0;
+
+    /* write a block at a time */
+    while (nwritten < nbytes) {
+
+	/*
+	 *  Fetch the current inversion file system block.  If the
+	 *  class storing the inversion file is empty, we don't want
+	 *  to do an index lookup, since index lookups choke on empty
+	 *  files (should be fixed someday).
+	 */
+
+	if ((obj_desc->flags & IFS_ATEOF)
+	    || obj_desc->heap_r->rd_nblocks == 0)
+	    htup = (HeapTuple) NULL;
+	else
+	    htup = inv_fetchtup(obj_desc, &b);
+
+	/* either append or replace a block, as required */
+	if (!HeapTupleIsValid(htup)) {
+	    tuplen = inv_wrnew(obj_desc, buf, nbytes - nwritten);
+	} else {
+	    if (obj_desc->offset > obj_desc->highbyte) 
+		tuplen = inv_wrnew(obj_desc, buf, nbytes - nwritten);
+	    else
+		tuplen = inv_wrold(obj_desc, buf, nbytes - nwritten, htup, b);
+	}
+
+	/* move pointers past the amount we just wrote */
+	buf += tuplen;
+	nwritten += tuplen;
+	obj_desc->offset += tuplen;
+    }
+
+    /* that's it */
+    return (nwritten);
+}
+
+/*
+ *  inv_fetchtup -- Fetch an inversion file system block.
+ *
+ *	This routine finds the file system block containing the offset
+ *	recorded in the obj_desc structure.  Later, we need to think about
+ *	the effects of non-functional updates (can you rewrite the same
+ *	block twice in a single transaction?), but for now, we won't bother.
+ *
+ *	Parameters:
+ *		obj_desc -- the object descriptor.
+ *		bufP -- pointer to a buffer in the buffer cache; caller
+ *		        must free this.
+ *
+ *	Returns:
+ *		A heap tuple containing the desired block, or NULL if no
+ *		such tuple exists.
+ */
+static HeapTuple
+inv_fetchtup(LargeObjectDesc *obj_desc, Buffer *bufP)
+{
+    HeapTuple htup;
+    RetrieveIndexResult res;
+    Datum d;
+    int firstbyte, lastbyte;
+    struct varlena *fsblock;
+    bool isNull;
+
+    /*
+     *  If we've exhausted the current block, we need to get the next one.
+     *  When we support time travel and non-functional updates, we will
+     *  need to loop over the blocks, rather than just have an 'if', in
+     *  order to find the one we're really interested in.
+     */
+
+    if (obj_desc->offset > obj_desc->highbyte
+	|| obj_desc->offset < obj_desc->lowbyte
+	|| !ItemPointerIsValid(&(obj_desc->htid))) {
+
+	/* initialize scan key if not done */
+	if (obj_desc->iscan==(IndexScanDesc)NULL) {
+	    ScanKeyData skey;
+
+	    ScanKeyEntryInitialize(&skey, 0x0, 1, INT4GE_PROC_OID,
+				   Int32GetDatum(0));
+	    obj_desc->iscan = 
+		index_beginscan(obj_desc->index_r,
+				(bool) 0, (uint16) 1,
+				&skey);
+	}
+
+	do {
+	    res = index_getnext(obj_desc->iscan, ForwardScanDirection);
+
+	    if (res == (RetrieveIndexResult) NULL) {
+		ItemPointerSetInvalid(&(obj_desc->htid));
+		return ((HeapTuple) NULL);
+	    }
+
+	    /*
+	     *  For time travel, we need to use the actual time qual here,
+	     *  rather that NowTimeQual.  We currently have no way to pass
+	     *  a time qual in.
+	     */
+
+	    htup = heap_fetch(obj_desc->heap_r, NowTimeQual,
+			      &(res->heap_iptr), bufP);
+
+	} while (htup == (HeapTuple) NULL);
+
+	/* remember this tid -- we may need it for later reads/writes */
+	ItemPointerCopy(&(res->heap_iptr), &(obj_desc->htid));
+
+    } else {
+	htup = heap_fetch(obj_desc->heap_r, NowTimeQual,
+		          &(obj_desc->htid), bufP);
+    }
+
+    /*
+     *  By here, we have the heap tuple we're interested in.  We cache
+     *  the upper and lower bounds for this block in the object descriptor
+     *  and return the tuple.
+     */
+
+    d = (Datum)heap_getattr(htup, *bufP, 1, obj_desc->hdesc, &isNull);
+    lastbyte = (int32) DatumGetInt32(d);
+    d = (Datum)heap_getattr(htup, *bufP, 2, obj_desc->hdesc, &isNull);
+    fsblock = (struct varlena *) DatumGetPointer(d);
+
+    /* order of + and - is important -- these are unsigned quantites near 0 */
+    firstbyte = (lastbyte + 1 + sizeof(fsblock->vl_len)) - fsblock->vl_len;
+
+    obj_desc->lowbyte = firstbyte;
+    obj_desc->highbyte = lastbyte;
+
+    /* done */
+    return (htup);
+}
+
+/*
+ *  inv_wrnew() -- append a new filesystem block tuple to the inversion
+ *		    file.
+ *
+ *	In response to an inv_write, we append one or more file system
+ *	blocks to the class containing the large object.  We violate the
+ *	class abstraction here in order to pack things as densely as we
+ *	are able.  We examine the last page in the relation, and write
+ *	just enough to fill it, assuming that it has above a certain
+ *	threshold of space available.  If the space available is less than
+ *	the threshold, we allocate a new page by writing a big tuple.
+ *
+ *	By the time we get here, we know all the parameters passed in
+ *	are valid, and that we hold the appropriate lock on the heap
+ *	relation.
+ *
+ *	Parameters:
+ *		obj_desc: large object descriptor for which to append block.
+ *		buf: buffer containing data to write.
+ *		nbytes: amount to write
+ *
+ *	Returns:
+ *		number of bytes actually written to the new tuple.
+ */
+static int
+inv_wrnew(LargeObjectDesc *obj_desc, char *buf, int nbytes)
+{
+    Relation hr;
+    HeapTuple ntup;
+    Buffer buffer;
+    Page page;
+    int nblocks;
+    int nwritten;
+
+    hr = obj_desc->heap_r;
+
+    /*
+     *  Get the last block in the relation.  If there's no data in the
+     *  relation at all, then we just get a new block.  Otherwise, we
+     *  check the last block to see whether it has room to accept some
+     *  or all of the data that the user wants to write.  If it doesn't,
+     *  then we allocate a new block.
+     */
+
+    nblocks = RelationGetNumberOfBlocks(hr);
+
+    if (nblocks > 0)
+	buffer = ReadBuffer(hr, nblocks - 1);
+    else
+	buffer = ReadBuffer(hr, P_NEW);
+
+    page = BufferGetPage(buffer);
+
+    /*
+     *  If the last page is too small to hold all the data, and it's too
+     *  small to hold IMINBLK, then we allocate a new page.  If it will
+     *  hold at least IMINBLK, but less than all the data requested, then
+     *  we write IMINBLK here.  The caller is responsible for noticing that
+     *  less than the requested number of bytes were written, and calling
+     *  this routine again.
+     */
+
+    nwritten = IFREESPC(page);
+    if (nwritten < nbytes) {
+	if (nwritten < IMINBLK) {
+            ReleaseBuffer(buffer);
+            buffer = ReadBuffer(hr, P_NEW);
+            page = BufferGetPage(buffer);
+	    PageInit(page, BufferGetPageSize(buffer), 0);
+	    if (nbytes > IMAXBLK)
+		nwritten = IMAXBLK;
+	    else
+		nwritten = nbytes;
+	}
+    } else {
+	nwritten = nbytes;
+    }
+
+    /*
+     *  Insert a new file system block tuple, index it, and write it out.
+     */
+
+    ntup = inv_newtuple(obj_desc, buffer, page, buf, nwritten);
+    inv_indextup(obj_desc, ntup);
+
+    /* new tuple is inserted */
+    WriteBuffer(buffer);
+
+    return (nwritten);
+}
+
+static int
+inv_wrold(LargeObjectDesc *obj_desc,
+	  char *dbuf,
+	  int nbytes,
+	  HeapTuple htup,
+	  Buffer buffer)
+{
+    Relation hr;
+    HeapTuple ntup;
+    Buffer newbuf;
+    Page page;
+    Page newpage;
+    int tupbytes;
+    Datum d;
+    struct varlena *fsblock;
+    int nwritten, nblocks, freespc;
+    bool isNull;
+    int keep_offset;
+
+    /*
+     *  Since we're using a no-overwrite storage manager, the way we
+     *  overwrite blocks is to mark the old block invalid and append
+     *  a new block.  First mark the old block invalid.  This violates
+     *  the tuple abstraction.
+     */
+
+    TransactionIdStore(GetCurrentTransactionId(), &(htup->t_xmax));
+    htup->t_cmax = GetCurrentCommandId();
+
+    /*
+     *  If we're overwriting the entire block, we're lucky.  All we need
+     *  to do is to insert a new block.
+     */
+
+    if (obj_desc->offset == obj_desc->lowbyte
+	&& obj_desc->lowbyte + nbytes >= obj_desc->highbyte) {
+	WriteBuffer(buffer);
+	return (inv_wrnew(obj_desc, dbuf, nbytes));
+    }
+
+     /*
+     *  By here, we need to overwrite part of the data in the current
+     *  tuple.  In order to reduce the degree to which we fragment blocks,
+     *  we guarantee that no block will be broken up due to an overwrite.
+     *  This means that we need to allocate a tuple on a new page, if
+     *  there's not room for the replacement on this one.
+     */
+
+    newbuf = buffer;
+    page = BufferGetPage(buffer);
+    newpage = BufferGetPage(newbuf);
+    hr = obj_desc->heap_r;
+    freespc = IFREESPC(page);
+    d = (Datum)heap_getattr(htup, buffer, 2, obj_desc->hdesc, &isNull);
+    fsblock = (struct varlena *) DatumGetPointer(d);
+    tupbytes = fsblock->vl_len - sizeof(fsblock->vl_len);
+
+    if (freespc < tupbytes) {
+
+	/*
+	 *  First see if there's enough space on the last page of the
+	 *  table to put this tuple.
+	 */
+
+	nblocks = RelationGetNumberOfBlocks(hr);
+
+	if (nblocks > 0)
+	    newbuf = ReadBuffer(hr, nblocks - 1);
+	else
+	    newbuf = ReadBuffer(hr, P_NEW);
+
+        newpage = BufferGetPage(newbuf);
+	freespc = IFREESPC(newpage);
+
+	/*
+	 *  If there's no room on the last page, allocate a new last
+	 *  page for the table, and put it there.
+	 */
+
+	if (freespc < tupbytes) {
+	    ReleaseBuffer(newbuf);
+	    newbuf = ReadBuffer(hr, P_NEW);
+	    newpage = BufferGetPage(newbuf);
+	    PageInit(newpage, BufferGetPageSize(newbuf), 0);
+	}
+    }
+    
+    nwritten = nbytes;
+    if (nwritten > obj_desc->highbyte - obj_desc->offset + 1)
+	nwritten = obj_desc->highbyte - obj_desc->offset + 1;
+    memmove(VARDATA(fsblock)+ (obj_desc->offset - obj_desc->lowbyte),
+	    dbuf,nwritten);
+    /* we are rewriting the entire old block, therefore
+       we reset offset to the lowbyte of the original block
+       before jumping into inv_newtuple() */
+    keep_offset = obj_desc->offset;
+    obj_desc->offset = obj_desc->lowbyte;
+    ntup = inv_newtuple(obj_desc, newbuf, newpage, VARDATA(fsblock),
+			tupbytes);
+    /* after we are done, we restore to the true offset */
+    obj_desc->offset = keep_offset;
+
+    /*
+     *  By here, we have a page (newpage) that's guaranteed to have
+     *  enough space on it to put the new tuple.  Call inv_newtuple
+     *  to do the work.  Passing NULL as a buffer to inv_newtuple()
+     *  keeps it from copying any data into the new tuple.  When it
+     *  returns, the tuple is ready to receive data from the old
+     *  tuple and the user's data buffer.
+     */
+/*
+    ntup = inv_newtuple(obj_desc, newbuf, newpage, (char *) NULL, tupbytes);
+    dptr = ((char *) ntup) + ntup->t_hoff - sizeof(ntup->t_bits) + sizeof(int4)
+		+ sizeof(fsblock->vl_len);
+
+    if (obj_desc->offset > obj_desc->lowbyte) {
+	memmove(dptr,
+		&(fsblock->vl_dat[0]),
+		obj_desc->offset - obj_desc->lowbyte);
+	dptr += obj_desc->offset - obj_desc->lowbyte;
+    }
+
+
+    nwritten = nbytes;
+    if (nwritten > obj_desc->highbyte - obj_desc->offset + 1)
+	nwritten = obj_desc->highbyte - obj_desc->offset + 1;
+
+    memmove(dptr, dbuf, nwritten);
+    dptr += nwritten;
+
+    if (obj_desc->offset + nwritten < obj_desc->highbyte + 1) {
+*/
+/*
+	loc = (obj_desc->highbyte - obj_desc->offset)
+		+ nwritten;
+	sz = obj_desc->highbyte - (obj_desc->lowbyte + loc);
+
+	what's going on here?? - jolly
+*/
+/*
+	sz = (obj_desc->highbyte + 1) - (obj_desc->offset + nwritten);
+	memmove(&(fsblock->vl_dat[0]), dptr, sz);
+    }
+*/
+
+
+    /* index the new tuple */
+    inv_indextup(obj_desc, ntup);
+
+    /* move the scandesc forward so we don't reread the newly inserted
+       tuple on the next index scan */
+    if (obj_desc->iscan)
+	index_getnext(obj_desc->iscan, ForwardScanDirection);
+
+    /*
+     *  Okay, by here, a tuple for the new block is correctly placed,
+     *  indexed, and filled.  Write the changed pages out.
+     */
+
+    WriteBuffer(buffer);
+    if (newbuf != buffer)
+	WriteBuffer(newbuf);
+
+    /* done */
+    return (nwritten);
+}
+
+static HeapTuple
+inv_newtuple(LargeObjectDesc *obj_desc,
+	     Buffer buffer,
+	     Page page,
+	     char *dbuf,
+	     int nwrite)
+{
+    HeapTuple ntup;
+    PageHeader ph;
+    int tupsize;
+    int hoff;
+    Offset lower;
+    Offset upper;
+    ItemId itemId;
+    OffsetNumber off;
+    OffsetNumber limit;
+    char *attptr;
+    
+    /* compute tuple size -- no nulls */
+    hoff = sizeof(HeapTupleData) - sizeof(ntup->t_bits);
+
+    /* add in olastbyte, varlena.vl_len, varlena.vl_dat */
+    tupsize = hoff + (2 * sizeof(int32)) + nwrite;
+    tupsize = LONGALIGN(tupsize);
+
+    /*
+     *  Allocate the tuple on the page, violating the page abstraction.
+     *  This code was swiped from PageAddItem().
+     */
+
+    ph = (PageHeader) page;
+    limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+    /* look for "recyclable" (unused & deallocated) ItemId */
+    for (off = FirstOffsetNumber; off < limit; off = OffsetNumberNext(off)) {
+	itemId = &ph->pd_linp[off - 1];
+	if ((((*itemId).lp_flags & LP_USED) == 0) && 
+	    ((*itemId).lp_len == 0)) 
+	    break;
+    }
+
+    if (off > limit)
+	lower = (Offset) (((char *) (&ph->pd_linp[off])) - ((char *) page));
+    else if (off == limit)
+	lower = ph->pd_lower + sizeof (ItemIdData);
+    else
+	lower = ph->pd_lower;
+
+    upper = ph->pd_upper - tupsize;
+    
+    itemId = &ph->pd_linp[off - 1];
+    (*itemId).lp_off = upper;
+    (*itemId).lp_len = tupsize;
+    (*itemId).lp_flags = LP_USED;
+    ph->pd_lower = lower;
+    ph->pd_upper = upper;
+
+    ntup = (HeapTuple) ((char *) page + upper);
+
+    /*
+     *  Tuple is now allocated on the page.  Next, fill in the tuple
+     *  header.  This block of code violates the tuple abstraction.
+     */
+
+    ntup->t_len = tupsize;
+    ItemPointerSet(&(ntup->t_ctid), BufferGetBlockNumber(buffer), off);
+    ItemPointerSetInvalid(&(ntup->t_chain));
+    LastOidProcessed = ntup->t_oid = newoid();
+    TransactionIdStore(GetCurrentTransactionId(), &(ntup->t_xmin));
+    ntup->t_cmin = GetCurrentCommandId();
+    StoreInvalidTransactionId(&(ntup->t_xmax));
+    ntup->t_cmax = 0;
+    ntup->t_tmin = INVALID_ABSTIME;
+    ntup->t_tmax = CURRENT_ABSTIME;
+    ntup->t_natts = 2;
+    ntup->t_hoff = hoff;
+    ntup->t_vtype = 0;
+    ntup->t_infomask = 0x0;
+
+    /* if a NULL is passed in, avoid the calculations below */
+    if (dbuf == NULL)
+	return ntup;
+
+    /*
+     *  Finally, copy the user's data buffer into the tuple.  This violates
+     *  the tuple and class abstractions.
+     */
+
+    attptr = ((char *) ntup) + hoff;
+    *((int32 *) attptr) = obj_desc->offset + nwrite - 1;
+    attptr += sizeof(int32);
+
+    /*
+    **	mer fixed disk layout of varlenas to get rid of the need for this.
+    **
+    **	*((int32 *) attptr) = nwrite + sizeof(int32);
+    **	attptr += sizeof(int32);
+    */
+
+    *((int32 *) attptr) = nwrite + sizeof(int32);
+    attptr += sizeof(int32);
+
+    /*
+     *  If a data buffer was passed in, then copy the data from the buffer
+     *  to the tuple.  Some callers (eg, inv_wrold()) may not pass in a
+     *  buffer, since they have to copy part of the old tuple data and
+     *  part of the user's new data into the new tuple.
+     */
+
+    if (dbuf != (char *) NULL)
+	memmove(attptr, dbuf, nwrite);
+
+    /* keep track of boundary of current tuple */
+    obj_desc->lowbyte = obj_desc->offset;
+    obj_desc->highbyte = obj_desc->offset + nwrite - 1;
+
+    /* new tuple is filled -- return it */
+    return (ntup);
+}
+
+static void
+inv_indextup(LargeObjectDesc *obj_desc, HeapTuple htup)
+{
+    IndexTuple itup;
+    InsertIndexResult res;
+    Datum v[1];
+    char n[1];
+
+    n[0] = ' ';
+    v[0] = Int32GetDatum(obj_desc->highbyte);
+    itup = index_formtuple(obj_desc->idesc, &v[0], &n[0]);
+    memmove((char *)&(itup->t_tid),
+	    (char *)&(htup->t_ctid),
+	    sizeof(ItemPointerData)); 
+    res = index_insert(obj_desc->index_r, itup);
+
+    if (res)
+	pfree(res);
+
+    pfree(itup);
+}
+
+/*
+static void
+DumpPage(Page page, int blkno)
+{
+	ItemId		lp;
+	HeapTuple	tup;
+	int		flags, i, nline;
+	ItemPointerData	pointerData;
+
+	printf("\t[subblock=%d]:lower=%d:upper=%d:special=%d\n", 0,
+		((PageHeader)page)->pd_lower, ((PageHeader)page)->pd_upper,
+		((PageHeader)page)->pd_special);
+
+	printf("\t:MaxOffsetNumber=%d\n",
+	       (int16) PageGetMaxOffsetNumber(page));
+	
+	nline = (int16) PageGetMaxOffsetNumber(page);
+
+{
+	int	i;
+	char	*cp;
+
+	i = PageGetSpecialSize(page);
+	cp = PageGetSpecialPointer(page);
+
+	printf("\t:SpecialData=");
+
+	while (i > 0) {
+		printf(" 0x%02x", *cp);
+		cp += 1;
+		i -= 1;
+	}
+	printf("\n");
+}
+	for (i = 0; i < nline; i++) {
+		lp = ((PageHeader)page)->pd_linp + i;
+		flags = (*lp).lp_flags;
+		ItemPointerSet(&pointerData, blkno, 1 + i);
+		printf("%s:off=%d:flags=0x%x:len=%d",
+			ItemPointerFormExternal(&pointerData), (*lp).lp_off,
+			flags, (*lp).lp_len);
+
+		if (flags & LP_USED) {
+			HeapTupleData	htdata;
+
+			printf(":USED");
+
+			memmove((char *) &htdata,
+				(char *) &((char *)page)[(*lp).lp_off],
+				sizeof(htdata));
+
+			tup = &htdata;
+
+			printf("\n\t:ctid=%s:oid=%d",
+				ItemPointerFormExternal(&tup->t_ctid),
+				tup->t_oid);
+			printf(":natts=%d:thoff=%d:vtype=`%c' (0x%02x):",
+				tup->t_natts,
+				tup->t_hoff, tup->t_vtype, tup->t_vtype);
+
+			printf("\n\t:tmin=%d:cmin=%u:",
+				tup->t_tmin, tup->t_cmin);
+
+			printf("xmin=%u:", tup->t_xmin);
+
+			printf("\n\t:tmax=%d:cmax=%u:",
+				tup->t_tmax, tup->t_cmax);
+
+			printf("xmax=%u:", tup->t_xmax);
+
+			printf("\n\t:chain=%s:\n",
+				ItemPointerFormExternal(&tup->t_chain));
+		} else
+			putchar('\n');
+	}
+}
+
+static char*
+ItemPointerFormExternal(ItemPointer pointer)
+{
+	static char	itemPointerString[32];
+
+	if (!ItemPointerIsValid(pointer)) {
+	    memmove(itemPointerString, "<-,-,->", sizeof "<-,-,->");
+	} else {
+	    sprintf(itemPointerString, "<%u,%u>",
+		    ItemPointerGetBlockNumber(pointer),
+		    ItemPointerGetOffsetNumber(pointer));
+	}
+
+	return (itemPointerString);
+}
+*/
+
+static int
+_inv_getsize(Relation hreln, TupleDesc hdesc, Relation ireln)
+{
+    IndexScanDesc iscan;
+    RetrieveIndexResult res;
+    Buffer buf;
+    HeapTuple htup;
+    Datum d;
+    long size;
+    bool isNull;
+
+    /* scan backwards from end */
+    iscan = index_beginscan(ireln, (bool) 1, 0, (ScanKey) NULL);
+
+    buf = InvalidBuffer;
+
+    do {
+	res = index_getnext(iscan, BackwardScanDirection);
+
+	/*
+	 *  If there are no more index tuples, then the relation is empty,
+	 *  so the file's size is zero.
+	 */
+
+	if (res == (RetrieveIndexResult) NULL) {
+	    index_endscan(iscan);
+	    return (0);
+	}
+
+	/*
+	 *  For time travel, we need to use the actual time qual here,
+	 *  rather that NowTimeQual.  We currently have no way to pass
+	 *  a time qual in.
+	 */
+
+	if (buf != InvalidBuffer)
+	    (void) ReleaseBuffer(buf);
+
+	htup = heap_fetch(hreln, NowTimeQual, &(res->heap_iptr), &buf);
+
+    } while (!HeapTupleIsValid(htup));
+
+    /* don't need the index scan anymore */
+    index_endscan(iscan);
+
+    /* get olastbyte attribute */
+    d = (Datum) heap_getattr(htup, buf, 1, hdesc, &isNull);
+    size = DatumGetInt32(d) + 1;
+
+    /* wei hates it if you forget to do this */
+    ReleaseBuffer(buf);
+
+    return (size);
+}
diff --git a/src/backend/storage/lmgr.h b/src/backend/storage/lmgr.h
new file mode 100644
index 00000000000..fe87eb05546
--- /dev/null
+++ b/src/backend/storage/lmgr.h
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.h--
+ *    POSTGRES lock manager definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: lmgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	LMGR_H
+#define LMGR_H
+
+#include "postgres.h"
+
+#include "storage/itemptr.h"
+#include "storage/lock.h"
+#include "utils/rel.h"
+
+/* 
+ * This was moved from pladt.h for the new lock manager.  Want to obsolete
+ * all of the old code.
+ */
+typedef struct LRelId {
+    Oid	 relId;     /* a relation identifier */
+    Oid     dbId;      /* a database identifier */
+} LRelId;
+
+typedef struct LockInfoData  {
+    bool                    initialized;
+    LRelId                  lRelId;
+    TransactionId           transactionIdData;
+    uint16                  flags;
+} LockInfoData;
+typedef LockInfoData    *LockInfo;
+
+#define LockInfoIsValid(linfo) \
+	((PointerIsValid(linfo)) &&  ((LockInfo) linfo)->initialized)
+
+
+extern LRelId RelationGetLRelId(Relation relation);
+extern Oid LRelIdGetDatabaseId(LRelId lRelId);
+extern Oid LRelIdGetRelationId(LRelId lRelId);
+extern bool DatabaseIdIsMyDatabaseId(Oid databaseId);
+extern bool LRelIdContainsMyDatabaseId(LRelId lRelId);
+extern void RelationInitLockInfo(Relation relation);
+extern void RelationDiscardLockInfo(Relation relation);
+extern void RelationSetLockForDescriptorOpen(Relation relation);
+extern void RelationSetLockForRead(Relation relation);
+extern void RelationUnsetLockForRead(Relation relation);
+extern void RelationSetLockForWrite(Relation relation);
+extern void RelationUnsetLockForWrite(Relation relation);
+extern void RelationSetLockForTupleRead(Relation relation,
+					ItemPointer itemPointer);
+
+/* used in vaccum.c */
+extern void RelationSetLockForWritePage(Relation relation,
+		       ItemPointer itemPointer);
+
+/* used in nbtpage.c, hashpage.c */
+extern void RelationSetSingleWLockPage(Relation relation,
+		       ItemPointer itemPointer);
+extern void RelationUnsetSingleWLockPage(Relation relation,
+		       ItemPointer itemPointer);
+extern void RelationSetSingleRLockPage(Relation relation,
+		       ItemPointer itemPointer);
+extern void RelationUnsetSingleRLockPage(Relation relation,
+		       ItemPointer itemPointer);
+extern void RelationSetRIntentLock(Relation relation);
+extern void RelationUnsetRIntentLock(Relation relation);
+extern void RelationSetWIntentLock(Relation relation);
+extern void RelationUnsetWIntentLock(Relation relation);
+extern void RelationSetLockForExtend(Relation relation);
+extern void RelationUnsetLockForExtend(Relation relation);
+extern void LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId);
+
+/* single.c */
+extern bool SingleLockReln(LockInfo linfo, LOCKT lockt, int action);
+extern bool SingleLockPage(LockInfo linfo, ItemPointer tidPtr,
+			   LOCKT lockt, int action);
+
+#endif	/* LMGR_H */
diff --git a/src/backend/storage/lmgr/Makefile.inc b/src/backend/storage/lmgr/Makefile.inc
new file mode 100644
index 00000000000..ac507558b57
--- /dev/null
+++ b/src/backend/storage/lmgr/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/lmgr
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= lmgr.c lock.c multi.c proc.c single.c
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README
new file mode 100644
index 00000000000..e382003f2a4
--- /dev/null
+++ b/src/backend/storage/lmgr/README
@@ -0,0 +1,93 @@
+$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+
+This file is an attempt to save me (and future code maintainers) some
+time and a lot of headaches.  The existing lock manager code at the time
+of this writing (June 16 1992) can best be described as confusing.  The
+complexity seems inherent in lock manager functionality, but variable
+names chosen in the current implementation really confuse me everytime
+I have to track down a bug.  Also, what gets done where and by whom isn't
+always clear....
+
+Starting with the data structures the lock manager relies upon...
+
+(NOTE - these will undoubtedly change over time and it is likely
+that this file won't always be updated along with the structs.)
+
+The lock manager's LOCK:
+
+tag -
+    The key fields that are used for hashing locks in the shared memory
+    lock hash table.  This is kept as a separate struct to ensure that we
+    always zero out the correct number of bytes.  This is a problem as
+    part of the tag is an itempointer which is 6 bytes and causes 2
+    additional bytes to be added as padding.
+
+    tag.relId -
+	Uniquely identifies the relation that the lock corresponds to.
+    
+    tag.dbId -
+	Uniquely identifies the database in which the relation lives.  If
+	this is a shared system relation (e.g. pg_user) the dbId should be
+	set to 0.
+
+    tag.tupleId -
+	Uniquely identifies the block/page within the relation and the
+	tuple within the block.  If we are setting a table level lock
+	both the blockId and tupleId (in an item pointer this is called
+	the position) are set to invalid, if it is a page level lock the
+	blockId is valid, while the tuleId is still invalid.  Finally if
+	this is a tuple level lock (we currently never do this) then both
+	the blockId and tupleId are set to valid specifications.  This is
+	how we get the appearance of a multi-level lock table while using
+	only a single table (see Gray's paper on 2 phase locking if
+	you are puzzled about how multi-level lock tables work).
+
+mask -
+    This field indicates what types of locks are currently held in the
+    given lock.  It is used (against the lock table's conflict table)
+    to determine if the new lock request will conflict with existing
+    lock types held.  Conficts are determined by bitwise AND operations
+    between the mask and the conflict table entry for the given lock type
+    to be set.  The current representation is that each bit (1 through 5)
+    is set when that lock type (WRITE, READ, WRITE INTENT, READ INTENT, EXTEND)
+    has been acquired for the lock.
+
+waitProcs -
+    This is a shared memory queue of all process structures corresponding to
+    a backend that is waiting (sleeping) until another backend releases this
+    lock.  The process structure holds the information needed to determine
+    if it should be woken up when this lock is released.  If, for example,
+    we are releasing a read lock and the process is sleeping trying to acquire
+    a read lock then there is no point in waking it since the lock being
+    released isn't what caused it to sleep in the first place.  There will
+    be more on this below (when I get to releasing locks and waking sleeping
+    process routines).
+
+nHolding -
+    Keeps a count of how many times this lock has been attempted to be
+    acquired.  The count includes attempts by processes which were put
+    to sleep due to conflicts.  It also counts the same backend twice
+    if, for example, a backend process first acquires a read and then
+    acquires a write.
+
+holders -
+    Keeps a count of how many locks of each type have been attempted.  Only
+    elements 1 through MAX_LOCK_TYPES are used as they correspond to the lock
+    type defined constants (WRITE through EXTEND).  Summing the values of
+    holders should come out equal to nHolding.
+
+nActive -
+    Keeps a count of how many times this lock has been succesfully acquired.
+    This count does not include attempts that were rejected due to conflicts,
+    but can count the same backend twice (e.g. a read then a write -- since
+    its the same transaction this won't cause a conflict)
+
+activeHolders -
+    Keeps a count of how locks of each type are currently held.  Once again
+    only elements 1 through MAX_LOCK_TYPES are used (0 is not).  Also, like
+    holders, summing the values of activeHolders should total to the value
+    of nActive.
+
+
+This is all I had the stomach for right now..... I will get back to this
+someday.	-mer 17 June 1992 12:00 am
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
new file mode 100644
index 00000000000..bfc2f5b2eec
--- /dev/null
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -0,0 +1,933 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.c--
+ *    POSTGRES lock manager code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/* #define LOCKDEBUGALL	1 */
+/* #define LOCKDEBUG	1 */
+
+#ifdef	LOCKDEBUGALL
+#define LOCKDEBUG	1
+#endif /*  LOCKDEBUGALL */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup.h"
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "utils/tqual.h"
+#include "access/xact.h"
+
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/itemptr.h"
+#include "storage/bufpage.h"
+#include "storage/multilev.h"
+#include "storage/lmgr.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+
+#include "catalog/catname.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_class.h"
+
+#include "nodes/memnodes.h"
+#include "storage/bufmgr.h"
+#include "access/transam.h"	/* for AmiTransactionId */
+
+/* ----------------
+ *	
+ * ----------------
+ */
+#define MaxRetries	4	/* XXX about 1/4 minute--a hack */
+
+#define IntentReadRelationLock	0x0100
+#define ReadRelationLock	0x0200
+#define IntentWriteRelationLock	0x0400
+#define WriteRelationLock	0x0800
+#define IntentReadPageLock	0x1000
+#define ReadTupleLock		0x2000
+
+#define TupleLevelLockCountMask	0x000f
+
+#define TupleLevelLockLimit	10
+
+extern Oid	MyDatabaseId;
+
+static LRelId	VariableRelationLRelId = {
+    RelOid_pg_variable,
+    InvalidOid
+};
+
+/* ----------------
+ *	RelationGetLRelId
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_10 \
+elog(NOTICE, "RelationGetLRelId(%s) invalid lockInfo", \
+     RelationGetRelationName(relation));
+#else
+#define LOCKDEBUG_10
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationGetLRelId --
+ *	Returns "lock" relation identifier for a relation.
+ */
+LRelId
+RelationGetLRelId(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    linfo = (LockInfo) relation->lockInfo;
+    
+    /* ----------------
+     *	initialize lock info if necessary
+     * ----------------
+     */
+    if (! LockInfoIsValid(linfo)) {
+	LOCKDEBUG_10;
+	RelationInitLockInfo(relation);
+	linfo = (LockInfo) relation->lockInfo;
+    }
+    
+    /* ----------------
+     * XXX hack to prevent problems during
+     * VARIABLE relation initialization
+     * ----------------
+     */
+    if (strcmp(RelationGetRelationName(relation)->data,
+	       VariableRelationName) == 0) {
+	return (VariableRelationLRelId);
+    }
+    
+    return (linfo->lRelId);
+}
+
+/*
+ * LRelIdGetDatabaseId --
+ *	Returns database identifier for a "lock" relation identifier.
+ */
+/* ----------------
+ *	LRelIdGetDatabaseId
+ *
+ * Note: The argument may not be correct, if it is not used soon
+ *	 after it is created.
+ * ----------------
+ */
+Oid
+LRelIdGetDatabaseId(LRelId lRelId)
+{
+    return (lRelId.dbId);
+}
+
+
+/*
+ * LRelIdGetRelationId --
+ *	Returns relation identifier for a "lock" relation identifier.
+ */
+Oid 
+LRelIdGetRelationId(LRelId lRelId)
+{
+    return (lRelId.relId);
+}
+
+/*
+ * DatabaseIdIsMyDatabaseId --
+ *	True iff database object identifier is valid in my present database.
+ */
+bool
+DatabaseIdIsMyDatabaseId(Oid databaseId)
+{
+    return (bool)
+	(!OidIsValid(databaseId) || databaseId == MyDatabaseId);
+}
+
+/*
+ * LRelIdContainsMyDatabaseId --
+ *	True iff "lock" relation identifier is valid in my present database.
+ */
+bool
+LRelIdContainsMyDatabaseId(LRelId lRelId)
+{
+    return (bool)
+	(!OidIsValid(lRelId.dbId) || lRelId.dbId == MyDatabaseId);
+}
+
+/*
+ * RelationInitLockInfo --
+ *	Initializes the lock information in a relation descriptor.
+ */
+/* ----------------
+ *	RelationInitLockInfo
+ *
+ * 	XXX processingVariable is a hack to prevent problems during
+ * 	VARIABLE relation initialization.
+ * ----------------
+ */
+void
+RelationInitLockInfo(Relation relation)
+{
+    LockInfo		info;
+    char 		*relname;
+    Oid		relationid;
+    bool		processingVariable;
+    extern Oid	MyDatabaseId;		/* XXX use include */
+    extern GlobalMemory CacheCxt;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    Assert(OidIsValid(RelationGetRelationId(relation)));
+    
+    /* ----------------
+     *	get information from relation descriptor
+     * ----------------
+     */
+    info = (LockInfo) relation->lockInfo;
+    relname = (char *) RelationGetRelationName(relation);
+    relationid = RelationGetRelationId(relation);
+    processingVariable = (strcmp(relname, VariableRelationName) == 0);
+    
+    /* ----------------
+     *	create a new lockinfo if not already done
+     * ----------------
+     */
+    if (! PointerIsValid(info)) 
+	{
+	    MemoryContext oldcxt;
+	    
+	    oldcxt = MemoryContextSwitchTo((MemoryContext)CacheCxt);
+	    info = (LockInfo)palloc(sizeof(LockInfoData));
+	    MemoryContextSwitchTo(oldcxt);
+	}
+    else if (processingVariable) {
+	if (IsTransactionState()) {
+	    TransactionIdStore(GetCurrentTransactionId(),
+			       &info->transactionIdData);
+	}
+	info->flags = 0x0;
+	return;		/* prevent an infinite loop--still true? */
+    }
+    else if (info->initialized)
+	{
+	    /* ------------
+	     *  If we've already initialized we're done.
+	     * ------------
+	     */
+	    return;
+	}
+    
+    /* ----------------
+     *	initialize lockinfo.dbId and .relId appropriately
+     * ----------------
+     */
+    if (IsSharedSystemRelationName(relname))
+	LRelIdAssign(&info->lRelId, InvalidOid, relationid);
+    else
+	LRelIdAssign(&info->lRelId, MyDatabaseId, relationid);
+    
+    /* ----------------
+     *	store the transaction id in the lockInfo field
+     * ----------------
+     */
+    if (processingVariable)
+	TransactionIdStore(AmiTransactionId,
+			   &info->transactionIdData);
+    else if (IsTransactionState()) 
+	TransactionIdStore(GetCurrentTransactionId(),
+			   &info->transactionIdData);
+    else
+	StoreInvalidTransactionId(&(info->transactionIdData));
+    
+    /* ----------------
+     *	initialize rest of lockinfo
+     * ----------------
+     */
+    info->flags = 0x0;
+    info->initialized =	(bool)true;
+    relation->lockInfo = (Pointer) info;
+}
+
+/* ----------------
+ *	RelationDiscardLockInfo
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_20 \
+elog(DEBUG, "DiscardLockInfo: NULL relation->lockInfo")
+#else
+#define LOCKDEBUG_20
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationDiscardLockInfo --
+ *	Discards the lock information in a relation descriptor.
+ */
+void
+RelationDiscardLockInfo(Relation relation)
+{
+    if (! LockInfoIsValid(relation->lockInfo)) {
+	LOCKDEBUG_20;
+	return;
+    }
+    
+    pfree(relation->lockInfo);
+    relation->lockInfo = NULL;
+}
+
+/*
+ * RelationSetLockForDescriptorOpen --
+ *	Sets read locks for a relation descriptor.
+ */
+#ifdef	LOCKDEBUGALL
+#define LOCKDEBUGALL_30 \
+elog(DEBUG, "RelationSetLockForDescriptorOpen(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUGALL_30
+#endif	/* LOCKDEBUGALL*/
+     
+void
+RelationSetLockForDescriptorOpen(Relation relation)
+{
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    LOCKDEBUGALL_30;
+    
+    /* ----------------
+     * read lock catalog tuples which compose the relation descriptor
+     * XXX race condition? XXX For now, do nothing.
+     * ----------------
+     */
+}
+
+/* ----------------
+ *	RelationSetLockForRead
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_40 \
+elog(DEBUG, "RelationSetLockForRead(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_40
+#endif	/* LOCKDEBUG*/
+     
+/*
+ * RelationSetLockForRead --
+ *	Sets relation level read lock.
+ */
+void
+RelationSetLockForRead(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    LOCKDEBUG_40;
+    
+    /* ----------------
+     * If we don't have lock info on the reln just go ahead and
+     * lock it without trying to short circuit the lock manager.
+     * ----------------
+     */
+    if (!LockInfoIsValid(relation->lockInfo))
+	{
+	    RelationInitLockInfo(relation);
+	    linfo = (LockInfo) relation->lockInfo;
+	    linfo->flags |= ReadRelationLock;
+	    MultiLockReln(linfo, READ_LOCK);
+	    return;
+	}
+    else
+        linfo = (LockInfo) relation->lockInfo;
+    
+    MultiLockReln(linfo, READ_LOCK);
+}
+
+/* ----------------
+ *	RelationUnsetLockForRead
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_50 \
+elog(DEBUG, "RelationUnsetLockForRead(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_50
+#endif	/* LOCKDEBUG*/
+     
+/*
+ * RelationUnsetLockForRead --
+ *	Unsets relation level read lock.
+ */
+void
+RelationUnsetLockForRead(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity check
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    linfo = (LockInfo) relation->lockInfo;
+    
+    /* ----------------
+     * If we don't have lock info on the reln just go ahead and
+     * release it.
+     * ----------------
+     */
+    if (!LockInfoIsValid(linfo))
+	{
+	    elog(WARN, 
+		 "Releasing a lock on %s with invalid lock information",
+		 RelationGetRelationName(relation));
+	}
+    
+    MultiReleaseReln(linfo, READ_LOCK);
+}
+
+/* ----------------
+ *	RelationSetLockForWrite(relation)
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_60 \
+elog(DEBUG, "RelationSetLockForWrite(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_60
+#endif	/* LOCKDEBUG*/
+     
+/*
+ * RelationSetLockForWrite --
+ *	Sets relation level write lock.
+ */
+void
+RelationSetLockForWrite(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    LOCKDEBUG_60;
+    
+    /* ----------------
+     * If we don't have lock info on the reln just go ahead and
+     * lock it without trying to short circuit the lock manager.
+     * ----------------
+     */
+    if (!LockInfoIsValid(relation->lockInfo))
+	{
+	    RelationInitLockInfo(relation);
+	    linfo = (LockInfo) relation->lockInfo;
+	    linfo->flags |= WriteRelationLock;
+	    MultiLockReln(linfo, WRITE_LOCK);
+	    return;
+	}
+    else
+        linfo = (LockInfo) relation->lockInfo;
+    
+    MultiLockReln(linfo, WRITE_LOCK);
+}
+
+/* ----------------
+ *	RelationUnsetLockForWrite
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_70 \
+elog(DEBUG, "RelationUnsetLockForWrite(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId);
+#else
+#define LOCKDEBUG_70
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationUnsetLockForWrite --
+ *	Unsets relation level write lock.
+ */
+void
+RelationUnsetLockForWrite(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled()) {
+	return;
+    }
+    
+    linfo = (LockInfo) relation->lockInfo;
+    
+    if (!LockInfoIsValid(linfo))
+	{
+	    elog(WARN, 
+		 "Releasing a lock on %s with invalid lock information",
+		 RelationGetRelationName(relation));
+	}
+    
+    MultiReleaseReln(linfo, WRITE_LOCK);
+}
+
+/* ----------------
+ *	RelationSetLockForTupleRead
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_80 \
+elog(DEBUG, "RelationSetLockForTupleRead(%s[%d,%d], 0x%x) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, \
+     itemPointer)
+#define LOCKDEBUG_81 \
+     elog(DEBUG, "RelationSetLockForTupleRead() escalating");
+#else
+#define LOCKDEBUG_80
+#define LOCKDEBUG_81
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationSetLockForTupleRead --
+ *	Sets tuple level read lock.
+ */
+void
+RelationSetLockForTupleRead(Relation relation, ItemPointer itemPointer)
+{
+    LockInfo	linfo;
+    TransactionId curXact;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    LOCKDEBUG_80;
+    
+    /* ---------------------
+     * If our lock info is invalid don't bother trying to short circuit
+     * the lock manager.
+     * ---------------------
+     */
+    if (!LockInfoIsValid(relation->lockInfo))
+	{
+	    RelationInitLockInfo(relation);
+	    linfo = (LockInfo) relation->lockInfo;
+	    linfo->flags |=
+                IntentReadRelationLock |
+		    IntentReadPageLock |
+			ReadTupleLock;
+	    MultiLockTuple(linfo, itemPointer, READ_LOCK);
+	    return;
+	}
+    else
+        linfo = (LockInfo) relation->lockInfo;
+    
+    /* ----------------
+     *	no need to set a lower granularity lock
+     * ----------------
+     */
+    curXact = GetCurrentTransactionId();
+    if ((linfo->flags & ReadRelationLock) &&
+	TransactionIdEquals(curXact, linfo->transactionIdData))
+	{
+	    return;
+	}
+    
+    /* ----------------
+     * If we don't already have a tuple lock this transaction
+     * ----------------
+     */
+    if (!( (linfo->flags & ReadTupleLock) &&
+	  TransactionIdEquals(curXact, linfo->transactionIdData) )) {
+	
+	linfo->flags |=
+	    IntentReadRelationLock |
+		IntentReadPageLock |
+		    ReadTupleLock;
+	
+	/* clear count */
+	linfo->flags &= ~TupleLevelLockCountMask;
+	
+    } else {
+	if (TupleLevelLockLimit == (TupleLevelLockCountMask &
+				    linfo->flags)) {
+	    LOCKDEBUG_81;
+	    
+	    /* escalate */
+	    MultiLockReln(linfo, READ_LOCK);
+	    
+	    /* clear count */
+	    linfo->flags &= ~TupleLevelLockCountMask;
+	    return;
+	}
+	
+	/* increment count */
+	linfo->flags =
+	    (linfo->flags & ~TupleLevelLockCountMask) |
+		(1 + (TupleLevelLockCountMask & linfo->flags));
+    }
+    
+    TransactionIdStore(curXact, &linfo->transactionIdData);
+    
+    /* ----------------
+     * Lock the tuple.
+     * ----------------
+     */
+    MultiLockTuple(linfo, itemPointer, READ_LOCK);
+}
+
+/* ----------------
+ *	RelationSetLockForReadPage
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_90 \
+elog(DEBUG, "RelationSetLockForReadPage(%s[%d,%d], @%d) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page);
+#else
+#define LOCKDEBUG_90
+#endif	/* LOCKDEBUG*/
+     
+/* ----------------
+ *	RelationSetLockForWritePage
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_100 \
+elog(DEBUG, "RelationSetLockForWritePage(%s[%d,%d], @%d) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page);
+#else
+#define LOCKDEBUG_100
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationSetLockForWritePage --
+ *	Sets write lock on a page.
+ */
+void 
+RelationSetLockForWritePage(Relation relation,
+			    ItemPointer itemPointer)
+{
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    /* ---------------
+     * Make sure linfo is initialized
+     * ---------------
+     */
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    /* ----------------
+     *	attempt to set lock
+     * ----------------
+     */
+    MultiLockPage((LockInfo) relation->lockInfo, itemPointer, WRITE_LOCK);
+}
+
+/* ----------------
+ *	RelationUnsetLockForReadPage
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_110 \
+elog(DEBUG, "RelationUnsetLockForReadPage(%s[%d,%d], @%d) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page)
+#else
+#define LOCKDEBUG_110
+#endif	/* LOCKDEBUG */
+     
+/* ----------------
+ *	RelationUnsetLockForWritePage
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_120 \
+elog(DEBUG, "RelationUnsetLockForWritePage(%s[%d,%d], @%d) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page)
+#else
+#define LOCKDEBUG_120
+#endif	/* LOCKDEBUG */
+     
+/*
+ * Set a single level write page lock.  Assumes that you already
+ * have a write intent lock on the relation.
+ */
+void
+RelationSetSingleWLockPage(Relation relation,
+			   ItemPointer itemPointer)
+{
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, !UNLOCK);
+}
+
+/*
+ * Unset a single level write page lock
+ */
+void
+RelationUnsetSingleWLockPage(Relation relation,
+			     ItemPointer itemPointer)
+{
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+        elog(WARN, 
+	     "Releasing a lock on %s with invalid lock information",
+	     RelationGetRelationName(relation));
+    
+    SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, UNLOCK);
+}
+
+/*
+ * Set a single level read page lock.  Assumes you already have a read
+ * intent lock set on the relation.
+ */
+void
+RelationSetSingleRLockPage(Relation relation,
+			   ItemPointer itemPointer)
+{
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, !UNLOCK);
+}
+
+/* 
+ * Unset a single level read page lock.
+ */
+void
+RelationUnsetSingleRLockPage(Relation relation,
+			     ItemPointer itemPointer)
+{
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+        elog(WARN, 
+	     "Releasing a lock on %s with invalid lock information",
+	     RelationGetRelationName(relation));
+    
+    SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, UNLOCK);
+}
+
+/*
+ * Set a read intent lock on a relation.
+ *
+ * Usually these are set in a multi-level table when you acquiring a
+ * page level lock.  i.e. To acquire a lock on a page you first acquire
+ * an intent lock on the entire relation.  Acquiring an intent lock along
+ * allows one to use the single level locking routines later.  Good for
+ * index scans that do a lot of page level locking.
+ */
+void
+RelationSetRIntentLock(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, !UNLOCK);
+}
+
+/*
+ * Unset a read intent lock on a relation
+ */
+void
+RelationUnsetRIntentLock(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, UNLOCK);
+}
+
+/*
+ * Set a write intent lock on a relation. For a more complete explanation
+ * see RelationSetRIntentLock()
+ */
+void
+RelationSetWIntentLock(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, !UNLOCK);
+}
+
+/*
+ * Unset a write intent lock.
+ */
+void
+RelationUnsetWIntentLock(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, UNLOCK);
+}
+
+/*
+ * Extend locks are used primarily in tertiary storage devices such as
+ * a WORM disk jukebox.  Sometimes need exclusive access to extend a 
+ * file by a block.
+ */
+void
+RelationSetLockForExtend(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    MultiLockReln((LockInfo) relation->lockInfo, EXTEND_LOCK);
+}
+
+void
+RelationUnsetLockForExtend(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    MultiReleaseReln((LockInfo) relation->lockInfo, EXTEND_LOCK);
+}
+
+/* 
+ * Create an LRelid --- Why not just pass in a pointer to the storage?
+ */
+void
+LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId)
+{   
+    lRelId->dbId = dbId;
+    lRelId->relId = relId;
+}
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
new file mode 100644
index 00000000000..8df898a0068
--- /dev/null
+++ b/src/backend/storage/lmgr/lock.c
@@ -0,0 +1,1020 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.c--
+ *    simple lock acquisition
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lock.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
+ *
+ * NOTES
+ *    Outside modules can create a lock table and acquire/release
+ *    locks.  A lock table is a shared memory hash table.  When
+ *    a process tries to acquire a lock of a type that conflicts
+ *    with existing locks, it is put to sleep using the routines
+ *    in storage/lmgr/proc.c.
+ *
+ *  Interface:
+ *
+ *  LockAcquire(), LockRelease(), LockTabInit().
+ *
+ *  LockReplace() is called only within this module and by the
+ *  	lkchain module.  It releases a lock without looking
+ * 	the lock up in the lock table.
+ *
+ *  NOTE: This module is used to define new lock tables.  The
+ *	multi-level lock table (multi.c) used by the heap
+ *	access methods calls these routines.  See multi.c for
+ *	examples showing how to use this interface.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>		/* for sprintf() */
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/proc.h"
+#include "storage/lock.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "access/xact.h"
+
+/*#define LOCK_MGR_DEBUG*/
+
+#ifndef LOCK_MGR_DEBUG
+
+#define LOCK_PRINT(where,tag,type)
+#define LOCK_DUMP(where,lock,type)
+#define XID_PRINT(where,xidentP)
+
+#else /* LOCK_MGR_DEBUG */
+
+#define LOCK_PRINT(where,tag,type)\
+  elog(NOTICE, "%s: rel (%d) dbid (%d) tid (%d,%d) type (%d)\n",where, \
+	 tag->relId, tag->dbId, \
+	 ( (tag->tupleId.ip_blkid.data[0] >= 0) ? \
+		BlockIdGetBlockNumber(&tag->tupleId.ip_blkid) : -1 ), \
+	 tag->tupleId.ip_posid, \
+	 type);
+
+#define LOCK_DUMP(where,lock,type)\
+  elog(NOTICE, "%s: rel (%d) dbid (%d) tid (%d,%d) nHolding (%d) holders (%d,%d,%d,%d,%d) type (%d)\n",where, \
+       lock->tag.relId, lock->tag.dbId, \
+       ((lock->tag.tupleId.ip_blkid.data[0] >= 0) ? \
+	BlockIdGetBlockNumber(&lock->tag.tupleId.ip_blkid) : -1 ), \
+       lock->tag.tupleId.ip_posid, \
+       lock->nHolding,\
+       lock->holders[1],\
+       lock->holders[2],\
+       lock->holders[3],\
+       lock->holders[4],\
+       lock->holders[5],\
+       type);
+
+#define XID_PRINT(where,xidentP)\
+  elog(NOTICE,\
+       "%s:xid (%d) pid (%d) lock (%x) nHolding (%d) holders (%d,%d,%d,%d,%d)",\
+       where,\
+       xidentP->tag.xid,\
+       xidentP->tag.pid,\
+       xidentP->tag.lock,\
+       xidentP->nHolding,\
+       xidentP->holders[1],\
+       xidentP->holders[2],\
+       xidentP->holders[3],\
+       xidentP->holders[4],\
+       xidentP->holders[5]);
+
+#endif /* LOCK_MGR_DEBUG */
+
+SPINLOCK LockMgrLock;		/* in Shmem or created in CreateSpinlocks() */
+
+/* This is to simplify/speed up some bit arithmetic */
+
+static MASK	BITS_OFF[MAX_LOCKTYPES];
+static MASK	BITS_ON[MAX_LOCKTYPES];
+
+/* -----------------
+ * XXX Want to move this to this file
+ * -----------------
+ */
+static bool LockingIsDisabled;
+
+/* ------------------
+ * from storage/ipc/shmem.c
+ * ------------------
+ */
+extern HTAB *ShmemInitHash();
+
+/* -------------------
+ * map from tableId to the lock table structure
+ * -------------------
+ */
+static LOCKTAB *AllTables[MAX_TABLES];
+
+/* -------------------
+ * no zero-th table
+ * -------------------
+ */
+static int	NumTables = 1;
+
+/* -------------------
+ * InitLocks -- Init the lock module.  Create a private data
+ *	structure for constructing conflict masks.
+ * -------------------
+ */
+void
+InitLocks()
+{
+    int i;
+    int bit;
+    
+    bit = 1;
+    /* -------------------
+     * remember 0th locktype is invalid
+     * -------------------
+     */
+    for (i=0;i<MAX_LOCKTYPES;i++,bit <<= 1)
+	{
+	    BITS_ON[i] = bit;
+	    BITS_OFF[i] = ~bit;
+	}
+}
+
+/* -------------------
+ * LockDisable -- sets LockingIsDisabled flag to TRUE or FALSE.
+ * ------------------
+ */
+void
+LockDisable(int status)
+{
+    LockingIsDisabled = status;
+}
+
+
+/*
+ * LockTypeInit -- initialize the lock table's lock type
+ *	structures
+ *
+ * Notes: just copying.  Should only be called once.
+ */
+static void
+LockTypeInit(LOCKTAB *ltable,
+	     MASK *conflictsP,
+	     int *prioP,
+	     int ntypes)
+{
+    int	i;
+    
+    ltable->ctl->nLockTypes = ntypes;
+    ntypes++;
+    for (i=0;i<ntypes;i++,prioP++,conflictsP++)
+	{
+	    ltable->ctl->conflictTab[i] = *conflictsP;
+	    ltable->ctl->prio[i] = *prioP;
+	}
+}
+
+/*
+ * LockTabInit -- initialize a lock table structure
+ *
+ * Notes:
+ *	(a) a lock table has four separate entries in the binding
+ *	table.  This is because every shared hash table and spinlock
+ *	has its name stored in the binding table at its creation.  It
+ *	is wasteful, in this case, but not much space is involved.
+ *
+ */
+LockTableId
+LockTabInit(char *tabName,
+	    MASK *conflictsP,
+	    int *prioP,
+	    int ntypes)
+{
+    LOCKTAB *ltable;
+    char *shmemName;
+    HASHCTL info;
+    int hash_flags;
+    bool	found;
+    int status = TRUE;
+    
+    if (ntypes > MAX_LOCKTYPES)
+	{
+	    elog(NOTICE,"LockTabInit: too many lock types %d greater than %d",
+		 ntypes,MAX_LOCKTYPES);
+	    return(INVALID_TABLEID);
+	}
+    
+    if (NumTables > MAX_TABLES)
+	{
+	    elog(NOTICE,
+		 "LockTabInit: system limit of MAX_TABLES (%d) lock tables",
+		 MAX_TABLES);
+	    return(INVALID_TABLEID);
+	}
+    
+    /* allocate a string for the binding table lookup */
+    shmemName = (char *) palloc((unsigned)(strlen(tabName)+32));
+    if (! shmemName)
+	{
+	    elog(NOTICE,"LockTabInit: couldn't malloc string %s \n",tabName);
+	    return(INVALID_TABLEID);
+	}
+    
+    /* each lock table has a non-shared header */
+    ltable = (LOCKTAB *) palloc((unsigned) sizeof(LOCKTAB));
+    if (! ltable)
+	{
+	    elog(NOTICE,"LockTabInit: couldn't malloc lock table %s\n",tabName);
+	    (void) pfree (shmemName);
+	    return(INVALID_TABLEID);
+	}
+    
+    /* ------------------------
+     * find/acquire the spinlock for the table
+     * ------------------------
+     */
+    SpinAcquire(LockMgrLock);
+    
+    
+    /* -----------------------
+     * allocate a control structure from shared memory or attach to it
+     * if it already exists.
+     * -----------------------
+     */
+    sprintf(shmemName,"%s (ctl)",tabName);
+    ltable->ctl = (LOCKCTL *)
+	ShmemInitStruct(shmemName,(unsigned)sizeof(LOCKCTL),&found);
+    
+    if (! ltable->ctl)
+	{
+	    elog(FATAL,"LockTabInit: couldn't initialize %s",tabName);
+	    status = FALSE;
+	}
+    
+    /* ----------------
+     * we're first - initialize
+     * ----------------
+     */
+    if (! found)
+	{
+	    memset(ltable->ctl, 0, sizeof(LOCKCTL)); 
+	    ltable->ctl->masterLock = LockMgrLock;
+	    ltable->ctl->tableId = NumTables;
+	}
+    
+    /* --------------------
+     * other modules refer to the lock table by a tableId
+     * --------------------
+     */
+    AllTables[NumTables] = ltable;
+    NumTables++;
+    Assert(NumTables <= MAX_TABLES);
+    
+    /* ----------------------
+     * allocate a hash table for the lock tags.  This is used
+     * to find the different locks.
+     * ----------------------
+     */
+    info.keysize =  sizeof(LOCKTAG);
+    info.datasize = sizeof(LOCK);
+    info.hash = tag_hash;
+    hash_flags = (HASH_ELEM | HASH_FUNCTION);
+    
+    sprintf(shmemName,"%s (lock hash)",tabName);
+    ltable->lockHash = (HTAB *) ShmemInitHash(shmemName,
+					      INIT_TABLE_SIZE,MAX_TABLE_SIZE,
+					      &info,hash_flags);
+    
+    Assert( ltable->lockHash->hash == tag_hash);
+    if (! ltable->lockHash)
+	{
+	    elog(FATAL,"LockTabInit: couldn't initialize %s",tabName);
+	    status = FALSE;
+	}
+    
+    /* -------------------------
+     * allocate an xid table.  When different transactions hold
+     * the same lock, additional information must be saved (locks per tx).
+     * -------------------------
+     */
+    info.keysize = XID_TAGSIZE;
+    info.datasize = sizeof(XIDLookupEnt);
+    info.hash = tag_hash;
+    hash_flags = (HASH_ELEM | HASH_FUNCTION);
+    
+    sprintf(shmemName,"%s (xid hash)",tabName);
+    ltable->xidHash = (HTAB *) ShmemInitHash(shmemName,
+					     INIT_TABLE_SIZE,MAX_TABLE_SIZE,
+					     &info,hash_flags);
+    
+    if (! ltable->xidHash)
+	{
+	    elog(FATAL,"LockTabInit: couldn't initialize %s",tabName);
+	    status = FALSE;
+	}
+    
+    /* init ctl data structures */
+    LockTypeInit(ltable, conflictsP, prioP, ntypes);
+    
+    SpinRelease(LockMgrLock);
+    
+    (void) pfree (shmemName);
+    
+    if (status)
+	return(ltable->ctl->tableId);
+    else
+	return(INVALID_TABLEID);
+}
+
+/*
+ * LockTabRename -- allocate another tableId to the same
+ *	lock table.
+ *
+ * NOTES: Both the lock module and the lock chain (lchain.c)
+ *	module use table id's to distinguish between different
+ *	kinds of locks.  Short term and long term locks look
+ *	the same to the lock table, but are handled differently
+ *	by the lock chain manager.  This function allows the
+ *	client to use different tableIds when acquiring/releasing
+ *	short term and long term locks.
+ */
+LockTableId
+LockTabRename(LockTableId tableId)
+{
+    LockTableId	newTableId;
+    
+    if (NumTables >= MAX_TABLES)
+	{
+	    return(INVALID_TABLEID);
+	}
+    if (AllTables[tableId] == INVALID_TABLEID)
+	{
+	    return(INVALID_TABLEID);
+	}
+    
+    /* other modules refer to the lock table by a tableId */
+    newTableId = NumTables;
+    NumTables++;
+    
+    AllTables[newTableId] = AllTables[tableId];
+    return(newTableId);
+}
+
+/*
+ * LockAcquire -- Check for lock conflicts, sleep if conflict found,
+ *	set lock if/when no conflicts.
+ *
+ * Returns: TRUE if parameters are correct, FALSE otherwise.
+ *
+ * Side Effects: The lock is always acquired.  No way to abort
+ *	a lock acquisition other than aborting the transaction.
+ *	Lock is recorded in the lkchain.
+ */
+bool
+LockAcquire(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt)
+{
+    XIDLookupEnt	*result,item;
+    HTAB		*xidTable;
+    bool	found;
+    LOCK		*lock = NULL;
+    SPINLOCK 	masterLock;
+    LOCKTAB 	*ltable;
+    int 		status;
+    TransactionId	myXid;
+    
+    Assert (tableId < NumTables);
+    ltable = AllTables[tableId];
+    if (!ltable)
+	{
+	    elog(NOTICE,"LockAcquire: bad lock table %d",tableId);
+	    return  (FALSE);
+	}
+    
+    if (LockingIsDisabled)
+	{
+	    return(TRUE);
+	}
+    
+    LOCK_PRINT("Acquire",lockName,lockt);
+    masterLock = ltable->ctl->masterLock;
+    
+    SpinAcquire(masterLock);
+    
+    Assert( ltable->lockHash->hash == tag_hash);
+    lock = (LOCK *)hash_search(ltable->lockHash,(Pointer)lockName,HASH_ENTER,&found);
+    
+    if (! lock)
+	{
+	    SpinRelease(masterLock);
+	    elog(FATAL,"LockAcquire: lock table %d is corrupted",tableId);
+	    return(FALSE);
+	}
+    
+    /* --------------------
+     * if there was nothing else there, complete initialization
+     * --------------------
+     */
+    if  (! found)
+	{
+	    lock->mask = 0;
+	    ProcQueueInit(&(lock->waitProcs));
+	    memset((char *)lock->holders, 0, sizeof(int)*MAX_LOCKTYPES);
+	    memset((char *)lock->activeHolders, 0, sizeof(int)*MAX_LOCKTYPES);
+	    lock->nHolding = 0;
+	    lock->nActive = 0;
+	    
+	    Assert(BlockIdEquals(&(lock->tag.tupleId.ip_blkid),
+				 &(lockName->tupleId.ip_blkid)));
+	    
+	}
+    
+    /* ------------------
+     * add an element to the lock queue so that we can clear the
+     * locks at end of transaction.
+     * ------------------
+     */
+    xidTable = ltable->xidHash;
+    myXid = GetCurrentTransactionId();
+    
+    /* ------------------
+     * Zero out all of the tag bytes (this clears the padding bytes for long
+     * word alignment and ensures hashing consistency).
+     * ------------------
+     */
+    memset(&item, 0, XID_TAGSIZE); 
+    TransactionIdStore(myXid, &item.tag.xid);
+    item.tag.lock = MAKE_OFFSET(lock);
+#if 0
+    item.tag.pid = MyPid;
+#endif
+    
+    result = (XIDLookupEnt *)hash_search(xidTable, (Pointer)&item, HASH_ENTER, &found);
+    if (!result)
+	{
+	    elog(NOTICE,"LockAcquire: xid table corrupted");
+	    return(STATUS_ERROR);
+	}
+    if (!found)
+	{
+	    XID_PRINT("queueing XidEnt LockAcquire:", result);
+	    ProcAddLock(&result->queue);
+	    result->nHolding = 0;
+	    memset((char *)result->holders, 0, sizeof(int)*MAX_LOCKTYPES);
+	}
+    
+    /* ----------------
+     * lock->nholding tells us how many processes have _tried_ to
+     * acquire this lock,  Regardless of whether they succeeded or
+     * failed in doing so.
+     * ----------------
+     */
+    lock->nHolding++;
+    lock->holders[lockt]++;
+    
+    /* --------------------
+     * If I'm the only one holding a lock, then there
+     * cannot be a conflict.  Need to subtract one from the
+     * lock's count since we just bumped the count up by 1 
+     * above.
+     * --------------------
+     */
+    if (result->nHolding == lock->nActive)
+	{
+	    result->holders[lockt]++;
+	    result->nHolding++;
+	    GrantLock(lock, lockt);
+	    SpinRelease(masterLock);
+	    return(TRUE);
+	}
+    
+    Assert(result->nHolding <= lock->nActive);
+    
+    status = LockResolveConflicts(ltable, lock, lockt, myXid);
+    
+    if (status == STATUS_OK)
+	{
+	    GrantLock(lock, lockt);
+	}
+    else if (status == STATUS_FOUND)
+	{
+	    status = WaitOnLock(ltable, tableId, lock, lockt);
+	    XID_PRINT("Someone granted me the lock", result);
+	}
+    
+    SpinRelease(masterLock);
+    
+    return(status == STATUS_OK);
+}
+
+/* ----------------------------
+ * LockResolveConflicts -- test for lock conflicts
+ *
+ * NOTES:
+ * 	Here's what makes this complicated: one transaction's
+ * locks don't conflict with one another.  When many processes
+ * hold locks, each has to subtract off the other's locks when
+ * determining whether or not any new lock acquired conflicts with
+ * the old ones.
+ *
+ *  For example, if I am already holding a WRITE_INTENT lock,
+ *  there will not be a conflict with my own READ_LOCK.  If I
+ *  don't consider the intent lock when checking for conflicts,
+ *  I find no conflict.
+ * ----------------------------
+ */
+int
+LockResolveConflicts(LOCKTAB *ltable,
+		     LOCK *lock,
+		     LOCKT lockt,
+		     TransactionId xid)
+{
+    XIDLookupEnt	*result,item;
+    int		*myHolders;
+    int		nLockTypes;
+    HTAB		*xidTable;
+    bool	found;
+    int		bitmask;
+    int 		i,tmpMask;
+    
+    nLockTypes = ltable->ctl->nLockTypes;
+    xidTable = ltable->xidHash;
+    
+    /* ---------------------
+     * read my own statistics from the xid table.  If there
+     * isn't an entry, then we'll just add one.
+     *
+     * Zero out the tag, this clears the padding bytes for long
+     * word alignment and ensures hashing consistency.
+     * ------------------
+     */
+    memset(&item, 0, XID_TAGSIZE);
+    TransactionIdStore(xid, &item.tag.xid);
+    item.tag.lock = MAKE_OFFSET(lock);
+#if 0
+    item.tag.pid = pid;
+#endif
+    
+    if (! (result = (XIDLookupEnt *)
+	   hash_search(xidTable, (Pointer)&item, HASH_ENTER, &found)))
+	{
+	    elog(NOTICE,"LockResolveConflicts: xid table corrupted");
+	    return(STATUS_ERROR);
+	}
+    myHolders = result->holders;
+    
+    if (! found)
+	{
+	    /* ---------------
+	     * we're not holding any type of lock yet.  Clear
+	     * the lock stats.
+	     * ---------------
+	     */
+	    memset(result->holders, 0, nLockTypes * sizeof(*(lock->holders))); 
+	    result->nHolding = 0;
+	}
+    
+    /* ----------------------------
+     * first check for global conflicts: If no locks conflict
+     * with mine, then I get the lock.
+     *
+     * Checking for conflict: lock->mask represents the types of
+     * currently held locks.  conflictTable[lockt] has a bit
+     * set for each type of lock that conflicts with mine.  Bitwise
+     * compare tells if there is a conflict.
+     * ----------------------------
+     */
+    if (! (ltable->ctl->conflictTab[lockt] & lock->mask))
+	{
+	    
+	    result->holders[lockt]++;
+	    result->nHolding++;
+	    
+	    XID_PRINT("Conflict Resolved: updated xid entry stats", result);
+	    
+	    return(STATUS_OK);
+	}
+    
+    /* ------------------------
+     * Rats.  Something conflicts. But it could still be my own
+     * lock.  We have to construct a conflict mask
+     * that does not reflect our own locks.
+     * ------------------------
+     */
+    bitmask = 0;
+    tmpMask = 2;
+    for (i=1;i<=nLockTypes;i++, tmpMask <<= 1)
+	{
+	    if (lock->activeHolders[i] - myHolders[i])
+		{
+		    bitmask |= tmpMask;
+		}
+	}
+    
+    /* ------------------------
+     * now check again for conflicts.  'bitmask' describes the types
+     * of locks held by other processes.  If one of these
+     * conflicts with the kind of lock that I want, there is a
+     * conflict and I have to sleep.
+     * ------------------------
+     */
+    if (! (ltable->ctl->conflictTab[lockt] & bitmask))
+	{
+	    
+	    /* no conflict. Get the lock and go on */
+	    
+	    result->holders[lockt]++;
+	    result->nHolding++;
+	    
+	    XID_PRINT("Conflict Resolved: updated xid entry stats", result);
+	    
+	    return(STATUS_OK);
+	    
+	}
+    
+    return(STATUS_FOUND);
+}
+
+int
+WaitOnLock(LOCKTAB *ltable, LockTableId tableId, LOCK *lock, LOCKT lockt)
+{
+    PROC_QUEUE *waitQueue = &(lock->waitProcs);
+    
+    int prio = ltable->ctl->prio[lockt];
+    
+    /* the waitqueue is ordered by priority. I insert myself
+     * according to the priority of the lock I am acquiring.
+     *
+     * SYNC NOTE: I am assuming that the lock table spinlock
+     * is sufficient synchronization for this queue.  That
+     * will not be true if/when people can be deleted from
+     * the queue by a SIGINT or something.
+     */
+    LOCK_DUMP("WaitOnLock: sleeping on lock", lock, lockt);
+    if (ProcSleep(waitQueue,
+		  ltable->ctl->masterLock,
+		  lockt,
+		  prio,
+		  lock) != NO_ERROR)
+	{
+	    /* -------------------
+	     * This could have happend as a result of a deadlock, see HandleDeadLock()
+	     * Decrement the lock nHolding and holders fields as we are no longer 
+	     * waiting on this lock.
+	     * -------------------
+	     */
+	    lock->nHolding--;
+	    lock->holders[lockt]--;
+	    LOCK_DUMP("WaitOnLock: aborting on lock", lock, lockt);
+	    SpinRelease(ltable->ctl->masterLock);
+	    elog(WARN,"WaitOnLock: error on wakeup - Aborting this transaction");
+	}
+    
+    return(STATUS_OK);
+}
+
+/*
+ * LockRelease -- look up 'lockName' in lock table 'tableId' and
+ *	release it.
+ *
+ * Side Effects: if the lock no longer conflicts with the highest
+ *	priority waiting process, that process is granted the lock
+ *	and awoken. (We have to grant the lock here to avoid a
+ *	race between the waking process and any new process to
+ *	come along and request the lock).
+ */
+bool
+LockRelease(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt)
+{
+    LOCK		*lock = NULL;
+    SPINLOCK 	masterLock;
+    bool	found;
+    LOCKTAB 	*ltable;
+    XIDLookupEnt	*result,item;
+    HTAB 		*xidTable;
+    bool		wakeupNeeded = true;
+    
+    Assert (tableId < NumTables);
+    ltable = AllTables[tableId];
+    if (!ltable) {
+	elog(NOTICE, "ltable is null in LockRelease");
+	return (FALSE);
+    }
+    
+    if (LockingIsDisabled)
+	{
+	    return(TRUE);
+	}
+    
+    LOCK_PRINT("Release",lockName,lockt);
+    
+    masterLock = ltable->ctl->masterLock;
+    xidTable = ltable->xidHash;
+    
+    SpinAcquire(masterLock);
+    
+    Assert( ltable->lockHash->hash == tag_hash);
+    lock = (LOCK *)
+	hash_search(ltable->lockHash,(Pointer)lockName,HASH_FIND_SAVE,&found);
+    
+    /* let the caller print its own error message, too.
+     * Do not elog(WARN).
+     */
+    if (! lock)
+	{
+	    SpinRelease(masterLock);
+	    elog(NOTICE,"LockRelease: locktable corrupted");
+	    return(FALSE);
+	}
+    
+    if (! found)
+	{
+	    SpinRelease(masterLock);
+	    elog(NOTICE,"LockRelease: locktable lookup failed, no lock");
+	    return(FALSE);
+	}
+    
+    Assert(lock->nHolding > 0);
+    
+    /*
+     * fix the general lock stats
+     */
+    lock->nHolding--;
+    lock->holders[lockt]--;
+    lock->nActive--;
+    lock->activeHolders[lockt]--;
+    
+    Assert(lock->nActive >= 0);
+    
+    if (! lock->nHolding)
+	{
+	    /* ------------------
+	     * if there's no one waiting in the queue,
+	     * we just released the last lock.
+	     * Delete it from the lock table.
+	     * ------------------
+	     */
+	    Assert( ltable->lockHash->hash == tag_hash);
+	    lock = (LOCK *) hash_search(ltable->lockHash,
+					(Pointer) &(lock->tag),
+					HASH_REMOVE_SAVED,
+					&found);
+	    Assert(lock && found);
+	    wakeupNeeded = false;
+	}
+    
+    /* ------------------
+     * Zero out all of the tag bytes (this clears the padding bytes for long
+     * word alignment and ensures hashing consistency).
+     * ------------------
+     */
+    memset(&item, 0, XID_TAGSIZE);
+    
+    TransactionIdStore(GetCurrentTransactionId(), &item.tag.xid);
+    item.tag.lock = MAKE_OFFSET(lock);
+#if 0
+    item.tag.pid = MyPid;
+#endif
+    
+    if (! ( result = (XIDLookupEnt *) hash_search(xidTable,
+						  (Pointer)&item,
+						  HASH_FIND_SAVE,
+						  &found) )
+	|| !found)
+	{
+	    SpinRelease(masterLock);
+	    elog(NOTICE,"LockReplace: xid table corrupted");
+	    return(FALSE);
+	}
+    /*
+     * now check to see if I have any private locks.  If I do,
+     * decrement the counts associated with them.
+     */
+    result->holders[lockt]--;
+    result->nHolding--;
+    
+    XID_PRINT("LockRelease updated xid stats", result);
+    
+    /*
+     * If this was my last hold on this lock, delete my entry
+     * in the XID table.
+     */
+    if (! result->nHolding)
+	{
+	    if (result->queue.next != INVALID_OFFSET)
+		SHMQueueDelete(&result->queue);
+	    if (! (result = (XIDLookupEnt *)
+		   hash_search(xidTable, (Pointer)&item, HASH_REMOVE_SAVED, &found)) ||
+		! found)
+		{
+		    SpinRelease(masterLock);
+		    elog(NOTICE,"LockReplace: xid table corrupted");
+		    return(FALSE);
+		}
+	}
+    
+    /* --------------------------
+     * If there are still active locks of the type I just released, no one
+     * should be woken up.  Whoever is asleep will still conflict
+     * with the remaining locks.
+     * --------------------------
+     */
+    if (! (lock->activeHolders[lockt]))
+	{
+	    /* change the conflict mask.  No more of this lock type. */
+	    lock->mask &= BITS_OFF[lockt];
+	}
+    
+    if (wakeupNeeded)
+	{
+	    /* --------------------------
+	     * Wake the first waiting process and grant him the lock if it
+	     * doesn't conflict.  The woken process must record the lock
+	     * himself.
+	     * --------------------------
+	     */
+	    (void) ProcLockWakeup(&(lock->waitProcs), (char *) ltable, (char *) lock);
+	}
+    
+    SpinRelease(masterLock);
+    return(TRUE);
+}
+
+/*
+ * GrantLock -- udpate the lock data structure to show
+ *	the new lock holder.
+ */
+void
+GrantLock(LOCK *lock, LOCKT lockt)
+{
+    lock->nActive++;
+    lock->activeHolders[lockt]++;
+    lock->mask |= BITS_ON[lockt];
+}
+
+bool
+LockReleaseAll(LockTableId tableId, SHM_QUEUE *lockQueue)
+{
+    PROC_QUEUE 	*waitQueue;
+    int		done;
+    XIDLookupEnt	*xidLook = NULL;
+    XIDLookupEnt	*tmp = NULL;
+    SHMEM_OFFSET 	end = MAKE_OFFSET(lockQueue);
+    SPINLOCK 	masterLock;
+    LOCKTAB 	*ltable;
+    int		i,nLockTypes;
+    LOCK		*lock;
+    bool	found;
+    
+    Assert (tableId < NumTables);
+    ltable = AllTables[tableId];
+    if (!ltable)
+	return (FALSE);
+    
+    nLockTypes = ltable->ctl->nLockTypes;
+    masterLock = ltable->ctl->masterLock;
+    
+    if (SHMQueueEmpty(lockQueue))
+	return TRUE;
+    
+    SHMQueueFirst(lockQueue,(Pointer*)&xidLook,&xidLook->queue);
+    
+    XID_PRINT("LockReleaseAll:", xidLook);
+    
+    SpinAcquire(masterLock);
+    for (;;)
+	{
+	    /* ---------------------------
+	     * XXX Here we assume the shared memory queue is circular and
+	     * that we know its internal structure.  Should have some sort of
+	     * macros to allow one to walk it.  mer 20 July 1991
+	     * ---------------------------
+	     */
+	    done = (xidLook->queue.next == end);
+	    lock = (LOCK *) MAKE_PTR(xidLook->tag.lock);
+	    
+	    LOCK_PRINT("ReleaseAll",(&lock->tag),0);
+	    
+	    /* ------------------
+	     * fix the general lock stats
+	     * ------------------
+	     */
+	    if (lock->nHolding != xidLook->nHolding)
+		{
+		    lock->nHolding -= xidLook->nHolding;
+		    lock->nActive -= xidLook->nHolding;
+		    Assert(lock->nActive >= 0);
+		    for (i=1; i<=nLockTypes; i++)
+			{
+			    lock->holders[i] -= xidLook->holders[i];
+			    lock->activeHolders[i] -= xidLook->holders[i];
+			    if (! lock->activeHolders[i])
+				lock->mask &= BITS_OFF[i];
+			}
+		}
+	    else
+		{
+		    /* --------------
+		     * set nHolding to zero so that we can garbage collect the lock
+		     * down below...
+		     * --------------
+		     */
+		    lock->nHolding = 0;
+		}
+	    /* ----------------
+	     * always remove the xidLookup entry, we're done with it now
+	     * ----------------
+	     */
+	    if ((! hash_search(ltable->xidHash, (Pointer)xidLook, HASH_REMOVE, &found))
+		|| !found)
+		{
+		    SpinRelease(masterLock);
+		    elog(NOTICE,"LockReplace: xid table corrupted");
+		    return(FALSE);
+		}
+	    
+	    if (! lock->nHolding)
+		{
+		    /* --------------------
+		     * if there's no one waiting in the queue, we've just released
+		     * the last lock.
+		     * --------------------
+		     */
+		    
+		    Assert( ltable->lockHash->hash == tag_hash);
+		    lock = (LOCK *)
+			hash_search(ltable->lockHash,(Pointer)&(lock->tag),HASH_REMOVE, &found);
+		    if ((! lock) || (!found))
+			{
+			    SpinRelease(masterLock);
+			    elog(NOTICE,"LockReplace: cannot remove lock from HTAB");
+			    return(FALSE);
+			}
+		}
+	    else
+		{
+		    /* --------------------
+		     * Wake the first waiting process and grant him the lock if it
+		     * doesn't conflict.  The woken process must record the lock
+		     * him/herself.
+		     * --------------------
+		     */
+		    waitQueue = &(lock->waitProcs);
+		    (void) ProcLockWakeup(waitQueue, (char *) ltable, (char *) lock);
+		}
+	    
+	    if (done)
+		break;
+	    SHMQueueFirst(&xidLook->queue,(Pointer*)&tmp,&tmp->queue);
+	    xidLook = tmp;
+	}
+    SpinRelease(masterLock);
+    SHMQueueInit(lockQueue);
+    return TRUE;
+}
+
+int
+LockShmemSize()
+{
+    int size = 0;
+    int nLockBuckets, nLockSegs;
+    int nXidBuckets, nXidSegs;
+    
+    nLockBuckets = 1 << (int)my_log2((NLOCKENTS - 1) / DEF_FFACTOR + 1);
+    nLockSegs = 1 << (int)my_log2((nLockBuckets - 1) / DEF_SEGSIZE + 1);
+    
+    nXidBuckets = 1 << (int)my_log2((NLOCKS_PER_XACT-1) / DEF_FFACTOR + 1);
+    nXidSegs = 1 << (int)my_log2((nLockBuckets - 1) / DEF_SEGSIZE + 1);
+    
+    size += MAXALIGN(NBACKENDS * sizeof(PROC));	/* each MyProc */
+    size += MAXALIGN(NBACKENDS * sizeof(LOCKCTL));	/* each ltable->ctl */
+    size += MAXALIGN(sizeof(PROC_HDR));		/* ProcGlobal */
+    
+    size += MAXALIGN(my_log2(NLOCKENTS) * sizeof(void *));
+    size += MAXALIGN(sizeof(HHDR));
+    size += nLockSegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    size += NLOCKENTS * /* XXX not multiple of BUCKET_ALLOC_INCR? */
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(sizeof(LOCK))); /* contains hash key */
+    
+    size += MAXALIGN(my_log2(NBACKENDS) * sizeof(void *));
+    size += MAXALIGN(sizeof(HHDR));
+    size += nXidSegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    size += NBACKENDS * /* XXX not multiple of BUCKET_ALLOC_INCR? */
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(sizeof(XIDLookupEnt))); /* contains hash key */
+    
+    return size;
+}
+
+/* -----------------
+ * Boolean function to determine current locking status
+ * -----------------
+ */
+bool
+LockingDisabled()
+{
+    return LockingIsDisabled;
+}
diff --git a/src/backend/storage/lmgr/multi.c b/src/backend/storage/lmgr/multi.c
new file mode 100644
index 00000000000..c1702d18cb8
--- /dev/null
+++ b/src/backend/storage/lmgr/multi.c
@@ -0,0 +1,415 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi.c--
+ *    multi level lock table manager
+ *
+ *    Standard multi-level lock manager as per the Gray paper
+ *    (at least, that is what it is supposed to be).  We implement
+ *    three levels -- RELN, PAGE, TUPLE.  Tuple is actually TID
+ *    a physical record pointer.  It isn't an object id.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/multi.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
+ *
+ * NOTES:
+ *   (1) The lock.c module assumes that the caller here is doing
+ *       two phase locking.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>
+#include <string.h>
+#include "storage/lmgr.h"
+#include "storage/multilev.h"
+
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "miscadmin.h"		/* MyDatabaseId */
+
+
+/*
+ * INTENT indicates to higher level that a lower level lock has been
+ * set.  For example, a write lock on a tuple conflicts with a write 
+ * lock on a relation.  This conflict is detected as a WRITE_INTENT/
+ * WRITE conflict between the tuple's intent lock and the relation's
+ * write lock.
+ */
+static int MultiConflicts[] = {
+    (int)NULL,	
+    /* All reads and writes at any level conflict with a write lock */
+    (1 << WRITE_LOCK)|(1 << WRITE_INTENT)|(1 << READ_LOCK)|(1 << READ_INTENT),
+    /* read locks conflict with write locks at curr and lower levels */
+    (1 << WRITE_LOCK)| (1 << WRITE_INTENT),  
+    /* write intent locks */
+    (1 << READ_LOCK) | (1 << WRITE_LOCK),
+    /* read intent locks*/
+    (1 << WRITE_LOCK),
+    /* extend locks for archive storage manager conflict only w/extend locks */
+    (1 << EXTEND_LOCK)
+};
+
+/*
+ * write locks have higher priority than read locks and extend locks.  May
+ * want to treat INTENT locks differently.
+ */
+static int MultiPrios[] = {
+    (int)NULL,
+    2,
+    1,
+    2,
+    1,
+    1
+};
+
+/* 
+ * Lock table identifier for this lock table.  The multi-level
+ * lock table is ONE lock table, not three.
+ */
+LockTableId MultiTableId = (LockTableId)NULL;
+LockTableId ShortTermTableId = (LockTableId)NULL;
+
+/*
+ * Create the lock table described by MultiConflicts and Multiprio.
+ */
+LockTableId
+InitMultiLevelLockm()
+{
+    int tableId;
+    
+    /* -----------------------
+     * If we're already initialized just return the table id.
+     * -----------------------
+     */
+    if (MultiTableId)
+	return MultiTableId;
+    
+    tableId = LockTabInit("LockTable", MultiConflicts, MultiPrios, 5);
+    MultiTableId = tableId;
+    if (! (MultiTableId)) {
+	elog(WARN,"InitMultiLockm: couldnt initialize lock table");
+    }
+    /* -----------------------
+     * No short term lock table for now.  -Jeff 15 July 1991
+     * 
+     * ShortTermTableId = LockTabRename(tableId);
+     * if (! (ShortTermTableId)) {
+     *   elog(WARN,"InitMultiLockm: couldnt rename lock table");
+     * }
+     * -----------------------
+     */
+    return MultiTableId;
+}
+
+/*
+ * MultiLockReln -- lock a relation
+ *
+ * Returns: TRUE if the lock can be set, FALSE otherwise.
+ */
+bool
+MultiLockReln(LockInfo linfo, LOCKT lockt)
+{
+    LOCKTAG	tag;
+    
+    /* LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    return(MultiAcquire(MultiTableId, &tag, lockt, RELN_LEVEL));
+}
+
+/*
+ * MultiLockTuple -- Lock the TID associated with a tuple
+ *
+ * Returns: TRUE if lock is set, FALSE otherwise.
+ *
+ * Side Effects: causes intention level locks to be set
+ * 	at the page and relation level.
+ */
+bool
+MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
+{
+    LOCKTAG	tag;
+    
+    /* LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    
+    /* not locking any valid Tuple, just the page */
+    tag.tupleId = *tidPtr;
+    return(MultiAcquire(MultiTableId, &tag, lockt, TUPLE_LEVEL));
+}
+
+/*
+ * same as above at page level
+ */
+bool
+MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
+{
+    LOCKTAG	tag;
+    
+    /* LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    
+    
+    /* ----------------------------
+     * Now we want to set the page offset to be invalid 
+     * and lock the block.  There is some confusion here as to what
+     * a page is.  In Postgres a page is an 8k block, however this
+     * block may be partitioned into many subpages which are sometimes
+     * also called pages.  The term is overloaded, so don't be fooled
+     * when we say lock the page we mean the 8k block. -Jeff 16 July 1991
+     * ----------------------------
+     */
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+    return(MultiAcquire(MultiTableId, &tag, lockt, PAGE_LEVEL));
+}
+
+/*
+ * MultiAcquire -- acquire multi level lock at requested level
+ *
+ * Returns: TRUE if lock is set, FALSE if not
+ * Side Effects:
+ */
+bool
+MultiAcquire(LockTableId tableId,
+	     LOCKTAG *tag,
+	     LOCKT lockt,
+	     LOCK_LEVEL level)
+{
+    LOCKT locks[N_LEVELS];
+    int	i,status;
+    LOCKTAG 	xxTag, *tmpTag = &xxTag;
+    int	retStatus = TRUE;
+    
+    /*
+     * Three levels implemented.  If we set a low level (e.g. Tuple)
+     * lock, we must set INTENT locks on the higher levels.  The 
+     * intent lock detects conflicts between the low level lock
+     * and an existing high level lock.  For example, setting a
+     * write lock on a tuple in a relation is disallowed if there
+     * is an existing read lock on the entire relation.  The
+     * write lock would set a WRITE + INTENT lock on the relation
+     * and that lock would conflict with the read.
+     */
+    switch (level) {
+    case RELN_LEVEL:
+	locks[0] = lockt;
+	locks[1] = NO_LOCK;
+	locks[2] = NO_LOCK;
+	break;
+    case PAGE_LEVEL:
+	locks[0] = lockt + INTENT;
+	locks[1] = lockt;
+	locks[2] = NO_LOCK;
+	break;
+    case TUPLE_LEVEL:
+	locks[0] = lockt + INTENT;
+	locks[1] = lockt + INTENT;
+	locks[2] = lockt;
+	break;
+    default:
+	elog(WARN,"MultiAcquire: bad lock level");
+	return(FALSE);
+    }
+    
+    /*
+     * construct a new tag as we go. Always loop through all levels,
+     * but if we arent' seting a low level lock, locks[i] is set to
+     * NO_LOCK for the lower levels.  Always start from the highest
+     * level and go to the lowest level. 
+     */
+    memset(tmpTag,0,sizeof(*tmpTag));
+    tmpTag->relId = tag->relId;
+    tmpTag->dbId = tag->dbId;
+    
+    for (i=0;i<N_LEVELS;i++) {
+	if (locks[i] != NO_LOCK) {
+	    switch (i) {
+	    case RELN_LEVEL:
+		/* -------------
+		 * Set the block # and offset to invalid
+		 * -------------
+		 */
+		BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
+		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+		break;
+	    case PAGE_LEVEL:
+		/* -------------
+		 * Copy the block #, set the offset to invalid
+		 * -------------
+		 */
+		BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
+			    &(tag->tupleId.ip_blkid));
+		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+		break;
+	    case TUPLE_LEVEL:
+		/* --------------
+		 * Copy the entire tuple id.
+		 * --------------
+		 */
+		ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
+		break;
+	    }
+	    
+	    status = LockAcquire(tableId, tmpTag, locks[i]);
+	    if (! status) {
+		/* failed for some reason. Before returning we have
+		 * to release all of the locks we just acquired.
+		 * MultiRelease(xx,xx,xx, i) means release starting from
+		 * the last level lock we successfully acquired
+		 */
+		retStatus = FALSE;
+		(void) MultiRelease(tableId, tag, lockt, i);
+		/* now leave the loop.  Don't try for any more locks */
+		break;
+	    }
+	}
+    }
+    return(retStatus);
+}
+
+/* ------------------
+ * Release a page in the multi-level lock table
+ * ------------------
+ */
+bool
+MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT	lockt)
+{
+    LOCKTAG tag;
+    
+    /* ------------------
+     * LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     * ------------------
+     */
+    memset(&tag, 0,sizeof(LOCKTAG));
+    
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+    
+    return (MultiRelease(MultiTableId, &tag, lockt, PAGE_LEVEL));
+}
+
+/* ------------------
+ * Release a relation in the multi-level lock table
+ * ------------------
+ */
+bool
+MultiReleaseReln(LockInfo linfo, LOCKT lockt)		
+{
+    LOCKTAG tag;
+    
+    /* ------------------
+     * LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     * ------------------
+     */
+    memset(&tag, 0, sizeof(LOCKTAG));
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    
+    return (MultiRelease(MultiTableId, &tag, lockt, RELN_LEVEL));
+}
+
+/*
+ * MultiRelease -- release a multi-level lock
+ *
+ * Returns: TRUE if successful, FALSE otherwise.
+ */
+bool
+MultiRelease(LockTableId tableId,
+	     LOCKTAG *tag,
+	     LOCKT	lockt,
+	     LOCK_LEVEL level)
+{
+    LOCKT 	locks[N_LEVELS];
+    int		i,status;
+    LOCKTAG 	xxTag, *tmpTag = &xxTag;
+    
+    /* 
+     * same level scheme as MultiAcquire().
+     */
+    switch (level) {
+    case RELN_LEVEL:
+	locks[0] = lockt;
+	locks[1] = NO_LOCK;
+	locks[2] = NO_LOCK;
+	break;
+    case PAGE_LEVEL:
+	locks[0] = lockt + INTENT;
+	locks[1] = lockt;
+	locks[2] = NO_LOCK;
+	break;
+    case TUPLE_LEVEL:
+	locks[0] = lockt + INTENT;
+	locks[1] = lockt + INTENT;
+	locks[2] = lockt;
+	break;
+    default:
+	elog(WARN,"MultiRelease: bad lockt");
+    }
+    
+    /*
+     * again, construct the tag on the fly.  This time, however,
+     * we release the locks in the REVERSE order -- from lowest
+     * level to highest level.  
+     *
+     * Must zero out the tag to set padding byes to zero and ensure
+     * hashing consistency.
+     */
+    memset(tmpTag, 0, sizeof(*tmpTag));
+    tmpTag->relId = tag->relId;
+    tmpTag->dbId =  tag->dbId;
+    
+    for (i=(N_LEVELS-1); i>=0; i--) {
+	if (locks[i] != NO_LOCK) {
+	    switch (i) {
+	    case RELN_LEVEL:
+		/* -------------
+		 * Set the block # and offset to invalid
+		 * -------------
+		 */
+		BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
+		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+		break;
+	    case PAGE_LEVEL:
+		/* -------------
+		 * Copy the block #, set the offset to invalid
+		 * -------------
+		 */
+		BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
+			    &(tag->tupleId.ip_blkid));
+		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+		break;
+	    case TUPLE_LEVEL:
+		ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
+		break;
+	    }
+	    status = LockRelease(tableId, tmpTag, locks[i]);
+	    if (! status) {
+		elog(WARN,"MultiRelease: couldn't release after error");
+	    }
+	}
+    }
+    /* shouldn't reach here */
+    return false;
+}
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
new file mode 100644
index 00000000000..0955cdfc2f5
--- /dev/null
+++ b/src/backend/storage/lmgr/proc.c
@@ -0,0 +1,826 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.c--
+ *    routines to manage per-process shared memory data structure
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *  Each postgres backend gets one of these.  We'll use it to
+ *  clean up after the process should the process suddenly die.
+ *
+ *
+ * Interface (a):
+ *	ProcSleep(), ProcWakeup(), ProcWakeupNext(),
+ * 	ProcQueueAlloc() -- create a shm queue for sleeping processes
+ * 	ProcQueueInit() -- create a queue without allocing memory
+ *
+ * Locking and waiting for buffers can cause the backend to be
+ * put to sleep.  Whoever releases the lock, etc. wakes the
+ * process up again (and gives it an error code so it knows
+ * whether it was awoken on an error condition).
+ *
+ * Interface (b):
+ *
+ * ProcReleaseLocks -- frees the locks associated with this process,
+ * ProcKill -- destroys the shared memory state (and locks)
+ *	associated with the process.
+ *
+ * 5/15/91 -- removed the buffer pool based lock chain in favor
+ *	of a shared memory lock chain.  The write-protection is
+ *	more expensive if the lock chain is in the buffer pool.
+ *	The only reason I kept the lock chain in the buffer pool
+ *	in the first place was to allow the lock table to grow larger
+ *	than available shared memory and that isn't going to work
+ *	without a lot of unimplemented support anyway.
+ *
+ * 4/7/95 -- instead of allocating a set of 1 semaphore per process, we
+ *      allocate a semaphore from a set of PROC_NSEMS_PER_SET semaphores
+ *      shared among backends (we keep a few sets of semaphores around).
+ *      This is so that we can support more backends. (system-wide semaphore
+ *      sets run out pretty fast.)                -ay 4/95
+ *
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ */
+#include <sys/time.h>
+#ifndef WIN32
+#include <unistd.h>
+#endif /* WIN32 */
+#include <string.h>
+#include <sys/types.h>
+#include "libpq/pqsignal.h"	/* substitute for <signal.h> */
+
+#if defined(PORTNAME_bsdi)
+/* hacka, hacka, hacka (XXX) */
+union semun {
+	int val; /* value for SETVAL */
+	struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */
+	ushort *array; /* array for GETALL & SETALL */
+};
+#endif
+
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+#include "storage/buf.h"	
+#include "storage/lock.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/proc.h"
+
+/*
+ * timeout (in seconds) for resolving possible deadlock
+ */
+#ifndef DEADLOCK_TIMEOUT
+#define DEADLOCK_TIMEOUT	60
+#endif
+
+/* --------------------
+ * Spin lock for manipulating the shared process data structure:
+ * ProcGlobal.... Adding an extra spin lock seemed like the smallest
+ * hack to get around reading and updating this structure in shared
+ * memory. -mer 17 July 1991
+ * --------------------
+ */
+SPINLOCK ProcStructLock;
+
+/*
+ * For cleanup routines.  Don't cleanup if the initialization
+ * has not happened.
+ */
+static bool	ProcInitialized = FALSE;
+
+static PROC_HDR *ProcGlobal = NULL;
+
+PROC 	*MyProc = NULL;
+
+static void ProcKill(int exitStatus, int pid);
+static void ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum);
+static void ProcFreeSem(IpcSemaphoreKey semKey, int semNum);
+#if defined(PORTNAME_linux)
+extern int HandleDeadLock(int);
+#else
+extern int HandleDeadLock(void);
+#endif
+/*
+ * InitProcGlobal -
+ *    initializes the global process table. We put it here so that
+ *    the postmaster can do this initialization. (ProcFreeAllSem needs
+ *    to read this table on exiting the postmaster. If we have the first
+ *    backend do this, starting up and killing the postmaster without
+ *    starting any backends will be a problem.)
+ */
+void
+InitProcGlobal(IPCKey key)
+{
+    bool found = false;
+
+    /* attach to the free list */
+    ProcGlobal = (PROC_HDR *)
+	ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found);
+
+    /* --------------------
+     * We're the first - initialize.
+     * --------------------
+     */
+    if (! found)
+	{
+	    int i;
+
+	    ProcGlobal->numProcs = 0;
+	    ProcGlobal->freeProcs = INVALID_OFFSET;
+	    ProcGlobal->currKey = IPCGetProcessSemaphoreInitKey(key);
+	    for (i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++)
+		ProcGlobal->freeSemMap[i] = 0;
+	}
+}
+
+/* ------------------------
+ * InitProc -- create a per-process data structure for this process
+ * used by the lock manager on semaphore queues.
+ * ------------------------
+ */
+void
+InitProcess(IPCKey key)
+{
+    bool found = false;
+    int pid;
+    int semstat;
+    unsigned long location, myOffset;
+    
+    /* ------------------
+     * Routine called if deadlock timer goes off. See ProcSleep()
+     * ------------------
+     */
+#ifndef WIN32
+    signal(SIGALRM, HandleDeadLock);
+#endif /* WIN32 we'll have to figure out how to handle this later */
+
+    SpinAcquire(ProcStructLock);
+    
+    /* attach to the free list */
+    ProcGlobal = (PROC_HDR *)
+	ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found);
+    if (!found) {
+	/* this should not happen. InitProcGlobal() is called before this. */
+	elog(WARN, "InitProcess: Proc Header uninitialized");
+    }
+    
+    if (MyProc != NULL)
+	{
+	    SpinRelease(ProcStructLock);
+	    elog(WARN,"ProcInit: you already exist");
+	    return;
+	}
+    
+    /* try to get a proc from the free list first */
+    
+    myOffset = ProcGlobal->freeProcs;
+    
+    if (myOffset != INVALID_OFFSET)
+	{
+	    MyProc = (PROC *) MAKE_PTR(myOffset);
+	    ProcGlobal->freeProcs = MyProc->links.next;
+	}
+    else
+	{
+	    /* have to allocate one.  We can't use the normal binding
+	     * table mechanism because the proc structure is stored
+	     * by PID instead of by a global name (need to look it
+	     * up by PID when we cleanup dead processes).
+	     */
+	    
+	    MyProc = (PROC *) ShmemAlloc((unsigned)sizeof(PROC));
+	    if (! MyProc)
+		{
+		    SpinRelease(ProcStructLock);
+		    elog (FATAL,"cannot create new proc: out of memory");
+		}
+	    
+	    /* this cannot be initialized until after the buffer pool */
+	    SHMQueueInit(&(MyProc->lockQueue));
+	    MyProc->procId = ProcGlobal->numProcs;
+	    ProcGlobal->numProcs++;
+	}
+    
+    /*
+     * zero out the spin lock counts and set the sLocks field for
+     * ProcStructLock to 1 as we have acquired this spinlock above but 
+     * didn't record it since we didn't have MyProc until now.
+     */
+    memset(MyProc->sLocks, 0, sizeof(MyProc->sLocks));
+    MyProc->sLocks[ProcStructLock] = 1;
+
+
+    if (IsUnderPostmaster) {
+	IPCKey semKey;
+	int semNum;
+	int semId;
+	union semun semun;
+
+	ProcGetNewSemKeyAndNum(&semKey, &semNum);
+	
+	semId = IpcSemaphoreCreate(semKey,
+				   PROC_NSEMS_PER_SET,
+				   IPCProtection,
+				   IpcSemaphoreDefaultStartValue,
+				   0,
+				   &semstat);
+	/*
+	 * we might be reusing a semaphore that belongs to a dead
+	 * backend. So be careful and reinitialize its value here.
+	 */
+	semun.val = IpcSemaphoreDefaultStartValue;
+	semctl(semId, semNum, SETVAL, semun);
+
+	IpcSemaphoreLock(semId, semNum, IpcExclusiveLock);
+	MyProc->sem.semId = semId;
+	MyProc->sem.semNum = semNum;
+	MyProc->sem.semKey = semKey;
+    } else {
+	MyProc->sem.semId = -1;
+    }
+    
+    /* ----------------------
+     * Release the lock.
+     * ----------------------
+     */
+    SpinRelease(ProcStructLock);
+    
+    MyProc->pid = 0;
+#if 0
+    MyProc->pid = MyPid;
+#endif
+    
+    /* ----------------
+     * Start keeping spin lock stats from here on.  Any botch before
+     * this initialization is forever botched
+     * ----------------
+     */
+    memset(MyProc->sLocks, 0, MAX_SPINS*sizeof(*MyProc->sLocks));
+    
+    /* -------------------------
+     * Install ourselves in the binding table.  The name to
+     * use is determined by the OS-assigned process id.  That
+     * allows the cleanup process to find us after any untimely
+     * exit.
+     * -------------------------
+     */
+    pid = getpid();
+    location = MAKE_OFFSET(MyProc);
+    if ((! ShmemPIDLookup(pid,&location)) || (location != MAKE_OFFSET(MyProc)))
+	{
+	    elog(FATAL,"InitProc: ShmemPID table broken");
+	}
+    
+    MyProc->errType = NO_ERROR;
+    SHMQueueElemInit(&(MyProc->links));
+    
+    on_exitpg(ProcKill, (caddr_t)pid);
+    
+    ProcInitialized = TRUE;
+}
+
+/*
+ * ProcReleaseLocks() -- release all locks associated with this process
+ *
+ */
+void
+ProcReleaseLocks()
+{
+    if (!MyProc)
+	return;
+    LockReleaseAll(1,&MyProc->lockQueue);
+}
+
+/*
+ * ProcRemove -
+ *    used by the postmaster to clean up the global tables. This also frees
+ *    up the semaphore used for the lmgr of the process. (We have to do
+ *    this is the postmaster instead of doing a IpcSemaphoreKill on exiting
+ *    the process because the semaphore set is shared among backends and
+ *    we don't want to remove other's semaphores on exit.)
+ */
+bool
+ProcRemove(int pid)
+{
+    SHMEM_OFFSET  location;
+    PROC *proc;
+    
+    location = INVALID_OFFSET;
+    
+    location = ShmemPIDDestroy(pid);
+    if (location == INVALID_OFFSET)
+	return(FALSE);
+    proc = (PROC *) MAKE_PTR(location);
+
+    SpinAcquire(ProcStructLock);
+    
+    ProcFreeSem(proc->sem.semKey, proc->sem.semNum);
+
+    proc->links.next =  ProcGlobal->freeProcs;
+    ProcGlobal->freeProcs = MAKE_OFFSET(proc);
+    
+    SpinRelease(ProcStructLock);
+
+    return(TRUE);
+}
+
+/*
+ * ProcKill() -- Destroy the per-proc data structure for
+ *	this process. Release any of its held spin locks.
+ */
+static void
+ProcKill(int exitStatus, int pid)
+{
+    PROC 		*proc;
+    SHMEM_OFFSET	location;
+    
+    /* -------------------- 
+     * If this is a FATAL exit the postmaster will have to kill all the
+     * existing backends and reinitialize shared memory.  So all we don't 
+     * need to do anything here.
+     * --------------------
+     */
+    if (exitStatus != 0)
+	return;
+    
+    if (! pid)
+	{
+	    pid = getpid();
+	}
+    
+    ShmemPIDLookup(pid,&location);
+    if (location == INVALID_OFFSET)
+	return;
+    
+    proc = (PROC *) MAKE_PTR(location);
+    
+    if (proc != MyProc) {
+	Assert( pid != getpid() );
+    } else
+	MyProc = NULL;
+    
+    /* ---------------
+     * Assume one lock table.
+     * ---------------
+     */
+    ProcReleaseSpins(proc);
+    LockReleaseAll(1,&proc->lockQueue);
+    
+    /* ----------------
+     * get off the wait queue
+     * ----------------
+     */
+    LockLockTable();
+    if (proc->links.next != INVALID_OFFSET) {
+	Assert(proc->waitLock->waitProcs.size > 0);
+	SHMQueueDelete(&(proc->links));
+	--proc->waitLock->waitProcs.size;
+    }
+    SHMQueueElemInit(&(proc->links));
+    UnlockLockTable();
+    
+    return;
+}
+
+/*
+ * ProcQueue package: routines for putting processes to sleep
+ * 	and  waking them up
+ */
+
+/*
+ * ProcQueueAlloc -- alloc/attach to a shared memory process queue
+ *
+ * Returns: a pointer to the queue or NULL
+ * Side Effects: Initializes the queue if we allocated one
+ */
+PROC_QUEUE *
+ProcQueueAlloc(char *name)
+{
+    bool	found;
+    PROC_QUEUE *queue = (PROC_QUEUE *)
+	ShmemInitStruct(name,(unsigned)sizeof(PROC_QUEUE),&found);
+    
+    if (! queue)
+	{
+	    return(NULL);
+	}
+    if (! found)
+	{
+	    ProcQueueInit(queue);
+	}
+    return(queue);
+}
+
+/*
+ * ProcQueueInit -- initialize a shared memory process queue
+ */
+void
+ProcQueueInit(PROC_QUEUE *queue)
+{
+    SHMQueueInit(&(queue->links));
+    queue->size = 0;
+}
+
+
+
+/*
+ * ProcSleep -- put a process to sleep
+ *
+ * P() on the semaphore should put us to sleep.  The process
+ * semaphore is cleared by default, so the first time we try
+ * to acquire it, we sleep.
+ *
+ * ASSUME: that no one will fiddle with the queue until after
+ * 	we release the spin lock.
+ *
+ * NOTES: The process queue is now a priority queue for locking.
+ */
+int
+ProcSleep(PROC_QUEUE *queue,
+	  SPINLOCK spinlock,
+	  int token,
+	  int prio,
+	  LOCK *lock)
+{
+    int 	i;
+    PROC	*proc;
+#ifndef WIN32 /* figure this out later */
+    struct itimerval timeval, dummy;
+#endif /* WIN32 */
+    
+    proc = (PROC *) MAKE_PTR(queue->links.prev);
+    for (i=0;i<queue->size;i++)
+	{
+	    if (proc->prio < prio)
+		proc = (PROC *) MAKE_PTR(proc->links.prev);
+	    else
+		break;
+	}
+    
+    MyProc->token = token;
+    MyProc->waitLock = lock;
+    
+    /* -------------------
+     * currently, we only need this for the ProcWakeup routines
+     * -------------------
+     */
+    TransactionIdStore((TransactionId) GetCurrentTransactionId(), &MyProc->xid);
+    
+    /* -------------------
+     * assume that these two operations are atomic (because
+     * of the spinlock).
+     * -------------------
+     */
+    SHMQueueInsertTL(&(proc->links),&(MyProc->links));
+    queue->size++;
+    
+    SpinRelease(spinlock);
+    
+    /* --------------
+     * Postgres does not have any deadlock detection code and for this 
+     * reason we must set a timer to wake up the process in the event of
+     * a deadlock.  For now the timer is set for 1 minute and we assume that
+     * any process which sleeps for this amount of time is deadlocked and will 
+     * receive a SIGALRM signal.  The handler should release the processes
+     * semaphore and abort the current transaction.
+     *
+     * Need to zero out struct to set the interval and the micro seconds fields
+     * to 0.
+     * --------------
+     */
+#ifndef WIN32
+    memset(&timeval, 0, sizeof(struct itimerval));
+    timeval.it_value.tv_sec = DEADLOCK_TIMEOUT;
+    
+    if (setitimer(ITIMER_REAL, &timeval, &dummy))
+	elog(FATAL, "ProcSleep: Unable to set timer for process wakeup");
+#endif /* WIN32 */
+    
+    /* --------------
+     * if someone wakes us between SpinRelease and IpcSemaphoreLock,
+     * IpcSemaphoreLock will not block.  The wakeup is "saved" by
+     * the semaphore implementation.
+     * --------------
+     */
+    IpcSemaphoreLock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock);
+    
+    /* ---------------
+     * We were awoken before a timeout - now disable the timer
+     * ---------------
+     */
+#ifndef WIN32
+    timeval.it_value.tv_sec = 0;
+    
+    
+    if (setitimer(ITIMER_REAL, &timeval, &dummy))
+	elog(FATAL, "ProcSleep: Unable to diable timer for process wakeup");
+#endif /* WIN32 */
+    
+    /* ----------------
+     * We were assumed to be in a critical section when we went
+     * to sleep.
+     * ----------------
+     */
+    SpinAcquire(spinlock);
+    
+    return(MyProc->errType);
+}
+
+
+/*
+ * ProcWakeup -- wake up a process by releasing its private semaphore.
+ *
+ *   remove the process from the wait queue and set its links invalid.
+ *   RETURN: the next process in the wait queue.
+ */
+PROC *
+ProcWakeup(PROC *proc, int errType)
+{
+    PROC *retProc;
+    /* assume that spinlock has been acquired */
+    
+    if (proc->links.prev == INVALID_OFFSET ||
+	proc->links.next == INVALID_OFFSET)
+	return((PROC *) NULL);
+    
+    retProc = (PROC *) MAKE_PTR(proc->links.prev);
+    
+    /* you have to update waitLock->waitProcs.size yourself */
+    SHMQueueDelete(&(proc->links));
+    SHMQueueElemInit(&(proc->links));
+    
+    proc->errType = errType;
+    
+    IpcSemaphoreUnlock(proc->sem.semId, proc->sem.semNum, IpcExclusiveLock);
+    
+    return retProc;
+}
+
+
+/*
+ * ProcGetId --
+ */
+int
+ProcGetId()
+{
+    return( MyProc->procId );
+}
+
+/*
+ * ProcLockWakeup -- routine for waking up processes when a lock is
+ * 	released.
+ */
+int
+ProcLockWakeup(PROC_QUEUE *queue, char *ltable, char *lock)
+{
+    PROC	*proc;
+    int	count;
+    
+    if (! queue->size)
+	return(STATUS_NOT_FOUND);
+    
+    proc = (PROC *) MAKE_PTR(queue->links.prev);
+    count = 0;
+    while ((LockResolveConflicts ((LOCKTAB *) ltable,
+				  (LOCK *) lock,
+				  proc->token,
+				  proc->xid) == STATUS_OK))
+	{
+	    /* there was a waiting process, grant it the lock before waking it
+	     * up.  This will prevent another process from seizing the lock
+	     * between the time we release the lock master (spinlock) and
+	     * the time that the awoken process begins executing again.
+	     */
+	    GrantLock((LOCK *) lock, proc->token);
+	    queue->size--;
+	    
+	    /*
+	     * ProcWakeup removes proc from the lock waiting process queue and
+	     * returns the next proc in chain.  If a writer just dropped
+	     * its lock and there are several waiting readers, wake them all up.
+	     */
+	    proc = ProcWakeup(proc, NO_ERROR);
+	    
+	    count++;
+	    if (!proc || queue->size == 0)
+		break;
+	}
+    
+    if (count)
+	return(STATUS_OK);
+    else
+	/* Something is still blocking us.  May have deadlocked. */
+	return(STATUS_NOT_FOUND);
+}
+
+void
+ProcAddLock(SHM_QUEUE *elem)
+{
+    SHMQueueInsertTL(&MyProc->lockQueue,elem);
+}
+
+/* --------------------
+ * We only get to this routine if we got SIGALRM after DEADLOCK_TIMEOUT
+ * while waiting for a lock to be released by some other process.  After
+ * the one minute deadline we assume we have a deadlock and must abort
+ * this transaction.  We must also indicate that I'm no longer waiting
+ * on a lock so that other processes don't try to wake me up and screw 
+ * up my semaphore.
+ * --------------------
+ */
+int
+#if defined(PORTNAME_linux)
+HandleDeadLock(int i)
+#else
+HandleDeadLock()
+#endif
+{
+    LOCK *lock;
+    int size;
+    
+    LockLockTable();
+    
+    /* ---------------------
+     * Check to see if we've been awoken by anyone in the interim.
+     *
+     * If we have we can return and resume our transaction -- happy day.
+     * Before we are awoken the process releasing the lock grants it to
+     * us so we know that we don't have to wait anymore.
+     * 
+     * Damn these names are LONG! -mer
+     * ---------------------
+     */
+    if (IpcSemaphoreGetCount(MyProc->sem.semId, MyProc->sem.semNum) == 
+	IpcSemaphoreDefaultStartValue) {
+	UnlockLockTable();
+	return 1;
+    }
+    
+    /*
+     * you would think this would be unnecessary, but...
+     *
+     * this also means we've been removed already.  in some ports
+     * (e.g., sparc and aix) the semop(2) implementation is such that
+     * we can actually end up in this handler after someone has removed
+     * us from the queue and bopped the semaphore *but the test above
+     * fails to detect the semaphore update* (presumably something weird
+     * having to do with the order in which the semaphore wakeup signal
+     * and SIGALRM get handled).
+     */
+    if (MyProc->links.prev == INVALID_OFFSET ||
+	MyProc->links.next == INVALID_OFFSET) {
+	UnlockLockTable();
+	return(1);
+    }
+    
+    lock = MyProc->waitLock;
+    size = lock->waitProcs.size; /* so we can look at this in the core */
+    
+    /* ------------------------
+     * Get this process off the lock's wait queue
+     * ------------------------
+     */
+    Assert(lock->waitProcs.size > 0);
+    --lock->waitProcs.size;
+    SHMQueueDelete(&(MyProc->links));
+    SHMQueueElemInit(&(MyProc->links));
+    
+    /* ------------------
+     * Unlock my semaphore so that the count is right for next time.
+     * I was awoken by a signal, not by someone unlocking my semaphore.
+     * ------------------
+     */
+    IpcSemaphoreUnlock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock);
+    
+    /* -------------
+     * Set MyProc->errType to STATUS_ERROR so that we abort after
+     * returning from this handler.
+     * -------------
+     */
+    MyProc->errType = STATUS_ERROR;
+    
+    /*
+     * if this doesn't follow the IpcSemaphoreUnlock then we get lock
+     * table corruption ("LockReplace: xid table corrupted") due to
+     * race conditions.  i don't claim to understand this...
+     */
+    UnlockLockTable();
+    
+    elog(NOTICE, "Timeout -- possible deadlock");
+    return 0;
+}
+
+void
+ProcReleaseSpins(PROC *proc)
+{
+    int i;
+    
+    if (!proc)
+	proc = MyProc;
+    
+    if (!proc)
+	return;
+    for (i=0; i < (int)MAX_SPINS; i++)
+	{
+	    if (proc->sLocks[i])
+		{
+		    Assert(proc->sLocks[i] == 1);
+		    SpinRelease(i);
+		}
+	}
+}
+
+/*****************************************************************************
+ * 
+ *****************************************************************************/
+
+/*
+ * ProcGetNewSemKeyAndNum -
+ *    scan the free semaphore bitmap and allocate a single semaphore from
+ *    a semaphore set. (If the semaphore set doesn't exist yet,
+ *    IpcSemaphoreCreate will create it. Otherwise, we use the existing
+ *    semaphore set.)
+ */
+static void
+ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum)
+{
+    int i;
+    int32 *freeSemMap = ProcGlobal->freeSemMap;
+    unsigned int fullmask;
+
+    /*
+     * we hold ProcStructLock when entering this routine. We scan through
+     * the bitmap to look for a free semaphore.
+     */
+    fullmask = ~0 >> (32 - PROC_NSEMS_PER_SET);
+    for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) {
+	int mask = 1;
+	int j;
+
+	if (freeSemMap[i] == fullmask)
+	    continue; /* none free for this set */
+
+	for(j = 0; j < PROC_NSEMS_PER_SET; j++) {
+	    if ((freeSemMap[i] & mask) == 0) {
+		/*
+		 * a free semaphore found. Mark it as allocated.
+		 */
+		freeSemMap[i] |= mask;
+
+		*key = ProcGlobal->currKey + i;
+		*semNum = j;
+		return;
+	    }
+	    mask <<= 1;
+	}
+    }
+
+    /* if we reach here, all the semaphores are in use. */
+    elog(WARN, "InitProc: cannot allocate a free semaphore");
+}
+
+/*
+ * ProcFreeSem -
+ *    free up our semaphore in the semaphore set. If we're the last one
+ *    in the set, also remove the semaphore set.
+ */
+static void
+ProcFreeSem(IpcSemaphoreKey semKey, int semNum)
+{
+    int mask;
+    int i;
+    int32 *freeSemMap = ProcGlobal->freeSemMap;
+
+    i = semKey - ProcGlobal->currKey;
+    mask = ~(1 << semNum);
+    freeSemMap[i] &= mask;
+
+    if (freeSemMap[i]==0)
+	IpcSemaphoreKill(semKey);
+}
+
+/*
+ * ProcFreeAllSemaphores -
+ *    on exiting the postmaster, we free up all the semaphores allocated
+ *    to the lmgrs of the backends.
+ */
+void
+ProcFreeAllSemaphores()
+{
+    int i;
+    int32 *freeSemMap = ProcGlobal->freeSemMap;
+
+    for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) {
+	if (freeSemMap[i]!=0)
+	    IpcSemaphoreKill(ProcGlobal->currKey + i);
+    }
+}
diff --git a/src/backend/storage/lmgr/single.c b/src/backend/storage/lmgr/single.c
new file mode 100644
index 00000000000..8d41ea38bb6
--- /dev/null
+++ b/src/backend/storage/lmgr/single.c
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * single.c--
+ *    set single locks in the multi-level lock hierarchy
+ *
+ *    Sometimes we don't want to set all levels of the multi-level
+ *	lock hierarchy at once.  This allows us to set and release
+ * 	one level at a time.  It's useful in index scans when
+ *	you can set an intent lock at the beginning and thereafter
+ * 	only set page locks.  Tends to speed things up.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/single.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "storage/lmgr.h"	/* where the declarations go */
+#include "storage/lock.h"
+#include "storage/multilev.h"
+#include "utils/rel.h"
+
+/*
+ * SingleLockReln -- lock a relation
+ *
+ * Returns: TRUE if the lock can be set, FALSE otherwise.
+ */
+bool
+SingleLockReln(LockInfo linfo, LOCKT lockt, int action)
+{
+    LOCKTAG	tag;
+    
+    /* 
+     * LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    BlockIdSet(&(tag.tupleId.ip_blkid), InvalidBlockNumber);
+    tag.tupleId.ip_posid = InvalidOffsetNumber;
+    
+    if (action == UNLOCK)
+	return(LockRelease(MultiTableId, &tag, lockt));
+    else
+	return(LockAcquire(MultiTableId, &tag, lockt));
+}
+
+/*
+ * SingleLockPage -- use multi-level lock table, but lock
+ *	only at the page level.
+ *
+ * Assumes that an INTENT lock has already been set in the
+ * multi-level lock table.
+ *
+ */
+bool
+SingleLockPage(LockInfo linfo,
+	       ItemPointer tidPtr,
+	       LOCKT lockt,
+	       int action)
+{
+    LOCKTAG	tag;
+    
+    /* 
+     * LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+    tag.tupleId.ip_posid = InvalidOffsetNumber;
+    
+    
+    if (action == UNLOCK)
+	return(LockRelease(MultiTableId, &tag, lockt));
+    else
+	return(LockAcquire(MultiTableId, &tag, lockt));
+}
+
diff --git a/src/backend/storage/lock.h b/src/backend/storage/lock.h
new file mode 100644
index 00000000000..df490e76512
--- /dev/null
+++ b/src/backend/storage/lock.h
@@ -0,0 +1,218 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.h--
+ *    
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: lock.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOCK_H_
+#define LOCK_H_
+
+#include "postgres.h"
+#include "storage/itemptr.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/backendid.h"
+#include "utils/hsearch.h"
+
+extern SPINLOCK LockMgrLock;
+typedef int MASK;
+
+#define INIT_TABLE_SIZE		100
+#define MAX_TABLE_SIZE 		1000
+
+
+/* ----------------------
+ * The following defines are used to estimate how much shared
+ * memory the lock manager is going to require.  
+ * 
+ * NBACKENDS - The number of concurrently running backends
+ * NLOCKS_PER_XACT - The number of unique locks acquired in a transaction
+ * NLOCKENTS - The maximum number of lock entries in the lock table.
+ * ----------------------
+ */
+#define NBACKENDS 50
+#define NLOCKS_PER_XACT 40
+#define NLOCKENTS NLOCKS_PER_XACT*NBACKENDS
+
+typedef int LOCK_TYPE;
+typedef int LOCKT;
+typedef int LockTableId;
+
+/* MAX_LOCKTYPES cannot be larger than the bits in MASK */
+#define MAX_LOCKTYPES 6
+
+/*
+ * MAX_TABLES corresponds to the number of spin locks allocated in
+ * CreateSpinLocks() or the number of shared memory locations allocated
+ * for lock table spin locks in the case of machines with TAS instructions.
+ */
+#define MAX_TABLES 2
+
+#define INVALID_TABLEID 0
+
+/*typedef struct LOCK LOCK; */
+
+
+typedef struct ltag {
+    Oid			relId;
+    Oid			dbId;
+    ItemPointerData	tupleId;
+} LOCKTAG;
+
+#define TAGSIZE (sizeof(LOCKTAG))
+
+/* This is the control structure for a lock table.  It
+ * lives in shared memory:
+ *
+ * tableID -- the handle used by the lock table's clients to
+ *	refer to the table.
+ *
+ * nLockTypes -- number of lock types (READ,WRITE,etc) that
+ *	are defined on this lock table
+ *
+ * conflictTab -- this is an array of bitmasks showing lock
+ *	type conflicts. conflictTab[i] is a mask with the j-th bit
+ *	turned on if lock types i and j conflict.
+ *
+ * prio -- each locktype has a priority, so, for example, waiting
+ *	writers can be given priority over readers (to avoid
+ *	starvation).
+ *
+ * masterlock -- synchronizes access to the table
+ *
+ */
+typedef struct lockctl {
+  LockTableId	tableId;
+  int		nLockTypes;
+  int		conflictTab[MAX_LOCKTYPES];
+  int		prio[MAX_LOCKTYPES];
+  SPINLOCK	masterLock;
+} LOCKCTL;
+
+/*
+ * lockHash -- hash table on lock Ids,
+ * xidHash -- hash on xid and lockId in case
+ *	multiple processes are holding the lock
+ * ctl - control structure described above.
+ */
+typedef struct ltable {
+    HTAB	*lockHash;
+    HTAB	*xidHash;
+    LOCKCTL	*ctl;
+} LOCKTAB;
+
+/* -----------------------
+ * A transaction never conflicts with its own locks.  Hence, if
+ * multiple transactions hold non-conflicting locks on the same
+ * data, private per-transaction information must be stored in the
+ * XID table.  The tag is XID + shared memory lock address so that
+ * all locks can use the same XID table.  The private information
+ * we store is the number of locks of each type (holders) and the
+ * total number of locks (nHolding) held by the transaction.
+ *
+ * NOTE: --
+ * There were some problems with the fact that currently TransactionIdData
+ * is a 5 byte entity and compilers long word aligning of structure fields.
+ * If the 3 byte padding is put in front of the actual xid data then the
+ * hash function (which uses XID_TAGSIZE when deciding how many bytes of a
+ * struct to look at for the key) might only see the last two bytes of the xid.
+ *
+ * Clearly this is not good since its likely that these bytes will be the
+ * same for many transactions and hence they will share the same entry in
+ * hash table causing the entry to be corrupted.  For this long-winded
+ * reason I have put the tag in a struct of its own to ensure that the
+ * XID_TAGSIZE is computed correctly.  It used to be sizeof (SHMEM_OFFSET) +
+ * sizeof(TransactionIdData) which != sizeof(XIDTAG).
+ *
+ * Finally since the hash function will now look at all 12 bytes of the tag
+ * the padding bytes MUST be zero'd before use in hash_search() as they
+ * will have random values otherwise.  Jeff 22 July 1991.
+ * -----------------------
+ */
+
+typedef struct XIDTAG {
+    SHMEM_OFFSET	lock;
+    int			pid;
+    TransactionId	xid;
+} XIDTAG;
+
+typedef struct XIDLookupEnt {
+    /* tag */
+    XIDTAG tag;
+
+    /* data */
+    int			holders[MAX_LOCKTYPES];
+    int			nHolding;
+    SHM_QUEUE		queue;
+} XIDLookupEnt;
+
+#define XID_TAGSIZE (sizeof(XIDTAG))
+
+/* originally in procq.h */
+typedef struct procQueue {
+    SHM_QUEUE	links;
+    int		size;
+} PROC_QUEUE;
+
+
+/*
+ * lock information:
+ *
+ * tag -- uniquely identifies the object being locked
+ * mask -- union of the conflict masks of all lock types
+ *	currently held on this object.
+ * waitProcs -- queue of processes waiting for this lock
+ * holders -- count of each lock type currently held on the
+ *	lock.
+ * nHolding -- total locks of all types.
+ */
+typedef struct Lock {
+    /* hash key */
+    LOCKTAG		tag;
+
+    /* data */
+    int			mask;
+    PROC_QUEUE		waitProcs;
+    int			holders[MAX_LOCKTYPES];
+    int			nHolding;
+    int			activeHolders[MAX_LOCKTYPES];
+    int			nActive;
+} LOCK;
+
+#define LockGetLock_nHolders(l) l->nHolders
+
+#define LockDecrWaitHolders(lock, lockt) \
+  lock->nHolding--; \
+  lock->holders[lockt]--
+
+#define LockLockTable() SpinAcquire(LockMgrLock);
+#define UnlockLockTable() SpinRelease(LockMgrLock);
+
+extern SPINLOCK LockMgrLock;
+
+/*
+ * function prototypes
+ */
+extern void InitLocks(void);
+extern void LockDisable(int status);
+extern LockTableId LockTabInit(char *tabName, MASK *conflictsP, int *prioP,
+			       int ntypes);
+extern LockTableId LockTabRename(LockTableId tableId);
+extern bool LockAcquire(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt);
+extern int LockResolveConflicts(LOCKTAB *ltable, LOCK *lock, LOCKT lockt,
+			    TransactionId xid);
+extern int WaitOnLock(LOCKTAB *ltable, LockTableId tableId, LOCK *lock,
+		      LOCKT lockt);
+extern bool LockRelease(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt);
+extern void GrantLock(LOCK *lock, LOCKT lockt);
+extern bool LockReleaseAll(LockTableId tableId, SHM_QUEUE *lockQueue);
+extern int LockShmemSize(void);
+extern bool LockingDisabled(void);
+
+#endif /* LOCK_H */
diff --git a/src/backend/storage/multilev.h b/src/backend/storage/multilev.h
new file mode 100644
index 00000000000..582c1cb6c37
--- /dev/null
+++ b/src/backend/storage/multilev.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * multilev.h--
+ *    multi level lock table consts/defs for single.c and multi.c and their
+ *    clients
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: multilev.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MULTILEV_H
+#define MULTILEV_H
+
+#include "storage/lock.h"
+#include "storage/lmgr.h"
+
+#define READ_LOCK  	2
+#define WRITE_LOCK 	1
+
+/* any time a small granularity READ/WRITE lock is set.  
+ * Higher granularity READ_INTENT/WRITE_INTENT locks must
+ * also be set.  A read intent lock is has value READ+INTENT.
+ * in this implementation.
+ */
+#define NO_LOCK		0
+#define INTENT		2
+#define READ_INTENT	(READ_LOCK+INTENT)
+#define WRITE_INTENT	(WRITE_LOCK+INTENT)
+
+#define EXTEND_LOCK	5
+
+#define SHORT_TERM	1
+#define LONG_TERM	2
+#define UNLOCK		0
+
+#define N_LEVELS 3
+#define RELN_LEVEL 0
+#define PAGE_LEVEL 1
+#define TUPLE_LEVEL 2
+typedef int LOCK_LEVEL;
+
+/* multi.c */
+
+extern LockTableId MultiTableId;
+extern LockTableId ShortTermTableId;
+
+/*
+ * function prototypes
+ */
+extern LockTableId InitMultiLevelLockm(void);
+extern bool MultiLockReln(LockInfo linfo, LOCKT lockt);
+extern bool MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiAcquire(LockTableId tableId, LOCKTAG *tag, LOCKT lockt,
+			 LOCK_LEVEL level);
+extern bool MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiReleaseReln(LockInfo linfo, LOCKT lockt);
+extern bool MultiRelease(LockTableId tableId, LOCKTAG *tag, LOCKT lockt,
+			 LOCK_LEVEL level);
+
+#endif /* MULTILEV_H */
diff --git a/src/backend/storage/off.h b/src/backend/storage/off.h
new file mode 100644
index 00000000000..e5f5cbf5482
--- /dev/null
+++ b/src/backend/storage/off.h
@@ -0,0 +1,60 @@
+/*-------------------------------------------------------------------------
+ *
+ * off.h--
+ *    POSTGRES disk "offset" definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: off.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	OFF_H
+#define OFF_H
+
+#include "c.h"
+#include "machine.h"		/* for BLCKSZ */
+#include "storage/itemid.h"
+
+/*
+ * OffsetNumber:
+ *
+ * this is a 1-based index into the linp (ItemIdData) array in the
+ * header of each disk page.
+ */
+typedef uint16			OffsetNumber;
+
+#define InvalidOffsetNumber	((OffsetNumber) 0)
+#define FirstOffsetNumber	((OffsetNumber) 1)
+#define	MaxOffsetNumber		((OffsetNumber) (BLCKSZ / sizeof(ItemIdData)))
+#define	OffsetNumberMask	(0xffff)		/* valid uint16 bits */
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+
+/*
+ * OffsetNumberIsValid --
+ *	True iff the offset number is valid.
+ */
+#define OffsetNumberIsValid(offsetNumber) \
+    ((bool) ((offsetNumber != InvalidOffsetNumber) && \
+	     (offsetNumber <= MaxOffsetNumber)))
+
+/*
+ * OffsetNumberNext --
+ * OffsetNumberPrev --
+ *	Increments/decrements the argument.  These macros look pointless
+ *	but they help us disambiguate the different manipulations on
+ *	OffsetNumbers (e.g., sometimes we substract one from an
+ *	OffsetNumber to move back, and sometimes we do so to form a
+ *	real C array index).
+ */
+#define OffsetNumberNext(offsetNumber) \
+    ((OffsetNumber) (1 + (offsetNumber)))
+#define OffsetNumberPrev(offsetNumber) \
+    ((OffsetNumber) (-1 + (offsetNumber)))
+
+#endif	/* OFF_H */
diff --git a/src/backend/storage/page.h b/src/backend/storage/page.h
new file mode 100644
index 00000000000..a012ea522c0
--- /dev/null
+++ b/src/backend/storage/page.h
@@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * page.h--
+ *    POSTGRES buffer page abstraction definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: page.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	PAGE_H
+#define PAGE_H
+
+#include "c.h"
+
+typedef Pointer	Page;
+
+/*
+ * PageIsValid --
+ *	True iff page is valid.
+ */
+#define	PageIsValid(page) PointerIsValid(page)
+
+#endif	/* PAGE_H */
diff --git a/src/backend/storage/page/Makefile.inc b/src/backend/storage/page/Makefile.inc
new file mode 100644
index 00000000000..2a7d8408512
--- /dev/null
+++ b/src/backend/storage/page/Makefile.inc
@@ -0,0 +1,16 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/page
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/page/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= bufpage.c itemptr.c
+
+
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
new file mode 100644
index 00000000000..14b5ead85bc
--- /dev/null
+++ b/src/backend/storage/page/bufpage.c
@@ -0,0 +1,519 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.c--
+ *    POSTGRES standard buffer page code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/page/bufpage.c,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/types.h>
+#include <sys/file.h>
+
+#include "c.h"
+
+#include "storage/item.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/memutils.h"
+#include "storage/bufpage.h"
+
+#include "lib/qsort.h"
+
+static bool PageManagerShuffle = true;	/* default is shuffle mode */
+
+/* ----------------------------------------------------------------
+ *			Buffer support functions
+ * ----------------------------------------------------------------
+ */
+/*
+ * BufferGetPageSize --
+ *	Returns the page size within a buffer.
+ *
+ * Notes:
+ *	Assumes buffer is valid.
+ *
+ *	The buffer can be a raw disk block and need not contain a valid
+ *	(formatted) disk page.
+ */
+Size
+BufferGetPageSize(Buffer buffer)
+{
+    Size	pageSize;
+    
+    Assert(BufferIsValid(buffer));
+    pageSize = BLCKSZ;	/* XXX dig out of buffer descriptor */
+    
+    Assert(PageSizeIsValid(pageSize));
+    return (pageSize);
+}
+
+/*
+ * BufferGetPage --
+ *	Returns the page associated with a buffer.
+ */
+Page
+BufferGetPage(Buffer buffer)
+{
+    return (Page) BufferGetBlock(buffer);
+}
+
+
+/* ----------------------------------------------------------------
+ *			Page support functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * PageInit --
+ *	Initializes the contents of a page.
+ */
+void
+PageInit(Page page, Size pageSize, Size specialSize)
+{
+    PageHeader p = (PageHeader) page;
+
+    Assert(pageSize == BLCKSZ);
+    Assert(pageSize >
+	   specialSize + sizeof(PageHeaderData) - sizeof(ItemIdData));
+    
+    specialSize = DOUBLEALIGN(specialSize);
+
+    p->pd_lower = sizeof(PageHeaderData) - sizeof(ItemIdData);
+    p->pd_upper = pageSize - specialSize;
+    p->pd_special = pageSize - specialSize;
+    PageSetPageSize(page, pageSize);
+}
+
+/*
+ * PageGetItem --
+ *	Retrieves an item on the given page.
+ *
+ * Note:
+ *	This does change the status of any of the resources passed.
+ *	The semantics may change in the future.
+ */
+Item
+PageGetItem(Page page, ItemId itemId)
+{
+    Item	item;
+    
+    Assert(PageIsValid(page));
+    Assert((*itemId).lp_flags & LP_USED);
+    
+    item = (Item)(((char *)page) + (*itemId).lp_off);
+    
+    return (item);
+}
+
+/*
+ * PageAddItem --
+ *	Adds item to the given page.
+ *
+ * Note:
+ *	This does not assume that the item resides on a single page.
+ *	It is the responsiblity of the caller to act appropriately
+ *	depending on this fact.  The "pskip" routines provide a
+ *	friendlier interface, in this case.
+ *	
+ *	This does change the status of any of the resources passed.
+ *	The semantics may change in the future.
+ *
+ *	This routine should probably be combined with others?
+ */
+/* ----------------
+ *	PageAddItem
+ *
+ *	add an item to a page.
+ *
+ *   Notes on interface:
+ *  	If offsetNumber is valid, shuffle ItemId's down to make room
+ * 	to use it, if PageManagerShuffle is true.  If PageManagerShuffle is
+ *  	false, then overwrite the specified ItemId.  (PageManagerShuffle is
+ *  	true by default, and is modified by calling PageManagerModeSet.)
+ *  	If offsetNumber is not valid, then assign one by finding the first 
+ *  	one that is both unused and deallocated.
+ *
+ *   NOTE: If offsetNumber is valid, and PageManagerShuffle is true, it
+ *  	is assumed that there is room on the page to shuffle the ItemId's
+ *  	down by one.
+ * ----------------
+ */
+OffsetNumber
+PageAddItem(Page page,
+	    Item item,
+	    Size size,
+	    OffsetNumber offsetNumber,
+	    ItemIdFlags flags)
+{
+    register 		i;
+    Size		alignedSize;
+    Offset		lower;
+    Offset		upper;
+    ItemId		itemId;
+    ItemId		fromitemId, toitemId;
+    OffsetNumber 	limit;
+    
+    bool shuffled = false;
+    
+    /*
+     *  Find first unallocated offsetNumber
+     */
+    limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+    
+    /* was offsetNumber passed in? */
+    if (OffsetNumberIsValid(offsetNumber)) {
+	if (PageManagerShuffle == true) {
+	    /* shuffle ItemId's (Do the PageManager Shuffle...) */
+	    for (i = (limit - 1); i >= offsetNumber; i--) {
+		fromitemId = &((PageHeader)page)->pd_linp[i - 1];
+		toitemId = &((PageHeader)page)->pd_linp[i];
+		*toitemId = *fromitemId;
+	    }
+	    shuffled = true;	/* need to increase "lower" */
+	} else { /* overwrite mode */
+	    itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+	    if (((*itemId).lp_flags & LP_USED)  || 
+		((*itemId).lp_len != 0)) {
+		elog(WARN, "PageAddItem: tried overwrite of used ItemId");
+		return (InvalidOffsetNumber);
+	    }
+	}
+    } else {	/* offsetNumber was not passed in, so find one */
+	/* look for "recyclable" (unused & deallocated) ItemId */
+	for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) {
+	    itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+	    if ((((*itemId).lp_flags & LP_USED) == 0) && 
+		((*itemId).lp_len == 0)) 
+		break;
+	}
+    }
+    if (offsetNumber > limit)
+	lower = (Offset) (((char *) (&((PageHeader)page)->pd_linp[offsetNumber])) - ((char *) page));
+    else if (offsetNumber == limit || shuffled == true)
+	lower = ((PageHeader)page)->pd_lower + sizeof (ItemIdData);
+    else
+	lower = ((PageHeader)page)->pd_lower;
+    
+    alignedSize = DOUBLEALIGN(size);
+    
+    upper = ((PageHeader)page)->pd_upper - alignedSize;
+    
+    if (lower > upper) {
+	return (InvalidOffsetNumber);
+    }
+    
+    itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+    (*itemId).lp_off = upper;
+    (*itemId).lp_len = size;
+    (*itemId).lp_flags = flags;
+    memmove((char *)page + upper, item, size);
+    ((PageHeader)page)->pd_lower = lower;
+    ((PageHeader)page)->pd_upper = upper;
+    
+    return (offsetNumber);
+}
+
+/*
+ * PageGetTempPage --
+ *	Get a temporary page in local memory for special processing
+ */
+Page
+PageGetTempPage(Page page, Size specialSize)
+{
+    Size	pageSize;
+    Size	size;
+    Page	temp;
+    PageHeader	thdr;
+    
+    pageSize = PageGetPageSize(page);
+    
+    if ((temp = (Page) palloc(pageSize)) == (Page) NULL)
+	elog(FATAL, "Cannot allocate %d bytes for temp page.", pageSize);
+    thdr = (PageHeader) temp;
+    
+    /* copy old page in */
+    memmove(temp, page, pageSize);
+    
+    /* clear out the middle */
+    size = (pageSize - sizeof(PageHeaderData)) + sizeof(ItemIdData);
+    size -= DOUBLEALIGN(specialSize);
+    memset((char *) &(thdr->pd_linp[0]), 0, size);
+    
+    /* set high, low water marks */
+    thdr->pd_lower = sizeof (PageHeaderData) - sizeof (ItemIdData);
+    thdr->pd_upper = pageSize - DOUBLEALIGN(specialSize);
+    
+    return (temp);
+}
+
+/*
+ * PageRestoreTempPage --
+ *	Copy temporary page back to permanent page after special processing
+ *	and release the temporary page.
+ */
+void
+PageRestoreTempPage(Page tempPage, Page oldPage)
+{
+    Size	pageSize;
+    
+    pageSize = PageGetPageSize(tempPage);
+    memmove((char *) oldPage, (char *) tempPage, pageSize);
+    
+    pfree(tempPage);
+}
+
+/*
+ * PageGetMaxOffsetNumber --
+ *	Returns the maximum offset number used by the given page.
+ *
+ *	NOTE: The offset is invalid if the page is non-empty.
+ *	Test whether PageIsEmpty before calling this routine
+ *	and/or using its return value.
+ */
+OffsetNumber
+PageGetMaxOffsetNumber(Page page)
+{
+    LocationIndex	low;
+    OffsetNumber	i;
+    
+    low = ((PageHeader) page)->pd_lower;
+    i = (low - (sizeof(PageHeaderData) - sizeof(ItemIdData)))
+	/ sizeof(ItemIdData);
+    
+    return(i);
+}	
+
+/* ----------------
+ *	itemid stuff for PageRepairFragmentation
+ * ----------------
+ */
+struct itemIdSortData {
+    int		offsetindex;	/* linp array index */
+    ItemIdData  itemiddata;
+};
+
+static int
+itemidcompare(struct itemIdSortData *itemidp1, struct itemIdSortData *itemidp2)
+{
+    if (itemidp1->itemiddata.lp_off == itemidp2->itemiddata.lp_off)
+	return(0);
+    else if (itemidp1->itemiddata.lp_off < itemidp2->itemiddata.lp_off)
+	return(1);
+    else
+	return(-1);
+}
+
+/*
+ * PageRepairFragmentation --
+ *	Frees fragmented space on a page.
+ */
+void
+PageRepairFragmentation(Page page)
+{
+    int 		i;
+    struct itemIdSortData 	*itemidbase, *itemidptr;
+    ItemId 		lp;
+    int 		nline, nused;
+    int 		itemidcompare();
+    Offset 		upper;
+    Size 		alignedSize;
+    
+    nline = (int16) PageGetMaxOffsetNumber(page);
+    nused = 0;
+    for (i=0; i<nline; i++) {
+	lp = ((PageHeader)page)->pd_linp + i;
+	if ((*lp).lp_flags & LP_USED)
+	    nused++;
+    }
+    
+    if (nused == 0) {
+	for (i=0; i<nline; i++) {
+	    lp = ((PageHeader)page)->pd_linp + i;
+	    if ((*lp).lp_len > 0) 	/* unused, but allocated */
+		(*lp).lp_len = 0;	/* indicate unused & deallocated */
+	}
+	
+	((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special;
+    } else {	/* nused != 0 */
+	itemidbase = (struct itemIdSortData *) 
+	    palloc(sizeof(struct itemIdSortData) * nused);
+	memset((char *) itemidbase, 0, sizeof(struct itemIdSortData) * nused);
+	itemidptr = itemidbase;
+	for (i=0; i<nline; i++) {
+	    lp = ((PageHeader)page)->pd_linp + i;
+	    if ((*lp).lp_flags & LP_USED) {
+		itemidptr->offsetindex = i;
+		itemidptr->itemiddata = *lp;
+		itemidptr++;
+	    } else {
+		if ((*lp).lp_len > 0) 	/* unused, but allocated */
+		    (*lp).lp_len = 0;	/* indicate unused & deallocated */
+	    }
+	}
+	
+	/* sort itemIdSortData array...*/
+	pg_qsort((char *) itemidbase, nused, sizeof(struct itemIdSortData),
+		 (void*) itemidcompare);
+	
+	/* compactify page */
+	((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special;
+	
+	for (i=0, itemidptr = itemidbase; i<nused; i++, itemidptr++) {
+	    lp = ((PageHeader)page)->pd_linp + itemidptr->offsetindex;
+	    alignedSize = DOUBLEALIGN((*lp).lp_len);
+	    upper = ((PageHeader)page)->pd_upper - alignedSize;
+	    memmove((char *) page + upper,
+		    (char *)page + (*lp).lp_off, 
+		    (*lp).lp_len);
+	    (*lp).lp_off = upper;
+	    ((PageHeader)page)->pd_upper = upper;
+	}
+	
+	pfree(itemidbase);
+    }
+}
+
+/*
+ * PageGetFreeSpace --
+ *	Returns the size of the free (allocatable) space on a page.
+ */
+Size
+PageGetFreeSpace(Page page)
+{
+    Size	space;
+    
+    
+    space = ((PageHeader)page)->pd_upper - ((PageHeader)page)->pd_lower;
+    
+    if (space < sizeof (ItemIdData)) {
+	return (0);
+    }
+    space -= sizeof (ItemIdData);		/* XXX not always true */
+    
+    return (space);
+}
+
+/*
+ * PageManagerModeSet --
+ *
+ *   Sets mode to either: ShufflePageManagerMode (the default) or
+ *   OverwritePageManagerMode.  For use by access methods code
+ *   for determining semantics of PageAddItem when the offsetNumber
+ *   argument is passed in.
+ */
+void
+PageManagerModeSet(PageManagerMode mode)
+{
+    if (mode == ShufflePageManagerMode)
+	PageManagerShuffle = true;
+    else if (mode == OverwritePageManagerMode)
+	PageManagerShuffle = false;
+}
+
+/*
+ *----------------------------------------------------------------
+ * PageIndexTupleDelete
+ *----------------------------------------------------------------
+ *
+ *	This routine does the work of removing a tuple from an index page.
+ */
+void
+PageIndexTupleDelete(Page page, OffsetNumber offnum)
+{
+    PageHeader 	phdr;
+    char 	*addr;
+    ItemId 	tup;
+    Size 	size;
+    char 	*locn;
+    int 	nbytes;
+    int		offidx;
+    
+    phdr = (PageHeader) page;
+    
+    /* change offset number to offset index */
+    offidx = offnum - 1;
+    
+    tup = PageGetItemId(page, offnum);
+    size = ItemIdGetLength(tup);
+    size = DOUBLEALIGN(size);
+    
+    /* location of deleted tuple data */
+    locn = (char *) (page + ItemIdGetOffset(tup));
+    
+    /*
+     * First, we want to get rid of the pd_linp entry for the index
+     * tuple.  We copy all subsequent linp's back one slot in the
+     * array.
+     */
+    
+    nbytes = phdr->pd_lower -
+	((char *)&phdr->pd_linp[offidx + 1] - (char *) phdr);
+    memmove((char *) &(phdr->pd_linp[offidx]),
+	    (char *) &(phdr->pd_linp[offidx + 1]),
+	    nbytes);
+    
+    /*
+     * Now move everything between the old upper bound (beginning of tuple
+     * space) and the beginning of the deleted tuple forward, so that
+     * space in the middle of the page is left free.  If we've just deleted
+     * the tuple at the beginning of tuple space, then there's no need
+     * to do the copy (and bcopy on some architectures SEGV's if asked
+     * to move zero bytes).
+     */
+    
+    /* beginning of tuple space */
+    addr = (char *) (page + phdr->pd_upper);
+    
+    if (locn != addr)
+	memmove(addr + size, addr, (int) (locn - addr));
+    
+    /* adjust free space boundary pointers */
+    phdr->pd_upper += size;
+    phdr->pd_lower -= sizeof (ItemIdData);
+    
+    /* finally, we need to adjust the linp entries that remain */
+    if (!PageIsEmpty(page))
+	PageIndexTupleDeleteAdjustLinePointers(phdr, locn, size);
+}
+
+/*
+ *----------------------------------------------------------------
+ * PageIndexTupleDeleteAdjustLinePointers
+ *----------------------------------------------------------------
+ *
+ *	Once the line pointers and tuple data have been shifted around
+ *	on the page, we need to go down the line pointer vector and
+ *	adjust pointers to reflect new locations.  Anything that used
+ *	to be before the deleted tuple's data was moved forward by the
+ *	size of the deleted tuple.
+ *
+ *	This routine does the work of adjusting the line pointers.
+ *	Location is where the tuple data used to lie; size is how
+ *	much space it occupied.  We assume that size has been aligned
+ *	as required by the time we get here.
+ *
+ *	This routine should never be called on an empty page.
+ */
+void
+PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr,
+				       char *location,
+				       Size size)
+{
+    int i;
+    
+    /* location is an index into the page... */
+    location -= (int) phdr;
+    
+    for (i = PageGetMaxOffsetNumber((Page) phdr) - 1; i >= 0; i--) {
+	if (phdr->pd_linp[i].lp_off <= (unsigned) location) {
+	    phdr->pd_linp[i].lp_off += size;
+	}
+    }
+}
diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c
new file mode 100644
index 00000000000..9d063374038
--- /dev/null
+++ b/src/backend/storage/page/itemptr.c
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.c--
+ *    POSTGRES disk item pointer code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/page/itemptr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "storage/block.h"
+#include "storage/off.h"
+#include "storage/itemptr.h"
+#include "storage/bufpage.h"
+
+/*
+ * ItemPointerEquals --
+ *  Returns true if both item pointers point to the same item, 
+ *   otherwise returns false.
+ *
+ * Note:
+ *  Assumes that the disk item pointers are not NULL.
+ */
+bool
+ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
+{
+    if (ItemPointerGetBlockNumber(pointer1) ==
+        ItemPointerGetBlockNumber(pointer2) &&
+        ItemPointerGetOffsetNumber(pointer1) ==
+        ItemPointerGetOffsetNumber(pointer2))
+	return(true);
+    else
+        return(false);
+}
+
diff --git a/src/backend/storage/pagenum.h b/src/backend/storage/pagenum.h
new file mode 100644
index 00000000000..f32624c226d
--- /dev/null
+++ b/src/backend/storage/pagenum.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagenum.h--
+ *    POSTGRES page number definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: pagenum.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	PAGENUM_H
+#define PAGENUM_H
+
+#include "c.h"
+#include "storage/page.h"
+
+typedef uint16	PageNumber;
+
+typedef uint32	LogicalPageNumber;
+
+#define InvalidLogicalPageNumber	0
+
+/*
+ * LogicalPageNumberIsValid --
+ *	True iff the logical page number is valid.
+ */
+#define LogicalPageNumberIsValid(pageNumber) \
+    ((bool)((pageNumber) != InvalidLogicalPageNumber))
+
+
+#endif	/* PAGENUM_H */
diff --git a/src/backend/storage/pos.h b/src/backend/storage/pos.h
new file mode 100644
index 00000000000..9a7f603416b
--- /dev/null
+++ b/src/backend/storage/pos.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * pos.h--
+ *    POSTGRES "position" definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: pos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	POS_H
+#define POS_H
+
+#include "c.h"
+
+/*
+ * a 'position' used to be <pagenumber, offset> in postgres.  this has
+ * been changed to just <offset> as the notion of having multiple pages
+ * within a block has been removed.
+ *
+ * the 'offset' abstraction is somewhat confusing.  it is NOT a byte
+ * offset within the page; instead, it is an offset into the line
+ * pointer array contained on every page that store (heap or index)
+ * tuples.
+ */
+typedef bits16		PositionIdData;
+typedef PositionIdData	*PositionId;
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+
+/*
+ * PositionIdIsValid --
+ *	True iff the position identifier is valid.
+ */
+#define PositionIdIsValid(positionId) \
+    PointerIsValid(positionId)
+
+/*
+ * PositionIdSetInvalid --
+ *      Make an invalid position.
+ */
+#define PositionIdSetInvalid(positionId) \
+    *(positionId) = (bits16) 0
+
+/*
+ * PositionIdSet --
+ *	Sets a position identifier to the specified value.
+ */
+#define PositionIdSet(positionId, offsetNumber) \
+    *(positionId) = (offsetNumber)
+
+/*
+ * PositionIdGetOffsetNumber --
+ *	Retrieve the offset number from a position identifier.
+ */
+#define PositionIdGetOffsetNumber(positionId) \
+    ((OffsetNumber) *(positionId))
+
+#endif	/*  POS_H */
diff --git a/src/backend/storage/proc.h b/src/backend/storage/proc.h
new file mode 100644
index 00000000000..1ec89dedc2d
--- /dev/null
+++ b/src/backend/storage/proc.h
@@ -0,0 +1,127 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.h--
+ *    
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: proc.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _PROC_H_
+#define _PROC_H_
+
+#include "storage/ipc.h"
+#include "storage/lock.h"
+#ifndef WIN32
+#include <sys/sem.h>
+#else
+/* This is because WIN32 already defines PROC */
+#define PROC	PGL_PROC
+#endif /* WIN32 */
+#include "storage/shmem.h"
+
+
+typedef struct {
+  int	 		sleeplock;
+  int			semNum;
+  IpcSemaphoreId	semId;
+  IpcSemaphoreKey	semKey;
+} SEMA;
+
+/*
+ * Each backend has:
+ */
+typedef struct proc {
+
+  /* proc->links MUST BE THE FIRST ELEMENT OF STRUCT (see ProcWakeup()) */
+
+  SHM_QUEUE         links;	/* proc can be waiting for one event(lock) */
+  SEMA              sem;	/* ONE semaphore to sleep on */
+  int               errType; 	/* error code tells why we woke up */
+
+  int               procId;  	/* unique number for this structure
+			 	 * NOT unique per backend, these things
+				 * are reused after the backend dies.
+				 */
+
+  int               critSects;	/* If critSects > 0, we are in sensitive
+				 * routines that cannot be recovered when
+				 * the process fails.
+				 */
+
+  int               prio;	/* priority for sleep queue */
+
+  TransactionId     xid;	/* transaction currently being executed
+				 * by this proc
+				 */
+
+  LOCK *            waitLock;	/* Lock we're sleeping on */
+  int               token;	/* info for proc wakeup routines */	
+  int		    pid;	/* This procs process id */
+  short		    sLocks[MAX_SPINS];	/* Spin lock stats */
+  SHM_QUEUE	    lockQueue;	/* locks associated with current transaction */
+} PROC;
+
+
+/*
+ * MAX_PROC_SEMS is the maximum number of per-process semaphores (those used
+ * by the lock mgr) we can keep track of. PROC_NSEMS_PER_SET is the number
+ * of semaphores in each (sys-V) semaphore set allocated. (Be careful not
+ * to set it to greater 32. Otherwise, the bitmap will overflow.)
+ */
+#define  MAX_PROC_SEMS		128
+#define  PROC_NSEMS_PER_SET	16
+
+typedef struct procglobal {
+    SHMEM_OFFSET	freeProcs;
+    int			numProcs;
+    IPCKey		currKey;
+    int32		freeSemMap[MAX_PROC_SEMS/PROC_NSEMS_PER_SET];
+} PROC_HDR;
+
+extern PROC *MyProc;
+
+#define PROC_INCR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])++
+#define PROC_DECR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])--
+
+/*
+ * flags explaining why process woke up
+ */
+#define NO_ERROR 	0
+#define ERR_TIMEOUT	1
+#define ERR_BUFFER_IO	2
+
+#define MAX_PRIO	50
+#define MIN_PRIO	(-1)
+
+extern SPINLOCK ProcStructLock;
+
+/*
+ * Function Prototypes
+ */
+extern void InitProcess(IPCKey key);
+extern void ProcReleaseLocks(void);
+extern bool ProcRemove(int pid);
+/* extern bool ProcKill(int exitStatus, int pid); */
+/* make static in storage/lmgr/proc.c -- jolly */
+
+extern PROC_QUEUE *ProcQueueAlloc(char *name);
+extern void ProcQueueInit(PROC_QUEUE *queue);
+extern int ProcSleep(PROC_QUEUE *queue, SPINLOCK spinlock, int token, 
+	      int prio, LOCK *lock);
+extern PROC *ProcWakeup(PROC *proc, int errType);
+extern int ProcGetId(void);
+extern int ProcLockWakeup(PROC_QUEUE *queue, char * ltable, char * lock);
+extern void ProcAddLock(SHM_QUEUE *elem);
+#if defined(PORTNAME_linux)
+extern int HandleDeadLock(int);
+#else
+extern int HandleDeadLock(void);
+#endif
+extern void ProcReleaseSpins(PROC *proc);
+extern void ProcFreeAllSemaphores(void);
+
+#endif /* PROC_H */
diff --git a/src/backend/storage/shmem.h b/src/backend/storage/shmem.h
new file mode 100644
index 00000000000..a00b33581a4
--- /dev/null
+++ b/src/backend/storage/shmem.h
@@ -0,0 +1,104 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.h--
+ *    shared memory management structures
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: shmem.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	SHMEM_H
+#define SHMEM_H
+
+#include "storage/spin.h"		/* for SPINLOCK */
+#include "utils/hsearch.h"		/* for HTAB */
+
+/* The shared memory region can start at a different address
+ * in every process.  Shared memory "pointers" are actually
+ * offsets relative to the start of the shared memory region(s).
+ */
+typedef unsigned long SHMEM_OFFSET;
+#define INVALID_OFFSET (-1)
+#define BAD_LOCATION (-1)
+
+/* start of the lowest shared memory region.  For now, assume that
+ * there is only one shared memory region 
+ */
+extern SHMEM_OFFSET ShmemBase;
+
+
+/* coerce an offset into a pointer in this process's address space */
+#define MAKE_PTR(xx_offs)\
+  (ShmemBase+((unsigned long)(xx_offs)))
+
+/* coerce a pointer into a shmem offset */
+#define MAKE_OFFSET(xx_ptr)\
+  (SHMEM_OFFSET) (((unsigned long)(xx_ptr))-ShmemBase)
+
+#define SHM_PTR_VALID(xx_ptr)\
+  (((unsigned long)xx_ptr) > ShmemBase)
+
+/* cannot have an offset to ShmemFreeStart (offset 0) */
+#define SHM_OFFSET_VALID(xx_offs)\
+  ((xx_offs != 0) && (xx_offs != INVALID_OFFSET))
+
+
+extern SPINLOCK ShmemLock;
+extern SPINLOCK BindingLock;
+
+/* shmemqueue.c */
+typedef struct SHM_QUEUE {
+    SHMEM_OFFSET	prev;
+    SHMEM_OFFSET	next;
+} SHM_QUEUE;
+
+/* shmem.c */
+extern void ShmemBindingTabReset();
+extern void ShmemCreate(unsigned int key, unsigned int size);
+extern int InitShmem(unsigned int key, unsigned int size);
+extern long *ShmemAlloc(unsigned long size);
+extern int ShmemIsValid(unsigned long addr);
+extern HTAB *ShmemInitHash(char *name, long init_size, long max_size,
+			   HASHCTL *infoP, int hash_flags);
+extern bool ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr);
+extern SHMEM_OFFSET ShmemPIDDestroy(int pid);
+extern long *ShmemInitStruct(char *name, unsigned long size,
+			     bool *foundPtr);
+
+
+typedef int TableID;
+
+/* size constants for the binding table */
+        /* max size of data structure string name */
+#define BTABLE_KEYSIZE  (50)
+        /* data in binding table hash bucket */
+#define BTABLE_DATASIZE (sizeof(BindingEnt) - BTABLE_KEYSIZE)
+        /* maximum size of the binding table */
+#define BTABLE_SIZE      (100)
+
+/* this is a hash bucket in the binding table */
+typedef struct {
+    char  	   key[BTABLE_KEYSIZE];	/* string name */
+    unsigned long  location;		/* location in shared mem */
+    unsigned long  size;		/* numbytes allocated for the
+					 * structure
+					 */
+} BindingEnt;
+
+/*
+ * prototypes for functions in shmqueue.c
+ */
+extern void SHMQueueInit(SHM_QUEUE *queue);
+extern bool SHMQueueIsDetached(SHM_QUEUE *queue);
+extern void SHMQueueElemInit(SHM_QUEUE *queue);
+extern void SHMQueueDelete(SHM_QUEUE *queue);
+extern void SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern void SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern void SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr,
+			  SHM_QUEUE *nextQueue);
+extern bool SHMQueueEmpty(SHM_QUEUE *queue);
+
+#endif	/* SHMEM_H */
diff --git a/src/backend/storage/sinval.h b/src/backend/storage/sinval.h
new file mode 100644
index 00000000000..036597dbb7a
--- /dev/null
+++ b/src/backend/storage/sinval.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.h--
+ *    POSTGRES shared cache invalidation communication definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: sinval.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	SINVAL_H
+#define SINVAL_H
+
+#include "c.h"
+#include "storage/spin.h"
+#include "storage/ipc.h"
+#include "storage/itemptr.h"
+#include "storage/backendid.h"
+
+extern SPINLOCK SInvalLock;
+
+extern void CreateSharedInvalidationState(IPCKey key);
+extern void AttachSharedInvalidationState(IPCKey key);
+extern void InitSharedInvalidationState();
+extern void RegisterSharedInvalid(int cacheId, Index hashIndex,
+				  ItemPointer pointer);
+extern void InvalidateSharedInvalid(void (*invalFunction)(),
+				    void (*resetFunction)());
+
+
+#endif /* SINVAL_H */
diff --git a/src/backend/storage/sinvaladt.h b/src/backend/storage/sinvaladt.h
new file mode 100644
index 00000000000..06029978980
--- /dev/null
+++ b/src/backend/storage/sinvaladt.h
@@ -0,0 +1,126 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.h--
+ *    POSTGRES shared cache invalidation segment definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: sinvaladt.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SINVALADT_H
+#define SINVALADT_H
+
+#include "postgres.h"	/* XXX */
+
+#include "storage/ipc.h"
+#include "storage/itemptr.h"
+#include "storage/sinval.h"
+ 
+/*
+ * The structure of the shared cache invaidation segment
+ *
+ */
+/*
+A------------- Header info --------------
+    criticalSectionSemaphoreId
+    generalSemaphoreId
+    startEntrySection   (offset a)
+    endEntrySection     (offset a + b)
+    startFreeSpace      (offset relative to B)
+    startEntryChain     (offset relatiev to B)
+    endEntryChain       (offset relative to B)
+    numEntries
+    maxNumEntries
+    procState[MaxBackendId] --> limit
+				resetState (bool)
+a				tag (POSTID)
+B------------- Start entry section -------
+    SISegEntry  --> entryData --> ... (see  SharedInvalidData!)
+                    isfree  (bool)
+                    next  (offset to next entry in chain )
+b     .... (dynamically growing down)
+C----------------End shared segment -------  
+
+*/
+
+/* Parameters (configurable)  *******************************************/
+#define MaxBackendId 32      	    /* maximum number of backends   	*/
+#define MAXNUMMESSAGES 1000 	    /* maximum number of messages in seg*/
+
+
+#define	InvalidOffset	1000000000  /* a invalid offset  (End of chain)	*/
+
+typedef struct ProcState {
+    int 	limit;      	/* the number of read messages	    	*/
+    bool 	resetState; 	/* true, if backend has to reset its state */
+    int		tag;		/* special tag, recieved from the postmaster */
+} ProcState;
+
+
+typedef struct SISeg {
+    IpcSemaphoreId  	criticalSectionSemaphoreId; /* semaphore id     */
+    IpcSemaphoreId  	generalSemaphoreId; 	    /* semaphore id     */
+    Offset      startEntrySection;  	/* (offset a)	    	    	*/
+    Offset      endEntrySection;    	/* (offset a + b)   	    	*/
+    Offset      startFreeSpace;	    	/* (offset relative to B)   	*/
+    Offset      startEntryChain;    	/* (offset relative to B)   	*/
+    Offset      endEntryChain;          /* (offset relative to B)   	*/
+    int         numEntries;
+    int         maxNumEntries;
+    ProcState   procState[MaxBackendId]; /* reflects the invalidation state */
+    /* here starts the entry section, controlled by offsets */
+} SISeg;
+#define SizeSISeg     sizeof(SISeg)
+
+typedef struct SharedInvalidData {
+    int	    	    	cacheId;    /* XXX */
+    Index   	    	hashIndex;
+    ItemPointerData 	pointerData;
+} SharedInvalidData;
+
+typedef SharedInvalidData   *SharedInvalid;
+
+
+typedef struct SISegEntry {
+    SharedInvalidData	entryData;  	    	    /* the message data */
+    bool                isfree;	    	    	    /* entry free? */
+    Offset  	    	next;	    	    	    /* offset to next entry*/
+} SISegEntry;
+
+#define SizeOfOneSISegEntry   sizeof(SISegEntry)
+    
+typedef struct SISegOffsets {
+    Offset  startSegment;   	    	/* always 0 (for now) */
+    Offset  offsetToFirstEntry;         /* A + a = B */
+    Offset  offsetToEndOfSegemnt;       /* A + a + b */
+} SISegOffsets;
+
+
+/****************************************************************************/
+/* synchronization of the shared buffer access	    	    	    	    */
+/*    access to the buffer is synchronized by the lock manager !!   	    */
+/****************************************************************************/
+
+#define SI_LockStartValue  255
+#define SI_SharedLock     (-1)
+#define SI_ExclusiveLock  (-255)
+
+extern SISeg *shmInvalBuffer;	
+
+/*
+ * prototypes for functions in sinvaladt.c
+ */
+extern int SIBackendInit(SISeg *segInOutP);
+extern int SISegmentInit(bool killExistingSegment, IPCKey key);
+
+extern bool SISetDataEntry(SISeg *segP, SharedInvalidData  *data);
+extern void SISetProcStateInvalid(SISeg *segP);
+extern bool SIDelDataEntry(SISeg *segP);
+extern void SIReadEntryData(SISeg *segP, int backendId,
+		void (*invalFunction)(), void (*resetFunction)());
+extern void SIDelExpiredDataEntries(SISeg *segP);
+
+#endif	/* SINVALADT_H */
diff --git a/src/backend/storage/smgr.h b/src/backend/storage/smgr.h
new file mode 100644
index 00000000000..2e91938290a
--- /dev/null
+++ b/src/backend/storage/smgr.h
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.h--
+ *    storage manager switch public interface declarations.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: smgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SMGR_H
+#define SMGR_H
+
+#include "utils/rel.h"
+#include "storage/spin.h"	/* for SPINLOCK */
+
+#define SM_FAIL		0
+#define	SM_SUCCESS	1
+
+#define	DEFAULT_SMGR	0
+
+extern int smgrinit(void);
+extern void smgrshutdown(int dummy);
+extern int smgrcreate(int16 which, Relation reln);
+extern int smgrunlink(int16 which, Relation reln);
+extern int smgrextend(int16 which, Relation reln, char *buffer);
+extern int smgropen(int16 which, Relation reln);
+extern int smgrclose(int16 which, Relation reln);
+extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
+		    char *buffer);
+extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
+		     char *buffer);
+extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
+		     char *buffer);
+extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid,
+			Oid relid, BlockNumber blkno, char *buffer);
+extern int smgrnblocks(int16 which, Relation reln);
+extern int smgrcommit(void);
+extern int smgrabort(void);
+extern bool smgriswo(int16 smgrno);
+
+
+
+/* internals: move me elsewhere -- ay 7/94 */
+
+/* in md.c */
+extern int mdinit(void);
+extern int mdcreate(Relation reln);
+extern int mdunlink(Relation reln);
+extern int mdextend(Relation reln, char *buffer);
+extern int mdopen(Relation reln);
+extern int mdclose(Relation reln);
+extern int mdread(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
+		      BlockNumber blkno, char *buffer);
+extern int mdnblocks(Relation reln);
+extern int mdcommit(void);
+extern int mdabort(void);
+
+/* mm.c */
+extern SPINLOCK MMCacheLock;
+
+extern int mminit(void);
+extern int mmshutdown(void);
+extern int mmcreate(Relation reln);
+extern int mmunlink(Relation reln);
+extern int mmextend(Relation reln, char *buffer);
+extern int mmopen(Relation reln);
+extern int mmclose(Relation reln);
+extern int mmread(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
+		      BlockNumber blkno, char *buffer);
+extern int mmnblocks(Relation reln);
+extern int mmcommit(void);
+extern int mmabort(void);
+extern int MMShmemSize(void);
+
+#endif	/* SMGR_H */
diff --git a/src/backend/storage/smgr/Makefile.inc b/src/backend/storage/smgr/Makefile.inc
new file mode 100644
index 00000000000..8ff067afbe8
--- /dev/null
+++ b/src/backend/storage/smgr/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/smgr
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= md.c mm.c smgr.c smgrtype.c
diff --git a/src/backend/storage/smgr/README b/src/backend/storage/smgr/README
new file mode 100644
index 00000000000..4dbb2dce708
--- /dev/null
+++ b/src/backend/storage/smgr/README
@@ -0,0 +1,40 @@
+# $Header: /cvsroot/pgsql/src/backend/storage/smgr/README,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+
+This directory contains the code that supports the Postgres storage manager
+switch and all of the installed storage managers.  In released systems,
+the only supported storage manager is the magnetic disk manager.  At UC
+Berkeley, the Sony WORM optical disk jukebox and persistent main memory are
+also supported.
+
+As of Postgres Release 3.0, every relation in the system is tagged with the
+storage manager on which it resides.  The storage manager switch code turns
+what used to by filesystem operations into operations on the correct store,
+for any given relation.
+
+The files in this directory, and their contents, are
+
+    smgrtype.c	Storage manager type -- maps string names to storage manager
+		IDs and provides simple comparison operators.  This is the
+		regproc support for type 'smgr' in the system catalogs.
+
+    smgr.c	The storage manager switch dispatch code.  The routines in
+		this file call the appropriate storage manager to do hardware
+		accesses requested by the backend.
+
+    md.c	The magnetic disk storage manager.
+
+    mm.c	The persistent main memory storage manager (#undef'ed in
+		tmp/c.h for all distributed systems).
+
+    sj.c	The sony jukebox storage manager and cache management code
+		(#undef'ed in tmp/c.h for all distributed systems).  The
+		routines in this file allocate extents, maintain block
+		maps, and guarantee the persistence and coherency of a cache
+		of jukebox blocks on magnetic disk.
+
+    pgjb.c	The postgres jukebox interface routines.  The routines here
+		handle exclusion on the physical device and translate requests
+		from the storage manager code (sj.c) into jbaccess calls.
+
+    jbaccess.c	Access code for the physical Sony jukebox device.  This code
+		was swiped from Andy McFadden's jblib.a code at UC Berkeley.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
new file mode 100644
index 00000000000..31aa1336a86
--- /dev/null
+++ b/src/backend/storage/smgr/md.c
@@ -0,0 +1,697 @@
+/*-------------------------------------------------------------------------
+ *
+ * md.c--
+ *    This code manages relations that reside on magnetic disk.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>		/* for sprintf() */
+#include <sys/file.h>
+
+#include "postgres.h"
+#include "miscadmin.h"  /* for DataDir */
+
+#include "machine.h"
+#include "storage/smgr.h"	/* where the declarations go */
+#include "storage/block.h"
+#include "storage/fd.h"
+#include "utils/mcxt.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "catalog/catalog.h"
+
+#undef DIAGNOSTIC
+
+/*
+ *  The magnetic disk storage manager keeps track of open file descriptors
+ *  in its own descriptor pool.  This happens for two reasons.  First, at
+ *  transaction boundaries, we walk the list of descriptors and flush
+ *  anything that we've dirtied in the current transaction.  Second, we
+ *  have to support relations of > 4GBytes.  In order to do this, we break
+ *  relations up into chunks of < 2GBytes and store one chunk in each of
+ *  several files that represent the relation.
+ */
+
+typedef struct _MdfdVec {
+    int			mdfd_vfd; /* fd number in vfd pool */
+    uint16		mdfd_flags; /* clean, dirty */
+    int			mdfd_lstbcnt; /* most recent block count */
+    struct _MdfdVec	*mdfd_chain; /* for large relations */
+} MdfdVec;
+
+static int	Nfds = 100;
+static MdfdVec	*Md_fdvec = (MdfdVec *) NULL;
+static int	CurFd = 0;
+static MemoryContext	MdCxt;
+
+#define MDFD_DIRTY	(uint16) 0x01
+
+#define	RELSEG_SIZE	262144		/* (2 ** 31) / 8192 -- 2GB file */
+
+/* routines declared here */
+static MdfdVec	*_mdfd_openseg(Relation reln, int segno, int oflags);
+static MdfdVec	*_mdfd_getseg(Relation reln, int blkno, int oflag);
+static int _fdvec_ext(void);
+static BlockNumber _mdnblocks(File file, Size blcksz);
+
+/*
+ *  mdinit() -- Initialize private state for magnetic disk storage manager.
+ *
+ *	We keep a private table of all file descriptors.  Whenever we do
+ *	a write to one, we mark it dirty in our table.  Whenever we force
+ *	changes to disk, we mark the file descriptor clean.  At transaction
+ *	commit, we force changes to disk for all dirty file descriptors.
+ *	This routine allocates and initializes the table.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdinit()
+{
+    MemoryContext oldcxt;
+
+    MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
+    if (MdCxt == (MemoryContext) NULL)
+	return (SM_FAIL);
+
+    oldcxt = MemoryContextSwitchTo(MdCxt);
+    Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
+    (void) MemoryContextSwitchTo(oldcxt);
+
+    if (Md_fdvec == (MdfdVec *) NULL)
+	return (SM_FAIL);
+
+    memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec)); 
+
+    return (SM_SUCCESS);
+}
+
+int
+mdcreate(Relation reln)
+{
+    int fd, vfd;
+    int tmp;
+    char *path;
+    extern bool IsBootstrapProcessingMode();
+
+    path = relpath(&(reln->rd_rel->relname.data[0]));
+    fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+
+    /*
+     *  If the file already exists and is empty, we pretend that the
+     *  create succeeded.  During bootstrap processing, we skip that check,
+     *  because pg_time, pg_variable, and pg_log get created before their
+     *  .bki file entries are processed.
+     */
+
+    if (fd < 0) {
+	if ((fd = FileNameOpenFile(path, O_RDWR, 0600)) >= 0) {
+	    if (!IsBootstrapProcessingMode() &&
+		FileRead(fd, (char *) &tmp, sizeof(tmp)) != 0) {
+		FileClose(fd);
+		return (-1);
+	    }
+	}
+    }
+
+    if (CurFd >= Nfds) {
+	if (_fdvec_ext() == SM_FAIL)
+	    return (-1);
+    }
+
+    Md_fdvec[CurFd].mdfd_vfd = fd;
+    Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
+    Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
+    Md_fdvec[CurFd].mdfd_lstbcnt = 0;
+
+    vfd = CurFd++;
+
+    return (vfd);
+}
+
+/*
+ *  mdunlink() -- Unlink a relation.
+ */
+int
+mdunlink(Relation reln)
+{
+    int fd;
+    int i;
+    MdfdVec *v, *ov;
+    MemoryContext oldcxt;
+    char fname[20];	/* XXX should have NAMESIZE defined */
+    char tname[20];
+
+ /* On Windows NT you can't unlink a file if it is open so we have
+ ** to do this.
+ */
+#ifdef WIN32
+    (void) mdclose(reln);
+#endif /* WIN32 */
+ 
+
+    memset(fname,0,20); 
+    strncpy(fname, RelationGetRelationName(reln)->data, 16);
+
+    if (FileNameUnlink(fname) < 0)
+	return (SM_FAIL);
+
+    /* unlink all the overflow files for large relations */
+    for (i = 1; ; i++) {
+#ifdef WIN32
+       (void) mdclose(reln);
+#endif /* WIN32 */
+	sprintf(tname, "%s.%d", fname, i);
+	if (FileNameUnlink(tname) < 0)
+	    break;
+    }
+
+    /* finally, clean out the mdfd vector */
+    fd = RelationGetFile(reln);
+    Md_fdvec[fd].mdfd_flags = (uint16) 0;
+
+    oldcxt = MemoryContextSwitchTo(MdCxt);
+    for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; ) {
+	ov = v;
+	v = v->mdfd_chain;
+	if (ov != &Md_fdvec[fd])
+	    pfree(ov);
+    }
+    Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
+    (void) MemoryContextSwitchTo(oldcxt);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mdextend() -- Add a block to the specified relation.
+ *
+ *	This routine returns SM_FAIL or SM_SUCCESS, with errno set as
+ *	appropriate.
+ */
+int
+mdextend(Relation reln, char *buffer)
+{
+    long pos;
+    int nblocks;
+    MdfdVec *v;
+
+    nblocks = mdnblocks(reln);
+    v = _mdfd_getseg(reln, nblocks, O_CREAT);
+
+    if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
+	return (SM_FAIL);
+
+    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
+	return (SM_FAIL);
+
+    /* remember that we did a write, so we can sync at xact commit */
+    v->mdfd_flags |= MDFD_DIRTY;
+
+    /* try to keep the last block count current, though it's just a hint */
+    if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
+	v->mdfd_lstbcnt = RELSEG_SIZE;
+
+#ifdef DIAGNOSTIC
+    if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
+	|| v->mdfd_lstbcnt > RELSEG_SIZE)
+	elog(FATAL, "segment too big!");
+#endif
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mdopen() -- Open the specified relation.
+ */
+int
+mdopen(Relation reln)
+{
+    char *path;
+    int fd;
+    int vfd;
+
+    if (CurFd >= Nfds) {
+	if (_fdvec_ext() == SM_FAIL)
+	    return (-1);
+    }
+
+    path = relpath(&(reln->rd_rel->relname.data[0]));
+
+    fd = FileNameOpenFile(path, O_RDWR, 0600);
+
+    /* this should only happen during bootstrap processing */
+    if (fd < 0)
+	fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+
+    Md_fdvec[CurFd].mdfd_vfd = fd;
+    Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
+    Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
+    Md_fdvec[CurFd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+
+#ifdef DIAGNOSTIC
+    if (Md_fdvec[CurFd].mdfd_lstbcnt > RELSEG_SIZE)
+	elog(FATAL, "segment too big on relopen!");
+#endif
+
+    vfd = CurFd++;
+
+    return (vfd);
+}
+
+/*
+ *  mdclose() -- Close the specified relation.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdclose(Relation reln)
+{
+    int fd;
+    MdfdVec *v;
+
+    fd = RelationGetFile(reln);
+
+    for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+
+	/* may be closed already */
+	if (v->mdfd_vfd < 0)
+	    continue;
+
+	/*
+	 *  We sync the file descriptor so that we don't need to reopen it at
+	 *  transaction commit to force changes to disk.
+	 */
+
+	FileSync(v->mdfd_vfd);
+	FileClose(v->mdfd_vfd);
+
+	/* mark this file descriptor as clean in our private table */
+	v->mdfd_flags &= ~MDFD_DIRTY;
+    }
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mdread() -- Read the specified block from a relation.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mdread(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+    long seekpos;
+    int nbytes;
+    MdfdVec *v;
+
+    v = _mdfd_getseg(reln, blocknum, 0);
+
+    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+
+#ifdef DIAGNOSTIC
+    if (seekpos >= BLCKSZ * RELSEG_SIZE)
+	elog(FATAL, "seekpos too big!");
+#endif
+
+    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+	return (SM_FAIL);
+    }
+
+    status = SM_SUCCESS;
+    if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) {
+	if (nbytes == 0) {
+	  memset(buffer, 0, BLCKSZ); 
+	} else {
+	    status = SM_FAIL;
+	}
+    }
+
+    return (status);
+}
+
+/*
+ *  mdwrite() -- Write the supplied block at the appropriate location.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+    long seekpos;
+    MdfdVec *v;
+
+    v = _mdfd_getseg(reln, blocknum, 0);
+
+    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+#ifdef DIAGNOSTIC
+    if (seekpos >= BLCKSZ * RELSEG_SIZE)
+	elog(FATAL, "seekpos too big!");
+#endif
+
+    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+	return (SM_FAIL);
+    }
+
+    status = SM_SUCCESS;
+    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
+	status = SM_FAIL;
+
+    v->mdfd_flags |= MDFD_DIRTY;
+
+    return (status);
+}
+
+/*
+ *  mdflush() -- Synchronously write a block to disk.
+ *
+ *	This is exactly like mdwrite(), but doesn't return until the file
+ *	system buffer cache has been flushed.
+ */
+int
+mdflush(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+    long seekpos;
+    MdfdVec *v;
+
+    v = _mdfd_getseg(reln, blocknum, 0);
+
+    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+#ifdef DIAGNOSTIC
+    if (seekpos >= BLCKSZ * RELSEG_SIZE)
+	elog(FATAL, "seekpos too big!");
+#endif
+
+    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+	return (SM_FAIL);
+    }
+
+    /* write and sync the block */
+    status = SM_SUCCESS;
+    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
+	|| FileSync(v->mdfd_vfd) < 0)
+	status = SM_FAIL;
+
+    /*
+     *  By here, the block is written and changes have been forced to stable
+     *  storage.  Mark the descriptor as clean until the next write, so we
+     *  don't sync it again unnecessarily at transaction commit.
+     */
+
+    v->mdfd_flags &= ~MDFD_DIRTY;
+
+    return (status);
+}
+
+/*
+ *  mdblindwrt() -- Write a block to disk blind.
+ *
+ *	We have to be able to do this using only the name and OID of
+ *	the database and relation in which the block belongs.  This
+ *	is a synchronous write.
+ */
+int
+mdblindwrt(char *dbstr,
+	   char *relstr,
+	   Oid dbid,
+	   Oid relid,
+	   BlockNumber blkno,
+	   char *buffer)
+{
+    int fd;
+    int segno;
+    long seekpos;
+    int status;
+    char *path;
+    int nchars;
+
+    /* be sure we have enough space for the '.segno', if any */
+    segno = blkno / RELSEG_SIZE;
+    if (segno > 0)
+	nchars = 10;
+    else
+	nchars = 0;
+
+    /* construct the path to the file and open it */
+    if (dbid == (Oid) 0) {
+	path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
+	if (segno == 0)
+	    sprintf(path, "%s/%.*s", DataDir, NAMEDATALEN, relstr);
+	else
+	    sprintf(path, "%s/%.*s.%d", DataDir, NAMEDATALEN, relstr, segno);
+    } else {
+	path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars);
+	if (segno == 0)
+	    sprintf(path, "%s/base/%.*s/%.*s", DataDir, NAMEDATALEN, 
+			dbstr, NAMEDATALEN, relstr);
+	else
+	    sprintf(path, "%s/base/%.*s/%.*s.%d", DataDir, NAMEDATALEN, dbstr,
+			NAMEDATALEN, relstr, segno);
+    }
+
+    if ((fd = open(path, O_RDWR, 0600)) < 0)
+	return (SM_FAIL);
+
+    /* seek to the right spot */
+    seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
+    if (lseek(fd, seekpos, SEEK_SET) != seekpos) {
+	(void) close(fd);
+	return (SM_FAIL);
+    }
+
+    status = SM_SUCCESS;
+
+    /* write and sync the block */
+    if (write(fd, buffer, BLCKSZ) != BLCKSZ || fsync(fd) < 0)
+	status = SM_FAIL;
+
+    if (close(fd) < 0)
+	status = SM_FAIL;
+
+    pfree(path);
+
+    return (status);
+}
+
+/*
+ *  mdnblocks() -- Get the number of blocks stored in a relation.
+ *
+ *	Returns # of blocks or -1 on error.
+ */
+int
+mdnblocks(Relation reln)
+{
+    int fd;
+    MdfdVec *v;
+    int nblocks;
+    int segno;
+
+    fd = RelationGetFile(reln);
+    v = &Md_fdvec[fd];
+
+#ifdef DIAGNOSTIC
+    if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
+	elog(FATAL, "segment too big in getseg!");
+#endif
+
+    segno = 0;
+    for (;;) {
+	if (v->mdfd_lstbcnt == RELSEG_SIZE
+	    || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE) {
+
+	    v->mdfd_lstbcnt = RELSEG_SIZE;
+	    segno++;
+
+	    if (v->mdfd_chain == (MdfdVec *) NULL) {
+		v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
+		if (v->mdfd_chain == (MdfdVec *) NULL)
+		    elog(WARN, "cannot count blocks for %.16s -- open failed",
+				RelationGetRelationName(reln));
+	    }
+
+	    v = v->mdfd_chain;
+	} else {
+	    return ((segno * RELSEG_SIZE) + nblocks);
+	}
+    }
+}
+
+/*
+ *  mdcommit() -- Commit a transaction.
+ *
+ *	All changes to magnetic disk relations must be forced to stable
+ *	storage.  This routine makes a pass over the private table of
+ *	file descriptors.  Any descriptors to which we have done writes,
+ *	but not synced, are synced here.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdcommit()
+{
+    int i;
+    MdfdVec *v;
+
+    for (i = 0; i < CurFd; i++) {
+	for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+	    if (v->mdfd_flags & MDFD_DIRTY) {
+		if (FileSync(v->mdfd_vfd) < 0)
+		    return (SM_FAIL);
+
+		v->mdfd_flags &= ~MDFD_DIRTY;
+	    }
+	}
+    }
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mdabort() -- Abort a transaction.
+ *
+ *	Changes need not be forced to disk at transaction abort.  We mark
+ *	all file descriptors as clean here.  Always returns SM_SUCCESS.
+ */
+int
+mdabort()
+{
+    int i;
+    MdfdVec *v;
+
+    for (i = 0; i < CurFd; i++) {
+	for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+	    v->mdfd_flags &= ~MDFD_DIRTY;
+	}
+    }
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  _fdvec_ext() -- Extend the md file descriptor vector.
+ *
+ *	The file descriptor vector must be large enough to hold at least
+ *	'fd' entries.
+ */
+static
+int _fdvec_ext()
+{
+    MdfdVec *nvec;
+    MemoryContext oldcxt;
+
+    Nfds *= 2;
+
+    oldcxt = MemoryContextSwitchTo(MdCxt);
+
+    nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
+    memset(nvec, 0, Nfds * sizeof(MdfdVec)); 
+    memmove(nvec, (char *) Md_fdvec, (Nfds / 2) * sizeof(MdfdVec)); 
+    pfree(Md_fdvec);
+
+    (void) MemoryContextSwitchTo(oldcxt);
+
+    Md_fdvec = nvec;
+
+    return (SM_SUCCESS);
+}
+
+static MdfdVec *
+_mdfd_openseg(Relation reln, int segno, int oflags)
+{
+    MemoryContext oldcxt;
+    MdfdVec *v;
+    int fd;
+    bool dofree;
+    char *path, *fullpath;
+
+    /* be sure we have enough space for the '.segno', if any */
+    path = relpath(RelationGetRelationName(reln)->data);
+
+    dofree = false;
+    if (segno > 0) {
+	dofree = true;
+	fullpath = (char *) palloc(strlen(path) + 12);
+	sprintf(fullpath, "%s.%d", path, segno);
+    } else
+	fullpath = path;
+
+    /* open the file */
+    fd = PathNameOpenFile(fullpath, O_RDWR|oflags, 0600);
+
+    if (dofree)
+	pfree(fullpath);
+
+    if (fd < 0)
+	return ((MdfdVec *) NULL);
+
+    /* allocate an mdfdvec entry for it */
+    oldcxt = MemoryContextSwitchTo(MdCxt);
+    v = (MdfdVec *) palloc(sizeof(MdfdVec));
+    (void) MemoryContextSwitchTo(oldcxt);
+
+    /* fill the entry */
+    v->mdfd_vfd = fd;
+    v->mdfd_flags = (uint16) 0;
+    v->mdfd_chain = (MdfdVec *) NULL;
+    v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+
+#ifdef DIAGNOSTIC
+    if (v->mdfd_lstbcnt > RELSEG_SIZE)
+	elog(FATAL, "segment too big on open!");
+#endif
+
+    /* all done */
+    return (v);
+}
+
+static MdfdVec *
+_mdfd_getseg(Relation reln, int blkno, int oflag)
+{
+    MdfdVec *v;
+    int segno;
+    int fd;
+    int i;
+
+    fd = RelationGetFile(reln);
+    if (fd < 0) {
+	if ((fd = mdopen(reln)) < 0)
+	    elog(WARN, "cannot open relation %.16s",
+			RelationGetRelationName(reln));
+	reln->rd_fd = fd;
+    }
+
+    for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
+	 segno > 0;
+	 i++, segno--) {
+
+	if (v->mdfd_chain == (MdfdVec *) NULL) {
+	    v->mdfd_chain = _mdfd_openseg(reln, i, oflag);
+
+	    if (v->mdfd_chain == (MdfdVec *) NULL)
+		elog(WARN, "cannot open segment %d of relation %.16s",
+			    i, RelationGetRelationName(reln));
+	}
+	v = v->mdfd_chain;
+    }
+
+    return (v);
+}
+
+static BlockNumber
+_mdnblocks(File file, Size blcksz)
+{
+    long len;
+    
+    len = FileSeek(file, 0L, SEEK_END) - 1;
+    return((BlockNumber)((len < 0) ? 0 : 1 + len / blcksz));
+}
diff --git a/src/backend/storage/smgr/mm.c b/src/backend/storage/smgr/mm.c
new file mode 100644
index 00000000000..24a8d2472a6
--- /dev/null
+++ b/src/backend/storage/smgr/mm.c
@@ -0,0 +1,586 @@
+/*-------------------------------------------------------------------------
+ *
+ * mm.c--
+ *    main memory storage manager
+ *
+ *    This code manages relations that reside in (presumably stable)
+ *    main memory.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#ifdef MAIN_MEMORY
+
+#include <math.h>
+#include "machine.h"
+#include "storage/ipc.h"
+#include "storage/smgr.h"	/* where the declarations go */
+#include "storage/block.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+#include "utils/hsearch.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+
+/*
+ *  MMCacheTag -- Unique triplet for blocks stored by the main memory
+ *		  storage manager.
+ */
+
+typedef struct MMCacheTag {
+    Oid			mmct_dbid;
+    Oid			mmct_relid;
+    BlockNumber		mmct_blkno;
+} MMCacheTag;
+
+/*
+ *  Shared-memory hash table for main memory relations contains
+ *  entries of this form.
+ */
+
+typedef struct MMHashEntry {
+    MMCacheTag		mmhe_tag;
+    int			mmhe_bufno;
+} MMHashEntry;
+
+/*
+ * MMRelTag -- Unique identifier for each relation that is stored in the
+ *	            main-memory storage manager.
+ */
+
+typedef struct MMRelTag {
+    Oid		mmrt_dbid;
+    Oid		mmrt_relid;
+} MMRelTag;
+
+/*
+ *  Shared-memory hash table for # blocks in main memory relations contains
+ *  entries of this form.
+ */
+
+typedef struct MMRelHashEntry {
+    MMRelTag		mmrhe_tag;
+    int			mmrhe_nblocks;
+} MMRelHashEntry;
+
+#define MMNBUFFERS	10
+#define MMNRELATIONS	2
+
+SPINLOCK	MMCacheLock;
+extern bool	IsPostmaster;
+extern Oid	MyDatabaseId;
+
+static int		*MMCurTop;
+static int		*MMCurRelno;
+static MMCacheTag	*MMBlockTags;
+static char		*MMBlockCache;
+static HTAB		*MMCacheHT;
+static HTAB		*MMRelCacheHT;
+
+int
+mminit()
+{
+    char *mmcacheblk;
+    int mmsize = 0;
+    bool found;
+    HASHCTL info;
+
+    SpinAcquire(MMCacheLock);
+
+    mmsize += MAXALIGN(BLCKSZ * MMNBUFFERS);
+    mmsize += MAXALIGN(sizeof(*MMCurTop));
+    mmsize += MAXALIGN(sizeof(*MMCurRelno));
+    mmsize += MAXALIGN((MMNBUFFERS * sizeof(MMCacheTag)));
+    mmcacheblk = (char *) ShmemInitStruct("Main memory smgr", mmsize, &found);
+
+    if (mmcacheblk == (char *) NULL) {
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    info.keysize = sizeof(MMCacheTag);
+    info.datasize = sizeof(int);
+    info.hash = tag_hash;
+
+    MMCacheHT = (HTAB *) ShmemInitHash("Main memory store HT",
+					MMNBUFFERS, MMNBUFFERS,
+					&info, (HASH_ELEM|HASH_FUNCTION));
+
+    if (MMCacheHT == (HTAB *) NULL) {
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    info.keysize = sizeof(MMRelTag);
+    info.datasize = sizeof(int);
+    info.hash = tag_hash;
+
+    MMRelCacheHT = (HTAB *) ShmemInitHash("Main memory rel HT",
+					  MMNRELATIONS, MMNRELATIONS,
+					  &info, (HASH_ELEM|HASH_FUNCTION));
+
+    if (MMRelCacheHT == (HTAB *) NULL) {
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    if (IsPostmaster) {
+	memset(mmcacheblk, 0, mmsize);
+	SpinRelease(MMCacheLock);
+	return (SM_SUCCESS);
+    }
+
+    SpinRelease(MMCacheLock);
+
+    MMCurTop = (int *) mmcacheblk;
+    mmcacheblk += sizeof(int);
+    MMCurRelno = (int *) mmcacheblk;
+    mmcacheblk += sizeof(int);
+    MMBlockTags = (MMCacheTag *) mmcacheblk;
+    mmcacheblk += (MMNBUFFERS * sizeof(MMCacheTag));
+    MMBlockCache = mmcacheblk;
+
+    return (SM_SUCCESS);
+}
+
+int
+mmshutdown()
+{
+    return (SM_SUCCESS);
+}
+
+int
+mmcreate(Relation reln)
+{
+    MMRelHashEntry *entry;
+    bool found;
+    MMRelTag tag;
+
+    SpinAcquire(MMCacheLock);
+
+    if (*MMCurRelno == MMNRELATIONS) {
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    (*MMCurRelno)++;
+
+    tag.mmrt_relid = reln->rd_id;
+    if (reln->rd_rel->relisshared)
+	tag.mmrt_dbid = (Oid) 0;
+    else
+	tag.mmrt_dbid = MyDatabaseId;
+
+    entry = (MMRelHashEntry *) hash_search(MMRelCacheHT,
+					   (char *) &tag, HASH_ENTER, &found);
+
+    if (entry == (MMRelHashEntry *) NULL) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "main memory storage mgr rel cache hash table corrupt");
+    }
+
+    if (found) {
+	/* already exists */
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    entry->mmrhe_nblocks = 0;
+
+    SpinRelease(MMCacheLock);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmunlink() -- Unlink a relation.
+ */
+int
+mmunlink(Relation reln)
+{
+    int i;
+    Oid reldbid;
+    MMHashEntry *entry;
+    MMRelHashEntry *rentry;
+    bool found;
+    MMRelTag rtag;
+
+    if (reln->rd_rel->relisshared)
+	reldbid = (Oid) 0;
+    else
+	reldbid = MyDatabaseId;
+
+    SpinAcquire(MMCacheLock);
+
+    for (i = 0; i < MMNBUFFERS; i++) {
+	if (MMBlockTags[i].mmct_dbid == reldbid
+	    && MMBlockTags[i].mmct_relid == reln->rd_id) {
+	    entry = (MMHashEntry *) hash_search(MMCacheHT,
+						(char *) &MMBlockTags[i],
+						 HASH_REMOVE, &found);
+	    if (entry == (MMHashEntry *) NULL || !found) {
+		SpinRelease(MMCacheLock);
+		elog(FATAL, "mmunlink: cache hash table corrupted");
+	    }
+	    MMBlockTags[i].mmct_dbid = (Oid) 0;
+	    MMBlockTags[i].mmct_relid = (Oid) 0;
+	    MMBlockTags[i].mmct_blkno = (BlockNumber) 0;
+	}
+    }
+    rtag.mmrt_dbid = reldbid;
+    rtag.mmrt_relid = reln->rd_id;
+
+    rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+					    HASH_REMOVE, &found);
+
+    if (rentry == (MMRelHashEntry *) NULL || !found) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmunlink: rel cache hash table corrupted");
+    }
+
+    (*MMCurRelno)--;
+
+    SpinRelease(MMCacheLock);
+    return 1;
+}
+
+/*
+ *  mmextend() -- Add a block to the specified relation.
+ *
+ *	This routine returns SM_FAIL or SM_SUCCESS, with errno set as
+ *	appropriate.
+ */
+int
+mmextend(Relation reln, char *buffer)
+{
+    MMRelHashEntry *rentry;
+    MMHashEntry *entry;
+    int i;
+    Oid reldbid;
+    int offset;
+    bool found;
+    MMRelTag rtag;
+    MMCacheTag tag;
+
+    if (reln->rd_rel->relisshared)
+	reldbid = (Oid) 0;
+    else
+	reldbid = MyDatabaseId;
+
+    tag.mmct_dbid = rtag.mmrt_dbid = reldbid;
+    tag.mmct_relid = rtag.mmrt_relid = reln->rd_id;
+
+    SpinAcquire(MMCacheLock);
+
+    if (*MMCurTop == MMNBUFFERS) {
+	for (i = 0; i < MMNBUFFERS; i++) {
+	    if (MMBlockTags[i].mmct_dbid == 0 &&
+		MMBlockTags[i].mmct_relid == 0)
+		break;
+	}
+	if (i == MMNBUFFERS) {
+	    SpinRelease(MMCacheLock);
+	    return (SM_FAIL);
+	}
+    } else {
+	i = *MMCurTop;
+	(*MMCurTop)++;
+    }
+
+    rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+					    HASH_FIND, &found);
+    if (rentry == (MMRelHashEntry *) NULL || !found) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmextend: rel cache hash table corrupt");
+    }
+
+    tag.mmct_blkno = rentry->mmrhe_nblocks;
+
+    entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+					HASH_ENTER, &found);
+    if (entry == (MMHashEntry *) NULL || found) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmextend: cache hash table corrupt");
+    }
+
+    entry->mmhe_bufno = i;
+    MMBlockTags[i].mmct_dbid = reldbid;
+    MMBlockTags[i].mmct_relid = reln->rd_id;
+    MMBlockTags[i].mmct_blkno = rentry->mmrhe_nblocks;
+
+    /* page numbers are zero-based, so we increment this at the end */
+    (rentry->mmrhe_nblocks)++;
+
+    /* write the extended page */
+    offset = (i * BLCKSZ);
+    memmove(&(MMBlockCache[offset]), buffer, BLCKSZ);
+
+    SpinRelease(MMCacheLock);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmopen() -- Open the specified relation.
+ */
+int
+mmopen(Relation reln)
+{
+    /* automatically successful */
+    return (0);
+}
+
+/*
+ *  mmclose() -- Close the specified relation.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mmclose(Relation reln)
+{
+    /* automatically successful */
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmread() -- Read the specified block from a relation.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mmread(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    MMHashEntry *entry;
+    bool found;
+    int offset;
+    MMCacheTag tag;
+
+    if (reln->rd_rel->relisshared)
+	tag.mmct_dbid = (Oid) 0;
+    else
+	tag.mmct_dbid = MyDatabaseId;
+
+    tag.mmct_relid = reln->rd_id;
+    tag.mmct_blkno = blocknum;
+
+    SpinAcquire(MMCacheLock);
+    entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+					HASH_FIND, &found);
+
+    if (entry == (MMHashEntry *) NULL) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmread: hash table corrupt");
+    }
+
+    if (!found) {
+	/* reading nonexistent pages is defined to fill them with zeroes */
+	SpinRelease(MMCacheLock);
+	memset(buffer, 0, BLCKSZ);
+	return (SM_SUCCESS);
+    }
+
+    offset = (entry->mmhe_bufno * BLCKSZ);
+    memmove(buffer, &MMBlockCache[offset], BLCKSZ);
+
+    SpinRelease(MMCacheLock);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmwrite() -- Write the supplied block at the appropriate location.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mmwrite(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    MMHashEntry *entry;
+    bool found;
+    int offset;
+    MMCacheTag tag;
+
+    if (reln->rd_rel->relisshared)
+	tag.mmct_dbid = (Oid) 0;
+    else
+	tag.mmct_dbid = MyDatabaseId;
+
+    tag.mmct_relid = reln->rd_id;
+    tag.mmct_blkno = blocknum;
+
+    SpinAcquire(MMCacheLock);
+    entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+					HASH_FIND, &found);
+
+    if (entry == (MMHashEntry *) NULL) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmread: hash table corrupt");
+    }
+
+    if (!found) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmwrite: hash table missing requested page");
+    }
+
+    offset = (entry->mmhe_bufno * BLCKSZ);
+    memmove(&MMBlockCache[offset], buffer, BLCKSZ);
+
+    SpinRelease(MMCacheLock);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmflush() -- Synchronously write a block to stable storage.
+ *
+ *	For main-memory relations, this is exactly equivalent to mmwrite().
+ */
+int
+mmflush(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    return (mmwrite(reln, blocknum, buffer));
+}
+
+/*
+ *  mmblindwrt() -- Write a block to stable storage blind.
+ *
+ *	We have to be able to do this using only the name and OID of
+ *	the database and relation in which the block belongs.
+ */
+int
+mmblindwrt(char *dbstr,
+	   char *relstr,
+	   Oid dbid,
+	   Oid relid,
+	   BlockNumber blkno,
+	   char *buffer)
+{
+    return (SM_FAIL);
+}
+
+/*
+ *  mmnblocks() -- Get the number of blocks stored in a relation.
+ *
+ *	Returns # of blocks or -1 on error.
+ */
+int
+mmnblocks(Relation reln)
+{
+    MMRelTag rtag;
+    MMRelHashEntry *rentry;
+    bool found;
+    int nblocks;
+
+    if (reln->rd_rel->relisshared)
+	rtag.mmrt_dbid = (Oid) 0;
+    else
+	rtag.mmrt_dbid = MyDatabaseId;
+
+    rtag.mmrt_relid = reln->rd_id;
+
+    SpinAcquire(MMCacheLock);
+
+    rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+					    HASH_FIND, &found);
+
+    if (rentry == (MMRelHashEntry *) NULL) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmnblocks: rel cache hash table corrupt");
+    }
+
+    if (found)
+	nblocks = rentry->mmrhe_nblocks;
+    else
+	nblocks = -1;
+
+    SpinRelease(MMCacheLock);
+
+    return (nblocks);
+}
+
+/*
+ *  mmcommit() -- Commit a transaction.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mmcommit()
+{
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmabort() -- Abort a transaction.
+ */
+
+int
+mmabort()
+{
+    return (SM_SUCCESS);
+}
+
+/*
+ *  MMShmemSize() -- Declare amount of shared memory we require.
+ *
+ *	The shared memory initialization code creates a block of shared
+ *	memory exactly big enough to hold all the structures it needs to.
+ *	This routine declares how much space the main memory storage
+ *	manager will use.
+ */
+int
+MMShmemSize()
+{
+    int size = 0;
+    int nbuckets;
+    int nsegs;
+    int tmp;
+
+    /*
+     *  first compute space occupied by the (dbid,relid,blkno) hash table
+     */
+
+    nbuckets = 1 << (int)my_log2((MMNBUFFERS - 1) / DEF_FFACTOR + 1);
+    nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1);
+    
+    size += MAXALIGN(my_log2(MMNBUFFERS) * sizeof(void *));
+    size += MAXALIGN(sizeof(HHDR));
+    size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    tmp = (int)ceil((double)MMNBUFFERS/BUCKET_ALLOC_INCR);
+    size += tmp * BUCKET_ALLOC_INCR *
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(sizeof(MMHashEntry)));	/* contains hash key */
+
+    /*
+     *  now do the same for the rel hash table
+     */
+
+    size += MAXALIGN(my_log2(MMNRELATIONS) * sizeof(void *));
+    size += MAXALIGN(sizeof(HHDR));
+    size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    tmp = (int)ceil((double)MMNRELATIONS/BUCKET_ALLOC_INCR);
+    size += tmp * BUCKET_ALLOC_INCR *
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(sizeof(MMRelHashEntry)));	/* contains hash key */
+
+    /*
+     *  finally, add in the memory block we use directly
+     */
+
+    size += MAXALIGN(BLCKSZ * MMNBUFFERS);
+    size += MAXALIGN(sizeof(*MMCurTop));
+    size += MAXALIGN(sizeof(*MMCurRelno));
+    size += MAXALIGN(MMNBUFFERS * sizeof(MMCacheTag));
+
+    return (size);
+}
+
+#endif /* MAIN_MEMORY */
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
new file mode 100644
index 00000000000..426c3d93480
--- /dev/null
+++ b/src/backend/storage/smgr/smgr.c
@@ -0,0 +1,371 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.c--
+ *    public interface routines to storage manager switch.
+ *
+ *    All file system operations in POSTGRES dispatch through these
+ *    routines.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+
+#include "machine.h"
+#include "storage/ipc.h"
+#include "storage/smgr.h"
+#include "storage/block.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+
+typedef struct f_smgr {
+    int		(*smgr_init)();		/* may be NULL */
+    int		(*smgr_shutdown)();	/* may be NULL */
+    int		(*smgr_create)();
+    int		(*smgr_unlink)();
+    int		(*smgr_extend)();
+    int		(*smgr_open)();
+    int		(*smgr_close)();
+    int		(*smgr_read)();
+    int		(*smgr_write)();
+    int		(*smgr_flush)();
+    int		(*smgr_blindwrt)();
+    int		(*smgr_nblocks)();
+    int		(*smgr_commit)();	/* may be NULL */
+    int		(*smgr_abort)();	/* may be NULL */
+} f_smgr;
+
+/*
+ *  The weird placement of commas in this init block is to keep the compiler
+ *  happy, regardless of what storage managers we have (or don't have).
+ */
+
+static f_smgr smgrsw[] = {
+
+    /* magnetic disk */
+    { mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
+      mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdcommit, mdabort },
+
+#ifdef MAIN_MEMORY
+    /* main memory */
+    { mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
+      mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, mmcommit, mmabort },
+
+#endif /* MAIN_MEMORY */
+};
+
+/*
+ *  This array records which storage managers are write-once, and which
+ *  support overwrite.  A 'true' entry means that the storage manager is
+ *  write-once.  In the best of all possible worlds, there would be no
+ *  write-once storage managers.
+ */
+
+static bool smgrwo[] = {
+    false,		/* magnetic disk */
+#ifdef MAIN_MEMORY
+    false,		/* main memory*/
+#endif /* MAIN_MEMORY */
+};
+static int NSmgr = lengthof(smgrsw);
+
+/*
+ *  smgrinit(), smgrshutdown() -- Initialize or shut down all storage
+ *				  managers.
+ *
+ */
+int
+smgrinit()
+{
+    int i;
+    extern char *smgrout();
+
+    for (i = 0; i < NSmgr; i++) {
+	if (smgrsw[i].smgr_init) {
+	    if ((*(smgrsw[i].smgr_init))() == SM_FAIL)
+		elog(FATAL, "initialization failed on %s", smgrout(i));
+	}
+    }
+
+    /* register the shutdown proc */
+    on_exitpg(smgrshutdown, 0);
+
+    return (SM_SUCCESS);
+}
+
+void
+smgrshutdown(int dummy)
+{
+    int i;
+    extern char *smgrout();
+
+    for (i = 0; i < NSmgr; i++) {
+	if (smgrsw[i].smgr_shutdown) {
+	    if ((*(smgrsw[i].smgr_shutdown))() == SM_FAIL)
+		elog(FATAL, "shutdown failed on %s", smgrout(i));
+	}
+    }
+}
+
+/*
+ *  smgrcreate() -- Create a new relation.
+ *
+ *	This routine takes a reldesc, creates the relation on the appropriate
+ *	device, and returns a file descriptor for it.
+ */
+int
+smgrcreate(int16 which, Relation reln)
+{
+    int fd;
+
+    if ((fd = (*(smgrsw[which].smgr_create))(reln)) < 0)
+	elog(WARN, "cannot open %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (fd);
+}
+
+/*
+ *  smgrunlink() -- Unlink a relation.
+ *
+ *	The relation is removed from the store.
+ */
+int
+smgrunlink(int16 which, Relation reln)
+{
+    int status;
+
+    if ((status = (*(smgrsw[which].smgr_unlink))(reln)) == SM_FAIL)
+	elog(WARN, "cannot unlink %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgrextend() -- Add a new block to a file.
+ *
+ *	Returns SM_SUCCESS on success; aborts the current transaction on
+ *	failure.
+ */
+int
+smgrextend(int16 which, Relation reln, char *buffer)
+{
+    int status;
+
+    status = (*(smgrsw[which].smgr_extend))(reln, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "%.*s: cannot extend",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgropen() -- Open a relation using a particular storage manager.
+ *
+ *	Returns the fd for the open relation on success, aborts the
+ *	transaction on failure.
+ */
+int
+smgropen(int16 which, Relation reln)
+{
+    int fd;
+
+    if ((fd = (*(smgrsw[which].smgr_open))(reln)) < 0)
+	elog(WARN, "cannot open %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (fd);
+}
+
+/*
+ *  smgrclose() -- Close a relation.
+ *
+ *	Returns SM_SUCCESS on success, aborts on failure.
+ */
+int
+smgrclose(int16 which, Relation reln)
+{
+    if ((*(smgrsw[which].smgr_close))(reln) == SM_FAIL)
+	elog(WARN, "cannot close %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  smgrread() -- read a particular block from a relation into the supplied
+ *		  buffer.
+ *
+ *	This routine is called from the buffer manager in order to
+ *	instantiate pages in the shared buffer cache.  All storage managers
+ *	return pages in the format that POSTGRES expects.  This routine
+ *	dispatches the read.  On success, it returns SM_SUCCESS.  On failure,
+ *	the current transaction is aborted.
+ */
+int
+smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+
+    status = (*(smgrsw[which].smgr_read))(reln, blocknum, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "cannot read block %d of %.*s",
+	     blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgrwrite() -- Write the supplied buffer out.
+ *
+ *	This is not a synchronous write -- the interface for that is
+ *	smgrflush().  The buffer is written out via the appropriate
+ *	storage manager.  This routine returns SM_SUCCESS or aborts
+ *	the current transaction.
+ */
+int
+smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+
+    status = (*(smgrsw[which].smgr_write))(reln, blocknum, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "cannot write block %d of %.*s",
+	     blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgrflush() -- A synchronous smgrwrite().
+ */
+int
+smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+
+    status = (*(smgrsw[which].smgr_flush))(reln, blocknum, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "cannot flush block %d of %.*s to stable store",
+	     blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgrblindwrt() -- Write a page out blind.
+ *
+ *	In some cases, we may find a page in the buffer cache that we
+ *	can't make a reldesc for.  This happens, for example, when we
+ *	want to reuse a dirty page that was written by a transaction
+ *	that has not yet committed, which created a new relation.  In
+ *	this case, the buffer manager will call smgrblindwrt() with
+ *	the name and OID of the database and the relation to which the
+ *	buffer belongs.  Every storage manager must be able to force
+ *	this page down to stable storage in this circumstance.
+ */
+int
+smgrblindwrt(int16 which,
+	     char *dbname,
+	     char *relname,
+	     Oid dbid,
+	     Oid relid,
+	     BlockNumber blkno,
+	     char *buffer)
+{
+    char *dbstr;
+    char *relstr;
+    int status;
+
+    dbstr = pstrdup(dbname);
+    relstr = pstrdup(relname);
+
+    status = (*(smgrsw[which].smgr_blindwrt))(dbstr, relstr, dbid, relid,
+					      blkno, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "cannot write block %d of %s [%s] blind",
+	     blkno, relstr, dbstr);
+
+    pfree(dbstr);
+    pfree(relstr);
+
+    return (status);
+}
+
+/*
+ *  smgrnblocks() -- Calculate the number of POSTGRES blocks in the
+ *		     supplied relation.
+ *
+ *	Returns the number of blocks on success, aborts the current
+ *	transaction on failure.
+ */
+int
+smgrnblocks(int16 which, Relation reln)
+{
+    int nblocks;
+
+    if ((nblocks = (*(smgrsw[which].smgr_nblocks))(reln)) < 0)
+	elog(WARN, "cannot count blocks for %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (nblocks);
+}
+
+/*
+ *  smgrcommit(), smgrabort() -- Commit or abort changes made during the
+ *				 current transaction.
+ */
+int
+smgrcommit()
+{
+    int i;
+    extern char *smgrout();
+
+    for (i = 0; i < NSmgr; i++) {
+	if (smgrsw[i].smgr_commit) {
+	    if ((*(smgrsw[i].smgr_commit))() == SM_FAIL)
+		elog(FATAL, "transaction commit failed on %s", smgrout(i));
+	}
+    }
+
+    return (SM_SUCCESS);
+}
+
+int
+smgrabort()
+{
+    int i;
+    extern char *smgrout();
+
+    for (i = 0; i < NSmgr; i++) {
+	if (smgrsw[i].smgr_abort) {
+	    if ((*(smgrsw[i].smgr_abort))() == SM_FAIL)
+		elog(FATAL, "transaction abort failed on %s", smgrout(i));
+	}
+    }
+
+    return (SM_SUCCESS);
+}
+
+bool
+smgriswo(int16 smgrno)
+{
+    if (smgrno < 0 || smgrno >= NSmgr)
+	elog(WARN, "illegal storage manager number %d", smgrno);
+
+    return (smgrwo[smgrno]);
+}
diff --git a/src/backend/storage/smgr/smgrtype.c b/src/backend/storage/smgr/smgrtype.c
new file mode 100644
index 00000000000..5c90d590914
--- /dev/null
+++ b/src/backend/storage/smgr/smgrtype.c
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgrtype.c--
+ *    storage manager type
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgrtype.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+
+#include "utils/builtins.h"	/* where the declarations go */
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "storage/smgr.h"
+
+typedef struct smgrid {
+    char *smgr_name;
+} smgrid;
+
+/*
+ *  StorageManager[] -- List of defined storage managers.
+ *
+ *	The weird comma placement is to keep compilers happy no matter
+ *	which of these is (or is not) defined.
+ */
+
+static smgrid StorageManager[] = {
+	{"magnetic disk"},
+#ifdef MAIN_MEMORY
+	{"main memory"}
+#endif /* MAIN_MEMORY */
+};
+
+static int NStorageManagers = lengthof(StorageManager);
+
+int2
+smgrin(char *s)
+{
+    int i;
+
+    for (i = 0; i < NStorageManagers; i++) {
+	if (strcmp(s, StorageManager[i].smgr_name) == 0)
+	    return((int2) i);
+    }
+    elog(WARN, "smgrin: illegal storage manager name %s", s);
+    return 0;
+}
+
+char *
+smgrout(int2 i)
+{
+    char *s;
+
+    if (i >= NStorageManagers || i < 0)
+	elog(WARN, "Illegal storage manager id %d", i);
+
+    s = (char *) palloc(strlen(StorageManager[i].smgr_name) + 1);
+    strcpy(s, StorageManager[i].smgr_name);
+    return (s);
+}
+
+bool
+smgreq(int2 a, int2 b)
+{
+    if (a == b)
+	return (true);
+    return (false);
+}
+
+bool
+smgrne(int2 a, int2 b)
+{
+    if (a == b)
+	return (false);
+    return (true);
+}
diff --git a/src/backend/storage/spin.h b/src/backend/storage/spin.h
new file mode 100644
index 00000000000..32037684ec1
--- /dev/null
+++ b/src/backend/storage/spin.h
@@ -0,0 +1,38 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.h--
+ *    synchronization routines
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: spin.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	SPIN_H
+#define SPIN_H
+
+#include "ipc.h"
+
+/* 
+ * two implementations of spin locks
+ *
+ * sequent, sparc, sun3: real spin locks. uses a TAS instruction; see
+ * src/storage/ipc/s_lock.c for details.
+ *
+ * default: fake spin locks using semaphores.  see spin.c
+ *
+ */
+
+typedef int SPINLOCK;
+
+extern bool CreateSpinlocks(IPCKey key);
+extern bool AttachSpinLocks(IPCKey key);
+extern bool InitSpinLocks(int init, IPCKey key);
+
+extern void SpinAcquire(SPINLOCK lock);
+extern void SpinRelease(SPINLOCK lock);
+extern bool SpinIsLocked(SPINLOCK lock);
+
+#endif	/* SPIN_H */