aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/Makefile.inc35
-rw-r--r--src/backend/access/attnum.h61
-rw-r--r--src/backend/access/common/Makefile.inc16
-rw-r--r--src/backend/access/common/heaptuple.c1011
-rw-r--r--src/backend/access/common/heapvalid.c134
-rw-r--r--src/backend/access/common/indextuple.c427
-rw-r--r--src/backend/access/common/indexvalid.c84
-rw-r--r--src/backend/access/common/printtup.c306
-rw-r--r--src/backend/access/common/scankey.c68
-rw-r--r--src/backend/access/common/tupdesc.c398
-rw-r--r--src/backend/access/funcindex.h43
-rw-r--r--src/backend/access/genam.h60
-rw-r--r--src/backend/access/hash.h336
-rw-r--r--src/backend/access/hash/Makefile.inc18
-rw-r--r--src/backend/access/hash/hash.c467
-rw-r--r--src/backend/access/hash/hashfunc.c276
-rw-r--r--src/backend/access/hash/hashinsert.c239
-rw-r--r--src/backend/access/hash/hashovfl.c614
-rw-r--r--src/backend/access/hash/hashpage.c669
-rw-r--r--src/backend/access/hash/hashscan.c172
-rw-r--r--src/backend/access/hash/hashsearch.c425
-rw-r--r--src/backend/access/hash/hashstrat.c104
-rw-r--r--src/backend/access/hash/hashutil.c147
-rw-r--r--src/backend/access/heap/Makefile.inc14
-rw-r--r--src/backend/access/heap/heapam.c1507
-rw-r--r--src/backend/access/heap/hio.c195
-rw-r--r--src/backend/access/heap/stats.c329
-rw-r--r--src/backend/access/heapam.h149
-rw-r--r--src/backend/access/hio.h26
-rw-r--r--src/backend/access/htup.h115
-rw-r--r--src/backend/access/ibit.h34
-rw-r--r--src/backend/access/index/Makefile.inc14
-rw-r--r--src/backend/access/index/genam.c275
-rw-r--r--src/backend/access/index/indexam.c411
-rw-r--r--src/backend/access/index/istrat.c679
-rw-r--r--src/backend/access/iqual.h32
-rw-r--r--src/backend/access/istrat.h80
-rw-r--r--src/backend/access/itup.h104
-rw-r--r--src/backend/access/nbtree.h264
-rw-r--r--src/backend/access/nbtree/Makefile.inc15
-rw-r--r--src/backend/access/nbtree/README68
-rw-r--r--src/backend/access/nbtree/nbtcompare.c173
-rw-r--r--src/backend/access/nbtree/nbtinsert.c831
-rw-r--r--src/backend/access/nbtree/nbtpage.c523
-rw-r--r--src/backend/access/nbtree/nbtree.c516
-rw-r--r--src/backend/access/nbtree/nbtscan.c164
-rw-r--r--src/backend/access/nbtree/nbtsearch.c1133
-rw-r--r--src/backend/access/nbtree/nbtsort.c1196
-rw-r--r--src/backend/access/nbtree/nbtstrat.c134
-rw-r--r--src/backend/access/nbtree/nbtutils.c239
-rw-r--r--src/backend/access/printtup.h26
-rw-r--r--src/backend/access/relscan.h87
-rw-r--r--src/backend/access/rtree.h98
-rw-r--r--src/backend/access/rtree/Makefile.inc14
-rw-r--r--src/backend/access/rtree/rtget.c320
-rw-r--r--src/backend/access/rtree/rtproc.c150
-rw-r--r--src/backend/access/rtree/rtree.c955
-rw-r--r--src/backend/access/rtree/rtscan.c392
-rw-r--r--src/backend/access/rtree/rtstrat.c239
-rw-r--r--src/backend/access/rtscan.h17
-rw-r--r--src/backend/access/rtstrat.h18
-rw-r--r--src/backend/access/sdir.h57
-rw-r--r--src/backend/access/skey.h52
-rw-r--r--src/backend/access/strat.h86
-rw-r--r--src/backend/access/transam.h213
-rw-r--r--src/backend/access/transam/Makefile.inc14
-rw-r--r--src/backend/access/transam/transam.c675
-rw-r--r--src/backend/access/transam/transsup.c663
-rw-r--r--src/backend/access/transam/varsup.c606
-rw-r--r--src/backend/access/transam/xact.c1314
-rw-r--r--src/backend/access/transam/xid.c156
-rw-r--r--src/backend/access/tupdesc.h53
-rw-r--r--src/backend/access/tupmacs.h43
-rw-r--r--src/backend/access/valid.h37
-rw-r--r--src/backend/access/xact.h115
75 files changed, 21730 insertions, 0 deletions
diff --git a/src/backend/access/Makefile.inc b/src/backend/access/Makefile.inc
new file mode 100644
index 00000000000..6adc2c692b5
--- /dev/null
+++ b/src/backend/access/Makefile.inc
@@ -0,0 +1,35 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for the access methods module
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+accdir=$(CURDIR)/access
+VPATH:=$(VPATH):$(accdir):\
+ $(accdir)/common:$(accdir)/hash:$(accdir)/heap:$(accdir)/index:\
+ $(accdir)/rtree:$(accdir)/nbtree:$(accdir)/transam
+
+
+SUBSRCS=
+include $(accdir)/common/Makefile.inc
+include $(accdir)/hash/Makefile.inc
+include $(accdir)/heap/Makefile.inc
+include $(accdir)/index/Makefile.inc
+include $(accdir)/rtree/Makefile.inc
+include $(accdir)/nbtree/Makefile.inc
+include $(accdir)/transam/Makefile.inc
+SRCS_ACCESS:= $(SUBSRCS)
+
+HEADERS+= attnum.h funcindex.h genam.h hash.h \
+ heapam.h hio.h htup.h ibit.h iqual.h istrat.h \
+ itup.h nbtree.h printtup.h relscan.h rtree.h \
+ sdir.h skey.h strat.h transam.h tupdesc.h tupmacs.h \
+ valid.h xact.h
+
diff --git a/src/backend/access/attnum.h b/src/backend/access/attnum.h
new file mode 100644
index 00000000000..7c999e58e9d
--- /dev/null
+++ b/src/backend/access/attnum.h
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * attnum.h--
+ * POSTGRES attribute number definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: attnum.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ATTNUM_H
+#define ATTNUM_H
+
+#include "c.h"
+
+/*
+ * user defined attribute numbers start at 1. -ay 2/95
+ */
+typedef int16 AttrNumber;
+
+#define InvalidAttrNumber 0
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+/*
+ * AttributeNumberIsValid --
+ * True iff the attribute number is valid.
+ */
+#define AttributeNumberIsValid(attributeNumber) \
+ ((bool) ((attributeNumber) != InvalidAttrNumber))
+
+/*
+ * AttrNumberIsForUserDefinedAttr --
+ * True iff the attribute number corresponds to an user defined attribute.
+ */
+#define AttrNumberIsForUserDefinedAttr(attributeNumber) \
+ ((bool) ((attributeNumber) > 0))
+
+/*
+ * AttrNumberGetAttrOffset --
+ * Returns the attribute offset for an attribute number.
+ *
+ * Note:
+ * Assumes the attribute number is for an user defined attribute.
+ */
+#define AttrNumberGetAttrOffset(attNum) \
+ (AssertMacro(AttrNumberIsForUserDefinedAttr(attNum)) ? \
+ ((attNum - 1)) : 0)
+
+/*
+ * AttributeOffsetGetAttributeNumber --
+ * Returns the attribute number for an attribute offset.
+ */
+#define AttrOffsetGetAttrNumber(attributeOffset) \
+ ((AttrNumber) (1 + attributeOffset))
+
+#endif /* ATTNUM_H */
diff --git a/src/backend/access/common/Makefile.inc b/src/backend/access/common/Makefile.inc
new file mode 100644
index 00000000000..5d5dd476274
--- /dev/null
+++ b/src/backend/access/common/Makefile.inc
@@ -0,0 +1,16 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/common
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/common/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= heaptuple.c heapvalid.c indextuple.c indexvalid.c printtup.c \
+ scankey.c tupdesc.c
+
diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
new file mode 100644
index 00000000000..c3e72fb97e8
--- /dev/null
+++ b/src/backend/access/common/heaptuple.c
@@ -0,0 +1,1011 @@
+/*-------------------------------------------------------------------------
+ *
+ * heaptuple.c--
+ * This file contains heap tuple accessor and mutator routines, as well
+ * as a few various tuple utilities.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/common/heaptuple.c,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ * NOTES
+ * The old interface functions have been converted to macros
+ * and moved to heapam.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+
+#include "postgres.h"
+
+#include "access/htup.h"
+#include "access/itup.h"
+#include "access/tupmacs.h"
+#include "access/skey.h"
+#include "storage/ipc.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "access/transam.h"
+#include "storage/bufpage.h" /* for MAXTUPLEN */
+#include "storage/itemptr.h"
+#include "utils/memutils.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/nabstime.h"
+
+/* this is so the sparcstation debugger works */
+
+#ifndef NO_ASSERT_CHECKING
+#ifdef sparc
+#define register
+#endif /* sparc */
+#endif /* NO_ASSERT_CHECKING */
+
+/* ----------------------------------------------------------------
+ * misc support routines
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * ComputeDataSize
+ * ----------------
+ */
+Size
+ComputeDataSize(TupleDesc tupleDesc,
+ Datum value[],
+ char nulls[])
+{
+ uint32 length;
+ int i;
+ int numberOfAttributes = tupleDesc->natts;
+ AttributeTupleForm *att = tupleDesc->attrs;
+
+ for (length = 0, i = 0; i < numberOfAttributes; i++) {
+ if (nulls[i] != ' ') continue;
+
+ switch (att[i]->attlen) {
+ case -1:
+ /*
+ * This is the size of the disk representation and so
+ * must include the additional sizeof long.
+ */
+ if (att[i]->attalign == 'd') {
+ length = DOUBLEALIGN(length)
+ + VARSIZE(DatumGetPointer(value[i]));
+ } else {
+ length = INTALIGN(length)
+ + VARSIZE(DatumGetPointer(value[i]));
+ }
+ break;
+ case sizeof(char):
+ length++;
+ break;
+ case sizeof(short):
+ length = SHORTALIGN(length + sizeof(short));
+ break;
+ case sizeof(int32):
+ length = INTALIGN(length + sizeof(int32));
+ break;
+ default:
+ if (att[i]->attlen < sizeof(int32))
+ elog(WARN, "ComputeDataSize: attribute %d has len %d",
+ i, att[i]->attlen);
+ if (att[i]->attalign == 'd')
+ length = DOUBLEALIGN(length) + att[i]->attlen;
+ else
+ length = LONGALIGN(length) + att[i]->attlen;
+ break;
+ }
+ }
+
+ return length;
+}
+
+/* ----------------
+ * DataFill
+ * ----------------
+ */
+void
+DataFill(char *data,
+ TupleDesc tupleDesc,
+ Datum value[],
+ char nulls[],
+ char *infomask,
+ bits8 bit[])
+{
+ bits8 *bitP;
+ int bitmask;
+ uint32 length;
+ int i;
+ int numberOfAttributes = tupleDesc->natts;
+ AttributeTupleForm* att = tupleDesc->attrs;
+
+ if (bit != NULL) {
+ bitP = &bit[-1];
+ bitmask = CSIGNBIT;
+ }
+
+ *infomask = 0;
+
+ for (i = 0; i < numberOfAttributes; i++) {
+ if (bit != NULL) {
+ if (bitmask != CSIGNBIT) {
+ bitmask <<= 1;
+ } else {
+ bitP += 1;
+ *bitP = 0x0;
+ bitmask = 1;
+ }
+
+ if (nulls[i] == 'n') {
+ *infomask |= HEAP_HASNULL;
+ continue;
+ }
+
+ *bitP |= bitmask;
+ }
+
+ switch (att[i]->attlen) {
+ case -1:
+ *infomask |= HEAP_HASVARLENA;
+ if (att[i]->attalign=='d') {
+ data = (char *) DOUBLEALIGN(data);
+ } else {
+ data = (char *) INTALIGN(data);
+ }
+ length = VARSIZE(DatumGetPointer(value[i]));
+ memmove(data, DatumGetPointer(value[i]),length);
+ data += length;
+ break;
+ case sizeof(char):
+ *data = att[i]->attbyval ?
+ DatumGetChar(value[i]) : *((char *) value[i]);
+ data += sizeof(char);
+ break;
+ case sizeof(int16):
+ data = (char *) SHORTALIGN(data);
+ * (short *) data = (att[i]->attbyval ?
+ DatumGetInt16(value[i]) :
+ *((short *) value[i]));
+ data += sizeof(short);
+ break;
+ case sizeof(int32):
+ data = (char *) INTALIGN(data);
+ * (int32 *) data = (att[i]->attbyval ?
+ DatumGetInt32(value[i]) :
+ *((int32 *) value[i]));
+ data += sizeof(int32);
+ break;
+ default:
+ if (att[i]->attlen < sizeof(int32))
+ elog(WARN, "DataFill: attribute %d has len %d",
+ i, att[i]->attlen);
+ if (att[i]->attalign == 'd') {
+ data = (char *) DOUBLEALIGN(data);
+ memmove(data, DatumGetPointer(value[i]),
+ att[i]->attlen);
+ data += att[i]->attlen;
+ } else {
+ data = (char *) LONGALIGN(data);
+ memmove(data, DatumGetPointer(value[i]),
+ att[i]->attlen);
+ data += att[i]->attlen;
+ }
+
+ }
+ }
+}
+
+/* ----------------------------------------------------------------
+ * heap tuple interface
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * heap_attisnull - returns 1 iff tuple attribute is not present
+ * ----------------
+ */
+int
+heap_attisnull(HeapTuple tup, int attnum)
+{
+ if (attnum > (int)tup->t_natts)
+ return (1);
+
+ if (HeapTupleNoNulls(tup)) return(0);
+
+ if (attnum > 0) {
+ return(att_isnull(attnum - 1, tup->t_bits));
+ } else
+ switch (attnum) {
+ case SelfItemPointerAttributeNumber:
+ case ObjectIdAttributeNumber:
+ case MinTransactionIdAttributeNumber:
+ case MinCommandIdAttributeNumber:
+ case MaxTransactionIdAttributeNumber:
+ case MaxCommandIdAttributeNumber:
+ case ChainItemPointerAttributeNumber:
+ case AnchorItemPointerAttributeNumber:
+ case MinAbsoluteTimeAttributeNumber:
+ case MaxAbsoluteTimeAttributeNumber:
+ case VersionTypeAttributeNumber:
+ break;
+
+ case 0:
+ elog(WARN, "heap_attisnull: zero attnum disallowed");
+
+ default:
+ elog(WARN, "heap_attisnull: undefined negative attnum");
+ }
+
+ return (0);
+}
+
+/* ----------------------------------------------------------------
+ * system attribute heap tuple support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * heap_sysattrlen
+ *
+ * This routine returns the length of a system attribute.
+ * ----------------
+ */
+int
+heap_sysattrlen(AttrNumber attno)
+{
+ HeapTupleData *f = NULL;
+ int len;
+
+ switch (attno) {
+ case SelfItemPointerAttributeNumber:
+ len = sizeof f->t_ctid;
+ break;
+ case ObjectIdAttributeNumber:
+ len = sizeof f->t_oid;
+ break;
+ case MinTransactionIdAttributeNumber:
+ len = sizeof f->t_xmin;
+ break;
+ case MinCommandIdAttributeNumber:
+ len = sizeof f->t_cmin;
+ break;
+ case MaxTransactionIdAttributeNumber:
+ len = sizeof f->t_xmax;
+ break;
+ case MaxCommandIdAttributeNumber:
+ len = sizeof f->t_cmax;
+ break;
+ case ChainItemPointerAttributeNumber:
+ len = sizeof f->t_chain;
+ break;
+ case AnchorItemPointerAttributeNumber:
+ elog(WARN, "heap_sysattrlen: field t_anchor does not exist!");
+ break;
+ case MinAbsoluteTimeAttributeNumber:
+ len = sizeof f->t_tmin;
+ break;
+ case MaxAbsoluteTimeAttributeNumber:
+ len = sizeof f->t_tmax;
+ break;
+ case VersionTypeAttributeNumber:
+ len = sizeof f->t_vtype;
+ break;
+ default:
+ elog(WARN, "sysattrlen: System attribute number %d unknown.",
+ attno);
+ len = 0;
+ break;
+ }
+ return (len);
+}
+
+/* ----------------
+ * heap_sysattrbyval
+ *
+ * This routine returns the "by-value" property of a system attribute.
+ * ----------------
+ */
+bool
+heap_sysattrbyval(AttrNumber attno)
+{
+ bool byval;
+
+ switch (attno) {
+ case SelfItemPointerAttributeNumber:
+ byval = false;
+ break;
+ case ObjectIdAttributeNumber:
+ byval = true;
+ break;
+ case MinTransactionIdAttributeNumber:
+ byval = true;
+ break;
+ case MinCommandIdAttributeNumber:
+ byval = true;
+ break;
+ case MaxTransactionIdAttributeNumber:
+ byval = true;
+ break;
+ case MaxCommandIdAttributeNumber:
+ byval = true;
+ break;
+ case ChainItemPointerAttributeNumber:
+ byval = false;
+ break;
+ case AnchorItemPointerAttributeNumber:
+ byval = false;
+ break;
+ case MinAbsoluteTimeAttributeNumber:
+ byval = true;
+ break;
+ case MaxAbsoluteTimeAttributeNumber:
+ byval = true;
+ break;
+ case VersionTypeAttributeNumber:
+ byval = true;
+ break;
+ default:
+ byval = true;
+ elog(WARN, "sysattrbyval: System attribute number %d unknown.",
+ attno);
+ break;
+ }
+
+ return byval;
+}
+
+/* ----------------
+ * heap_getsysattr
+ * ----------------
+ */
+char *
+heap_getsysattr(HeapTuple tup, Buffer b, int attnum)
+{
+ switch (attnum) {
+ case SelfItemPointerAttributeNumber:
+ return ((char *)&tup->t_ctid);
+ case ObjectIdAttributeNumber:
+ return ((char *) (long) tup->t_oid);
+ case MinTransactionIdAttributeNumber:
+ return ((char *) (long) tup->t_xmin);
+ case MinCommandIdAttributeNumber:
+ return ((char *) (long) tup->t_cmin);
+ case MaxTransactionIdAttributeNumber:
+ return ((char *) (long) tup->t_xmax);
+ case MaxCommandIdAttributeNumber:
+ return ((char *) (long) tup->t_cmax);
+ case ChainItemPointerAttributeNumber:
+ return ((char *) &tup->t_chain);
+ case AnchorItemPointerAttributeNumber:
+ elog(WARN, "heap_getsysattr: t_anchor does not exist!");
+ break;
+
+ /*
+ * For tmin and tmax, we need to do some extra work. These don't
+ * get filled in until the vacuum cleaner runs (or we manage to flush
+ * a page after setting the value correctly below). If the vacuum
+ * cleaner hasn't run yet, then the times stored in the tuple are
+ * wrong, and we need to look up the commit time of the transaction.
+ * We cache this value in the tuple to avoid doing the work more than
+ * once.
+ */
+
+ case MinAbsoluteTimeAttributeNumber:
+ if (!AbsoluteTimeIsBackwardCompatiblyValid(tup->t_tmin) &&
+ TransactionIdDidCommit(tup->t_xmin))
+ tup->t_tmin = TransactionIdGetCommitTime(tup->t_xmin);
+ return ((char *) (long) tup->t_tmin);
+ case MaxAbsoluteTimeAttributeNumber:
+ if (!AbsoluteTimeIsBackwardCompatiblyReal(tup->t_tmax)) {
+ if (TransactionIdDidCommit(tup->t_xmax))
+ tup->t_tmax = TransactionIdGetCommitTime(tup->t_xmax);
+ else
+ tup->t_tmax = CURRENT_ABSTIME;
+ }
+ return ((char *) (long) tup->t_tmax);
+ case VersionTypeAttributeNumber:
+ return ((char *) (long) tup->t_vtype);
+ default:
+ elog(WARN, "heap_getsysattr: undefined attnum %d", attnum);
+ }
+ return(NULL);
+}
+
+/* ----------------
+ * fastgetattr
+ *
+ * This is a newer version of fastgetattr which attempts to be
+ * faster by caching attribute offsets in the attribute descriptor.
+ *
+ * an alternate way to speed things up would be to cache offsets
+ * with the tuple, but that seems more difficult unless you take
+ * the storage hit of actually putting those offsets into the
+ * tuple you send to disk. Yuck.
+ *
+ * This scheme will be slightly slower than that, but should
+ * preform well for queries which hit large #'s of tuples. After
+ * you cache the offsets once, examining all the other tuples using
+ * the same attribute descriptor will go much quicker. -cim 5/4/91
+ * ----------------
+ */
+char *
+fastgetattr(HeapTuple tup,
+ int attnum,
+ TupleDesc tupleDesc,
+ bool *isnull)
+{
+ char *tp; /* ptr to att in tuple */
+ bits8 *bp; /* ptr to att in tuple */
+ int slow; /* do we have to walk nulls? */
+ AttributeTupleForm *att = tupleDesc->attrs;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+
+ Assert(PointerIsValid(isnull));
+ Assert(attnum > 0);
+
+ /* ----------------
+ * Three cases:
+ *
+ * 1: No nulls and no variable length attributes.
+ * 2: Has a null or a varlena AFTER att.
+ * 3: Has nulls or varlenas BEFORE att.
+ * ----------------
+ */
+
+ *isnull = false;
+
+ if (HeapTupleNoNulls(tup)) {
+ attnum--;
+ if (att[attnum]->attcacheoff > 0) {
+ return (char *)
+ fetchatt( &(att[attnum]),
+ (char *)tup + tup->t_hoff + att[attnum]->attcacheoff);
+ } else if (attnum == 0) {
+ /*
+ * first attribute is always at position zero
+ */
+ return((char *) fetchatt(&(att[0]), (char *) tup + tup->t_hoff));
+ }
+
+ tp = (char *) tup + tup->t_hoff;
+
+ slow = 0;
+ } else {
+ /*
+ * there's a null somewhere in the tuple
+ */
+
+ bp = tup->t_bits;
+ tp = (char *) tup + tup->t_hoff;
+ slow = 0;
+ attnum--;
+
+ /* ----------------
+ * check to see if desired att is null
+ * ----------------
+ */
+
+ if (att_isnull(attnum, bp)) {
+ *isnull = true;
+ return NULL;
+ }
+
+ /* ----------------
+ * Now check to see if any preceeding bits are null...
+ * ----------------
+ */
+
+ {
+ register int i = 0; /* current offset in bp */
+
+ for (i = 0; i < attnum && !slow; i++) {
+ if (att_isnull(i, bp)) slow = 1;
+ }
+ }
+ }
+
+ /*
+ * now check for any non-fixed length attrs before our attribute
+ */
+ if (!slow) {
+ if (att[attnum]->attcacheoff > 0) {
+ return (char *)
+ fetchatt(&(att[attnum]),
+ tp + att[attnum]->attcacheoff);
+ } else if (attnum == 0) {
+ return (char *)
+ fetchatt(&(att[0]), (char *) tup + tup->t_hoff);
+ } else if (!HeapTupleAllFixed(tup)) {
+ register int j = 0;
+
+ for (j = 0; j < attnum && !slow; j++)
+ if (att[j]->attlen < 1) slow = 1;
+ }
+ }
+
+ /*
+ * if slow is zero, and we got here, we know that we have a tuple with
+ * no nulls. We also have to initialize the remainder of
+ * the attribute cached offset values.
+ */
+ if (!slow) {
+ register int j = 1;
+ register long off;
+
+ /*
+ * need to set cache for some atts
+ */
+
+ att[0]->attcacheoff = 0;
+
+ while (att[j]->attcacheoff > 0) j++;
+
+ off = att[j-1]->attcacheoff + att[j-1]->attlen;
+
+ for (; j < attnum + 1; j++) {
+ switch(att[j]->attlen) {
+ case -1:
+ off = (att[j]->attalign=='d') ?
+ DOUBLEALIGN(off) : INTALIGN(off);
+ break;
+ case sizeof(char):
+ break;
+ case sizeof(short):
+ off = SHORTALIGN(off);
+ break;
+ case sizeof(int32):
+ off = INTALIGN(off);
+ break;
+ default:
+ if (att[j]->attlen < sizeof(int32)) {
+ elog(WARN,
+ "fastgetattr: attribute %d has len %d",
+ j, att[j]->attlen);
+ }
+ if (att[j]->attalign == 'd')
+ off = DOUBLEALIGN(off);
+ else
+ off = LONGALIGN(off);
+ break;
+ }
+
+ att[j]->attcacheoff = off;
+ off += att[j]->attlen;
+ }
+
+ return
+ (char *)fetchatt(&(att[attnum]), tp + att[attnum]->attcacheoff);
+ } else {
+ register bool usecache = true;
+ register int off = 0;
+ register int i;
+
+ /*
+ * Now we know that we have to walk the tuple CAREFULLY.
+ *
+ * Note - This loop is a little tricky. On iteration i we
+ * first set the offset for attribute i and figure out how much
+ * the offset should be incremented. Finally, we need to align the
+ * offset based on the size of attribute i+1 (for which the offset
+ * has been computed). -mer 12 Dec 1991
+ */
+
+ for (i = 0; i < attnum; i++) {
+ if (!HeapTupleNoNulls(tup)) {
+ if (att_isnull(i, bp)) {
+ usecache = false;
+ continue;
+ }
+ }
+ switch (att[i]->attlen) {
+ case -1:
+ off = (att[i]->attalign=='d') ?
+ DOUBLEALIGN(off) : INTALIGN(off);
+ break;
+ case sizeof(char):
+ break;
+ case sizeof(short):
+ off = SHORTALIGN(off);
+ break;
+ case sizeof(int32):
+ off = INTALIGN(off);
+ break;
+ default:
+ if (att[i]->attlen < sizeof(int32))
+ elog(WARN,
+ "fastgetattr2: attribute %d has len %d",
+ i, att[i]->attlen);
+ if (att[i]->attalign == 'd')
+ off = DOUBLEALIGN(off);
+ else
+ off = LONGALIGN(off);
+ break;
+ }
+ if (usecache && att[i]->attcacheoff > 0) {
+ off = att[i]->attcacheoff;
+ if (att[i]->attlen == -1) {
+ usecache = false;
+ }
+ } else {
+ if (usecache) att[i]->attcacheoff = off;
+ }
+
+ switch(att[i]->attlen) {
+ case sizeof(char):
+ off++;
+ break;
+ case sizeof(int16):
+ off += sizeof(int16);
+ break;
+ case sizeof(int32):
+ off += sizeof(int32);
+ break;
+ case -1:
+ usecache = false;
+ off += VARSIZE(tp + off);
+ break;
+ default:
+ off += att[i]->attlen;
+ break;
+ }
+ }
+ switch (att[attnum]->attlen) {
+ case -1:
+ off = (att[attnum]->attalign=='d')?
+ DOUBLEALIGN(off) : INTALIGN(off);
+ break;
+ case sizeof(char):
+ break;
+ case sizeof(short):
+ off = SHORTALIGN(off);
+ break;
+ case sizeof(int32):
+ off = INTALIGN(off);
+ break;
+ default:
+ if (att[attnum]->attlen < sizeof(int32))
+ elog(WARN, "fastgetattr3: attribute %d has len %d",
+ attnum, att[attnum]->attlen);
+ if (att[attnum]->attalign == 'd')
+ off = DOUBLEALIGN(off);
+ else
+ off = LONGALIGN(off);
+ break;
+ }
+ return((char *) fetchatt(&(att[attnum]), tp + off));
+ }
+}
+
+/* ----------------
+ * heap_getattr
+ *
+ * returns an attribute from a heap tuple. uses
+ * ----------------
+ */
+char *
+heap_getattr(HeapTuple tup,
+ Buffer b,
+ int attnum,
+ TupleDesc tupleDesc,
+ bool *isnull)
+{
+ bool localIsNull;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(tup != NULL);
+
+ if (! PointerIsValid(isnull))
+ isnull = &localIsNull;
+
+ if (attnum > (int) tup->t_natts) {
+ *isnull = true;
+ return ((char *) NULL);
+ }
+
+ /* ----------------
+ * take care of user defined attributes
+ * ----------------
+ */
+ if (attnum > 0) {
+ char *datum;
+ datum = fastgetattr(tup, attnum, tupleDesc, isnull);
+
+ return (datum);
+ }
+
+ /* ----------------
+ * take care of system attributes
+ * ----------------
+ */
+ *isnull = false;
+ return
+ heap_getsysattr(tup, b, attnum);
+}
+
+/* ----------------
+ * heap_copytuple
+ *
+ * returns a copy of an entire tuple
+ * ----------------
+ */
+HeapTuple
+heap_copytuple(HeapTuple tuple)
+{
+ HeapTuple newTuple;
+
+ if (! HeapTupleIsValid(tuple))
+ return (NULL);
+
+ /* XXX For now, just prevent an undetectable executor related error */
+ if (tuple->t_len > MAXTUPLEN) {
+ elog(WARN, "palloctup: cannot handle length %d tuples",
+ tuple->t_len);
+ }
+
+ newTuple = (HeapTuple) palloc(tuple->t_len);
+ memmove((char *) newTuple, (char *) tuple, (int) tuple->t_len);
+ return(newTuple);
+}
+
+/* ----------------
+ * heap_deformtuple
+ *
+ * the inverse of heap_formtuple (see below)
+ * ----------------
+ */
+void
+heap_deformtuple(HeapTuple tuple,
+ TupleDesc tdesc,
+ Datum values[],
+ char nulls[])
+{
+ int i;
+ int natts;
+
+ Assert(HeapTupleIsValid(tuple));
+
+ natts = tuple->t_natts;
+ for (i = 0; i<natts; i++) {
+ bool isnull;
+
+ values[i] = (Datum)heap_getattr(tuple,
+ InvalidBuffer,
+ i+1,
+ tdesc,
+ &isnull);
+ if (isnull)
+ nulls[i] = 'n';
+ else
+ nulls[i] = ' ';
+ }
+}
+
+/* ----------------
+ * heap_formtuple
+ *
+ * constructs a tuple from the given value[] and null[] arrays
+ *
+ * old comments
+ * Handles alignment by aligning 2 byte attributes on short boundries
+ * and 3 or 4 byte attributes on long word boundries on a vax; and
+ * aligning non-byte attributes on short boundries on a sun. Does
+ * not properly align fixed length arrays of 1 or 2 byte types (yet).
+ *
+ * Null attributes are indicated by a 'n' in the appropriate byte
+ * of the null[]. Non-null attributes are indicated by a ' ' (space).
+ *
+ * Fix me. (Figure that must keep context if debug--allow give oid.)
+ * Assumes in order.
+ * ----------------
+ */
+HeapTuple
+heap_formtuple(TupleDesc tupleDescriptor,
+ Datum value[],
+ char nulls[])
+{
+ char *tp; /* tuple pointer */
+ HeapTuple tuple; /* return tuple */
+ int bitmaplen;
+ long len;
+ int hoff;
+ bool hasnull = false;
+ int i;
+ int numberOfAttributes = tupleDescriptor->natts;
+
+ len = sizeof *tuple - sizeof tuple->t_bits;
+
+ for (i = 0; i < numberOfAttributes && !hasnull; i++) {
+ if (nulls[i] != ' ') hasnull = true;
+ }
+
+ if (numberOfAttributes > MaxHeapAttributeNumber)
+ elog(WARN, "heap_formtuple: numberOfAttributes of %d > %d",
+ numberOfAttributes, MaxHeapAttributeNumber);
+
+ if (hasnull) {
+ bitmaplen = BITMAPLEN(numberOfAttributes);
+ len += bitmaplen;
+ }
+
+ hoff = len = DOUBLEALIGN(len); /* be conservative here */
+
+ len += ComputeDataSize(tupleDescriptor, value, nulls);
+
+ tp = (char *) palloc(len);
+ tuple = (HeapTuple) tp;
+
+ memset(tp, 0, (int)len);
+
+ tuple->t_len = len;
+ tuple->t_natts = numberOfAttributes;
+ tuple->t_hoff = hoff;
+ tuple->t_tmin = INVALID_ABSTIME;
+ tuple->t_tmax = CURRENT_ABSTIME;
+
+ DataFill((char *)tuple + tuple->t_hoff,
+ tupleDescriptor,
+ value,
+ nulls,
+ &tuple->t_infomask,
+ (hasnull ? tuple->t_bits : NULL));
+
+ return (tuple);
+}
+
+/* ----------------
+ * heap_modifytuple
+ *
+ * forms a new tuple from an old tuple and a set of replacement values.
+ * ----------------
+ */
+HeapTuple
+heap_modifytuple(HeapTuple tuple,
+ Buffer buffer,
+ Relation relation,
+ Datum replValue[],
+ char replNull[],
+ char repl[])
+{
+ int attoff;
+ int numberOfAttributes;
+ Datum *value;
+ char *nulls;
+ bool isNull;
+ HeapTuple newTuple;
+ int madecopy;
+ uint8 infomask;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(HeapTupleIsValid(tuple));
+ Assert(BufferIsValid(buffer) || RelationIsValid(relation));
+ Assert(HeapTupleIsValid(tuple));
+ Assert(PointerIsValid(replValue));
+ Assert(PointerIsValid(replNull));
+ Assert(PointerIsValid(repl));
+
+ /* ----------------
+ * if we're pointing to a disk page, then first
+ * make a copy of our tuple so that all the attributes
+ * are available. XXX this is inefficient -cim
+ * ----------------
+ */
+ madecopy = 0;
+ if (BufferIsValid(buffer) == true) {
+ relation = (Relation) BufferGetRelation(buffer);
+ tuple = heap_copytuple(tuple);
+ madecopy = 1;
+ }
+
+ numberOfAttributes = RelationGetRelationTupleForm(relation)->relnatts;
+
+ /* ----------------
+ * allocate and fill value[] and nulls[] arrays from either
+ * the tuple or the repl information, as appropriate.
+ * ----------------
+ */
+ value = (Datum *) palloc(numberOfAttributes * sizeof *value);
+ nulls = (char *) palloc(numberOfAttributes * sizeof *nulls);
+
+ for (attoff = 0;
+ attoff < numberOfAttributes;
+ attoff += 1) {
+
+ if (repl[attoff] == ' ') {
+ char *attr;
+
+ attr =
+ heap_getattr(tuple,
+ InvalidBuffer,
+ AttrOffsetGetAttrNumber(attoff),
+ RelationGetTupleDescriptor(relation),
+ &isNull) ;
+ value[attoff] = PointerGetDatum(attr);
+ nulls[attoff] = (isNull) ? 'n' : ' ';
+
+ } else if (repl[attoff] != 'r') {
+ elog(WARN, "heap_modifytuple: repl is \\%3d", repl[attoff]);
+
+ } else { /* == 'r' */
+ value[attoff] = replValue[attoff];
+ nulls[attoff] = replNull[attoff];
+ }
+ }
+
+ /* ----------------
+ * create a new tuple from the values[] and nulls[] arrays
+ * ----------------
+ */
+ newTuple = heap_formtuple(RelationGetTupleDescriptor(relation),
+ value,
+ nulls);
+
+ /* ----------------
+ * copy the header except for t_len, t_natts, t_hoff, t_bits, t_infomask
+ * ----------------
+ */
+ infomask = newTuple->t_infomask;
+ memmove((char *) &newTuple->t_ctid, /*XXX*/
+ (char *) &tuple->t_ctid,
+ ((char *) &tuple->t_hoff - (char *) &tuple->t_ctid)); /*XXX*/
+ newTuple->t_infomask = infomask;
+ newTuple->t_natts = numberOfAttributes; /* fix t_natts just in case */
+
+ /* ----------------
+ * if we made a copy of the tuple, then free it.
+ * ----------------
+ */
+ if (madecopy)
+ pfree(tuple);
+
+ return
+ newTuple;
+}
+
+/* ----------------------------------------------------------------
+ * other misc functions
+ * ----------------------------------------------------------------
+ */
+
+HeapTuple
+heap_addheader(uint32 natts, /* max domain index */
+ int structlen, /* its length */
+ char *structure) /* pointer to the struct */
+{
+ register char *tp; /* tuple data pointer */
+ HeapTuple tup;
+ long len;
+ int hoff;
+
+ AssertArg(natts > 0);
+
+ len = sizeof (HeapTupleData) - sizeof (tup->t_bits);
+
+ hoff = len = DOUBLEALIGN(len); /* be conservative */
+ len += structlen;
+ tp = (char *) palloc(len);
+ tup = (HeapTuple) tp;
+ memset((char*)tup, 0, len);
+
+ tup->t_len = (short) len; /* XXX */
+ tp += tup->t_hoff = hoff;
+ tup->t_natts = natts;
+ tup->t_infomask = 0;
+
+ memmove(tp, structure, structlen);
+
+ return (tup);
+}
diff --git a/src/backend/access/common/heapvalid.c b/src/backend/access/common/heapvalid.c
new file mode 100644
index 00000000000..b80c5dd9eb0
--- /dev/null
+++ b/src/backend/access/common/heapvalid.c
@@ -0,0 +1,134 @@
+/*-------------------------------------------------------------------------
+ *
+ * heapvalid.c--
+ * heap tuple qualification validity checking code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/common/Attic/heapvalid.c,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "access/htup.h"
+#include "access/skey.h"
+#include "access/heapam.h"
+#include "utils/tqual.h"
+#include "access/valid.h" /* where the declarations go */
+#include "access/xact.h"
+
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "storage/itemid.h"
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/rel.h"
+
+/* ----------------
+ * heap_keytest
+ *
+ * Test a heap tuple with respect to a scan key.
+ * ----------------
+ */
+bool
+heap_keytest(HeapTuple t,
+ TupleDesc tupdesc,
+ int nkeys,
+ ScanKey keys)
+{
+ bool isnull;
+ Datum atp;
+ int test;
+
+ for (; nkeys--; keys++) {
+ atp = (Datum)heap_getattr(t, InvalidBuffer,
+ keys->sk_attno,
+ tupdesc,
+ &isnull);
+
+ if (isnull)
+ /* XXX eventually should check if SK_ISNULL */
+ return false;
+
+ if (keys->sk_flags & SK_COMMUTE)
+ test = (long) FMGR_PTR2(keys->sk_func, keys->sk_procedure,
+ keys->sk_argument, atp);
+ else
+ test = (long) FMGR_PTR2(keys->sk_func, keys->sk_procedure,
+ atp, keys->sk_argument);
+
+ if (!test == !(keys->sk_flags & SK_NEGATE))
+ return false;
+ }
+
+ return true;
+}
+
+/* ----------------
+ * heap_tuple_satisfies
+ *
+ * Returns a valid HeapTuple if it satisfies the timequal and keytest.
+ * Returns NULL otherwise. Used to be heap_satisifies (sic) which
+ * returned a boolean. It now returns a tuple so that we can avoid doing two
+ * PageGetItem's per tuple.
+ *
+ * Complete check of validity including LP_CTUP and keytest.
+ * This should perhaps be combined with valid somehow in the
+ * future. (Also, additional rule tests/time range tests.)
+ *
+ * on 8/21/92 mao says: i rearranged the tests here to do keytest before
+ * SatisfiesTimeQual. profiling indicated that even for vacuumed relations,
+ * time qual checking was more expensive than key testing. time qual is
+ * least likely to fail, too. we should really add the time qual test to
+ * the restriction and optimize it in the normal way. this has interactions
+ * with joey's expensive function work.
+ * ----------------
+ */
+HeapTuple
+heap_tuple_satisfies(ItemId itemId,
+ Relation relation,
+ PageHeader disk_page,
+ TimeQual qual,
+ int nKeys,
+ ScanKey key)
+{
+ HeapTuple tuple;
+ bool res;
+
+ if (! ItemIdIsUsed(itemId))
+ return NULL;
+
+ tuple = (HeapTuple) PageGetItem((Page) disk_page, itemId);
+
+ if (key != NULL)
+ res = heap_keytest(tuple, RelationGetTupleDescriptor(relation),
+ nKeys, key);
+ else
+ res = TRUE;
+
+ if (res && (relation->rd_rel->relkind == RELKIND_UNCATALOGED
+ || HeapTupleSatisfiesTimeQual(tuple,qual)))
+ return tuple;
+
+ return (HeapTuple) NULL;
+}
+
+/*
+ * TupleUpdatedByCurXactAndCmd() -- Returns true if this tuple has
+ * already been updated once by the current transaction/command
+ * pair.
+ */
+bool
+TupleUpdatedByCurXactAndCmd(HeapTuple t)
+{
+ if (TransactionIdEquals(t->t_xmax,
+ GetCurrentTransactionId()) &&
+ t->t_cmax == GetCurrentCommandId())
+ return true;
+
+ return false;
+}
diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c
new file mode 100644
index 00000000000..be5d2ccbd96
--- /dev/null
+++ b/src/backend/access/common/indextuple.c
@@ -0,0 +1,427 @@
+/*-------------------------------------------------------------------------
+ *
+ * indextuple.c--
+ * This file contains index tuple accessor and mutator routines,
+ * as well as a few various tuple utilities.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/common/indextuple.c,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+
+#include "c.h"
+#include "access/ibit.h"
+#include "access/itup.h" /* where the declarations go */
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/tupdesc.h"
+#include "access/tupmacs.h"
+
+#include "storage/itemptr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+
+static Size IndexInfoFindDataOffset(unsigned short t_info);
+
+/* ----------------------------------------------------------------
+ * index_ tuple interface routines
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * index_formtuple
+ * ----------------
+ */
+IndexTuple
+index_formtuple(TupleDesc tupleDescriptor,
+ Datum value[],
+ char null[])
+{
+ register char *tp; /* tuple pointer */
+ IndexTuple tuple; /* return tuple */
+ Size size, hoff;
+ int i;
+ unsigned short infomask = 0;
+ bool hasnull = false;
+ char tupmask = 0;
+ int numberOfAttributes = tupleDescriptor->natts;
+
+ if (numberOfAttributes > MaxIndexAttributeNumber)
+ elog(WARN, "index_formtuple: numberOfAttributes of %d > %d",
+ numberOfAttributes, MaxIndexAttributeNumber);
+
+
+ for (i = 0; i < numberOfAttributes && !hasnull; i++) {
+ if (null[i] != ' ') hasnull = true;
+ }
+
+ if (hasnull) infomask |= INDEX_NULL_MASK;
+
+ hoff = IndexInfoFindDataOffset(infomask);
+ size = hoff
+ + ComputeDataSize(tupleDescriptor,
+ value, null);
+ size = DOUBLEALIGN(size); /* be conservative */
+
+ tp = (char *) palloc(size);
+ tuple = (IndexTuple) tp;
+ memset(tp,0,(int)size);
+
+ DataFill((char *)tp + hoff,
+ tupleDescriptor,
+ value,
+ null,
+ &tupmask,
+ (hasnull ? (bits8*)tp + sizeof(*tuple) : NULL));
+
+ /*
+ * We do this because DataFill wants to initialize a "tupmask" which
+ * is used for HeapTuples, but we want an indextuple infomask. The only
+ * "relevent" info is the "has variable attributes" field, which is in
+ * mask position 0x02. We have already set the null mask above.
+ */
+
+ if (tupmask & 0x02) infomask |= INDEX_VAR_MASK;
+
+ /*
+ * Here we make sure that we can actually hold the size. We also want
+ * to make sure that size is not aligned oddly. This actually is a
+ * rather odd way to make sure the size is not too large overall.
+ */
+
+ if (size & 0xE000)
+ elog(WARN, "index_formtuple: data takes %d bytes: too big", size);
+
+
+ infomask |= size;
+
+ /* ----------------
+ * initialize metadata
+ * ----------------
+ */
+ tuple->t_info = infomask;
+ return (tuple);
+}
+
+/* ----------------
+ * fastgetiattr
+ *
+ * This is a newer version of fastgetiattr which attempts to be
+ * faster by caching attribute offsets in the attribute descriptor.
+ *
+ * an alternate way to speed things up would be to cache offsets
+ * with the tuple, but that seems more difficult unless you take
+ * the storage hit of actually putting those offsets into the
+ * tuple you send to disk. Yuck.
+ *
+ * This scheme will be slightly slower than that, but should
+ * preform well for queries which hit large #'s of tuples. After
+ * you cache the offsets once, examining all the other tuples using
+ * the same attribute descriptor will go much quicker. -cim 5/4/91
+ * ----------------
+ */
+char *
+fastgetiattr(IndexTuple tup,
+ int attnum,
+ TupleDesc tupleDesc,
+ bool *isnull)
+{
+ register char *tp; /* ptr to att in tuple */
+ register char *bp; /* ptr to att in tuple */
+ int slow; /* do we have to walk nulls? */
+ register int data_off; /* tuple data offset */
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+
+ Assert(PointerIsValid(isnull));
+ Assert(attnum > 0);
+
+ /* ----------------
+ * Three cases:
+ *
+ * 1: No nulls and no variable length attributes.
+ * 2: Has a null or a varlena AFTER att.
+ * 3: Has nulls or varlenas BEFORE att.
+ * ----------------
+ */
+
+ *isnull = false;
+ data_off = IndexTupleHasMinHeader(tup) ? sizeof *tup :
+ IndexInfoFindDataOffset(tup->t_info);
+
+ if (IndexTupleNoNulls(tup)) {
+
+ /* first attribute is always at position zero */
+
+ if (attnum == 1) {
+ return(fetchatt(&(tupleDesc->attrs[0]), (char *) tup + data_off));
+ }
+ attnum--;
+
+ if (tupleDesc->attrs[attnum]->attcacheoff > 0) {
+ return(fetchatt(&(tupleDesc->attrs[attnum]),
+ (char *) tup + data_off +
+ tupleDesc->attrs[attnum]->attcacheoff));
+ }
+
+ tp = (char *) tup + data_off;
+
+ slow = 0;
+ }else { /* there's a null somewhere in the tuple */
+
+ bp = (char *) tup + sizeof(*tup); /* "knows" t_bits are here! */
+ slow = 0;
+ /* ----------------
+ * check to see if desired att is null
+ * ----------------
+ */
+
+ attnum--;
+ {
+ if (att_isnull(attnum, bp)) {
+ *isnull = true;
+ return NULL;
+ }
+ }
+ /* ----------------
+ * Now check to see if any preceeding bits are null...
+ * ----------------
+ */
+ {
+ register int i = 0; /* current offset in bp */
+ register int mask; /* bit in byte we're looking at */
+ register char n; /* current byte in bp */
+ register int byte, finalbit;
+
+ byte = attnum >> 3;
+ finalbit = attnum & 0x07;
+
+ for (; i <= byte; i++) {
+ n = bp[i];
+ if (i < byte) {
+ /* check for nulls in any "earlier" bytes */
+ if ((~n) != 0) {
+ slow++;
+ break;
+ }
+ } else {
+ /* check for nulls "before" final bit of last byte*/
+ mask = (finalbit << 1) - 1;
+ if ((~n) & mask)
+ slow++;
+ }
+ }
+ }
+ tp = (char *) tup + data_off;
+ }
+
+ /* now check for any non-fixed length attrs before our attribute */
+
+ if (!slow) {
+ if (tupleDesc->attrs[attnum]->attcacheoff > 0) {
+ return(fetchatt(&(tupleDesc->attrs[attnum]),
+ tp + tupleDesc->attrs[attnum]->attcacheoff));
+ }else if (!IndexTupleAllFixed(tup)) {
+ register int j = 0;
+
+ for (j = 0; j < attnum && !slow; j++)
+ if (tupleDesc->attrs[j]->attlen < 1) slow = 1;
+ }
+ }
+
+ /*
+ * if slow is zero, and we got here, we know that we have a tuple with
+ * no nulls. We also know that we have to initialize the remainder of
+ * the attribute cached offset values.
+ */
+
+ if (!slow) {
+ register int j = 1;
+ register long off;
+
+ /*
+ * need to set cache for some atts
+ */
+
+ tupleDesc->attrs[0]->attcacheoff = 0;
+
+ while (tupleDesc->attrs[j]->attcacheoff > 0) j++;
+
+ off = tupleDesc->attrs[j-1]->attcacheoff +
+ tupleDesc->attrs[j-1]->attlen;
+
+ for (; j < attnum + 1; j++) {
+ /*
+ * Fix me when going to a machine with more than a four-byte
+ * word!
+ */
+
+ switch(tupleDesc->attrs[j]->attlen)
+ {
+ case -1:
+ off = (tupleDesc->attrs[j]->attalign=='d')?
+ DOUBLEALIGN(off):INTALIGN(off);
+ break;
+ case sizeof(char):
+ break;
+ case sizeof(short):
+ off = SHORTALIGN(off);
+ break;
+ case sizeof(int32):
+ off = INTALIGN(off);
+ break;
+ default:
+ if (tupleDesc->attrs[j]->attlen > sizeof(int32))
+ off = (tupleDesc->attrs[j]->attalign=='d')?
+ DOUBLEALIGN(off) : LONGALIGN(off);
+ else
+ elog(WARN, "fastgetiattr: attribute %d has len %d",
+ j, tupleDesc->attrs[j]->attlen);
+ break;
+
+ }
+
+ tupleDesc->attrs[j]->attcacheoff = off;
+ off += tupleDesc->attrs[j]->attlen;
+ }
+
+ return(fetchatt( &(tupleDesc->attrs[attnum]),
+ tp + tupleDesc->attrs[attnum]->attcacheoff));
+ }else {
+ register bool usecache = true;
+ register int off = 0;
+ register int i;
+
+ /*
+ * Now we know that we have to walk the tuple CAREFULLY.
+ */
+
+ for (i = 0; i < attnum; i++) {
+ if (!IndexTupleNoNulls(tup)) {
+ if (att_isnull(i, bp)) {
+ usecache = false;
+ continue;
+ }
+ }
+
+ if (usecache && tupleDesc->attrs[i]->attcacheoff > 0) {
+ off = tupleDesc->attrs[i]->attcacheoff;
+ if (tupleDesc->attrs[i]->attlen == -1)
+ usecache = false;
+ else
+ continue;
+ }
+
+ if (usecache) tupleDesc->attrs[i]->attcacheoff = off;
+ switch(tupleDesc->attrs[i]->attlen)
+ {
+ case sizeof(char):
+ off++;
+ break;
+ case sizeof(short):
+ off = SHORTALIGN(off) + sizeof(short);
+ break;
+ case -1:
+ usecache = false;
+ off = (tupleDesc->attrs[i]->attalign=='d')?
+ DOUBLEALIGN(off):INTALIGN(off);
+ off += VARSIZE(tp + off);
+ break;
+ default:
+ if (tupleDesc->attrs[i]->attlen > sizeof(int32))
+ off = (tupleDesc->attrs[i]->attalign=='d') ?
+ DOUBLEALIGN(off) + tupleDesc->attrs[i]->attlen :
+ LONGALIGN(off) + tupleDesc->attrs[i]->attlen;
+ else
+ elog(WARN, "fastgetiattr2: attribute %d has len %d",
+ i, tupleDesc->attrs[i]->attlen);
+
+ break;
+ }
+ }
+
+ return(fetchatt(&tupleDesc->attrs[attnum], tp + off));
+ }
+}
+
+/* ----------------
+ * index_getattr
+ * ----------------
+ */
+Datum
+index_getattr(IndexTuple tuple,
+ AttrNumber attNum,
+ TupleDesc tupDesc,
+ bool *isNullOutP)
+{
+ Assert (attNum > 0);
+
+ return (Datum)
+ fastgetiattr(tuple, attNum, tupDesc, isNullOutP);
+}
+
+RetrieveIndexResult
+FormRetrieveIndexResult(ItemPointer indexItemPointer,
+ ItemPointer heapItemPointer)
+{
+ RetrieveIndexResult result;
+
+ Assert(ItemPointerIsValid(indexItemPointer));
+ Assert(ItemPointerIsValid(heapItemPointer));
+
+ result = (RetrieveIndexResult) palloc(sizeof *result);
+
+ result->index_iptr = *indexItemPointer;
+ result->heap_iptr = *heapItemPointer;
+
+ return (result);
+}
+
+/*
+ * Takes an infomask as argument (primarily because this needs to be usable
+ * at index_formtuple time so enough space is allocated).
+ *
+ * Change me if adding an attribute to IndexTuples!!!!!!!!!!!
+ */
+static Size
+IndexInfoFindDataOffset(unsigned short t_info)
+{
+ if (!(t_info & INDEX_NULL_MASK))
+ return((Size) sizeof(IndexTupleData));
+ else {
+ Size size = sizeof(IndexTupleData);
+
+ if (t_info & INDEX_NULL_MASK) {
+ size += sizeof(IndexAttributeBitMapData);
+ }
+ return DOUBLEALIGN(size); /* be conservative */
+ }
+}
+
+/*
+ * Copies source into target. If *target == NULL, we palloc space; otherwise
+ * we assume we have space that is already palloc'ed.
+ */
+void
+CopyIndexTuple(IndexTuple source, IndexTuple *target)
+{
+ Size size;
+ IndexTuple ret;
+
+ size = IndexTupleSize(source);
+ if (*target == NULL) {
+ *target = (IndexTuple) palloc(size);
+ }
+
+ ret = *target;
+ memmove((char*)ret, (char*)source, size);
+}
+
diff --git a/src/backend/access/common/indexvalid.c b/src/backend/access/common/indexvalid.c
new file mode 100644
index 00000000000..b437718cecc
--- /dev/null
+++ b/src/backend/access/common/indexvalid.c
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * indexvalid.c--
+ * index tuple qualification validity checking code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/common/Attic/indexvalid.c,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "executor/execdebug.h"
+#include "access/genam.h"
+#include "access/iqual.h" /* where the declarations go */
+#include "access/itup.h"
+#include "access/skey.h"
+
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+#include "storage/itemid.h"
+#include "utils/rel.h"
+
+/* ----------------------------------------------------------------
+ * index scan key qualification code
+ * ----------------------------------------------------------------
+ */
+int NIndexTupleProcessed;
+
+/* ----------------
+ * index_keytest
+ *
+ * old comments
+ * May eventually combine with other tests (like timeranges)?
+ * Should have Buffer buffer; as an argument and pass it to amgetattr.
+ * ----------------
+ */
+bool
+index_keytest(IndexTuple tuple,
+ TupleDesc tupdesc,
+ int scanKeySize,
+ ScanKey key)
+{
+ bool isNull;
+ Datum datum;
+ int test;
+
+ IncrIndexProcessed();
+
+ while (scanKeySize > 0) {
+ datum = index_getattr(tuple,
+ 1,
+ tupdesc,
+ &isNull);
+
+ if (isNull) {
+ /* XXX eventually should check if SK_ISNULL */
+ return (false);
+ }
+
+ if (key[0].sk_flags & SK_COMMUTE) {
+ test = (int) (*(key[0].sk_func))
+ (DatumGetPointer(key[0].sk_argument),
+ datum);
+ } else {
+ test = (int) (*(key[0].sk_func))
+ (datum,
+ DatumGetPointer(key[0].sk_argument));
+ }
+
+ if (!test == !(key[0].sk_flags & SK_NEGATE)) {
+ return (false);
+ }
+
+ scanKeySize -= 1;
+ key++;
+ }
+
+ return (true);
+}
+
diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
new file mode 100644
index 00000000000..556b73b9dfd
--- /dev/null
+++ b/src/backend/access/common/printtup.c
@@ -0,0 +1,306 @@
+/*-------------------------------------------------------------------------
+ *
+ * printtup.c--
+ * Routines to print out tuples to the destination (binary or non-binary
+ * portals, frontend/interactive backend, etc.).
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/common/printtup.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup.h"
+#include "access/skey.h"
+#include "access/printtup.h"
+#include "access/tupdesc.h"
+#include "storage/buf.h"
+#include "utils/memutils.h"
+#include "utils/palloc.h"
+#include "fmgr.h"
+#include "utils/elog.h"
+
+#include "utils/syscache.h"
+#include "catalog/pg_type.h"
+
+#include "libpq/libpq.h"
+
+/* ----------------------------------------------------------------
+ * printtup / debugtup support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * typtoout - used by printtup and debugtup
+ * ----------------
+ */
+Oid
+typtoout(Oid type)
+{
+ HeapTuple typeTuple;
+
+ typeTuple = SearchSysCacheTuple(TYPOID,
+ ObjectIdGetDatum(type),
+ 0, 0, 0);
+
+ if (HeapTupleIsValid(typeTuple))
+ return((Oid)
+ ((TypeTupleForm) GETSTRUCT(typeTuple))->typoutput);
+
+ elog(WARN, "typtoout: Cache lookup of type %d failed", type);
+ return(InvalidOid);
+}
+
+Oid
+gettypelem(Oid type)
+{
+ HeapTuple typeTuple;
+
+ typeTuple = SearchSysCacheTuple(TYPOID,
+ ObjectIdGetDatum(type),
+ 0,0,0);
+
+ if (HeapTupleIsValid(typeTuple))
+ return((Oid)
+ ((TypeTupleForm) GETSTRUCT(typeTuple))->typelem);
+
+ elog(WARN, "typtoout: Cache lookup of type %d failed", type);
+ return(InvalidOid);
+}
+
+/* ----------------
+ * printtup
+ * ----------------
+ */
+void
+printtup(HeapTuple tuple, TupleDesc typeinfo)
+{
+ int i, j, k;
+ char *outputstr, *attr;
+ bool isnull;
+ Oid typoutput;
+
+ /* ----------------
+ * tell the frontend to expect new tuple data
+ * ----------------
+ */
+ pq_putnchar("D", 1);
+
+ /* ----------------
+ * send a bitmap of which attributes are null
+ * ----------------
+ */
+ j = 0;
+ k = 1 << 7;
+ for (i = 0; i < tuple->t_natts; ) {
+ attr = heap_getattr(tuple, InvalidBuffer, ++i, typeinfo, &isnull);
+ if (!isnull)
+ j |= k;
+ k >>= 1;
+ if (!(i & 7)) {
+ pq_putint(j, 1);
+ j = 0;
+ k = 1 << 7;
+ }
+ }
+ if (i & 7)
+ pq_putint(j, 1);
+
+ /* ----------------
+ * send the attributes of this tuple
+ * ----------------
+ */
+ for (i = 0; i < tuple->t_natts; ++i) {
+ attr = heap_getattr(tuple, InvalidBuffer, i+1, typeinfo, &isnull);
+ typoutput = typtoout((Oid) typeinfo->attrs[i]->atttypid);
+
+ if (!isnull && OidIsValid(typoutput)) {
+ outputstr = fmgr(typoutput, attr,
+ gettypelem(typeinfo->attrs[i]->atttypid));
+ pq_putint(strlen(outputstr)+4, 4);
+ pq_putnchar(outputstr, strlen(outputstr));
+ pfree(outputstr);
+ }
+ }
+}
+
+/* ----------------
+ * printatt
+ * ----------------
+ */
+static void
+printatt(unsigned attributeId,
+ AttributeTupleForm attributeP,
+ char *value)
+{
+ printf("\t%2d: %.*s%s%s%s\t(typeid = %u, len = %d, byval = %c)\n",
+ attributeId,
+ NAMEDATALEN, /* attname is a char16 */
+ attributeP->attname.data,
+ value != NULL ? " = \"" : "",
+ value != NULL ? value : "",
+ value != NULL ? "\"" : "",
+ (unsigned int) (attributeP->atttypid),
+ attributeP->attlen,
+ attributeP->attbyval ? 't' : 'f');
+}
+
+/* ----------------
+ * showatts
+ * ----------------
+ */
+void
+showatts(char *name, TupleDesc tupleDesc)
+{
+ int i;
+ int natts = tupleDesc->natts;
+ AttributeTupleForm *attinfo = tupleDesc->attrs;
+
+ puts(name);
+ for (i = 0; i < natts; ++i)
+ printatt((unsigned) i+1, attinfo[i], (char *) NULL);
+ printf("\t----\n");
+}
+
+/* ----------------
+ * debugtup
+ * ----------------
+ */
+void
+debugtup(HeapTuple tuple, TupleDesc typeinfo)
+{
+ register int i;
+ char *attr, *value;
+ bool isnull;
+ Oid typoutput;
+
+ for (i = 0; i < tuple->t_natts; ++i) {
+ attr = heap_getattr(tuple, InvalidBuffer, i+1, typeinfo, &isnull);
+ typoutput = typtoout((Oid) typeinfo->attrs[i]->atttypid);
+
+ if (!isnull && OidIsValid(typoutput)) {
+ value = fmgr(typoutput, attr,
+ gettypelem(typeinfo->attrs[i]->atttypid));
+ printatt((unsigned) i+1, typeinfo->attrs[i], value);
+ pfree(value);
+ }
+ }
+ printf("\t----\n");
+}
+
+/*#define IPORTAL_DEBUG*/
+
+/* ----------------
+ * printtup_internal
+ * Protocol expects either T, D, C, E, or N.
+ * We use a different data prefix, e.g. 'B' instead of 'D' to
+ * indicate a tuple in internal (binary) form.
+ *
+ * This is same as printtup, except we don't use the typout func.
+ * ----------------
+ */
+void
+printtup_internal(HeapTuple tuple, TupleDesc typeinfo)
+{
+ int i, j, k;
+ char *attr;
+ bool isnull;
+
+ /* ----------------
+ * tell the frontend to expect new tuple data
+ * ----------------
+ */
+ pq_putnchar("B", 1);
+
+ /* ----------------
+ * send a bitmap of which attributes are null
+ * ----------------
+ */
+ j = 0;
+ k = 1 << 7;
+ for (i = 0; i < tuple->t_natts; ) {
+ attr = heap_getattr(tuple, InvalidBuffer, ++i, typeinfo, &isnull);
+ if (!isnull)
+ j |= k;
+ k >>= 1;
+ if (!(i & 7)) {
+ pq_putint(j, 1);
+ j = 0;
+ k = 1 << 7;
+ }
+ }
+ if (i & 7)
+ pq_putint(j, 1);
+
+ /* ----------------
+ * send the attributes of this tuple
+ * ----------------
+ */
+#ifdef IPORTAL_DEBUG
+ fprintf(stderr, "sending tuple with %d atts\n", tuple->t_natts);
+#endif
+ for (i = 0; i < tuple->t_natts; ++i) {
+ int32 len = typeinfo->attrs[i]->attlen;
+
+ attr = heap_getattr(tuple, InvalidBuffer, i+1, typeinfo, &isnull);
+ if (!isnull) {
+ /* # of bytes, and opaque data */
+ if (len == -1) {
+ /* variable length, assume a varlena structure */
+ len = VARSIZE(attr) - VARHDRSZ;
+
+ pq_putint(len, sizeof(int32));
+ pq_putnchar(VARDATA(attr), len);
+#ifdef IPORTAL_DEBUG
+ {
+ char *d = VARDATA(attr);
+
+ fprintf(stderr, "length %d data %x%x%x%x\n",
+ len, *d, *(d+1), *(d+2), *(d+3));
+ }
+#endif
+ } else {
+ /* fixed size */
+ if (typeinfo->attrs[i]->attbyval) {
+ int8 i8;
+ int16 i16;
+ int32 i32;
+
+ pq_putint(len, sizeof(int32));
+ switch (len) {
+ case sizeof(int8):
+ i8 = DatumGetChar(attr);
+ pq_putnchar((char *) &i8, len);
+ break;
+ case sizeof(int16):
+ i16 = DatumGetInt16(attr);
+ pq_putnchar((char *) &i16, len);
+ break;
+ case sizeof(int32):
+ i32 = DatumGetInt32(attr);
+ pq_putnchar((char *) &i32, len);
+ break;
+ }
+#ifdef IPORTAL_DEBUG
+ fprintf(stderr, "byval length %d data %d\n", len, attr);
+#endif
+ } else {
+ pq_putint(len, sizeof(int32));
+ pq_putnchar(attr, len);
+#ifdef IPORTAL_DEBUG
+ fprintf(stderr, "byref length %d data %x\n", len, attr);
+#endif
+ }
+ }
+ }
+ }
+}
diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c
new file mode 100644
index 00000000000..7a47219a73c
--- /dev/null
+++ b/src/backend/access/common/scankey.c
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * scan.c--
+ * scan direction and key code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/common/scankey.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+#include "access/sdir.h"
+#include "access/attnum.h"
+#include "access/skey.h"
+
+#include "fmgr.h"
+
+/*
+ * ScanKeyEntryIsLegal --
+ * True iff the scan key entry is legal.
+ */
+#define ScanKeyEntryIsLegal(entry) \
+ ((bool) (AssertMacro(PointerIsValid(entry)) && \
+ AttributeNumberIsValid(entry->sk_attno)))
+
+/*
+ * ScanKeyEntrySetIllegal --
+ * Marks a scan key entry as illegal.
+ */
+void
+ScanKeyEntrySetIllegal(ScanKey entry)
+{
+
+ Assert(PointerIsValid(entry));
+
+ entry->sk_flags = 0; /* just in case... */
+ entry->sk_attno = InvalidAttrNumber;
+ entry->sk_procedure = 0; /* should be InvalidRegProcedure */
+}
+
+/*
+ * ScanKeyEntryInitialize --
+ * Initializes an scan key entry.
+ *
+ * Note:
+ * Assumes the scan key entry is valid.
+ * Assumes the intialized scan key entry will be legal.
+ */
+void
+ScanKeyEntryInitialize(ScanKey entry,
+ bits16 flags,
+ AttrNumber attributeNumber,
+ RegProcedure procedure,
+ Datum argument)
+{
+ Assert(PointerIsValid(entry));
+
+ entry->sk_flags = flags;
+ entry->sk_attno = attributeNumber;
+ entry->sk_procedure = procedure;
+ entry->sk_argument = argument;
+ fmgr_info(procedure, &entry->sk_func, &entry->sk_nargs);
+
+ Assert(ScanKeyEntryIsLegal(entry));
+}
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
new file mode 100644
index 00000000000..527eb5113df
--- /dev/null
+++ b/src/backend/access/common/tupdesc.c
@@ -0,0 +1,398 @@
+/*-------------------------------------------------------------------------
+ *
+ * tupdesc.c--
+ * POSTGRES tuple descriptor support code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/common/tupdesc.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * some of the executor utility code such as "ExecTypeFromTL" should be
+ * moved here.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h> /* for sprintf() */
+#include <ctype.h>
+#include <string.h>
+
+#include "postgres.h"
+
+#include "nodes/pg_list.h"
+#include "nodes/parsenodes.h"
+
+#include "access/attnum.h"
+#include "access/htup.h"
+#include "access/tupdesc.h"
+
+#include "utils/builtins.h"
+#include "utils/elog.h" /* XXX generate exceptions instead */
+#include "utils/palloc.h"
+
+#include "utils/syscache.h"
+#include "catalog/pg_type.h"
+
+#include "nodes/primnodes.h"
+
+#include "parser/catalog_utils.h"
+
+/* ----------------------------------------------------------------
+ * CreateTemplateTupleDesc
+ *
+ * This function allocates and zeros a tuple descriptor structure.
+ * ----------------------------------------------------------------
+ */
+TupleDesc
+CreateTemplateTupleDesc(int natts)
+{
+ uint32 size;
+ TupleDesc desc;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ AssertArg(natts >= 1);
+
+ /* ----------------
+ * allocate enough memory for the tuple descriptor and
+ * zero it as TupleDescInitEntry assumes that the descriptor
+ * is filled with NULL pointers.
+ * ----------------
+ */
+ size = natts * sizeof (AttributeTupleForm);
+ desc = (TupleDesc) palloc(sizeof(struct tupleDesc));
+ desc->attrs = (AttributeTupleForm*) palloc(size);
+ memset(desc->attrs, 0, size);
+
+ desc->natts = natts;
+
+ return (desc);
+}
+
+/* ----------------------------------------------------------------
+ * CreateTupleDesc
+ *
+ * This function allocates a new TupleDesc from AttributeTupleForm array
+ * ----------------------------------------------------------------
+ */
+TupleDesc
+CreateTupleDesc(int natts, AttributeTupleForm* attrs)
+{
+ TupleDesc desc;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ AssertArg(natts >= 1);
+
+ desc = (TupleDesc) palloc(sizeof(struct tupleDesc));
+ desc->attrs = attrs;
+ desc->natts = natts;
+
+
+ return (desc);
+}
+
+/* ----------------------------------------------------------------
+ * CreateTupleDescCopy
+ *
+ * This function creates a new TupleDesc by copying from an existing
+ * TupleDesc
+ *
+ * ----------------------------------------------------------------
+ */
+TupleDesc
+CreateTupleDescCopy(TupleDesc tupdesc)
+{
+ TupleDesc desc;
+ int i, size;
+
+ desc = (TupleDesc) palloc(sizeof(struct tupleDesc));
+ desc->natts = tupdesc->natts;
+ size = desc->natts * sizeof (AttributeTupleForm);
+ desc->attrs = (AttributeTupleForm*) palloc(size);
+ for (i=0;i<desc->natts;i++) {
+ desc->attrs[i] =
+ (AttributeTupleForm)palloc(ATTRIBUTE_TUPLE_SIZE);
+ memmove(desc->attrs[i],
+ tupdesc->attrs[i],
+ ATTRIBUTE_TUPLE_SIZE);
+ }
+ return desc;
+}
+
+/* ----------------------------------------------------------------
+ * TupleDescInitEntry
+ *
+ * This function initializes a single attribute structure in
+ * a preallocated tuple descriptor.
+ * ----------------------------------------------------------------
+ */
+bool
+TupleDescInitEntry(TupleDesc desc,
+ AttrNumber attributeNumber,
+ char *attributeName,
+ char *typeName,
+ int attdim,
+ bool attisset)
+{
+ HeapTuple tuple;
+ TypeTupleForm typeForm;
+ AttributeTupleForm att;
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ AssertArg(PointerIsValid(desc));
+ AssertArg(attributeNumber >= 1);
+ /* attributeName's are sometimes NULL,
+ from resdom's. I don't know why that is, though -- Jolly */
+/* AssertArg(NameIsValid(attributeName));*/
+/* AssertArg(NameIsValid(typeName));*/
+
+ AssertArg(!PointerIsValid(desc->attrs[attributeNumber - 1]));
+
+
+ /* ----------------
+ * allocate storage for this attribute
+ * ----------------
+ */
+
+ att = (AttributeTupleForm) palloc(ATTRIBUTE_TUPLE_SIZE);
+ desc->attrs[attributeNumber - 1] = att;
+
+ /* ----------------
+ * initialize some of the attribute fields
+ * ----------------
+ */
+ att->attrelid = 0; /* dummy value */
+
+ if (attributeName != NULL)
+ namestrcpy(&(att->attname), attributeName);
+ else
+ memset(att->attname.data,0,NAMEDATALEN);
+
+
+ att->attdefrel = 0; /* dummy value */
+ att->attnvals = 0; /* dummy value */
+ att->atttyparg = 0; /* dummy value */
+ att->attbound = 0; /* dummy value */
+ att->attcanindex = 0; /* dummy value */
+ att->attproc = 0; /* dummy value */
+ att->attcacheoff = -1;
+
+ att->attnum = attributeNumber;
+ att->attnelems = attdim;
+ att->attisset = attisset;
+
+ /* ----------------
+ * search the system cache for the type tuple of the attribute
+ * we are creating so that we can get the typeid and some other
+ * stuff.
+ *
+ * Note: in the special case of
+ *
+ * create EMP (name = char16, manager = EMP)
+ *
+ * RelationNameCreateHeapRelation() calls BuildDesc() which
+ * calls this routine and since EMP does not exist yet, the
+ * system cache lookup below fails. That's fine, but rather
+ * then doing a elog(WARN) we just leave that information
+ * uninitialized, return false, then fix things up later.
+ * -cim 6/14/90
+ * ----------------
+ */
+ tuple = SearchSysCacheTuple(TYPNAME, PointerGetDatum(typeName),
+ 0,0,0);
+ if (! HeapTupleIsValid(tuple)) {
+ /* ----------------
+ * here type info does not exist yet so we just fill
+ * the attribute with dummy information and return false.
+ * ----------------
+ */
+ att->atttypid = InvalidOid;
+ att->attlen = (int16) 0;
+ att->attbyval = (bool) 0;
+ att->attalign = 'i';
+ return false;
+ }
+
+ /* ----------------
+ * type info exists so we initialize our attribute
+ * information from the type tuple we found..
+ * ----------------
+ */
+ typeForm = (TypeTupleForm) GETSTRUCT(tuple);
+
+ att->atttypid = tuple->t_oid;
+ att->attalign = typeForm->typalign;
+
+ /* ------------------------
+ If this attribute is a set, what is really stored in the
+ attribute is the OID of a tuple in the pg_proc catalog.
+ The pg_proc tuple contains the query string which defines
+ this set - i.e., the query to run to get the set.
+ So the atttypid (just assigned above) refers to the type returned
+ by this query, but the actual length of this attribute is the
+ length (size) of an OID.
+
+ Why not just make the atttypid point to the OID type, instead
+ of the type the query returns? Because the executor uses the atttypid
+ to tell the front end what type will be returned (in BeginCommand),
+ and in the end the type returned will be the result of the query, not
+ an OID.
+
+ Why not wait until the return type of the set is known (i.e., the
+ recursive call to the executor to execute the set has returned)
+ before telling the front end what the return type will be? Because
+ the executor is a delicate thing, and making sure that the correct
+ order of front-end commands is maintained is messy, especially
+ considering that target lists may change as inherited attributes
+ are considered, etc. Ugh.
+ -----------------------------------------
+ */
+ if (attisset) {
+ Type t = type("oid");
+ att->attlen = tlen(t);
+ att->attbyval = tbyval(t);
+ } else {
+ att->attlen = typeForm->typlen;
+ att->attbyval = typeForm->typbyval;
+ }
+
+
+ return true;
+}
+
+
+/* ----------------------------------------------------------------
+ * TupleDescMakeSelfReference
+ *
+ * This function initializes a "self-referential" attribute like
+ * manager in "create EMP (name=text, manager = EMP)".
+ * It calls TypeShellMake() which inserts a "shell" type
+ * tuple into pg_type. A self-reference is one kind of set, so
+ * its size and byval are the same as for a set. See the comments
+ * above in TupleDescInitEntry.
+ * ----------------------------------------------------------------
+ */
+static void
+TupleDescMakeSelfReference(TupleDesc desc,
+ AttrNumber attnum,
+ char *relname)
+{
+ AttributeTupleForm att;
+ Type t = type("oid");
+
+ att = desc->attrs[attnum-1];
+ att->atttypid = TypeShellMake(relname);
+ att->attlen = tlen(t);
+ att->attbyval = tbyval(t);
+ att->attnelems = 0;
+}
+
+/* ----------------------------------------------------------------
+ * BuildDescForRelation
+ *
+ * This is a general purpose function identical to BuildDesc
+ * but is used by the DefineRelation() code to catch the
+ * special case where you
+ *
+ * create FOO ( ..., x = FOO )
+ *
+ * here, the initial type lookup for "x = FOO" will fail
+ * because FOO isn't in the catalogs yet. But since we
+ * are creating FOO, instead of doing an elog() we add
+ * a shell type tuple to pg_type and fix things later
+ * in amcreate().
+ * ----------------------------------------------------------------
+ */
+TupleDesc
+BuildDescForRelation(List *schema, char *relname)
+{
+ int natts;
+ AttrNumber attnum;
+ List *p;
+ TupleDesc desc;
+ char *attname;
+ char *typename;
+ int attdim;
+ bool attisset;
+
+ /* ----------------
+ * allocate a new tuple descriptor
+ * ----------------
+ */
+ natts = length(schema);
+ desc = CreateTemplateTupleDesc(natts);
+
+ attnum = 0;
+
+ typename = palloc(NAMEDATALEN+1);
+
+ foreach(p, schema) {
+ ColumnDef *entry;
+ List *arry;
+
+ /* ----------------
+ * for each entry in the list, get the name and type
+ * information from the list and have TupleDescInitEntry
+ * fill in the attribute information we need.
+ * ----------------
+ */
+ attnum++;
+
+ entry = lfirst(p);
+ attname = entry->colname;
+ arry = entry->typename->arrayBounds;
+ attisset = entry->typename->setof;
+
+ if (arry != NIL) {
+ char buf[20];
+
+ attdim = length(arry);
+
+ /* array of XXX is _XXX (inherited from release 3) */
+ sprintf(buf, "_%.*s", NAMEDATALEN, entry->typename->name);
+ strcpy(typename, buf);
+ } else {
+ strcpy(typename, entry->typename->name);
+ attdim = 0;
+ }
+
+ if (! TupleDescInitEntry(desc, attnum, attname,
+ typename, attdim, attisset)) {
+ /* ----------------
+ * if TupleDescInitEntry() fails, it means there is
+ * no type in the system catalogs. So now we check if
+ * the type name equals the relation name. If so we
+ * have a self reference, otherwise it's an error.
+ * ----------------
+ */
+ if (!strcmp(typename, relname)) {
+ TupleDescMakeSelfReference(desc, attnum, relname);
+ } else
+ elog(WARN, "DefineRelation: no such type %.*s",
+ NAMEDATALEN, typename);
+ }
+
+ /*
+ * this is for char() and varchar(). When an entry is of type
+ * char() or varchar(), typlen is set to the appropriate length,
+ * which we'll use here instead. (The catalog lookup only returns
+ * the length of bpchar and varchar which is not what we want!)
+ * - ay 6/95
+ */
+ if (entry->typename->typlen > 0) {
+ desc->attrs[attnum - 1]->attlen = entry->typename->typlen;
+ }
+ }
+ return desc;
+}
+
diff --git a/src/backend/access/funcindex.h b/src/backend/access/funcindex.h
new file mode 100644
index 00000000000..4689df19c04
--- /dev/null
+++ b/src/backend/access/funcindex.h
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ *
+ * funcindex.h--
+ *
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: funcindex.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _FUNC_INDEX_INCLUDED_
+#define _FUNC_INDEX_INCLUDED_
+
+#include "postgres.h"
+
+typedef struct {
+ int nargs;
+ Oid arglist[8];
+ Oid procOid;
+ NameData funcName;
+} FuncIndexInfo;
+
+typedef FuncIndexInfo *FuncIndexInfoPtr;
+
+/*
+ * some marginally useful macro definitions
+ */
+/* #define FIgetname(FINFO) (&((FINFO)->funcName.data[0]))*/
+#define FIgetname(FINFO) (FINFO)->funcName.data
+#define FIgetnArgs(FINFO) (FINFO)->nargs
+#define FIgetProcOid(FINFO) (FINFO)->procOid
+#define FIgetArg(FINFO, argnum) (FINFO)->arglist[argnum]
+#define FIgetArglist(FINFO) (FINFO)->arglist
+
+#define FIsetnArgs(FINFO, numargs) ((FINFO)->nargs = numargs)
+#define FIsetProcOid(FINFO, id) ((FINFO)->procOid = id)
+#define FIsetArg(FINFO, argnum, argtype) ((FINFO)->arglist[argnum] = argtype)
+
+#define FIisFunctionalIndex(FINFO) (FINFO->procOid != InvalidOid)
+
+#endif /* FUNCINDEX_H */
diff --git a/src/backend/access/genam.h b/src/backend/access/genam.h
new file mode 100644
index 00000000000..b2544650de8
--- /dev/null
+++ b/src/backend/access/genam.h
@@ -0,0 +1,60 @@
+/*-------------------------------------------------------------------------
+ *
+ * genam.h--
+ * POSTGRES general access method definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: genam.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GENAM_H
+#define GENAM_H
+
+#include "postgres.h"
+
+#include "access/attnum.h"
+#include "access/htup.h"
+#include "access/istrat.h"
+#include "access/itup.h"
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "access/sdir.h"
+#include "access/funcindex.h"
+
+/* ----------------
+ * generalized index_ interface routines
+ * ----------------
+ */
+extern Relation index_open(Oid relationId);
+extern Relation index_openr(char *relationName);
+extern void index_close(Relation relation);
+extern InsertIndexResult index_insert(Relation relation,
+ IndexTuple indexTuple);
+extern void index_delete(Relation relation, ItemPointer indexItem);
+extern IndexScanDesc index_beginscan(Relation relation, bool scanFromEnd,
+ uint16 numberOfKeys, ScanKey key);
+extern void index_rescan(IndexScanDesc scan, bool scanFromEnd, ScanKey key);
+extern void index_endscan(IndexScanDesc scan);
+extern void index_markpos(IndexScanDesc scan);
+extern void index_restrpos(IndexScanDesc scan);
+extern RetrieveIndexResult index_getnext(IndexScanDesc scan,
+ ScanDirection direction);
+extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
+ uint16 procnum);
+extern Datum GetIndexValue(HeapTuple tuple, TupleDesc hTupDesc,
+ int attOff, AttrNumber attrNums[], FuncIndexInfo *fInfo,
+ bool *attNull, Buffer buffer);
+
+/* in genam.c */
+extern IndexScanDesc RelationGetIndexScan(Relation relation, bool scanFromEnd,
+ uint16 numberOfKeys, ScanKey key);
+extern void IndexScanRestart(IndexScanDesc scan, bool scanFromEnd,
+ ScanKey key);
+extern void IndexScanEnd(IndexScanDesc scan);
+extern void IndexScanMarkPosition(IndexScanDesc scan);
+extern void IndexScanRestorePosition(IndexScanDesc scan);
+
+#endif /* GENAM_H */
diff --git a/src/backend/access/hash.h b/src/backend/access/hash.h
new file mode 100644
index 00000000000..21407696b44
--- /dev/null
+++ b/src/backend/access/hash.h
@@ -0,0 +1,336 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash.h--
+ * header file for postgres hash access method implementation
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: hash.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ * NOTES
+ * modeled after Margo Seltzer's hash implementation for unix.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HASH_H
+#define HASH_H
+
+#include "access/itup.h"
+
+/*
+ * An overflow page is a spare page allocated for storing data whose
+ * bucket doesn't have room to store it. We use overflow pages rather
+ * than just splitting the bucket because there is a linear order in
+ * the way we split buckets. In other words, if there isn't enough space
+ * in the bucket itself, put it in an overflow page.
+ *
+ * Overflow page addresses are stored in form: (Splitnumber, Page offset).
+ *
+ * A splitnumber is the number of the generation where the table doubles
+ * in size. The ovflpage's offset within the splitnumber; offsets start
+ * at 1.
+ *
+ * We convert the stored bitmap address into a page address with the
+ * macro OADDR_OF(S, O) where S is the splitnumber and O is the page
+ * offset.
+ */
+typedef uint32 Bucket;
+typedef bits16 OverflowPageAddress;
+typedef uint32 SplitNumber;
+typedef uint32 PageOffset;
+
+/* A valid overflow address will always have a page offset >= 1 */
+#define InvalidOvflAddress 0
+
+#define SPLITSHIFT 11
+#define SPLITMASK 0x7FF
+#define SPLITNUM(N) ((SplitNumber)(((uint32)(N)) >> SPLITSHIFT))
+#define OPAGENUM(N) ((PageOffset)((N) & SPLITMASK))
+#define OADDR_OF(S,O) ((OverflowPageAddress)((uint32)((uint32)(S) << SPLITSHIFT) + (O)))
+
+#define BUCKET_TO_BLKNO(B) \
+ ((Bucket) ((B) + ((B) ? metap->SPARES[_hash_log2((B)+1)-1] : 0)) + 1)
+#define OADDR_TO_BLKNO(B) \
+ ((BlockNumber) \
+ (BUCKET_TO_BLKNO ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B))));
+
+/*
+ * hasho_flag tells us which type of page we're looking at. For
+ * example, knowing overflow pages from bucket pages is necessary
+ * information when you're deleting tuples from a page. If all the
+ * tuples are deleted from an overflow page, the overflow is made
+ * available to other buckets by calling _hash_freeovflpage(). If all
+ * the tuples are deleted from a bucket page, no additional action is
+ * necessary.
+ */
+
+#define LH_UNUSED_PAGE (0)
+#define LH_OVERFLOW_PAGE (1 << 0)
+#define LH_BUCKET_PAGE (1 << 1)
+#define LH_BITMAP_PAGE (1 << 2)
+#define LH_META_PAGE (1 << 3)
+
+typedef struct HashPageOpaqueData {
+ bits16 hasho_flag; /* is this page a bucket or ovfl */
+ Bucket hasho_bucket; /* bucket number this pg belongs to */
+ OverflowPageAddress hasho_oaddr; /* ovfl address of this ovfl pg */
+ BlockNumber hasho_nextblkno; /* next ovfl blkno */
+ BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */
+} HashPageOpaqueData;
+
+typedef HashPageOpaqueData *HashPageOpaque;
+
+/*
+ * ScanOpaqueData is used to remember which buffers we're currently
+ * examining in the scan. We keep these buffers locked and pinned and
+ * recorded in the opaque entry of the scan in order to avoid doing a
+ * ReadBuffer() for every tuple in the index. This avoids semop() calls,
+ * which are expensive.
+ */
+
+typedef struct HashScanOpaqueData {
+ Buffer hashso_curbuf;
+ Buffer hashso_mrkbuf;
+} HashScanOpaqueData;
+
+typedef HashScanOpaqueData *HashScanOpaque;
+
+/*
+ * Definitions for metapage.
+ */
+
+#define HASH_METAPAGE 0 /* metapage is always block 0 */
+
+#define HASH_MAGIC 0x6440640
+#define HASH_VERSION 0
+
+/*
+ * NCACHED is used to set the array sizeof spares[] & bitmaps[].
+ *
+ * Spares[] is used to hold the number overflow pages currently
+ * allocated at a certain splitpoint. For example, if spares[3] = 7
+ * then there are a maximum of 7 ovflpages available at splitpoint 3.
+ * The value in spares[] will change as ovflpages are added within
+ * a splitpoint.
+ *
+ * Within a splitpoint, one can find which ovflpages are available and
+ * which are used by looking at a bitmaps that are stored on the ovfl
+ * pages themselves. There is at least one bitmap for every splitpoint's
+ * ovflpages. Bitmaps[] contains the ovflpage addresses of the ovflpages
+ * that hold the ovflpage bitmaps.
+ *
+ * The reason that the size is restricted to NCACHED (32) is because
+ * the bitmaps are 16 bits: upper 5 represent the splitpoint, lower 11
+ * indicate the page number within the splitpoint. Since there are
+ * only 5 bits to store the splitpoint, there can only be 32 splitpoints.
+ * Both spares[] and bitmaps[] use splitpoints as there indices, so there
+ * can only be 32 of them.
+ */
+
+#define NCACHED 32
+
+
+typedef struct HashMetaPageData {
+ PageHeaderData hashm_phdr; /* pad for page header
+ (do not use) */
+ uint32 hashm_magic; /* magic no. for hash tables */
+ uint32 hashm_version; /* version ID */
+ uint32 hashm_nkeys; /* number of keys stored in
+ the table */
+ uint16 hashm_ffactor; /* fill factor */
+ uint16 hashm_bsize; /* bucket size (bytes) -
+ must be a power of 2 */
+ uint16 hashm_bshift; /* bucket shift */
+ uint16 hashm_bmsize; /* bitmap array size (bytes) -
+ must be a power of 2 */
+ uint32 hashm_maxbucket; /* ID of maximum bucket
+ in use */
+ uint32 hashm_highmask; /* mask to modulo into
+ entire table */
+ uint32 hashm_lowmask; /* mask to modulo into lower
+ half of table */
+ uint32 hashm_ovflpoint; /* pageno. from which ovflpgs
+ being allocated */
+ uint32 hashm_lastfreed; /* last ovflpage freed */
+ uint32 hashm_nmaps; /* Initial number of bitmaps */
+ uint32 hashm_spares[NCACHED]; /* spare pages available at
+ splitpoints */
+ BlockNumber hashm_mapp[NCACHED]; /* blknumbers of ovfl page
+ maps */
+ RegProcedure hashm_procid; /* hash procedure id from
+ pg_proc */
+} HashMetaPageData;
+
+typedef HashMetaPageData *HashMetaPage;
+
+/* Short hands for accessing structure */
+#define BSHIFT hashm_bshift
+#define OVFL_POINT hashm_ovflpoint
+#define LAST_FREED hashm_lastfreed
+#define MAX_BUCKET hashm_maxbucket
+#define FFACTOR hashm_ffactor
+#define HIGH_MASK hashm_highmask
+#define LOW_MASK hashm_lowmask
+#define NKEYS hashm_nkeys
+#define SPARES hashm_spares
+
+extern bool BuildingHash;
+
+typedef struct HashItemData {
+ IndexTupleData hash_itup;
+} HashItemData;
+
+typedef HashItemData *HashItem;
+
+/*
+ * Constants
+ */
+#define DEFAULT_FFACTOR 300
+#define SPLITMAX 8
+#define BYTE_TO_BIT 3 /* 2^3 bits/byte */
+#define INT_TO_BYTE 2 /* 2^2 bytes/int */
+#define INT_TO_BIT 5 /* 2^5 bits/int */
+#define ALL_SET ((uint32) ~0)
+
+/*
+ * bitmap pages do not contain tuples. they do contain the standard
+ * page headers and trailers; however, everything in between is a
+ * giant bit array. the number of bits that fit on a page obviously
+ * depends on the page size and the header/trailer overhead.
+ */
+#define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize)
+#define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
+#define HashPageGetBitmap(pg) \
+ ((uint32 *) (((char *) (pg)) + DOUBLEALIGN(sizeof(PageHeaderData))))
+
+/*
+ * The number of bits in an ovflpage bitmap which
+ * tells which ovflpages are empty versus in use (NOT the number of
+ * bits in an overflow page *address* bitmap).
+ */
+#define BITS_PER_MAP 32 /* Number of bits in ovflpage bitmap */
+
+/* Given the address of the beginning of a big map, clear/set the nth bit */
+#define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
+#define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
+#define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
+
+/*
+ * page locking modes
+ */
+#define HASH_READ 0
+#define HASH_WRITE 1
+
+/*
+ * In general, the hash code tries to localize its knowledge about page
+ * layout to a couple of routines. However, we need a special value to
+ * indicate "no page number" in those places where we expect page numbers.
+ */
+
+#define P_NONE 0
+
+/*
+ * Strategy number. There's only one valid strategy for hashing: equality.
+ */
+
+#define HTEqualStrategyNumber 1
+#define HTMaxStrategyNumber 1
+
+/*
+ * When a new operator class is declared, we require that the user supply
+ * us with an amproc procudure for hashing a key of the new type.
+ * Since we only have one such proc in amproc, it's number 1.
+ */
+
+#define HASHPROC 1
+
+/* public routines */
+
+extern void hashbuild(Relation heap, Relation index, int natts,
+ AttrNumber *attnum, IndexStrategy istrat, uint16 pcount,
+ Datum *params, FuncIndexInfo *finfo, PredInfo *predInfo);
+extern InsertIndexResult hashinsert(Relation rel, IndexTuple itup);
+extern char *hashgettuple(IndexScanDesc scan, ScanDirection dir);
+extern char *hashbeginscan(Relation rel, bool fromEnd, uint16 keysz,
+ ScanKey scankey);
+extern void hashrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey);
+extern void hashendscan(IndexScanDesc scan);
+extern void hashmarkpos(IndexScanDesc scan);
+extern void hashrestrpos(IndexScanDesc scan);
+extern void hashdelete(Relation rel, ItemPointer tid);
+
+/* hashfunc.c */
+extern uint32 hashint2(int16 key);
+extern uint32 hashint4(uint32 key);
+extern uint32 hashfloat4(float32 keyp);
+extern uint32 hashfloat8(float64 keyp);
+extern uint32 hashoid(Oid key);
+extern uint32 hashchar(char key);
+extern uint32 hashchar2(uint16 intkey);
+extern uint32 hashchar4(uint32 intkey);
+extern uint32 hashchar8(char *key);
+extern uint32 hashchar16(char *key);
+extern uint32 hashtext(struct varlena *key);
+
+/* private routines */
+
+/* hashinsert.c */
+extern InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem);
+
+
+/* hashovfl.c */
+extern Buffer _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf);
+extern Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf);
+extern int32 _hash_initbitmap(Relation rel, HashMetaPage metap, int32 pnum,
+ int32 nbits, int32 ndx);
+extern void _hash_squeezebucket(Relation rel, HashMetaPage metap,
+ Bucket bucket);
+
+
+/* hashpage.c */
+extern void _hash_metapinit(Relation rel);
+extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access);
+extern void _hash_relbuf(Relation rel, Buffer buf, int access);
+extern void _hash_wrtbuf(Relation rel, Buffer buf);
+extern void _hash_wrtnorelbuf(Relation rel, Buffer buf);
+extern Page _hash_chgbufaccess(Relation rel, Buffer *bufp, int from_access,
+ int to_access);
+extern void _hash_pageinit(Page page, Size size);
+extern void _hash_pagedel(Relation rel, ItemPointer tid);
+extern void _hash_expandtable(Relation rel, Buffer metabuf);
+
+
+/* hashscan.c */
+extern void _hash_regscan(IndexScanDesc scan);
+extern void _hash_dropscan(IndexScanDesc scan);
+extern void _hash_adjscans(Relation rel, ItemPointer tid);
+
+
+/* hashsearch.c */
+extern void _hash_search(Relation rel, int keysz, ScanKey scankey,
+ Buffer *bufP, HashMetaPage metap);
+extern RetrieveIndexResult _hash_next(IndexScanDesc scan, ScanDirection dir);
+extern RetrieveIndexResult _hash_first(IndexScanDesc scan, ScanDirection dir);
+extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir,
+ Buffer metabuf);
+
+
+/* hashstrat.c */
+extern StrategyNumber _hash_getstrat(Relation rel, AttrNumber attno,
+ RegProcedure proc);
+extern bool _hash_invokestrat(Relation rel, AttrNumber attno,
+ StrategyNumber strat, Datum left, Datum right);
+
+
+/* hashutil.c */
+extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup,
+ HashMetaPage metap);
+extern void _hash_freeskey(ScanKey skey);
+extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
+extern HashItem _hash_formitem(IndexTuple itup);
+extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key);
+extern uint32 _hash_log2(uint32 num);
+extern void _hash_checkpage(Page page, int flags);
+
+#endif /* HASH_H */
diff --git a/src/backend/access/hash/Makefile.inc b/src/backend/access/hash/Makefile.inc
new file mode 100644
index 00000000000..8ea221bc264
--- /dev/null
+++ b/src/backend/access/hash/Makefile.inc
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/hash (hash access method)
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/hash/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= hash.c hashfunc.c hashinsert.c hashovfl.c hashpage.c hashscan.c \
+ hashsearch.c hashstrat.c hashutil.c
+
+
+
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
new file mode 100644
index 00000000000..a4a4e16e599
--- /dev/null
+++ b/src/backend/access/hash/hash.c
@@ -0,0 +1,467 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash.c--
+ * Implementation of Margo Seltzer's Hashing package for postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+#include "access/funcindex.h"
+#include "nodes/execnodes.h"
+#include "nodes/plannodes.h"
+#include "executor/executor.h"
+#include "executor/tuptable.h"
+#include "catalog/index.h"
+
+
+bool BuildingHash = false;
+
+/*
+ * hashbuild() -- build a new hash index.
+ *
+ * We use a global variable to record the fact that we're creating
+ * a new index. This is used to avoid high-concurrency locking,
+ * since the index won't be visible until this transaction commits
+ * and since building is guaranteed to be single-threaded.
+ */
+void
+hashbuild(Relation heap,
+ Relation index,
+ int natts,
+ AttrNumber *attnum,
+ IndexStrategy istrat,
+ uint16 pcount,
+ Datum *params,
+ FuncIndexInfo *finfo,
+ PredInfo *predInfo)
+{
+ HeapScanDesc hscan;
+ Buffer buffer;
+ HeapTuple htup;
+ IndexTuple itup;
+ TupleDesc htupdesc, itupdesc;
+ Datum *attdata;
+ bool *nulls;
+ InsertIndexResult res;
+ int nhtups, nitups;
+ int i;
+ HashItem hitem;
+ ExprContext *econtext;
+ TupleTable tupleTable;
+ TupleTableSlot *slot;
+ Oid hrelid, irelid;
+ Node *pred, *oldPred;
+
+ /* note that this is a new btree */
+ BuildingHash = true;
+
+ pred = predInfo->pred;
+ oldPred = predInfo->oldPred;
+
+ /* initialize the hash index metadata page (if this is a new index) */
+ if (oldPred == NULL)
+ _hash_metapinit(index);
+
+ /* get tuple descriptors for heap and index relations */
+ htupdesc = RelationGetTupleDescriptor(heap);
+ itupdesc = RelationGetTupleDescriptor(index);
+
+ /* get space for data items that'll appear in the index tuple */
+ attdata = (Datum *) palloc(natts * sizeof(Datum));
+ nulls = (bool *) palloc(natts * sizeof(bool));
+
+ /*
+ * If this is a predicate (partial) index, we will need to evaluate the
+ * predicate using ExecQual, which requires the current tuple to be in a
+ * slot of a TupleTable. In addition, ExecQual must have an ExprContext
+ * referring to that slot. Here, we initialize dummy TupleTable and
+ * ExprContext objects for this purpose. --Nels, Feb '92
+ */
+#ifndef OMIT_PARTIAL_INDEX
+ if (pred != NULL || oldPred != NULL) {
+ tupleTable = ExecCreateTupleTable(1);
+ slot = ExecAllocTableSlot(tupleTable);
+ econtext = makeNode(ExprContext);
+ FillDummyExprContext(econtext, slot, htupdesc, buffer);
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+
+ /* start a heap scan */
+ hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+ htup = heap_getnext(hscan, 0, &buffer);
+
+ /* build the index */
+ nhtups = nitups = 0;
+
+ for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
+
+ nhtups++;
+
+ /*
+ * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+ * this tuple if it was already in the existing partial index
+ */
+ if (oldPred != NULL) {
+ /*SetSlotContents(slot, htup); */
+#ifndef OMIT_PARTIAL_INDEX
+ slot->val = htup;
+ if (ExecQual((List*)oldPred, econtext) == true) {
+ nitups++;
+ continue;
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /* Skip this tuple if it doesn't satisfy the partial-index predicate */
+ if (pred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ /*SetSlotContents(slot, htup); */
+ slot->val = htup;
+ if (ExecQual((List*)pred, econtext) == false)
+ continue;
+#endif /* OMIT_PARTIAL_INDEX */
+}
+
+ nitups++;
+
+ /*
+ * For the current heap tuple, extract all the attributes
+ * we use in this index, and note which are null.
+ */
+ for (i = 1; i <= natts; i++) {
+ int attoff;
+ bool attnull;
+
+ /*
+ * Offsets are from the start of the tuple, and are
+ * zero-based; indices are one-based. The next call
+ * returns i - 1. That's data hiding for you.
+ */
+
+ /* attoff = i - 1 */
+ attoff = AttrNumberGetAttrOffset(i);
+
+ /* below, attdata[attoff] set to equal some datum &
+ * attnull is changed to indicate whether or not the attribute
+ * is null for this tuple
+ */
+ attdata[attoff] = GetIndexValue(htup,
+ htupdesc,
+ attoff,
+ attnum,
+ finfo,
+ &attnull,
+ buffer);
+ nulls[attoff] = (attnull ? 'n' : ' ');
+ }
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_formtuple(itupdesc, attdata, nulls);
+
+ /*
+ * If the single index key is null, we don't insert it into
+ * the index. Hash tables support scans on '='.
+ * Relational algebra says that A = B
+ * returns null if either A or B is null. This
+ * means that no qualification used in an index scan could ever
+ * return true on a null attribute. It also means that indices
+ * can't be used by ISNULL or NOTNULL scans, but that's an
+ * artifact of the strategy map architecture chosen in 1986, not
+ * of the way nulls are handled here.
+ */
+
+ if (itup->t_info & INDEX_NULL_MASK) {
+ pfree(itup);
+ continue;
+ }
+
+ itup->t_tid = htup->t_ctid;
+ hitem = _hash_formitem(itup);
+ res = _hash_doinsert(index, hitem);
+ pfree(hitem);
+ pfree(itup);
+ pfree(res);
+ }
+
+ /* okay, all heap tuples are indexed */
+ heap_endscan(hscan);
+
+ if (pred != NULL || oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ ExecDestroyTupleTable(tupleTable, true);
+ pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /*
+ * Since we just counted the tuples in the heap, we update its
+ * stats in pg_class to guarantee that the planner takes advantage
+ * of the index we just created. Finally, only update statistics
+ * during normal index definitions, not for indices on system catalogs
+ * created during bootstrap processing. We must close the relations
+ * before updatings statistics to guarantee that the relcache entries
+ * are flushed when we increment the command counter in UpdateStats().
+ */
+ if (IsNormalProcessingMode())
+ {
+ hrelid = heap->rd_id;
+ irelid = index->rd_id;
+ heap_close(heap);
+ index_close(index);
+ UpdateStats(hrelid, nhtups, true);
+ UpdateStats(irelid, nitups, false);
+ if (oldPred != NULL) {
+ if (nitups == nhtups) pred = NULL;
+ UpdateIndexPredicate(irelid, oldPred, pred);
+ }
+ }
+
+ /* be tidy */
+ pfree(nulls);
+ pfree(attdata);
+
+ /* all done */
+ BuildingHash = false;
+}
+
+/*
+ * hashinsert() -- insert an index tuple into a hash table.
+ *
+ * Hash on the index tuple's key, find the appropriate location
+ * for the new tuple, put it there, and return an InsertIndexResult
+ * to the caller.
+ */
+InsertIndexResult
+hashinsert(Relation rel, IndexTuple itup)
+{
+ HashItem hitem;
+ InsertIndexResult res;
+
+ if (itup->t_info & INDEX_NULL_MASK)
+ return ((InsertIndexResult) NULL);
+
+ hitem = _hash_formitem(itup);
+
+ res = _hash_doinsert(rel, hitem);
+
+ pfree(hitem);
+
+ return (res);
+}
+
+
+/*
+ * hashgettuple() -- Get the next tuple in the scan.
+ */
+char *
+hashgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+ RetrieveIndexResult res;
+
+ /*
+ * If we've already initialized this scan, we can just advance it
+ * in the appropriate direction. If we haven't done so yet, we
+ * call a routine to get the first item in the scan.
+ */
+
+ if (ItemPointerIsValid(&(scan->currentItemData)))
+ res = _hash_next(scan, dir);
+ else
+ res = _hash_first(scan, dir);
+
+ return ((char *) res);
+}
+
+
+/*
+ * hashbeginscan() -- start a scan on a hash index
+ */
+char *
+hashbeginscan(Relation rel,
+ bool fromEnd,
+ uint16 keysz,
+ ScanKey scankey)
+{
+ IndexScanDesc scan;
+ HashScanOpaque so;
+
+ scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
+ so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
+ so->hashso_curbuf = so->hashso_mrkbuf = InvalidBuffer;
+ scan->opaque = so;
+ scan->flags = 0x0;
+
+ /* register scan in case we change pages it's using */
+ _hash_regscan(scan);
+
+ return ((char *) scan);
+}
+
+/*
+ * hashrescan() -- rescan an index relation
+ */
+void
+hashrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
+{
+ ItemPointer iptr;
+ HashScanOpaque so;
+
+ so = (HashScanOpaque) scan->opaque;
+
+ /* we hold a read lock on the current page in the scan */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+ so->hashso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+ so->hashso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* reset the scan key */
+ if (scan->numberOfKeys > 0) {
+ memmove(scan->keyData,
+ scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ }
+}
+
+/*
+ * hashendscan() -- close down a scan
+ */
+void
+hashendscan(IndexScanDesc scan)
+{
+
+ ItemPointer iptr;
+ HashScanOpaque so;
+
+ so = (HashScanOpaque) scan->opaque;
+
+ /* release any locks we still hold */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+ so->hashso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ if (BufferIsValid(so->hashso_mrkbuf))
+ _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+ so->hashso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* don't need scan registered anymore */
+ _hash_dropscan(scan);
+
+ /* be tidy */
+#ifdef PERFECT_MMGR
+ pfree (scan->opaque);
+#endif /* PERFECT_MMGR */
+}
+
+/*
+ * hashmarkpos() -- save current scan position
+ *
+ */
+void
+hashmarkpos(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ HashScanOpaque so;
+
+ /* see if we ever call this code. if we do, then so_mrkbuf a
+ * useful element in the scan->opaque structure. if this procedure
+ * is never called, so_mrkbuf should be removed from the scan->opaque
+ * structure.
+ */
+ elog(NOTICE, "Hashmarkpos() called.");
+
+ so = (HashScanOpaque) scan->opaque;
+
+ /* release lock on old marked data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+ so->hashso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentItemData and copy to currentMarkData */
+ if (ItemPointerIsValid(&(scan->currentItemData))) {
+ so->hashso_mrkbuf = _hash_getbuf(scan->relation,
+ BufferGetBlockNumber(so->hashso_curbuf),
+ HASH_READ);
+ scan->currentMarkData = scan->currentItemData;
+ }
+}
+
+/*
+ * hashrestrpos() -- restore scan to last saved position
+ */
+void
+hashrestrpos(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ HashScanOpaque so;
+
+ /* see if we ever call this code. if we do, then so_mrkbuf a
+ * useful element in the scan->opaque structure. if this procedure
+ * is never called, so_mrkbuf should be removed from the scan->opaque
+ * structure.
+ */
+ elog(NOTICE, "Hashrestrpos() called.");
+
+ so = (HashScanOpaque) scan->opaque;
+
+ /* release lock on current data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+ so->hashso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentMarkData and copy to currentItemData */
+ if (ItemPointerIsValid(&(scan->currentMarkData))) {
+ so->hashso_curbuf =
+ _hash_getbuf(scan->relation,
+ BufferGetBlockNumber(so->hashso_mrkbuf),
+ HASH_READ);
+
+ scan->currentItemData = scan->currentMarkData;
+ }
+}
+
+/* stubs */
+void
+hashdelete(Relation rel, ItemPointer tid)
+{
+ /* adjust any active scans that will be affected by this deletion */
+ _hash_adjscans(rel, tid);
+
+ /* delete the data from the page */
+ _hash_pagedel(rel, tid);
+}
+
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
new file mode 100644
index 00000000000..6b37de29911
--- /dev/null
+++ b/src/backend/access/hash/hashfunc.c
@@ -0,0 +1,276 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashfunc.c--
+ * Comparison functions for hash access method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashfunc.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * These functions are stored in pg_amproc. For each operator class
+ * defined on hash tables, they compute the hash value of the argument.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "utils/nabstime.h"
+
+uint32 hashint2(int16 key)
+{
+ return ((uint32) ~key);
+}
+
+uint32 hashint4(uint32 key)
+{
+ return (~key);
+}
+
+/* Hash function from Chris Torek. */
+uint32 hashfloat4(float32 keyp)
+{
+ int len;
+ int loop;
+ uint32 h;
+ char *kp = (char *) keyp;
+
+ len = sizeof(float32data);
+
+#define HASH4a h = (h << 5) - h + *kp++;
+#define HASH4b h = (h << 5) + h + *kp++;
+#define HASH4 HASH4b
+
+
+ h = 0;
+ if (len > 0) {
+ loop = (len + 8 - 1) >> 3;
+
+ switch (len & (8 - 1)) {
+ case 0:
+ do { /* All fall throughs */
+ HASH4;
+ case 7:
+ HASH4;
+ case 6:
+ HASH4;
+ case 5:
+ HASH4;
+ case 4:
+ HASH4;
+ case 3:
+ HASH4;
+ case 2:
+ HASH4;
+ case 1:
+ HASH4;
+ } while (--loop);
+ }
+ }
+ return (h);
+}
+
+
+uint32 hashfloat8(float64 keyp)
+{
+ int len;
+ int loop;
+ uint32 h;
+ char *kp = (char *) keyp;
+
+ len = sizeof(float64data);
+
+#define HASH4a h = (h << 5) - h + *kp++;
+#define HASH4b h = (h << 5) + h + *kp++;
+#define HASH4 HASH4b
+
+
+ h = 0;
+ if (len > 0) {
+ loop = (len + 8 - 1) >> 3;
+
+ switch (len & (8 - 1)) {
+ case 0:
+ do { /* All fall throughs */
+ HASH4;
+ case 7:
+ HASH4;
+ case 6:
+ HASH4;
+ case 5:
+ HASH4;
+ case 4:
+ HASH4;
+ case 3:
+ HASH4;
+ case 2:
+ HASH4;
+ case 1:
+ HASH4;
+ } while (--loop);
+ }
+ }
+ return (h);
+}
+
+
+uint32 hashoid(Oid key)
+{
+ return ((uint32) ~key);
+}
+
+
+uint32 hashchar(char key)
+{
+ int len;
+ uint32 h;
+
+ len = sizeof(char);
+
+#define PRIME1 37
+#define PRIME2 1048583
+
+ h = 0;
+ /* Convert char to integer */
+ h = h * PRIME1 ^ (key - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+uint32 hashchar2(uint16 intkey)
+{
+ uint32 h;
+ int len;
+ char *key = (char *) &intkey;
+
+ h = 0;
+ len = sizeof(uint16);
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+uint32 hashchar4(uint32 intkey)
+{
+ uint32 h;
+ int len;
+ char *key = (char *) &intkey;
+
+ h = 0;
+ len = sizeof(uint32);
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+uint32 hashchar8(char *key)
+{
+ uint32 h;
+ int len;
+
+ h = 0;
+ len = sizeof(char8);
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+uint32 hashname(NameData *n)
+{
+ uint32 h;
+ int len;
+ char *key;
+
+ key = n->data;
+
+ h = 0;
+ len = NAMEDATALEN;
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+
+uint32 hashchar16(char *key)
+{
+ uint32 h;
+ int len;
+
+ h = 0;
+ len = sizeof(char16);
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+
+/*
+ * (Comment from the original db3 hashing code: )
+ *
+ * "This is INCREDIBLY ugly, but fast. We break the string up into 8 byte
+ * units. On the first time through the loop we get the 'leftover bytes'
+ * (strlen % 8). On every other iteration, we perform 8 HASHC's so we handle
+ * all 8 bytes. Essentially, this saves us 7 cmp & branch instructions. If
+ * this routine is heavily used enough, it's worth the ugly coding.
+ *
+ * "OZ's original sdbm hash"
+ */
+uint32 hashtext(struct varlena *key)
+{
+ int keylen;
+ char *keydata;
+ uint32 n;
+ int loop;
+
+ keydata = VARDATA(key);
+ keylen = VARSIZE(key);
+
+ /* keylen includes the four bytes in which string keylength is stored */
+ keylen -= sizeof(VARSIZE(key));
+
+#define HASHC n = *keydata++ + 65599 * n
+
+ n = 0;
+ if (keylen > 0) {
+ loop = (keylen + 8 - 1) >> 3;
+
+ switch (keylen & (8 - 1)) {
+ case 0:
+ do { /* All fall throughs */
+ HASHC;
+ case 7:
+ HASHC;
+ case 6:
+ HASHC;
+ case 5:
+ HASHC;
+ case 4:
+ HASHC;
+ case 3:
+ HASHC;
+ case 2:
+ HASHC;
+ case 1:
+ HASHC;
+ } while (--loop);
+ }
+ }
+ return (n);
+}
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
new file mode 100644
index 00000000000..c514cc614d8
--- /dev/null
+++ b/src/backend/access/hash/hashinsert.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashinsert.c--
+ * Item insertion in hash tables for Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashinsert.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/hash.h"
+
+static InsertIndexResult _hash_insertonpg(Relation rel, Buffer buf, int keysz, ScanKey scankey, HashItem hitem, Buffer metabuf);
+static OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, HashItem hitem);
+
+/*
+ * _hash_doinsert() -- Handle insertion of a single HashItem in the table.
+ *
+ * This routine is called by the public interface routines, hashbuild
+ * and hashinsert. By here, hashitem is filled in, and has a unique
+ * (xid, seqno) pair. The datum to be used as a "key" is in the
+ * hashitem.
+ */
+InsertIndexResult
+_hash_doinsert(Relation rel, HashItem hitem)
+{
+ Buffer buf;
+ Buffer metabuf;
+ BlockNumber blkno;
+ HashMetaPage metap;
+ IndexTuple itup;
+ InsertIndexResult res;
+ ScanKey itup_scankey;
+ int natts;
+ Page page;
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ /* we need a scan key to do our search, so build one */
+ itup = &(hitem->hash_itup);
+ if ((natts = rel->rd_rel->relnatts) != 1)
+ elog(WARN, "Hash indices valid for only one index key.");
+ itup_scankey = _hash_mkscankey(rel, itup, metap);
+
+ /*
+ * find the first page in the bucket chain containing this key and
+ * place it in buf. _hash_search obtains a read lock for us.
+ */
+ _hash_search(rel, natts, itup_scankey, &buf, metap);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE);
+
+ /*
+ * trade in our read lock for a write lock so that we can do the
+ * insertion.
+ */
+ blkno = BufferGetBlockNumber(buf);
+ _hash_relbuf(rel, buf, HASH_READ);
+ buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+
+
+ /*
+ * XXX btree comment (haven't decided what to do in hash): don't
+ * think the bucket can be split while we're reading the metapage.
+ *
+ * If the page was split between the time that we surrendered our
+ * read lock and acquired our write lock, then this page may no
+ * longer be the right place for the key we want to insert.
+ */
+
+ /* do the insertion */
+ res = _hash_insertonpg(rel, buf, natts, itup_scankey,
+ hitem, metabuf);
+
+ /* be tidy */
+ _hash_freeskey(itup_scankey);
+
+ return (res);
+}
+
+/*
+ * _hash_insertonpg() -- Insert a tuple on a particular page in the table.
+ *
+ * This recursive procedure does the following things:
+ *
+ * + if necessary, splits the target page.
+ * + inserts the tuple.
+ *
+ * On entry, we must have the right buffer on which to do the
+ * insertion, and the buffer must be pinned and locked. On return,
+ * we will have dropped both the pin and the write lock on the buffer.
+ *
+ */
+static InsertIndexResult
+_hash_insertonpg(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey scankey,
+ HashItem hitem,
+ Buffer metabuf)
+{
+ InsertIndexResult res;
+ Page page;
+ BlockNumber itup_blkno;
+ OffsetNumber itup_off;
+ int itemsz;
+ HashPageOpaque pageopaque;
+ bool do_expand = false;
+ Buffer ovflbuf;
+ HashMetaPage metap;
+ Bucket bucket;
+
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ bucket = pageopaque->hasho_bucket;
+
+ itemsz = IndexTupleDSize(hitem->hash_itup)
+ + (sizeof(HashItemData) - sizeof(IndexTupleData));
+ itemsz = DOUBLEALIGN(itemsz);
+
+ while (PageGetFreeSpace(page) < itemsz) {
+ /*
+ * no space on this page; check for an overflow page
+ */
+ if (BlockNumberIsValid(pageopaque->hasho_nextblkno)) {
+ /*
+ * ovfl page exists; go get it. if it doesn't have room,
+ * we'll find out next pass through the loop test above.
+ */
+ ovflbuf = _hash_getbuf(rel, pageopaque->hasho_nextblkno,
+ HASH_WRITE);
+ _hash_relbuf(rel, buf, HASH_WRITE);
+ buf = ovflbuf;
+ page = BufferGetPage(buf);
+ } else {
+ /*
+ * we're at the end of the bucket chain and we haven't
+ * found a page with enough room. allocate a new overflow
+ * page.
+ */
+ do_expand = true;
+ ovflbuf = _hash_addovflpage(rel, &metabuf, buf);
+ _hash_relbuf(rel, buf, HASH_WRITE);
+ buf = ovflbuf;
+ page = BufferGetPage(buf);
+
+ if (PageGetFreeSpace(page) < itemsz) {
+ /* it doesn't fit on an empty page -- give up */
+ elog(WARN, "hash item too large");
+ }
+ }
+ _hash_checkpage(page, LH_OVERFLOW_PAGE);
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(pageopaque->hasho_bucket == bucket);
+ }
+
+ itup_off = _hash_pgaddtup(rel, buf, keysz, scankey, itemsz, hitem);
+ itup_blkno = BufferGetBlockNumber(buf);
+
+ /* by here, the new tuple is inserted */
+ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+
+ ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+
+ if (res != NULL) {
+ /*
+ * Increment the number of keys in the table.
+ * We switch lock access type just for a moment
+ * to allow greater accessibility to the metapage.
+ */
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf,
+ HASH_READ, HASH_WRITE);
+ metap->hashm_nkeys += 1;
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf,
+ HASH_WRITE, HASH_READ);
+
+ }
+
+ _hash_wrtbuf(rel, buf);
+
+ if (do_expand ||
+ (metap->hashm_nkeys / (metap->hashm_maxbucket + 1))
+ > metap->hashm_ffactor) {
+ _hash_expandtable(rel, metabuf);
+ }
+ _hash_relbuf(rel, metabuf, HASH_READ);
+ return (res);
+}
+
+/*
+ * _hash_pgaddtup() -- add a tuple to a particular page in the index.
+ *
+ * This routine adds the tuple to the page as requested, and keeps the
+ * write lock and reference associated with the page's buffer. It is
+ * an error to call pgaddtup() without a write lock and reference.
+ */
+static OffsetNumber
+_hash_pgaddtup(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey itup_scankey,
+ Size itemsize,
+ HashItem hitem)
+{
+ OffsetNumber itup_off;
+ Page page;
+
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+
+ itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+ (void) PageAddItem(page, (Item) hitem, itemsize, itup_off, LP_USED);
+
+ /* write the buffer, but hold our lock */
+ _hash_wrtnorelbuf(rel, buf);
+
+ return (itup_off);
+}
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
new file mode 100644
index 00000000000..55ee9e9ce79
--- /dev/null
+++ b/src/backend/access/hash/hashovfl.c
@@ -0,0 +1,614 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashovfl.c--
+ * Overflow page management code for the Postgres hash access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * Overflow pages look like ordinary relation pages.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+static OverflowPageAddress _hash_getovfladdr(Relation rel, Buffer *metabufp);
+static uint32 _hash_firstfreebit(uint32 map);
+
+/*
+ * _hash_addovflpage
+ *
+ * Add an overflow page to the page currently pointed to by the buffer
+ * argument 'buf'.
+ *
+ * *Metabufp has a read lock upon entering the function; buf has a
+ * write lock.
+ *
+ */
+Buffer
+_hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf)
+{
+
+ OverflowPageAddress oaddr;
+ BlockNumber ovflblkno;
+ Buffer ovflbuf;
+ HashMetaPage metap;
+ HashPageOpaque ovflopaque;
+ HashPageOpaque pageopaque;
+ Page page;
+ Page ovflpage;
+
+ /* this had better be the last page in a bucket chain */
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(!BlockNumberIsValid(pageopaque->hasho_nextblkno));
+
+ metap = (HashMetaPage) BufferGetPage(*metabufp);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ /* allocate an empty overflow page */
+ oaddr = _hash_getovfladdr(rel, metabufp);
+ if (oaddr == InvalidOvflAddress) {
+ elog(WARN, "_hash_addovflpage: problem with _hash_getovfladdr.");
+ }
+ ovflblkno = OADDR_TO_BLKNO(OADDR_OF(SPLITNUM(oaddr), OPAGENUM(oaddr)));
+ Assert(BlockNumberIsValid(ovflblkno));
+ ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE);
+ Assert(BufferIsValid(ovflbuf));
+ ovflpage = BufferGetPage(ovflbuf);
+
+ /* initialize the new overflow page */
+ _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
+ ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+ ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
+ ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+ ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
+ ovflopaque->hasho_oaddr = oaddr;
+ ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
+ _hash_wrtnorelbuf(rel, ovflbuf);
+
+ /* logically chain overflow page to previous page */
+ pageopaque->hasho_nextblkno = ovflblkno;
+ _hash_wrtnorelbuf(rel, buf);
+ return (ovflbuf);
+}
+
+/*
+ * _hash_getovfladdr()
+ *
+ * Find an available overflow page and return its address.
+ *
+ * When we enter this function, we have a read lock on *metabufp which
+ * we change to a write lock immediately. Before exiting, the write lock
+ * is exchanged for a read lock.
+ *
+ */
+static OverflowPageAddress
+_hash_getovfladdr(Relation rel, Buffer *metabufp)
+{
+ HashMetaPage metap;
+ Buffer mapbuf;
+ BlockNumber blkno;
+ PageOffset offset;
+ OverflowPageAddress oaddr;
+ SplitNumber splitnum;
+ uint32 *freep;
+ uint32 max_free;
+ uint32 bit;
+ uint32 first_page;
+ uint32 free_bit;
+ uint32 free_page;
+ uint32 in_use_bits;
+ uint32 i, j;
+
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE);
+
+ splitnum = metap->OVFL_POINT;
+ max_free = metap->SPARES[splitnum];
+
+ free_page = (max_free - 1) >> (metap->BSHIFT + BYTE_TO_BIT);
+ free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1);
+
+ /* Look through all the free maps to find the first free block */
+ first_page = metap->LAST_FREED >> (metap->BSHIFT + BYTE_TO_BIT);
+ for ( i = first_page; i <= free_page; i++ ) {
+ Page mappage;
+
+ blkno = metap->hashm_mapp[i];
+ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ mappage = BufferGetPage(mapbuf);
+ _hash_checkpage(mappage, LH_BITMAP_PAGE);
+ freep = HashPageGetBitmap(mappage);
+ Assert(freep);
+
+ if (i == free_page)
+ in_use_bits = free_bit;
+ else
+ in_use_bits = BMPGSZ_BIT(metap) - 1;
+
+ if (i == first_page) {
+ bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1);
+ j = bit / BITS_PER_MAP;
+ bit = bit & ~(BITS_PER_MAP - 1);
+ } else {
+ bit = 0;
+ j = 0;
+ }
+ for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP)
+ if (freep[j] != ALL_SET)
+ goto found;
+ }
+
+ /* No Free Page Found - have to allocate a new page */
+ metap->LAST_FREED = metap->SPARES[splitnum];
+ metap->SPARES[splitnum]++;
+ offset = metap->SPARES[splitnum] -
+ (splitnum ? metap->SPARES[splitnum - 1] : 0);
+
+#define OVMSG "HASH: Out of overflow pages. Out of luck.\n"
+
+ if (offset > SPLITMASK) {
+ if (++splitnum >= NCACHED) {
+ elog(WARN, OVMSG);
+ }
+ metap->OVFL_POINT = splitnum;
+ metap->SPARES[splitnum] = metap->SPARES[splitnum-1];
+ metap->SPARES[splitnum-1]--;
+ offset = 0;
+ }
+
+ /* Check if we need to allocate a new bitmap page */
+ if (free_bit == BMPGSZ_BIT(metap) - 1) {
+ /* won't be needing old map page */
+
+ _hash_relbuf(rel, mapbuf, HASH_WRITE);
+
+ free_page++;
+ if (free_page >= NCACHED) {
+ elog(WARN, OVMSG);
+ }
+
+ /*
+ * This is tricky. The 1 indicates that you want the new page
+ * allocated with 1 clear bit. Actually, you are going to
+ * allocate 2 pages from this map. The first is going to be
+ * the map page, the second is the overflow page we were
+ * looking for. The init_bitmap routine automatically, sets
+ * the first bit of itself to indicate that the bitmap itself
+ * is in use. We would explicitly set the second bit, but
+ * don't have to if we tell init_bitmap not to leave it clear
+ * in the first place.
+ */
+ if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset),
+ 1, free_page)) {
+ elog(WARN, "overflow_page: problem with _hash_initbitmap.");
+ }
+ metap->SPARES[splitnum]++;
+ offset++;
+ if (offset > SPLITMASK) {
+ if (++splitnum >= NCACHED) {
+ elog(WARN, OVMSG);
+ }
+ metap->OVFL_POINT = splitnum;
+ metap->SPARES[splitnum] = metap->SPARES[splitnum-1];
+ metap->SPARES[splitnum-1]--;
+ offset = 0;
+ }
+ } else {
+
+ /*
+ * Free_bit addresses the last used bit. Bump it to address
+ * the first available bit.
+ */
+ free_bit++;
+ SETBIT(freep, free_bit);
+ _hash_wrtbuf(rel, mapbuf);
+ }
+
+ /* Calculate address of the new overflow page */
+ oaddr = OADDR_OF(splitnum, offset);
+ _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ);
+ return (oaddr);
+
+ found:
+ bit = bit + _hash_firstfreebit(freep[j]);
+ SETBIT(freep, bit);
+ _hash_wrtbuf(rel, mapbuf);
+
+ /*
+ * Bits are addressed starting with 0, but overflow pages are addressed
+ * beginning at 1. Bit is a bit addressnumber, so we need to increment
+ * it to convert it to a page number.
+ */
+
+ bit = 1 + bit + (i * BMPGSZ_BIT(metap));
+ if (bit >= metap->LAST_FREED) {
+ metap->LAST_FREED = bit - 1;
+ }
+
+ /* Calculate the split number for this page */
+ for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++)
+ ;
+ offset = (i ? bit - metap->SPARES[i - 1] : bit);
+ if (offset >= SPLITMASK) {
+ elog(WARN, OVMSG);
+ }
+
+ /* initialize this page */
+ oaddr = OADDR_OF(i, offset);
+ _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ);
+ return (oaddr);
+}
+
+/*
+ * _hash_firstfreebit()
+ *
+ * Return the first bit that is not set in the argument 'map'. This
+ * function is used to find an available overflow page within a
+ * splitnumber.
+ *
+ */
+static uint32
+_hash_firstfreebit(uint32 map)
+{
+ uint32 i, mask;
+
+ mask = 0x1;
+ for (i = 0; i < BITS_PER_MAP; i++) {
+ if (!(mask & map))
+ return (i);
+ mask = mask << 1;
+ }
+ return (i);
+}
+
+/*
+ * _hash_freeovflpage() -
+ *
+ * Mark this overflow page as free and return a buffer with
+ * the page that follows it (which may be defined as
+ * InvalidBuffer).
+ *
+ */
+Buffer
+_hash_freeovflpage(Relation rel, Buffer ovflbuf)
+{
+ HashMetaPage metap;
+ Buffer metabuf;
+ Buffer mapbuf;
+ BlockNumber prevblkno;
+ BlockNumber blkno;
+ BlockNumber nextblkno;
+ HashPageOpaque ovflopaque;
+ Page ovflpage;
+ Page mappage;
+ OverflowPageAddress addr;
+ SplitNumber splitnum;
+ uint32 *freep;
+ uint32 ovflpgno;
+ int32 bitmappage, bitmapbit;
+ Bucket bucket;
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ ovflpage = BufferGetPage(ovflbuf);
+ _hash_checkpage(ovflpage, LH_OVERFLOW_PAGE);
+ ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+ addr = ovflopaque->hasho_oaddr;
+ nextblkno = ovflopaque->hasho_nextblkno;
+ prevblkno = ovflopaque->hasho_prevblkno;
+ bucket = ovflopaque->hasho_bucket;
+ (void) memset(ovflpage, 0, BufferGetPageSize(ovflbuf));
+ _hash_wrtbuf(rel, ovflbuf);
+
+ /*
+ * fix up the bucket chain. this is a doubly-linked list, so we
+ * must fix up the bucket chain members behind and ahead of the
+ * overflow page being deleted.
+ *
+ * XXX this should look like:
+ * - lock prev/next
+ * - modify/write prev/next (how to do write ordering with a
+ * doubly-linked list???)
+ * - unlock prev/next
+ */
+ if (BlockNumberIsValid(prevblkno)) {
+ Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE);
+ Page prevpage = BufferGetPage(prevbuf);
+ HashPageOpaque prevopaque =
+ (HashPageOpaque) PageGetSpecialPointer(prevpage);
+
+ _hash_checkpage(prevpage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ Assert(prevopaque->hasho_bucket == bucket);
+ prevopaque->hasho_nextblkno = nextblkno;
+ _hash_wrtbuf(rel, prevbuf);
+ }
+ if (BlockNumberIsValid(nextblkno)) {
+ Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
+ Page nextpage = BufferGetPage(nextbuf);
+ HashPageOpaque nextopaque =
+ (HashPageOpaque) PageGetSpecialPointer(nextpage);
+
+ _hash_checkpage(nextpage, LH_OVERFLOW_PAGE);
+ Assert(nextopaque->hasho_bucket == bucket);
+ nextopaque->hasho_prevblkno = prevblkno;
+ _hash_wrtbuf(rel, nextbuf);
+ }
+
+ /*
+ * Fix up the overflow page bitmap that tracks this particular
+ * overflow page. The bitmap can be found in the MetaPageData
+ * array element hashm_mapp[bitmappage].
+ */
+ splitnum = (addr >> SPLITSHIFT);
+ ovflpgno =
+ (splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1;
+
+ if (ovflpgno < metap->LAST_FREED) {
+ metap->LAST_FREED = ovflpgno;
+ }
+
+ bitmappage = (ovflpgno >> (metap->BSHIFT + BYTE_TO_BIT));
+ bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1);
+
+ blkno = metap->hashm_mapp[bitmappage];
+ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ mappage = BufferGetPage(mapbuf);
+ _hash_checkpage(mappage, LH_BITMAP_PAGE);
+ freep = HashPageGetBitmap(mappage);
+ CLRBIT(freep, bitmapbit);
+ _hash_wrtbuf(rel, mapbuf);
+
+ _hash_relbuf(rel, metabuf, HASH_WRITE);
+
+ /*
+ * now instantiate the page that replaced this one,
+ * if it exists, and return that buffer with a write lock.
+ */
+ if (BlockNumberIsValid(nextblkno)) {
+ return (_hash_getbuf(rel, nextblkno, HASH_WRITE));
+ } else {
+ return (InvalidBuffer);
+ }
+}
+
+
+/*
+ * _hash_initbitmap()
+ *
+ * Initialize a new bitmap page. The metapage has a write-lock upon
+ * entering the function.
+ *
+ * 'pnum' is the OverflowPageAddress of the new bitmap page.
+ * 'nbits' is how many bits to clear (i.e., make available) in the new
+ * bitmap page. the remainder of the bits (as well as the first bit,
+ * representing the bitmap page itself) will be set.
+ * 'ndx' is the 0-based offset of the new bitmap page within the
+ * metapage's array of bitmap page OverflowPageAddresses.
+ */
+
+#define INT_MASK ((1 << INT_TO_BIT) -1)
+
+int32
+_hash_initbitmap(Relation rel,
+ HashMetaPage metap,
+ int32 pnum,
+ int32 nbits,
+ int32 ndx)
+{
+ Buffer buf;
+ BlockNumber blkno;
+ Page pg;
+ HashPageOpaque op;
+ uint32 *freep;
+ int clearbytes, clearints;
+
+ blkno = OADDR_TO_BLKNO(pnum);
+ buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ pg = BufferGetPage(buf);
+ _hash_pageinit(pg, BufferGetPageSize(buf));
+ op = (HashPageOpaque) PageGetSpecialPointer(pg);
+ op->hasho_oaddr = InvalidOvflAddress;
+ op->hasho_prevblkno = InvalidBlockNumber;
+ op->hasho_nextblkno = InvalidBlockNumber;
+ op->hasho_flag = LH_BITMAP_PAGE;
+ op->hasho_bucket = -1;
+
+ freep = HashPageGetBitmap(pg);
+
+ /* set all of the bits above 'nbits' to 1 */
+ clearints = ((nbits - 1) >> INT_TO_BIT) + 1;
+ clearbytes = clearints << INT_TO_BYTE;
+ (void) memset((char *) freep, 0, clearbytes);
+ (void) memset(((char *) freep) + clearbytes, 0xFF,
+ BMPGSZ_BYTE(metap) - clearbytes);
+ freep[clearints - 1] = ALL_SET << (nbits & INT_MASK);
+
+ /* bit 0 represents the new bitmap page */
+ SETBIT(freep, 0);
+
+ /* metapage already has a write lock */
+ metap->hashm_nmaps++;
+ metap->hashm_mapp[ndx] = blkno;
+
+ /* write out the new bitmap page (releasing its locks) */
+ _hash_wrtbuf(rel, buf);
+
+ return (0);
+}
+
+
+/*
+ * _hash_squeezebucket(rel, bucket)
+ *
+ * Try to squeeze the tuples onto pages occuring earlier in the
+ * bucket chain in an attempt to free overflow pages. When we start
+ * the "squeezing", the page from which we start taking tuples (the
+ * "read" page) is the last bucket in the bucket chain and the page
+ * onto which we start squeezing tuples (the "write" page) is the
+ * first page in the bucket chain. The read page works backward and
+ * the write page works forward; the procedure terminates when the
+ * read page and write page are the same page.
+ */
+void
+_hash_squeezebucket(Relation rel,
+ HashMetaPage metap,
+ Bucket bucket)
+{
+ Buffer wbuf;
+ Buffer rbuf;
+ BlockNumber wblkno;
+ BlockNumber rblkno;
+ Page wpage;
+ Page rpage;
+ HashPageOpaque wopaque;
+ HashPageOpaque ropaque;
+ OffsetNumber woffnum;
+ OffsetNumber roffnum;
+ HashItem hitem;
+ int itemsz;
+
+/* elog(DEBUG, "_hash_squeezebucket: squeezing bucket %d", bucket); */
+
+ /*
+ * start squeezing into the base bucket page.
+ */
+ wblkno = BUCKET_TO_BLKNO(bucket);
+ wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
+ wpage = BufferGetPage(wbuf);
+ _hash_checkpage(wpage, LH_BUCKET_PAGE);
+ wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
+
+ /*
+ * if there aren't any overflow pages, there's nothing to squeeze.
+ */
+ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) {
+ _hash_relbuf(rel, wbuf, HASH_WRITE);
+ return;
+ }
+
+ /*
+ * find the last page in the bucket chain by starting at the base
+ * bucket page and working forward.
+ *
+ * XXX if chains tend to be long, we should probably move forward
+ * using HASH_READ and then _hash_chgbufaccess to HASH_WRITE when
+ * we reach the end. if they are short we probably don't care
+ * very much. if the hash function is working at all, they had
+ * better be short..
+ */
+ ropaque = wopaque;
+ do {
+ rblkno = ropaque->hasho_nextblkno;
+ if (ropaque != wopaque) {
+ _hash_relbuf(rel, rbuf, HASH_WRITE);
+ }
+ rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
+ rpage = BufferGetPage(rbuf);
+ _hash_checkpage(rpage, LH_OVERFLOW_PAGE);
+ Assert(!PageIsEmpty(rpage));
+ ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
+ Assert(ropaque->hasho_bucket == bucket);
+ } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
+
+ /*
+ * squeeze the tuples.
+ */
+ roffnum = FirstOffsetNumber;
+ for(;;) {
+ hitem = (HashItem) PageGetItem(rpage, PageGetItemId(rpage, roffnum));
+ itemsz = IndexTupleDSize(hitem->hash_itup)
+ + (sizeof(HashItemData) - sizeof(IndexTupleData));
+ itemsz = DOUBLEALIGN(itemsz);
+
+ /*
+ * walk up the bucket chain, looking for a page big enough for
+ * this item.
+ */
+ while (PageGetFreeSpace(wpage) < itemsz) {
+ wblkno = wopaque->hasho_nextblkno;
+
+ _hash_wrtbuf(rel, wbuf);
+
+ if (!BlockNumberIsValid(wblkno) || (rblkno == wblkno)) {
+ _hash_wrtbuf(rel, rbuf);
+ /* wbuf is already released */
+ return;
+ }
+
+ wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
+ wpage = BufferGetPage(wbuf);
+ _hash_checkpage(wpage, LH_OVERFLOW_PAGE);
+ Assert(!PageIsEmpty(wpage));
+ wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
+ Assert(wopaque->hasho_bucket == bucket);
+ }
+
+ /*
+ * if we're here, we have found room so insert on the "write"
+ * page.
+ */
+ woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
+ (void) PageAddItem(wpage, (Item) hitem, itemsz, woffnum, LP_USED);
+
+ /*
+ * delete the tuple from the "read" page.
+ * PageIndexTupleDelete repacks the ItemId array, so 'roffnum'
+ * will be "advanced" to the "next" ItemId.
+ */
+ PageIndexTupleDelete(rpage, roffnum);
+ _hash_wrtnorelbuf(rel, rbuf);
+
+ /*
+ * if the "read" page is now empty because of the deletion,
+ * free it.
+ */
+ if (PageIsEmpty(rpage) && (ropaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+ rblkno = ropaque->hasho_prevblkno;
+ Assert(BlockNumberIsValid(rblkno));
+
+ /*
+ * free this overflow page. the extra _hash_relbuf is
+ * because _hash_freeovflpage gratuitously returns the
+ * next page (we want the previous page and will get it
+ * ourselves later).
+ */
+ rbuf = _hash_freeovflpage(rel, rbuf);
+ if (BufferIsValid(rbuf)) {
+ _hash_relbuf(rel, rbuf, HASH_WRITE);
+ }
+
+ if (rblkno == wblkno) {
+ /* rbuf is already released */
+ _hash_wrtbuf(rel, wbuf);
+ return;
+ }
+
+ rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
+ rpage = BufferGetPage(rbuf);
+ _hash_checkpage(rpage, LH_OVERFLOW_PAGE);
+ Assert(!PageIsEmpty(rpage));
+ ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
+ Assert(ropaque->hasho_bucket == bucket);
+
+ roffnum = FirstOffsetNumber;
+ }
+ }
+}
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
new file mode 100644
index 00000000000..2c6ebed8350
--- /dev/null
+++ b/src/backend/access/hash/hashpage.c
@@ -0,0 +1,669 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashpage.c--
+ * Hash table page management code for the Postgres hash access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * Postgres hash pages look like ordinary relation pages. The opaque
+ * data at high addresses includes information about the page including
+ * whether a page is an overflow page or a true bucket, the block
+ * numbers of the preceding and following pages, and the overflow
+ * address of the page if it is an overflow page.
+ *
+ * The first page in a hash relation, page zero, is special -- it stores
+ * information describing the hash table; it is referred to as teh
+ * "meta page." Pages one and higher store the actual data.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access);
+static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access);
+static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket);
+
+/*
+ * We use high-concurrency locking on hash indices. There are two cases in
+ * which we don't do locking. One is when we're building the index.
+ * Since the creating transaction has not committed, no one can see
+ * the index, and there's no reason to share locks. The second case
+ * is when we're just starting up the database system. We use some
+ * special-purpose initialization code in the relation cache manager
+ * (see utils/cache/relcache.c) to allow us to do indexed scans on
+ * the system catalogs before we'd normally be able to. This happens
+ * before the lock table is fully initialized, so we can't use it.
+ * Strictly speaking, this violates 2pl, but we don't do 2pl on the
+ * system catalogs anyway.
+ */
+
+
+#define USELOCKING (!BuildingHash && !IsInitProcessingMode())
+
+
+/*
+ * _hash_metapinit() -- Initialize the metadata page of a hash index,
+ * the two buckets that we begin with and the initial
+ * bitmap page.
+ */
+void
+_hash_metapinit(Relation rel)
+{
+ HashMetaPage metap;
+ HashPageOpaque pageopaque;
+ Buffer metabuf;
+ Buffer buf;
+ Page pg;
+ int nbuckets;
+ uint32 nelem; /* number elements */
+ uint32 lg2nelem; /* _hash_log2(nelem) */
+ uint32 nblocks;
+ uint16 i;
+
+ /* can't be sharing this with anyone, now... */
+ if (USELOCKING)
+ RelationSetLockForWrite(rel);
+
+ if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
+ elog(WARN, "Cannot initialize non-empty hash table %s",
+ RelationGetRelationName(rel));
+ }
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+ pg = BufferGetPage(metabuf);
+ metap = (HashMetaPage) pg;
+ _hash_pageinit(pg, BufferGetPageSize(metabuf));
+
+ metap->hashm_magic = HASH_MAGIC;
+ metap->hashm_version = HASH_VERSION;
+ metap->hashm_nkeys = 0;
+ metap->hashm_nmaps = 0;
+ metap->hashm_ffactor = DEFAULT_FFACTOR;
+ metap->hashm_bsize = BufferGetPageSize(metabuf);
+ metap->hashm_bshift = _hash_log2(metap->hashm_bsize);
+ for (i = metap->hashm_bshift; i > 0; --i) {
+ if ((1 << i) < (metap->hashm_bsize -
+ (DOUBLEALIGN(sizeof(PageHeaderData)) +
+ DOUBLEALIGN(sizeof(HashPageOpaqueData))))) {
+ break;
+ }
+ }
+ Assert(i);
+ metap->hashm_bmsize = 1 << i;
+ metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
+
+ /*
+ * Make nelem = 2 rather than 0 so that we end up allocating space
+ * for the next greater power of two number of buckets.
+ */
+ nelem = 2;
+ lg2nelem = 1; /*_hash_log2(MAX(nelem, 2)) */
+ nbuckets = 2; /*1 << lg2nelem */
+
+ memset((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares));
+ memset((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
+
+ metap->hashm_spares[lg2nelem] = 2; /* lg2nelem + 1 */
+ metap->hashm_spares[lg2nelem + 1] = 2; /* lg2nelem + 1 */
+ metap->hashm_ovflpoint = 1; /* lg2nelem */
+ metap->hashm_lastfreed = 2;
+
+ metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */
+ metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */
+
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
+ pageopaque->hasho_oaddr = InvalidOvflAddress;
+ pageopaque->hasho_prevblkno = InvalidBlockNumber;
+ pageopaque->hasho_nextblkno = InvalidBlockNumber;
+ pageopaque->hasho_flag = LH_META_PAGE;
+ pageopaque->hasho_bucket = -1;
+
+ /*
+ * First bitmap page is at: splitpoint lg2nelem page offset 1 which
+ * turns out to be page 3. Couldn't initialize page 3 until we created
+ * the first two buckets above.
+ */
+ if (_hash_initbitmap(rel, metap, OADDR_OF(lg2nelem, 1), lg2nelem + 1, 0))
+ elog(WARN, "Problem with _hash_initbitmap.");
+
+ /* all done */
+ _hash_wrtnorelbuf(rel, metabuf);
+
+ /*
+ * initialize the first two buckets
+ */
+ for (i = 0; i <= 1; i++) {
+ buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
+ pg = BufferGetPage(buf);
+ _hash_pageinit(pg, BufferGetPageSize(buf));
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
+ pageopaque->hasho_oaddr = InvalidOvflAddress;
+ pageopaque->hasho_prevblkno = InvalidBlockNumber;
+ pageopaque->hasho_nextblkno = InvalidBlockNumber;
+ pageopaque->hasho_flag = LH_BUCKET_PAGE;
+ pageopaque->hasho_bucket = i;
+ _hash_wrtbuf(rel, buf);
+ }
+
+ _hash_relbuf(rel, metabuf, HASH_WRITE);
+
+ if (USELOCKING)
+ RelationUnsetLockForWrite(rel);
+}
+
+/*
+ * _hash_getbuf() -- Get a buffer by block number for read or write.
+ *
+ * When this routine returns, the appropriate lock is set on the
+ * requested buffer its reference count is correct.
+ *
+ * XXX P_NEW is not used because, unlike the tree structures, we
+ * need the bucket blocks to be at certain block numbers. we must
+ * depend on the caller to call _hash_pageinit on the block if it
+ * knows that this is a new block.
+ */
+Buffer
+_hash_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+ Buffer buf;
+
+ if (blkno == P_NEW) {
+ elog(WARN, "_hash_getbuf: internal error: hash AM does not use P_NEW");
+ }
+ switch (access) {
+ case HASH_WRITE:
+ case HASH_READ:
+ _hash_setpagelock(rel, blkno, access);
+ break;
+ default:
+ elog(WARN, "_hash_getbuf: invalid access (%d) on new blk: %.*s",
+ access, NAMEDATALEN, RelationGetRelationName(rel));
+ break;
+ }
+ buf = ReadBuffer(rel, blkno);
+
+ /* ref count and lock type are correct */
+ return (buf);
+}
+
+/*
+ * _hash_relbuf() -- release a locked buffer.
+ */
+void
+_hash_relbuf(Relation rel, Buffer buf, int access)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+
+ switch (access) {
+ case HASH_WRITE:
+ case HASH_READ:
+ _hash_unsetpagelock(rel, blkno, access);
+ break;
+ default:
+ elog(WARN, "_hash_relbuf: invalid access (%d) on blk %x: %.*s",
+ access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+ }
+
+ ReleaseBuffer(buf);
+}
+
+/*
+ * _hash_wrtbuf() -- write a hash page to disk.
+ *
+ * This routine releases the lock held on the buffer and our reference
+ * to it. It is an error to call _hash_wrtbuf() without a write lock
+ * or a reference to the buffer.
+ */
+void
+_hash_wrtbuf(Relation rel, Buffer buf)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteBuffer(buf);
+ _hash_unsetpagelock(rel, blkno, HASH_WRITE);
+}
+
+/*
+ * _hash_wrtnorelbuf() -- write a hash page to disk, but do not release
+ * our reference or lock.
+ *
+ * It is an error to call _hash_wrtnorelbuf() without a write lock
+ * or a reference to the buffer.
+ */
+void
+_hash_wrtnorelbuf(Relation rel, Buffer buf)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteNoReleaseBuffer(buf);
+}
+
+Page
+_hash_chgbufaccess(Relation rel,
+ Buffer *bufp,
+ int from_access,
+ int to_access)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(*bufp);
+
+ switch (from_access) {
+ case HASH_WRITE:
+ _hash_wrtbuf(rel, *bufp);
+ break;
+ case HASH_READ:
+ _hash_relbuf(rel, *bufp, from_access);
+ break;
+ default:
+ elog(WARN, "_hash_chgbufaccess: invalid access (%d) on blk %x: %.*s",
+ from_access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+ break;
+ }
+ *bufp = _hash_getbuf(rel, blkno, to_access);
+ return (BufferGetPage(*bufp));
+}
+
+/*
+ * _hash_pageinit() -- Initialize a new page.
+ */
+void
+_hash_pageinit(Page page, Size size)
+{
+ Assert(((PageHeader) page)->pd_lower == 0);
+ Assert(((PageHeader) page)->pd_upper == 0);
+ Assert(((PageHeader) page)->pd_special == 0);
+
+ /*
+ * Cargo-cult programming -- don't really need this to be zero, but
+ * creating new pages is an infrequent occurrence and it makes me feel
+ * good when I know they're empty.
+ */
+ memset(page, 0, size);
+
+ PageInit(page, size, sizeof(HashPageOpaqueData));
+}
+
+static void
+_hash_setpagelock(Relation rel,
+ BlockNumber blkno,
+ int access)
+{
+ ItemPointerData iptr;
+
+ if (USELOCKING) {
+ ItemPointerSet(&iptr, blkno, 1);
+
+ switch (access) {
+ case HASH_WRITE:
+ RelationSetSingleWLockPage(rel, &iptr);
+ break;
+ case HASH_READ:
+ RelationSetSingleRLockPage(rel, &iptr);
+ break;
+ default:
+ elog(WARN, "_hash_setpagelock: invalid access (%d) on blk %x: %.*s",
+ access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+ break;
+ }
+ }
+}
+
+static void
+_hash_unsetpagelock(Relation rel,
+ BlockNumber blkno,
+ int access)
+{
+ ItemPointerData iptr;
+
+ if (USELOCKING) {
+ ItemPointerSet(&iptr, blkno, 1);
+
+ switch (access) {
+ case HASH_WRITE:
+ RelationUnsetSingleWLockPage(rel, &iptr);
+ break;
+ case HASH_READ:
+ RelationUnsetSingleRLockPage(rel, &iptr);
+ break;
+ default:
+ elog(WARN, "_hash_unsetpagelock: invalid access (%d) on blk %x: %.*s",
+ access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+ break;
+ }
+ }
+}
+
+void
+_hash_pagedel(Relation rel, ItemPointer tid)
+{
+ Buffer buf;
+ Buffer metabuf;
+ Page page;
+ BlockNumber blkno;
+ OffsetNumber offno;
+ HashMetaPage metap;
+ HashPageOpaque opaque;
+
+ blkno = ItemPointerGetBlockNumber(tid);
+ offno = ItemPointerGetOffsetNumber(tid);
+
+ buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+ PageIndexTupleDelete(page, offno);
+ _hash_wrtnorelbuf(rel, buf);
+
+ if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+ buf = _hash_freeovflpage(rel, buf);
+ if (BufferIsValid(buf)) {
+ _hash_relbuf(rel, buf, HASH_WRITE);
+ }
+ } else {
+ _hash_relbuf(rel, buf, HASH_WRITE);
+ }
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+ ++metap->hashm_nkeys;
+ _hash_wrtbuf(rel, metabuf);
+}
+
+void
+_hash_expandtable(Relation rel, Buffer metabuf)
+{
+ HashMetaPage metap;
+ Bucket old_bucket;
+ Bucket new_bucket;
+ uint32 spare_ndx;
+
+/* elog(DEBUG, "_hash_expandtable: expanding..."); */
+
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
+ new_bucket = ++metap->MAX_BUCKET;
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
+ old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK);
+
+ /*
+ * If the split point is increasing (MAX_BUCKET's log base 2
+ * * increases), we need to copy the current contents of the spare
+ * split bucket to the next bucket.
+ */
+ spare_ndx = _hash_log2(metap->MAX_BUCKET + 1);
+ if (spare_ndx > metap->OVFL_POINT) {
+
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
+ metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT];
+ metap->OVFL_POINT = spare_ndx;
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
+ }
+
+ if (new_bucket > metap->HIGH_MASK) {
+
+ /* Starting a new doubling */
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
+ metap->LOW_MASK = metap->HIGH_MASK;
+ metap->HIGH_MASK = new_bucket | metap->LOW_MASK;
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
+
+ }
+ /* Relocate records to the new bucket */
+ _hash_splitpage(rel, metabuf, old_bucket, new_bucket);
+}
+
+
+/*
+ * _hash_splitpage -- split 'obucket' into 'obucket' and 'nbucket'
+ *
+ * this routine is actually misnamed -- we are splitting a bucket that
+ * consists of a base bucket page and zero or more overflow (bucket
+ * chain) pages.
+ */
+static void
+_hash_splitpage(Relation rel,
+ Buffer metabuf,
+ Bucket obucket,
+ Bucket nbucket)
+{
+ Bucket bucket;
+ Buffer obuf;
+ Buffer nbuf;
+ Buffer ovflbuf;
+ BlockNumber oblkno;
+ BlockNumber nblkno;
+ bool null;
+ Datum datum;
+ HashItem hitem;
+ HashPageOpaque oopaque;
+ HashPageOpaque nopaque;
+ HashMetaPage metap;
+ IndexTuple itup;
+ int itemsz;
+ OffsetNumber ooffnum;
+ OffsetNumber noffnum;
+ OffsetNumber omaxoffnum;
+ Page opage;
+ Page npage;
+ TupleDesc itupdesc;
+
+/* elog(DEBUG, "_hash_splitpage: splitting %d into %d,%d",
+ obucket, obucket, nbucket);
+*/
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ /* get the buffers & pages */
+ oblkno = BUCKET_TO_BLKNO(obucket);
+ nblkno = BUCKET_TO_BLKNO(nbucket);
+ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+ nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
+ opage = BufferGetPage(obuf);
+ npage = BufferGetPage(nbuf);
+
+ /* initialize the new bucket */
+ _hash_pageinit(npage, BufferGetPageSize(nbuf));
+ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+ nopaque->hasho_prevblkno = InvalidBlockNumber;
+ nopaque->hasho_nextblkno = InvalidBlockNumber;
+ nopaque->hasho_flag = LH_BUCKET_PAGE;
+ nopaque->hasho_oaddr = InvalidOvflAddress;
+ nopaque->hasho_bucket = nbucket;
+ _hash_wrtnorelbuf(rel, nbuf);
+
+ /*
+ * make sure the old bucket isn't empty. advance 'opage' and
+ * friends through the overflow bucket chain until we find a
+ * non-empty page.
+ *
+ * XXX we should only need this once, if we are careful to
+ * preserve the invariant that overflow pages are never empty.
+ */
+ _hash_checkpage(opage, LH_BUCKET_PAGE);
+ oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+ if (PageIsEmpty(opage)) {
+ oblkno = oopaque->hasho_nextblkno;
+ _hash_relbuf(rel, obuf, HASH_WRITE);
+ if (!BlockNumberIsValid(oblkno)) {
+ /*
+ * the old bucket is completely empty; of course, the new
+ * bucket will be as well, but since it's a base bucket
+ * page we don't care.
+ */
+ _hash_relbuf(rel, nbuf, HASH_WRITE);
+ return;
+ }
+ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+ opage = BufferGetPage(obuf);
+ _hash_checkpage(opage, LH_OVERFLOW_PAGE);
+ if (PageIsEmpty(opage)) {
+ elog(WARN, "_hash_splitpage: empty overflow page %d", oblkno);
+ }
+ oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+ }
+
+ /*
+ * we are now guaranteed that 'opage' is not empty. partition the
+ * tuples in the old bucket between the old bucket and the new
+ * bucket, advancing along their respective overflow bucket chains
+ * and adding overflow pages as needed.
+ */
+ ooffnum = FirstOffsetNumber;
+ omaxoffnum = PageGetMaxOffsetNumber(opage);
+ for (;;) {
+ /*
+ * at each iteration through this loop, each of these variables
+ * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
+ */
+
+ /* check if we're at the end of the page */
+ if (ooffnum > omaxoffnum) {
+ /* at end of page, but check for overflow page */
+ oblkno = oopaque->hasho_nextblkno;
+ if (BlockNumberIsValid(oblkno)) {
+ /*
+ * we ran out of tuples on this particular page, but
+ * we have more overflow pages; re-init values.
+ */
+ _hash_wrtbuf(rel, obuf);
+ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+ opage = BufferGetPage(obuf);
+ _hash_checkpage(opage, LH_OVERFLOW_PAGE);
+ oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+
+ /* we're guaranteed that an ovfl page has at least 1 tuple */
+ if (PageIsEmpty(opage)) {
+ elog(WARN, "_hash_splitpage: empty ovfl page %d!",
+ oblkno);
+ }
+ ooffnum = FirstOffsetNumber;
+ omaxoffnum = PageGetMaxOffsetNumber(opage);
+ } else {
+ /*
+ * we're at the end of the bucket chain, so now we're
+ * really done with everything. before quitting, call
+ * _hash_squeezebucket to ensure the tuples in the
+ * bucket (including the overflow pages) are packed as
+ * tightly as possible.
+ */
+ _hash_wrtbuf(rel, obuf);
+ _hash_wrtbuf(rel, nbuf);
+ _hash_squeezebucket(rel, metap, obucket);
+ return;
+ }
+ }
+
+ /* hash on the tuple */
+ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
+ itup = &(hitem->hash_itup);
+ itupdesc = RelationGetTupleDescriptor(rel);
+ datum = index_getattr(itup, 1, itupdesc, &null);
+ bucket = _hash_call(rel, metap, datum);
+
+ if (bucket == nbucket) {
+ /*
+ * insert the tuple into the new bucket. if it doesn't
+ * fit on the current page in the new bucket, we must
+ * allocate a new overflow page and place the tuple on
+ * that page instead.
+ */
+ itemsz = IndexTupleDSize(hitem->hash_itup)
+ + (sizeof(HashItemData) - sizeof(IndexTupleData));
+
+ itemsz = DOUBLEALIGN(itemsz);
+
+ if (PageGetFreeSpace(npage) < itemsz) {
+ ovflbuf = _hash_addovflpage(rel, &metabuf, nbuf);
+ _hash_wrtbuf(rel, nbuf);
+ nbuf = ovflbuf;
+ npage = BufferGetPage(nbuf);
+ _hash_checkpage(npage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ }
+
+ noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
+ (void) PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED);
+ _hash_wrtnorelbuf(rel, nbuf);
+
+ /*
+ * now delete the tuple from the old bucket. after this
+ * section of code, 'ooffnum' will actually point to the
+ * ItemId to which we would point if we had advanced it
+ * before the deletion (PageIndexTupleDelete repacks the
+ * ItemId array). this also means that 'omaxoffnum' is
+ * exactly one less than it used to be, so we really can
+ * just decrement it instead of calling
+ * PageGetMaxOffsetNumber.
+ */
+ PageIndexTupleDelete(opage, ooffnum);
+ _hash_wrtnorelbuf(rel, obuf);
+ omaxoffnum = OffsetNumberPrev(omaxoffnum);
+
+ /*
+ * tidy up. if the old page was an overflow page and it
+ * is now empty, we must free it (we want to preserve the
+ * invariant that overflow pages cannot be empty).
+ */
+ if (PageIsEmpty(opage) &&
+ (oopaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+ obuf = _hash_freeovflpage(rel, obuf);
+
+ /* check that we're not through the bucket chain */
+ if (BufferIsInvalid(obuf)) {
+ _hash_wrtbuf(rel, nbuf);
+ _hash_squeezebucket(rel, metap, obucket);
+ return;
+ }
+
+ /*
+ * re-init. again, we're guaranteed that an ovfl page
+ * has at least one tuple.
+ */
+ opage = BufferGetPage(obuf);
+ _hash_checkpage(opage, LH_OVERFLOW_PAGE);
+ oblkno = BufferGetBlockNumber(obuf);
+ oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+ if (PageIsEmpty(opage)) {
+ elog(WARN, "_hash_splitpage: empty overflow page %d",
+ oblkno);
+ }
+ ooffnum = FirstOffsetNumber;
+ omaxoffnum = PageGetMaxOffsetNumber(opage);
+ }
+ } else {
+ /*
+ * the tuple stays on this page. we didn't move anything,
+ * so we didn't delete anything and therefore we don't
+ * have to change 'omaxoffnum'.
+ *
+ * XXX any hash value from [0, nbucket-1] will map to this
+ * bucket, which doesn't make sense to me.
+ */
+ ooffnum = OffsetNumberNext(ooffnum);
+ }
+ }
+ /*NOTREACHED*/
+}
diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c
new file mode 100644
index 00000000000..c4cce0e70d9
--- /dev/null
+++ b/src/backend/access/hash/hashscan.c
@@ -0,0 +1,172 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashscan.c--
+ * manage scans on hash tables
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * Because we can be doing an index scan on a relation while we
+ * update it, we need to avoid missing data that moves around in
+ * the index. The routines and global variables in this file
+ * guarantee that all scans in the local address space stay
+ * correctly positioned. This is all we need to worry about, since
+ * write locking guarantees that no one else will be on the same
+ * page at the same time as we are.
+ *
+ * The scheme is to manage a list of active scans in the current
+ * backend. Whenever we add or remove records from an index, we
+ * check the list of active scans to see if any has been affected.
+ * A scan is affected only if it is on the same relation, and the
+ * same page, as the update.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+
+static void _hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
+static bool _hash_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
+
+typedef struct HashScanListData {
+ IndexScanDesc hashsl_scan;
+ struct HashScanListData *hashsl_next;
+} HashScanListData;
+
+typedef HashScanListData *HashScanList;
+
+static HashScanList HashScans = (HashScanList) NULL;
+
+/*
+ * _Hash_regscan() -- register a new scan.
+ */
+void
+_hash_regscan(IndexScanDesc scan)
+{
+ HashScanList new_el;
+
+ new_el = (HashScanList) palloc(sizeof(HashScanListData));
+ new_el->hashsl_scan = scan;
+ new_el->hashsl_next = HashScans;
+ HashScans = new_el;
+}
+
+/*
+ * _hash_dropscan() -- drop a scan from the scan list
+ */
+void
+_hash_dropscan(IndexScanDesc scan)
+{
+ HashScanList chk, last;
+
+ last = (HashScanList) NULL;
+ for (chk = HashScans;
+ chk != (HashScanList) NULL && chk->hashsl_scan != scan;
+ chk = chk->hashsl_next) {
+ last = chk;
+ }
+
+ if (chk == (HashScanList) NULL)
+ elog(WARN, "hash scan list trashed; can't find 0x%lx", scan);
+
+ if (last == (HashScanList) NULL)
+ HashScans = chk->hashsl_next;
+ else
+ last->hashsl_next = chk->hashsl_next;
+
+#ifdef PERFECT_MEM
+ pfree (chk);
+#endif /* PERFECT_MEM */
+}
+
+void
+_hash_adjscans(Relation rel, ItemPointer tid)
+{
+ HashScanList l;
+ Oid relid;
+
+ relid = rel->rd_id;
+ for (l = HashScans; l != (HashScanList) NULL; l = l->hashsl_next) {
+ if (relid == l->hashsl_scan->relation->rd_id)
+ _hash_scandel(l->hashsl_scan, ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ }
+}
+
+static void
+_hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+ ItemPointer current;
+ Buffer buf;
+ Buffer metabuf;
+ HashScanOpaque so;
+
+ if (!_hash_scantouched(scan, blkno, offno))
+ return;
+
+ metabuf = _hash_getbuf(scan->relation, HASH_METAPAGE, HASH_READ);
+
+ so = (HashScanOpaque) scan->opaque;
+ buf = so->hashso_curbuf;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno) {
+ _hash_step(scan, &buf, BackwardScanDirection, metabuf);
+ so->hashso_curbuf = buf;
+ }
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno) {
+ ItemPointerData tmp;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ _hash_step(scan, &buf, BackwardScanDirection, metabuf);
+ so->hashso_mrkbuf = buf;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ }
+}
+
+static bool
+_hash_scantouched(IndexScanDesc scan,
+ BlockNumber blkno,
+ OffsetNumber offno)
+{
+ ItemPointer current;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ return (false);
+}
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
new file mode 100644
index 00000000000..056235dec85
--- /dev/null
+++ b/src/backend/access/hash/hashsearch.c
@@ -0,0 +1,425 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashsearch.c--
+ * search code for postgres hash tables
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "fmgr.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/skey.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+
+/*
+ * _hash_search() -- Finds the page/bucket that the contains the
+ * scankey and loads it into *bufP. the buffer has a read lock.
+ */
+void
+_hash_search(Relation rel,
+ int keysz,
+ ScanKey scankey,
+ Buffer *bufP,
+ HashMetaPage metap)
+{
+ BlockNumber blkno;
+ Datum keyDatum;
+ Bucket bucket;
+
+ if (scankey == (ScanKey) NULL ||
+ (keyDatum = scankey[0].sk_argument) == (Datum) NULL) {
+ /*
+ * If the scankey argument is NULL, all tuples will satisfy
+ * the scan so we start the scan at the first bucket (bucket
+ * 0).
+ */
+ bucket = 0;
+ } else {
+ bucket = _hash_call(rel, metap, keyDatum);
+ }
+
+ blkno = BUCKET_TO_BLKNO(bucket);
+
+ *bufP = _hash_getbuf(rel, blkno, HASH_READ);
+}
+
+/*
+ * _hash_next() -- Get the next item in a scan.
+ *
+ * On entry, we have a valid currentItemData in the scan, and a
+ * read lock on the page that contains that item. We do not have
+ * the page pinned. We return the next item in the scan. On
+ * exit, we have the page containing the next item locked but not
+ * pinned.
+ */
+RetrieveIndexResult
+_hash_next(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ Buffer buf;
+ Buffer metabuf;
+ Page page;
+ OffsetNumber offnum;
+ RetrieveIndexResult res;
+ ItemPointer current;
+ ItemPointer iptr;
+ HashItem hitem;
+ IndexTuple itup;
+ HashScanOpaque so;
+
+ rel = scan->relation;
+ so = (HashScanOpaque) scan->opaque;
+ current = &(scan->currentItemData);
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+
+ /*
+ * XXX 10 may 91: somewhere there's a bug in our management of the
+ * cached buffer for this scan. wei discovered it. the following
+ * is a workaround so he can work until i figure out what's going on.
+ */
+
+ if (!BufferIsValid(so->hashso_curbuf)) {
+ so->hashso_curbuf = _hash_getbuf(rel,
+ ItemPointerGetBlockNumber(current),
+ HASH_READ);
+ }
+
+ /* we still have the buffer pinned and locked */
+ buf = so->hashso_curbuf;
+
+ /*
+ * step to next valid tuple. note that _hash_step releases our
+ * lock on 'metabuf'; if we switch to a new 'buf' while looking
+ * for the next tuple, we come back with a lock on that buffer.
+ */
+ if (!_hash_step(scan, &buf, dir, metabuf)) {
+ return ((RetrieveIndexResult) NULL);
+ }
+
+ /* if we're here, _hash_step found a valid tuple */
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &hitem->hash_itup;
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid), sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+
+ return (res);
+}
+
+static void
+_hash_readnext(Relation rel,
+ Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
+{
+ BlockNumber blkno;
+
+ blkno = (*opaquep)->hasho_nextblkno;
+ _hash_relbuf(rel, *bufp, HASH_READ);
+ *bufp = InvalidBuffer;
+ if (BlockNumberIsValid(blkno)) {
+ *bufp = _hash_getbuf(rel, blkno, HASH_READ);
+ *pagep = BufferGetPage(*bufp);
+ _hash_checkpage(*pagep, LH_OVERFLOW_PAGE);
+ *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+ Assert(!PageIsEmpty(*pagep));
+ }
+}
+
+static void
+_hash_readprev(Relation rel,
+ Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
+{
+ BlockNumber blkno;
+
+ blkno = (*opaquep)->hasho_prevblkno;
+ _hash_relbuf(rel, *bufp, HASH_READ);
+ *bufp = InvalidBuffer;
+ if (BlockNumberIsValid(blkno)) {
+ *bufp = _hash_getbuf(rel, blkno, HASH_READ);
+ *pagep = BufferGetPage(*bufp);
+ _hash_checkpage(*pagep, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+ if (PageIsEmpty(*pagep)) {
+ Assert((*opaquep)->hasho_flag & LH_BUCKET_PAGE);
+ _hash_relbuf(rel, *bufp, HASH_READ);
+ *bufp = InvalidBuffer;
+ }
+ }
+}
+
+/*
+ * _hash_first() -- Find the first item in a scan.
+ *
+ * Return the RetrieveIndexResult of the first item in the tree that
+ * satisfies the qualificatin associated with the scan descriptor. On
+ * exit, the page containing the current index tuple is read locked
+ * and pinned, and the scan's opaque data entry is updated to
+ * include the buffer.
+ */
+RetrieveIndexResult
+_hash_first(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ Buffer buf;
+ Buffer metabuf;
+ Page page;
+ HashPageOpaque opaque;
+ HashMetaPage metap;
+ HashItem hitem;
+ IndexTuple itup;
+ ItemPointer current;
+ ItemPointer iptr;
+ OffsetNumber offnum;
+ RetrieveIndexResult res;
+ HashScanOpaque so;
+
+ rel = scan->relation;
+ so = (HashScanOpaque) scan->opaque;
+ current = &(scan->currentItemData);
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ /*
+ * XXX -- The attribute number stored in the scan key is the attno
+ * in the heap relation. We need to transmogrify this into
+ * the index relation attno here. For the moment, we have
+ * hardwired attno == 1.
+ */
+
+ /* find the correct bucket page and load it into buf */
+ _hash_search(rel, 1, scan->keyData, &buf, metap);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * if we are scanning forward, we need to find the first non-empty
+ * page (if any) in the bucket chain. since overflow pages are
+ * never empty, this had better be either the bucket page or the
+ * first overflow page.
+ *
+ * if we are scanning backward, we always go all the way to the
+ * end of the bucket chain.
+ */
+ if (PageIsEmpty(page)) {
+ if (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+ _hash_readnext(rel, &buf, &page, &opaque);
+ } else {
+ ItemPointerSetInvalid(current);
+ so->hashso_curbuf = InvalidBuffer;
+ return ((RetrieveIndexResult) NULL);
+ }
+ }
+ if (ScanDirectionIsBackward(dir)) {
+ while (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+ _hash_readnext(rel, &buf, &page, &opaque);
+ }
+ }
+
+ if (!_hash_step(scan, &buf, dir, metabuf)) {
+ return ((RetrieveIndexResult) NULL);
+ }
+
+ /* if we're here, _hash_step found a valid tuple */
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &hitem->hash_itup;
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid), sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+
+ return (res);
+}
+
+/*
+ * _hash_step() -- step to the next valid item in a scan in the bucket.
+ *
+ * If no valid record exists in the requested direction, return
+ * false. Else, return true and set the CurrentItemData for the
+ * scan to the right thing.
+ *
+ * 'bufP' points to the buffer which contains the current page
+ * that we'll step through.
+ *
+ * 'metabuf' is released when this returns.
+ */
+bool
+_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
+{
+ Relation rel;
+ ItemPointer current;
+ HashScanOpaque so;
+ int allbuckets;
+ HashMetaPage metap;
+ Buffer buf;
+ Page page;
+ HashPageOpaque opaque;
+ OffsetNumber maxoff;
+ OffsetNumber offnum;
+ Bucket bucket;
+ BlockNumber blkno;
+ HashItem hitem;
+ IndexTuple itup;
+
+ rel = scan->relation;
+ current = &(scan->currentItemData);
+ so = (HashScanOpaque) scan->opaque;
+ allbuckets = (scan->numberOfKeys < 1);
+
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ buf = *bufP;
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * If _hash_step is called from _hash_first, current will not be
+ * valid, so we can't dereference it. However, in that case, we
+ * presumably want to start at the beginning/end of the page...
+ */
+ maxoff = PageGetMaxOffsetNumber(page);
+ if (ItemPointerIsValid(current)) {
+ offnum = ItemPointerGetOffsetNumber(current);
+ } else {
+ offnum = InvalidOffsetNumber;
+ }
+
+ /*
+ * 'offnum' now points to the last tuple we have seen (if any).
+ *
+ * continue to step through tuples until:
+ * 1) we get to the end of the bucket chain or
+ * 2) we find a valid tuple.
+ */
+ do {
+ bucket = opaque->hasho_bucket;
+
+ switch (dir) {
+ case ForwardScanDirection:
+ if (offnum != InvalidOffsetNumber) {
+ offnum = OffsetNumberNext(offnum); /* move forward */
+ } else {
+ offnum = FirstOffsetNumber; /* new page */
+ }
+ while (offnum > maxoff) {
+ /*
+ * either this page is empty (maxoff ==
+ * InvalidOffsetNumber) or we ran off the end.
+ */
+ _hash_readnext(rel, &buf, &page, &opaque);
+ if (BufferIsInvalid(buf)) { /* end of chain */
+ if (allbuckets && bucket < metap->hashm_maxbucket) {
+ ++bucket;
+ blkno = BUCKET_TO_BLKNO(bucket);
+ buf = _hash_getbuf(rel, blkno, HASH_READ);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_bucket == bucket);
+ while (PageIsEmpty(page) &&
+ BlockNumberIsValid(opaque->hasho_nextblkno)) {
+ _hash_readnext(rel, &buf, &page, &opaque);
+ }
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = FirstOffsetNumber;
+ } else {
+ maxoff = offnum = InvalidOffsetNumber;
+ break; /* while */
+ }
+ } else {
+ /* _hash_readnext never returns an empty page */
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = FirstOffsetNumber;
+ }
+ }
+ break;
+ case BackwardScanDirection:
+ if (offnum != InvalidOffsetNumber) {
+ offnum = OffsetNumberPrev(offnum); /* move back */
+ } else {
+ offnum = maxoff; /* new page */
+ }
+ while (offnum < FirstOffsetNumber) {
+ /*
+ * either this page is empty (offnum ==
+ * InvalidOffsetNumber) or we ran off the end.
+ */
+ _hash_readprev(rel, &buf, &page, &opaque);
+ if (BufferIsInvalid(buf)) { /* end of chain */
+ if (allbuckets && bucket > 0) {
+ --bucket;
+ blkno = BUCKET_TO_BLKNO(bucket);
+ buf = _hash_getbuf(rel, blkno, HASH_READ);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_bucket == bucket);
+ while (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+ _hash_readnext(rel, &buf, &page, &opaque);
+ }
+ maxoff = offnum = PageGetMaxOffsetNumber(page);
+ } else {
+ maxoff = offnum = InvalidOffsetNumber;
+ break; /* while */
+ }
+ } else {
+ /* _hash_readprev never returns an empty page */
+ maxoff = offnum = PageGetMaxOffsetNumber(page);
+ }
+ }
+ break;
+ default:
+ /* NoMovementScanDirection */
+ /* this should not be reached */
+ break;
+ }
+
+ /* we ran off the end of the world without finding a match */
+ if (offnum == InvalidOffsetNumber) {
+ _hash_relbuf(rel, metabuf, HASH_READ);
+ *bufP = so->hashso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return(false);
+ }
+
+ /* get ready to check this tuple */
+ hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &hitem->hash_itup;
+ } while (!_hash_checkqual(scan, itup));
+
+ /* if we made it to here, we've found a valid tuple */
+ _hash_relbuf(rel, metabuf, HASH_READ);
+ blkno = BufferGetBlockNumber(buf);
+ *bufP = so->hashso_curbuf = buf;
+ ItemPointerSet(current, blkno, offnum);
+ return(true);
+}
diff --git a/src/backend/access/hash/hashstrat.c b/src/backend/access/hash/hashstrat.c
new file mode 100644
index 00000000000..cac2a58690e
--- /dev/null
+++ b/src/backend/access/hash/hashstrat.c
@@ -0,0 +1,104 @@
+/*-------------------------------------------------------------------------
+ *
+ * btstrat.c--
+ * Srategy map entries for the btree indexed access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/Attic/hashstrat.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+/*
+ * only one valid strategy for hash tables: equality.
+ */
+
+static StrategyNumber HTNegate[1] = {
+ InvalidStrategy
+};
+
+static StrategyNumber HTCommute[1] = {
+ HTEqualStrategyNumber
+};
+
+static StrategyNumber HTNegateCommute[1] = {
+ InvalidStrategy
+};
+
+static StrategyEvaluationData HTEvaluationData = {
+ /* XXX static for simplicity */
+
+ HTMaxStrategyNumber,
+ (StrategyTransformMap)HTNegate,
+ (StrategyTransformMap)HTCommute,
+ (StrategyTransformMap)HTNegateCommute,
+ {NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL}
+};
+
+/* ----------------------------------------------------------------
+ * RelationGetHashStrategy
+ * ----------------------------------------------------------------
+ */
+
+StrategyNumber
+_hash_getstrat(Relation rel,
+ AttrNumber attno,
+ RegProcedure proc)
+{
+ StrategyNumber strat;
+
+ strat = RelationGetStrategy(rel, attno, &HTEvaluationData, proc);
+
+ Assert(StrategyNumberIsValid(strat));
+
+ return (strat);
+}
+
+bool
+_hash_invokestrat(Relation rel,
+ AttrNumber attno,
+ StrategyNumber strat,
+ Datum left,
+ Datum right)
+{
+ return (RelationInvokeStrategy(rel, &HTEvaluationData, attno, strat,
+ left, right));
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
new file mode 100644
index 00000000000..f8f49fe7983
--- /dev/null
+++ b/src/backend/access/hash/hashutil.c
@@ -0,0 +1,147 @@
+/*-------------------------------------------------------------------------
+ *
+ * btutils.c--
+ * Utility code for Postgres btree implementation.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/iqual.h"
+#include "access/hash.h"
+
+ScanKey
+_hash_mkscankey(Relation rel, IndexTuple itup, HashMetaPage metap)
+{
+ ScanKey skey;
+ TupleDesc itupdesc;
+ int natts;
+ AttrNumber i;
+ Datum arg;
+ RegProcedure proc;
+ bool null;
+
+ natts = rel->rd_rel->relnatts;
+ itupdesc = RelationGetTupleDescriptor(rel);
+
+ skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
+
+ for (i = 0; i < natts; i++) {
+ arg = index_getattr(itup, i + 1, itupdesc, &null);
+ proc = metap->hashm_procid;
+ ScanKeyEntryInitialize(&skey[i],
+ 0x0, (AttrNumber) (i + 1), proc, arg);
+ }
+
+ return (skey);
+}
+
+void
+_hash_freeskey(ScanKey skey)
+{
+ pfree(skey);
+}
+
+
+bool
+_hash_checkqual(IndexScanDesc scan, IndexTuple itup)
+{
+ if (scan->numberOfKeys > 0)
+ return (index_keytest(itup,
+ RelationGetTupleDescriptor(scan->relation),
+ scan->numberOfKeys, scan->keyData));
+ else
+ return (true);
+}
+
+HashItem
+_hash_formitem(IndexTuple itup)
+{
+ int nbytes_hitem;
+ HashItem hitem;
+ Size tuplen;
+
+ /* disallow nulls in hash keys */
+ if (itup->t_info & INDEX_NULL_MASK)
+ elog(WARN, "hash indices cannot include null keys");
+
+ /* make a copy of the index tuple with room for the sequence number */
+ tuplen = IndexTupleSize(itup);
+ nbytes_hitem = tuplen +
+ (sizeof(HashItemData) - sizeof(IndexTupleData));
+
+ hitem = (HashItem) palloc(nbytes_hitem);
+ memmove((char *) &(hitem->hash_itup), (char *) itup, tuplen);
+
+ return (hitem);
+}
+
+Bucket
+_hash_call(Relation rel, HashMetaPage metap, Datum key)
+{
+ uint32 n;
+ Bucket bucket;
+ RegProcedure proc;
+
+ proc = metap->hashm_procid;
+ n = (uint32) fmgr(proc, key);
+ bucket = n & metap->hashm_highmask;
+ if (bucket > metap->hashm_maxbucket)
+ bucket = bucket & metap->hashm_lowmask;
+ return (bucket);
+}
+
+/*
+ * _hash_log2 -- returns ceil(lg2(num))
+ */
+uint32
+_hash_log2(uint32 num)
+{
+ uint32 i, limit;
+
+ limit = 1;
+ for (i = 0; limit < num; limit = limit << 1, i++)
+ ;
+ return (i);
+}
+
+/*
+ * _hash_checkpage -- sanity checks on the format of all hash pages
+ */
+void
+_hash_checkpage(Page page, int flags)
+{
+ PageHeader ph = (PageHeader) page;
+ HashPageOpaque opaque;
+
+ Assert(page);
+ Assert(ph->pd_lower >= (sizeof(PageHeaderData) - sizeof(ItemIdData)));
+#if 1
+ Assert(ph->pd_upper <=
+ (BLCKSZ - DOUBLEALIGN(sizeof(HashPageOpaqueData))));
+ Assert(ph->pd_special ==
+ (BLCKSZ - DOUBLEALIGN(sizeof(HashPageOpaqueData))));
+ Assert(ph->pd_opaque.od_pagesize == BLCKSZ);
+#endif
+ if (flags) {
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_flag & flags);
+ }
+}
diff --git a/src/backend/access/heap/Makefile.inc b/src/backend/access/heap/Makefile.inc
new file mode 100644
index 00000000000..f4f4bbb7031
--- /dev/null
+++ b/src/backend/access/heap/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/heap
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/heap/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= heapam.c hio.c stats.c
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
new file mode 100644
index 00000000000..4bf31efd832
--- /dev/null
+++ b/src/backend/access/heap/heapam.c
@@ -0,0 +1,1507 @@
+/*-------------------------------------------------------------------------
+ *
+ * heapam.c--
+ * heap access method code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+ *
+ *
+ * INTERFACE ROUTINES
+ * heapgettup - fetch next heap tuple from a scan
+ * heap_open - open a heap relation by relationId
+ * heap_openr - open a heap relation by name
+ * heap_close - close a heap relation
+ * heap_beginscan - begin relation scan
+ * heap_rescan - restart a relation scan
+ * heap_endscan - end relation scan
+ * heap_getnext - retrieve next tuple in scan
+ * heap_fetch - retrive tuple with tid
+ * heap_insert - insert tuple into a relation
+ * heap_delete - delete a tuple from a relation
+ * heap_replace - replace a tuple in a relation with another tuple
+ * heap_markpos - mark scan position
+ * heap_restrpos - restore position to marked location
+ *
+ * NOTES
+ * This file contains the heap_ routines which implement
+ * the POSTGRES heap access method used for all POSTGRES
+ * relations.
+ *
+ * OLD COMMENTS
+ * struct relscan hints: (struct should be made AM independent?)
+ *
+ * rs_ctid is the tid of the last tuple returned by getnext.
+ * rs_ptid and rs_ntid are the tids of the previous and next tuples
+ * returned by getnext, respectively. NULL indicates an end of
+ * scan (either direction); NON indicates an unknow value.
+ *
+ * possible combinations:
+ * rs_p rs_c rs_n interpretation
+ * NULL NULL NULL empty scan
+ * NULL NULL NON at begining of scan
+ * NULL NULL t1 at begining of scan (with cached tid)
+ * NON NULL NULL at end of scan
+ * t1 NULL NULL at end of scan (with cached tid)
+ * NULL t1 NULL just returned only tuple
+ * NULL t1 NON just returned first tuple
+ * NULL t1 t2 returned first tuple (with cached tid)
+ * NON t1 NULL just returned last tuple
+ * t2 t1 NULL returned last tuple (with cached tid)
+ * t1 t2 NON in the middle of a forward scan
+ * NON t2 t1 in the middle of a reverse scan
+ * ti tj tk in the middle of a scan (w cached tid)
+ *
+ * Here NULL is ...tup == NULL && ...buf == InvalidBuffer,
+ * and NON is ...tup == NULL && ...buf == UnknownBuffer.
+ *
+ * Currently, the NONTID values are not cached with their actual
+ * values by getnext. Values may be cached by markpos since it stores
+ * all three tids.
+ *
+ * NOTE: the calls to elog() must stop. Should decide on an interface
+ * between the general and specific AM calls.
+ *
+ * XXX probably do not need a free tuple routine for heaps.
+ * Huh? Free tuple is not necessary for tuples returned by scans, but
+ * is necessary for tuples which are returned by
+ * RelationGetTupleByItemPointer. -hirohama
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/file.h>
+#include <string.h>
+
+#include "postgres.h"
+
+#include "access/attnum.h"
+#include "access/heapam.h"
+#include "access/hio.h"
+#include "access/htup.h"
+#include "access/relscan.h"
+#include "access/skey.h"
+
+#include "utils/tqual.h"
+#include "access/valid.h"
+#include "access/xact.h"
+
+#include "catalog/catalog.h"
+#include "catalog/catname.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "storage/itemid.h"
+#include "storage/itemptr.h"
+#include "storage/lmgr.h"
+
+#include "tcop/tcopdebug.h"
+#include "miscadmin.h"
+
+#include "utils/memutils.h"
+#include "utils/palloc.h"
+#include "fmgr.h"
+#include "utils/inval.h"
+#include "utils/elog.h"
+#include "utils/mcxt.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+
+static bool ImmediateInvalidation;
+
+/* ----------------------------------------------------------------
+ * heap support routines
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * initsdesc - sdesc code common to heap_beginscan and heap_rescan
+ * ----------------
+ */
+static void
+initsdesc(HeapScanDesc sdesc,
+ Relation relation,
+ int atend,
+ unsigned nkeys,
+ ScanKey key)
+{
+ if (!RelationGetNumberOfBlocks(relation)) {
+ /* ----------------
+ * relation is empty
+ * ----------------
+ */
+ sdesc->rs_ntup = sdesc->rs_ctup = sdesc->rs_ptup = NULL;
+ sdesc->rs_nbuf = sdesc->rs_cbuf = sdesc->rs_pbuf = InvalidBuffer;
+ } else if (atend) {
+ /* ----------------
+ * reverse scan
+ * ----------------
+ */
+ sdesc->rs_ntup = sdesc->rs_ctup = NULL;
+ sdesc->rs_nbuf = sdesc->rs_cbuf = InvalidBuffer;
+ sdesc->rs_ptup = NULL;
+ sdesc->rs_pbuf = UnknownBuffer;
+ } else {
+ /* ----------------
+ * forward scan
+ * ----------------
+ */
+ sdesc->rs_ctup = sdesc->rs_ptup = NULL;
+ sdesc->rs_cbuf = sdesc->rs_pbuf = InvalidBuffer;
+ sdesc->rs_ntup = NULL;
+ sdesc->rs_nbuf = UnknownBuffer;
+ } /* invalid too */
+
+ /* we don't have a marked position... */
+ ItemPointerSetInvalid(&(sdesc->rs_mptid));
+ ItemPointerSetInvalid(&(sdesc->rs_mctid));
+ ItemPointerSetInvalid(&(sdesc->rs_mntid));
+ ItemPointerSetInvalid(&(sdesc->rs_mcd));
+
+ /* ----------------
+ * copy the scan key, if appropriate
+ * ----------------
+ */
+ if (key != NULL)
+ memmove(sdesc->rs_key, key, nkeys * sizeof(ScanKeyData));
+}
+
+/* ----------------
+ * unpinsdesc - code common to heap_rescan and heap_endscan
+ * ----------------
+ */
+static void
+unpinsdesc(HeapScanDesc sdesc)
+{
+ if (BufferIsValid(sdesc->rs_pbuf)) {
+ ReleaseBuffer(sdesc->rs_pbuf);
+ }
+
+ /* ------------------------------------
+ * Scan will pin buffer one for each non-NULL tuple pointer
+ * (ptup, ctup, ntup), so they have to be unpinned multiple
+ * times.
+ * ------------------------------------
+ */
+ if (BufferIsValid(sdesc->rs_cbuf)) {
+ ReleaseBuffer(sdesc->rs_cbuf);
+ }
+
+ if (BufferIsValid(sdesc->rs_nbuf)) {
+ ReleaseBuffer(sdesc->rs_nbuf);
+ }
+}
+
+/* ------------------------------------------
+ * nextpage
+ *
+ * figure out the next page to scan after the current page
+ * taking into account of possible adjustment of degrees of
+ * parallelism
+ * ------------------------------------------
+ */
+static int
+nextpage(int page, int dir)
+{
+ return((dir<0)?page-1:page+1);
+}
+
+/* ----------------
+ * heapgettup - fetch next heap tuple
+ *
+ * routine used by heap_getnext() which does most of the
+ * real work in scanning tuples.
+ * ----------------
+ */
+static HeapTuple
+heapgettup(Relation relation,
+ ItemPointer tid,
+ int dir,
+ Buffer *b,
+ TimeQual timeQual,
+ int nkeys,
+ ScanKey key)
+{
+ ItemId lpp;
+ Page dp;
+ int page;
+ int pages;
+ int lines;
+ HeapTuple rtup;
+ OffsetNumber lineoff;
+ int linesleft;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_heapgettup);
+ IncrHeapAccessStat(global_heapgettup);
+
+ /* ----------------
+ * debugging stuff
+ *
+ * check validity of arguments, here and for other functions too
+ * Note: no locking manipulations needed--this is a local function
+ * ----------------
+ */
+#ifdef HEAPDEBUGALL
+ if (ItemPointerIsValid(tid)) {
+ elog(DEBUG, "heapgettup(%.16s, tid=0x%x[%d,%d], dir=%d, ...)",
+ RelationGetRelationName(relation), tid, tid->ip_blkid,
+ tid->ip_posid, dir);
+ } else {
+ elog(DEBUG, "heapgettup(%.16s, tid=0x%x, dir=%d, ...)",
+ RelationGetRelationName(relation), tid, dir);
+ }
+ elog(DEBUG, "heapgettup(..., b=0x%x, timeQ=0x%x, nkeys=%d, key=0x%x",
+ b, timeQual, nkeys, key);
+ if (timeQual == SelfTimeQual) {
+ elog(DEBUG, "heapgettup: relation(%c)=`%.16s', SelfTimeQual",
+ relation->rd_rel->relkind, &relation->rd_rel->relname);
+ } else {
+ elog(DEBUG, "heapgettup: relation(%c)=`%.16s', timeQual=%d",
+ relation->rd_rel->relkind, &relation->rd_rel->relname,
+ timeQual);
+ }
+#endif /* !defined(HEAPDEBUGALL) */
+
+ if (!ItemPointerIsValid(tid)) {
+ Assert(!PointerIsValid(tid));
+ }
+
+ /* ----------------
+ * return null immediately if relation is empty
+ * ----------------
+ */
+ if (!(pages = relation->rd_nblocks))
+ return (NULL);
+
+ /* ----------------
+ * calculate next starting lineoff, given scan direction
+ * ----------------
+ */
+ if (!dir) {
+ /* ----------------
+ * ``no movement'' scan direction
+ * ----------------
+ */
+ /* assume it is a valid TID XXX */
+ if (ItemPointerIsValid(tid) == false) {
+ *b = InvalidBuffer;
+ return (NULL);
+ }
+ *b = RelationGetBufferWithBuffer(relation,
+ ItemPointerGetBlockNumber(tid),
+ *b);
+
+#ifndef NO_BUFFERISVALID
+ if (!BufferIsValid(*b)) {
+ elog(WARN, "heapgettup: failed ReadBuffer");
+ }
+#endif
+
+ dp = (Page) BufferGetPage(*b);
+ lineoff = ItemPointerGetOffsetNumber(tid);
+ lpp = PageGetItemId(dp, lineoff);
+
+ rtup = (HeapTuple)PageGetItem((Page) dp, lpp);
+ return (rtup);
+
+ } else if (dir < 0) {
+ /* ----------------
+ * reverse scan direction
+ * ----------------
+ */
+ if (ItemPointerIsValid(tid) == false) {
+ tid = NULL;
+ }
+ if (tid == NULL) {
+ page = pages - 1; /* final page */
+ } else {
+ page = ItemPointerGetBlockNumber(tid); /* current page */
+ }
+ if (page < 0) {
+ *b = InvalidBuffer;
+ return (NULL);
+ }
+
+ *b = RelationGetBufferWithBuffer(relation, page, *b);
+#ifndef NO_BUFFERISVALID
+ if (!BufferIsValid(*b)) {
+ elog(WARN, "heapgettup: failed ReadBuffer");
+ }
+#endif
+
+ dp = (Page) BufferGetPage(*b);
+ lines = PageGetMaxOffsetNumber(dp);
+ if (tid == NULL) {
+ lineoff = lines; /* final offnum */
+ } else {
+ lineoff = /* previous offnum */
+ OffsetNumberPrev(ItemPointerGetOffsetNumber(tid));
+ }
+ /* page and lineoff now reference the physically previous tid */
+
+ } else {
+ /* ----------------
+ * forward scan direction
+ * ----------------
+ */
+ if (ItemPointerIsValid(tid) == false) {
+ page = 0; /* first page */
+ lineoff = FirstOffsetNumber; /* first offnum */
+ } else {
+ page = ItemPointerGetBlockNumber(tid); /* current page */
+ lineoff = /* next offnum */
+ OffsetNumberNext(ItemPointerGetOffsetNumber(tid));
+ }
+
+ if (page >= pages) {
+ *b = InvalidBuffer;
+ return (NULL);
+ }
+ /* page and lineoff now reference the physically next tid */
+
+ *b = RelationGetBufferWithBuffer(relation, page, *b);
+#ifndef NO_BUFFERISVALID
+ if (!BufferIsValid(*b)) {
+ elog(WARN, "heapgettup: failed ReadBuffer");
+ }
+#endif
+
+ dp = (Page) BufferGetPage(*b);
+ lines = PageGetMaxOffsetNumber(dp);
+ }
+
+ /* 'dir' is now non-zero */
+
+ /* ----------------
+ * calculate line pointer and number of remaining items
+ * to check on this page.
+ * ----------------
+ */
+ lpp = PageGetItemId(dp, lineoff);
+ if (dir < 0) {
+ linesleft = lineoff - 1;
+ } else {
+ linesleft = lines - lineoff;
+ }
+
+ /* ----------------
+ * advance the scan until we find a qualifying tuple or
+ * run out of stuff to scan
+ * ----------------
+ */
+ for (;;) {
+ while (linesleft >= 0) {
+ /* ----------------
+ * if current tuple qualifies, return it.
+ * ----------------
+ */
+ if ((rtup = heap_tuple_satisfies(lpp, relation, (PageHeader) dp,
+ timeQual, nkeys, key)) != NULL) {
+ ItemPointer iptr = &(rtup->t_ctid);
+ if (ItemPointerGetBlockNumber(iptr) != page) {
+ /*
+ * set block id to the correct page number
+ * --- this is a hack to support the virtual fragment
+ * concept
+ */
+ ItemPointerSetBlockNumber(iptr, page);
+ }
+ return (rtup);
+ }
+
+ /* ----------------
+ * otherwise move to the next item on the page
+ * ----------------
+ */
+ --linesleft;
+ if (dir < 0) {
+ --lpp; /* move back in this page's ItemId array */
+ } else {
+ ++lpp; /* move forward in this page's ItemId array */
+ }
+ }
+
+ /* ----------------
+ * if we get here, it means we've exhausted the items on
+ * this page and it's time to move to the next..
+ * ----------------
+ */
+ page = nextpage(page, dir);
+
+ /* ----------------
+ * return NULL if we've exhausted all the pages..
+ * ----------------
+ */
+ if (page < 0 || page >= pages) {
+ if (BufferIsValid(*b))
+ ReleaseBuffer(*b);
+ *b = InvalidBuffer;
+ return (NULL);
+ }
+
+ *b = ReleaseAndReadBuffer(*b, relation, page);
+
+#ifndef NO_BUFFERISVALID
+ if (!BufferIsValid(*b)) {
+ elog(WARN, "heapgettup: failed ReadBuffer");
+ }
+#endif
+ dp = (Page) BufferGetPage(*b);
+ lines = lineoff = PageGetMaxOffsetNumber((Page) dp);
+ linesleft = lines - 1;
+ if (dir < 0) {
+ lpp = PageGetItemId(dp, lineoff);
+ } else {
+ lpp = PageGetItemId(dp, FirstOffsetNumber);
+ }
+ }
+}
+
+void
+doinsert(Relation relation, HeapTuple tup)
+{
+ RelationPutHeapTupleAtEnd(relation, tup);
+ return;
+}
+
+/*
+ * HeapScanIsValid is now a macro in relscan.h -cim 4/27/91
+ */
+
+/* ----------------
+ * SetHeapAccessMethodImmediateInvalidation
+ * ----------------
+ */
+void
+SetHeapAccessMethodImmediateInvalidation(bool on)
+{
+ ImmediateInvalidation = on;
+}
+
+/* ----------------------------------------------------------------
+ * heap access method interface
+ * ----------------------------------------------------------------
+ */
+/* ----------------
+ * heap_open - open a heap relation by relationId
+ *
+ * presently the relcache routines do all the work we need
+ * to open/close heap relations.
+ * ----------------
+ */
+Relation
+heap_open(Oid relationId)
+{
+ Relation r;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_open);
+ IncrHeapAccessStat(global_open);
+
+ r = (Relation) RelationIdGetRelation(relationId);
+
+ if (RelationIsValid(r) && r->rd_rel->relkind == RELKIND_INDEX) {
+ elog(WARN, "%s is an index relation", r->rd_rel->relname.data);
+ }
+
+ return (r);
+}
+
+/* ----------------
+ * heap_openr - open a heap relation by name
+ *
+ * presently the relcache routines do all the work we need
+ * to open/close heap relations.
+ * ----------------
+ */
+Relation
+heap_openr(char *relationName)
+{
+ Relation r;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_openr);
+ IncrHeapAccessStat(global_openr);
+
+ r = RelationNameGetRelation(relationName);
+
+ if (RelationIsValid(r) && r->rd_rel->relkind == RELKIND_INDEX) {
+ elog(WARN, "%s is an index relation", r->rd_rel->relname.data);
+ }
+
+ return (r);
+}
+
+/* ----------------
+ * heap_close - close a heap relation
+ *
+ * presently the relcache routines do all the work we need
+ * to open/close heap relations.
+ * ----------------
+ */
+void
+heap_close(Relation relation)
+{
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_close);
+ IncrHeapAccessStat(global_close);
+
+ (void) RelationClose(relation);
+}
+
+
+/* ----------------
+ * heap_beginscan - begin relation scan
+ * ----------------
+ */
+HeapScanDesc
+heap_beginscan(Relation relation,
+ int atend,
+ TimeQual timeQual,
+ unsigned nkeys,
+ ScanKey key)
+{
+ HeapScanDesc sdesc;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_beginscan);
+ IncrHeapAccessStat(global_beginscan);
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ if (RelationIsValid(relation) == false)
+ elog(WARN, "heap_beginscan: !RelationIsValid(relation)");
+
+ /* ----------------
+ * set relation level read lock
+ * ----------------
+ */
+ RelationSetLockForRead(relation);
+
+ /* XXX someday assert SelfTimeQual if relkind == RELKIND_UNCATALOGED */
+ if (relation->rd_rel->relkind == RELKIND_UNCATALOGED) {
+ timeQual = SelfTimeQual;
+ }
+
+ /* ----------------
+ * increment relation ref count while scanning relation
+ * ----------------
+ */
+ RelationIncrementReferenceCount(relation);
+
+ /* ----------------
+ * allocate and initialize scan descriptor
+ * ----------------
+ */
+ sdesc = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
+
+ relation->rd_nblocks = smgrnblocks(relation->rd_rel->relsmgr, relation);
+ sdesc->rs_rd = relation;
+
+ if (nkeys) {
+ /*
+ * we do this here instead of in initsdesc() because heap_rescan also
+ * calls initsdesc() and we don't want to allocate memory again
+ */
+ sdesc->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
+ } else {
+ sdesc->rs_key = NULL;
+ }
+
+ initsdesc(sdesc, relation, atend, nkeys, key);
+
+ sdesc->rs_atend = atend;
+ sdesc->rs_tr = timeQual;
+ sdesc->rs_nkeys = (short)nkeys;
+
+ return (sdesc);
+}
+
+/* ----------------
+ * heap_rescan - restart a relation scan
+ * ----------------
+ */
+void
+heap_rescan(HeapScanDesc sdesc,
+ bool scanFromEnd,
+ ScanKey key)
+{
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_rescan);
+ IncrHeapAccessStat(global_rescan);
+
+ /* Note: set relation level read lock is still set */
+
+ /* ----------------
+ * unpin scan buffers
+ * ----------------
+ */
+ unpinsdesc(sdesc);
+
+ /* ----------------
+ * reinitialize scan descriptor
+ * ----------------
+ */
+ initsdesc(sdesc, sdesc->rs_rd, scanFromEnd, sdesc->rs_nkeys, key);
+ sdesc->rs_atend = (bool) scanFromEnd;
+}
+
+/* ----------------
+ * heap_endscan - end relation scan
+ *
+ * See how to integrate with index scans.
+ * Check handling if reldesc caching.
+ * ----------------
+ */
+void
+heap_endscan(HeapScanDesc sdesc)
+{
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_endscan);
+ IncrHeapAccessStat(global_endscan);
+
+ /* Note: no locking manipulations needed */
+
+ /* ----------------
+ * unpin scan buffers
+ * ----------------
+ */
+ unpinsdesc(sdesc);
+
+ /* ----------------
+ * decrement relation reference count and free scan descriptor storage
+ * ----------------
+ */
+ RelationDecrementReferenceCount(sdesc->rs_rd);
+
+ /* ----------------
+ * Non 2-phase read locks on catalog relations
+ * ----------------
+ */
+ if ( IsSystemRelationName(RelationGetRelationName(sdesc->rs_rd)->data) )
+
+ RelationUnsetLockForRead(sdesc->rs_rd);
+
+ pfree(sdesc); /* XXX */
+}
+
+/* ----------------
+ * heap_getnext - retrieve next tuple in scan
+ *
+ * Fix to work with index relations.
+ * ----------------
+ */
+
+#ifdef HEAPDEBUGALL
+#define HEAPDEBUG_1 \
+elog(DEBUG, "heap_getnext([%s,nkeys=%d],backw=%d,0x%x) called", \
+ sdesc->rs_rd->rd_rel->relname.data, sdesc->rs_nkeys, backw, b)
+
+#define HEAPDEBUG_2 \
+ elog(DEBUG, "heap_getnext called with backw (no tracing yet)")
+
+#define HEAPDEBUG_3 \
+ elog(DEBUG, "heap_getnext returns NULL at end")
+
+#define HEAPDEBUG_4 \
+ elog(DEBUG, "heap_getnext valid buffer UNPIN'd")
+
+#define HEAPDEBUG_5 \
+ elog(DEBUG, "heap_getnext next tuple was cached")
+
+#define HEAPDEBUG_6 \
+ elog(DEBUG, "heap_getnext returning EOS")
+
+#define HEAPDEBUG_7 \
+ elog(DEBUG, "heap_getnext returning tuple");
+#else
+#define HEAPDEBUG_1
+#define HEAPDEBUG_2
+#define HEAPDEBUG_3
+#define HEAPDEBUG_4
+#define HEAPDEBUG_5
+#define HEAPDEBUG_6
+#define HEAPDEBUG_7
+#endif /* !defined(HEAPDEBUGALL) */
+
+
+HeapTuple
+heap_getnext(HeapScanDesc scandesc,
+ int backw,
+ Buffer *b)
+{
+ register HeapScanDesc sdesc = scandesc;
+ Buffer localb;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_getnext);
+ IncrHeapAccessStat(global_getnext);
+
+ /* Note: no locking manipulations needed */
+
+ /* ----------------
+ * argument checks
+ * ----------------
+ */
+ if (sdesc == NULL)
+ elog(WARN, "heap_getnext: NULL relscan");
+
+ /* ----------------
+ * initialize return buffer to InvalidBuffer
+ * ----------------
+ */
+ if (! PointerIsValid(b)) b = &localb;
+ (*b) = InvalidBuffer;
+
+ HEAPDEBUG_1; /* heap_getnext( info ) */
+
+ if (backw) {
+ /* ----------------
+ * handle reverse scan
+ * ----------------
+ */
+ HEAPDEBUG_2; /* heap_getnext called with backw */
+
+ if (sdesc->rs_ptup == sdesc->rs_ctup &&
+ BufferIsInvalid(sdesc->rs_pbuf))
+ {
+ if (BufferIsValid(sdesc->rs_nbuf))
+ ReleaseBuffer(sdesc->rs_nbuf);
+ return (NULL);
+ }
+
+ /*
+ * Copy the "current" tuple/buffer
+ * to "next". Pin/unpin the buffers
+ * accordingly
+ */
+ if (sdesc->rs_nbuf != sdesc->rs_cbuf) {
+ if (BufferIsValid(sdesc->rs_nbuf))
+ ReleaseBuffer(sdesc->rs_nbuf);
+ if (BufferIsValid(sdesc->rs_cbuf))
+ IncrBufferRefCount(sdesc->rs_cbuf);
+ }
+ sdesc->rs_ntup = sdesc->rs_ctup;
+ sdesc->rs_nbuf = sdesc->rs_cbuf;
+
+ if (sdesc->rs_ptup != NULL) {
+ if (sdesc->rs_cbuf != sdesc->rs_pbuf) {
+ if (BufferIsValid(sdesc->rs_cbuf))
+ ReleaseBuffer(sdesc->rs_cbuf);
+ if (BufferIsValid(sdesc->rs_pbuf))
+ IncrBufferRefCount(sdesc->rs_pbuf);
+ }
+ sdesc->rs_ctup = sdesc->rs_ptup;
+ sdesc->rs_cbuf = sdesc->rs_pbuf;
+ } else { /* NONTUP */
+ ItemPointer iptr;
+
+ iptr = (sdesc->rs_ctup != NULL) ?
+ &(sdesc->rs_ctup->t_ctid) : (ItemPointer) NULL;
+
+ /* Don't release sdesc->rs_cbuf at this point, because
+ heapgettup doesn't increase PrivateRefCount if it
+ is already set. On a backward scan, both rs_ctup and rs_ntup
+ usually point to the same buffer page, so
+ PrivateRefCount[rs_cbuf] should be 2 (or more, if for instance
+ ctup is stored in a TupleTableSlot). - 01/09/94 */
+
+ sdesc->rs_ctup = (HeapTuple)
+ heapgettup(sdesc->rs_rd,
+ iptr,
+ -1,
+ &(sdesc->rs_cbuf),
+ sdesc->rs_tr,
+ sdesc->rs_nkeys,
+ sdesc->rs_key);
+ }
+
+ if (sdesc->rs_ctup == NULL && !BufferIsValid(sdesc->rs_cbuf))
+ {
+ if (BufferIsValid(sdesc->rs_pbuf))
+ ReleaseBuffer(sdesc->rs_pbuf);
+ sdesc->rs_ptup = NULL;
+ sdesc->rs_pbuf = InvalidBuffer;
+ if (BufferIsValid(sdesc->rs_nbuf))
+ ReleaseBuffer(sdesc->rs_nbuf);
+ sdesc->rs_ntup = NULL;
+ sdesc->rs_nbuf = InvalidBuffer;
+ return (NULL);
+ }
+
+ if (BufferIsValid(sdesc->rs_pbuf))
+ ReleaseBuffer(sdesc->rs_pbuf);
+ sdesc->rs_ptup = NULL;
+ sdesc->rs_pbuf = UnknownBuffer;
+
+ } else {
+ /* ----------------
+ * handle forward scan
+ * ----------------
+ */
+ if (sdesc->rs_ctup == sdesc->rs_ntup &&
+ BufferIsInvalid(sdesc->rs_nbuf)) {
+ if (BufferIsValid(sdesc->rs_pbuf))
+ ReleaseBuffer(sdesc->rs_pbuf);
+ HEAPDEBUG_3; /* heap_getnext returns NULL at end */
+ return (NULL);
+ }
+
+ /*
+ * Copy the "current" tuple/buffer
+ * to "previous". Pin/unpin the buffers
+ * accordingly
+ */
+ if (sdesc->rs_pbuf != sdesc->rs_cbuf) {
+ if (BufferIsValid(sdesc->rs_pbuf))
+ ReleaseBuffer(sdesc->rs_pbuf);
+ if (BufferIsValid(sdesc->rs_cbuf))
+ IncrBufferRefCount(sdesc->rs_cbuf);
+ }
+ sdesc->rs_ptup = sdesc->rs_ctup;
+ sdesc->rs_pbuf = sdesc->rs_cbuf;
+
+ if (sdesc->rs_ntup != NULL) {
+ if (sdesc->rs_cbuf != sdesc->rs_nbuf) {
+ if (BufferIsValid(sdesc->rs_cbuf))
+ ReleaseBuffer(sdesc->rs_cbuf);
+ if (BufferIsValid(sdesc->rs_nbuf))
+ IncrBufferRefCount(sdesc->rs_nbuf);
+ }
+ sdesc->rs_ctup = sdesc->rs_ntup;
+ sdesc->rs_cbuf = sdesc->rs_nbuf;
+ HEAPDEBUG_5; /* heap_getnext next tuple was cached */
+ } else { /* NONTUP */
+ ItemPointer iptr;
+
+ iptr = (sdesc->rs_ctup != NULL) ?
+ &sdesc->rs_ctup->t_ctid : (ItemPointer) NULL;
+
+ /* Don't release sdesc->rs_cbuf at this point, because
+ heapgettup doesn't increase PrivateRefCount if it
+ is already set. On a forward scan, both rs_ctup and rs_ptup
+ usually point to the same buffer page, so
+ PrivateRefCount[rs_cbuf] should be 2 (or more, if for instance
+ ctup is stored in a TupleTableSlot). - 01/09/93 */
+
+ sdesc->rs_ctup = (HeapTuple)
+ heapgettup(sdesc->rs_rd,
+ iptr,
+ 1,
+ &sdesc->rs_cbuf,
+ sdesc->rs_tr,
+ sdesc->rs_nkeys,
+ sdesc->rs_key);
+ }
+
+ if (sdesc->rs_ctup == NULL && !BufferIsValid(sdesc->rs_cbuf)) {
+ if (BufferIsValid(sdesc->rs_nbuf))
+ ReleaseBuffer(sdesc->rs_nbuf);
+ sdesc->rs_ntup = NULL;
+ sdesc->rs_nbuf = InvalidBuffer;
+ if (BufferIsValid(sdesc->rs_pbuf))
+ ReleaseBuffer(sdesc->rs_pbuf);
+ sdesc->rs_ptup = NULL;
+ sdesc->rs_pbuf = InvalidBuffer;
+ HEAPDEBUG_6; /* heap_getnext returning EOS */
+ return (NULL);
+ }
+
+ if (BufferIsValid(sdesc->rs_nbuf))
+ ReleaseBuffer(sdesc->rs_nbuf);
+ sdesc->rs_ntup = NULL;
+ sdesc->rs_nbuf = UnknownBuffer;
+ }
+
+ /* ----------------
+ * if we get here it means we have a new current scan tuple, so
+ * point to the proper return buffer and return the tuple.
+ * ----------------
+ */
+ (*b) = sdesc->rs_cbuf;
+
+ HEAPDEBUG_7; /* heap_getnext returning tuple */
+
+ return (sdesc->rs_ctup);
+}
+
+/* ----------------
+ * heap_fetch - retrive tuple with tid
+ *
+ * Currently ignores LP_IVALID during processing!
+ * ----------------
+ */
+HeapTuple
+heap_fetch(Relation relation,
+ TimeQual timeQual,
+ ItemPointer tid,
+ Buffer *b)
+{
+ ItemId lp;
+ Buffer buffer;
+ PageHeader dp;
+ HeapTuple tuple;
+ OffsetNumber offnum;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_fetch);
+ IncrHeapAccessStat(global_fetch);
+
+ /*
+ * Note: This is collosally expensive - does two system calls per
+ * indexscan tuple fetch. Not good, and since we should be doing
+ * page level locking by the scanner anyway, it is commented out.
+ */
+
+ /* RelationSetLockForTupleRead(relation, tid); */
+
+ /* ----------------
+ * get the buffer from the relation descriptor
+ * Note that this does a buffer pin.
+ * ----------------
+ */
+
+ buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
+
+#ifndef NO_BUFFERISVALID
+ if (!BufferIsValid(buffer)) {
+ elog(WARN, "heap_fetch: %s relation: ReadBuffer(%lx) failed",
+ &relation->rd_rel->relname, (long)tid);
+ }
+#endif
+
+ /* ----------------
+ * get the item line pointer corresponding to the requested tid
+ * ----------------
+ */
+ dp = (PageHeader) BufferGetPage(buffer);
+ offnum = ItemPointerGetOffsetNumber(tid);
+ lp = PageGetItemId(dp, offnum);
+
+ /* ----------------
+ * more sanity checks
+ * ----------------
+ */
+
+ Assert(ItemIdIsUsed(lp));
+
+ /* ----------------
+ * check time qualification of tid
+ * ----------------
+ */
+
+ tuple = heap_tuple_satisfies(lp, relation, dp,
+ timeQual, 0,(ScanKey)NULL);
+
+ if (tuple == NULL)
+ {
+ ReleaseBuffer(buffer);
+ return (NULL);
+ }
+
+ /* ----------------
+ * all checks passed, now either return a copy of the tuple
+ * or pin the buffer page and return a pointer, depending on
+ * whether caller gave us a valid b.
+ * ----------------
+ */
+
+ if (PointerIsValid(b)) {
+ *b = buffer;
+ } else {
+ tuple = heap_copytuple(tuple);
+ ReleaseBuffer(buffer);
+ }
+ return (tuple);
+}
+
+/* ----------------
+ * heap_insert - insert tuple
+ *
+ * The assignment of t_min (and thus the others) should be
+ * removed eventually.
+ *
+ * Currently places the tuple onto the last page. If there is no room,
+ * it is placed on new pages. (Heap relations)
+ * Note that concurrent inserts during a scan will probably have
+ * unexpected results, though this will be fixed eventually.
+ *
+ * Fix to work with indexes.
+ * ----------------
+ */
+Oid
+heap_insert(Relation relation, HeapTuple tup)
+{
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_insert);
+ IncrHeapAccessStat(global_insert);
+
+ /* ----------------
+ * set relation level write lock. If this is a "local" relation (not
+ * visible to others), we don't need to set a write lock.
+ * ----------------
+ */
+ if (!relation->rd_islocal)
+ RelationSetLockForWrite(relation);
+
+ /* ----------------
+ * If the object id of this tuple has already been assigned, trust
+ * the caller. There are a couple of ways this can happen. At initial
+ * db creation, the backend program sets oids for tuples. When we
+ * define an index, we set the oid. Finally, in the future, we may
+ * allow users to set their own object ids in order to support a
+ * persistent object store (objects need to contain pointers to one
+ * another).
+ * ----------------
+ */
+ if (!OidIsValid(tup->t_oid)) {
+ tup->t_oid = newoid();
+ LastOidProcessed = tup->t_oid;
+ }
+
+ TransactionIdStore(GetCurrentTransactionId(), &(tup->t_xmin));
+ tup->t_cmin = GetCurrentCommandId();
+ StoreInvalidTransactionId(&(tup->t_xmax));
+ tup->t_tmin = INVALID_ABSTIME;
+ tup->t_tmax = CURRENT_ABSTIME;
+
+ doinsert(relation, tup);
+
+ if ( IsSystemRelationName(RelationGetRelationName(relation)->data)) {
+ RelationUnsetLockForWrite(relation);
+
+ /* ----------------
+ * invalidate caches (only works for system relations)
+ * ----------------
+ */
+ SetRefreshWhenInvalidate(ImmediateInvalidation);
+ RelationInvalidateHeapTuple(relation, tup);
+ SetRefreshWhenInvalidate((bool)!ImmediateInvalidation);
+ }
+
+ return(tup->t_oid);
+}
+
+/* ----------------
+ * heap_delete - delete a tuple
+ *
+ * Must decide how to handle errors.
+ * ----------------
+ */
+void
+heap_delete(Relation relation, ItemPointer tid)
+{
+ ItemId lp;
+ HeapTuple tp;
+ PageHeader dp;
+ Buffer b;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_delete);
+ IncrHeapAccessStat(global_delete);
+
+ /* ----------------
+ * sanity check
+ * ----------------
+ */
+ Assert(ItemPointerIsValid(tid));
+
+ /* ----------------
+ * set relation level write lock
+ * ----------------
+ */
+ RelationSetLockForWrite(relation);
+
+ b = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
+
+#ifndef NO_BUFFERISVALID
+ if (!BufferIsValid(b)) { /* XXX L_SH better ??? */
+ elog(WARN, "heap_delete: failed ReadBuffer");
+ }
+#endif /* NO_BUFFERISVALID */
+
+ dp = (PageHeader) BufferGetPage(b);
+ lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
+
+ /* ----------------
+ * check that we're deleteing a valid item
+ * ----------------
+ */
+ if (!(tp = heap_tuple_satisfies(lp, relation, dp,
+ NowTimeQual, 0, (ScanKey) NULL))) {
+
+ /* XXX call something else */
+ ReleaseBuffer(b);
+
+ elog(WARN, "heap_delete: (am)invalid tid");
+ }
+
+ /* ----------------
+ * get the tuple and lock tell the buffer manager we want
+ * exclusive access to the page
+ * ----------------
+ */
+
+ /* ----------------
+ * store transaction information of xact deleting the tuple
+ * ----------------
+ */
+ TransactionIdStore(GetCurrentTransactionId(), &(tp->t_xmax));
+ tp->t_cmax = GetCurrentCommandId();
+ ItemPointerSetInvalid(&tp->t_chain);
+
+ /* ----------------
+ * invalidate caches
+ * ----------------
+ */
+ SetRefreshWhenInvalidate(ImmediateInvalidation);
+ RelationInvalidateHeapTuple(relation, tp);
+ SetRefreshWhenInvalidate((bool)!ImmediateInvalidation);
+
+ WriteBuffer(b);
+ if ( IsSystemRelationName(RelationGetRelationName(relation)->data) )
+ RelationUnsetLockForWrite(relation);
+}
+
+/* ----------------
+ * heap_replace - replace a tuple
+ *
+ * Must decide how to handle errors.
+ *
+ * Fix arguments, work with indexes.
+ *
+ * 12/30/93 - modified the return value to be 1 when
+ * a non-functional update is detected. This
+ * prevents the calling routine from updating
+ * indices unnecessarily. -kw
+ *
+ * ----------------
+ */
+int
+heap_replace(Relation relation, ItemPointer otid, HeapTuple tup)
+{
+ ItemId lp;
+ HeapTuple tp;
+ Page dp;
+ Buffer buffer;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_replace);
+ IncrHeapAccessStat(global_replace);
+
+ /* ----------------
+ * sanity checks
+ * ----------------
+ */
+ Assert(ItemPointerIsValid(otid));
+
+ /* ----------------
+ * set relation level write lock
+ * ----------------
+ */
+ if (!relation->rd_islocal)
+ RelationSetLockForWrite(relation);
+
+ buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
+#ifndef NO_BUFFERISVALID
+ if (!BufferIsValid(buffer)) {
+ /* XXX L_SH better ??? */
+ elog(WARN, "amreplace: failed ReadBuffer");
+ }
+#endif /* NO_BUFFERISVALID */
+
+ dp = (Page) BufferGetPage(buffer);
+ lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(otid));
+
+ /* ----------------
+ * logically delete old item
+ * ----------------
+ */
+
+ tp = (HeapTuple) PageGetItem(dp, lp);
+ Assert(HeapTupleIsValid(tp));
+
+ /* -----------------
+ * the following test should be able to catch all non-functional
+ * update attempts and shut out all ghost tuples.
+ * XXX In the future, Spyros may need to update the rule lock on a tuple
+ * more than once within the same command and same transaction.
+ * He will have to introduce a new flag to override the following check.
+ * -- Wei
+ *
+ * -----------------
+ */
+
+ if (TupleUpdatedByCurXactAndCmd(tp)) {
+ elog(NOTICE, "Non-functional update, only first update is performed");
+ if ( IsSystemRelationName(RelationGetRelationName(relation)->data) )
+ RelationUnsetLockForWrite(relation);
+ ReleaseBuffer(buffer);
+ return(1);
+ }
+
+ /* ----------------
+ * check that we're replacing a valid item -
+ *
+ * NOTE that this check must follow the non-functional update test
+ * above as it can happen that we try to 'replace' the same tuple
+ * twice in a single transaction. The second time around the
+ * tuple will fail the NowTimeQual. We don't want to abort the
+ * xact, we only want to flag the 'non-functional' NOTICE. -mer
+ * ----------------
+ */
+ if (!heap_tuple_satisfies(lp,
+ relation,
+ (PageHeader)dp,
+ NowTimeQual,
+ 0,
+ (ScanKey)NULL))
+ {
+ ReleaseBuffer(buffer);
+ elog(WARN, "heap_replace: (am)invalid otid");
+ }
+
+ /* XXX order problems if not atomic assignment ??? */
+ tup->t_oid = tp->t_oid;
+ TransactionIdStore(GetCurrentTransactionId(), &(tup->t_xmin));
+ tup->t_cmin = GetCurrentCommandId();
+ StoreInvalidTransactionId(&(tup->t_xmax));
+ tup->t_tmin = INVALID_ABSTIME;
+ tup->t_tmax = CURRENT_ABSTIME;
+ ItemPointerSetInvalid(&tup->t_chain);
+
+ /* ----------------
+ * insert new item
+ * ----------------
+ */
+ if ((unsigned)DOUBLEALIGN(tup->t_len) <= PageGetFreeSpace((Page) dp)) {
+ RelationPutHeapTuple(relation, BufferGetBlockNumber(buffer), tup);
+ } else {
+ /* ----------------
+ * new item won't fit on same page as old item, have to look
+ * for a new place to put it.
+ * ----------------
+ */
+ doinsert(relation, tup);
+ }
+
+ /* ----------------
+ * new item in place, now record transaction information
+ * ----------------
+ */
+ TransactionIdStore(GetCurrentTransactionId(), &(tp->t_xmax));
+ tp->t_cmax = GetCurrentCommandId();
+ tp->t_chain = tup->t_ctid;
+
+ /* ----------------
+ * invalidate caches
+ * ----------------
+ */
+ SetRefreshWhenInvalidate(ImmediateInvalidation);
+ RelationInvalidateHeapTuple(relation, tp);
+ SetRefreshWhenInvalidate((bool)!ImmediateInvalidation);
+
+ WriteBuffer(buffer);
+
+ if ( IsSystemRelationName(RelationGetRelationName(relation)->data) )
+ RelationUnsetLockForWrite(relation);
+
+ return(0);
+}
+
+/* ----------------
+ * heap_markpos - mark scan position
+ *
+ * Note:
+ * Should only one mark be maintained per scan at one time.
+ * Check if this can be done generally--say calls to get the
+ * next/previous tuple and NEVER pass struct scandesc to the
+ * user AM's. Now, the mark is sent to the executor for safekeeping.
+ * Probably can store this info into a GENERAL scan structure.
+ *
+ * May be best to change this call to store the marked position
+ * (up to 2?) in the scan structure itself.
+ * Fix to use the proper caching structure.
+ * ----------------
+ */
+void
+heap_markpos(HeapScanDesc sdesc)
+{
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_markpos);
+ IncrHeapAccessStat(global_markpos);
+
+ /* Note: no locking manipulations needed */
+
+ if (sdesc->rs_ptup == NULL &&
+ BufferIsUnknown(sdesc->rs_pbuf)) { /* == NONTUP */
+ sdesc->rs_ptup = (HeapTuple)
+ heapgettup(sdesc->rs_rd,
+ (sdesc->rs_ctup == NULL) ?
+ (ItemPointer)NULL : &sdesc->rs_ctup->t_ctid,
+ -1,
+ &sdesc->rs_pbuf,
+ sdesc->rs_tr,
+ sdesc->rs_nkeys,
+ sdesc->rs_key);
+
+ } else if (sdesc->rs_ntup == NULL &&
+ BufferIsUnknown(sdesc->rs_nbuf)) { /* == NONTUP */
+ sdesc->rs_ntup = (HeapTuple)
+ heapgettup(sdesc->rs_rd,
+ (sdesc->rs_ctup == NULL) ?
+ (ItemPointer)NULL : &sdesc->rs_ctup->t_ctid,
+ 1,
+ &sdesc->rs_nbuf,
+ sdesc->rs_tr,
+ sdesc->rs_nkeys,
+ sdesc->rs_key);
+ }
+
+ /* ----------------
+ * Should not unpin the buffer pages. They may still be in use.
+ * ----------------
+ */
+ if (sdesc->rs_ptup != NULL) {
+ sdesc->rs_mptid = sdesc->rs_ptup->t_ctid;
+ } else {
+ ItemPointerSetInvalid(&sdesc->rs_mptid);
+ }
+ if (sdesc->rs_ctup != NULL) {
+ sdesc->rs_mctid = sdesc->rs_ctup->t_ctid;
+ } else {
+ ItemPointerSetInvalid(&sdesc->rs_mctid);
+ }
+ if (sdesc->rs_ntup != NULL) {
+ sdesc->rs_mntid = sdesc->rs_ntup->t_ctid;
+ } else {
+ ItemPointerSetInvalid(&sdesc->rs_mntid);
+ }
+}
+
+/* ----------------
+ * heap_restrpos - restore position to marked location
+ *
+ * Note: there are bad side effects here. If we were past the end
+ * of a relation when heapmarkpos is called, then if the relation is
+ * extended via insert, then the next call to heaprestrpos will set
+ * cause the added tuples to be visible when the scan continues.
+ * Problems also arise if the TID's are rearranged!!!
+ *
+ * Now pins buffer once for each valid tuple pointer (rs_ptup,
+ * rs_ctup, rs_ntup) referencing it.
+ * - 01/13/94
+ *
+ * XXX might be better to do direct access instead of
+ * using the generality of heapgettup().
+ *
+ * XXX It is very possible that when a scan is restored, that a tuple
+ * XXX which previously qualified may fail for time range purposes, unless
+ * XXX some form of locking exists (ie., portals currently can act funny.
+ * ----------------
+ */
+void
+heap_restrpos(HeapScanDesc sdesc)
+{
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_restrpos);
+ IncrHeapAccessStat(global_restrpos);
+
+ /* XXX no amrestrpos checking that ammarkpos called */
+
+ /* Note: no locking manipulations needed */
+
+ unpinsdesc(sdesc);
+
+ /* force heapgettup to pin buffer for each loaded tuple */
+ sdesc->rs_pbuf = InvalidBuffer;
+ sdesc->rs_cbuf = InvalidBuffer;
+ sdesc->rs_nbuf = InvalidBuffer;
+
+ if (!ItemPointerIsValid(&sdesc->rs_mptid)) {
+ sdesc->rs_ptup = NULL;
+ } else {
+ sdesc->rs_ptup = (HeapTuple)
+ heapgettup(sdesc->rs_rd,
+ &sdesc->rs_mptid,
+ 0,
+ &sdesc->rs_pbuf,
+ NowTimeQual,
+ 0,
+ (ScanKey) NULL);
+ }
+
+ if (!ItemPointerIsValid(&sdesc->rs_mctid)) {
+ sdesc->rs_ctup = NULL;
+ } else {
+ sdesc->rs_ctup = (HeapTuple)
+ heapgettup(sdesc->rs_rd,
+ &sdesc->rs_mctid,
+ 0,
+ &sdesc->rs_cbuf,
+ NowTimeQual,
+ 0,
+ (ScanKey) NULL);
+ }
+
+ if (!ItemPointerIsValid(&sdesc->rs_mntid)) {
+ sdesc->rs_ntup = NULL;
+ } else {
+ sdesc->rs_ntup = (HeapTuple)
+ heapgettup(sdesc->rs_rd,
+ &sdesc->rs_mntid,
+ 0,
+ &sdesc->rs_nbuf,
+ NowTimeQual,
+ 0,
+ (ScanKey) NULL);
+ }
+}
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
new file mode 100644
index 00000000000..457e1174a30
--- /dev/null
+++ b/src/backend/access/heap/hio.c
@@ -0,0 +1,195 @@
+/*-------------------------------------------------------------------------
+ *
+ * hio.c--
+ * POSTGRES heap access method input/output code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Id: hio.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+
+#include "c.h"
+
+#include "access/heapam.h"
+#include "access/hio.h"
+#include "access/htup.h"
+
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "storage/itemid.h"
+#include "storage/itemptr.h"
+#include "storage/off.h"
+
+#include "utils/memutils.h"
+#include "utils/elog.h"
+#include "utils/rel.h"
+
+/*
+ * amputunique - place tuple at tid
+ * Currently on errors, calls elog. Perhaps should return -1?
+ * Possible errors include the addition of a tuple to the page
+ * between the time the linep is chosen and the page is L_UP'd.
+ *
+ * This should be coordinated with the B-tree code.
+ * Probably needs to have an amdelunique to allow for
+ * internal index records to be deleted and reordered as needed.
+ * For the heap AM, this should never be needed.
+ */
+void
+RelationPutHeapTuple(Relation relation,
+ BlockNumber blockIndex,
+ HeapTuple tuple)
+{
+ Buffer buffer;
+ Page pageHeader;
+ BlockNumber numberOfBlocks;
+ OffsetNumber offnum;
+ unsigned int len;
+ ItemId itemId;
+ Item item;
+
+ /* ----------------
+ * increment access statistics
+ * ----------------
+ */
+ IncrHeapAccessStat(local_RelationPutHeapTuple);
+ IncrHeapAccessStat(global_RelationPutHeapTuple);
+
+ Assert(RelationIsValid(relation));
+ Assert(HeapTupleIsValid(tuple));
+
+ numberOfBlocks = RelationGetNumberOfBlocks(relation);
+ Assert(blockIndex < numberOfBlocks);
+
+ buffer = ReadBuffer(relation, blockIndex);
+#ifndef NO_BUFFERISVALID
+ if (!BufferIsValid(buffer)) {
+ elog(WARN, "RelationPutHeapTuple: no buffer for %ld in %s",
+ blockIndex, &relation->rd_rel->relname);
+ }
+#endif
+
+ pageHeader = (Page)BufferGetPage(buffer);
+ len = (unsigned)DOUBLEALIGN(tuple->t_len); /* be conservative */
+ Assert((int)len <= PageGetFreeSpace(pageHeader));
+
+ offnum = PageAddItem((Page)pageHeader, (Item)tuple,
+ tuple->t_len, InvalidOffsetNumber, LP_USED);
+
+ itemId = PageGetItemId((Page)pageHeader, offnum);
+ item = PageGetItem((Page)pageHeader, itemId);
+
+ ItemPointerSet(&((HeapTuple)item)->t_ctid, blockIndex, offnum);
+
+ WriteBuffer(buffer);
+ /* return an accurate tuple */
+ ItemPointerSet(&tuple->t_ctid, blockIndex, offnum);
+}
+
+/*
+ * The heap_insert routines "know" that a buffer page is initialized to
+ * zero when a BlockExtend operation is performed.
+ */
+
+#define PageIsNew(page) ((page)->pd_upper == 0)
+
+/*
+ * This routine is another in the series of attempts to reduce the number
+ * of I/O's and system calls executed in the various benchmarks. In
+ * particular, this routine is used to append data to the end of a relation
+ * file without excessive lseeks. This code should do no more than 2 semops
+ * in the ideal case.
+ *
+ * Eventually, we should cache the number of blocks in a relation somewhere.
+ * Until that time, this code will have to do an lseek to determine the number
+ * of blocks in a relation.
+ *
+ * This code should ideally do at most 4 semops, 1 lseek, and possibly 1 write
+ * to do an append; it's possible to eliminate 2 of the semops if we do direct
+ * buffer stuff (!); the lseek and the write can go if we get
+ * RelationGetNumberOfBlocks to be useful.
+ *
+ * NOTE: This code presumes that we have a write lock on the relation.
+ *
+ * Also note that this routine probably shouldn't have to exist, and does
+ * screw up the call graph rather badly, but we are wasting so much time and
+ * system resources being massively general that we are losing badly in our
+ * performance benchmarks.
+ */
+void
+RelationPutHeapTupleAtEnd(Relation relation, HeapTuple tuple)
+{
+ Buffer buffer;
+ Page pageHeader;
+ BlockNumber lastblock;
+ OffsetNumber offnum;
+ unsigned int len;
+ ItemId itemId;
+ Item item;
+
+ Assert(RelationIsValid(relation));
+ Assert(HeapTupleIsValid(tuple));
+
+ /*
+ * XXX This does an lseek - VERY expensive - but at the moment it
+ * is the only way to accurately determine how many blocks are in
+ * a relation. A good optimization would be to get this to actually
+ * work properly.
+ */
+
+ lastblock = RelationGetNumberOfBlocks(relation);
+
+ if (lastblock == 0)
+ {
+ buffer = ReadBuffer(relation, lastblock);
+ pageHeader = (Page)BufferGetPage(buffer);
+ if (PageIsNew((PageHeader) pageHeader))
+ {
+ buffer = ReleaseAndReadBuffer(buffer, relation, P_NEW);
+ pageHeader = (Page)BufferGetPage(buffer);
+ PageInit(pageHeader, BufferGetPageSize(buffer), 0);
+ }
+ }
+ else
+ buffer = ReadBuffer(relation, lastblock - 1);
+
+ pageHeader = (Page)BufferGetPage(buffer);
+ len = (unsigned)DOUBLEALIGN(tuple->t_len); /* be conservative */
+
+ /*
+ * Note that this is true if the above returned a bogus page, which
+ * it will do for a completely empty relation.
+ */
+
+ if (len > PageGetFreeSpace(pageHeader))
+ {
+ buffer = ReleaseAndReadBuffer(buffer, relation, P_NEW);
+ pageHeader = (Page)BufferGetPage(buffer);
+ PageInit(pageHeader, BufferGetPageSize(buffer), 0);
+
+ if (len > PageGetFreeSpace(pageHeader))
+ elog(WARN, "Tuple is too big: size %d", len);
+ }
+
+ offnum = PageAddItem((Page)pageHeader, (Item)tuple,
+ tuple->t_len, InvalidOffsetNumber, LP_USED);
+
+ itemId = PageGetItemId((Page)pageHeader, offnum);
+ item = PageGetItem((Page)pageHeader, itemId);
+
+ lastblock = BufferGetBlockNumber(buffer);
+
+ ItemPointerSet(&((HeapTuple)item)->t_ctid, lastblock, offnum);
+
+ /* return an accurate tuple */
+ ItemPointerSet(&tuple->t_ctid, lastblock, offnum);
+
+ WriteBuffer(buffer);
+}
diff --git a/src/backend/access/heap/stats.c b/src/backend/access/heap/stats.c
new file mode 100644
index 00000000000..d41d01ac1ba
--- /dev/null
+++ b/src/backend/access/heap/stats.c
@@ -0,0 +1,329 @@
+/*-------------------------------------------------------------------------
+ *
+ * stats.c--
+ * heap access method debugging statistic collection routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/heap/Attic/stats.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+ *
+ * NOTES
+ * initam should be moved someplace else.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+
+#include "utils/memutils.h"
+#include "utils/palloc.h"
+#include "utils/elog.h"
+#include "utils/mcxt.h"
+
+/* ----------------
+ * InitHeapAccessStatistics
+ * ----------------
+ */
+HeapAccessStatistics heap_access_stats = (HeapAccessStatistics) NULL;
+
+void
+InitHeapAccessStatistics()
+{
+ MemoryContext oldContext;
+ HeapAccessStatistics stats;
+
+ /* ----------------
+ * make sure we don't initialize things twice
+ * ----------------
+ */
+ if (heap_access_stats != NULL)
+ return;
+
+ /* ----------------
+ * allocate statistics structure from the top memory context
+ * ----------------
+ */
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ stats = (HeapAccessStatistics)
+ palloc(sizeof(HeapAccessStatisticsData));
+
+ /* ----------------
+ * initialize fields to default values
+ * ----------------
+ */
+ stats->global_open = 0;
+ stats->global_openr = 0;
+ stats->global_close = 0;
+ stats->global_beginscan = 0;
+ stats->global_rescan = 0;
+ stats->global_endscan = 0;
+ stats->global_getnext = 0;
+ stats->global_fetch = 0;
+ stats->global_insert = 0;
+ stats->global_delete = 0;
+ stats->global_replace = 0;
+ stats->global_markpos = 0;
+ stats->global_restrpos = 0;
+ stats->global_BufferGetRelation = 0;
+ stats->global_RelationIdGetRelation = 0;
+ stats->global_RelationIdGetRelation_Buf = 0;
+ stats->global_getreldesc = 0;
+ stats->global_heapgettup = 0;
+ stats->global_RelationPutHeapTuple = 0;
+ stats->global_RelationPutLongHeapTuple = 0;
+
+ stats->local_open = 0;
+ stats->local_openr = 0;
+ stats->local_close = 0;
+ stats->local_beginscan = 0;
+ stats->local_rescan = 0;
+ stats->local_endscan = 0;
+ stats->local_getnext = 0;
+ stats->local_fetch = 0;
+ stats->local_insert = 0;
+ stats->local_delete = 0;
+ stats->local_replace = 0;
+ stats->local_markpos = 0;
+ stats->local_restrpos = 0;
+ stats->local_BufferGetRelation = 0;
+ stats->local_RelationIdGetRelation = 0;
+ stats->local_RelationIdGetRelation_Buf = 0;
+ stats->local_getreldesc = 0;
+ stats->local_heapgettup = 0;
+ stats->local_RelationPutHeapTuple = 0;
+ stats->local_RelationPutLongHeapTuple = 0;
+ stats->local_RelationNameGetRelation = 0;
+ stats->global_RelationNameGetRelation = 0;
+
+ /* ----------------
+ * record init times
+ * ----------------
+ */
+ time(&stats->init_global_timestamp);
+ time(&stats->local_reset_timestamp);
+ time(&stats->last_request_timestamp);
+
+ /* ----------------
+ * return to old memory context
+ * ----------------
+ */
+ (void) MemoryContextSwitchTo(oldContext);
+
+ heap_access_stats = stats;
+}
+
+/* ----------------
+ * ResetHeapAccessStatistics
+ * ----------------
+ */
+void
+ResetHeapAccessStatistics()
+{
+ HeapAccessStatistics stats;
+
+ /* ----------------
+ * do nothing if stats aren't initialized
+ * ----------------
+ */
+ if (heap_access_stats == NULL)
+ return;
+
+ stats = heap_access_stats;
+
+ /* ----------------
+ * reset local counts
+ * ----------------
+ */
+ stats->local_open = 0;
+ stats->local_openr = 0;
+ stats->local_close = 0;
+ stats->local_beginscan = 0;
+ stats->local_rescan = 0;
+ stats->local_endscan = 0;
+ stats->local_getnext = 0;
+ stats->local_fetch = 0;
+ stats->local_insert = 0;
+ stats->local_delete = 0;
+ stats->local_replace = 0;
+ stats->local_markpos = 0;
+ stats->local_restrpos = 0;
+ stats->local_BufferGetRelation = 0;
+ stats->local_RelationIdGetRelation = 0;
+ stats->local_RelationIdGetRelation_Buf = 0;
+ stats->local_getreldesc = 0;
+ stats->local_heapgettup = 0;
+ stats->local_RelationPutHeapTuple = 0;
+ stats->local_RelationPutLongHeapTuple = 0;
+
+ /* ----------------
+ * reset local timestamps
+ * ----------------
+ */
+ time(&stats->local_reset_timestamp);
+ time(&stats->last_request_timestamp);
+}
+
+/* ----------------
+ * GetHeapAccessStatistics
+ * ----------------
+ */
+HeapAccessStatistics GetHeapAccessStatistics()
+{
+ HeapAccessStatistics stats;
+
+ /* ----------------
+ * return nothing if stats aren't initialized
+ * ----------------
+ */
+ if (heap_access_stats == NULL)
+ return NULL;
+
+ /* ----------------
+ * record the current request time
+ * ----------------
+ */
+ time(&heap_access_stats->last_request_timestamp);
+
+ /* ----------------
+ * allocate a copy of the stats and return it to the caller.
+ * ----------------
+ */
+ stats = (HeapAccessStatistics)
+ palloc(sizeof(HeapAccessStatisticsData));
+
+ memmove(stats,
+ heap_access_stats,
+ sizeof(HeapAccessStatisticsData));
+
+ return stats;
+}
+
+/* ----------------
+ * PrintHeapAccessStatistics
+ * ----------------
+ */
+void
+PrintHeapAccessStatistics(HeapAccessStatistics stats)
+{
+ /* ----------------
+ * return nothing if stats aren't valid
+ * ----------------
+ */
+ if (stats == NULL)
+ return;
+
+ printf("======== heap am statistics ========\n");
+ printf("init_global_timestamp: %s",
+ ctime(&(stats->init_global_timestamp)));
+
+ printf("local_reset_timestamp: %s",
+ ctime(&(stats->local_reset_timestamp)));
+
+ printf("last_request_timestamp: %s",
+ ctime(&(stats->last_request_timestamp)));
+
+ printf("local/global_open: %6d/%6d\n",
+ stats->local_open, stats->global_open);
+
+ printf("local/global_openr: %6d/%6d\n",
+ stats->local_openr, stats->global_openr);
+
+ printf("local/global_close: %6d/%6d\n",
+ stats->local_close, stats->global_close);
+
+ printf("local/global_beginscan: %6d/%6d\n",
+ stats->local_beginscan, stats->global_beginscan);
+
+ printf("local/global_rescan: %6d/%6d\n",
+ stats->local_rescan, stats->global_rescan);
+
+ printf("local/global_endscan: %6d/%6d\n",
+ stats->local_endscan, stats->global_endscan);
+
+ printf("local/global_getnext: %6d/%6d\n",
+ stats->local_getnext, stats->global_getnext);
+
+ printf("local/global_fetch: %6d/%6d\n",
+ stats->local_fetch, stats->global_fetch);
+
+ printf("local/global_insert: %6d/%6d\n",
+ stats->local_insert, stats->global_insert);
+
+ printf("local/global_delete: %6d/%6d\n",
+ stats->local_delete, stats->global_delete);
+
+ printf("local/global_replace: %6d/%6d\n",
+ stats->local_replace, stats->global_replace);
+
+ printf("local/global_markpos: %6d/%6d\n",
+ stats->local_markpos, stats->global_markpos);
+
+ printf("local/global_restrpos: %6d/%6d\n",
+ stats->local_restrpos, stats->global_restrpos);
+
+ printf("================\n");
+
+ printf("local/global_BufferGetRelation: %6d/%6d\n",
+ stats->local_BufferGetRelation,
+ stats->global_BufferGetRelation);
+
+ printf("local/global_RelationIdGetRelation: %6d/%6d\n",
+ stats->local_RelationIdGetRelation,
+ stats->global_RelationIdGetRelation);
+
+ printf("local/global_RelationIdGetRelation_Buf: %6d/%6d\n",
+ stats->local_RelationIdGetRelation_Buf,
+ stats->global_RelationIdGetRelation_Buf);
+
+ printf("local/global_getreldesc: %6d/%6d\n",
+ stats->local_getreldesc, stats->global_getreldesc);
+
+ printf("local/global_heapgettup: %6d/%6d\n",
+ stats->local_heapgettup, stats->global_heapgettup);
+
+ printf("local/global_RelationPutHeapTuple: %6d/%6d\n",
+ stats->local_RelationPutHeapTuple,
+ stats->global_RelationPutHeapTuple);
+
+ printf("local/global_RelationPutLongHeapTuple: %6d/%6d\n",
+ stats->local_RelationPutLongHeapTuple,
+ stats->global_RelationPutLongHeapTuple);
+
+ printf("===================================\n");
+
+ printf("\n");
+}
+
+/* ----------------
+ * PrintAndFreeHeapAccessStatistics
+ * ----------------
+ */
+void
+PrintAndFreeHeapAccessStatistics(HeapAccessStatistics stats)
+{
+ PrintHeapAccessStatistics(stats);
+ if (stats != NULL)
+ pfree(stats);
+}
+
+/* ----------------------------------------------------------------
+ * access method initialization
+ * ----------------------------------------------------------------
+ */
+/* ----------------
+ * initam should someday be moved someplace else.
+ * ----------------
+ */
+void
+initam()
+{
+ /* ----------------
+ * initialize heap statistics.
+ * ----------------
+ */
+ InitHeapAccessStatistics();
+}
diff --git a/src/backend/access/heapam.h b/src/backend/access/heapam.h
new file mode 100644
index 00000000000..9938dbeea77
--- /dev/null
+++ b/src/backend/access/heapam.h
@@ -0,0 +1,149 @@
+/*-------------------------------------------------------------------------
+ *
+ * heapam.h--
+ * POSTGRES heap access method definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: heapam.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HEAPAM_H
+#define HEAPAM_H
+
+#include <sys/types.h>
+
+#include "postgres.h"
+
+#include "access/attnum.h"
+#include "access/htup.h"
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "utils/tqual.h"
+#include "access/tupdesc.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
+
+/* ----------------------------------------------------------------
+ * heap access method statistics
+ * ----------------------------------------------------------------
+ */
+
+typedef struct HeapAccessStatisticsData {
+ time_t init_global_timestamp; /* time global statistics started */
+ time_t local_reset_timestamp; /* last time local reset was done */
+ time_t last_request_timestamp; /* last time stats were requested */
+
+ int global_open;
+ int global_openr;
+ int global_close;
+ int global_beginscan;
+ int global_rescan;
+ int global_endscan;
+ int global_getnext;
+ int global_fetch;
+ int global_insert;
+ int global_delete;
+ int global_replace;
+ int global_markpos;
+ int global_restrpos;
+ int global_BufferGetRelation;
+ int global_RelationIdGetRelation;
+ int global_RelationIdGetRelation_Buf;
+ int global_RelationNameGetRelation;
+ int global_getreldesc;
+ int global_heapgettup;
+ int global_RelationPutHeapTuple;
+ int global_RelationPutLongHeapTuple;
+
+ int local_open;
+ int local_openr;
+ int local_close;
+ int local_beginscan;
+ int local_rescan;
+ int local_endscan;
+ int local_getnext;
+ int local_fetch;
+ int local_insert;
+ int local_delete;
+ int local_replace;
+ int local_markpos;
+ int local_restrpos;
+ int local_BufferGetRelation;
+ int local_RelationIdGetRelation;
+ int local_RelationIdGetRelation_Buf;
+ int local_RelationNameGetRelation;
+ int local_getreldesc;
+ int local_heapgettup;
+ int local_RelationPutHeapTuple;
+ int local_RelationPutLongHeapTuple;
+} HeapAccessStatisticsData;
+
+typedef HeapAccessStatisticsData *HeapAccessStatistics;
+
+#define IncrHeapAccessStat(x) \
+ (heap_access_stats == NULL ? 0 : (heap_access_stats->x)++)
+
+extern HeapAccessStatistics heap_access_stats; /* in stats.c */
+
+/* ----------------
+ * function prototypes for heap access method
+ * ----------------
+ */
+/* heap_create, heap_creatr, and heap_destroy are declared in catalog/heap.h */
+#include "catalog/heap.h"
+
+/* heapam.c */
+extern void doinsert(Relation relation, HeapTuple tup);
+extern void SetHeapAccessMethodImmediateInvalidation(bool on);
+
+extern Relation heap_open(Oid relationId);
+extern Relation heap_openr(char *relationName);
+extern void heap_close(Relation relation);
+extern HeapScanDesc heap_beginscan(Relation relation, int atend,
+ TimeQual timeQual, unsigned nkeys, ScanKey key);
+extern void heap_rescan(HeapScanDesc sdesc, bool scanFromEnd, ScanKey key);
+extern void heap_endscan(HeapScanDesc sdesc);
+extern HeapTuple heap_getnext(HeapScanDesc scandesc, int backw, Buffer *b);
+extern HeapTuple heap_fetch(Relation relation, TimeQual timeQual,
+ ItemPointer tid, Buffer *b);
+extern Oid heap_insert(Relation relation, HeapTuple tup);
+extern void heap_delete(Relation relation, ItemPointer tid);
+extern int heap_replace(Relation relation, ItemPointer otid,
+ HeapTuple tup);
+extern void heap_markpos(HeapScanDesc sdesc);
+extern void heap_restrpos(HeapScanDesc sdesc);
+
+/* in common/heaptuple.c */
+extern Size ComputeDataSize(TupleDesc tupleDesc, Datum value[], char nulls[]);
+extern void DataFill(char *data, TupleDesc tupleDesc,
+ Datum value[], char nulls[], char *infomask,
+ bits8 bit[]);
+extern int heap_attisnull(HeapTuple tup, int attnum);
+extern int heap_sysattrlen(AttrNumber attno);
+extern bool heap_sysattrbyval(AttrNumber attno);
+extern char *heap_getsysattr(HeapTuple tup, Buffer b, int attnum);
+extern char *fastgetattr(HeapTuple tup, unsigned attnum,
+ TupleDesc att, bool *isnull);
+extern char *heap_getattr(HeapTuple tup, Buffer b, int attnum,
+ TupleDesc att, bool *isnull);
+extern HeapTuple heap_copytuple(HeapTuple tuple);
+extern void heap_deformtuple(HeapTuple tuple, TupleDesc tdesc,
+ Datum values[], char nulls[]);
+extern HeapTuple heap_formtuple(TupleDesc tupleDescriptor,
+ Datum value[], char nulls[]);
+extern HeapTuple heap_modifytuple(HeapTuple tuple, Buffer buffer,
+ Relation relation, Datum replValue[], char replNull[], char repl[]);
+HeapTuple heap_addheader(uint32 natts, int structlen, char *structure);
+
+/* in common/heap/stats.c */
+extern void InitHeapAccessStatistics(void);
+extern void ResetHeapAccessStatistics(void);
+extern HeapAccessStatistics GetHeapAccessStatistics(void);
+extern void PrintHeapAccessStatistics(HeapAccessStatistics stats);
+extern void PrintAndFreeHeapAccessStatistics(HeapAccessStatistics stats);
+extern void initam(void);
+
+#endif /* HEAPAM_H */
diff --git a/src/backend/access/hio.h b/src/backend/access/hio.h
new file mode 100644
index 00000000000..4a699ffcd98
--- /dev/null
+++ b/src/backend/access/hio.h
@@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * hio.h--
+ * POSTGRES heap access method input/output definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: hio.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HIO_H
+#define HIO_H
+
+#include "c.h"
+
+#include "storage/block.h"
+#include "access/htup.h"
+#include "utils/rel.h"
+
+extern void RelationPutHeapTuple(Relation relation, BlockNumber blockIndex,
+ HeapTuple tuple);
+extern void RelationPutHeapTupleAtEnd(Relation relation, HeapTuple tuple);
+
+#endif /* HIO_H */
diff --git a/src/backend/access/htup.h b/src/backend/access/htup.h
new file mode 100644
index 00000000000..7cf1ecf1762
--- /dev/null
+++ b/src/backend/access/htup.h
@@ -0,0 +1,115 @@
+/*-------------------------------------------------------------------------
+ *
+ * htup.h--
+ * POSTGRES heap tuple definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: htup.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef HTUP_H
+#define HTUP_H
+
+#include "access/attnum.h"
+#include "storage/bufpage.h" /* just to reduce levels of #include */
+#include "storage/itemptr.h"
+#include "utils/nabstime.h"
+
+#define MinHeapTupleBitmapSize 32 /* 8 * 4 */
+
+/* check these, they are likely to be more severely limited by t_hoff */
+
+#define MaxHeapAttributeNumber 1600 /* 8 * 200 */
+
+/*
+ * to avoid wasting space, the attributes should be layed out in such a
+ * way to reduce structure padding.
+ */
+typedef struct HeapTupleData {
+
+ unsigned int t_len; /* length of entire tuple */
+
+ ItemPointerData t_ctid; /* current TID of this tuple */
+
+ ItemPointerData t_chain; /* replaced tuple TID */
+
+ Oid t_oid; /* OID of this tuple -- 4 bytes */
+
+ CommandId t_cmin; /* insert CID stamp -- 2 bytes each */
+ CommandId t_cmax; /* delete CommandId stamp */
+
+ TransactionId t_xmin; /* insert XID stamp -- 4 bytes each */
+ TransactionId t_xmax; /* delete XID stamp */
+
+ AbsoluteTime t_tmin; /* time stamps -- 4 bytes each */
+ AbsoluteTime t_tmax;
+
+ int16 t_natts; /* number of attributes */
+ char t_vtype; /* not used - padding */
+
+ char t_infomask; /* whether tuple as null or variable
+ * length attributes
+ */
+
+ uint8 t_hoff; /* sizeof tuple header */
+
+ bits8 t_bits[MinHeapTupleBitmapSize / 8];
+ /* bit map of domains */
+
+ /* MORE DATA FOLLOWS AT END OF STRUCT */
+} HeapTupleData;
+
+typedef HeapTupleData *HeapTuple;
+
+
+#define SelfItemPointerAttributeNumber (-1)
+#define ObjectIdAttributeNumber (-2)
+#define MinTransactionIdAttributeNumber (-3)
+#define MinCommandIdAttributeNumber (-4)
+#define MaxTransactionIdAttributeNumber (-5)
+#define MaxCommandIdAttributeNumber (-6)
+#define ChainItemPointerAttributeNumber (-7)
+#define AnchorItemPointerAttributeNumber (-8)
+#define MinAbsoluteTimeAttributeNumber (-9)
+#define MaxAbsoluteTimeAttributeNumber (-10)
+#define VersionTypeAttributeNumber (-11)
+#define FirstLowInvalidHeapAttributeNumber (-12)
+
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+#define GETSTRUCT(TUP) (((char *)(TUP)) + ((HeapTuple)(TUP))->t_hoff)
+
+
+/*
+ * BITMAPLEN(NATTS) -
+ * Computes minimum size of bitmap given number of domains.
+ */
+#define BITMAPLEN(NATTS) \
+ ((((((int)(NATTS) - 1) >> 3) + 4 - (MinHeapTupleBitmapSize >> 3)) \
+ & ~03) + (MinHeapTupleBitmapSize >> 3))
+
+/*
+ * HeapTupleIsValid
+ * True iff the heap tuple is valid.
+ */
+#define HeapTupleIsValid(tuple) PointerIsValid(tuple)
+
+/*
+ * information stored in t_infomask:
+ */
+#define HEAP_HASNULL 0x01 /* has null attribute(s) */
+#define HEAP_HASVARLENA 0x02 /* has variable length attribute(s) */
+
+#define HeapTupleNoNulls(tuple) \
+ (!(((HeapTuple) (tuple))->t_infomask & HEAP_HASNULL))
+
+#define HeapTupleAllFixed(tuple) \
+ (!(((HeapTuple) (tuple))->t_infomask & HEAP_HASVARLENA))
+
+#endif /* HTUP_H */
diff --git a/src/backend/access/ibit.h b/src/backend/access/ibit.h
new file mode 100644
index 00000000000..990c23ab4dd
--- /dev/null
+++ b/src/backend/access/ibit.h
@@ -0,0 +1,34 @@
+/*-------------------------------------------------------------------------
+ *
+ * ibit.h--
+ * POSTGRES index valid attribute bit map definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: ibit.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IBIT_H
+#define IBIT_H
+
+#include "c.h"
+#include "utils/memutils.h"
+
+typedef struct IndexAttributeBitMapData {
+ char bits[(MaxIndexAttributeNumber + MaxBitsPerByte - 1)
+ / MaxBitsPerByte];
+} IndexAttributeBitMapData;
+
+typedef IndexAttributeBitMapData *IndexAttributeBitMap;
+
+#define IndexAttributeBitMapSize sizeof(IndexAttributeBitMapData)
+
+/*
+ * IndexAttributeBitMapIsValid --
+ * True iff attribute bit map is valid.
+ */
+#define IndexAttributeBitMapIsValid(bits) PointerIsValid(bits)
+
+#endif /* IBIT_H */
diff --git a/src/backend/access/index/Makefile.inc b/src/backend/access/index/Makefile.inc
new file mode 100644
index 00000000000..0bc58830c8f
--- /dev/null
+++ b/src/backend/access/index/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/index
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/index/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= genam.c indexam.c istrat.c
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
new file mode 100644
index 00000000000..3d02ba57009
--- /dev/null
+++ b/src/backend/access/index/genam.c
@@ -0,0 +1,275 @@
+/*-------------------------------------------------------------------------
+ *
+ * genam.c--
+ * general index access method routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/index/genam.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+ *
+ * NOTES
+ * many of the old access method routines have been turned into
+ * macros and moved to genam.h -cim 4/30/91
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * OLD COMMENTS
+ * Scans are implemented as follows:
+ *
+ * `0' represents an invalid item pointer.
+ * `-' represents an unknown item pointer.
+ * `X' represents a known item pointers.
+ * `+' represents known or invalid item pointers.
+ * `*' represents any item pointers.
+ *
+ * State is represented by a triple of these symbols in the order of
+ * previous, current, next. Note that the case of reverse scans works
+ * identically.
+ *
+ * State Result
+ * (1) + + - + 0 0 (if the next item pointer is invalid)
+ * (2) + X - (otherwise)
+ * (3) * 0 0 * 0 0 (no change)
+ * (4) + X 0 X 0 0 (shift)
+ * (5) * + X + X - (shift, add unknown)
+ *
+ * All other states cannot occur.
+ *
+ * Note:
+ *It would be possible to cache the status of the previous and
+ * next item pointer using the flags.
+ * ----------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/attnum.h"
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/itup.h"
+#include "access/relscan.h"
+#include "access/sdir.h"
+#include "access/skey.h"
+
+#include "storage/bufmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+
+#include "catalog/catname.h"
+#include "catalog/pg_attribute.h"
+#include "catalog/pg_index.h"
+#include "catalog/pg_proc.h"
+
+#include "catalog/index.h"
+
+/* ----------------------------------------------------------------
+ * general access method routines
+ *
+ * All indexed access methods use an identical scan structure.
+ * We don't know how the various AMs do locking, however, so we don't
+ * do anything about that here.
+ *
+ * The intent is that an AM implementor will define a front-end routine
+ * that calls this one, to fill in the scan, and then does whatever kind
+ * of locking he wants.
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * RelationGetIndexScan -- Create and fill an IndexScanDesc.
+ *
+ * This routine creates an index scan structure and sets its contents
+ * up correctly. This routine calls AMrescan to set up the scan with
+ * the passed key.
+ *
+ * Parameters:
+ * relation -- index relation for scan.
+ * scanFromEnd -- if true, begin scan at one of the index's
+ * endpoints.
+ * numberOfKeys -- count of scan keys (more than one won't
+ * necessarily do anything useful, yet).
+ * key -- the ScanKey for the starting position of the scan.
+ *
+ * Returns:
+ * An initialized IndexScanDesc.
+ *
+ * Side Effects:
+ * Bumps the ref count on the relation to keep it in the cache.
+ *
+ * ----------------
+ */
+IndexScanDesc
+RelationGetIndexScan(Relation relation,
+ bool scanFromEnd,
+ uint16 numberOfKeys,
+ ScanKey key)
+{
+ IndexScanDesc scan;
+
+ if (! RelationIsValid(relation))
+ elog(WARN, "RelationGetIndexScan: relation invalid");
+
+ scan = (IndexScanDesc) palloc(sizeof(IndexScanDescData));
+
+ scan->relation = relation;
+ scan->opaque = NULL;
+ scan->numberOfKeys = numberOfKeys;
+
+ ItemPointerSetInvalid(&scan->previousItemData);
+ ItemPointerSetInvalid(&scan->currentItemData);
+ ItemPointerSetInvalid(&scan->nextItemData);
+ ItemPointerSetInvalid(&scan->previousMarkData);
+ ItemPointerSetInvalid(&scan->currentMarkData);
+ ItemPointerSetInvalid(&scan->nextMarkData);
+
+ if (numberOfKeys > 0) {
+ scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * numberOfKeys);
+ } else {
+ scan->keyData = NULL;
+ }
+
+ index_rescan(scan, scanFromEnd, key);
+
+ return (scan);
+}
+
+/* ----------------
+ * IndexScanRestart -- Restart an index scan.
+ *
+ * This routine isn't used by any existing access method. It's
+ * appropriate if relation level locks are what you want.
+ *
+ * Returns:
+ * None.
+ *
+ * Side Effects:
+ * None.
+ * ----------------
+ */
+void
+IndexScanRestart(IndexScanDesc scan,
+ bool scanFromEnd,
+ ScanKey key)
+{
+ if (! IndexScanIsValid(scan))
+ elog(WARN, "IndexScanRestart: invalid scan");
+
+ ItemPointerSetInvalid(&scan->previousItemData);
+ ItemPointerSetInvalid(&scan->currentItemData);
+ ItemPointerSetInvalid(&scan->nextItemData);
+
+ if (RelationGetNumberOfBlocks(scan->relation) == 0)
+ scan->flags = ScanUnmarked;
+ else if (scanFromEnd)
+ scan->flags = ScanUnmarked | ScanUncheckedPrevious;
+ else
+ scan->flags = ScanUnmarked | ScanUncheckedNext;
+
+ scan->scanFromEnd = (bool) scanFromEnd;
+
+ if (scan->numberOfKeys > 0)
+ memmove(scan->keyData,
+ key,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+}
+
+/* ----------------
+ * IndexScanEnd -- End and index scan.
+ *
+ * This routine is not used by any existing access method, but is
+ * suitable for use if you don't want to do sophisticated locking.
+ *
+ * Returns:
+ * None.
+ *
+ * Side Effects:
+ * None.
+ * ----------------
+ */
+void
+IndexScanEnd(IndexScanDesc scan)
+{
+ if (! IndexScanIsValid(scan))
+ elog(WARN, "IndexScanEnd: invalid scan");
+
+ pfree(scan);
+}
+
+/* ----------------
+ * IndexScanMarkPosition -- Mark current position in a scan.
+ *
+ * This routine isn't used by any existing access method, but is the
+ * one that AM implementors should use, if they don't want to do any
+ * special locking. If relation-level locking is sufficient, this is
+ * the routine for you.
+ *
+ * Returns:
+ * None.
+ *
+ * Side Effects:
+ * None.
+ * ----------------
+ */
+void
+IndexScanMarkPosition(IndexScanDesc scan)
+{
+ RetrieveIndexResult result;
+
+ if (scan->flags & ScanUncheckedPrevious) {
+ result =
+ index_getnext(scan, BackwardScanDirection);
+
+ if (result != NULL) {
+ scan->previousItemData = result->index_iptr;
+ } else {
+ ItemPointerSetInvalid(&scan->previousItemData);
+ }
+
+ } else if (scan->flags & ScanUncheckedNext) {
+ result = (RetrieveIndexResult)
+ index_getnext(scan, ForwardScanDirection);
+
+ if (result != NULL) {
+ scan->nextItemData = result->index_iptr;
+ } else {
+ ItemPointerSetInvalid(&scan->nextItemData);
+ }
+ }
+
+ scan->previousMarkData = scan->previousItemData;
+ scan->currentMarkData = scan->currentItemData;
+ scan->nextMarkData = scan->nextItemData;
+
+ scan->flags = 0x0; /* XXX should have a symbolic name */
+}
+
+/* ----------------
+ * IndexScanRestorePosition -- Restore position on a marked scan.
+ *
+ * This routine isn't used by any existing access method, but is the
+ * one that AM implementors should use if they don't want to do any
+ * special locking. If relation-level locking is sufficient, then
+ * this is the one you want.
+ *
+ * Returns:
+ * None.
+ *
+ * Side Effects:
+ * None.
+ * ----------------
+ */
+void
+IndexScanRestorePosition(IndexScanDesc scan)
+{
+ if (scan->flags & ScanUnmarked)
+ elog(WARN, "IndexScanRestorePosition: no mark to restore");
+
+ scan->previousItemData = scan->previousMarkData;
+ scan->currentItemData = scan->currentMarkData;
+ scan->nextItemData = scan->nextMarkData;
+
+ scan->flags = 0x0; /* XXX should have a symbolic name */
+}
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
new file mode 100644
index 00000000000..bffe3a41f3a
--- /dev/null
+++ b/src/backend/access/index/indexam.c
@@ -0,0 +1,411 @@
+/*-------------------------------------------------------------------------
+ *
+ * indexam.c--
+ * general index access method routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+ *
+ * INTERFACE ROUTINES
+ * index_open - open an index relation by relationId
+ * index_openr - open a index relation by name
+ * index_close - close a index relation
+ * index_beginscan - start a scan of an index
+ * index_rescan - restart a scan of an index
+ * index_endscan - end a scan
+ * index_insert - insert an index tuple into a relation
+ * index_delete - delete an item from an index relation
+ * index_markpos - mark a scan position
+ * index_restrpos - restore a scan position
+ * index_getnext - get the next tuple from a scan
+ * ** index_fetch - retrieve tuple with tid
+ * ** index_replace - replace a tuple
+ * ** index_getattr - get an attribute from an index tuple
+ * index_getprocid - get a support procedure id from the rel tuple
+ *
+ * IndexScanIsValid - check index scan
+ *
+ * NOTES
+ * This file contains the index_ routines which used
+ * to be a scattered collection of stuff in access/genam.
+ *
+ * The ** routines: index_fetch, index_replace, and index_getattr
+ * have not yet been implemented. They may not be needed.
+ *
+ * old comments
+ * Scans are implemented as follows:
+ *
+ * `0' represents an invalid item pointer.
+ * `-' represents an unknown item pointer.
+ * `X' represents a known item pointers.
+ * `+' represents known or invalid item pointers.
+ * `*' represents any item pointers.
+ *
+ * State is represented by a triple of these symbols in the order of
+ * previous, current, next. Note that the case of reverse scans works
+ * identically.
+ *
+ * State Result
+ * (1) + + - + 0 0 (if the next item pointer is invalid)
+ * (2) + X - (otherwise)
+ * (3) * 0 0 * 0 0 (no change)
+ * (4) + X 0 X 0 0 (shift)
+ * (5) * + X + X - (shift, add unknown)
+ *
+ * All other states cannot occur.
+ *
+ * Note: It would be possible to cache the status of the previous and
+ * next item pointer using the flags.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/attnum.h"
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/itup.h"
+#include "access/relscan.h"
+#include "access/sdir.h"
+#include "access/skey.h"
+#include "access/funcindex.h"
+
+#include "storage/lmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+
+#include "catalog/catname.h"
+#include "catalog/pg_attribute.h"
+#include "catalog/pg_index.h"
+#include "catalog/pg_proc.h"
+
+#include "catalog/index.h"
+
+#include "fmgr.h"
+
+/* ----------------
+ * undefine macros we aren't going to use that would otherwise
+ * get in our way.. delete is defined in c.h and the am's are
+ * defined in heapam.h
+ * ----------------
+ */
+#undef delete
+#undef aminsert
+#undef amdelete
+#undef ambeginscan
+#undef amrescan
+#undef amendscan
+#undef ammarkpos
+#undef amrestrpos
+#undef amgettuple
+
+/* ----------------------------------------------------------------
+ * macros used in index_ routines
+ * ----------------------------------------------------------------
+ */
+#define RELATION_CHECKS \
+Assert(RelationIsValid(relation)); \
+ Assert(PointerIsValid(relation->rd_am))
+
+#define SCAN_CHECKS \
+ Assert(IndexScanIsValid(scan)); \
+ Assert(RelationIsValid(scan->relation)); \
+ Assert(PointerIsValid(scan->relation->rd_am))
+
+#define GET_REL_PROCEDURE(x,y) \
+ CppConcat(procedure = relation->rd_am->,y); \
+ if (! RegProcedureIsValid(procedure)) \
+ elog(WARN, "index_%s: invalid %s regproc", \
+ CppAsString(x), CppAsString(y))
+
+#define GET_SCAN_PROCEDURE(x,y) \
+ CppConcat(procedure = scan->relation->rd_am->,y); \
+ if (! RegProcedureIsValid(procedure)) \
+ elog(WARN, "index_%s: invalid %s regproc", \
+ CppAsString(x), CppAsString(y))
+
+
+/* ----------------------------------------------------------------
+ * index_ interface functions
+ * ----------------------------------------------------------------
+ */
+/* ----------------
+ * index_open - open an index relation by relationId
+ *
+ * presently the relcache routines do all the work we need
+ * to open/close index relations.
+ * ----------------
+ */
+Relation
+index_open(Oid relationId)
+{
+ return RelationIdGetRelation(relationId);
+}
+
+/* ----------------
+ * index_openr - open a index relation by name
+ *
+ * presently the relcache routines do all the work we need
+ * to open/close index relations.
+ * ----------------
+ */
+Relation
+index_openr(char *relationName)
+{
+ return RelationNameGetRelation(relationName);
+}
+
+/* ----------------
+ * index_close - close a index relation
+ *
+ * presently the relcache routines do all the work we need
+ * to open/close index relations.
+ * ----------------
+ */
+void
+index_close(Relation relation)
+{
+ (void) RelationClose(relation);
+}
+
+/* ----------------
+ * index_insert - insert an index tuple into a relation
+ * ----------------
+ */
+InsertIndexResult
+index_insert(Relation relation,
+ IndexTuple indexTuple)
+{
+ RegProcedure procedure;
+ InsertIndexResult specificResult;
+
+ RELATION_CHECKS;
+ GET_REL_PROCEDURE(insert,aminsert);
+
+ /* ----------------
+ * have the am's insert proc do all the work.
+ * ----------------
+ */
+ specificResult = (InsertIndexResult)
+ fmgr(procedure, relation, indexTuple, NULL);
+
+ /* ----------------
+ * the insert proc is supposed to return a "specific result" and
+ * this routine has to return a "general result" so after we get
+ * something back from the insert proc, we allocate a
+ * "general result" and copy some crap between the two.
+ *
+ * As far as I'm concerned all this result shit is needlessly c
+ * omplicated and should be eliminated. -cim 1/19/91
+ *
+ * mao concurs. regardless of how we feel here, however, it is
+ * important to free memory we don't intend to return to anyone.
+ * 2/28/91
+ *
+ * this "general result" crap is now gone. -ay 3/6/95
+ * ----------------
+ */
+
+ return (specificResult);
+}
+
+/* ----------------
+ * index_delete - delete an item from an index relation
+ * ----------------
+ */
+void
+index_delete(Relation relation, ItemPointer indexItem)
+{
+ RegProcedure procedure;
+
+ RELATION_CHECKS;
+ GET_REL_PROCEDURE(delete,amdelete);
+
+ (void) fmgr(procedure, relation, indexItem);
+}
+
+/* ----------------
+ * index_beginscan - start a scan of an index
+ * ----------------
+ */
+IndexScanDesc
+index_beginscan(Relation relation,
+ bool scanFromEnd,
+ uint16 numberOfKeys,
+ ScanKey key)
+{
+ IndexScanDesc scandesc;
+ RegProcedure procedure;
+
+ RELATION_CHECKS;
+ GET_REL_PROCEDURE(beginscan,ambeginscan);
+
+ RelationSetRIntentLock(relation);
+
+ scandesc = (IndexScanDesc)
+ fmgr(procedure, relation, scanFromEnd, numberOfKeys, key);
+
+ return scandesc;
+}
+
+/* ----------------
+ * index_rescan - restart a scan of an index
+ * ----------------
+ */
+void
+index_rescan(IndexScanDesc scan, bool scanFromEnd, ScanKey key)
+{
+ RegProcedure procedure;
+
+ SCAN_CHECKS;
+ GET_SCAN_PROCEDURE(rescan,amrescan);
+
+ (void) fmgr(procedure, scan, scanFromEnd, key);
+}
+
+/* ----------------
+ * index_endscan - end a scan
+ * ----------------
+ */
+void
+index_endscan(IndexScanDesc scan)
+{
+ RegProcedure procedure;
+
+ SCAN_CHECKS;
+ GET_SCAN_PROCEDURE(endscan,amendscan);
+
+ (void) fmgr(procedure, scan);
+
+ RelationUnsetRIntentLock(scan->relation);
+}
+
+/* ----------------
+ * index_markpos - mark a scan position
+ * ----------------
+ */
+void
+index_markpos(IndexScanDesc scan)
+{
+ RegProcedure procedure;
+
+ SCAN_CHECKS;
+ GET_SCAN_PROCEDURE(markpos,ammarkpos);
+
+ (void) fmgr(procedure, scan);
+}
+
+/* ----------------
+ * index_restrpos - restore a scan position
+ * ----------------
+ */
+void
+index_restrpos(IndexScanDesc scan)
+{
+ RegProcedure procedure;
+
+ SCAN_CHECKS;
+ GET_SCAN_PROCEDURE(restrpos,amrestrpos);
+
+ (void) fmgr(procedure, scan);
+}
+
+/* ----------------
+ * index_getnext - get the next tuple from a scan
+ *
+ * A RetrieveIndexResult is a index tuple/heap tuple pair
+ * ----------------
+ */
+RetrieveIndexResult
+index_getnext(IndexScanDesc scan,
+ ScanDirection direction)
+{
+ RegProcedure procedure;
+ RetrieveIndexResult result;
+
+ SCAN_CHECKS;
+ GET_SCAN_PROCEDURE(getnext,amgettuple);
+
+ /* ----------------
+ * have the am's gettuple proc do all the work.
+ * ----------------
+ */
+ result = (RetrieveIndexResult)
+ fmgr(procedure, scan, direction);
+
+ return result;
+}
+
+/* ----------------
+ * index_getprocid
+ *
+ * Some indexed access methods may require support routines that are
+ * not in the operator class/operator model imposed by pg_am. These
+ * access methods may store the OIDs of registered procedures they
+ * need in pg_amproc. These registered procedure OIDs are ordered in
+ * a way that makes sense to the access method, and used only by the
+ * access method. The general index code doesn't know anything about
+ * the routines involved; it just builds an ordered list of them for
+ * each attribute on which an index is defined.
+ *
+ * This routine returns the requested procedure OID for a particular
+ * indexed attribute.
+ * ----------------
+ */
+RegProcedure
+index_getprocid(Relation irel,
+ AttrNumber attnum,
+ uint16 procnum)
+{
+ RegProcedure *loc;
+ int natts;
+
+ natts = irel->rd_rel->relnatts;
+
+ loc = irel->rd_support;
+
+ Assert(loc != NULL);
+
+ return (loc[(natts * (procnum - 1)) + (attnum - 1)]);
+}
+
+Datum
+GetIndexValue(HeapTuple tuple,
+ TupleDesc hTupDesc,
+ int attOff,
+ AttrNumber attrNums[],
+ FuncIndexInfo *fInfo,
+ bool *attNull,
+ Buffer buffer)
+{
+ Datum returnVal;
+ bool isNull;
+
+ if (PointerIsValid(fInfo) && FIgetProcOid(fInfo) != InvalidOid) {
+ int i;
+ Datum *attData = (Datum *)palloc(FIgetnArgs(fInfo)*sizeof(Datum));
+
+ for (i = 0; i < FIgetnArgs(fInfo); i++) {
+ attData[i] = (Datum) heap_getattr(tuple,
+ buffer,
+ attrNums[i],
+ hTupDesc,
+ attNull);
+ }
+ returnVal = (Datum)fmgr_array_args(FIgetProcOid(fInfo),
+ FIgetnArgs(fInfo),
+ (char **) attData,
+ &isNull);
+ pfree(attData);
+ *attNull = FALSE;
+ }else {
+ returnVal = (Datum) heap_getattr(tuple, buffer, attrNums[attOff],
+ hTupDesc, attNull);
+ }
+ return returnVal;
+}
diff --git a/src/backend/access/index/istrat.c b/src/backend/access/index/istrat.c
new file mode 100644
index 00000000000..602d2bd9e94
--- /dev/null
+++ b/src/backend/access/index/istrat.c
@@ -0,0 +1,679 @@
+/*-------------------------------------------------------------------------
+ *
+ * istrat.c--
+ * index scan strategy manipulation code and index strategy manipulation
+ * operator code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/index/Attic/istrat.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/attnum.h"
+#include "access/heapam.h"
+#include "access/istrat.h"
+#include "access/itup.h" /* for MaxIndexAttributeNumber */
+#include "access/skey.h"
+#include "utils/tqual.h" /* for NowTimeQual */
+
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/rel.h"
+
+#include "catalog/catname.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_amproc.h"
+#include "catalog/pg_index.h"
+#include "catalog/pg_proc.h"
+
+/* ----------------------------------------------------------------
+ * misc strategy support routines
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * StrategyNumberIsValid
+ * StrategyNumberIsInBounds
+ * StrategyMapIsValid
+ * StrategyTransformMapIsValid
+ * IndexStrategyIsValid
+ *
+ * ... are now macros in istrat.h -cim 4/27/91
+ */
+
+/*
+ * StrategyMapGetScanKeyEntry --
+ * Returns a scan key entry of a index strategy mapping member.
+ *
+ * Note:
+ * Assumes that the index strategy mapping is valid.
+ * Assumes that the index strategy number is valid.
+ * Bounds checking should be done outside this routine.
+ */
+ScanKey
+StrategyMapGetScanKeyEntry(StrategyMap map,
+ StrategyNumber strategyNumber)
+{
+ Assert(StrategyMapIsValid(map));
+ Assert(StrategyNumberIsValid(strategyNumber));
+ return (&map->entry[strategyNumber - 1]);
+}
+
+/*
+ * IndexStrategyGetStrategyMap --
+ * Returns an index strategy mapping of an index strategy.
+ *
+ * Note:
+ * Assumes that the index strategy is valid.
+ * Assumes that the number of index strategies is valid.
+ * Bounds checking should be done outside this routine.
+ */
+StrategyMap
+IndexStrategyGetStrategyMap(IndexStrategy indexStrategy,
+ StrategyNumber maxStrategyNum,
+ AttrNumber attrNum)
+{
+ Assert(IndexStrategyIsValid(indexStrategy));
+ Assert(StrategyNumberIsValid(maxStrategyNum));
+ Assert(AttributeNumberIsValid(attrNum));
+
+ maxStrategyNum = AMStrategies(maxStrategyNum); /* XXX */
+ return
+ &indexStrategy->strategyMapData[maxStrategyNum * (attrNum - 1)];
+}
+
+/*
+ * AttributeNumberGetIndexStrategySize --
+ * Computes the size of an index strategy.
+ */
+Size
+AttributeNumberGetIndexStrategySize(AttrNumber maxAttributeNumber,
+ StrategyNumber maxStrategyNumber)
+{
+ maxStrategyNumber = AMStrategies(maxStrategyNumber); /* XXX */
+ return
+ maxAttributeNumber * maxStrategyNumber * sizeof (ScanKeyData);
+}
+
+/*
+ * StrategyTransformMapIsValid is now a macro in istrat.h -cim 4/27/91
+ */
+
+/* ----------------
+ * StrategyOperatorIsValid
+ * ----------------
+ */
+bool
+StrategyOperatorIsValid(StrategyOperator operator,
+ StrategyNumber maxStrategy)
+{
+ return (bool)
+ (PointerIsValid(operator) &&
+ StrategyNumberIsInBounds(operator->strategy, maxStrategy) &&
+ !(operator->flags & ~(SK_NEGATE | SK_COMMUTE)));
+}
+
+/* ----------------
+ * StrategyTermIsValid
+ * ----------------
+ */
+bool
+StrategyTermIsValid(StrategyTerm term,
+ StrategyNumber maxStrategy)
+{
+ Index index;
+
+ if (! PointerIsValid(term) || term->degree == 0)
+ return false;
+
+ for (index = 0; index < term->degree; index += 1) {
+ if (! StrategyOperatorIsValid(&term->operatorData[index],
+ maxStrategy)) {
+
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* ----------------
+ * StrategyExpressionIsValid
+ * ----------------
+ */
+bool
+StrategyExpressionIsValid(StrategyExpression expression,
+ StrategyNumber maxStrategy)
+{
+ StrategyTerm *termP;
+
+ if (!PointerIsValid(expression))
+ return true;
+
+ if (!StrategyTermIsValid(expression->term[0], maxStrategy))
+ return false;
+
+ termP = &expression->term[1];
+ while (StrategyTermIsValid(*termP, maxStrategy))
+ termP += 1;
+
+ return (bool)
+ (! PointerIsValid(*termP));
+}
+
+/* ----------------
+ * StrategyEvaluationIsValid
+ * ----------------
+ */
+bool
+StrategyEvaluationIsValid(StrategyEvaluation evaluation)
+{
+ Index index;
+
+ if (! PointerIsValid(evaluation) ||
+ ! StrategyNumberIsValid(evaluation->maxStrategy) ||
+ ! StrategyTransformMapIsValid(evaluation->negateTransform) ||
+ ! StrategyTransformMapIsValid(evaluation->commuteTransform) ||
+ ! StrategyTransformMapIsValid(evaluation->negateCommuteTransform)) {
+
+ return false;
+ }
+
+ for (index = 0; index < evaluation->maxStrategy; index += 1) {
+ if (! StrategyExpressionIsValid(evaluation->expression[index],
+ evaluation->maxStrategy)) {
+
+ return false;
+ }
+ }
+ return true;
+}
+
+/* ----------------
+ * StrategyTermEvaluate
+ * ----------------
+ */
+static bool
+StrategyTermEvaluate(StrategyTerm term,
+ StrategyMap map,
+ Datum left,
+ Datum right)
+{
+ Index index;
+ long tmpres;
+ bool result;
+ StrategyOperator operator;
+ ScanKey entry;
+
+ for (index = 0, operator = &term->operatorData[0];
+ index < term->degree; index += 1, operator += 1) {
+
+ entry = &map->entry[operator->strategy - 1];
+
+ Assert(RegProcedureIsValid(entry->sk_procedure));
+
+ switch (operator->flags ^ entry->sk_flags) {
+ case 0x0:
+ tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+ left, right);
+ break;
+
+ case SK_NEGATE:
+ tmpres = (long) !FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+ left, right);
+ break;
+
+ case SK_COMMUTE:
+ tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+ right, left);
+ break;
+
+ case SK_NEGATE | SK_COMMUTE:
+ tmpres = (long) !FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+ right, left);
+ break;
+
+ default:
+ elog(FATAL, "StrategyTermEvaluate: impossible case %d",
+ operator->flags ^ entry->sk_flags);
+ }
+
+ result = (bool) tmpres;
+ if (!result)
+ return result;
+ }
+
+ return result;
+}
+
+
+/* ----------------
+ * RelationGetStrategy
+ * ----------------
+ */
+StrategyNumber
+RelationGetStrategy(Relation relation,
+ AttrNumber attributeNumber,
+ StrategyEvaluation evaluation,
+ RegProcedure procedure)
+{
+ StrategyNumber strategy;
+ StrategyMap strategyMap;
+ ScanKey entry;
+ Index index;
+ int numattrs;
+
+ Assert(RelationIsValid(relation));
+ numattrs = RelationGetNumberOfAttributes(relation);
+
+ Assert(relation->rd_rel->relkind == RELKIND_INDEX); /* XXX use accessor */
+ Assert(AttributeNumberIsValid(attributeNumber));
+ Assert( (attributeNumber >= 1) && (attributeNumber < 1 + numattrs));
+
+ Assert(StrategyEvaluationIsValid(evaluation));
+ Assert(RegProcedureIsValid(procedure));
+
+ strategyMap =
+ IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+ evaluation->maxStrategy,
+ attributeNumber);
+
+ /* get a strategy number for the procedure ignoring flags for now */
+ for (index = 0; index < evaluation->maxStrategy; index += 1) {
+ if (strategyMap->entry[index].sk_procedure == procedure) {
+ break;
+ }
+ }
+
+ if (index == evaluation->maxStrategy)
+ return InvalidStrategy;
+
+ strategy = 1 + index;
+ entry = StrategyMapGetScanKeyEntry(strategyMap, strategy);
+
+ Assert(!(entry->sk_flags & ~(SK_NEGATE | SK_COMMUTE)));
+
+ switch (entry->sk_flags & (SK_NEGATE | SK_COMMUTE)) {
+ case 0x0:
+ return strategy;
+
+ case SK_NEGATE:
+ strategy = evaluation->negateTransform->strategy[strategy - 1];
+ break;
+
+ case SK_COMMUTE:
+ strategy = evaluation->commuteTransform->strategy[strategy - 1];
+ break;
+
+ case SK_NEGATE | SK_COMMUTE:
+ strategy = evaluation->negateCommuteTransform->strategy[strategy - 1];
+ break;
+
+ default:
+ elog(FATAL, "RelationGetStrategy: impossible case %d", entry->sk_flags);
+ }
+
+
+ if (! StrategyNumberIsInBounds(strategy, evaluation->maxStrategy)) {
+ if (! StrategyNumberIsValid(strategy)) {
+ elog(WARN, "RelationGetStrategy: corrupted evaluation");
+ }
+ }
+
+ return strategy;
+}
+
+/* ----------------
+ * RelationInvokeStrategy
+ * ----------------
+ */
+bool /* XXX someday, this may return Datum */
+RelationInvokeStrategy(Relation relation,
+ StrategyEvaluation evaluation,
+ AttrNumber attributeNumber,
+ StrategyNumber strategy,
+ Datum left,
+ Datum right)
+{
+ StrategyNumber newStrategy;
+ StrategyMap strategyMap;
+ ScanKey entry;
+ StrategyTermData termData;
+ int numattrs;
+
+ Assert(RelationIsValid(relation));
+ Assert(relation->rd_rel->relkind == RELKIND_INDEX); /* XXX use accessor */
+ numattrs = RelationGetNumberOfAttributes(relation);
+
+ Assert(StrategyEvaluationIsValid(evaluation));
+ Assert(AttributeNumberIsValid(attributeNumber));
+ Assert( (attributeNumber >= 1) && (attributeNumber < 1 + numattrs));
+
+ Assert(StrategyNumberIsInBounds(strategy, evaluation->maxStrategy));
+
+ termData.degree = 1;
+
+ strategyMap =
+ IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+ evaluation->maxStrategy,
+ attributeNumber);
+
+ entry = StrategyMapGetScanKeyEntry(strategyMap, strategy);
+
+ if (RegProcedureIsValid(entry->sk_procedure)) {
+ termData.operatorData[0].strategy = strategy;
+ termData.operatorData[0].flags = 0x0;
+
+ return
+ StrategyTermEvaluate(&termData, strategyMap, left, right);
+ }
+
+
+ newStrategy = evaluation->negateTransform->strategy[strategy - 1];
+ if (newStrategy != strategy && StrategyNumberIsValid(newStrategy)) {
+
+ entry = StrategyMapGetScanKeyEntry(strategyMap, newStrategy);
+
+ if (RegProcedureIsValid(entry->sk_procedure)) {
+ termData.operatorData[0].strategy = newStrategy;
+ termData.operatorData[0].flags = SK_NEGATE;
+
+ return
+ StrategyTermEvaluate(&termData, strategyMap, left, right);
+ }
+ }
+
+ newStrategy = evaluation->commuteTransform->strategy[strategy - 1];
+ if (newStrategy != strategy && StrategyNumberIsValid(newStrategy)) {
+
+ entry = StrategyMapGetScanKeyEntry(strategyMap, newStrategy);
+
+ if (RegProcedureIsValid(entry->sk_procedure)) {
+ termData.operatorData[0].strategy = newStrategy;
+ termData.operatorData[0].flags = SK_COMMUTE;
+
+ return
+ StrategyTermEvaluate(&termData, strategyMap, left, right);
+ }
+ }
+
+ newStrategy = evaluation->negateCommuteTransform->strategy[strategy - 1];
+ if (newStrategy != strategy && StrategyNumberIsValid(newStrategy)) {
+
+ entry = StrategyMapGetScanKeyEntry(strategyMap, newStrategy);
+
+ if (RegProcedureIsValid(entry->sk_procedure)) {
+ termData.operatorData[0].strategy = newStrategy;
+ termData.operatorData[0].flags = SK_NEGATE | SK_COMMUTE;
+
+ return
+ StrategyTermEvaluate(&termData, strategyMap, left, right);
+ }
+ }
+
+ if (PointerIsValid(evaluation->expression[strategy - 1])) {
+ StrategyTerm *termP;
+
+ termP = &evaluation->expression[strategy - 1]->term[0];
+ while (PointerIsValid(*termP)) {
+ Index index;
+
+ for (index = 0; index < (*termP)->degree; index += 1) {
+ entry = StrategyMapGetScanKeyEntry(strategyMap,
+ (*termP)->operatorData[index].strategy);
+
+ if (! RegProcedureIsValid(entry->sk_procedure)) {
+ break;
+ }
+ }
+
+ if (index == (*termP)->degree) {
+ return
+ StrategyTermEvaluate(*termP, strategyMap, left, right);
+ }
+
+ termP += 1;
+ }
+ }
+
+ elog(WARN, "RelationInvokeStrategy: cannot evaluate strategy %d",
+ strategy);
+
+ /* not reached, just to make compiler happy */
+ return FALSE;
+
+
+}
+
+/* ----------------
+ * OperatorRelationFillScanKeyEntry
+ * ----------------
+ */
+static void
+OperatorRelationFillScanKeyEntry(Relation operatorRelation,
+ Oid operatorObjectId,
+ ScanKey entry)
+{
+ HeapScanDesc scan;
+ ScanKeyData scanKeyData;
+ HeapTuple tuple;
+
+ ScanKeyEntryInitialize(&scanKeyData, 0,
+ ObjectIdAttributeNumber,
+ ObjectIdEqualRegProcedure,
+ ObjectIdGetDatum(operatorObjectId));
+
+ scan = heap_beginscan(operatorRelation, false, NowTimeQual,
+ 1, &scanKeyData);
+
+ tuple = heap_getnext(scan, false, (Buffer *)NULL);
+ if (! HeapTupleIsValid(tuple)) {
+ elog(WARN, "OperatorObjectIdFillScanKeyEntry: unknown operator %lu",
+ (uint32) operatorObjectId);
+ }
+
+ entry->sk_flags = 0;
+ entry->sk_procedure =
+ ((OperatorTupleForm) GETSTRUCT(tuple))->oprcode;
+ fmgr_info(entry->sk_procedure, &entry->sk_func, &entry->sk_nargs);
+
+ if (! RegProcedureIsValid(entry->sk_procedure)) {
+ elog(WARN,
+ "OperatorObjectIdFillScanKeyEntry: no procedure for operator %lu",
+ (uint32) operatorObjectId);
+ }
+
+ heap_endscan(scan);
+}
+
+
+/*
+ * IndexSupportInitialize --
+ * Initializes an index strategy and associated support procedures.
+ */
+void
+IndexSupportInitialize(IndexStrategy indexStrategy,
+ RegProcedure *indexSupport,
+ Oid indexObjectId,
+ Oid accessMethodObjectId,
+ StrategyNumber maxStrategyNumber,
+ StrategyNumber maxSupportNumber,
+ AttrNumber maxAttributeNumber)
+{
+ Relation relation;
+ Relation operatorRelation;
+ HeapScanDesc scan;
+ HeapTuple tuple;
+ ScanKeyData entry[2];
+ StrategyMap map;
+ AttrNumber attributeNumber;
+ int attributeIndex;
+ Oid operatorClassObjectId[ MaxIndexAttributeNumber ];
+
+ maxStrategyNumber = AMStrategies(maxStrategyNumber);
+
+ ScanKeyEntryInitialize(&entry[0], 0, Anum_pg_index_indexrelid,
+ ObjectIdEqualRegProcedure,
+ ObjectIdGetDatum(indexObjectId));
+
+ relation = heap_openr(IndexRelationName);
+ scan = heap_beginscan(relation, false, NowTimeQual, 1, entry);
+ tuple = heap_getnext(scan, 0, (Buffer *)NULL);
+ if (! HeapTupleIsValid(tuple))
+ elog(WARN, "IndexSupportInitialize: corrupted catalogs");
+
+ /*
+ * XXX note that the following assumes the INDEX tuple is well formed and
+ * that the key[] and class[] are 0 terminated.
+ */
+ for (attributeIndex=0; attributeIndex<maxAttributeNumber; attributeIndex++)
+ {
+ IndexTupleForm iform;
+
+ iform = (IndexTupleForm) GETSTRUCT(tuple);
+
+ if (!OidIsValid(iform->indkey[attributeIndex])) {
+ if (attributeIndex == 0) {
+ elog(WARN, "IndexSupportInitialize: no pg_index tuple");
+ }
+ break;
+ }
+
+ operatorClassObjectId[attributeIndex]
+ = iform->indclass[attributeIndex];
+ }
+
+ heap_endscan(scan);
+ heap_close(relation);
+
+ /* if support routines exist for this access method, load them */
+ if (maxSupportNumber > 0) {
+
+ ScanKeyEntryInitialize(&entry[0], 0, Anum_pg_amproc_amid,
+ ObjectIdEqualRegProcedure,
+ ObjectIdGetDatum(accessMethodObjectId));
+
+ ScanKeyEntryInitialize(&entry[1], 0, Anum_pg_amproc_amopclaid,
+ ObjectIdEqualRegProcedure, 0);
+
+/* relation = heap_openr(Name_pg_amproc); */
+ relation = heap_openr(AccessMethodProcedureRelationName);
+
+
+ for (attributeNumber = maxAttributeNumber; attributeNumber > 0;
+ attributeNumber--) {
+
+ int16 support;
+ Form_pg_amproc form;
+ RegProcedure *loc;
+
+ loc = &indexSupport[((attributeNumber - 1) * maxSupportNumber)];
+
+ for (support = maxSupportNumber; --support >= 0; ) {
+ loc[support] = InvalidOid;
+ }
+
+ entry[1].sk_argument =
+ ObjectIdGetDatum(operatorClassObjectId[attributeNumber - 1]);
+
+ scan = heap_beginscan(relation, false, NowTimeQual, 2, entry);
+
+ while (tuple = heap_getnext(scan, 0, (Buffer *)NULL),
+ HeapTupleIsValid(tuple)) {
+
+ form = (Form_pg_amproc) GETSTRUCT(tuple);
+ loc[(form->amprocnum - 1)] = form->amproc;
+ }
+
+ heap_endscan(scan);
+ }
+ heap_close(relation);
+ }
+
+ ScanKeyEntryInitialize(&entry[0], 0,
+ Anum_pg_amop_amopid,
+ ObjectIdEqualRegProcedure,
+ ObjectIdGetDatum(accessMethodObjectId));
+
+ ScanKeyEntryInitialize(&entry[1], 0,
+ Anum_pg_amop_amopclaid,
+ ObjectIdEqualRegProcedure, 0);
+
+ relation = heap_openr(AccessMethodOperatorRelationName);
+ operatorRelation = heap_openr(OperatorRelationName);
+
+ for (attributeNumber = maxAttributeNumber; attributeNumber > 0;
+ attributeNumber--) {
+
+ StrategyNumber strategy;
+
+ entry[1].sk_argument =
+ ObjectIdGetDatum(operatorClassObjectId[attributeNumber - 1]);
+
+ map = IndexStrategyGetStrategyMap(indexStrategy,
+ maxStrategyNumber,
+ attributeNumber);
+
+ for (strategy = 1; strategy <= maxStrategyNumber; strategy++)
+ ScanKeyEntrySetIllegal(StrategyMapGetScanKeyEntry(map, strategy));
+
+ scan = heap_beginscan(relation, false, NowTimeQual, 2, entry);
+
+ while (tuple = heap_getnext(scan, 0, (Buffer *)NULL),
+ HeapTupleIsValid(tuple)) {
+ Form_pg_amop form;
+
+ form = (Form_pg_amop) GETSTRUCT(tuple);
+
+ OperatorRelationFillScanKeyEntry(operatorRelation,
+ form->amopopr,
+ StrategyMapGetScanKeyEntry(map, form->amopstrategy));
+ }
+
+ heap_endscan(scan);
+ }
+
+ heap_close(operatorRelation);
+ heap_close(relation);
+}
+
+/* ----------------
+ * IndexStrategyDisplay
+ * ----------------
+ */
+#ifdef ISTRATDEBUG
+int
+IndexStrategyDisplay(IndexStrategy indexStrategy,
+ StrategyNumber numberOfStrategies,
+ int numberOfAttributes)
+{
+ StrategyMap strategyMap;
+ AttrNumber attributeNumber;
+ StrategyNumber strategyNumber;
+
+ for (attributeNumber = 1; attributeNumber <= numberOfAttributes;
+ attributeNumber += 1) {
+
+ strategyMap = IndexStrategyGetStrategyMap(indexStrategy,
+ numberOfStrategies,
+ attributeNumber);
+
+ for (strategyNumber = 1;
+ strategyNumber <= AMStrategies(numberOfStrategies);
+ strategyNumber += 1) {
+
+ printf(":att %d\t:str %d\t:opr 0x%x(%d)\n",
+ attributeNumber, strategyNumber,
+ strategyMap->entry[strategyNumber - 1].sk_procedure,
+ strategyMap->entry[strategyNumber - 1].sk_procedure);
+ }
+ }
+}
+#endif /* defined(ISTRATDEBUG) */
+
+
diff --git a/src/backend/access/iqual.h b/src/backend/access/iqual.h
new file mode 100644
index 00000000000..5fab98a15bd
--- /dev/null
+++ b/src/backend/access/iqual.h
@@ -0,0 +1,32 @@
+/*-------------------------------------------------------------------------
+ *
+ * iqual.h--
+ * Index scan key qualification definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: iqual.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IQUAL_H
+#define IQUAL_H
+
+#include "c.h"
+
+#include "storage/itemid.h"
+#include "utils/rel.h"
+#include "access/skey.h"
+
+/* ----------------
+ * index tuple qualification support
+ * ----------------
+ */
+
+extern int NIndexTupleProcessed;
+
+extern bool index_keytest(IndexTuple tuple, TupleDesc tupdesc,
+ int scanKeySize, ScanKey key);
+
+#endif /* IQUAL_H */
diff --git a/src/backend/access/istrat.h b/src/backend/access/istrat.h
new file mode 100644
index 00000000000..201e70e6602
--- /dev/null
+++ b/src/backend/access/istrat.h
@@ -0,0 +1,80 @@
+/*-------------------------------------------------------------------------
+ *
+ * istrat.h--
+ * POSTGRES index strategy definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: istrat.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ISTRAT_H
+#define ISTRAT_H
+
+#include "postgres.h"
+#include "access/attnum.h"
+#include "access/skey.h"
+#include "access/strat.h"
+#include "utils/rel.h" /* for Relation */
+
+/*
+ * StrategyNumberIsValid --
+ * True iff the strategy number is valid.
+ */
+#define StrategyNumberIsValid(strategyNumber) \
+ ((bool) ((strategyNumber) != InvalidStrategy))
+
+/*
+ * StrategyNumberIsInBounds --
+ * True iff strategy number is within given bounds.
+ *
+ * Note:
+ * Assumes StrategyNumber is an unsigned type.
+ * Assumes the bounded interval to be (0,max].
+ */
+#define StrategyNumberIsInBounds(strategyNumber, maxStrategyNumber) \
+ ((bool)(InvalidStrategy < (strategyNumber) && \
+ (strategyNumber) <= (maxStrategyNumber)))
+
+/*
+ * StrategyMapIsValid --
+ * True iff the index strategy mapping is valid.
+ */
+#define StrategyMapIsValid(map) PointerIsValid(map)
+
+/*
+ * IndexStrategyIsValid --
+ * True iff the index strategy is valid.
+ */
+#define IndexStrategyIsValid(s) PointerIsValid(s)
+
+extern ScanKey StrategyMapGetScanKeyEntry(StrategyMap map,
+ StrategyNumber strategyNumber);
+extern StrategyMap IndexStrategyGetStrategyMap(IndexStrategy indexStrategy,
+ StrategyNumber maxStrategyNum, AttrNumber attrNum);
+
+extern Size
+AttributeNumberGetIndexStrategySize(AttrNumber maxAttributeNumber,
+ StrategyNumber maxStrategyNumber);
+extern bool StrategyOperatorIsValid(StrategyOperator operator,
+ StrategyNumber maxStrategy);
+extern bool StrategyTermIsValid(StrategyTerm term,
+ StrategyNumber maxStrategy);
+extern bool StrategyExpressionIsValid(StrategyExpression expression,
+ StrategyNumber maxStrategy);
+extern bool StrategyEvaluationIsValid(StrategyEvaluation evaluation);
+extern StrategyNumber RelationGetStrategy(Relation relation,
+ AttrNumber attributeNumber, StrategyEvaluation evaluation,
+ RegProcedure procedure);
+extern bool RelationInvokeStrategy(Relation relation,
+ StrategyEvaluation evaluation, AttrNumber attributeNumber,
+ StrategyNumber strategy, Datum left, Datum right);
+extern void IndexSupportInitialize(IndexStrategy indexStrategy,
+ RegProcedure *indexSupport, Oid indexObjectId,
+ Oid accessMethodObjectId, StrategyNumber maxStrategyNumber,
+ StrategyNumber maxSupportNumber, AttrNumber maxAttributeNumber);
+
+
+#endif /* ISTRAT_H */
diff --git a/src/backend/access/itup.h b/src/backend/access/itup.h
new file mode 100644
index 00000000000..028bf430b0d
--- /dev/null
+++ b/src/backend/access/itup.h
@@ -0,0 +1,104 @@
+/*-------------------------------------------------------------------------
+ *
+ * itup.h--
+ * POSTGRES index tuple definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itup.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITUP_H
+#define ITUP_H
+
+#include "c.h"
+#include "access/ibit.h"
+#include "access/tupdesc.h" /* for TupleDesc */
+#include "storage/itemptr.h"
+
+#define MaxIndexAttributeNumber 7
+
+typedef struct IndexTupleData {
+ ItemPointerData t_tid; /* reference TID to base tuple */
+
+ /*
+ * t_info is layed out in the following fashion:
+ *
+ * 15th (leftmost) bit: "has nulls" bit
+ * 14th bit: "has varlenas" bit
+ * 13th bit: "has rules" bit - (removed ay 11/94)
+ * bits 12-0 bit: size of tuple.
+ */
+
+ unsigned short t_info; /* various info about tuple */
+
+ /*
+ * please make sure sizeof(IndexTupleData) is MAXALIGN'ed.
+ * See IndexInfoFindDataOffset() for the reason.
+ */
+
+} IndexTupleData; /* MORE DATA FOLLOWS AT END OF STRUCT */
+
+typedef IndexTupleData *IndexTuple;
+
+
+typedef struct InsertIndexResultData {
+ ItemPointerData pointerData;
+} InsertIndexResultData;
+
+typedef InsertIndexResultData *InsertIndexResult;
+
+
+typedef struct RetrieveIndexResultData {
+ ItemPointerData index_iptr;
+ ItemPointerData heap_iptr;
+} RetrieveIndexResultData;
+
+typedef RetrieveIndexResultData *RetrieveIndexResult;
+
+
+/*-----------------
+ * PredInfo -
+ * used for partial indices
+ *-----------------
+ */
+typedef struct PredInfo {
+ Node *pred;
+ Node *oldPred;
+} PredInfo;
+
+
+/* ----------------
+ * externs
+ * ----------------
+ */
+
+#define INDEX_SIZE_MASK 0x1FFF
+#define INDEX_NULL_MASK 0x8000
+#define INDEX_VAR_MASK 0x4000
+
+#define IndexTupleSize(itup) (((IndexTuple) (itup))->t_info & 0x1FFF)
+#define IndexTupleDSize(itup) ((itup).t_info & 0x1FFF)
+#define IndexTupleNoNulls(itup) (!(((IndexTuple) (itup))->t_info & 0x8000))
+#define IndexTupleAllFixed(itup) (!(((IndexTuple) (itup))->t_info & 0x4000))
+
+#define IndexTupleHasMinHeader(itup) (IndexTupleNoNulls(itup))
+
+
+/* indextuple.h */
+extern IndexTuple index_formtuple(TupleDesc tupleDescriptor,
+ Datum value[], char null[]);
+extern char *fastgetiattr(IndexTuple tup, int attnum,
+ TupleDesc att, bool *isnull);
+extern Datum index_getattr(IndexTuple tuple, AttrNumber attNum,
+ TupleDesc tupDesc, bool *isNullOutP);
+extern RetrieveIndexResult
+FormRetrieveIndexResult(ItemPointer indexItemPointer,
+ ItemPointer heapItemPointer);
+extern void CopyIndexTuple(IndexTuple source, IndexTuple *target);
+
+
+#endif /* ITUP_H */
+
diff --git a/src/backend/access/nbtree.h b/src/backend/access/nbtree.h
new file mode 100644
index 00000000000..d5c37a23950
--- /dev/null
+++ b/src/backend/access/nbtree.h
@@ -0,0 +1,264 @@
+/*-------------------------------------------------------------------------
+ *
+ * nbtree.h--
+ * header file for postgres btree access method implementation.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: nbtree.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NBTREE_H
+#define NBTREE_H
+
+#include "access/attnum.h"
+#include "access/itup.h"
+#include "access/htup.h"
+#include "access/tupdesc.h"
+
+#include "access/istrat.h"
+#include "access/funcindex.h"
+#include "access/relscan.h"
+#include "access/sdir.h"
+#include "nodes/pg_list.h"
+
+/*
+ * BTPageOpaqueData -- At the end of every page, we store a pointer
+ * to both siblings in the tree. See Lehman and Yao's paper for more
+ * info. In addition, we need to know what sort of page this is
+ * (leaf or internal), and whether the page is available for reuse.
+ *
+ * Lehman and Yao's algorithm requires a ``high key'' on every page.
+ * The high key on a page is guaranteed to be greater than or equal
+ * to any key that appears on this page. Our insertion algorithm
+ * guarantees that we can use the initial least key on our right
+ * sibling as the high key. We allocate space for the line pointer
+ * to the high key in the opaque data at the end of the page.
+ *
+ * Rightmost pages in the tree have no high key.
+ */
+
+typedef struct BTPageOpaqueData {
+ BlockNumber btpo_prev;
+ BlockNumber btpo_next;
+ uint16 btpo_flags;
+
+#define BTP_LEAF (1 << 0)
+#define BTP_ROOT (1 << 1)
+#define BTP_FREE (1 << 2)
+#define BTP_META (1 << 3)
+
+} BTPageOpaqueData;
+
+typedef BTPageOpaqueData *BTPageOpaque;
+
+/*
+ * ScanOpaqueData is used to remember which buffers we're currently
+ * examining in the scan. We keep these buffers locked and pinned
+ * and recorded in the opaque entry of the scan in order to avoid
+ * doing a ReadBuffer() for every tuple in the index. This avoids
+ * semop() calls, which are expensive.
+ */
+
+typedef struct BTScanOpaqueData {
+ Buffer btso_curbuf;
+ Buffer btso_mrkbuf;
+} BTScanOpaqueData;
+
+typedef BTScanOpaqueData *BTScanOpaque;
+
+/*
+ * BTItems are what we store in the btree. Each item has an index
+ * tuple, including key and pointer values. In addition, we must
+ * guarantee that all tuples in the index are unique, in order to
+ * satisfy some assumptions in Lehman and Yao. The way that we do
+ * this is by generating a new OID for every insertion that we do in
+ * the tree. This adds eight bytes to the size of btree index
+ * tuples. Note that we do not use the OID as part of a composite
+ * key; the OID only serves as a unique identifier for a given index
+ * tuple (logical position within a page).
+ */
+
+typedef struct BTItemData {
+ Oid bti_oid;
+ int32 bti_dummy; /* padding to make bti_itup
+ * align at 8-byte boundary
+ */
+ IndexTupleData bti_itup;
+} BTItemData;
+
+typedef BTItemData *BTItem;
+
+/*
+ * BTStackData -- As we descend a tree, we push the (key, pointer)
+ * pairs from internal nodes onto a private stack. If we split a
+ * leaf, we use this stack to walk back up the tree and insert data
+ * into parent nodes (and possibly to split them, too). Lehman and
+ * Yao's update algorithm guarantees that under no circumstances can
+ * our private stack give us an irredeemably bad picture up the tree.
+ * Again, see the paper for details.
+ */
+
+typedef struct BTStackData {
+ BlockNumber bts_blkno;
+ OffsetNumber bts_offset;
+ BTItem bts_btitem;
+ struct BTStackData *bts_parent;
+} BTStackData;
+
+typedef BTStackData *BTStack;
+
+/*
+ * We need to be able to tell the difference between read and write
+ * requests for pages, in order to do locking correctly.
+ */
+
+#define BT_READ 0
+#define BT_WRITE 1
+
+/*
+ * Similarly, the difference between insertion and non-insertion binary
+ * searches on a given page makes a difference when we're descending the
+ * tree.
+ */
+
+#define BT_INSERTION 0
+#define BT_DESCENT 1
+
+/*
+ * In general, the btree code tries to localize its knowledge about
+ * page layout to a couple of routines. However, we need a special
+ * value to indicate "no page number" in those places where we expect
+ * page numbers.
+ */
+
+#define P_NONE 0
+#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
+#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
+
+#define P_HIKEY ((OffsetNumber) 1)
+#define P_FIRSTKEY ((OffsetNumber) 2)
+
+/*
+ * Strategy numbers -- ordering of these is <, <=, =, >=, >
+ */
+
+#define BTLessStrategyNumber 1
+#define BTLessEqualStrategyNumber 2
+#define BTEqualStrategyNumber 3
+#define BTGreaterEqualStrategyNumber 4
+#define BTGreaterStrategyNumber 5
+#define BTMaxStrategyNumber 5
+
+/*
+ * When a new operator class is declared, we require that the user
+ * supply us with an amproc procedure for determining whether, for
+ * two keys a and b, a < b, a = b, or a > b. This routine must
+ * return < 0, 0, > 0, respectively, in these three cases. Since we
+ * only have one such proc in amproc, it's number 1.
+ */
+
+#define BTORDER_PROC 1
+
+
+/*
+ * prototypes for functions in nbtinsert.c
+ */
+extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem);
+extern bool _bt_itemcmp(Relation rel, Size keysz, BTItem item1, BTItem item2,
+ StrategyNumber strat);
+
+/*
+ * prototypes for functions in nbtpage.c
+ */
+extern void _bt_metapinit(Relation rel);
+extern void _bt_checkmeta(Relation rel);
+extern Buffer _bt_getroot(Relation rel, int access);
+extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
+extern void _bt_relbuf(Relation rel, Buffer buf, int access);
+extern void _bt_wrtbuf(Relation rel, Buffer buf);
+extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
+extern void _bt_pageinit(Page page, Size size);
+extern void _bt_metaproot(Relation rel, BlockNumber rootbknum);
+extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
+extern void _bt_setpagelock(Relation rel, BlockNumber blkno, int access);
+extern void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access);
+extern void _bt_pagedel(Relation rel, ItemPointer tid);
+
+/*
+ * prototypes for functions in nbtree.c
+ */
+extern bool BuildingBtree; /* in nbtree.c */
+
+extern void btbuild(Relation heap, Relation index, int natts,
+ AttrNumber *attnum, IndexStrategy istrat, uint16 pcount,
+ Datum *params, FuncIndexInfo *finfo, PredInfo *predInfo);
+extern InsertIndexResult btinsert(Relation rel, IndexTuple itup);
+extern char *btgettuple(IndexScanDesc scan, ScanDirection dir);
+extern char *btbeginscan(Relation rel, bool fromEnd, uint16 keysz,
+ ScanKey scankey);
+
+extern void btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey);
+extern void btmovescan(IndexScanDesc scan, Datum v);
+extern void btendscan(IndexScanDesc scan);
+extern void btmarkpos(IndexScanDesc scan);
+extern void btrestrpos(IndexScanDesc scan);
+extern void btdelete(Relation rel, ItemPointer tid);
+
+/*
+ * prototypes for functions in nbtscan.c
+ */
+extern void _bt_regscan(IndexScanDesc scan);
+extern void _bt_dropscan(IndexScanDesc scan);
+extern void _bt_adjscans(Relation rel, ItemPointer tid);
+extern void _bt_scandel(IndexScanDesc scan, BlockNumber blkno,
+ OffsetNumber offno);
+extern bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno,
+ OffsetNumber offno);
+
+/*
+ * prototypes for functions in nbtsearch.c
+ */
+extern BTStack _bt_search(Relation rel, int keysz, ScanKey scankey,
+ Buffer *bufP);
+extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
+ ScanKey scankey, int access);
+extern bool _bt_skeycmp(Relation rel, Size keysz, ScanKey scankey,
+ Page page, ItemId itemid, StrategyNumber strat);
+extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
+ ScanKey scankey, int srchtype);
+extern RetrieveIndexResult _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern RetrieveIndexResult _bt_first(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+
+/*
+ * prototypes for functions in nbtstrat.c
+ */
+extern StrategyNumber _bt_getstrat(Relation rel, AttrNumber attno,
+ RegProcedure proc);
+extern bool _bt_invokestrat(Relation rel, AttrNumber attno,
+ StrategyNumber strat, Datum left, Datum right);
+
+/*
+ * prototypes for functions in nbtutils.c
+ */
+extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
+extern void _bt_freeskey(ScanKey skey);
+extern void _bt_freestack(BTStack stack);
+extern void _bt_orderkeys(Relation relation, uint16 *numberOfKeys,
+ ScanKey key);
+extern bool _bt_checkqual(IndexScanDesc scan, IndexTuple itup);
+extern BTItem _bt_formitem(IndexTuple itup);
+
+/*
+ * prototypes for functions in nbtsort.c
+ */
+extern void *_bt_spoolinit(Relation index, int ntapes);
+extern void _bt_spooldestroy(void *spool);
+extern void _bt_spool(Relation index, BTItem btitem, void *spool);
+extern void _bt_upperbuild(Relation index, BlockNumber blk, int level);
+extern void _bt_leafbuild(Relation index, void *spool);
+
+#endif /* NBTREE_H */
diff --git a/src/backend/access/nbtree/Makefile.inc b/src/backend/access/nbtree/Makefile.inc
new file mode 100644
index 00000000000..50854008c01
--- /dev/null
+++ b/src/backend/access/nbtree/Makefile.inc
@@ -0,0 +1,15 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/nbtree (btree acess methods)
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= nbtcompare.c nbtinsert.c nbtpage.c nbtree.c nbtscan.c nbtsearch.c \
+ nbtstrat.c nbtutils.c nbtsort.c
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
new file mode 100644
index 00000000000..a204ad4af08
--- /dev/null
+++ b/src/backend/access/nbtree/README
@@ -0,0 +1,68 @@
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+
+This directory contains a correct implementation of Lehman and Yao's
+btree management algorithm that supports concurrent access for Postgres.
+We have made the following changes in order to incorporate their algorithm
+into Postgres:
+
+ + The requirement that all btree keys be unique is too onerous,
+ but the algorithm won't work correctly without it. As a result,
+ this implementation adds an OID (guaranteed to be unique) to
+ every key in the index. This guarantees uniqueness within a set
+ of duplicates. Space overhead is four bytes.
+
+ For this reason, when we're passed an index tuple to store by the
+ common access method code, we allocate a larger one and copy the
+ supplied tuple into it. No Postgres code outside of the btree
+ access method knows about this xid or sequence number.
+
+ + Lehman and Yao don't require read locks, but assume that in-
+ memory copies of tree nodes are unshared. Postgres shares
+ in-memory buffers among backends. As a result, we do page-
+ level read locking on btree nodes in order to guarantee that
+ no record is modified while we are examining it. This reduces
+ concurrency but guaranteees correct behavior.
+
+ + Read locks on a page are held for as long as a scan has a pointer
+ to the page. However, locks are always surrendered before the
+ sibling page lock is acquired (for readers), so we remain deadlock-
+ free. I will do a formal proof if I get bored anytime soon.
+
+In addition, the following things are handy to know:
+
+ + Page zero of every btree is a meta-data page. This page stores
+ the location of the root page, a pointer to a list of free
+ pages, and other stuff that's handy to know.
+
+ + This algorithm doesn't really work, since it requires ordered
+ writes, and UNIX doesn't support ordered writes.
+
+ + There's one other case where we may screw up in this
+ implementation. When we start a scan, we descend the tree
+ to the key nearest the one in the qual, and once we get there,
+ position ourselves correctly for the qual type (eg, <, >=, etc).
+ If we happen to step off a page, decide we want to get back to
+ it, and fetch the page again, and if some bad person has split
+ the page and moved the last tuple we saw off of it, then the
+ code complains about botched concurrency in an elog(WARN, ...)
+ and gives up the ghost. This is the ONLY violation of Lehman
+ and Yao's guarantee of correct behavior that I am aware of in
+ this code.
+
+Notes to operator class implementors:
+
+ With this implementation, we require the user to supply us with
+ a procedure for pg_amproc. This procedure should take two keys
+ A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B,
+ respectively. See the contents of that relation for the btree
+ access method for some samples.
+
+Notes to mao for implementation document:
+
+ On deletions, we need to adjust the position of active scans on
+ the index. The code in nbtscan.c handles this. We don't need to
+ do this for splits because of the way splits are handled; if they
+ happen behind us, we'll automatically go to the next page, and if
+ they happen in front of us, we're not affected by them. For
+ insertions, if we inserted a tuple behind the current scan location
+ on the current scan page, we move one space ahead.
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
new file mode 100644
index 00000000000..e567b3c44cb
--- /dev/null
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -0,0 +1,173 @@
+/*-------------------------------------------------------------------------
+ *
+ * btcompare.c--
+ * Comparison functions for btree access method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ * These functions are stored in pg_amproc. For each operator class
+ * defined on btrees, they compute
+ *
+ * compare(a, b):
+ * < 0 if a < b,
+ * = 0 if a == b,
+ * > 0 if a > b.
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+#include "utils/nabstime.h"
+
+int32
+btint2cmp(int16 a, int16 b)
+{
+ return ((int32) (a - b));
+}
+
+int32
+btint4cmp(int32 a, int32 b)
+{
+ return (a - b);
+}
+
+int32
+btint24cmp(int16 a, int32 b)
+{
+ return (((int32) a) - b);
+}
+
+int32
+btint42cmp(int32 a, int16 b)
+{
+ return (a - ((int32) b));
+}
+
+int32
+btfloat4cmp(float32 a, float32 b)
+{
+ if (*a > *b)
+ return (1);
+ else if (*a == *b)
+ return (0);
+ else
+ return (-1);
+}
+
+int32
+btfloat8cmp(float64 a, float64 b)
+{
+ if (*a > *b)
+ return (1);
+ else if (*a == *b)
+ return (0);
+ else
+ return (-1);
+}
+
+int32
+btoidcmp(Oid a, Oid b)
+{
+ if (a > b)
+ return (1);
+ else if (a == b)
+ return (0);
+ else
+ return (-1);
+}
+
+int32
+btabstimecmp(AbsoluteTime a, AbsoluteTime b)
+{
+ if (AbsoluteTimeIsBefore(a, b))
+ return (1);
+ else if (AbsoluteTimeIsBefore(b, a))
+ return (-1);
+ else
+ return (0);
+}
+
+int32
+btcharcmp(char a, char b)
+{
+ return ((int32) (a - b));
+}
+
+int32
+btchar2cmp(uint16 a, uint16 b)
+{
+ return (strncmp((char *) &a, (char *) &b, 2));
+}
+
+int32
+btchar4cmp(uint32 a, uint32 b)
+{
+ return (strncmp((char *) &a, (char *) &b, 4));
+}
+
+int32
+btchar8cmp(char *a, char *b)
+{
+ return (strncmp(a, b, 8));
+}
+
+int32
+btchar16cmp(char *a, char *b)
+{
+ return (strncmp(a, b, 16));
+}
+
+int32
+btnamecmp(NameData *a, NameData *b)
+{
+ return (strncmp(a->data, b->data, NAMEDATALEN));
+}
+
+int32
+bttextcmp(struct varlena *a, struct varlena *b)
+{
+ char *ap, *bp;
+ int len;
+ int res;
+
+ ap = VARDATA(a);
+ bp = VARDATA(b);
+
+ /* len is the length of the shorter of the two strings */
+ if ((len = VARSIZE(a)) > VARSIZE(b))
+ len = VARSIZE(b);
+
+ /* len includes the four bytes in which string length is stored */
+ len -= sizeof(VARSIZE(a));
+
+ /*
+ * If the two strings differ in the first len bytes, or if they're
+ * the same in the first len bytes and they're both len bytes long,
+ * we're done.
+ */
+
+ res = 0;
+ if (len > 0) {
+ do {
+ res = (int) (*ap++ - *bp++);
+ len--;
+ } while (res == 0 && len != 0);
+ }
+
+ if (res != 0 || VARSIZE(a) == VARSIZE(b))
+ return (res);
+
+ /*
+ * The two strings are the same in the first len bytes, and they
+ * are of different lengths.
+ */
+
+ if (VARSIZE(a) < VARSIZE(b))
+ return (-1);
+ else
+ return (1);
+}
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
new file mode 100644
index 00000000000..536c0aa385d
--- /dev/null
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -0,0 +1,831 @@
+/*-------------------------------------------------------------------------
+ *
+ * btinsert.c--
+ * Item insertion in Lehman and Yao btrees for Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem);
+static Buffer _bt_split(Relation rel, Buffer buf);
+static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit);
+static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem);
+static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem);
+static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, Oid bti_oid, BTItem newItem);
+
+/*
+ * _bt_doinsert() -- Handle insertion of a single btitem in the tree.
+ *
+ * This routine is called by the public interface routines, btbuild
+ * and btinsert. By here, btitem is filled in, and has a unique
+ * (xid, seqno) pair.
+ */
+InsertIndexResult
+_bt_doinsert(Relation rel, BTItem btitem)
+{
+ ScanKey itup_scankey;
+ IndexTuple itup;
+ BTStack stack;
+ Buffer buf;
+ BlockNumber blkno;
+ int natts;
+ InsertIndexResult res;
+
+ itup = &(btitem->bti_itup);
+
+ /* we need a scan key to do our search, so build one */
+ itup_scankey = _bt_mkscankey(rel, itup);
+ natts = rel->rd_rel->relnatts;
+
+ /* find the page containing this key */
+ stack = _bt_search(rel, natts, itup_scankey, &buf);
+ blkno = BufferGetBlockNumber(buf);
+
+ /* trade in our read lock for a write lock */
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_WRITE);
+
+ /*
+ * If the page was split between the time that we surrendered our
+ * read lock and acquired our write lock, then this page may no
+ * longer be the right place for the key we want to insert. In this
+ * case, we need to move right in the tree. See Lehman and Yao for
+ * an excruciatingly precise description.
+ */
+
+ buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
+
+ /* do the insertion */
+ res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey,
+ btitem, (BTItem) NULL);
+
+ /* be tidy */
+ _bt_freestack(stack);
+ _bt_freeskey(itup_scankey);
+
+ return (res);
+}
+
+/*
+ * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ * This recursive procedure does the following things:
+ *
+ * + if necessary, splits the target page.
+ * + finds the right place to insert the tuple (taking into
+ * account any changes induced by a split).
+ * + inserts the tuple.
+ * + if the page was split, pops the parent stack, and finds the
+ * right place to insert the new child pointer (by walking
+ * right using information stored in the parent stack).
+ * + invoking itself with the appropriate tuple for the right
+ * child page on the parent.
+ *
+ * On entry, we must have the right buffer on which to do the
+ * insertion, and the buffer must be pinned and locked. On return,
+ * we will have dropped both the pin and the write lock on the buffer.
+ *
+ * The locking interactions in this code are critical. You should
+ * grok Lehman and Yao's paper before making any changes. In addition,
+ * you need to understand how we disambiguate duplicate keys in this
+ * implementation, in order to be able to find our location using
+ * L&Y "move right" operations. Since we may insert duplicate user
+ * keys, and since these dups may propogate up the tree, we use the
+ * 'afteritem' parameter to position ourselves correctly for the
+ * insertion on internal pages.
+ */
+static InsertIndexResult
+_bt_insertonpg(Relation rel,
+ Buffer buf,
+ BTStack stack,
+ int keysz,
+ ScanKey scankey,
+ BTItem btitem,
+ BTItem afteritem)
+{
+ InsertIndexResult res;
+ Page page;
+ Buffer rbuf;
+ Buffer pbuf;
+ Page rpage;
+ ScanKey newskey;
+ BTItem ritem;
+ BTPageOpaque rpageop;
+ BlockNumber rbknum, itup_blkno;
+ OffsetNumber itup_off;
+ int itemsz;
+ InsertIndexResult newres;
+ BTItem new_item = (BTItem) NULL;
+ BTItem lowLeftItem;
+
+ page = BufferGetPage(buf);
+ itemsz = IndexTupleDSize(btitem->bti_itup)
+ + (sizeof(BTItemData) - sizeof(IndexTupleData));
+
+ itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do this
+ but we need to be consistent */
+
+ if (PageGetFreeSpace(page) < itemsz) {
+
+ /* split the buffer into left and right halves */
+ rbuf = _bt_split(rel, buf);
+
+ /* which new page (left half or right half) gets the tuple? */
+ if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) {
+ /* left page */
+ itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(buf);
+ } else {
+ /* right page */
+ itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(rbuf);
+ }
+
+ /*
+ * By here,
+ *
+ * + our target page has been split;
+ * + the original tuple has been inserted;
+ * + we have write locks on both the old (left half) and new
+ * (right half) buffers, after the split; and
+ * + we have the key we want to insert into the parent.
+ *
+ * Do the parent insertion. We need to hold onto the locks for
+ * the child pages until we locate the parent, but we can release
+ * them before doing the actual insertion (see Lehman and Yao for
+ * the reasoning).
+ */
+
+ if (stack == (BTStack) NULL) {
+
+ /* create a new root node and release the split buffers */
+ _bt_newroot(rel, buf, rbuf);
+ _bt_relbuf(rel, buf, BT_WRITE);
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+
+ } else {
+
+ /* form a index tuple that points at the new right page */
+ rbknum = BufferGetBlockNumber(rbuf);
+ rpage = BufferGetPage(rbuf);
+ rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /*
+ * By convention, the first entry (0) on every
+ * non-rightmost page is the high key for that page. In
+ * order to get the lowest key on the new right page, we
+ * actually look at its second (1) entry.
+ */
+
+ if (! P_RIGHTMOST(rpageop)) {
+ ritem = (BTItem) PageGetItem(rpage,
+ PageGetItemId(rpage, P_FIRSTKEY));
+ } else {
+ ritem = (BTItem) PageGetItem(rpage,
+ PageGetItemId(rpage, P_HIKEY));
+ }
+
+ /* get a unique btitem for this key */
+ new_item = _bt_formitem(&(ritem->bti_itup));
+
+ ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
+
+ /* find the parent buffer */
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+
+ /*
+ * If the key of new_item is < than the key of the item
+ * in the parent page pointing to the left page
+ * (stack->bts_btitem), we have to update the latter key;
+ * otherwise the keys on the parent page wouldn't be
+ * monotonically increasing after we inserted the new
+ * pointer to the right page (new_item). This only
+ * happens if our left page is the leftmost page and a
+ * new minimum key had been inserted before, which is not
+ * reflected in the parent page but didn't matter so
+ * far. If there are duplicate keys and this new minimum
+ * key spills over to our new right page, we get an
+ * inconsistency if we don't update the left key in the
+ * parent page.
+ */
+
+ if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item,
+ BTGreaterStrategyNumber)) {
+ lowLeftItem =
+ (BTItem) PageGetItem(page,
+ PageGetItemId(page, P_FIRSTKEY));
+ /* page must have right pointer after split */
+ _bt_updateitem(rel, keysz, pbuf, stack->bts_btitem->bti_oid,
+ lowLeftItem);
+ }
+
+ /* don't need the children anymore */
+ _bt_relbuf(rel, buf, BT_WRITE);
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+
+ newskey = _bt_mkscankey(rel, &(new_item->bti_itup));
+ newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+ keysz, newskey, new_item,
+ stack->bts_btitem);
+
+ /* be tidy */
+ pfree(newres);
+ pfree(newskey);
+ pfree(new_item);
+ }
+ } else {
+ itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(buf);
+
+ _bt_relbuf(rel, buf, BT_WRITE);
+ }
+
+ /* by here, the new tuple is inserted */
+ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+ ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+
+ return (res);
+}
+
+/*
+ * _bt_split() -- split a page in the btree.
+ *
+ * On entry, buf is the page to split, and is write-locked and pinned.
+ * Returns the new right sibling of buf, pinned and write-locked. The
+ * pin and lock on buf are maintained.
+ */
+static Buffer
+_bt_split(Relation rel, Buffer buf)
+{
+ Buffer rbuf;
+ Page origpage;
+ Page leftpage, rightpage;
+ BTPageOpaque ropaque, lopaque, oopaque;
+ Buffer sbuf;
+ Page spage;
+ BTPageOpaque sopaque;
+ Size itemsz;
+ ItemId itemid;
+ BTItem item;
+ OffsetNumber leftoff, rightoff;
+ OffsetNumber start;
+ OffsetNumber maxoff;
+ OffsetNumber firstright;
+ OffsetNumber i;
+ Size llimit;
+
+ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ origpage = BufferGetPage(buf);
+ leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
+ rightpage = BufferGetPage(rbuf);
+
+ _bt_pageinit(rightpage, BufferGetPageSize(rbuf));
+ _bt_pageinit(leftpage, BufferGetPageSize(buf));
+
+ /* init btree private data */
+ oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+
+ /* if we're splitting this page, it won't be the root when we're done */
+ oopaque->btpo_flags &= ~BTP_ROOT;
+ lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags;
+ lopaque->btpo_prev = oopaque->btpo_prev;
+ ropaque->btpo_prev = BufferGetBlockNumber(buf);
+ lopaque->btpo_next = BufferGetBlockNumber(rbuf);
+ ropaque->btpo_next = oopaque->btpo_next;
+
+ /*
+ * If the page we're splitting is not the rightmost page at its
+ * level in the tree, then the first (0) entry on the page is the
+ * high key for the page. We need to copy that to the right
+ * half. Otherwise (meaning the rightmost page case), we should
+ * treat the line pointers beginning at zero as user data.
+ *
+ * We leave a blank space at the start of the line table for the
+ * left page. We'll come back later and fill it in with the high
+ * key item we get from the right key.
+ */
+
+ leftoff = P_FIRSTKEY;
+ ropaque->btpo_next = oopaque->btpo_next;
+ if (! P_RIGHTMOST(oopaque)) {
+ /* splitting a non-rightmost page, start at the first data item */
+ start = P_FIRSTKEY;
+
+ /* copy the original high key to the new page */
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(origpage, itemid);
+ (void) PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED);
+ rightoff = P_FIRSTKEY;
+ } else {
+ /* splitting a rightmost page, "high key" is the first data item */
+ start = P_HIKEY;
+
+ /* the new rightmost page will not have a high key */
+ rightoff = P_HIKEY;
+ }
+ maxoff = PageGetMaxOffsetNumber(origpage);
+ llimit = PageGetFreeSpace(leftpage) / 2;
+ firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit);
+
+ for (i = start; i <= maxoff; i = OffsetNumberNext(i)) {
+ itemid = PageGetItemId(origpage, i);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(origpage, itemid);
+
+ /* decide which page to put it on */
+ if (i < firstright) {
+ (void) PageAddItem(leftpage, (Item) item, itemsz, leftoff,
+ LP_USED);
+ leftoff = OffsetNumberNext(leftoff);
+ } else {
+ (void) PageAddItem(rightpage, (Item) item, itemsz, rightoff,
+ LP_USED);
+ rightoff = OffsetNumberNext(rightoff);
+ }
+ }
+
+ /*
+ * Okay, page has been split, high key on right page is correct. Now
+ * set the high key on the left page to be the min key on the right
+ * page.
+ */
+
+ if (P_RIGHTMOST(ropaque)) {
+ itemid = PageGetItemId(rightpage, P_HIKEY);
+ } else {
+ itemid = PageGetItemId(rightpage, P_FIRSTKEY);
+ }
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(rightpage, itemid);
+
+ /*
+ * We left a hole for the high key on the left page; fill it. The
+ * modal crap is to tell the page manager to put the new item on the
+ * page and not screw around with anything else. Whoever designed
+ * this interface has presumably crawled back into the dung heap they
+ * came from. No one here will admit to it.
+ */
+
+ PageManagerModeSet(OverwritePageManagerMode);
+ (void) PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED);
+ PageManagerModeSet(ShufflePageManagerMode);
+
+ /*
+ * By here, the original data page has been split into two new halves,
+ * and these are correct. The algorithm requires that the left page
+ * never move during a split, so we copy the new left page back on top
+ * of the original. Note that this is not a waste of time, since we
+ * also require (in the page management code) that the center of a
+ * page always be clean, and the most efficient way to guarantee this
+ * is just to compact the data by reinserting it into a new left page.
+ */
+
+ PageRestoreTempPage(leftpage, origpage);
+
+ /* write these guys out */
+ _bt_wrtnorelbuf(rel, rbuf);
+ _bt_wrtnorelbuf(rel, buf);
+
+ /*
+ * Finally, we need to grab the right sibling (if any) and fix the
+ * prev pointer there. We are guaranteed that this is deadlock-free
+ * since no other writer will be moving holding a lock on that page
+ * and trying to move left, and all readers release locks on a page
+ * before trying to fetch its neighbors.
+ */
+
+ if (! P_RIGHTMOST(ropaque)) {
+ sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
+ spage = BufferGetPage(sbuf);
+ sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+ sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+
+ /* write and release the old right sibling */
+ _bt_wrtbuf(rel, sbuf);
+ }
+
+ /* split's done */
+ return (rbuf);
+}
+
+/*
+ * _bt_findsplitloc() -- find a safe place to split a page.
+ *
+ * In order to guarantee the proper handling of searches for duplicate
+ * keys, the first duplicate in the chain must either be the first
+ * item on the page after the split, or the entire chain must be on
+ * one of the two pages. That is,
+ * [1 2 2 2 3 4 5]
+ * must become
+ * [1] [2 2 2 3 4 5]
+ * or
+ * [1 2 2 2] [3 4 5]
+ * but not
+ * [1 2 2] [2 3 4 5].
+ * However,
+ * [2 2 2 2 2 3 4]
+ * may be split as
+ * [2 2 2 2] [2 3 4].
+ */
+static OffsetNumber
+_bt_findsplitloc(Relation rel,
+ Page page,
+ OffsetNumber start,
+ OffsetNumber maxoff,
+ Size llimit)
+{
+ OffsetNumber i;
+ OffsetNumber saferight;
+ ItemId nxtitemid, safeitemid;
+ BTItem safeitem, nxtitem;
+ IndexTuple safetup, nxttup;
+ Size nbytes;
+ TupleDesc itupdesc;
+ int natts;
+ int attno;
+ Datum attsafe;
+ Datum attnext;
+ bool null;
+
+ itupdesc = RelationGetTupleDescriptor(rel);
+ natts = rel->rd_rel->relnatts;
+
+ saferight = start;
+ safeitemid = PageGetItemId(page, saferight);
+ nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData);
+ safeitem = (BTItem) PageGetItem(page, safeitemid);
+ safetup = &(safeitem->bti_itup);
+
+ i = OffsetNumberNext(start);
+
+ while (nbytes < llimit) {
+
+ /* check the next item on the page */
+ nxtitemid = PageGetItemId(page, i);
+ nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData));
+ nxtitem = (BTItem) PageGetItem(page, nxtitemid);
+ nxttup = &(nxtitem->bti_itup);
+
+ /* test against last known safe item */
+ for (attno = 1; attno <= natts; attno++) {
+ attsafe = index_getattr(safetup, attno, itupdesc, &null);
+ attnext = index_getattr(nxttup, attno, itupdesc, &null);
+
+ /*
+ * If the tuple we're looking at isn't equal to the last safe one
+ * we saw, then it's our new safe tuple.
+ */
+
+ if (!_bt_invokestrat(rel, attno, BTEqualStrategyNumber,
+ attsafe, attnext)) {
+ safetup = nxttup;
+ saferight = i;
+
+ /* break is for the attno for loop */
+ break;
+ }
+ }
+ i = OffsetNumberNext(i);
+ }
+
+ /*
+ * If the chain of dups starts at the beginning of the page and extends
+ * past the halfway mark, we can split it in the middle.
+ */
+
+ if (saferight == start)
+ saferight = i;
+
+ return (saferight);
+}
+
+/*
+ * _bt_newroot() -- Create a new root page for the index.
+ *
+ * We've just split the old root page and need to create a new one.
+ * In order to do this, we add a new root page to the file, then lock
+ * the metadata page and update it. This is guaranteed to be deadlock-
+ * free, because all readers release their locks on the metadata page
+ * before trying to lock the root, and all writers lock the root before
+ * trying to lock the metadata page. We have a write lock on the old
+ * root page, so we have not introduced any cycles into the waits-for
+ * graph.
+ *
+ * On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ * locked. We don't drop the locks in this routine; that's done by
+ * the caller. On exit, a new root page exists with entries for the
+ * two new children. The new root page is neither pinned nor locked.
+ */
+static void
+_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
+{
+ Buffer rootbuf;
+ Page lpage, rpage, rootpage;
+ BlockNumber lbkno, rbkno;
+ BlockNumber rootbknum;
+ BTPageOpaque rootopaque;
+ ItemId itemid;
+ BTItem item;
+ Size itemsz;
+ BTItem new_item;
+
+ /* get a new root page */
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootpage = BufferGetPage(rootbuf);
+ _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+
+ /* set btree special data */
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+ rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+ rootopaque->btpo_flags |= BTP_ROOT;
+
+ /*
+ * Insert the internal tuple pointers.
+ */
+
+ lbkno = BufferGetBlockNumber(lbuf);
+ rbkno = BufferGetBlockNumber(rbuf);
+ lpage = BufferGetPage(lbuf);
+ rpage = BufferGetPage(rbuf);
+
+ /*
+ * step over the high key on the left page while building the
+ * left page pointer.
+ */
+ itemid = PageGetItemId(lpage, P_FIRSTKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(lpage, itemid);
+ new_item = _bt_formitem(&(item->bti_itup));
+ ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_FIRSTKEY);
+
+ /*
+ * insert the left page pointer into the new root page. the root
+ * page is the rightmost page on its level so the "high key" item
+ * is the first data item.
+ */
+ (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED);
+ pfree(new_item);
+
+ /*
+ * the right page is the rightmost page on the second level, so
+ * the "high key" item is the first data item on that page as well.
+ */
+ itemid = PageGetItemId(rpage, P_HIKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(rpage, itemid);
+ new_item = _bt_formitem(&(item->bti_itup));
+ ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
+
+ /*
+ * insert the right page pointer into the new root page.
+ */
+ (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED);
+ pfree(new_item);
+
+ /* write and let go of the root buffer */
+ rootbknum = BufferGetBlockNumber(rootbuf);
+ _bt_wrtbuf(rel, rootbuf);
+
+ /* update metadata page with new root block number */
+ _bt_metaproot(rel, rootbknum);
+}
+
+/*
+ * _bt_pgaddtup() -- add a tuple to a particular page in the index.
+ *
+ * This routine adds the tuple to the page as requested, and keeps the
+ * write lock and reference associated with the page's buffer. It is
+ * an error to call pgaddtup() without a write lock and reference. If
+ * afteritem is non-null, it's the item that we expect our new item
+ * to follow. Otherwise, we do a binary search for the correct place
+ * and insert the new item there.
+ */
+static OffsetNumber
+_bt_pgaddtup(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey itup_scankey,
+ Size itemsize,
+ BTItem btitem,
+ BTItem afteritem)
+{
+ OffsetNumber itup_off;
+ OffsetNumber first;
+ Page page;
+ BTPageOpaque opaque;
+ BTItem chkitem;
+ Oid afteroid;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (afteritem == (BTItem) NULL) {
+ itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION);
+ } else {
+ afteroid = afteritem->bti_oid;
+ itup_off = first;
+
+ do {
+ chkitem =
+ (BTItem) PageGetItem(page, PageGetItemId(page, itup_off));
+ itup_off = OffsetNumberNext(itup_off);
+ } while (chkitem->bti_oid != afteroid);
+ }
+
+ (void) PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED);
+
+ /* write the buffer, but hold our lock */
+ _bt_wrtnorelbuf(rel, buf);
+
+ return (itup_off);
+}
+
+/*
+ * _bt_goesonpg() -- Does a new tuple belong on this page?
+ *
+ * This is part of the complexity introduced by allowing duplicate
+ * keys into the index. The tuple belongs on this page if:
+ *
+ * + there is no page to the right of this one; or
+ * + it is less than the high key on the page; or
+ * + the item it is to follow ("afteritem") appears on this
+ * page.
+ */
+static bool
+_bt_goesonpg(Relation rel,
+ Buffer buf,
+ Size keysz,
+ ScanKey scankey,
+ BTItem afteritem)
+{
+ Page page;
+ ItemId hikey;
+ BTPageOpaque opaque;
+ BTItem chkitem;
+ OffsetNumber offnum, maxoff;
+ Oid afteroid;
+ bool found;
+
+ page = BufferGetPage(buf);
+
+ /* no right neighbor? */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_RIGHTMOST(opaque))
+ return (true);
+
+ /*
+ * this is a non-rightmost page, so it must have a high key item.
+ *
+ * If the scan key is < the high key (the min key on the next page),
+ * then it for sure belongs here.
+ */
+ hikey = PageGetItemId(page, P_HIKEY);
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber))
+ return (true);
+
+ /*
+ * If the scan key is > the high key, then it for sure doesn't belong
+ * here.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber))
+ return (false);
+
+ /*
+ * If we have no adjacency information, and the item is equal to the
+ * high key on the page (by here it is), then the item does not belong
+ * on this page.
+ */
+
+ if (afteritem == (BTItem) NULL)
+ return (false);
+
+ /* damn, have to work for it. i hate that. */
+ afteroid = afteritem->bti_oid;
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Search the entire page for the afteroid. We need to do this, rather
+ * than doing a binary search and starting from there, because if the
+ * key we're searching for is the leftmost key in the tree at this
+ * level, then a binary search will do the wrong thing. Splits are
+ * pretty infrequent, so the cost isn't as bad as it could be.
+ */
+
+ found = false;
+ for (offnum = P_FIRSTKEY;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum)) {
+ chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ if (chkitem->bti_oid == afteroid) {
+ found = true;
+ break;
+ }
+ }
+
+ return (found);
+}
+
+/*
+ * _bt_itemcmp() -- compare item1 to item2 using a requested
+ * strategy (<, <=, =, >=, >)
+ *
+ */
+bool
+_bt_itemcmp(Relation rel,
+ Size keysz,
+ BTItem item1,
+ BTItem item2,
+ StrategyNumber strat)
+{
+ TupleDesc tupDes;
+ IndexTuple indexTuple1, indexTuple2;
+ Datum attrDatum1, attrDatum2;
+ int i;
+ bool isNull;
+ bool compare;
+
+ tupDes = RelationGetTupleDescriptor(rel);
+ indexTuple1 = &(item1->bti_itup);
+ indexTuple2 = &(item2->bti_itup);
+
+ for (i = 1; i <= keysz; i++) {
+ attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isNull);
+ attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isNull);
+ compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2);
+ if (!compare) {
+ return (false);
+ }
+ }
+ return (true);
+}
+
+/*
+ * _bt_updateitem() -- updates the key of the item identified by the
+ * oid with the key of newItem (done in place)
+ *
+ */
+static void
+_bt_updateitem(Relation rel,
+ Size keysz,
+ Buffer buf,
+ Oid bti_oid,
+ BTItem newItem)
+{
+ Page page;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ ItemPointerData itemPtrData;
+ BTItem item;
+ IndexTuple oldIndexTuple, newIndexTuple;
+
+ page = BufferGetPage(buf);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* locate item on the page */
+ i = P_HIKEY;
+ do {
+ item = (BTItem) PageGetItem(page, PageGetItemId(page, i));
+ i = OffsetNumberNext(i);
+ } while (i <= maxoff && item->bti_oid != bti_oid);
+
+ /* this should never happen (in theory) */
+ if (item->bti_oid != bti_oid) {
+ elog(FATAL, "_bt_getstackbuf was lying!!");
+ }
+
+ oldIndexTuple = &(item->bti_itup);
+ newIndexTuple = &(newItem->bti_itup);
+
+ /* keep the original item pointer */
+ ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData);
+ CopyIndexTuple(newIndexTuple, &oldIndexTuple);
+ ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid));
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
new file mode 100644
index 00000000000..ce411a80d11
--- /dev/null
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -0,0 +1,523 @@
+/*-------------------------------------------------------------------------
+ *
+ * btpage.c--
+ * BTree-specific page management code for the Postgres btree access
+ * method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ * Postgres btree pages look like ordinary relation pages. The opaque
+ * data at high addresses includes pointers to left and right siblings
+ * and flag data describing page state. The first page in a btree, page
+ * zero, is special -- it stores meta-information describing the tree.
+ * Pages one and higher store the actual tree data.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+#define BTREE_METAPAGE 0
+#define BTREE_MAGIC 0x053162
+#define BTREE_VERSION 0
+
+typedef struct BTMetaPageData {
+ uint32 btm_magic;
+ uint32 btm_version;
+ BlockNumber btm_root;
+} BTMetaPageData;
+
+#define BTPageGetMeta(p) \
+ ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0])
+
+extern bool BuildingBtree;
+
+/*
+ * We use high-concurrency locking on btrees. There are two cases in
+ * which we don't do locking. One is when we're building the btree.
+ * Since the creating transaction has not committed, no one can see
+ * the index, and there's no reason to share locks. The second case
+ * is when we're just starting up the database system. We use some
+ * special-purpose initialization code in the relation cache manager
+ * (see utils/cache/relcache.c) to allow us to do indexed scans on
+ * the system catalogs before we'd normally be able to. This happens
+ * before the lock table is fully initialized, so we can't use it.
+ * Strictly speaking, this violates 2pl, but we don't do 2pl on the
+ * system catalogs anyway, so I declare this to be okay.
+ */
+
+#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
+
+/*
+ * _bt_metapinit() -- Initialize the metadata page of a btree.
+ */
+void
+_bt_metapinit(Relation rel)
+{
+ Buffer buf;
+ Page pg;
+ int nblocks;
+ BTMetaPageData metad;
+ BTPageOpaque op;
+
+ /* can't be sharing this with anyone, now... */
+ if (USELOCKING)
+ RelationSetLockForWrite(rel);
+
+ if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
+ elog(WARN, "Cannot initialize non-empty btree %s",
+ RelationGetRelationName(rel));
+ }
+
+ buf = ReadBuffer(rel, P_NEW);
+ pg = BufferGetPage(buf);
+ _bt_pageinit(pg, BufferGetPageSize(buf));
+
+ metad.btm_magic = BTREE_MAGIC;
+ metad.btm_version = BTREE_VERSION;
+ metad.btm_root = P_NONE;
+ memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
+
+ op = (BTPageOpaque) PageGetSpecialPointer(pg);
+ op->btpo_flags = BTP_META;
+
+ WriteBuffer(buf);
+
+ /* all done */
+ if (USELOCKING)
+ RelationUnsetLockForWrite(rel);
+}
+
+/*
+ * _bt_checkmeta() -- Verify that the metadata stored in a btree are
+ * reasonable.
+ */
+void
+_bt_checkmeta(Relation rel)
+{
+ Buffer metabuf;
+ Page metap;
+ BTMetaPageData *metad;
+ BTPageOpaque op;
+ int nblocks;
+
+ /* if the relation is empty, this is init time; don't complain */
+ if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
+ return;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metap = BufferGetPage(metabuf);
+ op = (BTPageOpaque) PageGetSpecialPointer(metap);
+ if (!(op->btpo_flags & BTP_META)) {
+ elog(WARN, "Invalid metapage for index %s",
+ RelationGetRelationName(rel));
+ }
+ metad = BTPageGetMeta(metap);
+
+ if (metad->btm_magic != BTREE_MAGIC) {
+ elog(WARN, "Index %s is not a btree",
+ RelationGetRelationName(rel));
+ }
+
+ if (metad->btm_version != BTREE_VERSION) {
+ elog(WARN, "Version mismatch on %s: version %d file, version %d code",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION);
+ }
+
+ _bt_relbuf(rel, metabuf, BT_READ);
+}
+
+/*
+ * _bt_getroot() -- Get the root page of the btree.
+ *
+ * Since the root page can move around the btree file, we have to read
+ * its location from the metadata page, and then read the root page
+ * itself. If no root page exists yet, we have to create one. The
+ * standard class of race conditions exists here; I think I covered
+ * them all in the Hopi Indian rain dance of lock requests below.
+ *
+ * We pass in the access type (BT_READ or BT_WRITE), and return the
+ * root page's buffer with the appropriate lock type set. Reference
+ * count on the root page gets bumped by ReadBuffer. The metadata
+ * page is unlocked and unreferenced by this process when this routine
+ * returns.
+ */
+Buffer
+_bt_getroot(Relation rel, int access)
+{
+ Buffer metabuf;
+ Page metapg;
+ BTPageOpaque metaopaque;
+ Buffer rootbuf;
+ Page rootpg;
+ BTPageOpaque rootopaque;
+ BlockNumber rootblkno;
+ BTMetaPageData *metad;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ Assert(metaopaque->btpo_flags & BTP_META);
+ metad = BTPageGetMeta(metapg);
+
+ /* if no root page initialized yet, do it */
+ if (metad->btm_root == P_NONE) {
+
+ /* turn our read lock in for a write lock */
+ _bt_relbuf(rel, metabuf, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ Assert(metaopaque->btpo_flags & BTP_META);
+ metad = BTPageGetMeta(metapg);
+
+ /*
+ * Race condition: if someone else initialized the metadata between
+ * the time we released the read lock and acquired the write lock,
+ * above, we want to avoid doing it again.
+ */
+
+ if (metad->btm_root == P_NONE) {
+
+ /*
+ * Get, initialize, write, and leave a lock of the appropriate
+ * type on the new root page. Since this is the first page in
+ * the tree, it's a leaf.
+ */
+
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootblkno = BufferGetBlockNumber(rootbuf);
+ rootpg = BufferGetPage(rootbuf);
+ metad->btm_root = rootblkno;
+ _bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+ rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
+ _bt_wrtnorelbuf(rel, rootbuf);
+
+ /* swap write lock for read lock, if appropriate */
+ if (access != BT_WRITE) {
+ _bt_setpagelock(rel, rootblkno, BT_READ);
+ _bt_unsetpagelock(rel, rootblkno, BT_WRITE);
+ }
+
+ /* okay, metadata is correct */
+ _bt_wrtbuf(rel, metabuf);
+ } else {
+
+ /*
+ * Metadata initialized by someone else. In order to guarantee
+ * no deadlocks, we have to release the metadata page and start
+ * all over again.
+ */
+
+ _bt_relbuf(rel, metabuf, BT_WRITE);
+ return (_bt_getroot(rel, access));
+ }
+ } else {
+ rootbuf = _bt_getbuf(rel, metad->btm_root, access);
+
+ /* done with the meta page */
+ _bt_relbuf(rel, metabuf, BT_READ);
+ }
+
+ /*
+ * Race condition: If the root page split between the time we looked
+ * at the metadata page and got the root buffer, then we got the wrong
+ * buffer.
+ */
+
+ rootpg = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+ if (!(rootopaque->btpo_flags & BTP_ROOT)) {
+
+ /* it happened, try again */
+ _bt_relbuf(rel, rootbuf, access);
+ return (_bt_getroot(rel, access));
+ }
+
+ /*
+ * By here, we have a correct lock on the root block, its reference
+ * count is correct, and we have no lock set on the metadata page.
+ * Return the root block.
+ */
+
+ return (rootbuf);
+}
+
+/*
+ * _bt_getbuf() -- Get a buffer by block number for read or write.
+ *
+ * When this routine returns, the appropriate lock is set on the
+ * requested buffer its reference count is correct.
+ */
+Buffer
+_bt_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+ Buffer buf;
+ Page page;
+
+ /*
+ * If we want a new block, we can't set a lock of the appropriate type
+ * until we've instantiated the buffer.
+ */
+
+ if (blkno != P_NEW) {
+ if (access == BT_WRITE)
+ _bt_setpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_setpagelock(rel, blkno, BT_READ);
+
+ buf = ReadBuffer(rel, blkno);
+ } else {
+ buf = ReadBuffer(rel, blkno);
+ blkno = BufferGetBlockNumber(buf);
+ page = BufferGetPage(buf);
+ _bt_pageinit(page, BufferGetPageSize(buf));
+
+ if (access == BT_WRITE)
+ _bt_setpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_setpagelock(rel, blkno, BT_READ);
+ }
+
+ /* ref count and lock type are correct */
+ return (buf);
+}
+
+/*
+ * _bt_relbuf() -- release a locked buffer.
+ */
+void
+_bt_relbuf(Relation rel, Buffer buf, int access)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+
+ /* access had better be one of read or write */
+ if (access == BT_WRITE)
+ _bt_unsetpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_unsetpagelock(rel, blkno, BT_READ);
+
+ ReleaseBuffer(buf);
+}
+
+/*
+ * _bt_wrtbuf() -- write a btree page to disk.
+ *
+ * This routine releases the lock held on the buffer and our reference
+ * to it. It is an error to call _bt_wrtbuf() without a write lock
+ * or a reference to the buffer.
+ */
+void
+_bt_wrtbuf(Relation rel, Buffer buf)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteBuffer(buf);
+ _bt_unsetpagelock(rel, blkno, BT_WRITE);
+}
+
+/*
+ * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
+ * our reference or lock.
+ *
+ * It is an error to call _bt_wrtnorelbuf() without a write lock
+ * or a reference to the buffer.
+ */
+void
+_bt_wrtnorelbuf(Relation rel, Buffer buf)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteNoReleaseBuffer(buf);
+}
+
+/*
+ * _bt_pageinit() -- Initialize a new page.
+ */
+void
+_bt_pageinit(Page page, Size size)
+{
+ /*
+ * Cargo-cult programming -- don't really need this to be zero, but
+ * creating new pages is an infrequent occurrence and it makes me feel
+ * good when I know they're empty.
+ */
+
+ memset(page, 0, size);
+
+ PageInit(page, size, sizeof(BTPageOpaqueData));
+}
+
+/*
+ * _bt_metaproot() -- Change the root page of the btree.
+ *
+ * Lehman and Yao require that the root page move around in order to
+ * guarantee deadlock-free short-term, fine-granularity locking. When
+ * we split the root page, we record the new parent in the metadata page
+ * for the relation. This routine does the work.
+ *
+ * No direct preconditions, but if you don't have the a write lock on
+ * at least the old root page when you call this, you're making a big
+ * mistake. On exit, metapage data is correct and we no longer have
+ * a reference to or lock on the metapage.
+ */
+void
+_bt_metaproot(Relation rel, BlockNumber rootbknum)
+{
+ Buffer metabuf;
+ Page metap;
+ BTPageOpaque metaopaque;
+ BTMetaPageData *metad;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metap = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
+ Assert(metaopaque->btpo_flags & BTP_META);
+ metad = BTPageGetMeta(metap);
+ metad->btm_root = rootbknum;
+ _bt_wrtbuf(rel, metabuf);
+}
+
+/*
+ * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
+ * we last looked at in the parent.
+ *
+ * This is possible because we save a bit image of the last item
+ * we looked at in the parent, and the update algorithm guarantees
+ * that if items above us in the tree move, they only move right.
+ */
+Buffer
+_bt_getstackbuf(Relation rel, BTStack stack, int access)
+{
+ Buffer buf;
+ BlockNumber blkno;
+ OffsetNumber start, offnum, maxoff;
+ OffsetNumber i;
+ Page page;
+ ItemId itemid;
+ BTItem item;
+ BTPageOpaque opaque;
+
+ blkno = stack->bts_blkno;
+ buf = _bt_getbuf(rel, blkno, access);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ if (maxoff >= stack->bts_offset) {
+ itemid = PageGetItemId(page, stack->bts_offset);
+ item = (BTItem) PageGetItem(page, itemid);
+
+ /* if the item is where we left it, we're done */
+ if (item->bti_oid == stack->bts_btitem->bti_oid)
+ return (buf);
+
+ /* if the item has just moved right on this page, we're done */
+ for (i = OffsetNumberNext(stack->bts_offset);
+ i <= maxoff;
+ i = OffsetNumberNext(i)) {
+ itemid = PageGetItemId(page, i);
+ item = (BTItem) PageGetItem(page, itemid);
+
+ /* if the item is where we left it, we're done */
+ if (item->bti_oid == stack->bts_btitem->bti_oid)
+ return (buf);
+ }
+ }
+
+ /* by here, the item we're looking for moved right at least one page */
+ for (;;) {
+ blkno = opaque->btpo_next;
+ if (P_RIGHTMOST(opaque))
+ elog(FATAL, "my bits moved right off the end of the world!");
+
+ _bt_relbuf(rel, buf, access);
+ buf = _bt_getbuf(rel, blkno, access);
+ page = BufferGetPage(buf);
+ maxoff = PageGetMaxOffsetNumber(page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* if we have a right sibling, step over the high key */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* see if it's on this page */
+ for (offnum = start;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum)) {
+ itemid = PageGetItemId(page, offnum);
+ item = (BTItem) PageGetItem(page, itemid);
+ if (item->bti_oid == stack->bts_btitem->bti_oid)
+ return (buf);
+ }
+ }
+}
+
+void
+_bt_setpagelock(Relation rel, BlockNumber blkno, int access)
+{
+ ItemPointerData iptr;
+
+ if (USELOCKING) {
+ ItemPointerSet(&iptr, blkno, P_HIKEY);
+
+ if (access == BT_WRITE)
+ RelationSetSingleWLockPage(rel, &iptr);
+ else
+ RelationSetSingleRLockPage(rel, &iptr);
+ }
+}
+
+void
+_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access)
+{
+ ItemPointerData iptr;
+
+ if (USELOCKING) {
+ ItemPointerSet(&iptr, blkno, P_HIKEY);
+
+ if (access == BT_WRITE)
+ RelationUnsetSingleWLockPage(rel, &iptr);
+ else
+ RelationUnsetSingleRLockPage(rel, &iptr);
+ }
+}
+
+void
+_bt_pagedel(Relation rel, ItemPointer tid)
+{
+ Buffer buf;
+ Page page;
+ BlockNumber blkno;
+ OffsetNumber offno;
+
+ blkno = ItemPointerGetBlockNumber(tid);
+ offno = ItemPointerGetOffsetNumber(tid);
+
+ buf = _bt_getbuf(rel, blkno, BT_WRITE);
+ page = BufferGetPage(buf);
+
+ PageIndexTupleDelete(page, offno);
+
+ /* write the buffer and release the lock */
+ _bt_wrtbuf(rel, buf);
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
new file mode 100644
index 00000000000..06016119964
--- /dev/null
+++ b/src/backend/access/nbtree/nbtree.c
@@ -0,0 +1,516 @@
+/*-------------------------------------------------------------------------
+ *
+ * btree.c--
+ * Implementation of Lehman and Yao's btree management algorithm for
+ * Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ * This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+#include "access/funcindex.h"
+
+#include "nodes/execnodes.h"
+#include "nodes/plannodes.h"
+
+#include "executor/executor.h"
+#include "executor/tuptable.h"
+
+#include "catalog/index.h"
+
+bool BuildingBtree = false;
+bool FastBuild = false; /* turn this on to make bulk builds work*/
+
+/*
+ * btbuild() -- build a new btree index.
+ *
+ * We use a global variable to record the fact that we're creating
+ * a new index. This is used to avoid high-concurrency locking,
+ * since the index won't be visible until this transaction commits
+ * and since building is guaranteed to be single-threaded.
+ */
+void
+btbuild(Relation heap,
+ Relation index,
+ int natts,
+ AttrNumber *attnum,
+ IndexStrategy istrat,
+ uint16 pcount,
+ Datum *params,
+ FuncIndexInfo *finfo,
+ PredInfo *predInfo)
+{
+ HeapScanDesc hscan;
+ Buffer buffer;
+ HeapTuple htup;
+ IndexTuple itup;
+ TupleDesc htupdesc, itupdesc;
+ Datum *attdata;
+ bool *nulls;
+ InsertIndexResult res;
+ int nhtups, nitups;
+ int i;
+ BTItem btitem;
+ ExprContext *econtext;
+ TupleTable tupleTable;
+ TupleTableSlot *slot;
+ Oid hrelid, irelid;
+ Node *pred, *oldPred;
+ void *spool;
+
+ /* note that this is a new btree */
+ BuildingBtree = true;
+
+ pred = predInfo->pred;
+ oldPred = predInfo->oldPred;
+
+ /* initialize the btree index metadata page (if this is a new index) */
+ if (oldPred == NULL)
+ _bt_metapinit(index);
+
+ /* get tuple descriptors for heap and index relations */
+ htupdesc = RelationGetTupleDescriptor(heap);
+ itupdesc = RelationGetTupleDescriptor(index);
+
+ /* get space for data items that'll appear in the index tuple */
+ attdata = (Datum *) palloc(natts * sizeof(Datum));
+ nulls = (bool *) palloc(natts * sizeof(bool));
+
+ /*
+ * If this is a predicate (partial) index, we will need to evaluate the
+ * predicate using ExecQual, which requires the current tuple to be in a
+ * slot of a TupleTable. In addition, ExecQual must have an ExprContext
+ * referring to that slot. Here, we initialize dummy TupleTable and
+ * ExprContext objects for this purpose. --Nels, Feb '92
+ */
+#ifndef OMIT_PARTIAL_INDEX
+ if (pred != NULL || oldPred != NULL) {
+ tupleTable = ExecCreateTupleTable(1);
+ slot = ExecAllocTableSlot(tupleTable);
+ econtext = makeNode(ExprContext);
+ FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+
+ /* start a heap scan */
+ hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+ htup = heap_getnext(hscan, 0, &buffer);
+
+ /* build the index */
+ nhtups = nitups = 0;
+
+ if (FastBuild) {
+ spool = _bt_spoolinit(index, 7);
+ res = (InsertIndexResult) NULL;
+ }
+
+ for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
+
+ nhtups++;
+
+ /*
+ * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+ * this tuple if it was already in the existing partial index
+ */
+ if (oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+
+ /*SetSlotContents(slot, htup);*/
+ slot->val = htup;
+ if (ExecQual((List*)oldPred, econtext) == true) {
+ nitups++;
+ continue;
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /* Skip this tuple if it doesn't satisfy the partial-index predicate */
+ if (pred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ /* SetSlotContents(slot, htup); */
+ slot->val = htup;
+ if (ExecQual((List*)pred, econtext) == false)
+ continue;
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ nitups++;
+
+ /*
+ * For the current heap tuple, extract all the attributes
+ * we use in this index, and note which are null.
+ */
+
+ for (i = 1; i <= natts; i++) {
+ int attoff;
+ bool attnull;
+
+ /*
+ * Offsets are from the start of the tuple, and are
+ * zero-based; indices are one-based. The next call
+ * returns i - 1. That's data hiding for you.
+ */
+
+ attoff = AttrNumberGetAttrOffset(i);
+ attdata[attoff] = GetIndexValue(htup,
+ htupdesc,
+ attoff,
+ attnum,
+ finfo,
+ &attnull,
+ buffer);
+ nulls[attoff] = (attnull ? 'n' : ' ');
+ }
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_formtuple(itupdesc, attdata, nulls);
+
+ /*
+ * If the single index key is null, we don't insert it into
+ * the index. Btrees support scans on <, <=, =, >=, and >.
+ * Relational algebra says that A op B (where op is one of the
+ * operators above) returns null if either A or B is null. This
+ * means that no qualification used in an index scan could ever
+ * return true on a null attribute. It also means that indices
+ * can't be used by ISNULL or NOTNULL scans, but that's an
+ * artifact of the strategy map architecture chosen in 1986, not
+ * of the way nulls are handled here.
+ */
+
+ if (itup->t_info & INDEX_NULL_MASK) {
+ pfree(itup);
+ continue;
+ }
+
+ itup->t_tid = htup->t_ctid;
+ btitem = _bt_formitem(itup);
+
+ /*
+ * if we are doing bottom-up btree build, we insert the index
+ * into a spool page for subsequent processing. otherwise, we
+ * insert into the btree.
+ */
+ if (FastBuild) {
+ _bt_spool(index, btitem, spool);
+ } else {
+ res = _bt_doinsert(index, btitem);
+ }
+
+ pfree(btitem);
+ pfree(itup);
+ if (res) {
+ pfree(res);
+ }
+ }
+
+ /* okay, all heap tuples are indexed */
+ heap_endscan(hscan);
+
+ if (pred != NULL || oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ ExecDestroyTupleTable(tupleTable, true);
+ pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /*
+ * if we are doing bottom-up btree build, we now have a bunch of
+ * sorted runs in the spool pages. finish the build by (1)
+ * merging the runs, (2) inserting the sorted tuples into btree
+ * pages and (3) building the upper levels.
+ */
+ if (FastBuild) {
+ _bt_spool(index, (BTItem) NULL, spool); /* flush spool */
+ _bt_leafbuild(index, spool);
+ _bt_spooldestroy(spool);
+ }
+
+ /*
+ * Since we just counted the tuples in the heap, we update its
+ * stats in pg_class to guarantee that the planner takes advantage
+ * of the index we just created. Finally, only update statistics
+ * during normal index definitions, not for indices on system catalogs
+ * created during bootstrap processing. We must close the relations
+ * before updatings statistics to guarantee that the relcache entries
+ * are flushed when we increment the command counter in UpdateStats().
+ */
+ if (IsNormalProcessingMode())
+ {
+ hrelid = heap->rd_id;
+ irelid = index->rd_id;
+ heap_close(heap);
+ index_close(index);
+ UpdateStats(hrelid, nhtups, true);
+ UpdateStats(irelid, nitups, false);
+ if (oldPred != NULL) {
+ if (nitups == nhtups) pred = NULL;
+ UpdateIndexPredicate(irelid, oldPred, pred);
+ }
+ }
+
+ /* be tidy */
+ pfree(nulls);
+ pfree(attdata);
+
+ /* all done */
+ BuildingBtree = false;
+}
+
+/*
+ * btinsert() -- insert an index tuple into a btree.
+ *
+ * Descend the tree recursively, find the appropriate location for our
+ * new tuple, put it there, set its unique OID as appropriate, and
+ * return an InsertIndexResult to the caller.
+ */
+InsertIndexResult
+btinsert(Relation rel, IndexTuple itup)
+{
+ BTItem btitem;
+ InsertIndexResult res;
+
+ if (itup->t_info & INDEX_NULL_MASK)
+ return ((InsertIndexResult) NULL);
+
+ btitem = _bt_formitem(itup);
+
+ res = _bt_doinsert(rel, btitem);
+ pfree(btitem);
+
+ return (res);
+}
+
+/*
+ * btgettuple() -- Get the next tuple in the scan.
+ */
+char *
+btgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+ RetrieveIndexResult res;
+
+ /*
+ * If we've already initialized this scan, we can just advance it
+ * in the appropriate direction. If we haven't done so yet, we
+ * call a routine to get the first item in the scan.
+ */
+
+ if (ItemPointerIsValid(&(scan->currentItemData)))
+ res = _bt_next(scan, dir);
+ else
+ res = _bt_first(scan, dir);
+
+ return ((char *) res);
+}
+
+/*
+ * btbeginscan() -- start a scan on a btree index
+ */
+char *
+btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey)
+{
+ IndexScanDesc scan;
+ StrategyNumber strat;
+ BTScanOpaque so;
+
+ /* first order the keys in the qualification */
+ if (keysz > 1)
+ _bt_orderkeys(rel, &keysz, scankey);
+
+ /* now get the scan */
+ scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
+ so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
+ so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
+ scan->opaque = so;
+
+ /* finally, be sure that the scan exploits the tree order */
+ scan->scanFromEnd = false;
+ scan->flags = 0x0;
+ if (keysz > 0) {
+ strat = _bt_getstrat(scan->relation, 1 /* XXX */,
+ scankey[0].sk_procedure);
+
+ if (strat == BTLessStrategyNumber
+ || strat == BTLessEqualStrategyNumber)
+ scan->scanFromEnd = true;
+ } else {
+ scan->scanFromEnd = true;
+ }
+
+ /* register scan in case we change pages it's using */
+ _bt_regscan(scan);
+
+ return ((char *) scan);
+}
+
+/*
+ * btrescan() -- rescan an index relation
+ */
+void
+btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* we hold a read lock on the current page in the scan */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* and we hold a read lock on the last marked item in the scan */
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* reset the scan key */
+ if (scan->numberOfKeys > 0) {
+ memmove(scan->keyData,
+ scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ }
+}
+
+void
+btmovescan(IndexScanDesc scan, Datum v)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release any locks we still hold */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ scan->keyData[0].sk_argument = v;
+}
+
+/*
+ * btendscan() -- close down a scan
+ */
+void
+btendscan(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release any locks we still hold */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ if (BufferIsValid(so->btso_curbuf))
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ if (BufferIsValid(so->btso_mrkbuf))
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* don't need scan registered anymore */
+ _bt_dropscan(scan);
+
+ /* be tidy */
+#ifdef PERFECT_MMGR
+ pfree (scan->opaque);
+#endif /* PERFECT_MMGR */
+}
+
+/*
+ * btmarkpos() -- save current scan position
+ */
+void
+btmarkpos(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release lock on old marked data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentItemData and copy to currentMarkData */
+ if (ItemPointerIsValid(&(scan->currentItemData))) {
+ so->btso_mrkbuf = _bt_getbuf(scan->relation,
+ BufferGetBlockNumber(so->btso_curbuf),
+ BT_READ);
+ scan->currentMarkData = scan->currentItemData;
+ }
+}
+
+/*
+ * btrestrpos() -- restore scan to last saved position
+ */
+void
+btrestrpos(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release lock on current data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentMarkData and copy to currentItemData */
+ if (ItemPointerIsValid(&(scan->currentMarkData))) {
+ so->btso_curbuf = _bt_getbuf(scan->relation,
+ BufferGetBlockNumber(so->btso_mrkbuf),
+ BT_READ);
+
+ scan->currentItemData = scan->currentMarkData;
+ }
+}
+
+/* stubs */
+void
+btdelete(Relation rel, ItemPointer tid)
+{
+ /* adjust any active scans that will be affected by this deletion */
+ _bt_adjscans(rel, tid);
+
+ /* delete the data from the page */
+ _bt_pagedel(rel, tid);
+}
diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c
new file mode 100644
index 00000000000..62a029bc06f
--- /dev/null
+++ b/src/backend/access/nbtree/nbtscan.c
@@ -0,0 +1,164 @@
+/*-------------------------------------------------------------------------
+ *
+ * btscan.c--
+ * manage scans on btrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *
+ * NOTES
+ * Because we can be doing an index scan on a relation while we update
+ * it, we need to avoid missing data that moves around in the index.
+ * The routines and global variables in this file guarantee that all
+ * scans in the local address space stay correctly positioned. This
+ * is all we need to worry about, since write locking guarantees that
+ * no one else will be on the same page at the same time as we are.
+ *
+ * The scheme is to manage a list of active scans in the current backend.
+ * Whenever we add or remove records from an index, or whenever we
+ * split a leaf page, we check the list of active scans to see if any
+ * has been affected. A scan is affected only if it is on the same
+ * relation, and the same page, as the update.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+
+typedef struct BTScanListData {
+ IndexScanDesc btsl_scan;
+ struct BTScanListData *btsl_next;
+} BTScanListData;
+
+typedef BTScanListData *BTScanList;
+
+static BTScanList BTScans = (BTScanList) NULL;
+
+/*
+ * _bt_regscan() -- register a new scan.
+ */
+void
+_bt_regscan(IndexScanDesc scan)
+{
+ BTScanList new_el;
+
+ new_el = (BTScanList) palloc(sizeof(BTScanListData));
+ new_el->btsl_scan = scan;
+ new_el->btsl_next = BTScans;
+ BTScans = new_el;
+}
+
+/*
+ * _bt_dropscan() -- drop a scan from the scan list
+ */
+void
+_bt_dropscan(IndexScanDesc scan)
+{
+ BTScanList chk, last;
+
+ last = (BTScanList) NULL;
+ for (chk = BTScans;
+ chk != (BTScanList) NULL && chk->btsl_scan != scan;
+ chk = chk->btsl_next) {
+ last = chk;
+ }
+
+ if (chk == (BTScanList) NULL)
+ elog(WARN, "btree scan list trashed; can't find 0x%lx", scan);
+
+ if (last == (BTScanList) NULL)
+ BTScans = chk->btsl_next;
+ else
+ last->btsl_next = chk->btsl_next;
+
+#ifdef PERFECT_MEM
+ pfree (chk);
+#endif /* PERFECT_MEM */
+}
+
+void
+_bt_adjscans(Relation rel, ItemPointer tid)
+{
+ BTScanList l;
+ Oid relid;
+
+ relid = rel->rd_id;
+ for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) {
+ if (relid == l->btsl_scan->relation->rd_id)
+ _bt_scandel(l->btsl_scan, ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ }
+}
+
+void
+_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+ ItemPointer current;
+ Buffer buf;
+ BTScanOpaque so;
+
+ if (!_bt_scantouched(scan, blkno, offno))
+ return;
+
+ so = (BTScanOpaque) scan->opaque;
+ buf = so->btso_curbuf;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno) {
+ _bt_step(scan, &buf, BackwardScanDirection);
+ so->btso_curbuf = buf;
+ }
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno) {
+ ItemPointerData tmp;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ _bt_step(scan, &buf, BackwardScanDirection);
+ so->btso_mrkbuf = buf;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ }
+}
+
+bool
+_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+ ItemPointer current;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ return (false);
+}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
new file mode 100644
index 00000000000..d7a7fc7d62e
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -0,0 +1,1133 @@
+/*-------------------------------------------------------------------------
+ *
+ * btsearch.c--
+ * search code for postgres btrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "fmgr.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/skey.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+
+static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, BTStack stack_in);
+static OffsetNumber _bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, Size keysz, ScanKey scankey, OffsetNumber offnum);
+static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page, int keysz, ScanKey scankey, OffsetNumber offnum);
+static bool _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+
+/*
+ * _bt_search() -- Search for a scan key in the index.
+ *
+ * This routine is actually just a helper that sets things up and
+ * calls a recursive-descent search routine on the tree.
+ */
+BTStack
+_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP)
+{
+ *bufP = _bt_getroot(rel, BT_READ);
+ return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL));
+}
+
+/*
+ * _bt_searchr() -- Search the tree recursively for a particular scankey.
+ */
+static BTStack
+_bt_searchr(Relation rel,
+ int keysz,
+ ScanKey scankey,
+ Buffer *bufP,
+ BTStack stack_in)
+{
+ BTStack stack;
+ OffsetNumber offnum;
+ Page page;
+ BTPageOpaque opaque;
+ BlockNumber par_blkno;
+ BlockNumber blkno;
+ ItemId itemid;
+ BTItem btitem;
+ BTItem item_save;
+ int item_nbytes;
+ IndexTuple itup;
+
+ /* if this is a leaf page, we're done */
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (opaque->btpo_flags & BTP_LEAF)
+ return (stack_in);
+
+ /*
+ * Find the appropriate item on the internal page, and get the child
+ * page that it points to.
+ */
+
+ par_blkno = BufferGetBlockNumber(*bufP);
+ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT);
+ itemid = PageGetItemId(page, offnum);
+ btitem = (BTItem) PageGetItem(page, itemid);
+ itup = &(btitem->bti_itup);
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+
+ /*
+ * We need to save the bit image of the index entry we chose in the
+ * parent page on a stack. In case we split the tree, we'll use this
+ * bit image to figure out what our real parent page is, in case the
+ * parent splits while we're working lower in the tree. See the paper
+ * by Lehman and Yao for how this is detected and handled. (We use
+ * unique OIDs to disambiguate duplicate keys in the index -- Lehman
+ * and Yao disallow duplicate keys).
+ */
+
+ item_nbytes = ItemIdGetLength(itemid);
+ item_save = (BTItem) palloc(item_nbytes);
+ memmove((char *) item_save, (char *) btitem, item_nbytes);
+ stack = (BTStack) palloc(sizeof(BTStackData));
+ stack->bts_blkno = par_blkno;
+ stack->bts_offset = offnum;
+ stack->bts_btitem = item_save;
+ stack->bts_parent = stack_in;
+
+ /* drop the read lock on the parent page and acquire one on the child */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+
+ /*
+ * Race -- the page we just grabbed may have split since we read its
+ * pointer in the parent. If it has, we may need to move right to its
+ * new sibling. Do that.
+ */
+
+ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
+
+ /* okay, all set to move down a level */
+ return (_bt_searchr(rel, keysz, scankey, bufP, stack));
+}
+
+/*
+ * _bt_moveright() -- move right in the btree if necessary.
+ *
+ * When we drop and reacquire a pointer to a page, it is possible that
+ * the page has changed in the meanwhile. If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
+ *
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page. If that entry
+ * is strictly less than one we expect to be on the page, then our
+ * picture of the page is incorrect and we need to move right.
+ *
+ * On entry, we have the buffer pinned and a lock of the proper type.
+ * If we move right, we release the buffer and lock and acquire the
+ * same on the right sibling.
+ */
+Buffer
+_bt_moveright(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey scankey,
+ int access)
+{
+ Page page;
+ BTPageOpaque opaque;
+ ItemId hikey;
+ ItemId itemid;
+ BlockNumber rblkno;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* if we're on a rightmost page, we don't need to move right */
+ if (P_RIGHTMOST(opaque))
+ return (buf);
+
+ /* by convention, item 0 on non-rightmost pages is the high key */
+ hikey = PageGetItemId(page, P_HIKEY);
+
+ /*
+ * If the scan key that brought us to this page is >= the high key
+ * stored on the page, then the page has split and we need to move
+ * right.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
+ BTGreaterEqualStrategyNumber)) {
+
+ /* move right as long as we need to */
+ do {
+ /*
+ * If this page consists of all duplicate keys (hikey and first
+ * key on the page have the same value), then we don't need to
+ * step right.
+ */
+ if (PageGetMaxOffsetNumber(page) > P_HIKEY) {
+ itemid = PageGetItemId(page, P_FIRSTKEY);
+ if (_bt_skeycmp(rel, keysz, scankey, page, itemid,
+ BTEqualStrategyNumber)) {
+ /* break is for the "move right" while loop */
+ break;
+ }
+ }
+
+ /* step right one page */
+ rblkno = opaque->btpo_next;
+ _bt_relbuf(rel, buf, access);
+ buf = _bt_getbuf(rel, rblkno, access);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ hikey = PageGetItemId(page, P_HIKEY);
+
+ } while (! P_RIGHTMOST(opaque)
+ && _bt_skeycmp(rel, keysz, scankey, page, hikey,
+ BTGreaterEqualStrategyNumber));
+ }
+ return (buf);
+}
+
+/*
+ * _bt_skeycmp() -- compare a scan key to a particular item on a page using
+ * a requested strategy (<, <=, =, >=, >).
+ *
+ * We ignore the unique OIDs stored in the btree item here. Those
+ * numbers are intended for use internally only, in repositioning a
+ * scan after a page split. They do not impose any meaningful ordering.
+ *
+ * The comparison is A <op> B, where A is the scan key and B is the
+ * tuple pointed at by itemid on page.
+ */
+bool
+_bt_skeycmp(Relation rel,
+ Size keysz,
+ ScanKey scankey,
+ Page page,
+ ItemId itemid,
+ StrategyNumber strat)
+{
+ BTItem item;
+ IndexTuple indexTuple;
+ TupleDesc tupDes;
+ ScanKey entry;
+ int i;
+ Datum attrDatum;
+ Datum keyDatum;
+ bool compare;
+ bool isNull;
+
+ item = (BTItem) PageGetItem(page, itemid);
+ indexTuple = &(item->bti_itup);
+
+ tupDes = RelationGetTupleDescriptor(rel);
+
+ /* see if the comparison is true for all of the key attributes */
+ for (i=1; i <= keysz; i++) {
+
+ entry = &scankey[i-1];
+ attrDatum = index_getattr(indexTuple,
+ entry->sk_attno,
+ tupDes,
+ &isNull);
+ keyDatum = entry->sk_argument;
+
+ compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum);
+ if (!compare)
+ return (false);
+ }
+
+ return (true);
+}
+
+/*
+ * _bt_binsrch() -- Do a binary search for a key on a particular page.
+ *
+ * The scankey we get has the compare function stored in the procedure
+ * entry of each data struct. We invoke this regproc to do the
+ * comparison for every key in the scankey. _bt_binsrch() returns
+ * the OffsetNumber of the first matching key on the page, or the
+ * OffsetNumber at which the matching key would appear if it were
+ * on this page.
+ *
+ * By the time this procedure is called, we're sure we're looking
+ * at the right page -- don't need to walk right. _bt_binsrch() has
+ * no lock or refcount side effects on the buffer.
+ */
+OffsetNumber
+_bt_binsrch(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey scankey,
+ int srchtype)
+{
+ TupleDesc itupdesc;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber low, mid, high;
+ bool match;
+ int result;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* by convention, item 0 on any non-rightmost page is the high key */
+ low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ high = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Since for non-rightmost pages, the zeroeth item on the page is the
+ * high key, there are two notions of emptiness. One is if nothing
+ * appears on the page. The other is if nothing but the high key does.
+ * The reason we test high <= low, rather than high == low, is that
+ * after vacuuming there may be nothing *but* the high key on a page.
+ * In that case, given the scheme above, low = 1 and high = 0.
+ */
+
+ if (PageIsEmpty(page) || (! P_RIGHTMOST(opaque) && high <= low))
+ return (low);
+
+ itupdesc = RelationGetTupleDescriptor(rel);
+ match = false;
+
+ while ((high - low) > 1) {
+ mid = low + ((high - low) / 2);
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid);
+
+ if (result > 0)
+ low = mid;
+ else if (result < 0)
+ high = mid - 1;
+ else {
+ match = true;
+ break;
+ }
+ }
+
+ /* if we found a match, we want to find the first one on the page */
+ if (match) {
+ return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, mid));
+ } else {
+
+ /*
+ * We terminated because the endpoints got too close together. There
+ * are two cases to take care of.
+ *
+ * For non-insertion searches on internal pages, we want to point at
+ * the last key <, or first key =, the scankey on the page. This
+ * guarantees that we'll descend the tree correctly.
+ *
+ * For all other cases, we want to point at the first key >=
+ * the scankey on the page. This guarantees that scans and
+ * insertions will happen correctly.
+ */
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT) {
+
+ /*
+ * We want the last key <, or first key ==, the scan key.
+ */
+
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
+
+ if (result == 0) {
+ return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, high));
+ } else if (result > 0) {
+ return (high);
+ } else {
+ return (low);
+ }
+ } else {
+
+ /* we want the first key >= the scan key */
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, low);
+ if (result <= 0) {
+ return (low);
+ } else {
+ if (low == high)
+ return (OffsetNumberNext(low));
+
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
+ if (result <= 0)
+ return (high);
+ else
+ return (OffsetNumberNext(high));
+ }
+ }
+ }
+}
+
+static OffsetNumber
+_bt_firsteq(Relation rel,
+ TupleDesc itupdesc,
+ Page page,
+ Size keysz,
+ ScanKey scankey,
+ OffsetNumber offnum)
+{
+ BTPageOpaque opaque;
+ OffsetNumber limit;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* skip the high key, if any */
+ limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* walk backwards looking for the first key in the chain of duplicates */
+ while (offnum > limit
+ && _bt_compare(rel, itupdesc, page,
+ keysz, scankey, OffsetNumberPrev(offnum)) == 0) {
+ offnum = OffsetNumberPrev(offnum);
+ }
+
+ return (offnum);
+}
+
+/*
+ * _bt_compare() -- Compare scankey to a particular tuple on the page.
+ *
+ * This routine returns:
+ * -1 if scankey < tuple at offnum;
+ * 0 if scankey == tuple at offnum;
+ * +1 if scankey > tuple at offnum.
+ *
+ * In order to avoid having to propagate changes up the tree any time
+ * a new minimal key is inserted, the leftmost entry on the leftmost
+ * page is less than all possible keys, by definition.
+ */
+static int
+_bt_compare(Relation rel,
+ TupleDesc itupdesc,
+ Page page,
+ int keysz,
+ ScanKey scankey,
+ OffsetNumber offnum)
+{
+ Datum datum;
+ BTItem btitem;
+ ItemId itemid;
+ IndexTuple itup;
+ BTPageOpaque opaque;
+ ScanKey entry;
+ AttrNumber attno;
+ int result;
+ int i;
+ bool null;
+
+ /*
+ * If this is a leftmost internal page, and if our comparison is
+ * with the first key on the page, then the item at that position is
+ * by definition less than the scan key.
+ */
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!(opaque->btpo_flags & BTP_LEAF)
+ && P_LEFTMOST(opaque)
+ && offnum == P_HIKEY) {
+ itemid = PageGetItemId(page, offnum);
+
+ /*
+ * we just have to believe that this will only be called with
+ * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the
+ * first actual data key (i.e., this is also a rightmost
+ * page). there doesn't seem to be any code that implies
+ * that the leftmost page is normally missing a high key as
+ * well as the rightmost page. but that implies that this
+ * code path only applies to the root -- which seems
+ * unlikely..
+ */
+ if (! P_RIGHTMOST(opaque)) {
+ elog(WARN, "_bt_compare: invalid comparison to high key");
+ }
+
+ /*
+ * If the item on the page is equal to the scankey, that's
+ * okay to admit. We just can't claim that the first key on
+ * the page is greater than anything.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, itemid,
+ BTEqualStrategyNumber)) {
+ return (0);
+ }
+ return (1);
+ }
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &(btitem->bti_itup);
+
+ /*
+ * The scan key is set up with the attribute number associated with each
+ * term in the key. It is important that, if the index is multi-key,
+ * the scan contain the first k key attributes, and that they be in
+ * order. If you think about how multi-key ordering works, you'll
+ * understand why this is.
+ *
+ * We don't test for violation of this condition here.
+ */
+
+ for (i = 1; i <= keysz; i++) {
+ long tmpres;
+
+ entry = &scankey[i - 1];
+ attno = entry->sk_attno;
+ datum = index_getattr(itup, attno, itupdesc, &null);
+ tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+ entry->sk_argument, datum);
+ result = tmpres;
+
+ /* if the keys are unequal, return the difference */
+ if (result != 0)
+ return (result);
+ }
+
+ /* by here, the keys are equal */
+ return (0);
+}
+
+/*
+ * _bt_next() -- Get the next item in a scan.
+ *
+ * On entry, we have a valid currentItemData in the scan, and a
+ * read lock on the page that contains that item. We do not have
+ * the page pinned. We return the next item in the scan. On
+ * exit, we have the page containing the next item locked but not
+ * pinned.
+ */
+RetrieveIndexResult
+_bt_next(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ Buffer buf;
+ Page page;
+ OffsetNumber offnum;
+ RetrieveIndexResult res;
+ BlockNumber blkno;
+ ItemPointer current;
+ ItemPointer iptr;
+ BTItem btitem;
+ IndexTuple itup;
+ BTScanOpaque so;
+
+ rel = scan->relation;
+ so = (BTScanOpaque) scan->opaque;
+ current = &(scan->currentItemData);
+
+ /*
+ * XXX 10 may 91: somewhere there's a bug in our management of the
+ * cached buffer for this scan. wei discovered it. the following
+ * is a workaround so he can work until i figure out what's going on.
+ */
+
+ if (!BufferIsValid(so->btso_curbuf))
+ so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current),
+ BT_READ);
+
+ /* we still have the buffer pinned and locked */
+ buf = so->btso_curbuf;
+ blkno = BufferGetBlockNumber(buf);
+
+ /* step one tuple in the appropriate direction */
+ if (!_bt_step(scan, &buf, dir))
+ return ((RetrieveIndexResult) NULL);
+
+ /* by here, current is the tuple we want to return */
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &btitem->bti_itup;
+
+ if (_bt_checkqual(scan, itup)) {
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid),
+ sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+
+ /* remember which buffer we have pinned and locked */
+ so->btso_curbuf = buf;
+ } else {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ res = (RetrieveIndexResult) NULL;
+ }
+
+ return (res);
+}
+
+/*
+ * _bt_first() -- Find the first item in a scan.
+ *
+ * We need to be clever about the type of scan, the operation it's
+ * performing, and the tree ordering. We return the RetrieveIndexResult
+ * of the first item in the tree that satisfies the qualification
+ * associated with the scan descriptor. On exit, the page containing
+ * the current index tuple is read locked and pinned, and the scan's
+ * opaque data entry is updated to include the buffer.
+ */
+RetrieveIndexResult
+_bt_first(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ TupleDesc itupdesc;
+ Buffer buf;
+ Page page;
+ BTStack stack;
+ OffsetNumber offnum, maxoff;
+ BTItem btitem;
+ IndexTuple itup;
+ ItemPointer current;
+ ItemPointer iptr;
+ BlockNumber blkno;
+ StrategyNumber strat;
+ RetrieveIndexResult res;
+ RegProcedure proc;
+ int result;
+ BTScanOpaque so;
+ ScanKeyData skdata;
+
+ /* if we just need to walk down one edge of the tree, do that */
+ if (scan->scanFromEnd)
+ return (_bt_endpoint(scan, dir));
+
+ rel = scan->relation;
+ itupdesc = RelationGetTupleDescriptor(scan->relation);
+ current = &(scan->currentItemData);
+ so = (BTScanOpaque) scan->opaque;
+
+ /*
+ * Okay, we want something more complicated. What we'll do is use
+ * the first item in the scan key passed in (which has been correctly
+ * ordered to take advantage of index ordering) to position ourselves
+ * at the right place in the scan.
+ */
+
+ /*
+ * XXX -- The attribute number stored in the scan key is the attno
+ * in the heap relation. We need to transmogrify this into
+ * the index relation attno here. For the moment, we have
+ * hardwired attno == 1.
+ */
+ proc = index_getprocid(rel, 1, BTORDER_PROC);
+ ScanKeyEntryInitialize(&skdata, 0x0, 1, proc,
+ scan->keyData[0].sk_argument);
+
+ stack = _bt_search(rel, 1, &skdata, &buf);
+ _bt_freestack(stack);
+
+ /* find the nearest match to the manufactured scan key on the page */
+ offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT);
+ page = BufferGetPage(buf);
+
+ /*
+ * This will happen if the tree we're searching is entirely empty,
+ * or if we're doing a search for a key that would appear on an
+ * entirely empty internal page. In either case, there are no
+ * matching tuples in the index.
+ */
+
+ if (PageIsEmpty(page)) {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ return ((RetrieveIndexResult) NULL);
+ }
+
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ if (offnum > maxoff)
+ offnum = maxoff;
+
+ blkno = BufferGetBlockNumber(buf);
+ ItemPointerSet(current, blkno, offnum);
+
+ /*
+ * Now find the right place to start the scan. Result is the
+ * value we're looking for minus the value we're looking at
+ * in the index.
+ */
+
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ strat = _bt_getstrat(rel, 1, scan->keyData[0].sk_procedure);
+
+ switch (strat) {
+ case BTLessStrategyNumber:
+ if (result <= 0) {
+ do {
+ if (!_bt_twostep(scan, &buf, BackwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result <= 0);
+
+ /* if this is true, the key we just looked at is gone */
+ if (result > 0)
+ (void) _bt_twostep(scan, &buf, ForwardScanDirection);
+ }
+ break;
+
+ case BTLessEqualStrategyNumber:
+ if (result >= 0) {
+ do {
+ if (!_bt_twostep(scan, &buf, ForwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result >= 0);
+
+ if (result < 0)
+ (void) _bt_twostep(scan, &buf, BackwardScanDirection);
+ }
+ break;
+
+ case BTEqualStrategyNumber:
+ if (result != 0) {
+ _bt_relbuf(scan->relation, buf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(&(scan->currentItemData));
+ return ((RetrieveIndexResult) NULL);
+ }
+ break;
+
+ case BTGreaterEqualStrategyNumber:
+ if (result < 0) {
+ do {
+ if (!_bt_twostep(scan, &buf, BackwardScanDirection))
+ break;
+
+ page = BufferGetPage(buf);
+ offnum = ItemPointerGetOffsetNumber(current);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result < 0);
+
+ if (result > 0)
+ (void) _bt_twostep(scan, &buf, ForwardScanDirection);
+ }
+ break;
+
+ case BTGreaterStrategyNumber:
+ if (result >= 0) {
+ do {
+ if (!_bt_twostep(scan, &buf, ForwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result >= 0);
+ }
+ break;
+ }
+
+ /* okay, current item pointer for the scan is right */
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &btitem->bti_itup;
+
+ if (_bt_checkqual(scan, itup)) {
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid),
+ sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+ pfree(iptr);
+
+ /* remember which buffer we have pinned */
+ so->btso_curbuf = buf;
+ } else {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ res = (RetrieveIndexResult) NULL;
+ }
+
+ return (res);
+}
+
+/*
+ * _bt_step() -- Step one item in the requested direction in a scan on
+ * the tree.
+ *
+ * If no adjacent record exists in the requested direction, return
+ * false. Else, return true and set the currentItemData for the
+ * scan to the right thing.
+ */
+bool
+_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+{
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum, maxoff;
+ OffsetNumber start;
+ BlockNumber blkno;
+ BlockNumber obknum;
+ BTScanOpaque so;
+ ItemPointer current;
+ Relation rel;
+
+ rel = scan->relation;
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ so = (BTScanOpaque) scan->opaque;
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* get the next tuple */
+ if (ScanDirectionIsForward(dir)) {
+ if (!PageIsEmpty(page) && offnum < maxoff) {
+ offnum = OffsetNumberNext(offnum);
+ } else {
+
+ /* if we're at end of scan, release the buffer and return */
+ blkno = opaque->btpo_next;
+ if (P_RIGHTMOST(opaque)) {
+ _bt_relbuf(rel, *bufP, BT_READ);
+ ItemPointerSetInvalid(current);
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ return (false);
+ } else {
+
+ /* walk right to the next page with data */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ for (;;) {
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (!PageIsEmpty(page) && start <= maxoff) {
+ break;
+ } else {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, *bufP, BT_READ);
+ if (blkno == P_NONE) {
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ }
+ }
+ }
+ offnum = start;
+ }
+ }
+ } else if (ScanDirectionIsBackward(dir)) {
+
+ /* remember that high key is item zero on non-rightmost pages */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (offnum > start) {
+ offnum = OffsetNumberPrev(offnum);
+ } else {
+
+ /* if we're at end of scan, release the buffer and return */
+ blkno = opaque->btpo_prev;
+ if (P_LEFTMOST(opaque)) {
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ } else {
+
+ obknum = BufferGetBlockNumber(*bufP);
+
+ /* walk right to the next page with data */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ for (;;) {
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * If the adjacent page just split, then we may have the
+ * wrong block. Handle this case. Because pages only
+ * split right, we don't have to worry about this failing
+ * to terminate.
+ */
+
+ while (opaque->btpo_next != obknum) {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
+
+ /* don't consider the high key */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* anything to look at here? */
+ if (!PageIsEmpty(page) && maxoff >= start) {
+ break;
+ } else {
+ blkno = opaque->btpo_prev;
+ obknum = BufferGetBlockNumber(*bufP);
+ _bt_relbuf(rel, *bufP, BT_READ);
+ if (blkno == P_NONE) {
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ }
+ }
+ }
+ offnum = maxoff; /* XXX PageIsEmpty? */
+ }
+ }
+ }
+ blkno = BufferGetBlockNumber(*bufP);
+ so->btso_curbuf = *bufP;
+ ItemPointerSet(current, blkno, offnum);
+
+ return (true);
+}
+
+/*
+ * _bt_twostep() -- Move to an adjacent record in a scan on the tree,
+ * if an adjacent record exists.
+ *
+ * This is like _bt_step, except that if no adjacent record exists
+ * it restores us to where we were before trying the step. This is
+ * only hairy when you cross page boundaries, since the page you cross
+ * from could have records inserted or deleted, or could even split.
+ * This is unlikely, but we try to handle it correctly here anyway.
+ *
+ * This routine contains the only case in which our changes to Lehman
+ * and Yao's algorithm.
+ *
+ * Like step, this routine leaves the scan's currentItemData in the
+ * proper state and acquires a lock and pin on *bufP. If the twostep
+ * succeeded, we return true; otherwise, we return false.
+ */
+static bool
+_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+{
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum, maxoff;
+ OffsetNumber start;
+ ItemPointer current;
+ ItemId itemid;
+ int itemsz;
+ BTItem btitem;
+ BTItem svitem;
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(*bufP);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* if we're safe, just do it */
+ if (ScanDirectionIsForward(dir) && offnum < maxoff) { /* XXX PageIsEmpty? */
+ ItemPointerSet(current, blkno, OffsetNumberNext(offnum));
+ return (true);
+ } else if (ScanDirectionIsBackward(dir) && offnum > start) {
+ ItemPointerSet(current, blkno, OffsetNumberPrev(offnum));
+ return (true);
+ }
+
+ /* if we've hit end of scan we don't have to do any work */
+ if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) {
+ return (false);
+ } else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) {
+ return (false);
+ }
+
+ /*
+ * Okay, it's off the page; let _bt_step() do the hard work, and we'll
+ * try to remember where we were. This is not guaranteed to work; this
+ * is the only place in the code where concurrency can screw us up,
+ * and it's because we want to be able to move in two directions in
+ * the scan.
+ */
+
+ itemid = PageGetItemId(page, offnum);
+ itemsz = ItemIdGetLength(itemid);
+ btitem = (BTItem) PageGetItem(page, itemid);
+ svitem = (BTItem) palloc(itemsz);
+ memmove((char *) svitem, (char *) btitem, itemsz);
+
+ if (_bt_step(scan, bufP, dir)) {
+ pfree(svitem);
+ return (true);
+ }
+
+ /* try to find our place again */
+ *bufP = _bt_getbuf(scan->relation, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ while (offnum <= maxoff) {
+ itemid = PageGetItemId(page, offnum);
+ btitem = (BTItem) PageGetItem(page, itemid);
+ if (btitem->bti_oid == svitem->bti_oid) {
+ pfree(svitem);
+ ItemPointerSet(current, blkno, offnum);
+ return (false);
+ }
+ }
+
+ /*
+ * XXX crash and burn -- can't find our place. We can be a little
+ * smarter -- walk to the next page to the right, for example, since
+ * that's the only direction that splits happen in. Deletions screw
+ * us up less often since they're only done by the vacuum daemon.
+ */
+
+ elog(WARN, "btree synchronization error: concurrent update botched scan");
+
+ return (false);
+}
+
+/*
+ * _bt_endpoint() -- Find the first or last key in the index.
+ */
+static RetrieveIndexResult
+_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ ItemPointer current;
+ ItemPointer iptr;
+ OffsetNumber offnum, maxoff;
+ OffsetNumber start;
+ BlockNumber blkno;
+ BTItem btitem;
+ IndexTuple itup;
+ BTScanOpaque so;
+ RetrieveIndexResult res;
+
+ rel = scan->relation;
+ current = &(scan->currentItemData);
+
+ buf = _bt_getroot(rel, BT_READ);
+ blkno = BufferGetBlockNumber(buf);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ for (;;) {
+ if (opaque->btpo_flags & BTP_LEAF)
+ break;
+
+ if (ScanDirectionIsForward(dir)) {
+ offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+ } else {
+ offnum = PageGetMaxOffsetNumber(page);
+ }
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &(btitem->bti_itup);
+
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * Race condition: If the child page we just stepped onto is
+ * in the process of being split, we need to make sure we're
+ * all the way at the right edge of the tree. See the paper
+ * by Lehman and Yao.
+ */
+
+ if (ScanDirectionIsBackward(dir) && ! P_RIGHTMOST(opaque)) {
+ do {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ } while (! P_RIGHTMOST(opaque));
+ }
+ }
+
+ /* okay, we've got the {left,right}-most page in the tree */
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ if (ScanDirectionIsForward(dir)) {
+ if (PageIsEmpty(page)) {
+ maxoff = FirstOffsetNumber;
+ } else {
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (PageIsEmpty(page) || start > maxoff) {
+ ItemPointerSet(current, blkno, maxoff);
+ if (!_bt_step(scan, &buf, BackwardScanDirection))
+ return ((RetrieveIndexResult) NULL);
+
+ start = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ } else {
+ ItemPointerSet(current, blkno, start);
+ }
+ } else if (ScanDirectionIsBackward(dir)) {
+ if (PageIsEmpty(page)) {
+ ItemPointerSet(current, blkno, FirstOffsetNumber);
+ if (!_bt_step(scan, &buf, ForwardScanDirection))
+ return ((RetrieveIndexResult) NULL);
+
+ start = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ } else {
+ start = PageGetMaxOffsetNumber(page);
+ ItemPointerSet(current, blkno, start);
+ }
+ } else {
+ elog(WARN, "Illegal scan direction %d", dir);
+ }
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
+ itup = &(btitem->bti_itup);
+
+ /* see if we picked a winner */
+ if (_bt_checkqual(scan, itup)) {
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid),
+ sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+
+ /* remember which buffer we have pinned */
+ so = (BTScanOpaque) scan->opaque;
+ so->btso_curbuf = buf;
+ } else {
+ _bt_relbuf(rel, buf, BT_READ);
+ res = (RetrieveIndexResult) NULL;
+ }
+
+ return (res);
+}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
new file mode 100644
index 00000000000..3d2676324a0
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -0,0 +1,1196 @@
+/*-------------------------------------------------------------------------
+ * btsort.c--
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Id: nbtsort.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ *
+ * what we do is:
+ * - generate a set of initial one-block runs, distributed round-robin
+ * between the output tapes.
+ * - for each pass,
+ * - swap input and output tape sets, rewinding both and truncating
+ * the output tapes.
+ * - merge the current run in each input tape to the current output
+ * tape.
+ * - when each input run has been exhausted, switch to another output
+ * tape and start processing another run.
+ * - when we have fewer runs than tapes, we know we are ready to start
+ * merging into the btree leaf pages.
+ * - every time we complete a level of the btree, we can construct the
+ * next level up. when we have only one page on a level, it can be
+ * attached to the btree metapage and we are done.
+ *
+ * conventions:
+ * - external interface routines take in and return "void *" for their
+ * opaque handles. this is for modularity reasons (i prefer not to
+ * export these structures without good reason).
+ *
+ * this code is moderately slow (~10% slower) compared to the regular
+ * btree (insertion) build code on sorted or well-clustered data. on
+ * random data, however, the insertion build code is unusable -- the
+ * difference on a 60MB heap is a factor of 15 because the random
+ * probes into the btree thrash the buffer pool.
+ *
+ * this code currently packs the pages to 100% of capacity. this is
+ * not wise, since *any* insertion will cause splitting. filling to
+ * something like the standard 70% steady-state load factor for btrees
+ * would probably be better.
+ *
+ * somebody desperately needs to figure out how to do a better job of
+ * balancing the merge passes -- the fan-in on the final merges can be
+ * pretty poor, which is bad for performance.
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+
+#include "c.h"
+
+#include "access/nbtree.h"
+
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "utils/rel.h"
+#include "utils/palloc.h"
+#include "utils/elog.h"
+
+/*#define FASTBUILD_DEBUG*/ /* turn on debugging output */
+
+#define FASTBUILD
+
+#ifdef FASTBUILD
+
+#define MAXTAPES (7)
+#define TAPEBLCKSZ (BLCKSZ << 2)
+#define TAPETEMP "pg_btsortXXXXXX"
+
+
+/*-------------------------------------------------------------------------
+ * sorting comparison routine - returns {-1,0,1} depending on whether
+ * the key in the left BTItem is {<,=,>} the key in the right BTItem.
+ *
+ * we want to use _bt_isortcmp as a comparison function for qsort(3),
+ * but it needs extra arguments, so we "pass them in" as global
+ * variables. ick. fortunately, they are the same throughout the
+ * build, so we need do this only once. this is why you must call
+ * _bt_isortcmpinit before the call to qsort(3).
+ *
+ * a NULL BTItem is always assumed to be greater than any actual
+ * value; our heap routines (see below) assume that the smallest
+ * element in the heap is returned. that way, NULL values from the
+ * exhausted tapes can sift down to the bottom of the heap. in point
+ * of fact we just don't replace the elements of exhausted tapes, but
+ * what the heck.
+ * *-------------------------------------------------------------------------
+ */
+static Relation _bt_sortrel;
+
+static void
+_bt_isortcmpinit(Relation index)
+{
+ _bt_sortrel = index;
+}
+
+static int
+_bt_isortcmp(BTItem *bti1p, BTItem *bti2p)
+{
+ BTItem bti1 = *bti1p;
+ BTItem bti2 = *bti2p;
+
+ if (bti1 == (BTItem) NULL) {
+ if (bti2 == (BTItem) NULL) {
+ return(0); /* 1 = 2 */
+ }
+ return(1); /* 1 > 2 */
+ } else if (bti2 == (BTItem) NULL) {
+ return(-1); /* 1 < 2 */
+ } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2,
+ BTGreaterStrategyNumber)) {
+ return(1); /* 1 > 2 */
+ } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1,
+ BTGreaterStrategyNumber)) {
+ return(-1); /* 1 < 2 */
+ }
+ return(0); /* 1 = 2 */
+}
+
+/*-------------------------------------------------------------------------
+ * priority queue methods
+ *
+ * these were more-or-less lifted from the heap section of the 1984
+ * edition of gonnet's book on algorithms and data structures. they
+ * are coded so that the smallest element in the heap is returned (we
+ * use them for merging sorted runs).
+ *
+ * XXX these probably ought to be generic library functions.
+ *-------------------------------------------------------------------------
+ */
+
+typedef struct {
+ int btpqe_tape; /* tape identifier */
+ BTItem btpqe_item; /* pointer to BTItem in tape buffer */
+} BTPriQueueElem;
+
+#define MAXELEM MAXTAPES
+typedef struct {
+ int btpq_nelem;
+ BTPriQueueElem btpq_queue[MAXELEM];
+ Relation btpq_rel;
+} BTPriQueue;
+
+/* be sure to call _bt_isortcmpinit first */
+#define GREATER(a, b) \
+ (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
+
+static void
+_bt_pqsift(BTPriQueue *q, int parent)
+{
+ int child;
+ BTPriQueueElem e;
+
+ for (child = parent * 2 + 1;
+ child < q->btpq_nelem;
+ child = parent * 2 + 1) {
+ if (child < q->btpq_nelem - 1) {
+ if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) {
+ ++child;
+ }
+ }
+ if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) {
+ e = q->btpq_queue[child]; /* struct = */
+ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
+ q->btpq_queue[parent] = e; /* struct = */
+ parent = child;
+ } else {
+ parent = child + 1;
+ }
+ }
+}
+
+static int
+_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
+{
+ if (q->btpq_nelem < 1) { /* already empty */
+ return(-1);
+ }
+ *e = q->btpq_queue[0]; /* struct = */
+
+ if (--q->btpq_nelem < 1) { /* now empty, don't sift */
+ return(0);
+ }
+ q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */
+ _bt_pqsift(q, 0);
+ return(0);
+}
+
+static void
+_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
+{
+ int child, parent;
+
+ if (q->btpq_nelem >= MAXELEM) {
+ elog(WARN, "_bt_pqadd: queue overflow");
+ }
+
+ child = q->btpq_nelem++;
+ while (child > 0) {
+ parent = child / 2;
+ if (GREATER(e, &(q->btpq_queue[parent]))) {
+ break;
+ } else {
+ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
+ child = parent;
+ }
+ }
+
+ q->btpq_queue[child] = *e; /* struct = */
+}
+
+/*-------------------------------------------------------------------------
+ * tape methods
+ *-------------------------------------------------------------------------
+ */
+
+#define BTITEMSZ(btitem) \
+ ((btitem) ? \
+ (IndexTupleDSize((btitem)->bti_itup) + \
+ (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
+ 0)
+#define SPCLEFT(tape) \
+ (sizeof((tape)->bttb_data) - (tape)->bttb_top)
+#define EMPTYTAPE(tape) \
+ ((tape)->bttb_ntup <= 0)
+#define BTTAPEMAGIC 0x19660226
+
+/*
+ * this is what we use to shovel BTItems in and out of memory. it's
+ * bigger than a standard block because we are doing a lot of strictly
+ * sequential i/o. this is obviously something of a tradeoff since we
+ * are potentially reading a bunch of zeroes off of disk in many
+ * cases.
+ *
+ * BTItems are packed in and DOUBLEALIGN'd.
+ *
+ * the fd should not be going out to disk, strictly speaking, but it's
+ * the only thing like that so i'm not going to worry about wasting a
+ * few bytes.
+ */
+typedef struct {
+ int bttb_magic; /* magic number */
+ int bttb_fd; /* file descriptor */
+ int bttb_top; /* top of free space within bttb_data */
+ short bttb_ntup; /* number of tuples in this block */
+ short bttb_eor; /* End-Of-Run marker */
+ char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
+} BTTapeBlock;
+
+
+/*
+ * reset the tape header for its next use without doing anything to
+ * the physical tape file. (setting bttb_top to 0 makes the block
+ * empty.)
+ */
+static void
+_bt_tapereset(BTTapeBlock *tape)
+{
+ tape->bttb_eor = 0;
+ tape->bttb_top = 0;
+ tape->bttb_ntup = 0;
+}
+
+/*
+ * rewind the physical tape file.
+ */
+static void
+_bt_taperewind(BTTapeBlock *tape)
+{
+ (void) FileSeek(tape->bttb_fd, 0, SEEK_SET);
+}
+
+/*
+ * destroy the contents of the physical tape file without destroying
+ * the tape data structure or removing the physical tape file.
+ *
+ * we use the VFD version of ftruncate(2) to do this rather than
+ * unlinking and recreating the file. you still have to wait while
+ * the OS frees up all of the file system blocks and stuff, but at
+ * least you don't have to delete and reinsert the directory entries.
+ */
+static void
+_bt_tapeclear(BTTapeBlock *tape)
+{
+ /* blow away the contents of the old file */
+ _bt_taperewind(tape);
+#if 0
+ FileSync(tape->bttb_fd);
+#endif
+ FileTruncate(tape->bttb_fd, 0);
+
+ /* reset the buffer */
+ _bt_tapereset(tape);
+}
+
+/*
+ * create a new BTTapeBlock, allocating memory for the data structure
+ * as well as opening a physical tape file.
+ */
+static BTTapeBlock *
+_bt_tapecreate(char *fname)
+{
+ BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
+
+ if (tape == (BTTapeBlock *) NULL) {
+ elog(WARN, "_bt_tapecreate: out of memory");
+ }
+
+ tape->bttb_magic = BTTAPEMAGIC;
+
+ tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
+ Assert(tape->bttb_fd >= 0);
+
+ /* initialize the buffer */
+ _bt_tapereset(tape);
+
+ return(tape);
+}
+
+/*
+ * destroy the BTTapeBlock structure and its physical tape file.
+ */
+static void
+_bt_tapedestroy(BTTapeBlock *tape)
+{
+ FileUnlink(tape->bttb_fd);
+ pfree((void *) tape);
+}
+
+/*
+ * flush the tape block to the file, marking End-Of-Run if requested.
+ */
+static void
+_bt_tapewrite(BTTapeBlock *tape, int eor)
+{
+ tape->bttb_eor = eor;
+ FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ);
+ _bt_tapereset(tape);
+}
+
+/*
+ * read a tape block from the file, overwriting the current contents
+ * of the buffer.
+ *
+ * returns:
+ * - 0 if there are no more blocks in the tape or in this run (call
+ * _bt_tapereset to clear the End-Of-Run marker)
+ * - 1 if a valid block was read
+ */
+static int
+_bt_taperead(BTTapeBlock *tape)
+{
+ int fd;
+ int nread;
+
+ if (tape->bttb_eor) {
+ return(0); /* we are at End-Of-Run */
+ }
+
+ /*
+ * we're clobbering the old tape block, but we do need to save the
+ * VFD (the one in the block we're reading is bogus).
+ */
+ fd = tape->bttb_fd;
+ nread = FileRead(fd, (char*) tape, TAPEBLCKSZ);
+ tape->bttb_fd = fd;
+
+ if (nread != TAPEBLCKSZ) {
+ Assert(nread == 0); /* we are at EOF */
+ return(0);
+ }
+ Assert(tape->bttb_magic == BTTAPEMAGIC);
+ return(1);
+}
+
+/*
+ * get the next BTItem from a tape block.
+ *
+ * returns:
+ * - NULL if we have run out of BTItems
+ * - a pointer to the BTItemData in the block otherwise
+ *
+ * side effects:
+ * - sets 'pos' to the current position within the block.
+ */
+static BTItem
+_bt_tapenext(BTTapeBlock *tape, char **pos)
+{
+ Size itemsz;
+ BTItem bti;
+
+ if (*pos >= tape->bttb_data + tape->bttb_top) {
+ return((BTItem) NULL);
+ }
+ bti = (BTItem) *pos;
+ itemsz = BTITEMSZ(bti);
+ *pos += DOUBLEALIGN(itemsz);
+ return(bti);
+}
+
+/*
+ * copy a BTItem into a tape block.
+ *
+ * assumes that we have already checked to see if the block has enough
+ * space for the item.
+ *
+ * side effects:
+ *
+ * - advances the 'top' pointer in the tape block header to point to
+ * the beginning of free space.
+ */
+static void
+_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
+{
+ (void) memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
+ ++tape->bttb_ntup;
+ tape->bttb_top += DOUBLEALIGN(itemsz);
+}
+
+/*-------------------------------------------------------------------------
+ * spool methods
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * this structure holds the bookkeeping for a simple balanced multiway
+ * merge. (polyphase merging is hairier than i want to get into right
+ * now, and i don't see why i have to care how many "tapes" i use
+ * right now. though if psort was in a condition that i could hack it
+ * to do this, you bet i would.)
+ */
+typedef struct {
+ int bts_ntapes;
+ int bts_tape;
+ BTTapeBlock **bts_itape; /* input tape blocks */
+ BTTapeBlock **bts_otape; /* output tape blocks */
+} BTSpool;
+
+/*
+ * create and initialize a spool structure, including the underlying
+ * files.
+ */
+void *
+_bt_spoolinit(Relation index, int ntapes)
+{
+ char *mktemp();
+
+ BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
+ int i;
+ char *fname = (char *) palloc(sizeof(TAPETEMP) + 1);
+
+ if (btspool == (BTSpool *) NULL || fname == (char *) NULL) {
+ elog(WARN, "_bt_spoolinit: out of memory");
+ }
+ (void) memset((char *) btspool, 0, sizeof(BTSpool));
+ btspool->bts_ntapes = ntapes;
+ btspool->bts_tape = 0;
+
+ btspool->bts_itape =
+ (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+ btspool->bts_otape =
+ (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+ if (btspool->bts_itape == (BTTapeBlock **) NULL ||
+ btspool->bts_otape == (BTTapeBlock **) NULL) {
+ elog(WARN, "_bt_spoolinit: out of memory");
+ }
+
+ for (i = 0; i < ntapes; ++i) {
+ btspool->bts_itape[i] =
+ _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+ btspool->bts_otape[i] =
+ _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+ }
+ pfree((void *) fname);
+
+ _bt_isortcmpinit(index);
+
+ return((void *) btspool);
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+void
+_bt_spooldestroy(void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ _bt_tapedestroy(btspool->bts_otape[i]);
+ _bt_tapedestroy(btspool->bts_itape[i]);
+ }
+ pfree((void *) btspool);
+}
+
+/*
+ * flush out any dirty output tape blocks
+ */
+static void
+_bt_spoolflush(BTSpool *btspool)
+{
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ if (!EMPTYTAPE(btspool->bts_otape[i])) {
+ _bt_tapewrite(btspool->bts_otape[i], 1);
+ }
+ }
+}
+
+/*
+ * swap input tapes and output tapes by swapping their file
+ * descriptors. additional preparation for the next merge pass
+ * includes rewinding the new input tapes and clearing out the new
+ * output tapes.
+ */
+static void
+_bt_spoolswap(BTSpool *btspool)
+{
+ File tmpfd;
+ BTTapeBlock *itape;
+ BTTapeBlock *otape;
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ itape = btspool->bts_itape[i];
+ otape = btspool->bts_otape[i];
+
+ /*
+ * swap the input and output VFDs.
+ */
+ tmpfd = itape->bttb_fd;
+ itape->bttb_fd = otape->bttb_fd;
+ otape->bttb_fd = tmpfd;
+
+ /*
+ * rewind the new input tape.
+ */
+ _bt_taperewind(itape);
+ _bt_tapereset(itape);
+
+ /*
+ * clear the new output tape -- it's ok to throw away the old
+ * inputs.
+ */
+ _bt_tapeclear(otape);
+ }
+}
+
+/*-------------------------------------------------------------------------
+ * sorting routines
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * spool 'btitem' into an initial run. as tape blocks are filled, the
+ * block BTItems are qsorted and written into some output tape (it
+ * doesn't matter which; we go round-robin for simplicity). the
+ * initial runs are therefore always just one block.
+ */
+void
+_bt_spool(Relation index, BTItem btitem, void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ BTTapeBlock *itape;
+ Size itemsz;
+
+ itape = btspool->bts_itape[btspool->bts_tape];
+ itemsz = BTITEMSZ(btitem);
+ itemsz = DOUBLEALIGN(itemsz);
+
+ /*
+ * if this buffer is too full for this BTItemData, or if we have
+ * run out of BTItems, we need to sort the buffer and write it
+ * out. in this case, the BTItemData will go into the next tape's
+ * buffer.
+ */
+ if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) {
+ BTItem *parray;
+ BTTapeBlock *otape;
+ BTItem bti;
+ char *pos;
+ int btisz;
+ int i;
+
+ /*
+ * build an array of pointers to the BTItemDatas on the input
+ * block.
+ */
+ parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem));
+ if (parray == (BTItem *) NULL) {
+ elog(WARN, "_bt_spool: out of memory");
+ }
+ pos = itape->bttb_data;
+ for (i = 0; i < itape->bttb_ntup; ++i) {
+ parray[i] = _bt_tapenext(itape, &pos);
+ }
+
+ /*
+ * qsort the pointer array.
+ */
+ _bt_isortcmpinit(index);
+ qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp);
+
+ /*
+ * write the spooled run into the output tape. we copy the
+ * BTItemDatas in the order dictated by the sorted array of
+ * BTItems, not the original order.
+ *
+ * (since everything was DOUBLEALIGN'd and is all on a single
+ * page, everything had *better* still fit on one page..)
+ */
+ otape = btspool->bts_otape[btspool->bts_tape];
+ for (i = 0; i < itape->bttb_ntup; ++i) {
+ bti = parray[i];
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ _bt_tapeadd(otape, bti, btisz);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_spool: inserted <%x> into output tape %d\n",
+ d, btspool->bts_tape);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+
+ /*
+ * the initial runs are always single tape blocks. flush the
+ * output block, marking End-Of-Run.
+ */
+ _bt_tapewrite(otape, 1);
+
+ /*
+ * reset the input buffer for the next run. we don't have to
+ * write it out or anything -- we only use it to hold the
+ * unsorted BTItemDatas, the output tape contains all the
+ * sorted stuff.
+ *
+ * changing bts_tape changes the output tape and input tape;
+ * we change itape for the code below.
+ */
+ _bt_tapereset(itape);
+ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+ itape = btspool->bts_itape[btspool->bts_tape];
+
+ /*
+ * destroy the pointer array.
+ */
+ pfree((void *) parray);
+ }
+
+ /* insert this item into the current buffer */
+ if (btitem != (BTItem) NULL) {
+ _bt_tapeadd(itape, btitem, itemsz);
+ }
+}
+
+/*
+ * allocate a new, clean btree page, not linked to any siblings.
+ */
+static void
+_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
+{
+ BTPageOpaque opaque;
+
+ *buf = _bt_getbuf(index, P_NEW, BT_WRITE);
+ *page = BufferGetPage(*buf);
+ _bt_pageinit(*page, BufferGetPageSize(*buf));
+ opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
+ opaque->btpo_prev = opaque->btpo_next = P_NONE;
+ opaque->btpo_flags = flags;
+}
+
+/*
+ * slide an array of ItemIds back one slot (from P_FIRSTKEY to
+ * P_HIKEY). we need to do this when we discover that we have built
+ * an ItemId array in what has turned out to be a P_RIGHTMOST page.
+ */
+static void
+_bt_slideleft(Relation index, Buffer buf, Page page)
+{
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ ItemId previi;
+ ItemId thisii;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ previi = PageGetItemId(page, P_HIKEY);
+ for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) {
+ thisii = PageGetItemId(page, off);
+ *previi = *thisii;
+ previi = thisii;
+ }
+ ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
+}
+
+typedef struct {
+ Buffer btps_buf;
+ Page btps_page;
+ BTItem btps_lastbti;
+ OffsetNumber btps_lastoff;
+ OffsetNumber btps_firstoff;
+} BTPageState;
+
+/*
+ * add an item to a disk page from a merge tape block.
+ *
+ * we must be careful to observe the following restrictions, placed
+ * upon us by the conventions in nbtsearch.c:
+ * - rightmost pages start data items at P_HIKEY instead of at
+ * P_FIRSTKEY.
+ * - duplicates cannot be split among pages unless the chain of
+ * duplicates starts at the first data item.
+ *
+ * a leaf page being built looks like:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ... |
+ * +-----------+----+---------------------------------+
+ * | ... linpN | ^ first |
+ * +-----------+--------------------------------------+
+ * | ^ last |
+ * | |
+ * | v last |
+ * +-------------+------------------------------------+
+ * | | itemN ... |
+ * +-------------+------------------+-----------------+
+ * | ... item3 item2 item1 | "special space" |
+ * +--------------------------------+-----------------+
+ * ^ first
+ *
+ * contrast this with the diagram in bufpage.h; note the mismatch
+ * between linps and items. this is because we reserve linp0 as a
+ * placeholder for the pointer to the "high key" item; when we have
+ * filled up the page, we will set linp0 to point to itemN and clear
+ * linpN.
+ *
+ * 'last' pointers indicate the last offset/item added to the page.
+ * 'first' pointers indicate the first offset/item that is part of a
+ * chain of duplicates extending from 'first' to 'last'.
+ *
+ * if all keys are unique, 'first' will always be the same as 'last'.
+ */
+static void
+_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
+{
+ Buffer nbuf;
+ Page npage;
+ BTItem last_bti;
+ OffsetNumber first_off;
+ OffsetNumber last_off;
+ OffsetNumber off;
+ Size pgspc;
+ Size btisz;
+
+ nbuf = state->btps_buf;
+ npage = state->btps_page;
+ first_off = state->btps_firstoff;
+ last_off = state->btps_lastoff;
+ last_bti = state->btps_lastbti;
+
+ pgspc = PageGetFreeSpace(npage);
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ if (pgspc < btisz) {
+ Buffer obuf = nbuf;
+ Page opage = npage;
+ OffsetNumber o, n;
+ ItemId ii;
+ ItemId hii;
+
+ _bt_blnewpage(index, &nbuf, &npage, flags);
+
+ /*
+ * if 'last' is part of a chain of duplicates that does not
+ * start at the beginning of the old page, the entire chain is
+ * copied to the new page; we delete all of the duplicates
+ * from the old page except the first, which becomes the high
+ * key item of the old page.
+ *
+ * if the chain starts at the beginning of the page or there
+ * is no chain ('first' == 'last'), we need only copy 'last'
+ * to the new page. again, 'first' (== 'last') becomes the
+ * high key of the old page.
+ *
+ * note that in either case, we copy at least one item to the
+ * new page, so 'last_bti' will always be valid. 'bti' will
+ * never be the first data item on the new page.
+ */
+ if (first_off == P_FIRSTKEY) {
+ Assert(last_off != P_FIRSTKEY);
+ first_off = last_off;
+ }
+ for (o = first_off, n = P_FIRSTKEY;
+ o <= last_off;
+ o = OffsetNumberNext(o), n = OffsetNumberNext(n)) {
+ ii = PageGetItemId(opage, o);
+ (void) PageAddItem(npage, PageGetItem(opage, ii),
+ ii->lp_len, n, LP_USED);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ BTItem tmpbti =
+ (BTItem) PageGetItem(npage, PageGetItemId(npage, n));
+ Datum d = index_getattr(&(tmpbti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_buildadd: moved <%x> to offset %d\n",
+ d, n);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+ for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) {
+ PageIndexTupleDelete(opage, o);
+ }
+ hii = PageGetItemId(opage, P_HIKEY);
+ ii = PageGetItemId(opage, first_off);
+ *hii = *ii;
+ ii->lp_flags &= ~LP_USED;
+ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
+
+ first_off = P_FIRSTKEY;
+ last_off = PageGetMaxOffsetNumber(npage);
+ last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off));
+
+ /*
+ * set the page (side link) pointers.
+ */
+ {
+ BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
+
+ oopaque->btpo_next = BufferGetBlockNumber(nbuf);
+ nopaque->btpo_prev = BufferGetBlockNumber(obuf);
+ nopaque->btpo_next = P_NONE;
+ }
+
+ /*
+ * write out the old stuff. we never want to see it again, so
+ * we can give up our lock (if we had one; BuildingBtree is
+ * set, so we aren't locking).
+ */
+ _bt_wrtbuf(index, obuf);
+ }
+
+ /*
+ * if this item is different from the last item added, we start a
+ * new chain of duplicates.
+ */
+ off = OffsetNumberNext(last_off);
+ (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_buildadd: inserted <%x> at offset %d\n",
+ d, off);
+ }
+#endif /* FASTBUILD_DEBUG */
+ if (last_bti == (BTItem) NULL) {
+ first_off = P_FIRSTKEY;
+ } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) {
+ first_off = off;
+ }
+ last_off = off;
+ last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off));
+
+ state->btps_buf = nbuf;
+ state->btps_page = npage;
+ state->btps_lastbti = last_bti;
+ state->btps_lastoff = last_off;
+ state->btps_firstoff = first_off;
+}
+
+/*
+ * take the input tapes stored by 'btspool' and perform successive
+ * merging passes until at most one run is left in each tape. at that
+ * point, merge the final tape runs into a set of btree leaves.
+ *
+ * XXX three nested loops? gross. cut me up into smaller routines.
+ */
+static BlockNumber
+_bt_merge(Relation index, BTSpool *btspool)
+{
+ BTPageState state;
+ BlockNumber firstblk;
+ BTPriQueue q;
+ BTPriQueueElem e;
+ BTItem bti;
+ BTTapeBlock *itape;
+ BTTapeBlock *otape;
+ char *tapepos[MAXTAPES];
+ int tapedone[MAXTAPES];
+ int t;
+ int goodtapes;
+ int nruns;
+ Size btisz;
+ bool doleaf = false;
+
+ /*
+ * initialize state needed for the merge into the btree leaf pages.
+ */
+ (void) memset((char *) &state, 0, sizeof(BTPageState));
+ _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF);
+ state.btps_lastoff = P_HIKEY;
+ state.btps_lastbti = (BTItem) NULL;
+ firstblk = BufferGetBlockNumber(state.btps_buf);
+
+ do { /* pass */
+ /*
+ * each pass starts by flushing the previous outputs and
+ * swapping inputs and outputs. this process also clears the
+ * new output tapes and rewinds the new input tapes.
+ */
+ btspool->bts_tape = btspool->bts_ntapes - 1;
+ _bt_spoolflush(btspool);
+ _bt_spoolswap(btspool);
+
+ nruns = 0;
+
+ for (;;) { /* run */
+ /*
+ * each run starts by selecting a new output tape. the
+ * merged results of a given run are always sent to this
+ * one tape.
+ */
+ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+ otape = btspool->bts_otape[btspool->bts_tape];
+
+ /*
+ * initialize the priority queue by loading it with the
+ * first element of the given run in each tape. since we
+ * are starting a new run, we reset the tape (clearing the
+ * End-Of-Run marker) before reading it. this means that
+ * _bt_taperead will return 0 only if the tape is actually
+ * at EOF.
+ */
+ (void) memset((char *) &q, 0, sizeof(BTPriQueue));
+ goodtapes = 0;
+ for (t = 0; t < btspool->bts_ntapes; ++t) {
+ itape = btspool->bts_itape[t];
+ tapepos[t] = itape->bttb_data;
+ _bt_tapereset(itape);
+ if (_bt_taperead(itape) == 0) {
+ tapedone[t] = 1;
+ } else {
+ ++goodtapes;
+ tapedone[t] = 0;
+ e.btpqe_tape = t;
+ e.btpqe_item = _bt_tapenext(itape, &tapepos[t]);
+ if (e.btpqe_item != (BTItem) NULL) {
+ _bt_pqadd(&q, &e);
+ }
+ }
+ }
+ /*
+ * if we don't have any tapes with any input (i.e., they
+ * are all at EOF), we must be done with this pass.
+ */
+ if (goodtapes == 0) {
+ break; /* for */
+ }
+ ++nruns;
+
+ /*
+ * output the smallest element from the queue until there are no
+ * more.
+ */
+ while (_bt_pqnext(&q, &e) >= 0) { /* item */
+ /*
+ * replace the element taken from priority queue,
+ * fetching a new block if needed. a tape can run out
+ * if it hits either End-Of-Run or EOF.
+ */
+ t = e.btpqe_tape;
+ bti = e.btpqe_item;
+ if (bti != (BTItem) NULL) {
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ if (doleaf) {
+ _bt_buildadd(index, &state, bti, BTP_LEAF);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_merge: inserted <%x> into block %d\n",
+ d, BufferGetBlockNumber(state.btps_buf));
+ }
+#endif /* FASTBUILD_DEBUG */
+ } else {
+ if (SPCLEFT(otape) < btisz) {
+ /*
+ * if it's full, write it out and add the
+ * item to the next block. (since we know
+ * there will be at least one more block,
+ * we know we do *not* want to set
+ * End-Of-Run here!)
+ */
+ _bt_tapewrite(otape, 0);
+ }
+ _bt_tapeadd(otape, bti, btisz);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index), &isnull);
+ printf("_bt_merge: inserted <%x> into tape %d\n",
+ d, btspool->bts_tape);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+ }
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_merge: got <%x> from tape %d\n", d, t);
+ }
+#endif /* FASTBUILD_DEBUG */
+
+ itape = btspool->bts_itape[t];
+ if (!tapedone[t]) {
+ BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
+
+ if (newbti == (BTItem) NULL) {
+ if (_bt_taperead(itape) == 0) {
+ tapedone[t] = 1;
+ } else {
+ tapepos[t] = itape->bttb_data;
+ newbti = _bt_tapenext(itape, &tapepos[t]);
+ }
+ }
+ if (newbti != (BTItem) NULL) {
+ BTPriQueueElem nexte;
+
+ nexte.btpqe_tape = t;
+ nexte.btpqe_item = newbti;
+ _bt_pqadd(&q, &nexte);
+ }
+ }
+ } /* item */
+ } /* run */
+
+ /*
+ * we are here because we ran out of input on all of the input
+ * tapes.
+ *
+ * if this pass did not generate more actual output runs than
+ * we have tapes, we know we have at most one run in each
+ * tape. this means that we are ready to merge into the final
+ * btree leaf pages instead of merging into a tape file.
+ */
+ if (nruns <= btspool->bts_ntapes) {
+ doleaf = true;
+ }
+ } while (nruns > 0); /* pass */
+
+ /*
+ * this is the rightmost page, so the ItemId array needs to be
+ * slid back one slot.
+ */
+ _bt_slideleft(index, state.btps_buf, state.btps_page);
+ _bt_wrtbuf(index, state.btps_buf);
+
+ return(firstblk);
+}
+
+
+/*
+ * given the block number 'blk' of the first page of a set of linked
+ * siblings (i.e., the start of an entire level of the btree),
+ * construct the corresponding next level of the btree. we do this by
+ * placing minimum keys from each page into this page. the format of
+ * the internal pages is otherwise the same as for leaf pages.
+ */
+void
+_bt_upperbuild(Relation index, BlockNumber blk, int level)
+{
+ Buffer rbuf;
+ Page rpage;
+ BTPageOpaque ropaque;
+ BTPageState state;
+ BlockNumber firstblk;
+ BTItem bti;
+ BTItem nbti;
+ OffsetNumber off;
+
+ rbuf = _bt_getbuf(index, blk, BT_WRITE);
+ rpage = BufferGetPage(rbuf);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /*
+ * if we only have one page on a level, we can just make it the
+ * root.
+ */
+ if (P_RIGHTMOST(ropaque)) {
+ ropaque->btpo_flags |= BTP_ROOT;
+ _bt_wrtbuf(index, rbuf);
+ _bt_metaproot(index, blk);
+ return;
+ }
+ _bt_relbuf(index, rbuf, BT_WRITE);
+
+ (void) memset((char *) &state, 0, sizeof(BTPageState));
+ _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0);
+ state.btps_lastoff = P_HIKEY;
+ state.btps_lastbti = (BTItem) NULL;
+ firstblk = BufferGetBlockNumber(state.btps_buf);
+
+ /* for each page... */
+ do {
+ rbuf = _bt_getbuf(index, blk, BT_READ);
+ rpage = BufferGetPage(rbuf);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /* for each item... */
+ if (!PageIsEmpty(rpage)) {
+ /*
+ * form a new index tuple corresponding to the minimum key
+ * of the lower page and insert it into a page at this
+ * level.
+ */
+ off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY;
+ bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off));
+ nbti = _bt_formitem(&(bti->bti_itup));
+ ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(nbti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_upperbuild: inserting <%x> at %d\n",
+ d, level);
+ }
+#endif /* FASTBUILD_DEBUG */
+ _bt_buildadd(index, &state, nbti, 0);
+ pfree((void *) nbti);
+ }
+ blk = ropaque->btpo_next;
+ _bt_relbuf(index, rbuf, BT_READ);
+ } while (blk != P_NONE);
+
+ /*
+ * this is the rightmost page, so the ItemId array needs to be
+ * slid back one slot.
+ */
+ _bt_slideleft(index, state.btps_buf, state.btps_page);
+ _bt_wrtbuf(index, state.btps_buf);
+
+ _bt_upperbuild(index, firstblk, level + 1);
+}
+
+/*
+ * given a spool loading by successive calls to _bt_spool, create an
+ * entire btree.
+ */
+void
+_bt_leafbuild(Relation index, void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ BlockNumber firstblk;
+
+ /*
+ * merge the runs into btree leaf pages.
+ */
+ firstblk = _bt_merge(index, btspool);
+
+ /*
+ * build the upper levels of the btree.
+ */
+ _bt_upperbuild(index, firstblk, 0);
+}
+
+#else /* !FASTBUILD */
+
+void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); }
+void _bt_spooldestroy(void *spool) { }
+void _bt_spool(Relation index, BTItem btitem, void *spool) { }
+void _bt_upperbuild(Relation index, BlockNumber blk, int level) { }
+void _bt_leafbuild(Relation index, void *spool) { }
+
+#endif /* !FASTBUILD */
diff --git a/src/backend/access/nbtree/nbtstrat.c b/src/backend/access/nbtree/nbtstrat.c
new file mode 100644
index 00000000000..2214c60950d
--- /dev/null
+++ b/src/backend/access/nbtree/nbtstrat.c
@@ -0,0 +1,134 @@
+/*-------------------------------------------------------------------------
+ *
+ * btstrat.c--
+ * Srategy map entries for the btree indexed access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+/*
+ * Note:
+ * StrategyNegate, StrategyCommute, and StrategyNegateCommute
+ * assume <, <=, ==, >=, > ordering.
+ */
+static StrategyNumber BTNegate[5] = {
+ BTGreaterEqualStrategyNumber,
+ BTGreaterStrategyNumber,
+ InvalidStrategy,
+ BTLessStrategyNumber,
+ BTLessEqualStrategyNumber
+};
+
+static StrategyNumber BTCommute[5] = {
+ BTGreaterStrategyNumber,
+ BTGreaterEqualStrategyNumber,
+ InvalidStrategy,
+ BTLessEqualStrategyNumber,
+ BTLessStrategyNumber
+};
+
+static StrategyNumber BTNegateCommute[5] = {
+ BTLessEqualStrategyNumber,
+ BTLessStrategyNumber,
+ InvalidStrategy,
+ BTGreaterStrategyNumber,
+ BTGreaterEqualStrategyNumber
+};
+
+static uint16 BTLessTermData[] = { /* XXX type clash */
+ 2,
+ BTLessStrategyNumber,
+ SK_NEGATE,
+ BTLessStrategyNumber,
+ SK_NEGATE | SK_COMMUTE
+};
+
+static uint16 BTLessEqualTermData[] = { /* XXX type clash */
+ 2,
+ BTLessEqualStrategyNumber,
+ 0x0,
+ BTLessEqualStrategyNumber,
+ SK_COMMUTE
+};
+
+static uint16 BTGreaterEqualTermData[] = { /* XXX type clash */
+ 2,
+ BTGreaterEqualStrategyNumber,
+ 0x0,
+ BTGreaterEqualStrategyNumber,
+ SK_COMMUTE
+ };
+
+static uint16 BTGreaterTermData[] = { /* XXX type clash */
+ 2,
+ BTGreaterStrategyNumber,
+ SK_NEGATE,
+ BTGreaterStrategyNumber,
+ SK_NEGATE | SK_COMMUTE
+};
+
+static StrategyTerm BTEqualExpressionData[] = {
+ (StrategyTerm)BTLessTermData, /* XXX */
+ (StrategyTerm)BTLessEqualTermData, /* XXX */
+ (StrategyTerm)BTGreaterEqualTermData, /* XXX */
+ (StrategyTerm)BTGreaterTermData, /* XXX */
+ NULL
+};
+
+static StrategyEvaluationData BTEvaluationData = {
+ /* XXX static for simplicity */
+
+ BTMaxStrategyNumber,
+ (StrategyTransformMap)BTNegate, /* XXX */
+ (StrategyTransformMap)BTCommute, /* XXX */
+ (StrategyTransformMap)BTNegateCommute, /* XXX */
+
+ { NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL,
+ NULL,NULL,NULL,NULL,NULL,NULL,NULL}
+};
+
+/* ----------------------------------------------------------------
+ * RelationGetBTStrategy
+ * ----------------------------------------------------------------
+ */
+
+StrategyNumber
+_bt_getstrat(Relation rel,
+ AttrNumber attno,
+ RegProcedure proc)
+{
+ StrategyNumber strat;
+
+ strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc);
+
+ Assert(StrategyNumberIsValid(strat));
+
+ return (strat);
+}
+
+bool
+_bt_invokestrat(Relation rel,
+ AttrNumber attno,
+ StrategyNumber strat,
+ Datum left,
+ Datum right)
+{
+ return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat,
+ left, right));
+}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
new file mode 100644
index 00000000000..695a2b637c8
--- /dev/null
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * btutils.c--
+ * Utility code for Postgres btree implementation.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+#include "utils/datum.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/iqual.h"
+#include "access/nbtree.h"
+
+ScanKey
+_bt_mkscankey(Relation rel, IndexTuple itup)
+{
+ ScanKey skey;
+ TupleDesc itupdesc;
+ int natts;
+ int i;
+ Datum arg;
+ RegProcedure proc;
+ bool null;
+
+ natts = rel->rd_rel->relnatts;
+ itupdesc = RelationGetTupleDescriptor(rel);
+
+ skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
+
+ for (i = 0; i < natts; i++) {
+ arg = index_getattr(itup, i + 1, itupdesc, &null);
+ proc = index_getprocid(rel, i + 1, BTORDER_PROC);
+ ScanKeyEntryInitialize(&skey[i],
+ 0x0, (AttrNumber) (i + 1), proc, arg);
+ }
+
+ return (skey);
+}
+
+void
+_bt_freeskey(ScanKey skey)
+{
+ pfree(skey);
+}
+
+void
+_bt_freestack(BTStack stack)
+{
+ BTStack ostack;
+
+ while (stack != (BTStack) NULL) {
+ ostack = stack;
+ stack = stack->bts_parent;
+ pfree(ostack->bts_btitem);
+ pfree(ostack);
+ }
+}
+
+/*
+ * _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals.
+ *
+ * The order of the keys in the qual match the ordering imposed by
+ * the index. This routine only needs to be called if there are
+ * more than one qual clauses using this index.
+ */
+void
+_bt_orderkeys(Relation relation, uint16 *numberOfKeys, ScanKey key)
+{
+ ScanKey xform;
+ ScanKeyData *cur;
+ StrategyMap map;
+ int nbytes;
+ long test;
+ int i, j;
+ int init[BTMaxStrategyNumber+1];
+
+ /* haven't looked at any strategies yet */
+ for (i = 0; i <= BTMaxStrategyNumber; i++)
+ init[i] = 0;
+
+ /* get space for the modified array of keys */
+ nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData);
+ xform = (ScanKey) palloc(nbytes);
+ memset(xform, 0, nbytes);
+
+
+ /* get the strategy map for this index/attribute pair */
+ /*
+ * XXX
+ * When we support multiple keys in a single index, this is what
+ * we'll want to do. At present, the planner is hosed, so we
+ * hard-wire the attribute number below. Postgres only does single-
+ * key indices...
+ * map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+ * BTMaxStrategyNumber,
+ * key->data[0].attributeNumber);
+ */
+ map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+ BTMaxStrategyNumber,
+ 1 /* XXX */ );
+
+ /* check each key passed in */
+ for (i = *numberOfKeys; --i >= 0; ) {
+ cur = &key[i];
+ for (j = BTMaxStrategyNumber; --j >= 0; ) {
+ if (cur->sk_procedure == map->entry[j].sk_procedure)
+ break;
+ }
+
+ /* have we seen one of these before? */
+ if (init[j]) {
+ /* yup, use the appropriate value */
+ test =
+ (long) FMGR_PTR2(cur->sk_func, cur->sk_procedure,
+ cur->sk_argument, xform[j].sk_argument);
+ if (test)
+ xform[j].sk_argument = cur->sk_argument;
+ } else {
+ /* nope, use this value */
+ memmove(&xform[j], cur, sizeof(*cur));
+
+ init[j] = 1;
+ }
+ }
+
+ /* if = has been specified, no other key will be used */
+ if (init[BTEqualStrategyNumber - 1]) {
+ init[BTLessStrategyNumber - 1] = 0;
+ init[BTLessEqualStrategyNumber - 1] = 0;
+ init[BTGreaterEqualStrategyNumber - 1] = 0;
+ init[BTGreaterStrategyNumber - 1] = 0;
+ }
+
+ /* only one of <, <= */
+ if (init[BTLessStrategyNumber - 1]
+ && init[BTLessEqualStrategyNumber - 1]) {
+
+ ScanKeyData *lt, *le;
+
+ lt = &xform[BTLessStrategyNumber - 1];
+ le = &xform[BTLessEqualStrategyNumber - 1];
+
+ /*
+ * DO NOT use the cached function stuff here -- this is key
+ * ordering, happens only when the user expresses a hokey
+ * qualification, and gets executed only once, anyway. The
+ * transform maps are hard-coded, and can't be initialized
+ * in the correct way.
+ */
+
+ test = (long) fmgr(le->sk_procedure, le->sk_argument, lt->sk_argument);
+
+ if (test)
+ init[BTLessEqualStrategyNumber - 1] = 0;
+ else
+ init[BTLessStrategyNumber - 1] = 0;
+ }
+
+ /* only one of >, >= */
+ if (init[BTGreaterStrategyNumber - 1]
+ && init[BTGreaterEqualStrategyNumber - 1]) {
+
+ ScanKeyData *gt, *ge;
+
+ gt = &xform[BTGreaterStrategyNumber - 1];
+ ge = &xform[BTGreaterEqualStrategyNumber - 1];
+
+ /* see note above on function cache */
+ test = (long) fmgr(ge->sk_procedure, gt->sk_argument, gt->sk_argument);
+
+ if (test)
+ init[BTGreaterStrategyNumber - 1] = 0;
+ else
+ init[BTGreaterEqualStrategyNumber - 1] = 0;
+ }
+
+ /* okay, reorder and count */
+ j = 0;
+
+ for (i = BTMaxStrategyNumber; --i >= 0; )
+ if (init[i])
+ key[j++] = xform[i];
+
+ *numberOfKeys = j;
+
+ pfree(xform);
+}
+
+bool
+_bt_checkqual(IndexScanDesc scan, IndexTuple itup)
+{
+ if (scan->numberOfKeys > 0)
+ return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
+ scan->numberOfKeys, scan->keyData));
+ else
+ return (true);
+}
+
+BTItem
+_bt_formitem(IndexTuple itup)
+{
+ int nbytes_btitem;
+ BTItem btitem;
+ Size tuplen;
+ extern Oid newoid();
+
+ /* disallow nulls in btree keys */
+ if (itup->t_info & INDEX_NULL_MASK)
+ elog(WARN, "btree indices cannot include null keys");
+
+ /* make a copy of the index tuple with room for the sequence number */
+ tuplen = IndexTupleSize(itup);
+ nbytes_btitem = tuplen +
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
+
+ btitem = (BTItem) palloc(nbytes_btitem);
+ memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
+
+ btitem->bti_oid = newoid();
+ return (btitem);
+}
diff --git a/src/backend/access/printtup.h b/src/backend/access/printtup.h
new file mode 100644
index 00000000000..b5843daf7e0
--- /dev/null
+++ b/src/backend/access/printtup.h
@@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * printtup.h--
+ *
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: printtup.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PRINTTUP_H
+#define PRINTTUP_H
+
+#include "access/htup.h"
+#include "access/tupdesc.h"
+
+extern Oid typtoout(Oid type);
+extern void printtup(HeapTuple tuple, TupleDesc typeinfo);
+extern void showatts(char *name, TupleDesc attinfo);
+extern void debugtup(HeapTuple tuple, TupleDesc typeinfo);
+extern void printtup_internal(HeapTuple tuple, TupleDesc typeinfo);
+extern Oid gettypelem(Oid type);
+
+#endif /* PRINTTUP_H */
diff --git a/src/backend/access/relscan.h b/src/backend/access/relscan.h
new file mode 100644
index 00000000000..7899e9d945f
--- /dev/null
+++ b/src/backend/access/relscan.h
@@ -0,0 +1,87 @@
+/*-------------------------------------------------------------------------
+ *
+ * relscan.h--
+ * POSTGRES internal relation scan descriptor definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: relscan.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RELSCAN_H
+#define RELSCAN_H
+
+#include "c.h"
+
+#include "access/skey.h"
+#include "storage/buf.h"
+#include "access/htup.h"
+#include "storage/itemptr.h"
+
+#include "utils/tqual.h"
+#include "utils/rel.h"
+
+
+typedef ItemPointerData MarkData;
+
+typedef struct HeapScanDescData {
+ Relation rs_rd; /* pointer to relation descriptor */
+ HeapTuple rs_ptup; /* previous tuple in scan */
+ HeapTuple rs_ctup; /* current tuple in scan */
+ HeapTuple rs_ntup; /* next tuple in scan */
+ Buffer rs_pbuf; /* previous buffer in scan */
+ Buffer rs_cbuf; /* current buffer in scan */
+ Buffer rs_nbuf; /* next buffer in scan */
+ ItemPointerData rs_mptid; /* marked previous tid */
+ ItemPointerData rs_mctid; /* marked current tid */
+ ItemPointerData rs_mntid; /* marked next tid */
+ ItemPointerData rs_mcd; /* marked current delta XXX ??? */
+ bool rs_atend; /* restart scan at end? */
+ TimeQual rs_tr; /* time qualification */
+ uint16 rs_cdelta; /* current delta in chain */
+ uint16 rs_nkeys; /* number of attributes in keys */
+ ScanKey rs_key; /* key descriptors */
+} HeapScanDescData;
+
+typedef HeapScanDescData *HeapScanDesc;
+
+typedef struct IndexScanDescData {
+ Relation relation; /* relation descriptor */
+ void *opaque; /* am-specific slot */
+ ItemPointerData previousItemData; /* previous index pointer */
+ ItemPointerData currentItemData; /* current index pointer */
+ ItemPointerData nextItemData; /* next index pointer */
+ MarkData previousMarkData; /* marked previous pointer */
+ MarkData currentMarkData; /* marked current pointer */
+ MarkData nextMarkData; /* marked next pointer */
+ uint8 flags; /* scan position flags */
+ bool scanFromEnd; /* restart scan at end? */
+ uint16 numberOfKeys; /* number of key attributes */
+ ScanKey keyData; /* key descriptor */
+} IndexScanDescData;
+
+typedef IndexScanDescData *IndexScanDesc;
+
+/* ----------------
+ * IndexScanDescPtr is used in the executor where we have to
+ * keep track of several index scans when using several indices
+ * - cim 9/10/89
+ * ----------------
+ */
+typedef IndexScanDesc *IndexScanDescPtr;
+
+/*
+ * HeapScanIsValid --
+ * True iff the heap scan is valid.
+ */
+#define HeapScanIsValid(scan) PointerIsValid(scan)
+
+/*
+ * IndexScanIsValid --
+ * True iff the index scan is valid.
+ */
+#define IndexScanIsValid(scan) PointerIsValid(scan)
+
+#endif /* RELSCAN_H */
diff --git a/src/backend/access/rtree.h b/src/backend/access/rtree.h
new file mode 100644
index 00000000000..79f1622e48b
--- /dev/null
+++ b/src/backend/access/rtree.h
@@ -0,0 +1,98 @@
+/*-------------------------------------------------------------------------
+ *
+ * rtree.h--
+ * common declarations for the rtree access method code.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: rtree.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RTREE_H
+#define RTREE_H
+
+/* see rtstrat.c for what all this is about */
+#define RTNStrategies 8
+#define RTLeftStrategyNumber 1
+#define RTOverLeftStrategyNumber 2
+#define RTOverlapStrategyNumber 3
+#define RTOverRightStrategyNumber 4
+#define RTRightStrategyNumber 5
+#define RTSameStrategyNumber 6
+#define RTContainsStrategyNumber 7
+#define RTContainedByStrategyNumber 8
+
+#define RTNProcs 3
+#define RT_UNION_PROC 1
+#define RT_INTER_PROC 2
+#define RT_SIZE_PROC 3
+
+#define F_LEAF (1 << 0)
+
+typedef struct RTreePageOpaqueData {
+ uint32 flags;
+} RTreePageOpaqueData;
+
+typedef RTreePageOpaqueData *RTreePageOpaque;
+
+/*
+ * When we descend a tree, we keep a stack of parent pointers.
+ */
+
+typedef struct RTSTACK {
+ struct RTSTACK *rts_parent;
+ OffsetNumber rts_child;
+ BlockNumber rts_blk;
+} RTSTACK;
+
+/*
+ * When we're doing a scan, we need to keep track of the parent stack
+ * for the marked and current items. Also, rtrees have the following
+ * property: if you're looking for the box (1,1,2,2), on the internal
+ * nodes you have to search for all boxes that *contain* (1,1,2,2), and
+ * not the ones that match it. We have a private scan key for internal
+ * nodes in the opaque structure for rtrees for this reason. See
+ * access/index-rtree/rtscan.c and rtstrat.c for how it gets initialized.
+ */
+
+typedef struct RTreeScanOpaqueData {
+ struct RTSTACK *s_stack;
+ struct RTSTACK *s_markstk;
+ uint16 s_flags;
+ uint16 s_internalNKey;
+ ScanKey s_internalKey;
+} RTreeScanOpaqueData;
+
+typedef RTreeScanOpaqueData *RTreeScanOpaque;
+
+/*
+ * When we're doing a scan and updating a tree at the same time, the
+ * updates may affect the scan. We use the flags entry of the scan's
+ * opaque space to record our actual position in response to updates
+ * that we can't handle simply by adjusting pointers.
+ */
+
+#define RTS_CURBEFORE ((uint16) (1 << 0))
+#define RTS_MRKBEFORE ((uint16) (1 << 1))
+
+/* root page of an rtree */
+#define P_ROOT 0
+
+/*
+ * When we update a relation on which we're doing a scan, we need to
+ * check the scan and fix it if the update affected any of the pages it
+ * touches. Otherwise, we can miss records that we should see. The only
+ * times we need to do this are for deletions and splits. See the code in
+ * rtscan.c for how the scan is fixed. These two contants tell us what sort
+ * of operation changed the index.
+ */
+
+#define RTOP_DEL 0
+#define RTOP_SPLIT 1
+
+/* defined in rtree.c */
+extern void freestack(RTSTACK *s);
+
+#endif /* RTREE_H */
diff --git a/src/backend/access/rtree/Makefile.inc b/src/backend/access/rtree/Makefile.inc
new file mode 100644
index 00000000000..a93a5e53290
--- /dev/null
+++ b/src/backend/access/rtree/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/rtree (R-Tree access method)
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= rtget.c rtproc.c rtree.c rtscan.c rtstrat.c
diff --git a/src/backend/access/rtree/rtget.c b/src/backend/access/rtree/rtget.c
new file mode 100644
index 00000000000..fb2e169297d
--- /dev/null
+++ b/src/backend/access/rtree/rtget.c
@@ -0,0 +1,320 @@
+/*-------------------------------------------------------------------------
+ *
+ * rtget.c--
+ * fetch tuples from an rtree scan.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtget.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/iqual.h"
+#include "access/rtree.h"
+#include "access/sdir.h"
+
+static OffsetNumber findnext(IndexScanDesc s, Page p, OffsetNumber n,
+ ScanDirection dir);
+static RetrieveIndexResult rtscancache(IndexScanDesc s, ScanDirection dir);
+static RetrieveIndexResult rtfirst(IndexScanDesc s, ScanDirection dir);
+static RetrieveIndexResult rtnext(IndexScanDesc s, ScanDirection dir);
+static ItemPointer rtheapptr(Relation r, ItemPointer itemp);
+
+
+RetrieveIndexResult
+rtgettuple(IndexScanDesc s, ScanDirection dir)
+{
+ RetrieveIndexResult res;
+
+ /* if we have it cached in the scan desc, just return the value */
+ if ((res = rtscancache(s, dir)) != (RetrieveIndexResult) NULL)
+ return (res);
+
+ /* not cached, so we'll have to do some work */
+ if (ItemPointerIsValid(&(s->currentItemData))) {
+ res = rtnext(s, dir);
+ } else {
+ res = rtfirst(s, dir);
+ }
+ return (res);
+}
+
+static RetrieveIndexResult
+rtfirst(IndexScanDesc s, ScanDirection dir)
+{
+ Buffer b;
+ Page p;
+ OffsetNumber n;
+ OffsetNumber maxoff;
+ RetrieveIndexResult res;
+ RTreePageOpaque po;
+ RTreeScanOpaque so;
+ RTSTACK *stk;
+ BlockNumber blk;
+ IndexTuple it;
+ ItemPointer ip;
+
+ b = ReadBuffer(s->relation, P_ROOT);
+ p = BufferGetPage(b);
+ po = (RTreePageOpaque) PageGetSpecialPointer(p);
+ so = (RTreeScanOpaque) s->opaque;
+
+ for (;;) {
+ maxoff = PageGetMaxOffsetNumber(p);
+ if (ScanDirectionIsBackward(dir))
+ n = findnext(s, p, maxoff, dir);
+ else
+ n = findnext(s, p, FirstOffsetNumber, dir);
+
+ while (n < FirstOffsetNumber || n > maxoff) {
+
+ ReleaseBuffer(b);
+ if (so->s_stack == (RTSTACK *) NULL)
+ return ((RetrieveIndexResult) NULL);
+
+ stk = so->s_stack;
+ b = ReadBuffer(s->relation, stk->rts_blk);
+ p = BufferGetPage(b);
+ po = (RTreePageOpaque) PageGetSpecialPointer(p);
+ maxoff = PageGetMaxOffsetNumber(p);
+
+ if (ScanDirectionIsBackward(dir)) {
+ n = OffsetNumberPrev(stk->rts_child);
+ } else {
+ n = OffsetNumberNext(stk->rts_child);
+ }
+ so->s_stack = stk->rts_parent;
+ pfree(stk);
+
+ n = findnext(s, p, n, dir);
+ }
+ if (po->flags & F_LEAF) {
+ ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
+
+ it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
+ ip = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) ip, (char *) &(it->t_tid),
+ sizeof(ItemPointerData));
+ ReleaseBuffer(b);
+
+ res = FormRetrieveIndexResult(&(s->currentItemData), ip);
+
+ return (res);
+ } else {
+ stk = (RTSTACK *) palloc(sizeof(RTSTACK));
+ stk->rts_child = n;
+ stk->rts_blk = BufferGetBlockNumber(b);
+ stk->rts_parent = so->s_stack;
+ so->s_stack = stk;
+
+ it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
+ blk = ItemPointerGetBlockNumber(&(it->t_tid));
+
+ ReleaseBuffer(b);
+ b = ReadBuffer(s->relation, blk);
+ p = BufferGetPage(b);
+ po = (RTreePageOpaque) PageGetSpecialPointer(p);
+ }
+ }
+}
+
+static RetrieveIndexResult
+rtnext(IndexScanDesc s, ScanDirection dir)
+{
+ Buffer b;
+ Page p;
+ OffsetNumber n;
+ OffsetNumber maxoff;
+ RetrieveIndexResult res;
+ RTreePageOpaque po;
+ RTreeScanOpaque so;
+ RTSTACK *stk;
+ BlockNumber blk;
+ IndexTuple it;
+ ItemPointer ip;
+
+ blk = ItemPointerGetBlockNumber(&(s->currentItemData));
+ n = ItemPointerGetOffsetNumber(&(s->currentItemData));
+
+ if (ScanDirectionIsForward(dir)) {
+ n = OffsetNumberNext(n);
+ } else {
+ n = OffsetNumberPrev(n);
+ }
+
+ b = ReadBuffer(s->relation, blk);
+ p = BufferGetPage(b);
+ po = (RTreePageOpaque) PageGetSpecialPointer(p);
+ so = (RTreeScanOpaque) s->opaque;
+
+ for (;;) {
+ maxoff = PageGetMaxOffsetNumber(p);
+ n = findnext(s, p, n, dir);
+
+ while (n < FirstOffsetNumber || n > maxoff) {
+
+ ReleaseBuffer(b);
+ if (so->s_stack == (RTSTACK *) NULL)
+ return ((RetrieveIndexResult) NULL);
+
+ stk = so->s_stack;
+ b = ReadBuffer(s->relation, stk->rts_blk);
+ p = BufferGetPage(b);
+ maxoff = PageGetMaxOffsetNumber(p);
+ po = (RTreePageOpaque) PageGetSpecialPointer(p);
+
+ if (ScanDirectionIsBackward(dir)) {
+ n = OffsetNumberPrev(stk->rts_child);
+ } else {
+ n = OffsetNumberNext(stk->rts_child);
+ }
+ so->s_stack = stk->rts_parent;
+ pfree(stk);
+
+ n = findnext(s, p, n, dir);
+ }
+ if (po->flags & F_LEAF) {
+ ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
+
+ it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
+ ip = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) ip, (char *) &(it->t_tid),
+ sizeof(ItemPointerData));
+ ReleaseBuffer(b);
+
+ res = FormRetrieveIndexResult(&(s->currentItemData), ip);
+
+ return (res);
+ } else {
+ stk = (RTSTACK *) palloc(sizeof(RTSTACK));
+ stk->rts_child = n;
+ stk->rts_blk = BufferGetBlockNumber(b);
+ stk->rts_parent = so->s_stack;
+ so->s_stack = stk;
+
+ it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
+ blk = ItemPointerGetBlockNumber(&(it->t_tid));
+
+ ReleaseBuffer(b);
+ b = ReadBuffer(s->relation, blk);
+ p = BufferGetPage(b);
+ po = (RTreePageOpaque) PageGetSpecialPointer(p);
+
+ if (ScanDirectionIsBackward(dir)) {
+ n = PageGetMaxOffsetNumber(p);
+ } else {
+ n = FirstOffsetNumber;
+ }
+ }
+ }
+}
+
+static OffsetNumber
+findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
+{
+ OffsetNumber maxoff;
+ IndexTuple it;
+ RTreePageOpaque po;
+ RTreeScanOpaque so;
+
+ maxoff = PageGetMaxOffsetNumber(p);
+ po = (RTreePageOpaque) PageGetSpecialPointer(p);
+ so = (RTreeScanOpaque) s->opaque;
+
+ /*
+ * If we modified the index during the scan, we may have a pointer to
+ * a ghost tuple, before the scan. If this is the case, back up one.
+ */
+
+ if (so->s_flags & RTS_CURBEFORE) {
+ so->s_flags &= ~RTS_CURBEFORE;
+ n = OffsetNumberPrev(n);
+ }
+
+ while (n >= FirstOffsetNumber && n <= maxoff) {
+ it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
+ if (po->flags & F_LEAF) {
+ if (index_keytest(it,
+ RelationGetTupleDescriptor(s->relation),
+ s->numberOfKeys, s->keyData))
+ break;
+ } else {
+ if (index_keytest(it,
+ RelationGetTupleDescriptor(s->relation),
+ so->s_internalNKey, so->s_internalKey))
+ break;
+ }
+
+ if (ScanDirectionIsBackward(dir)) {
+ n = OffsetNumberPrev(n);
+ } else {
+ n = OffsetNumberNext(n);
+ }
+ }
+
+ return (n);
+}
+
+static RetrieveIndexResult
+rtscancache(IndexScanDesc s, ScanDirection dir)
+{
+ RetrieveIndexResult res;
+ ItemPointer ip;
+
+ if (!(ScanDirectionIsNoMovement(dir)
+ && ItemPointerIsValid(&(s->currentItemData)))) {
+
+ return ((RetrieveIndexResult) NULL);
+ }
+
+ ip = rtheapptr(s->relation, &(s->currentItemData));
+
+ if (ItemPointerIsValid(ip))
+ res = FormRetrieveIndexResult(&(s->currentItemData), ip);
+ else
+ res = (RetrieveIndexResult) NULL;
+
+ return (res);
+}
+
+/*
+ * rtheapptr returns the item pointer to the tuple in the heap relation
+ * for which itemp is the index relation item pointer.
+ */
+static ItemPointer
+rtheapptr(Relation r, ItemPointer itemp)
+{
+ Buffer b;
+ Page p;
+ IndexTuple it;
+ ItemPointer ip;
+ OffsetNumber n;
+
+ ip = (ItemPointer) palloc(sizeof(ItemPointerData));
+ if (ItemPointerIsValid(itemp)) {
+ b = ReadBuffer(r, ItemPointerGetBlockNumber(itemp));
+ p = BufferGetPage(b);
+ n = ItemPointerGetOffsetNumber(itemp);
+ it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
+ memmove((char *) ip, (char *) &(it->t_tid),
+ sizeof(ItemPointerData));
+ ReleaseBuffer(b);
+ } else {
+ ItemPointerSetInvalid(ip);
+ }
+
+ return (ip);
+}
diff --git a/src/backend/access/rtree/rtproc.c b/src/backend/access/rtree/rtproc.c
new file mode 100644
index 00000000000..a2f7bef46b4
--- /dev/null
+++ b/src/backend/access/rtree/rtproc.c
@@ -0,0 +1,150 @@
+/*-------------------------------------------------------------------------
+ *
+ * rtproc.c--
+ * pg_amproc entries for rtrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtproc.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <math.h>
+#include <string.h>
+
+#include "postgres.h"
+
+#include "utils/elog.h"
+#include "utils/geo-decls.h"
+#include "utils/palloc.h"
+
+BOX
+*rt_box_union(BOX *a, BOX *b)
+{
+ BOX *n;
+
+ if ((n = (BOX *) palloc(sizeof (*n))) == (BOX *) NULL)
+ elog(WARN, "Cannot allocate box for union");
+
+ n->xh = Max(a->xh, b->xh);
+ n->yh = Max(a->yh, b->yh);
+ n->xl = Min(a->xl, b->xl);
+ n->yl = Min(a->yl, b->yl);
+
+ return (n);
+}
+
+BOX *
+rt_box_inter(BOX *a, BOX *b)
+{
+ BOX *n;
+
+ if ((n = (BOX *) palloc(sizeof (*n))) == (BOX *) NULL)
+ elog(WARN, "Cannot allocate box for union");
+
+ n->xh = Min(a->xh, b->xh);
+ n->yh = Min(a->yh, b->yh);
+ n->xl = Max(a->xl, b->xl);
+ n->yl = Max(a->yl, b->yl);
+
+ if (n->xh < n->xl || n->yh < n->yl) {
+ pfree(n);
+ return ((BOX *) NULL);
+ }
+
+ return (n);
+}
+
+void
+rt_box_size(BOX *a, float *size)
+{
+ if (a == (BOX *) NULL || a->xh <= a->xl || a->yh <= a->yl)
+ *size = 0.0;
+ else
+ *size = (float) ((a->xh - a->xl) * (a->yh - a->yl));
+
+ return;
+}
+
+/*
+ * rt_bigbox_size() -- Compute a size for big boxes.
+ *
+ * In an earlier release of the system, this routine did something
+ * different from rt_box_size. We now use floats, rather than ints,
+ * as the return type for the size routine, so we no longer need to
+ * have a special return type for big boxes.
+ */
+void
+rt_bigbox_size(BOX *a, float *size)
+{
+ rt_box_size(a, size);
+}
+
+POLYGON *
+rt_poly_union(POLYGON *a, POLYGON *b)
+{
+ POLYGON *p;
+
+ p = (POLYGON *)PALLOCTYPE(POLYGON);
+
+ if (!PointerIsValid(p))
+ elog(WARN, "Cannot allocate polygon for union");
+
+ memset((char *) p, 0, sizeof(POLYGON)); /* zero any holes */
+ p->size = sizeof(POLYGON);
+ p->npts = 0;
+ p->boundbox.xh = Max(a->boundbox.xh, b->boundbox.xh);
+ p->boundbox.yh = Max(a->boundbox.yh, b->boundbox.yh);
+ p->boundbox.xl = Min(a->boundbox.xl, b->boundbox.xl);
+ p->boundbox.yl = Min(a->boundbox.yl, b->boundbox.yl);
+ return p;
+}
+
+void
+rt_poly_size(POLYGON *a, float *size)
+{
+ double xdim, ydim;
+
+ size = (float *) palloc(sizeof(float));
+ if (a == (POLYGON *) NULL ||
+ a->boundbox.xh <= a->boundbox.xl ||
+ a->boundbox.yh <= a->boundbox.yl)
+ *size = 0.0;
+ else {
+ xdim = (a->boundbox.xh - a->boundbox.xl);
+ ydim = (a->boundbox.yh - a->boundbox.yl);
+
+ *size = (float) (xdim * ydim);
+ }
+
+ return;
+}
+
+POLYGON *
+rt_poly_inter(POLYGON *a, POLYGON *b)
+{
+ POLYGON *p;
+
+ p = (POLYGON *) PALLOCTYPE(POLYGON);
+
+ if (!PointerIsValid(p))
+ elog(WARN, "Cannot allocate polygon for intersection");
+
+ memset((char *) p, 0, sizeof(POLYGON)); /* zero any holes */
+ p->size = sizeof(POLYGON);
+ p->npts = 0;
+ p->boundbox.xh = Min(a->boundbox.xh, b->boundbox.xh);
+ p->boundbox.yh = Min(a->boundbox.yh, b->boundbox.yh);
+ p->boundbox.xl = Max(a->boundbox.xl, b->boundbox.xl);
+ p->boundbox.yl = Max(a->boundbox.yl, b->boundbox.yl);
+
+ if (p->boundbox.xh < p->boundbox.xl || p->boundbox.yh < p->boundbox.yl)
+ {
+ pfree(p);
+ return ((POLYGON *) NULL);
+ }
+
+ return (p);
+}
diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c
new file mode 100644
index 00000000000..96efc3bc90b
--- /dev/null
+++ b/src/backend/access/rtree/rtree.c
@@ -0,0 +1,955 @@
+/*-------------------------------------------------------------------------
+ *
+ * rtree.c--
+ * interface routines for the postgres rtree indexed access method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/rtree.h"
+#include "access/rtscan.h"
+#include "access/funcindex.h"
+#include "access/tupdesc.h"
+
+#include "nodes/execnodes.h"
+#include "nodes/plannodes.h"
+
+#include "executor/executor.h"
+#include "executor/tuptable.h"
+
+#include "catalog/index.h"
+
+typedef struct SPLITVEC {
+ OffsetNumber *spl_left;
+ int spl_nleft;
+ char *spl_ldatum;
+ OffsetNumber *spl_right;
+ int spl_nright;
+ char *spl_rdatum;
+} SPLITVEC;
+
+typedef struct RTSTATE {
+ func_ptr unionFn; /* union function */
+ func_ptr sizeFn; /* size function */
+ func_ptr interFn; /* intersection function */
+} RTSTATE;
+
+/* non-export function prototypes */
+static InsertIndexResult rtdoinsert(Relation r, IndexTuple itup,
+ RTSTATE *rtstate);
+static void rttighten(Relation r, RTSTACK *stk, char *datum, int att_size,
+ RTSTATE *rtstate);
+static InsertIndexResult dosplit(Relation r, Buffer buffer, RTSTACK *stack,
+ IndexTuple itup, RTSTATE *rtstate);
+static void rtintinsert(Relation r, RTSTACK *stk, IndexTuple ltup,
+ IndexTuple rtup, RTSTATE *rtstate);
+static void rtnewroot(Relation r, IndexTuple lt, IndexTuple rt);
+static void picksplit(Relation r, Page page, SPLITVEC *v, IndexTuple itup,
+ RTSTATE *rtstate);
+static void RTInitBuffer(Buffer b, uint32 f);
+static OffsetNumber choose(Relation r, Page p, IndexTuple it,
+ RTSTATE *rtstate);
+static int nospace(Page p, IndexTuple it);
+static void initRtstate(RTSTATE *rtstate, Relation index);
+
+
+void
+rtbuild(Relation heap,
+ Relation index,
+ int natts,
+ AttrNumber *attnum,
+ IndexStrategy istrat,
+ uint16 pcount,
+ Datum *params,
+ FuncIndexInfo *finfo,
+ PredInfo *predInfo)
+{
+ HeapScanDesc scan;
+ Buffer buffer;
+ AttrNumber i;
+ HeapTuple htup;
+ IndexTuple itup;
+ TupleDesc hd, id;
+ InsertIndexResult res;
+ Datum *d;
+ bool *nulls;
+ int nb, nh, ni;
+ ExprContext *econtext;
+ TupleTable tupleTable;
+ TupleTableSlot *slot;
+ Oid hrelid, irelid;
+ Node *pred, *oldPred;
+ RTSTATE rtState;
+
+ initRtstate(&rtState, index);
+
+ /* rtrees only know how to do stupid locking now */
+ RelationSetLockForWrite(index);
+
+ pred = predInfo->pred;
+ oldPred = predInfo->oldPred;
+
+ /*
+ * We expect to be called exactly once for any index relation.
+ * If that's not the case, big trouble's what we have.
+ */
+
+ if (oldPred == NULL && (nb = RelationGetNumberOfBlocks(index)) != 0)
+ elog(WARN, "%s already contains data", index->rd_rel->relname.data);
+
+ /* initialize the root page (if this is a new index) */
+ if (oldPred == NULL) {
+ buffer = ReadBuffer(index, P_NEW);
+ RTInitBuffer(buffer, F_LEAF);
+ WriteBuffer(buffer);
+ }
+
+ /* init the tuple descriptors and get set for a heap scan */
+ hd = RelationGetTupleDescriptor(heap);
+ id = RelationGetTupleDescriptor(index);
+ d = (Datum *)palloc(natts * sizeof (*d));
+ nulls = (bool *)palloc(natts * sizeof (*nulls));
+
+ /*
+ * If this is a predicate (partial) index, we will need to evaluate the
+ * predicate using ExecQual, which requires the current tuple to be in a
+ * slot of a TupleTable. In addition, ExecQual must have an ExprContext
+ * referring to that slot. Here, we initialize dummy TupleTable and
+ * ExprContext objects for this purpose. --Nels, Feb '92
+ */
+#ifndef OMIT_PARTIAL_INDEX
+ if (pred != NULL || oldPred != NULL) {
+ tupleTable = ExecCreateTupleTable(1);
+ slot = ExecAllocTableSlot(tupleTable);
+ econtext = makeNode(ExprContext);
+ FillDummyExprContext(econtext, slot, hd, buffer);
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+ scan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+ htup = heap_getnext(scan, 0, &buffer);
+
+ /* count the tuples as we insert them */
+ nh = ni = 0;
+
+ for (; HeapTupleIsValid(htup); htup = heap_getnext(scan, 0, &buffer)) {
+
+ nh++;
+
+ /*
+ * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+ * this tuple if it was already in the existing partial index
+ */
+ if (oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ /*SetSlotContents(slot, htup); */
+ slot->val = htup;
+ if (ExecQual((List*)oldPred, econtext) == true) {
+ ni++;
+ continue;
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /* Skip this tuple if it doesn't satisfy the partial-index predicate */
+ if (pred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ /*SetSlotContents(slot, htup); */
+ slot->val = htup;
+ if (ExecQual((List*)pred, econtext) == false)
+ continue;
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ ni++;
+
+ /*
+ * For the current heap tuple, extract all the attributes
+ * we use in this index, and note which are null.
+ */
+
+ for (i = 1; i <= natts; i++) {
+ int attoff;
+ bool attnull;
+
+ /*
+ * Offsets are from the start of the tuple, and are
+ * zero-based; indices are one-based. The next call
+ * returns i - 1. That's data hiding for you.
+ */
+
+ attoff = AttrNumberGetAttrOffset(i);
+ /*
+ d[attoff] = HeapTupleGetAttributeValue(htup, buffer,
+ */
+ d[attoff] = GetIndexValue(htup,
+ hd,
+ attoff,
+ attnum,
+ finfo,
+ &attnull,
+ buffer);
+ nulls[attoff] = (attnull ? 'n' : ' ');
+ }
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_formtuple(id, &d[0], nulls);
+ itup->t_tid = htup->t_ctid;
+
+ /*
+ * Since we already have the index relation locked, we
+ * call rtdoinsert directly. Normal access method calls
+ * dispatch through rtinsert, which locks the relation
+ * for write. This is the right thing to do if you're
+ * inserting single tups, but not when you're initializing
+ * the whole index at once.
+ */
+
+ res = rtdoinsert(index, itup, &rtState);
+ pfree(itup);
+ pfree(res);
+ }
+
+ /* okay, all heap tuples are indexed */
+ heap_endscan(scan);
+ RelationUnsetLockForWrite(index);
+
+ if (pred != NULL || oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ ExecDestroyTupleTable(tupleTable, true);
+ pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /*
+ * Since we just counted the tuples in the heap, we update its
+ * stats in pg_relation to guarantee that the planner takes
+ * advantage of the index we just created. UpdateStats() does a
+ * CommandCounterIncrement(), which flushes changed entries from
+ * the system relcache. The act of constructing an index changes
+ * these heap and index tuples in the system catalogs, so they
+ * need to be flushed. We close them to guarantee that they
+ * will be.
+ */
+
+ hrelid = heap->rd_id;
+ irelid = index->rd_id;
+ heap_close(heap);
+ index_close(index);
+
+ UpdateStats(hrelid, nh, true);
+ UpdateStats(irelid, ni, false);
+
+ if (oldPred != NULL) {
+ if (ni == nh) pred = NULL;
+ UpdateIndexPredicate(irelid, oldPred, pred);
+ }
+
+ /* be tidy */
+ pfree(nulls);
+ pfree(d);
+}
+
+/*
+ * rtinsert -- wrapper for rtree tuple insertion.
+ *
+ * This is the public interface routine for tuple insertion in rtrees.
+ * It doesn't do any work; just locks the relation and passes the buck.
+ */
+InsertIndexResult
+rtinsert(Relation r, IndexTuple itup)
+{
+ InsertIndexResult res;
+ RTSTATE rtState;
+
+ initRtstate(&rtState, r);
+
+ RelationSetLockForWrite(r);
+ res = rtdoinsert(r, itup, &rtState);
+
+ /* XXX two-phase locking -- don't unlock the relation until EOT */
+ return (res);
+}
+
+static InsertIndexResult
+rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate)
+{
+ Page page;
+ Buffer buffer;
+ BlockNumber blk;
+ IndexTuple which;
+ OffsetNumber l;
+ RTSTACK *stack;
+ InsertIndexResult res;
+ RTreePageOpaque opaque;
+ char *datum;
+
+ blk = P_ROOT;
+ buffer = InvalidBuffer;
+ stack = (RTSTACK *) NULL;
+
+ do {
+ /* let go of current buffer before getting next */
+ if (buffer != InvalidBuffer)
+ ReleaseBuffer(buffer);
+
+ /* get next buffer */
+ buffer = ReadBuffer(r, blk);
+ page = (Page) BufferGetPage(buffer);
+
+ opaque = (RTreePageOpaque) PageGetSpecialPointer(page);
+ if (!(opaque->flags & F_LEAF)) {
+ RTSTACK *n;
+ ItemId iid;
+
+ n = (RTSTACK *) palloc(sizeof(RTSTACK));
+ n->rts_parent = stack;
+ n->rts_blk = blk;
+ n->rts_child = choose(r, page, itup, rtstate);
+ stack = n;
+
+ iid = PageGetItemId(page, n->rts_child);
+ which = (IndexTuple) PageGetItem(page, iid);
+ blk = ItemPointerGetBlockNumber(&(which->t_tid));
+ }
+ } while (!(opaque->flags & F_LEAF));
+
+ if (nospace(page, itup)) {
+ /* need to do a split */
+ res = dosplit(r, buffer, stack, itup, rtstate);
+ freestack(stack);
+ WriteBuffer(buffer); /* don't forget to release buffer! */
+ return (res);
+ }
+
+ /* add the item and write the buffer */
+ if (PageIsEmpty(page)) {
+ l = PageAddItem(page, (Item) itup, IndexTupleSize(itup),
+ FirstOffsetNumber,
+ LP_USED);
+ } else {
+ l = PageAddItem(page, (Item) itup, IndexTupleSize(itup),
+ OffsetNumberNext(PageGetMaxOffsetNumber(page)),
+ LP_USED);
+ }
+
+ WriteBuffer(buffer);
+
+ datum = (((char *) itup) + sizeof(IndexTupleData));
+
+ /* now expand the page boundary in the parent to include the new child */
+ rttighten(r, stack, datum,
+ (IndexTupleSize(itup) - sizeof(IndexTupleData)), rtstate);
+ freestack(stack);
+
+ /* build and return an InsertIndexResult for this insertion */
+ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+ ItemPointerSet(&(res->pointerData), blk, l);
+
+ return (res);
+}
+
+static void
+rttighten(Relation r,
+ RTSTACK *stk,
+ char *datum,
+ int att_size,
+ RTSTATE *rtstate)
+{
+ char *oldud;
+ char *tdatum;
+ Page p;
+ float old_size, newd_size;
+ Buffer b;
+
+ if (stk == (RTSTACK *) NULL)
+ return;
+
+ b = ReadBuffer(r, stk->rts_blk);
+ p = BufferGetPage(b);
+
+ oldud = (char *) PageGetItem(p, PageGetItemId(p, stk->rts_child));
+ oldud += sizeof(IndexTupleData);
+
+ (*rtstate->sizeFn)(oldud, &old_size);
+ datum = (char *) (*rtstate->unionFn)(oldud, datum);
+
+ (*rtstate->sizeFn)(datum, &newd_size);
+
+ if (newd_size != old_size) {
+ TupleDesc td = RelationGetTupleDescriptor(r);
+
+ if (td->attrs[0]->attlen < 0) {
+ /*
+ * This is an internal page, so 'oldud' had better be a
+ * union (constant-length) key, too. (See comment below.)
+ */
+ Assert(VARSIZE(datum) == VARSIZE(oldud));
+ memmove(oldud, datum, VARSIZE(datum));
+ } else {
+ memmove(oldud, datum, att_size);
+ }
+ WriteBuffer(b);
+
+ /*
+ * The user may be defining an index on variable-sized data (like
+ * polygons). If so, we need to get a constant-sized datum for
+ * insertion on the internal page. We do this by calling the union
+ * proc, which is guaranteed to return a rectangle.
+ */
+
+ tdatum = (char *) (*rtstate->unionFn)(datum, datum);
+ rttighten(r, stk->rts_parent, tdatum, att_size, rtstate);
+ pfree(tdatum);
+ } else {
+ ReleaseBuffer(b);
+ }
+ pfree(datum);
+}
+
+/*
+ * dosplit -- split a page in the tree.
+ *
+ * This is the quadratic-cost split algorithm Guttman describes in
+ * his paper. The reason we chose it is that you can implement this
+ * with less information about the data types on which you're operating.
+ */
+static InsertIndexResult
+dosplit(Relation r,
+ Buffer buffer,
+ RTSTACK *stack,
+ IndexTuple itup,
+ RTSTATE *rtstate)
+{
+ Page p;
+ Buffer leftbuf, rightbuf;
+ Page left, right;
+ ItemId itemid;
+ IndexTuple item;
+ IndexTuple ltup, rtup;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ OffsetNumber leftoff, rightoff;
+ BlockNumber lbknum, rbknum;
+ BlockNumber bufblock;
+ RTreePageOpaque opaque;
+ int blank;
+ InsertIndexResult res;
+ char *isnull;
+ SPLITVEC v;
+ TupleDesc tupDesc;
+
+ isnull = (char *) palloc(r->rd_rel->relnatts);
+ for (blank = 0; blank < r->rd_rel->relnatts; blank++)
+ isnull[blank] = ' ';
+ p = (Page) BufferGetPage(buffer);
+ opaque = (RTreePageOpaque) PageGetSpecialPointer(p);
+
+ /*
+ * The root of the tree is the first block in the relation. If
+ * we're about to split the root, we need to do some hocus-pocus
+ * to enforce this guarantee.
+ */
+
+ if (BufferGetBlockNumber(buffer) == P_ROOT) {
+ leftbuf = ReadBuffer(r, P_NEW);
+ RTInitBuffer(leftbuf, opaque->flags);
+ lbknum = BufferGetBlockNumber(leftbuf);
+ left = (Page) BufferGetPage(leftbuf);
+ } else {
+ leftbuf = buffer;
+ IncrBufferRefCount(buffer);
+ lbknum = BufferGetBlockNumber(buffer);
+ left = (Page) PageGetTempPage(p, sizeof(RTreePageOpaqueData));
+ }
+
+ rightbuf = ReadBuffer(r, P_NEW);
+ RTInitBuffer(rightbuf, opaque->flags);
+ rbknum = BufferGetBlockNumber(rightbuf);
+ right = (Page) BufferGetPage(rightbuf);
+
+ picksplit(r, p, &v, itup, rtstate);
+
+ leftoff = rightoff = FirstOffsetNumber;
+ maxoff = PageGetMaxOffsetNumber(p);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+ itemid = PageGetItemId(p, i);
+ item = (IndexTuple) PageGetItem(p, itemid);
+
+ if (i == *(v.spl_left)) {
+ (void) PageAddItem(left, (Item) item, IndexTupleSize(item),
+ leftoff, LP_USED);
+ leftoff = OffsetNumberNext(leftoff);
+ v.spl_left++; /* advance in left split vector */
+ } else {
+ (void) PageAddItem(right, (Item) item, IndexTupleSize(item),
+ rightoff, LP_USED);
+ rightoff = OffsetNumberNext(rightoff);
+ v.spl_right++; /* advance in right split vector */
+ }
+ }
+
+ /* build an InsertIndexResult for this insertion */
+ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+
+ /* now insert the new index tuple */
+ if (*(v.spl_left) != FirstOffsetNumber) {
+ (void) PageAddItem(left, (Item) itup, IndexTupleSize(itup),
+ leftoff, LP_USED);
+ leftoff = OffsetNumberNext(leftoff);
+ ItemPointerSet(&(res->pointerData), lbknum, leftoff);
+ } else {
+ (void) PageAddItem(right, (Item) itup, IndexTupleSize(itup),
+ rightoff, LP_USED);
+ rightoff = OffsetNumberNext(rightoff);
+ ItemPointerSet(&(res->pointerData), rbknum, rightoff);
+ }
+
+ if ((bufblock = BufferGetBlockNumber(buffer)) != P_ROOT) {
+ PageRestoreTempPage(left, p);
+ }
+ WriteBuffer(leftbuf);
+ WriteBuffer(rightbuf);
+
+ /*
+ * Okay, the page is split. We have three things left to do:
+ *
+ * 1) Adjust any active scans on this index to cope with changes
+ * we introduced in its structure by splitting this page.
+ *
+ * 2) "Tighten" the bounding box of the pointer to the left
+ * page in the parent node in the tree, if any. Since we
+ * moved a bunch of stuff off the left page, we expect it
+ * to get smaller. This happens in the internal insertion
+ * routine.
+ *
+ * 3) Insert a pointer to the right page in the parent. This
+ * may cause the parent to split. If it does, we need to
+ * repeat steps one and two for each split node in the tree.
+ */
+
+ /* adjust active scans */
+ rtadjscans(r, RTOP_SPLIT, bufblock, FirstOffsetNumber);
+
+ tupDesc = r->rd_att;
+ ltup = (IndexTuple) index_formtuple(tupDesc,
+ (Datum *) &(v.spl_ldatum), isnull);
+ rtup = (IndexTuple) index_formtuple(tupDesc,
+ (Datum *) &(v.spl_rdatum), isnull);
+ pfree(isnull);
+
+ /* set pointers to new child pages in the internal index tuples */
+ ItemPointerSet(&(ltup->t_tid), lbknum, 1);
+ ItemPointerSet(&(rtup->t_tid), rbknum, 1);
+
+ rtintinsert(r, stack, ltup, rtup, rtstate);
+
+ pfree(ltup);
+ pfree(rtup);
+
+ return (res);
+}
+
+static void
+rtintinsert(Relation r,
+ RTSTACK *stk,
+ IndexTuple ltup,
+ IndexTuple rtup,
+ RTSTATE *rtstate)
+{
+ IndexTuple old;
+ Buffer b;
+ Page p;
+ char *ldatum, *rdatum, *newdatum;
+ InsertIndexResult res;
+
+ if (stk == (RTSTACK *) NULL) {
+ rtnewroot(r, ltup, rtup);
+ return;
+ }
+
+ b = ReadBuffer(r, stk->rts_blk);
+ p = BufferGetPage(b);
+ old = (IndexTuple) PageGetItem(p, PageGetItemId(p, stk->rts_child));
+
+ /*
+ * This is a hack. Right now, we force rtree keys to be constant size.
+ * To fix this, need delete the old key and add both left and right
+ * for the two new pages. The insertion of left may force a split if
+ * the new left key is bigger than the old key.
+ */
+
+ if (IndexTupleSize(old) != IndexTupleSize(ltup))
+ elog(WARN, "Variable-length rtree keys are not supported.");
+
+ /* install pointer to left child */
+ memmove(old, ltup,IndexTupleSize(ltup));
+
+ if (nospace(p, rtup)) {
+ newdatum = (((char *) ltup) + sizeof(IndexTupleData));
+ rttighten(r, stk->rts_parent, newdatum,
+ (IndexTupleSize(ltup) - sizeof(IndexTupleData)), rtstate);
+ res = dosplit(r, b, stk->rts_parent, rtup, rtstate);
+ WriteBuffer(b); /* don't forget to release buffer! - 01/31/94 */
+ pfree(res);
+ } else {
+ (void) PageAddItem(p, (Item) rtup, IndexTupleSize(rtup),
+ PageGetMaxOffsetNumber(p), LP_USED);
+ WriteBuffer(b);
+ ldatum = (((char *) ltup) + sizeof(IndexTupleData));
+ rdatum = (((char *) rtup) + sizeof(IndexTupleData));
+ newdatum = (char *) (*rtstate->unionFn)(ldatum, rdatum);
+
+ rttighten(r, stk->rts_parent, newdatum,
+ (IndexTupleSize(rtup) - sizeof(IndexTupleData)), rtstate);
+
+ pfree(newdatum);
+ }
+}
+
+static void
+rtnewroot(Relation r, IndexTuple lt, IndexTuple rt)
+{
+ Buffer b;
+ Page p;
+
+ b = ReadBuffer(r, P_ROOT);
+ RTInitBuffer(b, 0);
+ p = BufferGetPage(b);
+ (void) PageAddItem(p, (Item) lt, IndexTupleSize(lt),
+ FirstOffsetNumber, LP_USED);
+ (void) PageAddItem(p, (Item) rt, IndexTupleSize(rt),
+ OffsetNumberNext(FirstOffsetNumber), LP_USED);
+ WriteBuffer(b);
+}
+
+static void
+picksplit(Relation r,
+ Page page,
+ SPLITVEC *v,
+ IndexTuple itup,
+ RTSTATE *rtstate)
+{
+ OffsetNumber maxoff;
+ OffsetNumber i, j;
+ IndexTuple item_1, item_2;
+ char *datum_alpha, *datum_beta;
+ char *datum_l, *datum_r;
+ char *union_d, *union_dl, *union_dr;
+ char *inter_d;
+ bool firsttime;
+ float size_alpha, size_beta, size_union, size_inter;
+ float size_waste, waste;
+ float size_l, size_r;
+ int nbytes;
+ OffsetNumber seed_1 = 0, seed_2 = 0;
+ OffsetNumber *left, *right;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ nbytes = (maxoff + 2) * sizeof(OffsetNumber);
+ v->spl_left = (OffsetNumber *) palloc(nbytes);
+ v->spl_right = (OffsetNumber *) palloc(nbytes);
+
+ firsttime = true;
+ waste = 0.0;
+
+ for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i)) {
+ item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+ datum_alpha = ((char *) item_1) + sizeof(IndexTupleData);
+ for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j)) {
+ item_2 = (IndexTuple) PageGetItem(page, PageGetItemId(page, j));
+ datum_beta = ((char *) item_2) + sizeof(IndexTupleData);
+
+ /* compute the wasted space by unioning these guys */
+ union_d = (char *)(rtstate->unionFn)(datum_alpha, datum_beta);
+ (rtstate->sizeFn)(union_d, &size_union);
+ inter_d = (char *)(rtstate->interFn)(datum_alpha, datum_beta);
+ (rtstate->sizeFn)(inter_d, &size_inter);
+ size_waste = size_union - size_inter;
+
+ pfree(union_d);
+
+ if (inter_d != (char *) NULL)
+ pfree(inter_d);
+
+ /*
+ * are these a more promising split that what we've
+ * already seen?
+ */
+
+ if (size_waste > waste || firsttime) {
+ waste = size_waste;
+ seed_1 = i;
+ seed_2 = j;
+ firsttime = false;
+ }
+ }
+ }
+
+ left = v->spl_left;
+ v->spl_nleft = 0;
+ right = v->spl_right;
+ v->spl_nright = 0;
+
+ item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, seed_1));
+ datum_alpha = ((char *) item_1) + sizeof(IndexTupleData);
+ datum_l = (char *)(*rtstate->unionFn)(datum_alpha, datum_alpha);
+ (*rtstate->sizeFn)(datum_l, &size_l);
+ item_2 = (IndexTuple) PageGetItem(page, PageGetItemId(page, seed_2));
+ datum_beta = ((char *) item_2) + sizeof(IndexTupleData);
+ datum_r = (char *)(*rtstate->unionFn)(datum_beta, datum_beta);
+ (*rtstate->sizeFn)(datum_r, &size_r);
+
+ /*
+ * Now split up the regions between the two seeds. An important
+ * property of this split algorithm is that the split vector v
+ * has the indices of items to be split in order in its left and
+ * right vectors. We exploit this property by doing a merge in
+ * the code that actually splits the page.
+ *
+ * For efficiency, we also place the new index tuple in this loop.
+ * This is handled at the very end, when we have placed all the
+ * existing tuples and i == maxoff + 1.
+ */
+
+ maxoff = OffsetNumberNext(maxoff);
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+
+ /*
+ * If we've already decided where to place this item, just
+ * put it on the right list. Otherwise, we need to figure
+ * out which page needs the least enlargement in order to
+ * store the item.
+ */
+
+ if (i == seed_1) {
+ *left++ = i;
+ v->spl_nleft++;
+ continue;
+ } else if (i == seed_2) {
+ *right++ = i;
+ v->spl_nright++;
+ continue;
+ }
+
+ /* okay, which page needs least enlargement? */
+ if (i == maxoff) {
+ item_1 = itup;
+ } else {
+ item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+ }
+
+ datum_alpha = ((char *) item_1) + sizeof(IndexTupleData);
+ union_dl = (char *)(*rtstate->unionFn)(datum_l, datum_alpha);
+ union_dr = (char *)(*rtstate->unionFn)(datum_r, datum_alpha);
+ (*rtstate->sizeFn)(union_dl, &size_alpha);
+ (*rtstate->sizeFn)(union_dr, &size_beta);
+
+ /* pick which page to add it to */
+ if (size_alpha - size_l < size_beta - size_r) {
+ pfree(datum_l);
+ pfree(union_dr);
+ datum_l = union_dl;
+ size_l = size_alpha;
+ *left++ = i;
+ v->spl_nleft++;
+ } else {
+ pfree(datum_r);
+ pfree(union_dl);
+ datum_r = union_dr;
+ size_r = size_alpha;
+ *right++ = i;
+ v->spl_nright++;
+ }
+ }
+ *left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */
+
+ v->spl_ldatum = datum_l;
+ v->spl_rdatum = datum_r;
+}
+
+static void
+RTInitBuffer(Buffer b, uint32 f)
+{
+ RTreePageOpaque opaque;
+ Page page;
+ Size pageSize;
+
+ pageSize = BufferGetPageSize(b);
+
+ page = BufferGetPage(b);
+ memset(page, 0, (int) pageSize);
+ PageInit(page, pageSize, sizeof(RTreePageOpaqueData));
+
+ opaque = (RTreePageOpaque) PageGetSpecialPointer(page);
+ opaque->flags = f;
+}
+
+static OffsetNumber
+choose(Relation r, Page p, IndexTuple it, RTSTATE *rtstate)
+{
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ char *ud, *id;
+ char *datum;
+ float usize, dsize;
+ OffsetNumber which;
+ float which_grow;
+
+ id = ((char *) it) + sizeof(IndexTupleData);
+ maxoff = PageGetMaxOffsetNumber(p);
+ which_grow = -1.0;
+ which = -1;
+
+ for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+ datum = (char *) PageGetItem(p, PageGetItemId(p, i));
+ datum += sizeof(IndexTupleData);
+ (*rtstate->sizeFn)(datum, &dsize);
+ ud = (char *) (*rtstate->unionFn)(datum, id);
+ (*rtstate->sizeFn)(ud, &usize);
+ pfree(ud);
+ if (which_grow < 0 || usize - dsize < which_grow) {
+ which = i;
+ which_grow = usize - dsize;
+ if (which_grow == 0)
+ break;
+ }
+ }
+
+ return (which);
+}
+
+static int
+nospace(Page p, IndexTuple it)
+{
+ return (PageGetFreeSpace(p) < IndexTupleSize(it));
+}
+
+void
+freestack(RTSTACK *s)
+{
+ RTSTACK *p;
+
+ while (s != (RTSTACK *) NULL) {
+ p = s->rts_parent;
+ pfree(s);
+ s = p;
+ }
+}
+
+char *
+rtdelete(Relation r, ItemPointer tid)
+{
+ BlockNumber blkno;
+ OffsetNumber offnum;
+ Buffer buf;
+ Page page;
+
+ /* must write-lock on delete */
+ RelationSetLockForWrite(r);
+
+ blkno = ItemPointerGetBlockNumber(tid);
+ offnum = ItemPointerGetOffsetNumber(tid);
+
+ /* adjust any scans that will be affected by this deletion */
+ rtadjscans(r, RTOP_DEL, blkno, offnum);
+
+ /* delete the index tuple */
+ buf = ReadBuffer(r, blkno);
+ page = BufferGetPage(buf);
+
+ PageIndexTupleDelete(page, offnum);
+
+ WriteBuffer(buf);
+
+ /* XXX -- two-phase locking, don't release the write lock */
+ return ((char *) NULL);
+}
+
+static void initRtstate(RTSTATE *rtstate, Relation index)
+{
+ RegProcedure union_proc, size_proc, inter_proc;
+ func_ptr user_fn;
+ int pronargs;
+
+ union_proc = index_getprocid(index, 1, RT_UNION_PROC);
+ size_proc = index_getprocid(index, 1, RT_SIZE_PROC);
+ inter_proc = index_getprocid(index, 1, RT_INTER_PROC);
+ fmgr_info(union_proc, &user_fn, &pronargs);
+ rtstate->unionFn = user_fn;
+ fmgr_info(size_proc, &user_fn, &pronargs);
+ rtstate->sizeFn = user_fn;
+ fmgr_info(inter_proc, &user_fn, &pronargs);
+ rtstate->interFn = user_fn;
+ return;
+}
+
+#define RTDEBUG
+#ifdef RTDEBUG
+#include "utils/geo-decls.h"
+
+void
+_rtdump(Relation r)
+{
+ Buffer buf;
+ Page page;
+ OffsetNumber offnum, maxoff;
+ BlockNumber blkno;
+ BlockNumber nblocks;
+ RTreePageOpaque po;
+ IndexTuple itup;
+ BlockNumber itblkno;
+ OffsetNumber itoffno;
+ char *datum;
+ char *itkey;
+
+ nblocks = RelationGetNumberOfBlocks(r);
+ for (blkno = 0; blkno < nblocks; blkno++) {
+ buf = ReadBuffer(r, blkno);
+ page = BufferGetPage(buf);
+ po = (RTreePageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ printf("Page %d maxoff %d <%s>\n", blkno, maxoff,
+ (po->flags & F_LEAF ? "LEAF" : "INTERNAL"));
+
+ if (PageIsEmpty(page)) {
+ ReleaseBuffer(buf);
+ continue;
+ }
+
+ for (offnum = FirstOffsetNumber;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum)) {
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ itblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+ itoffno = ItemPointerGetOffsetNumber(&(itup->t_tid));
+ datum = ((char *) itup);
+ datum += sizeof(IndexTupleData);
+ itkey = (char *) box_out((BOX *) datum);
+ printf("\t[%d] size %d heap <%d,%d> key:%s\n",
+ offnum, IndexTupleSize(itup), itblkno, itoffno, itkey);
+ pfree(itkey);
+ }
+
+ ReleaseBuffer(buf);
+ }
+}
+#endif /* defined RTDEBUG */
+
diff --git a/src/backend/access/rtree/rtscan.c b/src/backend/access/rtree/rtscan.c
new file mode 100644
index 00000000000..aa68f0db70b
--- /dev/null
+++ b/src/backend/access/rtree/rtscan.c
@@ -0,0 +1,392 @@
+/*-------------------------------------------------------------------------
+ *
+ * rtscan.c--
+ * routines to manage scans on index relations
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtscan.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/rtree.h"
+#include "access/rtstrat.h"
+
+/* routines defined and used here */
+static void rtregscan(IndexScanDesc s);
+static void rtdropscan(IndexScanDesc s);
+static void rtadjone(IndexScanDesc s, int op, BlockNumber blkno,
+ OffsetNumber offnum);
+static void adjuststack(RTSTACK *stk, BlockNumber blkno,
+ OffsetNumber offnum);
+static void adjustiptr(IndexScanDesc s, ItemPointer iptr,
+ int op, BlockNumber blkno, OffsetNumber offnum);
+
+/*
+ * Whenever we start an rtree scan in a backend, we register it in private
+ * space. Then if the rtree index gets updated, we check all registered
+ * scans and adjust them if the tuple they point at got moved by the
+ * update. We only need to do this in private space, because when we update
+ * an rtree we have a write lock on the tree, so no other process can have
+ * any locks at all on it. A single transaction can have write and read
+ * locks on the same object, so that's why we need to handle this case.
+ */
+
+typedef struct RTScanListData {
+ IndexScanDesc rtsl_scan;
+ struct RTScanListData *rtsl_next;
+} RTScanListData;
+
+typedef RTScanListData *RTScanList;
+
+/* pointer to list of local scans on rtrees */
+static RTScanList RTScans = (RTScanList) NULL;
+
+IndexScanDesc
+rtbeginscan(Relation r,
+ bool fromEnd,
+ uint16 nkeys,
+ ScanKey key)
+{
+ IndexScanDesc s;
+
+ RelationSetLockForRead(r);
+ s = RelationGetIndexScan(r, fromEnd, nkeys, key);
+ rtregscan(s);
+
+ return (s);
+}
+
+void
+rtrescan(IndexScanDesc s, bool fromEnd, ScanKey key)
+{
+ RTreeScanOpaque p;
+ RegProcedure internal_proc;
+ int i;
+
+ if (!IndexScanIsValid(s)) {
+ elog(WARN, "rtrescan: invalid scan.");
+ return;
+ }
+
+ /*
+ * Clear all the pointers.
+ */
+
+ ItemPointerSetInvalid(&s->previousItemData);
+ ItemPointerSetInvalid(&s->currentItemData);
+ ItemPointerSetInvalid(&s->nextItemData);
+ ItemPointerSetInvalid(&s->previousMarkData);
+ ItemPointerSetInvalid(&s->currentMarkData);
+ ItemPointerSetInvalid(&s->nextMarkData);
+
+ /*
+ * Set flags.
+ */
+ if (RelationGetNumberOfBlocks(s->relation) == 0) {
+ s->flags = ScanUnmarked;
+ } else if (fromEnd) {
+ s->flags = ScanUnmarked | ScanUncheckedPrevious;
+ } else {
+ s->flags = ScanUnmarked | ScanUncheckedNext;
+ }
+
+ s->scanFromEnd = fromEnd;
+
+ if (s->numberOfKeys > 0) {
+ memmove(s->keyData,
+ key,
+ s->numberOfKeys * sizeof(ScanKeyData));
+ }
+
+ p = (RTreeScanOpaque) s->opaque;
+ if (p != (RTreeScanOpaque) NULL) {
+ freestack(p->s_stack);
+ freestack(p->s_markstk);
+ p->s_stack = p->s_markstk = (RTSTACK *) NULL;
+ p->s_flags = 0x0;
+ } else {
+ /* initialize opaque data */
+ p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData));
+ p->s_internalKey =
+ (ScanKey) palloc(sizeof(ScanKeyData) * s->numberOfKeys);
+ p->s_stack = p->s_markstk = (RTSTACK *) NULL;
+ p->s_internalNKey = s->numberOfKeys;
+ p->s_flags = 0x0;
+ for (i = 0; i < s->numberOfKeys; i++)
+ p->s_internalKey[i].sk_argument = s->keyData[i].sk_argument;
+ s->opaque = p;
+ if (s->numberOfKeys > 0) {
+
+ /*
+ * Scans on internal pages use different operators than they
+ * do on leaf pages. For example, if the user wants all boxes
+ * that exactly match (x1,y1,x2,y2), then on internal pages
+ * we need to find all boxes that contain (x1,y1,x2,y2).
+ */
+
+ for (i = 0; i < s->numberOfKeys; i++) {
+ internal_proc = RTMapOperator(s->relation,
+ s->keyData[i].sk_attno,
+ s->keyData[i].sk_procedure);
+ ScanKeyEntryInitialize(&(p->s_internalKey[i]),
+ s->keyData[i].sk_flags,
+ s->keyData[i].sk_attno,
+ internal_proc,
+ s->keyData[i].sk_argument);
+ }
+ }
+ }
+}
+
+void
+rtmarkpos(IndexScanDesc s)
+{
+ RTreeScanOpaque p;
+ RTSTACK *o, *n, *tmp;
+
+ s->currentMarkData = s->currentItemData;
+ p = (RTreeScanOpaque) s->opaque;
+ if (p->s_flags & RTS_CURBEFORE)
+ p->s_flags |= RTS_MRKBEFORE;
+ else
+ p->s_flags &= ~RTS_MRKBEFORE;
+
+ o = (RTSTACK *) NULL;
+ n = p->s_stack;
+
+ /* copy the parent stack from the current item data */
+ while (n != (RTSTACK *) NULL) {
+ tmp = (RTSTACK *) palloc(sizeof(RTSTACK));
+ tmp->rts_child = n->rts_child;
+ tmp->rts_blk = n->rts_blk;
+ tmp->rts_parent = o;
+ o = tmp;
+ n = n->rts_parent;
+ }
+
+ freestack(p->s_markstk);
+ p->s_markstk = o;
+}
+
+void
+rtrestrpos(IndexScanDesc s)
+{
+ RTreeScanOpaque p;
+ RTSTACK *o, *n, *tmp;
+
+ s->currentItemData = s->currentMarkData;
+ p = (RTreeScanOpaque) s->opaque;
+ if (p->s_flags & RTS_MRKBEFORE)
+ p->s_flags |= RTS_CURBEFORE;
+ else
+ p->s_flags &= ~RTS_CURBEFORE;
+
+ o = (RTSTACK *) NULL;
+ n = p->s_markstk;
+
+ /* copy the parent stack from the current item data */
+ while (n != (RTSTACK *) NULL) {
+ tmp = (RTSTACK *) palloc(sizeof(RTSTACK));
+ tmp->rts_child = n->rts_child;
+ tmp->rts_blk = n->rts_blk;
+ tmp->rts_parent = o;
+ o = tmp;
+ n = n->rts_parent;
+ }
+
+ freestack(p->s_stack);
+ p->s_stack = o;
+}
+
+void
+rtendscan(IndexScanDesc s)
+{
+ RTreeScanOpaque p;
+
+ p = (RTreeScanOpaque) s->opaque;
+
+ if (p != (RTreeScanOpaque) NULL) {
+ freestack(p->s_stack);
+ freestack(p->s_markstk);
+ }
+
+ rtdropscan(s);
+ /* XXX don't unset read lock -- two-phase locking */
+}
+
+static void
+rtregscan(IndexScanDesc s)
+{
+ RTScanList l;
+
+ l = (RTScanList) palloc(sizeof(RTScanListData));
+ l->rtsl_scan = s;
+ l->rtsl_next = RTScans;
+ RTScans = l;
+}
+
+static void
+rtdropscan(IndexScanDesc s)
+{
+ RTScanList l;
+ RTScanList prev;
+
+ prev = (RTScanList) NULL;
+
+ for (l = RTScans;
+ l != (RTScanList) NULL && l->rtsl_scan != s;
+ l = l->rtsl_next) {
+ prev = l;
+ }
+
+ if (l == (RTScanList) NULL)
+ elog(WARN, "rtree scan list corrupted -- cannot find 0x%lx", s);
+
+ if (prev == (RTScanList) NULL)
+ RTScans = l->rtsl_next;
+ else
+ prev->rtsl_next = l->rtsl_next;
+
+ pfree(l);
+}
+
+void
+rtadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum)
+{
+ RTScanList l;
+ Oid relid;
+
+ relid = r->rd_id;
+ for (l = RTScans; l != (RTScanList) NULL; l = l->rtsl_next) {
+ if (l->rtsl_scan->relation->rd_id == relid)
+ rtadjone(l->rtsl_scan, op, blkno, offnum);
+ }
+}
+
+/*
+ * rtadjone() -- adjust one scan for update.
+ *
+ * By here, the scan passed in is on a modified relation. Op tells
+ * us what the modification is, and blkno and offind tell us what
+ * block and offset index were affected. This routine checks the
+ * current and marked positions, and the current and marked stacks,
+ * to see if any stored location needs to be changed because of the
+ * update. If so, we make the change here.
+ */
+static void
+rtadjone(IndexScanDesc s,
+ int op,
+ BlockNumber blkno,
+ OffsetNumber offnum)
+{
+ RTreeScanOpaque so;
+
+ adjustiptr(s, &(s->currentItemData), op, blkno, offnum);
+ adjustiptr(s, &(s->currentMarkData), op, blkno, offnum);
+
+ so = (RTreeScanOpaque) s->opaque;
+
+ if (op == RTOP_SPLIT) {
+ adjuststack(so->s_stack, blkno, offnum);
+ adjuststack(so->s_markstk, blkno, offnum);
+ }
+}
+
+/*
+ * adjustiptr() -- adjust current and marked item pointers in the scan
+ *
+ * Depending on the type of update and the place it happened, we
+ * need to do nothing, to back up one record, or to start over on
+ * the same page.
+ */
+static void
+adjustiptr(IndexScanDesc s,
+ ItemPointer iptr,
+ int op,
+ BlockNumber blkno,
+ OffsetNumber offnum)
+{
+ OffsetNumber curoff;
+ RTreeScanOpaque so;
+
+ if (ItemPointerIsValid(iptr)) {
+ if (ItemPointerGetBlockNumber(iptr) == blkno) {
+ curoff = ItemPointerGetOffsetNumber(iptr);
+ so = (RTreeScanOpaque) s->opaque;
+
+ switch (op) {
+ case RTOP_DEL:
+ /* back up one if we need to */
+ if (curoff >= offnum) {
+
+ if (curoff > FirstOffsetNumber) {
+ /* just adjust the item pointer */
+ ItemPointerSet(iptr, blkno, OffsetNumberPrev(curoff));
+ } else {
+ /* remember that we're before the current tuple */
+ ItemPointerSet(iptr, blkno, FirstOffsetNumber);
+ if (iptr == &(s->currentItemData))
+ so->s_flags |= RTS_CURBEFORE;
+ else
+ so->s_flags |= RTS_MRKBEFORE;
+ }
+ }
+ break;
+
+ case RTOP_SPLIT:
+ /* back to start of page on split */
+ ItemPointerSet(iptr, blkno, FirstOffsetNumber);
+ if (iptr == &(s->currentItemData))
+ so->s_flags &= ~RTS_CURBEFORE;
+ else
+ so->s_flags &= ~RTS_MRKBEFORE;
+ break;
+
+ default:
+ elog(WARN, "Bad operation in rtree scan adjust: %d", op);
+ }
+ }
+ }
+}
+
+/*
+ * adjuststack() -- adjust the supplied stack for a split on a page in
+ * the index we're scanning.
+ *
+ * If a page on our parent stack has split, we need to back up to the
+ * beginning of the page and rescan it. The reason for this is that
+ * the split algorithm for rtrees doesn't order tuples in any useful
+ * way on a single page. This means on that a split, we may wind up
+ * looking at some heap tuples more than once. This is handled in the
+ * access method update code for heaps; if we've modified the tuple we
+ * are looking at already in this transaction, we ignore the update
+ * request.
+ */
+/*ARGSUSED*/
+static void
+adjuststack(RTSTACK *stk,
+ BlockNumber blkno,
+ OffsetNumber offnum)
+{
+ while (stk != (RTSTACK *) NULL) {
+ if (stk->rts_blk == blkno)
+ stk->rts_child = FirstOffsetNumber;
+
+ stk = stk->rts_parent;
+ }
+}
diff --git a/src/backend/access/rtree/rtstrat.c b/src/backend/access/rtree/rtstrat.c
new file mode 100644
index 00000000000..c5d934a22a2
--- /dev/null
+++ b/src/backend/access/rtree/rtstrat.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * rtstrat.c--
+ * strategy map data for rtrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtstrat.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "utils/rel.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "access/istrat.h"
+#include "access/rtree.h"
+
+/*
+ * Note: negate, commute, and negatecommute all assume that operators are
+ * ordered as follows in the strategy map:
+ *
+ * left, left-or-overlap, overlap, right-or-overlap, right, same,
+ * contains, contained-by
+ *
+ * The negate, commute, and negatecommute arrays are used by the planner
+ * to plan indexed scans over data that appears in the qualificiation in
+ * a boolean negation, or whose operands appear in the wrong order. For
+ * example, if the operator "<%" means "contains", and the user says
+ *
+ * where not rel.box <% "(10,10,20,20)"::box
+ *
+ * the planner can plan an index scan by noting that rtree indices have
+ * an operator in their operator class for negating <%.
+ *
+ * Similarly, if the user says something like
+ *
+ * where "(10,10,20,20)"::box <% rel.box
+ *
+ * the planner can see that the rtree index on rel.box has an operator in
+ * its opclass for commuting <%, and plan the scan using that operator.
+ * This added complexity in the access methods makes the planner a lot easier
+ * to write.
+ */
+
+/* if a op b, what operator tells us if (not a op b)? */
+static StrategyNumber RTNegate[RTNStrategies] = {
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy
+ };
+
+/* if a op_1 b, what is the operator op_2 such that b op_2 a? */
+static StrategyNumber RTCommute[RTNStrategies] = {
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy
+ };
+
+/* if a op_1 b, what is the operator op_2 such that (b !op_2 a)? */
+static StrategyNumber RTNegateCommute[RTNStrategies] = {
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy,
+ InvalidStrategy
+ };
+
+/*
+ * Now do the TermData arrays. These exist in case the user doesn't give
+ * us a full set of operators for a particular operator class. The idea
+ * is that by making multiple comparisons using any one of the supplied
+ * operators, we can decide whether two n-dimensional polygons are equal.
+ * For example, if a contains b and b contains a, we may conclude that
+ * a and b are equal.
+ *
+ * The presence of the TermData arrays in all this is a historical accident.
+ * Early in the development of the POSTGRES access methods, it was believed
+ * that writing functions was harder than writing arrays. This is wrong;
+ * TermData is hard to understand and hard to get right. In general, when
+ * someone populates a new operator class, the populate it completely. If
+ * Mike Hirohama had forced Cimarron Taylor to populate the strategy map
+ * for btree int2_ops completely in 1988, you wouldn't have to deal with
+ * all this now. Too bad for you.
+ *
+ * Since you can't necessarily do this in all cases (for example, you can't
+ * do it given only "intersects" or "disjoint"), TermData arrays for some
+ * operators don't appear below.
+ *
+ * Note that if you DO supply all the operators required in a given opclass
+ * by inserting them into the pg_opclass system catalog, you can get away
+ * without doing all this TermData stuff. Since the rtree code is intended
+ * to be a reference for access method implementors, I'm doing TermData
+ * correctly here.
+ *
+ * Note on style: these are all actually of type StrategyTermData, but
+ * since those have variable-length data at the end of the struct we can't
+ * properly initialize them if we declare them to be what they are.
+ */
+
+/* if you only have "contained-by", how do you determine equality? */
+static uint16 RTContainedByTermData[] = {
+ 2, /* make two comparisons */
+ RTContainedByStrategyNumber, /* use "a contained-by b" */
+ 0x0, /* without any magic */
+ RTContainedByStrategyNumber, /* then use contained-by, */
+ SK_COMMUTE /* swapping a and b */
+ };
+
+/* if you only have "contains", how do you determine equality? */
+static uint16 RTContainsTermData[] = {
+ 2, /* make two comparisons */
+ RTContainsStrategyNumber, /* use "a contains b" */
+ 0x0, /* without any magic */
+ RTContainsStrategyNumber, /* then use contains again, */
+ SK_COMMUTE /* swapping a and b */
+ };
+
+/* now put all that together in one place for the planner */
+static StrategyTerm RTEqualExpressionData[] = {
+ (StrategyTerm) RTContainedByTermData,
+ (StrategyTerm) RTContainsTermData,
+ NULL
+ };
+
+/*
+ * If you were sufficiently attentive to detail, you would go through
+ * the ExpressionData pain above for every one of the seven strategies
+ * we defined. I am not. Now we declare the StrategyEvaluationData
+ * structure that gets shipped around to help the planner and the access
+ * method decide what sort of scan it should do, based on (a) what the
+ * user asked for, (b) what operators are defined for a particular opclass,
+ * and (c) the reams of information we supplied above.
+ *
+ * The idea of all of this initialized data is to make life easier on the
+ * user when he defines a new operator class to use this access method.
+ * By filling in all the data, we let him get away with leaving holes in his
+ * operator class, and still let him use the index. The added complexity
+ * in the access methods just isn't worth the trouble, though.
+ */
+
+static StrategyEvaluationData RTEvaluationData = {
+ RTNStrategies, /* # of strategies */
+ (StrategyTransformMap) RTNegate, /* how to do (not qual) */
+ (StrategyTransformMap) RTCommute, /* how to swap operands */
+ (StrategyTransformMap) RTNegateCommute, /* how to do both */
+ {
+ NULL, /* express left */
+ NULL, /* express overleft */
+ NULL, /* express over */
+ NULL, /* express overright */
+ NULL, /* express right */
+ (StrategyExpression) RTEqualExpressionData, /* express same */
+ NULL, /* express contains */
+ NULL, /* express contained-by */
+ NULL,
+ NULL,
+ NULL
+ }
+};
+
+/*
+ * Okay, now something peculiar to rtrees that doesn't apply to most other
+ * indexing structures: When we're searching a tree for a given value, we
+ * can't do the same sorts of comparisons on internal node entries as we
+ * do at leaves. The reason is that if we're looking for (say) all boxes
+ * that are the same as (0,0,10,10), then we need to find all leaf pages
+ * that overlap that region. So internally we search for overlap, and at
+ * the leaf we search for equality.
+ *
+ * This array maps leaf search operators to the internal search operators.
+ * We assume the normal ordering on operators:
+ *
+ * left, left-or-overlap, overlap, right-or-overlap, right, same,
+ * contains, contained-by
+ */
+static StrategyNumber RTOperMap[RTNStrategies] = {
+ RTOverLeftStrategyNumber,
+ RTOverLeftStrategyNumber,
+ RTOverlapStrategyNumber,
+ RTOverRightStrategyNumber,
+ RTOverRightStrategyNumber,
+ RTContainsStrategyNumber,
+ RTContainsStrategyNumber,
+ RTOverlapStrategyNumber
+ };
+
+StrategyNumber
+RelationGetRTStrategy(Relation r,
+ AttrNumber attnum,
+ RegProcedure proc)
+{
+ return (RelationGetStrategy(r, attnum, &RTEvaluationData, proc));
+}
+
+bool
+RelationInvokeRTStrategy(Relation r,
+ AttrNumber attnum,
+ StrategyNumber s,
+ Datum left,
+ Datum right)
+{
+ return (RelationInvokeStrategy(r, &RTEvaluationData, attnum, s,
+ left, right));
+}
+
+RegProcedure
+RTMapOperator(Relation r,
+ AttrNumber attnum,
+ RegProcedure proc)
+{
+ StrategyNumber procstrat;
+ StrategyMap strategyMap;
+
+ procstrat = RelationGetRTStrategy(r, attnum, proc);
+ strategyMap = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(r),
+ RTNStrategies,
+ attnum);
+
+ return (strategyMap->entry[RTOperMap[procstrat - 1] - 1].sk_procedure);
+}
diff --git a/src/backend/access/rtscan.h b/src/backend/access/rtscan.h
new file mode 100644
index 00000000000..a928303f3f3
--- /dev/null
+++ b/src/backend/access/rtscan.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * rtscan.h--
+ * routines defined in access/rtree/rtscan.c
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: rtscan.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RTSCAN_H
+
+void rtadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum);
+
+#endif /* RTSCAN_H */
diff --git a/src/backend/access/rtstrat.h b/src/backend/access/rtstrat.h
new file mode 100644
index 00000000000..5b439e7b338
--- /dev/null
+++ b/src/backend/access/rtstrat.h
@@ -0,0 +1,18 @@
+/*-------------------------------------------------------------------------
+ *
+ * rtstrat.h--
+ * routines defined in access/rtree/rtstrat.c
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: rtstrat.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RTSTRAT_H
+
+extern RegProcedure RTMapOperator(Relation r, AttrNumber attnum,
+ RegProcedure proc);
+
+#endif /* RTSTRAT_H */
diff --git a/src/backend/access/sdir.h b/src/backend/access/sdir.h
new file mode 100644
index 00000000000..030007d39c9
--- /dev/null
+++ b/src/backend/access/sdir.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * sdir.h--
+ * POSTGRES scan direction definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: sdir.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SDIR_H
+#define SDIR_H
+
+#include "c.h"
+
+/*
+ * ScanDirection was an int8 for no apparent reason. I kept the original
+ * values because I'm not sure if I'll break anything otherwise. -ay 2/95
+ */
+typedef enum ScanDirection {
+ BackwardScanDirection = -1,
+ NoMovementScanDirection = 0,
+ ForwardScanDirection = 1
+} ScanDirection;
+
+/*
+ * ScanDirectionIsValid --
+ * True iff scan direciton is valid.
+ */
+#define ScanDirectionIsValid(direction) \
+ ((bool) (BackwardScanDirection <= direction && \
+ direction <= ForwardScanDirection))
+
+/*
+ * ScanDirectionIsBackward --
+ * True iff scan direciton is backward.
+ */
+#define ScanDirectionIsBackward(direction) \
+ ((bool) (direction == BackwardScanDirection))
+
+/*
+ * ScanDirectionIsNoMovement --
+ * True iff scan direciton indicates no movement.
+ */
+#define ScanDirectionIsNoMovement(direction) \
+ ((bool) (direction == NoMovementScanDirection))
+
+/*
+ * ScanDirectionIsForward --
+ * True iff scan direciton is forward.
+ */
+#define ScanDirectionIsForward(direction) \
+ ((bool) (direction == ForwardScanDirection))
+
+#endif /* SDIR_H */
diff --git a/src/backend/access/skey.h b/src/backend/access/skey.h
new file mode 100644
index 00000000000..3cadf348f42
--- /dev/null
+++ b/src/backend/access/skey.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * skey.h--
+ * POSTGRES scan key definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: skey.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *
+ * Note:
+ * Needs more accessor/assignment routines.
+ *-------------------------------------------------------------------------
+ */
+#ifndef SKEY_H
+#define SKEY_H
+
+#include "postgres.h"
+#include "access/attnum.h"
+
+
+typedef struct ScanKeyData {
+ bits16 sk_flags; /* flags */
+ AttrNumber sk_attno; /* domain number */
+ RegProcedure sk_procedure; /* procedure OID */
+ func_ptr sk_func;
+ int32 sk_nargs;
+ Datum sk_argument; /* data to compare */
+} ScanKeyData;
+
+typedef ScanKeyData *ScanKey;
+
+
+#define SK_ISNULL 0x1
+#define SK_UNARY 0x2
+#define SK_NEGATE 0x4
+#define SK_COMMUTE 0x8
+
+#define ScanUnmarked 0x01
+#define ScanUncheckedPrevious 0x02
+#define ScanUncheckedNext 0x04
+
+
+/*
+ * prototypes for functions in access/common/scankey.c
+ */
+extern void ScanKeyEntrySetIllegal(ScanKey entry);
+extern void ScanKeyEntryInitialize(ScanKey entry, bits16 flags,
+ AttrNumber attributeNumber, RegProcedure procedure, Datum argument);
+
+#endif /* SKEY_H */
diff --git a/src/backend/access/strat.h b/src/backend/access/strat.h
new file mode 100644
index 00000000000..4ddb2190d88
--- /dev/null
+++ b/src/backend/access/strat.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * strat.h--
+ * index strategy type definitions
+ * (separated out from original istrat.h to avoid circular refs)
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: strat.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STRAT_H
+#define STRAT_H
+
+#include "postgres.h"
+#include "access/attnum.h"
+#include "access/skey.h"
+
+typedef uint16 StrategyNumber;
+
+#define InvalidStrategy 0
+
+typedef struct StrategyTransformMapData {
+ StrategyNumber strategy[1]; /* VARIABLE LENGTH ARRAY */
+} StrategyTransformMapData; /* VARIABLE LENGTH STRUCTURE */
+
+typedef StrategyTransformMapData *StrategyTransformMap;
+
+typedef struct StrategyOperatorData {
+ StrategyNumber strategy;
+ bits16 flags; /* scan qualification flags h/skey.h */
+} StrategyOperatorData;
+
+typedef StrategyOperatorData *StrategyOperator;
+
+typedef struct StrategyTermData { /* conjunctive term */
+ uint16 degree;
+ StrategyOperatorData operatorData[1]; /* VARIABLE LENGTH */
+} StrategyTermData; /* VARIABLE LENGTH STRUCTURE */
+
+typedef StrategyTermData *StrategyTerm;
+
+typedef struct StrategyExpressionData { /* disjunctive normal form */
+ StrategyTerm term[1]; /* VARIABLE LENGTH ARRAY */
+} StrategyExpressionData; /* VARIABLE LENGTH STRUCTURE */
+
+typedef StrategyExpressionData *StrategyExpression;
+
+typedef struct StrategyEvaluationData {
+ StrategyNumber maxStrategy;
+ StrategyTransformMap negateTransform;
+ StrategyTransformMap commuteTransform;
+ StrategyTransformMap negateCommuteTransform;
+ StrategyExpression expression[12]; /* XXX VARIABLE LENGTH */
+} StrategyEvaluationData; /* VARIABLE LENGTH STRUCTURE */
+
+typedef StrategyEvaluationData *StrategyEvaluation;
+
+/*
+ * StrategyTransformMapIsValid --
+ * Returns true iff strategy transformation map is valid.
+ */
+#define StrategyTransformMapIsValid(transform) PointerIsValid(transform)
+
+
+#ifndef CorrectStrategies /* XXX this should be removable */
+#define AMStrategies(foo) 12
+#else /* !defined(CorrectStrategies) */
+#define AMStrategies(foo) (foo)
+#endif /* !defined(CorrectStrategies) */
+
+typedef struct StrategyMapData {
+ ScanKeyData entry[1]; /* VARIABLE LENGTH ARRAY */
+} StrategyMapData; /* VARIABLE LENGTH STRUCTURE */
+
+typedef StrategyMapData *StrategyMap;
+
+typedef struct IndexStrategyData {
+ StrategyMapData strategyMapData[1]; /* VARIABLE LENGTH ARRAY */
+} IndexStrategyData; /* VARIABLE LENGTH STRUCTURE */
+
+typedef IndexStrategyData *IndexStrategy;
+
+#endif /*STRAT_H */
diff --git a/src/backend/access/transam.h b/src/backend/access/transam.h
new file mode 100644
index 00000000000..0f5a9724dc0
--- /dev/null
+++ b/src/backend/access/transam.h
@@ -0,0 +1,213 @@
+/*-------------------------------------------------------------------------
+ *
+ * transam.h--
+ * postgres transaction access method support code header
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: transam.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ * NOTES
+ * Transaction System Version 101 now support proper oid
+ * generation and recording in the variable relation.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TRANSAM_H
+#define TRANSAM_H
+
+/* ----------------
+ * transaction system version id
+ *
+ * this is stored on the first page of the log, time and variable
+ * relations on the first 4 bytes. This is so that if we improve
+ * the format of the transaction log after postgres version 2, then
+ * people won't have to rebuild their databases.
+ *
+ * TRANS_SYSTEM_VERSION 100 means major version 1 minor version 0.
+ * Two databases with the same major version should be compatible,
+ * even if their minor versions differ.
+ * ----------------
+ */
+#define TRANS_SYSTEM_VERSION 101
+
+/* ----------------
+ * transaction id status values
+ *
+ * someday we will use "11" = 3 = XID_INVALID to mean the
+ * starting of run-length encoded log data.
+ * ----------------
+ */
+#define XID_COMMIT 2 /* transaction commited */
+#define XID_ABORT 1 /* transaction aborted */
+#define XID_INPROGRESS 0 /* transaction in progress */
+#define XID_INVALID 3 /* other */
+
+typedef unsigned char XidStatus; /* (2 bits) */
+
+/* ----------------
+ * BitIndexOf computes the index of the Nth xid on a given block
+ * ----------------
+ */
+#define BitIndexOf(N) ((N) * 2)
+
+/* ----------------
+ * transaction page definitions
+ * ----------------
+ */
+#define TP_DataSize BLCKSZ
+#define TP_NumXidStatusPerBlock (TP_DataSize * 4)
+#define TP_NumTimePerBlock (TP_DataSize / 4)
+
+/* ----------------
+ * LogRelationContents structure
+ *
+ * This structure describes the storage of the data in the
+ * first 128 bytes of the log relation. This storage is never
+ * used for transaction status because transaction id's begin
+ * their numbering at 512.
+ *
+ * The first 4 bytes of this relation store the version
+ * number of the transction system.
+ * ----------------
+ */
+typedef struct LogRelationContentsData {
+ int TransSystemVersion;
+} LogRelationContentsData;
+
+typedef LogRelationContentsData *LogRelationContents;
+
+/* ----------------
+ * TimeRelationContents structure
+ *
+ * This structure describes the storage of the data in the
+ * first 2048 bytes of the time relation. This storage is never
+ * used for transaction commit times because transaction id's begin
+ * their numbering at 512.
+ *
+ * The first 4 bytes of this relation store the version
+ * number of the transction system.
+ * ----------------
+ */
+typedef struct TimeRelationContentsData {
+ int TransSystemVersion;
+} TimeRelationContentsData;
+
+typedef TimeRelationContentsData *TimeRelationContents;
+
+/* ----------------
+ * VariableRelationContents structure
+ *
+ * The variable relation is a special "relation" which
+ * is used to store various system "variables" persistantly.
+ * Unlike other relations in the system, this relation
+ * is updated in place whenever the variables change.
+ *
+ * The first 4 bytes of this relation store the version
+ * number of the transction system.
+ *
+ * Currently, the relation has only one page and the next
+ * available xid, the last committed xid and the next
+ * available oid are stored there.
+ * ----------------
+ */
+typedef struct VariableRelationContentsData {
+ int TransSystemVersion;
+ TransactionId nextXidData;
+ TransactionId lastXidData;
+ Oid nextOid;
+} VariableRelationContentsData;
+
+typedef VariableRelationContentsData *VariableRelationContents;
+
+/* ----------------
+ * extern declarations
+ * ----------------
+ */
+
+/*
+ * prototypes for functions in transam/transam.c
+ */
+extern int RecoveryCheckingEnabled();
+extern void SetRecoveryCheckingEnabled(bool state);
+extern bool TransactionLogTest(TransactionId transactionId, XidStatus status);
+extern void TransactionLogUpdate(TransactionId transactionId,
+ XidStatus status);
+extern AbsoluteTime TransactionIdGetCommitTime(TransactionId transactionId);
+extern void TransRecover(Relation logRelation);
+extern void InitializeTransactionLog();
+extern bool TransactionIdDidCommit(TransactionId transactionId);
+extern bool TransactionIdDidAbort(TransactionId transactionId);
+extern bool TransactionIdIsInProgress(TransactionId transactionId);
+extern void TransactionIdCommit(TransactionId transactionId);
+extern void TransactionIdAbort(TransactionId transactionId);
+extern void TransactionIdSetInProgress(TransactionId transactionId);
+
+/* in transam/transsup.c */
+extern void AmiTransactionOverride(bool flag);
+extern void TransComputeBlockNumber(Relation relation,
+ TransactionId transactionId, BlockNumber *blockNumberOutP);
+extern XidStatus TransBlockGetLastTransactionIdStatus(Block tblock,
+ TransactionId baseXid, TransactionId *returnXidP);
+extern XidStatus TransBlockGetXidStatus(Block tblock,
+ TransactionId transactionId);
+extern void TransBlockSetXidStatus(Block tblock,
+ TransactionId transactionId, XidStatus xstatus);
+extern AbsoluteTime TransBlockGetCommitTime(Block tblock,
+ TransactionId transactionId);
+extern void TransBlockSetCommitTime(Block tblock,
+ TransactionId transactionId, AbsoluteTime commitTime);
+extern XidStatus TransBlockNumberGetXidStatus(Relation relation,
+ BlockNumber blockNumber, TransactionId xid, bool *failP);
+extern void TransBlockNumberSetXidStatus(Relation relation,
+ BlockNumber blockNumber, TransactionId xid, XidStatus xstatus,
+ bool *failP);
+extern AbsoluteTime TransBlockNumberGetCommitTime(Relation relation,
+ BlockNumber blockNumber, TransactionId xid, bool *failP);
+extern void TransBlockNumberSetCommitTime(Relation relation,
+ BlockNumber blockNumber, TransactionId xid, AbsoluteTime xtime,
+ bool *failP);
+extern void TransGetLastRecordedTransaction(Relation relation,
+ TransactionId xid, bool *failP);
+
+/* in transam/varsup.c */
+extern void VariableRelationGetNextXid(TransactionId *xidP);
+extern void VariableRelationGetLastXid(TransactionId *xidP);
+extern void VariableRelationPutNextXid(TransactionId xid);
+extern void VariableRelationPutLastXid(TransactionId xid);
+extern void VariableRelationGetNextOid(Oid *oid_return);
+extern void VariableRelationPutNextOid(Oid *oidP);
+extern void GetNewTransactionId(TransactionId *xid);
+extern void UpdateLastCommittedXid(TransactionId xid);
+extern void GetNewObjectIdBlock(Oid *oid_return, int oid_block_size);
+extern void GetNewObjectId(Oid *oid_return);
+
+/* ----------------
+ * global variable extern declarations
+ * ----------------
+ */
+
+/* in transam.c */
+extern Relation LogRelation;
+extern Relation TimeRelation;
+extern Relation VariableRelation;
+
+extern TransactionId cachedGetCommitTimeXid;
+extern AbsoluteTime cachedGetCommitTime;
+extern TransactionId cachedTestXid;
+extern XidStatus cachedTestXidStatus;
+
+extern TransactionId NullTransactionId;
+extern TransactionId AmiTransactionId;
+extern TransactionId FirstTransactionId;
+
+extern int RecoveryCheckingEnableState;
+
+/* in transsup.c */
+extern bool AMI_OVERRIDE;
+
+/* in varsup.c */
+extern int OidGenLockId;
+
+#endif /* TRAMSAM_H */
diff --git a/src/backend/access/transam/Makefile.inc b/src/backend/access/transam/Makefile.inc
new file mode 100644
index 00000000000..c4f5b95a0ae
--- /dev/null
+++ b/src/backend/access/transam/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/transam
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/transam/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= transam.c transsup.c varsup.c xact.c xid.c
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
new file mode 100644
index 00000000000..b3789a8c2c5
--- /dev/null
+++ b/src/backend/access/transam/transam.c
@@ -0,0 +1,675 @@
+/*-------------------------------------------------------------------------
+ *
+ * transam.c--
+ * postgres transaction log/time interface routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/transam.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ * NOTES
+ * This file contains the high level access-method interface to the
+ * transaction system.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "machine.h" /* in port/ directory (needed for BLCKSZ) */
+
+#include "access/heapam.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+
+#include "utils/memutils.h"
+#include "utils/mcxt.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+
+#include "utils/nabstime.h"
+#include "catalog/catname.h"
+
+#include "access/transam.h"
+#include "access/xact.h"
+#include "commands/vacuum.h" /* for VacuumRunning */
+
+/* ----------------
+ * global variables holding pointers to relations used
+ * by the transaction system. These are initialized by
+ * InitializeTransactionLog().
+ * ----------------
+ */
+
+Relation LogRelation = (Relation) NULL;
+Relation TimeRelation = (Relation) NULL;
+Relation VariableRelation = (Relation) NULL;
+
+/* ----------------
+ * global variables holding cached transaction id's and statuses.
+ * ----------------
+ */
+TransactionId cachedGetCommitTimeXid;
+AbsoluteTime cachedGetCommitTime;
+TransactionId cachedTestXid;
+XidStatus cachedTestXidStatus;
+
+/* ----------------
+ * transaction system constants
+ * ----------------
+ */
+/* ----------------------------------------------------------------
+ * transaction system constants
+ *
+ * read the comments for GetNewTransactionId in order to
+ * understand the initial values for AmiTransactionId and
+ * FirstTransactionId. -cim 3/23/90
+ * ----------------------------------------------------------------
+ */
+TransactionId NullTransactionId = (TransactionId) 0;
+
+TransactionId AmiTransactionId = (TransactionId) 512;
+
+TransactionId FirstTransactionId = (TransactionId) 514;
+
+/* ----------------
+ * transaction recovery state variables
+ *
+ * When the transaction system is initialized, we may
+ * need to do recovery checking. This decision is decided
+ * by the postmaster or the user by supplying the backend
+ * with a special flag. In general, we want to do recovery
+ * checking whenever we are running without a postmaster
+ * or when the number of backends running under the postmaster
+ * goes from zero to one. -cim 3/21/90
+ * ----------------
+ */
+int RecoveryCheckingEnableState = 0;
+
+/* ------------------
+ * spinlock for oid generation
+ * -----------------
+ */
+extern int OidGenLockId;
+
+/* ----------------
+ * globals that must be reset at abort
+ * ----------------
+ */
+extern bool BuildingBtree;
+
+
+/* ----------------
+ * recovery checking accessors
+ * ----------------
+ */
+int
+RecoveryCheckingEnabled()
+{
+ return RecoveryCheckingEnableState;
+}
+
+void
+SetRecoveryCheckingEnabled(bool state)
+{
+ RecoveryCheckingEnableState = (state == true);
+}
+
+/* ----------------------------------------------------------------
+ * postgres log/time access method interface
+ *
+ * TransactionLogTest
+ * TransactionLogUpdate
+ * ========
+ * these functions do work for the interface
+ * functions - they search/retrieve and append/update
+ * information in the log and time relations.
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * TransactionLogTest
+ * --------------------------------
+ */
+
+bool /* true/false: does transaction id have specified status? */
+TransactionLogTest(TransactionId transactionId, /* transaction id to test */
+ XidStatus status) /* transaction status */
+{
+ BlockNumber blockNumber;
+ XidStatus xidstatus; /* recorded status of xid */
+ bool fail = false; /* success/failure */
+
+ /* ----------------
+ * during initialization consider all transactions
+ * as having been committed
+ * ----------------
+ */
+ if (! RelationIsValid(LogRelation))
+ return (bool) (status == XID_COMMIT);
+
+ /* ----------------
+ * before going to the buffer manager, check our single
+ * item cache to see if we didn't just check the transaction
+ * status a moment ago.
+ * ----------------
+ */
+ if (TransactionIdEquals(transactionId, cachedTestXid))
+ return (bool)
+ (status == cachedTestXidStatus);
+
+ /* ----------------
+ * compute the item pointer corresponding to the
+ * page containing our transaction id. We save the item in
+ * our cache to speed up things if we happen to ask for the
+ * same xid's status more than once.
+ * ----------------
+ */
+ TransComputeBlockNumber(LogRelation, transactionId, &blockNumber);
+ xidstatus = TransBlockNumberGetXidStatus(LogRelation,
+ blockNumber,
+ transactionId,
+ &fail);
+
+ if (! fail) {
+ TransactionIdStore(transactionId, &cachedTestXid);
+ cachedTestXidStatus = xidstatus;
+ return (bool)
+ (status == xidstatus);
+ }
+
+ /* ----------------
+ * here the block didn't contain the information we wanted
+ * ----------------
+ */
+ elog(WARN, "TransactionLogTest: failed to get xidstatus");
+
+ /*
+ * so lint is happy...
+ */
+ return(false);
+}
+
+/* --------------------------------
+ * TransactionLogUpdate
+ * --------------------------------
+ */
+void
+TransactionLogUpdate(TransactionId transactionId, /* trans id to update */
+ XidStatus status) /* new trans status */
+{
+ BlockNumber blockNumber;
+ bool fail = false; /* success/failure */
+ AbsoluteTime currentTime; /* time of this transaction */
+
+ /* ----------------
+ * during initialization we don't record any updates.
+ * ----------------
+ */
+ if (! RelationIsValid(LogRelation))
+ return;
+
+ /* ----------------
+ * get the transaction commit time
+ * ----------------
+ */
+ currentTime = getSystemTime();
+
+ /* ----------------
+ * update the log relation
+ * ----------------
+ */
+ TransComputeBlockNumber(LogRelation, transactionId, &blockNumber);
+ TransBlockNumberSetXidStatus(LogRelation,
+ blockNumber,
+ transactionId,
+ status,
+ &fail);
+
+ /* ----------------
+ * update (invalidate) our single item TransactionLogTest cache.
+ * ----------------
+ */
+ TransactionIdStore(transactionId, &cachedTestXid);
+ cachedTestXidStatus = status;
+
+ /* ----------------
+ * now we update the time relation, if necessary
+ * (we only record commit times)
+ * ----------------
+ */
+ if (RelationIsValid(TimeRelation) && status == XID_COMMIT) {
+ TransComputeBlockNumber(TimeRelation, transactionId, &blockNumber);
+ TransBlockNumberSetCommitTime(TimeRelation,
+ blockNumber,
+ transactionId,
+ currentTime,
+ &fail);
+ /* ----------------
+ * update (invalidate) our single item GetCommitTime cache.
+ * ----------------
+ */
+ TransactionIdStore(transactionId, &cachedGetCommitTimeXid);
+ cachedGetCommitTime = currentTime;
+ }
+
+ /* ----------------
+ * now we update the "last committed transaction" field
+ * in the variable relation if we are recording a commit.
+ * ----------------
+ */
+ if (RelationIsValid(VariableRelation) && status == XID_COMMIT)
+ UpdateLastCommittedXid(transactionId);
+}
+
+/* --------------------------------
+ * TransactionIdGetCommitTime
+ * --------------------------------
+ */
+
+AbsoluteTime /* commit time of transaction id */
+TransactionIdGetCommitTime(TransactionId transactionId) /* transaction id to test */
+{
+ BlockNumber blockNumber;
+ AbsoluteTime commitTime; /* commit time */
+ bool fail = false; /* success/failure */
+
+ /* ----------------
+ * return invalid if we aren't running yet...
+ * ----------------
+ */
+ if (! RelationIsValid(TimeRelation))
+ return INVALID_ABSTIME;
+
+ /* ----------------
+ * before going to the buffer manager, check our single
+ * item cache to see if we didn't just get the commit time
+ * a moment ago.
+ * ----------------
+ */
+ if (TransactionIdEquals(transactionId, cachedGetCommitTimeXid))
+ return cachedGetCommitTime;
+
+ /* ----------------
+ * compute the item pointer corresponding to the
+ * page containing our transaction commit time
+ * ----------------
+ */
+ TransComputeBlockNumber(TimeRelation, transactionId, &blockNumber);
+ commitTime = TransBlockNumberGetCommitTime(TimeRelation,
+ blockNumber,
+ transactionId,
+ &fail);
+
+ /* ----------------
+ * update our cache and return the transaction commit time
+ * ----------------
+ */
+ if (! fail) {
+ TransactionIdStore(transactionId, &cachedGetCommitTimeXid);
+ cachedGetCommitTime = commitTime;
+ return commitTime;
+ } else
+ return INVALID_ABSTIME;
+}
+
+/* ----------------------------------------------------------------
+ * transaction recovery code
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * TransRecover
+ *
+ * preform transaction recovery checking.
+ *
+ * Note: this should only be preformed if no other backends
+ * are running. This is known by the postmaster and
+ * conveyed by the postmaster passing a "do recovery checking"
+ * flag to the backend.
+ *
+ * here we get the last recorded transaction from the log,
+ * get the "last" and "next" transactions from the variable relation
+ * and then preform some integrity tests:
+ *
+ * 1) No transaction may exist higher then the "next" available
+ * transaction recorded in the variable relation. If this is the
+ * case then it means either the log or the variable relation
+ * has become corrupted.
+ *
+ * 2) The last committed transaction may not be higher then the
+ * next available transaction for the same reason.
+ *
+ * 3) The last recorded transaction may not be lower then the
+ * last committed transaction. (the reverse is ok - it means
+ * that some transactions have aborted since the last commit)
+ *
+ * Here is what the proper situation looks like. The line
+ * represents the data stored in the log. 'c' indicates the
+ * transaction was recorded as committed, 'a' indicates an
+ * abortted transaction and '.' represents information not
+ * recorded. These may correspond to in progress transactions.
+ *
+ * c c a c . . a . . . . . . . . . .
+ * | |
+ * last next
+ *
+ * Since "next" is only incremented by GetNewTransactionId() which
+ * is called when transactions are started. Hence if there
+ * are commits or aborts after "next", then it means we committed
+ * or aborted BEFORE we started the transaction. This is the
+ * rational behind constraint (1).
+ *
+ * Likewise, "last" should never greater then "next" for essentially
+ * the same reason - it would imply we committed before we started.
+ * This is the reasoning for (2).
+ *
+ * (3) implies we may never have a situation such as:
+ *
+ * c c a c . . a c . . . . . . . . .
+ * | |
+ * last next
+ *
+ * where there is a 'c' greater then "last".
+ *
+ * Recovery checking is more difficult in the case where
+ * several backends are executing concurrently because the
+ * transactions may be executing in the other backends.
+ * So, we only do recovery stuff when the backend is explicitly
+ * passed a flag on the command line.
+ * --------------------------------
+ */
+void
+TransRecover(Relation logRelation)
+{
+#if 0
+ /* ----------------
+ * first get the last recorded transaction in the log.
+ * ----------------
+ */
+ TransGetLastRecordedTransaction(logRelation, logLastXid, &fail);
+ if (fail == true)
+ elog(WARN, "TransRecover: failed TransGetLastRecordedTransaction");
+
+ /* ----------------
+ * next get the "last" and "next" variables
+ * ----------------
+ */
+ VariableRelationGetLastXid(&varLastXid);
+ VariableRelationGetNextXid(&varNextXid);
+
+ /* ----------------
+ * intregity test (1)
+ * ----------------
+ */
+ if (TransactionIdIsLessThan(varNextXid, logLastXid))
+ elog(WARN, "TransRecover: varNextXid < logLastXid");
+
+ /* ----------------
+ * intregity test (2)
+ * ----------------
+ */
+
+ /* ----------------
+ * intregity test (3)
+ * ----------------
+ */
+
+ /* ----------------
+ * here we have a valid "
+ *
+ * **** RESUME HERE ****
+ * ----------------
+ */
+ varNextXid = TransactionIdDup(varLastXid);
+ TransactionIdIncrement(&varNextXid);
+
+ VarPut(var, VAR_PUT_LASTXID, varLastXid);
+ VarPut(var, VAR_PUT_NEXTXID, varNextXid);
+#endif
+}
+
+/* ----------------------------------------------------------------
+ * Interface functions
+ *
+ * InitializeTransactionLog
+ * ========
+ * this function (called near cinit) initializes
+ * the transaction log, time and variable relations.
+ *
+ * TransactionId DidCommit
+ * TransactionId DidAbort
+ * TransactionId IsInProgress
+ * ========
+ * these functions test the transaction status of
+ * a specified transaction id.
+ *
+ * TransactionId Commit
+ * TransactionId Abort
+ * TransactionId SetInProgress
+ * ========
+ * these functions set the transaction status
+ * of the specified xid. TransactionIdCommit() also
+ * records the current time in the time relation
+ * and updates the variable relation counter.
+ *
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * InitializeTransactionLog --
+ * Initializes transaction logging.
+ */
+void
+InitializeTransactionLog()
+{
+ Relation logRelation;
+ Relation timeRelation;
+ MemoryContext oldContext;
+
+ /* ----------------
+ * don't do anything during bootstrapping
+ * ----------------
+ */
+ if (AMI_OVERRIDE)
+ return;
+
+ /* ----------------
+ * disable the transaction system so the access methods
+ * don't interfere during initialization.
+ * ----------------
+ */
+ OverrideTransactionSystem(true);
+
+ /* ----------------
+ * make sure allocations occur within the top memory context
+ * so that our log management structures are protected from
+ * garbage collection at the end of every transaction.
+ * ----------------
+ */
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /* ----------------
+ * first open the log and time relations
+ * (these are created by amiint so they are guaranteed to exist)
+ * ----------------
+ */
+ logRelation = heap_openr(LogRelationName);
+ timeRelation = heap_openr(TimeRelationName);
+ VariableRelation = heap_openr(VariableRelationName);
+ /* ----------------
+ * XXX TransactionLogUpdate requires that LogRelation
+ * and TimeRelation are valid so we temporarily set
+ * them so we can initialize things properly.
+ * This could be done cleaner.
+ * ----------------
+ */
+ LogRelation = logRelation;
+ TimeRelation = timeRelation;
+
+ /* ----------------
+ * if we have a virgin database, we initialize the log and time
+ * relation by committing the AmiTransactionId (id 512) and we
+ * initialize the variable relation by setting the next available
+ * transaction id to FirstTransactionId (id 514). OID initialization
+ * happens as a side effect of bootstrapping in varsup.c.
+ * ----------------
+ */
+ SpinAcquire(OidGenLockId);
+ if (!TransactionIdDidCommit(AmiTransactionId)) {
+
+ /* ----------------
+ * SOMEDAY initialize the information stored in
+ * the headers of the log/time/variable relations.
+ * ----------------
+ */
+ TransactionLogUpdate(AmiTransactionId, XID_COMMIT);
+ VariableRelationPutNextXid(FirstTransactionId);
+
+ } else if (RecoveryCheckingEnabled()) {
+ /* ----------------
+ * if we have a pre-initialized database and if the
+ * perform recovery checking flag was passed then we
+ * do our database integrity checking.
+ * ----------------
+ */
+ TransRecover(logRelation);
+ }
+ LogRelation = (Relation) NULL;
+ TimeRelation = (Relation) NULL;
+ SpinRelease(OidGenLockId);
+
+ /* ----------------
+ * now re-enable the transaction system
+ * ----------------
+ */
+ OverrideTransactionSystem(false);
+
+ /* ----------------
+ * instantiate the global variables
+ * ----------------
+ */
+ LogRelation = logRelation;
+ TimeRelation = timeRelation;
+
+ /* ----------------
+ * restore the memory context to the previous context
+ * before we return from initialization.
+ * ----------------
+ */
+ MemoryContextSwitchTo(oldContext);
+}
+
+/* --------------------------------
+ * TransactionId DidCommit
+ * TransactionId DidAbort
+ * TransactionId IsInProgress
+ * --------------------------------
+ */
+
+/*
+ * TransactionIdDidCommit --
+ * True iff transaction associated with the identifier did commit.
+ *
+ * Note:
+ * Assumes transaction identifier is valid.
+ */
+bool /* true if given transaction committed */
+TransactionIdDidCommit(TransactionId transactionId)
+{
+ if (AMI_OVERRIDE)
+ return true;
+
+ return
+ TransactionLogTest(transactionId, XID_COMMIT);
+}
+
+/*
+ * TransactionIdDidAborted --
+ * True iff transaction associated with the identifier did abort.
+ *
+ * Note:
+ * Assumes transaction identifier is valid.
+ * XXX Is this unneeded?
+ */
+bool /* true if given transaction aborted */
+TransactionIdDidAbort(TransactionId transactionId)
+{
+ if (AMI_OVERRIDE)
+ return false;
+
+ return
+ TransactionLogTest(transactionId, XID_ABORT);
+}
+
+bool /* true if given transaction neither committed nor aborted */
+TransactionIdIsInProgress(TransactionId transactionId)
+{
+ if (AMI_OVERRIDE)
+ return false;
+
+ return
+ TransactionLogTest(transactionId, XID_INPROGRESS);
+}
+
+/* --------------------------------
+ * TransactionId Commit
+ * TransactionId Abort
+ * TransactionId SetInProgress
+ * --------------------------------
+ */
+
+/*
+ * TransactionIdCommit --
+ * Commits the transaction associated with the identifier.
+ *
+ * Note:
+ * Assumes transaction identifier is valid.
+ */
+void
+TransactionIdCommit(TransactionId transactionId)
+{
+ if (AMI_OVERRIDE)
+ return;
+
+ /*
+ * Within TransactionLogUpdate we call UpdateLastCommited()
+ * which assumes we have exclusive access to pg_variable.
+ * Therefore we need to get exclusive access before calling
+ * TransactionLogUpdate. -mer 18 Aug 1992
+ */
+ SpinAcquire(OidGenLockId);
+ TransactionLogUpdate(transactionId, XID_COMMIT);
+ SpinRelease(OidGenLockId);
+}
+
+/*
+ * TransactionIdAbort --
+ * Aborts the transaction associated with the identifier.
+ *
+ * Note:
+ * Assumes transaction identifier is valid.
+ */
+void
+TransactionIdAbort(TransactionId transactionId)
+{
+ BuildingBtree = false;
+
+ if (VacuumRunning)
+ vc_abort();
+
+ if (AMI_OVERRIDE)
+ return;
+
+ TransactionLogUpdate(transactionId, XID_ABORT);
+}
+
+void
+TransactionIdSetInProgress(TransactionId transactionId)
+{
+ if (AMI_OVERRIDE)
+ return;
+
+ TransactionLogUpdate(transactionId, XID_INPROGRESS);
+}
diff --git a/src/backend/access/transam/transsup.c b/src/backend/access/transam/transsup.c
new file mode 100644
index 00000000000..a1e5b17ec13
--- /dev/null
+++ b/src/backend/access/transam/transsup.c
@@ -0,0 +1,663 @@
+/*-------------------------------------------------------------------------
+ *
+ * transsup.c--
+ * postgres transaction access method support code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/Attic/transsup.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ * NOTES
+ * This file contains support functions for the high
+ * level access method interface routines found in transam.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "machine.h" /* in port/ directory (needed for BLCKSZ) */
+
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "utils/nabstime.h"
+
+#include "catalog/heap.h"
+#include "access/transam.h" /* where the declarations go */
+#include "access/xact.h" /* where the declarations go */
+
+#include "storage/smgr.h"
+
+/* ----------------------------------------------------------------
+ * general support routines
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * AmiTransactionOverride
+ *
+ * This function is used to manipulate the bootstrap flag.
+ * --------------------------------
+ */
+void
+AmiTransactionOverride(bool flag)
+{
+ AMI_OVERRIDE = flag;
+}
+
+/* --------------------------------
+ * TransComputeBlockNumber
+ * --------------------------------
+ */
+void
+TransComputeBlockNumber(Relation relation, /* relation to test */
+ TransactionId transactionId, /* transaction id to test */
+ BlockNumber *blockNumberOutP)
+{
+ long itemsPerBlock;
+
+ /* ----------------
+ * we calculate the block number of our transaction
+ * by dividing the transaction id by the number of
+ * transaction things per block.
+ * ----------------
+ */
+ if (relation == LogRelation)
+ itemsPerBlock = TP_NumXidStatusPerBlock;
+ else if (relation == TimeRelation)
+ itemsPerBlock = TP_NumTimePerBlock;
+ else
+ elog(WARN, "TransComputeBlockNumber: unknown relation");
+
+ /* ----------------
+ * warning! if the transaction id's get too large
+ * then a BlockNumber may not be large enough to hold the results
+ * of our division.
+ *
+ * XXX this will all vanish soon when we implement an improved
+ * transaction id schema -cim 3/23/90
+ *
+ * This has vanished now that xid's are 4 bytes (no longer 5).
+ * -mer 5/24/92
+ * ----------------
+ */
+ (*blockNumberOutP) = transactionId / itemsPerBlock;
+}
+
+
+/* ----------------------------------------------------------------
+ * trans block support routines
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * TransBlockGetLastTransactionIdStatus
+ *
+ * This returns the status and transaction id of the last
+ * transaction information recorded on the given TransBlock.
+ * --------------------------------
+ */
+
+XidStatus
+TransBlockGetLastTransactionIdStatus(Block tblock,
+ TransactionId baseXid,
+ TransactionId *returnXidP)
+{
+ Index index;
+ Index maxIndex;
+ bits8 bit1;
+ bits8 bit2;
+ BitIndex offset;
+ XidStatus xstatus;
+
+ /* ----------------
+ * sanity check
+ * ----------------
+ */
+ Assert((tblock != NULL));
+
+ /* ----------------
+ * search downward from the top of the block data, looking
+ * for the first Non-in progress transaction status. Since we
+ * are scanning backward, this will be last recorded transaction
+ * status on the block.
+ * ----------------
+ */
+ maxIndex = TP_NumXidStatusPerBlock;
+ for (index = maxIndex-1; index>=0; index--) {
+ offset = BitIndexOf(index);
+ bit1 = ((bits8) BitArrayBitIsSet((BitArray) tblock, offset++)) << 1;
+ bit2 = (bits8) BitArrayBitIsSet((BitArray) tblock, offset);
+
+ xstatus = (bit1 | bit2) ;
+
+ /* ----------------
+ * here we have the status of some transaction, so test
+ * if the status is recorded as "in progress". If so, then
+ * we save the transaction id in the place specified by the caller.
+ * ----------------
+ */
+ if (xstatus != XID_INPROGRESS) {
+ if (returnXidP != NULL) {
+ TransactionIdStore(baseXid, returnXidP);
+ TransactionIdAdd(returnXidP, index);
+ }
+ break;
+ }
+ }
+
+ /* ----------------
+ * if we get here and index is 0 it means we couldn't find
+ * a non-inprogress transaction on the block. For now we just
+ * return this info to the user. They can check if the return
+ * status is "in progress" to know this condition has arisen.
+ * ----------------
+ */
+ if (index == 0) {
+ if (returnXidP != NULL)
+ TransactionIdStore(baseXid, returnXidP);
+ }
+
+ /* ----------------
+ * return the status to the user
+ * ----------------
+ */
+ return xstatus;
+}
+
+/* --------------------------------
+ * TransBlockGetXidStatus
+ *
+ * This returns the status of the desired transaction
+ * --------------------------------
+ */
+
+XidStatus
+TransBlockGetXidStatus(Block tblock,
+ TransactionId transactionId)
+{
+ Index index;
+ bits8 bit1;
+ bits8 bit2;
+ BitIndex offset;
+
+ /* ----------------
+ * sanity check
+ * ----------------
+ */
+ if (tblock == NULL) {
+ return XID_INVALID;
+ }
+
+ /* ----------------
+ * calculate the index into the transaction data where
+ * our transaction status is located
+ *
+ * XXX this will be replaced soon when we move to the
+ * new transaction id scheme -cim 3/23/90
+ *
+ * The old system has now been replaced. -mer 5/24/92
+ * ----------------
+ */
+ index = transactionId % TP_NumXidStatusPerBlock;
+
+ /* ----------------
+ * get the data at the specified index
+ * ----------------
+ */
+ offset = BitIndexOf(index);
+ bit1 = ((bits8) BitArrayBitIsSet((BitArray) tblock, offset++)) << 1;
+ bit2 = (bits8) BitArrayBitIsSet((BitArray) tblock, offset);
+
+ /* ----------------
+ * return the transaction status to the caller
+ * ----------------
+ */
+ return (XidStatus)
+ (bit1 | bit2);
+}
+
+/* --------------------------------
+ * TransBlockSetXidStatus
+ *
+ * This sets the status of the desired transaction
+ * --------------------------------
+ */
+void
+TransBlockSetXidStatus(Block tblock,
+ TransactionId transactionId,
+ XidStatus xstatus)
+{
+ Index index;
+ BitIndex offset;
+
+ /* ----------------
+ * sanity check
+ * ----------------
+ */
+ if (tblock == NULL)
+ return;
+
+ /* ----------------
+ * calculate the index into the transaction data where
+ * we sould store our transaction status.
+ *
+ * XXX this will be replaced soon when we move to the
+ * new transaction id scheme -cim 3/23/90
+ *
+ * The new scheme is here -mer 5/24/92
+ * ----------------
+ */
+ index = transactionId % TP_NumXidStatusPerBlock;
+
+ offset = BitIndexOf(index);
+
+ /* ----------------
+ * store the transaction value at the specified offset
+ * ----------------
+ */
+ switch(xstatus) {
+ case XID_COMMIT: /* set 10 */
+ BitArraySetBit((BitArray) tblock, offset);
+ BitArrayClearBit((BitArray) tblock, offset + 1);
+ break;
+ case XID_ABORT: /* set 01 */
+ BitArrayClearBit((BitArray) tblock, offset);
+ BitArraySetBit((BitArray) tblock, offset + 1);
+ break;
+ case XID_INPROGRESS: /* set 00 */
+ BitArrayClearBit((BitArray) tblock, offset);
+ BitArrayClearBit((BitArray) tblock, offset + 1);
+ break;
+ default:
+ elog(NOTICE,
+ "TransBlockSetXidStatus: invalid status: %d (ignored)",
+ xstatus);
+ break;
+ }
+}
+
+/* --------------------------------
+ * TransBlockGetCommitTime
+ *
+ * This returns the transaction commit time for the
+ * specified transaction id in the trans block.
+ * --------------------------------
+ */
+AbsoluteTime
+TransBlockGetCommitTime(Block tblock,
+ TransactionId transactionId)
+{
+ Index index;
+ AbsoluteTime *timeArray;
+
+ /* ----------------
+ * sanity check
+ * ----------------
+ */
+ if (tblock == NULL)
+ return INVALID_ABSTIME;
+
+ /* ----------------
+ * calculate the index into the transaction data where
+ * our transaction commit time is located
+ *
+ * XXX this will be replaced soon when we move to the
+ * new transaction id scheme -cim 3/23/90
+ *
+ * The new scheme is here. -mer 5/24/92
+ * ----------------
+ */
+ index = transactionId % TP_NumTimePerBlock;
+
+ /* ----------------
+ * return the commit time to the caller
+ * ----------------
+ */
+ timeArray = (AbsoluteTime *) tblock;
+ return (AbsoluteTime)
+ timeArray[ index ];
+}
+
+/* --------------------------------
+ * TransBlockSetCommitTime
+ *
+ * This sets the commit time of the specified transaction
+ * --------------------------------
+ */
+void
+TransBlockSetCommitTime(Block tblock,
+ TransactionId transactionId,
+ AbsoluteTime commitTime)
+{
+ Index index;
+ AbsoluteTime *timeArray;
+
+ /* ----------------
+ * sanity check
+ * ----------------
+ */
+ if (tblock == NULL)
+ return;
+
+
+ /* ----------------
+ * calculate the index into the transaction data where
+ * we sould store our transaction status.
+ *
+ * XXX this will be replaced soon when we move to the
+ * new transaction id scheme -cim 3/23/90
+ *
+ * The new scheme is here. -mer 5/24/92
+ * ----------------
+ */
+ index = transactionId % TP_NumTimePerBlock;
+
+ /* ----------------
+ * store the transaction commit time at the specified index
+ * ----------------
+ */
+ timeArray = (AbsoluteTime *) tblock;
+ timeArray[ index ] = commitTime;
+}
+
+/* ----------------------------------------------------------------
+ * transam i/o support routines
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * TransBlockNumberGetXidStatus
+ * --------------------------------
+ */
+XidStatus
+TransBlockNumberGetXidStatus(Relation relation,
+ BlockNumber blockNumber,
+ TransactionId xid,
+ bool *failP)
+{
+ Buffer buffer; /* buffer associated with block */
+ Block block; /* block containing xstatus */
+ XidStatus xstatus; /* recorded status of xid */
+ bool localfail; /* bool used if failP = NULL */
+
+ /* ----------------
+ * SOMEDAY place a read lock on the log relation
+ * That someday is today 5 Aug 1991 -mer
+ * ----------------
+ */
+ RelationSetLockForRead(relation);
+
+ /* ----------------
+ * get the page containing the transaction information
+ * ----------------
+ */
+ buffer = ReadBuffer(relation, blockNumber);
+ block = BufferGetBlock(buffer);
+
+ /* ----------------
+ * get the status from the block. note, for now we always
+ * return false in failP.
+ * ----------------
+ */
+ if (failP == NULL)
+ failP = &localfail;
+ (*failP) = false;
+
+ xstatus = TransBlockGetXidStatus(block, xid);
+
+ /* ----------------
+ * release the buffer and return the status
+ * ----------------
+ */
+ ReleaseBuffer(buffer);
+
+ /* ----------------
+ * SOMEDAY release our lock on the log relation
+ * ----------------
+ */
+ RelationUnsetLockForRead(relation);
+
+ return
+ xstatus;
+}
+
+/* --------------------------------
+ * TransBlockNumberSetXidStatus
+ * --------------------------------
+ */
+void
+TransBlockNumberSetXidStatus(Relation relation,
+ BlockNumber blockNumber,
+ TransactionId xid,
+ XidStatus xstatus,
+ bool *failP)
+{
+ Buffer buffer; /* buffer associated with block */
+ Block block; /* block containing xstatus */
+ bool localfail; /* bool used if failP = NULL */
+
+ /* ----------------
+ * SOMEDAY gain exclusive access to the log relation
+ *
+ * That someday is today 5 Aug 1991 -mer
+ * ----------------
+ */
+ RelationSetLockForWrite(relation);
+
+ /* ----------------
+ * get the block containing the transaction status
+ * ----------------
+ */
+ buffer = ReadBuffer(relation, blockNumber);
+ block = BufferGetBlock(buffer);
+
+ /* ----------------
+ * attempt to update the status of the transaction on the block.
+ * if we are successful, write the block. otherwise release the buffer.
+ * note, for now we always return false in failP.
+ * ----------------
+ */
+ if (failP == NULL)
+ failP = &localfail;
+ (*failP) = false;
+
+ TransBlockSetXidStatus(block, xid, xstatus);
+
+ if ((*failP) == false)
+ WriteBuffer(buffer);
+ else
+ ReleaseBuffer(buffer);
+
+ /* ----------------
+ * SOMEDAY release our lock on the log relation
+ * ----------------
+ */
+ RelationUnsetLockForWrite(relation);
+}
+
+/* --------------------------------
+ * TransBlockNumberGetCommitTime
+ * --------------------------------
+ */
+AbsoluteTime
+TransBlockNumberGetCommitTime(Relation relation,
+ BlockNumber blockNumber,
+ TransactionId xid,
+ bool *failP)
+{
+ Buffer buffer; /* buffer associated with block */
+ Block block; /* block containing commit time */
+ bool localfail; /* bool used if failP = NULL */
+ AbsoluteTime xtime; /* commit time */
+
+ /* ----------------
+ * SOMEDAY place a read lock on the time relation
+ *
+ * That someday is today 5 Aug. 1991 -mer
+ * ----------------
+ */
+ RelationSetLockForRead(relation);
+
+ /* ----------------
+ * get the block containing the transaction information
+ * ----------------
+ */
+ buffer = ReadBuffer(relation, blockNumber);
+ block = BufferGetBlock(buffer);
+
+ /* ----------------
+ * get the commit time from the block
+ * note, for now we always return false in failP.
+ * ----------------
+ */
+ if (failP == NULL)
+ failP = &localfail;
+ (*failP) = false;
+
+ xtime = TransBlockGetCommitTime(block, xid);
+
+ /* ----------------
+ * release the buffer and return the commit time
+ * ----------------
+ */
+ ReleaseBuffer(buffer);
+
+ /* ----------------
+ * SOMEDAY release our lock on the time relation
+ * ----------------
+ */
+ RelationUnsetLockForRead(relation);
+
+ if ((*failP) == false)
+ return xtime;
+ else
+ return INVALID_ABSTIME;
+
+}
+
+/* --------------------------------
+ * TransBlockNumberSetCommitTime
+ * --------------------------------
+ */
+void
+TransBlockNumberSetCommitTime(Relation relation,
+ BlockNumber blockNumber,
+ TransactionId xid,
+ AbsoluteTime xtime,
+ bool *failP)
+{
+ Buffer buffer; /* buffer associated with block */
+ Block block; /* block containing commit time */
+ bool localfail; /* bool used if failP = NULL */
+
+ /* ----------------
+ * SOMEDAY gain exclusive access to the time relation
+ *
+ * That someday is today 5 Aug. 1991 -mer
+ * ----------------
+ */
+ RelationSetLockForWrite(relation);
+
+ /* ----------------
+ * get the block containing our commit time
+ * ----------------
+ */
+ buffer = ReadBuffer(relation, blockNumber);
+ block = BufferGetBlock(buffer);
+
+ /* ----------------
+ * attempt to update the commit time of the transaction on the block.
+ * if we are successful, write the block. otherwise release the buffer.
+ * note, for now we always return false in failP.
+ * ----------------
+ */
+ if (failP == NULL)
+ failP = &localfail;
+ (*failP) = false;
+
+ TransBlockSetCommitTime(block, xid, xtime);
+
+ if ((*failP) == false)
+ WriteBuffer(buffer);
+ else
+ ReleaseBuffer(buffer);
+
+ /* ----------------
+ * SOMEDAY release our lock on the time relation
+ * ----------------
+ */
+ RelationUnsetLockForWrite(relation);
+
+}
+
+/* --------------------------------
+ * TransGetLastRecordedTransaction
+ * --------------------------------
+ */
+void
+TransGetLastRecordedTransaction(Relation relation,
+ TransactionId xid, /* return: transaction id */
+ bool *failP)
+{
+ BlockNumber blockNumber; /* block number */
+ Buffer buffer; /* buffer associated with block */
+ Block block; /* block containing xid status */
+ BlockNumber n; /* number of blocks in the relation */
+ TransactionId baseXid;
+
+ (*failP) = false;
+
+ /* ----------------
+ * SOMEDAY gain exclusive access to the log relation
+ *
+ * That someday is today 5 Aug. 1991 -mer
+ * It looks to me like we only need to set a read lock here, despite
+ * the above comment about exclusive access. The block is never
+ * actually written into, we only check status bits.
+ * ----------------
+ */
+ RelationSetLockForRead(relation);
+
+ /* ----------------
+ * we assume the last block of the log contains the last
+ * recorded transaction. If the relation is empty we return
+ * failure to the user.
+ * ----------------
+ */
+ n = RelationGetNumberOfBlocks(relation);
+ if (n == 0) {
+ (*failP) = true;
+ return;
+ }
+
+ /* ----------------
+ * get the block containing the transaction information
+ * ----------------
+ */
+ blockNumber = n-1;
+ buffer = ReadBuffer(relation, blockNumber);
+ block = BufferGetBlock(buffer);
+
+ /* ----------------
+ * get the last xid on the block
+ * ----------------
+ */
+ baseXid = blockNumber * TP_NumXidStatusPerBlock;
+
+/* XXX ???? xid won't get returned! - AY '94 */
+ (void) TransBlockGetLastTransactionIdStatus(block, baseXid, &xid);
+
+ ReleaseBuffer(buffer);
+
+ /* ----------------
+ * SOMEDAY release our lock on the log relation
+ * ----------------
+ */
+ RelationUnsetLockForRead(relation);
+}
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
new file mode 100644
index 00000000000..a53cc7d35b1
--- /dev/null
+++ b/src/backend/access/transam/varsup.c
@@ -0,0 +1,606 @@
+/*-------------------------------------------------------------------------
+ *
+ * varsup.c--
+ * postgres variable relation support routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/varsup.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <math.h>
+#include "postgres.h"
+
+#include "machine.h" /* in port/ directory (needed for BLCKSZ) */
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h" /* for OIDGENLOCKID */
+
+#include "utils/rel.h"
+#include "utils/elog.h"
+
+#include "access/heapam.h"
+#include "access/transam.h" /* where the declarations go */
+#include "access/xact.h" /* where the declarations go */
+
+#include "catalog/catname.h"
+
+/* ----------
+ * note: we reserve the first 16384 object ids for internal use.
+ * oid's less than this appear in the .bki files. the choice of
+ * 16384 is completely arbitrary.
+ * ----------
+ */
+#define BootstrapObjectIdData 16384
+
+/* ---------------------
+ * spin lock for oid generation
+ * ---------------------
+ */
+int OidGenLockId;
+
+/* ----------------------------------------------------------------
+ * variable relation query/update routines
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * VariableRelationGetNextXid
+ * --------------------------------
+ */
+void
+VariableRelationGetNextXid(TransactionId *xidP)
+{
+ Buffer buf;
+ VariableRelationContents var;
+
+ /* ----------------
+ * We assume that a spinlock has been acquire to guarantee
+ * exclusive access to the variable relation.
+ * ----------------
+ */
+
+ /* ----------------
+ * do nothing before things are initialized
+ * ----------------
+ */
+ if (! RelationIsValid(VariableRelation))
+ return;
+
+ /* ----------------
+ * read the variable page, get the the nextXid field and
+ * release the buffer
+ * ----------------
+ */
+ buf = ReadBuffer(VariableRelation, 0);
+
+ if (! BufferIsValid(buf))
+ {
+ SpinRelease(OidGenLockId);
+ elog(WARN, "VariableRelationGetNextXid: ReadBuffer failed");
+ }
+
+ var = (VariableRelationContents) BufferGetBlock(buf);
+
+ TransactionIdStore(var->nextXidData, xidP);
+ ReleaseBuffer(buf);
+}
+
+/* --------------------------------
+ * VariableRelationGetLastXid
+ * --------------------------------
+ */
+void
+VariableRelationGetLastXid(TransactionId *xidP)
+{
+ Buffer buf;
+ VariableRelationContents var;
+
+ /* ----------------
+ * We assume that a spinlock has been acquire to guarantee
+ * exclusive access to the variable relation.
+ * ----------------
+ */
+
+ /* ----------------
+ * do nothing before things are initialized
+ * ----------------
+ */
+ if (! RelationIsValid(VariableRelation))
+ return;
+
+ /* ----------------
+ * read the variable page, get the the lastXid field and
+ * release the buffer
+ * ----------------
+ */
+ buf = ReadBuffer(VariableRelation, 0);
+
+ if (! BufferIsValid(buf))
+ {
+ SpinRelease(OidGenLockId);
+ elog(WARN, "VariableRelationGetNextXid: ReadBuffer failed");
+ }
+
+ var = (VariableRelationContents) BufferGetBlock(buf);
+
+ TransactionIdStore(var->lastXidData, xidP);
+
+ ReleaseBuffer(buf);
+}
+
+/* --------------------------------
+ * VariableRelationPutNextXid
+ * --------------------------------
+ */
+void
+VariableRelationPutNextXid(TransactionId xid)
+{
+ Buffer buf;
+ VariableRelationContents var;
+
+ /* ----------------
+ * We assume that a spinlock has been acquire to guarantee
+ * exclusive access to the variable relation.
+ * ----------------
+ */
+
+ /* ----------------
+ * do nothing before things are initialized
+ * ----------------
+ */
+ if (! RelationIsValid(VariableRelation))
+ return;
+
+ /* ----------------
+ * read the variable page, update the nextXid field and
+ * write the page back out to disk.
+ * ----------------
+ */
+ buf = ReadBuffer(VariableRelation, 0);
+
+ if (! BufferIsValid(buf))
+ {
+ SpinRelease(OidGenLockId);
+ elog(WARN, "VariableRelationPutNextXid: ReadBuffer failed");
+ }
+
+ var = (VariableRelationContents) BufferGetBlock(buf);
+
+ TransactionIdStore(xid, &(var->nextXidData));
+
+ WriteBuffer(buf);
+}
+
+/* --------------------------------
+ * VariableRelationPutLastXid
+ * --------------------------------
+ */
+void
+VariableRelationPutLastXid(TransactionId xid)
+{
+ Buffer buf;
+ VariableRelationContents var;
+
+ /* ----------------
+ * We assume that a spinlock has been acquire to guarantee
+ * exclusive access to the variable relation.
+ * ----------------
+ */
+
+ /* ----------------
+ * do nothing before things are initialized
+ * ----------------
+ */
+ if (! RelationIsValid(VariableRelation))
+ return;
+
+ /* ----------------
+ * read the variable page, update the lastXid field and
+ * force the page back out to disk.
+ * ----------------
+ */
+ buf = ReadBuffer(VariableRelation, 0);
+
+ if (! BufferIsValid(buf))
+ {
+ SpinRelease(OidGenLockId);
+ elog(WARN, "VariableRelationPutLastXid: ReadBuffer failed");
+ }
+
+ var = (VariableRelationContents) BufferGetBlock(buf);
+
+ TransactionIdStore(xid, &(var->lastXidData));
+
+ WriteBuffer(buf);
+}
+
+/* --------------------------------
+ * VariableRelationGetNextOid
+ * --------------------------------
+ */
+void
+VariableRelationGetNextOid(Oid *oid_return)
+{
+ Buffer buf;
+ VariableRelationContents var;
+
+ /* ----------------
+ * We assume that a spinlock has been acquire to guarantee
+ * exclusive access to the variable relation.
+ * ----------------
+ */
+
+ /* ----------------
+ * if the variable relation is not initialized, then we
+ * assume we are running at bootstrap time and so we return
+ * an invalid object id -- during this time GetNextBootstrapObjectId
+ * should be called instead..
+ * ----------------
+ */
+ if (! RelationIsValid(VariableRelation)) {
+ if (PointerIsValid(oid_return))
+ (*oid_return) = InvalidOid;
+ return;
+ }
+
+ /* ----------------
+ * read the variable page, get the the nextOid field and
+ * release the buffer
+ * ----------------
+ */
+ buf = ReadBuffer(VariableRelation, 0);
+
+ if (! BufferIsValid(buf))
+ {
+ SpinRelease(OidGenLockId);
+ elog(WARN, "VariableRelationGetNextXid: ReadBuffer failed");
+ }
+
+ var = (VariableRelationContents) BufferGetBlock(buf);
+
+ if (PointerIsValid(oid_return)) {
+
+ /* ----------------
+ * nothing up my sleeve... what's going on here is that this code
+ * is guaranteed never to be called until all files in data/base/
+ * are created, and the template database exists. at that point,
+ * we want to append a pg_database tuple. the first time we do
+ * this, the oid stored in pg_variable will be bogus, so we use
+ * a bootstrap value defined at the top of this file.
+ *
+ * this comment no longer holds true. This code is called before
+ * all of the files in data/base are created and you can't rely
+ * on system oid's to be less than BootstrapObjectIdData. mer 9/18/91
+ * ----------------
+ */
+ if (OidIsValid(var->nextOid))
+ (*oid_return) = var->nextOid;
+ else
+ (*oid_return) = BootstrapObjectIdData;
+ }
+
+ ReleaseBuffer(buf);
+}
+
+/* --------------------------------
+ * VariableRelationPutNextOid
+ * --------------------------------
+ */
+void
+VariableRelationPutNextOid(Oid *oidP)
+{
+ Buffer buf;
+ VariableRelationContents var;
+
+ /* ----------------
+ * We assume that a spinlock has been acquire to guarantee
+ * exclusive access to the variable relation.
+ * ----------------
+ */
+
+ /* ----------------
+ * do nothing before things are initialized
+ * ----------------
+ */
+ if (! RelationIsValid(VariableRelation))
+ return;
+
+ /* ----------------
+ * sanity check
+ * ----------------
+ */
+ if (! PointerIsValid(oidP))
+ {
+ SpinRelease(OidGenLockId);
+ elog(WARN, "VariableRelationPutNextOid: invalid oid pointer");
+ }
+
+ /* ----------------
+ * read the variable page, update the nextXid field and
+ * write the page back out to disk.
+ * ----------------
+ */
+ buf = ReadBuffer(VariableRelation, 0);
+
+ if (! BufferIsValid(buf))
+ {
+ SpinRelease(OidGenLockId);
+ elog(WARN, "VariableRelationPutNextXid: ReadBuffer failed");
+ }
+
+ var = (VariableRelationContents) BufferGetBlock(buf);
+
+ var->nextOid = (*oidP);
+
+ WriteBuffer(buf);
+}
+
+/* ----------------------------------------------------------------
+ * transaction id generation support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * GetNewTransactionId
+ *
+ * In the version 2 transaction system, transaction id's are
+ * restricted in several ways.
+ *
+ * First, all transaction id's are even numbers (4, 88, 121342, etc).
+ * This means the binary representation of the number will never
+ * have the least significent bit set. This bit is reserved to
+ * indicate that the transaction id does not in fact hold an XID,
+ * but rather a commit time. This makes it possible for the
+ * vaccuum daemon to disgard information from the log and time
+ * relations for committed tuples. This is important when archiving
+ * tuples to an optical disk because tuples with commit times
+ * stored in their xid fields will not need to consult the log
+ * and time relations.
+ *
+ * Second, since we may someday preform compression of the data
+ * in the log and time relations, we cause the numbering of the
+ * transaction ids to begin at 512. This means that some space
+ * on the page of the log and time relations corresponding to
+ * transaction id's 0 - 510 will never be used. This space is
+ * in fact used to store the version number of the postgres
+ * transaction log and will someday store compression information
+ * about the log.
+ *
+ * Lastly, rather then access the variable relation each time
+ * a backend requests a new transction id, we "prefetch" 32
+ * transaction id's by incrementing the nextXid stored in the
+ * var relation by 64 (remember only even xid's are legal) and then
+ * returning these id's one at a time until they are exhausted.
+ * This means we reduce the number of accesses to the variable
+ * relation by 32 for each backend.
+ *
+ * Note: 32 has no special significance. We don't want the
+ * number to be too large because if when the backend
+ * terminates, we lose the xid's we cached.
+ *
+ * ----------------
+ */
+
+#define VAR_XID_PREFETCH 32
+
+static int prefetched_xid_count = 0;
+static TransactionId next_prefetched_xid;
+
+void
+GetNewTransactionId(TransactionId *xid)
+{
+ TransactionId nextid;
+
+ /* ----------------
+ * during bootstrap initialization, we return the special
+ * bootstrap transaction id.
+ * ----------------
+ */
+ if (AMI_OVERRIDE) {
+ TransactionIdStore(AmiTransactionId, xid);
+ return;
+ }
+
+ /* ----------------
+ * if we run out of prefetched xids, then we get some
+ * more before handing them out to the caller.
+ * ----------------
+ */
+
+ if (prefetched_xid_count == 0) {
+ /* ----------------
+ * obtain exclusive access to the variable relation page
+ *
+ * get the "next" xid from the variable relation
+ * and save it in the prefetched id.
+ * ----------------
+ */
+ SpinAcquire(OidGenLockId);
+ VariableRelationGetNextXid(&nextid);
+ TransactionIdStore(nextid, &next_prefetched_xid);
+
+ /* ----------------
+ * now increment the variable relation's next xid
+ * and reset the prefetched_xid_count. We multiply
+ * the id by two because our xid's are always even.
+ * ----------------
+ */
+ prefetched_xid_count = VAR_XID_PREFETCH;
+ TransactionIdAdd(&nextid, prefetched_xid_count);
+ VariableRelationPutNextXid(nextid);
+ SpinRelease(OidGenLockId);
+ }
+
+ /* ----------------
+ * return the next prefetched xid in the pointer passed by
+ * the user and decrement the prefetch count. We add two
+ * to id we return the next time this is called because our
+ * transaction ids are always even.
+ *
+ * XXX Transaction Ids used to be even as the low order bit was
+ * used to determine commit status. This is no long true so
+ * we now use even and odd transaction ids. -mer 5/26/92
+ * ----------------
+ */
+ TransactionIdStore(next_prefetched_xid, xid);
+ TransactionIdAdd(&next_prefetched_xid, 1);
+ prefetched_xid_count--;
+}
+
+/* ----------------
+ * UpdateLastCommittedXid
+ * ----------------
+ */
+
+void
+UpdateLastCommittedXid(TransactionId xid)
+{
+ TransactionId lastid;
+
+
+ /* we assume that spinlock OidGenLockId has been acquired
+ * prior to entering this function
+ */
+
+ /* ----------------
+ * get the "last committed" transaction id from
+ * the variable relation page.
+ * ----------------
+ */
+ VariableRelationGetLastXid(&lastid);
+
+ /* ----------------
+ * if the transaction id is greater than the last committed
+ * transaction then we update the last committed transaction
+ * in the variable relation.
+ * ----------------
+ */
+ if (TransactionIdIsLessThan(lastid, xid))
+ VariableRelationPutLastXid(xid);
+
+}
+
+/* ----------------------------------------------------------------
+ * object id generation support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------
+ * GetNewObjectIdBlock
+ *
+ * This support function is used to allocate a block of object ids
+ * of the given size. applications wishing to do their own object
+ * id assignments should use this
+ * ----------------
+ */
+void
+GetNewObjectIdBlock(Oid *oid_return, /* place to return the new object id */
+ int oid_block_size) /* number of oids desired */
+{
+ Oid nextoid;
+
+ /* ----------------
+ * SOMEDAY obtain exclusive access to the variable relation page
+ * That someday is today -mer 6 Aug 1992
+ * ----------------
+ */
+ SpinAcquire(OidGenLockId);
+
+ /* ----------------
+ * get the "next" oid from the variable relation
+ * and give it to the caller.
+ * ----------------
+ */
+ VariableRelationGetNextOid(&nextoid);
+ if (PointerIsValid(oid_return))
+ (*oid_return) = nextoid;
+
+ /* ----------------
+ * now increment the variable relation's next oid
+ * field by the size of the oid block requested.
+ * ----------------
+ */
+ nextoid += oid_block_size;
+ VariableRelationPutNextOid(&nextoid);
+
+ /* ----------------
+ * SOMEDAY relinquish our lock on the variable relation page
+ * That someday is today -mer 6 Apr 1992
+ * ----------------
+ */
+ SpinRelease(OidGenLockId);
+}
+
+/* ----------------
+ * GetNewObjectId
+ *
+ * This function allocates and parses out object ids. Like
+ * GetNewTransactionId(), it "prefetches" 32 object ids by
+ * incrementing the nextOid stored in the var relation by 32 and then
+ * returning these id's one at a time until they are exhausted.
+ * This means we reduce the number of accesses to the variable
+ * relation by 32 for each backend.
+ *
+ * Note: 32 has no special significance. We don't want the
+ * number to be too large because if when the backend
+ * terminates, we lose the oids we cached.
+ *
+ * ----------------
+ */
+
+#define VAR_OID_PREFETCH 32
+
+static int prefetched_oid_count = 0;
+static Oid next_prefetched_oid;
+
+void
+GetNewObjectId(Oid *oid_return) /* place to return the new object id */
+{
+ /* ----------------
+ * if we run out of prefetched oids, then we get some
+ * more before handing them out to the caller.
+ * ----------------
+ */
+
+ if (prefetched_oid_count == 0) {
+ int oid_block_size = VAR_OID_PREFETCH;
+
+ /* ----------------
+ * during bootstrap time, we want to allocate oids
+ * one at a time. Otherwise there might be some
+ * bootstrap oid's left in the block we prefetch which
+ * would be passed out after the variable relation was
+ * initialized. This would be bad.
+ * ----------------
+ */
+ if (! RelationIsValid(VariableRelation))
+ VariableRelation = heap_openr(VariableRelationName);
+
+ /* ----------------
+ * get a new block of prefetched object ids.
+ * ----------------
+ */
+ GetNewObjectIdBlock(&next_prefetched_oid, oid_block_size);
+
+ /* ----------------
+ * now reset the prefetched_oid_count.
+ * ----------------
+ */
+ prefetched_oid_count = oid_block_size;
+ }
+
+ /* ----------------
+ * return the next prefetched oid in the pointer passed by
+ * the user and decrement the prefetch count.
+ * ----------------
+ */
+ if (PointerIsValid(oid_return))
+ (*oid_return) = next_prefetched_oid;
+
+ next_prefetched_oid++;
+ prefetched_oid_count--;
+}
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
new file mode 100644
index 00000000000..1798d09d054
--- /dev/null
+++ b/src/backend/access/transam/xact.c
@@ -0,0 +1,1314 @@
+/*-------------------------------------------------------------------------
+ *
+ * xact.c--
+ * top level transaction system support routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $
+ *
+ * NOTES
+ * Transaction aborts can now occur two ways:
+ *
+ * 1) system dies from some internal cause (Assert, etc..)
+ * 2) user types abort
+ *
+ * These two cases used to be treated identically, but now
+ * we need to distinguish them. Why? consider the following
+ * two situatuons:
+ *
+ * case 1 case 2
+ * ------ ------
+ * 1) user types BEGIN 1) user types BEGIN
+ * 2) user does something 2) user does something
+ * 3) user does not like what 3) system aborts for some reason
+ * she shes and types ABORT
+ *
+ * In case 1, we want to abort the transaction and return to the
+ * default state. In case 2, there may be more commands coming
+ * our way which are part of the same transaction block and we have
+ * to ignore these commands until we see an END transaction.
+ *
+ * Internal aborts are now handled by AbortTransactionBlock(), just as
+ * they always have been, and user aborts are now handled by
+ * UserAbortTransactionBlock(). Both of them rely on AbortTransaction()
+ * to do all the real work. The only difference is what state we
+ * enter after AbortTransaction() does it's work:
+ *
+ * * AbortTransactionBlock() leaves us in TBLOCK_ABORT and
+ * * UserAbortTransactionBlock() leaves us in TBLOCK_ENDABORT
+ *
+ * NOTES
+ * This file is an attempt at a redesign of the upper layer
+ * of the V1 transaction system which was too poorly thought
+ * out to describe. This new system hopes to be both simpler
+ * in design, simpler to extend and needs to contain added
+ * functionality to solve problems beyond the scope of the V1
+ * system. (In particuler, communication of transaction
+ * information between parallel backends has to be supported)
+ *
+ * The essential aspects of the transaction system are:
+ *
+ * o transaction id generation
+ * o transaction log updating
+ * o memory cleanup
+ * o cache invalidation
+ * o lock cleanup
+ *
+ * Hence, the functional division of the transaction code is
+ * based on what of the above things need to be done during
+ * a start/commit/abort transaction. For instance, the
+ * routine AtCommit_Memory() takes care of all the memory
+ * cleanup stuff done at commit time.
+ *
+ * The code is layered as follows:
+ *
+ * StartTransaction
+ * CommitTransaction
+ * AbortTransaction
+ * UserAbortTransaction
+ *
+ * are provided to do the lower level work like recording
+ * the transaction status in the log and doing memory cleanup.
+ * above these routines are another set of functions:
+ *
+ * StartTransactionCommand
+ * CommitTransactionCommand
+ * AbortCurrentTransaction
+ *
+ * These are the routines used in the postgres main processing
+ * loop. They are sensitive to the current transaction block state
+ * and make calls to the lower level routines appropriately.
+ *
+ * Support for transaction blocks is provided via the functions:
+ *
+ * StartTransactionBlock
+ * CommitTransactionBlock
+ * AbortTransactionBlock
+ *
+ * These are invoked only in responce to a user "BEGIN", "END",
+ * or "ABORT" command. The tricky part about these functions
+ * is that they are called within the postgres main loop, in between
+ * the StartTransactionCommand() and CommitTransactionCommand().
+ *
+ * For example, consider the following sequence of user commands:
+ *
+ * 1) begin
+ * 2) retrieve (foo.all)
+ * 3) append foo (bar = baz)
+ * 4) end
+ *
+ * in the main processing loop, this results in the following
+ * transaction sequence:
+ *
+ * / StartTransactionCommand();
+ * 1) / ProcessUtility(); << begin
+ * \ StartTransactionBlock();
+ * \ CommitTransactionCommand();
+ *
+ * / StartTransactionCommand();
+ * 2) < ProcessQuery(); << retrieve (foo.all)
+ * \ CommitTransactionCommand();
+ *
+ * / StartTransactionCommand();
+ * 3) < ProcessQuery(); << append foo (bar = baz)
+ * \ CommitTransactionCommand();
+ *
+ * / StartTransactionCommand();
+ * 4) / ProcessUtility(); << end
+ * \ CommitTransactionBlock();
+ * \ CommitTransactionCommand();
+ *
+ * The point of this example is to demonstrate the need for
+ * StartTransactionCommand() and CommitTransactionCommand() to
+ * be state smart -- they should do nothing in between the calls
+ * to StartTransactionBlock() and EndTransactionBlock() and
+ * outside these calls they need to do normal start/commit
+ * processing.
+ *
+ * Furthermore, suppose the "retrieve (foo.all)" caused an abort
+ * condition. We would then want to abort the transaction and
+ * ignore all subsequent commands up to the "end".
+ * -cim 3/23/90
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "access/xact.h"
+#include "commands/async.h"
+#include "storage/bufmgr.h"
+#include "storage/block.h"
+#include "storage/proc.h"
+#include "utils/inval.h"
+#include "utils/relcache.h"
+#include "access/transam.h"
+#include "catalog/heap.h"
+
+/* ----------------
+ * global variables holding the current transaction state.
+ *
+ * Note: when we are running several slave processes, the
+ * current transaction state data is copied into shared memory
+ * and the CurrentTransactionState pointer changed to
+ * point to the shared copy. All this occurrs in slaves.c
+ * ----------------
+ */
+TransactionStateData CurrentTransactionStateData = {
+ 0, /* transaction id */
+ FirstCommandId, /* command id */
+ 0x0, /* start time */
+ TRANS_DEFAULT, /* transaction state */
+ TBLOCK_DEFAULT /* transaction block state */
+ };
+
+TransactionState CurrentTransactionState =
+ &CurrentTransactionStateData;
+
+/* ----------------
+ * info returned when the system is desabled
+ *
+ * Note: I have no idea what the significance of the
+ * 1073741823 in DisabledStartTime.. I just carried
+ * this over when converting things from the old
+ * V1 transaction system. -cim 3/18/90
+ * ----------------
+ */
+TransactionId DisabledTransactionId = (TransactionId)-1;
+
+CommandId DisabledCommandId = (CommandId) -1;
+
+AbsoluteTime DisabledStartTime = (AbsoluteTime) 1073741823;
+
+/* ----------------
+ * overflow flag
+ * ----------------
+ */
+bool CommandIdCounterOverflowFlag;
+
+/* ----------------
+ * catalog creation transaction bootstrapping flag.
+ * This should be eliminated and added to the transaction
+ * state stuff. -cim 3/19/90
+ * ----------------
+ */
+bool AMI_OVERRIDE = false;
+
+/* ----------------------------------------------------------------
+ * transaction state accessors
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * TranactionFlushEnabled()
+ * SetTranactionFlushEnabled()
+ *
+ * These are used to test and set the "TransactionFlushState"
+ * varable. If this variable is true (the default), then
+ * the system will flush all dirty buffers to disk at the end
+ * of each transaction. If false then we are assuming the
+ * buffer pool resides in stable main memory, in which case we
+ * only do writes as necessary.
+ * --------------------------------
+ */
+static int TransactionFlushState = 1;
+
+int
+TransactionFlushEnabled()
+{
+ return TransactionFlushState;
+}
+
+void
+SetTransactionFlushEnabled(bool state)
+{
+ TransactionFlushState = (state == true);
+}
+
+/* --------------------------------
+ * IsTransactionState
+ *
+ * This returns true if we are currently running a query
+ * within an executing transaction.
+ * --------------------------------
+ */
+bool
+IsTransactionState()
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch (s->state) {
+ case TRANS_DEFAULT: return false;
+ case TRANS_START: return true;
+ case TRANS_INPROGRESS: return true;
+ case TRANS_COMMIT: return true;
+ case TRANS_ABORT: return true;
+ case TRANS_DISABLED: return false;
+ }
+ /*
+ * Shouldn't get here, but lint is not happy with this...
+ */
+ return(false);
+}
+
+/* --------------------------------
+ * IsAbortedTransactionBlockState
+ *
+ * This returns true if we are currently running a query
+ * within an aborted transaction block.
+ * --------------------------------
+ */
+bool
+IsAbortedTransactionBlockState()
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->blockState == TBLOCK_ABORT)
+ return true;
+
+ return false;
+}
+
+/* --------------------------------
+ * OverrideTransactionSystem
+ *
+ * This is used to temporarily disable the transaction
+ * processing system in order to do initialization of
+ * the transaction system data structures and relations
+ * themselves.
+ * --------------------------------
+ */
+int SavedTransactionState;
+
+void
+OverrideTransactionSystem(bool flag)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (flag == true) {
+ if (s->state == TRANS_DISABLED)
+ return;
+
+ SavedTransactionState = s->state;
+ s->state = TRANS_DISABLED;
+ } else {
+ if (s->state != TRANS_DISABLED)
+ return;
+
+ s->state = SavedTransactionState;
+ }
+}
+
+/* --------------------------------
+ * GetCurrentTransactionId
+ *
+ * This returns the id of the current transaction, or
+ * the id of the "disabled" transaction.
+ * --------------------------------
+ */
+TransactionId
+GetCurrentTransactionId()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * if the transaction system is disabled, we return
+ * the special "disabled" transaction id.
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return (TransactionId) DisabledTransactionId;
+
+ /* ----------------
+ * otherwise return the current transaction id.
+ * ----------------
+ */
+ return (TransactionId) s->transactionIdData;
+}
+
+
+/* --------------------------------
+ * GetCurrentCommandId
+ * --------------------------------
+ */
+CommandId
+GetCurrentCommandId()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * if the transaction system is disabled, we return
+ * the special "disabled" command id.
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return (CommandId) DisabledCommandId;
+
+ return s->commandId;
+}
+
+
+/* --------------------------------
+ * GetCurrentTransactionStartTime
+ * --------------------------------
+ */
+AbsoluteTime
+GetCurrentTransactionStartTime()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * if the transaction system is disabled, we return
+ * the special "disabled" starting time.
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return (AbsoluteTime) DisabledStartTime;
+
+ return s->startTime;
+}
+
+
+/* --------------------------------
+ * TransactionIdIsCurrentTransactionId
+ * --------------------------------
+ */
+bool
+TransactionIdIsCurrentTransactionId(TransactionId xid)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (AMI_OVERRIDE)
+ return false;
+
+ return (bool)
+ TransactionIdEquals(xid, s->transactionIdData);
+}
+
+
+/* --------------------------------
+ * CommandIdIsCurrentCommandId
+ * --------------------------------
+ */
+bool
+CommandIdIsCurrentCommandId(CommandId cid)
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (AMI_OVERRIDE)
+ return false;
+
+ return
+ (cid == s->commandId) ? true : false;
+}
+
+
+/* --------------------------------
+ * ClearCommandIdCounterOverflowFlag
+ * --------------------------------
+ */
+void
+ClearCommandIdCounterOverflowFlag()
+{
+ CommandIdCounterOverflowFlag = false;
+}
+
+
+/* --------------------------------
+ * CommandCounterIncrement
+ * --------------------------------
+ */
+void
+CommandCounterIncrement()
+{
+ CurrentTransactionStateData.commandId += 1;
+ if (CurrentTransactionStateData.commandId == FirstCommandId) {
+ CommandIdCounterOverflowFlag = true;
+ elog(WARN, "You may only have 65535 commands per transaction");
+ }
+
+ /* make cache changes visible to me */
+ AtCommit_Cache();
+ AtStart_Cache();
+}
+
+/* ----------------------------------------------------------------
+ * initialization stuff
+ * ----------------------------------------------------------------
+ */
+void
+InitializeTransactionSystem()
+{
+ InitializeTransactionLog();
+}
+
+/* ----------------------------------------------------------------
+ * StartTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * AtStart_Cache
+ * --------------------------------
+ */
+void
+AtStart_Cache()
+{
+ DiscardInvalid();
+}
+
+/* --------------------------------
+ * AtStart_Locks
+ * --------------------------------
+ */
+void
+AtStart_Locks()
+{
+ /*
+ * at present, it is unknown to me what belongs here -cim 3/18/90
+ *
+ * There isn't anything to do at the start of a xact for locks.
+ * -mer 5/24/92
+ */
+}
+
+/* --------------------------------
+ * AtStart_Memory
+ * --------------------------------
+ */
+void
+AtStart_Memory()
+{
+ Portal portal;
+ MemoryContext portalContext;
+
+ /* ----------------
+ * get the blank portal and its memory context
+ * ----------------
+ */
+ portal = GetPortalByName(NULL);
+ portalContext = (MemoryContext) PortalGetHeapMemory(portal);
+
+ /* ----------------
+ * tell system to allocate in the blank portal context
+ * ----------------
+ */
+ (void) MemoryContextSwitchTo(portalContext);
+ StartPortalAllocMode(DefaultAllocMode, 0);
+}
+
+
+/* ----------------------------------------------------------------
+ * CommitTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * RecordTransactionCommit
+ *
+ * Note: the two calls to BufferManagerFlush() exist to ensure
+ * that data pages are written before log pages. These
+ * explicit calls should be replaced by a more efficient
+ * ordered page write scheme in the buffer manager
+ * -cim 3/18/90
+ * --------------------------------
+ */
+void
+RecordTransactionCommit()
+{
+ TransactionId xid;
+ int leak;
+
+ /* ----------------
+ * get the current transaction id
+ * ----------------
+ */
+ xid = GetCurrentTransactionId();
+
+ /* ----------------
+ * flush the buffer manager pages. Note: if we have stable
+ * main memory, dirty shared buffers are not flushed
+ * plai 8/7/90
+ * ----------------
+ */
+ leak = BufferPoolCheckLeak();
+ FlushBufferPool(!TransactionFlushEnabled());
+ if (leak) ResetBufferPool();
+
+ /* ----------------
+ * have the transaction access methods record the status
+ * of this transaction id in the pg_log / pg_time relations.
+ * ----------------
+ */
+ TransactionIdCommit(xid);
+
+ /* ----------------
+ * Now write the log/time info to the disk too.
+ * ----------------
+ */
+ leak = BufferPoolCheckLeak();
+ FlushBufferPool(!TransactionFlushEnabled());
+ if (leak) ResetBufferPool();
+}
+
+
+/* --------------------------------
+ * AtCommit_Cache
+ * --------------------------------
+ */
+void
+AtCommit_Cache()
+{
+ /* ----------------
+ * Make catalog changes visible to me for the next command.
+ * Other backends will not process my invalidation messages until
+ * after I commit and free my locks--though they will do
+ * unnecessary work if I abort.
+ * ----------------
+ */
+ RegisterInvalid(true);
+}
+
+/* --------------------------------
+ * AtCommit_Locks
+ * --------------------------------
+ */
+void
+AtCommit_Locks()
+{
+ /* ----------------
+ * XXX What if ProcReleaseLocks fails? (race condition?)
+ *
+ * Then you're up a creek! -mer 5/24/92
+ * ----------------
+ */
+ ProcReleaseLocks();
+}
+
+/* --------------------------------
+ * AtCommit_Memory
+ * --------------------------------
+ */
+void
+AtCommit_Memory()
+{
+ /* ----------------
+ * now that we're "out" of a transaction, have the
+ * system allocate things in the top memory context instead
+ * of the blank portal memory context.
+ * ----------------
+ */
+ EndPortalAllocMode();
+ (void) MemoryContextSwitchTo(TopMemoryContext);
+}
+
+/* ----------------------------------------------------------------
+ * AbortTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * RecordTransactionAbort
+ * --------------------------------
+ */
+void
+RecordTransactionAbort()
+{
+ TransactionId xid;
+
+ /* ----------------
+ * get the current transaction id
+ * ----------------
+ */
+ xid = GetCurrentTransactionId();
+
+ /* ----------------
+ * have the transaction access methods record the status
+ * of this transaction id in the pg_log / pg_time relations.
+ * ----------------
+ */
+ TransactionIdAbort(xid);
+
+ /* ----------------
+ * flush the buffer manager pages. Note: if we have stable
+ * main memory, dirty shared buffers are not flushed
+ * plai 8/7/90
+ * ----------------
+ */
+ ResetBufferPool();
+}
+
+/* --------------------------------
+ * AtAbort_Cache
+ * --------------------------------
+ */
+void
+AtAbort_Cache()
+{
+ RegisterInvalid(false);
+}
+
+/* --------------------------------
+ * AtAbort_Locks
+ * --------------------------------
+ */
+void
+AtAbort_Locks()
+{
+ /* ----------------
+ * XXX What if ProcReleaseLocks() fails? (race condition?)
+ *
+ * Then you're up a creek without a paddle! -mer
+ * ----------------
+ */
+ ProcReleaseLocks();
+}
+
+
+/* --------------------------------
+ * AtAbort_Memory
+ * --------------------------------
+ */
+void
+AtAbort_Memory()
+{
+ /* ----------------
+ * after doing an abort transaction, make certain the
+ * system uses the top memory context rather then the
+ * portal memory context (until the next transaction).
+ * ----------------
+ */
+ (void) MemoryContextSwitchTo(TopMemoryContext);
+}
+
+/* ----------------------------------------------------------------
+ * interface routines
+ * ----------------------------------------------------------------
+ */
+
+/* --------------------------------
+ * StartTransaction
+ *
+ * --------------------------------
+ */
+void
+StartTransaction()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * Check the current transaction state. If the transaction system
+ * is switched off, or if we're already in a transaction, do nothing.
+ * We're already in a transaction when the monitor sends a null
+ * command to the backend to flush the comm channel. This is a
+ * hacky fix to a communications problem, and we keep having to
+ * deal with it here. We should fix the comm channel code. mao 080891
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED || s->state == TRANS_INPROGRESS)
+ return;
+
+ /* ----------------
+ * set the current transaction state information
+ * appropriately during start processing
+ * ----------------
+ */
+ s->state = TRANS_START;
+
+ /* ----------------
+ * generate a new transaction id
+ * ----------------
+ */
+ GetNewTransactionId(&(s->transactionIdData));
+
+ /* ----------------
+ * initialize current transaction state fields
+ * ----------------
+ */
+ s->commandId = FirstCommandId;
+ s->startTime = GetCurrentAbsoluteTime();
+
+ /* ----------------
+ * initialize the various transaction subsystems
+ * ----------------
+ */
+ AtStart_Cache();
+ AtStart_Locks();
+ AtStart_Memory();
+
+ /* --------------
+ initialize temporary relations list
+ the tempRelList is a list of temporary relations that
+ are created in the course of the transactions
+ they need to be destroyed properly at the end of the transactions
+ */
+ InitTempRelList();
+
+ /* ----------------
+ * done with start processing, set current transaction
+ * state to "in progress"
+ * ----------------
+ */
+ s->state = TRANS_INPROGRESS;
+}
+
+/* ---------------
+ * Tell me if we are currently in progress
+ * ---------------
+ */
+bool
+CurrentXactInProgress()
+{
+ return (CurrentTransactionState->state == TRANS_INPROGRESS);
+}
+
+/* --------------------------------
+ * CommitTransaction
+ *
+ * --------------------------------
+ */
+void
+CommitTransaction()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * check the current transaction state
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return;
+
+ if (s->state != TRANS_INPROGRESS)
+ elog(NOTICE, "CommitTransaction and not in in-progress state ");
+
+ /* ----------------
+ * set the current transaction state information
+ * appropriately during the abort processing
+ * ----------------
+ */
+ s->state = TRANS_COMMIT;
+
+ /* ----------------
+ * do commit processing
+ * ----------------
+ */
+ DestroyTempRels();
+ AtEOXact_portals();
+ RecordTransactionCommit();
+ RelationPurgeLocalRelation(true);
+ AtCommit_Cache();
+ AtCommit_Locks();
+ AtCommit_Memory();
+
+ /* ----------------
+ * done with commit processing, set current transaction
+ * state back to default
+ * ----------------
+ */
+ s->state = TRANS_DEFAULT;
+ { /* want this after commit */
+ if (IsNormalProcessingMode())
+ Async_NotifyAtCommit();
+ }
+}
+
+/* --------------------------------
+ * AbortTransaction
+ *
+ * --------------------------------
+ */
+void
+AbortTransaction()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * check the current transaction state
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return;
+
+ if (s->state != TRANS_INPROGRESS)
+ elog(NOTICE, "AbortTransaction and not in in-progress state ");
+
+ /* ----------------
+ * set the current transaction state information
+ * appropriately during the abort processing
+ * ----------------
+ */
+ s->state = TRANS_ABORT;
+
+ /* ----------------
+ * do abort processing
+ * ----------------
+ */
+ AtEOXact_portals();
+ RecordTransactionAbort();
+ RelationPurgeLocalRelation(false);
+ DestroyTempRels();
+ AtAbort_Cache();
+ AtAbort_Locks();
+ AtAbort_Memory();
+
+ /* ----------------
+ * done with abort processing, set current transaction
+ * state back to default
+ * ----------------
+ */
+ s->state = TRANS_DEFAULT;
+ {
+ /* We need to do this in case another process notified us while
+ we are in the middle of an aborted transaction. We need to
+ notify our frontend after we finish the current transaction.
+ -- jw, 1/3/94
+ */
+ if (IsNormalProcessingMode())
+ Async_NotifyAtAbort();
+ }
+}
+
+/* --------------------------------
+ * StartTransactionCommand
+ * --------------------------------
+ */
+void
+StartTransactionCommand()
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch(s->blockState) {
+ /* ----------------
+ * if we aren't in a transaction block, we
+ * just do our usual start transaction.
+ * ----------------
+ */
+ case TBLOCK_DEFAULT:
+ StartTransaction();
+ break;
+
+ /* ----------------
+ * We should never experience this -- if we do it
+ * means the BEGIN state was not changed in the previous
+ * CommitTransactionCommand(). If we get it, we print
+ * a warning and change to the in-progress state.
+ * ----------------
+ */
+ case TBLOCK_BEGIN:
+ elog(NOTICE, "StartTransactionCommand: unexpected TBLOCK_BEGIN");
+ s->blockState = TBLOCK_INPROGRESS;
+ break;
+
+ /* ----------------
+ * This is the case when are somewhere in a transaction
+ * block and about to start a new command. For now we
+ * do nothing but someday we may do command-local resource
+ * initialization.
+ * ----------------
+ */
+ case TBLOCK_INPROGRESS:
+ break;
+
+ /* ----------------
+ * As with BEGIN, we should never experience this --
+ * if we do it means the END state was not changed in the
+ * previous CommitTransactionCommand(). If we get it, we
+ * print a warning, commit the transaction, start a new
+ * transaction and change to the default state.
+ * ----------------
+ */
+ case TBLOCK_END:
+ elog(NOTICE, "StartTransactionCommand: unexpected TBLOCK_END");
+ s->blockState = TBLOCK_DEFAULT;
+ CommitTransaction();
+ StartTransaction();
+ break;
+
+ /* ----------------
+ * Here we are in the middle of a transaction block but
+ * one of the commands caused an abort so we do nothing
+ * but remain in the abort state. Eventually we will get
+ * to the "END TRANSACTION" which will set things straight.
+ * ----------------
+ */
+ case TBLOCK_ABORT:
+ break;
+
+ /* ----------------
+ * This means we somehow aborted and the last call to
+ * CommitTransactionCommand() didn't clear the state so
+ * we remain in the ENDABORT state and mabey next time
+ * we get to CommitTransactionCommand() the state will
+ * get reset to default.
+ * ----------------
+ */
+ case TBLOCK_ENDABORT:
+ elog(NOTICE, "StartTransactionCommand: unexpected TBLOCK_ENDABORT");
+ break;
+ }
+}
+/* --------------------------------
+ * CommitTransactionCommand
+ * --------------------------------
+ */
+void
+CommitTransactionCommand()
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch(s->blockState) {
+ /* ----------------
+ * if we aren't in a transaction block, we
+ * just do our usual transaction commit
+ * ----------------
+ */
+ case TBLOCK_DEFAULT:
+ CommitTransaction();
+ break;
+
+ /* ----------------
+ * This is the case right after we get a "BEGIN TRANSACTION"
+ * command, but the user hasn't done anything else yet, so
+ * we change to the "transaction block in progress" state
+ * and return.
+ * ----------------
+ */
+ case TBLOCK_BEGIN:
+ s->blockState = TBLOCK_INPROGRESS;
+ break;
+
+ /* ----------------
+ * This is the case when we have finished executing a command
+ * someplace within a transaction block. We increment the
+ * command counter and return. Someday we may free resources
+ * local to the command.
+ * ----------------
+ */
+ case TBLOCK_INPROGRESS:
+ CommandCounterIncrement();
+ break;
+
+ /* ----------------
+ * This is the case when we just got the "END TRANSACTION"
+ * statement, so we go back to the default state and
+ * commit the transaction.
+ * ----------------
+ */
+ case TBLOCK_END:
+ s->blockState = TBLOCK_DEFAULT;
+ CommitTransaction();
+ break;
+
+ /* ----------------
+ * Here we are in the middle of a transaction block but
+ * one of the commands caused an abort so we do nothing
+ * but remain in the abort state. Eventually we will get
+ * to the "END TRANSACTION" which will set things straight.
+ * ----------------
+ */
+ case TBLOCK_ABORT:
+ break;
+
+ /* ----------------
+ * Here we were in an aborted transaction block which
+ * just processed the "END TRANSACTION" command from the
+ * user, so now we return the to default state.
+ * ----------------
+ */
+ case TBLOCK_ENDABORT:
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+ }
+}
+
+/* --------------------------------
+ * AbortCurrentTransaction
+ * --------------------------------
+ */
+void
+AbortCurrentTransaction()
+{
+ TransactionState s = CurrentTransactionState;
+
+ switch(s->blockState) {
+ /* ----------------
+ * if we aren't in a transaction block, we
+ * just do our usual abort transaction.
+ * ----------------
+ */
+ case TBLOCK_DEFAULT:
+ AbortTransaction();
+ break;
+
+ /* ----------------
+ * If we are in the TBLOCK_BEGIN it means something
+ * screwed up right after reading "BEGIN TRANSACTION"
+ * so we enter the abort state. Eventually an "END
+ * TRANSACTION" will fix things.
+ * ----------------
+ */
+ case TBLOCK_BEGIN:
+ s->blockState = TBLOCK_ABORT;
+ AbortTransaction();
+ break;
+
+ /* ----------------
+ * This is the case when are somewhere in a transaction
+ * block which aborted so we abort the transaction and
+ * set the ABORT state. Eventually an "END TRANSACTION"
+ * will fix things and restore us to a normal state.
+ * ----------------
+ */
+ case TBLOCK_INPROGRESS:
+ s->blockState = TBLOCK_ABORT;
+ AbortTransaction();
+ break;
+
+ /* ----------------
+ * Here, the system was fouled up just after the
+ * user wanted to end the transaction block so we
+ * abort the transaction and put us back into the
+ * default state.
+ * ----------------
+ */
+ case TBLOCK_END:
+ s->blockState = TBLOCK_DEFAULT;
+ AbortTransaction();
+ break;
+
+ /* ----------------
+ * Here, we are already in an aborted transaction
+ * state and are waiting for an "END TRANSACTION" to
+ * come along and lo and behold, we abort again!
+ * So we just remain in the abort state.
+ * ----------------
+ */
+ case TBLOCK_ABORT:
+ break;
+
+ /* ----------------
+ * Here we were in an aborted transaction block which
+ * just processed the "END TRANSACTION" command but somehow
+ * aborted again.. since we must have done the abort
+ * processing, we return to the default state.
+ * ----------------
+ */
+ case TBLOCK_ENDABORT:
+ s->blockState = TBLOCK_DEFAULT;
+ break;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * transaction block support
+ * ----------------------------------------------------------------
+ */
+/* --------------------------------
+ * BeginTransactionBlock
+ * --------------------------------
+ */
+void
+BeginTransactionBlock()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * check the current transaction state
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return;
+
+ if (s->blockState != TBLOCK_DEFAULT)
+ elog(NOTICE, "BeginTransactionBlock and not in default state ");
+
+ /* ----------------
+ * set the current transaction block state information
+ * appropriately during begin processing
+ * ----------------
+ */
+ s->blockState = TBLOCK_BEGIN;
+
+ /* ----------------
+ * do begin processing
+ * ----------------
+ */
+
+ /* ----------------
+ * done with begin processing, set block state to inprogress
+ * ----------------
+ */
+ s->blockState = TBLOCK_INPROGRESS;
+}
+
+/* --------------------------------
+ * EndTransactionBlock
+ * --------------------------------
+ */
+void
+EndTransactionBlock()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * check the current transaction state
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return;
+
+ if (s->blockState == TBLOCK_INPROGRESS) {
+ /* ----------------
+ * here we are in a transaction block which should commit
+ * when we get to the upcoming CommitTransactionCommand()
+ * so we set the state to "END". CommitTransactionCommand()
+ * will recognize this and commit the transaction and return
+ * us to the default state
+ * ----------------
+ */
+ s->blockState = TBLOCK_END;
+ return;
+ }
+
+ if (s->blockState == TBLOCK_ABORT) {
+ /* ----------------
+ * here, we are in a transaction block which aborted
+ * and since the AbortTransaction() was already done,
+ * we do whatever is needed and change to the special
+ * "END ABORT" state. The upcoming CommitTransactionCommand()
+ * will recognise this and then put us back in the default
+ * state.
+ * ----------------
+ */
+ s->blockState = TBLOCK_ENDABORT;
+ return;
+ }
+
+ /* ----------------
+ * We should not get here, but if we do, we go to the ENDABORT
+ * state after printing a warning. The upcoming call to
+ * CommitTransactionCommand() will then put us back into the
+ * default state.
+ * ----------------
+ */
+ elog(NOTICE, "EndTransactionBlock and not inprogress/abort state ");
+ s->blockState = TBLOCK_ENDABORT;
+}
+
+/* --------------------------------
+ * AbortTransactionBlock
+ * --------------------------------
+ */
+void
+AbortTransactionBlock()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * check the current transaction state
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return;
+
+ if (s->blockState == TBLOCK_INPROGRESS) {
+ /* ----------------
+ * here we were inside a transaction block something
+ * screwed up inside the system so we enter the abort state,
+ * do the abort processing and then return.
+ * We remain in the abort state until we see the upcoming
+ * END TRANSACTION command.
+ * ----------------
+ */
+ s->blockState = TBLOCK_ABORT;
+
+ /* ----------------
+ * do abort processing and return
+ * ----------------
+ */
+ AbortTransaction();
+ return;
+ }
+
+ /* ----------------
+ * this case should not be possible, because it would mean
+ * the user entered an "abort" from outside a transaction block.
+ * So we print an error message, abort the transaction and
+ * enter the "ENDABORT" state so we will end up in the default
+ * state after the upcoming CommitTransactionCommand().
+ * ----------------
+ */
+ elog(NOTICE, "AbortTransactionBlock and not inprogress state");
+ AbortTransaction();
+ s->blockState = TBLOCK_ENDABORT;
+}
+
+/* --------------------------------
+ * UserAbortTransactionBlock
+ * --------------------------------
+ */
+void
+UserAbortTransactionBlock()
+{
+ TransactionState s = CurrentTransactionState;
+
+ /* ----------------
+ * check the current transaction state
+ * ----------------
+ */
+ if (s->state == TRANS_DISABLED)
+ return;
+
+ if (s->blockState == TBLOCK_INPROGRESS) {
+ /* ----------------
+ * here we were inside a transaction block and we
+ * got an abort command from the user, so we move to
+ * the abort state, do the abort processing and
+ * then change to the ENDABORT state so we will end up
+ * in the default state after the upcoming
+ * CommitTransactionCommand().
+ * ----------------
+ */
+ s->blockState = TBLOCK_ABORT;
+
+ /* ----------------
+ * do abort processing
+ * ----------------
+ */
+ AbortTransaction();
+
+ /* ----------------
+ * change to the end abort state and return
+ * ----------------
+ */
+ s->blockState = TBLOCK_ENDABORT;
+ return;
+ }
+
+ /* ----------------
+ * this case should not be possible, because it would mean
+ * the user entered an "abort" from outside a transaction block.
+ * So we print an error message, abort the transaction and
+ * enter the "ENDABORT" state so we will end up in the default
+ * state after the upcoming CommitTransactionCommand().
+ * ----------------
+ */
+ elog(NOTICE, "UserAbortTransactionBlock and not inprogress state");
+ AbortTransaction();
+ s->blockState = TBLOCK_ENDABORT;
+}
+
+bool
+IsTransactionBlock()
+{
+ TransactionState s = CurrentTransactionState;
+
+ if (s->blockState == TBLOCK_INPROGRESS
+ || s->blockState == TBLOCK_ENDABORT) {
+ return (true);
+ }
+
+ return (false);
+}
diff --git a/src/backend/access/transam/xid.c b/src/backend/access/transam/xid.c
new file mode 100644
index 00000000000..faeeb623d58
--- /dev/null
+++ b/src/backend/access/transam/xid.c
@@ -0,0 +1,156 @@
+/*-------------------------------------------------------------------------
+ *
+ * xid.c--
+ * POSTGRES transaction identifier code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/transam/Attic/xid.c,v 1.1.1.1 1996/07/09 06:21:14 scrappy Exp $
+ *
+ * OLD COMMENTS
+ * XXX WARNING
+ * Much of this file will change when we change our representation
+ * of transaction ids -cim 3/23/90
+ *
+ * It is time to make the switch from 5 byte to 4 byte transaction ids
+ * This file was totally reworked. -mer 5/22/92
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>
+#include "postgres.h"
+#include "utils/palloc.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "utils/nabstime.h"
+
+extern TransactionId NullTransactionId;
+extern TransactionId DisabledTransactionId;
+extern TransactionId AmiTransactionId;
+extern TransactionId FirstTransactionId;
+
+/* ----------------------------------------------------------------
+ * TransactionIdIsValid
+ *
+ * Macro-ize me.
+ * ----------------------------------------------------------------
+ */
+bool
+TransactionIdIsValid(TransactionId transactionId)
+{
+ return ((bool) (transactionId != NullTransactionId) );
+}
+
+/* XXX char16 name for catalogs */
+TransactionId
+xidin(char *representation)
+{
+ return (atol(representation));
+}
+
+/* XXX char16 name for catalogs */
+char*
+xidout(TransactionId transactionId)
+{
+/* return(TransactionIdFormString(transactionId)); */
+ char *representation;
+
+ /* maximum 32 bit unsigned integer representation takes 10 chars */
+ representation = palloc(11);
+
+ (void)sprintf(representation, "%u", transactionId);
+
+ return (representation);
+
+}
+
+/* ----------------------------------------------------------------
+ * StoreInvalidTransactionId
+ *
+ * Maybe do away with Pointer types in these routines.
+ * Macro-ize this one.
+ * ----------------------------------------------------------------
+ */
+void
+StoreInvalidTransactionId(TransactionId *destination)
+{
+ *destination = NullTransactionId;
+}
+
+/* ----------------------------------------------------------------
+ * TransactionIdStore
+ *
+ * Macro-ize this one.
+ * ----------------------------------------------------------------
+ */
+void
+TransactionIdStore(TransactionId transactionId,
+ TransactionId *destination)
+{
+ *destination = transactionId;
+}
+
+/* ----------------------------------------------------------------
+ * TransactionIdEquals
+ * ----------------------------------------------------------------
+ */
+bool
+TransactionIdEquals(TransactionId id1, TransactionId id2)
+{
+ return ((bool) (id1 == id2));
+}
+
+/* ----------------------------------------------------------------
+ * TransactionIdIsLessThan
+ * ----------------------------------------------------------------
+ */
+bool
+TransactionIdIsLessThan(TransactionId id1, TransactionId id2)
+{
+ return ((bool)(id1 < id2));
+}
+
+/* ----------------------------------------------------------------
+ * xideq
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * xideq - returns 1, iff xid1 == xid2
+ * 0 else;
+ */
+bool
+xideq(TransactionId xid1, TransactionId xid2)
+{
+ return( (bool) (xid1 == xid2) );
+}
+
+
+
+/* ----------------------------------------------------------------
+ * TransactionIdIncrement
+ * ----------------------------------------------------------------
+ */
+void
+TransactionIdIncrement(TransactionId *transactionId)
+{
+
+ (*transactionId)++;
+ if (*transactionId == DisabledTransactionId)
+ elog(FATAL, "TransactionIdIncrement: exhausted XID's");
+ return;
+}
+
+/* ----------------------------------------------------------------
+ * TransactionIdAdd
+ * ----------------------------------------------------------------
+ */
+void
+TransactionIdAdd(TransactionId *xid, int value)
+{
+ *xid += value;
+ return;
+}
+
diff --git a/src/backend/access/tupdesc.h b/src/backend/access/tupdesc.h
new file mode 100644
index 00000000000..a26bbc704da
--- /dev/null
+++ b/src/backend/access/tupdesc.h
@@ -0,0 +1,53 @@
+/*-------------------------------------------------------------------------
+ *
+ * tupdesc.h--
+ * POSTGRES tuple descriptor definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: tupdesc.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TUPDESC_H
+#define TUPDESC_H
+
+#include "postgres.h"
+#include "access/attnum.h"
+#include "nodes/pg_list.h" /* for List */
+#include "catalog/pg_attribute.h"
+
+/*
+ * a TupleDesc is an array of AttributeTupleForms, each of which is a
+ * pointer to a AttributeTupleForm
+ */
+/* typedef AttributeTupleForm *TupleDesc; */
+
+/* a TupleDesc is a pointer to a structure which includes an array of */
+/* AttributeTupleForms, i.e. pg_attribute information, and the size of */
+/* the array, i.e. the number of attributes */
+/* in short, a TupleDesc completely captures the attribute information */
+/* for a tuple */
+
+typedef struct tupleDesc {
+ int natts;
+ AttributeTupleForm *attrs;
+} *TupleDesc;
+
+extern TupleDesc CreateTemplateTupleDesc(int natts);
+
+extern TupleDesc CreateTupleDesc(int natts, AttributeTupleForm *attrs);
+
+extern TupleDesc CreateTupleDescCopy(TupleDesc tupdesc);
+
+extern bool TupleDescInitEntry(TupleDesc desc,
+ AttrNumber attributeNumber,
+ char *attributeName,
+ char *typeName,
+ int attdim,
+ bool attisset);
+
+extern TupleDesc BuildDescForRelation(List *schema, char *relname);
+
+#endif /* TUPDESC_H */
diff --git a/src/backend/access/tupmacs.h b/src/backend/access/tupmacs.h
new file mode 100644
index 00000000000..9a9bcce3b41
--- /dev/null
+++ b/src/backend/access/tupmacs.h
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ *
+ * tupmacs.h--
+ * Tuple macros used by both index tuples and heap tuples.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: tupmacs.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TUPMACS_H
+#define TUPMACS_H
+
+/*
+ * check to see if the ATT'th bit of an array of 8-bit bytes is set.
+ */
+#define att_isnull(ATT, BITS) (!((BITS)[(ATT) >> 3] & (1 << ((ATT) & 0x07))))
+
+/*
+ * given a AttributeTupleForm and a pointer into a tuple's data
+ * area, return the correct value or pointer.
+ *
+ * note that T must already be properly LONGALIGN/SHORTALIGN'd for
+ * this to work correctly.
+ *
+ * the double-cast is to stop gcc from (correctly) complaining about
+ * casting integer types with size < sizeof(char *) to (char *).
+ * sign-extension may get weird if you use an integer type that
+ * isn't the same size as (char *) for the first cast. (on the other
+ * hand, it's safe to use another type for the (foo *)(T).)
+ */
+#define fetchatt(A, T) \
+ ((*(A))->attbyval \
+ ? ((*(A))->attlen > sizeof(int16) \
+ ? (char *) (long) *((int32 *)(T)) \
+ : ((*(A))->attlen < sizeof(int16) \
+ ? (char *) (long) *((char *)(T)) \
+ : (char *) (long) *((int16 *)(T)))) \
+ : (char *) (T))
+
+#endif
diff --git a/src/backend/access/valid.h b/src/backend/access/valid.h
new file mode 100644
index 00000000000..1c5cf8cdeb3
--- /dev/null
+++ b/src/backend/access/valid.h
@@ -0,0 +1,37 @@
+/*-------------------------------------------------------------------------
+ *
+ * valid.h--
+ * POSTGRES tuple qualification validity definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: valid.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef VALID_H
+#define VALID_H
+
+#include "c.h"
+#include "access/skey.h"
+#include "storage/buf.h"
+#include "utils/tqual.h"
+#include "access/tupdesc.h"
+#include "utils/rel.h"
+#include "storage/bufpage.h"
+
+/* ----------------
+ * extern decl's
+ * ----------------
+ */
+
+extern bool heap_keytest(HeapTuple t, TupleDesc tupdesc,
+ int nkeys, ScanKey keys);
+
+extern HeapTuple heap_tuple_satisfies(ItemId itemId, Relation relation,
+ PageHeader disk_page, TimeQual qual, int nKeys, ScanKey key);
+
+extern bool TupleUpdatedByCurXactAndCmd(HeapTuple t);
+
+#endif /* VALID_H */
diff --git a/src/backend/access/xact.h b/src/backend/access/xact.h
new file mode 100644
index 00000000000..15f376ec5ed
--- /dev/null
+++ b/src/backend/access/xact.h
@@ -0,0 +1,115 @@
+/*-------------------------------------------------------------------------
+ *
+ * xact.h--
+ * postgres transaction system header
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: xact.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef XACT_H
+#define XACT_H
+
+#include <signal.h>
+
+#include "storage/ipc.h"
+#include "miscadmin.h"
+#include "utils/portal.h"
+#include "utils/elog.h"
+#include "utils/mcxt.h"
+#include "utils/nabstime.h"
+
+/* ----------------
+ * transaction state structure
+ * ----------------
+ */
+typedef struct TransactionStateData {
+ TransactionId transactionIdData;
+ CommandId commandId;
+ AbsoluteTime startTime;
+ int state;
+ int blockState;
+} TransactionStateData;
+
+/* ----------------
+ * transaction states
+ * ----------------
+ */
+#define TRANS_DEFAULT 0
+#define TRANS_START 1
+#define TRANS_INPROGRESS 2
+#define TRANS_COMMIT 3
+#define TRANS_ABORT 4
+#define TRANS_DISABLED 5
+
+/* ----------------
+ * transaction block states
+ * ----------------
+ */
+#define TBLOCK_DEFAULT 0
+#define TBLOCK_BEGIN 1
+#define TBLOCK_INPROGRESS 2
+#define TBLOCK_END 3
+#define TBLOCK_ABORT 4
+#define TBLOCK_ENDABORT 5
+
+typedef TransactionStateData *TransactionState;
+
+/* ----------------
+ * extern definitions
+ * ----------------
+ */
+extern int TransactionFlushEnabled();
+extern void SetTransactionFlushEnabled(bool state);
+
+extern bool IsTransactionState(void);
+extern bool IsAbortedTransactionBlockState(void);
+extern void OverrideTransactionSystem(bool flag);
+extern TransactionId GetCurrentTransactionId(void);
+extern CommandId GetCurrentCommandId(void);
+extern AbsoluteTime GetCurrentTransactionStartTime(void);
+extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
+extern bool CommandIdIsCurrentCommandId(CommandId cid);
+extern void ClearCommandIdCounterOverflowFlag(void);
+extern void CommandCounterIncrement(void);
+extern void InitializeTransactionSystem(void);
+extern void AtStart_Cache(void);
+extern void AtStart_Locks(void);
+extern void AtStart_Memory(void);
+extern void RecordTransactionCommit(void);
+extern void AtCommit_Cache(void);
+extern void AtCommit_Locks(void);
+extern void AtCommit_Memory(void);
+extern void RecordTransactionAbort(void);
+extern void AtAbort_Cache(void);
+extern void AtAbort_Locks(void);
+extern void AtAbort_Memory(void);
+extern void StartTransaction(void);
+extern bool CurrentXactInProgress(void);
+extern void CommitTransaction(void);
+extern void AbortTransaction(void);
+extern void StartTransactionCommand(void);
+extern void CommitTransactionCommand(void);
+extern void AbortCurrentTransaction(void);
+extern void BeginTransactionBlock(void);
+extern void EndTransactionBlock(void);
+extern void AbortTransactionBlock(void);
+extern bool IsTransactionBlock();
+extern void UserAbortTransactionBlock();
+
+extern TransactionId DisabledTransactionId;
+
+/* defined in xid.c */
+extern bool TransactionIdIsValid(TransactionId transactionId);
+extern void StoreInvalidTransactionId(TransactionId *destination);
+extern void TransactionIdStore(TransactionId transactionId,
+ TransactionId *destination);
+extern bool TransactionIdEquals(TransactionId id1, TransactionId id2);
+extern bool TransactionIdIsLessThan(TransactionId id1, TransactionId id2);
+extern void TransactionIdIncrement(TransactionId *transactionId);
+extern void TransactionIdAdd(TransactionId *xid, int value);
+
+#endif /* XACT_H */