diff options
Diffstat (limited to 'src/backend/access')
75 files changed, 21730 insertions, 0 deletions
diff --git a/src/backend/access/Makefile.inc b/src/backend/access/Makefile.inc new file mode 100644 index 00000000000..6adc2c692b5 --- /dev/null +++ b/src/backend/access/Makefile.inc @@ -0,0 +1,35 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for the access methods module +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ +# +#------------------------------------------------------------------------- + +accdir=$(CURDIR)/access +VPATH:=$(VPATH):$(accdir):\ + $(accdir)/common:$(accdir)/hash:$(accdir)/heap:$(accdir)/index:\ + $(accdir)/rtree:$(accdir)/nbtree:$(accdir)/transam + + +SUBSRCS= +include $(accdir)/common/Makefile.inc +include $(accdir)/hash/Makefile.inc +include $(accdir)/heap/Makefile.inc +include $(accdir)/index/Makefile.inc +include $(accdir)/rtree/Makefile.inc +include $(accdir)/nbtree/Makefile.inc +include $(accdir)/transam/Makefile.inc +SRCS_ACCESS:= $(SUBSRCS) + +HEADERS+= attnum.h funcindex.h genam.h hash.h \ + heapam.h hio.h htup.h ibit.h iqual.h istrat.h \ + itup.h nbtree.h printtup.h relscan.h rtree.h \ + sdir.h skey.h strat.h transam.h tupdesc.h tupmacs.h \ + valid.h xact.h + diff --git a/src/backend/access/attnum.h b/src/backend/access/attnum.h new file mode 100644 index 00000000000..7c999e58e9d --- /dev/null +++ b/src/backend/access/attnum.h @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * attnum.h-- + * POSTGRES attribute number definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: attnum.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef ATTNUM_H +#define ATTNUM_H + +#include "c.h" + +/* + * user defined attribute numbers start at 1. -ay 2/95 + */ +typedef int16 AttrNumber; + +#define InvalidAttrNumber 0 + +/* ---------------- + * support macros + * ---------------- + */ +/* + * AttributeNumberIsValid -- + * True iff the attribute number is valid. + */ +#define AttributeNumberIsValid(attributeNumber) \ + ((bool) ((attributeNumber) != InvalidAttrNumber)) + +/* + * AttrNumberIsForUserDefinedAttr -- + * True iff the attribute number corresponds to an user defined attribute. + */ +#define AttrNumberIsForUserDefinedAttr(attributeNumber) \ + ((bool) ((attributeNumber) > 0)) + +/* + * AttrNumberGetAttrOffset -- + * Returns the attribute offset for an attribute number. + * + * Note: + * Assumes the attribute number is for an user defined attribute. + */ +#define AttrNumberGetAttrOffset(attNum) \ + (AssertMacro(AttrNumberIsForUserDefinedAttr(attNum)) ? \ + ((attNum - 1)) : 0) + +/* + * AttributeOffsetGetAttributeNumber -- + * Returns the attribute number for an attribute offset. + */ +#define AttrOffsetGetAttrNumber(attributeOffset) \ + ((AttrNumber) (1 + attributeOffset)) + +#endif /* ATTNUM_H */ diff --git a/src/backend/access/common/Makefile.inc b/src/backend/access/common/Makefile.inc new file mode 100644 index 00000000000..5d5dd476274 --- /dev/null +++ b/src/backend/access/common/Makefile.inc @@ -0,0 +1,16 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for access/common +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/common/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= heaptuple.c heapvalid.c indextuple.c indexvalid.c printtup.c \ + scankey.c tupdesc.c + diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c new file mode 100644 index 00000000000..c3e72fb97e8 --- /dev/null +++ b/src/backend/access/common/heaptuple.c @@ -0,0 +1,1011 @@ +/*------------------------------------------------------------------------- + * + * heaptuple.c-- + * This file contains heap tuple accessor and mutator routines, as well + * as a few various tuple utilities. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/common/heaptuple.c,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + * NOTES + * The old interface functions have been converted to macros + * and moved to heapam.h + * + *------------------------------------------------------------------------- + */ +#include <string.h> + +#include "postgres.h" + +#include "access/htup.h" +#include "access/itup.h" +#include "access/tupmacs.h" +#include "access/skey.h" +#include "storage/ipc.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "access/transam.h" +#include "storage/bufpage.h" /* for MAXTUPLEN */ +#include "storage/itemptr.h" +#include "utils/memutils.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/nabstime.h" + +/* this is so the sparcstation debugger works */ + +#ifndef NO_ASSERT_CHECKING +#ifdef sparc +#define register +#endif /* sparc */ +#endif /* NO_ASSERT_CHECKING */ + +/* ---------------------------------------------------------------- + * misc support routines + * ---------------------------------------------------------------- + */ + +/* ---------------- + * ComputeDataSize + * ---------------- + */ +Size +ComputeDataSize(TupleDesc tupleDesc, + Datum value[], + char nulls[]) +{ + uint32 length; + int i; + int numberOfAttributes = tupleDesc->natts; + AttributeTupleForm *att = tupleDesc->attrs; + + for (length = 0, i = 0; i < numberOfAttributes; i++) { + if (nulls[i] != ' ') continue; + + switch (att[i]->attlen) { + case -1: + /* + * This is the size of the disk representation and so + * must include the additional sizeof long. + */ + if (att[i]->attalign == 'd') { + length = DOUBLEALIGN(length) + + VARSIZE(DatumGetPointer(value[i])); + } else { + length = INTALIGN(length) + + VARSIZE(DatumGetPointer(value[i])); + } + break; + case sizeof(char): + length++; + break; + case sizeof(short): + length = SHORTALIGN(length + sizeof(short)); + break; + case sizeof(int32): + length = INTALIGN(length + sizeof(int32)); + break; + default: + if (att[i]->attlen < sizeof(int32)) + elog(WARN, "ComputeDataSize: attribute %d has len %d", + i, att[i]->attlen); + if (att[i]->attalign == 'd') + length = DOUBLEALIGN(length) + att[i]->attlen; + else + length = LONGALIGN(length) + att[i]->attlen; + break; + } + } + + return length; +} + +/* ---------------- + * DataFill + * ---------------- + */ +void +DataFill(char *data, + TupleDesc tupleDesc, + Datum value[], + char nulls[], + char *infomask, + bits8 bit[]) +{ + bits8 *bitP; + int bitmask; + uint32 length; + int i; + int numberOfAttributes = tupleDesc->natts; + AttributeTupleForm* att = tupleDesc->attrs; + + if (bit != NULL) { + bitP = &bit[-1]; + bitmask = CSIGNBIT; + } + + *infomask = 0; + + for (i = 0; i < numberOfAttributes; i++) { + if (bit != NULL) { + if (bitmask != CSIGNBIT) { + bitmask <<= 1; + } else { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (nulls[i] == 'n') { + *infomask |= HEAP_HASNULL; + continue; + } + + *bitP |= bitmask; + } + + switch (att[i]->attlen) { + case -1: + *infomask |= HEAP_HASVARLENA; + if (att[i]->attalign=='d') { + data = (char *) DOUBLEALIGN(data); + } else { + data = (char *) INTALIGN(data); + } + length = VARSIZE(DatumGetPointer(value[i])); + memmove(data, DatumGetPointer(value[i]),length); + data += length; + break; + case sizeof(char): + *data = att[i]->attbyval ? + DatumGetChar(value[i]) : *((char *) value[i]); + data += sizeof(char); + break; + case sizeof(int16): + data = (char *) SHORTALIGN(data); + * (short *) data = (att[i]->attbyval ? + DatumGetInt16(value[i]) : + *((short *) value[i])); + data += sizeof(short); + break; + case sizeof(int32): + data = (char *) INTALIGN(data); + * (int32 *) data = (att[i]->attbyval ? + DatumGetInt32(value[i]) : + *((int32 *) value[i])); + data += sizeof(int32); + break; + default: + if (att[i]->attlen < sizeof(int32)) + elog(WARN, "DataFill: attribute %d has len %d", + i, att[i]->attlen); + if (att[i]->attalign == 'd') { + data = (char *) DOUBLEALIGN(data); + memmove(data, DatumGetPointer(value[i]), + att[i]->attlen); + data += att[i]->attlen; + } else { + data = (char *) LONGALIGN(data); + memmove(data, DatumGetPointer(value[i]), + att[i]->attlen); + data += att[i]->attlen; + } + + } + } +} + +/* ---------------------------------------------------------------- + * heap tuple interface + * ---------------------------------------------------------------- + */ + +/* ---------------- + * heap_attisnull - returns 1 iff tuple attribute is not present + * ---------------- + */ +int +heap_attisnull(HeapTuple tup, int attnum) +{ + if (attnum > (int)tup->t_natts) + return (1); + + if (HeapTupleNoNulls(tup)) return(0); + + if (attnum > 0) { + return(att_isnull(attnum - 1, tup->t_bits)); + } else + switch (attnum) { + case SelfItemPointerAttributeNumber: + case ObjectIdAttributeNumber: + case MinTransactionIdAttributeNumber: + case MinCommandIdAttributeNumber: + case MaxTransactionIdAttributeNumber: + case MaxCommandIdAttributeNumber: + case ChainItemPointerAttributeNumber: + case AnchorItemPointerAttributeNumber: + case MinAbsoluteTimeAttributeNumber: + case MaxAbsoluteTimeAttributeNumber: + case VersionTypeAttributeNumber: + break; + + case 0: + elog(WARN, "heap_attisnull: zero attnum disallowed"); + + default: + elog(WARN, "heap_attisnull: undefined negative attnum"); + } + + return (0); +} + +/* ---------------------------------------------------------------- + * system attribute heap tuple support + * ---------------------------------------------------------------- + */ + +/* ---------------- + * heap_sysattrlen + * + * This routine returns the length of a system attribute. + * ---------------- + */ +int +heap_sysattrlen(AttrNumber attno) +{ + HeapTupleData *f = NULL; + int len; + + switch (attno) { + case SelfItemPointerAttributeNumber: + len = sizeof f->t_ctid; + break; + case ObjectIdAttributeNumber: + len = sizeof f->t_oid; + break; + case MinTransactionIdAttributeNumber: + len = sizeof f->t_xmin; + break; + case MinCommandIdAttributeNumber: + len = sizeof f->t_cmin; + break; + case MaxTransactionIdAttributeNumber: + len = sizeof f->t_xmax; + break; + case MaxCommandIdAttributeNumber: + len = sizeof f->t_cmax; + break; + case ChainItemPointerAttributeNumber: + len = sizeof f->t_chain; + break; + case AnchorItemPointerAttributeNumber: + elog(WARN, "heap_sysattrlen: field t_anchor does not exist!"); + break; + case MinAbsoluteTimeAttributeNumber: + len = sizeof f->t_tmin; + break; + case MaxAbsoluteTimeAttributeNumber: + len = sizeof f->t_tmax; + break; + case VersionTypeAttributeNumber: + len = sizeof f->t_vtype; + break; + default: + elog(WARN, "sysattrlen: System attribute number %d unknown.", + attno); + len = 0; + break; + } + return (len); +} + +/* ---------------- + * heap_sysattrbyval + * + * This routine returns the "by-value" property of a system attribute. + * ---------------- + */ +bool +heap_sysattrbyval(AttrNumber attno) +{ + bool byval; + + switch (attno) { + case SelfItemPointerAttributeNumber: + byval = false; + break; + case ObjectIdAttributeNumber: + byval = true; + break; + case MinTransactionIdAttributeNumber: + byval = true; + break; + case MinCommandIdAttributeNumber: + byval = true; + break; + case MaxTransactionIdAttributeNumber: + byval = true; + break; + case MaxCommandIdAttributeNumber: + byval = true; + break; + case ChainItemPointerAttributeNumber: + byval = false; + break; + case AnchorItemPointerAttributeNumber: + byval = false; + break; + case MinAbsoluteTimeAttributeNumber: + byval = true; + break; + case MaxAbsoluteTimeAttributeNumber: + byval = true; + break; + case VersionTypeAttributeNumber: + byval = true; + break; + default: + byval = true; + elog(WARN, "sysattrbyval: System attribute number %d unknown.", + attno); + break; + } + + return byval; +} + +/* ---------------- + * heap_getsysattr + * ---------------- + */ +char * +heap_getsysattr(HeapTuple tup, Buffer b, int attnum) +{ + switch (attnum) { + case SelfItemPointerAttributeNumber: + return ((char *)&tup->t_ctid); + case ObjectIdAttributeNumber: + return ((char *) (long) tup->t_oid); + case MinTransactionIdAttributeNumber: + return ((char *) (long) tup->t_xmin); + case MinCommandIdAttributeNumber: + return ((char *) (long) tup->t_cmin); + case MaxTransactionIdAttributeNumber: + return ((char *) (long) tup->t_xmax); + case MaxCommandIdAttributeNumber: + return ((char *) (long) tup->t_cmax); + case ChainItemPointerAttributeNumber: + return ((char *) &tup->t_chain); + case AnchorItemPointerAttributeNumber: + elog(WARN, "heap_getsysattr: t_anchor does not exist!"); + break; + + /* + * For tmin and tmax, we need to do some extra work. These don't + * get filled in until the vacuum cleaner runs (or we manage to flush + * a page after setting the value correctly below). If the vacuum + * cleaner hasn't run yet, then the times stored in the tuple are + * wrong, and we need to look up the commit time of the transaction. + * We cache this value in the tuple to avoid doing the work more than + * once. + */ + + case MinAbsoluteTimeAttributeNumber: + if (!AbsoluteTimeIsBackwardCompatiblyValid(tup->t_tmin) && + TransactionIdDidCommit(tup->t_xmin)) + tup->t_tmin = TransactionIdGetCommitTime(tup->t_xmin); + return ((char *) (long) tup->t_tmin); + case MaxAbsoluteTimeAttributeNumber: + if (!AbsoluteTimeIsBackwardCompatiblyReal(tup->t_tmax)) { + if (TransactionIdDidCommit(tup->t_xmax)) + tup->t_tmax = TransactionIdGetCommitTime(tup->t_xmax); + else + tup->t_tmax = CURRENT_ABSTIME; + } + return ((char *) (long) tup->t_tmax); + case VersionTypeAttributeNumber: + return ((char *) (long) tup->t_vtype); + default: + elog(WARN, "heap_getsysattr: undefined attnum %d", attnum); + } + return(NULL); +} + +/* ---------------- + * fastgetattr + * + * This is a newer version of fastgetattr which attempts to be + * faster by caching attribute offsets in the attribute descriptor. + * + * an alternate way to speed things up would be to cache offsets + * with the tuple, but that seems more difficult unless you take + * the storage hit of actually putting those offsets into the + * tuple you send to disk. Yuck. + * + * This scheme will be slightly slower than that, but should + * preform well for queries which hit large #'s of tuples. After + * you cache the offsets once, examining all the other tuples using + * the same attribute descriptor will go much quicker. -cim 5/4/91 + * ---------------- + */ +char * +fastgetattr(HeapTuple tup, + int attnum, + TupleDesc tupleDesc, + bool *isnull) +{ + char *tp; /* ptr to att in tuple */ + bits8 *bp; /* ptr to att in tuple */ + int slow; /* do we have to walk nulls? */ + AttributeTupleForm *att = tupleDesc->attrs; + + /* ---------------- + * sanity checks + * ---------------- + */ + + Assert(PointerIsValid(isnull)); + Assert(attnum > 0); + + /* ---------------- + * Three cases: + * + * 1: No nulls and no variable length attributes. + * 2: Has a null or a varlena AFTER att. + * 3: Has nulls or varlenas BEFORE att. + * ---------------- + */ + + *isnull = false; + + if (HeapTupleNoNulls(tup)) { + attnum--; + if (att[attnum]->attcacheoff > 0) { + return (char *) + fetchatt( &(att[attnum]), + (char *)tup + tup->t_hoff + att[attnum]->attcacheoff); + } else if (attnum == 0) { + /* + * first attribute is always at position zero + */ + return((char *) fetchatt(&(att[0]), (char *) tup + tup->t_hoff)); + } + + tp = (char *) tup + tup->t_hoff; + + slow = 0; + } else { + /* + * there's a null somewhere in the tuple + */ + + bp = tup->t_bits; + tp = (char *) tup + tup->t_hoff; + slow = 0; + attnum--; + + /* ---------------- + * check to see if desired att is null + * ---------------- + */ + + if (att_isnull(attnum, bp)) { + *isnull = true; + return NULL; + } + + /* ---------------- + * Now check to see if any preceeding bits are null... + * ---------------- + */ + + { + register int i = 0; /* current offset in bp */ + + for (i = 0; i < attnum && !slow; i++) { + if (att_isnull(i, bp)) slow = 1; + } + } + } + + /* + * now check for any non-fixed length attrs before our attribute + */ + if (!slow) { + if (att[attnum]->attcacheoff > 0) { + return (char *) + fetchatt(&(att[attnum]), + tp + att[attnum]->attcacheoff); + } else if (attnum == 0) { + return (char *) + fetchatt(&(att[0]), (char *) tup + tup->t_hoff); + } else if (!HeapTupleAllFixed(tup)) { + register int j = 0; + + for (j = 0; j < attnum && !slow; j++) + if (att[j]->attlen < 1) slow = 1; + } + } + + /* + * if slow is zero, and we got here, we know that we have a tuple with + * no nulls. We also have to initialize the remainder of + * the attribute cached offset values. + */ + if (!slow) { + register int j = 1; + register long off; + + /* + * need to set cache for some atts + */ + + att[0]->attcacheoff = 0; + + while (att[j]->attcacheoff > 0) j++; + + off = att[j-1]->attcacheoff + att[j-1]->attlen; + + for (; j < attnum + 1; j++) { + switch(att[j]->attlen) { + case -1: + off = (att[j]->attalign=='d') ? + DOUBLEALIGN(off) : INTALIGN(off); + break; + case sizeof(char): + break; + case sizeof(short): + off = SHORTALIGN(off); + break; + case sizeof(int32): + off = INTALIGN(off); + break; + default: + if (att[j]->attlen < sizeof(int32)) { + elog(WARN, + "fastgetattr: attribute %d has len %d", + j, att[j]->attlen); + } + if (att[j]->attalign == 'd') + off = DOUBLEALIGN(off); + else + off = LONGALIGN(off); + break; + } + + att[j]->attcacheoff = off; + off += att[j]->attlen; + } + + return + (char *)fetchatt(&(att[attnum]), tp + att[attnum]->attcacheoff); + } else { + register bool usecache = true; + register int off = 0; + register int i; + + /* + * Now we know that we have to walk the tuple CAREFULLY. + * + * Note - This loop is a little tricky. On iteration i we + * first set the offset for attribute i and figure out how much + * the offset should be incremented. Finally, we need to align the + * offset based on the size of attribute i+1 (for which the offset + * has been computed). -mer 12 Dec 1991 + */ + + for (i = 0; i < attnum; i++) { + if (!HeapTupleNoNulls(tup)) { + if (att_isnull(i, bp)) { + usecache = false; + continue; + } + } + switch (att[i]->attlen) { + case -1: + off = (att[i]->attalign=='d') ? + DOUBLEALIGN(off) : INTALIGN(off); + break; + case sizeof(char): + break; + case sizeof(short): + off = SHORTALIGN(off); + break; + case sizeof(int32): + off = INTALIGN(off); + break; + default: + if (att[i]->attlen < sizeof(int32)) + elog(WARN, + "fastgetattr2: attribute %d has len %d", + i, att[i]->attlen); + if (att[i]->attalign == 'd') + off = DOUBLEALIGN(off); + else + off = LONGALIGN(off); + break; + } + if (usecache && att[i]->attcacheoff > 0) { + off = att[i]->attcacheoff; + if (att[i]->attlen == -1) { + usecache = false; + } + } else { + if (usecache) att[i]->attcacheoff = off; + } + + switch(att[i]->attlen) { + case sizeof(char): + off++; + break; + case sizeof(int16): + off += sizeof(int16); + break; + case sizeof(int32): + off += sizeof(int32); + break; + case -1: + usecache = false; + off += VARSIZE(tp + off); + break; + default: + off += att[i]->attlen; + break; + } + } + switch (att[attnum]->attlen) { + case -1: + off = (att[attnum]->attalign=='d')? + DOUBLEALIGN(off) : INTALIGN(off); + break; + case sizeof(char): + break; + case sizeof(short): + off = SHORTALIGN(off); + break; + case sizeof(int32): + off = INTALIGN(off); + break; + default: + if (att[attnum]->attlen < sizeof(int32)) + elog(WARN, "fastgetattr3: attribute %d has len %d", + attnum, att[attnum]->attlen); + if (att[attnum]->attalign == 'd') + off = DOUBLEALIGN(off); + else + off = LONGALIGN(off); + break; + } + return((char *) fetchatt(&(att[attnum]), tp + off)); + } +} + +/* ---------------- + * heap_getattr + * + * returns an attribute from a heap tuple. uses + * ---------------- + */ +char * +heap_getattr(HeapTuple tup, + Buffer b, + int attnum, + TupleDesc tupleDesc, + bool *isnull) +{ + bool localIsNull; + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(tup != NULL); + + if (! PointerIsValid(isnull)) + isnull = &localIsNull; + + if (attnum > (int) tup->t_natts) { + *isnull = true; + return ((char *) NULL); + } + + /* ---------------- + * take care of user defined attributes + * ---------------- + */ + if (attnum > 0) { + char *datum; + datum = fastgetattr(tup, attnum, tupleDesc, isnull); + + return (datum); + } + + /* ---------------- + * take care of system attributes + * ---------------- + */ + *isnull = false; + return + heap_getsysattr(tup, b, attnum); +} + +/* ---------------- + * heap_copytuple + * + * returns a copy of an entire tuple + * ---------------- + */ +HeapTuple +heap_copytuple(HeapTuple tuple) +{ + HeapTuple newTuple; + + if (! HeapTupleIsValid(tuple)) + return (NULL); + + /* XXX For now, just prevent an undetectable executor related error */ + if (tuple->t_len > MAXTUPLEN) { + elog(WARN, "palloctup: cannot handle length %d tuples", + tuple->t_len); + } + + newTuple = (HeapTuple) palloc(tuple->t_len); + memmove((char *) newTuple, (char *) tuple, (int) tuple->t_len); + return(newTuple); +} + +/* ---------------- + * heap_deformtuple + * + * the inverse of heap_formtuple (see below) + * ---------------- + */ +void +heap_deformtuple(HeapTuple tuple, + TupleDesc tdesc, + Datum values[], + char nulls[]) +{ + int i; + int natts; + + Assert(HeapTupleIsValid(tuple)); + + natts = tuple->t_natts; + for (i = 0; i<natts; i++) { + bool isnull; + + values[i] = (Datum)heap_getattr(tuple, + InvalidBuffer, + i+1, + tdesc, + &isnull); + if (isnull) + nulls[i] = 'n'; + else + nulls[i] = ' '; + } +} + +/* ---------------- + * heap_formtuple + * + * constructs a tuple from the given value[] and null[] arrays + * + * old comments + * Handles alignment by aligning 2 byte attributes on short boundries + * and 3 or 4 byte attributes on long word boundries on a vax; and + * aligning non-byte attributes on short boundries on a sun. Does + * not properly align fixed length arrays of 1 or 2 byte types (yet). + * + * Null attributes are indicated by a 'n' in the appropriate byte + * of the null[]. Non-null attributes are indicated by a ' ' (space). + * + * Fix me. (Figure that must keep context if debug--allow give oid.) + * Assumes in order. + * ---------------- + */ +HeapTuple +heap_formtuple(TupleDesc tupleDescriptor, + Datum value[], + char nulls[]) +{ + char *tp; /* tuple pointer */ + HeapTuple tuple; /* return tuple */ + int bitmaplen; + long len; + int hoff; + bool hasnull = false; + int i; + int numberOfAttributes = tupleDescriptor->natts; + + len = sizeof *tuple - sizeof tuple->t_bits; + + for (i = 0; i < numberOfAttributes && !hasnull; i++) { + if (nulls[i] != ' ') hasnull = true; + } + + if (numberOfAttributes > MaxHeapAttributeNumber) + elog(WARN, "heap_formtuple: numberOfAttributes of %d > %d", + numberOfAttributes, MaxHeapAttributeNumber); + + if (hasnull) { + bitmaplen = BITMAPLEN(numberOfAttributes); + len += bitmaplen; + } + + hoff = len = DOUBLEALIGN(len); /* be conservative here */ + + len += ComputeDataSize(tupleDescriptor, value, nulls); + + tp = (char *) palloc(len); + tuple = (HeapTuple) tp; + + memset(tp, 0, (int)len); + + tuple->t_len = len; + tuple->t_natts = numberOfAttributes; + tuple->t_hoff = hoff; + tuple->t_tmin = INVALID_ABSTIME; + tuple->t_tmax = CURRENT_ABSTIME; + + DataFill((char *)tuple + tuple->t_hoff, + tupleDescriptor, + value, + nulls, + &tuple->t_infomask, + (hasnull ? tuple->t_bits : NULL)); + + return (tuple); +} + +/* ---------------- + * heap_modifytuple + * + * forms a new tuple from an old tuple and a set of replacement values. + * ---------------- + */ +HeapTuple +heap_modifytuple(HeapTuple tuple, + Buffer buffer, + Relation relation, + Datum replValue[], + char replNull[], + char repl[]) +{ + int attoff; + int numberOfAttributes; + Datum *value; + char *nulls; + bool isNull; + HeapTuple newTuple; + int madecopy; + uint8 infomask; + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(HeapTupleIsValid(tuple)); + Assert(BufferIsValid(buffer) || RelationIsValid(relation)); + Assert(HeapTupleIsValid(tuple)); + Assert(PointerIsValid(replValue)); + Assert(PointerIsValid(replNull)); + Assert(PointerIsValid(repl)); + + /* ---------------- + * if we're pointing to a disk page, then first + * make a copy of our tuple so that all the attributes + * are available. XXX this is inefficient -cim + * ---------------- + */ + madecopy = 0; + if (BufferIsValid(buffer) == true) { + relation = (Relation) BufferGetRelation(buffer); + tuple = heap_copytuple(tuple); + madecopy = 1; + } + + numberOfAttributes = RelationGetRelationTupleForm(relation)->relnatts; + + /* ---------------- + * allocate and fill value[] and nulls[] arrays from either + * the tuple or the repl information, as appropriate. + * ---------------- + */ + value = (Datum *) palloc(numberOfAttributes * sizeof *value); + nulls = (char *) palloc(numberOfAttributes * sizeof *nulls); + + for (attoff = 0; + attoff < numberOfAttributes; + attoff += 1) { + + if (repl[attoff] == ' ') { + char *attr; + + attr = + heap_getattr(tuple, + InvalidBuffer, + AttrOffsetGetAttrNumber(attoff), + RelationGetTupleDescriptor(relation), + &isNull) ; + value[attoff] = PointerGetDatum(attr); + nulls[attoff] = (isNull) ? 'n' : ' '; + + } else if (repl[attoff] != 'r') { + elog(WARN, "heap_modifytuple: repl is \\%3d", repl[attoff]); + + } else { /* == 'r' */ + value[attoff] = replValue[attoff]; + nulls[attoff] = replNull[attoff]; + } + } + + /* ---------------- + * create a new tuple from the values[] and nulls[] arrays + * ---------------- + */ + newTuple = heap_formtuple(RelationGetTupleDescriptor(relation), + value, + nulls); + + /* ---------------- + * copy the header except for t_len, t_natts, t_hoff, t_bits, t_infomask + * ---------------- + */ + infomask = newTuple->t_infomask; + memmove((char *) &newTuple->t_ctid, /*XXX*/ + (char *) &tuple->t_ctid, + ((char *) &tuple->t_hoff - (char *) &tuple->t_ctid)); /*XXX*/ + newTuple->t_infomask = infomask; + newTuple->t_natts = numberOfAttributes; /* fix t_natts just in case */ + + /* ---------------- + * if we made a copy of the tuple, then free it. + * ---------------- + */ + if (madecopy) + pfree(tuple); + + return + newTuple; +} + +/* ---------------------------------------------------------------- + * other misc functions + * ---------------------------------------------------------------- + */ + +HeapTuple +heap_addheader(uint32 natts, /* max domain index */ + int structlen, /* its length */ + char *structure) /* pointer to the struct */ +{ + register char *tp; /* tuple data pointer */ + HeapTuple tup; + long len; + int hoff; + + AssertArg(natts > 0); + + len = sizeof (HeapTupleData) - sizeof (tup->t_bits); + + hoff = len = DOUBLEALIGN(len); /* be conservative */ + len += structlen; + tp = (char *) palloc(len); + tup = (HeapTuple) tp; + memset((char*)tup, 0, len); + + tup->t_len = (short) len; /* XXX */ + tp += tup->t_hoff = hoff; + tup->t_natts = natts; + tup->t_infomask = 0; + + memmove(tp, structure, structlen); + + return (tup); +} diff --git a/src/backend/access/common/heapvalid.c b/src/backend/access/common/heapvalid.c new file mode 100644 index 00000000000..b80c5dd9eb0 --- /dev/null +++ b/src/backend/access/common/heapvalid.c @@ -0,0 +1,134 @@ +/*------------------------------------------------------------------------- + * + * heapvalid.c-- + * heap tuple qualification validity checking code + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/common/Attic/heapvalid.c,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "access/htup.h" +#include "access/skey.h" +#include "access/heapam.h" +#include "utils/tqual.h" +#include "access/valid.h" /* where the declarations go */ +#include "access/xact.h" + +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/itemid.h" +#include "fmgr.h" +#include "utils/elog.h" +#include "utils/rel.h" + +/* ---------------- + * heap_keytest + * + * Test a heap tuple with respect to a scan key. + * ---------------- + */ +bool +heap_keytest(HeapTuple t, + TupleDesc tupdesc, + int nkeys, + ScanKey keys) +{ + bool isnull; + Datum atp; + int test; + + for (; nkeys--; keys++) { + atp = (Datum)heap_getattr(t, InvalidBuffer, + keys->sk_attno, + tupdesc, + &isnull); + + if (isnull) + /* XXX eventually should check if SK_ISNULL */ + return false; + + if (keys->sk_flags & SK_COMMUTE) + test = (long) FMGR_PTR2(keys->sk_func, keys->sk_procedure, + keys->sk_argument, atp); + else + test = (long) FMGR_PTR2(keys->sk_func, keys->sk_procedure, + atp, keys->sk_argument); + + if (!test == !(keys->sk_flags & SK_NEGATE)) + return false; + } + + return true; +} + +/* ---------------- + * heap_tuple_satisfies + * + * Returns a valid HeapTuple if it satisfies the timequal and keytest. + * Returns NULL otherwise. Used to be heap_satisifies (sic) which + * returned a boolean. It now returns a tuple so that we can avoid doing two + * PageGetItem's per tuple. + * + * Complete check of validity including LP_CTUP and keytest. + * This should perhaps be combined with valid somehow in the + * future. (Also, additional rule tests/time range tests.) + * + * on 8/21/92 mao says: i rearranged the tests here to do keytest before + * SatisfiesTimeQual. profiling indicated that even for vacuumed relations, + * time qual checking was more expensive than key testing. time qual is + * least likely to fail, too. we should really add the time qual test to + * the restriction and optimize it in the normal way. this has interactions + * with joey's expensive function work. + * ---------------- + */ +HeapTuple +heap_tuple_satisfies(ItemId itemId, + Relation relation, + PageHeader disk_page, + TimeQual qual, + int nKeys, + ScanKey key) +{ + HeapTuple tuple; + bool res; + + if (! ItemIdIsUsed(itemId)) + return NULL; + + tuple = (HeapTuple) PageGetItem((Page) disk_page, itemId); + + if (key != NULL) + res = heap_keytest(tuple, RelationGetTupleDescriptor(relation), + nKeys, key); + else + res = TRUE; + + if (res && (relation->rd_rel->relkind == RELKIND_UNCATALOGED + || HeapTupleSatisfiesTimeQual(tuple,qual))) + return tuple; + + return (HeapTuple) NULL; +} + +/* + * TupleUpdatedByCurXactAndCmd() -- Returns true if this tuple has + * already been updated once by the current transaction/command + * pair. + */ +bool +TupleUpdatedByCurXactAndCmd(HeapTuple t) +{ + if (TransactionIdEquals(t->t_xmax, + GetCurrentTransactionId()) && + t->t_cmax == GetCurrentCommandId()) + return true; + + return false; +} diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c new file mode 100644 index 00000000000..be5d2ccbd96 --- /dev/null +++ b/src/backend/access/common/indextuple.c @@ -0,0 +1,427 @@ +/*------------------------------------------------------------------------- + * + * indextuple.c-- + * This file contains index tuple accessor and mutator routines, + * as well as a few various tuple utilities. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/common/indextuple.c,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <string.h> + +#include "c.h" +#include "access/ibit.h" +#include "access/itup.h" /* where the declarations go */ +#include "access/heapam.h" +#include "access/genam.h" +#include "access/tupdesc.h" +#include "access/tupmacs.h" + +#include "storage/itemptr.h" +#include "utils/elog.h" +#include "utils/palloc.h" + +static Size IndexInfoFindDataOffset(unsigned short t_info); + +/* ---------------------------------------------------------------- + * index_ tuple interface routines + * ---------------------------------------------------------------- + */ + +/* ---------------- + * index_formtuple + * ---------------- + */ +IndexTuple +index_formtuple(TupleDesc tupleDescriptor, + Datum value[], + char null[]) +{ + register char *tp; /* tuple pointer */ + IndexTuple tuple; /* return tuple */ + Size size, hoff; + int i; + unsigned short infomask = 0; + bool hasnull = false; + char tupmask = 0; + int numberOfAttributes = tupleDescriptor->natts; + + if (numberOfAttributes > MaxIndexAttributeNumber) + elog(WARN, "index_formtuple: numberOfAttributes of %d > %d", + numberOfAttributes, MaxIndexAttributeNumber); + + + for (i = 0; i < numberOfAttributes && !hasnull; i++) { + if (null[i] != ' ') hasnull = true; + } + + if (hasnull) infomask |= INDEX_NULL_MASK; + + hoff = IndexInfoFindDataOffset(infomask); + size = hoff + + ComputeDataSize(tupleDescriptor, + value, null); + size = DOUBLEALIGN(size); /* be conservative */ + + tp = (char *) palloc(size); + tuple = (IndexTuple) tp; + memset(tp,0,(int)size); + + DataFill((char *)tp + hoff, + tupleDescriptor, + value, + null, + &tupmask, + (hasnull ? (bits8*)tp + sizeof(*tuple) : NULL)); + + /* + * We do this because DataFill wants to initialize a "tupmask" which + * is used for HeapTuples, but we want an indextuple infomask. The only + * "relevent" info is the "has variable attributes" field, which is in + * mask position 0x02. We have already set the null mask above. + */ + + if (tupmask & 0x02) infomask |= INDEX_VAR_MASK; + + /* + * Here we make sure that we can actually hold the size. We also want + * to make sure that size is not aligned oddly. This actually is a + * rather odd way to make sure the size is not too large overall. + */ + + if (size & 0xE000) + elog(WARN, "index_formtuple: data takes %d bytes: too big", size); + + + infomask |= size; + + /* ---------------- + * initialize metadata + * ---------------- + */ + tuple->t_info = infomask; + return (tuple); +} + +/* ---------------- + * fastgetiattr + * + * This is a newer version of fastgetiattr which attempts to be + * faster by caching attribute offsets in the attribute descriptor. + * + * an alternate way to speed things up would be to cache offsets + * with the tuple, but that seems more difficult unless you take + * the storage hit of actually putting those offsets into the + * tuple you send to disk. Yuck. + * + * This scheme will be slightly slower than that, but should + * preform well for queries which hit large #'s of tuples. After + * you cache the offsets once, examining all the other tuples using + * the same attribute descriptor will go much quicker. -cim 5/4/91 + * ---------------- + */ +char * +fastgetiattr(IndexTuple tup, + int attnum, + TupleDesc tupleDesc, + bool *isnull) +{ + register char *tp; /* ptr to att in tuple */ + register char *bp; /* ptr to att in tuple */ + int slow; /* do we have to walk nulls? */ + register int data_off; /* tuple data offset */ + + /* ---------------- + * sanity checks + * ---------------- + */ + + Assert(PointerIsValid(isnull)); + Assert(attnum > 0); + + /* ---------------- + * Three cases: + * + * 1: No nulls and no variable length attributes. + * 2: Has a null or a varlena AFTER att. + * 3: Has nulls or varlenas BEFORE att. + * ---------------- + */ + + *isnull = false; + data_off = IndexTupleHasMinHeader(tup) ? sizeof *tup : + IndexInfoFindDataOffset(tup->t_info); + + if (IndexTupleNoNulls(tup)) { + + /* first attribute is always at position zero */ + + if (attnum == 1) { + return(fetchatt(&(tupleDesc->attrs[0]), (char *) tup + data_off)); + } + attnum--; + + if (tupleDesc->attrs[attnum]->attcacheoff > 0) { + return(fetchatt(&(tupleDesc->attrs[attnum]), + (char *) tup + data_off + + tupleDesc->attrs[attnum]->attcacheoff)); + } + + tp = (char *) tup + data_off; + + slow = 0; + }else { /* there's a null somewhere in the tuple */ + + bp = (char *) tup + sizeof(*tup); /* "knows" t_bits are here! */ + slow = 0; + /* ---------------- + * check to see if desired att is null + * ---------------- + */ + + attnum--; + { + if (att_isnull(attnum, bp)) { + *isnull = true; + return NULL; + } + } + /* ---------------- + * Now check to see if any preceeding bits are null... + * ---------------- + */ + { + register int i = 0; /* current offset in bp */ + register int mask; /* bit in byte we're looking at */ + register char n; /* current byte in bp */ + register int byte, finalbit; + + byte = attnum >> 3; + finalbit = attnum & 0x07; + + for (; i <= byte; i++) { + n = bp[i]; + if (i < byte) { + /* check for nulls in any "earlier" bytes */ + if ((~n) != 0) { + slow++; + break; + } + } else { + /* check for nulls "before" final bit of last byte*/ + mask = (finalbit << 1) - 1; + if ((~n) & mask) + slow++; + } + } + } + tp = (char *) tup + data_off; + } + + /* now check for any non-fixed length attrs before our attribute */ + + if (!slow) { + if (tupleDesc->attrs[attnum]->attcacheoff > 0) { + return(fetchatt(&(tupleDesc->attrs[attnum]), + tp + tupleDesc->attrs[attnum]->attcacheoff)); + }else if (!IndexTupleAllFixed(tup)) { + register int j = 0; + + for (j = 0; j < attnum && !slow; j++) + if (tupleDesc->attrs[j]->attlen < 1) slow = 1; + } + } + + /* + * if slow is zero, and we got here, we know that we have a tuple with + * no nulls. We also know that we have to initialize the remainder of + * the attribute cached offset values. + */ + + if (!slow) { + register int j = 1; + register long off; + + /* + * need to set cache for some atts + */ + + tupleDesc->attrs[0]->attcacheoff = 0; + + while (tupleDesc->attrs[j]->attcacheoff > 0) j++; + + off = tupleDesc->attrs[j-1]->attcacheoff + + tupleDesc->attrs[j-1]->attlen; + + for (; j < attnum + 1; j++) { + /* + * Fix me when going to a machine with more than a four-byte + * word! + */ + + switch(tupleDesc->attrs[j]->attlen) + { + case -1: + off = (tupleDesc->attrs[j]->attalign=='d')? + DOUBLEALIGN(off):INTALIGN(off); + break; + case sizeof(char): + break; + case sizeof(short): + off = SHORTALIGN(off); + break; + case sizeof(int32): + off = INTALIGN(off); + break; + default: + if (tupleDesc->attrs[j]->attlen > sizeof(int32)) + off = (tupleDesc->attrs[j]->attalign=='d')? + DOUBLEALIGN(off) : LONGALIGN(off); + else + elog(WARN, "fastgetiattr: attribute %d has len %d", + j, tupleDesc->attrs[j]->attlen); + break; + + } + + tupleDesc->attrs[j]->attcacheoff = off; + off += tupleDesc->attrs[j]->attlen; + } + + return(fetchatt( &(tupleDesc->attrs[attnum]), + tp + tupleDesc->attrs[attnum]->attcacheoff)); + }else { + register bool usecache = true; + register int off = 0; + register int i; + + /* + * Now we know that we have to walk the tuple CAREFULLY. + */ + + for (i = 0; i < attnum; i++) { + if (!IndexTupleNoNulls(tup)) { + if (att_isnull(i, bp)) { + usecache = false; + continue; + } + } + + if (usecache && tupleDesc->attrs[i]->attcacheoff > 0) { + off = tupleDesc->attrs[i]->attcacheoff; + if (tupleDesc->attrs[i]->attlen == -1) + usecache = false; + else + continue; + } + + if (usecache) tupleDesc->attrs[i]->attcacheoff = off; + switch(tupleDesc->attrs[i]->attlen) + { + case sizeof(char): + off++; + break; + case sizeof(short): + off = SHORTALIGN(off) + sizeof(short); + break; + case -1: + usecache = false; + off = (tupleDesc->attrs[i]->attalign=='d')? + DOUBLEALIGN(off):INTALIGN(off); + off += VARSIZE(tp + off); + break; + default: + if (tupleDesc->attrs[i]->attlen > sizeof(int32)) + off = (tupleDesc->attrs[i]->attalign=='d') ? + DOUBLEALIGN(off) + tupleDesc->attrs[i]->attlen : + LONGALIGN(off) + tupleDesc->attrs[i]->attlen; + else + elog(WARN, "fastgetiattr2: attribute %d has len %d", + i, tupleDesc->attrs[i]->attlen); + + break; + } + } + + return(fetchatt(&tupleDesc->attrs[attnum], tp + off)); + } +} + +/* ---------------- + * index_getattr + * ---------------- + */ +Datum +index_getattr(IndexTuple tuple, + AttrNumber attNum, + TupleDesc tupDesc, + bool *isNullOutP) +{ + Assert (attNum > 0); + + return (Datum) + fastgetiattr(tuple, attNum, tupDesc, isNullOutP); +} + +RetrieveIndexResult +FormRetrieveIndexResult(ItemPointer indexItemPointer, + ItemPointer heapItemPointer) +{ + RetrieveIndexResult result; + + Assert(ItemPointerIsValid(indexItemPointer)); + Assert(ItemPointerIsValid(heapItemPointer)); + + result = (RetrieveIndexResult) palloc(sizeof *result); + + result->index_iptr = *indexItemPointer; + result->heap_iptr = *heapItemPointer; + + return (result); +} + +/* + * Takes an infomask as argument (primarily because this needs to be usable + * at index_formtuple time so enough space is allocated). + * + * Change me if adding an attribute to IndexTuples!!!!!!!!!!! + */ +static Size +IndexInfoFindDataOffset(unsigned short t_info) +{ + if (!(t_info & INDEX_NULL_MASK)) + return((Size) sizeof(IndexTupleData)); + else { + Size size = sizeof(IndexTupleData); + + if (t_info & INDEX_NULL_MASK) { + size += sizeof(IndexAttributeBitMapData); + } + return DOUBLEALIGN(size); /* be conservative */ + } +} + +/* + * Copies source into target. If *target == NULL, we palloc space; otherwise + * we assume we have space that is already palloc'ed. + */ +void +CopyIndexTuple(IndexTuple source, IndexTuple *target) +{ + Size size; + IndexTuple ret; + + size = IndexTupleSize(source); + if (*target == NULL) { + *target = (IndexTuple) palloc(size); + } + + ret = *target; + memmove((char*)ret, (char*)source, size); +} + diff --git a/src/backend/access/common/indexvalid.c b/src/backend/access/common/indexvalid.c new file mode 100644 index 00000000000..b437718cecc --- /dev/null +++ b/src/backend/access/common/indexvalid.c @@ -0,0 +1,84 @@ +/*------------------------------------------------------------------------- + * + * indexvalid.c-- + * index tuple qualification validity checking code + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/common/Attic/indexvalid.c,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "executor/execdebug.h" +#include "access/genam.h" +#include "access/iqual.h" /* where the declarations go */ +#include "access/itup.h" +#include "access/skey.h" + +#include "storage/buf.h" +#include "storage/bufpage.h" +#include "storage/itemid.h" +#include "utils/rel.h" + +/* ---------------------------------------------------------------- + * index scan key qualification code + * ---------------------------------------------------------------- + */ +int NIndexTupleProcessed; + +/* ---------------- + * index_keytest + * + * old comments + * May eventually combine with other tests (like timeranges)? + * Should have Buffer buffer; as an argument and pass it to amgetattr. + * ---------------- + */ +bool +index_keytest(IndexTuple tuple, + TupleDesc tupdesc, + int scanKeySize, + ScanKey key) +{ + bool isNull; + Datum datum; + int test; + + IncrIndexProcessed(); + + while (scanKeySize > 0) { + datum = index_getattr(tuple, + 1, + tupdesc, + &isNull); + + if (isNull) { + /* XXX eventually should check if SK_ISNULL */ + return (false); + } + + if (key[0].sk_flags & SK_COMMUTE) { + test = (int) (*(key[0].sk_func)) + (DatumGetPointer(key[0].sk_argument), + datum); + } else { + test = (int) (*(key[0].sk_func)) + (datum, + DatumGetPointer(key[0].sk_argument)); + } + + if (!test == !(key[0].sk_flags & SK_NEGATE)) { + return (false); + } + + scanKeySize -= 1; + key++; + } + + return (true); +} + diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c new file mode 100644 index 00000000000..556b73b9dfd --- /dev/null +++ b/src/backend/access/common/printtup.c @@ -0,0 +1,306 @@ +/*------------------------------------------------------------------------- + * + * printtup.c-- + * Routines to print out tuples to the destination (binary or non-binary + * portals, frontend/interactive backend, etc.). + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/common/printtup.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <sys/file.h> +#include <stdio.h> +#include <string.h> + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup.h" +#include "access/skey.h" +#include "access/printtup.h" +#include "access/tupdesc.h" +#include "storage/buf.h" +#include "utils/memutils.h" +#include "utils/palloc.h" +#include "fmgr.h" +#include "utils/elog.h" + +#include "utils/syscache.h" +#include "catalog/pg_type.h" + +#include "libpq/libpq.h" + +/* ---------------------------------------------------------------- + * printtup / debugtup support + * ---------------------------------------------------------------- + */ + +/* ---------------- + * typtoout - used by printtup and debugtup + * ---------------- + */ +Oid +typtoout(Oid type) +{ + HeapTuple typeTuple; + + typeTuple = SearchSysCacheTuple(TYPOID, + ObjectIdGetDatum(type), + 0, 0, 0); + + if (HeapTupleIsValid(typeTuple)) + return((Oid) + ((TypeTupleForm) GETSTRUCT(typeTuple))->typoutput); + + elog(WARN, "typtoout: Cache lookup of type %d failed", type); + return(InvalidOid); +} + +Oid +gettypelem(Oid type) +{ + HeapTuple typeTuple; + + typeTuple = SearchSysCacheTuple(TYPOID, + ObjectIdGetDatum(type), + 0,0,0); + + if (HeapTupleIsValid(typeTuple)) + return((Oid) + ((TypeTupleForm) GETSTRUCT(typeTuple))->typelem); + + elog(WARN, "typtoout: Cache lookup of type %d failed", type); + return(InvalidOid); +} + +/* ---------------- + * printtup + * ---------------- + */ +void +printtup(HeapTuple tuple, TupleDesc typeinfo) +{ + int i, j, k; + char *outputstr, *attr; + bool isnull; + Oid typoutput; + + /* ---------------- + * tell the frontend to expect new tuple data + * ---------------- + */ + pq_putnchar("D", 1); + + /* ---------------- + * send a bitmap of which attributes are null + * ---------------- + */ + j = 0; + k = 1 << 7; + for (i = 0; i < tuple->t_natts; ) { + attr = heap_getattr(tuple, InvalidBuffer, ++i, typeinfo, &isnull); + if (!isnull) + j |= k; + k >>= 1; + if (!(i & 7)) { + pq_putint(j, 1); + j = 0; + k = 1 << 7; + } + } + if (i & 7) + pq_putint(j, 1); + + /* ---------------- + * send the attributes of this tuple + * ---------------- + */ + for (i = 0; i < tuple->t_natts; ++i) { + attr = heap_getattr(tuple, InvalidBuffer, i+1, typeinfo, &isnull); + typoutput = typtoout((Oid) typeinfo->attrs[i]->atttypid); + + if (!isnull && OidIsValid(typoutput)) { + outputstr = fmgr(typoutput, attr, + gettypelem(typeinfo->attrs[i]->atttypid)); + pq_putint(strlen(outputstr)+4, 4); + pq_putnchar(outputstr, strlen(outputstr)); + pfree(outputstr); + } + } +} + +/* ---------------- + * printatt + * ---------------- + */ +static void +printatt(unsigned attributeId, + AttributeTupleForm attributeP, + char *value) +{ + printf("\t%2d: %.*s%s%s%s\t(typeid = %u, len = %d, byval = %c)\n", + attributeId, + NAMEDATALEN, /* attname is a char16 */ + attributeP->attname.data, + value != NULL ? " = \"" : "", + value != NULL ? value : "", + value != NULL ? "\"" : "", + (unsigned int) (attributeP->atttypid), + attributeP->attlen, + attributeP->attbyval ? 't' : 'f'); +} + +/* ---------------- + * showatts + * ---------------- + */ +void +showatts(char *name, TupleDesc tupleDesc) +{ + int i; + int natts = tupleDesc->natts; + AttributeTupleForm *attinfo = tupleDesc->attrs; + + puts(name); + for (i = 0; i < natts; ++i) + printatt((unsigned) i+1, attinfo[i], (char *) NULL); + printf("\t----\n"); +} + +/* ---------------- + * debugtup + * ---------------- + */ +void +debugtup(HeapTuple tuple, TupleDesc typeinfo) +{ + register int i; + char *attr, *value; + bool isnull; + Oid typoutput; + + for (i = 0; i < tuple->t_natts; ++i) { + attr = heap_getattr(tuple, InvalidBuffer, i+1, typeinfo, &isnull); + typoutput = typtoout((Oid) typeinfo->attrs[i]->atttypid); + + if (!isnull && OidIsValid(typoutput)) { + value = fmgr(typoutput, attr, + gettypelem(typeinfo->attrs[i]->atttypid)); + printatt((unsigned) i+1, typeinfo->attrs[i], value); + pfree(value); + } + } + printf("\t----\n"); +} + +/*#define IPORTAL_DEBUG*/ + +/* ---------------- + * printtup_internal + * Protocol expects either T, D, C, E, or N. + * We use a different data prefix, e.g. 'B' instead of 'D' to + * indicate a tuple in internal (binary) form. + * + * This is same as printtup, except we don't use the typout func. + * ---------------- + */ +void +printtup_internal(HeapTuple tuple, TupleDesc typeinfo) +{ + int i, j, k; + char *attr; + bool isnull; + + /* ---------------- + * tell the frontend to expect new tuple data + * ---------------- + */ + pq_putnchar("B", 1); + + /* ---------------- + * send a bitmap of which attributes are null + * ---------------- + */ + j = 0; + k = 1 << 7; + for (i = 0; i < tuple->t_natts; ) { + attr = heap_getattr(tuple, InvalidBuffer, ++i, typeinfo, &isnull); + if (!isnull) + j |= k; + k >>= 1; + if (!(i & 7)) { + pq_putint(j, 1); + j = 0; + k = 1 << 7; + } + } + if (i & 7) + pq_putint(j, 1); + + /* ---------------- + * send the attributes of this tuple + * ---------------- + */ +#ifdef IPORTAL_DEBUG + fprintf(stderr, "sending tuple with %d atts\n", tuple->t_natts); +#endif + for (i = 0; i < tuple->t_natts; ++i) { + int32 len = typeinfo->attrs[i]->attlen; + + attr = heap_getattr(tuple, InvalidBuffer, i+1, typeinfo, &isnull); + if (!isnull) { + /* # of bytes, and opaque data */ + if (len == -1) { + /* variable length, assume a varlena structure */ + len = VARSIZE(attr) - VARHDRSZ; + + pq_putint(len, sizeof(int32)); + pq_putnchar(VARDATA(attr), len); +#ifdef IPORTAL_DEBUG + { + char *d = VARDATA(attr); + + fprintf(stderr, "length %d data %x%x%x%x\n", + len, *d, *(d+1), *(d+2), *(d+3)); + } +#endif + } else { + /* fixed size */ + if (typeinfo->attrs[i]->attbyval) { + int8 i8; + int16 i16; + int32 i32; + + pq_putint(len, sizeof(int32)); + switch (len) { + case sizeof(int8): + i8 = DatumGetChar(attr); + pq_putnchar((char *) &i8, len); + break; + case sizeof(int16): + i16 = DatumGetInt16(attr); + pq_putnchar((char *) &i16, len); + break; + case sizeof(int32): + i32 = DatumGetInt32(attr); + pq_putnchar((char *) &i32, len); + break; + } +#ifdef IPORTAL_DEBUG + fprintf(stderr, "byval length %d data %d\n", len, attr); +#endif + } else { + pq_putint(len, sizeof(int32)); + pq_putnchar(attr, len); +#ifdef IPORTAL_DEBUG + fprintf(stderr, "byref length %d data %x\n", len, attr); +#endif + } + } + } + } +} diff --git a/src/backend/access/common/scankey.c b/src/backend/access/common/scankey.c new file mode 100644 index 00000000000..7a47219a73c --- /dev/null +++ b/src/backend/access/common/scankey.c @@ -0,0 +1,68 @@ +/*------------------------------------------------------------------------- + * + * scan.c-- + * scan direction and key code + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/common/scankey.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "c.h" +#include "access/sdir.h" +#include "access/attnum.h" +#include "access/skey.h" + +#include "fmgr.h" + +/* + * ScanKeyEntryIsLegal -- + * True iff the scan key entry is legal. + */ +#define ScanKeyEntryIsLegal(entry) \ + ((bool) (AssertMacro(PointerIsValid(entry)) && \ + AttributeNumberIsValid(entry->sk_attno))) + +/* + * ScanKeyEntrySetIllegal -- + * Marks a scan key entry as illegal. + */ +void +ScanKeyEntrySetIllegal(ScanKey entry) +{ + + Assert(PointerIsValid(entry)); + + entry->sk_flags = 0; /* just in case... */ + entry->sk_attno = InvalidAttrNumber; + entry->sk_procedure = 0; /* should be InvalidRegProcedure */ +} + +/* + * ScanKeyEntryInitialize -- + * Initializes an scan key entry. + * + * Note: + * Assumes the scan key entry is valid. + * Assumes the intialized scan key entry will be legal. + */ +void +ScanKeyEntryInitialize(ScanKey entry, + bits16 flags, + AttrNumber attributeNumber, + RegProcedure procedure, + Datum argument) +{ + Assert(PointerIsValid(entry)); + + entry->sk_flags = flags; + entry->sk_attno = attributeNumber; + entry->sk_procedure = procedure; + entry->sk_argument = argument; + fmgr_info(procedure, &entry->sk_func, &entry->sk_nargs); + + Assert(ScanKeyEntryIsLegal(entry)); +} diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c new file mode 100644 index 00000000000..527eb5113df --- /dev/null +++ b/src/backend/access/common/tupdesc.c @@ -0,0 +1,398 @@ +/*------------------------------------------------------------------------- + * + * tupdesc.c-- + * POSTGRES tuple descriptor support code + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/common/tupdesc.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + * NOTES + * some of the executor utility code such as "ExecTypeFromTL" should be + * moved here. + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> /* for sprintf() */ +#include <ctype.h> +#include <string.h> + +#include "postgres.h" + +#include "nodes/pg_list.h" +#include "nodes/parsenodes.h" + +#include "access/attnum.h" +#include "access/htup.h" +#include "access/tupdesc.h" + +#include "utils/builtins.h" +#include "utils/elog.h" /* XXX generate exceptions instead */ +#include "utils/palloc.h" + +#include "utils/syscache.h" +#include "catalog/pg_type.h" + +#include "nodes/primnodes.h" + +#include "parser/catalog_utils.h" + +/* ---------------------------------------------------------------- + * CreateTemplateTupleDesc + * + * This function allocates and zeros a tuple descriptor structure. + * ---------------------------------------------------------------- + */ +TupleDesc +CreateTemplateTupleDesc(int natts) +{ + uint32 size; + TupleDesc desc; + + /* ---------------- + * sanity checks + * ---------------- + */ + AssertArg(natts >= 1); + + /* ---------------- + * allocate enough memory for the tuple descriptor and + * zero it as TupleDescInitEntry assumes that the descriptor + * is filled with NULL pointers. + * ---------------- + */ + size = natts * sizeof (AttributeTupleForm); + desc = (TupleDesc) palloc(sizeof(struct tupleDesc)); + desc->attrs = (AttributeTupleForm*) palloc(size); + memset(desc->attrs, 0, size); + + desc->natts = natts; + + return (desc); +} + +/* ---------------------------------------------------------------- + * CreateTupleDesc + * + * This function allocates a new TupleDesc from AttributeTupleForm array + * ---------------------------------------------------------------- + */ +TupleDesc +CreateTupleDesc(int natts, AttributeTupleForm* attrs) +{ + TupleDesc desc; + + /* ---------------- + * sanity checks + * ---------------- + */ + AssertArg(natts >= 1); + + desc = (TupleDesc) palloc(sizeof(struct tupleDesc)); + desc->attrs = attrs; + desc->natts = natts; + + + return (desc); +} + +/* ---------------------------------------------------------------- + * CreateTupleDescCopy + * + * This function creates a new TupleDesc by copying from an existing + * TupleDesc + * + * ---------------------------------------------------------------- + */ +TupleDesc +CreateTupleDescCopy(TupleDesc tupdesc) +{ + TupleDesc desc; + int i, size; + + desc = (TupleDesc) palloc(sizeof(struct tupleDesc)); + desc->natts = tupdesc->natts; + size = desc->natts * sizeof (AttributeTupleForm); + desc->attrs = (AttributeTupleForm*) palloc(size); + for (i=0;i<desc->natts;i++) { + desc->attrs[i] = + (AttributeTupleForm)palloc(ATTRIBUTE_TUPLE_SIZE); + memmove(desc->attrs[i], + tupdesc->attrs[i], + ATTRIBUTE_TUPLE_SIZE); + } + return desc; +} + +/* ---------------------------------------------------------------- + * TupleDescInitEntry + * + * This function initializes a single attribute structure in + * a preallocated tuple descriptor. + * ---------------------------------------------------------------- + */ +bool +TupleDescInitEntry(TupleDesc desc, + AttrNumber attributeNumber, + char *attributeName, + char *typeName, + int attdim, + bool attisset) +{ + HeapTuple tuple; + TypeTupleForm typeForm; + AttributeTupleForm att; + + /* ---------------- + * sanity checks + * ---------------- + */ + AssertArg(PointerIsValid(desc)); + AssertArg(attributeNumber >= 1); + /* attributeName's are sometimes NULL, + from resdom's. I don't know why that is, though -- Jolly */ +/* AssertArg(NameIsValid(attributeName));*/ +/* AssertArg(NameIsValid(typeName));*/ + + AssertArg(!PointerIsValid(desc->attrs[attributeNumber - 1])); + + + /* ---------------- + * allocate storage for this attribute + * ---------------- + */ + + att = (AttributeTupleForm) palloc(ATTRIBUTE_TUPLE_SIZE); + desc->attrs[attributeNumber - 1] = att; + + /* ---------------- + * initialize some of the attribute fields + * ---------------- + */ + att->attrelid = 0; /* dummy value */ + + if (attributeName != NULL) + namestrcpy(&(att->attname), attributeName); + else + memset(att->attname.data,0,NAMEDATALEN); + + + att->attdefrel = 0; /* dummy value */ + att->attnvals = 0; /* dummy value */ + att->atttyparg = 0; /* dummy value */ + att->attbound = 0; /* dummy value */ + att->attcanindex = 0; /* dummy value */ + att->attproc = 0; /* dummy value */ + att->attcacheoff = -1; + + att->attnum = attributeNumber; + att->attnelems = attdim; + att->attisset = attisset; + + /* ---------------- + * search the system cache for the type tuple of the attribute + * we are creating so that we can get the typeid and some other + * stuff. + * + * Note: in the special case of + * + * create EMP (name = char16, manager = EMP) + * + * RelationNameCreateHeapRelation() calls BuildDesc() which + * calls this routine and since EMP does not exist yet, the + * system cache lookup below fails. That's fine, but rather + * then doing a elog(WARN) we just leave that information + * uninitialized, return false, then fix things up later. + * -cim 6/14/90 + * ---------------- + */ + tuple = SearchSysCacheTuple(TYPNAME, PointerGetDatum(typeName), + 0,0,0); + if (! HeapTupleIsValid(tuple)) { + /* ---------------- + * here type info does not exist yet so we just fill + * the attribute with dummy information and return false. + * ---------------- + */ + att->atttypid = InvalidOid; + att->attlen = (int16) 0; + att->attbyval = (bool) 0; + att->attalign = 'i'; + return false; + } + + /* ---------------- + * type info exists so we initialize our attribute + * information from the type tuple we found.. + * ---------------- + */ + typeForm = (TypeTupleForm) GETSTRUCT(tuple); + + att->atttypid = tuple->t_oid; + att->attalign = typeForm->typalign; + + /* ------------------------ + If this attribute is a set, what is really stored in the + attribute is the OID of a tuple in the pg_proc catalog. + The pg_proc tuple contains the query string which defines + this set - i.e., the query to run to get the set. + So the atttypid (just assigned above) refers to the type returned + by this query, but the actual length of this attribute is the + length (size) of an OID. + + Why not just make the atttypid point to the OID type, instead + of the type the query returns? Because the executor uses the atttypid + to tell the front end what type will be returned (in BeginCommand), + and in the end the type returned will be the result of the query, not + an OID. + + Why not wait until the return type of the set is known (i.e., the + recursive call to the executor to execute the set has returned) + before telling the front end what the return type will be? Because + the executor is a delicate thing, and making sure that the correct + order of front-end commands is maintained is messy, especially + considering that target lists may change as inherited attributes + are considered, etc. Ugh. + ----------------------------------------- + */ + if (attisset) { + Type t = type("oid"); + att->attlen = tlen(t); + att->attbyval = tbyval(t); + } else { + att->attlen = typeForm->typlen; + att->attbyval = typeForm->typbyval; + } + + + return true; +} + + +/* ---------------------------------------------------------------- + * TupleDescMakeSelfReference + * + * This function initializes a "self-referential" attribute like + * manager in "create EMP (name=text, manager = EMP)". + * It calls TypeShellMake() which inserts a "shell" type + * tuple into pg_type. A self-reference is one kind of set, so + * its size and byval are the same as for a set. See the comments + * above in TupleDescInitEntry. + * ---------------------------------------------------------------- + */ +static void +TupleDescMakeSelfReference(TupleDesc desc, + AttrNumber attnum, + char *relname) +{ + AttributeTupleForm att; + Type t = type("oid"); + + att = desc->attrs[attnum-1]; + att->atttypid = TypeShellMake(relname); + att->attlen = tlen(t); + att->attbyval = tbyval(t); + att->attnelems = 0; +} + +/* ---------------------------------------------------------------- + * BuildDescForRelation + * + * This is a general purpose function identical to BuildDesc + * but is used by the DefineRelation() code to catch the + * special case where you + * + * create FOO ( ..., x = FOO ) + * + * here, the initial type lookup for "x = FOO" will fail + * because FOO isn't in the catalogs yet. But since we + * are creating FOO, instead of doing an elog() we add + * a shell type tuple to pg_type and fix things later + * in amcreate(). + * ---------------------------------------------------------------- + */ +TupleDesc +BuildDescForRelation(List *schema, char *relname) +{ + int natts; + AttrNumber attnum; + List *p; + TupleDesc desc; + char *attname; + char *typename; + int attdim; + bool attisset; + + /* ---------------- + * allocate a new tuple descriptor + * ---------------- + */ + natts = length(schema); + desc = CreateTemplateTupleDesc(natts); + + attnum = 0; + + typename = palloc(NAMEDATALEN+1); + + foreach(p, schema) { + ColumnDef *entry; + List *arry; + + /* ---------------- + * for each entry in the list, get the name and type + * information from the list and have TupleDescInitEntry + * fill in the attribute information we need. + * ---------------- + */ + attnum++; + + entry = lfirst(p); + attname = entry->colname; + arry = entry->typename->arrayBounds; + attisset = entry->typename->setof; + + if (arry != NIL) { + char buf[20]; + + attdim = length(arry); + + /* array of XXX is _XXX (inherited from release 3) */ + sprintf(buf, "_%.*s", NAMEDATALEN, entry->typename->name); + strcpy(typename, buf); + } else { + strcpy(typename, entry->typename->name); + attdim = 0; + } + + if (! TupleDescInitEntry(desc, attnum, attname, + typename, attdim, attisset)) { + /* ---------------- + * if TupleDescInitEntry() fails, it means there is + * no type in the system catalogs. So now we check if + * the type name equals the relation name. If so we + * have a self reference, otherwise it's an error. + * ---------------- + */ + if (!strcmp(typename, relname)) { + TupleDescMakeSelfReference(desc, attnum, relname); + } else + elog(WARN, "DefineRelation: no such type %.*s", + NAMEDATALEN, typename); + } + + /* + * this is for char() and varchar(). When an entry is of type + * char() or varchar(), typlen is set to the appropriate length, + * which we'll use here instead. (The catalog lookup only returns + * the length of bpchar and varchar which is not what we want!) + * - ay 6/95 + */ + if (entry->typename->typlen > 0) { + desc->attrs[attnum - 1]->attlen = entry->typename->typlen; + } + } + return desc; +} + diff --git a/src/backend/access/funcindex.h b/src/backend/access/funcindex.h new file mode 100644 index 00000000000..4689df19c04 --- /dev/null +++ b/src/backend/access/funcindex.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * + * funcindex.h-- + * + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: funcindex.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef _FUNC_INDEX_INCLUDED_ +#define _FUNC_INDEX_INCLUDED_ + +#include "postgres.h" + +typedef struct { + int nargs; + Oid arglist[8]; + Oid procOid; + NameData funcName; +} FuncIndexInfo; + +typedef FuncIndexInfo *FuncIndexInfoPtr; + +/* + * some marginally useful macro definitions + */ +/* #define FIgetname(FINFO) (&((FINFO)->funcName.data[0]))*/ +#define FIgetname(FINFO) (FINFO)->funcName.data +#define FIgetnArgs(FINFO) (FINFO)->nargs +#define FIgetProcOid(FINFO) (FINFO)->procOid +#define FIgetArg(FINFO, argnum) (FINFO)->arglist[argnum] +#define FIgetArglist(FINFO) (FINFO)->arglist + +#define FIsetnArgs(FINFO, numargs) ((FINFO)->nargs = numargs) +#define FIsetProcOid(FINFO, id) ((FINFO)->procOid = id) +#define FIsetArg(FINFO, argnum, argtype) ((FINFO)->arglist[argnum] = argtype) + +#define FIisFunctionalIndex(FINFO) (FINFO->procOid != InvalidOid) + +#endif /* FUNCINDEX_H */ diff --git a/src/backend/access/genam.h b/src/backend/access/genam.h new file mode 100644 index 00000000000..b2544650de8 --- /dev/null +++ b/src/backend/access/genam.h @@ -0,0 +1,60 @@ +/*------------------------------------------------------------------------- + * + * genam.h-- + * POSTGRES general access method definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: genam.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef GENAM_H +#define GENAM_H + +#include "postgres.h" + +#include "access/attnum.h" +#include "access/htup.h" +#include "access/istrat.h" +#include "access/itup.h" +#include "access/relscan.h" +#include "access/skey.h" +#include "access/sdir.h" +#include "access/funcindex.h" + +/* ---------------- + * generalized index_ interface routines + * ---------------- + */ +extern Relation index_open(Oid relationId); +extern Relation index_openr(char *relationName); +extern void index_close(Relation relation); +extern InsertIndexResult index_insert(Relation relation, + IndexTuple indexTuple); +extern void index_delete(Relation relation, ItemPointer indexItem); +extern IndexScanDesc index_beginscan(Relation relation, bool scanFromEnd, + uint16 numberOfKeys, ScanKey key); +extern void index_rescan(IndexScanDesc scan, bool scanFromEnd, ScanKey key); +extern void index_endscan(IndexScanDesc scan); +extern void index_markpos(IndexScanDesc scan); +extern void index_restrpos(IndexScanDesc scan); +extern RetrieveIndexResult index_getnext(IndexScanDesc scan, + ScanDirection direction); +extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, + uint16 procnum); +extern Datum GetIndexValue(HeapTuple tuple, TupleDesc hTupDesc, + int attOff, AttrNumber attrNums[], FuncIndexInfo *fInfo, + bool *attNull, Buffer buffer); + +/* in genam.c */ +extern IndexScanDesc RelationGetIndexScan(Relation relation, bool scanFromEnd, + uint16 numberOfKeys, ScanKey key); +extern void IndexScanRestart(IndexScanDesc scan, bool scanFromEnd, + ScanKey key); +extern void IndexScanEnd(IndexScanDesc scan); +extern void IndexScanMarkPosition(IndexScanDesc scan); +extern void IndexScanRestorePosition(IndexScanDesc scan); + +#endif /* GENAM_H */ diff --git a/src/backend/access/hash.h b/src/backend/access/hash.h new file mode 100644 index 00000000000..21407696b44 --- /dev/null +++ b/src/backend/access/hash.h @@ -0,0 +1,336 @@ +/*------------------------------------------------------------------------- + * + * hash.h-- + * header file for postgres hash access method implementation + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: hash.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + * NOTES + * modeled after Margo Seltzer's hash implementation for unix. + * + *------------------------------------------------------------------------- + */ +#ifndef HASH_H +#define HASH_H + +#include "access/itup.h" + +/* + * An overflow page is a spare page allocated for storing data whose + * bucket doesn't have room to store it. We use overflow pages rather + * than just splitting the bucket because there is a linear order in + * the way we split buckets. In other words, if there isn't enough space + * in the bucket itself, put it in an overflow page. + * + * Overflow page addresses are stored in form: (Splitnumber, Page offset). + * + * A splitnumber is the number of the generation where the table doubles + * in size. The ovflpage's offset within the splitnumber; offsets start + * at 1. + * + * We convert the stored bitmap address into a page address with the + * macro OADDR_OF(S, O) where S is the splitnumber and O is the page + * offset. + */ +typedef uint32 Bucket; +typedef bits16 OverflowPageAddress; +typedef uint32 SplitNumber; +typedef uint32 PageOffset; + +/* A valid overflow address will always have a page offset >= 1 */ +#define InvalidOvflAddress 0 + +#define SPLITSHIFT 11 +#define SPLITMASK 0x7FF +#define SPLITNUM(N) ((SplitNumber)(((uint32)(N)) >> SPLITSHIFT)) +#define OPAGENUM(N) ((PageOffset)((N) & SPLITMASK)) +#define OADDR_OF(S,O) ((OverflowPageAddress)((uint32)((uint32)(S) << SPLITSHIFT) + (O))) + +#define BUCKET_TO_BLKNO(B) \ + ((Bucket) ((B) + ((B) ? metap->SPARES[_hash_log2((B)+1)-1] : 0)) + 1) +#define OADDR_TO_BLKNO(B) \ + ((BlockNumber) \ + (BUCKET_TO_BLKNO ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B)))); + +/* + * hasho_flag tells us which type of page we're looking at. For + * example, knowing overflow pages from bucket pages is necessary + * information when you're deleting tuples from a page. If all the + * tuples are deleted from an overflow page, the overflow is made + * available to other buckets by calling _hash_freeovflpage(). If all + * the tuples are deleted from a bucket page, no additional action is + * necessary. + */ + +#define LH_UNUSED_PAGE (0) +#define LH_OVERFLOW_PAGE (1 << 0) +#define LH_BUCKET_PAGE (1 << 1) +#define LH_BITMAP_PAGE (1 << 2) +#define LH_META_PAGE (1 << 3) + +typedef struct HashPageOpaqueData { + bits16 hasho_flag; /* is this page a bucket or ovfl */ + Bucket hasho_bucket; /* bucket number this pg belongs to */ + OverflowPageAddress hasho_oaddr; /* ovfl address of this ovfl pg */ + BlockNumber hasho_nextblkno; /* next ovfl blkno */ + BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */ +} HashPageOpaqueData; + +typedef HashPageOpaqueData *HashPageOpaque; + +/* + * ScanOpaqueData is used to remember which buffers we're currently + * examining in the scan. We keep these buffers locked and pinned and + * recorded in the opaque entry of the scan in order to avoid doing a + * ReadBuffer() for every tuple in the index. This avoids semop() calls, + * which are expensive. + */ + +typedef struct HashScanOpaqueData { + Buffer hashso_curbuf; + Buffer hashso_mrkbuf; +} HashScanOpaqueData; + +typedef HashScanOpaqueData *HashScanOpaque; + +/* + * Definitions for metapage. + */ + +#define HASH_METAPAGE 0 /* metapage is always block 0 */ + +#define HASH_MAGIC 0x6440640 +#define HASH_VERSION 0 + +/* + * NCACHED is used to set the array sizeof spares[] & bitmaps[]. + * + * Spares[] is used to hold the number overflow pages currently + * allocated at a certain splitpoint. For example, if spares[3] = 7 + * then there are a maximum of 7 ovflpages available at splitpoint 3. + * The value in spares[] will change as ovflpages are added within + * a splitpoint. + * + * Within a splitpoint, one can find which ovflpages are available and + * which are used by looking at a bitmaps that are stored on the ovfl + * pages themselves. There is at least one bitmap for every splitpoint's + * ovflpages. Bitmaps[] contains the ovflpage addresses of the ovflpages + * that hold the ovflpage bitmaps. + * + * The reason that the size is restricted to NCACHED (32) is because + * the bitmaps are 16 bits: upper 5 represent the splitpoint, lower 11 + * indicate the page number within the splitpoint. Since there are + * only 5 bits to store the splitpoint, there can only be 32 splitpoints. + * Both spares[] and bitmaps[] use splitpoints as there indices, so there + * can only be 32 of them. + */ + +#define NCACHED 32 + + +typedef struct HashMetaPageData { + PageHeaderData hashm_phdr; /* pad for page header + (do not use) */ + uint32 hashm_magic; /* magic no. for hash tables */ + uint32 hashm_version; /* version ID */ + uint32 hashm_nkeys; /* number of keys stored in + the table */ + uint16 hashm_ffactor; /* fill factor */ + uint16 hashm_bsize; /* bucket size (bytes) - + must be a power of 2 */ + uint16 hashm_bshift; /* bucket shift */ + uint16 hashm_bmsize; /* bitmap array size (bytes) - + must be a power of 2 */ + uint32 hashm_maxbucket; /* ID of maximum bucket + in use */ + uint32 hashm_highmask; /* mask to modulo into + entire table */ + uint32 hashm_lowmask; /* mask to modulo into lower + half of table */ + uint32 hashm_ovflpoint; /* pageno. from which ovflpgs + being allocated */ + uint32 hashm_lastfreed; /* last ovflpage freed */ + uint32 hashm_nmaps; /* Initial number of bitmaps */ + uint32 hashm_spares[NCACHED]; /* spare pages available at + splitpoints */ + BlockNumber hashm_mapp[NCACHED]; /* blknumbers of ovfl page + maps */ + RegProcedure hashm_procid; /* hash procedure id from + pg_proc */ +} HashMetaPageData; + +typedef HashMetaPageData *HashMetaPage; + +/* Short hands for accessing structure */ +#define BSHIFT hashm_bshift +#define OVFL_POINT hashm_ovflpoint +#define LAST_FREED hashm_lastfreed +#define MAX_BUCKET hashm_maxbucket +#define FFACTOR hashm_ffactor +#define HIGH_MASK hashm_highmask +#define LOW_MASK hashm_lowmask +#define NKEYS hashm_nkeys +#define SPARES hashm_spares + +extern bool BuildingHash; + +typedef struct HashItemData { + IndexTupleData hash_itup; +} HashItemData; + +typedef HashItemData *HashItem; + +/* + * Constants + */ +#define DEFAULT_FFACTOR 300 +#define SPLITMAX 8 +#define BYTE_TO_BIT 3 /* 2^3 bits/byte */ +#define INT_TO_BYTE 2 /* 2^2 bytes/int */ +#define INT_TO_BIT 5 /* 2^5 bits/int */ +#define ALL_SET ((uint32) ~0) + +/* + * bitmap pages do not contain tuples. they do contain the standard + * page headers and trailers; however, everything in between is a + * giant bit array. the number of bits that fit on a page obviously + * depends on the page size and the header/trailer overhead. + */ +#define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize) +#define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT) +#define HashPageGetBitmap(pg) \ + ((uint32 *) (((char *) (pg)) + DOUBLEALIGN(sizeof(PageHeaderData)))) + +/* + * The number of bits in an ovflpage bitmap which + * tells which ovflpages are empty versus in use (NOT the number of + * bits in an overflow page *address* bitmap). + */ +#define BITS_PER_MAP 32 /* Number of bits in ovflpage bitmap */ + +/* Given the address of the beginning of a big map, clear/set the nth bit */ +#define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) +#define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) +#define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) + +/* + * page locking modes + */ +#define HASH_READ 0 +#define HASH_WRITE 1 + +/* + * In general, the hash code tries to localize its knowledge about page + * layout to a couple of routines. However, we need a special value to + * indicate "no page number" in those places where we expect page numbers. + */ + +#define P_NONE 0 + +/* + * Strategy number. There's only one valid strategy for hashing: equality. + */ + +#define HTEqualStrategyNumber 1 +#define HTMaxStrategyNumber 1 + +/* + * When a new operator class is declared, we require that the user supply + * us with an amproc procudure for hashing a key of the new type. + * Since we only have one such proc in amproc, it's number 1. + */ + +#define HASHPROC 1 + +/* public routines */ + +extern void hashbuild(Relation heap, Relation index, int natts, + AttrNumber *attnum, IndexStrategy istrat, uint16 pcount, + Datum *params, FuncIndexInfo *finfo, PredInfo *predInfo); +extern InsertIndexResult hashinsert(Relation rel, IndexTuple itup); +extern char *hashgettuple(IndexScanDesc scan, ScanDirection dir); +extern char *hashbeginscan(Relation rel, bool fromEnd, uint16 keysz, + ScanKey scankey); +extern void hashrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey); +extern void hashendscan(IndexScanDesc scan); +extern void hashmarkpos(IndexScanDesc scan); +extern void hashrestrpos(IndexScanDesc scan); +extern void hashdelete(Relation rel, ItemPointer tid); + +/* hashfunc.c */ +extern uint32 hashint2(int16 key); +extern uint32 hashint4(uint32 key); +extern uint32 hashfloat4(float32 keyp); +extern uint32 hashfloat8(float64 keyp); +extern uint32 hashoid(Oid key); +extern uint32 hashchar(char key); +extern uint32 hashchar2(uint16 intkey); +extern uint32 hashchar4(uint32 intkey); +extern uint32 hashchar8(char *key); +extern uint32 hashchar16(char *key); +extern uint32 hashtext(struct varlena *key); + +/* private routines */ + +/* hashinsert.c */ +extern InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem); + + +/* hashovfl.c */ +extern Buffer _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf); +extern Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf); +extern int32 _hash_initbitmap(Relation rel, HashMetaPage metap, int32 pnum, + int32 nbits, int32 ndx); +extern void _hash_squeezebucket(Relation rel, HashMetaPage metap, + Bucket bucket); + + +/* hashpage.c */ +extern void _hash_metapinit(Relation rel); +extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access); +extern void _hash_relbuf(Relation rel, Buffer buf, int access); +extern void _hash_wrtbuf(Relation rel, Buffer buf); +extern void _hash_wrtnorelbuf(Relation rel, Buffer buf); +extern Page _hash_chgbufaccess(Relation rel, Buffer *bufp, int from_access, + int to_access); +extern void _hash_pageinit(Page page, Size size); +extern void _hash_pagedel(Relation rel, ItemPointer tid); +extern void _hash_expandtable(Relation rel, Buffer metabuf); + + +/* hashscan.c */ +extern void _hash_regscan(IndexScanDesc scan); +extern void _hash_dropscan(IndexScanDesc scan); +extern void _hash_adjscans(Relation rel, ItemPointer tid); + + +/* hashsearch.c */ +extern void _hash_search(Relation rel, int keysz, ScanKey scankey, + Buffer *bufP, HashMetaPage metap); +extern RetrieveIndexResult _hash_next(IndexScanDesc scan, ScanDirection dir); +extern RetrieveIndexResult _hash_first(IndexScanDesc scan, ScanDirection dir); +extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, + Buffer metabuf); + + +/* hashstrat.c */ +extern StrategyNumber _hash_getstrat(Relation rel, AttrNumber attno, + RegProcedure proc); +extern bool _hash_invokestrat(Relation rel, AttrNumber attno, + StrategyNumber strat, Datum left, Datum right); + + +/* hashutil.c */ +extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup, + HashMetaPage metap); +extern void _hash_freeskey(ScanKey skey); +extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup); +extern HashItem _hash_formitem(IndexTuple itup); +extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key); +extern uint32 _hash_log2(uint32 num); +extern void _hash_checkpage(Page page, int flags); + +#endif /* HASH_H */ diff --git a/src/backend/access/hash/Makefile.inc b/src/backend/access/hash/Makefile.inc new file mode 100644 index 00000000000..8ea221bc264 --- /dev/null +++ b/src/backend/access/hash/Makefile.inc @@ -0,0 +1,18 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for access/hash (hash access method) +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/hash/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= hash.c hashfunc.c hashinsert.c hashovfl.c hashpage.c hashscan.c \ + hashsearch.c hashstrat.c hashutil.c + + + diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c new file mode 100644 index 00000000000..a4a4e16e599 --- /dev/null +++ b/src/backend/access/hash/hash.c @@ -0,0 +1,467 @@ +/*------------------------------------------------------------------------- + * + * hash.c-- + * Implementation of Margo Seltzer's Hashing package for postgres. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + * NOTES + * This file contains only the public interface routines. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" +#include "access/heapam.h" +#include "access/genam.h" +#include "access/sdir.h" +#include "access/hash.h" +#include "access/funcindex.h" +#include "nodes/execnodes.h" +#include "nodes/plannodes.h" +#include "executor/executor.h" +#include "executor/tuptable.h" +#include "catalog/index.h" + + +bool BuildingHash = false; + +/* + * hashbuild() -- build a new hash index. + * + * We use a global variable to record the fact that we're creating + * a new index. This is used to avoid high-concurrency locking, + * since the index won't be visible until this transaction commits + * and since building is guaranteed to be single-threaded. + */ +void +hashbuild(Relation heap, + Relation index, + int natts, + AttrNumber *attnum, + IndexStrategy istrat, + uint16 pcount, + Datum *params, + FuncIndexInfo *finfo, + PredInfo *predInfo) +{ + HeapScanDesc hscan; + Buffer buffer; + HeapTuple htup; + IndexTuple itup; + TupleDesc htupdesc, itupdesc; + Datum *attdata; + bool *nulls; + InsertIndexResult res; + int nhtups, nitups; + int i; + HashItem hitem; + ExprContext *econtext; + TupleTable tupleTable; + TupleTableSlot *slot; + Oid hrelid, irelid; + Node *pred, *oldPred; + + /* note that this is a new btree */ + BuildingHash = true; + + pred = predInfo->pred; + oldPred = predInfo->oldPred; + + /* initialize the hash index metadata page (if this is a new index) */ + if (oldPred == NULL) + _hash_metapinit(index); + + /* get tuple descriptors for heap and index relations */ + htupdesc = RelationGetTupleDescriptor(heap); + itupdesc = RelationGetTupleDescriptor(index); + + /* get space for data items that'll appear in the index tuple */ + attdata = (Datum *) palloc(natts * sizeof(Datum)); + nulls = (bool *) palloc(natts * sizeof(bool)); + + /* + * If this is a predicate (partial) index, we will need to evaluate the + * predicate using ExecQual, which requires the current tuple to be in a + * slot of a TupleTable. In addition, ExecQual must have an ExprContext + * referring to that slot. Here, we initialize dummy TupleTable and + * ExprContext objects for this purpose. --Nels, Feb '92 + */ +#ifndef OMIT_PARTIAL_INDEX + if (pred != NULL || oldPred != NULL) { + tupleTable = ExecCreateTupleTable(1); + slot = ExecAllocTableSlot(tupleTable); + econtext = makeNode(ExprContext); + FillDummyExprContext(econtext, slot, htupdesc, buffer); + } +#endif /* OMIT_PARTIAL_INDEX */ + + /* start a heap scan */ + hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL); + htup = heap_getnext(hscan, 0, &buffer); + + /* build the index */ + nhtups = nitups = 0; + + for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) { + + nhtups++; + + /* + * If oldPred != NULL, this is an EXTEND INDEX command, so skip + * this tuple if it was already in the existing partial index + */ + if (oldPred != NULL) { + /*SetSlotContents(slot, htup); */ +#ifndef OMIT_PARTIAL_INDEX + slot->val = htup; + if (ExecQual((List*)oldPred, econtext) == true) { + nitups++; + continue; + } +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* Skip this tuple if it doesn't satisfy the partial-index predicate */ + if (pred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + /*SetSlotContents(slot, htup); */ + slot->val = htup; + if (ExecQual((List*)pred, econtext) == false) + continue; +#endif /* OMIT_PARTIAL_INDEX */ +} + + nitups++; + + /* + * For the current heap tuple, extract all the attributes + * we use in this index, and note which are null. + */ + for (i = 1; i <= natts; i++) { + int attoff; + bool attnull; + + /* + * Offsets are from the start of the tuple, and are + * zero-based; indices are one-based. The next call + * returns i - 1. That's data hiding for you. + */ + + /* attoff = i - 1 */ + attoff = AttrNumberGetAttrOffset(i); + + /* below, attdata[attoff] set to equal some datum & + * attnull is changed to indicate whether or not the attribute + * is null for this tuple + */ + attdata[attoff] = GetIndexValue(htup, + htupdesc, + attoff, + attnum, + finfo, + &attnull, + buffer); + nulls[attoff] = (attnull ? 'n' : ' '); + } + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(itupdesc, attdata, nulls); + + /* + * If the single index key is null, we don't insert it into + * the index. Hash tables support scans on '='. + * Relational algebra says that A = B + * returns null if either A or B is null. This + * means that no qualification used in an index scan could ever + * return true on a null attribute. It also means that indices + * can't be used by ISNULL or NOTNULL scans, but that's an + * artifact of the strategy map architecture chosen in 1986, not + * of the way nulls are handled here. + */ + + if (itup->t_info & INDEX_NULL_MASK) { + pfree(itup); + continue; + } + + itup->t_tid = htup->t_ctid; + hitem = _hash_formitem(itup); + res = _hash_doinsert(index, hitem); + pfree(hitem); + pfree(itup); + pfree(res); + } + + /* okay, all heap tuples are indexed */ + heap_endscan(hscan); + + if (pred != NULL || oldPred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + ExecDestroyTupleTable(tupleTable, true); + pfree(econtext); +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* + * Since we just counted the tuples in the heap, we update its + * stats in pg_class to guarantee that the planner takes advantage + * of the index we just created. Finally, only update statistics + * during normal index definitions, not for indices on system catalogs + * created during bootstrap processing. We must close the relations + * before updatings statistics to guarantee that the relcache entries + * are flushed when we increment the command counter in UpdateStats(). + */ + if (IsNormalProcessingMode()) + { + hrelid = heap->rd_id; + irelid = index->rd_id; + heap_close(heap); + index_close(index); + UpdateStats(hrelid, nhtups, true); + UpdateStats(irelid, nitups, false); + if (oldPred != NULL) { + if (nitups == nhtups) pred = NULL; + UpdateIndexPredicate(irelid, oldPred, pred); + } + } + + /* be tidy */ + pfree(nulls); + pfree(attdata); + + /* all done */ + BuildingHash = false; +} + +/* + * hashinsert() -- insert an index tuple into a hash table. + * + * Hash on the index tuple's key, find the appropriate location + * for the new tuple, put it there, and return an InsertIndexResult + * to the caller. + */ +InsertIndexResult +hashinsert(Relation rel, IndexTuple itup) +{ + HashItem hitem; + InsertIndexResult res; + + if (itup->t_info & INDEX_NULL_MASK) + return ((InsertIndexResult) NULL); + + hitem = _hash_formitem(itup); + + res = _hash_doinsert(rel, hitem); + + pfree(hitem); + + return (res); +} + + +/* + * hashgettuple() -- Get the next tuple in the scan. + */ +char * +hashgettuple(IndexScanDesc scan, ScanDirection dir) +{ + RetrieveIndexResult res; + + /* + * If we've already initialized this scan, we can just advance it + * in the appropriate direction. If we haven't done so yet, we + * call a routine to get the first item in the scan. + */ + + if (ItemPointerIsValid(&(scan->currentItemData))) + res = _hash_next(scan, dir); + else + res = _hash_first(scan, dir); + + return ((char *) res); +} + + +/* + * hashbeginscan() -- start a scan on a hash index + */ +char * +hashbeginscan(Relation rel, + bool fromEnd, + uint16 keysz, + ScanKey scankey) +{ + IndexScanDesc scan; + HashScanOpaque so; + + scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey); + so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); + so->hashso_curbuf = so->hashso_mrkbuf = InvalidBuffer; + scan->opaque = so; + scan->flags = 0x0; + + /* register scan in case we change pages it's using */ + _hash_regscan(scan); + + return ((char *) scan); +} + +/* + * hashrescan() -- rescan an index relation + */ +void +hashrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey) +{ + ItemPointer iptr; + HashScanOpaque so; + + so = (HashScanOpaque) scan->opaque; + + /* we hold a read lock on the current page in the scan */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ); + so->hashso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ); + so->hashso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* reset the scan key */ + if (scan->numberOfKeys > 0) { + memmove(scan->keyData, + scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } +} + +/* + * hashendscan() -- close down a scan + */ +void +hashendscan(IndexScanDesc scan) +{ + + ItemPointer iptr; + HashScanOpaque so; + + so = (HashScanOpaque) scan->opaque; + + /* release any locks we still hold */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ); + so->hashso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + if (BufferIsValid(so->hashso_mrkbuf)) + _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ); + so->hashso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* don't need scan registered anymore */ + _hash_dropscan(scan); + + /* be tidy */ +#ifdef PERFECT_MMGR + pfree (scan->opaque); +#endif /* PERFECT_MMGR */ +} + +/* + * hashmarkpos() -- save current scan position + * + */ +void +hashmarkpos(IndexScanDesc scan) +{ + ItemPointer iptr; + HashScanOpaque so; + + /* see if we ever call this code. if we do, then so_mrkbuf a + * useful element in the scan->opaque structure. if this procedure + * is never called, so_mrkbuf should be removed from the scan->opaque + * structure. + */ + elog(NOTICE, "Hashmarkpos() called."); + + so = (HashScanOpaque) scan->opaque; + + /* release lock on old marked data, if any */ + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ); + so->hashso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* bump lock on currentItemData and copy to currentMarkData */ + if (ItemPointerIsValid(&(scan->currentItemData))) { + so->hashso_mrkbuf = _hash_getbuf(scan->relation, + BufferGetBlockNumber(so->hashso_curbuf), + HASH_READ); + scan->currentMarkData = scan->currentItemData; + } +} + +/* + * hashrestrpos() -- restore scan to last saved position + */ +void +hashrestrpos(IndexScanDesc scan) +{ + ItemPointer iptr; + HashScanOpaque so; + + /* see if we ever call this code. if we do, then so_mrkbuf a + * useful element in the scan->opaque structure. if this procedure + * is never called, so_mrkbuf should be removed from the scan->opaque + * structure. + */ + elog(NOTICE, "Hashrestrpos() called."); + + so = (HashScanOpaque) scan->opaque; + + /* release lock on current data, if any */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ); + so->hashso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* bump lock on currentMarkData and copy to currentItemData */ + if (ItemPointerIsValid(&(scan->currentMarkData))) { + so->hashso_curbuf = + _hash_getbuf(scan->relation, + BufferGetBlockNumber(so->hashso_mrkbuf), + HASH_READ); + + scan->currentItemData = scan->currentMarkData; + } +} + +/* stubs */ +void +hashdelete(Relation rel, ItemPointer tid) +{ + /* adjust any active scans that will be affected by this deletion */ + _hash_adjscans(rel, tid); + + /* delete the data from the page */ + _hash_pagedel(rel, tid); +} + diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c new file mode 100644 index 00000000000..6b37de29911 --- /dev/null +++ b/src/backend/access/hash/hashfunc.c @@ -0,0 +1,276 @@ +/*------------------------------------------------------------------------- + * + * hashfunc.c-- + * Comparison functions for hash access method. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashfunc.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + * NOTES + * These functions are stored in pg_amproc. For each operator class + * defined on hash tables, they compute the hash value of the argument. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "utils/nabstime.h" + +uint32 hashint2(int16 key) +{ + return ((uint32) ~key); +} + +uint32 hashint4(uint32 key) +{ + return (~key); +} + +/* Hash function from Chris Torek. */ +uint32 hashfloat4(float32 keyp) +{ + int len; + int loop; + uint32 h; + char *kp = (char *) keyp; + + len = sizeof(float32data); + +#define HASH4a h = (h << 5) - h + *kp++; +#define HASH4b h = (h << 5) + h + *kp++; +#define HASH4 HASH4b + + + h = 0; + if (len > 0) { + loop = (len + 8 - 1) >> 3; + + switch (len & (8 - 1)) { + case 0: + do { /* All fall throughs */ + HASH4; + case 7: + HASH4; + case 6: + HASH4; + case 5: + HASH4; + case 4: + HASH4; + case 3: + HASH4; + case 2: + HASH4; + case 1: + HASH4; + } while (--loop); + } + } + return (h); +} + + +uint32 hashfloat8(float64 keyp) +{ + int len; + int loop; + uint32 h; + char *kp = (char *) keyp; + + len = sizeof(float64data); + +#define HASH4a h = (h << 5) - h + *kp++; +#define HASH4b h = (h << 5) + h + *kp++; +#define HASH4 HASH4b + + + h = 0; + if (len > 0) { + loop = (len + 8 - 1) >> 3; + + switch (len & (8 - 1)) { + case 0: + do { /* All fall throughs */ + HASH4; + case 7: + HASH4; + case 6: + HASH4; + case 5: + HASH4; + case 4: + HASH4; + case 3: + HASH4; + case 2: + HASH4; + case 1: + HASH4; + } while (--loop); + } + } + return (h); +} + + +uint32 hashoid(Oid key) +{ + return ((uint32) ~key); +} + + +uint32 hashchar(char key) +{ + int len; + uint32 h; + + len = sizeof(char); + +#define PRIME1 37 +#define PRIME2 1048583 + + h = 0; + /* Convert char to integer */ + h = h * PRIME1 ^ (key - ' '); + h %= PRIME2; + + return (h); +} + +uint32 hashchar2(uint16 intkey) +{ + uint32 h; + int len; + char *key = (char *) &intkey; + + h = 0; + len = sizeof(uint16); + /* Convert string to integer */ + while (len--) + h = h * PRIME1 ^ (*key++ - ' '); + h %= PRIME2; + + return (h); +} + +uint32 hashchar4(uint32 intkey) +{ + uint32 h; + int len; + char *key = (char *) &intkey; + + h = 0; + len = sizeof(uint32); + /* Convert string to integer */ + while (len--) + h = h * PRIME1 ^ (*key++ - ' '); + h %= PRIME2; + + return (h); +} + +uint32 hashchar8(char *key) +{ + uint32 h; + int len; + + h = 0; + len = sizeof(char8); + /* Convert string to integer */ + while (len--) + h = h * PRIME1 ^ (*key++ - ' '); + h %= PRIME2; + + return (h); +} + +uint32 hashname(NameData *n) +{ + uint32 h; + int len; + char *key; + + key = n->data; + + h = 0; + len = NAMEDATALEN; + /* Convert string to integer */ + while (len--) + h = h * PRIME1 ^ (*key++ - ' '); + h %= PRIME2; + + return (h); +} + + +uint32 hashchar16(char *key) +{ + uint32 h; + int len; + + h = 0; + len = sizeof(char16); + /* Convert string to integer */ + while (len--) + h = h * PRIME1 ^ (*key++ - ' '); + h %= PRIME2; + + return (h); +} + + +/* + * (Comment from the original db3 hashing code: ) + * + * "This is INCREDIBLY ugly, but fast. We break the string up into 8 byte + * units. On the first time through the loop we get the 'leftover bytes' + * (strlen % 8). On every other iteration, we perform 8 HASHC's so we handle + * all 8 bytes. Essentially, this saves us 7 cmp & branch instructions. If + * this routine is heavily used enough, it's worth the ugly coding. + * + * "OZ's original sdbm hash" + */ +uint32 hashtext(struct varlena *key) +{ + int keylen; + char *keydata; + uint32 n; + int loop; + + keydata = VARDATA(key); + keylen = VARSIZE(key); + + /* keylen includes the four bytes in which string keylength is stored */ + keylen -= sizeof(VARSIZE(key)); + +#define HASHC n = *keydata++ + 65599 * n + + n = 0; + if (keylen > 0) { + loop = (keylen + 8 - 1) >> 3; + + switch (keylen & (8 - 1)) { + case 0: + do { /* All fall throughs */ + HASHC; + case 7: + HASHC; + case 6: + HASHC; + case 5: + HASHC; + case 4: + HASHC; + case 3: + HASHC; + case 2: + HASHC; + case 1: + HASHC; + } while (--loop); + } + } + return (n); +} diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c new file mode 100644 index 00000000000..c514cc614d8 --- /dev/null +++ b/src/backend/access/hash/hashinsert.c @@ -0,0 +1,239 @@ +/*------------------------------------------------------------------------- + * + * hashinsert.c-- + * Item insertion in hash tables for Postgres. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashinsert.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/hash.h" + +static InsertIndexResult _hash_insertonpg(Relation rel, Buffer buf, int keysz, ScanKey scankey, HashItem hitem, Buffer metabuf); +static OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, HashItem hitem); + +/* + * _hash_doinsert() -- Handle insertion of a single HashItem in the table. + * + * This routine is called by the public interface routines, hashbuild + * and hashinsert. By here, hashitem is filled in, and has a unique + * (xid, seqno) pair. The datum to be used as a "key" is in the + * hashitem. + */ +InsertIndexResult +_hash_doinsert(Relation rel, HashItem hitem) +{ + Buffer buf; + Buffer metabuf; + BlockNumber blkno; + HashMetaPage metap; + IndexTuple itup; + InsertIndexResult res; + ScanKey itup_scankey; + int natts; + Page page; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + /* we need a scan key to do our search, so build one */ + itup = &(hitem->hash_itup); + if ((natts = rel->rd_rel->relnatts) != 1) + elog(WARN, "Hash indices valid for only one index key."); + itup_scankey = _hash_mkscankey(rel, itup, metap); + + /* + * find the first page in the bucket chain containing this key and + * place it in buf. _hash_search obtains a read lock for us. + */ + _hash_search(rel, natts, itup_scankey, &buf, metap); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE); + + /* + * trade in our read lock for a write lock so that we can do the + * insertion. + */ + blkno = BufferGetBlockNumber(buf); + _hash_relbuf(rel, buf, HASH_READ); + buf = _hash_getbuf(rel, blkno, HASH_WRITE); + + + /* + * XXX btree comment (haven't decided what to do in hash): don't + * think the bucket can be split while we're reading the metapage. + * + * If the page was split between the time that we surrendered our + * read lock and acquired our write lock, then this page may no + * longer be the right place for the key we want to insert. + */ + + /* do the insertion */ + res = _hash_insertonpg(rel, buf, natts, itup_scankey, + hitem, metabuf); + + /* be tidy */ + _hash_freeskey(itup_scankey); + + return (res); +} + +/* + * _hash_insertonpg() -- Insert a tuple on a particular page in the table. + * + * This recursive procedure does the following things: + * + * + if necessary, splits the target page. + * + inserts the tuple. + * + * On entry, we must have the right buffer on which to do the + * insertion, and the buffer must be pinned and locked. On return, + * we will have dropped both the pin and the write lock on the buffer. + * + */ +static InsertIndexResult +_hash_insertonpg(Relation rel, + Buffer buf, + int keysz, + ScanKey scankey, + HashItem hitem, + Buffer metabuf) +{ + InsertIndexResult res; + Page page; + BlockNumber itup_blkno; + OffsetNumber itup_off; + int itemsz; + HashPageOpaque pageopaque; + bool do_expand = false; + Buffer ovflbuf; + HashMetaPage metap; + Bucket bucket; + + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket = pageopaque->hasho_bucket; + + itemsz = IndexTupleDSize(hitem->hash_itup) + + (sizeof(HashItemData) - sizeof(IndexTupleData)); + itemsz = DOUBLEALIGN(itemsz); + + while (PageGetFreeSpace(page) < itemsz) { + /* + * no space on this page; check for an overflow page + */ + if (BlockNumberIsValid(pageopaque->hasho_nextblkno)) { + /* + * ovfl page exists; go get it. if it doesn't have room, + * we'll find out next pass through the loop test above. + */ + ovflbuf = _hash_getbuf(rel, pageopaque->hasho_nextblkno, + HASH_WRITE); + _hash_relbuf(rel, buf, HASH_WRITE); + buf = ovflbuf; + page = BufferGetPage(buf); + } else { + /* + * we're at the end of the bucket chain and we haven't + * found a page with enough room. allocate a new overflow + * page. + */ + do_expand = true; + ovflbuf = _hash_addovflpage(rel, &metabuf, buf); + _hash_relbuf(rel, buf, HASH_WRITE); + buf = ovflbuf; + page = BufferGetPage(buf); + + if (PageGetFreeSpace(page) < itemsz) { + /* it doesn't fit on an empty page -- give up */ + elog(WARN, "hash item too large"); + } + } + _hash_checkpage(page, LH_OVERFLOW_PAGE); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(pageopaque->hasho_bucket == bucket); + } + + itup_off = _hash_pgaddtup(rel, buf, keysz, scankey, itemsz, hitem); + itup_blkno = BufferGetBlockNumber(buf); + + /* by here, the new tuple is inserted */ + res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); + + ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); + + if (res != NULL) { + /* + * Increment the number of keys in the table. + * We switch lock access type just for a moment + * to allow greater accessibility to the metapage. + */ + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, + HASH_READ, HASH_WRITE); + metap->hashm_nkeys += 1; + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, + HASH_WRITE, HASH_READ); + + } + + _hash_wrtbuf(rel, buf); + + if (do_expand || + (metap->hashm_nkeys / (metap->hashm_maxbucket + 1)) + > metap->hashm_ffactor) { + _hash_expandtable(rel, metabuf); + } + _hash_relbuf(rel, metabuf, HASH_READ); + return (res); +} + +/* + * _hash_pgaddtup() -- add a tuple to a particular page in the index. + * + * This routine adds the tuple to the page as requested, and keeps the + * write lock and reference associated with the page's buffer. It is + * an error to call pgaddtup() without a write lock and reference. + */ +static OffsetNumber +_hash_pgaddtup(Relation rel, + Buffer buf, + int keysz, + ScanKey itup_scankey, + Size itemsize, + HashItem hitem) +{ + OffsetNumber itup_off; + Page page; + + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + + itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page)); + (void) PageAddItem(page, (Item) hitem, itemsize, itup_off, LP_USED); + + /* write the buffer, but hold our lock */ + _hash_wrtnorelbuf(rel, buf); + + return (itup_off); +} diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c new file mode 100644 index 00000000000..55ee9e9ce79 --- /dev/null +++ b/src/backend/access/hash/hashovfl.c @@ -0,0 +1,614 @@ +/*------------------------------------------------------------------------- + * + * hashovfl.c-- + * Overflow page management code for the Postgres hash access method + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + * NOTES + * Overflow pages look like ordinary relation pages. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/hash.h" + +static OverflowPageAddress _hash_getovfladdr(Relation rel, Buffer *metabufp); +static uint32 _hash_firstfreebit(uint32 map); + +/* + * _hash_addovflpage + * + * Add an overflow page to the page currently pointed to by the buffer + * argument 'buf'. + * + * *Metabufp has a read lock upon entering the function; buf has a + * write lock. + * + */ +Buffer +_hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf) +{ + + OverflowPageAddress oaddr; + BlockNumber ovflblkno; + Buffer ovflbuf; + HashMetaPage metap; + HashPageOpaque ovflopaque; + HashPageOpaque pageopaque; + Page page; + Page ovflpage; + + /* this had better be the last page in a bucket chain */ + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(!BlockNumberIsValid(pageopaque->hasho_nextblkno)); + + metap = (HashMetaPage) BufferGetPage(*metabufp); + _hash_checkpage((Page) metap, LH_META_PAGE); + + /* allocate an empty overflow page */ + oaddr = _hash_getovfladdr(rel, metabufp); + if (oaddr == InvalidOvflAddress) { + elog(WARN, "_hash_addovflpage: problem with _hash_getovfladdr."); + } + ovflblkno = OADDR_TO_BLKNO(OADDR_OF(SPLITNUM(oaddr), OPAGENUM(oaddr))); + Assert(BlockNumberIsValid(ovflblkno)); + ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE); + Assert(BufferIsValid(ovflbuf)); + ovflpage = BufferGetPage(ovflbuf); + + /* initialize the new overflow page */ + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; + ovflopaque->hasho_oaddr = oaddr; + ovflopaque->hasho_bucket = pageopaque->hasho_bucket; + _hash_wrtnorelbuf(rel, ovflbuf); + + /* logically chain overflow page to previous page */ + pageopaque->hasho_nextblkno = ovflblkno; + _hash_wrtnorelbuf(rel, buf); + return (ovflbuf); +} + +/* + * _hash_getovfladdr() + * + * Find an available overflow page and return its address. + * + * When we enter this function, we have a read lock on *metabufp which + * we change to a write lock immediately. Before exiting, the write lock + * is exchanged for a read lock. + * + */ +static OverflowPageAddress +_hash_getovfladdr(Relation rel, Buffer *metabufp) +{ + HashMetaPage metap; + Buffer mapbuf; + BlockNumber blkno; + PageOffset offset; + OverflowPageAddress oaddr; + SplitNumber splitnum; + uint32 *freep; + uint32 max_free; + uint32 bit; + uint32 first_page; + uint32 free_bit; + uint32 free_page; + uint32 in_use_bits; + uint32 i, j; + + metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE); + + splitnum = metap->OVFL_POINT; + max_free = metap->SPARES[splitnum]; + + free_page = (max_free - 1) >> (metap->BSHIFT + BYTE_TO_BIT); + free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1); + + /* Look through all the free maps to find the first free block */ + first_page = metap->LAST_FREED >> (metap->BSHIFT + BYTE_TO_BIT); + for ( i = first_page; i <= free_page; i++ ) { + Page mappage; + + blkno = metap->hashm_mapp[i]; + mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE); + mappage = BufferGetPage(mapbuf); + _hash_checkpage(mappage, LH_BITMAP_PAGE); + freep = HashPageGetBitmap(mappage); + Assert(freep); + + if (i == free_page) + in_use_bits = free_bit; + else + in_use_bits = BMPGSZ_BIT(metap) - 1; + + if (i == first_page) { + bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1); + j = bit / BITS_PER_MAP; + bit = bit & ~(BITS_PER_MAP - 1); + } else { + bit = 0; + j = 0; + } + for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP) + if (freep[j] != ALL_SET) + goto found; + } + + /* No Free Page Found - have to allocate a new page */ + metap->LAST_FREED = metap->SPARES[splitnum]; + metap->SPARES[splitnum]++; + offset = metap->SPARES[splitnum] - + (splitnum ? metap->SPARES[splitnum - 1] : 0); + +#define OVMSG "HASH: Out of overflow pages. Out of luck.\n" + + if (offset > SPLITMASK) { + if (++splitnum >= NCACHED) { + elog(WARN, OVMSG); + } + metap->OVFL_POINT = splitnum; + metap->SPARES[splitnum] = metap->SPARES[splitnum-1]; + metap->SPARES[splitnum-1]--; + offset = 0; + } + + /* Check if we need to allocate a new bitmap page */ + if (free_bit == BMPGSZ_BIT(metap) - 1) { + /* won't be needing old map page */ + + _hash_relbuf(rel, mapbuf, HASH_WRITE); + + free_page++; + if (free_page >= NCACHED) { + elog(WARN, OVMSG); + } + + /* + * This is tricky. The 1 indicates that you want the new page + * allocated with 1 clear bit. Actually, you are going to + * allocate 2 pages from this map. The first is going to be + * the map page, the second is the overflow page we were + * looking for. The init_bitmap routine automatically, sets + * the first bit of itself to indicate that the bitmap itself + * is in use. We would explicitly set the second bit, but + * don't have to if we tell init_bitmap not to leave it clear + * in the first place. + */ + if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset), + 1, free_page)) { + elog(WARN, "overflow_page: problem with _hash_initbitmap."); + } + metap->SPARES[splitnum]++; + offset++; + if (offset > SPLITMASK) { + if (++splitnum >= NCACHED) { + elog(WARN, OVMSG); + } + metap->OVFL_POINT = splitnum; + metap->SPARES[splitnum] = metap->SPARES[splitnum-1]; + metap->SPARES[splitnum-1]--; + offset = 0; + } + } else { + + /* + * Free_bit addresses the last used bit. Bump it to address + * the first available bit. + */ + free_bit++; + SETBIT(freep, free_bit); + _hash_wrtbuf(rel, mapbuf); + } + + /* Calculate address of the new overflow page */ + oaddr = OADDR_OF(splitnum, offset); + _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ); + return (oaddr); + + found: + bit = bit + _hash_firstfreebit(freep[j]); + SETBIT(freep, bit); + _hash_wrtbuf(rel, mapbuf); + + /* + * Bits are addressed starting with 0, but overflow pages are addressed + * beginning at 1. Bit is a bit addressnumber, so we need to increment + * it to convert it to a page number. + */ + + bit = 1 + bit + (i * BMPGSZ_BIT(metap)); + if (bit >= metap->LAST_FREED) { + metap->LAST_FREED = bit - 1; + } + + /* Calculate the split number for this page */ + for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++) + ; + offset = (i ? bit - metap->SPARES[i - 1] : bit); + if (offset >= SPLITMASK) { + elog(WARN, OVMSG); + } + + /* initialize this page */ + oaddr = OADDR_OF(i, offset); + _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ); + return (oaddr); +} + +/* + * _hash_firstfreebit() + * + * Return the first bit that is not set in the argument 'map'. This + * function is used to find an available overflow page within a + * splitnumber. + * + */ +static uint32 +_hash_firstfreebit(uint32 map) +{ + uint32 i, mask; + + mask = 0x1; + for (i = 0; i < BITS_PER_MAP; i++) { + if (!(mask & map)) + return (i); + mask = mask << 1; + } + return (i); +} + +/* + * _hash_freeovflpage() - + * + * Mark this overflow page as free and return a buffer with + * the page that follows it (which may be defined as + * InvalidBuffer). + * + */ +Buffer +_hash_freeovflpage(Relation rel, Buffer ovflbuf) +{ + HashMetaPage metap; + Buffer metabuf; + Buffer mapbuf; + BlockNumber prevblkno; + BlockNumber blkno; + BlockNumber nextblkno; + HashPageOpaque ovflopaque; + Page ovflpage; + Page mappage; + OverflowPageAddress addr; + SplitNumber splitnum; + uint32 *freep; + uint32 ovflpgno; + int32 bitmappage, bitmapbit; + Bucket bucket; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + ovflpage = BufferGetPage(ovflbuf); + _hash_checkpage(ovflpage, LH_OVERFLOW_PAGE); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + addr = ovflopaque->hasho_oaddr; + nextblkno = ovflopaque->hasho_nextblkno; + prevblkno = ovflopaque->hasho_prevblkno; + bucket = ovflopaque->hasho_bucket; + (void) memset(ovflpage, 0, BufferGetPageSize(ovflbuf)); + _hash_wrtbuf(rel, ovflbuf); + + /* + * fix up the bucket chain. this is a doubly-linked list, so we + * must fix up the bucket chain members behind and ahead of the + * overflow page being deleted. + * + * XXX this should look like: + * - lock prev/next + * - modify/write prev/next (how to do write ordering with a + * doubly-linked list???) + * - unlock prev/next + */ + if (BlockNumberIsValid(prevblkno)) { + Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE); + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = + (HashPageOpaque) PageGetSpecialPointer(prevpage); + + _hash_checkpage(prevpage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + _hash_wrtbuf(rel, prevbuf); + } + if (BlockNumberIsValid(nextblkno)) { + Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE); + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = + (HashPageOpaque) PageGetSpecialPointer(nextpage); + + _hash_checkpage(nextpage, LH_OVERFLOW_PAGE); + Assert(nextopaque->hasho_bucket == bucket); + nextopaque->hasho_prevblkno = prevblkno; + _hash_wrtbuf(rel, nextbuf); + } + + /* + * Fix up the overflow page bitmap that tracks this particular + * overflow page. The bitmap can be found in the MetaPageData + * array element hashm_mapp[bitmappage]. + */ + splitnum = (addr >> SPLITSHIFT); + ovflpgno = + (splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1; + + if (ovflpgno < metap->LAST_FREED) { + metap->LAST_FREED = ovflpgno; + } + + bitmappage = (ovflpgno >> (metap->BSHIFT + BYTE_TO_BIT)); + bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1); + + blkno = metap->hashm_mapp[bitmappage]; + mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE); + mappage = BufferGetPage(mapbuf); + _hash_checkpage(mappage, LH_BITMAP_PAGE); + freep = HashPageGetBitmap(mappage); + CLRBIT(freep, bitmapbit); + _hash_wrtbuf(rel, mapbuf); + + _hash_relbuf(rel, metabuf, HASH_WRITE); + + /* + * now instantiate the page that replaced this one, + * if it exists, and return that buffer with a write lock. + */ + if (BlockNumberIsValid(nextblkno)) { + return (_hash_getbuf(rel, nextblkno, HASH_WRITE)); + } else { + return (InvalidBuffer); + } +} + + +/* + * _hash_initbitmap() + * + * Initialize a new bitmap page. The metapage has a write-lock upon + * entering the function. + * + * 'pnum' is the OverflowPageAddress of the new bitmap page. + * 'nbits' is how many bits to clear (i.e., make available) in the new + * bitmap page. the remainder of the bits (as well as the first bit, + * representing the bitmap page itself) will be set. + * 'ndx' is the 0-based offset of the new bitmap page within the + * metapage's array of bitmap page OverflowPageAddresses. + */ + +#define INT_MASK ((1 << INT_TO_BIT) -1) + +int32 +_hash_initbitmap(Relation rel, + HashMetaPage metap, + int32 pnum, + int32 nbits, + int32 ndx) +{ + Buffer buf; + BlockNumber blkno; + Page pg; + HashPageOpaque op; + uint32 *freep; + int clearbytes, clearints; + + blkno = OADDR_TO_BLKNO(pnum); + buf = _hash_getbuf(rel, blkno, HASH_WRITE); + pg = BufferGetPage(buf); + _hash_pageinit(pg, BufferGetPageSize(buf)); + op = (HashPageOpaque) PageGetSpecialPointer(pg); + op->hasho_oaddr = InvalidOvflAddress; + op->hasho_prevblkno = InvalidBlockNumber; + op->hasho_nextblkno = InvalidBlockNumber; + op->hasho_flag = LH_BITMAP_PAGE; + op->hasho_bucket = -1; + + freep = HashPageGetBitmap(pg); + + /* set all of the bits above 'nbits' to 1 */ + clearints = ((nbits - 1) >> INT_TO_BIT) + 1; + clearbytes = clearints << INT_TO_BYTE; + (void) memset((char *) freep, 0, clearbytes); + (void) memset(((char *) freep) + clearbytes, 0xFF, + BMPGSZ_BYTE(metap) - clearbytes); + freep[clearints - 1] = ALL_SET << (nbits & INT_MASK); + + /* bit 0 represents the new bitmap page */ + SETBIT(freep, 0); + + /* metapage already has a write lock */ + metap->hashm_nmaps++; + metap->hashm_mapp[ndx] = blkno; + + /* write out the new bitmap page (releasing its locks) */ + _hash_wrtbuf(rel, buf); + + return (0); +} + + +/* + * _hash_squeezebucket(rel, bucket) + * + * Try to squeeze the tuples onto pages occuring earlier in the + * bucket chain in an attempt to free overflow pages. When we start + * the "squeezing", the page from which we start taking tuples (the + * "read" page) is the last bucket in the bucket chain and the page + * onto which we start squeezing tuples (the "write" page) is the + * first page in the bucket chain. The read page works backward and + * the write page works forward; the procedure terminates when the + * read page and write page are the same page. + */ +void +_hash_squeezebucket(Relation rel, + HashMetaPage metap, + Bucket bucket) +{ + Buffer wbuf; + Buffer rbuf; + BlockNumber wblkno; + BlockNumber rblkno; + Page wpage; + Page rpage; + HashPageOpaque wopaque; + HashPageOpaque ropaque; + OffsetNumber woffnum; + OffsetNumber roffnum; + HashItem hitem; + int itemsz; + +/* elog(DEBUG, "_hash_squeezebucket: squeezing bucket %d", bucket); */ + + /* + * start squeezing into the base bucket page. + */ + wblkno = BUCKET_TO_BLKNO(bucket); + wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); + wpage = BufferGetPage(wbuf); + _hash_checkpage(wpage, LH_BUCKET_PAGE); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + + /* + * if there aren't any overflow pages, there's nothing to squeeze. + */ + if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { + _hash_relbuf(rel, wbuf, HASH_WRITE); + return; + } + + /* + * find the last page in the bucket chain by starting at the base + * bucket page and working forward. + * + * XXX if chains tend to be long, we should probably move forward + * using HASH_READ and then _hash_chgbufaccess to HASH_WRITE when + * we reach the end. if they are short we probably don't care + * very much. if the hash function is working at all, they had + * better be short.. + */ + ropaque = wopaque; + do { + rblkno = ropaque->hasho_nextblkno; + if (ropaque != wopaque) { + _hash_relbuf(rel, rbuf, HASH_WRITE); + } + rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); + rpage = BufferGetPage(rbuf); + _hash_checkpage(rpage, LH_OVERFLOW_PAGE); + Assert(!PageIsEmpty(rpage)); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); + Assert(ropaque->hasho_bucket == bucket); + } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); + + /* + * squeeze the tuples. + */ + roffnum = FirstOffsetNumber; + for(;;) { + hitem = (HashItem) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); + itemsz = IndexTupleDSize(hitem->hash_itup) + + (sizeof(HashItemData) - sizeof(IndexTupleData)); + itemsz = DOUBLEALIGN(itemsz); + + /* + * walk up the bucket chain, looking for a page big enough for + * this item. + */ + while (PageGetFreeSpace(wpage) < itemsz) { + wblkno = wopaque->hasho_nextblkno; + + _hash_wrtbuf(rel, wbuf); + + if (!BlockNumberIsValid(wblkno) || (rblkno == wblkno)) { + _hash_wrtbuf(rel, rbuf); + /* wbuf is already released */ + return; + } + + wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); + wpage = BufferGetPage(wbuf); + _hash_checkpage(wpage, LH_OVERFLOW_PAGE); + Assert(!PageIsEmpty(wpage)); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + Assert(wopaque->hasho_bucket == bucket); + } + + /* + * if we're here, we have found room so insert on the "write" + * page. + */ + woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage)); + (void) PageAddItem(wpage, (Item) hitem, itemsz, woffnum, LP_USED); + + /* + * delete the tuple from the "read" page. + * PageIndexTupleDelete repacks the ItemId array, so 'roffnum' + * will be "advanced" to the "next" ItemId. + */ + PageIndexTupleDelete(rpage, roffnum); + _hash_wrtnorelbuf(rel, rbuf); + + /* + * if the "read" page is now empty because of the deletion, + * free it. + */ + if (PageIsEmpty(rpage) && (ropaque->hasho_flag & LH_OVERFLOW_PAGE)) { + rblkno = ropaque->hasho_prevblkno; + Assert(BlockNumberIsValid(rblkno)); + + /* + * free this overflow page. the extra _hash_relbuf is + * because _hash_freeovflpage gratuitously returns the + * next page (we want the previous page and will get it + * ourselves later). + */ + rbuf = _hash_freeovflpage(rel, rbuf); + if (BufferIsValid(rbuf)) { + _hash_relbuf(rel, rbuf, HASH_WRITE); + } + + if (rblkno == wblkno) { + /* rbuf is already released */ + _hash_wrtbuf(rel, wbuf); + return; + } + + rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); + rpage = BufferGetPage(rbuf); + _hash_checkpage(rpage, LH_OVERFLOW_PAGE); + Assert(!PageIsEmpty(rpage)); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); + Assert(ropaque->hasho_bucket == bucket); + + roffnum = FirstOffsetNumber; + } + } +} diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c new file mode 100644 index 00000000000..2c6ebed8350 --- /dev/null +++ b/src/backend/access/hash/hashpage.c @@ -0,0 +1,669 @@ +/*------------------------------------------------------------------------- + * + * hashpage.c-- + * Hash table page management code for the Postgres hash access method + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + * NOTES + * Postgres hash pages look like ordinary relation pages. The opaque + * data at high addresses includes information about the page including + * whether a page is an overflow page or a true bucket, the block + * numbers of the preceding and following pages, and the overflow + * address of the page if it is an overflow page. + * + * The first page in a hash relation, page zero, is special -- it stores + * information describing the hash table; it is referred to as teh + * "meta page." Pages one and higher store the actual data. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/hash.h" + +static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access); +static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access); +static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket); + +/* + * We use high-concurrency locking on hash indices. There are two cases in + * which we don't do locking. One is when we're building the index. + * Since the creating transaction has not committed, no one can see + * the index, and there's no reason to share locks. The second case + * is when we're just starting up the database system. We use some + * special-purpose initialization code in the relation cache manager + * (see utils/cache/relcache.c) to allow us to do indexed scans on + * the system catalogs before we'd normally be able to. This happens + * before the lock table is fully initialized, so we can't use it. + * Strictly speaking, this violates 2pl, but we don't do 2pl on the + * system catalogs anyway. + */ + + +#define USELOCKING (!BuildingHash && !IsInitProcessingMode()) + + +/* + * _hash_metapinit() -- Initialize the metadata page of a hash index, + * the two buckets that we begin with and the initial + * bitmap page. + */ +void +_hash_metapinit(Relation rel) +{ + HashMetaPage metap; + HashPageOpaque pageopaque; + Buffer metabuf; + Buffer buf; + Page pg; + int nbuckets; + uint32 nelem; /* number elements */ + uint32 lg2nelem; /* _hash_log2(nelem) */ + uint32 nblocks; + uint16 i; + + /* can't be sharing this with anyone, now... */ + if (USELOCKING) + RelationSetLockForWrite(rel); + + if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) { + elog(WARN, "Cannot initialize non-empty hash table %s", + RelationGetRelationName(rel)); + } + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + pg = BufferGetPage(metabuf); + metap = (HashMetaPage) pg; + _hash_pageinit(pg, BufferGetPageSize(metabuf)); + + metap->hashm_magic = HASH_MAGIC; + metap->hashm_version = HASH_VERSION; + metap->hashm_nkeys = 0; + metap->hashm_nmaps = 0; + metap->hashm_ffactor = DEFAULT_FFACTOR; + metap->hashm_bsize = BufferGetPageSize(metabuf); + metap->hashm_bshift = _hash_log2(metap->hashm_bsize); + for (i = metap->hashm_bshift; i > 0; --i) { + if ((1 << i) < (metap->hashm_bsize - + (DOUBLEALIGN(sizeof(PageHeaderData)) + + DOUBLEALIGN(sizeof(HashPageOpaqueData))))) { + break; + } + } + Assert(i); + metap->hashm_bmsize = 1 << i; + metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); + + /* + * Make nelem = 2 rather than 0 so that we end up allocating space + * for the next greater power of two number of buckets. + */ + nelem = 2; + lg2nelem = 1; /*_hash_log2(MAX(nelem, 2)) */ + nbuckets = 2; /*1 << lg2nelem */ + + memset((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares)); + memset((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); + + metap->hashm_spares[lg2nelem] = 2; /* lg2nelem + 1 */ + metap->hashm_spares[lg2nelem + 1] = 2; /* lg2nelem + 1 */ + metap->hashm_ovflpoint = 1; /* lg2nelem */ + metap->hashm_lastfreed = 2; + + metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ + metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); + pageopaque->hasho_oaddr = InvalidOvflAddress; + pageopaque->hasho_prevblkno = InvalidBlockNumber; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_flag = LH_META_PAGE; + pageopaque->hasho_bucket = -1; + + /* + * First bitmap page is at: splitpoint lg2nelem page offset 1 which + * turns out to be page 3. Couldn't initialize page 3 until we created + * the first two buckets above. + */ + if (_hash_initbitmap(rel, metap, OADDR_OF(lg2nelem, 1), lg2nelem + 1, 0)) + elog(WARN, "Problem with _hash_initbitmap."); + + /* all done */ + _hash_wrtnorelbuf(rel, metabuf); + + /* + * initialize the first two buckets + */ + for (i = 0; i <= 1; i++) { + buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE); + pg = BufferGetPage(buf); + _hash_pageinit(pg, BufferGetPageSize(buf)); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); + pageopaque->hasho_oaddr = InvalidOvflAddress; + pageopaque->hasho_prevblkno = InvalidBlockNumber; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_flag = LH_BUCKET_PAGE; + pageopaque->hasho_bucket = i; + _hash_wrtbuf(rel, buf); + } + + _hash_relbuf(rel, metabuf, HASH_WRITE); + + if (USELOCKING) + RelationUnsetLockForWrite(rel); +} + +/* + * _hash_getbuf() -- Get a buffer by block number for read or write. + * + * When this routine returns, the appropriate lock is set on the + * requested buffer its reference count is correct. + * + * XXX P_NEW is not used because, unlike the tree structures, we + * need the bucket blocks to be at certain block numbers. we must + * depend on the caller to call _hash_pageinit on the block if it + * knows that this is a new block. + */ +Buffer +_hash_getbuf(Relation rel, BlockNumber blkno, int access) +{ + Buffer buf; + + if (blkno == P_NEW) { + elog(WARN, "_hash_getbuf: internal error: hash AM does not use P_NEW"); + } + switch (access) { + case HASH_WRITE: + case HASH_READ: + _hash_setpagelock(rel, blkno, access); + break; + default: + elog(WARN, "_hash_getbuf: invalid access (%d) on new blk: %.*s", + access, NAMEDATALEN, RelationGetRelationName(rel)); + break; + } + buf = ReadBuffer(rel, blkno); + + /* ref count and lock type are correct */ + return (buf); +} + +/* + * _hash_relbuf() -- release a locked buffer. + */ +void +_hash_relbuf(Relation rel, Buffer buf, int access) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + + switch (access) { + case HASH_WRITE: + case HASH_READ: + _hash_unsetpagelock(rel, blkno, access); + break; + default: + elog(WARN, "_hash_relbuf: invalid access (%d) on blk %x: %.*s", + access, blkno, NAMEDATALEN, RelationGetRelationName(rel)); + } + + ReleaseBuffer(buf); +} + +/* + * _hash_wrtbuf() -- write a hash page to disk. + * + * This routine releases the lock held on the buffer and our reference + * to it. It is an error to call _hash_wrtbuf() without a write lock + * or a reference to the buffer. + */ +void +_hash_wrtbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteBuffer(buf); + _hash_unsetpagelock(rel, blkno, HASH_WRITE); +} + +/* + * _hash_wrtnorelbuf() -- write a hash page to disk, but do not release + * our reference or lock. + * + * It is an error to call _hash_wrtnorelbuf() without a write lock + * or a reference to the buffer. + */ +void +_hash_wrtnorelbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteNoReleaseBuffer(buf); +} + +Page +_hash_chgbufaccess(Relation rel, + Buffer *bufp, + int from_access, + int to_access) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(*bufp); + + switch (from_access) { + case HASH_WRITE: + _hash_wrtbuf(rel, *bufp); + break; + case HASH_READ: + _hash_relbuf(rel, *bufp, from_access); + break; + default: + elog(WARN, "_hash_chgbufaccess: invalid access (%d) on blk %x: %.*s", + from_access, blkno, NAMEDATALEN, RelationGetRelationName(rel)); + break; + } + *bufp = _hash_getbuf(rel, blkno, to_access); + return (BufferGetPage(*bufp)); +} + +/* + * _hash_pageinit() -- Initialize a new page. + */ +void +_hash_pageinit(Page page, Size size) +{ + Assert(((PageHeader) page)->pd_lower == 0); + Assert(((PageHeader) page)->pd_upper == 0); + Assert(((PageHeader) page)->pd_special == 0); + + /* + * Cargo-cult programming -- don't really need this to be zero, but + * creating new pages is an infrequent occurrence and it makes me feel + * good when I know they're empty. + */ + memset(page, 0, size); + + PageInit(page, size, sizeof(HashPageOpaqueData)); +} + +static void +_hash_setpagelock(Relation rel, + BlockNumber blkno, + int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, 1); + + switch (access) { + case HASH_WRITE: + RelationSetSingleWLockPage(rel, &iptr); + break; + case HASH_READ: + RelationSetSingleRLockPage(rel, &iptr); + break; + default: + elog(WARN, "_hash_setpagelock: invalid access (%d) on blk %x: %.*s", + access, blkno, NAMEDATALEN, RelationGetRelationName(rel)); + break; + } + } +} + +static void +_hash_unsetpagelock(Relation rel, + BlockNumber blkno, + int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, 1); + + switch (access) { + case HASH_WRITE: + RelationUnsetSingleWLockPage(rel, &iptr); + break; + case HASH_READ: + RelationUnsetSingleRLockPage(rel, &iptr); + break; + default: + elog(WARN, "_hash_unsetpagelock: invalid access (%d) on blk %x: %.*s", + access, blkno, NAMEDATALEN, RelationGetRelationName(rel)); + break; + } + } +} + +void +_hash_pagedel(Relation rel, ItemPointer tid) +{ + Buffer buf; + Buffer metabuf; + Page page; + BlockNumber blkno; + OffsetNumber offno; + HashMetaPage metap; + HashPageOpaque opaque; + + blkno = ItemPointerGetBlockNumber(tid); + offno = ItemPointerGetOffsetNumber(tid); + + buf = _hash_getbuf(rel, blkno, HASH_WRITE); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + PageIndexTupleDelete(page, offno); + _hash_wrtnorelbuf(rel, buf); + + if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE)) { + buf = _hash_freeovflpage(rel, buf); + if (BufferIsValid(buf)) { + _hash_relbuf(rel, buf, HASH_WRITE); + } + } else { + _hash_relbuf(rel, buf, HASH_WRITE); + } + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + ++metap->hashm_nkeys; + _hash_wrtbuf(rel, metabuf); +} + +void +_hash_expandtable(Relation rel, Buffer metabuf) +{ + HashMetaPage metap; + Bucket old_bucket; + Bucket new_bucket; + uint32 spare_ndx; + +/* elog(DEBUG, "_hash_expandtable: expanding..."); */ + + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); + new_bucket = ++metap->MAX_BUCKET; + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); + old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK); + + /* + * If the split point is increasing (MAX_BUCKET's log base 2 + * * increases), we need to copy the current contents of the spare + * split bucket to the next bucket. + */ + spare_ndx = _hash_log2(metap->MAX_BUCKET + 1); + if (spare_ndx > metap->OVFL_POINT) { + + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); + metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT]; + metap->OVFL_POINT = spare_ndx; + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); + } + + if (new_bucket > metap->HIGH_MASK) { + + /* Starting a new doubling */ + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); + metap->LOW_MASK = metap->HIGH_MASK; + metap->HIGH_MASK = new_bucket | metap->LOW_MASK; + metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); + + } + /* Relocate records to the new bucket */ + _hash_splitpage(rel, metabuf, old_bucket, new_bucket); +} + + +/* + * _hash_splitpage -- split 'obucket' into 'obucket' and 'nbucket' + * + * this routine is actually misnamed -- we are splitting a bucket that + * consists of a base bucket page and zero or more overflow (bucket + * chain) pages. + */ +static void +_hash_splitpage(Relation rel, + Buffer metabuf, + Bucket obucket, + Bucket nbucket) +{ + Bucket bucket; + Buffer obuf; + Buffer nbuf; + Buffer ovflbuf; + BlockNumber oblkno; + BlockNumber nblkno; + bool null; + Datum datum; + HashItem hitem; + HashPageOpaque oopaque; + HashPageOpaque nopaque; + HashMetaPage metap; + IndexTuple itup; + int itemsz; + OffsetNumber ooffnum; + OffsetNumber noffnum; + OffsetNumber omaxoffnum; + Page opage; + Page npage; + TupleDesc itupdesc; + +/* elog(DEBUG, "_hash_splitpage: splitting %d into %d,%d", + obucket, obucket, nbucket); +*/ + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + /* get the buffers & pages */ + oblkno = BUCKET_TO_BLKNO(obucket); + nblkno = BUCKET_TO_BLKNO(nbucket); + obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); + nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); + opage = BufferGetPage(obuf); + npage = BufferGetPage(nbuf); + + /* initialize the new bucket */ + _hash_pageinit(npage, BufferGetPageSize(nbuf)); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nopaque->hasho_prevblkno = InvalidBlockNumber; + nopaque->hasho_nextblkno = InvalidBlockNumber; + nopaque->hasho_flag = LH_BUCKET_PAGE; + nopaque->hasho_oaddr = InvalidOvflAddress; + nopaque->hasho_bucket = nbucket; + _hash_wrtnorelbuf(rel, nbuf); + + /* + * make sure the old bucket isn't empty. advance 'opage' and + * friends through the overflow bucket chain until we find a + * non-empty page. + * + * XXX we should only need this once, if we are careful to + * preserve the invariant that overflow pages are never empty. + */ + _hash_checkpage(opage, LH_BUCKET_PAGE); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + if (PageIsEmpty(opage)) { + oblkno = oopaque->hasho_nextblkno; + _hash_relbuf(rel, obuf, HASH_WRITE); + if (!BlockNumberIsValid(oblkno)) { + /* + * the old bucket is completely empty; of course, the new + * bucket will be as well, but since it's a base bucket + * page we don't care. + */ + _hash_relbuf(rel, nbuf, HASH_WRITE); + return; + } + obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); + opage = BufferGetPage(obuf); + _hash_checkpage(opage, LH_OVERFLOW_PAGE); + if (PageIsEmpty(opage)) { + elog(WARN, "_hash_splitpage: empty overflow page %d", oblkno); + } + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + } + + /* + * we are now guaranteed that 'opage' is not empty. partition the + * tuples in the old bucket between the old bucket and the new + * bucket, advancing along their respective overflow bucket chains + * and adding overflow pages as needed. + */ + ooffnum = FirstOffsetNumber; + omaxoffnum = PageGetMaxOffsetNumber(opage); + for (;;) { + /* + * at each iteration through this loop, each of these variables + * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum + */ + + /* check if we're at the end of the page */ + if (ooffnum > omaxoffnum) { + /* at end of page, but check for overflow page */ + oblkno = oopaque->hasho_nextblkno; + if (BlockNumberIsValid(oblkno)) { + /* + * we ran out of tuples on this particular page, but + * we have more overflow pages; re-init values. + */ + _hash_wrtbuf(rel, obuf); + obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); + opage = BufferGetPage(obuf); + _hash_checkpage(opage, LH_OVERFLOW_PAGE); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* we're guaranteed that an ovfl page has at least 1 tuple */ + if (PageIsEmpty(opage)) { + elog(WARN, "_hash_splitpage: empty ovfl page %d!", + oblkno); + } + ooffnum = FirstOffsetNumber; + omaxoffnum = PageGetMaxOffsetNumber(opage); + } else { + /* + * we're at the end of the bucket chain, so now we're + * really done with everything. before quitting, call + * _hash_squeezebucket to ensure the tuples in the + * bucket (including the overflow pages) are packed as + * tightly as possible. + */ + _hash_wrtbuf(rel, obuf); + _hash_wrtbuf(rel, nbuf); + _hash_squeezebucket(rel, metap, obucket); + return; + } + } + + /* hash on the tuple */ + hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); + itup = &(hitem->hash_itup); + itupdesc = RelationGetTupleDescriptor(rel); + datum = index_getattr(itup, 1, itupdesc, &null); + bucket = _hash_call(rel, metap, datum); + + if (bucket == nbucket) { + /* + * insert the tuple into the new bucket. if it doesn't + * fit on the current page in the new bucket, we must + * allocate a new overflow page and place the tuple on + * that page instead. + */ + itemsz = IndexTupleDSize(hitem->hash_itup) + + (sizeof(HashItemData) - sizeof(IndexTupleData)); + + itemsz = DOUBLEALIGN(itemsz); + + if (PageGetFreeSpace(npage) < itemsz) { + ovflbuf = _hash_addovflpage(rel, &metabuf, nbuf); + _hash_wrtbuf(rel, nbuf); + nbuf = ovflbuf; + npage = BufferGetPage(nbuf); + _hash_checkpage(npage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + } + + noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); + (void) PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED); + _hash_wrtnorelbuf(rel, nbuf); + + /* + * now delete the tuple from the old bucket. after this + * section of code, 'ooffnum' will actually point to the + * ItemId to which we would point if we had advanced it + * before the deletion (PageIndexTupleDelete repacks the + * ItemId array). this also means that 'omaxoffnum' is + * exactly one less than it used to be, so we really can + * just decrement it instead of calling + * PageGetMaxOffsetNumber. + */ + PageIndexTupleDelete(opage, ooffnum); + _hash_wrtnorelbuf(rel, obuf); + omaxoffnum = OffsetNumberPrev(omaxoffnum); + + /* + * tidy up. if the old page was an overflow page and it + * is now empty, we must free it (we want to preserve the + * invariant that overflow pages cannot be empty). + */ + if (PageIsEmpty(opage) && + (oopaque->hasho_flag & LH_OVERFLOW_PAGE)) { + obuf = _hash_freeovflpage(rel, obuf); + + /* check that we're not through the bucket chain */ + if (BufferIsInvalid(obuf)) { + _hash_wrtbuf(rel, nbuf); + _hash_squeezebucket(rel, metap, obucket); + return; + } + + /* + * re-init. again, we're guaranteed that an ovfl page + * has at least one tuple. + */ + opage = BufferGetPage(obuf); + _hash_checkpage(opage, LH_OVERFLOW_PAGE); + oblkno = BufferGetBlockNumber(obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + if (PageIsEmpty(opage)) { + elog(WARN, "_hash_splitpage: empty overflow page %d", + oblkno); + } + ooffnum = FirstOffsetNumber; + omaxoffnum = PageGetMaxOffsetNumber(opage); + } + } else { + /* + * the tuple stays on this page. we didn't move anything, + * so we didn't delete anything and therefore we don't + * have to change 'omaxoffnum'. + * + * XXX any hash value from [0, nbucket-1] will map to this + * bucket, which doesn't make sense to me. + */ + ooffnum = OffsetNumberNext(ooffnum); + } + } + /*NOTREACHED*/ +} diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c new file mode 100644 index 00000000000..c4cce0e70d9 --- /dev/null +++ b/src/backend/access/hash/hashscan.c @@ -0,0 +1,172 @@ +/*------------------------------------------------------------------------- + * + * hashscan.c-- + * manage scans on hash tables + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + * NOTES + * Because we can be doing an index scan on a relation while we + * update it, we need to avoid missing data that moves around in + * the index. The routines and global variables in this file + * guarantee that all scans in the local address space stay + * correctly positioned. This is all we need to worry about, since + * write locking guarantees that no one else will be on the same + * page at the same time as we are. + * + * The scheme is to manage a list of active scans in the current + * backend. Whenever we add or remove records from an index, we + * check the list of active scans to see if any has been affected. + * A scan is affected only if it is on the same relation, and the + * same page, as the update. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/sdir.h" +#include "access/hash.h" + +static void _hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno); +static bool _hash_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno); + +typedef struct HashScanListData { + IndexScanDesc hashsl_scan; + struct HashScanListData *hashsl_next; +} HashScanListData; + +typedef HashScanListData *HashScanList; + +static HashScanList HashScans = (HashScanList) NULL; + +/* + * _Hash_regscan() -- register a new scan. + */ +void +_hash_regscan(IndexScanDesc scan) +{ + HashScanList new_el; + + new_el = (HashScanList) palloc(sizeof(HashScanListData)); + new_el->hashsl_scan = scan; + new_el->hashsl_next = HashScans; + HashScans = new_el; +} + +/* + * _hash_dropscan() -- drop a scan from the scan list + */ +void +_hash_dropscan(IndexScanDesc scan) +{ + HashScanList chk, last; + + last = (HashScanList) NULL; + for (chk = HashScans; + chk != (HashScanList) NULL && chk->hashsl_scan != scan; + chk = chk->hashsl_next) { + last = chk; + } + + if (chk == (HashScanList) NULL) + elog(WARN, "hash scan list trashed; can't find 0x%lx", scan); + + if (last == (HashScanList) NULL) + HashScans = chk->hashsl_next; + else + last->hashsl_next = chk->hashsl_next; + +#ifdef PERFECT_MEM + pfree (chk); +#endif /* PERFECT_MEM */ +} + +void +_hash_adjscans(Relation rel, ItemPointer tid) +{ + HashScanList l; + Oid relid; + + relid = rel->rd_id; + for (l = HashScans; l != (HashScanList) NULL; l = l->hashsl_next) { + if (relid == l->hashsl_scan->relation->rd_id) + _hash_scandel(l->hashsl_scan, ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + } +} + +static void +_hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno) +{ + ItemPointer current; + Buffer buf; + Buffer metabuf; + HashScanOpaque so; + + if (!_hash_scantouched(scan, blkno, offno)) + return; + + metabuf = _hash_getbuf(scan->relation, HASH_METAPAGE, HASH_READ); + + so = (HashScanOpaque) scan->opaque; + buf = so->hashso_curbuf; + + current = &(scan->currentItemData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) { + _hash_step(scan, &buf, BackwardScanDirection, metabuf); + so->hashso_curbuf = buf; + } + + current = &(scan->currentMarkData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) { + ItemPointerData tmp; + tmp = *current; + *current = scan->currentItemData; + scan->currentItemData = tmp; + _hash_step(scan, &buf, BackwardScanDirection, metabuf); + so->hashso_mrkbuf = buf; + tmp = *current; + *current = scan->currentItemData; + scan->currentItemData = tmp; + } +} + +static bool +_hash_scantouched(IndexScanDesc scan, + BlockNumber blkno, + OffsetNumber offno) +{ + ItemPointer current; + + current = &(scan->currentItemData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + return (true); + + current = &(scan->currentMarkData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + return (true); + + return (false); +} diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c new file mode 100644 index 00000000000..056235dec85 --- /dev/null +++ b/src/backend/access/hash/hashsearch.c @@ -0,0 +1,425 @@ +/*------------------------------------------------------------------------- + * + * hashsearch.c-- + * search code for postgres hash tables + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "fmgr.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/skey.h" +#include "access/sdir.h" +#include "access/hash.h" + +/* + * _hash_search() -- Finds the page/bucket that the contains the + * scankey and loads it into *bufP. the buffer has a read lock. + */ +void +_hash_search(Relation rel, + int keysz, + ScanKey scankey, + Buffer *bufP, + HashMetaPage metap) +{ + BlockNumber blkno; + Datum keyDatum; + Bucket bucket; + + if (scankey == (ScanKey) NULL || + (keyDatum = scankey[0].sk_argument) == (Datum) NULL) { + /* + * If the scankey argument is NULL, all tuples will satisfy + * the scan so we start the scan at the first bucket (bucket + * 0). + */ + bucket = 0; + } else { + bucket = _hash_call(rel, metap, keyDatum); + } + + blkno = BUCKET_TO_BLKNO(bucket); + + *bufP = _hash_getbuf(rel, blkno, HASH_READ); +} + +/* + * _hash_next() -- Get the next item in a scan. + * + * On entry, we have a valid currentItemData in the scan, and a + * read lock on the page that contains that item. We do not have + * the page pinned. We return the next item in the scan. On + * exit, we have the page containing the next item locked but not + * pinned. + */ +RetrieveIndexResult +_hash_next(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel; + Buffer buf; + Buffer metabuf; + Page page; + OffsetNumber offnum; + RetrieveIndexResult res; + ItemPointer current; + ItemPointer iptr; + HashItem hitem; + IndexTuple itup; + HashScanOpaque so; + + rel = scan->relation; + so = (HashScanOpaque) scan->opaque; + current = &(scan->currentItemData); + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); + + /* + * XXX 10 may 91: somewhere there's a bug in our management of the + * cached buffer for this scan. wei discovered it. the following + * is a workaround so he can work until i figure out what's going on. + */ + + if (!BufferIsValid(so->hashso_curbuf)) { + so->hashso_curbuf = _hash_getbuf(rel, + ItemPointerGetBlockNumber(current), + HASH_READ); + } + + /* we still have the buffer pinned and locked */ + buf = so->hashso_curbuf; + + /* + * step to next valid tuple. note that _hash_step releases our + * lock on 'metabuf'; if we switch to a new 'buf' while looking + * for the next tuple, we come back with a lock on that buffer. + */ + if (!_hash_step(scan, &buf, dir, metabuf)) { + return ((RetrieveIndexResult) NULL); + } + + /* if we're here, _hash_step found a valid tuple */ + current = &(scan->currentItemData); + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &hitem->hash_itup; + iptr = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) iptr, (char *) &(itup->t_tid), sizeof(ItemPointerData)); + res = FormRetrieveIndexResult(current, iptr); + + return (res); +} + +static void +_hash_readnext(Relation rel, + Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) +{ + BlockNumber blkno; + + blkno = (*opaquep)->hasho_nextblkno; + _hash_relbuf(rel, *bufp, HASH_READ); + *bufp = InvalidBuffer; + if (BlockNumberIsValid(blkno)) { + *bufp = _hash_getbuf(rel, blkno, HASH_READ); + *pagep = BufferGetPage(*bufp); + _hash_checkpage(*pagep, LH_OVERFLOW_PAGE); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + Assert(!PageIsEmpty(*pagep)); + } +} + +static void +_hash_readprev(Relation rel, + Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) +{ + BlockNumber blkno; + + blkno = (*opaquep)->hasho_prevblkno; + _hash_relbuf(rel, *bufp, HASH_READ); + *bufp = InvalidBuffer; + if (BlockNumberIsValid(blkno)) { + *bufp = _hash_getbuf(rel, blkno, HASH_READ); + *pagep = BufferGetPage(*bufp); + _hash_checkpage(*pagep, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + if (PageIsEmpty(*pagep)) { + Assert((*opaquep)->hasho_flag & LH_BUCKET_PAGE); + _hash_relbuf(rel, *bufp, HASH_READ); + *bufp = InvalidBuffer; + } + } +} + +/* + * _hash_first() -- Find the first item in a scan. + * + * Return the RetrieveIndexResult of the first item in the tree that + * satisfies the qualificatin associated with the scan descriptor. On + * exit, the page containing the current index tuple is read locked + * and pinned, and the scan's opaque data entry is updated to + * include the buffer. + */ +RetrieveIndexResult +_hash_first(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel; + Buffer buf; + Buffer metabuf; + Page page; + HashPageOpaque opaque; + HashMetaPage metap; + HashItem hitem; + IndexTuple itup; + ItemPointer current; + ItemPointer iptr; + OffsetNumber offnum; + RetrieveIndexResult res; + HashScanOpaque so; + + rel = scan->relation; + so = (HashScanOpaque) scan->opaque; + current = &(scan->currentItemData); + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + /* + * XXX -- The attribute number stored in the scan key is the attno + * in the heap relation. We need to transmogrify this into + * the index relation attno here. For the moment, we have + * hardwired attno == 1. + */ + + /* find the correct bucket page and load it into buf */ + _hash_search(rel, 1, scan->keyData, &buf, metap); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* + * if we are scanning forward, we need to find the first non-empty + * page (if any) in the bucket chain. since overflow pages are + * never empty, this had better be either the bucket page or the + * first overflow page. + * + * if we are scanning backward, we always go all the way to the + * end of the bucket chain. + */ + if (PageIsEmpty(page)) { + if (BlockNumberIsValid(opaque->hasho_nextblkno)) { + _hash_readnext(rel, &buf, &page, &opaque); + } else { + ItemPointerSetInvalid(current); + so->hashso_curbuf = InvalidBuffer; + return ((RetrieveIndexResult) NULL); + } + } + if (ScanDirectionIsBackward(dir)) { + while (BlockNumberIsValid(opaque->hasho_nextblkno)) { + _hash_readnext(rel, &buf, &page, &opaque); + } + } + + if (!_hash_step(scan, &buf, dir, metabuf)) { + return ((RetrieveIndexResult) NULL); + } + + /* if we're here, _hash_step found a valid tuple */ + current = &(scan->currentItemData); + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &hitem->hash_itup; + iptr = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) iptr, (char *) &(itup->t_tid), sizeof(ItemPointerData)); + res = FormRetrieveIndexResult(current, iptr); + + return (res); +} + +/* + * _hash_step() -- step to the next valid item in a scan in the bucket. + * + * If no valid record exists in the requested direction, return + * false. Else, return true and set the CurrentItemData for the + * scan to the right thing. + * + * 'bufP' points to the buffer which contains the current page + * that we'll step through. + * + * 'metabuf' is released when this returns. + */ +bool +_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf) +{ + Relation rel; + ItemPointer current; + HashScanOpaque so; + int allbuckets; + HashMetaPage metap; + Buffer buf; + Page page; + HashPageOpaque opaque; + OffsetNumber maxoff; + OffsetNumber offnum; + Bucket bucket; + BlockNumber blkno; + HashItem hitem; + IndexTuple itup; + + rel = scan->relation; + current = &(scan->currentItemData); + so = (HashScanOpaque) scan->opaque; + allbuckets = (scan->numberOfKeys < 1); + + metap = (HashMetaPage) BufferGetPage(metabuf); + _hash_checkpage((Page) metap, LH_META_PAGE); + + buf = *bufP; + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* + * If _hash_step is called from _hash_first, current will not be + * valid, so we can't dereference it. However, in that case, we + * presumably want to start at the beginning/end of the page... + */ + maxoff = PageGetMaxOffsetNumber(page); + if (ItemPointerIsValid(current)) { + offnum = ItemPointerGetOffsetNumber(current); + } else { + offnum = InvalidOffsetNumber; + } + + /* + * 'offnum' now points to the last tuple we have seen (if any). + * + * continue to step through tuples until: + * 1) we get to the end of the bucket chain or + * 2) we find a valid tuple. + */ + do { + bucket = opaque->hasho_bucket; + + switch (dir) { + case ForwardScanDirection: + if (offnum != InvalidOffsetNumber) { + offnum = OffsetNumberNext(offnum); /* move forward */ + } else { + offnum = FirstOffsetNumber; /* new page */ + } + while (offnum > maxoff) { + /* + * either this page is empty (maxoff == + * InvalidOffsetNumber) or we ran off the end. + */ + _hash_readnext(rel, &buf, &page, &opaque); + if (BufferIsInvalid(buf)) { /* end of chain */ + if (allbuckets && bucket < metap->hashm_maxbucket) { + ++bucket; + blkno = BUCKET_TO_BLKNO(bucket); + buf = _hash_getbuf(rel, blkno, HASH_READ); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + while (PageIsEmpty(page) && + BlockNumberIsValid(opaque->hasho_nextblkno)) { + _hash_readnext(rel, &buf, &page, &opaque); + } + maxoff = PageGetMaxOffsetNumber(page); + offnum = FirstOffsetNumber; + } else { + maxoff = offnum = InvalidOffsetNumber; + break; /* while */ + } + } else { + /* _hash_readnext never returns an empty page */ + maxoff = PageGetMaxOffsetNumber(page); + offnum = FirstOffsetNumber; + } + } + break; + case BackwardScanDirection: + if (offnum != InvalidOffsetNumber) { + offnum = OffsetNumberPrev(offnum); /* move back */ + } else { + offnum = maxoff; /* new page */ + } + while (offnum < FirstOffsetNumber) { + /* + * either this page is empty (offnum == + * InvalidOffsetNumber) or we ran off the end. + */ + _hash_readprev(rel, &buf, &page, &opaque); + if (BufferIsInvalid(buf)) { /* end of chain */ + if (allbuckets && bucket > 0) { + --bucket; + blkno = BUCKET_TO_BLKNO(bucket); + buf = _hash_getbuf(rel, blkno, HASH_READ); + page = BufferGetPage(buf); + _hash_checkpage(page, LH_BUCKET_PAGE); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + while (BlockNumberIsValid(opaque->hasho_nextblkno)) { + _hash_readnext(rel, &buf, &page, &opaque); + } + maxoff = offnum = PageGetMaxOffsetNumber(page); + } else { + maxoff = offnum = InvalidOffsetNumber; + break; /* while */ + } + } else { + /* _hash_readprev never returns an empty page */ + maxoff = offnum = PageGetMaxOffsetNumber(page); + } + } + break; + default: + /* NoMovementScanDirection */ + /* this should not be reached */ + break; + } + + /* we ran off the end of the world without finding a match */ + if (offnum == InvalidOffsetNumber) { + _hash_relbuf(rel, metabuf, HASH_READ); + *bufP = so->hashso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return(false); + } + + /* get ready to check this tuple */ + hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &hitem->hash_itup; + } while (!_hash_checkqual(scan, itup)); + + /* if we made it to here, we've found a valid tuple */ + _hash_relbuf(rel, metabuf, HASH_READ); + blkno = BufferGetBlockNumber(buf); + *bufP = so->hashso_curbuf = buf; + ItemPointerSet(current, blkno, offnum); + return(true); +} diff --git a/src/backend/access/hash/hashstrat.c b/src/backend/access/hash/hashstrat.c new file mode 100644 index 00000000000..cac2a58690e --- /dev/null +++ b/src/backend/access/hash/hashstrat.c @@ -0,0 +1,104 @@ +/*------------------------------------------------------------------------- + * + * btstrat.c-- + * Srategy map entries for the btree indexed access method + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/Attic/hashstrat.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/hash.h" + +/* + * only one valid strategy for hash tables: equality. + */ + +static StrategyNumber HTNegate[1] = { + InvalidStrategy +}; + +static StrategyNumber HTCommute[1] = { + HTEqualStrategyNumber +}; + +static StrategyNumber HTNegateCommute[1] = { + InvalidStrategy +}; + +static StrategyEvaluationData HTEvaluationData = { + /* XXX static for simplicity */ + + HTMaxStrategyNumber, + (StrategyTransformMap)HTNegate, + (StrategyTransformMap)HTCommute, + (StrategyTransformMap)HTNegateCommute, + {NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL} +}; + +/* ---------------------------------------------------------------- + * RelationGetHashStrategy + * ---------------------------------------------------------------- + */ + +StrategyNumber +_hash_getstrat(Relation rel, + AttrNumber attno, + RegProcedure proc) +{ + StrategyNumber strat; + + strat = RelationGetStrategy(rel, attno, &HTEvaluationData, proc); + + Assert(StrategyNumberIsValid(strat)); + + return (strat); +} + +bool +_hash_invokestrat(Relation rel, + AttrNumber attno, + StrategyNumber strat, + Datum left, + Datum right) +{ + return (RelationInvokeStrategy(rel, &HTEvaluationData, attno, strat, + left, right)); +} + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c new file mode 100644 index 00000000000..f8f49fe7983 --- /dev/null +++ b/src/backend/access/hash/hashutil.c @@ -0,0 +1,147 @@ +/*------------------------------------------------------------------------- + * + * btutils.c-- + * Utility code for Postgres btree implementation. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "fmgr.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/iqual.h" +#include "access/hash.h" + +ScanKey +_hash_mkscankey(Relation rel, IndexTuple itup, HashMetaPage metap) +{ + ScanKey skey; + TupleDesc itupdesc; + int natts; + AttrNumber i; + Datum arg; + RegProcedure proc; + bool null; + + natts = rel->rd_rel->relnatts; + itupdesc = RelationGetTupleDescriptor(rel); + + skey = (ScanKey) palloc(natts * sizeof(ScanKeyData)); + + for (i = 0; i < natts; i++) { + arg = index_getattr(itup, i + 1, itupdesc, &null); + proc = metap->hashm_procid; + ScanKeyEntryInitialize(&skey[i], + 0x0, (AttrNumber) (i + 1), proc, arg); + } + + return (skey); +} + +void +_hash_freeskey(ScanKey skey) +{ + pfree(skey); +} + + +bool +_hash_checkqual(IndexScanDesc scan, IndexTuple itup) +{ + if (scan->numberOfKeys > 0) + return (index_keytest(itup, + RelationGetTupleDescriptor(scan->relation), + scan->numberOfKeys, scan->keyData)); + else + return (true); +} + +HashItem +_hash_formitem(IndexTuple itup) +{ + int nbytes_hitem; + HashItem hitem; + Size tuplen; + + /* disallow nulls in hash keys */ + if (itup->t_info & INDEX_NULL_MASK) + elog(WARN, "hash indices cannot include null keys"); + + /* make a copy of the index tuple with room for the sequence number */ + tuplen = IndexTupleSize(itup); + nbytes_hitem = tuplen + + (sizeof(HashItemData) - sizeof(IndexTupleData)); + + hitem = (HashItem) palloc(nbytes_hitem); + memmove((char *) &(hitem->hash_itup), (char *) itup, tuplen); + + return (hitem); +} + +Bucket +_hash_call(Relation rel, HashMetaPage metap, Datum key) +{ + uint32 n; + Bucket bucket; + RegProcedure proc; + + proc = metap->hashm_procid; + n = (uint32) fmgr(proc, key); + bucket = n & metap->hashm_highmask; + if (bucket > metap->hashm_maxbucket) + bucket = bucket & metap->hashm_lowmask; + return (bucket); +} + +/* + * _hash_log2 -- returns ceil(lg2(num)) + */ +uint32 +_hash_log2(uint32 num) +{ + uint32 i, limit; + + limit = 1; + for (i = 0; limit < num; limit = limit << 1, i++) + ; + return (i); +} + +/* + * _hash_checkpage -- sanity checks on the format of all hash pages + */ +void +_hash_checkpage(Page page, int flags) +{ + PageHeader ph = (PageHeader) page; + HashPageOpaque opaque; + + Assert(page); + Assert(ph->pd_lower >= (sizeof(PageHeaderData) - sizeof(ItemIdData))); +#if 1 + Assert(ph->pd_upper <= + (BLCKSZ - DOUBLEALIGN(sizeof(HashPageOpaqueData)))); + Assert(ph->pd_special == + (BLCKSZ - DOUBLEALIGN(sizeof(HashPageOpaqueData)))); + Assert(ph->pd_opaque.od_pagesize == BLCKSZ); +#endif + if (flags) { + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_flag & flags); + } +} diff --git a/src/backend/access/heap/Makefile.inc b/src/backend/access/heap/Makefile.inc new file mode 100644 index 00000000000..f4f4bbb7031 --- /dev/null +++ b/src/backend/access/heap/Makefile.inc @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for access/heap +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/heap/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= heapam.c hio.c stats.c diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c new file mode 100644 index 00000000000..4bf31efd832 --- /dev/null +++ b/src/backend/access/heap/heapam.c @@ -0,0 +1,1507 @@ +/*------------------------------------------------------------------------- + * + * heapam.c-- + * heap access method code + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ + * + * + * INTERFACE ROUTINES + * heapgettup - fetch next heap tuple from a scan + * heap_open - open a heap relation by relationId + * heap_openr - open a heap relation by name + * heap_close - close a heap relation + * heap_beginscan - begin relation scan + * heap_rescan - restart a relation scan + * heap_endscan - end relation scan + * heap_getnext - retrieve next tuple in scan + * heap_fetch - retrive tuple with tid + * heap_insert - insert tuple into a relation + * heap_delete - delete a tuple from a relation + * heap_replace - replace a tuple in a relation with another tuple + * heap_markpos - mark scan position + * heap_restrpos - restore position to marked location + * + * NOTES + * This file contains the heap_ routines which implement + * the POSTGRES heap access method used for all POSTGRES + * relations. + * + * OLD COMMENTS + * struct relscan hints: (struct should be made AM independent?) + * + * rs_ctid is the tid of the last tuple returned by getnext. + * rs_ptid and rs_ntid are the tids of the previous and next tuples + * returned by getnext, respectively. NULL indicates an end of + * scan (either direction); NON indicates an unknow value. + * + * possible combinations: + * rs_p rs_c rs_n interpretation + * NULL NULL NULL empty scan + * NULL NULL NON at begining of scan + * NULL NULL t1 at begining of scan (with cached tid) + * NON NULL NULL at end of scan + * t1 NULL NULL at end of scan (with cached tid) + * NULL t1 NULL just returned only tuple + * NULL t1 NON just returned first tuple + * NULL t1 t2 returned first tuple (with cached tid) + * NON t1 NULL just returned last tuple + * t2 t1 NULL returned last tuple (with cached tid) + * t1 t2 NON in the middle of a forward scan + * NON t2 t1 in the middle of a reverse scan + * ti tj tk in the middle of a scan (w cached tid) + * + * Here NULL is ...tup == NULL && ...buf == InvalidBuffer, + * and NON is ...tup == NULL && ...buf == UnknownBuffer. + * + * Currently, the NONTID values are not cached with their actual + * values by getnext. Values may be cached by markpos since it stores + * all three tids. + * + * NOTE: the calls to elog() must stop. Should decide on an interface + * between the general and specific AM calls. + * + * XXX probably do not need a free tuple routine for heaps. + * Huh? Free tuple is not necessary for tuples returned by scans, but + * is necessary for tuples which are returned by + * RelationGetTupleByItemPointer. -hirohama + * + *------------------------------------------------------------------------- + */ +#include <sys/file.h> +#include <string.h> + +#include "postgres.h" + +#include "access/attnum.h" +#include "access/heapam.h" +#include "access/hio.h" +#include "access/htup.h" +#include "access/relscan.h" +#include "access/skey.h" + +#include "utils/tqual.h" +#include "access/valid.h" +#include "access/xact.h" + +#include "catalog/catalog.h" +#include "catalog/catname.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/itemid.h" +#include "storage/itemptr.h" +#include "storage/lmgr.h" + +#include "tcop/tcopdebug.h" +#include "miscadmin.h" + +#include "utils/memutils.h" +#include "utils/palloc.h" +#include "fmgr.h" +#include "utils/inval.h" +#include "utils/elog.h" +#include "utils/mcxt.h" +#include "utils/rel.h" +#include "utils/relcache.h" + +static bool ImmediateInvalidation; + +/* ---------------------------------------------------------------- + * heap support routines + * ---------------------------------------------------------------- + */ + +/* ---------------- + * initsdesc - sdesc code common to heap_beginscan and heap_rescan + * ---------------- + */ +static void +initsdesc(HeapScanDesc sdesc, + Relation relation, + int atend, + unsigned nkeys, + ScanKey key) +{ + if (!RelationGetNumberOfBlocks(relation)) { + /* ---------------- + * relation is empty + * ---------------- + */ + sdesc->rs_ntup = sdesc->rs_ctup = sdesc->rs_ptup = NULL; + sdesc->rs_nbuf = sdesc->rs_cbuf = sdesc->rs_pbuf = InvalidBuffer; + } else if (atend) { + /* ---------------- + * reverse scan + * ---------------- + */ + sdesc->rs_ntup = sdesc->rs_ctup = NULL; + sdesc->rs_nbuf = sdesc->rs_cbuf = InvalidBuffer; + sdesc->rs_ptup = NULL; + sdesc->rs_pbuf = UnknownBuffer; + } else { + /* ---------------- + * forward scan + * ---------------- + */ + sdesc->rs_ctup = sdesc->rs_ptup = NULL; + sdesc->rs_cbuf = sdesc->rs_pbuf = InvalidBuffer; + sdesc->rs_ntup = NULL; + sdesc->rs_nbuf = UnknownBuffer; + } /* invalid too */ + + /* we don't have a marked position... */ + ItemPointerSetInvalid(&(sdesc->rs_mptid)); + ItemPointerSetInvalid(&(sdesc->rs_mctid)); + ItemPointerSetInvalid(&(sdesc->rs_mntid)); + ItemPointerSetInvalid(&(sdesc->rs_mcd)); + + /* ---------------- + * copy the scan key, if appropriate + * ---------------- + */ + if (key != NULL) + memmove(sdesc->rs_key, key, nkeys * sizeof(ScanKeyData)); +} + +/* ---------------- + * unpinsdesc - code common to heap_rescan and heap_endscan + * ---------------- + */ +static void +unpinsdesc(HeapScanDesc sdesc) +{ + if (BufferIsValid(sdesc->rs_pbuf)) { + ReleaseBuffer(sdesc->rs_pbuf); + } + + /* ------------------------------------ + * Scan will pin buffer one for each non-NULL tuple pointer + * (ptup, ctup, ntup), so they have to be unpinned multiple + * times. + * ------------------------------------ + */ + if (BufferIsValid(sdesc->rs_cbuf)) { + ReleaseBuffer(sdesc->rs_cbuf); + } + + if (BufferIsValid(sdesc->rs_nbuf)) { + ReleaseBuffer(sdesc->rs_nbuf); + } +} + +/* ------------------------------------------ + * nextpage + * + * figure out the next page to scan after the current page + * taking into account of possible adjustment of degrees of + * parallelism + * ------------------------------------------ + */ +static int +nextpage(int page, int dir) +{ + return((dir<0)?page-1:page+1); +} + +/* ---------------- + * heapgettup - fetch next heap tuple + * + * routine used by heap_getnext() which does most of the + * real work in scanning tuples. + * ---------------- + */ +static HeapTuple +heapgettup(Relation relation, + ItemPointer tid, + int dir, + Buffer *b, + TimeQual timeQual, + int nkeys, + ScanKey key) +{ + ItemId lpp; + Page dp; + int page; + int pages; + int lines; + HeapTuple rtup; + OffsetNumber lineoff; + int linesleft; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_heapgettup); + IncrHeapAccessStat(global_heapgettup); + + /* ---------------- + * debugging stuff + * + * check validity of arguments, here and for other functions too + * Note: no locking manipulations needed--this is a local function + * ---------------- + */ +#ifdef HEAPDEBUGALL + if (ItemPointerIsValid(tid)) { + elog(DEBUG, "heapgettup(%.16s, tid=0x%x[%d,%d], dir=%d, ...)", + RelationGetRelationName(relation), tid, tid->ip_blkid, + tid->ip_posid, dir); + } else { + elog(DEBUG, "heapgettup(%.16s, tid=0x%x, dir=%d, ...)", + RelationGetRelationName(relation), tid, dir); + } + elog(DEBUG, "heapgettup(..., b=0x%x, timeQ=0x%x, nkeys=%d, key=0x%x", + b, timeQual, nkeys, key); + if (timeQual == SelfTimeQual) { + elog(DEBUG, "heapgettup: relation(%c)=`%.16s', SelfTimeQual", + relation->rd_rel->relkind, &relation->rd_rel->relname); + } else { + elog(DEBUG, "heapgettup: relation(%c)=`%.16s', timeQual=%d", + relation->rd_rel->relkind, &relation->rd_rel->relname, + timeQual); + } +#endif /* !defined(HEAPDEBUGALL) */ + + if (!ItemPointerIsValid(tid)) { + Assert(!PointerIsValid(tid)); + } + + /* ---------------- + * return null immediately if relation is empty + * ---------------- + */ + if (!(pages = relation->rd_nblocks)) + return (NULL); + + /* ---------------- + * calculate next starting lineoff, given scan direction + * ---------------- + */ + if (!dir) { + /* ---------------- + * ``no movement'' scan direction + * ---------------- + */ + /* assume it is a valid TID XXX */ + if (ItemPointerIsValid(tid) == false) { + *b = InvalidBuffer; + return (NULL); + } + *b = RelationGetBufferWithBuffer(relation, + ItemPointerGetBlockNumber(tid), + *b); + +#ifndef NO_BUFFERISVALID + if (!BufferIsValid(*b)) { + elog(WARN, "heapgettup: failed ReadBuffer"); + } +#endif + + dp = (Page) BufferGetPage(*b); + lineoff = ItemPointerGetOffsetNumber(tid); + lpp = PageGetItemId(dp, lineoff); + + rtup = (HeapTuple)PageGetItem((Page) dp, lpp); + return (rtup); + + } else if (dir < 0) { + /* ---------------- + * reverse scan direction + * ---------------- + */ + if (ItemPointerIsValid(tid) == false) { + tid = NULL; + } + if (tid == NULL) { + page = pages - 1; /* final page */ + } else { + page = ItemPointerGetBlockNumber(tid); /* current page */ + } + if (page < 0) { + *b = InvalidBuffer; + return (NULL); + } + + *b = RelationGetBufferWithBuffer(relation, page, *b); +#ifndef NO_BUFFERISVALID + if (!BufferIsValid(*b)) { + elog(WARN, "heapgettup: failed ReadBuffer"); + } +#endif + + dp = (Page) BufferGetPage(*b); + lines = PageGetMaxOffsetNumber(dp); + if (tid == NULL) { + lineoff = lines; /* final offnum */ + } else { + lineoff = /* previous offnum */ + OffsetNumberPrev(ItemPointerGetOffsetNumber(tid)); + } + /* page and lineoff now reference the physically previous tid */ + + } else { + /* ---------------- + * forward scan direction + * ---------------- + */ + if (ItemPointerIsValid(tid) == false) { + page = 0; /* first page */ + lineoff = FirstOffsetNumber; /* first offnum */ + } else { + page = ItemPointerGetBlockNumber(tid); /* current page */ + lineoff = /* next offnum */ + OffsetNumberNext(ItemPointerGetOffsetNumber(tid)); + } + + if (page >= pages) { + *b = InvalidBuffer; + return (NULL); + } + /* page and lineoff now reference the physically next tid */ + + *b = RelationGetBufferWithBuffer(relation, page, *b); +#ifndef NO_BUFFERISVALID + if (!BufferIsValid(*b)) { + elog(WARN, "heapgettup: failed ReadBuffer"); + } +#endif + + dp = (Page) BufferGetPage(*b); + lines = PageGetMaxOffsetNumber(dp); + } + + /* 'dir' is now non-zero */ + + /* ---------------- + * calculate line pointer and number of remaining items + * to check on this page. + * ---------------- + */ + lpp = PageGetItemId(dp, lineoff); + if (dir < 0) { + linesleft = lineoff - 1; + } else { + linesleft = lines - lineoff; + } + + /* ---------------- + * advance the scan until we find a qualifying tuple or + * run out of stuff to scan + * ---------------- + */ + for (;;) { + while (linesleft >= 0) { + /* ---------------- + * if current tuple qualifies, return it. + * ---------------- + */ + if ((rtup = heap_tuple_satisfies(lpp, relation, (PageHeader) dp, + timeQual, nkeys, key)) != NULL) { + ItemPointer iptr = &(rtup->t_ctid); + if (ItemPointerGetBlockNumber(iptr) != page) { + /* + * set block id to the correct page number + * --- this is a hack to support the virtual fragment + * concept + */ + ItemPointerSetBlockNumber(iptr, page); + } + return (rtup); + } + + /* ---------------- + * otherwise move to the next item on the page + * ---------------- + */ + --linesleft; + if (dir < 0) { + --lpp; /* move back in this page's ItemId array */ + } else { + ++lpp; /* move forward in this page's ItemId array */ + } + } + + /* ---------------- + * if we get here, it means we've exhausted the items on + * this page and it's time to move to the next.. + * ---------------- + */ + page = nextpage(page, dir); + + /* ---------------- + * return NULL if we've exhausted all the pages.. + * ---------------- + */ + if (page < 0 || page >= pages) { + if (BufferIsValid(*b)) + ReleaseBuffer(*b); + *b = InvalidBuffer; + return (NULL); + } + + *b = ReleaseAndReadBuffer(*b, relation, page); + +#ifndef NO_BUFFERISVALID + if (!BufferIsValid(*b)) { + elog(WARN, "heapgettup: failed ReadBuffer"); + } +#endif + dp = (Page) BufferGetPage(*b); + lines = lineoff = PageGetMaxOffsetNumber((Page) dp); + linesleft = lines - 1; + if (dir < 0) { + lpp = PageGetItemId(dp, lineoff); + } else { + lpp = PageGetItemId(dp, FirstOffsetNumber); + } + } +} + +void +doinsert(Relation relation, HeapTuple tup) +{ + RelationPutHeapTupleAtEnd(relation, tup); + return; +} + +/* + * HeapScanIsValid is now a macro in relscan.h -cim 4/27/91 + */ + +/* ---------------- + * SetHeapAccessMethodImmediateInvalidation + * ---------------- + */ +void +SetHeapAccessMethodImmediateInvalidation(bool on) +{ + ImmediateInvalidation = on; +} + +/* ---------------------------------------------------------------- + * heap access method interface + * ---------------------------------------------------------------- + */ +/* ---------------- + * heap_open - open a heap relation by relationId + * + * presently the relcache routines do all the work we need + * to open/close heap relations. + * ---------------- + */ +Relation +heap_open(Oid relationId) +{ + Relation r; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_open); + IncrHeapAccessStat(global_open); + + r = (Relation) RelationIdGetRelation(relationId); + + if (RelationIsValid(r) && r->rd_rel->relkind == RELKIND_INDEX) { + elog(WARN, "%s is an index relation", r->rd_rel->relname.data); + } + + return (r); +} + +/* ---------------- + * heap_openr - open a heap relation by name + * + * presently the relcache routines do all the work we need + * to open/close heap relations. + * ---------------- + */ +Relation +heap_openr(char *relationName) +{ + Relation r; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_openr); + IncrHeapAccessStat(global_openr); + + r = RelationNameGetRelation(relationName); + + if (RelationIsValid(r) && r->rd_rel->relkind == RELKIND_INDEX) { + elog(WARN, "%s is an index relation", r->rd_rel->relname.data); + } + + return (r); +} + +/* ---------------- + * heap_close - close a heap relation + * + * presently the relcache routines do all the work we need + * to open/close heap relations. + * ---------------- + */ +void +heap_close(Relation relation) +{ + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_close); + IncrHeapAccessStat(global_close); + + (void) RelationClose(relation); +} + + +/* ---------------- + * heap_beginscan - begin relation scan + * ---------------- + */ +HeapScanDesc +heap_beginscan(Relation relation, + int atend, + TimeQual timeQual, + unsigned nkeys, + ScanKey key) +{ + HeapScanDesc sdesc; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_beginscan); + IncrHeapAccessStat(global_beginscan); + + /* ---------------- + * sanity checks + * ---------------- + */ + if (RelationIsValid(relation) == false) + elog(WARN, "heap_beginscan: !RelationIsValid(relation)"); + + /* ---------------- + * set relation level read lock + * ---------------- + */ + RelationSetLockForRead(relation); + + /* XXX someday assert SelfTimeQual if relkind == RELKIND_UNCATALOGED */ + if (relation->rd_rel->relkind == RELKIND_UNCATALOGED) { + timeQual = SelfTimeQual; + } + + /* ---------------- + * increment relation ref count while scanning relation + * ---------------- + */ + RelationIncrementReferenceCount(relation); + + /* ---------------- + * allocate and initialize scan descriptor + * ---------------- + */ + sdesc = (HeapScanDesc) palloc(sizeof(HeapScanDescData)); + + relation->rd_nblocks = smgrnblocks(relation->rd_rel->relsmgr, relation); + sdesc->rs_rd = relation; + + if (nkeys) { + /* + * we do this here instead of in initsdesc() because heap_rescan also + * calls initsdesc() and we don't want to allocate memory again + */ + sdesc->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); + } else { + sdesc->rs_key = NULL; + } + + initsdesc(sdesc, relation, atend, nkeys, key); + + sdesc->rs_atend = atend; + sdesc->rs_tr = timeQual; + sdesc->rs_nkeys = (short)nkeys; + + return (sdesc); +} + +/* ---------------- + * heap_rescan - restart a relation scan + * ---------------- + */ +void +heap_rescan(HeapScanDesc sdesc, + bool scanFromEnd, + ScanKey key) +{ + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_rescan); + IncrHeapAccessStat(global_rescan); + + /* Note: set relation level read lock is still set */ + + /* ---------------- + * unpin scan buffers + * ---------------- + */ + unpinsdesc(sdesc); + + /* ---------------- + * reinitialize scan descriptor + * ---------------- + */ + initsdesc(sdesc, sdesc->rs_rd, scanFromEnd, sdesc->rs_nkeys, key); + sdesc->rs_atend = (bool) scanFromEnd; +} + +/* ---------------- + * heap_endscan - end relation scan + * + * See how to integrate with index scans. + * Check handling if reldesc caching. + * ---------------- + */ +void +heap_endscan(HeapScanDesc sdesc) +{ + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_endscan); + IncrHeapAccessStat(global_endscan); + + /* Note: no locking manipulations needed */ + + /* ---------------- + * unpin scan buffers + * ---------------- + */ + unpinsdesc(sdesc); + + /* ---------------- + * decrement relation reference count and free scan descriptor storage + * ---------------- + */ + RelationDecrementReferenceCount(sdesc->rs_rd); + + /* ---------------- + * Non 2-phase read locks on catalog relations + * ---------------- + */ + if ( IsSystemRelationName(RelationGetRelationName(sdesc->rs_rd)->data) ) + + RelationUnsetLockForRead(sdesc->rs_rd); + + pfree(sdesc); /* XXX */ +} + +/* ---------------- + * heap_getnext - retrieve next tuple in scan + * + * Fix to work with index relations. + * ---------------- + */ + +#ifdef HEAPDEBUGALL +#define HEAPDEBUG_1 \ +elog(DEBUG, "heap_getnext([%s,nkeys=%d],backw=%d,0x%x) called", \ + sdesc->rs_rd->rd_rel->relname.data, sdesc->rs_nkeys, backw, b) + +#define HEAPDEBUG_2 \ + elog(DEBUG, "heap_getnext called with backw (no tracing yet)") + +#define HEAPDEBUG_3 \ + elog(DEBUG, "heap_getnext returns NULL at end") + +#define HEAPDEBUG_4 \ + elog(DEBUG, "heap_getnext valid buffer UNPIN'd") + +#define HEAPDEBUG_5 \ + elog(DEBUG, "heap_getnext next tuple was cached") + +#define HEAPDEBUG_6 \ + elog(DEBUG, "heap_getnext returning EOS") + +#define HEAPDEBUG_7 \ + elog(DEBUG, "heap_getnext returning tuple"); +#else +#define HEAPDEBUG_1 +#define HEAPDEBUG_2 +#define HEAPDEBUG_3 +#define HEAPDEBUG_4 +#define HEAPDEBUG_5 +#define HEAPDEBUG_6 +#define HEAPDEBUG_7 +#endif /* !defined(HEAPDEBUGALL) */ + + +HeapTuple +heap_getnext(HeapScanDesc scandesc, + int backw, + Buffer *b) +{ + register HeapScanDesc sdesc = scandesc; + Buffer localb; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_getnext); + IncrHeapAccessStat(global_getnext); + + /* Note: no locking manipulations needed */ + + /* ---------------- + * argument checks + * ---------------- + */ + if (sdesc == NULL) + elog(WARN, "heap_getnext: NULL relscan"); + + /* ---------------- + * initialize return buffer to InvalidBuffer + * ---------------- + */ + if (! PointerIsValid(b)) b = &localb; + (*b) = InvalidBuffer; + + HEAPDEBUG_1; /* heap_getnext( info ) */ + + if (backw) { + /* ---------------- + * handle reverse scan + * ---------------- + */ + HEAPDEBUG_2; /* heap_getnext called with backw */ + + if (sdesc->rs_ptup == sdesc->rs_ctup && + BufferIsInvalid(sdesc->rs_pbuf)) + { + if (BufferIsValid(sdesc->rs_nbuf)) + ReleaseBuffer(sdesc->rs_nbuf); + return (NULL); + } + + /* + * Copy the "current" tuple/buffer + * to "next". Pin/unpin the buffers + * accordingly + */ + if (sdesc->rs_nbuf != sdesc->rs_cbuf) { + if (BufferIsValid(sdesc->rs_nbuf)) + ReleaseBuffer(sdesc->rs_nbuf); + if (BufferIsValid(sdesc->rs_cbuf)) + IncrBufferRefCount(sdesc->rs_cbuf); + } + sdesc->rs_ntup = sdesc->rs_ctup; + sdesc->rs_nbuf = sdesc->rs_cbuf; + + if (sdesc->rs_ptup != NULL) { + if (sdesc->rs_cbuf != sdesc->rs_pbuf) { + if (BufferIsValid(sdesc->rs_cbuf)) + ReleaseBuffer(sdesc->rs_cbuf); + if (BufferIsValid(sdesc->rs_pbuf)) + IncrBufferRefCount(sdesc->rs_pbuf); + } + sdesc->rs_ctup = sdesc->rs_ptup; + sdesc->rs_cbuf = sdesc->rs_pbuf; + } else { /* NONTUP */ + ItemPointer iptr; + + iptr = (sdesc->rs_ctup != NULL) ? + &(sdesc->rs_ctup->t_ctid) : (ItemPointer) NULL; + + /* Don't release sdesc->rs_cbuf at this point, because + heapgettup doesn't increase PrivateRefCount if it + is already set. On a backward scan, both rs_ctup and rs_ntup + usually point to the same buffer page, so + PrivateRefCount[rs_cbuf] should be 2 (or more, if for instance + ctup is stored in a TupleTableSlot). - 01/09/94 */ + + sdesc->rs_ctup = (HeapTuple) + heapgettup(sdesc->rs_rd, + iptr, + -1, + &(sdesc->rs_cbuf), + sdesc->rs_tr, + sdesc->rs_nkeys, + sdesc->rs_key); + } + + if (sdesc->rs_ctup == NULL && !BufferIsValid(sdesc->rs_cbuf)) + { + if (BufferIsValid(sdesc->rs_pbuf)) + ReleaseBuffer(sdesc->rs_pbuf); + sdesc->rs_ptup = NULL; + sdesc->rs_pbuf = InvalidBuffer; + if (BufferIsValid(sdesc->rs_nbuf)) + ReleaseBuffer(sdesc->rs_nbuf); + sdesc->rs_ntup = NULL; + sdesc->rs_nbuf = InvalidBuffer; + return (NULL); + } + + if (BufferIsValid(sdesc->rs_pbuf)) + ReleaseBuffer(sdesc->rs_pbuf); + sdesc->rs_ptup = NULL; + sdesc->rs_pbuf = UnknownBuffer; + + } else { + /* ---------------- + * handle forward scan + * ---------------- + */ + if (sdesc->rs_ctup == sdesc->rs_ntup && + BufferIsInvalid(sdesc->rs_nbuf)) { + if (BufferIsValid(sdesc->rs_pbuf)) + ReleaseBuffer(sdesc->rs_pbuf); + HEAPDEBUG_3; /* heap_getnext returns NULL at end */ + return (NULL); + } + + /* + * Copy the "current" tuple/buffer + * to "previous". Pin/unpin the buffers + * accordingly + */ + if (sdesc->rs_pbuf != sdesc->rs_cbuf) { + if (BufferIsValid(sdesc->rs_pbuf)) + ReleaseBuffer(sdesc->rs_pbuf); + if (BufferIsValid(sdesc->rs_cbuf)) + IncrBufferRefCount(sdesc->rs_cbuf); + } + sdesc->rs_ptup = sdesc->rs_ctup; + sdesc->rs_pbuf = sdesc->rs_cbuf; + + if (sdesc->rs_ntup != NULL) { + if (sdesc->rs_cbuf != sdesc->rs_nbuf) { + if (BufferIsValid(sdesc->rs_cbuf)) + ReleaseBuffer(sdesc->rs_cbuf); + if (BufferIsValid(sdesc->rs_nbuf)) + IncrBufferRefCount(sdesc->rs_nbuf); + } + sdesc->rs_ctup = sdesc->rs_ntup; + sdesc->rs_cbuf = sdesc->rs_nbuf; + HEAPDEBUG_5; /* heap_getnext next tuple was cached */ + } else { /* NONTUP */ + ItemPointer iptr; + + iptr = (sdesc->rs_ctup != NULL) ? + &sdesc->rs_ctup->t_ctid : (ItemPointer) NULL; + + /* Don't release sdesc->rs_cbuf at this point, because + heapgettup doesn't increase PrivateRefCount if it + is already set. On a forward scan, both rs_ctup and rs_ptup + usually point to the same buffer page, so + PrivateRefCount[rs_cbuf] should be 2 (or more, if for instance + ctup is stored in a TupleTableSlot). - 01/09/93 */ + + sdesc->rs_ctup = (HeapTuple) + heapgettup(sdesc->rs_rd, + iptr, + 1, + &sdesc->rs_cbuf, + sdesc->rs_tr, + sdesc->rs_nkeys, + sdesc->rs_key); + } + + if (sdesc->rs_ctup == NULL && !BufferIsValid(sdesc->rs_cbuf)) { + if (BufferIsValid(sdesc->rs_nbuf)) + ReleaseBuffer(sdesc->rs_nbuf); + sdesc->rs_ntup = NULL; + sdesc->rs_nbuf = InvalidBuffer; + if (BufferIsValid(sdesc->rs_pbuf)) + ReleaseBuffer(sdesc->rs_pbuf); + sdesc->rs_ptup = NULL; + sdesc->rs_pbuf = InvalidBuffer; + HEAPDEBUG_6; /* heap_getnext returning EOS */ + return (NULL); + } + + if (BufferIsValid(sdesc->rs_nbuf)) + ReleaseBuffer(sdesc->rs_nbuf); + sdesc->rs_ntup = NULL; + sdesc->rs_nbuf = UnknownBuffer; + } + + /* ---------------- + * if we get here it means we have a new current scan tuple, so + * point to the proper return buffer and return the tuple. + * ---------------- + */ + (*b) = sdesc->rs_cbuf; + + HEAPDEBUG_7; /* heap_getnext returning tuple */ + + return (sdesc->rs_ctup); +} + +/* ---------------- + * heap_fetch - retrive tuple with tid + * + * Currently ignores LP_IVALID during processing! + * ---------------- + */ +HeapTuple +heap_fetch(Relation relation, + TimeQual timeQual, + ItemPointer tid, + Buffer *b) +{ + ItemId lp; + Buffer buffer; + PageHeader dp; + HeapTuple tuple; + OffsetNumber offnum; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_fetch); + IncrHeapAccessStat(global_fetch); + + /* + * Note: This is collosally expensive - does two system calls per + * indexscan tuple fetch. Not good, and since we should be doing + * page level locking by the scanner anyway, it is commented out. + */ + + /* RelationSetLockForTupleRead(relation, tid); */ + + /* ---------------- + * get the buffer from the relation descriptor + * Note that this does a buffer pin. + * ---------------- + */ + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + +#ifndef NO_BUFFERISVALID + if (!BufferIsValid(buffer)) { + elog(WARN, "heap_fetch: %s relation: ReadBuffer(%lx) failed", + &relation->rd_rel->relname, (long)tid); + } +#endif + + /* ---------------- + * get the item line pointer corresponding to the requested tid + * ---------------- + */ + dp = (PageHeader) BufferGetPage(buffer); + offnum = ItemPointerGetOffsetNumber(tid); + lp = PageGetItemId(dp, offnum); + + /* ---------------- + * more sanity checks + * ---------------- + */ + + Assert(ItemIdIsUsed(lp)); + + /* ---------------- + * check time qualification of tid + * ---------------- + */ + + tuple = heap_tuple_satisfies(lp, relation, dp, + timeQual, 0,(ScanKey)NULL); + + if (tuple == NULL) + { + ReleaseBuffer(buffer); + return (NULL); + } + + /* ---------------- + * all checks passed, now either return a copy of the tuple + * or pin the buffer page and return a pointer, depending on + * whether caller gave us a valid b. + * ---------------- + */ + + if (PointerIsValid(b)) { + *b = buffer; + } else { + tuple = heap_copytuple(tuple); + ReleaseBuffer(buffer); + } + return (tuple); +} + +/* ---------------- + * heap_insert - insert tuple + * + * The assignment of t_min (and thus the others) should be + * removed eventually. + * + * Currently places the tuple onto the last page. If there is no room, + * it is placed on new pages. (Heap relations) + * Note that concurrent inserts during a scan will probably have + * unexpected results, though this will be fixed eventually. + * + * Fix to work with indexes. + * ---------------- + */ +Oid +heap_insert(Relation relation, HeapTuple tup) +{ + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_insert); + IncrHeapAccessStat(global_insert); + + /* ---------------- + * set relation level write lock. If this is a "local" relation (not + * visible to others), we don't need to set a write lock. + * ---------------- + */ + if (!relation->rd_islocal) + RelationSetLockForWrite(relation); + + /* ---------------- + * If the object id of this tuple has already been assigned, trust + * the caller. There are a couple of ways this can happen. At initial + * db creation, the backend program sets oids for tuples. When we + * define an index, we set the oid. Finally, in the future, we may + * allow users to set their own object ids in order to support a + * persistent object store (objects need to contain pointers to one + * another). + * ---------------- + */ + if (!OidIsValid(tup->t_oid)) { + tup->t_oid = newoid(); + LastOidProcessed = tup->t_oid; + } + + TransactionIdStore(GetCurrentTransactionId(), &(tup->t_xmin)); + tup->t_cmin = GetCurrentCommandId(); + StoreInvalidTransactionId(&(tup->t_xmax)); + tup->t_tmin = INVALID_ABSTIME; + tup->t_tmax = CURRENT_ABSTIME; + + doinsert(relation, tup); + + if ( IsSystemRelationName(RelationGetRelationName(relation)->data)) { + RelationUnsetLockForWrite(relation); + + /* ---------------- + * invalidate caches (only works for system relations) + * ---------------- + */ + SetRefreshWhenInvalidate(ImmediateInvalidation); + RelationInvalidateHeapTuple(relation, tup); + SetRefreshWhenInvalidate((bool)!ImmediateInvalidation); + } + + return(tup->t_oid); +} + +/* ---------------- + * heap_delete - delete a tuple + * + * Must decide how to handle errors. + * ---------------- + */ +void +heap_delete(Relation relation, ItemPointer tid) +{ + ItemId lp; + HeapTuple tp; + PageHeader dp; + Buffer b; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_delete); + IncrHeapAccessStat(global_delete); + + /* ---------------- + * sanity check + * ---------------- + */ + Assert(ItemPointerIsValid(tid)); + + /* ---------------- + * set relation level write lock + * ---------------- + */ + RelationSetLockForWrite(relation); + + b = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + +#ifndef NO_BUFFERISVALID + if (!BufferIsValid(b)) { /* XXX L_SH better ??? */ + elog(WARN, "heap_delete: failed ReadBuffer"); + } +#endif /* NO_BUFFERISVALID */ + + dp = (PageHeader) BufferGetPage(b); + lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid)); + + /* ---------------- + * check that we're deleteing a valid item + * ---------------- + */ + if (!(tp = heap_tuple_satisfies(lp, relation, dp, + NowTimeQual, 0, (ScanKey) NULL))) { + + /* XXX call something else */ + ReleaseBuffer(b); + + elog(WARN, "heap_delete: (am)invalid tid"); + } + + /* ---------------- + * get the tuple and lock tell the buffer manager we want + * exclusive access to the page + * ---------------- + */ + + /* ---------------- + * store transaction information of xact deleting the tuple + * ---------------- + */ + TransactionIdStore(GetCurrentTransactionId(), &(tp->t_xmax)); + tp->t_cmax = GetCurrentCommandId(); + ItemPointerSetInvalid(&tp->t_chain); + + /* ---------------- + * invalidate caches + * ---------------- + */ + SetRefreshWhenInvalidate(ImmediateInvalidation); + RelationInvalidateHeapTuple(relation, tp); + SetRefreshWhenInvalidate((bool)!ImmediateInvalidation); + + WriteBuffer(b); + if ( IsSystemRelationName(RelationGetRelationName(relation)->data) ) + RelationUnsetLockForWrite(relation); +} + +/* ---------------- + * heap_replace - replace a tuple + * + * Must decide how to handle errors. + * + * Fix arguments, work with indexes. + * + * 12/30/93 - modified the return value to be 1 when + * a non-functional update is detected. This + * prevents the calling routine from updating + * indices unnecessarily. -kw + * + * ---------------- + */ +int +heap_replace(Relation relation, ItemPointer otid, HeapTuple tup) +{ + ItemId lp; + HeapTuple tp; + Page dp; + Buffer buffer; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_replace); + IncrHeapAccessStat(global_replace); + + /* ---------------- + * sanity checks + * ---------------- + */ + Assert(ItemPointerIsValid(otid)); + + /* ---------------- + * set relation level write lock + * ---------------- + */ + if (!relation->rd_islocal) + RelationSetLockForWrite(relation); + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid)); +#ifndef NO_BUFFERISVALID + if (!BufferIsValid(buffer)) { + /* XXX L_SH better ??? */ + elog(WARN, "amreplace: failed ReadBuffer"); + } +#endif /* NO_BUFFERISVALID */ + + dp = (Page) BufferGetPage(buffer); + lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(otid)); + + /* ---------------- + * logically delete old item + * ---------------- + */ + + tp = (HeapTuple) PageGetItem(dp, lp); + Assert(HeapTupleIsValid(tp)); + + /* ----------------- + * the following test should be able to catch all non-functional + * update attempts and shut out all ghost tuples. + * XXX In the future, Spyros may need to update the rule lock on a tuple + * more than once within the same command and same transaction. + * He will have to introduce a new flag to override the following check. + * -- Wei + * + * ----------------- + */ + + if (TupleUpdatedByCurXactAndCmd(tp)) { + elog(NOTICE, "Non-functional update, only first update is performed"); + if ( IsSystemRelationName(RelationGetRelationName(relation)->data) ) + RelationUnsetLockForWrite(relation); + ReleaseBuffer(buffer); + return(1); + } + + /* ---------------- + * check that we're replacing a valid item - + * + * NOTE that this check must follow the non-functional update test + * above as it can happen that we try to 'replace' the same tuple + * twice in a single transaction. The second time around the + * tuple will fail the NowTimeQual. We don't want to abort the + * xact, we only want to flag the 'non-functional' NOTICE. -mer + * ---------------- + */ + if (!heap_tuple_satisfies(lp, + relation, + (PageHeader)dp, + NowTimeQual, + 0, + (ScanKey)NULL)) + { + ReleaseBuffer(buffer); + elog(WARN, "heap_replace: (am)invalid otid"); + } + + /* XXX order problems if not atomic assignment ??? */ + tup->t_oid = tp->t_oid; + TransactionIdStore(GetCurrentTransactionId(), &(tup->t_xmin)); + tup->t_cmin = GetCurrentCommandId(); + StoreInvalidTransactionId(&(tup->t_xmax)); + tup->t_tmin = INVALID_ABSTIME; + tup->t_tmax = CURRENT_ABSTIME; + ItemPointerSetInvalid(&tup->t_chain); + + /* ---------------- + * insert new item + * ---------------- + */ + if ((unsigned)DOUBLEALIGN(tup->t_len) <= PageGetFreeSpace((Page) dp)) { + RelationPutHeapTuple(relation, BufferGetBlockNumber(buffer), tup); + } else { + /* ---------------- + * new item won't fit on same page as old item, have to look + * for a new place to put it. + * ---------------- + */ + doinsert(relation, tup); + } + + /* ---------------- + * new item in place, now record transaction information + * ---------------- + */ + TransactionIdStore(GetCurrentTransactionId(), &(tp->t_xmax)); + tp->t_cmax = GetCurrentCommandId(); + tp->t_chain = tup->t_ctid; + + /* ---------------- + * invalidate caches + * ---------------- + */ + SetRefreshWhenInvalidate(ImmediateInvalidation); + RelationInvalidateHeapTuple(relation, tp); + SetRefreshWhenInvalidate((bool)!ImmediateInvalidation); + + WriteBuffer(buffer); + + if ( IsSystemRelationName(RelationGetRelationName(relation)->data) ) + RelationUnsetLockForWrite(relation); + + return(0); +} + +/* ---------------- + * heap_markpos - mark scan position + * + * Note: + * Should only one mark be maintained per scan at one time. + * Check if this can be done generally--say calls to get the + * next/previous tuple and NEVER pass struct scandesc to the + * user AM's. Now, the mark is sent to the executor for safekeeping. + * Probably can store this info into a GENERAL scan structure. + * + * May be best to change this call to store the marked position + * (up to 2?) in the scan structure itself. + * Fix to use the proper caching structure. + * ---------------- + */ +void +heap_markpos(HeapScanDesc sdesc) +{ + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_markpos); + IncrHeapAccessStat(global_markpos); + + /* Note: no locking manipulations needed */ + + if (sdesc->rs_ptup == NULL && + BufferIsUnknown(sdesc->rs_pbuf)) { /* == NONTUP */ + sdesc->rs_ptup = (HeapTuple) + heapgettup(sdesc->rs_rd, + (sdesc->rs_ctup == NULL) ? + (ItemPointer)NULL : &sdesc->rs_ctup->t_ctid, + -1, + &sdesc->rs_pbuf, + sdesc->rs_tr, + sdesc->rs_nkeys, + sdesc->rs_key); + + } else if (sdesc->rs_ntup == NULL && + BufferIsUnknown(sdesc->rs_nbuf)) { /* == NONTUP */ + sdesc->rs_ntup = (HeapTuple) + heapgettup(sdesc->rs_rd, + (sdesc->rs_ctup == NULL) ? + (ItemPointer)NULL : &sdesc->rs_ctup->t_ctid, + 1, + &sdesc->rs_nbuf, + sdesc->rs_tr, + sdesc->rs_nkeys, + sdesc->rs_key); + } + + /* ---------------- + * Should not unpin the buffer pages. They may still be in use. + * ---------------- + */ + if (sdesc->rs_ptup != NULL) { + sdesc->rs_mptid = sdesc->rs_ptup->t_ctid; + } else { + ItemPointerSetInvalid(&sdesc->rs_mptid); + } + if (sdesc->rs_ctup != NULL) { + sdesc->rs_mctid = sdesc->rs_ctup->t_ctid; + } else { + ItemPointerSetInvalid(&sdesc->rs_mctid); + } + if (sdesc->rs_ntup != NULL) { + sdesc->rs_mntid = sdesc->rs_ntup->t_ctid; + } else { + ItemPointerSetInvalid(&sdesc->rs_mntid); + } +} + +/* ---------------- + * heap_restrpos - restore position to marked location + * + * Note: there are bad side effects here. If we were past the end + * of a relation when heapmarkpos is called, then if the relation is + * extended via insert, then the next call to heaprestrpos will set + * cause the added tuples to be visible when the scan continues. + * Problems also arise if the TID's are rearranged!!! + * + * Now pins buffer once for each valid tuple pointer (rs_ptup, + * rs_ctup, rs_ntup) referencing it. + * - 01/13/94 + * + * XXX might be better to do direct access instead of + * using the generality of heapgettup(). + * + * XXX It is very possible that when a scan is restored, that a tuple + * XXX which previously qualified may fail for time range purposes, unless + * XXX some form of locking exists (ie., portals currently can act funny. + * ---------------- + */ +void +heap_restrpos(HeapScanDesc sdesc) +{ + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_restrpos); + IncrHeapAccessStat(global_restrpos); + + /* XXX no amrestrpos checking that ammarkpos called */ + + /* Note: no locking manipulations needed */ + + unpinsdesc(sdesc); + + /* force heapgettup to pin buffer for each loaded tuple */ + sdesc->rs_pbuf = InvalidBuffer; + sdesc->rs_cbuf = InvalidBuffer; + sdesc->rs_nbuf = InvalidBuffer; + + if (!ItemPointerIsValid(&sdesc->rs_mptid)) { + sdesc->rs_ptup = NULL; + } else { + sdesc->rs_ptup = (HeapTuple) + heapgettup(sdesc->rs_rd, + &sdesc->rs_mptid, + 0, + &sdesc->rs_pbuf, + NowTimeQual, + 0, + (ScanKey) NULL); + } + + if (!ItemPointerIsValid(&sdesc->rs_mctid)) { + sdesc->rs_ctup = NULL; + } else { + sdesc->rs_ctup = (HeapTuple) + heapgettup(sdesc->rs_rd, + &sdesc->rs_mctid, + 0, + &sdesc->rs_cbuf, + NowTimeQual, + 0, + (ScanKey) NULL); + } + + if (!ItemPointerIsValid(&sdesc->rs_mntid)) { + sdesc->rs_ntup = NULL; + } else { + sdesc->rs_ntup = (HeapTuple) + heapgettup(sdesc->rs_rd, + &sdesc->rs_mntid, + 0, + &sdesc->rs_nbuf, + NowTimeQual, + 0, + (ScanKey) NULL); + } +} diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c new file mode 100644 index 00000000000..457e1174a30 --- /dev/null +++ b/src/backend/access/heap/hio.c @@ -0,0 +1,195 @@ +/*------------------------------------------------------------------------- + * + * hio.c-- + * POSTGRES heap access method input/output code. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Id: hio.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <string.h> + +#include "c.h" + +#include "access/heapam.h" +#include "access/hio.h" +#include "access/htup.h" + +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/itemid.h" +#include "storage/itemptr.h" +#include "storage/off.h" + +#include "utils/memutils.h" +#include "utils/elog.h" +#include "utils/rel.h" + +/* + * amputunique - place tuple at tid + * Currently on errors, calls elog. Perhaps should return -1? + * Possible errors include the addition of a tuple to the page + * between the time the linep is chosen and the page is L_UP'd. + * + * This should be coordinated with the B-tree code. + * Probably needs to have an amdelunique to allow for + * internal index records to be deleted and reordered as needed. + * For the heap AM, this should never be needed. + */ +void +RelationPutHeapTuple(Relation relation, + BlockNumber blockIndex, + HeapTuple tuple) +{ + Buffer buffer; + Page pageHeader; + BlockNumber numberOfBlocks; + OffsetNumber offnum; + unsigned int len; + ItemId itemId; + Item item; + + /* ---------------- + * increment access statistics + * ---------------- + */ + IncrHeapAccessStat(local_RelationPutHeapTuple); + IncrHeapAccessStat(global_RelationPutHeapTuple); + + Assert(RelationIsValid(relation)); + Assert(HeapTupleIsValid(tuple)); + + numberOfBlocks = RelationGetNumberOfBlocks(relation); + Assert(blockIndex < numberOfBlocks); + + buffer = ReadBuffer(relation, blockIndex); +#ifndef NO_BUFFERISVALID + if (!BufferIsValid(buffer)) { + elog(WARN, "RelationPutHeapTuple: no buffer for %ld in %s", + blockIndex, &relation->rd_rel->relname); + } +#endif + + pageHeader = (Page)BufferGetPage(buffer); + len = (unsigned)DOUBLEALIGN(tuple->t_len); /* be conservative */ + Assert((int)len <= PageGetFreeSpace(pageHeader)); + + offnum = PageAddItem((Page)pageHeader, (Item)tuple, + tuple->t_len, InvalidOffsetNumber, LP_USED); + + itemId = PageGetItemId((Page)pageHeader, offnum); + item = PageGetItem((Page)pageHeader, itemId); + + ItemPointerSet(&((HeapTuple)item)->t_ctid, blockIndex, offnum); + + WriteBuffer(buffer); + /* return an accurate tuple */ + ItemPointerSet(&tuple->t_ctid, blockIndex, offnum); +} + +/* + * The heap_insert routines "know" that a buffer page is initialized to + * zero when a BlockExtend operation is performed. + */ + +#define PageIsNew(page) ((page)->pd_upper == 0) + +/* + * This routine is another in the series of attempts to reduce the number + * of I/O's and system calls executed in the various benchmarks. In + * particular, this routine is used to append data to the end of a relation + * file without excessive lseeks. This code should do no more than 2 semops + * in the ideal case. + * + * Eventually, we should cache the number of blocks in a relation somewhere. + * Until that time, this code will have to do an lseek to determine the number + * of blocks in a relation. + * + * This code should ideally do at most 4 semops, 1 lseek, and possibly 1 write + * to do an append; it's possible to eliminate 2 of the semops if we do direct + * buffer stuff (!); the lseek and the write can go if we get + * RelationGetNumberOfBlocks to be useful. + * + * NOTE: This code presumes that we have a write lock on the relation. + * + * Also note that this routine probably shouldn't have to exist, and does + * screw up the call graph rather badly, but we are wasting so much time and + * system resources being massively general that we are losing badly in our + * performance benchmarks. + */ +void +RelationPutHeapTupleAtEnd(Relation relation, HeapTuple tuple) +{ + Buffer buffer; + Page pageHeader; + BlockNumber lastblock; + OffsetNumber offnum; + unsigned int len; + ItemId itemId; + Item item; + + Assert(RelationIsValid(relation)); + Assert(HeapTupleIsValid(tuple)); + + /* + * XXX This does an lseek - VERY expensive - but at the moment it + * is the only way to accurately determine how many blocks are in + * a relation. A good optimization would be to get this to actually + * work properly. + */ + + lastblock = RelationGetNumberOfBlocks(relation); + + if (lastblock == 0) + { + buffer = ReadBuffer(relation, lastblock); + pageHeader = (Page)BufferGetPage(buffer); + if (PageIsNew((PageHeader) pageHeader)) + { + buffer = ReleaseAndReadBuffer(buffer, relation, P_NEW); + pageHeader = (Page)BufferGetPage(buffer); + PageInit(pageHeader, BufferGetPageSize(buffer), 0); + } + } + else + buffer = ReadBuffer(relation, lastblock - 1); + + pageHeader = (Page)BufferGetPage(buffer); + len = (unsigned)DOUBLEALIGN(tuple->t_len); /* be conservative */ + + /* + * Note that this is true if the above returned a bogus page, which + * it will do for a completely empty relation. + */ + + if (len > PageGetFreeSpace(pageHeader)) + { + buffer = ReleaseAndReadBuffer(buffer, relation, P_NEW); + pageHeader = (Page)BufferGetPage(buffer); + PageInit(pageHeader, BufferGetPageSize(buffer), 0); + + if (len > PageGetFreeSpace(pageHeader)) + elog(WARN, "Tuple is too big: size %d", len); + } + + offnum = PageAddItem((Page)pageHeader, (Item)tuple, + tuple->t_len, InvalidOffsetNumber, LP_USED); + + itemId = PageGetItemId((Page)pageHeader, offnum); + item = PageGetItem((Page)pageHeader, itemId); + + lastblock = BufferGetBlockNumber(buffer); + + ItemPointerSet(&((HeapTuple)item)->t_ctid, lastblock, offnum); + + /* return an accurate tuple */ + ItemPointerSet(&tuple->t_ctid, lastblock, offnum); + + WriteBuffer(buffer); +} diff --git a/src/backend/access/heap/stats.c b/src/backend/access/heap/stats.c new file mode 100644 index 00000000000..d41d01ac1ba --- /dev/null +++ b/src/backend/access/heap/stats.c @@ -0,0 +1,329 @@ +/*------------------------------------------------------------------------- + * + * stats.c-- + * heap access method debugging statistic collection routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/heap/Attic/stats.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ + * + * NOTES + * initam should be moved someplace else. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" + +#include "utils/memutils.h" +#include "utils/palloc.h" +#include "utils/elog.h" +#include "utils/mcxt.h" + +/* ---------------- + * InitHeapAccessStatistics + * ---------------- + */ +HeapAccessStatistics heap_access_stats = (HeapAccessStatistics) NULL; + +void +InitHeapAccessStatistics() +{ + MemoryContext oldContext; + HeapAccessStatistics stats; + + /* ---------------- + * make sure we don't initialize things twice + * ---------------- + */ + if (heap_access_stats != NULL) + return; + + /* ---------------- + * allocate statistics structure from the top memory context + * ---------------- + */ + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + stats = (HeapAccessStatistics) + palloc(sizeof(HeapAccessStatisticsData)); + + /* ---------------- + * initialize fields to default values + * ---------------- + */ + stats->global_open = 0; + stats->global_openr = 0; + stats->global_close = 0; + stats->global_beginscan = 0; + stats->global_rescan = 0; + stats->global_endscan = 0; + stats->global_getnext = 0; + stats->global_fetch = 0; + stats->global_insert = 0; + stats->global_delete = 0; + stats->global_replace = 0; + stats->global_markpos = 0; + stats->global_restrpos = 0; + stats->global_BufferGetRelation = 0; + stats->global_RelationIdGetRelation = 0; + stats->global_RelationIdGetRelation_Buf = 0; + stats->global_getreldesc = 0; + stats->global_heapgettup = 0; + stats->global_RelationPutHeapTuple = 0; + stats->global_RelationPutLongHeapTuple = 0; + + stats->local_open = 0; + stats->local_openr = 0; + stats->local_close = 0; + stats->local_beginscan = 0; + stats->local_rescan = 0; + stats->local_endscan = 0; + stats->local_getnext = 0; + stats->local_fetch = 0; + stats->local_insert = 0; + stats->local_delete = 0; + stats->local_replace = 0; + stats->local_markpos = 0; + stats->local_restrpos = 0; + stats->local_BufferGetRelation = 0; + stats->local_RelationIdGetRelation = 0; + stats->local_RelationIdGetRelation_Buf = 0; + stats->local_getreldesc = 0; + stats->local_heapgettup = 0; + stats->local_RelationPutHeapTuple = 0; + stats->local_RelationPutLongHeapTuple = 0; + stats->local_RelationNameGetRelation = 0; + stats->global_RelationNameGetRelation = 0; + + /* ---------------- + * record init times + * ---------------- + */ + time(&stats->init_global_timestamp); + time(&stats->local_reset_timestamp); + time(&stats->last_request_timestamp); + + /* ---------------- + * return to old memory context + * ---------------- + */ + (void) MemoryContextSwitchTo(oldContext); + + heap_access_stats = stats; +} + +/* ---------------- + * ResetHeapAccessStatistics + * ---------------- + */ +void +ResetHeapAccessStatistics() +{ + HeapAccessStatistics stats; + + /* ---------------- + * do nothing if stats aren't initialized + * ---------------- + */ + if (heap_access_stats == NULL) + return; + + stats = heap_access_stats; + + /* ---------------- + * reset local counts + * ---------------- + */ + stats->local_open = 0; + stats->local_openr = 0; + stats->local_close = 0; + stats->local_beginscan = 0; + stats->local_rescan = 0; + stats->local_endscan = 0; + stats->local_getnext = 0; + stats->local_fetch = 0; + stats->local_insert = 0; + stats->local_delete = 0; + stats->local_replace = 0; + stats->local_markpos = 0; + stats->local_restrpos = 0; + stats->local_BufferGetRelation = 0; + stats->local_RelationIdGetRelation = 0; + stats->local_RelationIdGetRelation_Buf = 0; + stats->local_getreldesc = 0; + stats->local_heapgettup = 0; + stats->local_RelationPutHeapTuple = 0; + stats->local_RelationPutLongHeapTuple = 0; + + /* ---------------- + * reset local timestamps + * ---------------- + */ + time(&stats->local_reset_timestamp); + time(&stats->last_request_timestamp); +} + +/* ---------------- + * GetHeapAccessStatistics + * ---------------- + */ +HeapAccessStatistics GetHeapAccessStatistics() +{ + HeapAccessStatistics stats; + + /* ---------------- + * return nothing if stats aren't initialized + * ---------------- + */ + if (heap_access_stats == NULL) + return NULL; + + /* ---------------- + * record the current request time + * ---------------- + */ + time(&heap_access_stats->last_request_timestamp); + + /* ---------------- + * allocate a copy of the stats and return it to the caller. + * ---------------- + */ + stats = (HeapAccessStatistics) + palloc(sizeof(HeapAccessStatisticsData)); + + memmove(stats, + heap_access_stats, + sizeof(HeapAccessStatisticsData)); + + return stats; +} + +/* ---------------- + * PrintHeapAccessStatistics + * ---------------- + */ +void +PrintHeapAccessStatistics(HeapAccessStatistics stats) +{ + /* ---------------- + * return nothing if stats aren't valid + * ---------------- + */ + if (stats == NULL) + return; + + printf("======== heap am statistics ========\n"); + printf("init_global_timestamp: %s", + ctime(&(stats->init_global_timestamp))); + + printf("local_reset_timestamp: %s", + ctime(&(stats->local_reset_timestamp))); + + printf("last_request_timestamp: %s", + ctime(&(stats->last_request_timestamp))); + + printf("local/global_open: %6d/%6d\n", + stats->local_open, stats->global_open); + + printf("local/global_openr: %6d/%6d\n", + stats->local_openr, stats->global_openr); + + printf("local/global_close: %6d/%6d\n", + stats->local_close, stats->global_close); + + printf("local/global_beginscan: %6d/%6d\n", + stats->local_beginscan, stats->global_beginscan); + + printf("local/global_rescan: %6d/%6d\n", + stats->local_rescan, stats->global_rescan); + + printf("local/global_endscan: %6d/%6d\n", + stats->local_endscan, stats->global_endscan); + + printf("local/global_getnext: %6d/%6d\n", + stats->local_getnext, stats->global_getnext); + + printf("local/global_fetch: %6d/%6d\n", + stats->local_fetch, stats->global_fetch); + + printf("local/global_insert: %6d/%6d\n", + stats->local_insert, stats->global_insert); + + printf("local/global_delete: %6d/%6d\n", + stats->local_delete, stats->global_delete); + + printf("local/global_replace: %6d/%6d\n", + stats->local_replace, stats->global_replace); + + printf("local/global_markpos: %6d/%6d\n", + stats->local_markpos, stats->global_markpos); + + printf("local/global_restrpos: %6d/%6d\n", + stats->local_restrpos, stats->global_restrpos); + + printf("================\n"); + + printf("local/global_BufferGetRelation: %6d/%6d\n", + stats->local_BufferGetRelation, + stats->global_BufferGetRelation); + + printf("local/global_RelationIdGetRelation: %6d/%6d\n", + stats->local_RelationIdGetRelation, + stats->global_RelationIdGetRelation); + + printf("local/global_RelationIdGetRelation_Buf: %6d/%6d\n", + stats->local_RelationIdGetRelation_Buf, + stats->global_RelationIdGetRelation_Buf); + + printf("local/global_getreldesc: %6d/%6d\n", + stats->local_getreldesc, stats->global_getreldesc); + + printf("local/global_heapgettup: %6d/%6d\n", + stats->local_heapgettup, stats->global_heapgettup); + + printf("local/global_RelationPutHeapTuple: %6d/%6d\n", + stats->local_RelationPutHeapTuple, + stats->global_RelationPutHeapTuple); + + printf("local/global_RelationPutLongHeapTuple: %6d/%6d\n", + stats->local_RelationPutLongHeapTuple, + stats->global_RelationPutLongHeapTuple); + + printf("===================================\n"); + + printf("\n"); +} + +/* ---------------- + * PrintAndFreeHeapAccessStatistics + * ---------------- + */ +void +PrintAndFreeHeapAccessStatistics(HeapAccessStatistics stats) +{ + PrintHeapAccessStatistics(stats); + if (stats != NULL) + pfree(stats); +} + +/* ---------------------------------------------------------------- + * access method initialization + * ---------------------------------------------------------------- + */ +/* ---------------- + * initam should someday be moved someplace else. + * ---------------- + */ +void +initam() +{ + /* ---------------- + * initialize heap statistics. + * ---------------- + */ + InitHeapAccessStatistics(); +} diff --git a/src/backend/access/heapam.h b/src/backend/access/heapam.h new file mode 100644 index 00000000000..9938dbeea77 --- /dev/null +++ b/src/backend/access/heapam.h @@ -0,0 +1,149 @@ +/*------------------------------------------------------------------------- + * + * heapam.h-- + * POSTGRES heap access method definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: heapam.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef HEAPAM_H +#define HEAPAM_H + +#include <sys/types.h> + +#include "postgres.h" + +#include "access/attnum.h" +#include "access/htup.h" +#include "access/relscan.h" +#include "access/skey.h" +#include "utils/tqual.h" +#include "access/tupdesc.h" +#include "storage/smgr.h" +#include "utils/rel.h" + +/* ---------------------------------------------------------------- + * heap access method statistics + * ---------------------------------------------------------------- + */ + +typedef struct HeapAccessStatisticsData { + time_t init_global_timestamp; /* time global statistics started */ + time_t local_reset_timestamp; /* last time local reset was done */ + time_t last_request_timestamp; /* last time stats were requested */ + + int global_open; + int global_openr; + int global_close; + int global_beginscan; + int global_rescan; + int global_endscan; + int global_getnext; + int global_fetch; + int global_insert; + int global_delete; + int global_replace; + int global_markpos; + int global_restrpos; + int global_BufferGetRelation; + int global_RelationIdGetRelation; + int global_RelationIdGetRelation_Buf; + int global_RelationNameGetRelation; + int global_getreldesc; + int global_heapgettup; + int global_RelationPutHeapTuple; + int global_RelationPutLongHeapTuple; + + int local_open; + int local_openr; + int local_close; + int local_beginscan; + int local_rescan; + int local_endscan; + int local_getnext; + int local_fetch; + int local_insert; + int local_delete; + int local_replace; + int local_markpos; + int local_restrpos; + int local_BufferGetRelation; + int local_RelationIdGetRelation; + int local_RelationIdGetRelation_Buf; + int local_RelationNameGetRelation; + int local_getreldesc; + int local_heapgettup; + int local_RelationPutHeapTuple; + int local_RelationPutLongHeapTuple; +} HeapAccessStatisticsData; + +typedef HeapAccessStatisticsData *HeapAccessStatistics; + +#define IncrHeapAccessStat(x) \ + (heap_access_stats == NULL ? 0 : (heap_access_stats->x)++) + +extern HeapAccessStatistics heap_access_stats; /* in stats.c */ + +/* ---------------- + * function prototypes for heap access method + * ---------------- + */ +/* heap_create, heap_creatr, and heap_destroy are declared in catalog/heap.h */ +#include "catalog/heap.h" + +/* heapam.c */ +extern void doinsert(Relation relation, HeapTuple tup); +extern void SetHeapAccessMethodImmediateInvalidation(bool on); + +extern Relation heap_open(Oid relationId); +extern Relation heap_openr(char *relationName); +extern void heap_close(Relation relation); +extern HeapScanDesc heap_beginscan(Relation relation, int atend, + TimeQual timeQual, unsigned nkeys, ScanKey key); +extern void heap_rescan(HeapScanDesc sdesc, bool scanFromEnd, ScanKey key); +extern void heap_endscan(HeapScanDesc sdesc); +extern HeapTuple heap_getnext(HeapScanDesc scandesc, int backw, Buffer *b); +extern HeapTuple heap_fetch(Relation relation, TimeQual timeQual, + ItemPointer tid, Buffer *b); +extern Oid heap_insert(Relation relation, HeapTuple tup); +extern void heap_delete(Relation relation, ItemPointer tid); +extern int heap_replace(Relation relation, ItemPointer otid, + HeapTuple tup); +extern void heap_markpos(HeapScanDesc sdesc); +extern void heap_restrpos(HeapScanDesc sdesc); + +/* in common/heaptuple.c */ +extern Size ComputeDataSize(TupleDesc tupleDesc, Datum value[], char nulls[]); +extern void DataFill(char *data, TupleDesc tupleDesc, + Datum value[], char nulls[], char *infomask, + bits8 bit[]); +extern int heap_attisnull(HeapTuple tup, int attnum); +extern int heap_sysattrlen(AttrNumber attno); +extern bool heap_sysattrbyval(AttrNumber attno); +extern char *heap_getsysattr(HeapTuple tup, Buffer b, int attnum); +extern char *fastgetattr(HeapTuple tup, unsigned attnum, + TupleDesc att, bool *isnull); +extern char *heap_getattr(HeapTuple tup, Buffer b, int attnum, + TupleDesc att, bool *isnull); +extern HeapTuple heap_copytuple(HeapTuple tuple); +extern void heap_deformtuple(HeapTuple tuple, TupleDesc tdesc, + Datum values[], char nulls[]); +extern HeapTuple heap_formtuple(TupleDesc tupleDescriptor, + Datum value[], char nulls[]); +extern HeapTuple heap_modifytuple(HeapTuple tuple, Buffer buffer, + Relation relation, Datum replValue[], char replNull[], char repl[]); +HeapTuple heap_addheader(uint32 natts, int structlen, char *structure); + +/* in common/heap/stats.c */ +extern void InitHeapAccessStatistics(void); +extern void ResetHeapAccessStatistics(void); +extern HeapAccessStatistics GetHeapAccessStatistics(void); +extern void PrintHeapAccessStatistics(HeapAccessStatistics stats); +extern void PrintAndFreeHeapAccessStatistics(HeapAccessStatistics stats); +extern void initam(void); + +#endif /* HEAPAM_H */ diff --git a/src/backend/access/hio.h b/src/backend/access/hio.h new file mode 100644 index 00000000000..4a699ffcd98 --- /dev/null +++ b/src/backend/access/hio.h @@ -0,0 +1,26 @@ +/*------------------------------------------------------------------------- + * + * hio.h-- + * POSTGRES heap access method input/output definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: hio.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef HIO_H +#define HIO_H + +#include "c.h" + +#include "storage/block.h" +#include "access/htup.h" +#include "utils/rel.h" + +extern void RelationPutHeapTuple(Relation relation, BlockNumber blockIndex, + HeapTuple tuple); +extern void RelationPutHeapTupleAtEnd(Relation relation, HeapTuple tuple); + +#endif /* HIO_H */ diff --git a/src/backend/access/htup.h b/src/backend/access/htup.h new file mode 100644 index 00000000000..7cf1ecf1762 --- /dev/null +++ b/src/backend/access/htup.h @@ -0,0 +1,115 @@ +/*------------------------------------------------------------------------- + * + * htup.h-- + * POSTGRES heap tuple definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: htup.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef HTUP_H +#define HTUP_H + +#include "access/attnum.h" +#include "storage/bufpage.h" /* just to reduce levels of #include */ +#include "storage/itemptr.h" +#include "utils/nabstime.h" + +#define MinHeapTupleBitmapSize 32 /* 8 * 4 */ + +/* check these, they are likely to be more severely limited by t_hoff */ + +#define MaxHeapAttributeNumber 1600 /* 8 * 200 */ + +/* + * to avoid wasting space, the attributes should be layed out in such a + * way to reduce structure padding. + */ +typedef struct HeapTupleData { + + unsigned int t_len; /* length of entire tuple */ + + ItemPointerData t_ctid; /* current TID of this tuple */ + + ItemPointerData t_chain; /* replaced tuple TID */ + + Oid t_oid; /* OID of this tuple -- 4 bytes */ + + CommandId t_cmin; /* insert CID stamp -- 2 bytes each */ + CommandId t_cmax; /* delete CommandId stamp */ + + TransactionId t_xmin; /* insert XID stamp -- 4 bytes each */ + TransactionId t_xmax; /* delete XID stamp */ + + AbsoluteTime t_tmin; /* time stamps -- 4 bytes each */ + AbsoluteTime t_tmax; + + int16 t_natts; /* number of attributes */ + char t_vtype; /* not used - padding */ + + char t_infomask; /* whether tuple as null or variable + * length attributes + */ + + uint8 t_hoff; /* sizeof tuple header */ + + bits8 t_bits[MinHeapTupleBitmapSize / 8]; + /* bit map of domains */ + + /* MORE DATA FOLLOWS AT END OF STRUCT */ +} HeapTupleData; + +typedef HeapTupleData *HeapTuple; + + +#define SelfItemPointerAttributeNumber (-1) +#define ObjectIdAttributeNumber (-2) +#define MinTransactionIdAttributeNumber (-3) +#define MinCommandIdAttributeNumber (-4) +#define MaxTransactionIdAttributeNumber (-5) +#define MaxCommandIdAttributeNumber (-6) +#define ChainItemPointerAttributeNumber (-7) +#define AnchorItemPointerAttributeNumber (-8) +#define MinAbsoluteTimeAttributeNumber (-9) +#define MaxAbsoluteTimeAttributeNumber (-10) +#define VersionTypeAttributeNumber (-11) +#define FirstLowInvalidHeapAttributeNumber (-12) + + +/* ---------------- + * support macros + * ---------------- + */ +#define GETSTRUCT(TUP) (((char *)(TUP)) + ((HeapTuple)(TUP))->t_hoff) + + +/* + * BITMAPLEN(NATTS) - + * Computes minimum size of bitmap given number of domains. + */ +#define BITMAPLEN(NATTS) \ + ((((((int)(NATTS) - 1) >> 3) + 4 - (MinHeapTupleBitmapSize >> 3)) \ + & ~03) + (MinHeapTupleBitmapSize >> 3)) + +/* + * HeapTupleIsValid + * True iff the heap tuple is valid. + */ +#define HeapTupleIsValid(tuple) PointerIsValid(tuple) + +/* + * information stored in t_infomask: + */ +#define HEAP_HASNULL 0x01 /* has null attribute(s) */ +#define HEAP_HASVARLENA 0x02 /* has variable length attribute(s) */ + +#define HeapTupleNoNulls(tuple) \ + (!(((HeapTuple) (tuple))->t_infomask & HEAP_HASNULL)) + +#define HeapTupleAllFixed(tuple) \ + (!(((HeapTuple) (tuple))->t_infomask & HEAP_HASVARLENA)) + +#endif /* HTUP_H */ diff --git a/src/backend/access/ibit.h b/src/backend/access/ibit.h new file mode 100644 index 00000000000..990c23ab4dd --- /dev/null +++ b/src/backend/access/ibit.h @@ -0,0 +1,34 @@ +/*------------------------------------------------------------------------- + * + * ibit.h-- + * POSTGRES index valid attribute bit map definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: ibit.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef IBIT_H +#define IBIT_H + +#include "c.h" +#include "utils/memutils.h" + +typedef struct IndexAttributeBitMapData { + char bits[(MaxIndexAttributeNumber + MaxBitsPerByte - 1) + / MaxBitsPerByte]; +} IndexAttributeBitMapData; + +typedef IndexAttributeBitMapData *IndexAttributeBitMap; + +#define IndexAttributeBitMapSize sizeof(IndexAttributeBitMapData) + +/* + * IndexAttributeBitMapIsValid -- + * True iff attribute bit map is valid. + */ +#define IndexAttributeBitMapIsValid(bits) PointerIsValid(bits) + +#endif /* IBIT_H */ diff --git a/src/backend/access/index/Makefile.inc b/src/backend/access/index/Makefile.inc new file mode 100644 index 00000000000..0bc58830c8f --- /dev/null +++ b/src/backend/access/index/Makefile.inc @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for access/index +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/index/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= genam.c indexam.c istrat.c diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c new file mode 100644 index 00000000000..3d02ba57009 --- /dev/null +++ b/src/backend/access/index/genam.c @@ -0,0 +1,275 @@ +/*------------------------------------------------------------------------- + * + * genam.c-- + * general index access method routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/index/genam.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ + * + * NOTES + * many of the old access method routines have been turned into + * macros and moved to genam.h -cim 4/30/91 + * + *------------------------------------------------------------------------- + */ +/* + * OLD COMMENTS + * Scans are implemented as follows: + * + * `0' represents an invalid item pointer. + * `-' represents an unknown item pointer. + * `X' represents a known item pointers. + * `+' represents known or invalid item pointers. + * `*' represents any item pointers. + * + * State is represented by a triple of these symbols in the order of + * previous, current, next. Note that the case of reverse scans works + * identically. + * + * State Result + * (1) + + - + 0 0 (if the next item pointer is invalid) + * (2) + X - (otherwise) + * (3) * 0 0 * 0 0 (no change) + * (4) + X 0 X 0 0 (shift) + * (5) * + X + X - (shift, add unknown) + * + * All other states cannot occur. + * + * Note: + *It would be possible to cache the status of the previous and + * next item pointer using the flags. + * ---------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/attnum.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/itup.h" +#include "access/relscan.h" +#include "access/sdir.h" +#include "access/skey.h" + +#include "storage/bufmgr.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" + +#include "catalog/catname.h" +#include "catalog/pg_attribute.h" +#include "catalog/pg_index.h" +#include "catalog/pg_proc.h" + +#include "catalog/index.h" + +/* ---------------------------------------------------------------- + * general access method routines + * + * All indexed access methods use an identical scan structure. + * We don't know how the various AMs do locking, however, so we don't + * do anything about that here. + * + * The intent is that an AM implementor will define a front-end routine + * that calls this one, to fill in the scan, and then does whatever kind + * of locking he wants. + * ---------------------------------------------------------------- + */ + +/* ---------------- + * RelationGetIndexScan -- Create and fill an IndexScanDesc. + * + * This routine creates an index scan structure and sets its contents + * up correctly. This routine calls AMrescan to set up the scan with + * the passed key. + * + * Parameters: + * relation -- index relation for scan. + * scanFromEnd -- if true, begin scan at one of the index's + * endpoints. + * numberOfKeys -- count of scan keys (more than one won't + * necessarily do anything useful, yet). + * key -- the ScanKey for the starting position of the scan. + * + * Returns: + * An initialized IndexScanDesc. + * + * Side Effects: + * Bumps the ref count on the relation to keep it in the cache. + * + * ---------------- + */ +IndexScanDesc +RelationGetIndexScan(Relation relation, + bool scanFromEnd, + uint16 numberOfKeys, + ScanKey key) +{ + IndexScanDesc scan; + + if (! RelationIsValid(relation)) + elog(WARN, "RelationGetIndexScan: relation invalid"); + + scan = (IndexScanDesc) palloc(sizeof(IndexScanDescData)); + + scan->relation = relation; + scan->opaque = NULL; + scan->numberOfKeys = numberOfKeys; + + ItemPointerSetInvalid(&scan->previousItemData); + ItemPointerSetInvalid(&scan->currentItemData); + ItemPointerSetInvalid(&scan->nextItemData); + ItemPointerSetInvalid(&scan->previousMarkData); + ItemPointerSetInvalid(&scan->currentMarkData); + ItemPointerSetInvalid(&scan->nextMarkData); + + if (numberOfKeys > 0) { + scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * numberOfKeys); + } else { + scan->keyData = NULL; + } + + index_rescan(scan, scanFromEnd, key); + + return (scan); +} + +/* ---------------- + * IndexScanRestart -- Restart an index scan. + * + * This routine isn't used by any existing access method. It's + * appropriate if relation level locks are what you want. + * + * Returns: + * None. + * + * Side Effects: + * None. + * ---------------- + */ +void +IndexScanRestart(IndexScanDesc scan, + bool scanFromEnd, + ScanKey key) +{ + if (! IndexScanIsValid(scan)) + elog(WARN, "IndexScanRestart: invalid scan"); + + ItemPointerSetInvalid(&scan->previousItemData); + ItemPointerSetInvalid(&scan->currentItemData); + ItemPointerSetInvalid(&scan->nextItemData); + + if (RelationGetNumberOfBlocks(scan->relation) == 0) + scan->flags = ScanUnmarked; + else if (scanFromEnd) + scan->flags = ScanUnmarked | ScanUncheckedPrevious; + else + scan->flags = ScanUnmarked | ScanUncheckedNext; + + scan->scanFromEnd = (bool) scanFromEnd; + + if (scan->numberOfKeys > 0) + memmove(scan->keyData, + key, + scan->numberOfKeys * sizeof(ScanKeyData)); +} + +/* ---------------- + * IndexScanEnd -- End and index scan. + * + * This routine is not used by any existing access method, but is + * suitable for use if you don't want to do sophisticated locking. + * + * Returns: + * None. + * + * Side Effects: + * None. + * ---------------- + */ +void +IndexScanEnd(IndexScanDesc scan) +{ + if (! IndexScanIsValid(scan)) + elog(WARN, "IndexScanEnd: invalid scan"); + + pfree(scan); +} + +/* ---------------- + * IndexScanMarkPosition -- Mark current position in a scan. + * + * This routine isn't used by any existing access method, but is the + * one that AM implementors should use, if they don't want to do any + * special locking. If relation-level locking is sufficient, this is + * the routine for you. + * + * Returns: + * None. + * + * Side Effects: + * None. + * ---------------- + */ +void +IndexScanMarkPosition(IndexScanDesc scan) +{ + RetrieveIndexResult result; + + if (scan->flags & ScanUncheckedPrevious) { + result = + index_getnext(scan, BackwardScanDirection); + + if (result != NULL) { + scan->previousItemData = result->index_iptr; + } else { + ItemPointerSetInvalid(&scan->previousItemData); + } + + } else if (scan->flags & ScanUncheckedNext) { + result = (RetrieveIndexResult) + index_getnext(scan, ForwardScanDirection); + + if (result != NULL) { + scan->nextItemData = result->index_iptr; + } else { + ItemPointerSetInvalid(&scan->nextItemData); + } + } + + scan->previousMarkData = scan->previousItemData; + scan->currentMarkData = scan->currentItemData; + scan->nextMarkData = scan->nextItemData; + + scan->flags = 0x0; /* XXX should have a symbolic name */ +} + +/* ---------------- + * IndexScanRestorePosition -- Restore position on a marked scan. + * + * This routine isn't used by any existing access method, but is the + * one that AM implementors should use if they don't want to do any + * special locking. If relation-level locking is sufficient, then + * this is the one you want. + * + * Returns: + * None. + * + * Side Effects: + * None. + * ---------------- + */ +void +IndexScanRestorePosition(IndexScanDesc scan) +{ + if (scan->flags & ScanUnmarked) + elog(WARN, "IndexScanRestorePosition: no mark to restore"); + + scan->previousItemData = scan->previousMarkData; + scan->currentItemData = scan->currentMarkData; + scan->nextItemData = scan->nextMarkData; + + scan->flags = 0x0; /* XXX should have a symbolic name */ +} diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c new file mode 100644 index 00000000000..bffe3a41f3a --- /dev/null +++ b/src/backend/access/index/indexam.c @@ -0,0 +1,411 @@ +/*------------------------------------------------------------------------- + * + * indexam.c-- + * general index access method routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ + * + * INTERFACE ROUTINES + * index_open - open an index relation by relationId + * index_openr - open a index relation by name + * index_close - close a index relation + * index_beginscan - start a scan of an index + * index_rescan - restart a scan of an index + * index_endscan - end a scan + * index_insert - insert an index tuple into a relation + * index_delete - delete an item from an index relation + * index_markpos - mark a scan position + * index_restrpos - restore a scan position + * index_getnext - get the next tuple from a scan + * ** index_fetch - retrieve tuple with tid + * ** index_replace - replace a tuple + * ** index_getattr - get an attribute from an index tuple + * index_getprocid - get a support procedure id from the rel tuple + * + * IndexScanIsValid - check index scan + * + * NOTES + * This file contains the index_ routines which used + * to be a scattered collection of stuff in access/genam. + * + * The ** routines: index_fetch, index_replace, and index_getattr + * have not yet been implemented. They may not be needed. + * + * old comments + * Scans are implemented as follows: + * + * `0' represents an invalid item pointer. + * `-' represents an unknown item pointer. + * `X' represents a known item pointers. + * `+' represents known or invalid item pointers. + * `*' represents any item pointers. + * + * State is represented by a triple of these symbols in the order of + * previous, current, next. Note that the case of reverse scans works + * identically. + * + * State Result + * (1) + + - + 0 0 (if the next item pointer is invalid) + * (2) + X - (otherwise) + * (3) * 0 0 * 0 0 (no change) + * (4) + X 0 X 0 0 (shift) + * (5) * + X + X - (shift, add unknown) + * + * All other states cannot occur. + * + * Note: It would be possible to cache the status of the previous and + * next item pointer using the flags. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/attnum.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/itup.h" +#include "access/relscan.h" +#include "access/sdir.h" +#include "access/skey.h" +#include "access/funcindex.h" + +#include "storage/lmgr.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/relcache.h" + +#include "catalog/catname.h" +#include "catalog/pg_attribute.h" +#include "catalog/pg_index.h" +#include "catalog/pg_proc.h" + +#include "catalog/index.h" + +#include "fmgr.h" + +/* ---------------- + * undefine macros we aren't going to use that would otherwise + * get in our way.. delete is defined in c.h and the am's are + * defined in heapam.h + * ---------------- + */ +#undef delete +#undef aminsert +#undef amdelete +#undef ambeginscan +#undef amrescan +#undef amendscan +#undef ammarkpos +#undef amrestrpos +#undef amgettuple + +/* ---------------------------------------------------------------- + * macros used in index_ routines + * ---------------------------------------------------------------- + */ +#define RELATION_CHECKS \ +Assert(RelationIsValid(relation)); \ + Assert(PointerIsValid(relation->rd_am)) + +#define SCAN_CHECKS \ + Assert(IndexScanIsValid(scan)); \ + Assert(RelationIsValid(scan->relation)); \ + Assert(PointerIsValid(scan->relation->rd_am)) + +#define GET_REL_PROCEDURE(x,y) \ + CppConcat(procedure = relation->rd_am->,y); \ + if (! RegProcedureIsValid(procedure)) \ + elog(WARN, "index_%s: invalid %s regproc", \ + CppAsString(x), CppAsString(y)) + +#define GET_SCAN_PROCEDURE(x,y) \ + CppConcat(procedure = scan->relation->rd_am->,y); \ + if (! RegProcedureIsValid(procedure)) \ + elog(WARN, "index_%s: invalid %s regproc", \ + CppAsString(x), CppAsString(y)) + + +/* ---------------------------------------------------------------- + * index_ interface functions + * ---------------------------------------------------------------- + */ +/* ---------------- + * index_open - open an index relation by relationId + * + * presently the relcache routines do all the work we need + * to open/close index relations. + * ---------------- + */ +Relation +index_open(Oid relationId) +{ + return RelationIdGetRelation(relationId); +} + +/* ---------------- + * index_openr - open a index relation by name + * + * presently the relcache routines do all the work we need + * to open/close index relations. + * ---------------- + */ +Relation +index_openr(char *relationName) +{ + return RelationNameGetRelation(relationName); +} + +/* ---------------- + * index_close - close a index relation + * + * presently the relcache routines do all the work we need + * to open/close index relations. + * ---------------- + */ +void +index_close(Relation relation) +{ + (void) RelationClose(relation); +} + +/* ---------------- + * index_insert - insert an index tuple into a relation + * ---------------- + */ +InsertIndexResult +index_insert(Relation relation, + IndexTuple indexTuple) +{ + RegProcedure procedure; + InsertIndexResult specificResult; + + RELATION_CHECKS; + GET_REL_PROCEDURE(insert,aminsert); + + /* ---------------- + * have the am's insert proc do all the work. + * ---------------- + */ + specificResult = (InsertIndexResult) + fmgr(procedure, relation, indexTuple, NULL); + + /* ---------------- + * the insert proc is supposed to return a "specific result" and + * this routine has to return a "general result" so after we get + * something back from the insert proc, we allocate a + * "general result" and copy some crap between the two. + * + * As far as I'm concerned all this result shit is needlessly c + * omplicated and should be eliminated. -cim 1/19/91 + * + * mao concurs. regardless of how we feel here, however, it is + * important to free memory we don't intend to return to anyone. + * 2/28/91 + * + * this "general result" crap is now gone. -ay 3/6/95 + * ---------------- + */ + + return (specificResult); +} + +/* ---------------- + * index_delete - delete an item from an index relation + * ---------------- + */ +void +index_delete(Relation relation, ItemPointer indexItem) +{ + RegProcedure procedure; + + RELATION_CHECKS; + GET_REL_PROCEDURE(delete,amdelete); + + (void) fmgr(procedure, relation, indexItem); +} + +/* ---------------- + * index_beginscan - start a scan of an index + * ---------------- + */ +IndexScanDesc +index_beginscan(Relation relation, + bool scanFromEnd, + uint16 numberOfKeys, + ScanKey key) +{ + IndexScanDesc scandesc; + RegProcedure procedure; + + RELATION_CHECKS; + GET_REL_PROCEDURE(beginscan,ambeginscan); + + RelationSetRIntentLock(relation); + + scandesc = (IndexScanDesc) + fmgr(procedure, relation, scanFromEnd, numberOfKeys, key); + + return scandesc; +} + +/* ---------------- + * index_rescan - restart a scan of an index + * ---------------- + */ +void +index_rescan(IndexScanDesc scan, bool scanFromEnd, ScanKey key) +{ + RegProcedure procedure; + + SCAN_CHECKS; + GET_SCAN_PROCEDURE(rescan,amrescan); + + (void) fmgr(procedure, scan, scanFromEnd, key); +} + +/* ---------------- + * index_endscan - end a scan + * ---------------- + */ +void +index_endscan(IndexScanDesc scan) +{ + RegProcedure procedure; + + SCAN_CHECKS; + GET_SCAN_PROCEDURE(endscan,amendscan); + + (void) fmgr(procedure, scan); + + RelationUnsetRIntentLock(scan->relation); +} + +/* ---------------- + * index_markpos - mark a scan position + * ---------------- + */ +void +index_markpos(IndexScanDesc scan) +{ + RegProcedure procedure; + + SCAN_CHECKS; + GET_SCAN_PROCEDURE(markpos,ammarkpos); + + (void) fmgr(procedure, scan); +} + +/* ---------------- + * index_restrpos - restore a scan position + * ---------------- + */ +void +index_restrpos(IndexScanDesc scan) +{ + RegProcedure procedure; + + SCAN_CHECKS; + GET_SCAN_PROCEDURE(restrpos,amrestrpos); + + (void) fmgr(procedure, scan); +} + +/* ---------------- + * index_getnext - get the next tuple from a scan + * + * A RetrieveIndexResult is a index tuple/heap tuple pair + * ---------------- + */ +RetrieveIndexResult +index_getnext(IndexScanDesc scan, + ScanDirection direction) +{ + RegProcedure procedure; + RetrieveIndexResult result; + + SCAN_CHECKS; + GET_SCAN_PROCEDURE(getnext,amgettuple); + + /* ---------------- + * have the am's gettuple proc do all the work. + * ---------------- + */ + result = (RetrieveIndexResult) + fmgr(procedure, scan, direction); + + return result; +} + +/* ---------------- + * index_getprocid + * + * Some indexed access methods may require support routines that are + * not in the operator class/operator model imposed by pg_am. These + * access methods may store the OIDs of registered procedures they + * need in pg_amproc. These registered procedure OIDs are ordered in + * a way that makes sense to the access method, and used only by the + * access method. The general index code doesn't know anything about + * the routines involved; it just builds an ordered list of them for + * each attribute on which an index is defined. + * + * This routine returns the requested procedure OID for a particular + * indexed attribute. + * ---------------- + */ +RegProcedure +index_getprocid(Relation irel, + AttrNumber attnum, + uint16 procnum) +{ + RegProcedure *loc; + int natts; + + natts = irel->rd_rel->relnatts; + + loc = irel->rd_support; + + Assert(loc != NULL); + + return (loc[(natts * (procnum - 1)) + (attnum - 1)]); +} + +Datum +GetIndexValue(HeapTuple tuple, + TupleDesc hTupDesc, + int attOff, + AttrNumber attrNums[], + FuncIndexInfo *fInfo, + bool *attNull, + Buffer buffer) +{ + Datum returnVal; + bool isNull; + + if (PointerIsValid(fInfo) && FIgetProcOid(fInfo) != InvalidOid) { + int i; + Datum *attData = (Datum *)palloc(FIgetnArgs(fInfo)*sizeof(Datum)); + + for (i = 0; i < FIgetnArgs(fInfo); i++) { + attData[i] = (Datum) heap_getattr(tuple, + buffer, + attrNums[i], + hTupDesc, + attNull); + } + returnVal = (Datum)fmgr_array_args(FIgetProcOid(fInfo), + FIgetnArgs(fInfo), + (char **) attData, + &isNull); + pfree(attData); + *attNull = FALSE; + }else { + returnVal = (Datum) heap_getattr(tuple, buffer, attrNums[attOff], + hTupDesc, attNull); + } + return returnVal; +} diff --git a/src/backend/access/index/istrat.c b/src/backend/access/index/istrat.c new file mode 100644 index 00000000000..602d2bd9e94 --- /dev/null +++ b/src/backend/access/index/istrat.c @@ -0,0 +1,679 @@ +/*------------------------------------------------------------------------- + * + * istrat.c-- + * index scan strategy manipulation code and index strategy manipulation + * operator code. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/index/Attic/istrat.c,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/attnum.h" +#include "access/heapam.h" +#include "access/istrat.h" +#include "access/itup.h" /* for MaxIndexAttributeNumber */ +#include "access/skey.h" +#include "utils/tqual.h" /* for NowTimeQual */ + +#include "fmgr.h" +#include "utils/elog.h" +#include "utils/rel.h" + +#include "catalog/catname.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_index.h" +#include "catalog/pg_proc.h" + +/* ---------------------------------------------------------------- + * misc strategy support routines + * ---------------------------------------------------------------- + */ + +/* + * StrategyNumberIsValid + * StrategyNumberIsInBounds + * StrategyMapIsValid + * StrategyTransformMapIsValid + * IndexStrategyIsValid + * + * ... are now macros in istrat.h -cim 4/27/91 + */ + +/* + * StrategyMapGetScanKeyEntry -- + * Returns a scan key entry of a index strategy mapping member. + * + * Note: + * Assumes that the index strategy mapping is valid. + * Assumes that the index strategy number is valid. + * Bounds checking should be done outside this routine. + */ +ScanKey +StrategyMapGetScanKeyEntry(StrategyMap map, + StrategyNumber strategyNumber) +{ + Assert(StrategyMapIsValid(map)); + Assert(StrategyNumberIsValid(strategyNumber)); + return (&map->entry[strategyNumber - 1]); +} + +/* + * IndexStrategyGetStrategyMap -- + * Returns an index strategy mapping of an index strategy. + * + * Note: + * Assumes that the index strategy is valid. + * Assumes that the number of index strategies is valid. + * Bounds checking should be done outside this routine. + */ +StrategyMap +IndexStrategyGetStrategyMap(IndexStrategy indexStrategy, + StrategyNumber maxStrategyNum, + AttrNumber attrNum) +{ + Assert(IndexStrategyIsValid(indexStrategy)); + Assert(StrategyNumberIsValid(maxStrategyNum)); + Assert(AttributeNumberIsValid(attrNum)); + + maxStrategyNum = AMStrategies(maxStrategyNum); /* XXX */ + return + &indexStrategy->strategyMapData[maxStrategyNum * (attrNum - 1)]; +} + +/* + * AttributeNumberGetIndexStrategySize -- + * Computes the size of an index strategy. + */ +Size +AttributeNumberGetIndexStrategySize(AttrNumber maxAttributeNumber, + StrategyNumber maxStrategyNumber) +{ + maxStrategyNumber = AMStrategies(maxStrategyNumber); /* XXX */ + return + maxAttributeNumber * maxStrategyNumber * sizeof (ScanKeyData); +} + +/* + * StrategyTransformMapIsValid is now a macro in istrat.h -cim 4/27/91 + */ + +/* ---------------- + * StrategyOperatorIsValid + * ---------------- + */ +bool +StrategyOperatorIsValid(StrategyOperator operator, + StrategyNumber maxStrategy) +{ + return (bool) + (PointerIsValid(operator) && + StrategyNumberIsInBounds(operator->strategy, maxStrategy) && + !(operator->flags & ~(SK_NEGATE | SK_COMMUTE))); +} + +/* ---------------- + * StrategyTermIsValid + * ---------------- + */ +bool +StrategyTermIsValid(StrategyTerm term, + StrategyNumber maxStrategy) +{ + Index index; + + if (! PointerIsValid(term) || term->degree == 0) + return false; + + for (index = 0; index < term->degree; index += 1) { + if (! StrategyOperatorIsValid(&term->operatorData[index], + maxStrategy)) { + + return false; + } + } + + return true; +} + +/* ---------------- + * StrategyExpressionIsValid + * ---------------- + */ +bool +StrategyExpressionIsValid(StrategyExpression expression, + StrategyNumber maxStrategy) +{ + StrategyTerm *termP; + + if (!PointerIsValid(expression)) + return true; + + if (!StrategyTermIsValid(expression->term[0], maxStrategy)) + return false; + + termP = &expression->term[1]; + while (StrategyTermIsValid(*termP, maxStrategy)) + termP += 1; + + return (bool) + (! PointerIsValid(*termP)); +} + +/* ---------------- + * StrategyEvaluationIsValid + * ---------------- + */ +bool +StrategyEvaluationIsValid(StrategyEvaluation evaluation) +{ + Index index; + + if (! PointerIsValid(evaluation) || + ! StrategyNumberIsValid(evaluation->maxStrategy) || + ! StrategyTransformMapIsValid(evaluation->negateTransform) || + ! StrategyTransformMapIsValid(evaluation->commuteTransform) || + ! StrategyTransformMapIsValid(evaluation->negateCommuteTransform)) { + + return false; + } + + for (index = 0; index < evaluation->maxStrategy; index += 1) { + if (! StrategyExpressionIsValid(evaluation->expression[index], + evaluation->maxStrategy)) { + + return false; + } + } + return true; +} + +/* ---------------- + * StrategyTermEvaluate + * ---------------- + */ +static bool +StrategyTermEvaluate(StrategyTerm term, + StrategyMap map, + Datum left, + Datum right) +{ + Index index; + long tmpres; + bool result; + StrategyOperator operator; + ScanKey entry; + + for (index = 0, operator = &term->operatorData[0]; + index < term->degree; index += 1, operator += 1) { + + entry = &map->entry[operator->strategy - 1]; + + Assert(RegProcedureIsValid(entry->sk_procedure)); + + switch (operator->flags ^ entry->sk_flags) { + case 0x0: + tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure, + left, right); + break; + + case SK_NEGATE: + tmpres = (long) !FMGR_PTR2(entry->sk_func, entry->sk_procedure, + left, right); + break; + + case SK_COMMUTE: + tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure, + right, left); + break; + + case SK_NEGATE | SK_COMMUTE: + tmpres = (long) !FMGR_PTR2(entry->sk_func, entry->sk_procedure, + right, left); + break; + + default: + elog(FATAL, "StrategyTermEvaluate: impossible case %d", + operator->flags ^ entry->sk_flags); + } + + result = (bool) tmpres; + if (!result) + return result; + } + + return result; +} + + +/* ---------------- + * RelationGetStrategy + * ---------------- + */ +StrategyNumber +RelationGetStrategy(Relation relation, + AttrNumber attributeNumber, + StrategyEvaluation evaluation, + RegProcedure procedure) +{ + StrategyNumber strategy; + StrategyMap strategyMap; + ScanKey entry; + Index index; + int numattrs; + + Assert(RelationIsValid(relation)); + numattrs = RelationGetNumberOfAttributes(relation); + + Assert(relation->rd_rel->relkind == RELKIND_INDEX); /* XXX use accessor */ + Assert(AttributeNumberIsValid(attributeNumber)); + Assert( (attributeNumber >= 1) && (attributeNumber < 1 + numattrs)); + + Assert(StrategyEvaluationIsValid(evaluation)); + Assert(RegProcedureIsValid(procedure)); + + strategyMap = + IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), + evaluation->maxStrategy, + attributeNumber); + + /* get a strategy number for the procedure ignoring flags for now */ + for (index = 0; index < evaluation->maxStrategy; index += 1) { + if (strategyMap->entry[index].sk_procedure == procedure) { + break; + } + } + + if (index == evaluation->maxStrategy) + return InvalidStrategy; + + strategy = 1 + index; + entry = StrategyMapGetScanKeyEntry(strategyMap, strategy); + + Assert(!(entry->sk_flags & ~(SK_NEGATE | SK_COMMUTE))); + + switch (entry->sk_flags & (SK_NEGATE | SK_COMMUTE)) { + case 0x0: + return strategy; + + case SK_NEGATE: + strategy = evaluation->negateTransform->strategy[strategy - 1]; + break; + + case SK_COMMUTE: + strategy = evaluation->commuteTransform->strategy[strategy - 1]; + break; + + case SK_NEGATE | SK_COMMUTE: + strategy = evaluation->negateCommuteTransform->strategy[strategy - 1]; + break; + + default: + elog(FATAL, "RelationGetStrategy: impossible case %d", entry->sk_flags); + } + + + if (! StrategyNumberIsInBounds(strategy, evaluation->maxStrategy)) { + if (! StrategyNumberIsValid(strategy)) { + elog(WARN, "RelationGetStrategy: corrupted evaluation"); + } + } + + return strategy; +} + +/* ---------------- + * RelationInvokeStrategy + * ---------------- + */ +bool /* XXX someday, this may return Datum */ +RelationInvokeStrategy(Relation relation, + StrategyEvaluation evaluation, + AttrNumber attributeNumber, + StrategyNumber strategy, + Datum left, + Datum right) +{ + StrategyNumber newStrategy; + StrategyMap strategyMap; + ScanKey entry; + StrategyTermData termData; + int numattrs; + + Assert(RelationIsValid(relation)); + Assert(relation->rd_rel->relkind == RELKIND_INDEX); /* XXX use accessor */ + numattrs = RelationGetNumberOfAttributes(relation); + + Assert(StrategyEvaluationIsValid(evaluation)); + Assert(AttributeNumberIsValid(attributeNumber)); + Assert( (attributeNumber >= 1) && (attributeNumber < 1 + numattrs)); + + Assert(StrategyNumberIsInBounds(strategy, evaluation->maxStrategy)); + + termData.degree = 1; + + strategyMap = + IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), + evaluation->maxStrategy, + attributeNumber); + + entry = StrategyMapGetScanKeyEntry(strategyMap, strategy); + + if (RegProcedureIsValid(entry->sk_procedure)) { + termData.operatorData[0].strategy = strategy; + termData.operatorData[0].flags = 0x0; + + return + StrategyTermEvaluate(&termData, strategyMap, left, right); + } + + + newStrategy = evaluation->negateTransform->strategy[strategy - 1]; + if (newStrategy != strategy && StrategyNumberIsValid(newStrategy)) { + + entry = StrategyMapGetScanKeyEntry(strategyMap, newStrategy); + + if (RegProcedureIsValid(entry->sk_procedure)) { + termData.operatorData[0].strategy = newStrategy; + termData.operatorData[0].flags = SK_NEGATE; + + return + StrategyTermEvaluate(&termData, strategyMap, left, right); + } + } + + newStrategy = evaluation->commuteTransform->strategy[strategy - 1]; + if (newStrategy != strategy && StrategyNumberIsValid(newStrategy)) { + + entry = StrategyMapGetScanKeyEntry(strategyMap, newStrategy); + + if (RegProcedureIsValid(entry->sk_procedure)) { + termData.operatorData[0].strategy = newStrategy; + termData.operatorData[0].flags = SK_COMMUTE; + + return + StrategyTermEvaluate(&termData, strategyMap, left, right); + } + } + + newStrategy = evaluation->negateCommuteTransform->strategy[strategy - 1]; + if (newStrategy != strategy && StrategyNumberIsValid(newStrategy)) { + + entry = StrategyMapGetScanKeyEntry(strategyMap, newStrategy); + + if (RegProcedureIsValid(entry->sk_procedure)) { + termData.operatorData[0].strategy = newStrategy; + termData.operatorData[0].flags = SK_NEGATE | SK_COMMUTE; + + return + StrategyTermEvaluate(&termData, strategyMap, left, right); + } + } + + if (PointerIsValid(evaluation->expression[strategy - 1])) { + StrategyTerm *termP; + + termP = &evaluation->expression[strategy - 1]->term[0]; + while (PointerIsValid(*termP)) { + Index index; + + for (index = 0; index < (*termP)->degree; index += 1) { + entry = StrategyMapGetScanKeyEntry(strategyMap, + (*termP)->operatorData[index].strategy); + + if (! RegProcedureIsValid(entry->sk_procedure)) { + break; + } + } + + if (index == (*termP)->degree) { + return + StrategyTermEvaluate(*termP, strategyMap, left, right); + } + + termP += 1; + } + } + + elog(WARN, "RelationInvokeStrategy: cannot evaluate strategy %d", + strategy); + + /* not reached, just to make compiler happy */ + return FALSE; + + +} + +/* ---------------- + * OperatorRelationFillScanKeyEntry + * ---------------- + */ +static void +OperatorRelationFillScanKeyEntry(Relation operatorRelation, + Oid operatorObjectId, + ScanKey entry) +{ + HeapScanDesc scan; + ScanKeyData scanKeyData; + HeapTuple tuple; + + ScanKeyEntryInitialize(&scanKeyData, 0, + ObjectIdAttributeNumber, + ObjectIdEqualRegProcedure, + ObjectIdGetDatum(operatorObjectId)); + + scan = heap_beginscan(operatorRelation, false, NowTimeQual, + 1, &scanKeyData); + + tuple = heap_getnext(scan, false, (Buffer *)NULL); + if (! HeapTupleIsValid(tuple)) { + elog(WARN, "OperatorObjectIdFillScanKeyEntry: unknown operator %lu", + (uint32) operatorObjectId); + } + + entry->sk_flags = 0; + entry->sk_procedure = + ((OperatorTupleForm) GETSTRUCT(tuple))->oprcode; + fmgr_info(entry->sk_procedure, &entry->sk_func, &entry->sk_nargs); + + if (! RegProcedureIsValid(entry->sk_procedure)) { + elog(WARN, + "OperatorObjectIdFillScanKeyEntry: no procedure for operator %lu", + (uint32) operatorObjectId); + } + + heap_endscan(scan); +} + + +/* + * IndexSupportInitialize -- + * Initializes an index strategy and associated support procedures. + */ +void +IndexSupportInitialize(IndexStrategy indexStrategy, + RegProcedure *indexSupport, + Oid indexObjectId, + Oid accessMethodObjectId, + StrategyNumber maxStrategyNumber, + StrategyNumber maxSupportNumber, + AttrNumber maxAttributeNumber) +{ + Relation relation; + Relation operatorRelation; + HeapScanDesc scan; + HeapTuple tuple; + ScanKeyData entry[2]; + StrategyMap map; + AttrNumber attributeNumber; + int attributeIndex; + Oid operatorClassObjectId[ MaxIndexAttributeNumber ]; + + maxStrategyNumber = AMStrategies(maxStrategyNumber); + + ScanKeyEntryInitialize(&entry[0], 0, Anum_pg_index_indexrelid, + ObjectIdEqualRegProcedure, + ObjectIdGetDatum(indexObjectId)); + + relation = heap_openr(IndexRelationName); + scan = heap_beginscan(relation, false, NowTimeQual, 1, entry); + tuple = heap_getnext(scan, 0, (Buffer *)NULL); + if (! HeapTupleIsValid(tuple)) + elog(WARN, "IndexSupportInitialize: corrupted catalogs"); + + /* + * XXX note that the following assumes the INDEX tuple is well formed and + * that the key[] and class[] are 0 terminated. + */ + for (attributeIndex=0; attributeIndex<maxAttributeNumber; attributeIndex++) + { + IndexTupleForm iform; + + iform = (IndexTupleForm) GETSTRUCT(tuple); + + if (!OidIsValid(iform->indkey[attributeIndex])) { + if (attributeIndex == 0) { + elog(WARN, "IndexSupportInitialize: no pg_index tuple"); + } + break; + } + + operatorClassObjectId[attributeIndex] + = iform->indclass[attributeIndex]; + } + + heap_endscan(scan); + heap_close(relation); + + /* if support routines exist for this access method, load them */ + if (maxSupportNumber > 0) { + + ScanKeyEntryInitialize(&entry[0], 0, Anum_pg_amproc_amid, + ObjectIdEqualRegProcedure, + ObjectIdGetDatum(accessMethodObjectId)); + + ScanKeyEntryInitialize(&entry[1], 0, Anum_pg_amproc_amopclaid, + ObjectIdEqualRegProcedure, 0); + +/* relation = heap_openr(Name_pg_amproc); */ + relation = heap_openr(AccessMethodProcedureRelationName); + + + for (attributeNumber = maxAttributeNumber; attributeNumber > 0; + attributeNumber--) { + + int16 support; + Form_pg_amproc form; + RegProcedure *loc; + + loc = &indexSupport[((attributeNumber - 1) * maxSupportNumber)]; + + for (support = maxSupportNumber; --support >= 0; ) { + loc[support] = InvalidOid; + } + + entry[1].sk_argument = + ObjectIdGetDatum(operatorClassObjectId[attributeNumber - 1]); + + scan = heap_beginscan(relation, false, NowTimeQual, 2, entry); + + while (tuple = heap_getnext(scan, 0, (Buffer *)NULL), + HeapTupleIsValid(tuple)) { + + form = (Form_pg_amproc) GETSTRUCT(tuple); + loc[(form->amprocnum - 1)] = form->amproc; + } + + heap_endscan(scan); + } + heap_close(relation); + } + + ScanKeyEntryInitialize(&entry[0], 0, + Anum_pg_amop_amopid, + ObjectIdEqualRegProcedure, + ObjectIdGetDatum(accessMethodObjectId)); + + ScanKeyEntryInitialize(&entry[1], 0, + Anum_pg_amop_amopclaid, + ObjectIdEqualRegProcedure, 0); + + relation = heap_openr(AccessMethodOperatorRelationName); + operatorRelation = heap_openr(OperatorRelationName); + + for (attributeNumber = maxAttributeNumber; attributeNumber > 0; + attributeNumber--) { + + StrategyNumber strategy; + + entry[1].sk_argument = + ObjectIdGetDatum(operatorClassObjectId[attributeNumber - 1]); + + map = IndexStrategyGetStrategyMap(indexStrategy, + maxStrategyNumber, + attributeNumber); + + for (strategy = 1; strategy <= maxStrategyNumber; strategy++) + ScanKeyEntrySetIllegal(StrategyMapGetScanKeyEntry(map, strategy)); + + scan = heap_beginscan(relation, false, NowTimeQual, 2, entry); + + while (tuple = heap_getnext(scan, 0, (Buffer *)NULL), + HeapTupleIsValid(tuple)) { + Form_pg_amop form; + + form = (Form_pg_amop) GETSTRUCT(tuple); + + OperatorRelationFillScanKeyEntry(operatorRelation, + form->amopopr, + StrategyMapGetScanKeyEntry(map, form->amopstrategy)); + } + + heap_endscan(scan); + } + + heap_close(operatorRelation); + heap_close(relation); +} + +/* ---------------- + * IndexStrategyDisplay + * ---------------- + */ +#ifdef ISTRATDEBUG +int +IndexStrategyDisplay(IndexStrategy indexStrategy, + StrategyNumber numberOfStrategies, + int numberOfAttributes) +{ + StrategyMap strategyMap; + AttrNumber attributeNumber; + StrategyNumber strategyNumber; + + for (attributeNumber = 1; attributeNumber <= numberOfAttributes; + attributeNumber += 1) { + + strategyMap = IndexStrategyGetStrategyMap(indexStrategy, + numberOfStrategies, + attributeNumber); + + for (strategyNumber = 1; + strategyNumber <= AMStrategies(numberOfStrategies); + strategyNumber += 1) { + + printf(":att %d\t:str %d\t:opr 0x%x(%d)\n", + attributeNumber, strategyNumber, + strategyMap->entry[strategyNumber - 1].sk_procedure, + strategyMap->entry[strategyNumber - 1].sk_procedure); + } + } +} +#endif /* defined(ISTRATDEBUG) */ + + diff --git a/src/backend/access/iqual.h b/src/backend/access/iqual.h new file mode 100644 index 00000000000..5fab98a15bd --- /dev/null +++ b/src/backend/access/iqual.h @@ -0,0 +1,32 @@ +/*------------------------------------------------------------------------- + * + * iqual.h-- + * Index scan key qualification definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: iqual.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef IQUAL_H +#define IQUAL_H + +#include "c.h" + +#include "storage/itemid.h" +#include "utils/rel.h" +#include "access/skey.h" + +/* ---------------- + * index tuple qualification support + * ---------------- + */ + +extern int NIndexTupleProcessed; + +extern bool index_keytest(IndexTuple tuple, TupleDesc tupdesc, + int scanKeySize, ScanKey key); + +#endif /* IQUAL_H */ diff --git a/src/backend/access/istrat.h b/src/backend/access/istrat.h new file mode 100644 index 00000000000..201e70e6602 --- /dev/null +++ b/src/backend/access/istrat.h @@ -0,0 +1,80 @@ +/*------------------------------------------------------------------------- + * + * istrat.h-- + * POSTGRES index strategy definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: istrat.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef ISTRAT_H +#define ISTRAT_H + +#include "postgres.h" +#include "access/attnum.h" +#include "access/skey.h" +#include "access/strat.h" +#include "utils/rel.h" /* for Relation */ + +/* + * StrategyNumberIsValid -- + * True iff the strategy number is valid. + */ +#define StrategyNumberIsValid(strategyNumber) \ + ((bool) ((strategyNumber) != InvalidStrategy)) + +/* + * StrategyNumberIsInBounds -- + * True iff strategy number is within given bounds. + * + * Note: + * Assumes StrategyNumber is an unsigned type. + * Assumes the bounded interval to be (0,max]. + */ +#define StrategyNumberIsInBounds(strategyNumber, maxStrategyNumber) \ + ((bool)(InvalidStrategy < (strategyNumber) && \ + (strategyNumber) <= (maxStrategyNumber))) + +/* + * StrategyMapIsValid -- + * True iff the index strategy mapping is valid. + */ +#define StrategyMapIsValid(map) PointerIsValid(map) + +/* + * IndexStrategyIsValid -- + * True iff the index strategy is valid. + */ +#define IndexStrategyIsValid(s) PointerIsValid(s) + +extern ScanKey StrategyMapGetScanKeyEntry(StrategyMap map, + StrategyNumber strategyNumber); +extern StrategyMap IndexStrategyGetStrategyMap(IndexStrategy indexStrategy, + StrategyNumber maxStrategyNum, AttrNumber attrNum); + +extern Size +AttributeNumberGetIndexStrategySize(AttrNumber maxAttributeNumber, + StrategyNumber maxStrategyNumber); +extern bool StrategyOperatorIsValid(StrategyOperator operator, + StrategyNumber maxStrategy); +extern bool StrategyTermIsValid(StrategyTerm term, + StrategyNumber maxStrategy); +extern bool StrategyExpressionIsValid(StrategyExpression expression, + StrategyNumber maxStrategy); +extern bool StrategyEvaluationIsValid(StrategyEvaluation evaluation); +extern StrategyNumber RelationGetStrategy(Relation relation, + AttrNumber attributeNumber, StrategyEvaluation evaluation, + RegProcedure procedure); +extern bool RelationInvokeStrategy(Relation relation, + StrategyEvaluation evaluation, AttrNumber attributeNumber, + StrategyNumber strategy, Datum left, Datum right); +extern void IndexSupportInitialize(IndexStrategy indexStrategy, + RegProcedure *indexSupport, Oid indexObjectId, + Oid accessMethodObjectId, StrategyNumber maxStrategyNumber, + StrategyNumber maxSupportNumber, AttrNumber maxAttributeNumber); + + +#endif /* ISTRAT_H */ diff --git a/src/backend/access/itup.h b/src/backend/access/itup.h new file mode 100644 index 00000000000..028bf430b0d --- /dev/null +++ b/src/backend/access/itup.h @@ -0,0 +1,104 @@ +/*------------------------------------------------------------------------- + * + * itup.h-- + * POSTGRES index tuple definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: itup.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef ITUP_H +#define ITUP_H + +#include "c.h" +#include "access/ibit.h" +#include "access/tupdesc.h" /* for TupleDesc */ +#include "storage/itemptr.h" + +#define MaxIndexAttributeNumber 7 + +typedef struct IndexTupleData { + ItemPointerData t_tid; /* reference TID to base tuple */ + + /* + * t_info is layed out in the following fashion: + * + * 15th (leftmost) bit: "has nulls" bit + * 14th bit: "has varlenas" bit + * 13th bit: "has rules" bit - (removed ay 11/94) + * bits 12-0 bit: size of tuple. + */ + + unsigned short t_info; /* various info about tuple */ + + /* + * please make sure sizeof(IndexTupleData) is MAXALIGN'ed. + * See IndexInfoFindDataOffset() for the reason. + */ + +} IndexTupleData; /* MORE DATA FOLLOWS AT END OF STRUCT */ + +typedef IndexTupleData *IndexTuple; + + +typedef struct InsertIndexResultData { + ItemPointerData pointerData; +} InsertIndexResultData; + +typedef InsertIndexResultData *InsertIndexResult; + + +typedef struct RetrieveIndexResultData { + ItemPointerData index_iptr; + ItemPointerData heap_iptr; +} RetrieveIndexResultData; + +typedef RetrieveIndexResultData *RetrieveIndexResult; + + +/*----------------- + * PredInfo - + * used for partial indices + *----------------- + */ +typedef struct PredInfo { + Node *pred; + Node *oldPred; +} PredInfo; + + +/* ---------------- + * externs + * ---------------- + */ + +#define INDEX_SIZE_MASK 0x1FFF +#define INDEX_NULL_MASK 0x8000 +#define INDEX_VAR_MASK 0x4000 + +#define IndexTupleSize(itup) (((IndexTuple) (itup))->t_info & 0x1FFF) +#define IndexTupleDSize(itup) ((itup).t_info & 0x1FFF) +#define IndexTupleNoNulls(itup) (!(((IndexTuple) (itup))->t_info & 0x8000)) +#define IndexTupleAllFixed(itup) (!(((IndexTuple) (itup))->t_info & 0x4000)) + +#define IndexTupleHasMinHeader(itup) (IndexTupleNoNulls(itup)) + + +/* indextuple.h */ +extern IndexTuple index_formtuple(TupleDesc tupleDescriptor, + Datum value[], char null[]); +extern char *fastgetiattr(IndexTuple tup, int attnum, + TupleDesc att, bool *isnull); +extern Datum index_getattr(IndexTuple tuple, AttrNumber attNum, + TupleDesc tupDesc, bool *isNullOutP); +extern RetrieveIndexResult +FormRetrieveIndexResult(ItemPointer indexItemPointer, + ItemPointer heapItemPointer); +extern void CopyIndexTuple(IndexTuple source, IndexTuple *target); + + +#endif /* ITUP_H */ + diff --git a/src/backend/access/nbtree.h b/src/backend/access/nbtree.h new file mode 100644 index 00000000000..d5c37a23950 --- /dev/null +++ b/src/backend/access/nbtree.h @@ -0,0 +1,264 @@ +/*------------------------------------------------------------------------- + * + * nbtree.h-- + * header file for postgres btree access method implementation. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: nbtree.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef NBTREE_H +#define NBTREE_H + +#include "access/attnum.h" +#include "access/itup.h" +#include "access/htup.h" +#include "access/tupdesc.h" + +#include "access/istrat.h" +#include "access/funcindex.h" +#include "access/relscan.h" +#include "access/sdir.h" +#include "nodes/pg_list.h" + +/* + * BTPageOpaqueData -- At the end of every page, we store a pointer + * to both siblings in the tree. See Lehman and Yao's paper for more + * info. In addition, we need to know what sort of page this is + * (leaf or internal), and whether the page is available for reuse. + * + * Lehman and Yao's algorithm requires a ``high key'' on every page. + * The high key on a page is guaranteed to be greater than or equal + * to any key that appears on this page. Our insertion algorithm + * guarantees that we can use the initial least key on our right + * sibling as the high key. We allocate space for the line pointer + * to the high key in the opaque data at the end of the page. + * + * Rightmost pages in the tree have no high key. + */ + +typedef struct BTPageOpaqueData { + BlockNumber btpo_prev; + BlockNumber btpo_next; + uint16 btpo_flags; + +#define BTP_LEAF (1 << 0) +#define BTP_ROOT (1 << 1) +#define BTP_FREE (1 << 2) +#define BTP_META (1 << 3) + +} BTPageOpaqueData; + +typedef BTPageOpaqueData *BTPageOpaque; + +/* + * ScanOpaqueData is used to remember which buffers we're currently + * examining in the scan. We keep these buffers locked and pinned + * and recorded in the opaque entry of the scan in order to avoid + * doing a ReadBuffer() for every tuple in the index. This avoids + * semop() calls, which are expensive. + */ + +typedef struct BTScanOpaqueData { + Buffer btso_curbuf; + Buffer btso_mrkbuf; +} BTScanOpaqueData; + +typedef BTScanOpaqueData *BTScanOpaque; + +/* + * BTItems are what we store in the btree. Each item has an index + * tuple, including key and pointer values. In addition, we must + * guarantee that all tuples in the index are unique, in order to + * satisfy some assumptions in Lehman and Yao. The way that we do + * this is by generating a new OID for every insertion that we do in + * the tree. This adds eight bytes to the size of btree index + * tuples. Note that we do not use the OID as part of a composite + * key; the OID only serves as a unique identifier for a given index + * tuple (logical position within a page). + */ + +typedef struct BTItemData { + Oid bti_oid; + int32 bti_dummy; /* padding to make bti_itup + * align at 8-byte boundary + */ + IndexTupleData bti_itup; +} BTItemData; + +typedef BTItemData *BTItem; + +/* + * BTStackData -- As we descend a tree, we push the (key, pointer) + * pairs from internal nodes onto a private stack. If we split a + * leaf, we use this stack to walk back up the tree and insert data + * into parent nodes (and possibly to split them, too). Lehman and + * Yao's update algorithm guarantees that under no circumstances can + * our private stack give us an irredeemably bad picture up the tree. + * Again, see the paper for details. + */ + +typedef struct BTStackData { + BlockNumber bts_blkno; + OffsetNumber bts_offset; + BTItem bts_btitem; + struct BTStackData *bts_parent; +} BTStackData; + +typedef BTStackData *BTStack; + +/* + * We need to be able to tell the difference between read and write + * requests for pages, in order to do locking correctly. + */ + +#define BT_READ 0 +#define BT_WRITE 1 + +/* + * Similarly, the difference between insertion and non-insertion binary + * searches on a given page makes a difference when we're descending the + * tree. + */ + +#define BT_INSERTION 0 +#define BT_DESCENT 1 + +/* + * In general, the btree code tries to localize its knowledge about + * page layout to a couple of routines. However, we need a special + * value to indicate "no page number" in those places where we expect + * page numbers. + */ + +#define P_NONE 0 +#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) +#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) + +#define P_HIKEY ((OffsetNumber) 1) +#define P_FIRSTKEY ((OffsetNumber) 2) + +/* + * Strategy numbers -- ordering of these is <, <=, =, >=, > + */ + +#define BTLessStrategyNumber 1 +#define BTLessEqualStrategyNumber 2 +#define BTEqualStrategyNumber 3 +#define BTGreaterEqualStrategyNumber 4 +#define BTGreaterStrategyNumber 5 +#define BTMaxStrategyNumber 5 + +/* + * When a new operator class is declared, we require that the user + * supply us with an amproc procedure for determining whether, for + * two keys a and b, a < b, a = b, or a > b. This routine must + * return < 0, 0, > 0, respectively, in these three cases. Since we + * only have one such proc in amproc, it's number 1. + */ + +#define BTORDER_PROC 1 + + +/* + * prototypes for functions in nbtinsert.c + */ +extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem); +extern bool _bt_itemcmp(Relation rel, Size keysz, BTItem item1, BTItem item2, + StrategyNumber strat); + +/* + * prototypes for functions in nbtpage.c + */ +extern void _bt_metapinit(Relation rel); +extern void _bt_checkmeta(Relation rel); +extern Buffer _bt_getroot(Relation rel, int access); +extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); +extern void _bt_relbuf(Relation rel, Buffer buf, int access); +extern void _bt_wrtbuf(Relation rel, Buffer buf); +extern void _bt_wrtnorelbuf(Relation rel, Buffer buf); +extern void _bt_pageinit(Page page, Size size); +extern void _bt_metaproot(Relation rel, BlockNumber rootbknum); +extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access); +extern void _bt_setpagelock(Relation rel, BlockNumber blkno, int access); +extern void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access); +extern void _bt_pagedel(Relation rel, ItemPointer tid); + +/* + * prototypes for functions in nbtree.c + */ +extern bool BuildingBtree; /* in nbtree.c */ + +extern void btbuild(Relation heap, Relation index, int natts, + AttrNumber *attnum, IndexStrategy istrat, uint16 pcount, + Datum *params, FuncIndexInfo *finfo, PredInfo *predInfo); +extern InsertIndexResult btinsert(Relation rel, IndexTuple itup); +extern char *btgettuple(IndexScanDesc scan, ScanDirection dir); +extern char *btbeginscan(Relation rel, bool fromEnd, uint16 keysz, + ScanKey scankey); + +extern void btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey); +extern void btmovescan(IndexScanDesc scan, Datum v); +extern void btendscan(IndexScanDesc scan); +extern void btmarkpos(IndexScanDesc scan); +extern void btrestrpos(IndexScanDesc scan); +extern void btdelete(Relation rel, ItemPointer tid); + +/* + * prototypes for functions in nbtscan.c + */ +extern void _bt_regscan(IndexScanDesc scan); +extern void _bt_dropscan(IndexScanDesc scan); +extern void _bt_adjscans(Relation rel, ItemPointer tid); +extern void _bt_scandel(IndexScanDesc scan, BlockNumber blkno, + OffsetNumber offno); +extern bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno, + OffsetNumber offno); + +/* + * prototypes for functions in nbtsearch.c + */ +extern BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, + Buffer *bufP); +extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, + ScanKey scankey, int access); +extern bool _bt_skeycmp(Relation rel, Size keysz, ScanKey scankey, + Page page, ItemId itemid, StrategyNumber strat); +extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, + ScanKey scankey, int srchtype); +extern RetrieveIndexResult _bt_next(IndexScanDesc scan, ScanDirection dir); +extern RetrieveIndexResult _bt_first(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); + +/* + * prototypes for functions in nbtstrat.c + */ +extern StrategyNumber _bt_getstrat(Relation rel, AttrNumber attno, + RegProcedure proc); +extern bool _bt_invokestrat(Relation rel, AttrNumber attno, + StrategyNumber strat, Datum left, Datum right); + +/* + * prototypes for functions in nbtutils.c + */ +extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup); +extern void _bt_freeskey(ScanKey skey); +extern void _bt_freestack(BTStack stack); +extern void _bt_orderkeys(Relation relation, uint16 *numberOfKeys, + ScanKey key); +extern bool _bt_checkqual(IndexScanDesc scan, IndexTuple itup); +extern BTItem _bt_formitem(IndexTuple itup); + +/* + * prototypes for functions in nbtsort.c + */ +extern void *_bt_spoolinit(Relation index, int ntapes); +extern void _bt_spooldestroy(void *spool); +extern void _bt_spool(Relation index, BTItem btitem, void *spool); +extern void _bt_upperbuild(Relation index, BlockNumber blk, int level); +extern void _bt_leafbuild(Relation index, void *spool); + +#endif /* NBTREE_H */ diff --git a/src/backend/access/nbtree/Makefile.inc b/src/backend/access/nbtree/Makefile.inc new file mode 100644 index 00000000000..50854008c01 --- /dev/null +++ b/src/backend/access/nbtree/Makefile.inc @@ -0,0 +1,15 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for access/nbtree (btree acess methods) +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= nbtcompare.c nbtinsert.c nbtpage.c nbtree.c nbtscan.c nbtsearch.c \ + nbtstrat.c nbtutils.c nbtsort.c diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README new file mode 100644 index 00000000000..a204ad4af08 --- /dev/null +++ b/src/backend/access/nbtree/README @@ -0,0 +1,68 @@ +$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + +This directory contains a correct implementation of Lehman and Yao's +btree management algorithm that supports concurrent access for Postgres. +We have made the following changes in order to incorporate their algorithm +into Postgres: + + + The requirement that all btree keys be unique is too onerous, + but the algorithm won't work correctly without it. As a result, + this implementation adds an OID (guaranteed to be unique) to + every key in the index. This guarantees uniqueness within a set + of duplicates. Space overhead is four bytes. + + For this reason, when we're passed an index tuple to store by the + common access method code, we allocate a larger one and copy the + supplied tuple into it. No Postgres code outside of the btree + access method knows about this xid or sequence number. + + + Lehman and Yao don't require read locks, but assume that in- + memory copies of tree nodes are unshared. Postgres shares + in-memory buffers among backends. As a result, we do page- + level read locking on btree nodes in order to guarantee that + no record is modified while we are examining it. This reduces + concurrency but guaranteees correct behavior. + + + Read locks on a page are held for as long as a scan has a pointer + to the page. However, locks are always surrendered before the + sibling page lock is acquired (for readers), so we remain deadlock- + free. I will do a formal proof if I get bored anytime soon. + +In addition, the following things are handy to know: + + + Page zero of every btree is a meta-data page. This page stores + the location of the root page, a pointer to a list of free + pages, and other stuff that's handy to know. + + + This algorithm doesn't really work, since it requires ordered + writes, and UNIX doesn't support ordered writes. + + + There's one other case where we may screw up in this + implementation. When we start a scan, we descend the tree + to the key nearest the one in the qual, and once we get there, + position ourselves correctly for the qual type (eg, <, >=, etc). + If we happen to step off a page, decide we want to get back to + it, and fetch the page again, and if some bad person has split + the page and moved the last tuple we saw off of it, then the + code complains about botched concurrency in an elog(WARN, ...) + and gives up the ghost. This is the ONLY violation of Lehman + and Yao's guarantee of correct behavior that I am aware of in + this code. + +Notes to operator class implementors: + + With this implementation, we require the user to supply us with + a procedure for pg_amproc. This procedure should take two keys + A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B, + respectively. See the contents of that relation for the btree + access method for some samples. + +Notes to mao for implementation document: + + On deletions, we need to adjust the position of active scans on + the index. The code in nbtscan.c handles this. We don't need to + do this for splits because of the way splits are handled; if they + happen behind us, we'll automatically go to the next page, and if + they happen in front of us, we're not affected by them. For + insertions, if we inserted a tuple behind the current scan location + on the current scan page, we move one space ahead. diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c new file mode 100644 index 00000000000..e567b3c44cb --- /dev/null +++ b/src/backend/access/nbtree/nbtcompare.c @@ -0,0 +1,173 @@ +/*------------------------------------------------------------------------- + * + * btcompare.c-- + * Comparison functions for btree access method. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * These functions are stored in pg_amproc. For each operator class + * defined on btrees, they compute + * + * compare(a, b): + * < 0 if a < b, + * = 0 if a == b, + * > 0 if a > b. + *------------------------------------------------------------------------- + */ +#include <string.h> +#include "postgres.h" +#include "utils/nabstime.h" + +int32 +btint2cmp(int16 a, int16 b) +{ + return ((int32) (a - b)); +} + +int32 +btint4cmp(int32 a, int32 b) +{ + return (a - b); +} + +int32 +btint24cmp(int16 a, int32 b) +{ + return (((int32) a) - b); +} + +int32 +btint42cmp(int32 a, int16 b) +{ + return (a - ((int32) b)); +} + +int32 +btfloat4cmp(float32 a, float32 b) +{ + if (*a > *b) + return (1); + else if (*a == *b) + return (0); + else + return (-1); +} + +int32 +btfloat8cmp(float64 a, float64 b) +{ + if (*a > *b) + return (1); + else if (*a == *b) + return (0); + else + return (-1); +} + +int32 +btoidcmp(Oid a, Oid b) +{ + if (a > b) + return (1); + else if (a == b) + return (0); + else + return (-1); +} + +int32 +btabstimecmp(AbsoluteTime a, AbsoluteTime b) +{ + if (AbsoluteTimeIsBefore(a, b)) + return (1); + else if (AbsoluteTimeIsBefore(b, a)) + return (-1); + else + return (0); +} + +int32 +btcharcmp(char a, char b) +{ + return ((int32) (a - b)); +} + +int32 +btchar2cmp(uint16 a, uint16 b) +{ + return (strncmp((char *) &a, (char *) &b, 2)); +} + +int32 +btchar4cmp(uint32 a, uint32 b) +{ + return (strncmp((char *) &a, (char *) &b, 4)); +} + +int32 +btchar8cmp(char *a, char *b) +{ + return (strncmp(a, b, 8)); +} + +int32 +btchar16cmp(char *a, char *b) +{ + return (strncmp(a, b, 16)); +} + +int32 +btnamecmp(NameData *a, NameData *b) +{ + return (strncmp(a->data, b->data, NAMEDATALEN)); +} + +int32 +bttextcmp(struct varlena *a, struct varlena *b) +{ + char *ap, *bp; + int len; + int res; + + ap = VARDATA(a); + bp = VARDATA(b); + + /* len is the length of the shorter of the two strings */ + if ((len = VARSIZE(a)) > VARSIZE(b)) + len = VARSIZE(b); + + /* len includes the four bytes in which string length is stored */ + len -= sizeof(VARSIZE(a)); + + /* + * If the two strings differ in the first len bytes, or if they're + * the same in the first len bytes and they're both len bytes long, + * we're done. + */ + + res = 0; + if (len > 0) { + do { + res = (int) (*ap++ - *bp++); + len--; + } while (res == 0 && len != 0); + } + + if (res != 0 || VARSIZE(a) == VARSIZE(b)) + return (res); + + /* + * The two strings are the same in the first len bytes, and they + * are of different lengths. + */ + + if (VARSIZE(a) < VARSIZE(b)) + return (-1); + else + return (1); +} diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c new file mode 100644 index 00000000000..536c0aa385d --- /dev/null +++ b/src/backend/access/nbtree/nbtinsert.c @@ -0,0 +1,831 @@ +/*------------------------------------------------------------------------- + * + * btinsert.c-- + * Item insertion in Lehman and Yao btrees for Postgres. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/nbtree.h" + +static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem); +static Buffer _bt_split(Relation rel, Buffer buf); +static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit); +static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); +static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem); +static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem); +static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, Oid bti_oid, BTItem newItem); + +/* + * _bt_doinsert() -- Handle insertion of a single btitem in the tree. + * + * This routine is called by the public interface routines, btbuild + * and btinsert. By here, btitem is filled in, and has a unique + * (xid, seqno) pair. + */ +InsertIndexResult +_bt_doinsert(Relation rel, BTItem btitem) +{ + ScanKey itup_scankey; + IndexTuple itup; + BTStack stack; + Buffer buf; + BlockNumber blkno; + int natts; + InsertIndexResult res; + + itup = &(btitem->bti_itup); + + /* we need a scan key to do our search, so build one */ + itup_scankey = _bt_mkscankey(rel, itup); + natts = rel->rd_rel->relnatts; + + /* find the page containing this key */ + stack = _bt_search(rel, natts, itup_scankey, &buf); + blkno = BufferGetBlockNumber(buf); + + /* trade in our read lock for a write lock */ + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_WRITE); + + /* + * If the page was split between the time that we surrendered our + * read lock and acquired our write lock, then this page may no + * longer be the right place for the key we want to insert. In this + * case, we need to move right in the tree. See Lehman and Yao for + * an excruciatingly precise description. + */ + + buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE); + + /* do the insertion */ + res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, + btitem, (BTItem) NULL); + + /* be tidy */ + _bt_freestack(stack); + _bt_freeskey(itup_scankey); + + return (res); +} + +/* + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. + * + * This recursive procedure does the following things: + * + * + if necessary, splits the target page. + * + finds the right place to insert the tuple (taking into + * account any changes induced by a split). + * + inserts the tuple. + * + if the page was split, pops the parent stack, and finds the + * right place to insert the new child pointer (by walking + * right using information stored in the parent stack). + * + invoking itself with the appropriate tuple for the right + * child page on the parent. + * + * On entry, we must have the right buffer on which to do the + * insertion, and the buffer must be pinned and locked. On return, + * we will have dropped both the pin and the write lock on the buffer. + * + * The locking interactions in this code are critical. You should + * grok Lehman and Yao's paper before making any changes. In addition, + * you need to understand how we disambiguate duplicate keys in this + * implementation, in order to be able to find our location using + * L&Y "move right" operations. Since we may insert duplicate user + * keys, and since these dups may propogate up the tree, we use the + * 'afteritem' parameter to position ourselves correctly for the + * insertion on internal pages. + */ +static InsertIndexResult +_bt_insertonpg(Relation rel, + Buffer buf, + BTStack stack, + int keysz, + ScanKey scankey, + BTItem btitem, + BTItem afteritem) +{ + InsertIndexResult res; + Page page; + Buffer rbuf; + Buffer pbuf; + Page rpage; + ScanKey newskey; + BTItem ritem; + BTPageOpaque rpageop; + BlockNumber rbknum, itup_blkno; + OffsetNumber itup_off; + int itemsz; + InsertIndexResult newres; + BTItem new_item = (BTItem) NULL; + BTItem lowLeftItem; + + page = BufferGetPage(buf); + itemsz = IndexTupleDSize(btitem->bti_itup) + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + + itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do this + but we need to be consistent */ + + if (PageGetFreeSpace(page) < itemsz) { + + /* split the buffer into left and right halves */ + rbuf = _bt_split(rel, buf); + + /* which new page (left half or right half) gets the tuple? */ + if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) { + /* left page */ + itup_off = _bt_pgaddtup(rel, buf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(buf); + } else { + /* right page */ + itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(rbuf); + } + + /* + * By here, + * + * + our target page has been split; + * + the original tuple has been inserted; + * + we have write locks on both the old (left half) and new + * (right half) buffers, after the split; and + * + we have the key we want to insert into the parent. + * + * Do the parent insertion. We need to hold onto the locks for + * the child pages until we locate the parent, but we can release + * them before doing the actual insertion (see Lehman and Yao for + * the reasoning). + */ + + if (stack == (BTStack) NULL) { + + /* create a new root node and release the split buffers */ + _bt_newroot(rel, buf, rbuf); + _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, rbuf, BT_WRITE); + + } else { + + /* form a index tuple that points at the new right page */ + rbknum = BufferGetBlockNumber(rbuf); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* + * By convention, the first entry (0) on every + * non-rightmost page is the high key for that page. In + * order to get the lowest key on the new right page, we + * actually look at its second (1) entry. + */ + + if (! P_RIGHTMOST(rpageop)) { + ritem = (BTItem) PageGetItem(rpage, + PageGetItemId(rpage, P_FIRSTKEY)); + } else { + ritem = (BTItem) PageGetItem(rpage, + PageGetItemId(rpage, P_HIKEY)); + } + + /* get a unique btitem for this key */ + new_item = _bt_formitem(&(ritem->bti_itup)); + + ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY); + + /* find the parent buffer */ + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + + /* + * If the key of new_item is < than the key of the item + * in the parent page pointing to the left page + * (stack->bts_btitem), we have to update the latter key; + * otherwise the keys on the parent page wouldn't be + * monotonically increasing after we inserted the new + * pointer to the right page (new_item). This only + * happens if our left page is the leftmost page and a + * new minimum key had been inserted before, which is not + * reflected in the parent page but didn't matter so + * far. If there are duplicate keys and this new minimum + * key spills over to our new right page, we get an + * inconsistency if we don't update the left key in the + * parent page. + */ + + if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item, + BTGreaterStrategyNumber)) { + lowLeftItem = + (BTItem) PageGetItem(page, + PageGetItemId(page, P_FIRSTKEY)); + /* page must have right pointer after split */ + _bt_updateitem(rel, keysz, pbuf, stack->bts_btitem->bti_oid, + lowLeftItem); + } + + /* don't need the children anymore */ + _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, rbuf, BT_WRITE); + + newskey = _bt_mkscankey(rel, &(new_item->bti_itup)); + newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, + keysz, newskey, new_item, + stack->bts_btitem); + + /* be tidy */ + pfree(newres); + pfree(newskey); + pfree(new_item); + } + } else { + itup_off = _bt_pgaddtup(rel, buf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(buf); + + _bt_relbuf(rel, buf, BT_WRITE); + } + + /* by here, the new tuple is inserted */ + res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); + ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); + + return (res); +} + +/* + * _bt_split() -- split a page in the btree. + * + * On entry, buf is the page to split, and is write-locked and pinned. + * Returns the new right sibling of buf, pinned and write-locked. The + * pin and lock on buf are maintained. + */ +static Buffer +_bt_split(Relation rel, Buffer buf) +{ + Buffer rbuf; + Page origpage; + Page leftpage, rightpage; + BTPageOpaque ropaque, lopaque, oopaque; + Buffer sbuf; + Page spage; + BTPageOpaque sopaque; + Size itemsz; + ItemId itemid; + BTItem item; + OffsetNumber leftoff, rightoff; + OffsetNumber start; + OffsetNumber maxoff; + OffsetNumber firstright; + OffsetNumber i; + Size llimit; + + rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + origpage = BufferGetPage(buf); + leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData)); + rightpage = BufferGetPage(rbuf); + + _bt_pageinit(rightpage, BufferGetPageSize(rbuf)); + _bt_pageinit(leftpage, BufferGetPageSize(buf)); + + /* init btree private data */ + oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); + + /* if we're splitting this page, it won't be the root when we're done */ + oopaque->btpo_flags &= ~BTP_ROOT; + lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags; + lopaque->btpo_prev = oopaque->btpo_prev; + ropaque->btpo_prev = BufferGetBlockNumber(buf); + lopaque->btpo_next = BufferGetBlockNumber(rbuf); + ropaque->btpo_next = oopaque->btpo_next; + + /* + * If the page we're splitting is not the rightmost page at its + * level in the tree, then the first (0) entry on the page is the + * high key for the page. We need to copy that to the right + * half. Otherwise (meaning the rightmost page case), we should + * treat the line pointers beginning at zero as user data. + * + * We leave a blank space at the start of the line table for the + * left page. We'll come back later and fill it in with the high + * key item we get from the right key. + */ + + leftoff = P_FIRSTKEY; + ropaque->btpo_next = oopaque->btpo_next; + if (! P_RIGHTMOST(oopaque)) { + /* splitting a non-rightmost page, start at the first data item */ + start = P_FIRSTKEY; + + /* copy the original high key to the new page */ + itemid = PageGetItemId(origpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(origpage, itemid); + (void) PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED); + rightoff = P_FIRSTKEY; + } else { + /* splitting a rightmost page, "high key" is the first data item */ + start = P_HIKEY; + + /* the new rightmost page will not have a high key */ + rightoff = P_HIKEY; + } + maxoff = PageGetMaxOffsetNumber(origpage); + llimit = PageGetFreeSpace(leftpage) / 2; + firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit); + + for (i = start; i <= maxoff; i = OffsetNumberNext(i)) { + itemid = PageGetItemId(origpage, i); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(origpage, itemid); + + /* decide which page to put it on */ + if (i < firstright) { + (void) PageAddItem(leftpage, (Item) item, itemsz, leftoff, + LP_USED); + leftoff = OffsetNumberNext(leftoff); + } else { + (void) PageAddItem(rightpage, (Item) item, itemsz, rightoff, + LP_USED); + rightoff = OffsetNumberNext(rightoff); + } + } + + /* + * Okay, page has been split, high key on right page is correct. Now + * set the high key on the left page to be the min key on the right + * page. + */ + + if (P_RIGHTMOST(ropaque)) { + itemid = PageGetItemId(rightpage, P_HIKEY); + } else { + itemid = PageGetItemId(rightpage, P_FIRSTKEY); + } + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(rightpage, itemid); + + /* + * We left a hole for the high key on the left page; fill it. The + * modal crap is to tell the page manager to put the new item on the + * page and not screw around with anything else. Whoever designed + * this interface has presumably crawled back into the dung heap they + * came from. No one here will admit to it. + */ + + PageManagerModeSet(OverwritePageManagerMode); + (void) PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED); + PageManagerModeSet(ShufflePageManagerMode); + + /* + * By here, the original data page has been split into two new halves, + * and these are correct. The algorithm requires that the left page + * never move during a split, so we copy the new left page back on top + * of the original. Note that this is not a waste of time, since we + * also require (in the page management code) that the center of a + * page always be clean, and the most efficient way to guarantee this + * is just to compact the data by reinserting it into a new left page. + */ + + PageRestoreTempPage(leftpage, origpage); + + /* write these guys out */ + _bt_wrtnorelbuf(rel, rbuf); + _bt_wrtnorelbuf(rel, buf); + + /* + * Finally, we need to grab the right sibling (if any) and fix the + * prev pointer there. We are guaranteed that this is deadlock-free + * since no other writer will be moving holding a lock on that page + * and trying to move left, and all readers release locks on a page + * before trying to fetch its neighbors. + */ + + if (! P_RIGHTMOST(ropaque)) { + sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE); + spage = BufferGetPage(sbuf); + sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); + sopaque->btpo_prev = BufferGetBlockNumber(rbuf); + + /* write and release the old right sibling */ + _bt_wrtbuf(rel, sbuf); + } + + /* split's done */ + return (rbuf); +} + +/* + * _bt_findsplitloc() -- find a safe place to split a page. + * + * In order to guarantee the proper handling of searches for duplicate + * keys, the first duplicate in the chain must either be the first + * item on the page after the split, or the entire chain must be on + * one of the two pages. That is, + * [1 2 2 2 3 4 5] + * must become + * [1] [2 2 2 3 4 5] + * or + * [1 2 2 2] [3 4 5] + * but not + * [1 2 2] [2 3 4 5]. + * However, + * [2 2 2 2 2 3 4] + * may be split as + * [2 2 2 2] [2 3 4]. + */ +static OffsetNumber +_bt_findsplitloc(Relation rel, + Page page, + OffsetNumber start, + OffsetNumber maxoff, + Size llimit) +{ + OffsetNumber i; + OffsetNumber saferight; + ItemId nxtitemid, safeitemid; + BTItem safeitem, nxtitem; + IndexTuple safetup, nxttup; + Size nbytes; + TupleDesc itupdesc; + int natts; + int attno; + Datum attsafe; + Datum attnext; + bool null; + + itupdesc = RelationGetTupleDescriptor(rel); + natts = rel->rd_rel->relnatts; + + saferight = start; + safeitemid = PageGetItemId(page, saferight); + nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData); + safeitem = (BTItem) PageGetItem(page, safeitemid); + safetup = &(safeitem->bti_itup); + + i = OffsetNumberNext(start); + + while (nbytes < llimit) { + + /* check the next item on the page */ + nxtitemid = PageGetItemId(page, i); + nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData)); + nxtitem = (BTItem) PageGetItem(page, nxtitemid); + nxttup = &(nxtitem->bti_itup); + + /* test against last known safe item */ + for (attno = 1; attno <= natts; attno++) { + attsafe = index_getattr(safetup, attno, itupdesc, &null); + attnext = index_getattr(nxttup, attno, itupdesc, &null); + + /* + * If the tuple we're looking at isn't equal to the last safe one + * we saw, then it's our new safe tuple. + */ + + if (!_bt_invokestrat(rel, attno, BTEqualStrategyNumber, + attsafe, attnext)) { + safetup = nxttup; + saferight = i; + + /* break is for the attno for loop */ + break; + } + } + i = OffsetNumberNext(i); + } + + /* + * If the chain of dups starts at the beginning of the page and extends + * past the halfway mark, we can split it in the middle. + */ + + if (saferight == start) + saferight = i; + + return (saferight); +} + +/* + * _bt_newroot() -- Create a new root page for the index. + * + * We've just split the old root page and need to create a new one. + * In order to do this, we add a new root page to the file, then lock + * the metadata page and update it. This is guaranteed to be deadlock- + * free, because all readers release their locks on the metadata page + * before trying to lock the root, and all writers lock the root before + * trying to lock the metadata page. We have a write lock on the old + * root page, so we have not introduced any cycles into the waits-for + * graph. + * + * On entry, lbuf (the old root) and rbuf (its new peer) are write- + * locked. We don't drop the locks in this routine; that's done by + * the caller. On exit, a new root page exists with entries for the + * two new children. The new root page is neither pinned nor locked. + */ +static void +_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) +{ + Buffer rootbuf; + Page lpage, rpage, rootpage; + BlockNumber lbkno, rbkno; + BlockNumber rootbknum; + BTPageOpaque rootopaque; + ItemId itemid; + BTItem item; + Size itemsz; + BTItem new_item; + + /* get a new root page */ + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootpage = BufferGetPage(rootbuf); + _bt_pageinit(rootpage, BufferGetPageSize(rootbuf)); + + /* set btree special data */ + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags |= BTP_ROOT; + + /* + * Insert the internal tuple pointers. + */ + + lbkno = BufferGetBlockNumber(lbuf); + rbkno = BufferGetBlockNumber(rbuf); + lpage = BufferGetPage(lbuf); + rpage = BufferGetPage(rbuf); + + /* + * step over the high key on the left page while building the + * left page pointer. + */ + itemid = PageGetItemId(lpage, P_FIRSTKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(lpage, itemid); + new_item = _bt_formitem(&(item->bti_itup)); + ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_FIRSTKEY); + + /* + * insert the left page pointer into the new root page. the root + * page is the rightmost page on its level so the "high key" item + * is the first data item. + */ + (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED); + pfree(new_item); + + /* + * the right page is the rightmost page on the second level, so + * the "high key" item is the first data item on that page as well. + */ + itemid = PageGetItemId(rpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(rpage, itemid); + new_item = _bt_formitem(&(item->bti_itup)); + ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY); + + /* + * insert the right page pointer into the new root page. + */ + (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED); + pfree(new_item); + + /* write and let go of the root buffer */ + rootbknum = BufferGetBlockNumber(rootbuf); + _bt_wrtbuf(rel, rootbuf); + + /* update metadata page with new root block number */ + _bt_metaproot(rel, rootbknum); +} + +/* + * _bt_pgaddtup() -- add a tuple to a particular page in the index. + * + * This routine adds the tuple to the page as requested, and keeps the + * write lock and reference associated with the page's buffer. It is + * an error to call pgaddtup() without a write lock and reference. If + * afteritem is non-null, it's the item that we expect our new item + * to follow. Otherwise, we do a binary search for the correct place + * and insert the new item there. + */ +static OffsetNumber +_bt_pgaddtup(Relation rel, + Buffer buf, + int keysz, + ScanKey itup_scankey, + Size itemsize, + BTItem btitem, + BTItem afteritem) +{ + OffsetNumber itup_off; + OffsetNumber first; + Page page; + BTPageOpaque opaque; + BTItem chkitem; + Oid afteroid; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (afteritem == (BTItem) NULL) { + itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION); + } else { + afteroid = afteritem->bti_oid; + itup_off = first; + + do { + chkitem = + (BTItem) PageGetItem(page, PageGetItemId(page, itup_off)); + itup_off = OffsetNumberNext(itup_off); + } while (chkitem->bti_oid != afteroid); + } + + (void) PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED); + + /* write the buffer, but hold our lock */ + _bt_wrtnorelbuf(rel, buf); + + return (itup_off); +} + +/* + * _bt_goesonpg() -- Does a new tuple belong on this page? + * + * This is part of the complexity introduced by allowing duplicate + * keys into the index. The tuple belongs on this page if: + * + * + there is no page to the right of this one; or + * + it is less than the high key on the page; or + * + the item it is to follow ("afteritem") appears on this + * page. + */ +static bool +_bt_goesonpg(Relation rel, + Buffer buf, + Size keysz, + ScanKey scankey, + BTItem afteritem) +{ + Page page; + ItemId hikey; + BTPageOpaque opaque; + BTItem chkitem; + OffsetNumber offnum, maxoff; + Oid afteroid; + bool found; + + page = BufferGetPage(buf); + + /* no right neighbor? */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_RIGHTMOST(opaque)) + return (true); + + /* + * this is a non-rightmost page, so it must have a high key item. + * + * If the scan key is < the high key (the min key on the next page), + * then it for sure belongs here. + */ + hikey = PageGetItemId(page, P_HIKEY); + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber)) + return (true); + + /* + * If the scan key is > the high key, then it for sure doesn't belong + * here. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber)) + return (false); + + /* + * If we have no adjacency information, and the item is equal to the + * high key on the page (by here it is), then the item does not belong + * on this page. + */ + + if (afteritem == (BTItem) NULL) + return (false); + + /* damn, have to work for it. i hate that. */ + afteroid = afteritem->bti_oid; + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Search the entire page for the afteroid. We need to do this, rather + * than doing a binary search and starting from there, because if the + * key we're searching for is the leftmost key in the tree at this + * level, then a binary search will do the wrong thing. Splits are + * pretty infrequent, so the cost isn't as bad as it could be. + */ + + found = false; + for (offnum = P_FIRSTKEY; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) { + chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + if (chkitem->bti_oid == afteroid) { + found = true; + break; + } + } + + return (found); +} + +/* + * _bt_itemcmp() -- compare item1 to item2 using a requested + * strategy (<, <=, =, >=, >) + * + */ +bool +_bt_itemcmp(Relation rel, + Size keysz, + BTItem item1, + BTItem item2, + StrategyNumber strat) +{ + TupleDesc tupDes; + IndexTuple indexTuple1, indexTuple2; + Datum attrDatum1, attrDatum2; + int i; + bool isNull; + bool compare; + + tupDes = RelationGetTupleDescriptor(rel); + indexTuple1 = &(item1->bti_itup); + indexTuple2 = &(item2->bti_itup); + + for (i = 1; i <= keysz; i++) { + attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isNull); + attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isNull); + compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2); + if (!compare) { + return (false); + } + } + return (true); +} + +/* + * _bt_updateitem() -- updates the key of the item identified by the + * oid with the key of newItem (done in place) + * + */ +static void +_bt_updateitem(Relation rel, + Size keysz, + Buffer buf, + Oid bti_oid, + BTItem newItem) +{ + Page page; + OffsetNumber maxoff; + OffsetNumber i; + ItemPointerData itemPtrData; + BTItem item; + IndexTuple oldIndexTuple, newIndexTuple; + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + + /* locate item on the page */ + i = P_HIKEY; + do { + item = (BTItem) PageGetItem(page, PageGetItemId(page, i)); + i = OffsetNumberNext(i); + } while (i <= maxoff && item->bti_oid != bti_oid); + + /* this should never happen (in theory) */ + if (item->bti_oid != bti_oid) { + elog(FATAL, "_bt_getstackbuf was lying!!"); + } + + oldIndexTuple = &(item->bti_itup); + newIndexTuple = &(newItem->bti_itup); + + /* keep the original item pointer */ + ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData); + CopyIndexTuple(newIndexTuple, &oldIndexTuple); + ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid)); +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c new file mode 100644 index 00000000000..ce411a80d11 --- /dev/null +++ b/src/backend/access/nbtree/nbtpage.c @@ -0,0 +1,523 @@ +/*------------------------------------------------------------------------- + * + * btpage.c-- + * BTree-specific page management code for the Postgres btree access + * method. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * Postgres btree pages look like ordinary relation pages. The opaque + * data at high addresses includes pointers to left and right siblings + * and flag data describing page state. The first page in a btree, page + * zero, is special -- it stores meta-information describing the tree. + * Pages one and higher store the actual tree data. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/nbtree.h" + +#define BTREE_METAPAGE 0 +#define BTREE_MAGIC 0x053162 +#define BTREE_VERSION 0 + +typedef struct BTMetaPageData { + uint32 btm_magic; + uint32 btm_version; + BlockNumber btm_root; +} BTMetaPageData; + +#define BTPageGetMeta(p) \ + ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0]) + +extern bool BuildingBtree; + +/* + * We use high-concurrency locking on btrees. There are two cases in + * which we don't do locking. One is when we're building the btree. + * Since the creating transaction has not committed, no one can see + * the index, and there's no reason to share locks. The second case + * is when we're just starting up the database system. We use some + * special-purpose initialization code in the relation cache manager + * (see utils/cache/relcache.c) to allow us to do indexed scans on + * the system catalogs before we'd normally be able to. This happens + * before the lock table is fully initialized, so we can't use it. + * Strictly speaking, this violates 2pl, but we don't do 2pl on the + * system catalogs anyway, so I declare this to be okay. + */ + +#define USELOCKING (!BuildingBtree && !IsInitProcessingMode()) + +/* + * _bt_metapinit() -- Initialize the metadata page of a btree. + */ +void +_bt_metapinit(Relation rel) +{ + Buffer buf; + Page pg; + int nblocks; + BTMetaPageData metad; + BTPageOpaque op; + + /* can't be sharing this with anyone, now... */ + if (USELOCKING) + RelationSetLockForWrite(rel); + + if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) { + elog(WARN, "Cannot initialize non-empty btree %s", + RelationGetRelationName(rel)); + } + + buf = ReadBuffer(rel, P_NEW); + pg = BufferGetPage(buf); + _bt_pageinit(pg, BufferGetPageSize(buf)); + + metad.btm_magic = BTREE_MAGIC; + metad.btm_version = BTREE_VERSION; + metad.btm_root = P_NONE; + memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad)); + + op = (BTPageOpaque) PageGetSpecialPointer(pg); + op->btpo_flags = BTP_META; + + WriteBuffer(buf); + + /* all done */ + if (USELOCKING) + RelationUnsetLockForWrite(rel); +} + +/* + * _bt_checkmeta() -- Verify that the metadata stored in a btree are + * reasonable. + */ +void +_bt_checkmeta(Relation rel) +{ + Buffer metabuf; + Page metap; + BTMetaPageData *metad; + BTPageOpaque op; + int nblocks; + + /* if the relation is empty, this is init time; don't complain */ + if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0) + return; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metap = BufferGetPage(metabuf); + op = (BTPageOpaque) PageGetSpecialPointer(metap); + if (!(op->btpo_flags & BTP_META)) { + elog(WARN, "Invalid metapage for index %s", + RelationGetRelationName(rel)); + } + metad = BTPageGetMeta(metap); + + if (metad->btm_magic != BTREE_MAGIC) { + elog(WARN, "Index %s is not a btree", + RelationGetRelationName(rel)); + } + + if (metad->btm_version != BTREE_VERSION) { + elog(WARN, "Version mismatch on %s: version %d file, version %d code", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION); + } + + _bt_relbuf(rel, metabuf, BT_READ); +} + +/* + * _bt_getroot() -- Get the root page of the btree. + * + * Since the root page can move around the btree file, we have to read + * its location from the metadata page, and then read the root page + * itself. If no root page exists yet, we have to create one. The + * standard class of race conditions exists here; I think I covered + * them all in the Hopi Indian rain dance of lock requests below. + * + * We pass in the access type (BT_READ or BT_WRITE), and return the + * root page's buffer with the appropriate lock type set. Reference + * count on the root page gets bumped by ReadBuffer. The metadata + * page is unlocked and unreferenced by this process when this routine + * returns. + */ +Buffer +_bt_getroot(Relation rel, int access) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + Buffer rootbuf; + Page rootpg; + BTPageOpaque rootopaque; + BlockNumber rootblkno; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metapg); + + /* if no root page initialized yet, do it */ + if (metad->btm_root == P_NONE) { + + /* turn our read lock in for a write lock */ + _bt_relbuf(rel, metabuf, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metapg); + + /* + * Race condition: if someone else initialized the metadata between + * the time we released the read lock and acquired the write lock, + * above, we want to avoid doing it again. + */ + + if (metad->btm_root == P_NONE) { + + /* + * Get, initialize, write, and leave a lock of the appropriate + * type on the new root page. Since this is the first page in + * the tree, it's a leaf. + */ + + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootblkno = BufferGetBlockNumber(rootbuf); + rootpg = BufferGetPage(rootbuf); + metad->btm_root = rootblkno; + _bt_pageinit(rootpg, BufferGetPageSize(rootbuf)); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); + rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT); + _bt_wrtnorelbuf(rel, rootbuf); + + /* swap write lock for read lock, if appropriate */ + if (access != BT_WRITE) { + _bt_setpagelock(rel, rootblkno, BT_READ); + _bt_unsetpagelock(rel, rootblkno, BT_WRITE); + } + + /* okay, metadata is correct */ + _bt_wrtbuf(rel, metabuf); + } else { + + /* + * Metadata initialized by someone else. In order to guarantee + * no deadlocks, we have to release the metadata page and start + * all over again. + */ + + _bt_relbuf(rel, metabuf, BT_WRITE); + return (_bt_getroot(rel, access)); + } + } else { + rootbuf = _bt_getbuf(rel, metad->btm_root, access); + + /* done with the meta page */ + _bt_relbuf(rel, metabuf, BT_READ); + } + + /* + * Race condition: If the root page split between the time we looked + * at the metadata page and got the root buffer, then we got the wrong + * buffer. + */ + + rootpg = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); + if (!(rootopaque->btpo_flags & BTP_ROOT)) { + + /* it happened, try again */ + _bt_relbuf(rel, rootbuf, access); + return (_bt_getroot(rel, access)); + } + + /* + * By here, we have a correct lock on the root block, its reference + * count is correct, and we have no lock set on the metadata page. + * Return the root block. + */ + + return (rootbuf); +} + +/* + * _bt_getbuf() -- Get a buffer by block number for read or write. + * + * When this routine returns, the appropriate lock is set on the + * requested buffer its reference count is correct. + */ +Buffer +_bt_getbuf(Relation rel, BlockNumber blkno, int access) +{ + Buffer buf; + Page page; + + /* + * If we want a new block, we can't set a lock of the appropriate type + * until we've instantiated the buffer. + */ + + if (blkno != P_NEW) { + if (access == BT_WRITE) + _bt_setpagelock(rel, blkno, BT_WRITE); + else + _bt_setpagelock(rel, blkno, BT_READ); + + buf = ReadBuffer(rel, blkno); + } else { + buf = ReadBuffer(rel, blkno); + blkno = BufferGetBlockNumber(buf); + page = BufferGetPage(buf); + _bt_pageinit(page, BufferGetPageSize(buf)); + + if (access == BT_WRITE) + _bt_setpagelock(rel, blkno, BT_WRITE); + else + _bt_setpagelock(rel, blkno, BT_READ); + } + + /* ref count and lock type are correct */ + return (buf); +} + +/* + * _bt_relbuf() -- release a locked buffer. + */ +void +_bt_relbuf(Relation rel, Buffer buf, int access) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + + /* access had better be one of read or write */ + if (access == BT_WRITE) + _bt_unsetpagelock(rel, blkno, BT_WRITE); + else + _bt_unsetpagelock(rel, blkno, BT_READ); + + ReleaseBuffer(buf); +} + +/* + * _bt_wrtbuf() -- write a btree page to disk. + * + * This routine releases the lock held on the buffer and our reference + * to it. It is an error to call _bt_wrtbuf() without a write lock + * or a reference to the buffer. + */ +void +_bt_wrtbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteBuffer(buf); + _bt_unsetpagelock(rel, blkno, BT_WRITE); +} + +/* + * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release + * our reference or lock. + * + * It is an error to call _bt_wrtnorelbuf() without a write lock + * or a reference to the buffer. + */ +void +_bt_wrtnorelbuf(Relation rel, Buffer buf) +{ + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteNoReleaseBuffer(buf); +} + +/* + * _bt_pageinit() -- Initialize a new page. + */ +void +_bt_pageinit(Page page, Size size) +{ + /* + * Cargo-cult programming -- don't really need this to be zero, but + * creating new pages is an infrequent occurrence and it makes me feel + * good when I know they're empty. + */ + + memset(page, 0, size); + + PageInit(page, size, sizeof(BTPageOpaqueData)); +} + +/* + * _bt_metaproot() -- Change the root page of the btree. + * + * Lehman and Yao require that the root page move around in order to + * guarantee deadlock-free short-term, fine-granularity locking. When + * we split the root page, we record the new parent in the metadata page + * for the relation. This routine does the work. + * + * No direct preconditions, but if you don't have the a write lock on + * at least the old root page when you call this, you're making a big + * mistake. On exit, metapage data is correct and we no longer have + * a reference to or lock on the metapage. + */ +void +_bt_metaproot(Relation rel, BlockNumber rootbknum) +{ + Buffer metabuf; + Page metap; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metap = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metap); + metad->btm_root = rootbknum; + _bt_wrtbuf(rel, metabuf); +} + +/* + * _bt_getstackbuf() -- Walk back up the tree one step, and find the item + * we last looked at in the parent. + * + * This is possible because we save a bit image of the last item + * we looked at in the parent, and the update algorithm guarantees + * that if items above us in the tree move, they only move right. + */ +Buffer +_bt_getstackbuf(Relation rel, BTStack stack, int access) +{ + Buffer buf; + BlockNumber blkno; + OffsetNumber start, offnum, maxoff; + OffsetNumber i; + Page page; + ItemId itemid; + BTItem item; + BTPageOpaque opaque; + + blkno = stack->bts_blkno; + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + if (maxoff >= stack->bts_offset) { + itemid = PageGetItemId(page, stack->bts_offset); + item = (BTItem) PageGetItem(page, itemid); + + /* if the item is where we left it, we're done */ + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + + /* if the item has just moved right on this page, we're done */ + for (i = OffsetNumberNext(stack->bts_offset); + i <= maxoff; + i = OffsetNumberNext(i)) { + itemid = PageGetItemId(page, i); + item = (BTItem) PageGetItem(page, itemid); + + /* if the item is where we left it, we're done */ + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + } + } + + /* by here, the item we're looking for moved right at least one page */ + for (;;) { + blkno = opaque->btpo_next; + if (P_RIGHTMOST(opaque)) + elog(FATAL, "my bits moved right off the end of the world!"); + + _bt_relbuf(rel, buf, access); + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* if we have a right sibling, step over the high key */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* see if it's on this page */ + for (offnum = start; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) { + itemid = PageGetItemId(page, offnum); + item = (BTItem) PageGetItem(page, itemid); + if (item->bti_oid == stack->bts_btitem->bti_oid) + return (buf); + } + } +} + +void +_bt_setpagelock(Relation rel, BlockNumber blkno, int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, P_HIKEY); + + if (access == BT_WRITE) + RelationSetSingleWLockPage(rel, &iptr); + else + RelationSetSingleRLockPage(rel, &iptr); + } +} + +void +_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access) +{ + ItemPointerData iptr; + + if (USELOCKING) { + ItemPointerSet(&iptr, blkno, P_HIKEY); + + if (access == BT_WRITE) + RelationUnsetSingleWLockPage(rel, &iptr); + else + RelationUnsetSingleRLockPage(rel, &iptr); + } +} + +void +_bt_pagedel(Relation rel, ItemPointer tid) +{ + Buffer buf; + Page page; + BlockNumber blkno; + OffsetNumber offno; + + blkno = ItemPointerGetBlockNumber(tid); + offno = ItemPointerGetOffsetNumber(tid); + + buf = _bt_getbuf(rel, blkno, BT_WRITE); + page = BufferGetPage(buf); + + PageIndexTupleDelete(page, offno); + + /* write the buffer and release the lock */ + _bt_wrtbuf(rel, buf); +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c new file mode 100644 index 00000000000..06016119964 --- /dev/null +++ b/src/backend/access/nbtree/nbtree.c @@ -0,0 +1,516 @@ +/*------------------------------------------------------------------------- + * + * btree.c-- + * Implementation of Lehman and Yao's btree management algorithm for + * Postgres. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * This file contains only the public interface routines. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/sdir.h" +#include "access/nbtree.h" +#include "access/funcindex.h" + +#include "nodes/execnodes.h" +#include "nodes/plannodes.h" + +#include "executor/executor.h" +#include "executor/tuptable.h" + +#include "catalog/index.h" + +bool BuildingBtree = false; +bool FastBuild = false; /* turn this on to make bulk builds work*/ + +/* + * btbuild() -- build a new btree index. + * + * We use a global variable to record the fact that we're creating + * a new index. This is used to avoid high-concurrency locking, + * since the index won't be visible until this transaction commits + * and since building is guaranteed to be single-threaded. + */ +void +btbuild(Relation heap, + Relation index, + int natts, + AttrNumber *attnum, + IndexStrategy istrat, + uint16 pcount, + Datum *params, + FuncIndexInfo *finfo, + PredInfo *predInfo) +{ + HeapScanDesc hscan; + Buffer buffer; + HeapTuple htup; + IndexTuple itup; + TupleDesc htupdesc, itupdesc; + Datum *attdata; + bool *nulls; + InsertIndexResult res; + int nhtups, nitups; + int i; + BTItem btitem; + ExprContext *econtext; + TupleTable tupleTable; + TupleTableSlot *slot; + Oid hrelid, irelid; + Node *pred, *oldPred; + void *spool; + + /* note that this is a new btree */ + BuildingBtree = true; + + pred = predInfo->pred; + oldPred = predInfo->oldPred; + + /* initialize the btree index metadata page (if this is a new index) */ + if (oldPred == NULL) + _bt_metapinit(index); + + /* get tuple descriptors for heap and index relations */ + htupdesc = RelationGetTupleDescriptor(heap); + itupdesc = RelationGetTupleDescriptor(index); + + /* get space for data items that'll appear in the index tuple */ + attdata = (Datum *) palloc(natts * sizeof(Datum)); + nulls = (bool *) palloc(natts * sizeof(bool)); + + /* + * If this is a predicate (partial) index, we will need to evaluate the + * predicate using ExecQual, which requires the current tuple to be in a + * slot of a TupleTable. In addition, ExecQual must have an ExprContext + * referring to that slot. Here, we initialize dummy TupleTable and + * ExprContext objects for this purpose. --Nels, Feb '92 + */ +#ifndef OMIT_PARTIAL_INDEX + if (pred != NULL || oldPred != NULL) { + tupleTable = ExecCreateTupleTable(1); + slot = ExecAllocTableSlot(tupleTable); + econtext = makeNode(ExprContext); + FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer); + } +#endif /* OMIT_PARTIAL_INDEX */ + + /* start a heap scan */ + hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL); + htup = heap_getnext(hscan, 0, &buffer); + + /* build the index */ + nhtups = nitups = 0; + + if (FastBuild) { + spool = _bt_spoolinit(index, 7); + res = (InsertIndexResult) NULL; + } + + for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) { + + nhtups++; + + /* + * If oldPred != NULL, this is an EXTEND INDEX command, so skip + * this tuple if it was already in the existing partial index + */ + if (oldPred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + + /*SetSlotContents(slot, htup);*/ + slot->val = htup; + if (ExecQual((List*)oldPred, econtext) == true) { + nitups++; + continue; + } +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* Skip this tuple if it doesn't satisfy the partial-index predicate */ + if (pred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + /* SetSlotContents(slot, htup); */ + slot->val = htup; + if (ExecQual((List*)pred, econtext) == false) + continue; +#endif /* OMIT_PARTIAL_INDEX */ + } + + nitups++; + + /* + * For the current heap tuple, extract all the attributes + * we use in this index, and note which are null. + */ + + for (i = 1; i <= natts; i++) { + int attoff; + bool attnull; + + /* + * Offsets are from the start of the tuple, and are + * zero-based; indices are one-based. The next call + * returns i - 1. That's data hiding for you. + */ + + attoff = AttrNumberGetAttrOffset(i); + attdata[attoff] = GetIndexValue(htup, + htupdesc, + attoff, + attnum, + finfo, + &attnull, + buffer); + nulls[attoff] = (attnull ? 'n' : ' '); + } + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(itupdesc, attdata, nulls); + + /* + * If the single index key is null, we don't insert it into + * the index. Btrees support scans on <, <=, =, >=, and >. + * Relational algebra says that A op B (where op is one of the + * operators above) returns null if either A or B is null. This + * means that no qualification used in an index scan could ever + * return true on a null attribute. It also means that indices + * can't be used by ISNULL or NOTNULL scans, but that's an + * artifact of the strategy map architecture chosen in 1986, not + * of the way nulls are handled here. + */ + + if (itup->t_info & INDEX_NULL_MASK) { + pfree(itup); + continue; + } + + itup->t_tid = htup->t_ctid; + btitem = _bt_formitem(itup); + + /* + * if we are doing bottom-up btree build, we insert the index + * into a spool page for subsequent processing. otherwise, we + * insert into the btree. + */ + if (FastBuild) { + _bt_spool(index, btitem, spool); + } else { + res = _bt_doinsert(index, btitem); + } + + pfree(btitem); + pfree(itup); + if (res) { + pfree(res); + } + } + + /* okay, all heap tuples are indexed */ + heap_endscan(hscan); + + if (pred != NULL || oldPred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + ExecDestroyTupleTable(tupleTable, true); + pfree(econtext); +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* + * if we are doing bottom-up btree build, we now have a bunch of + * sorted runs in the spool pages. finish the build by (1) + * merging the runs, (2) inserting the sorted tuples into btree + * pages and (3) building the upper levels. + */ + if (FastBuild) { + _bt_spool(index, (BTItem) NULL, spool); /* flush spool */ + _bt_leafbuild(index, spool); + _bt_spooldestroy(spool); + } + + /* + * Since we just counted the tuples in the heap, we update its + * stats in pg_class to guarantee that the planner takes advantage + * of the index we just created. Finally, only update statistics + * during normal index definitions, not for indices on system catalogs + * created during bootstrap processing. We must close the relations + * before updatings statistics to guarantee that the relcache entries + * are flushed when we increment the command counter in UpdateStats(). + */ + if (IsNormalProcessingMode()) + { + hrelid = heap->rd_id; + irelid = index->rd_id; + heap_close(heap); + index_close(index); + UpdateStats(hrelid, nhtups, true); + UpdateStats(irelid, nitups, false); + if (oldPred != NULL) { + if (nitups == nhtups) pred = NULL; + UpdateIndexPredicate(irelid, oldPred, pred); + } + } + + /* be tidy */ + pfree(nulls); + pfree(attdata); + + /* all done */ + BuildingBtree = false; +} + +/* + * btinsert() -- insert an index tuple into a btree. + * + * Descend the tree recursively, find the appropriate location for our + * new tuple, put it there, set its unique OID as appropriate, and + * return an InsertIndexResult to the caller. + */ +InsertIndexResult +btinsert(Relation rel, IndexTuple itup) +{ + BTItem btitem; + InsertIndexResult res; + + if (itup->t_info & INDEX_NULL_MASK) + return ((InsertIndexResult) NULL); + + btitem = _bt_formitem(itup); + + res = _bt_doinsert(rel, btitem); + pfree(btitem); + + return (res); +} + +/* + * btgettuple() -- Get the next tuple in the scan. + */ +char * +btgettuple(IndexScanDesc scan, ScanDirection dir) +{ + RetrieveIndexResult res; + + /* + * If we've already initialized this scan, we can just advance it + * in the appropriate direction. If we haven't done so yet, we + * call a routine to get the first item in the scan. + */ + + if (ItemPointerIsValid(&(scan->currentItemData))) + res = _bt_next(scan, dir); + else + res = _bt_first(scan, dir); + + return ((char *) res); +} + +/* + * btbeginscan() -- start a scan on a btree index + */ +char * +btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey) +{ + IndexScanDesc scan; + StrategyNumber strat; + BTScanOpaque so; + + /* first order the keys in the qualification */ + if (keysz > 1) + _bt_orderkeys(rel, &keysz, scankey); + + /* now get the scan */ + scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey); + so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); + so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer; + scan->opaque = so; + + /* finally, be sure that the scan exploits the tree order */ + scan->scanFromEnd = false; + scan->flags = 0x0; + if (keysz > 0) { + strat = _bt_getstrat(scan->relation, 1 /* XXX */, + scankey[0].sk_procedure); + + if (strat == BTLessStrategyNumber + || strat == BTLessEqualStrategyNumber) + scan->scanFromEnd = true; + } else { + scan->scanFromEnd = true; + } + + /* register scan in case we change pages it's using */ + _bt_regscan(scan); + + return ((char *) scan); +} + +/* + * btrescan() -- rescan an index relation + */ +void +btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* we hold a read lock on the current page in the scan */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* and we hold a read lock on the last marked item in the scan */ + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* reset the scan key */ + if (scan->numberOfKeys > 0) { + memmove(scan->keyData, + scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } +} + +void +btmovescan(IndexScanDesc scan, Datum v) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release any locks we still hold */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + scan->keyData[0].sk_argument = v; +} + +/* + * btendscan() -- close down a scan + */ +void +btendscan(IndexScanDesc scan) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release any locks we still hold */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + if (BufferIsValid(so->btso_curbuf)) + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + if (BufferIsValid(so->btso_mrkbuf)) + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* don't need scan registered anymore */ + _bt_dropscan(scan); + + /* be tidy */ +#ifdef PERFECT_MMGR + pfree (scan->opaque); +#endif /* PERFECT_MMGR */ +} + +/* + * btmarkpos() -- save current scan position + */ +void +btmarkpos(IndexScanDesc scan) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release lock on old marked data, if any */ + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* bump lock on currentItemData and copy to currentMarkData */ + if (ItemPointerIsValid(&(scan->currentItemData))) { + so->btso_mrkbuf = _bt_getbuf(scan->relation, + BufferGetBlockNumber(so->btso_curbuf), + BT_READ); + scan->currentMarkData = scan->currentItemData; + } +} + +/* + * btrestrpos() -- restore scan to last saved position + */ +void +btrestrpos(IndexScanDesc scan) +{ + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release lock on current data, if any */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* bump lock on currentMarkData and copy to currentItemData */ + if (ItemPointerIsValid(&(scan->currentMarkData))) { + so->btso_curbuf = _bt_getbuf(scan->relation, + BufferGetBlockNumber(so->btso_mrkbuf), + BT_READ); + + scan->currentItemData = scan->currentMarkData; + } +} + +/* stubs */ +void +btdelete(Relation rel, ItemPointer tid) +{ + /* adjust any active scans that will be affected by this deletion */ + _bt_adjscans(rel, tid); + + /* delete the data from the page */ + _bt_pagedel(rel, tid); +} diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c new file mode 100644 index 00000000000..62a029bc06f --- /dev/null +++ b/src/backend/access/nbtree/nbtscan.c @@ -0,0 +1,164 @@ +/*------------------------------------------------------------------------- + * + * btscan.c-- + * manage scans on btrees. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * + * NOTES + * Because we can be doing an index scan on a relation while we update + * it, we need to avoid missing data that moves around in the index. + * The routines and global variables in this file guarantee that all + * scans in the local address space stay correctly positioned. This + * is all we need to worry about, since write locking guarantees that + * no one else will be on the same page at the same time as we are. + * + * The scheme is to manage a list of active scans in the current backend. + * Whenever we add or remove records from an index, or whenever we + * split a leaf page, we check the list of active scans to see if any + * has been affected. A scan is affected only if it is on the same + * relation, and the same page, as the update. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/sdir.h" +#include "access/nbtree.h" + +typedef struct BTScanListData { + IndexScanDesc btsl_scan; + struct BTScanListData *btsl_next; +} BTScanListData; + +typedef BTScanListData *BTScanList; + +static BTScanList BTScans = (BTScanList) NULL; + +/* + * _bt_regscan() -- register a new scan. + */ +void +_bt_regscan(IndexScanDesc scan) +{ + BTScanList new_el; + + new_el = (BTScanList) palloc(sizeof(BTScanListData)); + new_el->btsl_scan = scan; + new_el->btsl_next = BTScans; + BTScans = new_el; +} + +/* + * _bt_dropscan() -- drop a scan from the scan list + */ +void +_bt_dropscan(IndexScanDesc scan) +{ + BTScanList chk, last; + + last = (BTScanList) NULL; + for (chk = BTScans; + chk != (BTScanList) NULL && chk->btsl_scan != scan; + chk = chk->btsl_next) { + last = chk; + } + + if (chk == (BTScanList) NULL) + elog(WARN, "btree scan list trashed; can't find 0x%lx", scan); + + if (last == (BTScanList) NULL) + BTScans = chk->btsl_next; + else + last->btsl_next = chk->btsl_next; + +#ifdef PERFECT_MEM + pfree (chk); +#endif /* PERFECT_MEM */ +} + +void +_bt_adjscans(Relation rel, ItemPointer tid) +{ + BTScanList l; + Oid relid; + + relid = rel->rd_id; + for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) { + if (relid == l->btsl_scan->relation->rd_id) + _bt_scandel(l->btsl_scan, ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + } +} + +void +_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno) +{ + ItemPointer current; + Buffer buf; + BTScanOpaque so; + + if (!_bt_scantouched(scan, blkno, offno)) + return; + + so = (BTScanOpaque) scan->opaque; + buf = so->btso_curbuf; + + current = &(scan->currentItemData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) { + _bt_step(scan, &buf, BackwardScanDirection); + so->btso_curbuf = buf; + } + + current = &(scan->currentMarkData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) { + ItemPointerData tmp; + tmp = *current; + *current = scan->currentItemData; + scan->currentItemData = tmp; + _bt_step(scan, &buf, BackwardScanDirection); + so->btso_mrkbuf = buf; + tmp = *current; + *current = scan->currentItemData; + scan->currentItemData = tmp; + } +} + +bool +_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno) +{ + ItemPointer current; + + current = &(scan->currentItemData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + return (true); + + current = &(scan->currentMarkData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + return (true); + + return (false); +} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c new file mode 100644 index 00000000000..d7a7fc7d62e --- /dev/null +++ b/src/backend/access/nbtree/nbtsearch.c @@ -0,0 +1,1133 @@ +/*------------------------------------------------------------------------- + * + * btsearch.c-- + * search code for postgres btrees. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "fmgr.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/skey.h" +#include "access/sdir.h" +#include "access/nbtree.h" + +static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey, Buffer *bufP, BTStack stack_in); +static OffsetNumber _bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, Size keysz, ScanKey scankey, OffsetNumber offnum); +static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page, int keysz, ScanKey scankey, OffsetNumber offnum); +static bool _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); +static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir); + +/* + * _bt_search() -- Search for a scan key in the index. + * + * This routine is actually just a helper that sets things up and + * calls a recursive-descent search routine on the tree. + */ +BTStack +_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP) +{ + *bufP = _bt_getroot(rel, BT_READ); + return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL)); +} + +/* + * _bt_searchr() -- Search the tree recursively for a particular scankey. + */ +static BTStack +_bt_searchr(Relation rel, + int keysz, + ScanKey scankey, + Buffer *bufP, + BTStack stack_in) +{ + BTStack stack; + OffsetNumber offnum; + Page page; + BTPageOpaque opaque; + BlockNumber par_blkno; + BlockNumber blkno; + ItemId itemid; + BTItem btitem; + BTItem item_save; + int item_nbytes; + IndexTuple itup; + + /* if this is a leaf page, we're done */ + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (opaque->btpo_flags & BTP_LEAF) + return (stack_in); + + /* + * Find the appropriate item on the internal page, and get the child + * page that it points to. + */ + + par_blkno = BufferGetBlockNumber(*bufP); + offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT); + itemid = PageGetItemId(page, offnum); + btitem = (BTItem) PageGetItem(page, itemid); + itup = &(btitem->bti_itup); + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + + /* + * We need to save the bit image of the index entry we chose in the + * parent page on a stack. In case we split the tree, we'll use this + * bit image to figure out what our real parent page is, in case the + * parent splits while we're working lower in the tree. See the paper + * by Lehman and Yao for how this is detected and handled. (We use + * unique OIDs to disambiguate duplicate keys in the index -- Lehman + * and Yao disallow duplicate keys). + */ + + item_nbytes = ItemIdGetLength(itemid); + item_save = (BTItem) palloc(item_nbytes); + memmove((char *) item_save, (char *) btitem, item_nbytes); + stack = (BTStack) palloc(sizeof(BTStackData)); + stack->bts_blkno = par_blkno; + stack->bts_offset = offnum; + stack->bts_btitem = item_save; + stack->bts_parent = stack_in; + + /* drop the read lock on the parent page and acquire one on the child */ + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = _bt_getbuf(rel, blkno, BT_READ); + + /* + * Race -- the page we just grabbed may have split since we read its + * pointer in the parent. If it has, we may need to move right to its + * new sibling. Do that. + */ + + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); + + /* okay, all set to move down a level */ + return (_bt_searchr(rel, keysz, scankey, bufP, stack)); +} + +/* + * _bt_moveright() -- move right in the btree if necessary. + * + * When we drop and reacquire a pointer to a page, it is possible that + * the page has changed in the meanwhile. If this happens, we're + * guaranteed that the page has "split right" -- that is, that any + * data that appeared on the page originally is either on the page + * or strictly to the right of it. + * + * This routine decides whether or not we need to move right in the + * tree by examining the high key entry on the page. If that entry + * is strictly less than one we expect to be on the page, then our + * picture of the page is incorrect and we need to move right. + * + * On entry, we have the buffer pinned and a lock of the proper type. + * If we move right, we release the buffer and lock and acquire the + * same on the right sibling. + */ +Buffer +_bt_moveright(Relation rel, + Buffer buf, + int keysz, + ScanKey scankey, + int access) +{ + Page page; + BTPageOpaque opaque; + ItemId hikey; + ItemId itemid; + BlockNumber rblkno; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* if we're on a rightmost page, we don't need to move right */ + if (P_RIGHTMOST(opaque)) + return (buf); + + /* by convention, item 0 on non-rightmost pages is the high key */ + hikey = PageGetItemId(page, P_HIKEY); + + /* + * If the scan key that brought us to this page is >= the high key + * stored on the page, then the page has split and we need to move + * right. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, + BTGreaterEqualStrategyNumber)) { + + /* move right as long as we need to */ + do { + /* + * If this page consists of all duplicate keys (hikey and first + * key on the page have the same value), then we don't need to + * step right. + */ + if (PageGetMaxOffsetNumber(page) > P_HIKEY) { + itemid = PageGetItemId(page, P_FIRSTKEY); + if (_bt_skeycmp(rel, keysz, scankey, page, itemid, + BTEqualStrategyNumber)) { + /* break is for the "move right" while loop */ + break; + } + } + + /* step right one page */ + rblkno = opaque->btpo_next; + _bt_relbuf(rel, buf, access); + buf = _bt_getbuf(rel, rblkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + hikey = PageGetItemId(page, P_HIKEY); + + } while (! P_RIGHTMOST(opaque) + && _bt_skeycmp(rel, keysz, scankey, page, hikey, + BTGreaterEqualStrategyNumber)); + } + return (buf); +} + +/* + * _bt_skeycmp() -- compare a scan key to a particular item on a page using + * a requested strategy (<, <=, =, >=, >). + * + * We ignore the unique OIDs stored in the btree item here. Those + * numbers are intended for use internally only, in repositioning a + * scan after a page split. They do not impose any meaningful ordering. + * + * The comparison is A <op> B, where A is the scan key and B is the + * tuple pointed at by itemid on page. + */ +bool +_bt_skeycmp(Relation rel, + Size keysz, + ScanKey scankey, + Page page, + ItemId itemid, + StrategyNumber strat) +{ + BTItem item; + IndexTuple indexTuple; + TupleDesc tupDes; + ScanKey entry; + int i; + Datum attrDatum; + Datum keyDatum; + bool compare; + bool isNull; + + item = (BTItem) PageGetItem(page, itemid); + indexTuple = &(item->bti_itup); + + tupDes = RelationGetTupleDescriptor(rel); + + /* see if the comparison is true for all of the key attributes */ + for (i=1; i <= keysz; i++) { + + entry = &scankey[i-1]; + attrDatum = index_getattr(indexTuple, + entry->sk_attno, + tupDes, + &isNull); + keyDatum = entry->sk_argument; + + compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum); + if (!compare) + return (false); + } + + return (true); +} + +/* + * _bt_binsrch() -- Do a binary search for a key on a particular page. + * + * The scankey we get has the compare function stored in the procedure + * entry of each data struct. We invoke this regproc to do the + * comparison for every key in the scankey. _bt_binsrch() returns + * the OffsetNumber of the first matching key on the page, or the + * OffsetNumber at which the matching key would appear if it were + * on this page. + * + * By the time this procedure is called, we're sure we're looking + * at the right page -- don't need to walk right. _bt_binsrch() has + * no lock or refcount side effects on the buffer. + */ +OffsetNumber +_bt_binsrch(Relation rel, + Buffer buf, + int keysz, + ScanKey scankey, + int srchtype) +{ + TupleDesc itupdesc; + Page page; + BTPageOpaque opaque; + OffsetNumber low, mid, high; + bool match; + int result; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* by convention, item 0 on any non-rightmost page is the high key */ + low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + high = PageGetMaxOffsetNumber(page); + + /* + * Since for non-rightmost pages, the zeroeth item on the page is the + * high key, there are two notions of emptiness. One is if nothing + * appears on the page. The other is if nothing but the high key does. + * The reason we test high <= low, rather than high == low, is that + * after vacuuming there may be nothing *but* the high key on a page. + * In that case, given the scheme above, low = 1 and high = 0. + */ + + if (PageIsEmpty(page) || (! P_RIGHTMOST(opaque) && high <= low)) + return (low); + + itupdesc = RelationGetTupleDescriptor(rel); + match = false; + + while ((high - low) > 1) { + mid = low + ((high - low) / 2); + result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid); + + if (result > 0) + low = mid; + else if (result < 0) + high = mid - 1; + else { + match = true; + break; + } + } + + /* if we found a match, we want to find the first one on the page */ + if (match) { + return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, mid)); + } else { + + /* + * We terminated because the endpoints got too close together. There + * are two cases to take care of. + * + * For non-insertion searches on internal pages, we want to point at + * the last key <, or first key =, the scankey on the page. This + * guarantees that we'll descend the tree correctly. + * + * For all other cases, we want to point at the first key >= + * the scankey on the page. This guarantees that scans and + * insertions will happen correctly. + */ + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT) { + + /* + * We want the last key <, or first key ==, the scan key. + */ + + result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); + + if (result == 0) { + return (_bt_firsteq(rel, itupdesc, page, keysz, scankey, high)); + } else if (result > 0) { + return (high); + } else { + return (low); + } + } else { + + /* we want the first key >= the scan key */ + result = _bt_compare(rel, itupdesc, page, keysz, scankey, low); + if (result <= 0) { + return (low); + } else { + if (low == high) + return (OffsetNumberNext(low)); + + result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); + if (result <= 0) + return (high); + else + return (OffsetNumberNext(high)); + } + } + } +} + +static OffsetNumber +_bt_firsteq(Relation rel, + TupleDesc itupdesc, + Page page, + Size keysz, + ScanKey scankey, + OffsetNumber offnum) +{ + BTPageOpaque opaque; + OffsetNumber limit; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* skip the high key, if any */ + limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* walk backwards looking for the first key in the chain of duplicates */ + while (offnum > limit + && _bt_compare(rel, itupdesc, page, + keysz, scankey, OffsetNumberPrev(offnum)) == 0) { + offnum = OffsetNumberPrev(offnum); + } + + return (offnum); +} + +/* + * _bt_compare() -- Compare scankey to a particular tuple on the page. + * + * This routine returns: + * -1 if scankey < tuple at offnum; + * 0 if scankey == tuple at offnum; + * +1 if scankey > tuple at offnum. + * + * In order to avoid having to propagate changes up the tree any time + * a new minimal key is inserted, the leftmost entry on the leftmost + * page is less than all possible keys, by definition. + */ +static int +_bt_compare(Relation rel, + TupleDesc itupdesc, + Page page, + int keysz, + ScanKey scankey, + OffsetNumber offnum) +{ + Datum datum; + BTItem btitem; + ItemId itemid; + IndexTuple itup; + BTPageOpaque opaque; + ScanKey entry; + AttrNumber attno; + int result; + int i; + bool null; + + /* + * If this is a leftmost internal page, and if our comparison is + * with the first key on the page, then the item at that position is + * by definition less than the scan key. + */ + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!(opaque->btpo_flags & BTP_LEAF) + && P_LEFTMOST(opaque) + && offnum == P_HIKEY) { + itemid = PageGetItemId(page, offnum); + + /* + * we just have to believe that this will only be called with + * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the + * first actual data key (i.e., this is also a rightmost + * page). there doesn't seem to be any code that implies + * that the leftmost page is normally missing a high key as + * well as the rightmost page. but that implies that this + * code path only applies to the root -- which seems + * unlikely.. + */ + if (! P_RIGHTMOST(opaque)) { + elog(WARN, "_bt_compare: invalid comparison to high key"); + } + + /* + * If the item on the page is equal to the scankey, that's + * okay to admit. We just can't claim that the first key on + * the page is greater than anything. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, itemid, + BTEqualStrategyNumber)) { + return (0); + } + return (1); + } + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &(btitem->bti_itup); + + /* + * The scan key is set up with the attribute number associated with each + * term in the key. It is important that, if the index is multi-key, + * the scan contain the first k key attributes, and that they be in + * order. If you think about how multi-key ordering works, you'll + * understand why this is. + * + * We don't test for violation of this condition here. + */ + + for (i = 1; i <= keysz; i++) { + long tmpres; + + entry = &scankey[i - 1]; + attno = entry->sk_attno; + datum = index_getattr(itup, attno, itupdesc, &null); + tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure, + entry->sk_argument, datum); + result = tmpres; + + /* if the keys are unequal, return the difference */ + if (result != 0) + return (result); + } + + /* by here, the keys are equal */ + return (0); +} + +/* + * _bt_next() -- Get the next item in a scan. + * + * On entry, we have a valid currentItemData in the scan, and a + * read lock on the page that contains that item. We do not have + * the page pinned. We return the next item in the scan. On + * exit, we have the page containing the next item locked but not + * pinned. + */ +RetrieveIndexResult +_bt_next(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel; + Buffer buf; + Page page; + OffsetNumber offnum; + RetrieveIndexResult res; + BlockNumber blkno; + ItemPointer current; + ItemPointer iptr; + BTItem btitem; + IndexTuple itup; + BTScanOpaque so; + + rel = scan->relation; + so = (BTScanOpaque) scan->opaque; + current = &(scan->currentItemData); + + /* + * XXX 10 may 91: somewhere there's a bug in our management of the + * cached buffer for this scan. wei discovered it. the following + * is a workaround so he can work until i figure out what's going on. + */ + + if (!BufferIsValid(so->btso_curbuf)) + so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current), + BT_READ); + + /* we still have the buffer pinned and locked */ + buf = so->btso_curbuf; + blkno = BufferGetBlockNumber(buf); + + /* step one tuple in the appropriate direction */ + if (!_bt_step(scan, &buf, dir)) + return ((RetrieveIndexResult) NULL); + + /* by here, current is the tuple we want to return */ + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &btitem->bti_itup; + + if (_bt_checkqual(scan, itup)) { + iptr = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) iptr, (char *) &(itup->t_tid), + sizeof(ItemPointerData)); + res = FormRetrieveIndexResult(current, iptr); + + /* remember which buffer we have pinned and locked */ + so->btso_curbuf = buf; + } else { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + res = (RetrieveIndexResult) NULL; + } + + return (res); +} + +/* + * _bt_first() -- Find the first item in a scan. + * + * We need to be clever about the type of scan, the operation it's + * performing, and the tree ordering. We return the RetrieveIndexResult + * of the first item in the tree that satisfies the qualification + * associated with the scan descriptor. On exit, the page containing + * the current index tuple is read locked and pinned, and the scan's + * opaque data entry is updated to include the buffer. + */ +RetrieveIndexResult +_bt_first(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel; + TupleDesc itupdesc; + Buffer buf; + Page page; + BTStack stack; + OffsetNumber offnum, maxoff; + BTItem btitem; + IndexTuple itup; + ItemPointer current; + ItemPointer iptr; + BlockNumber blkno; + StrategyNumber strat; + RetrieveIndexResult res; + RegProcedure proc; + int result; + BTScanOpaque so; + ScanKeyData skdata; + + /* if we just need to walk down one edge of the tree, do that */ + if (scan->scanFromEnd) + return (_bt_endpoint(scan, dir)); + + rel = scan->relation; + itupdesc = RelationGetTupleDescriptor(scan->relation); + current = &(scan->currentItemData); + so = (BTScanOpaque) scan->opaque; + + /* + * Okay, we want something more complicated. What we'll do is use + * the first item in the scan key passed in (which has been correctly + * ordered to take advantage of index ordering) to position ourselves + * at the right place in the scan. + */ + + /* + * XXX -- The attribute number stored in the scan key is the attno + * in the heap relation. We need to transmogrify this into + * the index relation attno here. For the moment, we have + * hardwired attno == 1. + */ + proc = index_getprocid(rel, 1, BTORDER_PROC); + ScanKeyEntryInitialize(&skdata, 0x0, 1, proc, + scan->keyData[0].sk_argument); + + stack = _bt_search(rel, 1, &skdata, &buf); + _bt_freestack(stack); + + /* find the nearest match to the manufactured scan key on the page */ + offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT); + page = BufferGetPage(buf); + + /* + * This will happen if the tree we're searching is entirely empty, + * or if we're doing a search for a key that would appear on an + * entirely empty internal page. In either case, there are no + * matching tuples in the index. + */ + + if (PageIsEmpty(page)) { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + return ((RetrieveIndexResult) NULL); + } + + maxoff = PageGetMaxOffsetNumber(page); + + if (offnum > maxoff) + offnum = maxoff; + + blkno = BufferGetBlockNumber(buf); + ItemPointerSet(current, blkno, offnum); + + /* + * Now find the right place to start the scan. Result is the + * value we're looking for minus the value we're looking at + * in the index. + */ + + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + strat = _bt_getstrat(rel, 1, scan->keyData[0].sk_procedure); + + switch (strat) { + case BTLessStrategyNumber: + if (result <= 0) { + do { + if (!_bt_twostep(scan, &buf, BackwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result <= 0); + + /* if this is true, the key we just looked at is gone */ + if (result > 0) + (void) _bt_twostep(scan, &buf, ForwardScanDirection); + } + break; + + case BTLessEqualStrategyNumber: + if (result >= 0) { + do { + if (!_bt_twostep(scan, &buf, ForwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result >= 0); + + if (result < 0) + (void) _bt_twostep(scan, &buf, BackwardScanDirection); + } + break; + + case BTEqualStrategyNumber: + if (result != 0) { + _bt_relbuf(scan->relation, buf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(&(scan->currentItemData)); + return ((RetrieveIndexResult) NULL); + } + break; + + case BTGreaterEqualStrategyNumber: + if (result < 0) { + do { + if (!_bt_twostep(scan, &buf, BackwardScanDirection)) + break; + + page = BufferGetPage(buf); + offnum = ItemPointerGetOffsetNumber(current); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result < 0); + + if (result > 0) + (void) _bt_twostep(scan, &buf, ForwardScanDirection); + } + break; + + case BTGreaterStrategyNumber: + if (result >= 0) { + do { + if (!_bt_twostep(scan, &buf, ForwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result >= 0); + } + break; + } + + /* okay, current item pointer for the scan is right */ + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &btitem->bti_itup; + + if (_bt_checkqual(scan, itup)) { + iptr = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) iptr, (char *) &(itup->t_tid), + sizeof(ItemPointerData)); + res = FormRetrieveIndexResult(current, iptr); + pfree(iptr); + + /* remember which buffer we have pinned */ + so->btso_curbuf = buf; + } else { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + res = (RetrieveIndexResult) NULL; + } + + return (res); +} + +/* + * _bt_step() -- Step one item in the requested direction in a scan on + * the tree. + * + * If no adjacent record exists in the requested direction, return + * false. Else, return true and set the currentItemData for the + * scan to the right thing. + */ +bool +_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +{ + Page page; + BTPageOpaque opaque; + OffsetNumber offnum, maxoff; + OffsetNumber start; + BlockNumber blkno; + BlockNumber obknum; + BTScanOpaque so; + ItemPointer current; + Relation rel; + + rel = scan->relation; + current = &(scan->currentItemData); + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + so = (BTScanOpaque) scan->opaque; + maxoff = PageGetMaxOffsetNumber(page); + + /* get the next tuple */ + if (ScanDirectionIsForward(dir)) { + if (!PageIsEmpty(page) && offnum < maxoff) { + offnum = OffsetNumberNext(offnum); + } else { + + /* if we're at end of scan, release the buffer and return */ + blkno = opaque->btpo_next; + if (P_RIGHTMOST(opaque)) { + _bt_relbuf(rel, *bufP, BT_READ); + ItemPointerSetInvalid(current); + *bufP = so->btso_curbuf = InvalidBuffer; + return (false); + } else { + + /* walk right to the next page with data */ + _bt_relbuf(rel, *bufP, BT_READ); + for (;;) { + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (!PageIsEmpty(page) && start <= maxoff) { + break; + } else { + blkno = opaque->btpo_next; + _bt_relbuf(rel, *bufP, BT_READ); + if (blkno == P_NONE) { + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } + } + } + offnum = start; + } + } + } else if (ScanDirectionIsBackward(dir)) { + + /* remember that high key is item zero on non-rightmost pages */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (offnum > start) { + offnum = OffsetNumberPrev(offnum); + } else { + + /* if we're at end of scan, release the buffer and return */ + blkno = opaque->btpo_prev; + if (P_LEFTMOST(opaque)) { + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } else { + + obknum = BufferGetBlockNumber(*bufP); + + /* walk right to the next page with data */ + _bt_relbuf(rel, *bufP, BT_READ); + for (;;) { + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * If the adjacent page just split, then we may have the + * wrong block. Handle this case. Because pages only + * split right, we don't have to worry about this failing + * to terminate. + */ + + while (opaque->btpo_next != obknum) { + blkno = opaque->btpo_next; + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + } + + /* don't consider the high key */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* anything to look at here? */ + if (!PageIsEmpty(page) && maxoff >= start) { + break; + } else { + blkno = opaque->btpo_prev; + obknum = BufferGetBlockNumber(*bufP); + _bt_relbuf(rel, *bufP, BT_READ); + if (blkno == P_NONE) { + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } + } + } + offnum = maxoff; /* XXX PageIsEmpty? */ + } + } + } + blkno = BufferGetBlockNumber(*bufP); + so->btso_curbuf = *bufP; + ItemPointerSet(current, blkno, offnum); + + return (true); +} + +/* + * _bt_twostep() -- Move to an adjacent record in a scan on the tree, + * if an adjacent record exists. + * + * This is like _bt_step, except that if no adjacent record exists + * it restores us to where we were before trying the step. This is + * only hairy when you cross page boundaries, since the page you cross + * from could have records inserted or deleted, or could even split. + * This is unlikely, but we try to handle it correctly here anyway. + * + * This routine contains the only case in which our changes to Lehman + * and Yao's algorithm. + * + * Like step, this routine leaves the scan's currentItemData in the + * proper state and acquires a lock and pin on *bufP. If the twostep + * succeeded, we return true; otherwise, we return false. + */ +static bool +_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +{ + Page page; + BTPageOpaque opaque; + OffsetNumber offnum, maxoff; + OffsetNumber start; + ItemPointer current; + ItemId itemid; + int itemsz; + BTItem btitem; + BTItem svitem; + BlockNumber blkno; + + blkno = BufferGetBlockNumber(*bufP); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + current = &(scan->currentItemData); + offnum = ItemPointerGetOffsetNumber(current); + + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* if we're safe, just do it */ + if (ScanDirectionIsForward(dir) && offnum < maxoff) { /* XXX PageIsEmpty? */ + ItemPointerSet(current, blkno, OffsetNumberNext(offnum)); + return (true); + } else if (ScanDirectionIsBackward(dir) && offnum > start) { + ItemPointerSet(current, blkno, OffsetNumberPrev(offnum)); + return (true); + } + + /* if we've hit end of scan we don't have to do any work */ + if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) { + return (false); + } else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) { + return (false); + } + + /* + * Okay, it's off the page; let _bt_step() do the hard work, and we'll + * try to remember where we were. This is not guaranteed to work; this + * is the only place in the code where concurrency can screw us up, + * and it's because we want to be able to move in two directions in + * the scan. + */ + + itemid = PageGetItemId(page, offnum); + itemsz = ItemIdGetLength(itemid); + btitem = (BTItem) PageGetItem(page, itemid); + svitem = (BTItem) palloc(itemsz); + memmove((char *) svitem, (char *) btitem, itemsz); + + if (_bt_step(scan, bufP, dir)) { + pfree(svitem); + return (true); + } + + /* try to find our place again */ + *bufP = _bt_getbuf(scan->relation, blkno, BT_READ); + page = BufferGetPage(*bufP); + maxoff = PageGetMaxOffsetNumber(page); + + while (offnum <= maxoff) { + itemid = PageGetItemId(page, offnum); + btitem = (BTItem) PageGetItem(page, itemid); + if (btitem->bti_oid == svitem->bti_oid) { + pfree(svitem); + ItemPointerSet(current, blkno, offnum); + return (false); + } + } + + /* + * XXX crash and burn -- can't find our place. We can be a little + * smarter -- walk to the next page to the right, for example, since + * that's the only direction that splits happen in. Deletions screw + * us up less often since they're only done by the vacuum daemon. + */ + + elog(WARN, "btree synchronization error: concurrent update botched scan"); + + return (false); +} + +/* + * _bt_endpoint() -- Find the first or last key in the index. + */ +static RetrieveIndexResult +_bt_endpoint(IndexScanDesc scan, ScanDirection dir) +{ + Relation rel; + Buffer buf; + Page page; + BTPageOpaque opaque; + ItemPointer current; + ItemPointer iptr; + OffsetNumber offnum, maxoff; + OffsetNumber start; + BlockNumber blkno; + BTItem btitem; + IndexTuple itup; + BTScanOpaque so; + RetrieveIndexResult res; + + rel = scan->relation; + current = &(scan->currentItemData); + + buf = _bt_getroot(rel, BT_READ); + blkno = BufferGetBlockNumber(buf); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) { + if (opaque->btpo_flags & BTP_LEAF) + break; + + if (ScanDirectionIsForward(dir)) { + offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + } else { + offnum = PageGetMaxOffsetNumber(page); + } + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &(btitem->bti_itup); + + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * Race condition: If the child page we just stepped onto is + * in the process of being split, we need to make sure we're + * all the way at the right edge of the tree. See the paper + * by Lehman and Yao. + */ + + if (ScanDirectionIsBackward(dir) && ! P_RIGHTMOST(opaque)) { + do { + blkno = opaque->btpo_next; + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } while (! P_RIGHTMOST(opaque)); + } + } + + /* okay, we've got the {left,right}-most page in the tree */ + maxoff = PageGetMaxOffsetNumber(page); + + if (ScanDirectionIsForward(dir)) { + if (PageIsEmpty(page)) { + maxoff = FirstOffsetNumber; + } else { + maxoff = PageGetMaxOffsetNumber(page); + } + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (PageIsEmpty(page) || start > maxoff) { + ItemPointerSet(current, blkno, maxoff); + if (!_bt_step(scan, &buf, BackwardScanDirection)) + return ((RetrieveIndexResult) NULL); + + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } else { + ItemPointerSet(current, blkno, start); + } + } else if (ScanDirectionIsBackward(dir)) { + if (PageIsEmpty(page)) { + ItemPointerSet(current, blkno, FirstOffsetNumber); + if (!_bt_step(scan, &buf, ForwardScanDirection)) + return ((RetrieveIndexResult) NULL); + + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } else { + start = PageGetMaxOffsetNumber(page); + ItemPointerSet(current, blkno, start); + } + } else { + elog(WARN, "Illegal scan direction %d", dir); + } + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); + itup = &(btitem->bti_itup); + + /* see if we picked a winner */ + if (_bt_checkqual(scan, itup)) { + iptr = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) iptr, (char *) &(itup->t_tid), + sizeof(ItemPointerData)); + res = FormRetrieveIndexResult(current, iptr); + + /* remember which buffer we have pinned */ + so = (BTScanOpaque) scan->opaque; + so->btso_curbuf = buf; + } else { + _bt_relbuf(rel, buf, BT_READ); + res = (RetrieveIndexResult) NULL; + } + + return (res); +} diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c new file mode 100644 index 00000000000..3d2676324a0 --- /dev/null +++ b/src/backend/access/nbtree/nbtsort.c @@ -0,0 +1,1196 @@ +/*------------------------------------------------------------------------- + * btsort.c-- + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Id: nbtsort.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * + * what we do is: + * - generate a set of initial one-block runs, distributed round-robin + * between the output tapes. + * - for each pass, + * - swap input and output tape sets, rewinding both and truncating + * the output tapes. + * - merge the current run in each input tape to the current output + * tape. + * - when each input run has been exhausted, switch to another output + * tape and start processing another run. + * - when we have fewer runs than tapes, we know we are ready to start + * merging into the btree leaf pages. + * - every time we complete a level of the btree, we can construct the + * next level up. when we have only one page on a level, it can be + * attached to the btree metapage and we are done. + * + * conventions: + * - external interface routines take in and return "void *" for their + * opaque handles. this is for modularity reasons (i prefer not to + * export these structures without good reason). + * + * this code is moderately slow (~10% slower) compared to the regular + * btree (insertion) build code on sorted or well-clustered data. on + * random data, however, the insertion build code is unusable -- the + * difference on a 60MB heap is a factor of 15 because the random + * probes into the btree thrash the buffer pool. + * + * this code currently packs the pages to 100% of capacity. this is + * not wise, since *any* insertion will cause splitting. filling to + * something like the standard 70% steady-state load factor for btrees + * would probably be better. + * + * somebody desperately needs to figure out how to do a better job of + * balancing the merge passes -- the fan-in on the final merges can be + * pretty poor, which is bad for performance. + *------------------------------------------------------------------------- + */ + +#include <stdio.h> + +#include "c.h" + +#include "access/nbtree.h" + +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "utils/rel.h" +#include "utils/palloc.h" +#include "utils/elog.h" + +/*#define FASTBUILD_DEBUG*/ /* turn on debugging output */ + +#define FASTBUILD + +#ifdef FASTBUILD + +#define MAXTAPES (7) +#define TAPEBLCKSZ (BLCKSZ << 2) +#define TAPETEMP "pg_btsortXXXXXX" + + +/*------------------------------------------------------------------------- + * sorting comparison routine - returns {-1,0,1} depending on whether + * the key in the left BTItem is {<,=,>} the key in the right BTItem. + * + * we want to use _bt_isortcmp as a comparison function for qsort(3), + * but it needs extra arguments, so we "pass them in" as global + * variables. ick. fortunately, they are the same throughout the + * build, so we need do this only once. this is why you must call + * _bt_isortcmpinit before the call to qsort(3). + * + * a NULL BTItem is always assumed to be greater than any actual + * value; our heap routines (see below) assume that the smallest + * element in the heap is returned. that way, NULL values from the + * exhausted tapes can sift down to the bottom of the heap. in point + * of fact we just don't replace the elements of exhausted tapes, but + * what the heck. + * *------------------------------------------------------------------------- + */ +static Relation _bt_sortrel; + +static void +_bt_isortcmpinit(Relation index) +{ + _bt_sortrel = index; +} + +static int +_bt_isortcmp(BTItem *bti1p, BTItem *bti2p) +{ + BTItem bti1 = *bti1p; + BTItem bti2 = *bti2p; + + if (bti1 == (BTItem) NULL) { + if (bti2 == (BTItem) NULL) { + return(0); /* 1 = 2 */ + } + return(1); /* 1 > 2 */ + } else if (bti2 == (BTItem) NULL) { + return(-1); /* 1 < 2 */ + } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2, + BTGreaterStrategyNumber)) { + return(1); /* 1 > 2 */ + } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1, + BTGreaterStrategyNumber)) { + return(-1); /* 1 < 2 */ + } + return(0); /* 1 = 2 */ +} + +/*------------------------------------------------------------------------- + * priority queue methods + * + * these were more-or-less lifted from the heap section of the 1984 + * edition of gonnet's book on algorithms and data structures. they + * are coded so that the smallest element in the heap is returned (we + * use them for merging sorted runs). + * + * XXX these probably ought to be generic library functions. + *------------------------------------------------------------------------- + */ + +typedef struct { + int btpqe_tape; /* tape identifier */ + BTItem btpqe_item; /* pointer to BTItem in tape buffer */ +} BTPriQueueElem; + +#define MAXELEM MAXTAPES +typedef struct { + int btpq_nelem; + BTPriQueueElem btpq_queue[MAXELEM]; + Relation btpq_rel; +} BTPriQueue; + +/* be sure to call _bt_isortcmpinit first */ +#define GREATER(a, b) \ + (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) + +static void +_bt_pqsift(BTPriQueue *q, int parent) +{ + int child; + BTPriQueueElem e; + + for (child = parent * 2 + 1; + child < q->btpq_nelem; + child = parent * 2 + 1) { + if (child < q->btpq_nelem - 1) { + if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) { + ++child; + } + } + if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) { + e = q->btpq_queue[child]; /* struct = */ + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + q->btpq_queue[parent] = e; /* struct = */ + parent = child; + } else { + parent = child + 1; + } + } +} + +static int +_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e) +{ + if (q->btpq_nelem < 1) { /* already empty */ + return(-1); + } + *e = q->btpq_queue[0]; /* struct = */ + + if (--q->btpq_nelem < 1) { /* now empty, don't sift */ + return(0); + } + q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ + _bt_pqsift(q, 0); + return(0); +} + +static void +_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) +{ + int child, parent; + + if (q->btpq_nelem >= MAXELEM) { + elog(WARN, "_bt_pqadd: queue overflow"); + } + + child = q->btpq_nelem++; + while (child > 0) { + parent = child / 2; + if (GREATER(e, &(q->btpq_queue[parent]))) { + break; + } else { + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + child = parent; + } + } + + q->btpq_queue[child] = *e; /* struct = */ +} + +/*------------------------------------------------------------------------- + * tape methods + *------------------------------------------------------------------------- + */ + +#define BTITEMSZ(btitem) \ + ((btitem) ? \ + (IndexTupleDSize((btitem)->bti_itup) + \ + (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ + 0) +#define SPCLEFT(tape) \ + (sizeof((tape)->bttb_data) - (tape)->bttb_top) +#define EMPTYTAPE(tape) \ + ((tape)->bttb_ntup <= 0) +#define BTTAPEMAGIC 0x19660226 + +/* + * this is what we use to shovel BTItems in and out of memory. it's + * bigger than a standard block because we are doing a lot of strictly + * sequential i/o. this is obviously something of a tradeoff since we + * are potentially reading a bunch of zeroes off of disk in many + * cases. + * + * BTItems are packed in and DOUBLEALIGN'd. + * + * the fd should not be going out to disk, strictly speaking, but it's + * the only thing like that so i'm not going to worry about wasting a + * few bytes. + */ +typedef struct { + int bttb_magic; /* magic number */ + int bttb_fd; /* file descriptor */ + int bttb_top; /* top of free space within bttb_data */ + short bttb_ntup; /* number of tuples in this block */ + short bttb_eor; /* End-Of-Run marker */ + char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; +} BTTapeBlock; + + +/* + * reset the tape header for its next use without doing anything to + * the physical tape file. (setting bttb_top to 0 makes the block + * empty.) + */ +static void +_bt_tapereset(BTTapeBlock *tape) +{ + tape->bttb_eor = 0; + tape->bttb_top = 0; + tape->bttb_ntup = 0; +} + +/* + * rewind the physical tape file. + */ +static void +_bt_taperewind(BTTapeBlock *tape) +{ + (void) FileSeek(tape->bttb_fd, 0, SEEK_SET); +} + +/* + * destroy the contents of the physical tape file without destroying + * the tape data structure or removing the physical tape file. + * + * we use the VFD version of ftruncate(2) to do this rather than + * unlinking and recreating the file. you still have to wait while + * the OS frees up all of the file system blocks and stuff, but at + * least you don't have to delete and reinsert the directory entries. + */ +static void +_bt_tapeclear(BTTapeBlock *tape) +{ + /* blow away the contents of the old file */ + _bt_taperewind(tape); +#if 0 + FileSync(tape->bttb_fd); +#endif + FileTruncate(tape->bttb_fd, 0); + + /* reset the buffer */ + _bt_tapereset(tape); +} + +/* + * create a new BTTapeBlock, allocating memory for the data structure + * as well as opening a physical tape file. + */ +static BTTapeBlock * +_bt_tapecreate(char *fname) +{ + BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); + + if (tape == (BTTapeBlock *) NULL) { + elog(WARN, "_bt_tapecreate: out of memory"); + } + + tape->bttb_magic = BTTAPEMAGIC; + + tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600); + Assert(tape->bttb_fd >= 0); + + /* initialize the buffer */ + _bt_tapereset(tape); + + return(tape); +} + +/* + * destroy the BTTapeBlock structure and its physical tape file. + */ +static void +_bt_tapedestroy(BTTapeBlock *tape) +{ + FileUnlink(tape->bttb_fd); + pfree((void *) tape); +} + +/* + * flush the tape block to the file, marking End-Of-Run if requested. + */ +static void +_bt_tapewrite(BTTapeBlock *tape, int eor) +{ + tape->bttb_eor = eor; + FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ); + _bt_tapereset(tape); +} + +/* + * read a tape block from the file, overwriting the current contents + * of the buffer. + * + * returns: + * - 0 if there are no more blocks in the tape or in this run (call + * _bt_tapereset to clear the End-Of-Run marker) + * - 1 if a valid block was read + */ +static int +_bt_taperead(BTTapeBlock *tape) +{ + int fd; + int nread; + + if (tape->bttb_eor) { + return(0); /* we are at End-Of-Run */ + } + + /* + * we're clobbering the old tape block, but we do need to save the + * VFD (the one in the block we're reading is bogus). + */ + fd = tape->bttb_fd; + nread = FileRead(fd, (char*) tape, TAPEBLCKSZ); + tape->bttb_fd = fd; + + if (nread != TAPEBLCKSZ) { + Assert(nread == 0); /* we are at EOF */ + return(0); + } + Assert(tape->bttb_magic == BTTAPEMAGIC); + return(1); +} + +/* + * get the next BTItem from a tape block. + * + * returns: + * - NULL if we have run out of BTItems + * - a pointer to the BTItemData in the block otherwise + * + * side effects: + * - sets 'pos' to the current position within the block. + */ +static BTItem +_bt_tapenext(BTTapeBlock *tape, char **pos) +{ + Size itemsz; + BTItem bti; + + if (*pos >= tape->bttb_data + tape->bttb_top) { + return((BTItem) NULL); + } + bti = (BTItem) *pos; + itemsz = BTITEMSZ(bti); + *pos += DOUBLEALIGN(itemsz); + return(bti); +} + +/* + * copy a BTItem into a tape block. + * + * assumes that we have already checked to see if the block has enough + * space for the item. + * + * side effects: + * + * - advances the 'top' pointer in the tape block header to point to + * the beginning of free space. + */ +static void +_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) +{ + (void) memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); + ++tape->bttb_ntup; + tape->bttb_top += DOUBLEALIGN(itemsz); +} + +/*------------------------------------------------------------------------- + * spool methods + *------------------------------------------------------------------------- + */ + +/* + * this structure holds the bookkeeping for a simple balanced multiway + * merge. (polyphase merging is hairier than i want to get into right + * now, and i don't see why i have to care how many "tapes" i use + * right now. though if psort was in a condition that i could hack it + * to do this, you bet i would.) + */ +typedef struct { + int bts_ntapes; + int bts_tape; + BTTapeBlock **bts_itape; /* input tape blocks */ + BTTapeBlock **bts_otape; /* output tape blocks */ +} BTSpool; + +/* + * create and initialize a spool structure, including the underlying + * files. + */ +void * +_bt_spoolinit(Relation index, int ntapes) +{ + char *mktemp(); + + BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); + int i; + char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); + + if (btspool == (BTSpool *) NULL || fname == (char *) NULL) { + elog(WARN, "_bt_spoolinit: out of memory"); + } + (void) memset((char *) btspool, 0, sizeof(BTSpool)); + btspool->bts_ntapes = ntapes; + btspool->bts_tape = 0; + + btspool->bts_itape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + btspool->bts_otape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + if (btspool->bts_itape == (BTTapeBlock **) NULL || + btspool->bts_otape == (BTTapeBlock **) NULL) { + elog(WARN, "_bt_spoolinit: out of memory"); + } + + for (i = 0; i < ntapes; ++i) { + btspool->bts_itape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + btspool->bts_otape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + } + pfree((void *) fname); + + _bt_isortcmpinit(index); + + return((void *) btspool); +} + +/* + * clean up a spool structure and its substructures. + */ +void +_bt_spooldestroy(void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + _bt_tapedestroy(btspool->bts_otape[i]); + _bt_tapedestroy(btspool->bts_itape[i]); + } + pfree((void *) btspool); +} + +/* + * flush out any dirty output tape blocks + */ +static void +_bt_spoolflush(BTSpool *btspool) +{ + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + if (!EMPTYTAPE(btspool->bts_otape[i])) { + _bt_tapewrite(btspool->bts_otape[i], 1); + } + } +} + +/* + * swap input tapes and output tapes by swapping their file + * descriptors. additional preparation for the next merge pass + * includes rewinding the new input tapes and clearing out the new + * output tapes. + */ +static void +_bt_spoolswap(BTSpool *btspool) +{ + File tmpfd; + BTTapeBlock *itape; + BTTapeBlock *otape; + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + itape = btspool->bts_itape[i]; + otape = btspool->bts_otape[i]; + + /* + * swap the input and output VFDs. + */ + tmpfd = itape->bttb_fd; + itape->bttb_fd = otape->bttb_fd; + otape->bttb_fd = tmpfd; + + /* + * rewind the new input tape. + */ + _bt_taperewind(itape); + _bt_tapereset(itape); + + /* + * clear the new output tape -- it's ok to throw away the old + * inputs. + */ + _bt_tapeclear(otape); + } +} + +/*------------------------------------------------------------------------- + * sorting routines + *------------------------------------------------------------------------- + */ + +/* + * spool 'btitem' into an initial run. as tape blocks are filled, the + * block BTItems are qsorted and written into some output tape (it + * doesn't matter which; we go round-robin for simplicity). the + * initial runs are therefore always just one block. + */ +void +_bt_spool(Relation index, BTItem btitem, void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + BTTapeBlock *itape; + Size itemsz; + + itape = btspool->bts_itape[btspool->bts_tape]; + itemsz = BTITEMSZ(btitem); + itemsz = DOUBLEALIGN(itemsz); + + /* + * if this buffer is too full for this BTItemData, or if we have + * run out of BTItems, we need to sort the buffer and write it + * out. in this case, the BTItemData will go into the next tape's + * buffer. + */ + if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) { + BTItem *parray; + BTTapeBlock *otape; + BTItem bti; + char *pos; + int btisz; + int i; + + /* + * build an array of pointers to the BTItemDatas on the input + * block. + */ + parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem)); + if (parray == (BTItem *) NULL) { + elog(WARN, "_bt_spool: out of memory"); + } + pos = itape->bttb_data; + for (i = 0; i < itape->bttb_ntup; ++i) { + parray[i] = _bt_tapenext(itape, &pos); + } + + /* + * qsort the pointer array. + */ + _bt_isortcmpinit(index); + qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp); + + /* + * write the spooled run into the output tape. we copy the + * BTItemDatas in the order dictated by the sorted array of + * BTItems, not the original order. + * + * (since everything was DOUBLEALIGN'd and is all on a single + * page, everything had *better* still fit on one page..) + */ + otape = btspool->bts_otape[btspool->bts_tape]; + for (i = 0; i < itape->bttb_ntup; ++i) { + bti = parray[i]; + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + _bt_tapeadd(otape, bti, btisz); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_spool: inserted <%x> into output tape %d\n", + d, btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG */ + } + + /* + * the initial runs are always single tape blocks. flush the + * output block, marking End-Of-Run. + */ + _bt_tapewrite(otape, 1); + + /* + * reset the input buffer for the next run. we don't have to + * write it out or anything -- we only use it to hold the + * unsorted BTItemDatas, the output tape contains all the + * sorted stuff. + * + * changing bts_tape changes the output tape and input tape; + * we change itape for the code below. + */ + _bt_tapereset(itape); + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + itape = btspool->bts_itape[btspool->bts_tape]; + + /* + * destroy the pointer array. + */ + pfree((void *) parray); + } + + /* insert this item into the current buffer */ + if (btitem != (BTItem) NULL) { + _bt_tapeadd(itape, btitem, itemsz); + } +} + +/* + * allocate a new, clean btree page, not linked to any siblings. + */ +static void +_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) +{ + BTPageOpaque opaque; + + *buf = _bt_getbuf(index, P_NEW, BT_WRITE); + *page = BufferGetPage(*buf); + _bt_pageinit(*page, BufferGetPageSize(*buf)); + opaque = (BTPageOpaque) PageGetSpecialPointer(*page); + opaque->btpo_prev = opaque->btpo_next = P_NONE; + opaque->btpo_flags = flags; +} + +/* + * slide an array of ItemIds back one slot (from P_FIRSTKEY to + * P_HIKEY). we need to do this when we discover that we have built + * an ItemId array in what has turned out to be a P_RIGHTMOST page. + */ +static void +_bt_slideleft(Relation index, Buffer buf, Page page) +{ + OffsetNumber off; + OffsetNumber maxoff; + ItemId previi; + ItemId thisii; + + maxoff = PageGetMaxOffsetNumber(page); + previi = PageGetItemId(page, P_HIKEY); + for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { + thisii = PageGetItemId(page, off); + *previi = *thisii; + previi = thisii; + } + ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); +} + +typedef struct { + Buffer btps_buf; + Page btps_page; + BTItem btps_lastbti; + OffsetNumber btps_lastoff; + OffsetNumber btps_firstoff; +} BTPageState; + +/* + * add an item to a disk page from a merge tape block. + * + * we must be careful to observe the following restrictions, placed + * upon us by the conventions in nbtsearch.c: + * - rightmost pages start data items at P_HIKEY instead of at + * P_FIRSTKEY. + * - duplicates cannot be split among pages unless the chain of + * duplicates starts at the first data item. + * + * a leaf page being built looks like: + * + * +----------------+---------------------------------+ + * | PageHeaderData | linp0 linp1 linp2 ... | + * +-----------+----+---------------------------------+ + * | ... linpN | ^ first | + * +-----------+--------------------------------------+ + * | ^ last | + * | | + * | v last | + * +-------------+------------------------------------+ + * | | itemN ... | + * +-------------+------------------+-----------------+ + * | ... item3 item2 item1 | "special space" | + * +--------------------------------+-----------------+ + * ^ first + * + * contrast this with the diagram in bufpage.h; note the mismatch + * between linps and items. this is because we reserve linp0 as a + * placeholder for the pointer to the "high key" item; when we have + * filled up the page, we will set linp0 to point to itemN and clear + * linpN. + * + * 'last' pointers indicate the last offset/item added to the page. + * 'first' pointers indicate the first offset/item that is part of a + * chain of duplicates extending from 'first' to 'last'. + * + * if all keys are unique, 'first' will always be the same as 'last'. + */ +static void +_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) +{ + Buffer nbuf; + Page npage; + BTItem last_bti; + OffsetNumber first_off; + OffsetNumber last_off; + OffsetNumber off; + Size pgspc; + Size btisz; + + nbuf = state->btps_buf; + npage = state->btps_page; + first_off = state->btps_firstoff; + last_off = state->btps_lastoff; + last_bti = state->btps_lastbti; + + pgspc = PageGetFreeSpace(npage); + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (pgspc < btisz) { + Buffer obuf = nbuf; + Page opage = npage; + OffsetNumber o, n; + ItemId ii; + ItemId hii; + + _bt_blnewpage(index, &nbuf, &npage, flags); + + /* + * if 'last' is part of a chain of duplicates that does not + * start at the beginning of the old page, the entire chain is + * copied to the new page; we delete all of the duplicates + * from the old page except the first, which becomes the high + * key item of the old page. + * + * if the chain starts at the beginning of the page or there + * is no chain ('first' == 'last'), we need only copy 'last' + * to the new page. again, 'first' (== 'last') becomes the + * high key of the old page. + * + * note that in either case, we copy at least one item to the + * new page, so 'last_bti' will always be valid. 'bti' will + * never be the first data item on the new page. + */ + if (first_off == P_FIRSTKEY) { + Assert(last_off != P_FIRSTKEY); + first_off = last_off; + } + for (o = first_off, n = P_FIRSTKEY; + o <= last_off; + o = OffsetNumberNext(o), n = OffsetNumberNext(n)) { + ii = PageGetItemId(opage, o); + (void) PageAddItem(npage, PageGetItem(opage, ii), + ii->lp_len, n, LP_USED); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + BTItem tmpbti = + (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); + Datum d = index_getattr(&(tmpbti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_buildadd: moved <%x> to offset %d\n", + d, n); + } +#endif /* FASTBUILD_DEBUG */ + } + for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) { + PageIndexTupleDelete(opage, o); + } + hii = PageGetItemId(opage, P_HIKEY); + ii = PageGetItemId(opage, first_off); + *hii = *ii; + ii->lp_flags &= ~LP_USED; + ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); + + first_off = P_FIRSTKEY; + last_off = PageGetMaxOffsetNumber(npage); + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); + + /* + * set the page (side link) pointers. + */ + { + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); + + oopaque->btpo_next = BufferGetBlockNumber(nbuf); + nopaque->btpo_prev = BufferGetBlockNumber(obuf); + nopaque->btpo_next = P_NONE; + } + + /* + * write out the old stuff. we never want to see it again, so + * we can give up our lock (if we had one; BuildingBtree is + * set, so we aren't locking). + */ + _bt_wrtbuf(index, obuf); + } + + /* + * if this item is different from the last item added, we start a + * new chain of duplicates. + */ + off = OffsetNumberNext(last_off); + (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_buildadd: inserted <%x> at offset %d\n", + d, off); + } +#endif /* FASTBUILD_DEBUG */ + if (last_bti == (BTItem) NULL) { + first_off = P_FIRSTKEY; + } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) { + first_off = off; + } + last_off = off; + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); + + state->btps_buf = nbuf; + state->btps_page = npage; + state->btps_lastbti = last_bti; + state->btps_lastoff = last_off; + state->btps_firstoff = first_off; +} + +/* + * take the input tapes stored by 'btspool' and perform successive + * merging passes until at most one run is left in each tape. at that + * point, merge the final tape runs into a set of btree leaves. + * + * XXX three nested loops? gross. cut me up into smaller routines. + */ +static BlockNumber +_bt_merge(Relation index, BTSpool *btspool) +{ + BTPageState state; + BlockNumber firstblk; + BTPriQueue q; + BTPriQueueElem e; + BTItem bti; + BTTapeBlock *itape; + BTTapeBlock *otape; + char *tapepos[MAXTAPES]; + int tapedone[MAXTAPES]; + int t; + int goodtapes; + int nruns; + Size btisz; + bool doleaf = false; + + /* + * initialize state needed for the merge into the btree leaf pages. + */ + (void) memset((char *) &state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF); + state.btps_lastoff = P_HIKEY; + state.btps_lastbti = (BTItem) NULL; + firstblk = BufferGetBlockNumber(state.btps_buf); + + do { /* pass */ + /* + * each pass starts by flushing the previous outputs and + * swapping inputs and outputs. this process also clears the + * new output tapes and rewinds the new input tapes. + */ + btspool->bts_tape = btspool->bts_ntapes - 1; + _bt_spoolflush(btspool); + _bt_spoolswap(btspool); + + nruns = 0; + + for (;;) { /* run */ + /* + * each run starts by selecting a new output tape. the + * merged results of a given run are always sent to this + * one tape. + */ + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + otape = btspool->bts_otape[btspool->bts_tape]; + + /* + * initialize the priority queue by loading it with the + * first element of the given run in each tape. since we + * are starting a new run, we reset the tape (clearing the + * End-Of-Run marker) before reading it. this means that + * _bt_taperead will return 0 only if the tape is actually + * at EOF. + */ + (void) memset((char *) &q, 0, sizeof(BTPriQueue)); + goodtapes = 0; + for (t = 0; t < btspool->bts_ntapes; ++t) { + itape = btspool->bts_itape[t]; + tapepos[t] = itape->bttb_data; + _bt_tapereset(itape); + if (_bt_taperead(itape) == 0) { + tapedone[t] = 1; + } else { + ++goodtapes; + tapedone[t] = 0; + e.btpqe_tape = t; + e.btpqe_item = _bt_tapenext(itape, &tapepos[t]); + if (e.btpqe_item != (BTItem) NULL) { + _bt_pqadd(&q, &e); + } + } + } + /* + * if we don't have any tapes with any input (i.e., they + * are all at EOF), we must be done with this pass. + */ + if (goodtapes == 0) { + break; /* for */ + } + ++nruns; + + /* + * output the smallest element from the queue until there are no + * more. + */ + while (_bt_pqnext(&q, &e) >= 0) { /* item */ + /* + * replace the element taken from priority queue, + * fetching a new block if needed. a tape can run out + * if it hits either End-Of-Run or EOF. + */ + t = e.btpqe_tape; + bti = e.btpqe_item; + if (bti != (BTItem) NULL) { + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (doleaf) { + _bt_buildadd(index, &state, bti, BTP_LEAF); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_merge: inserted <%x> into block %d\n", + d, BufferGetBlockNumber(state.btps_buf)); + } +#endif /* FASTBUILD_DEBUG */ + } else { + if (SPCLEFT(otape) < btisz) { + /* + * if it's full, write it out and add the + * item to the next block. (since we know + * there will be at least one more block, + * we know we do *not* want to set + * End-Of-Run here!) + */ + _bt_tapewrite(otape, 0); + } + _bt_tapeadd(otape, bti, btisz); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), &isnull); + printf("_bt_merge: inserted <%x> into tape %d\n", + d, btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG */ + } + } +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_merge: got <%x> from tape %d\n", d, t); + } +#endif /* FASTBUILD_DEBUG */ + + itape = btspool->bts_itape[t]; + if (!tapedone[t]) { + BTItem newbti = _bt_tapenext(itape, &tapepos[t]); + + if (newbti == (BTItem) NULL) { + if (_bt_taperead(itape) == 0) { + tapedone[t] = 1; + } else { + tapepos[t] = itape->bttb_data; + newbti = _bt_tapenext(itape, &tapepos[t]); + } + } + if (newbti != (BTItem) NULL) { + BTPriQueueElem nexte; + + nexte.btpqe_tape = t; + nexte.btpqe_item = newbti; + _bt_pqadd(&q, &nexte); + } + } + } /* item */ + } /* run */ + + /* + * we are here because we ran out of input on all of the input + * tapes. + * + * if this pass did not generate more actual output runs than + * we have tapes, we know we have at most one run in each + * tape. this means that we are ready to merge into the final + * btree leaf pages instead of merging into a tape file. + */ + if (nruns <= btspool->bts_ntapes) { + doleaf = true; + } + } while (nruns > 0); /* pass */ + + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, state.btps_buf, state.btps_page); + _bt_wrtbuf(index, state.btps_buf); + + return(firstblk); +} + + +/* + * given the block number 'blk' of the first page of a set of linked + * siblings (i.e., the start of an entire level of the btree), + * construct the corresponding next level of the btree. we do this by + * placing minimum keys from each page into this page. the format of + * the internal pages is otherwise the same as for leaf pages. + */ +void +_bt_upperbuild(Relation index, BlockNumber blk, int level) +{ + Buffer rbuf; + Page rpage; + BTPageOpaque ropaque; + BTPageState state; + BlockNumber firstblk; + BTItem bti; + BTItem nbti; + OffsetNumber off; + + rbuf = _bt_getbuf(index, blk, BT_WRITE); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* + * if we only have one page on a level, we can just make it the + * root. + */ + if (P_RIGHTMOST(ropaque)) { + ropaque->btpo_flags |= BTP_ROOT; + _bt_wrtbuf(index, rbuf); + _bt_metaproot(index, blk); + return; + } + _bt_relbuf(index, rbuf, BT_WRITE); + + (void) memset((char *) &state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0); + state.btps_lastoff = P_HIKEY; + state.btps_lastbti = (BTItem) NULL; + firstblk = BufferGetBlockNumber(state.btps_buf); + + /* for each page... */ + do { + rbuf = _bt_getbuf(index, blk, BT_READ); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* for each item... */ + if (!PageIsEmpty(rpage)) { + /* + * form a new index tuple corresponding to the minimum key + * of the lower page and insert it into a page at this + * level. + */ + off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY; + bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off)); + nbti = _bt_formitem(&(bti->bti_itup)); + ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(nbti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_upperbuild: inserting <%x> at %d\n", + d, level); + } +#endif /* FASTBUILD_DEBUG */ + _bt_buildadd(index, &state, nbti, 0); + pfree((void *) nbti); + } + blk = ropaque->btpo_next; + _bt_relbuf(index, rbuf, BT_READ); + } while (blk != P_NONE); + + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, state.btps_buf, state.btps_page); + _bt_wrtbuf(index, state.btps_buf); + + _bt_upperbuild(index, firstblk, level + 1); +} + +/* + * given a spool loading by successive calls to _bt_spool, create an + * entire btree. + */ +void +_bt_leafbuild(Relation index, void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + BlockNumber firstblk; + + /* + * merge the runs into btree leaf pages. + */ + firstblk = _bt_merge(index, btspool); + + /* + * build the upper levels of the btree. + */ + _bt_upperbuild(index, firstblk, 0); +} + +#else /* !FASTBUILD */ + +void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); } +void _bt_spooldestroy(void *spool) { } +void _bt_spool(Relation index, BTItem btitem, void *spool) { } +void _bt_upperbuild(Relation index, BlockNumber blk, int level) { } +void _bt_leafbuild(Relation index, void *spool) { } + +#endif /* !FASTBUILD */ diff --git a/src/backend/access/nbtree/nbtstrat.c b/src/backend/access/nbtree/nbtstrat.c new file mode 100644 index 00000000000..2214c60950d --- /dev/null +++ b/src/backend/access/nbtree/nbtstrat.c @@ -0,0 +1,134 @@ +/*------------------------------------------------------------------------- + * + * btstrat.c-- + * Srategy map entries for the btree indexed access method + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/genam.h" +#include "access/nbtree.h" + +/* + * Note: + * StrategyNegate, StrategyCommute, and StrategyNegateCommute + * assume <, <=, ==, >=, > ordering. + */ +static StrategyNumber BTNegate[5] = { + BTGreaterEqualStrategyNumber, + BTGreaterStrategyNumber, + InvalidStrategy, + BTLessStrategyNumber, + BTLessEqualStrategyNumber +}; + +static StrategyNumber BTCommute[5] = { + BTGreaterStrategyNumber, + BTGreaterEqualStrategyNumber, + InvalidStrategy, + BTLessEqualStrategyNumber, + BTLessStrategyNumber +}; + +static StrategyNumber BTNegateCommute[5] = { + BTLessEqualStrategyNumber, + BTLessStrategyNumber, + InvalidStrategy, + BTGreaterStrategyNumber, + BTGreaterEqualStrategyNumber +}; + +static uint16 BTLessTermData[] = { /* XXX type clash */ + 2, + BTLessStrategyNumber, + SK_NEGATE, + BTLessStrategyNumber, + SK_NEGATE | SK_COMMUTE +}; + +static uint16 BTLessEqualTermData[] = { /* XXX type clash */ + 2, + BTLessEqualStrategyNumber, + 0x0, + BTLessEqualStrategyNumber, + SK_COMMUTE +}; + +static uint16 BTGreaterEqualTermData[] = { /* XXX type clash */ + 2, + BTGreaterEqualStrategyNumber, + 0x0, + BTGreaterEqualStrategyNumber, + SK_COMMUTE + }; + +static uint16 BTGreaterTermData[] = { /* XXX type clash */ + 2, + BTGreaterStrategyNumber, + SK_NEGATE, + BTGreaterStrategyNumber, + SK_NEGATE | SK_COMMUTE +}; + +static StrategyTerm BTEqualExpressionData[] = { + (StrategyTerm)BTLessTermData, /* XXX */ + (StrategyTerm)BTLessEqualTermData, /* XXX */ + (StrategyTerm)BTGreaterEqualTermData, /* XXX */ + (StrategyTerm)BTGreaterTermData, /* XXX */ + NULL +}; + +static StrategyEvaluationData BTEvaluationData = { + /* XXX static for simplicity */ + + BTMaxStrategyNumber, + (StrategyTransformMap)BTNegate, /* XXX */ + (StrategyTransformMap)BTCommute, /* XXX */ + (StrategyTransformMap)BTNegateCommute, /* XXX */ + + { NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL, + NULL,NULL,NULL,NULL,NULL,NULL,NULL} +}; + +/* ---------------------------------------------------------------- + * RelationGetBTStrategy + * ---------------------------------------------------------------- + */ + +StrategyNumber +_bt_getstrat(Relation rel, + AttrNumber attno, + RegProcedure proc) +{ + StrategyNumber strat; + + strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc); + + Assert(StrategyNumberIsValid(strat)); + + return (strat); +} + +bool +_bt_invokestrat(Relation rel, + AttrNumber attno, + StrategyNumber strat, + Datum left, + Datum right) +{ + return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat, + left, right)); +} diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c new file mode 100644 index 00000000000..695a2b637c8 --- /dev/null +++ b/src/backend/access/nbtree/nbtutils.c @@ -0,0 +1,239 @@ +/*------------------------------------------------------------------------- + * + * btutils.c-- + * Utility code for Postgres btree implementation. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "fmgr.h" +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" +#include "utils/datum.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/iqual.h" +#include "access/nbtree.h" + +ScanKey +_bt_mkscankey(Relation rel, IndexTuple itup) +{ + ScanKey skey; + TupleDesc itupdesc; + int natts; + int i; + Datum arg; + RegProcedure proc; + bool null; + + natts = rel->rd_rel->relnatts; + itupdesc = RelationGetTupleDescriptor(rel); + + skey = (ScanKey) palloc(natts * sizeof(ScanKeyData)); + + for (i = 0; i < natts; i++) { + arg = index_getattr(itup, i + 1, itupdesc, &null); + proc = index_getprocid(rel, i + 1, BTORDER_PROC); + ScanKeyEntryInitialize(&skey[i], + 0x0, (AttrNumber) (i + 1), proc, arg); + } + + return (skey); +} + +void +_bt_freeskey(ScanKey skey) +{ + pfree(skey); +} + +void +_bt_freestack(BTStack stack) +{ + BTStack ostack; + + while (stack != (BTStack) NULL) { + ostack = stack; + stack = stack->bts_parent; + pfree(ostack->bts_btitem); + pfree(ostack); + } +} + +/* + * _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals. + * + * The order of the keys in the qual match the ordering imposed by + * the index. This routine only needs to be called if there are + * more than one qual clauses using this index. + */ +void +_bt_orderkeys(Relation relation, uint16 *numberOfKeys, ScanKey key) +{ + ScanKey xform; + ScanKeyData *cur; + StrategyMap map; + int nbytes; + long test; + int i, j; + int init[BTMaxStrategyNumber+1]; + + /* haven't looked at any strategies yet */ + for (i = 0; i <= BTMaxStrategyNumber; i++) + init[i] = 0; + + /* get space for the modified array of keys */ + nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData); + xform = (ScanKey) palloc(nbytes); + memset(xform, 0, nbytes); + + + /* get the strategy map for this index/attribute pair */ + /* + * XXX + * When we support multiple keys in a single index, this is what + * we'll want to do. At present, the planner is hosed, so we + * hard-wire the attribute number below. Postgres only does single- + * key indices... + * map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), + * BTMaxStrategyNumber, + * key->data[0].attributeNumber); + */ + map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), + BTMaxStrategyNumber, + 1 /* XXX */ ); + + /* check each key passed in */ + for (i = *numberOfKeys; --i >= 0; ) { + cur = &key[i]; + for (j = BTMaxStrategyNumber; --j >= 0; ) { + if (cur->sk_procedure == map->entry[j].sk_procedure) + break; + } + + /* have we seen one of these before? */ + if (init[j]) { + /* yup, use the appropriate value */ + test = + (long) FMGR_PTR2(cur->sk_func, cur->sk_procedure, + cur->sk_argument, xform[j].sk_argument); + if (test) + xform[j].sk_argument = cur->sk_argument; + } else { + /* nope, use this value */ + memmove(&xform[j], cur, sizeof(*cur)); + + init[j] = 1; + } + } + + /* if = has been specified, no other key will be used */ + if (init[BTEqualStrategyNumber - 1]) { + init[BTLessStrategyNumber - 1] = 0; + init[BTLessEqualStrategyNumber - 1] = 0; + init[BTGreaterEqualStrategyNumber - 1] = 0; + init[BTGreaterStrategyNumber - 1] = 0; + } + + /* only one of <, <= */ + if (init[BTLessStrategyNumber - 1] + && init[BTLessEqualStrategyNumber - 1]) { + + ScanKeyData *lt, *le; + + lt = &xform[BTLessStrategyNumber - 1]; + le = &xform[BTLessEqualStrategyNumber - 1]; + + /* + * DO NOT use the cached function stuff here -- this is key + * ordering, happens only when the user expresses a hokey + * qualification, and gets executed only once, anyway. The + * transform maps are hard-coded, and can't be initialized + * in the correct way. + */ + + test = (long) fmgr(le->sk_procedure, le->sk_argument, lt->sk_argument); + + if (test) + init[BTLessEqualStrategyNumber - 1] = 0; + else + init[BTLessStrategyNumber - 1] = 0; + } + + /* only one of >, >= */ + if (init[BTGreaterStrategyNumber - 1] + && init[BTGreaterEqualStrategyNumber - 1]) { + + ScanKeyData *gt, *ge; + + gt = &xform[BTGreaterStrategyNumber - 1]; + ge = &xform[BTGreaterEqualStrategyNumber - 1]; + + /* see note above on function cache */ + test = (long) fmgr(ge->sk_procedure, gt->sk_argument, gt->sk_argument); + + if (test) + init[BTGreaterStrategyNumber - 1] = 0; + else + init[BTGreaterEqualStrategyNumber - 1] = 0; + } + + /* okay, reorder and count */ + j = 0; + + for (i = BTMaxStrategyNumber; --i >= 0; ) + if (init[i]) + key[j++] = xform[i]; + + *numberOfKeys = j; + + pfree(xform); +} + +bool +_bt_checkqual(IndexScanDesc scan, IndexTuple itup) +{ + if (scan->numberOfKeys > 0) + return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation), + scan->numberOfKeys, scan->keyData)); + else + return (true); +} + +BTItem +_bt_formitem(IndexTuple itup) +{ + int nbytes_btitem; + BTItem btitem; + Size tuplen; + extern Oid newoid(); + + /* disallow nulls in btree keys */ + if (itup->t_info & INDEX_NULL_MASK) + elog(WARN, "btree indices cannot include null keys"); + + /* make a copy of the index tuple with room for the sequence number */ + tuplen = IndexTupleSize(itup); + nbytes_btitem = tuplen + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + + btitem = (BTItem) palloc(nbytes_btitem); + memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen); + + btitem->bti_oid = newoid(); + return (btitem); +} diff --git a/src/backend/access/printtup.h b/src/backend/access/printtup.h new file mode 100644 index 00000000000..b5843daf7e0 --- /dev/null +++ b/src/backend/access/printtup.h @@ -0,0 +1,26 @@ +/*------------------------------------------------------------------------- + * + * printtup.h-- + * + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: printtup.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef PRINTTUP_H +#define PRINTTUP_H + +#include "access/htup.h" +#include "access/tupdesc.h" + +extern Oid typtoout(Oid type); +extern void printtup(HeapTuple tuple, TupleDesc typeinfo); +extern void showatts(char *name, TupleDesc attinfo); +extern void debugtup(HeapTuple tuple, TupleDesc typeinfo); +extern void printtup_internal(HeapTuple tuple, TupleDesc typeinfo); +extern Oid gettypelem(Oid type); + +#endif /* PRINTTUP_H */ diff --git a/src/backend/access/relscan.h b/src/backend/access/relscan.h new file mode 100644 index 00000000000..7899e9d945f --- /dev/null +++ b/src/backend/access/relscan.h @@ -0,0 +1,87 @@ +/*------------------------------------------------------------------------- + * + * relscan.h-- + * POSTGRES internal relation scan descriptor definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: relscan.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef RELSCAN_H +#define RELSCAN_H + +#include "c.h" + +#include "access/skey.h" +#include "storage/buf.h" +#include "access/htup.h" +#include "storage/itemptr.h" + +#include "utils/tqual.h" +#include "utils/rel.h" + + +typedef ItemPointerData MarkData; + +typedef struct HeapScanDescData { + Relation rs_rd; /* pointer to relation descriptor */ + HeapTuple rs_ptup; /* previous tuple in scan */ + HeapTuple rs_ctup; /* current tuple in scan */ + HeapTuple rs_ntup; /* next tuple in scan */ + Buffer rs_pbuf; /* previous buffer in scan */ + Buffer rs_cbuf; /* current buffer in scan */ + Buffer rs_nbuf; /* next buffer in scan */ + ItemPointerData rs_mptid; /* marked previous tid */ + ItemPointerData rs_mctid; /* marked current tid */ + ItemPointerData rs_mntid; /* marked next tid */ + ItemPointerData rs_mcd; /* marked current delta XXX ??? */ + bool rs_atend; /* restart scan at end? */ + TimeQual rs_tr; /* time qualification */ + uint16 rs_cdelta; /* current delta in chain */ + uint16 rs_nkeys; /* number of attributes in keys */ + ScanKey rs_key; /* key descriptors */ +} HeapScanDescData; + +typedef HeapScanDescData *HeapScanDesc; + +typedef struct IndexScanDescData { + Relation relation; /* relation descriptor */ + void *opaque; /* am-specific slot */ + ItemPointerData previousItemData; /* previous index pointer */ + ItemPointerData currentItemData; /* current index pointer */ + ItemPointerData nextItemData; /* next index pointer */ + MarkData previousMarkData; /* marked previous pointer */ + MarkData currentMarkData; /* marked current pointer */ + MarkData nextMarkData; /* marked next pointer */ + uint8 flags; /* scan position flags */ + bool scanFromEnd; /* restart scan at end? */ + uint16 numberOfKeys; /* number of key attributes */ + ScanKey keyData; /* key descriptor */ +} IndexScanDescData; + +typedef IndexScanDescData *IndexScanDesc; + +/* ---------------- + * IndexScanDescPtr is used in the executor where we have to + * keep track of several index scans when using several indices + * - cim 9/10/89 + * ---------------- + */ +typedef IndexScanDesc *IndexScanDescPtr; + +/* + * HeapScanIsValid -- + * True iff the heap scan is valid. + */ +#define HeapScanIsValid(scan) PointerIsValid(scan) + +/* + * IndexScanIsValid -- + * True iff the index scan is valid. + */ +#define IndexScanIsValid(scan) PointerIsValid(scan) + +#endif /* RELSCAN_H */ diff --git a/src/backend/access/rtree.h b/src/backend/access/rtree.h new file mode 100644 index 00000000000..79f1622e48b --- /dev/null +++ b/src/backend/access/rtree.h @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * + * rtree.h-- + * common declarations for the rtree access method code. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: rtree.h,v 1.1.1.1 1996/07/09 06:21:08 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef RTREE_H +#define RTREE_H + +/* see rtstrat.c for what all this is about */ +#define RTNStrategies 8 +#define RTLeftStrategyNumber 1 +#define RTOverLeftStrategyNumber 2 +#define RTOverlapStrategyNumber 3 +#define RTOverRightStrategyNumber 4 +#define RTRightStrategyNumber 5 +#define RTSameStrategyNumber 6 +#define RTContainsStrategyNumber 7 +#define RTContainedByStrategyNumber 8 + +#define RTNProcs 3 +#define RT_UNION_PROC 1 +#define RT_INTER_PROC 2 +#define RT_SIZE_PROC 3 + +#define F_LEAF (1 << 0) + +typedef struct RTreePageOpaqueData { + uint32 flags; +} RTreePageOpaqueData; + +typedef RTreePageOpaqueData *RTreePageOpaque; + +/* + * When we descend a tree, we keep a stack of parent pointers. + */ + +typedef struct RTSTACK { + struct RTSTACK *rts_parent; + OffsetNumber rts_child; + BlockNumber rts_blk; +} RTSTACK; + +/* + * When we're doing a scan, we need to keep track of the parent stack + * for the marked and current items. Also, rtrees have the following + * property: if you're looking for the box (1,1,2,2), on the internal + * nodes you have to search for all boxes that *contain* (1,1,2,2), and + * not the ones that match it. We have a private scan key for internal + * nodes in the opaque structure for rtrees for this reason. See + * access/index-rtree/rtscan.c and rtstrat.c for how it gets initialized. + */ + +typedef struct RTreeScanOpaqueData { + struct RTSTACK *s_stack; + struct RTSTACK *s_markstk; + uint16 s_flags; + uint16 s_internalNKey; + ScanKey s_internalKey; +} RTreeScanOpaqueData; + +typedef RTreeScanOpaqueData *RTreeScanOpaque; + +/* + * When we're doing a scan and updating a tree at the same time, the + * updates may affect the scan. We use the flags entry of the scan's + * opaque space to record our actual position in response to updates + * that we can't handle simply by adjusting pointers. + */ + +#define RTS_CURBEFORE ((uint16) (1 << 0)) +#define RTS_MRKBEFORE ((uint16) (1 << 1)) + +/* root page of an rtree */ +#define P_ROOT 0 + +/* + * When we update a relation on which we're doing a scan, we need to + * check the scan and fix it if the update affected any of the pages it + * touches. Otherwise, we can miss records that we should see. The only + * times we need to do this are for deletions and splits. See the code in + * rtscan.c for how the scan is fixed. These two contants tell us what sort + * of operation changed the index. + */ + +#define RTOP_DEL 0 +#define RTOP_SPLIT 1 + +/* defined in rtree.c */ +extern void freestack(RTSTACK *s); + +#endif /* RTREE_H */ diff --git a/src/backend/access/rtree/Makefile.inc b/src/backend/access/rtree/Makefile.inc new file mode 100644 index 00000000000..a93a5e53290 --- /dev/null +++ b/src/backend/access/rtree/Makefile.inc @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for access/rtree (R-Tree access method) +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= rtget.c rtproc.c rtree.c rtscan.c rtstrat.c diff --git a/src/backend/access/rtree/rtget.c b/src/backend/access/rtree/rtget.c new file mode 100644 index 00000000000..fb2e169297d --- /dev/null +++ b/src/backend/access/rtree/rtget.c @@ -0,0 +1,320 @@ +/*------------------------------------------------------------------------- + * + * rtget.c-- + * fetch tuples from an rtree scan. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtget.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/iqual.h" +#include "access/rtree.h" +#include "access/sdir.h" + +static OffsetNumber findnext(IndexScanDesc s, Page p, OffsetNumber n, + ScanDirection dir); +static RetrieveIndexResult rtscancache(IndexScanDesc s, ScanDirection dir); +static RetrieveIndexResult rtfirst(IndexScanDesc s, ScanDirection dir); +static RetrieveIndexResult rtnext(IndexScanDesc s, ScanDirection dir); +static ItemPointer rtheapptr(Relation r, ItemPointer itemp); + + +RetrieveIndexResult +rtgettuple(IndexScanDesc s, ScanDirection dir) +{ + RetrieveIndexResult res; + + /* if we have it cached in the scan desc, just return the value */ + if ((res = rtscancache(s, dir)) != (RetrieveIndexResult) NULL) + return (res); + + /* not cached, so we'll have to do some work */ + if (ItemPointerIsValid(&(s->currentItemData))) { + res = rtnext(s, dir); + } else { + res = rtfirst(s, dir); + } + return (res); +} + +static RetrieveIndexResult +rtfirst(IndexScanDesc s, ScanDirection dir) +{ + Buffer b; + Page p; + OffsetNumber n; + OffsetNumber maxoff; + RetrieveIndexResult res; + RTreePageOpaque po; + RTreeScanOpaque so; + RTSTACK *stk; + BlockNumber blk; + IndexTuple it; + ItemPointer ip; + + b = ReadBuffer(s->relation, P_ROOT); + p = BufferGetPage(b); + po = (RTreePageOpaque) PageGetSpecialPointer(p); + so = (RTreeScanOpaque) s->opaque; + + for (;;) { + maxoff = PageGetMaxOffsetNumber(p); + if (ScanDirectionIsBackward(dir)) + n = findnext(s, p, maxoff, dir); + else + n = findnext(s, p, FirstOffsetNumber, dir); + + while (n < FirstOffsetNumber || n > maxoff) { + + ReleaseBuffer(b); + if (so->s_stack == (RTSTACK *) NULL) + return ((RetrieveIndexResult) NULL); + + stk = so->s_stack; + b = ReadBuffer(s->relation, stk->rts_blk); + p = BufferGetPage(b); + po = (RTreePageOpaque) PageGetSpecialPointer(p); + maxoff = PageGetMaxOffsetNumber(p); + + if (ScanDirectionIsBackward(dir)) { + n = OffsetNumberPrev(stk->rts_child); + } else { + n = OffsetNumberNext(stk->rts_child); + } + so->s_stack = stk->rts_parent; + pfree(stk); + + n = findnext(s, p, n, dir); + } + if (po->flags & F_LEAF) { + ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n); + + it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); + ip = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) ip, (char *) &(it->t_tid), + sizeof(ItemPointerData)); + ReleaseBuffer(b); + + res = FormRetrieveIndexResult(&(s->currentItemData), ip); + + return (res); + } else { + stk = (RTSTACK *) palloc(sizeof(RTSTACK)); + stk->rts_child = n; + stk->rts_blk = BufferGetBlockNumber(b); + stk->rts_parent = so->s_stack; + so->s_stack = stk; + + it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); + blk = ItemPointerGetBlockNumber(&(it->t_tid)); + + ReleaseBuffer(b); + b = ReadBuffer(s->relation, blk); + p = BufferGetPage(b); + po = (RTreePageOpaque) PageGetSpecialPointer(p); + } + } +} + +static RetrieveIndexResult +rtnext(IndexScanDesc s, ScanDirection dir) +{ + Buffer b; + Page p; + OffsetNumber n; + OffsetNumber maxoff; + RetrieveIndexResult res; + RTreePageOpaque po; + RTreeScanOpaque so; + RTSTACK *stk; + BlockNumber blk; + IndexTuple it; + ItemPointer ip; + + blk = ItemPointerGetBlockNumber(&(s->currentItemData)); + n = ItemPointerGetOffsetNumber(&(s->currentItemData)); + + if (ScanDirectionIsForward(dir)) { + n = OffsetNumberNext(n); + } else { + n = OffsetNumberPrev(n); + } + + b = ReadBuffer(s->relation, blk); + p = BufferGetPage(b); + po = (RTreePageOpaque) PageGetSpecialPointer(p); + so = (RTreeScanOpaque) s->opaque; + + for (;;) { + maxoff = PageGetMaxOffsetNumber(p); + n = findnext(s, p, n, dir); + + while (n < FirstOffsetNumber || n > maxoff) { + + ReleaseBuffer(b); + if (so->s_stack == (RTSTACK *) NULL) + return ((RetrieveIndexResult) NULL); + + stk = so->s_stack; + b = ReadBuffer(s->relation, stk->rts_blk); + p = BufferGetPage(b); + maxoff = PageGetMaxOffsetNumber(p); + po = (RTreePageOpaque) PageGetSpecialPointer(p); + + if (ScanDirectionIsBackward(dir)) { + n = OffsetNumberPrev(stk->rts_child); + } else { + n = OffsetNumberNext(stk->rts_child); + } + so->s_stack = stk->rts_parent; + pfree(stk); + + n = findnext(s, p, n, dir); + } + if (po->flags & F_LEAF) { + ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n); + + it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); + ip = (ItemPointer) palloc(sizeof(ItemPointerData)); + memmove((char *) ip, (char *) &(it->t_tid), + sizeof(ItemPointerData)); + ReleaseBuffer(b); + + res = FormRetrieveIndexResult(&(s->currentItemData), ip); + + return (res); + } else { + stk = (RTSTACK *) palloc(sizeof(RTSTACK)); + stk->rts_child = n; + stk->rts_blk = BufferGetBlockNumber(b); + stk->rts_parent = so->s_stack; + so->s_stack = stk; + + it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); + blk = ItemPointerGetBlockNumber(&(it->t_tid)); + + ReleaseBuffer(b); + b = ReadBuffer(s->relation, blk); + p = BufferGetPage(b); + po = (RTreePageOpaque) PageGetSpecialPointer(p); + + if (ScanDirectionIsBackward(dir)) { + n = PageGetMaxOffsetNumber(p); + } else { + n = FirstOffsetNumber; + } + } + } +} + +static OffsetNumber +findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir) +{ + OffsetNumber maxoff; + IndexTuple it; + RTreePageOpaque po; + RTreeScanOpaque so; + + maxoff = PageGetMaxOffsetNumber(p); + po = (RTreePageOpaque) PageGetSpecialPointer(p); + so = (RTreeScanOpaque) s->opaque; + + /* + * If we modified the index during the scan, we may have a pointer to + * a ghost tuple, before the scan. If this is the case, back up one. + */ + + if (so->s_flags & RTS_CURBEFORE) { + so->s_flags &= ~RTS_CURBEFORE; + n = OffsetNumberPrev(n); + } + + while (n >= FirstOffsetNumber && n <= maxoff) { + it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); + if (po->flags & F_LEAF) { + if (index_keytest(it, + RelationGetTupleDescriptor(s->relation), + s->numberOfKeys, s->keyData)) + break; + } else { + if (index_keytest(it, + RelationGetTupleDescriptor(s->relation), + so->s_internalNKey, so->s_internalKey)) + break; + } + + if (ScanDirectionIsBackward(dir)) { + n = OffsetNumberPrev(n); + } else { + n = OffsetNumberNext(n); + } + } + + return (n); +} + +static RetrieveIndexResult +rtscancache(IndexScanDesc s, ScanDirection dir) +{ + RetrieveIndexResult res; + ItemPointer ip; + + if (!(ScanDirectionIsNoMovement(dir) + && ItemPointerIsValid(&(s->currentItemData)))) { + + return ((RetrieveIndexResult) NULL); + } + + ip = rtheapptr(s->relation, &(s->currentItemData)); + + if (ItemPointerIsValid(ip)) + res = FormRetrieveIndexResult(&(s->currentItemData), ip); + else + res = (RetrieveIndexResult) NULL; + + return (res); +} + +/* + * rtheapptr returns the item pointer to the tuple in the heap relation + * for which itemp is the index relation item pointer. + */ +static ItemPointer +rtheapptr(Relation r, ItemPointer itemp) +{ + Buffer b; + Page p; + IndexTuple it; + ItemPointer ip; + OffsetNumber n; + + ip = (ItemPointer) palloc(sizeof(ItemPointerData)); + if (ItemPointerIsValid(itemp)) { + b = ReadBuffer(r, ItemPointerGetBlockNumber(itemp)); + p = BufferGetPage(b); + n = ItemPointerGetOffsetNumber(itemp); + it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); + memmove((char *) ip, (char *) &(it->t_tid), + sizeof(ItemPointerData)); + ReleaseBuffer(b); + } else { + ItemPointerSetInvalid(ip); + } + + return (ip); +} diff --git a/src/backend/access/rtree/rtproc.c b/src/backend/access/rtree/rtproc.c new file mode 100644 index 00000000000..a2f7bef46b4 --- /dev/null +++ b/src/backend/access/rtree/rtproc.c @@ -0,0 +1,150 @@ +/*------------------------------------------------------------------------- + * + * rtproc.c-- + * pg_amproc entries for rtrees. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtproc.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <math.h> +#include <string.h> + +#include "postgres.h" + +#include "utils/elog.h" +#include "utils/geo-decls.h" +#include "utils/palloc.h" + +BOX +*rt_box_union(BOX *a, BOX *b) +{ + BOX *n; + + if ((n = (BOX *) palloc(sizeof (*n))) == (BOX *) NULL) + elog(WARN, "Cannot allocate box for union"); + + n->xh = Max(a->xh, b->xh); + n->yh = Max(a->yh, b->yh); + n->xl = Min(a->xl, b->xl); + n->yl = Min(a->yl, b->yl); + + return (n); +} + +BOX * +rt_box_inter(BOX *a, BOX *b) +{ + BOX *n; + + if ((n = (BOX *) palloc(sizeof (*n))) == (BOX *) NULL) + elog(WARN, "Cannot allocate box for union"); + + n->xh = Min(a->xh, b->xh); + n->yh = Min(a->yh, b->yh); + n->xl = Max(a->xl, b->xl); + n->yl = Max(a->yl, b->yl); + + if (n->xh < n->xl || n->yh < n->yl) { + pfree(n); + return ((BOX *) NULL); + } + + return (n); +} + +void +rt_box_size(BOX *a, float *size) +{ + if (a == (BOX *) NULL || a->xh <= a->xl || a->yh <= a->yl) + *size = 0.0; + else + *size = (float) ((a->xh - a->xl) * (a->yh - a->yl)); + + return; +} + +/* + * rt_bigbox_size() -- Compute a size for big boxes. + * + * In an earlier release of the system, this routine did something + * different from rt_box_size. We now use floats, rather than ints, + * as the return type for the size routine, so we no longer need to + * have a special return type for big boxes. + */ +void +rt_bigbox_size(BOX *a, float *size) +{ + rt_box_size(a, size); +} + +POLYGON * +rt_poly_union(POLYGON *a, POLYGON *b) +{ + POLYGON *p; + + p = (POLYGON *)PALLOCTYPE(POLYGON); + + if (!PointerIsValid(p)) + elog(WARN, "Cannot allocate polygon for union"); + + memset((char *) p, 0, sizeof(POLYGON)); /* zero any holes */ + p->size = sizeof(POLYGON); + p->npts = 0; + p->boundbox.xh = Max(a->boundbox.xh, b->boundbox.xh); + p->boundbox.yh = Max(a->boundbox.yh, b->boundbox.yh); + p->boundbox.xl = Min(a->boundbox.xl, b->boundbox.xl); + p->boundbox.yl = Min(a->boundbox.yl, b->boundbox.yl); + return p; +} + +void +rt_poly_size(POLYGON *a, float *size) +{ + double xdim, ydim; + + size = (float *) palloc(sizeof(float)); + if (a == (POLYGON *) NULL || + a->boundbox.xh <= a->boundbox.xl || + a->boundbox.yh <= a->boundbox.yl) + *size = 0.0; + else { + xdim = (a->boundbox.xh - a->boundbox.xl); + ydim = (a->boundbox.yh - a->boundbox.yl); + + *size = (float) (xdim * ydim); + } + + return; +} + +POLYGON * +rt_poly_inter(POLYGON *a, POLYGON *b) +{ + POLYGON *p; + + p = (POLYGON *) PALLOCTYPE(POLYGON); + + if (!PointerIsValid(p)) + elog(WARN, "Cannot allocate polygon for intersection"); + + memset((char *) p, 0, sizeof(POLYGON)); /* zero any holes */ + p->size = sizeof(POLYGON); + p->npts = 0; + p->boundbox.xh = Min(a->boundbox.xh, b->boundbox.xh); + p->boundbox.yh = Min(a->boundbox.yh, b->boundbox.yh); + p->boundbox.xl = Max(a->boundbox.xl, b->boundbox.xl); + p->boundbox.yl = Max(a->boundbox.yl, b->boundbox.yl); + + if (p->boundbox.xh < p->boundbox.xl || p->boundbox.yh < p->boundbox.yl) + { + pfree(p); + return ((POLYGON *) NULL); + } + + return (p); +} diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c new file mode 100644 index 00000000000..96efc3bc90b --- /dev/null +++ b/src/backend/access/rtree/rtree.c @@ -0,0 +1,955 @@ +/*------------------------------------------------------------------------- + * + * rtree.c-- + * interface routines for the postgres rtree indexed access method. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" +#include "utils/excid.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/rtree.h" +#include "access/rtscan.h" +#include "access/funcindex.h" +#include "access/tupdesc.h" + +#include "nodes/execnodes.h" +#include "nodes/plannodes.h" + +#include "executor/executor.h" +#include "executor/tuptable.h" + +#include "catalog/index.h" + +typedef struct SPLITVEC { + OffsetNumber *spl_left; + int spl_nleft; + char *spl_ldatum; + OffsetNumber *spl_right; + int spl_nright; + char *spl_rdatum; +} SPLITVEC; + +typedef struct RTSTATE { + func_ptr unionFn; /* union function */ + func_ptr sizeFn; /* size function */ + func_ptr interFn; /* intersection function */ +} RTSTATE; + +/* non-export function prototypes */ +static InsertIndexResult rtdoinsert(Relation r, IndexTuple itup, + RTSTATE *rtstate); +static void rttighten(Relation r, RTSTACK *stk, char *datum, int att_size, + RTSTATE *rtstate); +static InsertIndexResult dosplit(Relation r, Buffer buffer, RTSTACK *stack, + IndexTuple itup, RTSTATE *rtstate); +static void rtintinsert(Relation r, RTSTACK *stk, IndexTuple ltup, + IndexTuple rtup, RTSTATE *rtstate); +static void rtnewroot(Relation r, IndexTuple lt, IndexTuple rt); +static void picksplit(Relation r, Page page, SPLITVEC *v, IndexTuple itup, + RTSTATE *rtstate); +static void RTInitBuffer(Buffer b, uint32 f); +static OffsetNumber choose(Relation r, Page p, IndexTuple it, + RTSTATE *rtstate); +static int nospace(Page p, IndexTuple it); +static void initRtstate(RTSTATE *rtstate, Relation index); + + +void +rtbuild(Relation heap, + Relation index, + int natts, + AttrNumber *attnum, + IndexStrategy istrat, + uint16 pcount, + Datum *params, + FuncIndexInfo *finfo, + PredInfo *predInfo) +{ + HeapScanDesc scan; + Buffer buffer; + AttrNumber i; + HeapTuple htup; + IndexTuple itup; + TupleDesc hd, id; + InsertIndexResult res; + Datum *d; + bool *nulls; + int nb, nh, ni; + ExprContext *econtext; + TupleTable tupleTable; + TupleTableSlot *slot; + Oid hrelid, irelid; + Node *pred, *oldPred; + RTSTATE rtState; + + initRtstate(&rtState, index); + + /* rtrees only know how to do stupid locking now */ + RelationSetLockForWrite(index); + + pred = predInfo->pred; + oldPred = predInfo->oldPred; + + /* + * We expect to be called exactly once for any index relation. + * If that's not the case, big trouble's what we have. + */ + + if (oldPred == NULL && (nb = RelationGetNumberOfBlocks(index)) != 0) + elog(WARN, "%s already contains data", index->rd_rel->relname.data); + + /* initialize the root page (if this is a new index) */ + if (oldPred == NULL) { + buffer = ReadBuffer(index, P_NEW); + RTInitBuffer(buffer, F_LEAF); + WriteBuffer(buffer); + } + + /* init the tuple descriptors and get set for a heap scan */ + hd = RelationGetTupleDescriptor(heap); + id = RelationGetTupleDescriptor(index); + d = (Datum *)palloc(natts * sizeof (*d)); + nulls = (bool *)palloc(natts * sizeof (*nulls)); + + /* + * If this is a predicate (partial) index, we will need to evaluate the + * predicate using ExecQual, which requires the current tuple to be in a + * slot of a TupleTable. In addition, ExecQual must have an ExprContext + * referring to that slot. Here, we initialize dummy TupleTable and + * ExprContext objects for this purpose. --Nels, Feb '92 + */ +#ifndef OMIT_PARTIAL_INDEX + if (pred != NULL || oldPred != NULL) { + tupleTable = ExecCreateTupleTable(1); + slot = ExecAllocTableSlot(tupleTable); + econtext = makeNode(ExprContext); + FillDummyExprContext(econtext, slot, hd, buffer); + } +#endif /* OMIT_PARTIAL_INDEX */ + scan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL); + htup = heap_getnext(scan, 0, &buffer); + + /* count the tuples as we insert them */ + nh = ni = 0; + + for (; HeapTupleIsValid(htup); htup = heap_getnext(scan, 0, &buffer)) { + + nh++; + + /* + * If oldPred != NULL, this is an EXTEND INDEX command, so skip + * this tuple if it was already in the existing partial index + */ + if (oldPred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + /*SetSlotContents(slot, htup); */ + slot->val = htup; + if (ExecQual((List*)oldPred, econtext) == true) { + ni++; + continue; + } +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* Skip this tuple if it doesn't satisfy the partial-index predicate */ + if (pred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + /*SetSlotContents(slot, htup); */ + slot->val = htup; + if (ExecQual((List*)pred, econtext) == false) + continue; +#endif /* OMIT_PARTIAL_INDEX */ + } + + ni++; + + /* + * For the current heap tuple, extract all the attributes + * we use in this index, and note which are null. + */ + + for (i = 1; i <= natts; i++) { + int attoff; + bool attnull; + + /* + * Offsets are from the start of the tuple, and are + * zero-based; indices are one-based. The next call + * returns i - 1. That's data hiding for you. + */ + + attoff = AttrNumberGetAttrOffset(i); + /* + d[attoff] = HeapTupleGetAttributeValue(htup, buffer, + */ + d[attoff] = GetIndexValue(htup, + hd, + attoff, + attnum, + finfo, + &attnull, + buffer); + nulls[attoff] = (attnull ? 'n' : ' '); + } + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(id, &d[0], nulls); + itup->t_tid = htup->t_ctid; + + /* + * Since we already have the index relation locked, we + * call rtdoinsert directly. Normal access method calls + * dispatch through rtinsert, which locks the relation + * for write. This is the right thing to do if you're + * inserting single tups, but not when you're initializing + * the whole index at once. + */ + + res = rtdoinsert(index, itup, &rtState); + pfree(itup); + pfree(res); + } + + /* okay, all heap tuples are indexed */ + heap_endscan(scan); + RelationUnsetLockForWrite(index); + + if (pred != NULL || oldPred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + ExecDestroyTupleTable(tupleTable, true); + pfree(econtext); +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* + * Since we just counted the tuples in the heap, we update its + * stats in pg_relation to guarantee that the planner takes + * advantage of the index we just created. UpdateStats() does a + * CommandCounterIncrement(), which flushes changed entries from + * the system relcache. The act of constructing an index changes + * these heap and index tuples in the system catalogs, so they + * need to be flushed. We close them to guarantee that they + * will be. + */ + + hrelid = heap->rd_id; + irelid = index->rd_id; + heap_close(heap); + index_close(index); + + UpdateStats(hrelid, nh, true); + UpdateStats(irelid, ni, false); + + if (oldPred != NULL) { + if (ni == nh) pred = NULL; + UpdateIndexPredicate(irelid, oldPred, pred); + } + + /* be tidy */ + pfree(nulls); + pfree(d); +} + +/* + * rtinsert -- wrapper for rtree tuple insertion. + * + * This is the public interface routine for tuple insertion in rtrees. + * It doesn't do any work; just locks the relation and passes the buck. + */ +InsertIndexResult +rtinsert(Relation r, IndexTuple itup) +{ + InsertIndexResult res; + RTSTATE rtState; + + initRtstate(&rtState, r); + + RelationSetLockForWrite(r); + res = rtdoinsert(r, itup, &rtState); + + /* XXX two-phase locking -- don't unlock the relation until EOT */ + return (res); +} + +static InsertIndexResult +rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate) +{ + Page page; + Buffer buffer; + BlockNumber blk; + IndexTuple which; + OffsetNumber l; + RTSTACK *stack; + InsertIndexResult res; + RTreePageOpaque opaque; + char *datum; + + blk = P_ROOT; + buffer = InvalidBuffer; + stack = (RTSTACK *) NULL; + + do { + /* let go of current buffer before getting next */ + if (buffer != InvalidBuffer) + ReleaseBuffer(buffer); + + /* get next buffer */ + buffer = ReadBuffer(r, blk); + page = (Page) BufferGetPage(buffer); + + opaque = (RTreePageOpaque) PageGetSpecialPointer(page); + if (!(opaque->flags & F_LEAF)) { + RTSTACK *n; + ItemId iid; + + n = (RTSTACK *) palloc(sizeof(RTSTACK)); + n->rts_parent = stack; + n->rts_blk = blk; + n->rts_child = choose(r, page, itup, rtstate); + stack = n; + + iid = PageGetItemId(page, n->rts_child); + which = (IndexTuple) PageGetItem(page, iid); + blk = ItemPointerGetBlockNumber(&(which->t_tid)); + } + } while (!(opaque->flags & F_LEAF)); + + if (nospace(page, itup)) { + /* need to do a split */ + res = dosplit(r, buffer, stack, itup, rtstate); + freestack(stack); + WriteBuffer(buffer); /* don't forget to release buffer! */ + return (res); + } + + /* add the item and write the buffer */ + if (PageIsEmpty(page)) { + l = PageAddItem(page, (Item) itup, IndexTupleSize(itup), + FirstOffsetNumber, + LP_USED); + } else { + l = PageAddItem(page, (Item) itup, IndexTupleSize(itup), + OffsetNumberNext(PageGetMaxOffsetNumber(page)), + LP_USED); + } + + WriteBuffer(buffer); + + datum = (((char *) itup) + sizeof(IndexTupleData)); + + /* now expand the page boundary in the parent to include the new child */ + rttighten(r, stack, datum, + (IndexTupleSize(itup) - sizeof(IndexTupleData)), rtstate); + freestack(stack); + + /* build and return an InsertIndexResult for this insertion */ + res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); + ItemPointerSet(&(res->pointerData), blk, l); + + return (res); +} + +static void +rttighten(Relation r, + RTSTACK *stk, + char *datum, + int att_size, + RTSTATE *rtstate) +{ + char *oldud; + char *tdatum; + Page p; + float old_size, newd_size; + Buffer b; + + if (stk == (RTSTACK *) NULL) + return; + + b = ReadBuffer(r, stk->rts_blk); + p = BufferGetPage(b); + + oldud = (char *) PageGetItem(p, PageGetItemId(p, stk->rts_child)); + oldud += sizeof(IndexTupleData); + + (*rtstate->sizeFn)(oldud, &old_size); + datum = (char *) (*rtstate->unionFn)(oldud, datum); + + (*rtstate->sizeFn)(datum, &newd_size); + + if (newd_size != old_size) { + TupleDesc td = RelationGetTupleDescriptor(r); + + if (td->attrs[0]->attlen < 0) { + /* + * This is an internal page, so 'oldud' had better be a + * union (constant-length) key, too. (See comment below.) + */ + Assert(VARSIZE(datum) == VARSIZE(oldud)); + memmove(oldud, datum, VARSIZE(datum)); + } else { + memmove(oldud, datum, att_size); + } + WriteBuffer(b); + + /* + * The user may be defining an index on variable-sized data (like + * polygons). If so, we need to get a constant-sized datum for + * insertion on the internal page. We do this by calling the union + * proc, which is guaranteed to return a rectangle. + */ + + tdatum = (char *) (*rtstate->unionFn)(datum, datum); + rttighten(r, stk->rts_parent, tdatum, att_size, rtstate); + pfree(tdatum); + } else { + ReleaseBuffer(b); + } + pfree(datum); +} + +/* + * dosplit -- split a page in the tree. + * + * This is the quadratic-cost split algorithm Guttman describes in + * his paper. The reason we chose it is that you can implement this + * with less information about the data types on which you're operating. + */ +static InsertIndexResult +dosplit(Relation r, + Buffer buffer, + RTSTACK *stack, + IndexTuple itup, + RTSTATE *rtstate) +{ + Page p; + Buffer leftbuf, rightbuf; + Page left, right; + ItemId itemid; + IndexTuple item; + IndexTuple ltup, rtup; + OffsetNumber maxoff; + OffsetNumber i; + OffsetNumber leftoff, rightoff; + BlockNumber lbknum, rbknum; + BlockNumber bufblock; + RTreePageOpaque opaque; + int blank; + InsertIndexResult res; + char *isnull; + SPLITVEC v; + TupleDesc tupDesc; + + isnull = (char *) palloc(r->rd_rel->relnatts); + for (blank = 0; blank < r->rd_rel->relnatts; blank++) + isnull[blank] = ' '; + p = (Page) BufferGetPage(buffer); + opaque = (RTreePageOpaque) PageGetSpecialPointer(p); + + /* + * The root of the tree is the first block in the relation. If + * we're about to split the root, we need to do some hocus-pocus + * to enforce this guarantee. + */ + + if (BufferGetBlockNumber(buffer) == P_ROOT) { + leftbuf = ReadBuffer(r, P_NEW); + RTInitBuffer(leftbuf, opaque->flags); + lbknum = BufferGetBlockNumber(leftbuf); + left = (Page) BufferGetPage(leftbuf); + } else { + leftbuf = buffer; + IncrBufferRefCount(buffer); + lbknum = BufferGetBlockNumber(buffer); + left = (Page) PageGetTempPage(p, sizeof(RTreePageOpaqueData)); + } + + rightbuf = ReadBuffer(r, P_NEW); + RTInitBuffer(rightbuf, opaque->flags); + rbknum = BufferGetBlockNumber(rightbuf); + right = (Page) BufferGetPage(rightbuf); + + picksplit(r, p, &v, itup, rtstate); + + leftoff = rightoff = FirstOffsetNumber; + maxoff = PageGetMaxOffsetNumber(p); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { + itemid = PageGetItemId(p, i); + item = (IndexTuple) PageGetItem(p, itemid); + + if (i == *(v.spl_left)) { + (void) PageAddItem(left, (Item) item, IndexTupleSize(item), + leftoff, LP_USED); + leftoff = OffsetNumberNext(leftoff); + v.spl_left++; /* advance in left split vector */ + } else { + (void) PageAddItem(right, (Item) item, IndexTupleSize(item), + rightoff, LP_USED); + rightoff = OffsetNumberNext(rightoff); + v.spl_right++; /* advance in right split vector */ + } + } + + /* build an InsertIndexResult for this insertion */ + res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); + + /* now insert the new index tuple */ + if (*(v.spl_left) != FirstOffsetNumber) { + (void) PageAddItem(left, (Item) itup, IndexTupleSize(itup), + leftoff, LP_USED); + leftoff = OffsetNumberNext(leftoff); + ItemPointerSet(&(res->pointerData), lbknum, leftoff); + } else { + (void) PageAddItem(right, (Item) itup, IndexTupleSize(itup), + rightoff, LP_USED); + rightoff = OffsetNumberNext(rightoff); + ItemPointerSet(&(res->pointerData), rbknum, rightoff); + } + + if ((bufblock = BufferGetBlockNumber(buffer)) != P_ROOT) { + PageRestoreTempPage(left, p); + } + WriteBuffer(leftbuf); + WriteBuffer(rightbuf); + + /* + * Okay, the page is split. We have three things left to do: + * + * 1) Adjust any active scans on this index to cope with changes + * we introduced in its structure by splitting this page. + * + * 2) "Tighten" the bounding box of the pointer to the left + * page in the parent node in the tree, if any. Since we + * moved a bunch of stuff off the left page, we expect it + * to get smaller. This happens in the internal insertion + * routine. + * + * 3) Insert a pointer to the right page in the parent. This + * may cause the parent to split. If it does, we need to + * repeat steps one and two for each split node in the tree. + */ + + /* adjust active scans */ + rtadjscans(r, RTOP_SPLIT, bufblock, FirstOffsetNumber); + + tupDesc = r->rd_att; + ltup = (IndexTuple) index_formtuple(tupDesc, + (Datum *) &(v.spl_ldatum), isnull); + rtup = (IndexTuple) index_formtuple(tupDesc, + (Datum *) &(v.spl_rdatum), isnull); + pfree(isnull); + + /* set pointers to new child pages in the internal index tuples */ + ItemPointerSet(&(ltup->t_tid), lbknum, 1); + ItemPointerSet(&(rtup->t_tid), rbknum, 1); + + rtintinsert(r, stack, ltup, rtup, rtstate); + + pfree(ltup); + pfree(rtup); + + return (res); +} + +static void +rtintinsert(Relation r, + RTSTACK *stk, + IndexTuple ltup, + IndexTuple rtup, + RTSTATE *rtstate) +{ + IndexTuple old; + Buffer b; + Page p; + char *ldatum, *rdatum, *newdatum; + InsertIndexResult res; + + if (stk == (RTSTACK *) NULL) { + rtnewroot(r, ltup, rtup); + return; + } + + b = ReadBuffer(r, stk->rts_blk); + p = BufferGetPage(b); + old = (IndexTuple) PageGetItem(p, PageGetItemId(p, stk->rts_child)); + + /* + * This is a hack. Right now, we force rtree keys to be constant size. + * To fix this, need delete the old key and add both left and right + * for the two new pages. The insertion of left may force a split if + * the new left key is bigger than the old key. + */ + + if (IndexTupleSize(old) != IndexTupleSize(ltup)) + elog(WARN, "Variable-length rtree keys are not supported."); + + /* install pointer to left child */ + memmove(old, ltup,IndexTupleSize(ltup)); + + if (nospace(p, rtup)) { + newdatum = (((char *) ltup) + sizeof(IndexTupleData)); + rttighten(r, stk->rts_parent, newdatum, + (IndexTupleSize(ltup) - sizeof(IndexTupleData)), rtstate); + res = dosplit(r, b, stk->rts_parent, rtup, rtstate); + WriteBuffer(b); /* don't forget to release buffer! - 01/31/94 */ + pfree(res); + } else { + (void) PageAddItem(p, (Item) rtup, IndexTupleSize(rtup), + PageGetMaxOffsetNumber(p), LP_USED); + WriteBuffer(b); + ldatum = (((char *) ltup) + sizeof(IndexTupleData)); + rdatum = (((char *) rtup) + sizeof(IndexTupleData)); + newdatum = (char *) (*rtstate->unionFn)(ldatum, rdatum); + + rttighten(r, stk->rts_parent, newdatum, + (IndexTupleSize(rtup) - sizeof(IndexTupleData)), rtstate); + + pfree(newdatum); + } +} + +static void +rtnewroot(Relation r, IndexTuple lt, IndexTuple rt) +{ + Buffer b; + Page p; + + b = ReadBuffer(r, P_ROOT); + RTInitBuffer(b, 0); + p = BufferGetPage(b); + (void) PageAddItem(p, (Item) lt, IndexTupleSize(lt), + FirstOffsetNumber, LP_USED); + (void) PageAddItem(p, (Item) rt, IndexTupleSize(rt), + OffsetNumberNext(FirstOffsetNumber), LP_USED); + WriteBuffer(b); +} + +static void +picksplit(Relation r, + Page page, + SPLITVEC *v, + IndexTuple itup, + RTSTATE *rtstate) +{ + OffsetNumber maxoff; + OffsetNumber i, j; + IndexTuple item_1, item_2; + char *datum_alpha, *datum_beta; + char *datum_l, *datum_r; + char *union_d, *union_dl, *union_dr; + char *inter_d; + bool firsttime; + float size_alpha, size_beta, size_union, size_inter; + float size_waste, waste; + float size_l, size_r; + int nbytes; + OffsetNumber seed_1 = 0, seed_2 = 0; + OffsetNumber *left, *right; + + maxoff = PageGetMaxOffsetNumber(page); + + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + + firsttime = true; + waste = 0.0; + + for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i)) { + item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + datum_alpha = ((char *) item_1) + sizeof(IndexTupleData); + for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j)) { + item_2 = (IndexTuple) PageGetItem(page, PageGetItemId(page, j)); + datum_beta = ((char *) item_2) + sizeof(IndexTupleData); + + /* compute the wasted space by unioning these guys */ + union_d = (char *)(rtstate->unionFn)(datum_alpha, datum_beta); + (rtstate->sizeFn)(union_d, &size_union); + inter_d = (char *)(rtstate->interFn)(datum_alpha, datum_beta); + (rtstate->sizeFn)(inter_d, &size_inter); + size_waste = size_union - size_inter; + + pfree(union_d); + + if (inter_d != (char *) NULL) + pfree(inter_d); + + /* + * are these a more promising split that what we've + * already seen? + */ + + if (size_waste > waste || firsttime) { + waste = size_waste; + seed_1 = i; + seed_2 = j; + firsttime = false; + } + } + } + + left = v->spl_left; + v->spl_nleft = 0; + right = v->spl_right; + v->spl_nright = 0; + + item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, seed_1)); + datum_alpha = ((char *) item_1) + sizeof(IndexTupleData); + datum_l = (char *)(*rtstate->unionFn)(datum_alpha, datum_alpha); + (*rtstate->sizeFn)(datum_l, &size_l); + item_2 = (IndexTuple) PageGetItem(page, PageGetItemId(page, seed_2)); + datum_beta = ((char *) item_2) + sizeof(IndexTupleData); + datum_r = (char *)(*rtstate->unionFn)(datum_beta, datum_beta); + (*rtstate->sizeFn)(datum_r, &size_r); + + /* + * Now split up the regions between the two seeds. An important + * property of this split algorithm is that the split vector v + * has the indices of items to be split in order in its left and + * right vectors. We exploit this property by doing a merge in + * the code that actually splits the page. + * + * For efficiency, we also place the new index tuple in this loop. + * This is handled at the very end, when we have placed all the + * existing tuples and i == maxoff + 1. + */ + + maxoff = OffsetNumberNext(maxoff); + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { + + /* + * If we've already decided where to place this item, just + * put it on the right list. Otherwise, we need to figure + * out which page needs the least enlargement in order to + * store the item. + */ + + if (i == seed_1) { + *left++ = i; + v->spl_nleft++; + continue; + } else if (i == seed_2) { + *right++ = i; + v->spl_nright++; + continue; + } + + /* okay, which page needs least enlargement? */ + if (i == maxoff) { + item_1 = itup; + } else { + item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + } + + datum_alpha = ((char *) item_1) + sizeof(IndexTupleData); + union_dl = (char *)(*rtstate->unionFn)(datum_l, datum_alpha); + union_dr = (char *)(*rtstate->unionFn)(datum_r, datum_alpha); + (*rtstate->sizeFn)(union_dl, &size_alpha); + (*rtstate->sizeFn)(union_dr, &size_beta); + + /* pick which page to add it to */ + if (size_alpha - size_l < size_beta - size_r) { + pfree(datum_l); + pfree(union_dr); + datum_l = union_dl; + size_l = size_alpha; + *left++ = i; + v->spl_nleft++; + } else { + pfree(datum_r); + pfree(union_dl); + datum_r = union_dr; + size_r = size_alpha; + *right++ = i; + v->spl_nright++; + } + } + *left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */ + + v->spl_ldatum = datum_l; + v->spl_rdatum = datum_r; +} + +static void +RTInitBuffer(Buffer b, uint32 f) +{ + RTreePageOpaque opaque; + Page page; + Size pageSize; + + pageSize = BufferGetPageSize(b); + + page = BufferGetPage(b); + memset(page, 0, (int) pageSize); + PageInit(page, pageSize, sizeof(RTreePageOpaqueData)); + + opaque = (RTreePageOpaque) PageGetSpecialPointer(page); + opaque->flags = f; +} + +static OffsetNumber +choose(Relation r, Page p, IndexTuple it, RTSTATE *rtstate) +{ + OffsetNumber maxoff; + OffsetNumber i; + char *ud, *id; + char *datum; + float usize, dsize; + OffsetNumber which; + float which_grow; + + id = ((char *) it) + sizeof(IndexTupleData); + maxoff = PageGetMaxOffsetNumber(p); + which_grow = -1.0; + which = -1; + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { + datum = (char *) PageGetItem(p, PageGetItemId(p, i)); + datum += sizeof(IndexTupleData); + (*rtstate->sizeFn)(datum, &dsize); + ud = (char *) (*rtstate->unionFn)(datum, id); + (*rtstate->sizeFn)(ud, &usize); + pfree(ud); + if (which_grow < 0 || usize - dsize < which_grow) { + which = i; + which_grow = usize - dsize; + if (which_grow == 0) + break; + } + } + + return (which); +} + +static int +nospace(Page p, IndexTuple it) +{ + return (PageGetFreeSpace(p) < IndexTupleSize(it)); +} + +void +freestack(RTSTACK *s) +{ + RTSTACK *p; + + while (s != (RTSTACK *) NULL) { + p = s->rts_parent; + pfree(s); + s = p; + } +} + +char * +rtdelete(Relation r, ItemPointer tid) +{ + BlockNumber blkno; + OffsetNumber offnum; + Buffer buf; + Page page; + + /* must write-lock on delete */ + RelationSetLockForWrite(r); + + blkno = ItemPointerGetBlockNumber(tid); + offnum = ItemPointerGetOffsetNumber(tid); + + /* adjust any scans that will be affected by this deletion */ + rtadjscans(r, RTOP_DEL, blkno, offnum); + + /* delete the index tuple */ + buf = ReadBuffer(r, blkno); + page = BufferGetPage(buf); + + PageIndexTupleDelete(page, offnum); + + WriteBuffer(buf); + + /* XXX -- two-phase locking, don't release the write lock */ + return ((char *) NULL); +} + +static void initRtstate(RTSTATE *rtstate, Relation index) +{ + RegProcedure union_proc, size_proc, inter_proc; + func_ptr user_fn; + int pronargs; + + union_proc = index_getprocid(index, 1, RT_UNION_PROC); + size_proc = index_getprocid(index, 1, RT_SIZE_PROC); + inter_proc = index_getprocid(index, 1, RT_INTER_PROC); + fmgr_info(union_proc, &user_fn, &pronargs); + rtstate->unionFn = user_fn; + fmgr_info(size_proc, &user_fn, &pronargs); + rtstate->sizeFn = user_fn; + fmgr_info(inter_proc, &user_fn, &pronargs); + rtstate->interFn = user_fn; + return; +} + +#define RTDEBUG +#ifdef RTDEBUG +#include "utils/geo-decls.h" + +void +_rtdump(Relation r) +{ + Buffer buf; + Page page; + OffsetNumber offnum, maxoff; + BlockNumber blkno; + BlockNumber nblocks; + RTreePageOpaque po; + IndexTuple itup; + BlockNumber itblkno; + OffsetNumber itoffno; + char *datum; + char *itkey; + + nblocks = RelationGetNumberOfBlocks(r); + for (blkno = 0; blkno < nblocks; blkno++) { + buf = ReadBuffer(r, blkno); + page = BufferGetPage(buf); + po = (RTreePageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + printf("Page %d maxoff %d <%s>\n", blkno, maxoff, + (po->flags & F_LEAF ? "LEAF" : "INTERNAL")); + + if (PageIsEmpty(page)) { + ReleaseBuffer(buf); + continue; + } + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + itblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + itoffno = ItemPointerGetOffsetNumber(&(itup->t_tid)); + datum = ((char *) itup); + datum += sizeof(IndexTupleData); + itkey = (char *) box_out((BOX *) datum); + printf("\t[%d] size %d heap <%d,%d> key:%s\n", + offnum, IndexTupleSize(itup), itblkno, itoffno, itkey); + pfree(itkey); + } + + ReleaseBuffer(buf); + } +} +#endif /* defined RTDEBUG */ + diff --git a/src/backend/access/rtree/rtscan.c b/src/backend/access/rtree/rtscan.c new file mode 100644 index 00000000000..aa68f0db70b --- /dev/null +++ b/src/backend/access/rtree/rtscan.c @@ -0,0 +1,392 @@ +/*------------------------------------------------------------------------- + * + * rtscan.c-- + * routines to manage scans on index relations + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtscan.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "c.h" +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "utils/elog.h" +#include "utils/palloc.h" +#include "utils/rel.h" + +#include "access/heapam.h" +#include "access/genam.h" +#include "access/rtree.h" +#include "access/rtstrat.h" + +/* routines defined and used here */ +static void rtregscan(IndexScanDesc s); +static void rtdropscan(IndexScanDesc s); +static void rtadjone(IndexScanDesc s, int op, BlockNumber blkno, + OffsetNumber offnum); +static void adjuststack(RTSTACK *stk, BlockNumber blkno, + OffsetNumber offnum); +static void adjustiptr(IndexScanDesc s, ItemPointer iptr, + int op, BlockNumber blkno, OffsetNumber offnum); + +/* + * Whenever we start an rtree scan in a backend, we register it in private + * space. Then if the rtree index gets updated, we check all registered + * scans and adjust them if the tuple they point at got moved by the + * update. We only need to do this in private space, because when we update + * an rtree we have a write lock on the tree, so no other process can have + * any locks at all on it. A single transaction can have write and read + * locks on the same object, so that's why we need to handle this case. + */ + +typedef struct RTScanListData { + IndexScanDesc rtsl_scan; + struct RTScanListData *rtsl_next; +} RTScanListData; + +typedef RTScanListData *RTScanList; + +/* pointer to list of local scans on rtrees */ +static RTScanList RTScans = (RTScanList) NULL; + +IndexScanDesc +rtbeginscan(Relation r, + bool fromEnd, + uint16 nkeys, + ScanKey key) +{ + IndexScanDesc s; + + RelationSetLockForRead(r); + s = RelationGetIndexScan(r, fromEnd, nkeys, key); + rtregscan(s); + + return (s); +} + +void +rtrescan(IndexScanDesc s, bool fromEnd, ScanKey key) +{ + RTreeScanOpaque p; + RegProcedure internal_proc; + int i; + + if (!IndexScanIsValid(s)) { + elog(WARN, "rtrescan: invalid scan."); + return; + } + + /* + * Clear all the pointers. + */ + + ItemPointerSetInvalid(&s->previousItemData); + ItemPointerSetInvalid(&s->currentItemData); + ItemPointerSetInvalid(&s->nextItemData); + ItemPointerSetInvalid(&s->previousMarkData); + ItemPointerSetInvalid(&s->currentMarkData); + ItemPointerSetInvalid(&s->nextMarkData); + + /* + * Set flags. + */ + if (RelationGetNumberOfBlocks(s->relation) == 0) { + s->flags = ScanUnmarked; + } else if (fromEnd) { + s->flags = ScanUnmarked | ScanUncheckedPrevious; + } else { + s->flags = ScanUnmarked | ScanUncheckedNext; + } + + s->scanFromEnd = fromEnd; + + if (s->numberOfKeys > 0) { + memmove(s->keyData, + key, + s->numberOfKeys * sizeof(ScanKeyData)); + } + + p = (RTreeScanOpaque) s->opaque; + if (p != (RTreeScanOpaque) NULL) { + freestack(p->s_stack); + freestack(p->s_markstk); + p->s_stack = p->s_markstk = (RTSTACK *) NULL; + p->s_flags = 0x0; + } else { + /* initialize opaque data */ + p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData)); + p->s_internalKey = + (ScanKey) palloc(sizeof(ScanKeyData) * s->numberOfKeys); + p->s_stack = p->s_markstk = (RTSTACK *) NULL; + p->s_internalNKey = s->numberOfKeys; + p->s_flags = 0x0; + for (i = 0; i < s->numberOfKeys; i++) + p->s_internalKey[i].sk_argument = s->keyData[i].sk_argument; + s->opaque = p; + if (s->numberOfKeys > 0) { + + /* + * Scans on internal pages use different operators than they + * do on leaf pages. For example, if the user wants all boxes + * that exactly match (x1,y1,x2,y2), then on internal pages + * we need to find all boxes that contain (x1,y1,x2,y2). + */ + + for (i = 0; i < s->numberOfKeys; i++) { + internal_proc = RTMapOperator(s->relation, + s->keyData[i].sk_attno, + s->keyData[i].sk_procedure); + ScanKeyEntryInitialize(&(p->s_internalKey[i]), + s->keyData[i].sk_flags, + s->keyData[i].sk_attno, + internal_proc, + s->keyData[i].sk_argument); + } + } + } +} + +void +rtmarkpos(IndexScanDesc s) +{ + RTreeScanOpaque p; + RTSTACK *o, *n, *tmp; + + s->currentMarkData = s->currentItemData; + p = (RTreeScanOpaque) s->opaque; + if (p->s_flags & RTS_CURBEFORE) + p->s_flags |= RTS_MRKBEFORE; + else + p->s_flags &= ~RTS_MRKBEFORE; + + o = (RTSTACK *) NULL; + n = p->s_stack; + + /* copy the parent stack from the current item data */ + while (n != (RTSTACK *) NULL) { + tmp = (RTSTACK *) palloc(sizeof(RTSTACK)); + tmp->rts_child = n->rts_child; + tmp->rts_blk = n->rts_blk; + tmp->rts_parent = o; + o = tmp; + n = n->rts_parent; + } + + freestack(p->s_markstk); + p->s_markstk = o; +} + +void +rtrestrpos(IndexScanDesc s) +{ + RTreeScanOpaque p; + RTSTACK *o, *n, *tmp; + + s->currentItemData = s->currentMarkData; + p = (RTreeScanOpaque) s->opaque; + if (p->s_flags & RTS_MRKBEFORE) + p->s_flags |= RTS_CURBEFORE; + else + p->s_flags &= ~RTS_CURBEFORE; + + o = (RTSTACK *) NULL; + n = p->s_markstk; + + /* copy the parent stack from the current item data */ + while (n != (RTSTACK *) NULL) { + tmp = (RTSTACK *) palloc(sizeof(RTSTACK)); + tmp->rts_child = n->rts_child; + tmp->rts_blk = n->rts_blk; + tmp->rts_parent = o; + o = tmp; + n = n->rts_parent; + } + + freestack(p->s_stack); + p->s_stack = o; +} + +void +rtendscan(IndexScanDesc s) +{ + RTreeScanOpaque p; + + p = (RTreeScanOpaque) s->opaque; + + if (p != (RTreeScanOpaque) NULL) { + freestack(p->s_stack); + freestack(p->s_markstk); + } + + rtdropscan(s); + /* XXX don't unset read lock -- two-phase locking */ +} + +static void +rtregscan(IndexScanDesc s) +{ + RTScanList l; + + l = (RTScanList) palloc(sizeof(RTScanListData)); + l->rtsl_scan = s; + l->rtsl_next = RTScans; + RTScans = l; +} + +static void +rtdropscan(IndexScanDesc s) +{ + RTScanList l; + RTScanList prev; + + prev = (RTScanList) NULL; + + for (l = RTScans; + l != (RTScanList) NULL && l->rtsl_scan != s; + l = l->rtsl_next) { + prev = l; + } + + if (l == (RTScanList) NULL) + elog(WARN, "rtree scan list corrupted -- cannot find 0x%lx", s); + + if (prev == (RTScanList) NULL) + RTScans = l->rtsl_next; + else + prev->rtsl_next = l->rtsl_next; + + pfree(l); +} + +void +rtadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum) +{ + RTScanList l; + Oid relid; + + relid = r->rd_id; + for (l = RTScans; l != (RTScanList) NULL; l = l->rtsl_next) { + if (l->rtsl_scan->relation->rd_id == relid) + rtadjone(l->rtsl_scan, op, blkno, offnum); + } +} + +/* + * rtadjone() -- adjust one scan for update. + * + * By here, the scan passed in is on a modified relation. Op tells + * us what the modification is, and blkno and offind tell us what + * block and offset index were affected. This routine checks the + * current and marked positions, and the current and marked stacks, + * to see if any stored location needs to be changed because of the + * update. If so, we make the change here. + */ +static void +rtadjone(IndexScanDesc s, + int op, + BlockNumber blkno, + OffsetNumber offnum) +{ + RTreeScanOpaque so; + + adjustiptr(s, &(s->currentItemData), op, blkno, offnum); + adjustiptr(s, &(s->currentMarkData), op, blkno, offnum); + + so = (RTreeScanOpaque) s->opaque; + + if (op == RTOP_SPLIT) { + adjuststack(so->s_stack, blkno, offnum); + adjuststack(so->s_markstk, blkno, offnum); + } +} + +/* + * adjustiptr() -- adjust current and marked item pointers in the scan + * + * Depending on the type of update and the place it happened, we + * need to do nothing, to back up one record, or to start over on + * the same page. + */ +static void +adjustiptr(IndexScanDesc s, + ItemPointer iptr, + int op, + BlockNumber blkno, + OffsetNumber offnum) +{ + OffsetNumber curoff; + RTreeScanOpaque so; + + if (ItemPointerIsValid(iptr)) { + if (ItemPointerGetBlockNumber(iptr) == blkno) { + curoff = ItemPointerGetOffsetNumber(iptr); + so = (RTreeScanOpaque) s->opaque; + + switch (op) { + case RTOP_DEL: + /* back up one if we need to */ + if (curoff >= offnum) { + + if (curoff > FirstOffsetNumber) { + /* just adjust the item pointer */ + ItemPointerSet(iptr, blkno, OffsetNumberPrev(curoff)); + } else { + /* remember that we're before the current tuple */ + ItemPointerSet(iptr, blkno, FirstOffsetNumber); + if (iptr == &(s->currentItemData)) + so->s_flags |= RTS_CURBEFORE; + else + so->s_flags |= RTS_MRKBEFORE; + } + } + break; + + case RTOP_SPLIT: + /* back to start of page on split */ + ItemPointerSet(iptr, blkno, FirstOffsetNumber); + if (iptr == &(s->currentItemData)) + so->s_flags &= ~RTS_CURBEFORE; + else + so->s_flags &= ~RTS_MRKBEFORE; + break; + + default: + elog(WARN, "Bad operation in rtree scan adjust: %d", op); + } + } + } +} + +/* + * adjuststack() -- adjust the supplied stack for a split on a page in + * the index we're scanning. + * + * If a page on our parent stack has split, we need to back up to the + * beginning of the page and rescan it. The reason for this is that + * the split algorithm for rtrees doesn't order tuples in any useful + * way on a single page. This means on that a split, we may wind up + * looking at some heap tuples more than once. This is handled in the + * access method update code for heaps; if we've modified the tuple we + * are looking at already in this transaction, we ignore the update + * request. + */ +/*ARGSUSED*/ +static void +adjuststack(RTSTACK *stk, + BlockNumber blkno, + OffsetNumber offnum) +{ + while (stk != (RTSTACK *) NULL) { + if (stk->rts_blk == blkno) + stk->rts_child = FirstOffsetNumber; + + stk = stk->rts_parent; + } +} diff --git a/src/backend/access/rtree/rtstrat.c b/src/backend/access/rtree/rtstrat.c new file mode 100644 index 00000000000..c5d934a22a2 --- /dev/null +++ b/src/backend/access/rtree/rtstrat.c @@ -0,0 +1,239 @@ +/*------------------------------------------------------------------------- + * + * rtstrat.c-- + * strategy map data for rtrees. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtstrat.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "utils/rel.h" + +#include "storage/bufmgr.h" +#include "storage/bufpage.h" + +#include "access/istrat.h" +#include "access/rtree.h" + +/* + * Note: negate, commute, and negatecommute all assume that operators are + * ordered as follows in the strategy map: + * + * left, left-or-overlap, overlap, right-or-overlap, right, same, + * contains, contained-by + * + * The negate, commute, and negatecommute arrays are used by the planner + * to plan indexed scans over data that appears in the qualificiation in + * a boolean negation, or whose operands appear in the wrong order. For + * example, if the operator "<%" means "contains", and the user says + * + * where not rel.box <% "(10,10,20,20)"::box + * + * the planner can plan an index scan by noting that rtree indices have + * an operator in their operator class for negating <%. + * + * Similarly, if the user says something like + * + * where "(10,10,20,20)"::box <% rel.box + * + * the planner can see that the rtree index on rel.box has an operator in + * its opclass for commuting <%, and plan the scan using that operator. + * This added complexity in the access methods makes the planner a lot easier + * to write. + */ + +/* if a op b, what operator tells us if (not a op b)? */ +static StrategyNumber RTNegate[RTNStrategies] = { + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy + }; + +/* if a op_1 b, what is the operator op_2 such that b op_2 a? */ +static StrategyNumber RTCommute[RTNStrategies] = { + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy + }; + +/* if a op_1 b, what is the operator op_2 such that (b !op_2 a)? */ +static StrategyNumber RTNegateCommute[RTNStrategies] = { + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy, + InvalidStrategy + }; + +/* + * Now do the TermData arrays. These exist in case the user doesn't give + * us a full set of operators for a particular operator class. The idea + * is that by making multiple comparisons using any one of the supplied + * operators, we can decide whether two n-dimensional polygons are equal. + * For example, if a contains b and b contains a, we may conclude that + * a and b are equal. + * + * The presence of the TermData arrays in all this is a historical accident. + * Early in the development of the POSTGRES access methods, it was believed + * that writing functions was harder than writing arrays. This is wrong; + * TermData is hard to understand and hard to get right. In general, when + * someone populates a new operator class, the populate it completely. If + * Mike Hirohama had forced Cimarron Taylor to populate the strategy map + * for btree int2_ops completely in 1988, you wouldn't have to deal with + * all this now. Too bad for you. + * + * Since you can't necessarily do this in all cases (for example, you can't + * do it given only "intersects" or "disjoint"), TermData arrays for some + * operators don't appear below. + * + * Note that if you DO supply all the operators required in a given opclass + * by inserting them into the pg_opclass system catalog, you can get away + * without doing all this TermData stuff. Since the rtree code is intended + * to be a reference for access method implementors, I'm doing TermData + * correctly here. + * + * Note on style: these are all actually of type StrategyTermData, but + * since those have variable-length data at the end of the struct we can't + * properly initialize them if we declare them to be what they are. + */ + +/* if you only have "contained-by", how do you determine equality? */ +static uint16 RTContainedByTermData[] = { + 2, /* make two comparisons */ + RTContainedByStrategyNumber, /* use "a contained-by b" */ + 0x0, /* without any magic */ + RTContainedByStrategyNumber, /* then use contained-by, */ + SK_COMMUTE /* swapping a and b */ + }; + +/* if you only have "contains", how do you determine equality? */ +static uint16 RTContainsTermData[] = { + 2, /* make two comparisons */ + RTContainsStrategyNumber, /* use "a contains b" */ + 0x0, /* without any magic */ + RTContainsStrategyNumber, /* then use contains again, */ + SK_COMMUTE /* swapping a and b */ + }; + +/* now put all that together in one place for the planner */ +static StrategyTerm RTEqualExpressionData[] = { + (StrategyTerm) RTContainedByTermData, + (StrategyTerm) RTContainsTermData, + NULL + }; + +/* + * If you were sufficiently attentive to detail, you would go through + * the ExpressionData pain above for every one of the seven strategies + * we defined. I am not. Now we declare the StrategyEvaluationData + * structure that gets shipped around to help the planner and the access + * method decide what sort of scan it should do, based on (a) what the + * user asked for, (b) what operators are defined for a particular opclass, + * and (c) the reams of information we supplied above. + * + * The idea of all of this initialized data is to make life easier on the + * user when he defines a new operator class to use this access method. + * By filling in all the data, we let him get away with leaving holes in his + * operator class, and still let him use the index. The added complexity + * in the access methods just isn't worth the trouble, though. + */ + +static StrategyEvaluationData RTEvaluationData = { + RTNStrategies, /* # of strategies */ + (StrategyTransformMap) RTNegate, /* how to do (not qual) */ + (StrategyTransformMap) RTCommute, /* how to swap operands */ + (StrategyTransformMap) RTNegateCommute, /* how to do both */ + { + NULL, /* express left */ + NULL, /* express overleft */ + NULL, /* express over */ + NULL, /* express overright */ + NULL, /* express right */ + (StrategyExpression) RTEqualExpressionData, /* express same */ + NULL, /* express contains */ + NULL, /* express contained-by */ + NULL, + NULL, + NULL + } +}; + +/* + * Okay, now something peculiar to rtrees that doesn't apply to most other + * indexing structures: When we're searching a tree for a given value, we + * can't do the same sorts of comparisons on internal node entries as we + * do at leaves. The reason is that if we're looking for (say) all boxes + * that are the same as (0,0,10,10), then we need to find all leaf pages + * that overlap that region. So internally we search for overlap, and at + * the leaf we search for equality. + * + * This array maps leaf search operators to the internal search operators. + * We assume the normal ordering on operators: + * + * left, left-or-overlap, overlap, right-or-overlap, right, same, + * contains, contained-by + */ +static StrategyNumber RTOperMap[RTNStrategies] = { + RTOverLeftStrategyNumber, + RTOverLeftStrategyNumber, + RTOverlapStrategyNumber, + RTOverRightStrategyNumber, + RTOverRightStrategyNumber, + RTContainsStrategyNumber, + RTContainsStrategyNumber, + RTOverlapStrategyNumber + }; + +StrategyNumber +RelationGetRTStrategy(Relation r, + AttrNumber attnum, + RegProcedure proc) +{ + return (RelationGetStrategy(r, attnum, &RTEvaluationData, proc)); +} + +bool +RelationInvokeRTStrategy(Relation r, + AttrNumber attnum, + StrategyNumber s, + Datum left, + Datum right) +{ + return (RelationInvokeStrategy(r, &RTEvaluationData, attnum, s, + left, right)); +} + +RegProcedure +RTMapOperator(Relation r, + AttrNumber attnum, + RegProcedure proc) +{ + StrategyNumber procstrat; + StrategyMap strategyMap; + + procstrat = RelationGetRTStrategy(r, attnum, proc); + strategyMap = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(r), + RTNStrategies, + attnum); + + return (strategyMap->entry[RTOperMap[procstrat - 1] - 1].sk_procedure); +} diff --git a/src/backend/access/rtscan.h b/src/backend/access/rtscan.h new file mode 100644 index 00000000000..a928303f3f3 --- /dev/null +++ b/src/backend/access/rtscan.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * rtscan.h-- + * routines defined in access/rtree/rtscan.c + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: rtscan.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef RTSCAN_H + +void rtadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum); + +#endif /* RTSCAN_H */ diff --git a/src/backend/access/rtstrat.h b/src/backend/access/rtstrat.h new file mode 100644 index 00000000000..5b439e7b338 --- /dev/null +++ b/src/backend/access/rtstrat.h @@ -0,0 +1,18 @@ +/*------------------------------------------------------------------------- + * + * rtstrat.h-- + * routines defined in access/rtree/rtstrat.c + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: rtstrat.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef RTSTRAT_H + +extern RegProcedure RTMapOperator(Relation r, AttrNumber attnum, + RegProcedure proc); + +#endif /* RTSTRAT_H */ diff --git a/src/backend/access/sdir.h b/src/backend/access/sdir.h new file mode 100644 index 00000000000..030007d39c9 --- /dev/null +++ b/src/backend/access/sdir.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * sdir.h-- + * POSTGRES scan direction definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: sdir.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef SDIR_H +#define SDIR_H + +#include "c.h" + +/* + * ScanDirection was an int8 for no apparent reason. I kept the original + * values because I'm not sure if I'll break anything otherwise. -ay 2/95 + */ +typedef enum ScanDirection { + BackwardScanDirection = -1, + NoMovementScanDirection = 0, + ForwardScanDirection = 1 +} ScanDirection; + +/* + * ScanDirectionIsValid -- + * True iff scan direciton is valid. + */ +#define ScanDirectionIsValid(direction) \ + ((bool) (BackwardScanDirection <= direction && \ + direction <= ForwardScanDirection)) + +/* + * ScanDirectionIsBackward -- + * True iff scan direciton is backward. + */ +#define ScanDirectionIsBackward(direction) \ + ((bool) (direction == BackwardScanDirection)) + +/* + * ScanDirectionIsNoMovement -- + * True iff scan direciton indicates no movement. + */ +#define ScanDirectionIsNoMovement(direction) \ + ((bool) (direction == NoMovementScanDirection)) + +/* + * ScanDirectionIsForward -- + * True iff scan direciton is forward. + */ +#define ScanDirectionIsForward(direction) \ + ((bool) (direction == ForwardScanDirection)) + +#endif /* SDIR_H */ diff --git a/src/backend/access/skey.h b/src/backend/access/skey.h new file mode 100644 index 00000000000..3cadf348f42 --- /dev/null +++ b/src/backend/access/skey.h @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * skey.h-- + * POSTGRES scan key definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: skey.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + * + * Note: + * Needs more accessor/assignment routines. + *------------------------------------------------------------------------- + */ +#ifndef SKEY_H +#define SKEY_H + +#include "postgres.h" +#include "access/attnum.h" + + +typedef struct ScanKeyData { + bits16 sk_flags; /* flags */ + AttrNumber sk_attno; /* domain number */ + RegProcedure sk_procedure; /* procedure OID */ + func_ptr sk_func; + int32 sk_nargs; + Datum sk_argument; /* data to compare */ +} ScanKeyData; + +typedef ScanKeyData *ScanKey; + + +#define SK_ISNULL 0x1 +#define SK_UNARY 0x2 +#define SK_NEGATE 0x4 +#define SK_COMMUTE 0x8 + +#define ScanUnmarked 0x01 +#define ScanUncheckedPrevious 0x02 +#define ScanUncheckedNext 0x04 + + +/* + * prototypes for functions in access/common/scankey.c + */ +extern void ScanKeyEntrySetIllegal(ScanKey entry); +extern void ScanKeyEntryInitialize(ScanKey entry, bits16 flags, + AttrNumber attributeNumber, RegProcedure procedure, Datum argument); + +#endif /* SKEY_H */ diff --git a/src/backend/access/strat.h b/src/backend/access/strat.h new file mode 100644 index 00000000000..4ddb2190d88 --- /dev/null +++ b/src/backend/access/strat.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * strat.h-- + * index strategy type definitions + * (separated out from original istrat.h to avoid circular refs) + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: strat.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef STRAT_H +#define STRAT_H + +#include "postgres.h" +#include "access/attnum.h" +#include "access/skey.h" + +typedef uint16 StrategyNumber; + +#define InvalidStrategy 0 + +typedef struct StrategyTransformMapData { + StrategyNumber strategy[1]; /* VARIABLE LENGTH ARRAY */ +} StrategyTransformMapData; /* VARIABLE LENGTH STRUCTURE */ + +typedef StrategyTransformMapData *StrategyTransformMap; + +typedef struct StrategyOperatorData { + StrategyNumber strategy; + bits16 flags; /* scan qualification flags h/skey.h */ +} StrategyOperatorData; + +typedef StrategyOperatorData *StrategyOperator; + +typedef struct StrategyTermData { /* conjunctive term */ + uint16 degree; + StrategyOperatorData operatorData[1]; /* VARIABLE LENGTH */ +} StrategyTermData; /* VARIABLE LENGTH STRUCTURE */ + +typedef StrategyTermData *StrategyTerm; + +typedef struct StrategyExpressionData { /* disjunctive normal form */ + StrategyTerm term[1]; /* VARIABLE LENGTH ARRAY */ +} StrategyExpressionData; /* VARIABLE LENGTH STRUCTURE */ + +typedef StrategyExpressionData *StrategyExpression; + +typedef struct StrategyEvaluationData { + StrategyNumber maxStrategy; + StrategyTransformMap negateTransform; + StrategyTransformMap commuteTransform; + StrategyTransformMap negateCommuteTransform; + StrategyExpression expression[12]; /* XXX VARIABLE LENGTH */ +} StrategyEvaluationData; /* VARIABLE LENGTH STRUCTURE */ + +typedef StrategyEvaluationData *StrategyEvaluation; + +/* + * StrategyTransformMapIsValid -- + * Returns true iff strategy transformation map is valid. + */ +#define StrategyTransformMapIsValid(transform) PointerIsValid(transform) + + +#ifndef CorrectStrategies /* XXX this should be removable */ +#define AMStrategies(foo) 12 +#else /* !defined(CorrectStrategies) */ +#define AMStrategies(foo) (foo) +#endif /* !defined(CorrectStrategies) */ + +typedef struct StrategyMapData { + ScanKeyData entry[1]; /* VARIABLE LENGTH ARRAY */ +} StrategyMapData; /* VARIABLE LENGTH STRUCTURE */ + +typedef StrategyMapData *StrategyMap; + +typedef struct IndexStrategyData { + StrategyMapData strategyMapData[1]; /* VARIABLE LENGTH ARRAY */ +} IndexStrategyData; /* VARIABLE LENGTH STRUCTURE */ + +typedef IndexStrategyData *IndexStrategy; + +#endif /*STRAT_H */ diff --git a/src/backend/access/transam.h b/src/backend/access/transam.h new file mode 100644 index 00000000000..0f5a9724dc0 --- /dev/null +++ b/src/backend/access/transam.h @@ -0,0 +1,213 @@ +/*------------------------------------------------------------------------- + * + * transam.h-- + * postgres transaction access method support code header + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: transam.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + * NOTES + * Transaction System Version 101 now support proper oid + * generation and recording in the variable relation. + * + *------------------------------------------------------------------------- + */ +#ifndef TRANSAM_H +#define TRANSAM_H + +/* ---------------- + * transaction system version id + * + * this is stored on the first page of the log, time and variable + * relations on the first 4 bytes. This is so that if we improve + * the format of the transaction log after postgres version 2, then + * people won't have to rebuild their databases. + * + * TRANS_SYSTEM_VERSION 100 means major version 1 minor version 0. + * Two databases with the same major version should be compatible, + * even if their minor versions differ. + * ---------------- + */ +#define TRANS_SYSTEM_VERSION 101 + +/* ---------------- + * transaction id status values + * + * someday we will use "11" = 3 = XID_INVALID to mean the + * starting of run-length encoded log data. + * ---------------- + */ +#define XID_COMMIT 2 /* transaction commited */ +#define XID_ABORT 1 /* transaction aborted */ +#define XID_INPROGRESS 0 /* transaction in progress */ +#define XID_INVALID 3 /* other */ + +typedef unsigned char XidStatus; /* (2 bits) */ + +/* ---------------- + * BitIndexOf computes the index of the Nth xid on a given block + * ---------------- + */ +#define BitIndexOf(N) ((N) * 2) + +/* ---------------- + * transaction page definitions + * ---------------- + */ +#define TP_DataSize BLCKSZ +#define TP_NumXidStatusPerBlock (TP_DataSize * 4) +#define TP_NumTimePerBlock (TP_DataSize / 4) + +/* ---------------- + * LogRelationContents structure + * + * This structure describes the storage of the data in the + * first 128 bytes of the log relation. This storage is never + * used for transaction status because transaction id's begin + * their numbering at 512. + * + * The first 4 bytes of this relation store the version + * number of the transction system. + * ---------------- + */ +typedef struct LogRelationContentsData { + int TransSystemVersion; +} LogRelationContentsData; + +typedef LogRelationContentsData *LogRelationContents; + +/* ---------------- + * TimeRelationContents structure + * + * This structure describes the storage of the data in the + * first 2048 bytes of the time relation. This storage is never + * used for transaction commit times because transaction id's begin + * their numbering at 512. + * + * The first 4 bytes of this relation store the version + * number of the transction system. + * ---------------- + */ +typedef struct TimeRelationContentsData { + int TransSystemVersion; +} TimeRelationContentsData; + +typedef TimeRelationContentsData *TimeRelationContents; + +/* ---------------- + * VariableRelationContents structure + * + * The variable relation is a special "relation" which + * is used to store various system "variables" persistantly. + * Unlike other relations in the system, this relation + * is updated in place whenever the variables change. + * + * The first 4 bytes of this relation store the version + * number of the transction system. + * + * Currently, the relation has only one page and the next + * available xid, the last committed xid and the next + * available oid are stored there. + * ---------------- + */ +typedef struct VariableRelationContentsData { + int TransSystemVersion; + TransactionId nextXidData; + TransactionId lastXidData; + Oid nextOid; +} VariableRelationContentsData; + +typedef VariableRelationContentsData *VariableRelationContents; + +/* ---------------- + * extern declarations + * ---------------- + */ + +/* + * prototypes for functions in transam/transam.c + */ +extern int RecoveryCheckingEnabled(); +extern void SetRecoveryCheckingEnabled(bool state); +extern bool TransactionLogTest(TransactionId transactionId, XidStatus status); +extern void TransactionLogUpdate(TransactionId transactionId, + XidStatus status); +extern AbsoluteTime TransactionIdGetCommitTime(TransactionId transactionId); +extern void TransRecover(Relation logRelation); +extern void InitializeTransactionLog(); +extern bool TransactionIdDidCommit(TransactionId transactionId); +extern bool TransactionIdDidAbort(TransactionId transactionId); +extern bool TransactionIdIsInProgress(TransactionId transactionId); +extern void TransactionIdCommit(TransactionId transactionId); +extern void TransactionIdAbort(TransactionId transactionId); +extern void TransactionIdSetInProgress(TransactionId transactionId); + +/* in transam/transsup.c */ +extern void AmiTransactionOverride(bool flag); +extern void TransComputeBlockNumber(Relation relation, + TransactionId transactionId, BlockNumber *blockNumberOutP); +extern XidStatus TransBlockGetLastTransactionIdStatus(Block tblock, + TransactionId baseXid, TransactionId *returnXidP); +extern XidStatus TransBlockGetXidStatus(Block tblock, + TransactionId transactionId); +extern void TransBlockSetXidStatus(Block tblock, + TransactionId transactionId, XidStatus xstatus); +extern AbsoluteTime TransBlockGetCommitTime(Block tblock, + TransactionId transactionId); +extern void TransBlockSetCommitTime(Block tblock, + TransactionId transactionId, AbsoluteTime commitTime); +extern XidStatus TransBlockNumberGetXidStatus(Relation relation, + BlockNumber blockNumber, TransactionId xid, bool *failP); +extern void TransBlockNumberSetXidStatus(Relation relation, + BlockNumber blockNumber, TransactionId xid, XidStatus xstatus, + bool *failP); +extern AbsoluteTime TransBlockNumberGetCommitTime(Relation relation, + BlockNumber blockNumber, TransactionId xid, bool *failP); +extern void TransBlockNumberSetCommitTime(Relation relation, + BlockNumber blockNumber, TransactionId xid, AbsoluteTime xtime, + bool *failP); +extern void TransGetLastRecordedTransaction(Relation relation, + TransactionId xid, bool *failP); + +/* in transam/varsup.c */ +extern void VariableRelationGetNextXid(TransactionId *xidP); +extern void VariableRelationGetLastXid(TransactionId *xidP); +extern void VariableRelationPutNextXid(TransactionId xid); +extern void VariableRelationPutLastXid(TransactionId xid); +extern void VariableRelationGetNextOid(Oid *oid_return); +extern void VariableRelationPutNextOid(Oid *oidP); +extern void GetNewTransactionId(TransactionId *xid); +extern void UpdateLastCommittedXid(TransactionId xid); +extern void GetNewObjectIdBlock(Oid *oid_return, int oid_block_size); +extern void GetNewObjectId(Oid *oid_return); + +/* ---------------- + * global variable extern declarations + * ---------------- + */ + +/* in transam.c */ +extern Relation LogRelation; +extern Relation TimeRelation; +extern Relation VariableRelation; + +extern TransactionId cachedGetCommitTimeXid; +extern AbsoluteTime cachedGetCommitTime; +extern TransactionId cachedTestXid; +extern XidStatus cachedTestXidStatus; + +extern TransactionId NullTransactionId; +extern TransactionId AmiTransactionId; +extern TransactionId FirstTransactionId; + +extern int RecoveryCheckingEnableState; + +/* in transsup.c */ +extern bool AMI_OVERRIDE; + +/* in varsup.c */ +extern int OidGenLockId; + +#endif /* TRAMSAM_H */ diff --git a/src/backend/access/transam/Makefile.inc b/src/backend/access/transam/Makefile.inc new file mode 100644 index 00000000000..c4f5b95a0ae --- /dev/null +++ b/src/backend/access/transam/Makefile.inc @@ -0,0 +1,14 @@ +#------------------------------------------------------------------------- +# +# Makefile.inc-- +# Makefile for access/transam +# +# Copyright (c) 1994, Regents of the University of California +# +# +# IDENTIFICATION +# $Header: /cvsroot/pgsql/src/backend/access/transam/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ +# +#------------------------------------------------------------------------- + +SUBSRCS+= transam.c transsup.c varsup.c xact.c xid.c diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c new file mode 100644 index 00000000000..b3789a8c2c5 --- /dev/null +++ b/src/backend/access/transam/transam.c @@ -0,0 +1,675 @@ +/*------------------------------------------------------------------------- + * + * transam.c-- + * postgres transaction log/time interface routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/transam/transam.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + * NOTES + * This file contains the high level access-method interface to the + * transaction system. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "machine.h" /* in port/ directory (needed for BLCKSZ) */ + +#include "access/heapam.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" + +#include "utils/memutils.h" +#include "utils/mcxt.h" +#include "utils/rel.h" +#include "utils/elog.h" + +#include "utils/nabstime.h" +#include "catalog/catname.h" + +#include "access/transam.h" +#include "access/xact.h" +#include "commands/vacuum.h" /* for VacuumRunning */ + +/* ---------------- + * global variables holding pointers to relations used + * by the transaction system. These are initialized by + * InitializeTransactionLog(). + * ---------------- + */ + +Relation LogRelation = (Relation) NULL; +Relation TimeRelation = (Relation) NULL; +Relation VariableRelation = (Relation) NULL; + +/* ---------------- + * global variables holding cached transaction id's and statuses. + * ---------------- + */ +TransactionId cachedGetCommitTimeXid; +AbsoluteTime cachedGetCommitTime; +TransactionId cachedTestXid; +XidStatus cachedTestXidStatus; + +/* ---------------- + * transaction system constants + * ---------------- + */ +/* ---------------------------------------------------------------- + * transaction system constants + * + * read the comments for GetNewTransactionId in order to + * understand the initial values for AmiTransactionId and + * FirstTransactionId. -cim 3/23/90 + * ---------------------------------------------------------------- + */ +TransactionId NullTransactionId = (TransactionId) 0; + +TransactionId AmiTransactionId = (TransactionId) 512; + +TransactionId FirstTransactionId = (TransactionId) 514; + +/* ---------------- + * transaction recovery state variables + * + * When the transaction system is initialized, we may + * need to do recovery checking. This decision is decided + * by the postmaster or the user by supplying the backend + * with a special flag. In general, we want to do recovery + * checking whenever we are running without a postmaster + * or when the number of backends running under the postmaster + * goes from zero to one. -cim 3/21/90 + * ---------------- + */ +int RecoveryCheckingEnableState = 0; + +/* ------------------ + * spinlock for oid generation + * ----------------- + */ +extern int OidGenLockId; + +/* ---------------- + * globals that must be reset at abort + * ---------------- + */ +extern bool BuildingBtree; + + +/* ---------------- + * recovery checking accessors + * ---------------- + */ +int +RecoveryCheckingEnabled() +{ + return RecoveryCheckingEnableState; +} + +void +SetRecoveryCheckingEnabled(bool state) +{ + RecoveryCheckingEnableState = (state == true); +} + +/* ---------------------------------------------------------------- + * postgres log/time access method interface + * + * TransactionLogTest + * TransactionLogUpdate + * ======== + * these functions do work for the interface + * functions - they search/retrieve and append/update + * information in the log and time relations. + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * TransactionLogTest + * -------------------------------- + */ + +bool /* true/false: does transaction id have specified status? */ +TransactionLogTest(TransactionId transactionId, /* transaction id to test */ + XidStatus status) /* transaction status */ +{ + BlockNumber blockNumber; + XidStatus xidstatus; /* recorded status of xid */ + bool fail = false; /* success/failure */ + + /* ---------------- + * during initialization consider all transactions + * as having been committed + * ---------------- + */ + if (! RelationIsValid(LogRelation)) + return (bool) (status == XID_COMMIT); + + /* ---------------- + * before going to the buffer manager, check our single + * item cache to see if we didn't just check the transaction + * status a moment ago. + * ---------------- + */ + if (TransactionIdEquals(transactionId, cachedTestXid)) + return (bool) + (status == cachedTestXidStatus); + + /* ---------------- + * compute the item pointer corresponding to the + * page containing our transaction id. We save the item in + * our cache to speed up things if we happen to ask for the + * same xid's status more than once. + * ---------------- + */ + TransComputeBlockNumber(LogRelation, transactionId, &blockNumber); + xidstatus = TransBlockNumberGetXidStatus(LogRelation, + blockNumber, + transactionId, + &fail); + + if (! fail) { + TransactionIdStore(transactionId, &cachedTestXid); + cachedTestXidStatus = xidstatus; + return (bool) + (status == xidstatus); + } + + /* ---------------- + * here the block didn't contain the information we wanted + * ---------------- + */ + elog(WARN, "TransactionLogTest: failed to get xidstatus"); + + /* + * so lint is happy... + */ + return(false); +} + +/* -------------------------------- + * TransactionLogUpdate + * -------------------------------- + */ +void +TransactionLogUpdate(TransactionId transactionId, /* trans id to update */ + XidStatus status) /* new trans status */ +{ + BlockNumber blockNumber; + bool fail = false; /* success/failure */ + AbsoluteTime currentTime; /* time of this transaction */ + + /* ---------------- + * during initialization we don't record any updates. + * ---------------- + */ + if (! RelationIsValid(LogRelation)) + return; + + /* ---------------- + * get the transaction commit time + * ---------------- + */ + currentTime = getSystemTime(); + + /* ---------------- + * update the log relation + * ---------------- + */ + TransComputeBlockNumber(LogRelation, transactionId, &blockNumber); + TransBlockNumberSetXidStatus(LogRelation, + blockNumber, + transactionId, + status, + &fail); + + /* ---------------- + * update (invalidate) our single item TransactionLogTest cache. + * ---------------- + */ + TransactionIdStore(transactionId, &cachedTestXid); + cachedTestXidStatus = status; + + /* ---------------- + * now we update the time relation, if necessary + * (we only record commit times) + * ---------------- + */ + if (RelationIsValid(TimeRelation) && status == XID_COMMIT) { + TransComputeBlockNumber(TimeRelation, transactionId, &blockNumber); + TransBlockNumberSetCommitTime(TimeRelation, + blockNumber, + transactionId, + currentTime, + &fail); + /* ---------------- + * update (invalidate) our single item GetCommitTime cache. + * ---------------- + */ + TransactionIdStore(transactionId, &cachedGetCommitTimeXid); + cachedGetCommitTime = currentTime; + } + + /* ---------------- + * now we update the "last committed transaction" field + * in the variable relation if we are recording a commit. + * ---------------- + */ + if (RelationIsValid(VariableRelation) && status == XID_COMMIT) + UpdateLastCommittedXid(transactionId); +} + +/* -------------------------------- + * TransactionIdGetCommitTime + * -------------------------------- + */ + +AbsoluteTime /* commit time of transaction id */ +TransactionIdGetCommitTime(TransactionId transactionId) /* transaction id to test */ +{ + BlockNumber blockNumber; + AbsoluteTime commitTime; /* commit time */ + bool fail = false; /* success/failure */ + + /* ---------------- + * return invalid if we aren't running yet... + * ---------------- + */ + if (! RelationIsValid(TimeRelation)) + return INVALID_ABSTIME; + + /* ---------------- + * before going to the buffer manager, check our single + * item cache to see if we didn't just get the commit time + * a moment ago. + * ---------------- + */ + if (TransactionIdEquals(transactionId, cachedGetCommitTimeXid)) + return cachedGetCommitTime; + + /* ---------------- + * compute the item pointer corresponding to the + * page containing our transaction commit time + * ---------------- + */ + TransComputeBlockNumber(TimeRelation, transactionId, &blockNumber); + commitTime = TransBlockNumberGetCommitTime(TimeRelation, + blockNumber, + transactionId, + &fail); + + /* ---------------- + * update our cache and return the transaction commit time + * ---------------- + */ + if (! fail) { + TransactionIdStore(transactionId, &cachedGetCommitTimeXid); + cachedGetCommitTime = commitTime; + return commitTime; + } else + return INVALID_ABSTIME; +} + +/* ---------------------------------------------------------------- + * transaction recovery code + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * TransRecover + * + * preform transaction recovery checking. + * + * Note: this should only be preformed if no other backends + * are running. This is known by the postmaster and + * conveyed by the postmaster passing a "do recovery checking" + * flag to the backend. + * + * here we get the last recorded transaction from the log, + * get the "last" and "next" transactions from the variable relation + * and then preform some integrity tests: + * + * 1) No transaction may exist higher then the "next" available + * transaction recorded in the variable relation. If this is the + * case then it means either the log or the variable relation + * has become corrupted. + * + * 2) The last committed transaction may not be higher then the + * next available transaction for the same reason. + * + * 3) The last recorded transaction may not be lower then the + * last committed transaction. (the reverse is ok - it means + * that some transactions have aborted since the last commit) + * + * Here is what the proper situation looks like. The line + * represents the data stored in the log. 'c' indicates the + * transaction was recorded as committed, 'a' indicates an + * abortted transaction and '.' represents information not + * recorded. These may correspond to in progress transactions. + * + * c c a c . . a . . . . . . . . . . + * | | + * last next + * + * Since "next" is only incremented by GetNewTransactionId() which + * is called when transactions are started. Hence if there + * are commits or aborts after "next", then it means we committed + * or aborted BEFORE we started the transaction. This is the + * rational behind constraint (1). + * + * Likewise, "last" should never greater then "next" for essentially + * the same reason - it would imply we committed before we started. + * This is the reasoning for (2). + * + * (3) implies we may never have a situation such as: + * + * c c a c . . a c . . . . . . . . . + * | | + * last next + * + * where there is a 'c' greater then "last". + * + * Recovery checking is more difficult in the case where + * several backends are executing concurrently because the + * transactions may be executing in the other backends. + * So, we only do recovery stuff when the backend is explicitly + * passed a flag on the command line. + * -------------------------------- + */ +void +TransRecover(Relation logRelation) +{ +#if 0 + /* ---------------- + * first get the last recorded transaction in the log. + * ---------------- + */ + TransGetLastRecordedTransaction(logRelation, logLastXid, &fail); + if (fail == true) + elog(WARN, "TransRecover: failed TransGetLastRecordedTransaction"); + + /* ---------------- + * next get the "last" and "next" variables + * ---------------- + */ + VariableRelationGetLastXid(&varLastXid); + VariableRelationGetNextXid(&varNextXid); + + /* ---------------- + * intregity test (1) + * ---------------- + */ + if (TransactionIdIsLessThan(varNextXid, logLastXid)) + elog(WARN, "TransRecover: varNextXid < logLastXid"); + + /* ---------------- + * intregity test (2) + * ---------------- + */ + + /* ---------------- + * intregity test (3) + * ---------------- + */ + + /* ---------------- + * here we have a valid " + * + * **** RESUME HERE **** + * ---------------- + */ + varNextXid = TransactionIdDup(varLastXid); + TransactionIdIncrement(&varNextXid); + + VarPut(var, VAR_PUT_LASTXID, varLastXid); + VarPut(var, VAR_PUT_NEXTXID, varNextXid); +#endif +} + +/* ---------------------------------------------------------------- + * Interface functions + * + * InitializeTransactionLog + * ======== + * this function (called near cinit) initializes + * the transaction log, time and variable relations. + * + * TransactionId DidCommit + * TransactionId DidAbort + * TransactionId IsInProgress + * ======== + * these functions test the transaction status of + * a specified transaction id. + * + * TransactionId Commit + * TransactionId Abort + * TransactionId SetInProgress + * ======== + * these functions set the transaction status + * of the specified xid. TransactionIdCommit() also + * records the current time in the time relation + * and updates the variable relation counter. + * + * ---------------------------------------------------------------- + */ + +/* + * InitializeTransactionLog -- + * Initializes transaction logging. + */ +void +InitializeTransactionLog() +{ + Relation logRelation; + Relation timeRelation; + MemoryContext oldContext; + + /* ---------------- + * don't do anything during bootstrapping + * ---------------- + */ + if (AMI_OVERRIDE) + return; + + /* ---------------- + * disable the transaction system so the access methods + * don't interfere during initialization. + * ---------------- + */ + OverrideTransactionSystem(true); + + /* ---------------- + * make sure allocations occur within the top memory context + * so that our log management structures are protected from + * garbage collection at the end of every transaction. + * ---------------- + */ + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* ---------------- + * first open the log and time relations + * (these are created by amiint so they are guaranteed to exist) + * ---------------- + */ + logRelation = heap_openr(LogRelationName); + timeRelation = heap_openr(TimeRelationName); + VariableRelation = heap_openr(VariableRelationName); + /* ---------------- + * XXX TransactionLogUpdate requires that LogRelation + * and TimeRelation are valid so we temporarily set + * them so we can initialize things properly. + * This could be done cleaner. + * ---------------- + */ + LogRelation = logRelation; + TimeRelation = timeRelation; + + /* ---------------- + * if we have a virgin database, we initialize the log and time + * relation by committing the AmiTransactionId (id 512) and we + * initialize the variable relation by setting the next available + * transaction id to FirstTransactionId (id 514). OID initialization + * happens as a side effect of bootstrapping in varsup.c. + * ---------------- + */ + SpinAcquire(OidGenLockId); + if (!TransactionIdDidCommit(AmiTransactionId)) { + + /* ---------------- + * SOMEDAY initialize the information stored in + * the headers of the log/time/variable relations. + * ---------------- + */ + TransactionLogUpdate(AmiTransactionId, XID_COMMIT); + VariableRelationPutNextXid(FirstTransactionId); + + } else if (RecoveryCheckingEnabled()) { + /* ---------------- + * if we have a pre-initialized database and if the + * perform recovery checking flag was passed then we + * do our database integrity checking. + * ---------------- + */ + TransRecover(logRelation); + } + LogRelation = (Relation) NULL; + TimeRelation = (Relation) NULL; + SpinRelease(OidGenLockId); + + /* ---------------- + * now re-enable the transaction system + * ---------------- + */ + OverrideTransactionSystem(false); + + /* ---------------- + * instantiate the global variables + * ---------------- + */ + LogRelation = logRelation; + TimeRelation = timeRelation; + + /* ---------------- + * restore the memory context to the previous context + * before we return from initialization. + * ---------------- + */ + MemoryContextSwitchTo(oldContext); +} + +/* -------------------------------- + * TransactionId DidCommit + * TransactionId DidAbort + * TransactionId IsInProgress + * -------------------------------- + */ + +/* + * TransactionIdDidCommit -- + * True iff transaction associated with the identifier did commit. + * + * Note: + * Assumes transaction identifier is valid. + */ +bool /* true if given transaction committed */ +TransactionIdDidCommit(TransactionId transactionId) +{ + if (AMI_OVERRIDE) + return true; + + return + TransactionLogTest(transactionId, XID_COMMIT); +} + +/* + * TransactionIdDidAborted -- + * True iff transaction associated with the identifier did abort. + * + * Note: + * Assumes transaction identifier is valid. + * XXX Is this unneeded? + */ +bool /* true if given transaction aborted */ +TransactionIdDidAbort(TransactionId transactionId) +{ + if (AMI_OVERRIDE) + return false; + + return + TransactionLogTest(transactionId, XID_ABORT); +} + +bool /* true if given transaction neither committed nor aborted */ +TransactionIdIsInProgress(TransactionId transactionId) +{ + if (AMI_OVERRIDE) + return false; + + return + TransactionLogTest(transactionId, XID_INPROGRESS); +} + +/* -------------------------------- + * TransactionId Commit + * TransactionId Abort + * TransactionId SetInProgress + * -------------------------------- + */ + +/* + * TransactionIdCommit -- + * Commits the transaction associated with the identifier. + * + * Note: + * Assumes transaction identifier is valid. + */ +void +TransactionIdCommit(TransactionId transactionId) +{ + if (AMI_OVERRIDE) + return; + + /* + * Within TransactionLogUpdate we call UpdateLastCommited() + * which assumes we have exclusive access to pg_variable. + * Therefore we need to get exclusive access before calling + * TransactionLogUpdate. -mer 18 Aug 1992 + */ + SpinAcquire(OidGenLockId); + TransactionLogUpdate(transactionId, XID_COMMIT); + SpinRelease(OidGenLockId); +} + +/* + * TransactionIdAbort -- + * Aborts the transaction associated with the identifier. + * + * Note: + * Assumes transaction identifier is valid. + */ +void +TransactionIdAbort(TransactionId transactionId) +{ + BuildingBtree = false; + + if (VacuumRunning) + vc_abort(); + + if (AMI_OVERRIDE) + return; + + TransactionLogUpdate(transactionId, XID_ABORT); +} + +void +TransactionIdSetInProgress(TransactionId transactionId) +{ + if (AMI_OVERRIDE) + return; + + TransactionLogUpdate(transactionId, XID_INPROGRESS); +} diff --git a/src/backend/access/transam/transsup.c b/src/backend/access/transam/transsup.c new file mode 100644 index 00000000000..a1e5b17ec13 --- /dev/null +++ b/src/backend/access/transam/transsup.c @@ -0,0 +1,663 @@ +/*------------------------------------------------------------------------- + * + * transsup.c-- + * postgres transaction access method support code + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/transam/Attic/transsup.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + * NOTES + * This file contains support functions for the high + * level access method interface routines found in transam.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "machine.h" /* in port/ directory (needed for BLCKSZ) */ + +#include "storage/buf.h" +#include "storage/bufmgr.h" + +#include "utils/rel.h" +#include "utils/elog.h" +#include "utils/memutils.h" +#include "utils/nabstime.h" + +#include "catalog/heap.h" +#include "access/transam.h" /* where the declarations go */ +#include "access/xact.h" /* where the declarations go */ + +#include "storage/smgr.h" + +/* ---------------------------------------------------------------- + * general support routines + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * AmiTransactionOverride + * + * This function is used to manipulate the bootstrap flag. + * -------------------------------- + */ +void +AmiTransactionOverride(bool flag) +{ + AMI_OVERRIDE = flag; +} + +/* -------------------------------- + * TransComputeBlockNumber + * -------------------------------- + */ +void +TransComputeBlockNumber(Relation relation, /* relation to test */ + TransactionId transactionId, /* transaction id to test */ + BlockNumber *blockNumberOutP) +{ + long itemsPerBlock; + + /* ---------------- + * we calculate the block number of our transaction + * by dividing the transaction id by the number of + * transaction things per block. + * ---------------- + */ + if (relation == LogRelation) + itemsPerBlock = TP_NumXidStatusPerBlock; + else if (relation == TimeRelation) + itemsPerBlock = TP_NumTimePerBlock; + else + elog(WARN, "TransComputeBlockNumber: unknown relation"); + + /* ---------------- + * warning! if the transaction id's get too large + * then a BlockNumber may not be large enough to hold the results + * of our division. + * + * XXX this will all vanish soon when we implement an improved + * transaction id schema -cim 3/23/90 + * + * This has vanished now that xid's are 4 bytes (no longer 5). + * -mer 5/24/92 + * ---------------- + */ + (*blockNumberOutP) = transactionId / itemsPerBlock; +} + + +/* ---------------------------------------------------------------- + * trans block support routines + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * TransBlockGetLastTransactionIdStatus + * + * This returns the status and transaction id of the last + * transaction information recorded on the given TransBlock. + * -------------------------------- + */ + +XidStatus +TransBlockGetLastTransactionIdStatus(Block tblock, + TransactionId baseXid, + TransactionId *returnXidP) +{ + Index index; + Index maxIndex; + bits8 bit1; + bits8 bit2; + BitIndex offset; + XidStatus xstatus; + + /* ---------------- + * sanity check + * ---------------- + */ + Assert((tblock != NULL)); + + /* ---------------- + * search downward from the top of the block data, looking + * for the first Non-in progress transaction status. Since we + * are scanning backward, this will be last recorded transaction + * status on the block. + * ---------------- + */ + maxIndex = TP_NumXidStatusPerBlock; + for (index = maxIndex-1; index>=0; index--) { + offset = BitIndexOf(index); + bit1 = ((bits8) BitArrayBitIsSet((BitArray) tblock, offset++)) << 1; + bit2 = (bits8) BitArrayBitIsSet((BitArray) tblock, offset); + + xstatus = (bit1 | bit2) ; + + /* ---------------- + * here we have the status of some transaction, so test + * if the status is recorded as "in progress". If so, then + * we save the transaction id in the place specified by the caller. + * ---------------- + */ + if (xstatus != XID_INPROGRESS) { + if (returnXidP != NULL) { + TransactionIdStore(baseXid, returnXidP); + TransactionIdAdd(returnXidP, index); + } + break; + } + } + + /* ---------------- + * if we get here and index is 0 it means we couldn't find + * a non-inprogress transaction on the block. For now we just + * return this info to the user. They can check if the return + * status is "in progress" to know this condition has arisen. + * ---------------- + */ + if (index == 0) { + if (returnXidP != NULL) + TransactionIdStore(baseXid, returnXidP); + } + + /* ---------------- + * return the status to the user + * ---------------- + */ + return xstatus; +} + +/* -------------------------------- + * TransBlockGetXidStatus + * + * This returns the status of the desired transaction + * -------------------------------- + */ + +XidStatus +TransBlockGetXidStatus(Block tblock, + TransactionId transactionId) +{ + Index index; + bits8 bit1; + bits8 bit2; + BitIndex offset; + + /* ---------------- + * sanity check + * ---------------- + */ + if (tblock == NULL) { + return XID_INVALID; + } + + /* ---------------- + * calculate the index into the transaction data where + * our transaction status is located + * + * XXX this will be replaced soon when we move to the + * new transaction id scheme -cim 3/23/90 + * + * The old system has now been replaced. -mer 5/24/92 + * ---------------- + */ + index = transactionId % TP_NumXidStatusPerBlock; + + /* ---------------- + * get the data at the specified index + * ---------------- + */ + offset = BitIndexOf(index); + bit1 = ((bits8) BitArrayBitIsSet((BitArray) tblock, offset++)) << 1; + bit2 = (bits8) BitArrayBitIsSet((BitArray) tblock, offset); + + /* ---------------- + * return the transaction status to the caller + * ---------------- + */ + return (XidStatus) + (bit1 | bit2); +} + +/* -------------------------------- + * TransBlockSetXidStatus + * + * This sets the status of the desired transaction + * -------------------------------- + */ +void +TransBlockSetXidStatus(Block tblock, + TransactionId transactionId, + XidStatus xstatus) +{ + Index index; + BitIndex offset; + + /* ---------------- + * sanity check + * ---------------- + */ + if (tblock == NULL) + return; + + /* ---------------- + * calculate the index into the transaction data where + * we sould store our transaction status. + * + * XXX this will be replaced soon when we move to the + * new transaction id scheme -cim 3/23/90 + * + * The new scheme is here -mer 5/24/92 + * ---------------- + */ + index = transactionId % TP_NumXidStatusPerBlock; + + offset = BitIndexOf(index); + + /* ---------------- + * store the transaction value at the specified offset + * ---------------- + */ + switch(xstatus) { + case XID_COMMIT: /* set 10 */ + BitArraySetBit((BitArray) tblock, offset); + BitArrayClearBit((BitArray) tblock, offset + 1); + break; + case XID_ABORT: /* set 01 */ + BitArrayClearBit((BitArray) tblock, offset); + BitArraySetBit((BitArray) tblock, offset + 1); + break; + case XID_INPROGRESS: /* set 00 */ + BitArrayClearBit((BitArray) tblock, offset); + BitArrayClearBit((BitArray) tblock, offset + 1); + break; + default: + elog(NOTICE, + "TransBlockSetXidStatus: invalid status: %d (ignored)", + xstatus); + break; + } +} + +/* -------------------------------- + * TransBlockGetCommitTime + * + * This returns the transaction commit time for the + * specified transaction id in the trans block. + * -------------------------------- + */ +AbsoluteTime +TransBlockGetCommitTime(Block tblock, + TransactionId transactionId) +{ + Index index; + AbsoluteTime *timeArray; + + /* ---------------- + * sanity check + * ---------------- + */ + if (tblock == NULL) + return INVALID_ABSTIME; + + /* ---------------- + * calculate the index into the transaction data where + * our transaction commit time is located + * + * XXX this will be replaced soon when we move to the + * new transaction id scheme -cim 3/23/90 + * + * The new scheme is here. -mer 5/24/92 + * ---------------- + */ + index = transactionId % TP_NumTimePerBlock; + + /* ---------------- + * return the commit time to the caller + * ---------------- + */ + timeArray = (AbsoluteTime *) tblock; + return (AbsoluteTime) + timeArray[ index ]; +} + +/* -------------------------------- + * TransBlockSetCommitTime + * + * This sets the commit time of the specified transaction + * -------------------------------- + */ +void +TransBlockSetCommitTime(Block tblock, + TransactionId transactionId, + AbsoluteTime commitTime) +{ + Index index; + AbsoluteTime *timeArray; + + /* ---------------- + * sanity check + * ---------------- + */ + if (tblock == NULL) + return; + + + /* ---------------- + * calculate the index into the transaction data where + * we sould store our transaction status. + * + * XXX this will be replaced soon when we move to the + * new transaction id scheme -cim 3/23/90 + * + * The new scheme is here. -mer 5/24/92 + * ---------------- + */ + index = transactionId % TP_NumTimePerBlock; + + /* ---------------- + * store the transaction commit time at the specified index + * ---------------- + */ + timeArray = (AbsoluteTime *) tblock; + timeArray[ index ] = commitTime; +} + +/* ---------------------------------------------------------------- + * transam i/o support routines + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * TransBlockNumberGetXidStatus + * -------------------------------- + */ +XidStatus +TransBlockNumberGetXidStatus(Relation relation, + BlockNumber blockNumber, + TransactionId xid, + bool *failP) +{ + Buffer buffer; /* buffer associated with block */ + Block block; /* block containing xstatus */ + XidStatus xstatus; /* recorded status of xid */ + bool localfail; /* bool used if failP = NULL */ + + /* ---------------- + * SOMEDAY place a read lock on the log relation + * That someday is today 5 Aug 1991 -mer + * ---------------- + */ + RelationSetLockForRead(relation); + + /* ---------------- + * get the page containing the transaction information + * ---------------- + */ + buffer = ReadBuffer(relation, blockNumber); + block = BufferGetBlock(buffer); + + /* ---------------- + * get the status from the block. note, for now we always + * return false in failP. + * ---------------- + */ + if (failP == NULL) + failP = &localfail; + (*failP) = false; + + xstatus = TransBlockGetXidStatus(block, xid); + + /* ---------------- + * release the buffer and return the status + * ---------------- + */ + ReleaseBuffer(buffer); + + /* ---------------- + * SOMEDAY release our lock on the log relation + * ---------------- + */ + RelationUnsetLockForRead(relation); + + return + xstatus; +} + +/* -------------------------------- + * TransBlockNumberSetXidStatus + * -------------------------------- + */ +void +TransBlockNumberSetXidStatus(Relation relation, + BlockNumber blockNumber, + TransactionId xid, + XidStatus xstatus, + bool *failP) +{ + Buffer buffer; /* buffer associated with block */ + Block block; /* block containing xstatus */ + bool localfail; /* bool used if failP = NULL */ + + /* ---------------- + * SOMEDAY gain exclusive access to the log relation + * + * That someday is today 5 Aug 1991 -mer + * ---------------- + */ + RelationSetLockForWrite(relation); + + /* ---------------- + * get the block containing the transaction status + * ---------------- + */ + buffer = ReadBuffer(relation, blockNumber); + block = BufferGetBlock(buffer); + + /* ---------------- + * attempt to update the status of the transaction on the block. + * if we are successful, write the block. otherwise release the buffer. + * note, for now we always return false in failP. + * ---------------- + */ + if (failP == NULL) + failP = &localfail; + (*failP) = false; + + TransBlockSetXidStatus(block, xid, xstatus); + + if ((*failP) == false) + WriteBuffer(buffer); + else + ReleaseBuffer(buffer); + + /* ---------------- + * SOMEDAY release our lock on the log relation + * ---------------- + */ + RelationUnsetLockForWrite(relation); +} + +/* -------------------------------- + * TransBlockNumberGetCommitTime + * -------------------------------- + */ +AbsoluteTime +TransBlockNumberGetCommitTime(Relation relation, + BlockNumber blockNumber, + TransactionId xid, + bool *failP) +{ + Buffer buffer; /* buffer associated with block */ + Block block; /* block containing commit time */ + bool localfail; /* bool used if failP = NULL */ + AbsoluteTime xtime; /* commit time */ + + /* ---------------- + * SOMEDAY place a read lock on the time relation + * + * That someday is today 5 Aug. 1991 -mer + * ---------------- + */ + RelationSetLockForRead(relation); + + /* ---------------- + * get the block containing the transaction information + * ---------------- + */ + buffer = ReadBuffer(relation, blockNumber); + block = BufferGetBlock(buffer); + + /* ---------------- + * get the commit time from the block + * note, for now we always return false in failP. + * ---------------- + */ + if (failP == NULL) + failP = &localfail; + (*failP) = false; + + xtime = TransBlockGetCommitTime(block, xid); + + /* ---------------- + * release the buffer and return the commit time + * ---------------- + */ + ReleaseBuffer(buffer); + + /* ---------------- + * SOMEDAY release our lock on the time relation + * ---------------- + */ + RelationUnsetLockForRead(relation); + + if ((*failP) == false) + return xtime; + else + return INVALID_ABSTIME; + +} + +/* -------------------------------- + * TransBlockNumberSetCommitTime + * -------------------------------- + */ +void +TransBlockNumberSetCommitTime(Relation relation, + BlockNumber blockNumber, + TransactionId xid, + AbsoluteTime xtime, + bool *failP) +{ + Buffer buffer; /* buffer associated with block */ + Block block; /* block containing commit time */ + bool localfail; /* bool used if failP = NULL */ + + /* ---------------- + * SOMEDAY gain exclusive access to the time relation + * + * That someday is today 5 Aug. 1991 -mer + * ---------------- + */ + RelationSetLockForWrite(relation); + + /* ---------------- + * get the block containing our commit time + * ---------------- + */ + buffer = ReadBuffer(relation, blockNumber); + block = BufferGetBlock(buffer); + + /* ---------------- + * attempt to update the commit time of the transaction on the block. + * if we are successful, write the block. otherwise release the buffer. + * note, for now we always return false in failP. + * ---------------- + */ + if (failP == NULL) + failP = &localfail; + (*failP) = false; + + TransBlockSetCommitTime(block, xid, xtime); + + if ((*failP) == false) + WriteBuffer(buffer); + else + ReleaseBuffer(buffer); + + /* ---------------- + * SOMEDAY release our lock on the time relation + * ---------------- + */ + RelationUnsetLockForWrite(relation); + +} + +/* -------------------------------- + * TransGetLastRecordedTransaction + * -------------------------------- + */ +void +TransGetLastRecordedTransaction(Relation relation, + TransactionId xid, /* return: transaction id */ + bool *failP) +{ + BlockNumber blockNumber; /* block number */ + Buffer buffer; /* buffer associated with block */ + Block block; /* block containing xid status */ + BlockNumber n; /* number of blocks in the relation */ + TransactionId baseXid; + + (*failP) = false; + + /* ---------------- + * SOMEDAY gain exclusive access to the log relation + * + * That someday is today 5 Aug. 1991 -mer + * It looks to me like we only need to set a read lock here, despite + * the above comment about exclusive access. The block is never + * actually written into, we only check status bits. + * ---------------- + */ + RelationSetLockForRead(relation); + + /* ---------------- + * we assume the last block of the log contains the last + * recorded transaction. If the relation is empty we return + * failure to the user. + * ---------------- + */ + n = RelationGetNumberOfBlocks(relation); + if (n == 0) { + (*failP) = true; + return; + } + + /* ---------------- + * get the block containing the transaction information + * ---------------- + */ + blockNumber = n-1; + buffer = ReadBuffer(relation, blockNumber); + block = BufferGetBlock(buffer); + + /* ---------------- + * get the last xid on the block + * ---------------- + */ + baseXid = blockNumber * TP_NumXidStatusPerBlock; + +/* XXX ???? xid won't get returned! - AY '94 */ + (void) TransBlockGetLastTransactionIdStatus(block, baseXid, &xid); + + ReleaseBuffer(buffer); + + /* ---------------- + * SOMEDAY release our lock on the log relation + * ---------------- + */ + RelationUnsetLockForRead(relation); +} diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c new file mode 100644 index 00000000000..a53cc7d35b1 --- /dev/null +++ b/src/backend/access/transam/varsup.c @@ -0,0 +1,606 @@ +/*------------------------------------------------------------------------- + * + * varsup.c-- + * postgres variable relation support routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/transam/varsup.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#include <math.h> +#include "postgres.h" + +#include "machine.h" /* in port/ directory (needed for BLCKSZ) */ +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" /* for OIDGENLOCKID */ + +#include "utils/rel.h" +#include "utils/elog.h" + +#include "access/heapam.h" +#include "access/transam.h" /* where the declarations go */ +#include "access/xact.h" /* where the declarations go */ + +#include "catalog/catname.h" + +/* ---------- + * note: we reserve the first 16384 object ids for internal use. + * oid's less than this appear in the .bki files. the choice of + * 16384 is completely arbitrary. + * ---------- + */ +#define BootstrapObjectIdData 16384 + +/* --------------------- + * spin lock for oid generation + * --------------------- + */ +int OidGenLockId; + +/* ---------------------------------------------------------------- + * variable relation query/update routines + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * VariableRelationGetNextXid + * -------------------------------- + */ +void +VariableRelationGetNextXid(TransactionId *xidP) +{ + Buffer buf; + VariableRelationContents var; + + /* ---------------- + * We assume that a spinlock has been acquire to guarantee + * exclusive access to the variable relation. + * ---------------- + */ + + /* ---------------- + * do nothing before things are initialized + * ---------------- + */ + if (! RelationIsValid(VariableRelation)) + return; + + /* ---------------- + * read the variable page, get the the nextXid field and + * release the buffer + * ---------------- + */ + buf = ReadBuffer(VariableRelation, 0); + + if (! BufferIsValid(buf)) + { + SpinRelease(OidGenLockId); + elog(WARN, "VariableRelationGetNextXid: ReadBuffer failed"); + } + + var = (VariableRelationContents) BufferGetBlock(buf); + + TransactionIdStore(var->nextXidData, xidP); + ReleaseBuffer(buf); +} + +/* -------------------------------- + * VariableRelationGetLastXid + * -------------------------------- + */ +void +VariableRelationGetLastXid(TransactionId *xidP) +{ + Buffer buf; + VariableRelationContents var; + + /* ---------------- + * We assume that a spinlock has been acquire to guarantee + * exclusive access to the variable relation. + * ---------------- + */ + + /* ---------------- + * do nothing before things are initialized + * ---------------- + */ + if (! RelationIsValid(VariableRelation)) + return; + + /* ---------------- + * read the variable page, get the the lastXid field and + * release the buffer + * ---------------- + */ + buf = ReadBuffer(VariableRelation, 0); + + if (! BufferIsValid(buf)) + { + SpinRelease(OidGenLockId); + elog(WARN, "VariableRelationGetNextXid: ReadBuffer failed"); + } + + var = (VariableRelationContents) BufferGetBlock(buf); + + TransactionIdStore(var->lastXidData, xidP); + + ReleaseBuffer(buf); +} + +/* -------------------------------- + * VariableRelationPutNextXid + * -------------------------------- + */ +void +VariableRelationPutNextXid(TransactionId xid) +{ + Buffer buf; + VariableRelationContents var; + + /* ---------------- + * We assume that a spinlock has been acquire to guarantee + * exclusive access to the variable relation. + * ---------------- + */ + + /* ---------------- + * do nothing before things are initialized + * ---------------- + */ + if (! RelationIsValid(VariableRelation)) + return; + + /* ---------------- + * read the variable page, update the nextXid field and + * write the page back out to disk. + * ---------------- + */ + buf = ReadBuffer(VariableRelation, 0); + + if (! BufferIsValid(buf)) + { + SpinRelease(OidGenLockId); + elog(WARN, "VariableRelationPutNextXid: ReadBuffer failed"); + } + + var = (VariableRelationContents) BufferGetBlock(buf); + + TransactionIdStore(xid, &(var->nextXidData)); + + WriteBuffer(buf); +} + +/* -------------------------------- + * VariableRelationPutLastXid + * -------------------------------- + */ +void +VariableRelationPutLastXid(TransactionId xid) +{ + Buffer buf; + VariableRelationContents var; + + /* ---------------- + * We assume that a spinlock has been acquire to guarantee + * exclusive access to the variable relation. + * ---------------- + */ + + /* ---------------- + * do nothing before things are initialized + * ---------------- + */ + if (! RelationIsValid(VariableRelation)) + return; + + /* ---------------- + * read the variable page, update the lastXid field and + * force the page back out to disk. + * ---------------- + */ + buf = ReadBuffer(VariableRelation, 0); + + if (! BufferIsValid(buf)) + { + SpinRelease(OidGenLockId); + elog(WARN, "VariableRelationPutLastXid: ReadBuffer failed"); + } + + var = (VariableRelationContents) BufferGetBlock(buf); + + TransactionIdStore(xid, &(var->lastXidData)); + + WriteBuffer(buf); +} + +/* -------------------------------- + * VariableRelationGetNextOid + * -------------------------------- + */ +void +VariableRelationGetNextOid(Oid *oid_return) +{ + Buffer buf; + VariableRelationContents var; + + /* ---------------- + * We assume that a spinlock has been acquire to guarantee + * exclusive access to the variable relation. + * ---------------- + */ + + /* ---------------- + * if the variable relation is not initialized, then we + * assume we are running at bootstrap time and so we return + * an invalid object id -- during this time GetNextBootstrapObjectId + * should be called instead.. + * ---------------- + */ + if (! RelationIsValid(VariableRelation)) { + if (PointerIsValid(oid_return)) + (*oid_return) = InvalidOid; + return; + } + + /* ---------------- + * read the variable page, get the the nextOid field and + * release the buffer + * ---------------- + */ + buf = ReadBuffer(VariableRelation, 0); + + if (! BufferIsValid(buf)) + { + SpinRelease(OidGenLockId); + elog(WARN, "VariableRelationGetNextXid: ReadBuffer failed"); + } + + var = (VariableRelationContents) BufferGetBlock(buf); + + if (PointerIsValid(oid_return)) { + + /* ---------------- + * nothing up my sleeve... what's going on here is that this code + * is guaranteed never to be called until all files in data/base/ + * are created, and the template database exists. at that point, + * we want to append a pg_database tuple. the first time we do + * this, the oid stored in pg_variable will be bogus, so we use + * a bootstrap value defined at the top of this file. + * + * this comment no longer holds true. This code is called before + * all of the files in data/base are created and you can't rely + * on system oid's to be less than BootstrapObjectIdData. mer 9/18/91 + * ---------------- + */ + if (OidIsValid(var->nextOid)) + (*oid_return) = var->nextOid; + else + (*oid_return) = BootstrapObjectIdData; + } + + ReleaseBuffer(buf); +} + +/* -------------------------------- + * VariableRelationPutNextOid + * -------------------------------- + */ +void +VariableRelationPutNextOid(Oid *oidP) +{ + Buffer buf; + VariableRelationContents var; + + /* ---------------- + * We assume that a spinlock has been acquire to guarantee + * exclusive access to the variable relation. + * ---------------- + */ + + /* ---------------- + * do nothing before things are initialized + * ---------------- + */ + if (! RelationIsValid(VariableRelation)) + return; + + /* ---------------- + * sanity check + * ---------------- + */ + if (! PointerIsValid(oidP)) + { + SpinRelease(OidGenLockId); + elog(WARN, "VariableRelationPutNextOid: invalid oid pointer"); + } + + /* ---------------- + * read the variable page, update the nextXid field and + * write the page back out to disk. + * ---------------- + */ + buf = ReadBuffer(VariableRelation, 0); + + if (! BufferIsValid(buf)) + { + SpinRelease(OidGenLockId); + elog(WARN, "VariableRelationPutNextXid: ReadBuffer failed"); + } + + var = (VariableRelationContents) BufferGetBlock(buf); + + var->nextOid = (*oidP); + + WriteBuffer(buf); +} + +/* ---------------------------------------------------------------- + * transaction id generation support + * ---------------------------------------------------------------- + */ + +/* ---------------- + * GetNewTransactionId + * + * In the version 2 transaction system, transaction id's are + * restricted in several ways. + * + * First, all transaction id's are even numbers (4, 88, 121342, etc). + * This means the binary representation of the number will never + * have the least significent bit set. This bit is reserved to + * indicate that the transaction id does not in fact hold an XID, + * but rather a commit time. This makes it possible for the + * vaccuum daemon to disgard information from the log and time + * relations for committed tuples. This is important when archiving + * tuples to an optical disk because tuples with commit times + * stored in their xid fields will not need to consult the log + * and time relations. + * + * Second, since we may someday preform compression of the data + * in the log and time relations, we cause the numbering of the + * transaction ids to begin at 512. This means that some space + * on the page of the log and time relations corresponding to + * transaction id's 0 - 510 will never be used. This space is + * in fact used to store the version number of the postgres + * transaction log and will someday store compression information + * about the log. + * + * Lastly, rather then access the variable relation each time + * a backend requests a new transction id, we "prefetch" 32 + * transaction id's by incrementing the nextXid stored in the + * var relation by 64 (remember only even xid's are legal) and then + * returning these id's one at a time until they are exhausted. + * This means we reduce the number of accesses to the variable + * relation by 32 for each backend. + * + * Note: 32 has no special significance. We don't want the + * number to be too large because if when the backend + * terminates, we lose the xid's we cached. + * + * ---------------- + */ + +#define VAR_XID_PREFETCH 32 + +static int prefetched_xid_count = 0; +static TransactionId next_prefetched_xid; + +void +GetNewTransactionId(TransactionId *xid) +{ + TransactionId nextid; + + /* ---------------- + * during bootstrap initialization, we return the special + * bootstrap transaction id. + * ---------------- + */ + if (AMI_OVERRIDE) { + TransactionIdStore(AmiTransactionId, xid); + return; + } + + /* ---------------- + * if we run out of prefetched xids, then we get some + * more before handing them out to the caller. + * ---------------- + */ + + if (prefetched_xid_count == 0) { + /* ---------------- + * obtain exclusive access to the variable relation page + * + * get the "next" xid from the variable relation + * and save it in the prefetched id. + * ---------------- + */ + SpinAcquire(OidGenLockId); + VariableRelationGetNextXid(&nextid); + TransactionIdStore(nextid, &next_prefetched_xid); + + /* ---------------- + * now increment the variable relation's next xid + * and reset the prefetched_xid_count. We multiply + * the id by two because our xid's are always even. + * ---------------- + */ + prefetched_xid_count = VAR_XID_PREFETCH; + TransactionIdAdd(&nextid, prefetched_xid_count); + VariableRelationPutNextXid(nextid); + SpinRelease(OidGenLockId); + } + + /* ---------------- + * return the next prefetched xid in the pointer passed by + * the user and decrement the prefetch count. We add two + * to id we return the next time this is called because our + * transaction ids are always even. + * + * XXX Transaction Ids used to be even as the low order bit was + * used to determine commit status. This is no long true so + * we now use even and odd transaction ids. -mer 5/26/92 + * ---------------- + */ + TransactionIdStore(next_prefetched_xid, xid); + TransactionIdAdd(&next_prefetched_xid, 1); + prefetched_xid_count--; +} + +/* ---------------- + * UpdateLastCommittedXid + * ---------------- + */ + +void +UpdateLastCommittedXid(TransactionId xid) +{ + TransactionId lastid; + + + /* we assume that spinlock OidGenLockId has been acquired + * prior to entering this function + */ + + /* ---------------- + * get the "last committed" transaction id from + * the variable relation page. + * ---------------- + */ + VariableRelationGetLastXid(&lastid); + + /* ---------------- + * if the transaction id is greater than the last committed + * transaction then we update the last committed transaction + * in the variable relation. + * ---------------- + */ + if (TransactionIdIsLessThan(lastid, xid)) + VariableRelationPutLastXid(xid); + +} + +/* ---------------------------------------------------------------- + * object id generation support + * ---------------------------------------------------------------- + */ + +/* ---------------- + * GetNewObjectIdBlock + * + * This support function is used to allocate a block of object ids + * of the given size. applications wishing to do their own object + * id assignments should use this + * ---------------- + */ +void +GetNewObjectIdBlock(Oid *oid_return, /* place to return the new object id */ + int oid_block_size) /* number of oids desired */ +{ + Oid nextoid; + + /* ---------------- + * SOMEDAY obtain exclusive access to the variable relation page + * That someday is today -mer 6 Aug 1992 + * ---------------- + */ + SpinAcquire(OidGenLockId); + + /* ---------------- + * get the "next" oid from the variable relation + * and give it to the caller. + * ---------------- + */ + VariableRelationGetNextOid(&nextoid); + if (PointerIsValid(oid_return)) + (*oid_return) = nextoid; + + /* ---------------- + * now increment the variable relation's next oid + * field by the size of the oid block requested. + * ---------------- + */ + nextoid += oid_block_size; + VariableRelationPutNextOid(&nextoid); + + /* ---------------- + * SOMEDAY relinquish our lock on the variable relation page + * That someday is today -mer 6 Apr 1992 + * ---------------- + */ + SpinRelease(OidGenLockId); +} + +/* ---------------- + * GetNewObjectId + * + * This function allocates and parses out object ids. Like + * GetNewTransactionId(), it "prefetches" 32 object ids by + * incrementing the nextOid stored in the var relation by 32 and then + * returning these id's one at a time until they are exhausted. + * This means we reduce the number of accesses to the variable + * relation by 32 for each backend. + * + * Note: 32 has no special significance. We don't want the + * number to be too large because if when the backend + * terminates, we lose the oids we cached. + * + * ---------------- + */ + +#define VAR_OID_PREFETCH 32 + +static int prefetched_oid_count = 0; +static Oid next_prefetched_oid; + +void +GetNewObjectId(Oid *oid_return) /* place to return the new object id */ +{ + /* ---------------- + * if we run out of prefetched oids, then we get some + * more before handing them out to the caller. + * ---------------- + */ + + if (prefetched_oid_count == 0) { + int oid_block_size = VAR_OID_PREFETCH; + + /* ---------------- + * during bootstrap time, we want to allocate oids + * one at a time. Otherwise there might be some + * bootstrap oid's left in the block we prefetch which + * would be passed out after the variable relation was + * initialized. This would be bad. + * ---------------- + */ + if (! RelationIsValid(VariableRelation)) + VariableRelation = heap_openr(VariableRelationName); + + /* ---------------- + * get a new block of prefetched object ids. + * ---------------- + */ + GetNewObjectIdBlock(&next_prefetched_oid, oid_block_size); + + /* ---------------- + * now reset the prefetched_oid_count. + * ---------------- + */ + prefetched_oid_count = oid_block_size; + } + + /* ---------------- + * return the next prefetched oid in the pointer passed by + * the user and decrement the prefetch count. + * ---------------- + */ + if (PointerIsValid(oid_return)) + (*oid_return) = next_prefetched_oid; + + next_prefetched_oid++; + prefetched_oid_count--; +} diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c new file mode 100644 index 00000000000..1798d09d054 --- /dev/null +++ b/src/backend/access/transam/xact.c @@ -0,0 +1,1314 @@ +/*------------------------------------------------------------------------- + * + * xact.c-- + * top level transaction system support routines + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.1.1.1 1996/07/09 06:21:13 scrappy Exp $ + * + * NOTES + * Transaction aborts can now occur two ways: + * + * 1) system dies from some internal cause (Assert, etc..) + * 2) user types abort + * + * These two cases used to be treated identically, but now + * we need to distinguish them. Why? consider the following + * two situatuons: + * + * case 1 case 2 + * ------ ------ + * 1) user types BEGIN 1) user types BEGIN + * 2) user does something 2) user does something + * 3) user does not like what 3) system aborts for some reason + * she shes and types ABORT + * + * In case 1, we want to abort the transaction and return to the + * default state. In case 2, there may be more commands coming + * our way which are part of the same transaction block and we have + * to ignore these commands until we see an END transaction. + * + * Internal aborts are now handled by AbortTransactionBlock(), just as + * they always have been, and user aborts are now handled by + * UserAbortTransactionBlock(). Both of them rely on AbortTransaction() + * to do all the real work. The only difference is what state we + * enter after AbortTransaction() does it's work: + * + * * AbortTransactionBlock() leaves us in TBLOCK_ABORT and + * * UserAbortTransactionBlock() leaves us in TBLOCK_ENDABORT + * + * NOTES + * This file is an attempt at a redesign of the upper layer + * of the V1 transaction system which was too poorly thought + * out to describe. This new system hopes to be both simpler + * in design, simpler to extend and needs to contain added + * functionality to solve problems beyond the scope of the V1 + * system. (In particuler, communication of transaction + * information between parallel backends has to be supported) + * + * The essential aspects of the transaction system are: + * + * o transaction id generation + * o transaction log updating + * o memory cleanup + * o cache invalidation + * o lock cleanup + * + * Hence, the functional division of the transaction code is + * based on what of the above things need to be done during + * a start/commit/abort transaction. For instance, the + * routine AtCommit_Memory() takes care of all the memory + * cleanup stuff done at commit time. + * + * The code is layered as follows: + * + * StartTransaction + * CommitTransaction + * AbortTransaction + * UserAbortTransaction + * + * are provided to do the lower level work like recording + * the transaction status in the log and doing memory cleanup. + * above these routines are another set of functions: + * + * StartTransactionCommand + * CommitTransactionCommand + * AbortCurrentTransaction + * + * These are the routines used in the postgres main processing + * loop. They are sensitive to the current transaction block state + * and make calls to the lower level routines appropriately. + * + * Support for transaction blocks is provided via the functions: + * + * StartTransactionBlock + * CommitTransactionBlock + * AbortTransactionBlock + * + * These are invoked only in responce to a user "BEGIN", "END", + * or "ABORT" command. The tricky part about these functions + * is that they are called within the postgres main loop, in between + * the StartTransactionCommand() and CommitTransactionCommand(). + * + * For example, consider the following sequence of user commands: + * + * 1) begin + * 2) retrieve (foo.all) + * 3) append foo (bar = baz) + * 4) end + * + * in the main processing loop, this results in the following + * transaction sequence: + * + * / StartTransactionCommand(); + * 1) / ProcessUtility(); << begin + * \ StartTransactionBlock(); + * \ CommitTransactionCommand(); + * + * / StartTransactionCommand(); + * 2) < ProcessQuery(); << retrieve (foo.all) + * \ CommitTransactionCommand(); + * + * / StartTransactionCommand(); + * 3) < ProcessQuery(); << append foo (bar = baz) + * \ CommitTransactionCommand(); + * + * / StartTransactionCommand(); + * 4) / ProcessUtility(); << end + * \ CommitTransactionBlock(); + * \ CommitTransactionCommand(); + * + * The point of this example is to demonstrate the need for + * StartTransactionCommand() and CommitTransactionCommand() to + * be state smart -- they should do nothing in between the calls + * to StartTransactionBlock() and EndTransactionBlock() and + * outside these calls they need to do normal start/commit + * processing. + * + * Furthermore, suppose the "retrieve (foo.all)" caused an abort + * condition. We would then want to abort the transaction and + * ignore all subsequent commands up to the "end". + * -cim 3/23/90 + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "access/xact.h" +#include "commands/async.h" +#include "storage/bufmgr.h" +#include "storage/block.h" +#include "storage/proc.h" +#include "utils/inval.h" +#include "utils/relcache.h" +#include "access/transam.h" +#include "catalog/heap.h" + +/* ---------------- + * global variables holding the current transaction state. + * + * Note: when we are running several slave processes, the + * current transaction state data is copied into shared memory + * and the CurrentTransactionState pointer changed to + * point to the shared copy. All this occurrs in slaves.c + * ---------------- + */ +TransactionStateData CurrentTransactionStateData = { + 0, /* transaction id */ + FirstCommandId, /* command id */ + 0x0, /* start time */ + TRANS_DEFAULT, /* transaction state */ + TBLOCK_DEFAULT /* transaction block state */ + }; + +TransactionState CurrentTransactionState = + &CurrentTransactionStateData; + +/* ---------------- + * info returned when the system is desabled + * + * Note: I have no idea what the significance of the + * 1073741823 in DisabledStartTime.. I just carried + * this over when converting things from the old + * V1 transaction system. -cim 3/18/90 + * ---------------- + */ +TransactionId DisabledTransactionId = (TransactionId)-1; + +CommandId DisabledCommandId = (CommandId) -1; + +AbsoluteTime DisabledStartTime = (AbsoluteTime) 1073741823; + +/* ---------------- + * overflow flag + * ---------------- + */ +bool CommandIdCounterOverflowFlag; + +/* ---------------- + * catalog creation transaction bootstrapping flag. + * This should be eliminated and added to the transaction + * state stuff. -cim 3/19/90 + * ---------------- + */ +bool AMI_OVERRIDE = false; + +/* ---------------------------------------------------------------- + * transaction state accessors + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * TranactionFlushEnabled() + * SetTranactionFlushEnabled() + * + * These are used to test and set the "TransactionFlushState" + * varable. If this variable is true (the default), then + * the system will flush all dirty buffers to disk at the end + * of each transaction. If false then we are assuming the + * buffer pool resides in stable main memory, in which case we + * only do writes as necessary. + * -------------------------------- + */ +static int TransactionFlushState = 1; + +int +TransactionFlushEnabled() +{ + return TransactionFlushState; +} + +void +SetTransactionFlushEnabled(bool state) +{ + TransactionFlushState = (state == true); +} + +/* -------------------------------- + * IsTransactionState + * + * This returns true if we are currently running a query + * within an executing transaction. + * -------------------------------- + */ +bool +IsTransactionState() +{ + TransactionState s = CurrentTransactionState; + + switch (s->state) { + case TRANS_DEFAULT: return false; + case TRANS_START: return true; + case TRANS_INPROGRESS: return true; + case TRANS_COMMIT: return true; + case TRANS_ABORT: return true; + case TRANS_DISABLED: return false; + } + /* + * Shouldn't get here, but lint is not happy with this... + */ + return(false); +} + +/* -------------------------------- + * IsAbortedTransactionBlockState + * + * This returns true if we are currently running a query + * within an aborted transaction block. + * -------------------------------- + */ +bool +IsAbortedTransactionBlockState() +{ + TransactionState s = CurrentTransactionState; + + if (s->blockState == TBLOCK_ABORT) + return true; + + return false; +} + +/* -------------------------------- + * OverrideTransactionSystem + * + * This is used to temporarily disable the transaction + * processing system in order to do initialization of + * the transaction system data structures and relations + * themselves. + * -------------------------------- + */ +int SavedTransactionState; + +void +OverrideTransactionSystem(bool flag) +{ + TransactionState s = CurrentTransactionState; + + if (flag == true) { + if (s->state == TRANS_DISABLED) + return; + + SavedTransactionState = s->state; + s->state = TRANS_DISABLED; + } else { + if (s->state != TRANS_DISABLED) + return; + + s->state = SavedTransactionState; + } +} + +/* -------------------------------- + * GetCurrentTransactionId + * + * This returns the id of the current transaction, or + * the id of the "disabled" transaction. + * -------------------------------- + */ +TransactionId +GetCurrentTransactionId() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * if the transaction system is disabled, we return + * the special "disabled" transaction id. + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return (TransactionId) DisabledTransactionId; + + /* ---------------- + * otherwise return the current transaction id. + * ---------------- + */ + return (TransactionId) s->transactionIdData; +} + + +/* -------------------------------- + * GetCurrentCommandId + * -------------------------------- + */ +CommandId +GetCurrentCommandId() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * if the transaction system is disabled, we return + * the special "disabled" command id. + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return (CommandId) DisabledCommandId; + + return s->commandId; +} + + +/* -------------------------------- + * GetCurrentTransactionStartTime + * -------------------------------- + */ +AbsoluteTime +GetCurrentTransactionStartTime() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * if the transaction system is disabled, we return + * the special "disabled" starting time. + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return (AbsoluteTime) DisabledStartTime; + + return s->startTime; +} + + +/* -------------------------------- + * TransactionIdIsCurrentTransactionId + * -------------------------------- + */ +bool +TransactionIdIsCurrentTransactionId(TransactionId xid) +{ + TransactionState s = CurrentTransactionState; + + if (AMI_OVERRIDE) + return false; + + return (bool) + TransactionIdEquals(xid, s->transactionIdData); +} + + +/* -------------------------------- + * CommandIdIsCurrentCommandId + * -------------------------------- + */ +bool +CommandIdIsCurrentCommandId(CommandId cid) +{ + TransactionState s = CurrentTransactionState; + + if (AMI_OVERRIDE) + return false; + + return + (cid == s->commandId) ? true : false; +} + + +/* -------------------------------- + * ClearCommandIdCounterOverflowFlag + * -------------------------------- + */ +void +ClearCommandIdCounterOverflowFlag() +{ + CommandIdCounterOverflowFlag = false; +} + + +/* -------------------------------- + * CommandCounterIncrement + * -------------------------------- + */ +void +CommandCounterIncrement() +{ + CurrentTransactionStateData.commandId += 1; + if (CurrentTransactionStateData.commandId == FirstCommandId) { + CommandIdCounterOverflowFlag = true; + elog(WARN, "You may only have 65535 commands per transaction"); + } + + /* make cache changes visible to me */ + AtCommit_Cache(); + AtStart_Cache(); +} + +/* ---------------------------------------------------------------- + * initialization stuff + * ---------------------------------------------------------------- + */ +void +InitializeTransactionSystem() +{ + InitializeTransactionLog(); +} + +/* ---------------------------------------------------------------- + * StartTransaction stuff + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * AtStart_Cache + * -------------------------------- + */ +void +AtStart_Cache() +{ + DiscardInvalid(); +} + +/* -------------------------------- + * AtStart_Locks + * -------------------------------- + */ +void +AtStart_Locks() +{ + /* + * at present, it is unknown to me what belongs here -cim 3/18/90 + * + * There isn't anything to do at the start of a xact for locks. + * -mer 5/24/92 + */ +} + +/* -------------------------------- + * AtStart_Memory + * -------------------------------- + */ +void +AtStart_Memory() +{ + Portal portal; + MemoryContext portalContext; + + /* ---------------- + * get the blank portal and its memory context + * ---------------- + */ + portal = GetPortalByName(NULL); + portalContext = (MemoryContext) PortalGetHeapMemory(portal); + + /* ---------------- + * tell system to allocate in the blank portal context + * ---------------- + */ + (void) MemoryContextSwitchTo(portalContext); + StartPortalAllocMode(DefaultAllocMode, 0); +} + + +/* ---------------------------------------------------------------- + * CommitTransaction stuff + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * RecordTransactionCommit + * + * Note: the two calls to BufferManagerFlush() exist to ensure + * that data pages are written before log pages. These + * explicit calls should be replaced by a more efficient + * ordered page write scheme in the buffer manager + * -cim 3/18/90 + * -------------------------------- + */ +void +RecordTransactionCommit() +{ + TransactionId xid; + int leak; + + /* ---------------- + * get the current transaction id + * ---------------- + */ + xid = GetCurrentTransactionId(); + + /* ---------------- + * flush the buffer manager pages. Note: if we have stable + * main memory, dirty shared buffers are not flushed + * plai 8/7/90 + * ---------------- + */ + leak = BufferPoolCheckLeak(); + FlushBufferPool(!TransactionFlushEnabled()); + if (leak) ResetBufferPool(); + + /* ---------------- + * have the transaction access methods record the status + * of this transaction id in the pg_log / pg_time relations. + * ---------------- + */ + TransactionIdCommit(xid); + + /* ---------------- + * Now write the log/time info to the disk too. + * ---------------- + */ + leak = BufferPoolCheckLeak(); + FlushBufferPool(!TransactionFlushEnabled()); + if (leak) ResetBufferPool(); +} + + +/* -------------------------------- + * AtCommit_Cache + * -------------------------------- + */ +void +AtCommit_Cache() +{ + /* ---------------- + * Make catalog changes visible to me for the next command. + * Other backends will not process my invalidation messages until + * after I commit and free my locks--though they will do + * unnecessary work if I abort. + * ---------------- + */ + RegisterInvalid(true); +} + +/* -------------------------------- + * AtCommit_Locks + * -------------------------------- + */ +void +AtCommit_Locks() +{ + /* ---------------- + * XXX What if ProcReleaseLocks fails? (race condition?) + * + * Then you're up a creek! -mer 5/24/92 + * ---------------- + */ + ProcReleaseLocks(); +} + +/* -------------------------------- + * AtCommit_Memory + * -------------------------------- + */ +void +AtCommit_Memory() +{ + /* ---------------- + * now that we're "out" of a transaction, have the + * system allocate things in the top memory context instead + * of the blank portal memory context. + * ---------------- + */ + EndPortalAllocMode(); + (void) MemoryContextSwitchTo(TopMemoryContext); +} + +/* ---------------------------------------------------------------- + * AbortTransaction stuff + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * RecordTransactionAbort + * -------------------------------- + */ +void +RecordTransactionAbort() +{ + TransactionId xid; + + /* ---------------- + * get the current transaction id + * ---------------- + */ + xid = GetCurrentTransactionId(); + + /* ---------------- + * have the transaction access methods record the status + * of this transaction id in the pg_log / pg_time relations. + * ---------------- + */ + TransactionIdAbort(xid); + + /* ---------------- + * flush the buffer manager pages. Note: if we have stable + * main memory, dirty shared buffers are not flushed + * plai 8/7/90 + * ---------------- + */ + ResetBufferPool(); +} + +/* -------------------------------- + * AtAbort_Cache + * -------------------------------- + */ +void +AtAbort_Cache() +{ + RegisterInvalid(false); +} + +/* -------------------------------- + * AtAbort_Locks + * -------------------------------- + */ +void +AtAbort_Locks() +{ + /* ---------------- + * XXX What if ProcReleaseLocks() fails? (race condition?) + * + * Then you're up a creek without a paddle! -mer + * ---------------- + */ + ProcReleaseLocks(); +} + + +/* -------------------------------- + * AtAbort_Memory + * -------------------------------- + */ +void +AtAbort_Memory() +{ + /* ---------------- + * after doing an abort transaction, make certain the + * system uses the top memory context rather then the + * portal memory context (until the next transaction). + * ---------------- + */ + (void) MemoryContextSwitchTo(TopMemoryContext); +} + +/* ---------------------------------------------------------------- + * interface routines + * ---------------------------------------------------------------- + */ + +/* -------------------------------- + * StartTransaction + * + * -------------------------------- + */ +void +StartTransaction() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * Check the current transaction state. If the transaction system + * is switched off, or if we're already in a transaction, do nothing. + * We're already in a transaction when the monitor sends a null + * command to the backend to flush the comm channel. This is a + * hacky fix to a communications problem, and we keep having to + * deal with it here. We should fix the comm channel code. mao 080891 + * ---------------- + */ + if (s->state == TRANS_DISABLED || s->state == TRANS_INPROGRESS) + return; + + /* ---------------- + * set the current transaction state information + * appropriately during start processing + * ---------------- + */ + s->state = TRANS_START; + + /* ---------------- + * generate a new transaction id + * ---------------- + */ + GetNewTransactionId(&(s->transactionIdData)); + + /* ---------------- + * initialize current transaction state fields + * ---------------- + */ + s->commandId = FirstCommandId; + s->startTime = GetCurrentAbsoluteTime(); + + /* ---------------- + * initialize the various transaction subsystems + * ---------------- + */ + AtStart_Cache(); + AtStart_Locks(); + AtStart_Memory(); + + /* -------------- + initialize temporary relations list + the tempRelList is a list of temporary relations that + are created in the course of the transactions + they need to be destroyed properly at the end of the transactions + */ + InitTempRelList(); + + /* ---------------- + * done with start processing, set current transaction + * state to "in progress" + * ---------------- + */ + s->state = TRANS_INPROGRESS; +} + +/* --------------- + * Tell me if we are currently in progress + * --------------- + */ +bool +CurrentXactInProgress() +{ + return (CurrentTransactionState->state == TRANS_INPROGRESS); +} + +/* -------------------------------- + * CommitTransaction + * + * -------------------------------- + */ +void +CommitTransaction() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * check the current transaction state + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return; + + if (s->state != TRANS_INPROGRESS) + elog(NOTICE, "CommitTransaction and not in in-progress state "); + + /* ---------------- + * set the current transaction state information + * appropriately during the abort processing + * ---------------- + */ + s->state = TRANS_COMMIT; + + /* ---------------- + * do commit processing + * ---------------- + */ + DestroyTempRels(); + AtEOXact_portals(); + RecordTransactionCommit(); + RelationPurgeLocalRelation(true); + AtCommit_Cache(); + AtCommit_Locks(); + AtCommit_Memory(); + + /* ---------------- + * done with commit processing, set current transaction + * state back to default + * ---------------- + */ + s->state = TRANS_DEFAULT; + { /* want this after commit */ + if (IsNormalProcessingMode()) + Async_NotifyAtCommit(); + } +} + +/* -------------------------------- + * AbortTransaction + * + * -------------------------------- + */ +void +AbortTransaction() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * check the current transaction state + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return; + + if (s->state != TRANS_INPROGRESS) + elog(NOTICE, "AbortTransaction and not in in-progress state "); + + /* ---------------- + * set the current transaction state information + * appropriately during the abort processing + * ---------------- + */ + s->state = TRANS_ABORT; + + /* ---------------- + * do abort processing + * ---------------- + */ + AtEOXact_portals(); + RecordTransactionAbort(); + RelationPurgeLocalRelation(false); + DestroyTempRels(); + AtAbort_Cache(); + AtAbort_Locks(); + AtAbort_Memory(); + + /* ---------------- + * done with abort processing, set current transaction + * state back to default + * ---------------- + */ + s->state = TRANS_DEFAULT; + { + /* We need to do this in case another process notified us while + we are in the middle of an aborted transaction. We need to + notify our frontend after we finish the current transaction. + -- jw, 1/3/94 + */ + if (IsNormalProcessingMode()) + Async_NotifyAtAbort(); + } +} + +/* -------------------------------- + * StartTransactionCommand + * -------------------------------- + */ +void +StartTransactionCommand() +{ + TransactionState s = CurrentTransactionState; + + switch(s->blockState) { + /* ---------------- + * if we aren't in a transaction block, we + * just do our usual start transaction. + * ---------------- + */ + case TBLOCK_DEFAULT: + StartTransaction(); + break; + + /* ---------------- + * We should never experience this -- if we do it + * means the BEGIN state was not changed in the previous + * CommitTransactionCommand(). If we get it, we print + * a warning and change to the in-progress state. + * ---------------- + */ + case TBLOCK_BEGIN: + elog(NOTICE, "StartTransactionCommand: unexpected TBLOCK_BEGIN"); + s->blockState = TBLOCK_INPROGRESS; + break; + + /* ---------------- + * This is the case when are somewhere in a transaction + * block and about to start a new command. For now we + * do nothing but someday we may do command-local resource + * initialization. + * ---------------- + */ + case TBLOCK_INPROGRESS: + break; + + /* ---------------- + * As with BEGIN, we should never experience this -- + * if we do it means the END state was not changed in the + * previous CommitTransactionCommand(). If we get it, we + * print a warning, commit the transaction, start a new + * transaction and change to the default state. + * ---------------- + */ + case TBLOCK_END: + elog(NOTICE, "StartTransactionCommand: unexpected TBLOCK_END"); + s->blockState = TBLOCK_DEFAULT; + CommitTransaction(); + StartTransaction(); + break; + + /* ---------------- + * Here we are in the middle of a transaction block but + * one of the commands caused an abort so we do nothing + * but remain in the abort state. Eventually we will get + * to the "END TRANSACTION" which will set things straight. + * ---------------- + */ + case TBLOCK_ABORT: + break; + + /* ---------------- + * This means we somehow aborted and the last call to + * CommitTransactionCommand() didn't clear the state so + * we remain in the ENDABORT state and mabey next time + * we get to CommitTransactionCommand() the state will + * get reset to default. + * ---------------- + */ + case TBLOCK_ENDABORT: + elog(NOTICE, "StartTransactionCommand: unexpected TBLOCK_ENDABORT"); + break; + } +} +/* -------------------------------- + * CommitTransactionCommand + * -------------------------------- + */ +void +CommitTransactionCommand() +{ + TransactionState s = CurrentTransactionState; + + switch(s->blockState) { + /* ---------------- + * if we aren't in a transaction block, we + * just do our usual transaction commit + * ---------------- + */ + case TBLOCK_DEFAULT: + CommitTransaction(); + break; + + /* ---------------- + * This is the case right after we get a "BEGIN TRANSACTION" + * command, but the user hasn't done anything else yet, so + * we change to the "transaction block in progress" state + * and return. + * ---------------- + */ + case TBLOCK_BEGIN: + s->blockState = TBLOCK_INPROGRESS; + break; + + /* ---------------- + * This is the case when we have finished executing a command + * someplace within a transaction block. We increment the + * command counter and return. Someday we may free resources + * local to the command. + * ---------------- + */ + case TBLOCK_INPROGRESS: + CommandCounterIncrement(); + break; + + /* ---------------- + * This is the case when we just got the "END TRANSACTION" + * statement, so we go back to the default state and + * commit the transaction. + * ---------------- + */ + case TBLOCK_END: + s->blockState = TBLOCK_DEFAULT; + CommitTransaction(); + break; + + /* ---------------- + * Here we are in the middle of a transaction block but + * one of the commands caused an abort so we do nothing + * but remain in the abort state. Eventually we will get + * to the "END TRANSACTION" which will set things straight. + * ---------------- + */ + case TBLOCK_ABORT: + break; + + /* ---------------- + * Here we were in an aborted transaction block which + * just processed the "END TRANSACTION" command from the + * user, so now we return the to default state. + * ---------------- + */ + case TBLOCK_ENDABORT: + s->blockState = TBLOCK_DEFAULT; + break; + } +} + +/* -------------------------------- + * AbortCurrentTransaction + * -------------------------------- + */ +void +AbortCurrentTransaction() +{ + TransactionState s = CurrentTransactionState; + + switch(s->blockState) { + /* ---------------- + * if we aren't in a transaction block, we + * just do our usual abort transaction. + * ---------------- + */ + case TBLOCK_DEFAULT: + AbortTransaction(); + break; + + /* ---------------- + * If we are in the TBLOCK_BEGIN it means something + * screwed up right after reading "BEGIN TRANSACTION" + * so we enter the abort state. Eventually an "END + * TRANSACTION" will fix things. + * ---------------- + */ + case TBLOCK_BEGIN: + s->blockState = TBLOCK_ABORT; + AbortTransaction(); + break; + + /* ---------------- + * This is the case when are somewhere in a transaction + * block which aborted so we abort the transaction and + * set the ABORT state. Eventually an "END TRANSACTION" + * will fix things and restore us to a normal state. + * ---------------- + */ + case TBLOCK_INPROGRESS: + s->blockState = TBLOCK_ABORT; + AbortTransaction(); + break; + + /* ---------------- + * Here, the system was fouled up just after the + * user wanted to end the transaction block so we + * abort the transaction and put us back into the + * default state. + * ---------------- + */ + case TBLOCK_END: + s->blockState = TBLOCK_DEFAULT; + AbortTransaction(); + break; + + /* ---------------- + * Here, we are already in an aborted transaction + * state and are waiting for an "END TRANSACTION" to + * come along and lo and behold, we abort again! + * So we just remain in the abort state. + * ---------------- + */ + case TBLOCK_ABORT: + break; + + /* ---------------- + * Here we were in an aborted transaction block which + * just processed the "END TRANSACTION" command but somehow + * aborted again.. since we must have done the abort + * processing, we return to the default state. + * ---------------- + */ + case TBLOCK_ENDABORT: + s->blockState = TBLOCK_DEFAULT; + break; + } +} + +/* ---------------------------------------------------------------- + * transaction block support + * ---------------------------------------------------------------- + */ +/* -------------------------------- + * BeginTransactionBlock + * -------------------------------- + */ +void +BeginTransactionBlock() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * check the current transaction state + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return; + + if (s->blockState != TBLOCK_DEFAULT) + elog(NOTICE, "BeginTransactionBlock and not in default state "); + + /* ---------------- + * set the current transaction block state information + * appropriately during begin processing + * ---------------- + */ + s->blockState = TBLOCK_BEGIN; + + /* ---------------- + * do begin processing + * ---------------- + */ + + /* ---------------- + * done with begin processing, set block state to inprogress + * ---------------- + */ + s->blockState = TBLOCK_INPROGRESS; +} + +/* -------------------------------- + * EndTransactionBlock + * -------------------------------- + */ +void +EndTransactionBlock() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * check the current transaction state + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return; + + if (s->blockState == TBLOCK_INPROGRESS) { + /* ---------------- + * here we are in a transaction block which should commit + * when we get to the upcoming CommitTransactionCommand() + * so we set the state to "END". CommitTransactionCommand() + * will recognize this and commit the transaction and return + * us to the default state + * ---------------- + */ + s->blockState = TBLOCK_END; + return; + } + + if (s->blockState == TBLOCK_ABORT) { + /* ---------------- + * here, we are in a transaction block which aborted + * and since the AbortTransaction() was already done, + * we do whatever is needed and change to the special + * "END ABORT" state. The upcoming CommitTransactionCommand() + * will recognise this and then put us back in the default + * state. + * ---------------- + */ + s->blockState = TBLOCK_ENDABORT; + return; + } + + /* ---------------- + * We should not get here, but if we do, we go to the ENDABORT + * state after printing a warning. The upcoming call to + * CommitTransactionCommand() will then put us back into the + * default state. + * ---------------- + */ + elog(NOTICE, "EndTransactionBlock and not inprogress/abort state "); + s->blockState = TBLOCK_ENDABORT; +} + +/* -------------------------------- + * AbortTransactionBlock + * -------------------------------- + */ +void +AbortTransactionBlock() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * check the current transaction state + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return; + + if (s->blockState == TBLOCK_INPROGRESS) { + /* ---------------- + * here we were inside a transaction block something + * screwed up inside the system so we enter the abort state, + * do the abort processing and then return. + * We remain in the abort state until we see the upcoming + * END TRANSACTION command. + * ---------------- + */ + s->blockState = TBLOCK_ABORT; + + /* ---------------- + * do abort processing and return + * ---------------- + */ + AbortTransaction(); + return; + } + + /* ---------------- + * this case should not be possible, because it would mean + * the user entered an "abort" from outside a transaction block. + * So we print an error message, abort the transaction and + * enter the "ENDABORT" state so we will end up in the default + * state after the upcoming CommitTransactionCommand(). + * ---------------- + */ + elog(NOTICE, "AbortTransactionBlock and not inprogress state"); + AbortTransaction(); + s->blockState = TBLOCK_ENDABORT; +} + +/* -------------------------------- + * UserAbortTransactionBlock + * -------------------------------- + */ +void +UserAbortTransactionBlock() +{ + TransactionState s = CurrentTransactionState; + + /* ---------------- + * check the current transaction state + * ---------------- + */ + if (s->state == TRANS_DISABLED) + return; + + if (s->blockState == TBLOCK_INPROGRESS) { + /* ---------------- + * here we were inside a transaction block and we + * got an abort command from the user, so we move to + * the abort state, do the abort processing and + * then change to the ENDABORT state so we will end up + * in the default state after the upcoming + * CommitTransactionCommand(). + * ---------------- + */ + s->blockState = TBLOCK_ABORT; + + /* ---------------- + * do abort processing + * ---------------- + */ + AbortTransaction(); + + /* ---------------- + * change to the end abort state and return + * ---------------- + */ + s->blockState = TBLOCK_ENDABORT; + return; + } + + /* ---------------- + * this case should not be possible, because it would mean + * the user entered an "abort" from outside a transaction block. + * So we print an error message, abort the transaction and + * enter the "ENDABORT" state so we will end up in the default + * state after the upcoming CommitTransactionCommand(). + * ---------------- + */ + elog(NOTICE, "UserAbortTransactionBlock and not inprogress state"); + AbortTransaction(); + s->blockState = TBLOCK_ENDABORT; +} + +bool +IsTransactionBlock() +{ + TransactionState s = CurrentTransactionState; + + if (s->blockState == TBLOCK_INPROGRESS + || s->blockState == TBLOCK_ENDABORT) { + return (true); + } + + return (false); +} diff --git a/src/backend/access/transam/xid.c b/src/backend/access/transam/xid.c new file mode 100644 index 00000000000..faeeb623d58 --- /dev/null +++ b/src/backend/access/transam/xid.c @@ -0,0 +1,156 @@ +/*------------------------------------------------------------------------- + * + * xid.c-- + * POSTGRES transaction identifier code. + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/transam/Attic/xid.c,v 1.1.1.1 1996/07/09 06:21:14 scrappy Exp $ + * + * OLD COMMENTS + * XXX WARNING + * Much of this file will change when we change our representation + * of transaction ids -cim 3/23/90 + * + * It is time to make the switch from 5 byte to 4 byte transaction ids + * This file was totally reworked. -mer 5/22/92 + * + *------------------------------------------------------------------------- + */ +#include <stdio.h> +#include "postgres.h" +#include "utils/palloc.h" +#include "utils/elog.h" +#include "utils/memutils.h" +#include "utils/nabstime.h" + +extern TransactionId NullTransactionId; +extern TransactionId DisabledTransactionId; +extern TransactionId AmiTransactionId; +extern TransactionId FirstTransactionId; + +/* ---------------------------------------------------------------- + * TransactionIdIsValid + * + * Macro-ize me. + * ---------------------------------------------------------------- + */ +bool +TransactionIdIsValid(TransactionId transactionId) +{ + return ((bool) (transactionId != NullTransactionId) ); +} + +/* XXX char16 name for catalogs */ +TransactionId +xidin(char *representation) +{ + return (atol(representation)); +} + +/* XXX char16 name for catalogs */ +char* +xidout(TransactionId transactionId) +{ +/* return(TransactionIdFormString(transactionId)); */ + char *representation; + + /* maximum 32 bit unsigned integer representation takes 10 chars */ + representation = palloc(11); + + (void)sprintf(representation, "%u", transactionId); + + return (representation); + +} + +/* ---------------------------------------------------------------- + * StoreInvalidTransactionId + * + * Maybe do away with Pointer types in these routines. + * Macro-ize this one. + * ---------------------------------------------------------------- + */ +void +StoreInvalidTransactionId(TransactionId *destination) +{ + *destination = NullTransactionId; +} + +/* ---------------------------------------------------------------- + * TransactionIdStore + * + * Macro-ize this one. + * ---------------------------------------------------------------- + */ +void +TransactionIdStore(TransactionId transactionId, + TransactionId *destination) +{ + *destination = transactionId; +} + +/* ---------------------------------------------------------------- + * TransactionIdEquals + * ---------------------------------------------------------------- + */ +bool +TransactionIdEquals(TransactionId id1, TransactionId id2) +{ + return ((bool) (id1 == id2)); +} + +/* ---------------------------------------------------------------- + * TransactionIdIsLessThan + * ---------------------------------------------------------------- + */ +bool +TransactionIdIsLessThan(TransactionId id1, TransactionId id2) +{ + return ((bool)(id1 < id2)); +} + +/* ---------------------------------------------------------------- + * xideq + * ---------------------------------------------------------------- + */ + +/* + * xideq - returns 1, iff xid1 == xid2 + * 0 else; + */ +bool +xideq(TransactionId xid1, TransactionId xid2) +{ + return( (bool) (xid1 == xid2) ); +} + + + +/* ---------------------------------------------------------------- + * TransactionIdIncrement + * ---------------------------------------------------------------- + */ +void +TransactionIdIncrement(TransactionId *transactionId) +{ + + (*transactionId)++; + if (*transactionId == DisabledTransactionId) + elog(FATAL, "TransactionIdIncrement: exhausted XID's"); + return; +} + +/* ---------------------------------------------------------------- + * TransactionIdAdd + * ---------------------------------------------------------------- + */ +void +TransactionIdAdd(TransactionId *xid, int value) +{ + *xid += value; + return; +} + diff --git a/src/backend/access/tupdesc.h b/src/backend/access/tupdesc.h new file mode 100644 index 00000000000..a26bbc704da --- /dev/null +++ b/src/backend/access/tupdesc.h @@ -0,0 +1,53 @@ +/*------------------------------------------------------------------------- + * + * tupdesc.h-- + * POSTGRES tuple descriptor definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: tupdesc.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef TUPDESC_H +#define TUPDESC_H + +#include "postgres.h" +#include "access/attnum.h" +#include "nodes/pg_list.h" /* for List */ +#include "catalog/pg_attribute.h" + +/* + * a TupleDesc is an array of AttributeTupleForms, each of which is a + * pointer to a AttributeTupleForm + */ +/* typedef AttributeTupleForm *TupleDesc; */ + +/* a TupleDesc is a pointer to a structure which includes an array of */ +/* AttributeTupleForms, i.e. pg_attribute information, and the size of */ +/* the array, i.e. the number of attributes */ +/* in short, a TupleDesc completely captures the attribute information */ +/* for a tuple */ + +typedef struct tupleDesc { + int natts; + AttributeTupleForm *attrs; +} *TupleDesc; + +extern TupleDesc CreateTemplateTupleDesc(int natts); + +extern TupleDesc CreateTupleDesc(int natts, AttributeTupleForm *attrs); + +extern TupleDesc CreateTupleDescCopy(TupleDesc tupdesc); + +extern bool TupleDescInitEntry(TupleDesc desc, + AttrNumber attributeNumber, + char *attributeName, + char *typeName, + int attdim, + bool attisset); + +extern TupleDesc BuildDescForRelation(List *schema, char *relname); + +#endif /* TUPDESC_H */ diff --git a/src/backend/access/tupmacs.h b/src/backend/access/tupmacs.h new file mode 100644 index 00000000000..9a9bcce3b41 --- /dev/null +++ b/src/backend/access/tupmacs.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * + * tupmacs.h-- + * Tuple macros used by both index tuples and heap tuples. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: tupmacs.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef TUPMACS_H +#define TUPMACS_H + +/* + * check to see if the ATT'th bit of an array of 8-bit bytes is set. + */ +#define att_isnull(ATT, BITS) (!((BITS)[(ATT) >> 3] & (1 << ((ATT) & 0x07)))) + +/* + * given a AttributeTupleForm and a pointer into a tuple's data + * area, return the correct value or pointer. + * + * note that T must already be properly LONGALIGN/SHORTALIGN'd for + * this to work correctly. + * + * the double-cast is to stop gcc from (correctly) complaining about + * casting integer types with size < sizeof(char *) to (char *). + * sign-extension may get weird if you use an integer type that + * isn't the same size as (char *) for the first cast. (on the other + * hand, it's safe to use another type for the (foo *)(T).) + */ +#define fetchatt(A, T) \ + ((*(A))->attbyval \ + ? ((*(A))->attlen > sizeof(int16) \ + ? (char *) (long) *((int32 *)(T)) \ + : ((*(A))->attlen < sizeof(int16) \ + ? (char *) (long) *((char *)(T)) \ + : (char *) (long) *((int16 *)(T)))) \ + : (char *) (T)) + +#endif diff --git a/src/backend/access/valid.h b/src/backend/access/valid.h new file mode 100644 index 00000000000..1c5cf8cdeb3 --- /dev/null +++ b/src/backend/access/valid.h @@ -0,0 +1,37 @@ +/*------------------------------------------------------------------------- + * + * valid.h-- + * POSTGRES tuple qualification validity definitions. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: valid.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef VALID_H +#define VALID_H + +#include "c.h" +#include "access/skey.h" +#include "storage/buf.h" +#include "utils/tqual.h" +#include "access/tupdesc.h" +#include "utils/rel.h" +#include "storage/bufpage.h" + +/* ---------------- + * extern decl's + * ---------------- + */ + +extern bool heap_keytest(HeapTuple t, TupleDesc tupdesc, + int nkeys, ScanKey keys); + +extern HeapTuple heap_tuple_satisfies(ItemId itemId, Relation relation, + PageHeader disk_page, TimeQual qual, int nKeys, ScanKey key); + +extern bool TupleUpdatedByCurXactAndCmd(HeapTuple t); + +#endif /* VALID_H */ diff --git a/src/backend/access/xact.h b/src/backend/access/xact.h new file mode 100644 index 00000000000..15f376ec5ed --- /dev/null +++ b/src/backend/access/xact.h @@ -0,0 +1,115 @@ +/*------------------------------------------------------------------------- + * + * xact.h-- + * postgres transaction system header + * + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: xact.h,v 1.1.1.1 1996/07/09 06:21:09 scrappy Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef XACT_H +#define XACT_H + +#include <signal.h> + +#include "storage/ipc.h" +#include "miscadmin.h" +#include "utils/portal.h" +#include "utils/elog.h" +#include "utils/mcxt.h" +#include "utils/nabstime.h" + +/* ---------------- + * transaction state structure + * ---------------- + */ +typedef struct TransactionStateData { + TransactionId transactionIdData; + CommandId commandId; + AbsoluteTime startTime; + int state; + int blockState; +} TransactionStateData; + +/* ---------------- + * transaction states + * ---------------- + */ +#define TRANS_DEFAULT 0 +#define TRANS_START 1 +#define TRANS_INPROGRESS 2 +#define TRANS_COMMIT 3 +#define TRANS_ABORT 4 +#define TRANS_DISABLED 5 + +/* ---------------- + * transaction block states + * ---------------- + */ +#define TBLOCK_DEFAULT 0 +#define TBLOCK_BEGIN 1 +#define TBLOCK_INPROGRESS 2 +#define TBLOCK_END 3 +#define TBLOCK_ABORT 4 +#define TBLOCK_ENDABORT 5 + +typedef TransactionStateData *TransactionState; + +/* ---------------- + * extern definitions + * ---------------- + */ +extern int TransactionFlushEnabled(); +extern void SetTransactionFlushEnabled(bool state); + +extern bool IsTransactionState(void); +extern bool IsAbortedTransactionBlockState(void); +extern void OverrideTransactionSystem(bool flag); +extern TransactionId GetCurrentTransactionId(void); +extern CommandId GetCurrentCommandId(void); +extern AbsoluteTime GetCurrentTransactionStartTime(void); +extern bool TransactionIdIsCurrentTransactionId(TransactionId xid); +extern bool CommandIdIsCurrentCommandId(CommandId cid); +extern void ClearCommandIdCounterOverflowFlag(void); +extern void CommandCounterIncrement(void); +extern void InitializeTransactionSystem(void); +extern void AtStart_Cache(void); +extern void AtStart_Locks(void); +extern void AtStart_Memory(void); +extern void RecordTransactionCommit(void); +extern void AtCommit_Cache(void); +extern void AtCommit_Locks(void); +extern void AtCommit_Memory(void); +extern void RecordTransactionAbort(void); +extern void AtAbort_Cache(void); +extern void AtAbort_Locks(void); +extern void AtAbort_Memory(void); +extern void StartTransaction(void); +extern bool CurrentXactInProgress(void); +extern void CommitTransaction(void); +extern void AbortTransaction(void); +extern void StartTransactionCommand(void); +extern void CommitTransactionCommand(void); +extern void AbortCurrentTransaction(void); +extern void BeginTransactionBlock(void); +extern void EndTransactionBlock(void); +extern void AbortTransactionBlock(void); +extern bool IsTransactionBlock(); +extern void UserAbortTransactionBlock(); + +extern TransactionId DisabledTransactionId; + +/* defined in xid.c */ +extern bool TransactionIdIsValid(TransactionId transactionId); +extern void StoreInvalidTransactionId(TransactionId *destination); +extern void TransactionIdStore(TransactionId transactionId, + TransactionId *destination); +extern bool TransactionIdEquals(TransactionId id1, TransactionId id2); +extern bool TransactionIdIsLessThan(TransactionId id1, TransactionId id2); +extern void TransactionIdIncrement(TransactionId *transactionId); +extern void TransactionIdAdd(TransactionId *xid, int value); + +#endif /* XACT_H */ |