aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/hash
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/hash')
-rw-r--r--src/backend/access/hash/Makefile.inc18
-rw-r--r--src/backend/access/hash/hash.c467
-rw-r--r--src/backend/access/hash/hashfunc.c276
-rw-r--r--src/backend/access/hash/hashinsert.c239
-rw-r--r--src/backend/access/hash/hashovfl.c614
-rw-r--r--src/backend/access/hash/hashpage.c669
-rw-r--r--src/backend/access/hash/hashscan.c172
-rw-r--r--src/backend/access/hash/hashsearch.c425
-rw-r--r--src/backend/access/hash/hashstrat.c104
-rw-r--r--src/backend/access/hash/hashutil.c147
10 files changed, 3131 insertions, 0 deletions
diff --git a/src/backend/access/hash/Makefile.inc b/src/backend/access/hash/Makefile.inc
new file mode 100644
index 00000000000..8ea221bc264
--- /dev/null
+++ b/src/backend/access/hash/Makefile.inc
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+# Makefile for access/hash (hash access method)
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+# $Header: /cvsroot/pgsql/src/backend/access/hash/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= hash.c hashfunc.c hashinsert.c hashovfl.c hashpage.c hashscan.c \
+ hashsearch.c hashstrat.c hashutil.c
+
+
+
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
new file mode 100644
index 00000000000..a4a4e16e599
--- /dev/null
+++ b/src/backend/access/hash/hash.c
@@ -0,0 +1,467 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash.c--
+ * Implementation of Margo Seltzer's Hashing package for postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+#include "access/funcindex.h"
+#include "nodes/execnodes.h"
+#include "nodes/plannodes.h"
+#include "executor/executor.h"
+#include "executor/tuptable.h"
+#include "catalog/index.h"
+
+
+bool BuildingHash = false;
+
+/*
+ * hashbuild() -- build a new hash index.
+ *
+ * We use a global variable to record the fact that we're creating
+ * a new index. This is used to avoid high-concurrency locking,
+ * since the index won't be visible until this transaction commits
+ * and since building is guaranteed to be single-threaded.
+ */
+void
+hashbuild(Relation heap,
+ Relation index,
+ int natts,
+ AttrNumber *attnum,
+ IndexStrategy istrat,
+ uint16 pcount,
+ Datum *params,
+ FuncIndexInfo *finfo,
+ PredInfo *predInfo)
+{
+ HeapScanDesc hscan;
+ Buffer buffer;
+ HeapTuple htup;
+ IndexTuple itup;
+ TupleDesc htupdesc, itupdesc;
+ Datum *attdata;
+ bool *nulls;
+ InsertIndexResult res;
+ int nhtups, nitups;
+ int i;
+ HashItem hitem;
+ ExprContext *econtext;
+ TupleTable tupleTable;
+ TupleTableSlot *slot;
+ Oid hrelid, irelid;
+ Node *pred, *oldPred;
+
+ /* note that this is a new btree */
+ BuildingHash = true;
+
+ pred = predInfo->pred;
+ oldPred = predInfo->oldPred;
+
+ /* initialize the hash index metadata page (if this is a new index) */
+ if (oldPred == NULL)
+ _hash_metapinit(index);
+
+ /* get tuple descriptors for heap and index relations */
+ htupdesc = RelationGetTupleDescriptor(heap);
+ itupdesc = RelationGetTupleDescriptor(index);
+
+ /* get space for data items that'll appear in the index tuple */
+ attdata = (Datum *) palloc(natts * sizeof(Datum));
+ nulls = (bool *) palloc(natts * sizeof(bool));
+
+ /*
+ * If this is a predicate (partial) index, we will need to evaluate the
+ * predicate using ExecQual, which requires the current tuple to be in a
+ * slot of a TupleTable. In addition, ExecQual must have an ExprContext
+ * referring to that slot. Here, we initialize dummy TupleTable and
+ * ExprContext objects for this purpose. --Nels, Feb '92
+ */
+#ifndef OMIT_PARTIAL_INDEX
+ if (pred != NULL || oldPred != NULL) {
+ tupleTable = ExecCreateTupleTable(1);
+ slot = ExecAllocTableSlot(tupleTable);
+ econtext = makeNode(ExprContext);
+ FillDummyExprContext(econtext, slot, htupdesc, buffer);
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+
+ /* start a heap scan */
+ hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+ htup = heap_getnext(hscan, 0, &buffer);
+
+ /* build the index */
+ nhtups = nitups = 0;
+
+ for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
+
+ nhtups++;
+
+ /*
+ * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+ * this tuple if it was already in the existing partial index
+ */
+ if (oldPred != NULL) {
+ /*SetSlotContents(slot, htup); */
+#ifndef OMIT_PARTIAL_INDEX
+ slot->val = htup;
+ if (ExecQual((List*)oldPred, econtext) == true) {
+ nitups++;
+ continue;
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /* Skip this tuple if it doesn't satisfy the partial-index predicate */
+ if (pred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ /*SetSlotContents(slot, htup); */
+ slot->val = htup;
+ if (ExecQual((List*)pred, econtext) == false)
+ continue;
+#endif /* OMIT_PARTIAL_INDEX */
+}
+
+ nitups++;
+
+ /*
+ * For the current heap tuple, extract all the attributes
+ * we use in this index, and note which are null.
+ */
+ for (i = 1; i <= natts; i++) {
+ int attoff;
+ bool attnull;
+
+ /*
+ * Offsets are from the start of the tuple, and are
+ * zero-based; indices are one-based. The next call
+ * returns i - 1. That's data hiding for you.
+ */
+
+ /* attoff = i - 1 */
+ attoff = AttrNumberGetAttrOffset(i);
+
+ /* below, attdata[attoff] set to equal some datum &
+ * attnull is changed to indicate whether or not the attribute
+ * is null for this tuple
+ */
+ attdata[attoff] = GetIndexValue(htup,
+ htupdesc,
+ attoff,
+ attnum,
+ finfo,
+ &attnull,
+ buffer);
+ nulls[attoff] = (attnull ? 'n' : ' ');
+ }
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_formtuple(itupdesc, attdata, nulls);
+
+ /*
+ * If the single index key is null, we don't insert it into
+ * the index. Hash tables support scans on '='.
+ * Relational algebra says that A = B
+ * returns null if either A or B is null. This
+ * means that no qualification used in an index scan could ever
+ * return true on a null attribute. It also means that indices
+ * can't be used by ISNULL or NOTNULL scans, but that's an
+ * artifact of the strategy map architecture chosen in 1986, not
+ * of the way nulls are handled here.
+ */
+
+ if (itup->t_info & INDEX_NULL_MASK) {
+ pfree(itup);
+ continue;
+ }
+
+ itup->t_tid = htup->t_ctid;
+ hitem = _hash_formitem(itup);
+ res = _hash_doinsert(index, hitem);
+ pfree(hitem);
+ pfree(itup);
+ pfree(res);
+ }
+
+ /* okay, all heap tuples are indexed */
+ heap_endscan(hscan);
+
+ if (pred != NULL || oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ ExecDestroyTupleTable(tupleTable, true);
+ pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /*
+ * Since we just counted the tuples in the heap, we update its
+ * stats in pg_class to guarantee that the planner takes advantage
+ * of the index we just created. Finally, only update statistics
+ * during normal index definitions, not for indices on system catalogs
+ * created during bootstrap processing. We must close the relations
+ * before updatings statistics to guarantee that the relcache entries
+ * are flushed when we increment the command counter in UpdateStats().
+ */
+ if (IsNormalProcessingMode())
+ {
+ hrelid = heap->rd_id;
+ irelid = index->rd_id;
+ heap_close(heap);
+ index_close(index);
+ UpdateStats(hrelid, nhtups, true);
+ UpdateStats(irelid, nitups, false);
+ if (oldPred != NULL) {
+ if (nitups == nhtups) pred = NULL;
+ UpdateIndexPredicate(irelid, oldPred, pred);
+ }
+ }
+
+ /* be tidy */
+ pfree(nulls);
+ pfree(attdata);
+
+ /* all done */
+ BuildingHash = false;
+}
+
+/*
+ * hashinsert() -- insert an index tuple into a hash table.
+ *
+ * Hash on the index tuple's key, find the appropriate location
+ * for the new tuple, put it there, and return an InsertIndexResult
+ * to the caller.
+ */
+InsertIndexResult
+hashinsert(Relation rel, IndexTuple itup)
+{
+ HashItem hitem;
+ InsertIndexResult res;
+
+ if (itup->t_info & INDEX_NULL_MASK)
+ return ((InsertIndexResult) NULL);
+
+ hitem = _hash_formitem(itup);
+
+ res = _hash_doinsert(rel, hitem);
+
+ pfree(hitem);
+
+ return (res);
+}
+
+
+/*
+ * hashgettuple() -- Get the next tuple in the scan.
+ */
+char *
+hashgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+ RetrieveIndexResult res;
+
+ /*
+ * If we've already initialized this scan, we can just advance it
+ * in the appropriate direction. If we haven't done so yet, we
+ * call a routine to get the first item in the scan.
+ */
+
+ if (ItemPointerIsValid(&(scan->currentItemData)))
+ res = _hash_next(scan, dir);
+ else
+ res = _hash_first(scan, dir);
+
+ return ((char *) res);
+}
+
+
+/*
+ * hashbeginscan() -- start a scan on a hash index
+ */
+char *
+hashbeginscan(Relation rel,
+ bool fromEnd,
+ uint16 keysz,
+ ScanKey scankey)
+{
+ IndexScanDesc scan;
+ HashScanOpaque so;
+
+ scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
+ so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
+ so->hashso_curbuf = so->hashso_mrkbuf = InvalidBuffer;
+ scan->opaque = so;
+ scan->flags = 0x0;
+
+ /* register scan in case we change pages it's using */
+ _hash_regscan(scan);
+
+ return ((char *) scan);
+}
+
+/*
+ * hashrescan() -- rescan an index relation
+ */
+void
+hashrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
+{
+ ItemPointer iptr;
+ HashScanOpaque so;
+
+ so = (HashScanOpaque) scan->opaque;
+
+ /* we hold a read lock on the current page in the scan */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+ so->hashso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+ so->hashso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* reset the scan key */
+ if (scan->numberOfKeys > 0) {
+ memmove(scan->keyData,
+ scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ }
+}
+
+/*
+ * hashendscan() -- close down a scan
+ */
+void
+hashendscan(IndexScanDesc scan)
+{
+
+ ItemPointer iptr;
+ HashScanOpaque so;
+
+ so = (HashScanOpaque) scan->opaque;
+
+ /* release any locks we still hold */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+ so->hashso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ if (BufferIsValid(so->hashso_mrkbuf))
+ _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+ so->hashso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* don't need scan registered anymore */
+ _hash_dropscan(scan);
+
+ /* be tidy */
+#ifdef PERFECT_MMGR
+ pfree (scan->opaque);
+#endif /* PERFECT_MMGR */
+}
+
+/*
+ * hashmarkpos() -- save current scan position
+ *
+ */
+void
+hashmarkpos(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ HashScanOpaque so;
+
+ /* see if we ever call this code. if we do, then so_mrkbuf a
+ * useful element in the scan->opaque structure. if this procedure
+ * is never called, so_mrkbuf should be removed from the scan->opaque
+ * structure.
+ */
+ elog(NOTICE, "Hashmarkpos() called.");
+
+ so = (HashScanOpaque) scan->opaque;
+
+ /* release lock on old marked data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+ _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+ so->hashso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentItemData and copy to currentMarkData */
+ if (ItemPointerIsValid(&(scan->currentItemData))) {
+ so->hashso_mrkbuf = _hash_getbuf(scan->relation,
+ BufferGetBlockNumber(so->hashso_curbuf),
+ HASH_READ);
+ scan->currentMarkData = scan->currentItemData;
+ }
+}
+
+/*
+ * hashrestrpos() -- restore scan to last saved position
+ */
+void
+hashrestrpos(IndexScanDesc scan)
+{
+ ItemPointer iptr;
+ HashScanOpaque so;
+
+ /* see if we ever call this code. if we do, then so_mrkbuf a
+ * useful element in the scan->opaque structure. if this procedure
+ * is never called, so_mrkbuf should be removed from the scan->opaque
+ * structure.
+ */
+ elog(NOTICE, "Hashrestrpos() called.");
+
+ so = (HashScanOpaque) scan->opaque;
+
+ /* release lock on current data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+ _hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+ so->hashso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentMarkData and copy to currentItemData */
+ if (ItemPointerIsValid(&(scan->currentMarkData))) {
+ so->hashso_curbuf =
+ _hash_getbuf(scan->relation,
+ BufferGetBlockNumber(so->hashso_mrkbuf),
+ HASH_READ);
+
+ scan->currentItemData = scan->currentMarkData;
+ }
+}
+
+/* stubs */
+void
+hashdelete(Relation rel, ItemPointer tid)
+{
+ /* adjust any active scans that will be affected by this deletion */
+ _hash_adjscans(rel, tid);
+
+ /* delete the data from the page */
+ _hash_pagedel(rel, tid);
+}
+
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
new file mode 100644
index 00000000000..6b37de29911
--- /dev/null
+++ b/src/backend/access/hash/hashfunc.c
@@ -0,0 +1,276 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashfunc.c--
+ * Comparison functions for hash access method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashfunc.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * These functions are stored in pg_amproc. For each operator class
+ * defined on hash tables, they compute the hash value of the argument.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "utils/nabstime.h"
+
+uint32 hashint2(int16 key)
+{
+ return ((uint32) ~key);
+}
+
+uint32 hashint4(uint32 key)
+{
+ return (~key);
+}
+
+/* Hash function from Chris Torek. */
+uint32 hashfloat4(float32 keyp)
+{
+ int len;
+ int loop;
+ uint32 h;
+ char *kp = (char *) keyp;
+
+ len = sizeof(float32data);
+
+#define HASH4a h = (h << 5) - h + *kp++;
+#define HASH4b h = (h << 5) + h + *kp++;
+#define HASH4 HASH4b
+
+
+ h = 0;
+ if (len > 0) {
+ loop = (len + 8 - 1) >> 3;
+
+ switch (len & (8 - 1)) {
+ case 0:
+ do { /* All fall throughs */
+ HASH4;
+ case 7:
+ HASH4;
+ case 6:
+ HASH4;
+ case 5:
+ HASH4;
+ case 4:
+ HASH4;
+ case 3:
+ HASH4;
+ case 2:
+ HASH4;
+ case 1:
+ HASH4;
+ } while (--loop);
+ }
+ }
+ return (h);
+}
+
+
+uint32 hashfloat8(float64 keyp)
+{
+ int len;
+ int loop;
+ uint32 h;
+ char *kp = (char *) keyp;
+
+ len = sizeof(float64data);
+
+#define HASH4a h = (h << 5) - h + *kp++;
+#define HASH4b h = (h << 5) + h + *kp++;
+#define HASH4 HASH4b
+
+
+ h = 0;
+ if (len > 0) {
+ loop = (len + 8 - 1) >> 3;
+
+ switch (len & (8 - 1)) {
+ case 0:
+ do { /* All fall throughs */
+ HASH4;
+ case 7:
+ HASH4;
+ case 6:
+ HASH4;
+ case 5:
+ HASH4;
+ case 4:
+ HASH4;
+ case 3:
+ HASH4;
+ case 2:
+ HASH4;
+ case 1:
+ HASH4;
+ } while (--loop);
+ }
+ }
+ return (h);
+}
+
+
+uint32 hashoid(Oid key)
+{
+ return ((uint32) ~key);
+}
+
+
+uint32 hashchar(char key)
+{
+ int len;
+ uint32 h;
+
+ len = sizeof(char);
+
+#define PRIME1 37
+#define PRIME2 1048583
+
+ h = 0;
+ /* Convert char to integer */
+ h = h * PRIME1 ^ (key - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+uint32 hashchar2(uint16 intkey)
+{
+ uint32 h;
+ int len;
+ char *key = (char *) &intkey;
+
+ h = 0;
+ len = sizeof(uint16);
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+uint32 hashchar4(uint32 intkey)
+{
+ uint32 h;
+ int len;
+ char *key = (char *) &intkey;
+
+ h = 0;
+ len = sizeof(uint32);
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+uint32 hashchar8(char *key)
+{
+ uint32 h;
+ int len;
+
+ h = 0;
+ len = sizeof(char8);
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+uint32 hashname(NameData *n)
+{
+ uint32 h;
+ int len;
+ char *key;
+
+ key = n->data;
+
+ h = 0;
+ len = NAMEDATALEN;
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+
+uint32 hashchar16(char *key)
+{
+ uint32 h;
+ int len;
+
+ h = 0;
+ len = sizeof(char16);
+ /* Convert string to integer */
+ while (len--)
+ h = h * PRIME1 ^ (*key++ - ' ');
+ h %= PRIME2;
+
+ return (h);
+}
+
+
+/*
+ * (Comment from the original db3 hashing code: )
+ *
+ * "This is INCREDIBLY ugly, but fast. We break the string up into 8 byte
+ * units. On the first time through the loop we get the 'leftover bytes'
+ * (strlen % 8). On every other iteration, we perform 8 HASHC's so we handle
+ * all 8 bytes. Essentially, this saves us 7 cmp & branch instructions. If
+ * this routine is heavily used enough, it's worth the ugly coding.
+ *
+ * "OZ's original sdbm hash"
+ */
+uint32 hashtext(struct varlena *key)
+{
+ int keylen;
+ char *keydata;
+ uint32 n;
+ int loop;
+
+ keydata = VARDATA(key);
+ keylen = VARSIZE(key);
+
+ /* keylen includes the four bytes in which string keylength is stored */
+ keylen -= sizeof(VARSIZE(key));
+
+#define HASHC n = *keydata++ + 65599 * n
+
+ n = 0;
+ if (keylen > 0) {
+ loop = (keylen + 8 - 1) >> 3;
+
+ switch (keylen & (8 - 1)) {
+ case 0:
+ do { /* All fall throughs */
+ HASHC;
+ case 7:
+ HASHC;
+ case 6:
+ HASHC;
+ case 5:
+ HASHC;
+ case 4:
+ HASHC;
+ case 3:
+ HASHC;
+ case 2:
+ HASHC;
+ case 1:
+ HASHC;
+ } while (--loop);
+ }
+ }
+ return (n);
+}
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
new file mode 100644
index 00000000000..c514cc614d8
--- /dev/null
+++ b/src/backend/access/hash/hashinsert.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashinsert.c--
+ * Item insertion in hash tables for Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashinsert.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/hash.h"
+
+static InsertIndexResult _hash_insertonpg(Relation rel, Buffer buf, int keysz, ScanKey scankey, HashItem hitem, Buffer metabuf);
+static OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, HashItem hitem);
+
+/*
+ * _hash_doinsert() -- Handle insertion of a single HashItem in the table.
+ *
+ * This routine is called by the public interface routines, hashbuild
+ * and hashinsert. By here, hashitem is filled in, and has a unique
+ * (xid, seqno) pair. The datum to be used as a "key" is in the
+ * hashitem.
+ */
+InsertIndexResult
+_hash_doinsert(Relation rel, HashItem hitem)
+{
+ Buffer buf;
+ Buffer metabuf;
+ BlockNumber blkno;
+ HashMetaPage metap;
+ IndexTuple itup;
+ InsertIndexResult res;
+ ScanKey itup_scankey;
+ int natts;
+ Page page;
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ /* we need a scan key to do our search, so build one */
+ itup = &(hitem->hash_itup);
+ if ((natts = rel->rd_rel->relnatts) != 1)
+ elog(WARN, "Hash indices valid for only one index key.");
+ itup_scankey = _hash_mkscankey(rel, itup, metap);
+
+ /*
+ * find the first page in the bucket chain containing this key and
+ * place it in buf. _hash_search obtains a read lock for us.
+ */
+ _hash_search(rel, natts, itup_scankey, &buf, metap);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE);
+
+ /*
+ * trade in our read lock for a write lock so that we can do the
+ * insertion.
+ */
+ blkno = BufferGetBlockNumber(buf);
+ _hash_relbuf(rel, buf, HASH_READ);
+ buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+
+
+ /*
+ * XXX btree comment (haven't decided what to do in hash): don't
+ * think the bucket can be split while we're reading the metapage.
+ *
+ * If the page was split between the time that we surrendered our
+ * read lock and acquired our write lock, then this page may no
+ * longer be the right place for the key we want to insert.
+ */
+
+ /* do the insertion */
+ res = _hash_insertonpg(rel, buf, natts, itup_scankey,
+ hitem, metabuf);
+
+ /* be tidy */
+ _hash_freeskey(itup_scankey);
+
+ return (res);
+}
+
+/*
+ * _hash_insertonpg() -- Insert a tuple on a particular page in the table.
+ *
+ * This recursive procedure does the following things:
+ *
+ * + if necessary, splits the target page.
+ * + inserts the tuple.
+ *
+ * On entry, we must have the right buffer on which to do the
+ * insertion, and the buffer must be pinned and locked. On return,
+ * we will have dropped both the pin and the write lock on the buffer.
+ *
+ */
+static InsertIndexResult
+_hash_insertonpg(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey scankey,
+ HashItem hitem,
+ Buffer metabuf)
+{
+ InsertIndexResult res;
+ Page page;
+ BlockNumber itup_blkno;
+ OffsetNumber itup_off;
+ int itemsz;
+ HashPageOpaque pageopaque;
+ bool do_expand = false;
+ Buffer ovflbuf;
+ HashMetaPage metap;
+ Bucket bucket;
+
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ bucket = pageopaque->hasho_bucket;
+
+ itemsz = IndexTupleDSize(hitem->hash_itup)
+ + (sizeof(HashItemData) - sizeof(IndexTupleData));
+ itemsz = DOUBLEALIGN(itemsz);
+
+ while (PageGetFreeSpace(page) < itemsz) {
+ /*
+ * no space on this page; check for an overflow page
+ */
+ if (BlockNumberIsValid(pageopaque->hasho_nextblkno)) {
+ /*
+ * ovfl page exists; go get it. if it doesn't have room,
+ * we'll find out next pass through the loop test above.
+ */
+ ovflbuf = _hash_getbuf(rel, pageopaque->hasho_nextblkno,
+ HASH_WRITE);
+ _hash_relbuf(rel, buf, HASH_WRITE);
+ buf = ovflbuf;
+ page = BufferGetPage(buf);
+ } else {
+ /*
+ * we're at the end of the bucket chain and we haven't
+ * found a page with enough room. allocate a new overflow
+ * page.
+ */
+ do_expand = true;
+ ovflbuf = _hash_addovflpage(rel, &metabuf, buf);
+ _hash_relbuf(rel, buf, HASH_WRITE);
+ buf = ovflbuf;
+ page = BufferGetPage(buf);
+
+ if (PageGetFreeSpace(page) < itemsz) {
+ /* it doesn't fit on an empty page -- give up */
+ elog(WARN, "hash item too large");
+ }
+ }
+ _hash_checkpage(page, LH_OVERFLOW_PAGE);
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(pageopaque->hasho_bucket == bucket);
+ }
+
+ itup_off = _hash_pgaddtup(rel, buf, keysz, scankey, itemsz, hitem);
+ itup_blkno = BufferGetBlockNumber(buf);
+
+ /* by here, the new tuple is inserted */
+ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+
+ ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+
+ if (res != NULL) {
+ /*
+ * Increment the number of keys in the table.
+ * We switch lock access type just for a moment
+ * to allow greater accessibility to the metapage.
+ */
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf,
+ HASH_READ, HASH_WRITE);
+ metap->hashm_nkeys += 1;
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf,
+ HASH_WRITE, HASH_READ);
+
+ }
+
+ _hash_wrtbuf(rel, buf);
+
+ if (do_expand ||
+ (metap->hashm_nkeys / (metap->hashm_maxbucket + 1))
+ > metap->hashm_ffactor) {
+ _hash_expandtable(rel, metabuf);
+ }
+ _hash_relbuf(rel, metabuf, HASH_READ);
+ return (res);
+}
+
+/*
+ * _hash_pgaddtup() -- add a tuple to a particular page in the index.
+ *
+ * This routine adds the tuple to the page as requested, and keeps the
+ * write lock and reference associated with the page's buffer. It is
+ * an error to call pgaddtup() without a write lock and reference.
+ */
+static OffsetNumber
+_hash_pgaddtup(Relation rel,
+ Buffer buf,
+ int keysz,
+ ScanKey itup_scankey,
+ Size itemsize,
+ HashItem hitem)
+{
+ OffsetNumber itup_off;
+ Page page;
+
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+
+ itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+ (void) PageAddItem(page, (Item) hitem, itemsize, itup_off, LP_USED);
+
+ /* write the buffer, but hold our lock */
+ _hash_wrtnorelbuf(rel, buf);
+
+ return (itup_off);
+}
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
new file mode 100644
index 00000000000..55ee9e9ce79
--- /dev/null
+++ b/src/backend/access/hash/hashovfl.c
@@ -0,0 +1,614 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashovfl.c--
+ * Overflow page management code for the Postgres hash access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * Overflow pages look like ordinary relation pages.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+static OverflowPageAddress _hash_getovfladdr(Relation rel, Buffer *metabufp);
+static uint32 _hash_firstfreebit(uint32 map);
+
+/*
+ * _hash_addovflpage
+ *
+ * Add an overflow page to the page currently pointed to by the buffer
+ * argument 'buf'.
+ *
+ * *Metabufp has a read lock upon entering the function; buf has a
+ * write lock.
+ *
+ */
+Buffer
+_hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf)
+{
+
+ OverflowPageAddress oaddr;
+ BlockNumber ovflblkno;
+ Buffer ovflbuf;
+ HashMetaPage metap;
+ HashPageOpaque ovflopaque;
+ HashPageOpaque pageopaque;
+ Page page;
+ Page ovflpage;
+
+ /* this had better be the last page in a bucket chain */
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(!BlockNumberIsValid(pageopaque->hasho_nextblkno));
+
+ metap = (HashMetaPage) BufferGetPage(*metabufp);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ /* allocate an empty overflow page */
+ oaddr = _hash_getovfladdr(rel, metabufp);
+ if (oaddr == InvalidOvflAddress) {
+ elog(WARN, "_hash_addovflpage: problem with _hash_getovfladdr.");
+ }
+ ovflblkno = OADDR_TO_BLKNO(OADDR_OF(SPLITNUM(oaddr), OPAGENUM(oaddr)));
+ Assert(BlockNumberIsValid(ovflblkno));
+ ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE);
+ Assert(BufferIsValid(ovflbuf));
+ ovflpage = BufferGetPage(ovflbuf);
+
+ /* initialize the new overflow page */
+ _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
+ ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+ ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
+ ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+ ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
+ ovflopaque->hasho_oaddr = oaddr;
+ ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
+ _hash_wrtnorelbuf(rel, ovflbuf);
+
+ /* logically chain overflow page to previous page */
+ pageopaque->hasho_nextblkno = ovflblkno;
+ _hash_wrtnorelbuf(rel, buf);
+ return (ovflbuf);
+}
+
+/*
+ * _hash_getovfladdr()
+ *
+ * Find an available overflow page and return its address.
+ *
+ * When we enter this function, we have a read lock on *metabufp which
+ * we change to a write lock immediately. Before exiting, the write lock
+ * is exchanged for a read lock.
+ *
+ */
+static OverflowPageAddress
+_hash_getovfladdr(Relation rel, Buffer *metabufp)
+{
+ HashMetaPage metap;
+ Buffer mapbuf;
+ BlockNumber blkno;
+ PageOffset offset;
+ OverflowPageAddress oaddr;
+ SplitNumber splitnum;
+ uint32 *freep;
+ uint32 max_free;
+ uint32 bit;
+ uint32 first_page;
+ uint32 free_bit;
+ uint32 free_page;
+ uint32 in_use_bits;
+ uint32 i, j;
+
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE);
+
+ splitnum = metap->OVFL_POINT;
+ max_free = metap->SPARES[splitnum];
+
+ free_page = (max_free - 1) >> (metap->BSHIFT + BYTE_TO_BIT);
+ free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1);
+
+ /* Look through all the free maps to find the first free block */
+ first_page = metap->LAST_FREED >> (metap->BSHIFT + BYTE_TO_BIT);
+ for ( i = first_page; i <= free_page; i++ ) {
+ Page mappage;
+
+ blkno = metap->hashm_mapp[i];
+ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ mappage = BufferGetPage(mapbuf);
+ _hash_checkpage(mappage, LH_BITMAP_PAGE);
+ freep = HashPageGetBitmap(mappage);
+ Assert(freep);
+
+ if (i == free_page)
+ in_use_bits = free_bit;
+ else
+ in_use_bits = BMPGSZ_BIT(metap) - 1;
+
+ if (i == first_page) {
+ bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1);
+ j = bit / BITS_PER_MAP;
+ bit = bit & ~(BITS_PER_MAP - 1);
+ } else {
+ bit = 0;
+ j = 0;
+ }
+ for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP)
+ if (freep[j] != ALL_SET)
+ goto found;
+ }
+
+ /* No Free Page Found - have to allocate a new page */
+ metap->LAST_FREED = metap->SPARES[splitnum];
+ metap->SPARES[splitnum]++;
+ offset = metap->SPARES[splitnum] -
+ (splitnum ? metap->SPARES[splitnum - 1] : 0);
+
+#define OVMSG "HASH: Out of overflow pages. Out of luck.\n"
+
+ if (offset > SPLITMASK) {
+ if (++splitnum >= NCACHED) {
+ elog(WARN, OVMSG);
+ }
+ metap->OVFL_POINT = splitnum;
+ metap->SPARES[splitnum] = metap->SPARES[splitnum-1];
+ metap->SPARES[splitnum-1]--;
+ offset = 0;
+ }
+
+ /* Check if we need to allocate a new bitmap page */
+ if (free_bit == BMPGSZ_BIT(metap) - 1) {
+ /* won't be needing old map page */
+
+ _hash_relbuf(rel, mapbuf, HASH_WRITE);
+
+ free_page++;
+ if (free_page >= NCACHED) {
+ elog(WARN, OVMSG);
+ }
+
+ /*
+ * This is tricky. The 1 indicates that you want the new page
+ * allocated with 1 clear bit. Actually, you are going to
+ * allocate 2 pages from this map. The first is going to be
+ * the map page, the second is the overflow page we were
+ * looking for. The init_bitmap routine automatically, sets
+ * the first bit of itself to indicate that the bitmap itself
+ * is in use. We would explicitly set the second bit, but
+ * don't have to if we tell init_bitmap not to leave it clear
+ * in the first place.
+ */
+ if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset),
+ 1, free_page)) {
+ elog(WARN, "overflow_page: problem with _hash_initbitmap.");
+ }
+ metap->SPARES[splitnum]++;
+ offset++;
+ if (offset > SPLITMASK) {
+ if (++splitnum >= NCACHED) {
+ elog(WARN, OVMSG);
+ }
+ metap->OVFL_POINT = splitnum;
+ metap->SPARES[splitnum] = metap->SPARES[splitnum-1];
+ metap->SPARES[splitnum-1]--;
+ offset = 0;
+ }
+ } else {
+
+ /*
+ * Free_bit addresses the last used bit. Bump it to address
+ * the first available bit.
+ */
+ free_bit++;
+ SETBIT(freep, free_bit);
+ _hash_wrtbuf(rel, mapbuf);
+ }
+
+ /* Calculate address of the new overflow page */
+ oaddr = OADDR_OF(splitnum, offset);
+ _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ);
+ return (oaddr);
+
+ found:
+ bit = bit + _hash_firstfreebit(freep[j]);
+ SETBIT(freep, bit);
+ _hash_wrtbuf(rel, mapbuf);
+
+ /*
+ * Bits are addressed starting with 0, but overflow pages are addressed
+ * beginning at 1. Bit is a bit addressnumber, so we need to increment
+ * it to convert it to a page number.
+ */
+
+ bit = 1 + bit + (i * BMPGSZ_BIT(metap));
+ if (bit >= metap->LAST_FREED) {
+ metap->LAST_FREED = bit - 1;
+ }
+
+ /* Calculate the split number for this page */
+ for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++)
+ ;
+ offset = (i ? bit - metap->SPARES[i - 1] : bit);
+ if (offset >= SPLITMASK) {
+ elog(WARN, OVMSG);
+ }
+
+ /* initialize this page */
+ oaddr = OADDR_OF(i, offset);
+ _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ);
+ return (oaddr);
+}
+
+/*
+ * _hash_firstfreebit()
+ *
+ * Return the first bit that is not set in the argument 'map'. This
+ * function is used to find an available overflow page within a
+ * splitnumber.
+ *
+ */
+static uint32
+_hash_firstfreebit(uint32 map)
+{
+ uint32 i, mask;
+
+ mask = 0x1;
+ for (i = 0; i < BITS_PER_MAP; i++) {
+ if (!(mask & map))
+ return (i);
+ mask = mask << 1;
+ }
+ return (i);
+}
+
+/*
+ * _hash_freeovflpage() -
+ *
+ * Mark this overflow page as free and return a buffer with
+ * the page that follows it (which may be defined as
+ * InvalidBuffer).
+ *
+ */
+Buffer
+_hash_freeovflpage(Relation rel, Buffer ovflbuf)
+{
+ HashMetaPage metap;
+ Buffer metabuf;
+ Buffer mapbuf;
+ BlockNumber prevblkno;
+ BlockNumber blkno;
+ BlockNumber nextblkno;
+ HashPageOpaque ovflopaque;
+ Page ovflpage;
+ Page mappage;
+ OverflowPageAddress addr;
+ SplitNumber splitnum;
+ uint32 *freep;
+ uint32 ovflpgno;
+ int32 bitmappage, bitmapbit;
+ Bucket bucket;
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ ovflpage = BufferGetPage(ovflbuf);
+ _hash_checkpage(ovflpage, LH_OVERFLOW_PAGE);
+ ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+ addr = ovflopaque->hasho_oaddr;
+ nextblkno = ovflopaque->hasho_nextblkno;
+ prevblkno = ovflopaque->hasho_prevblkno;
+ bucket = ovflopaque->hasho_bucket;
+ (void) memset(ovflpage, 0, BufferGetPageSize(ovflbuf));
+ _hash_wrtbuf(rel, ovflbuf);
+
+ /*
+ * fix up the bucket chain. this is a doubly-linked list, so we
+ * must fix up the bucket chain members behind and ahead of the
+ * overflow page being deleted.
+ *
+ * XXX this should look like:
+ * - lock prev/next
+ * - modify/write prev/next (how to do write ordering with a
+ * doubly-linked list???)
+ * - unlock prev/next
+ */
+ if (BlockNumberIsValid(prevblkno)) {
+ Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE);
+ Page prevpage = BufferGetPage(prevbuf);
+ HashPageOpaque prevopaque =
+ (HashPageOpaque) PageGetSpecialPointer(prevpage);
+
+ _hash_checkpage(prevpage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ Assert(prevopaque->hasho_bucket == bucket);
+ prevopaque->hasho_nextblkno = nextblkno;
+ _hash_wrtbuf(rel, prevbuf);
+ }
+ if (BlockNumberIsValid(nextblkno)) {
+ Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
+ Page nextpage = BufferGetPage(nextbuf);
+ HashPageOpaque nextopaque =
+ (HashPageOpaque) PageGetSpecialPointer(nextpage);
+
+ _hash_checkpage(nextpage, LH_OVERFLOW_PAGE);
+ Assert(nextopaque->hasho_bucket == bucket);
+ nextopaque->hasho_prevblkno = prevblkno;
+ _hash_wrtbuf(rel, nextbuf);
+ }
+
+ /*
+ * Fix up the overflow page bitmap that tracks this particular
+ * overflow page. The bitmap can be found in the MetaPageData
+ * array element hashm_mapp[bitmappage].
+ */
+ splitnum = (addr >> SPLITSHIFT);
+ ovflpgno =
+ (splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1;
+
+ if (ovflpgno < metap->LAST_FREED) {
+ metap->LAST_FREED = ovflpgno;
+ }
+
+ bitmappage = (ovflpgno >> (metap->BSHIFT + BYTE_TO_BIT));
+ bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1);
+
+ blkno = metap->hashm_mapp[bitmappage];
+ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ mappage = BufferGetPage(mapbuf);
+ _hash_checkpage(mappage, LH_BITMAP_PAGE);
+ freep = HashPageGetBitmap(mappage);
+ CLRBIT(freep, bitmapbit);
+ _hash_wrtbuf(rel, mapbuf);
+
+ _hash_relbuf(rel, metabuf, HASH_WRITE);
+
+ /*
+ * now instantiate the page that replaced this one,
+ * if it exists, and return that buffer with a write lock.
+ */
+ if (BlockNumberIsValid(nextblkno)) {
+ return (_hash_getbuf(rel, nextblkno, HASH_WRITE));
+ } else {
+ return (InvalidBuffer);
+ }
+}
+
+
+/*
+ * _hash_initbitmap()
+ *
+ * Initialize a new bitmap page. The metapage has a write-lock upon
+ * entering the function.
+ *
+ * 'pnum' is the OverflowPageAddress of the new bitmap page.
+ * 'nbits' is how many bits to clear (i.e., make available) in the new
+ * bitmap page. the remainder of the bits (as well as the first bit,
+ * representing the bitmap page itself) will be set.
+ * 'ndx' is the 0-based offset of the new bitmap page within the
+ * metapage's array of bitmap page OverflowPageAddresses.
+ */
+
+#define INT_MASK ((1 << INT_TO_BIT) -1)
+
+int32
+_hash_initbitmap(Relation rel,
+ HashMetaPage metap,
+ int32 pnum,
+ int32 nbits,
+ int32 ndx)
+{
+ Buffer buf;
+ BlockNumber blkno;
+ Page pg;
+ HashPageOpaque op;
+ uint32 *freep;
+ int clearbytes, clearints;
+
+ blkno = OADDR_TO_BLKNO(pnum);
+ buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ pg = BufferGetPage(buf);
+ _hash_pageinit(pg, BufferGetPageSize(buf));
+ op = (HashPageOpaque) PageGetSpecialPointer(pg);
+ op->hasho_oaddr = InvalidOvflAddress;
+ op->hasho_prevblkno = InvalidBlockNumber;
+ op->hasho_nextblkno = InvalidBlockNumber;
+ op->hasho_flag = LH_BITMAP_PAGE;
+ op->hasho_bucket = -1;
+
+ freep = HashPageGetBitmap(pg);
+
+ /* set all of the bits above 'nbits' to 1 */
+ clearints = ((nbits - 1) >> INT_TO_BIT) + 1;
+ clearbytes = clearints << INT_TO_BYTE;
+ (void) memset((char *) freep, 0, clearbytes);
+ (void) memset(((char *) freep) + clearbytes, 0xFF,
+ BMPGSZ_BYTE(metap) - clearbytes);
+ freep[clearints - 1] = ALL_SET << (nbits & INT_MASK);
+
+ /* bit 0 represents the new bitmap page */
+ SETBIT(freep, 0);
+
+ /* metapage already has a write lock */
+ metap->hashm_nmaps++;
+ metap->hashm_mapp[ndx] = blkno;
+
+ /* write out the new bitmap page (releasing its locks) */
+ _hash_wrtbuf(rel, buf);
+
+ return (0);
+}
+
+
+/*
+ * _hash_squeezebucket(rel, bucket)
+ *
+ * Try to squeeze the tuples onto pages occuring earlier in the
+ * bucket chain in an attempt to free overflow pages. When we start
+ * the "squeezing", the page from which we start taking tuples (the
+ * "read" page) is the last bucket in the bucket chain and the page
+ * onto which we start squeezing tuples (the "write" page) is the
+ * first page in the bucket chain. The read page works backward and
+ * the write page works forward; the procedure terminates when the
+ * read page and write page are the same page.
+ */
+void
+_hash_squeezebucket(Relation rel,
+ HashMetaPage metap,
+ Bucket bucket)
+{
+ Buffer wbuf;
+ Buffer rbuf;
+ BlockNumber wblkno;
+ BlockNumber rblkno;
+ Page wpage;
+ Page rpage;
+ HashPageOpaque wopaque;
+ HashPageOpaque ropaque;
+ OffsetNumber woffnum;
+ OffsetNumber roffnum;
+ HashItem hitem;
+ int itemsz;
+
+/* elog(DEBUG, "_hash_squeezebucket: squeezing bucket %d", bucket); */
+
+ /*
+ * start squeezing into the base bucket page.
+ */
+ wblkno = BUCKET_TO_BLKNO(bucket);
+ wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
+ wpage = BufferGetPage(wbuf);
+ _hash_checkpage(wpage, LH_BUCKET_PAGE);
+ wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
+
+ /*
+ * if there aren't any overflow pages, there's nothing to squeeze.
+ */
+ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) {
+ _hash_relbuf(rel, wbuf, HASH_WRITE);
+ return;
+ }
+
+ /*
+ * find the last page in the bucket chain by starting at the base
+ * bucket page and working forward.
+ *
+ * XXX if chains tend to be long, we should probably move forward
+ * using HASH_READ and then _hash_chgbufaccess to HASH_WRITE when
+ * we reach the end. if they are short we probably don't care
+ * very much. if the hash function is working at all, they had
+ * better be short..
+ */
+ ropaque = wopaque;
+ do {
+ rblkno = ropaque->hasho_nextblkno;
+ if (ropaque != wopaque) {
+ _hash_relbuf(rel, rbuf, HASH_WRITE);
+ }
+ rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
+ rpage = BufferGetPage(rbuf);
+ _hash_checkpage(rpage, LH_OVERFLOW_PAGE);
+ Assert(!PageIsEmpty(rpage));
+ ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
+ Assert(ropaque->hasho_bucket == bucket);
+ } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
+
+ /*
+ * squeeze the tuples.
+ */
+ roffnum = FirstOffsetNumber;
+ for(;;) {
+ hitem = (HashItem) PageGetItem(rpage, PageGetItemId(rpage, roffnum));
+ itemsz = IndexTupleDSize(hitem->hash_itup)
+ + (sizeof(HashItemData) - sizeof(IndexTupleData));
+ itemsz = DOUBLEALIGN(itemsz);
+
+ /*
+ * walk up the bucket chain, looking for a page big enough for
+ * this item.
+ */
+ while (PageGetFreeSpace(wpage) < itemsz) {
+ wblkno = wopaque->hasho_nextblkno;
+
+ _hash_wrtbuf(rel, wbuf);
+
+ if (!BlockNumberIsValid(wblkno) || (rblkno == wblkno)) {
+ _hash_wrtbuf(rel, rbuf);
+ /* wbuf is already released */
+ return;
+ }
+
+ wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
+ wpage = BufferGetPage(wbuf);
+ _hash_checkpage(wpage, LH_OVERFLOW_PAGE);
+ Assert(!PageIsEmpty(wpage));
+ wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
+ Assert(wopaque->hasho_bucket == bucket);
+ }
+
+ /*
+ * if we're here, we have found room so insert on the "write"
+ * page.
+ */
+ woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
+ (void) PageAddItem(wpage, (Item) hitem, itemsz, woffnum, LP_USED);
+
+ /*
+ * delete the tuple from the "read" page.
+ * PageIndexTupleDelete repacks the ItemId array, so 'roffnum'
+ * will be "advanced" to the "next" ItemId.
+ */
+ PageIndexTupleDelete(rpage, roffnum);
+ _hash_wrtnorelbuf(rel, rbuf);
+
+ /*
+ * if the "read" page is now empty because of the deletion,
+ * free it.
+ */
+ if (PageIsEmpty(rpage) && (ropaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+ rblkno = ropaque->hasho_prevblkno;
+ Assert(BlockNumberIsValid(rblkno));
+
+ /*
+ * free this overflow page. the extra _hash_relbuf is
+ * because _hash_freeovflpage gratuitously returns the
+ * next page (we want the previous page and will get it
+ * ourselves later).
+ */
+ rbuf = _hash_freeovflpage(rel, rbuf);
+ if (BufferIsValid(rbuf)) {
+ _hash_relbuf(rel, rbuf, HASH_WRITE);
+ }
+
+ if (rblkno == wblkno) {
+ /* rbuf is already released */
+ _hash_wrtbuf(rel, wbuf);
+ return;
+ }
+
+ rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
+ rpage = BufferGetPage(rbuf);
+ _hash_checkpage(rpage, LH_OVERFLOW_PAGE);
+ Assert(!PageIsEmpty(rpage));
+ ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
+ Assert(ropaque->hasho_bucket == bucket);
+
+ roffnum = FirstOffsetNumber;
+ }
+ }
+}
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
new file mode 100644
index 00000000000..2c6ebed8350
--- /dev/null
+++ b/src/backend/access/hash/hashpage.c
@@ -0,0 +1,669 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashpage.c--
+ * Hash table page management code for the Postgres hash access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * Postgres hash pages look like ordinary relation pages. The opaque
+ * data at high addresses includes information about the page including
+ * whether a page is an overflow page or a true bucket, the block
+ * numbers of the preceding and following pages, and the overflow
+ * address of the page if it is an overflow page.
+ *
+ * The first page in a hash relation, page zero, is special -- it stores
+ * information describing the hash table; it is referred to as teh
+ * "meta page." Pages one and higher store the actual data.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access);
+static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access);
+static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket);
+
+/*
+ * We use high-concurrency locking on hash indices. There are two cases in
+ * which we don't do locking. One is when we're building the index.
+ * Since the creating transaction has not committed, no one can see
+ * the index, and there's no reason to share locks. The second case
+ * is when we're just starting up the database system. We use some
+ * special-purpose initialization code in the relation cache manager
+ * (see utils/cache/relcache.c) to allow us to do indexed scans on
+ * the system catalogs before we'd normally be able to. This happens
+ * before the lock table is fully initialized, so we can't use it.
+ * Strictly speaking, this violates 2pl, but we don't do 2pl on the
+ * system catalogs anyway.
+ */
+
+
+#define USELOCKING (!BuildingHash && !IsInitProcessingMode())
+
+
+/*
+ * _hash_metapinit() -- Initialize the metadata page of a hash index,
+ * the two buckets that we begin with and the initial
+ * bitmap page.
+ */
+void
+_hash_metapinit(Relation rel)
+{
+ HashMetaPage metap;
+ HashPageOpaque pageopaque;
+ Buffer metabuf;
+ Buffer buf;
+ Page pg;
+ int nbuckets;
+ uint32 nelem; /* number elements */
+ uint32 lg2nelem; /* _hash_log2(nelem) */
+ uint32 nblocks;
+ uint16 i;
+
+ /* can't be sharing this with anyone, now... */
+ if (USELOCKING)
+ RelationSetLockForWrite(rel);
+
+ if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
+ elog(WARN, "Cannot initialize non-empty hash table %s",
+ RelationGetRelationName(rel));
+ }
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+ pg = BufferGetPage(metabuf);
+ metap = (HashMetaPage) pg;
+ _hash_pageinit(pg, BufferGetPageSize(metabuf));
+
+ metap->hashm_magic = HASH_MAGIC;
+ metap->hashm_version = HASH_VERSION;
+ metap->hashm_nkeys = 0;
+ metap->hashm_nmaps = 0;
+ metap->hashm_ffactor = DEFAULT_FFACTOR;
+ metap->hashm_bsize = BufferGetPageSize(metabuf);
+ metap->hashm_bshift = _hash_log2(metap->hashm_bsize);
+ for (i = metap->hashm_bshift; i > 0; --i) {
+ if ((1 << i) < (metap->hashm_bsize -
+ (DOUBLEALIGN(sizeof(PageHeaderData)) +
+ DOUBLEALIGN(sizeof(HashPageOpaqueData))))) {
+ break;
+ }
+ }
+ Assert(i);
+ metap->hashm_bmsize = 1 << i;
+ metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
+
+ /*
+ * Make nelem = 2 rather than 0 so that we end up allocating space
+ * for the next greater power of two number of buckets.
+ */
+ nelem = 2;
+ lg2nelem = 1; /*_hash_log2(MAX(nelem, 2)) */
+ nbuckets = 2; /*1 << lg2nelem */
+
+ memset((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares));
+ memset((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
+
+ metap->hashm_spares[lg2nelem] = 2; /* lg2nelem + 1 */
+ metap->hashm_spares[lg2nelem + 1] = 2; /* lg2nelem + 1 */
+ metap->hashm_ovflpoint = 1; /* lg2nelem */
+ metap->hashm_lastfreed = 2;
+
+ metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */
+ metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */
+
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
+ pageopaque->hasho_oaddr = InvalidOvflAddress;
+ pageopaque->hasho_prevblkno = InvalidBlockNumber;
+ pageopaque->hasho_nextblkno = InvalidBlockNumber;
+ pageopaque->hasho_flag = LH_META_PAGE;
+ pageopaque->hasho_bucket = -1;
+
+ /*
+ * First bitmap page is at: splitpoint lg2nelem page offset 1 which
+ * turns out to be page 3. Couldn't initialize page 3 until we created
+ * the first two buckets above.
+ */
+ if (_hash_initbitmap(rel, metap, OADDR_OF(lg2nelem, 1), lg2nelem + 1, 0))
+ elog(WARN, "Problem with _hash_initbitmap.");
+
+ /* all done */
+ _hash_wrtnorelbuf(rel, metabuf);
+
+ /*
+ * initialize the first two buckets
+ */
+ for (i = 0; i <= 1; i++) {
+ buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
+ pg = BufferGetPage(buf);
+ _hash_pageinit(pg, BufferGetPageSize(buf));
+ pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
+ pageopaque->hasho_oaddr = InvalidOvflAddress;
+ pageopaque->hasho_prevblkno = InvalidBlockNumber;
+ pageopaque->hasho_nextblkno = InvalidBlockNumber;
+ pageopaque->hasho_flag = LH_BUCKET_PAGE;
+ pageopaque->hasho_bucket = i;
+ _hash_wrtbuf(rel, buf);
+ }
+
+ _hash_relbuf(rel, metabuf, HASH_WRITE);
+
+ if (USELOCKING)
+ RelationUnsetLockForWrite(rel);
+}
+
+/*
+ * _hash_getbuf() -- Get a buffer by block number for read or write.
+ *
+ * When this routine returns, the appropriate lock is set on the
+ * requested buffer its reference count is correct.
+ *
+ * XXX P_NEW is not used because, unlike the tree structures, we
+ * need the bucket blocks to be at certain block numbers. we must
+ * depend on the caller to call _hash_pageinit on the block if it
+ * knows that this is a new block.
+ */
+Buffer
+_hash_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+ Buffer buf;
+
+ if (blkno == P_NEW) {
+ elog(WARN, "_hash_getbuf: internal error: hash AM does not use P_NEW");
+ }
+ switch (access) {
+ case HASH_WRITE:
+ case HASH_READ:
+ _hash_setpagelock(rel, blkno, access);
+ break;
+ default:
+ elog(WARN, "_hash_getbuf: invalid access (%d) on new blk: %.*s",
+ access, NAMEDATALEN, RelationGetRelationName(rel));
+ break;
+ }
+ buf = ReadBuffer(rel, blkno);
+
+ /* ref count and lock type are correct */
+ return (buf);
+}
+
+/*
+ * _hash_relbuf() -- release a locked buffer.
+ */
+void
+_hash_relbuf(Relation rel, Buffer buf, int access)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+
+ switch (access) {
+ case HASH_WRITE:
+ case HASH_READ:
+ _hash_unsetpagelock(rel, blkno, access);
+ break;
+ default:
+ elog(WARN, "_hash_relbuf: invalid access (%d) on blk %x: %.*s",
+ access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+ }
+
+ ReleaseBuffer(buf);
+}
+
+/*
+ * _hash_wrtbuf() -- write a hash page to disk.
+ *
+ * This routine releases the lock held on the buffer and our reference
+ * to it. It is an error to call _hash_wrtbuf() without a write lock
+ * or a reference to the buffer.
+ */
+void
+_hash_wrtbuf(Relation rel, Buffer buf)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteBuffer(buf);
+ _hash_unsetpagelock(rel, blkno, HASH_WRITE);
+}
+
+/*
+ * _hash_wrtnorelbuf() -- write a hash page to disk, but do not release
+ * our reference or lock.
+ *
+ * It is an error to call _hash_wrtnorelbuf() without a write lock
+ * or a reference to the buffer.
+ */
+void
+_hash_wrtnorelbuf(Relation rel, Buffer buf)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteNoReleaseBuffer(buf);
+}
+
+Page
+_hash_chgbufaccess(Relation rel,
+ Buffer *bufp,
+ int from_access,
+ int to_access)
+{
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(*bufp);
+
+ switch (from_access) {
+ case HASH_WRITE:
+ _hash_wrtbuf(rel, *bufp);
+ break;
+ case HASH_READ:
+ _hash_relbuf(rel, *bufp, from_access);
+ break;
+ default:
+ elog(WARN, "_hash_chgbufaccess: invalid access (%d) on blk %x: %.*s",
+ from_access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+ break;
+ }
+ *bufp = _hash_getbuf(rel, blkno, to_access);
+ return (BufferGetPage(*bufp));
+}
+
+/*
+ * _hash_pageinit() -- Initialize a new page.
+ */
+void
+_hash_pageinit(Page page, Size size)
+{
+ Assert(((PageHeader) page)->pd_lower == 0);
+ Assert(((PageHeader) page)->pd_upper == 0);
+ Assert(((PageHeader) page)->pd_special == 0);
+
+ /*
+ * Cargo-cult programming -- don't really need this to be zero, but
+ * creating new pages is an infrequent occurrence and it makes me feel
+ * good when I know they're empty.
+ */
+ memset(page, 0, size);
+
+ PageInit(page, size, sizeof(HashPageOpaqueData));
+}
+
+static void
+_hash_setpagelock(Relation rel,
+ BlockNumber blkno,
+ int access)
+{
+ ItemPointerData iptr;
+
+ if (USELOCKING) {
+ ItemPointerSet(&iptr, blkno, 1);
+
+ switch (access) {
+ case HASH_WRITE:
+ RelationSetSingleWLockPage(rel, &iptr);
+ break;
+ case HASH_READ:
+ RelationSetSingleRLockPage(rel, &iptr);
+ break;
+ default:
+ elog(WARN, "_hash_setpagelock: invalid access (%d) on blk %x: %.*s",
+ access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+ break;
+ }
+ }
+}
+
+static void
+_hash_unsetpagelock(Relation rel,
+ BlockNumber blkno,
+ int access)
+{
+ ItemPointerData iptr;
+
+ if (USELOCKING) {
+ ItemPointerSet(&iptr, blkno, 1);
+
+ switch (access) {
+ case HASH_WRITE:
+ RelationUnsetSingleWLockPage(rel, &iptr);
+ break;
+ case HASH_READ:
+ RelationUnsetSingleRLockPage(rel, &iptr);
+ break;
+ default:
+ elog(WARN, "_hash_unsetpagelock: invalid access (%d) on blk %x: %.*s",
+ access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+ break;
+ }
+ }
+}
+
+void
+_hash_pagedel(Relation rel, ItemPointer tid)
+{
+ Buffer buf;
+ Buffer metabuf;
+ Page page;
+ BlockNumber blkno;
+ OffsetNumber offno;
+ HashMetaPage metap;
+ HashPageOpaque opaque;
+
+ blkno = ItemPointerGetBlockNumber(tid);
+ offno = ItemPointerGetOffsetNumber(tid);
+
+ buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+ PageIndexTupleDelete(page, offno);
+ _hash_wrtnorelbuf(rel, buf);
+
+ if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+ buf = _hash_freeovflpage(rel, buf);
+ if (BufferIsValid(buf)) {
+ _hash_relbuf(rel, buf, HASH_WRITE);
+ }
+ } else {
+ _hash_relbuf(rel, buf, HASH_WRITE);
+ }
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+ ++metap->hashm_nkeys;
+ _hash_wrtbuf(rel, metabuf);
+}
+
+void
+_hash_expandtable(Relation rel, Buffer metabuf)
+{
+ HashMetaPage metap;
+ Bucket old_bucket;
+ Bucket new_bucket;
+ uint32 spare_ndx;
+
+/* elog(DEBUG, "_hash_expandtable: expanding..."); */
+
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
+ new_bucket = ++metap->MAX_BUCKET;
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
+ old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK);
+
+ /*
+ * If the split point is increasing (MAX_BUCKET's log base 2
+ * * increases), we need to copy the current contents of the spare
+ * split bucket to the next bucket.
+ */
+ spare_ndx = _hash_log2(metap->MAX_BUCKET + 1);
+ if (spare_ndx > metap->OVFL_POINT) {
+
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
+ metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT];
+ metap->OVFL_POINT = spare_ndx;
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
+ }
+
+ if (new_bucket > metap->HIGH_MASK) {
+
+ /* Starting a new doubling */
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
+ metap->LOW_MASK = metap->HIGH_MASK;
+ metap->HIGH_MASK = new_bucket | metap->LOW_MASK;
+ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
+
+ }
+ /* Relocate records to the new bucket */
+ _hash_splitpage(rel, metabuf, old_bucket, new_bucket);
+}
+
+
+/*
+ * _hash_splitpage -- split 'obucket' into 'obucket' and 'nbucket'
+ *
+ * this routine is actually misnamed -- we are splitting a bucket that
+ * consists of a base bucket page and zero or more overflow (bucket
+ * chain) pages.
+ */
+static void
+_hash_splitpage(Relation rel,
+ Buffer metabuf,
+ Bucket obucket,
+ Bucket nbucket)
+{
+ Bucket bucket;
+ Buffer obuf;
+ Buffer nbuf;
+ Buffer ovflbuf;
+ BlockNumber oblkno;
+ BlockNumber nblkno;
+ bool null;
+ Datum datum;
+ HashItem hitem;
+ HashPageOpaque oopaque;
+ HashPageOpaque nopaque;
+ HashMetaPage metap;
+ IndexTuple itup;
+ int itemsz;
+ OffsetNumber ooffnum;
+ OffsetNumber noffnum;
+ OffsetNumber omaxoffnum;
+ Page opage;
+ Page npage;
+ TupleDesc itupdesc;
+
+/* elog(DEBUG, "_hash_splitpage: splitting %d into %d,%d",
+ obucket, obucket, nbucket);
+*/
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ /* get the buffers & pages */
+ oblkno = BUCKET_TO_BLKNO(obucket);
+ nblkno = BUCKET_TO_BLKNO(nbucket);
+ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+ nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
+ opage = BufferGetPage(obuf);
+ npage = BufferGetPage(nbuf);
+
+ /* initialize the new bucket */
+ _hash_pageinit(npage, BufferGetPageSize(nbuf));
+ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+ nopaque->hasho_prevblkno = InvalidBlockNumber;
+ nopaque->hasho_nextblkno = InvalidBlockNumber;
+ nopaque->hasho_flag = LH_BUCKET_PAGE;
+ nopaque->hasho_oaddr = InvalidOvflAddress;
+ nopaque->hasho_bucket = nbucket;
+ _hash_wrtnorelbuf(rel, nbuf);
+
+ /*
+ * make sure the old bucket isn't empty. advance 'opage' and
+ * friends through the overflow bucket chain until we find a
+ * non-empty page.
+ *
+ * XXX we should only need this once, if we are careful to
+ * preserve the invariant that overflow pages are never empty.
+ */
+ _hash_checkpage(opage, LH_BUCKET_PAGE);
+ oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+ if (PageIsEmpty(opage)) {
+ oblkno = oopaque->hasho_nextblkno;
+ _hash_relbuf(rel, obuf, HASH_WRITE);
+ if (!BlockNumberIsValid(oblkno)) {
+ /*
+ * the old bucket is completely empty; of course, the new
+ * bucket will be as well, but since it's a base bucket
+ * page we don't care.
+ */
+ _hash_relbuf(rel, nbuf, HASH_WRITE);
+ return;
+ }
+ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+ opage = BufferGetPage(obuf);
+ _hash_checkpage(opage, LH_OVERFLOW_PAGE);
+ if (PageIsEmpty(opage)) {
+ elog(WARN, "_hash_splitpage: empty overflow page %d", oblkno);
+ }
+ oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+ }
+
+ /*
+ * we are now guaranteed that 'opage' is not empty. partition the
+ * tuples in the old bucket between the old bucket and the new
+ * bucket, advancing along their respective overflow bucket chains
+ * and adding overflow pages as needed.
+ */
+ ooffnum = FirstOffsetNumber;
+ omaxoffnum = PageGetMaxOffsetNumber(opage);
+ for (;;) {
+ /*
+ * at each iteration through this loop, each of these variables
+ * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
+ */
+
+ /* check if we're at the end of the page */
+ if (ooffnum > omaxoffnum) {
+ /* at end of page, but check for overflow page */
+ oblkno = oopaque->hasho_nextblkno;
+ if (BlockNumberIsValid(oblkno)) {
+ /*
+ * we ran out of tuples on this particular page, but
+ * we have more overflow pages; re-init values.
+ */
+ _hash_wrtbuf(rel, obuf);
+ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+ opage = BufferGetPage(obuf);
+ _hash_checkpage(opage, LH_OVERFLOW_PAGE);
+ oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+
+ /* we're guaranteed that an ovfl page has at least 1 tuple */
+ if (PageIsEmpty(opage)) {
+ elog(WARN, "_hash_splitpage: empty ovfl page %d!",
+ oblkno);
+ }
+ ooffnum = FirstOffsetNumber;
+ omaxoffnum = PageGetMaxOffsetNumber(opage);
+ } else {
+ /*
+ * we're at the end of the bucket chain, so now we're
+ * really done with everything. before quitting, call
+ * _hash_squeezebucket to ensure the tuples in the
+ * bucket (including the overflow pages) are packed as
+ * tightly as possible.
+ */
+ _hash_wrtbuf(rel, obuf);
+ _hash_wrtbuf(rel, nbuf);
+ _hash_squeezebucket(rel, metap, obucket);
+ return;
+ }
+ }
+
+ /* hash on the tuple */
+ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
+ itup = &(hitem->hash_itup);
+ itupdesc = RelationGetTupleDescriptor(rel);
+ datum = index_getattr(itup, 1, itupdesc, &null);
+ bucket = _hash_call(rel, metap, datum);
+
+ if (bucket == nbucket) {
+ /*
+ * insert the tuple into the new bucket. if it doesn't
+ * fit on the current page in the new bucket, we must
+ * allocate a new overflow page and place the tuple on
+ * that page instead.
+ */
+ itemsz = IndexTupleDSize(hitem->hash_itup)
+ + (sizeof(HashItemData) - sizeof(IndexTupleData));
+
+ itemsz = DOUBLEALIGN(itemsz);
+
+ if (PageGetFreeSpace(npage) < itemsz) {
+ ovflbuf = _hash_addovflpage(rel, &metabuf, nbuf);
+ _hash_wrtbuf(rel, nbuf);
+ nbuf = ovflbuf;
+ npage = BufferGetPage(nbuf);
+ _hash_checkpage(npage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ }
+
+ noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
+ (void) PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED);
+ _hash_wrtnorelbuf(rel, nbuf);
+
+ /*
+ * now delete the tuple from the old bucket. after this
+ * section of code, 'ooffnum' will actually point to the
+ * ItemId to which we would point if we had advanced it
+ * before the deletion (PageIndexTupleDelete repacks the
+ * ItemId array). this also means that 'omaxoffnum' is
+ * exactly one less than it used to be, so we really can
+ * just decrement it instead of calling
+ * PageGetMaxOffsetNumber.
+ */
+ PageIndexTupleDelete(opage, ooffnum);
+ _hash_wrtnorelbuf(rel, obuf);
+ omaxoffnum = OffsetNumberPrev(omaxoffnum);
+
+ /*
+ * tidy up. if the old page was an overflow page and it
+ * is now empty, we must free it (we want to preserve the
+ * invariant that overflow pages cannot be empty).
+ */
+ if (PageIsEmpty(opage) &&
+ (oopaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+ obuf = _hash_freeovflpage(rel, obuf);
+
+ /* check that we're not through the bucket chain */
+ if (BufferIsInvalid(obuf)) {
+ _hash_wrtbuf(rel, nbuf);
+ _hash_squeezebucket(rel, metap, obucket);
+ return;
+ }
+
+ /*
+ * re-init. again, we're guaranteed that an ovfl page
+ * has at least one tuple.
+ */
+ opage = BufferGetPage(obuf);
+ _hash_checkpage(opage, LH_OVERFLOW_PAGE);
+ oblkno = BufferGetBlockNumber(obuf);
+ oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+ if (PageIsEmpty(opage)) {
+ elog(WARN, "_hash_splitpage: empty overflow page %d",
+ oblkno);
+ }
+ ooffnum = FirstOffsetNumber;
+ omaxoffnum = PageGetMaxOffsetNumber(opage);
+ }
+ } else {
+ /*
+ * the tuple stays on this page. we didn't move anything,
+ * so we didn't delete anything and therefore we don't
+ * have to change 'omaxoffnum'.
+ *
+ * XXX any hash value from [0, nbucket-1] will map to this
+ * bucket, which doesn't make sense to me.
+ */
+ ooffnum = OffsetNumberNext(ooffnum);
+ }
+ }
+ /*NOTREACHED*/
+}
diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c
new file mode 100644
index 00000000000..c4cce0e70d9
--- /dev/null
+++ b/src/backend/access/hash/hashscan.c
@@ -0,0 +1,172 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashscan.c--
+ * manage scans on hash tables
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ * Because we can be doing an index scan on a relation while we
+ * update it, we need to avoid missing data that moves around in
+ * the index. The routines and global variables in this file
+ * guarantee that all scans in the local address space stay
+ * correctly positioned. This is all we need to worry about, since
+ * write locking guarantees that no one else will be on the same
+ * page at the same time as we are.
+ *
+ * The scheme is to manage a list of active scans in the current
+ * backend. Whenever we add or remove records from an index, we
+ * check the list of active scans to see if any has been affected.
+ * A scan is affected only if it is on the same relation, and the
+ * same page, as the update.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+
+static void _hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
+static bool _hash_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
+
+typedef struct HashScanListData {
+ IndexScanDesc hashsl_scan;
+ struct HashScanListData *hashsl_next;
+} HashScanListData;
+
+typedef HashScanListData *HashScanList;
+
+static HashScanList HashScans = (HashScanList) NULL;
+
+/*
+ * _Hash_regscan() -- register a new scan.
+ */
+void
+_hash_regscan(IndexScanDesc scan)
+{
+ HashScanList new_el;
+
+ new_el = (HashScanList) palloc(sizeof(HashScanListData));
+ new_el->hashsl_scan = scan;
+ new_el->hashsl_next = HashScans;
+ HashScans = new_el;
+}
+
+/*
+ * _hash_dropscan() -- drop a scan from the scan list
+ */
+void
+_hash_dropscan(IndexScanDesc scan)
+{
+ HashScanList chk, last;
+
+ last = (HashScanList) NULL;
+ for (chk = HashScans;
+ chk != (HashScanList) NULL && chk->hashsl_scan != scan;
+ chk = chk->hashsl_next) {
+ last = chk;
+ }
+
+ if (chk == (HashScanList) NULL)
+ elog(WARN, "hash scan list trashed; can't find 0x%lx", scan);
+
+ if (last == (HashScanList) NULL)
+ HashScans = chk->hashsl_next;
+ else
+ last->hashsl_next = chk->hashsl_next;
+
+#ifdef PERFECT_MEM
+ pfree (chk);
+#endif /* PERFECT_MEM */
+}
+
+void
+_hash_adjscans(Relation rel, ItemPointer tid)
+{
+ HashScanList l;
+ Oid relid;
+
+ relid = rel->rd_id;
+ for (l = HashScans; l != (HashScanList) NULL; l = l->hashsl_next) {
+ if (relid == l->hashsl_scan->relation->rd_id)
+ _hash_scandel(l->hashsl_scan, ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ }
+}
+
+static void
+_hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+ ItemPointer current;
+ Buffer buf;
+ Buffer metabuf;
+ HashScanOpaque so;
+
+ if (!_hash_scantouched(scan, blkno, offno))
+ return;
+
+ metabuf = _hash_getbuf(scan->relation, HASH_METAPAGE, HASH_READ);
+
+ so = (HashScanOpaque) scan->opaque;
+ buf = so->hashso_curbuf;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno) {
+ _hash_step(scan, &buf, BackwardScanDirection, metabuf);
+ so->hashso_curbuf = buf;
+ }
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno) {
+ ItemPointerData tmp;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ _hash_step(scan, &buf, BackwardScanDirection, metabuf);
+ so->hashso_mrkbuf = buf;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ }
+}
+
+static bool
+_hash_scantouched(IndexScanDesc scan,
+ BlockNumber blkno,
+ OffsetNumber offno)
+{
+ ItemPointer current;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ return (false);
+}
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
new file mode 100644
index 00000000000..056235dec85
--- /dev/null
+++ b/src/backend/access/hash/hashsearch.c
@@ -0,0 +1,425 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashsearch.c--
+ * search code for postgres hash tables
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "fmgr.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/skey.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+
+/*
+ * _hash_search() -- Finds the page/bucket that the contains the
+ * scankey and loads it into *bufP. the buffer has a read lock.
+ */
+void
+_hash_search(Relation rel,
+ int keysz,
+ ScanKey scankey,
+ Buffer *bufP,
+ HashMetaPage metap)
+{
+ BlockNumber blkno;
+ Datum keyDatum;
+ Bucket bucket;
+
+ if (scankey == (ScanKey) NULL ||
+ (keyDatum = scankey[0].sk_argument) == (Datum) NULL) {
+ /*
+ * If the scankey argument is NULL, all tuples will satisfy
+ * the scan so we start the scan at the first bucket (bucket
+ * 0).
+ */
+ bucket = 0;
+ } else {
+ bucket = _hash_call(rel, metap, keyDatum);
+ }
+
+ blkno = BUCKET_TO_BLKNO(bucket);
+
+ *bufP = _hash_getbuf(rel, blkno, HASH_READ);
+}
+
+/*
+ * _hash_next() -- Get the next item in a scan.
+ *
+ * On entry, we have a valid currentItemData in the scan, and a
+ * read lock on the page that contains that item. We do not have
+ * the page pinned. We return the next item in the scan. On
+ * exit, we have the page containing the next item locked but not
+ * pinned.
+ */
+RetrieveIndexResult
+_hash_next(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ Buffer buf;
+ Buffer metabuf;
+ Page page;
+ OffsetNumber offnum;
+ RetrieveIndexResult res;
+ ItemPointer current;
+ ItemPointer iptr;
+ HashItem hitem;
+ IndexTuple itup;
+ HashScanOpaque so;
+
+ rel = scan->relation;
+ so = (HashScanOpaque) scan->opaque;
+ current = &(scan->currentItemData);
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+
+ /*
+ * XXX 10 may 91: somewhere there's a bug in our management of the
+ * cached buffer for this scan. wei discovered it. the following
+ * is a workaround so he can work until i figure out what's going on.
+ */
+
+ if (!BufferIsValid(so->hashso_curbuf)) {
+ so->hashso_curbuf = _hash_getbuf(rel,
+ ItemPointerGetBlockNumber(current),
+ HASH_READ);
+ }
+
+ /* we still have the buffer pinned and locked */
+ buf = so->hashso_curbuf;
+
+ /*
+ * step to next valid tuple. note that _hash_step releases our
+ * lock on 'metabuf'; if we switch to a new 'buf' while looking
+ * for the next tuple, we come back with a lock on that buffer.
+ */
+ if (!_hash_step(scan, &buf, dir, metabuf)) {
+ return ((RetrieveIndexResult) NULL);
+ }
+
+ /* if we're here, _hash_step found a valid tuple */
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &hitem->hash_itup;
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid), sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+
+ return (res);
+}
+
+static void
+_hash_readnext(Relation rel,
+ Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
+{
+ BlockNumber blkno;
+
+ blkno = (*opaquep)->hasho_nextblkno;
+ _hash_relbuf(rel, *bufp, HASH_READ);
+ *bufp = InvalidBuffer;
+ if (BlockNumberIsValid(blkno)) {
+ *bufp = _hash_getbuf(rel, blkno, HASH_READ);
+ *pagep = BufferGetPage(*bufp);
+ _hash_checkpage(*pagep, LH_OVERFLOW_PAGE);
+ *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+ Assert(!PageIsEmpty(*pagep));
+ }
+}
+
+static void
+_hash_readprev(Relation rel,
+ Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
+{
+ BlockNumber blkno;
+
+ blkno = (*opaquep)->hasho_prevblkno;
+ _hash_relbuf(rel, *bufp, HASH_READ);
+ *bufp = InvalidBuffer;
+ if (BlockNumberIsValid(blkno)) {
+ *bufp = _hash_getbuf(rel, blkno, HASH_READ);
+ *pagep = BufferGetPage(*bufp);
+ _hash_checkpage(*pagep, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+ if (PageIsEmpty(*pagep)) {
+ Assert((*opaquep)->hasho_flag & LH_BUCKET_PAGE);
+ _hash_relbuf(rel, *bufp, HASH_READ);
+ *bufp = InvalidBuffer;
+ }
+ }
+}
+
+/*
+ * _hash_first() -- Find the first item in a scan.
+ *
+ * Return the RetrieveIndexResult of the first item in the tree that
+ * satisfies the qualificatin associated with the scan descriptor. On
+ * exit, the page containing the current index tuple is read locked
+ * and pinned, and the scan's opaque data entry is updated to
+ * include the buffer.
+ */
+RetrieveIndexResult
+_hash_first(IndexScanDesc scan, ScanDirection dir)
+{
+ Relation rel;
+ Buffer buf;
+ Buffer metabuf;
+ Page page;
+ HashPageOpaque opaque;
+ HashMetaPage metap;
+ HashItem hitem;
+ IndexTuple itup;
+ ItemPointer current;
+ ItemPointer iptr;
+ OffsetNumber offnum;
+ RetrieveIndexResult res;
+ HashScanOpaque so;
+
+ rel = scan->relation;
+ so = (HashScanOpaque) scan->opaque;
+ current = &(scan->currentItemData);
+
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ /*
+ * XXX -- The attribute number stored in the scan key is the attno
+ * in the heap relation. We need to transmogrify this into
+ * the index relation attno here. For the moment, we have
+ * hardwired attno == 1.
+ */
+
+ /* find the correct bucket page and load it into buf */
+ _hash_search(rel, 1, scan->keyData, &buf, metap);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * if we are scanning forward, we need to find the first non-empty
+ * page (if any) in the bucket chain. since overflow pages are
+ * never empty, this had better be either the bucket page or the
+ * first overflow page.
+ *
+ * if we are scanning backward, we always go all the way to the
+ * end of the bucket chain.
+ */
+ if (PageIsEmpty(page)) {
+ if (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+ _hash_readnext(rel, &buf, &page, &opaque);
+ } else {
+ ItemPointerSetInvalid(current);
+ so->hashso_curbuf = InvalidBuffer;
+ return ((RetrieveIndexResult) NULL);
+ }
+ }
+ if (ScanDirectionIsBackward(dir)) {
+ while (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+ _hash_readnext(rel, &buf, &page, &opaque);
+ }
+ }
+
+ if (!_hash_step(scan, &buf, dir, metabuf)) {
+ return ((RetrieveIndexResult) NULL);
+ }
+
+ /* if we're here, _hash_step found a valid tuple */
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &hitem->hash_itup;
+ iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+ memmove((char *) iptr, (char *) &(itup->t_tid), sizeof(ItemPointerData));
+ res = FormRetrieveIndexResult(current, iptr);
+
+ return (res);
+}
+
+/*
+ * _hash_step() -- step to the next valid item in a scan in the bucket.
+ *
+ * If no valid record exists in the requested direction, return
+ * false. Else, return true and set the CurrentItemData for the
+ * scan to the right thing.
+ *
+ * 'bufP' points to the buffer which contains the current page
+ * that we'll step through.
+ *
+ * 'metabuf' is released when this returns.
+ */
+bool
+_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
+{
+ Relation rel;
+ ItemPointer current;
+ HashScanOpaque so;
+ int allbuckets;
+ HashMetaPage metap;
+ Buffer buf;
+ Page page;
+ HashPageOpaque opaque;
+ OffsetNumber maxoff;
+ OffsetNumber offnum;
+ Bucket bucket;
+ BlockNumber blkno;
+ HashItem hitem;
+ IndexTuple itup;
+
+ rel = scan->relation;
+ current = &(scan->currentItemData);
+ so = (HashScanOpaque) scan->opaque;
+ allbuckets = (scan->numberOfKeys < 1);
+
+ metap = (HashMetaPage) BufferGetPage(metabuf);
+ _hash_checkpage((Page) metap, LH_META_PAGE);
+
+ buf = *bufP;
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * If _hash_step is called from _hash_first, current will not be
+ * valid, so we can't dereference it. However, in that case, we
+ * presumably want to start at the beginning/end of the page...
+ */
+ maxoff = PageGetMaxOffsetNumber(page);
+ if (ItemPointerIsValid(current)) {
+ offnum = ItemPointerGetOffsetNumber(current);
+ } else {
+ offnum = InvalidOffsetNumber;
+ }
+
+ /*
+ * 'offnum' now points to the last tuple we have seen (if any).
+ *
+ * continue to step through tuples until:
+ * 1) we get to the end of the bucket chain or
+ * 2) we find a valid tuple.
+ */
+ do {
+ bucket = opaque->hasho_bucket;
+
+ switch (dir) {
+ case ForwardScanDirection:
+ if (offnum != InvalidOffsetNumber) {
+ offnum = OffsetNumberNext(offnum); /* move forward */
+ } else {
+ offnum = FirstOffsetNumber; /* new page */
+ }
+ while (offnum > maxoff) {
+ /*
+ * either this page is empty (maxoff ==
+ * InvalidOffsetNumber) or we ran off the end.
+ */
+ _hash_readnext(rel, &buf, &page, &opaque);
+ if (BufferIsInvalid(buf)) { /* end of chain */
+ if (allbuckets && bucket < metap->hashm_maxbucket) {
+ ++bucket;
+ blkno = BUCKET_TO_BLKNO(bucket);
+ buf = _hash_getbuf(rel, blkno, HASH_READ);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_bucket == bucket);
+ while (PageIsEmpty(page) &&
+ BlockNumberIsValid(opaque->hasho_nextblkno)) {
+ _hash_readnext(rel, &buf, &page, &opaque);
+ }
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = FirstOffsetNumber;
+ } else {
+ maxoff = offnum = InvalidOffsetNumber;
+ break; /* while */
+ }
+ } else {
+ /* _hash_readnext never returns an empty page */
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = FirstOffsetNumber;
+ }
+ }
+ break;
+ case BackwardScanDirection:
+ if (offnum != InvalidOffsetNumber) {
+ offnum = OffsetNumberPrev(offnum); /* move back */
+ } else {
+ offnum = maxoff; /* new page */
+ }
+ while (offnum < FirstOffsetNumber) {
+ /*
+ * either this page is empty (offnum ==
+ * InvalidOffsetNumber) or we ran off the end.
+ */
+ _hash_readprev(rel, &buf, &page, &opaque);
+ if (BufferIsInvalid(buf)) { /* end of chain */
+ if (allbuckets && bucket > 0) {
+ --bucket;
+ blkno = BUCKET_TO_BLKNO(bucket);
+ buf = _hash_getbuf(rel, blkno, HASH_READ);
+ page = BufferGetPage(buf);
+ _hash_checkpage(page, LH_BUCKET_PAGE);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_bucket == bucket);
+ while (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+ _hash_readnext(rel, &buf, &page, &opaque);
+ }
+ maxoff = offnum = PageGetMaxOffsetNumber(page);
+ } else {
+ maxoff = offnum = InvalidOffsetNumber;
+ break; /* while */
+ }
+ } else {
+ /* _hash_readprev never returns an empty page */
+ maxoff = offnum = PageGetMaxOffsetNumber(page);
+ }
+ }
+ break;
+ default:
+ /* NoMovementScanDirection */
+ /* this should not be reached */
+ break;
+ }
+
+ /* we ran off the end of the world without finding a match */
+ if (offnum == InvalidOffsetNumber) {
+ _hash_relbuf(rel, metabuf, HASH_READ);
+ *bufP = so->hashso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return(false);
+ }
+
+ /* get ready to check this tuple */
+ hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &hitem->hash_itup;
+ } while (!_hash_checkqual(scan, itup));
+
+ /* if we made it to here, we've found a valid tuple */
+ _hash_relbuf(rel, metabuf, HASH_READ);
+ blkno = BufferGetBlockNumber(buf);
+ *bufP = so->hashso_curbuf = buf;
+ ItemPointerSet(current, blkno, offnum);
+ return(true);
+}
diff --git a/src/backend/access/hash/hashstrat.c b/src/backend/access/hash/hashstrat.c
new file mode 100644
index 00000000000..cac2a58690e
--- /dev/null
+++ b/src/backend/access/hash/hashstrat.c
@@ -0,0 +1,104 @@
+/*-------------------------------------------------------------------------
+ *
+ * btstrat.c--
+ * Srategy map entries for the btree indexed access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/Attic/hashstrat.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+/*
+ * only one valid strategy for hash tables: equality.
+ */
+
+static StrategyNumber HTNegate[1] = {
+ InvalidStrategy
+};
+
+static StrategyNumber HTCommute[1] = {
+ HTEqualStrategyNumber
+};
+
+static StrategyNumber HTNegateCommute[1] = {
+ InvalidStrategy
+};
+
+static StrategyEvaluationData HTEvaluationData = {
+ /* XXX static for simplicity */
+
+ HTMaxStrategyNumber,
+ (StrategyTransformMap)HTNegate,
+ (StrategyTransformMap)HTCommute,
+ (StrategyTransformMap)HTNegateCommute,
+ {NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL}
+};
+
+/* ----------------------------------------------------------------
+ * RelationGetHashStrategy
+ * ----------------------------------------------------------------
+ */
+
+StrategyNumber
+_hash_getstrat(Relation rel,
+ AttrNumber attno,
+ RegProcedure proc)
+{
+ StrategyNumber strat;
+
+ strat = RelationGetStrategy(rel, attno, &HTEvaluationData, proc);
+
+ Assert(StrategyNumberIsValid(strat));
+
+ return (strat);
+}
+
+bool
+_hash_invokestrat(Relation rel,
+ AttrNumber attno,
+ StrategyNumber strat,
+ Datum left,
+ Datum right)
+{
+ return (RelationInvokeStrategy(rel, &HTEvaluationData, attno, strat,
+ left, right));
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
new file mode 100644
index 00000000000..f8f49fe7983
--- /dev/null
+++ b/src/backend/access/hash/hashutil.c
@@ -0,0 +1,147 @@
+/*-------------------------------------------------------------------------
+ *
+ * btutils.c--
+ * Utility code for Postgres btree implementation.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/iqual.h"
+#include "access/hash.h"
+
+ScanKey
+_hash_mkscankey(Relation rel, IndexTuple itup, HashMetaPage metap)
+{
+ ScanKey skey;
+ TupleDesc itupdesc;
+ int natts;
+ AttrNumber i;
+ Datum arg;
+ RegProcedure proc;
+ bool null;
+
+ natts = rel->rd_rel->relnatts;
+ itupdesc = RelationGetTupleDescriptor(rel);
+
+ skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
+
+ for (i = 0; i < natts; i++) {
+ arg = index_getattr(itup, i + 1, itupdesc, &null);
+ proc = metap->hashm_procid;
+ ScanKeyEntryInitialize(&skey[i],
+ 0x0, (AttrNumber) (i + 1), proc, arg);
+ }
+
+ return (skey);
+}
+
+void
+_hash_freeskey(ScanKey skey)
+{
+ pfree(skey);
+}
+
+
+bool
+_hash_checkqual(IndexScanDesc scan, IndexTuple itup)
+{
+ if (scan->numberOfKeys > 0)
+ return (index_keytest(itup,
+ RelationGetTupleDescriptor(scan->relation),
+ scan->numberOfKeys, scan->keyData));
+ else
+ return (true);
+}
+
+HashItem
+_hash_formitem(IndexTuple itup)
+{
+ int nbytes_hitem;
+ HashItem hitem;
+ Size tuplen;
+
+ /* disallow nulls in hash keys */
+ if (itup->t_info & INDEX_NULL_MASK)
+ elog(WARN, "hash indices cannot include null keys");
+
+ /* make a copy of the index tuple with room for the sequence number */
+ tuplen = IndexTupleSize(itup);
+ nbytes_hitem = tuplen +
+ (sizeof(HashItemData) - sizeof(IndexTupleData));
+
+ hitem = (HashItem) palloc(nbytes_hitem);
+ memmove((char *) &(hitem->hash_itup), (char *) itup, tuplen);
+
+ return (hitem);
+}
+
+Bucket
+_hash_call(Relation rel, HashMetaPage metap, Datum key)
+{
+ uint32 n;
+ Bucket bucket;
+ RegProcedure proc;
+
+ proc = metap->hashm_procid;
+ n = (uint32) fmgr(proc, key);
+ bucket = n & metap->hashm_highmask;
+ if (bucket > metap->hashm_maxbucket)
+ bucket = bucket & metap->hashm_lowmask;
+ return (bucket);
+}
+
+/*
+ * _hash_log2 -- returns ceil(lg2(num))
+ */
+uint32
+_hash_log2(uint32 num)
+{
+ uint32 i, limit;
+
+ limit = 1;
+ for (i = 0; limit < num; limit = limit << 1, i++)
+ ;
+ return (i);
+}
+
+/*
+ * _hash_checkpage -- sanity checks on the format of all hash pages
+ */
+void
+_hash_checkpage(Page page, int flags)
+{
+ PageHeader ph = (PageHeader) page;
+ HashPageOpaque opaque;
+
+ Assert(page);
+ Assert(ph->pd_lower >= (sizeof(PageHeaderData) - sizeof(ItemIdData)));
+#if 1
+ Assert(ph->pd_upper <=
+ (BLCKSZ - DOUBLEALIGN(sizeof(HashPageOpaqueData))));
+ Assert(ph->pd_special ==
+ (BLCKSZ - DOUBLEALIGN(sizeof(HashPageOpaqueData))));
+ Assert(ph->pd_opaque.od_pagesize == BLCKSZ);
+#endif
+ if (flags) {
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_flag & flags);
+ }
+}