aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtsort.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtsort.c')
-rw-r--r--src/backend/access/nbtree/nbtsort.c1196
1 files changed, 1196 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
new file mode 100644
index 00000000000..3d2676324a0
--- /dev/null
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -0,0 +1,1196 @@
+/*-------------------------------------------------------------------------
+ * btsort.c--
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $Id: nbtsort.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ *
+ * what we do is:
+ * - generate a set of initial one-block runs, distributed round-robin
+ * between the output tapes.
+ * - for each pass,
+ * - swap input and output tape sets, rewinding both and truncating
+ * the output tapes.
+ * - merge the current run in each input tape to the current output
+ * tape.
+ * - when each input run has been exhausted, switch to another output
+ * tape and start processing another run.
+ * - when we have fewer runs than tapes, we know we are ready to start
+ * merging into the btree leaf pages.
+ * - every time we complete a level of the btree, we can construct the
+ * next level up. when we have only one page on a level, it can be
+ * attached to the btree metapage and we are done.
+ *
+ * conventions:
+ * - external interface routines take in and return "void *" for their
+ * opaque handles. this is for modularity reasons (i prefer not to
+ * export these structures without good reason).
+ *
+ * this code is moderately slow (~10% slower) compared to the regular
+ * btree (insertion) build code on sorted or well-clustered data. on
+ * random data, however, the insertion build code is unusable -- the
+ * difference on a 60MB heap is a factor of 15 because the random
+ * probes into the btree thrash the buffer pool.
+ *
+ * this code currently packs the pages to 100% of capacity. this is
+ * not wise, since *any* insertion will cause splitting. filling to
+ * something like the standard 70% steady-state load factor for btrees
+ * would probably be better.
+ *
+ * somebody desperately needs to figure out how to do a better job of
+ * balancing the merge passes -- the fan-in on the final merges can be
+ * pretty poor, which is bad for performance.
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+
+#include "c.h"
+
+#include "access/nbtree.h"
+
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "utils/rel.h"
+#include "utils/palloc.h"
+#include "utils/elog.h"
+
+/*#define FASTBUILD_DEBUG*/ /* turn on debugging output */
+
+#define FASTBUILD
+
+#ifdef FASTBUILD
+
+#define MAXTAPES (7)
+#define TAPEBLCKSZ (BLCKSZ << 2)
+#define TAPETEMP "pg_btsortXXXXXX"
+
+
+/*-------------------------------------------------------------------------
+ * sorting comparison routine - returns {-1,0,1} depending on whether
+ * the key in the left BTItem is {<,=,>} the key in the right BTItem.
+ *
+ * we want to use _bt_isortcmp as a comparison function for qsort(3),
+ * but it needs extra arguments, so we "pass them in" as global
+ * variables. ick. fortunately, they are the same throughout the
+ * build, so we need do this only once. this is why you must call
+ * _bt_isortcmpinit before the call to qsort(3).
+ *
+ * a NULL BTItem is always assumed to be greater than any actual
+ * value; our heap routines (see below) assume that the smallest
+ * element in the heap is returned. that way, NULL values from the
+ * exhausted tapes can sift down to the bottom of the heap. in point
+ * of fact we just don't replace the elements of exhausted tapes, but
+ * what the heck.
+ * *-------------------------------------------------------------------------
+ */
+static Relation _bt_sortrel;
+
+static void
+_bt_isortcmpinit(Relation index)
+{
+ _bt_sortrel = index;
+}
+
+static int
+_bt_isortcmp(BTItem *bti1p, BTItem *bti2p)
+{
+ BTItem bti1 = *bti1p;
+ BTItem bti2 = *bti2p;
+
+ if (bti1 == (BTItem) NULL) {
+ if (bti2 == (BTItem) NULL) {
+ return(0); /* 1 = 2 */
+ }
+ return(1); /* 1 > 2 */
+ } else if (bti2 == (BTItem) NULL) {
+ return(-1); /* 1 < 2 */
+ } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2,
+ BTGreaterStrategyNumber)) {
+ return(1); /* 1 > 2 */
+ } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1,
+ BTGreaterStrategyNumber)) {
+ return(-1); /* 1 < 2 */
+ }
+ return(0); /* 1 = 2 */
+}
+
+/*-------------------------------------------------------------------------
+ * priority queue methods
+ *
+ * these were more-or-less lifted from the heap section of the 1984
+ * edition of gonnet's book on algorithms and data structures. they
+ * are coded so that the smallest element in the heap is returned (we
+ * use them for merging sorted runs).
+ *
+ * XXX these probably ought to be generic library functions.
+ *-------------------------------------------------------------------------
+ */
+
+typedef struct {
+ int btpqe_tape; /* tape identifier */
+ BTItem btpqe_item; /* pointer to BTItem in tape buffer */
+} BTPriQueueElem;
+
+#define MAXELEM MAXTAPES
+typedef struct {
+ int btpq_nelem;
+ BTPriQueueElem btpq_queue[MAXELEM];
+ Relation btpq_rel;
+} BTPriQueue;
+
+/* be sure to call _bt_isortcmpinit first */
+#define GREATER(a, b) \
+ (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
+
+static void
+_bt_pqsift(BTPriQueue *q, int parent)
+{
+ int child;
+ BTPriQueueElem e;
+
+ for (child = parent * 2 + 1;
+ child < q->btpq_nelem;
+ child = parent * 2 + 1) {
+ if (child < q->btpq_nelem - 1) {
+ if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) {
+ ++child;
+ }
+ }
+ if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) {
+ e = q->btpq_queue[child]; /* struct = */
+ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
+ q->btpq_queue[parent] = e; /* struct = */
+ parent = child;
+ } else {
+ parent = child + 1;
+ }
+ }
+}
+
+static int
+_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
+{
+ if (q->btpq_nelem < 1) { /* already empty */
+ return(-1);
+ }
+ *e = q->btpq_queue[0]; /* struct = */
+
+ if (--q->btpq_nelem < 1) { /* now empty, don't sift */
+ return(0);
+ }
+ q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */
+ _bt_pqsift(q, 0);
+ return(0);
+}
+
+static void
+_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
+{
+ int child, parent;
+
+ if (q->btpq_nelem >= MAXELEM) {
+ elog(WARN, "_bt_pqadd: queue overflow");
+ }
+
+ child = q->btpq_nelem++;
+ while (child > 0) {
+ parent = child / 2;
+ if (GREATER(e, &(q->btpq_queue[parent]))) {
+ break;
+ } else {
+ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
+ child = parent;
+ }
+ }
+
+ q->btpq_queue[child] = *e; /* struct = */
+}
+
+/*-------------------------------------------------------------------------
+ * tape methods
+ *-------------------------------------------------------------------------
+ */
+
+#define BTITEMSZ(btitem) \
+ ((btitem) ? \
+ (IndexTupleDSize((btitem)->bti_itup) + \
+ (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
+ 0)
+#define SPCLEFT(tape) \
+ (sizeof((tape)->bttb_data) - (tape)->bttb_top)
+#define EMPTYTAPE(tape) \
+ ((tape)->bttb_ntup <= 0)
+#define BTTAPEMAGIC 0x19660226
+
+/*
+ * this is what we use to shovel BTItems in and out of memory. it's
+ * bigger than a standard block because we are doing a lot of strictly
+ * sequential i/o. this is obviously something of a tradeoff since we
+ * are potentially reading a bunch of zeroes off of disk in many
+ * cases.
+ *
+ * BTItems are packed in and DOUBLEALIGN'd.
+ *
+ * the fd should not be going out to disk, strictly speaking, but it's
+ * the only thing like that so i'm not going to worry about wasting a
+ * few bytes.
+ */
+typedef struct {
+ int bttb_magic; /* magic number */
+ int bttb_fd; /* file descriptor */
+ int bttb_top; /* top of free space within bttb_data */
+ short bttb_ntup; /* number of tuples in this block */
+ short bttb_eor; /* End-Of-Run marker */
+ char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
+} BTTapeBlock;
+
+
+/*
+ * reset the tape header for its next use without doing anything to
+ * the physical tape file. (setting bttb_top to 0 makes the block
+ * empty.)
+ */
+static void
+_bt_tapereset(BTTapeBlock *tape)
+{
+ tape->bttb_eor = 0;
+ tape->bttb_top = 0;
+ tape->bttb_ntup = 0;
+}
+
+/*
+ * rewind the physical tape file.
+ */
+static void
+_bt_taperewind(BTTapeBlock *tape)
+{
+ (void) FileSeek(tape->bttb_fd, 0, SEEK_SET);
+}
+
+/*
+ * destroy the contents of the physical tape file without destroying
+ * the tape data structure or removing the physical tape file.
+ *
+ * we use the VFD version of ftruncate(2) to do this rather than
+ * unlinking and recreating the file. you still have to wait while
+ * the OS frees up all of the file system blocks and stuff, but at
+ * least you don't have to delete and reinsert the directory entries.
+ */
+static void
+_bt_tapeclear(BTTapeBlock *tape)
+{
+ /* blow away the contents of the old file */
+ _bt_taperewind(tape);
+#if 0
+ FileSync(tape->bttb_fd);
+#endif
+ FileTruncate(tape->bttb_fd, 0);
+
+ /* reset the buffer */
+ _bt_tapereset(tape);
+}
+
+/*
+ * create a new BTTapeBlock, allocating memory for the data structure
+ * as well as opening a physical tape file.
+ */
+static BTTapeBlock *
+_bt_tapecreate(char *fname)
+{
+ BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
+
+ if (tape == (BTTapeBlock *) NULL) {
+ elog(WARN, "_bt_tapecreate: out of memory");
+ }
+
+ tape->bttb_magic = BTTAPEMAGIC;
+
+ tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
+ Assert(tape->bttb_fd >= 0);
+
+ /* initialize the buffer */
+ _bt_tapereset(tape);
+
+ return(tape);
+}
+
+/*
+ * destroy the BTTapeBlock structure and its physical tape file.
+ */
+static void
+_bt_tapedestroy(BTTapeBlock *tape)
+{
+ FileUnlink(tape->bttb_fd);
+ pfree((void *) tape);
+}
+
+/*
+ * flush the tape block to the file, marking End-Of-Run if requested.
+ */
+static void
+_bt_tapewrite(BTTapeBlock *tape, int eor)
+{
+ tape->bttb_eor = eor;
+ FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ);
+ _bt_tapereset(tape);
+}
+
+/*
+ * read a tape block from the file, overwriting the current contents
+ * of the buffer.
+ *
+ * returns:
+ * - 0 if there are no more blocks in the tape or in this run (call
+ * _bt_tapereset to clear the End-Of-Run marker)
+ * - 1 if a valid block was read
+ */
+static int
+_bt_taperead(BTTapeBlock *tape)
+{
+ int fd;
+ int nread;
+
+ if (tape->bttb_eor) {
+ return(0); /* we are at End-Of-Run */
+ }
+
+ /*
+ * we're clobbering the old tape block, but we do need to save the
+ * VFD (the one in the block we're reading is bogus).
+ */
+ fd = tape->bttb_fd;
+ nread = FileRead(fd, (char*) tape, TAPEBLCKSZ);
+ tape->bttb_fd = fd;
+
+ if (nread != TAPEBLCKSZ) {
+ Assert(nread == 0); /* we are at EOF */
+ return(0);
+ }
+ Assert(tape->bttb_magic == BTTAPEMAGIC);
+ return(1);
+}
+
+/*
+ * get the next BTItem from a tape block.
+ *
+ * returns:
+ * - NULL if we have run out of BTItems
+ * - a pointer to the BTItemData in the block otherwise
+ *
+ * side effects:
+ * - sets 'pos' to the current position within the block.
+ */
+static BTItem
+_bt_tapenext(BTTapeBlock *tape, char **pos)
+{
+ Size itemsz;
+ BTItem bti;
+
+ if (*pos >= tape->bttb_data + tape->bttb_top) {
+ return((BTItem) NULL);
+ }
+ bti = (BTItem) *pos;
+ itemsz = BTITEMSZ(bti);
+ *pos += DOUBLEALIGN(itemsz);
+ return(bti);
+}
+
+/*
+ * copy a BTItem into a tape block.
+ *
+ * assumes that we have already checked to see if the block has enough
+ * space for the item.
+ *
+ * side effects:
+ *
+ * - advances the 'top' pointer in the tape block header to point to
+ * the beginning of free space.
+ */
+static void
+_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
+{
+ (void) memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
+ ++tape->bttb_ntup;
+ tape->bttb_top += DOUBLEALIGN(itemsz);
+}
+
+/*-------------------------------------------------------------------------
+ * spool methods
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * this structure holds the bookkeeping for a simple balanced multiway
+ * merge. (polyphase merging is hairier than i want to get into right
+ * now, and i don't see why i have to care how many "tapes" i use
+ * right now. though if psort was in a condition that i could hack it
+ * to do this, you bet i would.)
+ */
+typedef struct {
+ int bts_ntapes;
+ int bts_tape;
+ BTTapeBlock **bts_itape; /* input tape blocks */
+ BTTapeBlock **bts_otape; /* output tape blocks */
+} BTSpool;
+
+/*
+ * create and initialize a spool structure, including the underlying
+ * files.
+ */
+void *
+_bt_spoolinit(Relation index, int ntapes)
+{
+ char *mktemp();
+
+ BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
+ int i;
+ char *fname = (char *) palloc(sizeof(TAPETEMP) + 1);
+
+ if (btspool == (BTSpool *) NULL || fname == (char *) NULL) {
+ elog(WARN, "_bt_spoolinit: out of memory");
+ }
+ (void) memset((char *) btspool, 0, sizeof(BTSpool));
+ btspool->bts_ntapes = ntapes;
+ btspool->bts_tape = 0;
+
+ btspool->bts_itape =
+ (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+ btspool->bts_otape =
+ (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+ if (btspool->bts_itape == (BTTapeBlock **) NULL ||
+ btspool->bts_otape == (BTTapeBlock **) NULL) {
+ elog(WARN, "_bt_spoolinit: out of memory");
+ }
+
+ for (i = 0; i < ntapes; ++i) {
+ btspool->bts_itape[i] =
+ _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+ btspool->bts_otape[i] =
+ _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+ }
+ pfree((void *) fname);
+
+ _bt_isortcmpinit(index);
+
+ return((void *) btspool);
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+void
+_bt_spooldestroy(void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ _bt_tapedestroy(btspool->bts_otape[i]);
+ _bt_tapedestroy(btspool->bts_itape[i]);
+ }
+ pfree((void *) btspool);
+}
+
+/*
+ * flush out any dirty output tape blocks
+ */
+static void
+_bt_spoolflush(BTSpool *btspool)
+{
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ if (!EMPTYTAPE(btspool->bts_otape[i])) {
+ _bt_tapewrite(btspool->bts_otape[i], 1);
+ }
+ }
+}
+
+/*
+ * swap input tapes and output tapes by swapping their file
+ * descriptors. additional preparation for the next merge pass
+ * includes rewinding the new input tapes and clearing out the new
+ * output tapes.
+ */
+static void
+_bt_spoolswap(BTSpool *btspool)
+{
+ File tmpfd;
+ BTTapeBlock *itape;
+ BTTapeBlock *otape;
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i) {
+ itape = btspool->bts_itape[i];
+ otape = btspool->bts_otape[i];
+
+ /*
+ * swap the input and output VFDs.
+ */
+ tmpfd = itape->bttb_fd;
+ itape->bttb_fd = otape->bttb_fd;
+ otape->bttb_fd = tmpfd;
+
+ /*
+ * rewind the new input tape.
+ */
+ _bt_taperewind(itape);
+ _bt_tapereset(itape);
+
+ /*
+ * clear the new output tape -- it's ok to throw away the old
+ * inputs.
+ */
+ _bt_tapeclear(otape);
+ }
+}
+
+/*-------------------------------------------------------------------------
+ * sorting routines
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * spool 'btitem' into an initial run. as tape blocks are filled, the
+ * block BTItems are qsorted and written into some output tape (it
+ * doesn't matter which; we go round-robin for simplicity). the
+ * initial runs are therefore always just one block.
+ */
+void
+_bt_spool(Relation index, BTItem btitem, void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ BTTapeBlock *itape;
+ Size itemsz;
+
+ itape = btspool->bts_itape[btspool->bts_tape];
+ itemsz = BTITEMSZ(btitem);
+ itemsz = DOUBLEALIGN(itemsz);
+
+ /*
+ * if this buffer is too full for this BTItemData, or if we have
+ * run out of BTItems, we need to sort the buffer and write it
+ * out. in this case, the BTItemData will go into the next tape's
+ * buffer.
+ */
+ if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) {
+ BTItem *parray;
+ BTTapeBlock *otape;
+ BTItem bti;
+ char *pos;
+ int btisz;
+ int i;
+
+ /*
+ * build an array of pointers to the BTItemDatas on the input
+ * block.
+ */
+ parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem));
+ if (parray == (BTItem *) NULL) {
+ elog(WARN, "_bt_spool: out of memory");
+ }
+ pos = itape->bttb_data;
+ for (i = 0; i < itape->bttb_ntup; ++i) {
+ parray[i] = _bt_tapenext(itape, &pos);
+ }
+
+ /*
+ * qsort the pointer array.
+ */
+ _bt_isortcmpinit(index);
+ qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp);
+
+ /*
+ * write the spooled run into the output tape. we copy the
+ * BTItemDatas in the order dictated by the sorted array of
+ * BTItems, not the original order.
+ *
+ * (since everything was DOUBLEALIGN'd and is all on a single
+ * page, everything had *better* still fit on one page..)
+ */
+ otape = btspool->bts_otape[btspool->bts_tape];
+ for (i = 0; i < itape->bttb_ntup; ++i) {
+ bti = parray[i];
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ _bt_tapeadd(otape, bti, btisz);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_spool: inserted <%x> into output tape %d\n",
+ d, btspool->bts_tape);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+
+ /*
+ * the initial runs are always single tape blocks. flush the
+ * output block, marking End-Of-Run.
+ */
+ _bt_tapewrite(otape, 1);
+
+ /*
+ * reset the input buffer for the next run. we don't have to
+ * write it out or anything -- we only use it to hold the
+ * unsorted BTItemDatas, the output tape contains all the
+ * sorted stuff.
+ *
+ * changing bts_tape changes the output tape and input tape;
+ * we change itape for the code below.
+ */
+ _bt_tapereset(itape);
+ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+ itape = btspool->bts_itape[btspool->bts_tape];
+
+ /*
+ * destroy the pointer array.
+ */
+ pfree((void *) parray);
+ }
+
+ /* insert this item into the current buffer */
+ if (btitem != (BTItem) NULL) {
+ _bt_tapeadd(itape, btitem, itemsz);
+ }
+}
+
+/*
+ * allocate a new, clean btree page, not linked to any siblings.
+ */
+static void
+_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
+{
+ BTPageOpaque opaque;
+
+ *buf = _bt_getbuf(index, P_NEW, BT_WRITE);
+ *page = BufferGetPage(*buf);
+ _bt_pageinit(*page, BufferGetPageSize(*buf));
+ opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
+ opaque->btpo_prev = opaque->btpo_next = P_NONE;
+ opaque->btpo_flags = flags;
+}
+
+/*
+ * slide an array of ItemIds back one slot (from P_FIRSTKEY to
+ * P_HIKEY). we need to do this when we discover that we have built
+ * an ItemId array in what has turned out to be a P_RIGHTMOST page.
+ */
+static void
+_bt_slideleft(Relation index, Buffer buf, Page page)
+{
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ ItemId previi;
+ ItemId thisii;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ previi = PageGetItemId(page, P_HIKEY);
+ for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) {
+ thisii = PageGetItemId(page, off);
+ *previi = *thisii;
+ previi = thisii;
+ }
+ ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
+}
+
+typedef struct {
+ Buffer btps_buf;
+ Page btps_page;
+ BTItem btps_lastbti;
+ OffsetNumber btps_lastoff;
+ OffsetNumber btps_firstoff;
+} BTPageState;
+
+/*
+ * add an item to a disk page from a merge tape block.
+ *
+ * we must be careful to observe the following restrictions, placed
+ * upon us by the conventions in nbtsearch.c:
+ * - rightmost pages start data items at P_HIKEY instead of at
+ * P_FIRSTKEY.
+ * - duplicates cannot be split among pages unless the chain of
+ * duplicates starts at the first data item.
+ *
+ * a leaf page being built looks like:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ... |
+ * +-----------+----+---------------------------------+
+ * | ... linpN | ^ first |
+ * +-----------+--------------------------------------+
+ * | ^ last |
+ * | |
+ * | v last |
+ * +-------------+------------------------------------+
+ * | | itemN ... |
+ * +-------------+------------------+-----------------+
+ * | ... item3 item2 item1 | "special space" |
+ * +--------------------------------+-----------------+
+ * ^ first
+ *
+ * contrast this with the diagram in bufpage.h; note the mismatch
+ * between linps and items. this is because we reserve linp0 as a
+ * placeholder for the pointer to the "high key" item; when we have
+ * filled up the page, we will set linp0 to point to itemN and clear
+ * linpN.
+ *
+ * 'last' pointers indicate the last offset/item added to the page.
+ * 'first' pointers indicate the first offset/item that is part of a
+ * chain of duplicates extending from 'first' to 'last'.
+ *
+ * if all keys are unique, 'first' will always be the same as 'last'.
+ */
+static void
+_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
+{
+ Buffer nbuf;
+ Page npage;
+ BTItem last_bti;
+ OffsetNumber first_off;
+ OffsetNumber last_off;
+ OffsetNumber off;
+ Size pgspc;
+ Size btisz;
+
+ nbuf = state->btps_buf;
+ npage = state->btps_page;
+ first_off = state->btps_firstoff;
+ last_off = state->btps_lastoff;
+ last_bti = state->btps_lastbti;
+
+ pgspc = PageGetFreeSpace(npage);
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ if (pgspc < btisz) {
+ Buffer obuf = nbuf;
+ Page opage = npage;
+ OffsetNumber o, n;
+ ItemId ii;
+ ItemId hii;
+
+ _bt_blnewpage(index, &nbuf, &npage, flags);
+
+ /*
+ * if 'last' is part of a chain of duplicates that does not
+ * start at the beginning of the old page, the entire chain is
+ * copied to the new page; we delete all of the duplicates
+ * from the old page except the first, which becomes the high
+ * key item of the old page.
+ *
+ * if the chain starts at the beginning of the page or there
+ * is no chain ('first' == 'last'), we need only copy 'last'
+ * to the new page. again, 'first' (== 'last') becomes the
+ * high key of the old page.
+ *
+ * note that in either case, we copy at least one item to the
+ * new page, so 'last_bti' will always be valid. 'bti' will
+ * never be the first data item on the new page.
+ */
+ if (first_off == P_FIRSTKEY) {
+ Assert(last_off != P_FIRSTKEY);
+ first_off = last_off;
+ }
+ for (o = first_off, n = P_FIRSTKEY;
+ o <= last_off;
+ o = OffsetNumberNext(o), n = OffsetNumberNext(n)) {
+ ii = PageGetItemId(opage, o);
+ (void) PageAddItem(npage, PageGetItem(opage, ii),
+ ii->lp_len, n, LP_USED);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ BTItem tmpbti =
+ (BTItem) PageGetItem(npage, PageGetItemId(npage, n));
+ Datum d = index_getattr(&(tmpbti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_buildadd: moved <%x> to offset %d\n",
+ d, n);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+ for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) {
+ PageIndexTupleDelete(opage, o);
+ }
+ hii = PageGetItemId(opage, P_HIKEY);
+ ii = PageGetItemId(opage, first_off);
+ *hii = *ii;
+ ii->lp_flags &= ~LP_USED;
+ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
+
+ first_off = P_FIRSTKEY;
+ last_off = PageGetMaxOffsetNumber(npage);
+ last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off));
+
+ /*
+ * set the page (side link) pointers.
+ */
+ {
+ BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
+
+ oopaque->btpo_next = BufferGetBlockNumber(nbuf);
+ nopaque->btpo_prev = BufferGetBlockNumber(obuf);
+ nopaque->btpo_next = P_NONE;
+ }
+
+ /*
+ * write out the old stuff. we never want to see it again, so
+ * we can give up our lock (if we had one; BuildingBtree is
+ * set, so we aren't locking).
+ */
+ _bt_wrtbuf(index, obuf);
+ }
+
+ /*
+ * if this item is different from the last item added, we start a
+ * new chain of duplicates.
+ */
+ off = OffsetNumberNext(last_off);
+ (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_buildadd: inserted <%x> at offset %d\n",
+ d, off);
+ }
+#endif /* FASTBUILD_DEBUG */
+ if (last_bti == (BTItem) NULL) {
+ first_off = P_FIRSTKEY;
+ } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) {
+ first_off = off;
+ }
+ last_off = off;
+ last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off));
+
+ state->btps_buf = nbuf;
+ state->btps_page = npage;
+ state->btps_lastbti = last_bti;
+ state->btps_lastoff = last_off;
+ state->btps_firstoff = first_off;
+}
+
+/*
+ * take the input tapes stored by 'btspool' and perform successive
+ * merging passes until at most one run is left in each tape. at that
+ * point, merge the final tape runs into a set of btree leaves.
+ *
+ * XXX three nested loops? gross. cut me up into smaller routines.
+ */
+static BlockNumber
+_bt_merge(Relation index, BTSpool *btspool)
+{
+ BTPageState state;
+ BlockNumber firstblk;
+ BTPriQueue q;
+ BTPriQueueElem e;
+ BTItem bti;
+ BTTapeBlock *itape;
+ BTTapeBlock *otape;
+ char *tapepos[MAXTAPES];
+ int tapedone[MAXTAPES];
+ int t;
+ int goodtapes;
+ int nruns;
+ Size btisz;
+ bool doleaf = false;
+
+ /*
+ * initialize state needed for the merge into the btree leaf pages.
+ */
+ (void) memset((char *) &state, 0, sizeof(BTPageState));
+ _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF);
+ state.btps_lastoff = P_HIKEY;
+ state.btps_lastbti = (BTItem) NULL;
+ firstblk = BufferGetBlockNumber(state.btps_buf);
+
+ do { /* pass */
+ /*
+ * each pass starts by flushing the previous outputs and
+ * swapping inputs and outputs. this process also clears the
+ * new output tapes and rewinds the new input tapes.
+ */
+ btspool->bts_tape = btspool->bts_ntapes - 1;
+ _bt_spoolflush(btspool);
+ _bt_spoolswap(btspool);
+
+ nruns = 0;
+
+ for (;;) { /* run */
+ /*
+ * each run starts by selecting a new output tape. the
+ * merged results of a given run are always sent to this
+ * one tape.
+ */
+ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+ otape = btspool->bts_otape[btspool->bts_tape];
+
+ /*
+ * initialize the priority queue by loading it with the
+ * first element of the given run in each tape. since we
+ * are starting a new run, we reset the tape (clearing the
+ * End-Of-Run marker) before reading it. this means that
+ * _bt_taperead will return 0 only if the tape is actually
+ * at EOF.
+ */
+ (void) memset((char *) &q, 0, sizeof(BTPriQueue));
+ goodtapes = 0;
+ for (t = 0; t < btspool->bts_ntapes; ++t) {
+ itape = btspool->bts_itape[t];
+ tapepos[t] = itape->bttb_data;
+ _bt_tapereset(itape);
+ if (_bt_taperead(itape) == 0) {
+ tapedone[t] = 1;
+ } else {
+ ++goodtapes;
+ tapedone[t] = 0;
+ e.btpqe_tape = t;
+ e.btpqe_item = _bt_tapenext(itape, &tapepos[t]);
+ if (e.btpqe_item != (BTItem) NULL) {
+ _bt_pqadd(&q, &e);
+ }
+ }
+ }
+ /*
+ * if we don't have any tapes with any input (i.e., they
+ * are all at EOF), we must be done with this pass.
+ */
+ if (goodtapes == 0) {
+ break; /* for */
+ }
+ ++nruns;
+
+ /*
+ * output the smallest element from the queue until there are no
+ * more.
+ */
+ while (_bt_pqnext(&q, &e) >= 0) { /* item */
+ /*
+ * replace the element taken from priority queue,
+ * fetching a new block if needed. a tape can run out
+ * if it hits either End-Of-Run or EOF.
+ */
+ t = e.btpqe_tape;
+ bti = e.btpqe_item;
+ if (bti != (BTItem) NULL) {
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ if (doleaf) {
+ _bt_buildadd(index, &state, bti, BTP_LEAF);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_merge: inserted <%x> into block %d\n",
+ d, BufferGetBlockNumber(state.btps_buf));
+ }
+#endif /* FASTBUILD_DEBUG */
+ } else {
+ if (SPCLEFT(otape) < btisz) {
+ /*
+ * if it's full, write it out and add the
+ * item to the next block. (since we know
+ * there will be at least one more block,
+ * we know we do *not* want to set
+ * End-Of-Run here!)
+ */
+ _bt_tapewrite(otape, 0);
+ }
+ _bt_tapeadd(otape, bti, btisz);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index), &isnull);
+ printf("_bt_merge: inserted <%x> into tape %d\n",
+ d, btspool->bts_tape);
+ }
+#endif /* FASTBUILD_DEBUG */
+ }
+ }
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_merge: got <%x> from tape %d\n", d, t);
+ }
+#endif /* FASTBUILD_DEBUG */
+
+ itape = btspool->bts_itape[t];
+ if (!tapedone[t]) {
+ BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
+
+ if (newbti == (BTItem) NULL) {
+ if (_bt_taperead(itape) == 0) {
+ tapedone[t] = 1;
+ } else {
+ tapepos[t] = itape->bttb_data;
+ newbti = _bt_tapenext(itape, &tapepos[t]);
+ }
+ }
+ if (newbti != (BTItem) NULL) {
+ BTPriQueueElem nexte;
+
+ nexte.btpqe_tape = t;
+ nexte.btpqe_item = newbti;
+ _bt_pqadd(&q, &nexte);
+ }
+ }
+ } /* item */
+ } /* run */
+
+ /*
+ * we are here because we ran out of input on all of the input
+ * tapes.
+ *
+ * if this pass did not generate more actual output runs than
+ * we have tapes, we know we have at most one run in each
+ * tape. this means that we are ready to merge into the final
+ * btree leaf pages instead of merging into a tape file.
+ */
+ if (nruns <= btspool->bts_ntapes) {
+ doleaf = true;
+ }
+ } while (nruns > 0); /* pass */
+
+ /*
+ * this is the rightmost page, so the ItemId array needs to be
+ * slid back one slot.
+ */
+ _bt_slideleft(index, state.btps_buf, state.btps_page);
+ _bt_wrtbuf(index, state.btps_buf);
+
+ return(firstblk);
+}
+
+
+/*
+ * given the block number 'blk' of the first page of a set of linked
+ * siblings (i.e., the start of an entire level of the btree),
+ * construct the corresponding next level of the btree. we do this by
+ * placing minimum keys from each page into this page. the format of
+ * the internal pages is otherwise the same as for leaf pages.
+ */
+void
+_bt_upperbuild(Relation index, BlockNumber blk, int level)
+{
+ Buffer rbuf;
+ Page rpage;
+ BTPageOpaque ropaque;
+ BTPageState state;
+ BlockNumber firstblk;
+ BTItem bti;
+ BTItem nbti;
+ OffsetNumber off;
+
+ rbuf = _bt_getbuf(index, blk, BT_WRITE);
+ rpage = BufferGetPage(rbuf);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /*
+ * if we only have one page on a level, we can just make it the
+ * root.
+ */
+ if (P_RIGHTMOST(ropaque)) {
+ ropaque->btpo_flags |= BTP_ROOT;
+ _bt_wrtbuf(index, rbuf);
+ _bt_metaproot(index, blk);
+ return;
+ }
+ _bt_relbuf(index, rbuf, BT_WRITE);
+
+ (void) memset((char *) &state, 0, sizeof(BTPageState));
+ _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0);
+ state.btps_lastoff = P_HIKEY;
+ state.btps_lastbti = (BTItem) NULL;
+ firstblk = BufferGetBlockNumber(state.btps_buf);
+
+ /* for each page... */
+ do {
+ rbuf = _bt_getbuf(index, blk, BT_READ);
+ rpage = BufferGetPage(rbuf);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /* for each item... */
+ if (!PageIsEmpty(rpage)) {
+ /*
+ * form a new index tuple corresponding to the minimum key
+ * of the lower page and insert it into a page at this
+ * level.
+ */
+ off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY;
+ bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off));
+ nbti = _bt_formitem(&(bti->bti_itup));
+ ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY);
+#ifdef FASTBUILD_DEBUG
+ {
+ bool isnull;
+ Datum d = index_getattr(&(nbti->bti_itup), 1,
+ RelationGetTupleDescriptor(index),
+ &isnull);
+ printf("_bt_upperbuild: inserting <%x> at %d\n",
+ d, level);
+ }
+#endif /* FASTBUILD_DEBUG */
+ _bt_buildadd(index, &state, nbti, 0);
+ pfree((void *) nbti);
+ }
+ blk = ropaque->btpo_next;
+ _bt_relbuf(index, rbuf, BT_READ);
+ } while (blk != P_NONE);
+
+ /*
+ * this is the rightmost page, so the ItemId array needs to be
+ * slid back one slot.
+ */
+ _bt_slideleft(index, state.btps_buf, state.btps_page);
+ _bt_wrtbuf(index, state.btps_buf);
+
+ _bt_upperbuild(index, firstblk, level + 1);
+}
+
+/*
+ * given a spool loading by successive calls to _bt_spool, create an
+ * entire btree.
+ */
+void
+_bt_leafbuild(Relation index, void *spool)
+{
+ BTSpool *btspool = (BTSpool *) spool;
+ BlockNumber firstblk;
+
+ /*
+ * merge the runs into btree leaf pages.
+ */
+ firstblk = _bt_merge(index, btspool);
+
+ /*
+ * build the upper levels of the btree.
+ */
+ _bt_upperbuild(index, firstblk, 0);
+}
+
+#else /* !FASTBUILD */
+
+void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); }
+void _bt_spooldestroy(void *spool) { }
+void _bt_spool(Relation index, BTItem btitem, void *spool) { }
+void _bt_upperbuild(Relation index, BlockNumber blk, int level) { }
+void _bt_leafbuild(Relation index, void *spool) { }
+
+#endif /* !FASTBUILD */