diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtsort.c')
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 1196 |
1 files changed, 1196 insertions, 0 deletions
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c new file mode 100644 index 00000000000..3d2676324a0 --- /dev/null +++ b/src/backend/access/nbtree/nbtsort.c @@ -0,0 +1,1196 @@ +/*------------------------------------------------------------------------- + * btsort.c-- + * + * Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Id: nbtsort.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $ + * + * NOTES + * + * what we do is: + * - generate a set of initial one-block runs, distributed round-robin + * between the output tapes. + * - for each pass, + * - swap input and output tape sets, rewinding both and truncating + * the output tapes. + * - merge the current run in each input tape to the current output + * tape. + * - when each input run has been exhausted, switch to another output + * tape and start processing another run. + * - when we have fewer runs than tapes, we know we are ready to start + * merging into the btree leaf pages. + * - every time we complete a level of the btree, we can construct the + * next level up. when we have only one page on a level, it can be + * attached to the btree metapage and we are done. + * + * conventions: + * - external interface routines take in and return "void *" for their + * opaque handles. this is for modularity reasons (i prefer not to + * export these structures without good reason). + * + * this code is moderately slow (~10% slower) compared to the regular + * btree (insertion) build code on sorted or well-clustered data. on + * random data, however, the insertion build code is unusable -- the + * difference on a 60MB heap is a factor of 15 because the random + * probes into the btree thrash the buffer pool. + * + * this code currently packs the pages to 100% of capacity. this is + * not wise, since *any* insertion will cause splitting. filling to + * something like the standard 70% steady-state load factor for btrees + * would probably be better. + * + * somebody desperately needs to figure out how to do a better job of + * balancing the merge passes -- the fan-in on the final merges can be + * pretty poor, which is bad for performance. + *------------------------------------------------------------------------- + */ + +#include <stdio.h> + +#include "c.h" + +#include "access/nbtree.h" + +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "utils/rel.h" +#include "utils/palloc.h" +#include "utils/elog.h" + +/*#define FASTBUILD_DEBUG*/ /* turn on debugging output */ + +#define FASTBUILD + +#ifdef FASTBUILD + +#define MAXTAPES (7) +#define TAPEBLCKSZ (BLCKSZ << 2) +#define TAPETEMP "pg_btsortXXXXXX" + + +/*------------------------------------------------------------------------- + * sorting comparison routine - returns {-1,0,1} depending on whether + * the key in the left BTItem is {<,=,>} the key in the right BTItem. + * + * we want to use _bt_isortcmp as a comparison function for qsort(3), + * but it needs extra arguments, so we "pass them in" as global + * variables. ick. fortunately, they are the same throughout the + * build, so we need do this only once. this is why you must call + * _bt_isortcmpinit before the call to qsort(3). + * + * a NULL BTItem is always assumed to be greater than any actual + * value; our heap routines (see below) assume that the smallest + * element in the heap is returned. that way, NULL values from the + * exhausted tapes can sift down to the bottom of the heap. in point + * of fact we just don't replace the elements of exhausted tapes, but + * what the heck. + * *------------------------------------------------------------------------- + */ +static Relation _bt_sortrel; + +static void +_bt_isortcmpinit(Relation index) +{ + _bt_sortrel = index; +} + +static int +_bt_isortcmp(BTItem *bti1p, BTItem *bti2p) +{ + BTItem bti1 = *bti1p; + BTItem bti2 = *bti2p; + + if (bti1 == (BTItem) NULL) { + if (bti2 == (BTItem) NULL) { + return(0); /* 1 = 2 */ + } + return(1); /* 1 > 2 */ + } else if (bti2 == (BTItem) NULL) { + return(-1); /* 1 < 2 */ + } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2, + BTGreaterStrategyNumber)) { + return(1); /* 1 > 2 */ + } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1, + BTGreaterStrategyNumber)) { + return(-1); /* 1 < 2 */ + } + return(0); /* 1 = 2 */ +} + +/*------------------------------------------------------------------------- + * priority queue methods + * + * these were more-or-less lifted from the heap section of the 1984 + * edition of gonnet's book on algorithms and data structures. they + * are coded so that the smallest element in the heap is returned (we + * use them for merging sorted runs). + * + * XXX these probably ought to be generic library functions. + *------------------------------------------------------------------------- + */ + +typedef struct { + int btpqe_tape; /* tape identifier */ + BTItem btpqe_item; /* pointer to BTItem in tape buffer */ +} BTPriQueueElem; + +#define MAXELEM MAXTAPES +typedef struct { + int btpq_nelem; + BTPriQueueElem btpq_queue[MAXELEM]; + Relation btpq_rel; +} BTPriQueue; + +/* be sure to call _bt_isortcmpinit first */ +#define GREATER(a, b) \ + (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) + +static void +_bt_pqsift(BTPriQueue *q, int parent) +{ + int child; + BTPriQueueElem e; + + for (child = parent * 2 + 1; + child < q->btpq_nelem; + child = parent * 2 + 1) { + if (child < q->btpq_nelem - 1) { + if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) { + ++child; + } + } + if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) { + e = q->btpq_queue[child]; /* struct = */ + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + q->btpq_queue[parent] = e; /* struct = */ + parent = child; + } else { + parent = child + 1; + } + } +} + +static int +_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e) +{ + if (q->btpq_nelem < 1) { /* already empty */ + return(-1); + } + *e = q->btpq_queue[0]; /* struct = */ + + if (--q->btpq_nelem < 1) { /* now empty, don't sift */ + return(0); + } + q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ + _bt_pqsift(q, 0); + return(0); +} + +static void +_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) +{ + int child, parent; + + if (q->btpq_nelem >= MAXELEM) { + elog(WARN, "_bt_pqadd: queue overflow"); + } + + child = q->btpq_nelem++; + while (child > 0) { + parent = child / 2; + if (GREATER(e, &(q->btpq_queue[parent]))) { + break; + } else { + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + child = parent; + } + } + + q->btpq_queue[child] = *e; /* struct = */ +} + +/*------------------------------------------------------------------------- + * tape methods + *------------------------------------------------------------------------- + */ + +#define BTITEMSZ(btitem) \ + ((btitem) ? \ + (IndexTupleDSize((btitem)->bti_itup) + \ + (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ + 0) +#define SPCLEFT(tape) \ + (sizeof((tape)->bttb_data) - (tape)->bttb_top) +#define EMPTYTAPE(tape) \ + ((tape)->bttb_ntup <= 0) +#define BTTAPEMAGIC 0x19660226 + +/* + * this is what we use to shovel BTItems in and out of memory. it's + * bigger than a standard block because we are doing a lot of strictly + * sequential i/o. this is obviously something of a tradeoff since we + * are potentially reading a bunch of zeroes off of disk in many + * cases. + * + * BTItems are packed in and DOUBLEALIGN'd. + * + * the fd should not be going out to disk, strictly speaking, but it's + * the only thing like that so i'm not going to worry about wasting a + * few bytes. + */ +typedef struct { + int bttb_magic; /* magic number */ + int bttb_fd; /* file descriptor */ + int bttb_top; /* top of free space within bttb_data */ + short bttb_ntup; /* number of tuples in this block */ + short bttb_eor; /* End-Of-Run marker */ + char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; +} BTTapeBlock; + + +/* + * reset the tape header for its next use without doing anything to + * the physical tape file. (setting bttb_top to 0 makes the block + * empty.) + */ +static void +_bt_tapereset(BTTapeBlock *tape) +{ + tape->bttb_eor = 0; + tape->bttb_top = 0; + tape->bttb_ntup = 0; +} + +/* + * rewind the physical tape file. + */ +static void +_bt_taperewind(BTTapeBlock *tape) +{ + (void) FileSeek(tape->bttb_fd, 0, SEEK_SET); +} + +/* + * destroy the contents of the physical tape file without destroying + * the tape data structure or removing the physical tape file. + * + * we use the VFD version of ftruncate(2) to do this rather than + * unlinking and recreating the file. you still have to wait while + * the OS frees up all of the file system blocks and stuff, but at + * least you don't have to delete and reinsert the directory entries. + */ +static void +_bt_tapeclear(BTTapeBlock *tape) +{ + /* blow away the contents of the old file */ + _bt_taperewind(tape); +#if 0 + FileSync(tape->bttb_fd); +#endif + FileTruncate(tape->bttb_fd, 0); + + /* reset the buffer */ + _bt_tapereset(tape); +} + +/* + * create a new BTTapeBlock, allocating memory for the data structure + * as well as opening a physical tape file. + */ +static BTTapeBlock * +_bt_tapecreate(char *fname) +{ + BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); + + if (tape == (BTTapeBlock *) NULL) { + elog(WARN, "_bt_tapecreate: out of memory"); + } + + tape->bttb_magic = BTTAPEMAGIC; + + tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600); + Assert(tape->bttb_fd >= 0); + + /* initialize the buffer */ + _bt_tapereset(tape); + + return(tape); +} + +/* + * destroy the BTTapeBlock structure and its physical tape file. + */ +static void +_bt_tapedestroy(BTTapeBlock *tape) +{ + FileUnlink(tape->bttb_fd); + pfree((void *) tape); +} + +/* + * flush the tape block to the file, marking End-Of-Run if requested. + */ +static void +_bt_tapewrite(BTTapeBlock *tape, int eor) +{ + tape->bttb_eor = eor; + FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ); + _bt_tapereset(tape); +} + +/* + * read a tape block from the file, overwriting the current contents + * of the buffer. + * + * returns: + * - 0 if there are no more blocks in the tape or in this run (call + * _bt_tapereset to clear the End-Of-Run marker) + * - 1 if a valid block was read + */ +static int +_bt_taperead(BTTapeBlock *tape) +{ + int fd; + int nread; + + if (tape->bttb_eor) { + return(0); /* we are at End-Of-Run */ + } + + /* + * we're clobbering the old tape block, but we do need to save the + * VFD (the one in the block we're reading is bogus). + */ + fd = tape->bttb_fd; + nread = FileRead(fd, (char*) tape, TAPEBLCKSZ); + tape->bttb_fd = fd; + + if (nread != TAPEBLCKSZ) { + Assert(nread == 0); /* we are at EOF */ + return(0); + } + Assert(tape->bttb_magic == BTTAPEMAGIC); + return(1); +} + +/* + * get the next BTItem from a tape block. + * + * returns: + * - NULL if we have run out of BTItems + * - a pointer to the BTItemData in the block otherwise + * + * side effects: + * - sets 'pos' to the current position within the block. + */ +static BTItem +_bt_tapenext(BTTapeBlock *tape, char **pos) +{ + Size itemsz; + BTItem bti; + + if (*pos >= tape->bttb_data + tape->bttb_top) { + return((BTItem) NULL); + } + bti = (BTItem) *pos; + itemsz = BTITEMSZ(bti); + *pos += DOUBLEALIGN(itemsz); + return(bti); +} + +/* + * copy a BTItem into a tape block. + * + * assumes that we have already checked to see if the block has enough + * space for the item. + * + * side effects: + * + * - advances the 'top' pointer in the tape block header to point to + * the beginning of free space. + */ +static void +_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) +{ + (void) memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); + ++tape->bttb_ntup; + tape->bttb_top += DOUBLEALIGN(itemsz); +} + +/*------------------------------------------------------------------------- + * spool methods + *------------------------------------------------------------------------- + */ + +/* + * this structure holds the bookkeeping for a simple balanced multiway + * merge. (polyphase merging is hairier than i want to get into right + * now, and i don't see why i have to care how many "tapes" i use + * right now. though if psort was in a condition that i could hack it + * to do this, you bet i would.) + */ +typedef struct { + int bts_ntapes; + int bts_tape; + BTTapeBlock **bts_itape; /* input tape blocks */ + BTTapeBlock **bts_otape; /* output tape blocks */ +} BTSpool; + +/* + * create and initialize a spool structure, including the underlying + * files. + */ +void * +_bt_spoolinit(Relation index, int ntapes) +{ + char *mktemp(); + + BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); + int i; + char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); + + if (btspool == (BTSpool *) NULL || fname == (char *) NULL) { + elog(WARN, "_bt_spoolinit: out of memory"); + } + (void) memset((char *) btspool, 0, sizeof(BTSpool)); + btspool->bts_ntapes = ntapes; + btspool->bts_tape = 0; + + btspool->bts_itape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + btspool->bts_otape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + if (btspool->bts_itape == (BTTapeBlock **) NULL || + btspool->bts_otape == (BTTapeBlock **) NULL) { + elog(WARN, "_bt_spoolinit: out of memory"); + } + + for (i = 0; i < ntapes; ++i) { + btspool->bts_itape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + btspool->bts_otape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + } + pfree((void *) fname); + + _bt_isortcmpinit(index); + + return((void *) btspool); +} + +/* + * clean up a spool structure and its substructures. + */ +void +_bt_spooldestroy(void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + _bt_tapedestroy(btspool->bts_otape[i]); + _bt_tapedestroy(btspool->bts_itape[i]); + } + pfree((void *) btspool); +} + +/* + * flush out any dirty output tape blocks + */ +static void +_bt_spoolflush(BTSpool *btspool) +{ + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + if (!EMPTYTAPE(btspool->bts_otape[i])) { + _bt_tapewrite(btspool->bts_otape[i], 1); + } + } +} + +/* + * swap input tapes and output tapes by swapping their file + * descriptors. additional preparation for the next merge pass + * includes rewinding the new input tapes and clearing out the new + * output tapes. + */ +static void +_bt_spoolswap(BTSpool *btspool) +{ + File tmpfd; + BTTapeBlock *itape; + BTTapeBlock *otape; + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) { + itape = btspool->bts_itape[i]; + otape = btspool->bts_otape[i]; + + /* + * swap the input and output VFDs. + */ + tmpfd = itape->bttb_fd; + itape->bttb_fd = otape->bttb_fd; + otape->bttb_fd = tmpfd; + + /* + * rewind the new input tape. + */ + _bt_taperewind(itape); + _bt_tapereset(itape); + + /* + * clear the new output tape -- it's ok to throw away the old + * inputs. + */ + _bt_tapeclear(otape); + } +} + +/*------------------------------------------------------------------------- + * sorting routines + *------------------------------------------------------------------------- + */ + +/* + * spool 'btitem' into an initial run. as tape blocks are filled, the + * block BTItems are qsorted and written into some output tape (it + * doesn't matter which; we go round-robin for simplicity). the + * initial runs are therefore always just one block. + */ +void +_bt_spool(Relation index, BTItem btitem, void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + BTTapeBlock *itape; + Size itemsz; + + itape = btspool->bts_itape[btspool->bts_tape]; + itemsz = BTITEMSZ(btitem); + itemsz = DOUBLEALIGN(itemsz); + + /* + * if this buffer is too full for this BTItemData, or if we have + * run out of BTItems, we need to sort the buffer and write it + * out. in this case, the BTItemData will go into the next tape's + * buffer. + */ + if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) { + BTItem *parray; + BTTapeBlock *otape; + BTItem bti; + char *pos; + int btisz; + int i; + + /* + * build an array of pointers to the BTItemDatas on the input + * block. + */ + parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem)); + if (parray == (BTItem *) NULL) { + elog(WARN, "_bt_spool: out of memory"); + } + pos = itape->bttb_data; + for (i = 0; i < itape->bttb_ntup; ++i) { + parray[i] = _bt_tapenext(itape, &pos); + } + + /* + * qsort the pointer array. + */ + _bt_isortcmpinit(index); + qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp); + + /* + * write the spooled run into the output tape. we copy the + * BTItemDatas in the order dictated by the sorted array of + * BTItems, not the original order. + * + * (since everything was DOUBLEALIGN'd and is all on a single + * page, everything had *better* still fit on one page..) + */ + otape = btspool->bts_otape[btspool->bts_tape]; + for (i = 0; i < itape->bttb_ntup; ++i) { + bti = parray[i]; + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + _bt_tapeadd(otape, bti, btisz); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_spool: inserted <%x> into output tape %d\n", + d, btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG */ + } + + /* + * the initial runs are always single tape blocks. flush the + * output block, marking End-Of-Run. + */ + _bt_tapewrite(otape, 1); + + /* + * reset the input buffer for the next run. we don't have to + * write it out or anything -- we only use it to hold the + * unsorted BTItemDatas, the output tape contains all the + * sorted stuff. + * + * changing bts_tape changes the output tape and input tape; + * we change itape for the code below. + */ + _bt_tapereset(itape); + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + itape = btspool->bts_itape[btspool->bts_tape]; + + /* + * destroy the pointer array. + */ + pfree((void *) parray); + } + + /* insert this item into the current buffer */ + if (btitem != (BTItem) NULL) { + _bt_tapeadd(itape, btitem, itemsz); + } +} + +/* + * allocate a new, clean btree page, not linked to any siblings. + */ +static void +_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) +{ + BTPageOpaque opaque; + + *buf = _bt_getbuf(index, P_NEW, BT_WRITE); + *page = BufferGetPage(*buf); + _bt_pageinit(*page, BufferGetPageSize(*buf)); + opaque = (BTPageOpaque) PageGetSpecialPointer(*page); + opaque->btpo_prev = opaque->btpo_next = P_NONE; + opaque->btpo_flags = flags; +} + +/* + * slide an array of ItemIds back one slot (from P_FIRSTKEY to + * P_HIKEY). we need to do this when we discover that we have built + * an ItemId array in what has turned out to be a P_RIGHTMOST page. + */ +static void +_bt_slideleft(Relation index, Buffer buf, Page page) +{ + OffsetNumber off; + OffsetNumber maxoff; + ItemId previi; + ItemId thisii; + + maxoff = PageGetMaxOffsetNumber(page); + previi = PageGetItemId(page, P_HIKEY); + for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { + thisii = PageGetItemId(page, off); + *previi = *thisii; + previi = thisii; + } + ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); +} + +typedef struct { + Buffer btps_buf; + Page btps_page; + BTItem btps_lastbti; + OffsetNumber btps_lastoff; + OffsetNumber btps_firstoff; +} BTPageState; + +/* + * add an item to a disk page from a merge tape block. + * + * we must be careful to observe the following restrictions, placed + * upon us by the conventions in nbtsearch.c: + * - rightmost pages start data items at P_HIKEY instead of at + * P_FIRSTKEY. + * - duplicates cannot be split among pages unless the chain of + * duplicates starts at the first data item. + * + * a leaf page being built looks like: + * + * +----------------+---------------------------------+ + * | PageHeaderData | linp0 linp1 linp2 ... | + * +-----------+----+---------------------------------+ + * | ... linpN | ^ first | + * +-----------+--------------------------------------+ + * | ^ last | + * | | + * | v last | + * +-------------+------------------------------------+ + * | | itemN ... | + * +-------------+------------------+-----------------+ + * | ... item3 item2 item1 | "special space" | + * +--------------------------------+-----------------+ + * ^ first + * + * contrast this with the diagram in bufpage.h; note the mismatch + * between linps and items. this is because we reserve linp0 as a + * placeholder for the pointer to the "high key" item; when we have + * filled up the page, we will set linp0 to point to itemN and clear + * linpN. + * + * 'last' pointers indicate the last offset/item added to the page. + * 'first' pointers indicate the first offset/item that is part of a + * chain of duplicates extending from 'first' to 'last'. + * + * if all keys are unique, 'first' will always be the same as 'last'. + */ +static void +_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) +{ + Buffer nbuf; + Page npage; + BTItem last_bti; + OffsetNumber first_off; + OffsetNumber last_off; + OffsetNumber off; + Size pgspc; + Size btisz; + + nbuf = state->btps_buf; + npage = state->btps_page; + first_off = state->btps_firstoff; + last_off = state->btps_lastoff; + last_bti = state->btps_lastbti; + + pgspc = PageGetFreeSpace(npage); + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (pgspc < btisz) { + Buffer obuf = nbuf; + Page opage = npage; + OffsetNumber o, n; + ItemId ii; + ItemId hii; + + _bt_blnewpage(index, &nbuf, &npage, flags); + + /* + * if 'last' is part of a chain of duplicates that does not + * start at the beginning of the old page, the entire chain is + * copied to the new page; we delete all of the duplicates + * from the old page except the first, which becomes the high + * key item of the old page. + * + * if the chain starts at the beginning of the page or there + * is no chain ('first' == 'last'), we need only copy 'last' + * to the new page. again, 'first' (== 'last') becomes the + * high key of the old page. + * + * note that in either case, we copy at least one item to the + * new page, so 'last_bti' will always be valid. 'bti' will + * never be the first data item on the new page. + */ + if (first_off == P_FIRSTKEY) { + Assert(last_off != P_FIRSTKEY); + first_off = last_off; + } + for (o = first_off, n = P_FIRSTKEY; + o <= last_off; + o = OffsetNumberNext(o), n = OffsetNumberNext(n)) { + ii = PageGetItemId(opage, o); + (void) PageAddItem(npage, PageGetItem(opage, ii), + ii->lp_len, n, LP_USED); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + BTItem tmpbti = + (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); + Datum d = index_getattr(&(tmpbti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_buildadd: moved <%x> to offset %d\n", + d, n); + } +#endif /* FASTBUILD_DEBUG */ + } + for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) { + PageIndexTupleDelete(opage, o); + } + hii = PageGetItemId(opage, P_HIKEY); + ii = PageGetItemId(opage, first_off); + *hii = *ii; + ii->lp_flags &= ~LP_USED; + ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); + + first_off = P_FIRSTKEY; + last_off = PageGetMaxOffsetNumber(npage); + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); + + /* + * set the page (side link) pointers. + */ + { + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); + + oopaque->btpo_next = BufferGetBlockNumber(nbuf); + nopaque->btpo_prev = BufferGetBlockNumber(obuf); + nopaque->btpo_next = P_NONE; + } + + /* + * write out the old stuff. we never want to see it again, so + * we can give up our lock (if we had one; BuildingBtree is + * set, so we aren't locking). + */ + _bt_wrtbuf(index, obuf); + } + + /* + * if this item is different from the last item added, we start a + * new chain of duplicates. + */ + off = OffsetNumberNext(last_off); + (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_buildadd: inserted <%x> at offset %d\n", + d, off); + } +#endif /* FASTBUILD_DEBUG */ + if (last_bti == (BTItem) NULL) { + first_off = P_FIRSTKEY; + } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) { + first_off = off; + } + last_off = off; + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); + + state->btps_buf = nbuf; + state->btps_page = npage; + state->btps_lastbti = last_bti; + state->btps_lastoff = last_off; + state->btps_firstoff = first_off; +} + +/* + * take the input tapes stored by 'btspool' and perform successive + * merging passes until at most one run is left in each tape. at that + * point, merge the final tape runs into a set of btree leaves. + * + * XXX three nested loops? gross. cut me up into smaller routines. + */ +static BlockNumber +_bt_merge(Relation index, BTSpool *btspool) +{ + BTPageState state; + BlockNumber firstblk; + BTPriQueue q; + BTPriQueueElem e; + BTItem bti; + BTTapeBlock *itape; + BTTapeBlock *otape; + char *tapepos[MAXTAPES]; + int tapedone[MAXTAPES]; + int t; + int goodtapes; + int nruns; + Size btisz; + bool doleaf = false; + + /* + * initialize state needed for the merge into the btree leaf pages. + */ + (void) memset((char *) &state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF); + state.btps_lastoff = P_HIKEY; + state.btps_lastbti = (BTItem) NULL; + firstblk = BufferGetBlockNumber(state.btps_buf); + + do { /* pass */ + /* + * each pass starts by flushing the previous outputs and + * swapping inputs and outputs. this process also clears the + * new output tapes and rewinds the new input tapes. + */ + btspool->bts_tape = btspool->bts_ntapes - 1; + _bt_spoolflush(btspool); + _bt_spoolswap(btspool); + + nruns = 0; + + for (;;) { /* run */ + /* + * each run starts by selecting a new output tape. the + * merged results of a given run are always sent to this + * one tape. + */ + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + otape = btspool->bts_otape[btspool->bts_tape]; + + /* + * initialize the priority queue by loading it with the + * first element of the given run in each tape. since we + * are starting a new run, we reset the tape (clearing the + * End-Of-Run marker) before reading it. this means that + * _bt_taperead will return 0 only if the tape is actually + * at EOF. + */ + (void) memset((char *) &q, 0, sizeof(BTPriQueue)); + goodtapes = 0; + for (t = 0; t < btspool->bts_ntapes; ++t) { + itape = btspool->bts_itape[t]; + tapepos[t] = itape->bttb_data; + _bt_tapereset(itape); + if (_bt_taperead(itape) == 0) { + tapedone[t] = 1; + } else { + ++goodtapes; + tapedone[t] = 0; + e.btpqe_tape = t; + e.btpqe_item = _bt_tapenext(itape, &tapepos[t]); + if (e.btpqe_item != (BTItem) NULL) { + _bt_pqadd(&q, &e); + } + } + } + /* + * if we don't have any tapes with any input (i.e., they + * are all at EOF), we must be done with this pass. + */ + if (goodtapes == 0) { + break; /* for */ + } + ++nruns; + + /* + * output the smallest element from the queue until there are no + * more. + */ + while (_bt_pqnext(&q, &e) >= 0) { /* item */ + /* + * replace the element taken from priority queue, + * fetching a new block if needed. a tape can run out + * if it hits either End-Of-Run or EOF. + */ + t = e.btpqe_tape; + bti = e.btpqe_item; + if (bti != (BTItem) NULL) { + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (doleaf) { + _bt_buildadd(index, &state, bti, BTP_LEAF); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_merge: inserted <%x> into block %d\n", + d, BufferGetBlockNumber(state.btps_buf)); + } +#endif /* FASTBUILD_DEBUG */ + } else { + if (SPCLEFT(otape) < btisz) { + /* + * if it's full, write it out and add the + * item to the next block. (since we know + * there will be at least one more block, + * we know we do *not* want to set + * End-Of-Run here!) + */ + _bt_tapewrite(otape, 0); + } + _bt_tapeadd(otape, bti, btisz); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), &isnull); + printf("_bt_merge: inserted <%x> into tape %d\n", + d, btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG */ + } + } +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_merge: got <%x> from tape %d\n", d, t); + } +#endif /* FASTBUILD_DEBUG */ + + itape = btspool->bts_itape[t]; + if (!tapedone[t]) { + BTItem newbti = _bt_tapenext(itape, &tapepos[t]); + + if (newbti == (BTItem) NULL) { + if (_bt_taperead(itape) == 0) { + tapedone[t] = 1; + } else { + tapepos[t] = itape->bttb_data; + newbti = _bt_tapenext(itape, &tapepos[t]); + } + } + if (newbti != (BTItem) NULL) { + BTPriQueueElem nexte; + + nexte.btpqe_tape = t; + nexte.btpqe_item = newbti; + _bt_pqadd(&q, &nexte); + } + } + } /* item */ + } /* run */ + + /* + * we are here because we ran out of input on all of the input + * tapes. + * + * if this pass did not generate more actual output runs than + * we have tapes, we know we have at most one run in each + * tape. this means that we are ready to merge into the final + * btree leaf pages instead of merging into a tape file. + */ + if (nruns <= btspool->bts_ntapes) { + doleaf = true; + } + } while (nruns > 0); /* pass */ + + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, state.btps_buf, state.btps_page); + _bt_wrtbuf(index, state.btps_buf); + + return(firstblk); +} + + +/* + * given the block number 'blk' of the first page of a set of linked + * siblings (i.e., the start of an entire level of the btree), + * construct the corresponding next level of the btree. we do this by + * placing minimum keys from each page into this page. the format of + * the internal pages is otherwise the same as for leaf pages. + */ +void +_bt_upperbuild(Relation index, BlockNumber blk, int level) +{ + Buffer rbuf; + Page rpage; + BTPageOpaque ropaque; + BTPageState state; + BlockNumber firstblk; + BTItem bti; + BTItem nbti; + OffsetNumber off; + + rbuf = _bt_getbuf(index, blk, BT_WRITE); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* + * if we only have one page on a level, we can just make it the + * root. + */ + if (P_RIGHTMOST(ropaque)) { + ropaque->btpo_flags |= BTP_ROOT; + _bt_wrtbuf(index, rbuf); + _bt_metaproot(index, blk); + return; + } + _bt_relbuf(index, rbuf, BT_WRITE); + + (void) memset((char *) &state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0); + state.btps_lastoff = P_HIKEY; + state.btps_lastbti = (BTItem) NULL; + firstblk = BufferGetBlockNumber(state.btps_buf); + + /* for each page... */ + do { + rbuf = _bt_getbuf(index, blk, BT_READ); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* for each item... */ + if (!PageIsEmpty(rpage)) { + /* + * form a new index tuple corresponding to the minimum key + * of the lower page and insert it into a page at this + * level. + */ + off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY; + bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off)); + nbti = _bt_formitem(&(bti->bti_itup)); + ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY); +#ifdef FASTBUILD_DEBUG + { + bool isnull; + Datum d = index_getattr(&(nbti->bti_itup), 1, + RelationGetTupleDescriptor(index), + &isnull); + printf("_bt_upperbuild: inserting <%x> at %d\n", + d, level); + } +#endif /* FASTBUILD_DEBUG */ + _bt_buildadd(index, &state, nbti, 0); + pfree((void *) nbti); + } + blk = ropaque->btpo_next; + _bt_relbuf(index, rbuf, BT_READ); + } while (blk != P_NONE); + + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, state.btps_buf, state.btps_page); + _bt_wrtbuf(index, state.btps_buf); + + _bt_upperbuild(index, firstblk, level + 1); +} + +/* + * given a spool loading by successive calls to _bt_spool, create an + * entire btree. + */ +void +_bt_leafbuild(Relation index, void *spool) +{ + BTSpool *btspool = (BTSpool *) spool; + BlockNumber firstblk; + + /* + * merge the runs into btree leaf pages. + */ + firstblk = _bt_merge(index, btspool); + + /* + * build the upper levels of the btree. + */ + _bt_upperbuild(index, firstblk, 0); +} + +#else /* !FASTBUILD */ + +void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); } +void _bt_spooldestroy(void *spool) { } +void _bt_spool(Relation index, BTItem btitem, void *spool) { } +void _bt_upperbuild(Relation index, BlockNumber blk, int level) { } +void _bt_leafbuild(Relation index, void *spool) { } + +#endif /* !FASTBUILD */ |