aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>1999-10-17 22:15:09 +0000
committerTom Lane <tgl@sss.pgh.pa.us>1999-10-17 22:15:09 +0000
commit26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54 (patch)
treecbcf32d78330eb3414abed1117b0a54090302a97 /src
parent59ed74e60bb3c1ad2b83ebacbb49f74517d8764e (diff)
downloadpostgresql-26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54.tar.gz
postgresql-26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54.zip
Final stage of psort reconstruction work: replace psort.c with
a generalized module 'tuplesort.c' that can sort either HeapTuples or IndexTuples, and is not tied to execution of a Sort node. Clean up memory leakages in sorting, and replace nbtsort.c's private implementation of mergesorting with calls to tuplesort.c.
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/nbtree/nbtree.c30
-rw-r--r--src/backend/access/nbtree/nbtsort.c1000
-rw-r--r--src/backend/executor/nodeSort.c153
-rw-r--r--src/backend/utils/sort/Makefile4
-rw-r--r--src/backend/utils/sort/logtape.c8
-rw-r--r--src/backend/utils/sort/tuplesort.c1465
-rw-r--r--src/include/access/nbtree.h13
-rw-r--r--src/include/nodes/execnodes.h20
-rw-r--r--src/include/nodes/plannodes.h4
-rw-r--r--src/include/utils/tuplesort.h68
10 files changed, 1747 insertions, 1018 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 11f527fc3ba..d8d835f424b 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1,17 +1,17 @@
/*-------------------------------------------------------------------------
*
- * btree.c
+ * nbtree.c
* Implementation of Lehman and Yao's btree management algorithm for
* Postgres.
*
- * Copyright (c) 1994, Regents of the University of California
+ * NOTES
+ * This file contains only the public interface routines.
*
*
- * IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.46 1999/09/18 19:06:10 tgl Exp $
+ * Copyright (c) 1994, Regents of the University of California
*
- * NOTES
- * This file contains only the public interface routines.
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.47 1999/10/17 22:15:03 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -76,7 +76,7 @@ btbuild(Relation heap,
#endif
Node *pred,
*oldPred;
- void *spool = (void *) NULL;
+ BTSpool *spool = NULL;
bool isunique;
bool usefast;
@@ -147,7 +147,7 @@ btbuild(Relation heap,
if (usefast)
{
- spool = _bt_spoolinit(index, 7, isunique);
+ spool = _bt_spoolinit(index, isunique);
res = (InsertIndexResult) NULL;
}
@@ -249,11 +249,11 @@ btbuild(Relation heap,
/*
* if we are doing bottom-up btree build, we insert the index into
- * a spool page for subsequent processing. otherwise, we insert
+ * a spool file for subsequent processing. otherwise, we insert
* into the btree.
*/
if (usefast)
- _bt_spool(index, btitem, spool);
+ _bt_spool(btitem, spool);
else
res = _bt_doinsert(index, btitem, isunique, heap);
@@ -275,15 +275,13 @@ btbuild(Relation heap,
}
/*
- * if we are doing bottom-up btree build, we now have a bunch of
- * sorted runs in the spool pages. finish the build by (1) merging
- * the runs, (2) inserting the sorted tuples into btree pages and (3)
- * building the upper levels.
+ * if we are doing bottom-up btree build, finish the build by
+ * (1) completing the sort of the spool file, (2) inserting the
+ * sorted tuples into btree pages and (3) building the upper levels.
*/
if (usefast)
{
- _bt_spool(index, (BTItem) NULL, spool); /* flush the spool */
- _bt_leafbuild(index, spool);
+ _bt_leafbuild(spool);
_bt_spooldestroy(spool);
}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index c1564544b03..48386c113f0 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1,68 +1,47 @@
/*-------------------------------------------------------------------------
- * btsort.c
- *
- * Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- * $Id: nbtsort.c,v 1.46 1999/07/19 07:07:19 momjian Exp $
+ * nbtsort.c
+ * Build a btree from sorted input by loading leaf pages sequentially.
*
* NOTES
*
- * what we do is:
- * - generate a set of initial one-block runs, distributed round-robin
- * between the output tapes.
- * - for each pass,
- * - swap input and output tape sets, rewinding both and truncating
- * the output tapes.
- * - merge the current run in each input tape to the current output
- * tape.
- * - when each input run has been exhausted, switch to another output
- * tape and start processing another run.
- * - when we have fewer runs than tapes, we know we are ready to start
- * merging into the btree leaf pages. (i.e., we do not have to wait
- * until we have exactly one tape.)
- * - as we extract tuples from the final runs, we build the pages for
- * each level. when we have only one page on a level, it must be the
- * root -- it can be attached to the btree metapage and we are done.
- *
- * conventions:
- * - external interface routines take in and return "void *" for their
- * opaque handles. this is for modularity reasons.
+ * We use tuplesort.c to sort the given index tuples into order.
+ * Then we scan the index tuples in order and build the btree pages
+ * for each level. When we have only one page on a level, it must be the
+ * root -- it can be attached to the btree metapage and we are done.
*
* this code is moderately slow (~10% slower) compared to the regular
* btree (insertion) build code on sorted or well-clustered data. on
* random data, however, the insertion build code is unusable -- the
* difference on a 60MB heap is a factor of 15 because the random
- * probes into the btree thrash the buffer pool.
+ * probes into the btree thrash the buffer pool. (NOTE: the above
+ * "10%" estimate is probably obsolete, since it refers to an old and
+ * not very good external sort implementation that used to exist in
+ * this module. tuplesort.c is almost certainly faster.)
*
* this code currently packs the pages to 100% of capacity. this is
* not wise, since *any* insertion will cause splitting. filling to
* something like the standard 70% steady-state load factor for btrees
* would probably be better.
*
- * somebody desperately needs to figure out how to do a better job of
- * balancing the merge passes -- the fan-in on the final merges can be
- * pretty poor, which is bad for performance.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.47 1999/10/17 22:15:04 tgl Exp $
+ *
*-------------------------------------------------------------------------
*/
-#include <fcntl.h>
-
#include "postgres.h"
#include "access/nbtree.h"
+#include "utils/tuplesort.h"
#ifdef BTREE_BUILD_STATS
#define ShowExecutorStats pg_options[TRACE_EXECUTORSTATS]
#endif
-static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);
-static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
-static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);
-static void _bt_uppershutdown(Relation index, BTPageState *state);
-
/*
* turn on debugging output.
*
@@ -70,689 +49,108 @@ static void _bt_uppershutdown(Relation index, BTPageState *state);
* only really useful for integer keys.
*/
/*#define FASTBUILD_DEBUG*/
-#define FASTBUILD_SPOOL
-#define FASTBUILD_MERGE
-
-#define MAXTAPES (7)
-#define TAPEBLCKSZ (BLCKSZ << 2)
-
-extern int NDirectFileRead;
-extern int NDirectFileWrite;
-
-/*
- * this is what we use to shovel BTItems in and out of memory. it's
- * bigger than a standard block because we are doing a lot of strictly
- * sequential i/o. this is obviously something of a tradeoff since we
- * are potentially reading a bunch of zeroes off of disk in many
- * cases.
- *
- * BTItems are packed in and MAXALIGN'd.
- *
- * the fd should not be going out to disk, strictly speaking, but it's
- * the only thing like that so i'm not going to worry about wasting a
- * few bytes.
- */
-typedef struct
-{
- int bttb_magic; /* magic number */
- File bttb_fd; /* file descriptor */
- int bttb_top; /* top of free space within bttb_data */
- short bttb_ntup; /* number of tuples in this block */
- short bttb_eor; /* End-Of-Run marker */
- char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
-} BTTapeBlock;
/*
- * this structure holds the bookkeeping for a simple balanced multiway
- * merge. (polyphase merging is hairier than i want to get into right
- * now, and i don't see why i have to care how many "tapes" i use
- * right now. though if psort was in a condition that i could hack it
- * to do this, you bet i would.)
+ * Status record for spooling.
*/
-typedef struct
+struct BTSpool
{
- int bts_ntapes;
- int bts_tape;
- BTTapeBlock **bts_itape; /* input tape blocks */
- BTTapeBlock **bts_otape; /* output tape blocks */
+ Tuplesortstate *sortstate; /* state data for tuplesort.c */
+ Relation index;
bool isunique;
-} BTSpool;
-
-/*-------------------------------------------------------------------------
- * sorting comparison routine - returns {-1,0,1} depending on whether
- * the key in the left BTItem is {<,=,>} the key in the right BTItem.
- *
- * we want to use _bt_isortcmp as a comparison function for qsort(3),
- * but it needs extra arguments, so we "pass them in" as global
- * variables. ick. fortunately, they are the same throughout the
- * build, so we need do this only once. this is why you must call
- * _bt_isortcmpinit before the call to qsort(3).
- *
- * a NULL BTItem is always assumed to be greater than any actual
- * value; our heap routines (see below) assume that the smallest
- * element in the heap is returned. that way, NULL values from the
- * exhausted tapes can sift down to the bottom of the heap. in point
- * of fact we just don't replace the elements of exhausted tapes, but
- * what the heck.
- * *-------------------------------------------------------------------------
- */
-typedef struct
-{
- Datum *btsk_datum;
- char *btsk_nulls;
- BTItem btsk_item;
-} BTSortKey;
-
-static Relation _bt_sortrel;
-static int _bt_nattr;
-static BTSpool *_bt_inspool;
-
-static void
-_bt_isortcmpinit(Relation index, BTSpool *spool)
-{
- _bt_sortrel = index;
- _bt_inspool = spool;
- _bt_nattr = index->rd_att->natts;
-}
-
-static int
-_bt_isortcmp(BTSortKey *k1, BTSortKey *k2)
-{
- Datum *k1_datum = k1->btsk_datum;
- Datum *k2_datum = k2->btsk_datum;
- char *k1_nulls = k1->btsk_nulls;
- char *k2_nulls = k2->btsk_nulls;
- bool equal_isnull = false;
- int i;
-
- if (k1->btsk_item == (BTItem) NULL)
- {
- if (k2->btsk_item == (BTItem) NULL)
- return 0; /* 1 = 2 */
- return 1; /* 1 > 2 */
- }
- else if (k2->btsk_item == (BTItem) NULL)
- return -1; /* 1 < 2 */
-
- for (i = 0; i < _bt_nattr; i++)
- {
- if (k1_nulls[i] != ' ') /* k1 attr is NULL */
- {
- if (k2_nulls[i] != ' ') /* the same for k2 */
- {
- equal_isnull = true;
- continue;
- }
- return 1; /* NULL ">" NOT_NULL */
- }
- else if (k2_nulls[i] != ' ') /* k2 attr is NULL */
- return -1; /* NOT_NULL "<" NULL */
-
- if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
- k1_datum[i], k2_datum[i]))
- return 1; /* 1 > 2 */
- else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
- k2_datum[i], k1_datum[i]))
- return -1; /* 1 < 2 */
- }
-
- if (_bt_inspool->isunique && !equal_isnull)
- {
- _bt_spooldestroy((void *) _bt_inspool);
- elog(ERROR, "Cannot create unique index. Table contains non-unique values");
- }
- return 0; /* 1 = 2 */
-}
-
-static void
-_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk)
-{
- sk->btsk_item = (BTItem) NULL;
- sk->btsk_datum = (Datum *) NULL;
- sk->btsk_nulls = (char *) NULL;
-
- if (bti != (BTItem) NULL)
- {
- IndexTuple it = &(bti->bti_itup);
- TupleDesc itdesc = index->rd_att;
- Datum *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum));
- char *np = (char *) palloc(_bt_nattr * sizeof(char));
- bool isnull;
- int i;
-
- for (i = 0; i < _bt_nattr; i++)
- {
- dp[i] = index_getattr(it, i + 1, itdesc, &isnull);
- if (isnull)
- np[i] = 'n';
- else
- np[i] = ' ';
- }
- sk->btsk_item = bti;
- sk->btsk_datum = dp;
- sk->btsk_nulls = np;
- }
-}
-
-/*-------------------------------------------------------------------------
- * priority queue methods
- *
- * these were more-or-less lifted from the heap section of the 1984
- * edition of gonnet's book on algorithms and data structures. they
- * are coded so that the smallest element in the heap is returned (we
- * use them for merging sorted runs).
- *
- * XXX these probably ought to be generic library functions.
- *-------------------------------------------------------------------------
- */
-typedef struct
-{
- int btpqe_tape; /* tape identifier */
- BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */
-} BTPriQueueElem;
-
-#define MAXELEM MAXTAPES
-typedef struct
-{
- int btpq_nelem;
- BTPriQueueElem btpq_queue[MAXELEM];
- Relation btpq_rel;
-} BTPriQueue;
-
-/* be sure to call _bt_isortcmpinit first */
-#define GREATER(a, b) \
- (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
-
-static void
-_bt_pqsift(BTPriQueue *q, int parent)
-{
- int child;
- BTPriQueueElem e;
-
- for (child = parent * 2 + 1;
- child < q->btpq_nelem;
- child = parent * 2 + 1)
- {
- if (child < q->btpq_nelem - 1)
- {
- if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1])))
- ++child;
- }
- if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child])))
- {
- e = q->btpq_queue[child]; /* struct = */
- q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
- q->btpq_queue[parent] = e; /* struct = */
- parent = child;
- }
- else
- parent = child + 1;
- }
-}
-
-static int
-_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
-{
- if (q->btpq_nelem < 1)
- { /* already empty */
- return -1;
- }
- *e = q->btpq_queue[0]; /* struct = */
-
- if (--q->btpq_nelem < 1)
- { /* now empty, don't sift */
- return 0;
- }
- q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */
- _bt_pqsift(q, 0);
- return 0;
-}
-
-static void
-_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
-{
- int child,
- parent;
-
- if (q->btpq_nelem >= MAXELEM)
- elog(ERROR, "_bt_pqadd: queue overflow");
-
- child = q->btpq_nelem++;
- while (child > 0)
- {
- parent = child / 2;
- if (GREATER(e, &(q->btpq_queue[parent])))
- break;
- else
- {
- q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
- child = parent;
- }
- }
-
- q->btpq_queue[child] = *e; /* struct = */
-}
-
-/*-------------------------------------------------------------------------
- * tape methods
- *-------------------------------------------------------------------------
- */
+};
#define BTITEMSZ(btitem) \
((btitem) ? \
(IndexTupleDSize((btitem)->bti_itup) + \
(sizeof(BTItemData) - sizeof(IndexTupleData))) : \
0)
-#define SPCLEFT(tape) \
- (sizeof((tape)->bttb_data) - (tape)->bttb_top)
-#define EMPTYTAPE(tape) \
- ((tape)->bttb_ntup <= 0)
-#define BTTAPEMAGIC 0x19660226
-
-/*
- * reset the tape header for its next use without doing anything to
- * the physical tape file. (setting bttb_top to 0 makes the block
- * empty.)
- */
-static void
-_bt_tapereset(BTTapeBlock *tape)
-{
- tape->bttb_eor = 0;
- tape->bttb_top = 0;
- tape->bttb_ntup = 0;
-}
-
-/*
- * rewind the physical tape file.
- */
-static void
-_bt_taperewind(BTTapeBlock *tape)
-{
- FileSeek(tape->bttb_fd, 0L, SEEK_SET);
-}
-
-/*
- * destroy the contents of the physical tape file without destroying
- * the tape data structure or removing the physical tape file.
- *
- * we use the VFD version of ftruncate(2) to do this rather than
- * unlinking and recreating the file. you still have to wait while
- * the OS frees up all of the file system blocks and stuff, but at
- * least you don't have to delete and reinsert the directory entries.
- */
-static void
-_bt_tapeclear(BTTapeBlock *tape)
-{
- /* blow away the contents of the old file */
- _bt_taperewind(tape);
-#ifdef NOT_USED
- FileSync(tape->bttb_fd);
-#endif
- FileTruncate(tape->bttb_fd, 0);
-
- /* reset the buffer */
- _bt_tapereset(tape);
-}
-
-/*
- * create a new BTTapeBlock, allocating memory for the data structure
- * as well as opening a physical tape file.
- */
-static BTTapeBlock *
-_bt_tapecreate(void)
-{
- BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
-
- if (tape == (BTTapeBlock *) NULL)
- elog(ERROR, "_bt_tapecreate: out of memory");
-
- tape->bttb_magic = BTTAPEMAGIC;
-
- tape->bttb_fd = OpenTemporaryFile();
- Assert(tape->bttb_fd >= 0);
-
- /* initialize the buffer */
- _bt_tapereset(tape);
-
- return tape;
-}
-
-/*
- * destroy the BTTapeBlock structure and its physical tape file.
- */
-static void
-_bt_tapedestroy(BTTapeBlock *tape)
-{
- FileUnlink(tape->bttb_fd);
- pfree((void *) tape);
-}
-
-/*
- * flush the tape block to the file, marking End-Of-Run if requested.
- */
-static void
-_bt_tapewrite(BTTapeBlock *tape, int eor)
-{
- tape->bttb_eor = eor;
- FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);
- NDirectFileWrite += TAPEBLCKSZ / BLCKSZ;
- _bt_tapereset(tape);
-}
-
-/*
- * read a tape block from the file, overwriting the current contents
- * of the buffer.
- *
- * returns:
- * - 0 if there are no more blocks in the tape or in this run (call
- * _bt_tapereset to clear the End-Of-Run marker)
- * - 1 if a valid block was read
- */
-static int
-_bt_taperead(BTTapeBlock *tape)
-{
- File fd;
- int nread;
-
- if (tape->bttb_eor)
- {
- return 0; /* we are already at End-Of-Run */
- }
-
- /*
- * we're clobbering the old tape block, but we do need to save the VFD
- * (the one in the block we're reading is bogus).
- */
- fd = tape->bttb_fd;
- nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);
- tape->bttb_fd = fd;
- if (nread != TAPEBLCKSZ)
- {
- Assert(nread == 0); /* we are at EOF */
- return 0;
- }
- Assert(tape->bttb_magic == BTTAPEMAGIC);
- NDirectFileRead += TAPEBLCKSZ / BLCKSZ;
- return 1;
-}
-/*
- * get the next BTItem from a tape block.
- *
- * returns:
- * - NULL if we have run out of BTItems
- * - a pointer to the BTItemData in the block otherwise
- *
- * side effects:
- * - sets 'pos' to the current position within the block.
- */
-static BTItem
-_bt_tapenext(BTTapeBlock *tape, char **pos)
-{
- Size itemsz;
- BTItem bti;
+static void _bt_load(Relation index, BTSpool *btspool);
+static BTItem _bt_buildadd(Relation index, BTPageState *state, BTItem bti,
+ int flags);
+static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
+static BTPageState *_bt_pagestate(Relation index, int flags,
+ int level, bool doupper);
+static void _bt_uppershutdown(Relation index, BTPageState *state);
- if (*pos >= tape->bttb_data + tape->bttb_top)
- return (BTItem) NULL;
- bti = (BTItem) *pos;
- itemsz = BTITEMSZ(bti);
- *pos += MAXALIGN(itemsz);
- return bti;
-}
/*
- * copy a BTItem into a tape block.
- *
- * assumes that we have already checked to see if the block has enough
- * space for the item.
- *
- * side effects:
- *
- * - advances the 'top' pointer in the tape block header to point to
- * the beginning of free space.
+ * Interface routines
*/
-static void
-_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
-{
- memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
- ++tape->bttb_ntup;
- tape->bttb_top += MAXALIGN(itemsz);
-}
-/*-------------------------------------------------------------------------
- * spool methods
- *-------------------------------------------------------------------------
- */
/*
- * create and initialize a spool structure, including the underlying
- * files.
+ * create and initialize a spool structure
*/
-void *
-_bt_spoolinit(Relation index, int ntapes, bool isunique)
+BTSpool *
+_bt_spoolinit(Relation index, bool isunique)
{
BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
- int i;
- if (btspool == (BTSpool *) NULL)
- elog(ERROR, "_bt_spoolinit: out of memory");
MemSet((char *) btspool, 0, sizeof(BTSpool));
- btspool->bts_ntapes = ntapes;
- btspool->bts_tape = 0;
- btspool->isunique = isunique;
- btspool->bts_itape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
- btspool->bts_otape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
- if (btspool->bts_itape == (BTTapeBlock **) NULL ||
- btspool->bts_otape == (BTTapeBlock **) NULL)
- elog(ERROR, "_bt_spoolinit: out of memory");
+ btspool->index = index;
+ btspool->isunique = isunique;
- for (i = 0; i < ntapes; ++i)
- {
- btspool->bts_itape[i] = _bt_tapecreate();
- btspool->bts_otape[i] = _bt_tapecreate();
- }
+ btspool->sortstate = tuplesort_begin_index(index, isunique, false);
- _bt_isortcmpinit(index, btspool);
+ /*
+ * Currently, tuplesort provides sort functions on IndexTuples.
+ * If we kept anything in a BTItem other than a regular IndexTuple,
+ * we'd need to modify tuplesort to understand BTItems as such.
+ */
+ Assert(sizeof(BTItemData) == sizeof(IndexTupleData));
- return (void *) btspool;
+ return btspool;
}
/*
* clean up a spool structure and its substructures.
*/
void
-_bt_spooldestroy(void *spool)
+_bt_spooldestroy(BTSpool *btspool)
{
- BTSpool *btspool = (BTSpool *) spool;
- int i;
-
- for (i = 0; i < btspool->bts_ntapes; ++i)
- {
- _bt_tapedestroy(btspool->bts_otape[i]);
- _bt_tapedestroy(btspool->bts_itape[i]);
- }
+ tuplesort_end(btspool->sortstate);
pfree((void *) btspool);
}
/*
- * flush out any dirty output tape blocks
+ * spool a btitem into the sort file.
*/
-static void
-_bt_spoolflush(BTSpool *btspool)
+void
+_bt_spool(BTItem btitem, BTSpool *btspool)
{
- int i;
-
- for (i = 0; i < btspool->bts_ntapes; ++i)
- {
- if (!EMPTYTAPE(btspool->bts_otape[i]))
- _bt_tapewrite(btspool->bts_otape[i], 1);
- }
+ /* A BTItem is really just an IndexTuple */
+ tuplesort_puttuple(btspool->sortstate, (void *) btitem);
}
/*
- * swap input tapes and output tapes by swapping their file
- * descriptors. additional preparation for the next merge pass
- * includes rewinding the new input tapes and clearing out the new
- * output tapes.
+ * given a spool loaded by successive calls to _bt_spool,
+ * create an entire btree.
*/
-static void
-_bt_spoolswap(BTSpool *btspool)
+void
+_bt_leafbuild(BTSpool *btspool)
{
- File tmpfd;
- BTTapeBlock *itape;
- BTTapeBlock *otape;
- int i;
-
- for (i = 0; i < btspool->bts_ntapes; ++i)
+#ifdef BTREE_BUILD_STATS
+ if (ShowExecutorStats)
{
- itape = btspool->bts_itape[i];
- otape = btspool->bts_otape[i];
-
- /*
- * swap the input and output VFDs.
- */
- tmpfd = itape->bttb_fd;
- itape->bttb_fd = otape->bttb_fd;
- otape->bttb_fd = tmpfd;
-
- /*
- * rewind the new input tape.
- */
- _bt_taperewind(itape);
- _bt_tapereset(itape);
-
- /*
- * clear the new output tape -- it's ok to throw away the old
- * inputs.
- */
- _bt_tapeclear(otape);
+ fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
+ ShowUsage();
+ ResetUsage();
}
+#endif
+ tuplesort_performsort(btspool->sortstate);
+
+ _bt_load(btspool->index, btspool);
}
-/*-------------------------------------------------------------------------
- * sorting routines
- *-------------------------------------------------------------------------
- */
/*
- * spool 'btitem' into an initial run. as tape blocks are filled, the
- * block BTItems are qsorted and written into some output tape (it
- * doesn't matter which; we go round-robin for simplicity). the
- * initial runs are therefore always just one block.
+ * Internal routines.
*/
-void
-_bt_spool(Relation index, BTItem btitem, void *spool)
-{
- BTSpool *btspool = (BTSpool *) spool;
- BTTapeBlock *itape;
- Size itemsz;
-
- _bt_isortcmpinit(index, btspool);
- itape = btspool->bts_itape[btspool->bts_tape];
- itemsz = BTITEMSZ(btitem);
- itemsz = MAXALIGN(itemsz);
-
- /*
- * if this buffer is too full for this BTItemData, or if we have run
- * out of BTItems, we need to sort the buffer and write it out. in
- * this case, the BTItemData will go into the next tape's buffer.
- */
- if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz)
- {
- BTSortKey *parray = (BTSortKey *) NULL;
- BTTapeBlock *otape;
- BTItem bti;
- char *pos;
- int btisz;
- int it_ntup = itape->bttb_ntup;
- int i;
-
- /*
- * build an array of pointers to the BTItemDatas on the input
- * block.
- */
- if (it_ntup > 0)
- {
- parray = (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey));
- pos = itape->bttb_data;
- for (i = 0; i < it_ntup; ++i)
- _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i]));
-
- /*
- * qsort the pointer array.
- */
- qsort((void *) parray, it_ntup, sizeof(BTSortKey),
- (int (*) (const void *, const void *)) _bt_isortcmp);
- }
-
- /*
- * write the spooled run into the output tape. we copy the
- * BTItemDatas in the order dictated by the sorted array of
- * BTItems, not the original order.
- *
- * (since everything was MAXALIGN'd and is all on a single tape
- * block, everything had *better* still fit on one tape block..)
- */
- otape = btspool->bts_otape[btspool->bts_tape];
- for (i = 0; i < it_ntup; ++i)
- {
- bti = parray[i].btsk_item;
- btisz = BTITEMSZ(bti);
- btisz = MAXALIGN(btisz);
- _bt_tapeadd(otape, bti, btisz);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL)
- {
- bool isnull;
- Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att,
- &isnull);
-
- printf("_bt_spool: inserted <%x> into output tape %d\n",
- d, btspool->bts_tape);
- }
-#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */
- }
-
- /*
- * the initial runs are always single tape blocks. flush the
- * output block, marking End-Of-Run.
- */
- _bt_tapewrite(otape, 1);
-
- /*
- * reset the input buffer for the next run. we don't have to
- * write it out or anything -- we only use it to hold the unsorted
- * BTItemDatas, the output tape contains all the sorted stuff.
- *
- * changing bts_tape changes the output tape and input tape; we
- * change itape for the code below.
- */
- _bt_tapereset(itape);
- btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
- itape = btspool->bts_itape[btspool->bts_tape];
-
- /*
- * destroy the pointer array.
- */
- if (parray != (BTSortKey *) NULL)
- {
- for (i = 0; i < it_ntup; i++)
- {
- if (parray[i].btsk_datum != (Datum *) NULL)
- pfree((void *) (parray[i].btsk_datum));
- if (parray[i].btsk_nulls != (char *) NULL)
- pfree((void *) (parray[i].btsk_nulls));
- }
- pfree((void *) parray);
- }
- }
-
- /* insert this item into the current buffer */
- if (btitem != (BTItem) NULL)
- _bt_tapeadd(itape, btitem, itemsz);
-}
/*
* allocate a new, clean btree page, not linked to any siblings.
@@ -805,7 +203,7 @@ _bt_slideleft(Relation index, Buffer buf, Page page)
* allocate and initialize a new BTPageState. the returned structure
* is suitable for immediate use by _bt_buildadd.
*/
-static void *
+static BTPageState *
_bt_pagestate(Relation index, int flags, int level, bool doupper)
{
BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
@@ -819,7 +217,7 @@ _bt_pagestate(Relation index, int flags, int level, bool doupper)
state->btps_level = level;
state->btps_doupper = doupper;
- return (void *) state;
+ return state;
}
/*
@@ -883,9 +281,8 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
* if all keys are unique, 'first' will always be the same as 'last'.
*/
static BTItem
-_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
+_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
{
- BTPageState *state = (BTPageState *) pstate;
Buffer nbuf;
Page npage;
BTItem last_bti;
@@ -944,8 +341,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
if (PageAddItem(npage, PageGetItem(opage, ii),
ii->lp_len, n, LP_USED) == InvalidOffsetNumber)
elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
-#ifdef NOT_USED
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
{
bool isnull;
BTItem tmpbti =
@@ -956,7 +352,6 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
d, n, state->btps_level);
}
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
}
@@ -989,7 +384,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
nopaque->btpo_prev = BufferGetBlockNumber(obuf);
nopaque->btpo_next = P_NONE;
- if (_bt_itemcmp(index, _bt_nattr,
+ if (_bt_itemcmp(index, index->rd_att->natts,
(BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)),
(BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)),
BTEqualStrategyNumber))
@@ -1030,8 +425,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
off = OffsetNumberNext(last_off);
if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber)
elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
-#ifdef NOT_USED
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
{
bool isnull;
Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
@@ -1039,11 +433,10 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
d, off, state->btps_level);
}
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
if (last_bti == (BTItem) NULL)
first_off = P_FIRSTKEY;
- else if (!_bt_itemcmp(index, _bt_nattr,
+ else if (!_bt_itemcmp(index, index->rd_att->natts,
bti, last_bti, BTEqualStrategyNumber))
first_off = off;
last_off = off;
@@ -1103,224 +496,31 @@ _bt_uppershutdown(Relation index, BTPageState *state)
}
/*
- * take the input tapes stored by 'btspool' and perform successive
- * merging passes until at most one run is left in each tape. at that
- * point, merge the final tape runs into a set of btree leaves.
- *
- * XXX three nested loops? gross. cut me up into smaller routines.
+ * Read tuples in correct sort order from tuplesort, and load them into
+ * btree leaves.
*/
static void
-_bt_merge(Relation index, BTSpool *btspool)
+_bt_load(Relation index, BTSpool *btspool)
{
BTPageState *state;
- BTPriQueue q;
- BTPriQueueElem e;
- BTSortKey btsk;
BTItem bti;
- BTTapeBlock *itape;
- BTTapeBlock *otape;
- char *tapepos[MAXTAPES];
- int tapedone[MAXTAPES];
- int t;
- int goodtapes;
- int npass;
- int nruns;
- Size btisz;
- bool doleaf = false;
+ bool should_free;
/*
* initialize state needed for the merge into the btree leaf pages.
*/
- state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true);
-
- npass = 0;
- do
- { /* pass */
-
- /*
- * each pass starts by flushing the previous outputs and swapping
- * inputs and outputs. flushing sets End-of-Run for any dirty
- * output tapes. swapping clears the new output tapes and rewinds
- * the new input tapes.
- */
- btspool->bts_tape = btspool->bts_ntapes - 1;
- _bt_spoolflush(btspool);
- _bt_spoolswap(btspool);
-
- ++npass;
- nruns = 0;
-
- for (;;)
- { /* run */
-
- /*
- * each run starts by selecting a new output tape. the merged
- * results of a given run are always sent to this one tape.
- */
- btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
- otape = btspool->bts_otape[btspool->bts_tape];
-
- /*
- * initialize the priority queue by loading it with the first
- * element of the given run in each tape. since we are
- * starting a new run, we reset the tape (clearing the
- * End-Of-Run marker) before reading it. this means that
- * _bt_taperead will return 0 only if the tape is actually at
- * EOF.
- */
- MemSet((char *) &q, 0, sizeof(BTPriQueue));
- goodtapes = 0;
- for (t = 0; t < btspool->bts_ntapes; ++t)
- {
- itape = btspool->bts_itape[t];
- tapepos[t] = itape->bttb_data;
- tapedone[t] = 0;
- _bt_tapereset(itape);
- do
- {
- if (_bt_taperead(itape) == 0)
- tapedone[t] = 1;
- } while (!tapedone[t] && EMPTYTAPE(itape));
- if (!tapedone[t])
- {
- ++goodtapes;
- e.btpqe_tape = t;
- _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]),
- &(e.btpqe_item));
- if (e.btpqe_item.btsk_item != (BTItem) NULL)
- _bt_pqadd(&q, &e);
- }
- }
-
- /*
- * if we don't have any tapes with any input (i.e., they are
- * all at EOF), there is no work to do in this run -- we must
- * be done with this pass.
- */
- if (goodtapes == 0)
- {
- break; /* for */
- }
- ++nruns;
-
- /*
- * output the smallest element from the queue until there are
- * no more.
- */
- while (_bt_pqnext(&q, &e) >= 0)
- { /* item */
-
- /*
- * replace the element taken from priority queue, fetching
- * a new block if needed. a tape can run out if it hits
- * either End-Of-Run or EOF.
- */
- t = e.btpqe_tape;
- btsk = e.btpqe_item;
- bti = btsk.btsk_item;
- if (bti != (BTItem) NULL)
- {
- btisz = BTITEMSZ(bti);
- btisz = MAXALIGN(btisz);
- if (doleaf)
- {
- _bt_buildadd(index, state, bti, BTP_LEAF);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
- {
- bool isnull;
- Datum d = index_getattr(&(bti->bti_itup), 1,
- index->rd_att, &isnull);
-
- printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n",
- npass, nruns, d, t,
- BufferGetBlockNumber(state->btps_buf));
- }
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
- }
- else
- {
- if (SPCLEFT(otape) < btisz)
- {
-
- /*
- * if it's full, write it out and add the item
- * to the next block. (since we will be
- * adding another tuple immediately after
- * this, we can be sure that there will be at
- * least one more block in this run and so we
- * know we do *not* want to set End-Of-Run
- * here.)
- */
- _bt_tapewrite(otape, 0);
- }
- _bt_tapeadd(otape, bti, btisz);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
- {
- bool isnull;
- Datum d = index_getattr(&(bti->bti_itup), 1,
- index->rd_att, &isnull);
-
- printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n",
- npass, nruns, d, t,
- btspool->bts_tape);
- }
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
- }
-
- if (btsk.btsk_datum != (Datum *) NULL)
- pfree((void *) (btsk.btsk_datum));
- if (btsk.btsk_nulls != (char *) NULL)
- pfree((void *) (btsk.btsk_nulls));
-
- }
- itape = btspool->bts_itape[t];
- if (!tapedone[t])
- {
- BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
-
- if (newbti == (BTItem) NULL)
- {
- do
- {
- if (_bt_taperead(itape) == 0)
- tapedone[t] = 1;
- } while (!tapedone[t] && EMPTYTAPE(itape));
- if (!tapedone[t])
- {
- tapepos[t] = itape->bttb_data;
- newbti = _bt_tapenext(itape, &tapepos[t]);
- }
- }
- if (newbti != (BTItem) NULL)
- {
- BTPriQueueElem nexte;
-
- nexte.btpqe_tape = t;
- _bt_setsortkey(index, newbti, &(nexte.btpqe_item));
- _bt_pqadd(&q, &nexte);
- }
- }
- } /* item */
-
- /*
- * that's it for this run. flush the output tape, marking
- * End-of-Run.
- */
- _bt_tapewrite(otape, 1);
- } /* run */
+ state = _bt_pagestate(index, BTP_LEAF, 0, true);
- /*
- * we are here because we ran out of input on all of the input
- * tapes.
- *
- * if this pass did not generate more actual output runs than we have
- * tapes, we know we have at most one run in each tape. this
- * means that we are ready to merge into the final btree leaf
- * pages instead of merging into a tape file.
- */
- if (nruns <= btspool->bts_ntapes)
- doleaf = true;
- } while (nruns > 0); /* pass */
+ for (;;)
+ {
+ bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true,
+ &should_free);
+ if (bti == (BTItem) NULL)
+ break;
+ _bt_buildadd(index, state, bti, BTP_LEAF);
+ if (should_free)
+ pfree((void *) bti);
+ }
_bt_uppershutdown(index, state);
}
@@ -1359,7 +559,7 @@ _bt_upperbuild(Relation index)
ropaque->btpo_flags &= ~BTP_ROOT;
_bt_wrtbuf(index, rbuf);
- state = (BTPageState *) _bt_pagestate(index, 0, 0, true);
+ state = _bt_pagestate(index, 0, 0, true);
/* for each page... */
do
@@ -1380,7 +580,7 @@ _bt_upperbuild(Relation index)
* the lower page and insert it into a page at this level.
*/
nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque));
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
{
bool isnull;
Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att,
@@ -1389,7 +589,7 @@ _bt_upperbuild(Relation index)
printf("_bt_upperbuild: inserting <%x> at %d\n",
d, state->btps_level);
}
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
+#endif
_bt_buildadd(index, state, nbti, 0);
pfree((void *) nbti);
}
@@ -1401,25 +601,3 @@ _bt_upperbuild(Relation index)
}
#endif
-
-/*
- * given a spool loading by successive calls to _bt_spool, create an
- * entire btree.
- */
-void
-_bt_leafbuild(Relation index, void *spool)
-{
- _bt_isortcmpinit(index, (BTSpool *) spool);
-
-#ifdef BTREE_BUILD_STATS
- if (ShowExecutorStats)
- {
- fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
- ShowUsage();
- ResetUsage();
- }
-#endif
-
- _bt_merge(index, (BTSpool *) spool);
-
-}
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index f82fecf0d6f..14e8b46aa86 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -7,16 +7,17 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.23 1999/07/17 20:16:58 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.24 1999/10/17 22:15:02 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
+
#include "executor/executor.h"
#include "executor/execdebug.h"
#include "executor/nodeSort.h"
-#include "utils/psort.h"
+#include "utils/tuplesort.h"
/* ----------------------------------------------------------------
* FormSortKeys(node)
@@ -83,11 +84,9 @@ FormSortKeys(Sort *sortnode)
/* ----------------------------------------------------------------
* ExecSort
*
- * old comments
- * Sorts tuples from the outer subtree of the node in psort,
+ * Sorts tuples from the outer subtree of the node using tuplesort,
* which saves the results in a temporary file or memory. After the
* initial call, returns a tuple from the file with each call.
- * Assumes that heap access method is used.
*
* Conditions:
* -- none.
@@ -101,10 +100,8 @@ ExecSort(Sort *node)
{
EState *estate;
SortState *sortstate;
- Plan *outerNode;
ScanDirection dir;
- int keycount;
- ScanKey sortkeys;
+ Tuplesortstate *tuplesortstate;
HeapTuple heapTuple;
TupleTableSlot *slot;
bool should_free;
@@ -119,44 +116,72 @@ ExecSort(Sort *node)
sortstate = node->sortstate;
estate = node->plan.state;
dir = estate->es_direction;
+ tuplesortstate = (Tuplesortstate *) sortstate->tuplesortstate;
/* ----------------
- * the first time we call this, psort sorts this into a file.
- * Subsequent calls return tuples from psort.
+ * If first time through, read all tuples from outer plan and
+ * pass them to tuplesort.c.
+ * Subsequent calls just fetch tuples from tuplesort.
* ----------------
*/
- if (sortstate->sort_Flag == false)
+ if (! sortstate->sort_Done)
{
+ Plan *outerNode;
+ TupleDesc tupDesc;
+ int keycount;
+ ScanKey sortkeys;
+
SO1_printf("ExecSort: %s\n",
- "sortstate == false -> sorting subplan");
+ "sorting subplan");
/* ----------------
- * set all relations to be scanned in the forward direction
- * while creating the temporary relation.
+ * Want to scan subplan in the forward direction while creating
+ * the sorted data. (Does setting my direction actually affect
+ * the subplan? I bet this is useless code...)
* ----------------
*/
estate->es_direction = ForwardScanDirection;
/* ----------------
- * prepare information for psort_begin()
+ * Initialize tuplesort module.
* ----------------
*/
- outerNode = outerPlan((Plan *) node);
+ SO1_printf("ExecSort: %s\n",
+ "calling tuplesort_begin");
+ outerNode = outerPlan((Plan *) node);
+ tupDesc = ExecGetTupType(outerNode);
keycount = node->keycount;
sortkeys = (ScanKey) sortstate->sort_Keys;
- SO1_printf("ExecSort: %s\n",
- "calling psort_begin");
- if (!psort_begin(node, /* this node */
- keycount, /* number keys */
- sortkeys)) /* keys */
+ tuplesortstate = tuplesort_begin_heap(tupDesc, keycount, sortkeys,
+ true /* randomAccess */);
+
+ sortstate->tuplesortstate = (void *) tuplesortstate;
+
+ /* ----------------
+ * Scan the subplan and feed all the tuples to tuplesort.
+ * ----------------
+ */
+
+ for (;;)
{
- /* Psort says, there are no tuples to be sorted */
- return NULL;
+ slot = ExecProcNode(outerNode, (Plan *) node);
+
+ if (TupIsNull(slot))
+ break;
+
+ tuplesort_puttuple(tuplesortstate, (void *) slot->val);
+ ExecClearTuple(slot);
}
/* ----------------
+ * Complete the sort.
+ * ----------------
+ */
+ tuplesort_performsort(tuplesortstate);
+
+ /* ----------------
* restore to user specified direction
* ----------------
*/
@@ -167,25 +192,29 @@ ExecSort(Sort *node)
* ----------------
*/
slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot;
- slot->ttc_tupleDescriptor = ExecGetTupType(outerNode);
+ slot->ttc_tupleDescriptor = tupDesc;
+
/* ----------------
* finally set the sorted flag to true
* ----------------
*/
- sortstate->sort_Flag = true;
+ sortstate->sort_Done = true;
SO1_printf(stderr, "ExecSort: sorting done.\n");
}
else
slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot;
SO1_printf("ExecSort: %s\n",
- "retrieving tuple from sorted relation");
+ "retrieving tuple from tuplesort");
/* ----------------
- * at this point we grab a tuple from psort
+ * Get the first or next tuple from tuplesort.
+ * Returns NULL if no more tuples.
* ----------------
*/
- heapTuple = psort_grabtuple(node, &should_free);
+ heapTuple = tuplesort_getheaptuple(tuplesortstate,
+ ScanDirectionIsForward(dir),
+ &should_free);
return ExecStoreTuple(heapTuple, slot, InvalidBuffer, should_free);
}
@@ -193,7 +222,6 @@ ExecSort(Sort *node)
/* ----------------------------------------------------------------
* ExecInitSort
*
- * old comments
* Creates the run-time state information for the sort node
* produced by the planner and initailizes its outer subtree.
* ----------------------------------------------------------------
@@ -203,7 +231,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
{
SortState *sortstate;
Plan *outerPlan;
- ScanKey sortkeys;
SO1_printf("ExecInitSort: %s\n",
"initializing sort node");
@@ -219,14 +246,14 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
* ----------------
*/
sortstate = makeNode(SortState);
- sortstate->sort_Flag = 0;
+ sortstate->sort_Done = false;
sortstate->sort_Keys = NULL;
- node->cleaned = FALSE;
+ sortstate->tuplesortstate = NULL;
node->sortstate = sortstate;
/* ----------------
- * Miscellanious initialization
+ * Miscellaneous initialization
*
* + assign node's base_id
* + assign debugging hooks
@@ -259,9 +286,7 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
* initialize sortstate information
* ----------------
*/
- sortkeys = FormSortKeys(node);
- sortstate->sort_Keys = sortkeys;
- sortstate->sort_Flag = false;
+ sortstate->sort_Keys = FormSortKeys(node);
/* ----------------
* initialize tuple type. no need to initialize projection
@@ -275,11 +300,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
SO1_printf("ExecInitSort: %s\n",
"sort node initialized");
- /* ----------------
- * return relation oid of temporary sort relation in a list
- * (someday -- for now we return LispTrue... cim 10/12/89)
- * ----------------
- */
return TRUE;
}
@@ -293,8 +313,6 @@ ExecCountSlotsSort(Sort *node)
/* ----------------------------------------------------------------
* ExecEndSort(node)
- *
- * old comments
* ----------------------------------------------------------------
*/
void
@@ -325,8 +343,13 @@ ExecEndSort(Sort *node)
*/
ExecClearTuple(sortstate->csstate.css_ScanTupleSlot);
- /* Clean up after psort */
- psort_end(node);
+ /* ----------------
+ * Release tuplesort resources
+ * ----------------
+ */
+ if (sortstate->tuplesortstate != NULL)
+ tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate);
+ sortstate->tuplesortstate = NULL;
SO1_printf("ExecEndSort: %s\n",
"sort node shutdown");
@@ -335,51 +358,47 @@ ExecEndSort(Sort *node)
/* ----------------------------------------------------------------
* ExecSortMarkPos
*
- * Calls psort to save the current position in the sorted file.
+ * Calls tuplesort to save the current position in the sorted file.
* ----------------------------------------------------------------
*/
void
ExecSortMarkPos(Sort *node)
{
- SortState *sortstate;
+ SortState *sortstate = node->sortstate;
/* ----------------
* if we haven't sorted yet, just return
* ----------------
*/
- sortstate = node->sortstate;
- if (sortstate->sort_Flag == false)
+ if (! sortstate->sort_Done)
return;
- psort_markpos(node);
-
- return;
+ tuplesort_markpos((Tuplesortstate *) sortstate->tuplesortstate);
}
/* ----------------------------------------------------------------
* ExecSortRestrPos
*
- * Calls psort to restore the last saved sort file position.
+ * Calls tuplesort to restore the last saved sort file position.
* ----------------------------------------------------------------
*/
void
ExecSortRestrPos(Sort *node)
{
- SortState *sortstate;
+ SortState *sortstate = node->sortstate;
/* ----------------
* if we haven't sorted yet, just return.
* ----------------
*/
- sortstate = node->sortstate;
- if (sortstate->sort_Flag == false)
+ if (! sortstate->sort_Done)
return;
/* ----------------
* restore the scan to the previously marked position
* ----------------
*/
- psort_restorepos(node);
+ tuplesort_restorepos((Tuplesortstate *) sortstate->tuplesortstate);
}
void
@@ -392,17 +411,25 @@ ExecReScanSort(Sort *node, ExprContext *exprCtxt, Plan *parent)
* not NULL then it will be re-scanned by ExecProcNode, else - no
* reason to re-scan it at all.
*/
- if (sortstate->sort_Flag == false)
+ if (! sortstate->sort_Done)
return;
ExecClearTuple(sortstate->csstate.cstate.cs_ResultTupleSlot);
- psort_rescan(node);
-
/*
- * If subnode is to be rescanned then we aren't sorted
+ * If subnode is to be rescanned then we forget previous sort
+ * results; we have to re-read the subplan and re-sort.
+ *
+ * Otherwise we can just rewind and rescan the sorted output.
*/
if (((Plan *) node)->lefttree->chgParam != NULL)
- sortstate->sort_Flag = false;
-
+ {
+ sortstate->sort_Done = false;
+ tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate);
+ sortstate->tuplesortstate = NULL;
+ }
+ else
+ {
+ tuplesort_rescan((Tuplesortstate *) sortstate->tuplesortstate);
+ }
}
diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile
index d411a89c735..c680a089230 100644
--- a/src/backend/utils/sort/Makefile
+++ b/src/backend/utils/sort/Makefile
@@ -4,7 +4,7 @@
# Makefile for utils/sort
#
# IDENTIFICATION
-# $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.6 1999/10/16 19:49:27 tgl Exp $
+# $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.7 1999/10/17 22:15:05 tgl Exp $
#
#-------------------------------------------------------------------------
@@ -13,7 +13,7 @@ include ../../../Makefile.global
CFLAGS += -I../..
-OBJS = logtape.o lselect.o psort.o
+OBJS = logtape.o tuplesort.o
all: SUBSYS.o
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 8d5d34c00a7..46497598b56 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -4,8 +4,8 @@
* Management of "logical tapes" within temporary files.
*
* This module exists to support sorting via multiple merge passes (see
- * psort.c). Merging is an ideal algorithm for tape devices, but if we
- * implement it on disk by creating a separate file for each "tape",
+ * tuplesort.c). Merging is an ideal algorithm for tape devices, but if
+ * we implement it on disk by creating a separate file for each "tape",
* there is an annoying problem: the peak space usage is at least twice
* the volume of actual data to be sorted. (This must be so because each
* datum will appear in both the input and output tapes of the final
@@ -23,7 +23,7 @@
* Few OSes allow arbitrary parts of a file to be released back to the OS,
* so we have to implement this space-recycling ourselves within a single
* logical file. logtape.c exists to perform this bookkeeping and provide
- * the illusion of N independent tape devices to psort.c. Note that
+ * the illusion of N independent tape devices to tuplesort.c. Note that
* logtape.c itself depends on buffile.c to provide a "logical file" of
* larger size than the underlying OS may support.
*
@@ -63,7 +63,7 @@
* Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.1 1999/10/16 19:49:27 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.2 1999/10/17 22:15:05 tgl Exp $
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
new file mode 100644
index 00000000000..2240564fa25
--- /dev/null
+++ b/src/backend/utils/sort/tuplesort.c
@@ -0,0 +1,1465 @@
+/*-------------------------------------------------------------------------
+ *
+ * tuplesort.c
+ * Generalized tuple sorting routines.
+ *
+ * This module handles sorting of either heap tuples or index tuples
+ * (and could fairly easily support other kinds of sortable objects,
+ * if necessary). It works efficiently for both small and large amounts
+ * of data. Small amounts are sorted in-memory using qsort(). Large
+ * amounts are sorted using temporary files and a standard external sort
+ * algorithm.
+ *
+ * See Knuth, volume 3, for more than you want to know about the external
+ * sorting algorithm. We divide the input into sorted runs using replacement
+ * selection, in the form of a priority tree implemented as a heap
+ * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase
+ * merge, Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D
+ * are implemented by logtape.c, which avoids space wastage by recycling
+ * disk space as soon as each block is read from its "tape".
+ *
+ * We do not form the initial runs using Knuth's recommended replacement
+ * selection method (Algorithm 5.4.1R), because it uses a fixed number of
+ * records in memory at all times. Since we are dealing with tuples that
+ * may vary considerably in size, we want to be able to vary the number of
+ * records kept in memory to ensure full utilization of the allowed sort
+ * memory space. This is easily done by keeping a variable-size heap in
+ * which the records of the current run are stored, plus a variable-size
+ * unsorted array holding records that must go into the next run.
+ *
+ * The (approximate) amount of memory allowed for any one sort operation
+ * is given in kilobytes by the external variable SortMem. Initially,
+ * we absorb tuples and simply store them in an unsorted array as long as
+ * we haven't exceeded SortMem. If we reach the end of the input without
+ * exceeding SortMem, we sort the array using qsort() and subsequently return
+ * tuples just by scanning the tuple array sequentially. If we do exceed
+ * SortMem, we construct a heap using Algorithm H and begin to emit tuples
+ * into sorted runs in temporary tapes, emitting just enough tuples at each
+ * step to get back within the SortMem limit. New tuples are added to the
+ * heap if they can go into the current run, else they are temporarily added
+ * to the unsorted array. Whenever the heap empties, we construct a new heap
+ * from the current contents of the unsorted array, and begin a new run with a
+ * new output tape (selected per Algorithm D). After the end of the input
+ * is reached, we dump out remaining tuples in memory into a final run
+ * (or two), then merge the runs using Algorithm D.
+ *
+ * When the caller requests random access to the sort result, we form
+ * the final sorted run on a logical tape which is then "frozen", so
+ * that we can access it randomly. When the caller does not need random
+ * access, we return from tuplesort_performsort() as soon as we are down
+ * to one run per logical tape. The final merge is then performed
+ * on-the-fly as the caller repeatedly calls tuplesort_gettuple; this
+ * saves one cycle of writing all the data out to disk and reading it in.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * $Header: /cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v 1.1 1999/10/17 22:15:05 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/nbtree.h"
+#include "miscadmin.h"
+#include "utils/logtape.h"
+#include "utils/tuplesort.h"
+
+/*
+ * Possible states of a Tuplesort object. These denote the states that
+ * persist between calls of Tuplesort routines.
+ */
+typedef enum
+{
+ TSS_INITIAL, /* Loading tuples; still within memory limit */
+ TSS_BUILDRUNS, /* Loading tuples; writing to tape */
+ TSS_SORTEDINMEM, /* Sort completed entirely in memory */
+ TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
+ TSS_FINALMERGE /* Performing final merge on-the-fly */
+} TupSortStatus;
+
+/*
+ * We use a seven-tape polyphase merge, which is the "sweet spot" on the
+ * tapes-to-passes curve according to Knuth's figure 70 (section 5.4.2).
+ */
+#define MAXTAPES 7 /* Knuth's T */
+#define TAPERANGE (MAXTAPES-1) /* Knuth's P */
+
+/*
+ * Private state of a Tuplesort operation.
+ */
+struct Tuplesortstate
+{
+ TupSortStatus status; /* enumerated value as shown above */
+ bool randomAccess; /* did caller request random access? */
+ long availMem; /* remaining memory available, in bytes */
+ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
+
+ /*
+ * These function pointers decouple the routines that must know what kind
+ * of tuple we are sorting from the routines that don't need to know it.
+ * They are set up by the tuplesort_begin_xxx routines.
+ *
+ * Function to compare two tuples; result is per qsort() convention,
+ * ie, <0, 0, >0 according as a<b, a=b, a>b.
+ */
+ int (*comparetup) (Tuplesortstate *state, const void *a, const void *b);
+ /*
+ * Function to copy a supplied input tuple into palloc'd space.
+ * (NB: we assume that a single pfree() is enough to release the tuple
+ * later, so the representation must be "flat" in one palloc chunk.)
+ * state->availMem must be decreased by the amount of space used.
+ */
+ void * (*copytup) (Tuplesortstate *state, void *tup);
+ /*
+ * Function to write a stored tuple onto tape. The representation of
+ * the tuple on tape need not be the same as it is in memory; requirements
+ * on the tape representation are given below. After writing the tuple,
+ * pfree() it, and increase state->availMem by the amount of memory space
+ * thereby released.
+ */
+ void (*writetup) (Tuplesortstate *state, int tapenum, void *tup);
+ /*
+ * Function to read a stored tuple from tape back into memory.
+ * 'len' is the already-read length of the stored tuple. Create and
+ * return a palloc'd copy, and decrease state->availMem by the amount
+ * of memory space consumed.
+ */
+ void * (*readtup) (Tuplesortstate *state, int tapenum, unsigned int len);
+
+ /*
+ * This array holds "unsorted" tuples during the input phases.
+ * If we are able to complete the sort in memory, it holds the
+ * final sorted result as well.
+ */
+ void **memtuples; /* array of pointers to palloc'd tuples */
+ int memtupcount; /* number of tuples currently present */
+ int memtupsize; /* allocated length of memtuples array */
+
+ /*
+ * This array holds the partially-sorted "heap" of tuples that will go
+ * out in the current run during BUILDRUNS state. While completing
+ * the sort, we use it to merge runs of tuples from input tapes.
+ * It is never allocated unless we need to use tapes.
+ */
+ void **heaptuples; /* array of pointers to palloc'd tuples */
+ int heaptupcount; /* number of tuples currently present */
+ int heaptupsize; /* allocated length of heaptuples array */
+ /*
+ * While merging, this array holds the actual number of the input tape
+ * that each tuple in heaptuples[] came from.
+ */
+ int *heapsrctapes;
+
+ /*
+ * Variables for Algorithm D. Note that destTape is a "logical" tape
+ * number, ie, an index into the tp_xxx[] arrays. Be careful to keep
+ * "logical" and "actual" tape numbers straight!
+ */
+ int Level; /* Knuth's l */
+ int destTape; /* current output tape (Knuth's j, less 1) */
+ int tp_fib[MAXTAPES]; /* Target Fibonacci run counts (A[]) */
+ int tp_runs[MAXTAPES]; /* # of real runs on each tape */
+ int tp_dummy[MAXTAPES]; /* # of dummy runs for each tape (D[]) */
+ int tp_tapenum[MAXTAPES]; /* Actual tape numbers (TAPE[]) */
+
+ bool multipleRuns; /* T if we have created more than 1 run */
+
+ /*
+ * These variables are used after completion of sorting to keep track
+ * of the next tuple to return. (In the tape case, the tape's current
+ * read position is also critical state.)
+ */
+ int result_tape; /* actual tape number of finished output */
+ int current; /* array index (only used if SORTEDINMEM) */
+ bool eof_reached; /* reached EOF (needed for cursors) */
+
+ /* markpos_xxx holds marked position for mark and restore */
+ long markpos_block; /* tape block# (only used if SORTEDONTAPE) */
+ int markpos_offset; /* saved "current", or offset in tape block */
+ bool markpos_eof; /* saved "eof_reached" */
+
+ /*
+ * These variables are specific to the HeapTuple case; they are set
+ * by tuplesort_begin_heap and used only by the HeapTuple routines.
+ */
+ TupleDesc tupDesc;
+ int nKeys;
+ ScanKey scanKeys;
+
+ /*
+ * These variables are specific to the IndexTuple case; they are set
+ * by tuplesort_begin_index and used only by the IndexTuple routines.
+ */
+ Relation indexRel;
+ bool enforceUnique; /* complain if we find duplicate tuples */
+};
+
+#define COMPARETUP(state,a,b) ((*(state)->comparetup) (state, a, b))
+#define COPYTUP(state,tup) ((*(state)->copytup) (state, tup))
+#define WRITETUP(state,tape,tup) ((*(state)->writetup) (state, tape, tup))
+#define READTUP(state,tape,len) ((*(state)->readtup) (state, tape, len))
+#define LACKMEM(state) ((state)->availMem < 0)
+#define USEMEM(state,amt) ((state)->availMem -= (amt))
+#define FREEMEM(state,amt) ((state)->availMem += (amt))
+
+/*--------------------
+ *
+ * NOTES about on-tape representation of tuples:
+ *
+ * We require the first "unsigned int" of a stored tuple to be the total size
+ * on-tape of the tuple, including itself (so it is never zero; an all-zero
+ * unsigned int is used to delimit runs). The remainder of the stored tuple
+ * may or may not match the in-memory representation of the tuple ---
+ * any conversion needed is the job of the writetup and readtup routines.
+ *
+ * If state->randomAccess is true, then the stored representation of the
+ * tuple must be followed by another "unsigned int" that is a copy of the
+ * length --- so the total tape space used is actually sizeof(unsigned int)
+ * more than the stored length value. This allows read-backwards. When
+ * randomAccess is not true, the write/read routines may omit the extra
+ * length word.
+ *
+ * writetup is expected to write both length words as well as the tuple
+ * data. When readtup is called, the tape is positioned just after the
+ * front length word; readtup must read the tuple data and advance past
+ * the back length word (if present).
+ *
+ * The write/read routines can make use of the tuple description data
+ * stored in the Tuplesortstate record, if needed. They are also expected
+ * to adjust state->availMem by the amount of memory space (not tape space!)
+ * released or consumed. There is no error return from either writetup
+ * or readtup; they should elog() on failure.
+ *
+ *
+ * NOTES about memory consumption calculations:
+ *
+ * We count space requested for tuples against the SortMem limit.
+ * Fixed-size space (primarily the LogicalTapeSet I/O buffers) is not
+ * counted, nor do we count the variable-size memtuples and heaptuples
+ * arrays. (Even though those could grow pretty large, they should be
+ * small compared to the tuples proper, so this is not unreasonable.)
+ *
+ * The major deficiency in this approach is that it ignores palloc overhead.
+ * The memory space actually allocated for a palloc chunk is always more
+ * than the request size, and could be considerably more (as much as 2X
+ * larger, in the current aset.c implementation). So the space used could
+ * be considerably more than SortMem says.
+ *
+ * One way to fix this is to add a memory management function that, given
+ * a pointer to a palloc'd chunk, returns the actual space consumed by the
+ * chunk. This would be very easy in the current aset.c module, but I'm
+ * hesitant to do it because it might be unpleasant to support in future
+ * implementations of memory management. (For example, a direct
+ * implementation of palloc as malloc could not support such a function
+ * portably.)
+ *
+ * A cruder answer is just to apply a fudge factor, say by initializing
+ * availMem to only three-quarters of what SortMem indicates. This is
+ * probably the right answer if anyone complains that SortMem is not being
+ * obeyed very faithfully.
+ *
+ *--------------------
+ */
+
+static Tuplesortstate *tuplesort_begin_common(bool randomAccess);
+static void inittapes(Tuplesortstate *state);
+static void selectnewtape(Tuplesortstate *state);
+static void mergeruns(Tuplesortstate *state);
+static void mergeonerun(Tuplesortstate *state);
+static void beginmerge(Tuplesortstate *state);
+static void beginrun(Tuplesortstate *state);
+static void dumptuples(Tuplesortstate *state, bool alltuples);
+static void tuplesort_heap_insert(Tuplesortstate *state, void *tuple,
+ int tapenum);
+static void tuplesort_heap_siftup(Tuplesortstate *state);
+static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
+static void markrunend(Tuplesortstate *state, int tapenum);
+static int qsort_comparetup(const void *a, const void *b);
+static int comparetup_heap(Tuplesortstate *state,
+ const void *a, const void *b);
+static void *copytup_heap(Tuplesortstate *state, void *tup);
+static void writetup_heap(Tuplesortstate *state, int tapenum, void *tup);
+static void *readtup_heap(Tuplesortstate *state, int tapenum,
+ unsigned int len);
+static int comparetup_index(Tuplesortstate *state,
+ const void *a, const void *b);
+static void *copytup_index(Tuplesortstate *state, void *tup);
+static void writetup_index(Tuplesortstate *state, int tapenum, void *tup);
+static void *readtup_index(Tuplesortstate *state, int tapenum,
+ unsigned int len);
+
+/*
+ * Since qsort(3) will not pass any context info to qsort_comparetup(),
+ * we have to use this ugly static variable. It is set to point to the
+ * active Tuplesortstate object just before calling qsort. It should
+ * not be used directly by anything except qsort_comparetup().
+ */
+static Tuplesortstate *qsort_tuplesortstate;
+
+
+/*
+ * tuplesort_begin_xxx
+ *
+ * Initialize for a tuple sort operation.
+ *
+ * After calling tuplesort_begin, the caller should call tuplesort_puttuple
+ * zero or more times, then call tuplesort_performsort when all the tuples
+ * have been supplied. After performsort, retrieve the tuples in sorted
+ * order by calling tuplesort_gettuple until it returns NULL. (If random
+ * access was requested, rescan, markpos, and restorepos can also be called.)
+ * Call tuplesort_end to terminate the operation and release memory/disk space.
+ */
+
+static Tuplesortstate *
+tuplesort_begin_common(bool randomAccess)
+{
+ Tuplesortstate *state;
+
+ state = (Tuplesortstate *) palloc(sizeof(Tuplesortstate));
+
+ MemSet((char *) state, 0, sizeof(Tuplesortstate));
+
+ state->status = TSS_INITIAL;
+ state->randomAccess = randomAccess;
+ state->availMem = SortMem * 1024L;
+ state->tapeset = NULL;
+
+ state->memtupcount = 0;
+ state->memtupsize = 1024; /* initial guess */
+ state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));
+
+ state->heaptuples = NULL; /* until and unless needed */
+ state->heaptupcount = 0;
+ state->heaptupsize = 0;
+ state->heapsrctapes = NULL;
+
+ /* Algorithm D variables will be initialized by inittapes, if needed */
+
+ state->result_tape = -1; /* flag that result tape has not been formed */
+
+ return state;
+}
+
+Tuplesortstate *
+tuplesort_begin_heap(TupleDesc tupDesc,
+ int nkeys, ScanKey keys,
+ bool randomAccess)
+{
+ Tuplesortstate *state = tuplesort_begin_common(randomAccess);
+
+ AssertArg(nkeys >= 1);
+ AssertArg(keys[0].sk_attno != 0);
+ AssertArg(keys[0].sk_procedure != 0);
+
+ state->comparetup = comparetup_heap;
+ state->copytup = copytup_heap;
+ state->writetup = writetup_heap;
+ state->readtup = readtup_heap;
+
+ state->tupDesc = tupDesc;
+ state->nKeys = nkeys;
+ state->scanKeys = keys;
+
+ return state;
+}
+
+Tuplesortstate *
+tuplesort_begin_index(Relation indexRel,
+ bool enforceUnique,
+ bool randomAccess)
+{
+ Tuplesortstate *state = tuplesort_begin_common(randomAccess);
+
+ state->comparetup = comparetup_index;
+ state->copytup = copytup_index;
+ state->writetup = writetup_index;
+ state->readtup = readtup_index;
+
+ state->indexRel = indexRel;
+ state->enforceUnique = enforceUnique;
+
+ return state;
+}
+
+/*
+ * tuplesort_end
+ *
+ * Release resources and clean up.
+ */
+void
+tuplesort_end(Tuplesortstate *state)
+{
+ int i;
+
+ if (state->tapeset)
+ LogicalTapeSetClose(state->tapeset);
+ if (state->memtuples)
+ {
+ for (i = 0; i < state->memtupcount; i++)
+ pfree(state->memtuples[i]);
+ pfree(state->memtuples);
+ }
+ if (state->heaptuples)
+ {
+ for (i = 0; i < state->heaptupcount; i++)
+ pfree(state->heaptuples[i]);
+ pfree(state->heaptuples);
+ }
+ if (state->heapsrctapes)
+ pfree(state->heapsrctapes);
+}
+
+/*
+ * Accept one tuple while collecting input data for sort.
+ *
+ * Note that the input tuple is always copied; the caller need not save it.
+ */
+void
+tuplesort_puttuple(Tuplesortstate *state, void *tuple)
+{
+ /*
+ * Copy the given tuple into memory we control, and decrease availMem.
+ */
+ tuple = COPYTUP(state, tuple);
+
+ switch (state->status)
+ {
+ case TSS_INITIAL:
+ /*
+ * Save the copied tuple into the unsorted array.
+ */
+ if (state->memtupcount >= state->memtupsize)
+ {
+ /* Grow the unsorted array as needed. */
+ state->memtupsize *= 2;
+ state->memtuples = (void **)
+ repalloc(state->memtuples,
+ state->memtupsize * sizeof(void *));
+ }
+ state->memtuples[state->memtupcount++] = tuple;
+ /*
+ * Done if we still fit in available memory.
+ */
+ if (! LACKMEM(state))
+ return;
+ /*
+ * Nope; time to switch to tape-based operation.
+ */
+ inittapes(state);
+ beginrun(state);
+ /*
+ * Dump tuples until we are back under the limit.
+ */
+ dumptuples(state, false);
+ break;
+ case TSS_BUILDRUNS:
+ /*
+ * Insert the copied tuple into the heap if it can go into the
+ * current run; otherwise add it to the unsorted array, whence
+ * it will go into the next run.
+ *
+ * The tuple can go into the current run if it is >= the first
+ * not-yet-output tuple. (Actually, it could go into the current
+ * run if it is >= the most recently output tuple ... but that
+ * would require keeping around the tuple we last output, and
+ * it's simplest to let writetup free the tuple when written.)
+ *
+ * Note there will always be at least one tuple in the heap
+ * at this point; see dumptuples.
+ */
+ Assert(state->heaptupcount > 0);
+ if (COMPARETUP(state, tuple, state->heaptuples[0]) >= 0)
+ {
+ tuplesort_heap_insert(state, tuple, 0);
+ }
+ else
+ {
+ if (state->memtupcount >= state->memtupsize)
+ {
+ /* Grow the unsorted array as needed. */
+ state->memtupsize *= 2;
+ state->memtuples = (void **)
+ repalloc(state->memtuples,
+ state->memtupsize * sizeof(void *));
+ }
+ state->memtuples[state->memtupcount++] = tuple;
+ }
+ /*
+ * If we are over the memory limit, dump tuples till we're under.
+ */
+ dumptuples(state, false);
+ break;
+ default:
+ elog(ERROR, "tuplesort_puttuple: invalid state");
+ break;
+ }
+}
+
+/*
+ * All tuples have been provided; finish the sort.
+ */
+void
+tuplesort_performsort(Tuplesortstate *state)
+{
+ switch (state->status)
+ {
+ case TSS_INITIAL:
+ /*
+ * We were able to accumulate all the tuples within the
+ * allowed amount of memory. Just qsort 'em and we're done.
+ */
+ if (state->memtupcount > 1)
+ {
+ qsort_tuplesortstate = state;
+ qsort((void *) state->memtuples, state->memtupcount,
+ sizeof(void *), qsort_comparetup);
+ }
+ state->current = 0;
+ state->eof_reached = false;
+ state->markpos_offset = 0;
+ state->markpos_eof = false;
+ state->status = TSS_SORTEDINMEM;
+ break;
+ case TSS_BUILDRUNS:
+ /*
+ * Finish tape-based sort. First, flush all tuples remaining
+ * in memory out to tape; then merge until we have a single
+ * remaining run (or, if !randomAccess, one run per tape).
+ * Note that mergeruns sets the correct status.
+ */
+ dumptuples(state, true);
+ mergeruns(state);
+ state->eof_reached = false;
+ state->markpos_block = 0L;
+ state->markpos_offset = 0;
+ state->markpos_eof = false;
+ break;
+ default:
+ elog(ERROR, "tuplesort_performsort: invalid state");
+ break;
+ }
+}
+
+/*
+ * Fetch the next tuple in either forward or back direction.
+ * Returns NULL if no more tuples. If should_free is set, the
+ * caller must pfree the returned tuple when done with it.
+ */
+void *
+tuplesort_gettuple(Tuplesortstate *state, bool forward,
+ bool *should_free)
+{
+ unsigned int tuplen;
+ void *tup;
+
+ switch (state->status)
+ {
+ case TSS_SORTEDINMEM:
+ Assert(forward || state->randomAccess);
+ *should_free = false;
+ if (forward)
+ {
+ if (state->current < state->memtupcount)
+ return state->memtuples[state->current++];
+ state->eof_reached = true;
+ return NULL;
+ }
+ else
+ {
+ if (state->current <= 0)
+ return NULL;
+ /*
+ * if all tuples are fetched already then we return last tuple,
+ * else - tuple before last returned.
+ */
+ if (state->eof_reached)
+ state->eof_reached = false;
+ else
+ {
+ state->current--; /* last returned tuple */
+ if (state->current <= 0)
+ return NULL;
+ }
+ return state->memtuples[state->current - 1];
+ }
+ break;
+
+ case TSS_SORTEDONTAPE:
+ Assert(forward || state->randomAccess);
+ *should_free = true;
+ if (forward)
+ {
+ if (state->eof_reached)
+ return NULL;
+ if ((tuplen = getlen(state, state->result_tape, true)) != 0)
+ {
+ tup = READTUP(state, state->result_tape, tuplen);
+ return tup;
+ }
+ else
+ {
+ state->eof_reached = true;
+ return NULL;
+ }
+ }
+ /* Backward.
+ *
+ * if all tuples are fetched already then we return last tuple,
+ * else - tuple before last returned.
+ */
+ if (state->eof_reached)
+ {
+ /*
+ * Seek position is pointing just past the zero tuplen
+ * at the end of file; back up to fetch last tuple's ending
+ * length word. If seek fails we must have a completely empty
+ * file.
+ */
+ if (! LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ 2 * sizeof(unsigned int)))
+ return NULL;
+ state->eof_reached = false;
+ }
+ else
+ {
+ /*
+ * Back up and fetch previously-returned tuple's ending length
+ * word. If seek fails, assume we are at start of file.
+ */
+ if (! LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ sizeof(unsigned int)))
+ return NULL;
+ tuplen = getlen(state, state->result_tape, false);
+ /*
+ * Back up to get ending length word of tuple before it.
+ */
+ if (! LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ tuplen + 2 * sizeof(unsigned int)))
+ {
+ /* If that fails, presumably the prev tuple is the first
+ * in the file. Back up so that it becomes next to read
+ * in forward direction (not obviously right, but that is
+ * what in-memory case does).
+ */
+ if (! LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ tuplen + sizeof(unsigned int)))
+ elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan");
+ return NULL;
+ }
+ }
+
+ tuplen = getlen(state, state->result_tape, false);
+ /*
+ * Now we have the length of the prior tuple, back up and read it.
+ * Note: READTUP expects we are positioned after the initial
+ * length word of the tuple, so back up to that point.
+ */
+ if (! LogicalTapeBackspace(state->tapeset,
+ state->result_tape,
+ tuplen))
+ elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan");
+ tup = READTUP(state, state->result_tape, tuplen);
+ return tup;
+
+ case TSS_FINALMERGE:
+ Assert(forward);
+ *should_free = true;
+ /*
+ * This code should match the inner loop of mergeonerun().
+ */
+ if (state->heaptupcount > 0)
+ {
+ int srcTape = state->heapsrctapes[0];
+
+ tup = state->heaptuples[0];
+ tuplesort_heap_siftup(state);
+ if ((tuplen = getlen(state, srcTape, true)) != 0)
+ {
+ void *newtup = READTUP(state, srcTape, tuplen);
+ tuplesort_heap_insert(state, newtup, srcTape);
+ }
+ return tup;
+ }
+ return NULL;
+
+ default:
+ elog(ERROR, "tuplesort_gettuple: invalid state");
+ return NULL; /* keep compiler quiet */
+ }
+}
+
+/*
+ * inittapes - initialize for tape sorting.
+ *
+ * This is called only if we have found we don't have room to sort in memory.
+ */
+static void
+inittapes(Tuplesortstate *state)
+{
+ int j;
+
+ state->tapeset = LogicalTapeSetCreate(MAXTAPES);
+
+ /*
+ * Initialize heaptuples array slightly larger than current memtuples
+ * usage; memtupcount is probably a good guess at how many tuples we
+ * will be able to have in the heap at once.
+ */
+ state->heaptupcount = 0;
+ state->heaptupsize = state->memtupcount + state->memtupcount / 4;
+ state->heaptuples = (void **) palloc(state->heaptupsize * sizeof(void *));
+
+ /*
+ * Initialize variables of Algorithm D (step D1).
+ */
+ for (j = 0; j < MAXTAPES; j++)
+ {
+ state->tp_fib[j] = 1;
+ state->tp_runs[j] = 0;
+ state->tp_dummy[j] = 1;
+ state->tp_tapenum[j] = j;
+ }
+ state->tp_fib[TAPERANGE] = 0;
+ state->tp_dummy[TAPERANGE] = 0;
+
+ state->Level = 1;
+ state->destTape = 0;
+
+ state->multipleRuns = false;
+
+ state->status = TSS_BUILDRUNS;
+}
+
+/*
+ * selectnewtape -- select new tape for new initial run.
+ *
+ * This is called after finishing a run when we know another run
+ * must be started. This implements steps D3, D4 of Algorithm D.
+ */
+static void
+selectnewtape(Tuplesortstate *state)
+{
+ int j;
+ int a;
+
+ /* We now have at least two initial runs */
+ state->multipleRuns = true;
+
+ /* Step D3: advance j (destTape) */
+ if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape+1])
+ {
+ state->destTape++;
+ return;
+ }
+ if (state->tp_dummy[state->destTape] != 0)
+ {
+ state->destTape = 0;
+ return;
+ }
+
+ /* Step D4: increase level */
+ state->Level++;
+ a = state->tp_fib[0];
+ for (j = 0; j < TAPERANGE; j++)
+ {
+ state->tp_dummy[j] = a + state->tp_fib[j+1] - state->tp_fib[j];
+ state->tp_fib[j] = a + state->tp_fib[j+1];
+ }
+ state->destTape = 0;
+}
+
+/*
+ * mergeruns -- merge all the completed initial runs.
+ *
+ * This implements steps D5, D6 of Algorithm D. All input data has
+ * already been written to initial runs on tape (see dumptuples).
+ */
+static void
+mergeruns(Tuplesortstate *state)
+{
+ int tapenum,
+ svTape,
+ svRuns,
+ svDummy;
+
+ Assert(state->status == TSS_BUILDRUNS);
+ Assert(state->memtupcount == 0 && state->heaptupcount == 0);
+ /*
+ * If we produced only one initial run (quite likely if the total
+ * data volume is between 1X and 2X SortMem), we can just use that
+ * tape as the finished output, rather than doing a useless merge.
+ */
+ if (! state->multipleRuns)
+ {
+ state->result_tape = state->tp_tapenum[state->destTape];
+ /* must freeze and rewind the finished output tape */
+ LogicalTapeFreeze(state->tapeset, state->result_tape);
+ state->status = TSS_SORTEDONTAPE;
+ return;
+ }
+
+ /* End of step D2: rewind all output tapes to prepare for merging */
+ for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+ LogicalTapeRewind(state->tapeset, tapenum, false);
+
+ for (;;)
+ {
+ /* Step D5: merge runs onto tape[T] until tape[P] is empty */
+ while (state->tp_runs[TAPERANGE-1] || state->tp_dummy[TAPERANGE-1])
+ {
+ bool allDummy = true;
+ bool allOneRun = true;
+
+ for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+ {
+ if (state->tp_dummy[tapenum] == 0)
+ allDummy = false;
+ if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
+ allOneRun = false;
+ }
+ /*
+ * If we don't have to produce a materialized sorted tape,
+ * quit as soon as we're down to one real/dummy run per tape.
+ */
+ if (! state->randomAccess && allOneRun)
+ {
+ Assert(! allDummy);
+ /* Initialize for the final merge pass */
+ beginmerge(state);
+ state->status = TSS_FINALMERGE;
+ return;
+ }
+ if (allDummy)
+ {
+ state->tp_dummy[TAPERANGE]++;
+ for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+ state->tp_dummy[tapenum]--;
+ }
+ else
+ {
+ mergeonerun(state);
+ }
+ }
+ /* Step D6: decrease level */
+ if (--state->Level == 0)
+ break;
+ /* rewind output tape T to use as new input */
+ LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE],
+ false);
+ /* rewind used-up input tape P, and prepare it for write pass */
+ LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE-1],
+ true);
+ state->tp_runs[TAPERANGE-1] = 0;
+ /* reassign tape units per step D6; note we no longer care about A[] */
+ svTape = state->tp_tapenum[TAPERANGE];
+ svDummy = state->tp_dummy[TAPERANGE];
+ svRuns = state->tp_runs[TAPERANGE];
+ for (tapenum = TAPERANGE; tapenum > 0; tapenum--)
+ {
+ state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum-1];
+ state->tp_dummy[tapenum] = state->tp_dummy[tapenum-1];
+ state->tp_runs[tapenum] = state->tp_runs[tapenum-1];
+ }
+ state->tp_tapenum[0] = svTape;
+ state->tp_dummy[0] = svDummy;
+ state->tp_runs[0] = svRuns;
+ }
+ /*
+ * Done. Knuth says that the result is on TAPE[1], but since we exited
+ * the loop without performing the last iteration of step D6, we have not
+ * rearranged the tape unit assignment, and therefore the result is on
+ * TAPE[T]. We need to do it this way so that we can freeze the final
+ * output tape while rewinding it. The last iteration of step D6 would
+ * be a waste of cycles anyway...
+ */
+ state->result_tape = state->tp_tapenum[TAPERANGE];
+ LogicalTapeFreeze(state->tapeset, state->result_tape);
+ state->status = TSS_SORTEDONTAPE;
+}
+
+/*
+ * Merge one run from each input tape, except ones with dummy runs.
+ *
+ * This is the inner loop of Algorithm D step D5. We know that the
+ * output tape is TAPE[T].
+ */
+static void
+mergeonerun(Tuplesortstate *state)
+{
+ int destTape = state->tp_tapenum[TAPERANGE];
+ int srcTape;
+ unsigned int tuplen;
+ void *tup;
+
+ /*
+ * Start the merge by loading one tuple from each active source tape
+ * into the heap. We can also decrease the input run/dummy run counts.
+ */
+ beginmerge(state);
+
+ /*
+ * Execute merge by repeatedly extracting lowest tuple in heap,
+ * writing it out, and replacing it with next tuple from same tape
+ * (if there is another one).
+ */
+ while (state->heaptupcount > 0)
+ {
+ WRITETUP(state, destTape, state->heaptuples[0]);
+ srcTape = state->heapsrctapes[0];
+ tuplesort_heap_siftup(state);
+ if ((tuplen = getlen(state, srcTape, true)) != 0)
+ {
+ tup = READTUP(state, srcTape, tuplen);
+ tuplesort_heap_insert(state, tup, srcTape);
+ }
+ }
+
+ /*
+ * When the heap empties, we're done. Write an end-of-run marker
+ * on the output tape, and increment its count of real runs.
+ */
+ markrunend(state, destTape);
+ state->tp_runs[TAPERANGE]++;
+}
+
+/*
+ * beginmerge - initialize for a merge pass
+ *
+ * We load the first tuple from each nondummy input run into the heap.
+ * We also decrease the counts of real and dummy runs for each tape.
+ */
+static void
+beginmerge(Tuplesortstate *state)
+{
+ int tapenum;
+ int srcTape;
+ unsigned int tuplen;
+ void *tup;
+
+ Assert(state->heaptuples != NULL && state->heaptupcount == 0);
+ if (state->heapsrctapes == NULL)
+ state->heapsrctapes = (int *) palloc(MAXTAPES * sizeof(int));
+
+ for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+ {
+ if (state->tp_dummy[tapenum] > 0)
+ {
+ state->tp_dummy[tapenum]--;
+ }
+ else
+ {
+ Assert(state->tp_runs[tapenum] > 0);
+ state->tp_runs[tapenum]--;
+ srcTape = state->tp_tapenum[tapenum];
+ tuplen = getlen(state, srcTape, false);
+ tup = READTUP(state, srcTape, tuplen);
+ tuplesort_heap_insert(state, tup, srcTape);
+ }
+ }
+
+}
+
+/*
+ * beginrun - start a new initial run
+ *
+ * The tuples presently in the unsorted memory array are moved into
+ * the heap.
+ */
+static void
+beginrun(Tuplesortstate *state)
+{
+ int i;
+
+ Assert(state->heaptupcount == 0 && state->memtupcount > 0);
+ for (i = 0; i < state->memtupcount; i++)
+ tuplesort_heap_insert(state, state->memtuples[i], 0);
+ state->memtupcount = 0;
+}
+
+/*
+ * dumptuples - remove tuples from heap and write to tape
+ *
+ * When alltuples = false, dump only enough tuples to get under the
+ * availMem limit (and leave at least one tuple in the heap in any case,
+ * since puttuple assumes it always has a tuple to compare to).
+ *
+ * When alltuples = true, dump everything currently in memory.
+ * (This case is only used at end of input data.)
+ *
+ * If we empty the heap, then start a new run using the tuples that
+ * have accumulated in memtuples[] (if any).
+ */
+static void
+dumptuples(Tuplesortstate *state, bool alltuples)
+{
+ while (alltuples ||
+ (LACKMEM(state) &&
+ (state->heaptupcount > 0 || state->memtupcount > 0)))
+ {
+ /*
+ * Dump the heap's frontmost entry, and sift up to remove it
+ * from the heap.
+ */
+ Assert(state->heaptupcount > 0);
+ WRITETUP(state, state->tp_tapenum[state->destTape],
+ state->heaptuples[0]);
+ tuplesort_heap_siftup(state);
+ /*
+ * If the heap is now empty, we've finished a run.
+ */
+ if (state->heaptupcount == 0)
+ {
+ markrunend(state, state->tp_tapenum[state->destTape]);
+ state->tp_runs[state->destTape]++;
+ state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
+ if (state->memtupcount == 0)
+ break; /* all input data has been written to tape */
+ /* Select new output tape and start a new run */
+ selectnewtape(state);
+ beginrun(state);
+ }
+ }
+}
+
+/*
+ * tuplesort_rescan - rewind and replay the scan
+ */
+void
+tuplesort_rescan(Tuplesortstate *state)
+{
+ Assert(state->randomAccess);
+
+ switch (state->status)
+ {
+ case TSS_SORTEDINMEM:
+ state->current = 0;
+ state->eof_reached = false;
+ state->markpos_offset = 0;
+ state->markpos_eof = false;
+ break;
+ case TSS_SORTEDONTAPE:
+ LogicalTapeRewind(state->tapeset,
+ state->result_tape,
+ false);
+ state->eof_reached = false;
+ state->markpos_block = 0L;
+ state->markpos_offset = 0;
+ state->markpos_eof = false;
+ break;
+ default:
+ elog(ERROR, "tuplesort_rescan: invalid state");
+ break;
+ }
+}
+
+/*
+ * tuplesort_markpos - saves current position in the merged sort file
+ */
+void
+tuplesort_markpos(Tuplesortstate *state)
+{
+ Assert(state->randomAccess);
+
+ switch (state->status)
+ {
+ case TSS_SORTEDINMEM:
+ state->markpos_offset = state->current;
+ state->markpos_eof = state->eof_reached;
+ break;
+ case TSS_SORTEDONTAPE:
+ LogicalTapeTell(state->tapeset,
+ state->result_tape,
+ & state->markpos_block,
+ & state->markpos_offset);
+ state->markpos_eof = state->eof_reached;
+ break;
+ default:
+ elog(ERROR, "tuplesort_markpos: invalid state");
+ break;
+ }
+}
+
+/*
+ * tuplesort_restorepos - restores current position in merged sort file to
+ * last saved position
+ */
+void
+tuplesort_restorepos(Tuplesortstate *state)
+{
+ Assert(state->randomAccess);
+
+ switch (state->status)
+ {
+ case TSS_SORTEDINMEM:
+ state->current = state->markpos_offset;
+ state->eof_reached = state->markpos_eof;
+ break;
+ case TSS_SORTEDONTAPE:
+ if (! LogicalTapeSeek(state->tapeset,
+ state->result_tape,
+ state->markpos_block,
+ state->markpos_offset))
+ elog(ERROR, "tuplesort_restorepos failed");
+ state->eof_reached = state->markpos_eof;
+ break;
+ default:
+ elog(ERROR, "tuplesort_restorepos: invalid state");
+ break;
+ }
+}
+
+
+/*
+ * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
+ */
+
+/*
+ * Insert a new tuple into an empty or existing heap, maintaining the
+ * heap invariant. The heap lives in state->heaptuples[]. Also, if
+ * state->heapsrctapes is not NULL, we store each tuple's source tapenum
+ * in the corresponding element of state->heapsrctapes[].
+ */
+static void
+tuplesort_heap_insert(Tuplesortstate *state, void *tuple,
+ int tapenum)
+{
+ int j;
+
+ /*
+ * Make sure heaptuples[] can handle another entry.
+ * NOTE: we do not enlarge heapsrctapes[]; it's supposed
+ * to be big enough when created.
+ */
+ if (state->heaptupcount >= state->heaptupsize)
+ {
+ /* Grow the unsorted array as needed. */
+ state->heaptupsize *= 2;
+ state->heaptuples = (void **)
+ repalloc(state->heaptuples,
+ state->heaptupsize * sizeof(void *));
+ }
+ /*
+ * Sift-up the new entry, per Knuth 5.2.3 exercise 16.
+ * Note that Knuth is using 1-based array indexes, not 0-based.
+ */
+ j = state->heaptupcount++;
+ while (j > 0) {
+ int i = (j-1) >> 1;
+
+ if (COMPARETUP(state, tuple, state->heaptuples[i]) >= 0)
+ break;
+ state->heaptuples[j] = state->heaptuples[i];
+ if (state->heapsrctapes)
+ state->heapsrctapes[j] = state->heapsrctapes[i];
+ j = i;
+ }
+ state->heaptuples[j] = tuple;
+ if (state->heapsrctapes)
+ state->heapsrctapes[j] = tapenum;
+}
+
+/*
+ * The tuple at state->heaptuples[0] has been removed from the heap.
+ * Decrement heaptupcount, and sift up to maintain the heap invariant.
+ */
+static void
+tuplesort_heap_siftup(Tuplesortstate *state)
+{
+ void **heaptuples = state->heaptuples;
+ void *tuple;
+ int i,
+ n;
+
+ if (--state->heaptupcount <= 0)
+ return;
+ n = state->heaptupcount;
+ tuple = heaptuples[n]; /* tuple that must be reinserted */
+ i = 0; /* i is where the "hole" is */
+ for (;;) {
+ int j = 2*i + 1;
+
+ if (j >= n)
+ break;
+ if (j+1 < n &&
+ COMPARETUP(state, heaptuples[j], heaptuples[j+1]) > 0)
+ j++;
+ if (COMPARETUP(state, tuple, heaptuples[j]) <= 0)
+ break;
+ heaptuples[i] = heaptuples[j];
+ if (state->heapsrctapes)
+ state->heapsrctapes[i] = state->heapsrctapes[j];
+ i = j;
+ }
+ heaptuples[i] = tuple;
+ if (state->heapsrctapes)
+ state->heapsrctapes[i] = state->heapsrctapes[n];
+}
+
+
+/*
+ * Tape interface routines
+ */
+
+static unsigned int
+getlen(Tuplesortstate *state, int tapenum, bool eofOK)
+{
+ unsigned int len;
+
+ if (LogicalTapeRead(state->tapeset, tapenum, (void *) &len,
+ sizeof(len)) != sizeof(len))
+ elog(ERROR, "tuplesort: unexpected end of tape");
+ if (len == 0 && !eofOK)
+ elog(ERROR, "tuplesort: unexpected end of data");
+ return len;
+}
+
+static void
+markrunend(Tuplesortstate *state, int tapenum)
+{
+ unsigned int len = 0;
+
+ LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
+}
+
+
+/*
+ * qsort interface
+ */
+
+static int
+qsort_comparetup(const void *a, const void *b)
+{
+ /* The passed pointers are pointers to void * ... */
+
+ return COMPARETUP(qsort_tuplesortstate, * (void **) a, * (void **) b);
+}
+
+
+/*
+ * Routines specialized for HeapTuple case
+ */
+
+static int
+comparetup_heap(Tuplesortstate *state, const void *a, const void *b)
+{
+ HeapTuple ltup = (HeapTuple) a;
+ HeapTuple rtup = (HeapTuple) b;
+ int nkey;
+
+ for (nkey = 0; nkey < state->nKeys; nkey++)
+ {
+ ScanKey scanKey = state->scanKeys + nkey;
+ Datum lattr,
+ rattr;
+ bool isnull1,
+ isnull2;
+ int result;
+
+ lattr = heap_getattr(ltup,
+ scanKey->sk_attno,
+ state->tupDesc,
+ &isnull1);
+ rattr = heap_getattr(rtup,
+ scanKey->sk_attno,
+ state->tupDesc,
+ &isnull2);
+ if (isnull1)
+ {
+ if (!isnull2)
+ return 1; /* NULL sorts after non-NULL */
+ }
+ else if (isnull2)
+ return -1;
+ else if (scanKey->sk_flags & SK_COMMUTE)
+ {
+ if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr)))
+ result = (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr);
+ if (result)
+ return result;
+ }
+ else
+ {
+ if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr)))
+ result = (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr);
+ if (result)
+ return result;
+ }
+ }
+
+ return 0;
+}
+
+static void *
+copytup_heap(Tuplesortstate *state, void *tup)
+{
+ HeapTuple tuple = (HeapTuple) tup;
+
+ USEMEM(state, HEAPTUPLESIZE + tuple->t_len);
+ return (void *) heap_copytuple(tuple);
+}
+
+/*
+ * We don't bother to write the HeapTupleData part of the tuple.
+ */
+
+static void
+writetup_heap(Tuplesortstate *state, int tapenum, void *tup)
+{
+ HeapTuple tuple = (HeapTuple) tup;
+ unsigned int tuplen;
+
+ tuplen = tuple->t_len + sizeof(tuplen);
+ LogicalTapeWrite(state->tapeset, tapenum,
+ (void*) &tuplen, sizeof(tuplen));
+ LogicalTapeWrite(state->tapeset, tapenum,
+ (void*) tuple->t_data, tuple->t_len);
+ if (state->randomAccess) /* need trailing length word? */
+ LogicalTapeWrite(state->tapeset, tapenum,
+ (void*) &tuplen, sizeof(tuplen));
+
+ FREEMEM(state, HEAPTUPLESIZE + tuple->t_len);
+ pfree(tuple);
+}
+
+static void *
+readtup_heap(Tuplesortstate *state, int tapenum, unsigned int len)
+{
+ unsigned int tuplen = len - sizeof(unsigned int) + HEAPTUPLESIZE;
+ HeapTuple tuple = (HeapTuple) palloc(tuplen);
+
+ USEMEM(state, tuplen);
+ /* reconstruct the HeapTupleData portion */
+ tuple->t_len = len - sizeof(unsigned int);
+ ItemPointerSetInvalid(&(tuple->t_self));
+ tuple->t_data = (HeapTupleHeader) (((char *) tuple) + HEAPTUPLESIZE);
+ /* read in the tuple proper */
+ if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple->t_data,
+ tuple->t_len) != tuple->t_len)
+ elog(ERROR, "tuplesort: unexpected end of data");
+ if (state->randomAccess) /* need trailing length word? */
+ if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen,
+ sizeof(tuplen)) != sizeof(tuplen))
+ elog(ERROR, "tuplesort: unexpected end of data");
+ return (void *) tuple;
+}
+
+
+/*
+ * Routines specialized for IndexTuple case
+ *
+ * NOTE: actually, these are specialized for the btree case; it's not
+ * clear whether you could use them for a non-btree index. Possibly
+ * you'd need to make another set of routines if you needed to sort
+ * according to another kind of index.
+ */
+
+static int
+comparetup_index(Tuplesortstate *state, const void *a, const void *b)
+{
+ IndexTuple ltup = (IndexTuple) a;
+ IndexTuple rtup = (IndexTuple) b;
+ TupleDesc itdesc = state->indexRel->rd_att;
+ bool equal_isnull = false;
+ Datum lattr,
+ rattr;
+ bool isnull1,
+ isnull2;
+ int i;
+
+ for (i = 0; i < itdesc->natts; i++)
+ {
+ lattr = index_getattr(ltup, i + 1, itdesc, &isnull1);
+ rattr = index_getattr(rtup, i + 1, itdesc, &isnull2);
+
+ if (isnull1)
+ {
+ if (!isnull2)
+ return 1; /* NULL sorts after non-NULL */
+ equal_isnull = true;
+ continue;
+ }
+ else if (isnull2)
+ return -1;
+
+ if (_bt_invokestrat(state->indexRel, i + 1,
+ BTGreaterStrategyNumber,
+ lattr, rattr))
+ return 1;
+ if (_bt_invokestrat(state->indexRel, i + 1,
+ BTGreaterStrategyNumber,
+ rattr, lattr))
+ return -1;
+ }
+
+ /*
+ * If btree has asked us to enforce uniqueness, complain if two equal
+ * tuples are detected (unless there was at least one NULL field).
+ *
+ * It is sufficient to make the test here, because if two tuples are
+ * equal they *must* get compared at some stage of the sort --- otherwise
+ * the sort algorithm wouldn't have checked whether one must appear
+ * before the other.
+ */
+ if (state->enforceUnique && !equal_isnull)
+ elog(ERROR, "Cannot create unique index. Table contains non-unique values");
+
+ return 0;
+}
+
+static void *
+copytup_index(Tuplesortstate *state, void *tup)
+{
+ IndexTuple tuple = (IndexTuple) tup;
+ unsigned int tuplen = IndexTupleSize(tuple);
+ IndexTuple newtuple;
+
+ USEMEM(state, tuplen);
+ newtuple = (IndexTuple) palloc(tuplen);
+ memcpy(newtuple, tuple, tuplen);
+
+ return (void *) newtuple;
+}
+
+static void
+writetup_index(Tuplesortstate *state, int tapenum, void *tup)
+{
+ IndexTuple tuple = (IndexTuple) tup;
+ unsigned int tuplen;
+
+ tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
+ LogicalTapeWrite(state->tapeset, tapenum,
+ (void*) &tuplen, sizeof(tuplen));
+ LogicalTapeWrite(state->tapeset, tapenum,
+ (void*) tuple, IndexTupleSize(tuple));
+ if (state->randomAccess) /* need trailing length word? */
+ LogicalTapeWrite(state->tapeset, tapenum,
+ (void*) &tuplen, sizeof(tuplen));
+
+ FREEMEM(state, IndexTupleSize(tuple));
+ pfree(tuple);
+}
+
+static void *
+readtup_index(Tuplesortstate *state, int tapenum, unsigned int len)
+{
+ unsigned int tuplen = len - sizeof(unsigned int);
+ IndexTuple tuple = (IndexTuple) palloc(tuplen);
+
+ USEMEM(state, tuplen);
+ if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple,
+ tuplen) != tuplen)
+ elog(ERROR, "tuplesort: unexpected end of data");
+ if (state->randomAccess) /* need trailing length word? */
+ if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen,
+ sizeof(tuplen)) != sizeof(tuplen))
+ elog(ERROR, "tuplesort: unexpected end of data");
+ return (void *) tuple;
+}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 7c57a9a4f99..613595febf4 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -6,7 +6,7 @@
*
* Copyright (c) 1994, Regents of the University of California
*
- * $Id: nbtree.h,v 1.31 1999/08/08 20:12:49 tgl Exp $
+ * $Id: nbtree.h,v 1.32 1999/10/17 22:15:03 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -288,9 +288,12 @@ extern BTItem _bt_formitem(IndexTuple itup);
/*
* prototypes for functions in nbtsort.c
*/
-extern void *_bt_spoolinit(Relation index, int ntapes, bool isunique);
-extern void _bt_spooldestroy(void *spool);
-extern void _bt_spool(Relation index, BTItem btitem, void *spool);
-extern void _bt_leafbuild(Relation index, void *spool);
+
+typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
+
+extern BTSpool *_bt_spoolinit(Relation index, bool isunique);
+extern void _bt_spooldestroy(BTSpool *btspool);
+extern void _bt_spool(BTItem btitem, BTSpool *btspool);
+extern void _bt_leafbuild(BTSpool *btspool);
#endif /* NBTREE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 092fa57acb1..44aa8b8ace5 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -6,7 +6,7 @@
*
* Copyright (c) 1994, Regents of the University of California
*
- * $Id: execnodes.h,v 1.36 1999/09/26 21:21:04 tgl Exp $
+ * $Id: execnodes.h,v 1.37 1999/10/17 22:15:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -597,17 +597,9 @@ typedef struct GroupState
/* ----------------
* SortState information
*
- *| sort nodes are really just a kind of a scan since
- *| we implement sorts by retrieving the entire subplan
- *| into a temp relation, sorting the temp relation into
- *| another sorted relation, and then preforming a simple
- *| unqualified sequential scan on the sorted relation..
- *| -cim 10/15/89
- *
- * Flag indicated whether relation has been sorted
- * Keys scan key structures used to keep info on sort keys
- * TempRelation temporary relation containing result of executing
- * the subplan.
+ * sort_Done indicates whether sort has been performed yet
+ * sort_Keys scan key structures describing the sort keys
+ * tuplesortstate private state of tuplesort.c
*
* CommonScanState information
*
@@ -628,9 +620,9 @@ typedef struct GroupState
typedef struct SortState
{
CommonScanState csstate; /* its first field is NodeTag */
- bool sort_Flag;
+ bool sort_Done;
ScanKey sort_Keys;
- bool cleaned;
+ void *tuplesortstate;
} SortState;
/* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 095ee074d38..a03dacfb02b 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -6,7 +6,7 @@
*
* Copyright (c) 1994, Regents of the University of California
*
- * $Id: plannodes.h,v 1.30 1999/08/21 03:49:09 tgl Exp $
+ * $Id: plannodes.h,v 1.31 1999/10/17 22:15:07 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -288,8 +288,6 @@ typedef struct Sort
Oid nonameid;
int keycount;
SortState *sortstate;
- void *psortstate;
- bool cleaned;
} Sort;
/* ----------------
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
new file mode 100644
index 00000000000..7c5a3209897
--- /dev/null
+++ b/src/include/utils/tuplesort.h
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * tuplesort.h
+ * Generalized tuple sorting routines.
+ *
+ * This module handles sorting of either heap tuples or index tuples
+ * (and could fairly easily support other kinds of sortable objects,
+ * if necessary). It works efficiently for both small and large amounts
+ * of data. Small amounts are sorted in-memory using qsort(). Large
+ * amounts are sorted using temporary files and a standard external sort
+ * algorithm.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: tuplesort.h,v 1.1 1999/10/17 22:15:09 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TUPLESORT_H
+#define TUPLESORT_H
+
+#include "access/htup.h"
+#include "access/itup.h"
+#include "access/skey.h"
+#include "access/tupdesc.h"
+#include "utils/rel.h"
+
+/* Tuplesortstate is an opaque type whose details are not known outside tuplesort.c. */
+
+typedef struct Tuplesortstate Tuplesortstate;
+
+/*
+ * We provide two different interfaces to what is essentially the same
+ * code: one for sorting HeapTuples and one for sorting IndexTuples.
+ * They differ primarily in the way that the sort key information is
+ * supplied.
+ */
+
+extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
+ int nkeys, ScanKey keys,
+ bool randomAccess);
+extern Tuplesortstate *tuplesort_begin_index(Relation indexRel,
+ bool enforceUnique,
+ bool randomAccess);
+
+extern void tuplesort_puttuple(Tuplesortstate *state, void *tuple);
+
+extern void tuplesort_performsort(Tuplesortstate *state);
+
+extern void *tuplesort_gettuple(Tuplesortstate *state, bool forward,
+ bool *should_free);
+#define tuplesort_getheaptuple(state, forward, should_free) \
+ ((HeapTuple) tuplesort_gettuple(state, forward, should_free))
+#define tuplesort_getindextuple(state, forward, should_free) \
+ ((IndexTuple) tuplesort_gettuple(state, forward, should_free))
+
+extern void tuplesort_end(Tuplesortstate *state);
+
+/*
+ * These routines may only be called if randomAccess was specified 'true'.
+ * Backwards scan in gettuple is likewise only allowed if randomAccess.
+ */
+
+extern void tuplesort_rescan(Tuplesortstate *state);
+extern void tuplesort_markpos(Tuplesortstate *state);
+extern void tuplesort_restorepos(Tuplesortstate *state);
+
+#endif /* TUPLESORT_H */