Final stage of psort reconstruction work: replace psort.c with

a generalized module 'tuplesort.c' that can sort either HeapTuples or IndexTuples, and is not tied to execution of a Sort node. Clean up memory leakages in sorting, and replace nbtsort.c's private implementation of mergesorting with calls to tuplesort.c.
author: Tom Lane <tgl@sss.pgh.pa.us> 1999-10-17 22:15:09 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 1999-10-17 22:15:09 +0000
commit: 26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54 (patch)
tree: cbcf32d78330eb3414abed1117b0a54090302a97 /src
parent: 59ed74e60bb3c1ad2b83ebacbb49f74517d8764e (diff)
download: postgresql-26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54.tar.gz
postgresql-26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54.zip
10 files changed, 1747 insertions, 1018 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 11f527fc3ba..d8d835f424b 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1,17 +1,17 @@
 /*-------------------------------------------------------------------------
  *
- * btree.c
+ * nbtree.c
  *	  Implementation of Lehman and Yao's btree management algorithm for
  *	  Postgres.
  *
- * Copyright (c) 1994, Regents of the University of California
+ * NOTES
+ *	  This file contains only the public interface routines.
  *
  *
- * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.46 1999/09/18 19:06:10 tgl Exp $
+ * Copyright (c) 1994, Regents of the University of California
  *
- * NOTES
- *	  This file contains only the public interface routines.
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.47 1999/10/17 22:15:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -76,7 +76,7 @@ btbuild(Relation heap,
 #endif
 	Node	   *pred,
 			   *oldPred;
-	void	   *spool = (void *) NULL;
+	BTSpool	   *spool = NULL;
 	bool		isunique;
 	bool		usefast;
 
@@ -147,7 +147,7 @@ btbuild(Relation heap,
 
 	if (usefast)
 	{
-		spool = _bt_spoolinit(index, 7, isunique);
+		spool = _bt_spoolinit(index, isunique);
 		res = (InsertIndexResult) NULL;
 	}
 
@@ -249,11 +249,11 @@ btbuild(Relation heap,
 
 		/*
 		 * if we are doing bottom-up btree build, we insert the index into
-		 * a spool page for subsequent processing.	otherwise, we insert
+		 * a spool file for subsequent processing.	otherwise, we insert
 		 * into the btree.
 		 */
 		if (usefast)
-			_bt_spool(index, btitem, spool);
+			_bt_spool(btitem, spool);
 		else
 			res = _bt_doinsert(index, btitem, isunique, heap);
 
@@ -275,15 +275,13 @@ btbuild(Relation heap,
 	}
 
 	/*
-	 * if we are doing bottom-up btree build, we now have a bunch of
-	 * sorted runs in the spool pages.	finish the build by (1) merging
-	 * the runs, (2) inserting the sorted tuples into btree pages and (3)
-	 * building the upper levels.
+	 * if we are doing bottom-up btree build, finish the build by
+	 * (1) completing the sort of the spool file, (2) inserting the
+	 * sorted tuples into btree pages and (3) building the upper levels.
 	 */
 	if (usefast)
 	{
-		_bt_spool(index, (BTItem) NULL, spool); /* flush the spool */
-		_bt_leafbuild(index, spool);
+		_bt_leafbuild(spool);
 		_bt_spooldestroy(spool);
 	}
 
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index c1564544b03..48386c113f0 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1,68 +1,47 @@
 /*-------------------------------------------------------------------------
- * btsort.c
- *
- * Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  $Id: nbtsort.c,v 1.46 1999/07/19 07:07:19 momjian Exp $
+ * nbtsort.c
+ *		Build a btree from sorted input by loading leaf pages sequentially.
  *
  * NOTES
  *
- * what we do is:
- * - generate a set of initial one-block runs, distributed round-robin
- *	 between the output tapes.
- * - for each pass,
- *	 - swap input and output tape sets, rewinding both and truncating
- *	   the output tapes.
- *	 - merge the current run in each input tape to the current output
- *	   tape.
- *	   - when each input run has been exhausted, switch to another output
- *		 tape and start processing another run.
- * - when we have fewer runs than tapes, we know we are ready to start
- *	 merging into the btree leaf pages.  (i.e., we do not have to wait
- *	 until we have exactly one tape.)
- * - as we extract tuples from the final runs, we build the pages for
- *	 each level.  when we have only one page on a level, it must be the
- *	 root -- it can be attached to the btree metapage and we are done.
- *
- * conventions:
- * - external interface routines take in and return "void *" for their
- *	 opaque handles.  this is for modularity reasons.
+ * We use tuplesort.c to sort the given index tuples into order.
+ * Then we scan the index tuples in order and build the btree pages
+ * for each level.  When we have only one page on a level, it must be the
+ * root -- it can be attached to the btree metapage and we are done.
  *
  * this code is moderately slow (~10% slower) compared to the regular
  * btree (insertion) build code on sorted or well-clustered data.  on
  * random data, however, the insertion build code is unusable -- the
  * difference on a 60MB heap is a factor of 15 because the random
- * probes into the btree thrash the buffer pool.
+ * probes into the btree thrash the buffer pool.  (NOTE: the above
+ * "10%" estimate is probably obsolete, since it refers to an old and
+ * not very good external sort implementation that used to exist in
+ * this module.  tuplesort.c is almost certainly faster.)
  *
  * this code currently packs the pages to 100% of capacity.  this is
  * not wise, since *any* insertion will cause splitting.  filling to
  * something like the standard 70% steady-state load factor for btrees
  * would probably be better.
  *
- * somebody desperately needs to figure out how to do a better job of
- * balancing the merge passes -- the fan-in on the final merges can be
- * pretty poor, which is bad for performance.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.47 1999/10/17 22:15:04 tgl Exp $
+ *
  *-------------------------------------------------------------------------
  */
 
-#include <fcntl.h>
-
 #include "postgres.h"
 
 #include "access/nbtree.h"
+#include "utils/tuplesort.h"
 
 
 #ifdef BTREE_BUILD_STATS
 #define ShowExecutorStats pg_options[TRACE_EXECUTORSTATS]
 #endif
 
-static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);
-static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
-static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);
-static void _bt_uppershutdown(Relation index, BTPageState *state);
-
 /*
  * turn on debugging output.
  *
@@ -70,689 +49,108 @@ static void _bt_uppershutdown(Relation index, BTPageState *state);
  * only really useful for integer keys.
  */
 /*#define FASTBUILD_DEBUG*/
-#define FASTBUILD_SPOOL
-#define FASTBUILD_MERGE
-
-#define MAXTAPES		(7)
-#define TAPEBLCKSZ		(BLCKSZ << 2)
-
-extern int	NDirectFileRead;
-extern int	NDirectFileWrite;
-
-/*
- * this is what we use to shovel BTItems in and out of memory.	it's
- * bigger than a standard block because we are doing a lot of strictly
- * sequential i/o.	this is obviously something of a tradeoff since we
- * are potentially reading a bunch of zeroes off of disk in many
- * cases.
- *
- * BTItems are packed in and MAXALIGN'd.
- *
- * the fd should not be going out to disk, strictly speaking, but it's
- * the only thing like that so i'm not going to worry about wasting a
- * few bytes.
- */
-typedef struct
-{
-	int			bttb_magic;		/* magic number */
-	File		bttb_fd;		/* file descriptor */
-	int			bttb_top;		/* top of free space within bttb_data */
-	short		bttb_ntup;		/* number of tuples in this block */
-	short		bttb_eor;		/* End-Of-Run marker */
-	char		bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
-} BTTapeBlock;
 
 /*
- * this structure holds the bookkeeping for a simple balanced multiway
- * merge.  (polyphase merging is hairier than i want to get into right
- * now, and i don't see why i have to care how many "tapes" i use
- * right now.  though if psort was in a condition that i could hack it
- * to do this, you bet i would.)
+ * Status record for spooling.
  */
-typedef struct
+struct BTSpool
 {
-	int			bts_ntapes;
-	int			bts_tape;
-	BTTapeBlock **bts_itape;	/* input tape blocks */
-	BTTapeBlock **bts_otape;	/* output tape blocks */
+	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
+	Relation	index;
 	bool		isunique;
-} BTSpool;
-
-/*-------------------------------------------------------------------------
- * sorting comparison routine - returns {-1,0,1} depending on whether
- * the key in the left BTItem is {<,=,>} the key in the right BTItem.
- *
- * we want to use _bt_isortcmp as a comparison function for qsort(3),
- * but it needs extra arguments, so we "pass them in" as global
- * variables.  ick.  fortunately, they are the same throughout the
- * build, so we need do this only once.  this is why you must call
- * _bt_isortcmpinit before the call to qsort(3).
- *
- * a NULL BTItem is always assumed to be greater than any actual
- * value; our heap routines (see below) assume that the smallest
- * element in the heap is returned.  that way, NULL values from the
- * exhausted tapes can sift down to the bottom of the heap.  in point
- * of fact we just don't replace the elements of exhausted tapes, but
- * what the heck.
- * *-------------------------------------------------------------------------
- */
-typedef struct
-{
-	Datum	   *btsk_datum;
-	char	   *btsk_nulls;
-	BTItem		btsk_item;
-} BTSortKey;
-
-static Relation _bt_sortrel;
-static int	_bt_nattr;
-static BTSpool *_bt_inspool;
-
-static void
-_bt_isortcmpinit(Relation index, BTSpool *spool)
-{
-	_bt_sortrel = index;
-	_bt_inspool = spool;
-	_bt_nattr = index->rd_att->natts;
-}
-
-static int
-_bt_isortcmp(BTSortKey *k1, BTSortKey *k2)
-{
-	Datum	   *k1_datum = k1->btsk_datum;
-	Datum	   *k2_datum = k2->btsk_datum;
-	char	   *k1_nulls = k1->btsk_nulls;
-	char	   *k2_nulls = k2->btsk_nulls;
-	bool		equal_isnull = false;
-	int			i;
-
-	if (k1->btsk_item == (BTItem) NULL)
-	{
-		if (k2->btsk_item == (BTItem) NULL)
-			return 0;			/* 1 = 2 */
-		return 1;				/* 1 > 2 */
-	}
-	else if (k2->btsk_item == (BTItem) NULL)
-		return -1;				/* 1 < 2 */
-
-	for (i = 0; i < _bt_nattr; i++)
-	{
-		if (k1_nulls[i] != ' ') /* k1 attr is NULL */
-		{
-			if (k2_nulls[i] != ' ')		/* the same for k2 */
-			{
-				equal_isnull = true;
-				continue;
-			}
-			return 1;			/* NULL ">" NOT_NULL */
-		}
-		else if (k2_nulls[i] != ' ')	/* k2 attr is NULL */
-			return -1;			/* NOT_NULL "<" NULL */
-
-		if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
-							k1_datum[i], k2_datum[i]))
-			return 1;			/* 1 > 2 */
-		else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
-								 k2_datum[i], k1_datum[i]))
-			return -1;			/* 1 < 2 */
-	}
-
-	if (_bt_inspool->isunique && !equal_isnull)
-	{
-		_bt_spooldestroy((void *) _bt_inspool);
-		elog(ERROR, "Cannot create unique index. Table contains non-unique values");
-	}
-	return 0;					/* 1 = 2 */
-}
-
-static void
-_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk)
-{
-	sk->btsk_item = (BTItem) NULL;
-	sk->btsk_datum = (Datum *) NULL;
-	sk->btsk_nulls = (char *) NULL;
-
-	if (bti != (BTItem) NULL)
-	{
-		IndexTuple	it = &(bti->bti_itup);
-		TupleDesc	itdesc = index->rd_att;
-		Datum	   *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum));
-		char	   *np = (char *) palloc(_bt_nattr * sizeof(char));
-		bool		isnull;
-		int			i;
-
-		for (i = 0; i < _bt_nattr; i++)
-		{
-			dp[i] = index_getattr(it, i + 1, itdesc, &isnull);
-			if (isnull)
-				np[i] = 'n';
-			else
-				np[i] = ' ';
-		}
-		sk->btsk_item = bti;
-		sk->btsk_datum = dp;
-		sk->btsk_nulls = np;
-	}
-}
-
-/*-------------------------------------------------------------------------
- * priority queue methods
- *
- * these were more-or-less lifted from the heap section of the 1984
- * edition of gonnet's book on algorithms and data structures.  they
- * are coded so that the smallest element in the heap is returned (we
- * use them for merging sorted runs).
- *
- * XXX these probably ought to be generic library functions.
- *-------------------------------------------------------------------------
- */
-typedef struct
-{
-	int			btpqe_tape;		/* tape identifier */
-	BTSortKey	btpqe_item;		/* pointer to BTItem in tape buffer */
-} BTPriQueueElem;
-
-#define MAXELEM MAXTAPES
-typedef struct
-{
-	int			btpq_nelem;
-	BTPriQueueElem btpq_queue[MAXELEM];
-	Relation	btpq_rel;
-} BTPriQueue;
-
-/* be sure to call _bt_isortcmpinit first */
-#define GREATER(a, b) \
-	(_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
-
-static void
-_bt_pqsift(BTPriQueue *q, int parent)
-{
-	int			child;
-	BTPriQueueElem e;
-
-	for (child = parent * 2 + 1;
-		 child < q->btpq_nelem;
-		 child = parent * 2 + 1)
-	{
-		if (child < q->btpq_nelem - 1)
-		{
-			if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1])))
-				++child;
-		}
-		if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child])))
-		{
-			e = q->btpq_queue[child];	/* struct = */
-			q->btpq_queue[child] = q->btpq_queue[parent];		/* struct = */
-			q->btpq_queue[parent] = e;	/* struct = */
-			parent = child;
-		}
-		else
-			parent = child + 1;
-	}
-}
-
-static int
-_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
-{
-	if (q->btpq_nelem < 1)
-	{							/* already empty */
-		return -1;
-	}
-	*e = q->btpq_queue[0];		/* struct = */
-
-	if (--q->btpq_nelem < 1)
-	{							/* now empty, don't sift */
-		return 0;
-	}
-	q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem];	/* struct = */
-	_bt_pqsift(q, 0);
-	return 0;
-}
-
-static void
-_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
-{
-	int			child,
-				parent;
-
-	if (q->btpq_nelem >= MAXELEM)
-		elog(ERROR, "_bt_pqadd: queue overflow");
-
-	child = q->btpq_nelem++;
-	while (child > 0)
-	{
-		parent = child / 2;
-		if (GREATER(e, &(q->btpq_queue[parent])))
-			break;
-		else
-		{
-			q->btpq_queue[child] = q->btpq_queue[parent];		/* struct = */
-			child = parent;
-		}
-	}
-
-	q->btpq_queue[child] = *e;	/* struct = */
-}
-
-/*-------------------------------------------------------------------------
- * tape methods
- *-------------------------------------------------------------------------
- */
+};
 
 #define BTITEMSZ(btitem) \
 	((btitem) ? \
 	 (IndexTupleDSize((btitem)->bti_itup) + \
 	  (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
 	 0)
-#define SPCLEFT(tape) \
-	(sizeof((tape)->bttb_data) - (tape)->bttb_top)
-#define EMPTYTAPE(tape) \
-	((tape)->bttb_ntup <= 0)
-#define BTTAPEMAGIC		0x19660226
-
-/*
- * reset the tape header for its next use without doing anything to
- * the physical tape file.	(setting bttb_top to 0 makes the block
- * empty.)
- */
-static void
-_bt_tapereset(BTTapeBlock *tape)
-{
-	tape->bttb_eor = 0;
-	tape->bttb_top = 0;
-	tape->bttb_ntup = 0;
-}
-
-/*
- * rewind the physical tape file.
- */
-static void
-_bt_taperewind(BTTapeBlock *tape)
-{
-	FileSeek(tape->bttb_fd, 0L, SEEK_SET);
-}
-
-/*
- * destroy the contents of the physical tape file without destroying
- * the tape data structure or removing the physical tape file.
- *
- * we use the VFD version of ftruncate(2) to do this rather than
- * unlinking and recreating the file.  you still have to wait while
- * the OS frees up all of the file system blocks and stuff, but at
- * least you don't have to delete and reinsert the directory entries.
- */
-static void
-_bt_tapeclear(BTTapeBlock *tape)
-{
-	/* blow away the contents of the old file */
-	_bt_taperewind(tape);
-#ifdef NOT_USED
-	FileSync(tape->bttb_fd);
-#endif
-	FileTruncate(tape->bttb_fd, 0);
-
-	/* reset the buffer */
-	_bt_tapereset(tape);
-}
-
-/*
- * create a new BTTapeBlock, allocating memory for the data structure
- * as well as opening a physical tape file.
- */
-static BTTapeBlock *
-_bt_tapecreate(void)
-{
-	BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
-
-	if (tape == (BTTapeBlock *) NULL)
-		elog(ERROR, "_bt_tapecreate: out of memory");
-
-	tape->bttb_magic = BTTAPEMAGIC;
-
-	tape->bttb_fd = OpenTemporaryFile();
-	Assert(tape->bttb_fd >= 0);
-
-	/* initialize the buffer */
-	_bt_tapereset(tape);
-
-	return tape;
-}
-
-/*
- * destroy the BTTapeBlock structure and its physical tape file.
- */
-static void
-_bt_tapedestroy(BTTapeBlock *tape)
-{
-	FileUnlink(tape->bttb_fd);
-	pfree((void *) tape);
-}
-
-/*
- * flush the tape block to the file, marking End-Of-Run if requested.
- */
-static void
-_bt_tapewrite(BTTapeBlock *tape, int eor)
-{
-	tape->bttb_eor = eor;
-	FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);
-	NDirectFileWrite += TAPEBLCKSZ / BLCKSZ;
-	_bt_tapereset(tape);
-}
-
-/*
- * read a tape block from the file, overwriting the current contents
- * of the buffer.
- *
- * returns:
- * - 0 if there are no more blocks in the tape or in this run (call
- *	 _bt_tapereset to clear the End-Of-Run marker)
- * - 1 if a valid block was read
- */
-static int
-_bt_taperead(BTTapeBlock *tape)
-{
-	File		fd;
-	int			nread;
-
-	if (tape->bttb_eor)
-	{
-		return 0;				/* we are already at End-Of-Run */
-	}
-
-	/*
-	 * we're clobbering the old tape block, but we do need to save the VFD
-	 * (the one in the block we're reading is bogus).
-	 */
-	fd = tape->bttb_fd;
-	nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);
-	tape->bttb_fd = fd;
 
-	if (nread != TAPEBLCKSZ)
-	{
-		Assert(nread == 0);		/* we are at EOF */
-		return 0;
-	}
-	Assert(tape->bttb_magic == BTTAPEMAGIC);
-	NDirectFileRead += TAPEBLCKSZ / BLCKSZ;
-	return 1;
-}
 
-/*
- * get the next BTItem from a tape block.
- *
- * returns:
- * - NULL if we have run out of BTItems
- * - a pointer to the BTItemData in the block otherwise
- *
- * side effects:
- * - sets 'pos' to the current position within the block.
- */
-static BTItem
-_bt_tapenext(BTTapeBlock *tape, char **pos)
-{
-	Size		itemsz;
-	BTItem		bti;
+static void _bt_load(Relation index, BTSpool *btspool);
+static BTItem _bt_buildadd(Relation index, BTPageState *state, BTItem bti,
+						   int flags);
+static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
+static BTPageState *_bt_pagestate(Relation index, int flags,
+								  int level, bool doupper);
+static void _bt_uppershutdown(Relation index, BTPageState *state);
 
-	if (*pos >= tape->bttb_data + tape->bttb_top)
-		return (BTItem) NULL;
-	bti = (BTItem) *pos;
-	itemsz = BTITEMSZ(bti);
-	*pos += MAXALIGN(itemsz);
-	return bti;
-}
 
 /*
- * copy a BTItem into a tape block.
- *
- * assumes that we have already checked to see if the block has enough
- * space for the item.
- *
- * side effects:
- *
- * - advances the 'top' pointer in the tape block header to point to
- * the beginning of free space.
+ * Interface routines
  */
-static void
-_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
-{
-	memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
-	++tape->bttb_ntup;
-	tape->bttb_top += MAXALIGN(itemsz);
-}
 
-/*-------------------------------------------------------------------------
- * spool methods
- *-------------------------------------------------------------------------
- */
 
 /*
- * create and initialize a spool structure, including the underlying
- * files.
+ * create and initialize a spool structure
  */
-void *
-_bt_spoolinit(Relation index, int ntapes, bool isunique)
+BTSpool *
+_bt_spoolinit(Relation index, bool isunique)
 {
 	BTSpool    *btspool = (BTSpool *) palloc(sizeof(BTSpool));
-	int			i;
 
-	if (btspool == (BTSpool *) NULL)
-		elog(ERROR, "_bt_spoolinit: out of memory");
 	MemSet((char *) btspool, 0, sizeof(BTSpool));
-	btspool->bts_ntapes = ntapes;
-	btspool->bts_tape = 0;
-	btspool->isunique = isunique;
 
-	btspool->bts_itape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
-	btspool->bts_otape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
-	if (btspool->bts_itape == (BTTapeBlock **) NULL ||
-		btspool->bts_otape == (BTTapeBlock **) NULL)
-		elog(ERROR, "_bt_spoolinit: out of memory");
+	btspool->index = index;
+	btspool->isunique = isunique;
 
-	for (i = 0; i < ntapes; ++i)
-	{
-		btspool->bts_itape[i] = _bt_tapecreate();
-		btspool->bts_otape[i] = _bt_tapecreate();
-	}
+	btspool->sortstate = tuplesort_begin_index(index, isunique, false);
 
-	_bt_isortcmpinit(index, btspool);
+	/*
+	 * Currently, tuplesort provides sort functions on IndexTuples.
+	 * If we kept anything in a BTItem other than a regular IndexTuple,
+	 * we'd need to modify tuplesort to understand BTItems as such.
+	 */
+	Assert(sizeof(BTItemData) == sizeof(IndexTupleData));
 
-	return (void *) btspool;
+	return btspool;
 }
 
 /*
  * clean up a spool structure and its substructures.
  */
 void
-_bt_spooldestroy(void *spool)
+_bt_spooldestroy(BTSpool *btspool)
 {
-	BTSpool    *btspool = (BTSpool *) spool;
-	int			i;
-
-	for (i = 0; i < btspool->bts_ntapes; ++i)
-	{
-		_bt_tapedestroy(btspool->bts_otape[i]);
-		_bt_tapedestroy(btspool->bts_itape[i]);
-	}
+	tuplesort_end(btspool->sortstate);
 	pfree((void *) btspool);
 }
 
 /*
- * flush out any dirty output tape blocks
+ * spool a btitem into the sort file.
  */
-static void
-_bt_spoolflush(BTSpool *btspool)
+void
+_bt_spool(BTItem btitem, BTSpool *btspool)
 {
-	int			i;
-
-	for (i = 0; i < btspool->bts_ntapes; ++i)
-	{
-		if (!EMPTYTAPE(btspool->bts_otape[i]))
-			_bt_tapewrite(btspool->bts_otape[i], 1);
-	}
+	/* A BTItem is really just an IndexTuple */
+	tuplesort_puttuple(btspool->sortstate, (void *) btitem);
 }
 
 /*
- * swap input tapes and output tapes by swapping their file
- * descriptors.  additional preparation for the next merge pass
- * includes rewinding the new input tapes and clearing out the new
- * output tapes.
+ * given a spool loaded by successive calls to _bt_spool,
+ * create an entire btree.
  */
-static void
-_bt_spoolswap(BTSpool *btspool)
+void
+_bt_leafbuild(BTSpool *btspool)
 {
-	File		tmpfd;
-	BTTapeBlock *itape;
-	BTTapeBlock *otape;
-	int			i;
-
-	for (i = 0; i < btspool->bts_ntapes; ++i)
+#ifdef BTREE_BUILD_STATS
+	if (ShowExecutorStats)
 	{
-		itape = btspool->bts_itape[i];
-		otape = btspool->bts_otape[i];
-
-		/*
-		 * swap the input and output VFDs.
-		 */
-		tmpfd = itape->bttb_fd;
-		itape->bttb_fd = otape->bttb_fd;
-		otape->bttb_fd = tmpfd;
-
-		/*
-		 * rewind the new input tape.
-		 */
-		_bt_taperewind(itape);
-		_bt_tapereset(itape);
-
-		/*
-		 * clear the new output tape -- it's ok to throw away the old
-		 * inputs.
-		 */
-		_bt_tapeclear(otape);
+		fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
+		ShowUsage();
+		ResetUsage();
 	}
+#endif
+	tuplesort_performsort(btspool->sortstate);
+
+	_bt_load(btspool->index, btspool);
 }
 
-/*-------------------------------------------------------------------------
- * sorting routines
- *-------------------------------------------------------------------------
- */
 
 /*
- * spool 'btitem' into an initial run.	as tape blocks are filled, the
- * block BTItems are qsorted and written into some output tape (it
- * doesn't matter which; we go round-robin for simplicity).  the
- * initial runs are therefore always just one block.
+ * Internal routines.
  */
-void
-_bt_spool(Relation index, BTItem btitem, void *spool)
-{
-	BTSpool    *btspool = (BTSpool *) spool;
-	BTTapeBlock *itape;
-	Size		itemsz;
-
-	_bt_isortcmpinit(index, btspool);
 
-	itape = btspool->bts_itape[btspool->bts_tape];
-	itemsz = BTITEMSZ(btitem);
-	itemsz = MAXALIGN(itemsz);
-
-	/*
-	 * if this buffer is too full for this BTItemData, or if we have run
-	 * out of BTItems, we need to sort the buffer and write it out.  in
-	 * this case, the BTItemData will go into the next tape's buffer.
-	 */
-	if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz)
-	{
-		BTSortKey  *parray = (BTSortKey *) NULL;
-		BTTapeBlock *otape;
-		BTItem		bti;
-		char	   *pos;
-		int			btisz;
-		int			it_ntup = itape->bttb_ntup;
-		int			i;
-
-		/*
-		 * build an array of pointers to the BTItemDatas on the input
-		 * block.
-		 */
-		if (it_ntup > 0)
-		{
-			parray = (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey));
-			pos = itape->bttb_data;
-			for (i = 0; i < it_ntup; ++i)
-				_bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i]));
-
-			/*
-			 * qsort the pointer array.
-			 */
-			qsort((void *) parray, it_ntup, sizeof(BTSortKey),
-				  (int (*) (const void *, const void *)) _bt_isortcmp);
-		}
-
-		/*
-		 * write the spooled run into the output tape.	we copy the
-		 * BTItemDatas in the order dictated by the sorted array of
-		 * BTItems, not the original order.
-		 *
-		 * (since everything was MAXALIGN'd and is all on a single tape
-		 * block, everything had *better* still fit on one tape block..)
-		 */
-		otape = btspool->bts_otape[btspool->bts_tape];
-		for (i = 0; i < it_ntup; ++i)
-		{
-			bti = parray[i].btsk_item;
-			btisz = BTITEMSZ(bti);
-			btisz = MAXALIGN(btisz);
-			_bt_tapeadd(otape, bti, btisz);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL)
-			{
-				bool		isnull;
-				Datum		d = index_getattr(&(bti->bti_itup), 1, index->rd_att,
-											  &isnull);
-
-				printf("_bt_spool: inserted <%x> into output tape %d\n",
-					   d, btspool->bts_tape);
-			}
-#endif	 /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */
-		}
-
-		/*
-		 * the initial runs are always single tape blocks.	flush the
-		 * output block, marking End-Of-Run.
-		 */
-		_bt_tapewrite(otape, 1);
-
-		/*
-		 * reset the input buffer for the next run.  we don't have to
-		 * write it out or anything -- we only use it to hold the unsorted
-		 * BTItemDatas, the output tape contains all the sorted stuff.
-		 *
-		 * changing bts_tape changes the output tape and input tape; we
-		 * change itape for the code below.
-		 */
-		_bt_tapereset(itape);
-		btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
-		itape = btspool->bts_itape[btspool->bts_tape];
-
-		/*
-		 * destroy the pointer array.
-		 */
-		if (parray != (BTSortKey *) NULL)
-		{
-			for (i = 0; i < it_ntup; i++)
-			{
-				if (parray[i].btsk_datum != (Datum *) NULL)
-					pfree((void *) (parray[i].btsk_datum));
-				if (parray[i].btsk_nulls != (char *) NULL)
-					pfree((void *) (parray[i].btsk_nulls));
-			}
-			pfree((void *) parray);
-		}
-	}
-
-	/* insert this item into the current buffer */
-	if (btitem != (BTItem) NULL)
-		_bt_tapeadd(itape, btitem, itemsz);
-}
 
 /*
  * allocate a new, clean btree page, not linked to any siblings.
@@ -805,7 +203,7 @@ _bt_slideleft(Relation index, Buffer buf, Page page)
  * allocate and initialize a new BTPageState.  the returned structure
  * is suitable for immediate use by _bt_buildadd.
  */
-static void *
+static BTPageState *
 _bt_pagestate(Relation index, int flags, int level, bool doupper)
 {
 	BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
@@ -819,7 +217,7 @@ _bt_pagestate(Relation index, int flags, int level, bool doupper)
 	state->btps_level = level;
 	state->btps_doupper = doupper;
 
-	return (void *) state;
+	return state;
 }
 
 /*
@@ -883,9 +281,8 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
  * if all keys are unique, 'first' will always be the same as 'last'.
  */
 static BTItem
-_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
+_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
 {
-	BTPageState *state = (BTPageState *) pstate;
 	Buffer		nbuf;
 	Page		npage;
 	BTItem		last_bti;
@@ -944,8 +341,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
 			if (PageAddItem(npage, PageGetItem(opage, ii),
 						  ii->lp_len, n, LP_USED) == InvalidOffsetNumber)
 				elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
-#ifdef NOT_USED
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
 			{
 				bool		isnull;
 				BTItem		tmpbti =
@@ -956,7 +352,6 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
 				printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
 					   d, n, state->btps_level);
 			}
-#endif	 /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
 #endif
 		}
 
@@ -989,7 +384,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
 			nopaque->btpo_prev = BufferGetBlockNumber(obuf);
 			nopaque->btpo_next = P_NONE;
 
-			if (_bt_itemcmp(index, _bt_nattr,
+			if (_bt_itemcmp(index, index->rd_att->natts,
 			  (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)),
 			(BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)),
 							BTEqualStrategyNumber))
@@ -1030,8 +425,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
 	off = OffsetNumberNext(last_off);
 	if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber)
 		elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
-#ifdef NOT_USED
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
 	{
 		bool		isnull;
 		Datum		d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
@@ -1039,11 +433,10 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
 		printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
 			   d, off, state->btps_level);
 	}
-#endif	 /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
 #endif
 	if (last_bti == (BTItem) NULL)
 		first_off = P_FIRSTKEY;
-	else if (!_bt_itemcmp(index, _bt_nattr,
+	else if (!_bt_itemcmp(index, index->rd_att->natts,
 						  bti, last_bti, BTEqualStrategyNumber))
 		first_off = off;
 	last_off = off;
@@ -1103,224 +496,31 @@ _bt_uppershutdown(Relation index, BTPageState *state)
 }
 
 /*
- * take the input tapes stored by 'btspool' and perform successive
- * merging passes until at most one run is left in each tape.  at that
- * point, merge the final tape runs into a set of btree leaves.
- *
- * XXX three nested loops?	gross.	cut me up into smaller routines.
+ * Read tuples in correct sort order from tuplesort, and load them into
+ * btree leaves.
  */
 static void
-_bt_merge(Relation index, BTSpool *btspool)
+_bt_load(Relation index, BTSpool *btspool)
 {
 	BTPageState *state;
-	BTPriQueue	q;
-	BTPriQueueElem e;
-	BTSortKey	btsk;
 	BTItem		bti;
-	BTTapeBlock *itape;
-	BTTapeBlock *otape;
-	char	   *tapepos[MAXTAPES];
-	int			tapedone[MAXTAPES];
-	int			t;
-	int			goodtapes;
-	int			npass;
-	int			nruns;
-	Size		btisz;
-	bool		doleaf = false;
+	bool		should_free;
 
 	/*
 	 * initialize state needed for the merge into the btree leaf pages.
 	 */
-	state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true);
-
-	npass = 0;
-	do
-	{							/* pass */
-
-		/*
-		 * each pass starts by flushing the previous outputs and swapping
-		 * inputs and outputs.	flushing sets End-of-Run for any dirty
-		 * output tapes.  swapping clears the new output tapes and rewinds
-		 * the new input tapes.
-		 */
-		btspool->bts_tape = btspool->bts_ntapes - 1;
-		_bt_spoolflush(btspool);
-		_bt_spoolswap(btspool);
-
-		++npass;
-		nruns = 0;
-
-		for (;;)
-		{						/* run */
-
-			/*
-			 * each run starts by selecting a new output tape.	the merged
-			 * results of a given run are always sent to this one tape.
-			 */
-			btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
-			otape = btspool->bts_otape[btspool->bts_tape];
-
-			/*
-			 * initialize the priority queue by loading it with the first
-			 * element of the given run in each tape.  since we are
-			 * starting a new run, we reset the tape (clearing the
-			 * End-Of-Run marker) before reading it.  this means that
-			 * _bt_taperead will return 0 only if the tape is actually at
-			 * EOF.
-			 */
-			MemSet((char *) &q, 0, sizeof(BTPriQueue));
-			goodtapes = 0;
-			for (t = 0; t < btspool->bts_ntapes; ++t)
-			{
-				itape = btspool->bts_itape[t];
-				tapepos[t] = itape->bttb_data;
-				tapedone[t] = 0;
-				_bt_tapereset(itape);
-				do
-				{
-					if (_bt_taperead(itape) == 0)
-						tapedone[t] = 1;
-				} while (!tapedone[t] && EMPTYTAPE(itape));
-				if (!tapedone[t])
-				{
-					++goodtapes;
-					e.btpqe_tape = t;
-					_bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]),
-								   &(e.btpqe_item));
-					if (e.btpqe_item.btsk_item != (BTItem) NULL)
-						_bt_pqadd(&q, &e);
-				}
-			}
-
-			/*
-			 * if we don't have any tapes with any input (i.e., they are
-			 * all at EOF), there is no work to do in this run -- we must
-			 * be done with this pass.
-			 */
-			if (goodtapes == 0)
-			{
-				break;			/* for */
-			}
-			++nruns;
-
-			/*
-			 * output the smallest element from the queue until there are
-			 * no more.
-			 */
-			while (_bt_pqnext(&q, &e) >= 0)
-			{					/* item */
-
-				/*
-				 * replace the element taken from priority queue, fetching
-				 * a new block if needed.  a tape can run out if it hits
-				 * either End-Of-Run or EOF.
-				 */
-				t = e.btpqe_tape;
-				btsk = e.btpqe_item;
-				bti = btsk.btsk_item;
-				if (bti != (BTItem) NULL)
-				{
-					btisz = BTITEMSZ(bti);
-					btisz = MAXALIGN(btisz);
-					if (doleaf)
-					{
-						_bt_buildadd(index, state, bti, BTP_LEAF);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
-						{
-							bool		isnull;
-							Datum		d = index_getattr(&(bti->bti_itup), 1,
-												 index->rd_att, &isnull);
-
-							printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n",
-								   npass, nruns, d, t,
-								   BufferGetBlockNumber(state->btps_buf));
-						}
-#endif	 /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
-					}
-					else
-					{
-						if (SPCLEFT(otape) < btisz)
-						{
-
-							/*
-							 * if it's full, write it out and add the item
-							 * to the next block.  (since we will be
-							 * adding another tuple immediately after
-							 * this, we can be sure that there will be at
-							 * least one more block in this run and so we
-							 * know we do *not* want to set End-Of-Run
-							 * here.)
-							 */
-							_bt_tapewrite(otape, 0);
-						}
-						_bt_tapeadd(otape, bti, btisz);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
-						{
-							bool		isnull;
-							Datum		d = index_getattr(&(bti->bti_itup), 1,
-												 index->rd_att, &isnull);
-
-							printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n",
-								   npass, nruns, d, t,
-								   btspool->bts_tape);
-						}
-#endif	 /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
-					}
-
-					if (btsk.btsk_datum != (Datum *) NULL)
-						pfree((void *) (btsk.btsk_datum));
-					if (btsk.btsk_nulls != (char *) NULL)
-						pfree((void *) (btsk.btsk_nulls));
-
-				}
-				itape = btspool->bts_itape[t];
-				if (!tapedone[t])
-				{
-					BTItem		newbti = _bt_tapenext(itape, &tapepos[t]);
-
-					if (newbti == (BTItem) NULL)
-					{
-						do
-						{
-							if (_bt_taperead(itape) == 0)
-								tapedone[t] = 1;
-						} while (!tapedone[t] && EMPTYTAPE(itape));
-						if (!tapedone[t])
-						{
-							tapepos[t] = itape->bttb_data;
-							newbti = _bt_tapenext(itape, &tapepos[t]);
-						}
-					}
-					if (newbti != (BTItem) NULL)
-					{
-						BTPriQueueElem nexte;
-
-						nexte.btpqe_tape = t;
-						_bt_setsortkey(index, newbti, &(nexte.btpqe_item));
-						_bt_pqadd(&q, &nexte);
-					}
-				}
-			}					/* item */
-
-			/*
-			 * that's it for this run.  flush the output tape, marking
-			 * End-of-Run.
-			 */
-			_bt_tapewrite(otape, 1);
-		}						/* run */
+	state = _bt_pagestate(index, BTP_LEAF, 0, true);
 
-		/*
-		 * we are here because we ran out of input on all of the input
-		 * tapes.
-		 *
-		 * if this pass did not generate more actual output runs than we have
-		 * tapes, we know we have at most one run in each tape.  this
-		 * means that we are ready to merge into the final btree leaf
-		 * pages instead of merging into a tape file.
-		 */
-		if (nruns <= btspool->bts_ntapes)
-			doleaf = true;
-	} while (nruns > 0);		/* pass */
+	for (;;)
+	{
+		bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true,
+											   &should_free);
+		if (bti == (BTItem) NULL)
+			break;
+		_bt_buildadd(index, state, bti, BTP_LEAF);
+		if (should_free)
+			pfree((void *) bti);
+	}
 
 	_bt_uppershutdown(index, state);
 }
@@ -1359,7 +559,7 @@ _bt_upperbuild(Relation index)
 	ropaque->btpo_flags &= ~BTP_ROOT;
 	_bt_wrtbuf(index, rbuf);
 
-	state = (BTPageState *) _bt_pagestate(index, 0, 0, true);
+	state = _bt_pagestate(index, 0, 0, true);
 
 	/* for each page... */
 	do
@@ -1380,7 +580,7 @@ _bt_upperbuild(Relation index)
 			 * the lower page and insert it into a page at this level.
 			 */
 			nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque));
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+#ifdef FASTBUILD_DEBUG
 			{
 				bool		isnull;
 				Datum		d = index_getattr(&(nbti->bti_itup), 1, index->rd_att,
@@ -1389,7 +589,7 @@ _bt_upperbuild(Relation index)
 				printf("_bt_upperbuild: inserting <%x> at %d\n",
 					   d, state->btps_level);
 			}
-#endif	 /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
+#endif
 			_bt_buildadd(index, state, nbti, 0);
 			pfree((void *) nbti);
 		}
@@ -1401,25 +601,3 @@ _bt_upperbuild(Relation index)
 }
 
 #endif
-
-/*
- * given a spool loading by successive calls to _bt_spool, create an
- * entire btree.
- */
-void
-_bt_leafbuild(Relation index, void *spool)
-{
-	_bt_isortcmpinit(index, (BTSpool *) spool);
-
-#ifdef BTREE_BUILD_STATS
-	if (ShowExecutorStats)
-	{
-		fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
-		ShowUsage();
-		ResetUsage();
-	}
-#endif
-
-	_bt_merge(index, (BTSpool *) spool);
-
-}
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index f82fecf0d6f..14e8b46aa86 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -7,16 +7,17 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.23 1999/07/17 20:16:58 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.24 1999/10/17 22:15:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include "postgres.h"
+
 #include "executor/executor.h"
 #include "executor/execdebug.h"
 #include "executor/nodeSort.h"
-#include "utils/psort.h"
+#include "utils/tuplesort.h"
 
 /* ----------------------------------------------------------------
  *		FormSortKeys(node)
@@ -83,11 +84,9 @@ FormSortKeys(Sort *sortnode)
 /* ----------------------------------------------------------------
  *		ExecSort
  *
- * old comments
- *		Sorts tuples from the outer subtree of the node in psort,
+ *		Sorts tuples from the outer subtree of the node using tuplesort,
  *		which saves the results in a temporary file or memory. After the
  *		initial call, returns a tuple from the file with each call.
- *		Assumes that heap access method is used.
  *
  *		Conditions:
  *		  -- none.
@@ -101,10 +100,8 @@ ExecSort(Sort *node)
 {
 	EState	   *estate;
 	SortState  *sortstate;
-	Plan	   *outerNode;
 	ScanDirection dir;
-	int			keycount;
-	ScanKey		sortkeys;
+	Tuplesortstate *tuplesortstate;
 	HeapTuple	heapTuple;
 	TupleTableSlot *slot;
 	bool		should_free;
@@ -119,44 +116,72 @@ ExecSort(Sort *node)
 	sortstate = node->sortstate;
 	estate = node->plan.state;
 	dir = estate->es_direction;
+	tuplesortstate = (Tuplesortstate *) sortstate->tuplesortstate;
 
 	/* ----------------
-	 *	the first time we call this, psort sorts this into a file.
-	 *	Subsequent calls return tuples from psort.
+	 *	If first time through, read all tuples from outer plan and
+	 *	pass them to tuplesort.c.
+	 *	Subsequent calls just fetch tuples from tuplesort.
 	 * ----------------
 	 */
 
-	if (sortstate->sort_Flag == false)
+	if (! sortstate->sort_Done)
 	{
+		Plan	   *outerNode;
+		TupleDesc	tupDesc;
+		int			keycount;
+		ScanKey		sortkeys;
+
 		SO1_printf("ExecSort: %s\n",
-				   "sortstate == false -> sorting subplan");
+				   "sorting subplan");
 		/* ----------------
-		 *	set all relations to be scanned in the forward direction
-		 *	while creating the temporary relation.
+		 *	Want to scan subplan in the forward direction while creating
+		 *	the sorted data.  (Does setting my direction actually affect
+		 *	the subplan?  I bet this is useless code...)
 		 * ----------------
 		 */
 		estate->es_direction = ForwardScanDirection;
 
 		/* ----------------
-		 *	 prepare information for psort_begin()
+		 *	 Initialize tuplesort module.
 		 * ----------------
 		 */
-		outerNode = outerPlan((Plan *) node);
+		SO1_printf("ExecSort: %s\n",
+				   "calling tuplesort_begin");
 
+		outerNode = outerPlan((Plan *) node);
+		tupDesc = ExecGetTupType(outerNode);
 		keycount = node->keycount;
 		sortkeys = (ScanKey) sortstate->sort_Keys;
-		SO1_printf("ExecSort: %s\n",
-				   "calling psort_begin");
 
-		if (!psort_begin(node,	/* this node */
-						 keycount,		/* number keys */
-						 sortkeys))		/* keys */
+		tuplesortstate = tuplesort_begin_heap(tupDesc, keycount, sortkeys,
+											  true /* randomAccess */);
+
+		sortstate->tuplesortstate = (void *) tuplesortstate;
+
+		/* ----------------
+		 *	 Scan the subplan and feed all the tuples to tuplesort.
+		 * ----------------
+		 */
+
+		for (;;)
 		{
-			/* Psort says, there are no tuples to be sorted */
-			return NULL;
+			slot = ExecProcNode(outerNode, (Plan *) node);
+
+			if (TupIsNull(slot))
+				break;
+
+			tuplesort_puttuple(tuplesortstate, (void *) slot->val);
+			ExecClearTuple(slot);
 		}
 
 		/* ----------------
+		 *	 Complete the sort.
+		 * ----------------
+		 */
+		tuplesort_performsort(tuplesortstate);
+
+		/* ----------------
 		 *	 restore to user specified direction
 		 * ----------------
 		 */
@@ -167,25 +192,29 @@ ExecSort(Sort *node)
 		 * ----------------
 		 */
 		slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot;
-		slot->ttc_tupleDescriptor = ExecGetTupType(outerNode);
+		slot->ttc_tupleDescriptor = tupDesc;
+
 		/* ----------------
 		 *	finally set the sorted flag to true
 		 * ----------------
 		 */
-		sortstate->sort_Flag = true;
+		sortstate->sort_Done = true;
 		SO1_printf(stderr, "ExecSort: sorting done.\n");
 	}
 	else
 		slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot;
 
 	SO1_printf("ExecSort: %s\n",
-			   "retrieving tuple from sorted relation");
+			   "retrieving tuple from tuplesort");
 
 	/* ----------------
-	 *	at this point we grab a tuple from psort
+	 *	Get the first or next tuple from tuplesort.
+	 *	Returns NULL if no more tuples.
 	 * ----------------
 	 */
-	heapTuple = psort_grabtuple(node, &should_free);
+	heapTuple = tuplesort_getheaptuple(tuplesortstate,
+									   ScanDirectionIsForward(dir),
+									   &should_free);
 
 	return ExecStoreTuple(heapTuple, slot, InvalidBuffer, should_free);
 }
@@ -193,7 +222,6 @@ ExecSort(Sort *node)
 /* ----------------------------------------------------------------
  *		ExecInitSort
  *
- * old comments
  *		Creates the run-time state information for the sort node
  *		produced by the planner and initailizes its outer subtree.
  * ----------------------------------------------------------------
@@ -203,7 +231,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
 {
 	SortState  *sortstate;
 	Plan	   *outerPlan;
-	ScanKey		sortkeys;
 
 	SO1_printf("ExecInitSort: %s\n",
 			   "initializing sort node");
@@ -219,14 +246,14 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
 	 * ----------------
 	 */
 	sortstate = makeNode(SortState);
-	sortstate->sort_Flag = 0;
+	sortstate->sort_Done = false;
 	sortstate->sort_Keys = NULL;
-	node->cleaned = FALSE;
+	sortstate->tuplesortstate = NULL;
 
 	node->sortstate = sortstate;
 
 	/* ----------------
-	 *	Miscellanious initialization
+	 *	Miscellaneous initialization
 	 *
 	 *		 +	assign node's base_id
 	 *		 +	assign debugging hooks
@@ -259,9 +286,7 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
 	 *	initialize sortstate information
 	 * ----------------
 	 */
-	sortkeys = FormSortKeys(node);
-	sortstate->sort_Keys = sortkeys;
-	sortstate->sort_Flag = false;
+	sortstate->sort_Keys = FormSortKeys(node);
 
 	/* ----------------
 	 *	initialize tuple type.	no need to initialize projection
@@ -275,11 +300,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent)
 	SO1_printf("ExecInitSort: %s\n",
 			   "sort node initialized");
 
-	/* ----------------
-	 *	return relation oid of temporary sort relation in a list
-	 *	(someday -- for now we return LispTrue... cim 10/12/89)
-	 * ----------------
-	 */
 	return TRUE;
 }
 
@@ -293,8 +313,6 @@ ExecCountSlotsSort(Sort *node)
 
 /* ----------------------------------------------------------------
  *		ExecEndSort(node)
- *
- * old comments
  * ----------------------------------------------------------------
  */
 void
@@ -325,8 +343,13 @@ ExecEndSort(Sort *node)
 	 */
 	ExecClearTuple(sortstate->csstate.css_ScanTupleSlot);
 
-	/* Clean up after psort */
-	psort_end(node);
+	/* ----------------
+	 *	Release tuplesort resources
+	 * ----------------
+	 */
+	if (sortstate->tuplesortstate != NULL)
+		tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate);
+	sortstate->tuplesortstate = NULL;
 
 	SO1_printf("ExecEndSort: %s\n",
 			   "sort node shutdown");
@@ -335,51 +358,47 @@ ExecEndSort(Sort *node)
 /* ----------------------------------------------------------------
  *		ExecSortMarkPos
  *
- *		Calls psort to save the current position in the sorted file.
+ *		Calls tuplesort to save the current position in the sorted file.
  * ----------------------------------------------------------------
  */
 void
 ExecSortMarkPos(Sort *node)
 {
-	SortState  *sortstate;
+	SortState  *sortstate = node->sortstate;
 
 	/* ----------------
 	 *	if we haven't sorted yet, just return
 	 * ----------------
 	 */
-	sortstate = node->sortstate;
-	if (sortstate->sort_Flag == false)
+	if (! sortstate->sort_Done)
 		return;
 
-	psort_markpos(node);
-
-	return;
+	tuplesort_markpos((Tuplesortstate *) sortstate->tuplesortstate);
 }
 
 /* ----------------------------------------------------------------
  *		ExecSortRestrPos
  *
- *		Calls psort to restore the last saved sort file position.
+ *		Calls tuplesort to restore the last saved sort file position.
  * ----------------------------------------------------------------
  */
 void
 ExecSortRestrPos(Sort *node)
 {
-	SortState  *sortstate;
+	SortState  *sortstate = node->sortstate;
 
 	/* ----------------
 	 *	if we haven't sorted yet, just return.
 	 * ----------------
 	 */
-	sortstate = node->sortstate;
-	if (sortstate->sort_Flag == false)
+	if (! sortstate->sort_Done)
 		return;
 
 	/* ----------------
 	 *	restore the scan to the previously marked position
 	 * ----------------
 	 */
-	psort_restorepos(node);
+	tuplesort_restorepos((Tuplesortstate *) sortstate->tuplesortstate);
 }
 
 void
@@ -392,17 +411,25 @@ ExecReScanSort(Sort *node, ExprContext *exprCtxt, Plan *parent)
 	 * not NULL then it will be re-scanned by ExecProcNode, else - no
 	 * reason to re-scan it at all.
 	 */
-	if (sortstate->sort_Flag == false)
+	if (! sortstate->sort_Done)
 		return;
 
 	ExecClearTuple(sortstate->csstate.cstate.cs_ResultTupleSlot);
 
-	psort_rescan(node);
-
 	/*
-	 * If subnode is to be rescanned then we aren't sorted
+	 * If subnode is to be rescanned then we forget previous sort
+	 * results; we have to re-read the subplan and re-sort.
+	 *
+	 * Otherwise we can just rewind and rescan the sorted output.
 	 */
 	if (((Plan *) node)->lefttree->chgParam != NULL)
-		sortstate->sort_Flag = false;
-
+	{
+		sortstate->sort_Done = false;
+		tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate);
+		sortstate->tuplesortstate = NULL;
+	}
+	else
+	{
+		tuplesort_rescan((Tuplesortstate *) sortstate->tuplesortstate);
+	}
 }
diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile
index d411a89c735..c680a089230 100644
--- a/src/backend/utils/sort/Makefile
+++ b/src/backend/utils/sort/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for utils/sort
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.6 1999/10/16 19:49:27 tgl Exp $
+#    $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.7 1999/10/17 22:15:05 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -13,7 +13,7 @@ include ../../../Makefile.global
 
 CFLAGS += -I../..
 
-OBJS = logtape.o lselect.o psort.o
+OBJS = logtape.o tuplesort.o
 
 all: SUBSYS.o
 
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 8d5d34c00a7..46497598b56 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -4,8 +4,8 @@
  *	  Management of "logical tapes" within temporary files.
  *
  * This module exists to support sorting via multiple merge passes (see
- * psort.c).  Merging is an ideal algorithm for tape devices, but if we
- * implement it on disk by creating a separate file for each "tape",
+ * tuplesort.c).  Merging is an ideal algorithm for tape devices, but if
+ * we implement it on disk by creating a separate file for each "tape",
  * there is an annoying problem: the peak space usage is at least twice
  * the volume of actual data to be sorted.  (This must be so because each
  * datum will appear in both the input and output tapes of the final
@@ -23,7 +23,7 @@
  * Few OSes allow arbitrary parts of a file to be released back to the OS,
  * so we have to implement this space-recycling ourselves within a single
  * logical file.  logtape.c exists to perform this bookkeeping and provide
- * the illusion of N independent tape devices to psort.c.  Note that
+ * the illusion of N independent tape devices to tuplesort.c.  Note that
  * logtape.c itself depends on buffile.c to provide a "logical file" of
  * larger size than the underlying OS may support.
  *
@@ -63,7 +63,7 @@
  * Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.1 1999/10/16 19:49:27 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.2 1999/10/17 22:15:05 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
new file mode 100644
index 00000000000..2240564fa25
--- /dev/null
+++ b/src/backend/utils/sort/tuplesort.c
@@ -0,0 +1,1465 @@
+/*-------------------------------------------------------------------------
+ *
+ * tuplesort.c
+ *	  Generalized tuple sorting routines.
+ *
+ * This module handles sorting of either heap tuples or index tuples
+ * (and could fairly easily support other kinds of sortable objects,
+ * if necessary).  It works efficiently for both small and large amounts
+ * of data.  Small amounts are sorted in-memory using qsort().  Large
+ * amounts are sorted using temporary files and a standard external sort
+ * algorithm.
+ *
+ * See Knuth, volume 3, for more than you want to know about the external
+ * sorting algorithm.  We divide the input into sorted runs using replacement
+ * selection, in the form of a priority tree implemented as a heap
+ * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase
+ * merge, Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D
+ * are implemented by logtape.c, which avoids space wastage by recycling
+ * disk space as soon as each block is read from its "tape".
+ *
+ * We do not form the initial runs using Knuth's recommended replacement
+ * selection method (Algorithm 5.4.1R), because it uses a fixed number of
+ * records in memory at all times.  Since we are dealing with tuples that
+ * may vary considerably in size, we want to be able to vary the number of
+ * records kept in memory to ensure full utilization of the allowed sort
+ * memory space.  This is easily done by keeping a variable-size heap in
+ * which the records of the current run are stored, plus a variable-size
+ * unsorted array holding records that must go into the next run.
+ *
+ * The (approximate) amount of memory allowed for any one sort operation
+ * is given in kilobytes by the external variable SortMem.  Initially,
+ * we absorb tuples and simply store them in an unsorted array as long as
+ * we haven't exceeded SortMem.  If we reach the end of the input without
+ * exceeding SortMem, we sort the array using qsort() and subsequently return
+ * tuples just by scanning the tuple array sequentially.  If we do exceed
+ * SortMem, we construct a heap using Algorithm H and begin to emit tuples
+ * into sorted runs in temporary tapes, emitting just enough tuples at each
+ * step to get back within the SortMem limit.  New tuples are added to the
+ * heap if they can go into the current run, else they are temporarily added
+ * to the unsorted array.  Whenever the heap empties, we construct a new heap
+ * from the current contents of the unsorted array, and begin a new run with a
+ * new output tape (selected per Algorithm D).  After the end of the input
+ * is reached, we dump out remaining tuples in memory into a final run
+ * (or two), then merge the runs using Algorithm D.
+ *
+ * When the caller requests random access to the sort result, we form
+ * the final sorted run on a logical tape which is then "frozen", so
+ * that we can access it randomly.  When the caller does not need random
+ * access, we return from tuplesort_performsort() as soon as we are down
+ * to one run per logical tape.  The final merge is then performed
+ * on-the-fly as the caller repeatedly calls tuplesort_gettuple; this
+ * saves one cycle of writing all the data out to disk and reading it in.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v 1.1 1999/10/17 22:15:05 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/nbtree.h"
+#include "miscadmin.h"
+#include "utils/logtape.h"
+#include "utils/tuplesort.h"
+
+/*
+ * Possible states of a Tuplesort object.  These denote the states that
+ * persist between calls of Tuplesort routines.
+ */
+typedef enum
+{
+	TSS_INITIAL,		/* Loading tuples; still within memory limit */
+	TSS_BUILDRUNS,		/* Loading tuples; writing to tape */
+	TSS_SORTEDINMEM,	/* Sort completed entirely in memory */
+	TSS_SORTEDONTAPE,	/* Sort completed, final run is on tape */
+	TSS_FINALMERGE		/* Performing final merge on-the-fly */
+} TupSortStatus;
+
+/*
+ * We use a seven-tape polyphase merge, which is the "sweet spot" on the
+ * tapes-to-passes curve according to Knuth's figure 70 (section 5.4.2).
+ */
+#define MAXTAPES		7				/* Knuth's T */
+#define TAPERANGE		(MAXTAPES-1)	/* Knuth's P */
+
+/*
+ * Private state of a Tuplesort operation.
+ */
+struct Tuplesortstate
+{
+	TupSortStatus status;		/* enumerated value as shown above */
+	bool		randomAccess;	/* did caller request random access? */
+	long		availMem;		/* remaining memory available, in bytes */
+	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
+
+	/*
+	 * These function pointers decouple the routines that must know what kind
+	 * of tuple we are sorting from the routines that don't need to know it.
+	 * They are set up by the tuplesort_begin_xxx routines.
+	 *
+	 * Function to compare two tuples; result is per qsort() convention,
+	 * ie, <0, 0, >0 according as a<b, a=b, a>b.
+	 */
+	int (*comparetup) (Tuplesortstate *state, const void *a, const void *b);
+	/*
+	 * Function to copy a supplied input tuple into palloc'd space.
+	 * (NB: we assume that a single pfree() is enough to release the tuple
+	 * later, so the representation must be "flat" in one palloc chunk.)
+	 * state->availMem must be decreased by the amount of space used.
+	 */
+	void * (*copytup) (Tuplesortstate *state, void *tup);
+	/*
+	 * Function to write a stored tuple onto tape.  The representation of
+	 * the tuple on tape need not be the same as it is in memory; requirements
+	 * on the tape representation are given below.  After writing the tuple,
+	 * pfree() it, and increase state->availMem by the amount of memory space
+	 * thereby released.
+	 */
+	void (*writetup) (Tuplesortstate *state, int tapenum, void *tup);
+	/*
+	 * Function to read a stored tuple from tape back into memory.
+	 * 'len' is the already-read length of the stored tuple.  Create and
+	 * return a palloc'd copy, and decrease state->availMem by the amount
+	 * of memory space consumed.
+	 */
+	void * (*readtup) (Tuplesortstate *state, int tapenum, unsigned int len);
+
+	/*
+	 * This array holds "unsorted" tuples during the input phases.
+	 * If we are able to complete the sort in memory, it holds the
+	 * final sorted result as well.
+	 */
+	void	  **memtuples;		/* array of pointers to palloc'd tuples */
+	int			memtupcount;	/* number of tuples currently present */
+	int			memtupsize;		/* allocated length of memtuples array */
+
+	/*
+	 * This array holds the partially-sorted "heap" of tuples that will go
+	 * out in the current run during BUILDRUNS state.  While completing
+	 * the sort, we use it to merge runs of tuples from input tapes.
+	 * It is never allocated unless we need to use tapes.
+	 */
+	void	  **heaptuples;		/* array of pointers to palloc'd tuples */
+	int			heaptupcount;	/* number of tuples currently present */
+	int			heaptupsize;	/* allocated length of heaptuples array */
+	/*
+	 * While merging, this array holds the actual number of the input tape
+	 * that each tuple in heaptuples[] came from.
+	 */
+	int		   *heapsrctapes;
+
+	/*
+	 * Variables for Algorithm D.  Note that destTape is a "logical" tape
+	 * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
+	 * "logical" and "actual" tape numbers straight!
+	 */
+	int			Level;			/* Knuth's l */
+	int			destTape;		/* current output tape (Knuth's j, less 1) */
+	int			tp_fib[MAXTAPES]; /* Target Fibonacci run counts (A[]) */
+	int			tp_runs[MAXTAPES];	/* # of real runs on each tape */
+	int			tp_dummy[MAXTAPES];	/* # of dummy runs for each tape (D[]) */
+	int			tp_tapenum[MAXTAPES]; /* Actual tape numbers (TAPE[]) */
+
+	bool		multipleRuns;	/* T if we have created more than 1 run */
+
+	/*
+	 * These variables are used after completion of sorting to keep track
+	 * of the next tuple to return.  (In the tape case, the tape's current
+	 * read position is also critical state.)
+	 */
+	int			result_tape;	/* actual tape number of finished output */
+	int			current;		/* array index (only used if SORTEDINMEM) */
+	bool		eof_reached;	/* reached EOF (needed for cursors) */
+
+	/* markpos_xxx holds marked position for mark and restore */
+	long		markpos_block;	/* tape block# (only used if SORTEDONTAPE) */
+	int			markpos_offset;	/* saved "current", or offset in tape block */
+	bool		markpos_eof;	/* saved "eof_reached" */
+
+	/*
+	 * These variables are specific to the HeapTuple case; they are set
+	 * by tuplesort_begin_heap and used only by the HeapTuple routines.
+	 */
+	TupleDesc	tupDesc;
+	int			nKeys;
+	ScanKey		scanKeys;
+
+	/*
+	 * These variables are specific to the IndexTuple case; they are set
+	 * by tuplesort_begin_index and used only by the IndexTuple routines.
+	 */
+	Relation	indexRel;
+	bool		enforceUnique;	/* complain if we find duplicate tuples */
+};
+
+#define COMPARETUP(state,a,b)	((*(state)->comparetup) (state, a, b))
+#define COPYTUP(state,tup)	((*(state)->copytup) (state, tup))
+#define WRITETUP(state,tape,tup)	((*(state)->writetup) (state, tape, tup))
+#define READTUP(state,tape,len)	((*(state)->readtup) (state, tape, len))
+#define LACKMEM(state)		((state)->availMem < 0)
+#define USEMEM(state,amt)	((state)->availMem -= (amt))
+#define FREEMEM(state,amt)	((state)->availMem += (amt))
+
+/*--------------------
+ *
+ * NOTES about on-tape representation of tuples:
+ *
+ * We require the first "unsigned int" of a stored tuple to be the total size
+ * on-tape of the tuple, including itself (so it is never zero; an all-zero
+ * unsigned int is used to delimit runs).  The remainder of the stored tuple
+ * may or may not match the in-memory representation of the tuple ---
+ * any conversion needed is the job of the writetup and readtup routines.
+ *
+ * If state->randomAccess is true, then the stored representation of the
+ * tuple must be followed by another "unsigned int" that is a copy of the
+ * length --- so the total tape space used is actually sizeof(unsigned int)
+ * more than the stored length value.  This allows read-backwards.  When
+ * randomAccess is not true, the write/read routines may omit the extra
+ * length word.
+ *
+ * writetup is expected to write both length words as well as the tuple
+ * data.  When readtup is called, the tape is positioned just after the
+ * front length word; readtup must read the tuple data and advance past
+ * the back length word (if present).
+ *
+ * The write/read routines can make use of the tuple description data
+ * stored in the Tuplesortstate record, if needed.  They are also expected
+ * to adjust state->availMem by the amount of memory space (not tape space!)
+ * released or consumed.  There is no error return from either writetup
+ * or readtup; they should elog() on failure.
+ *
+ *
+ * NOTES about memory consumption calculations:
+ *
+ * We count space requested for tuples against the SortMem limit.
+ * Fixed-size space (primarily the LogicalTapeSet I/O buffers) is not
+ * counted, nor do we count the variable-size memtuples and heaptuples
+ * arrays.  (Even though those could grow pretty large, they should be
+ * small compared to the tuples proper, so this is not unreasonable.)
+ *
+ * The major deficiency in this approach is that it ignores palloc overhead.
+ * The memory space actually allocated for a palloc chunk is always more
+ * than the request size, and could be considerably more (as much as 2X
+ * larger, in the current aset.c implementation).  So the space used could
+ * be considerably more than SortMem says.
+ *
+ * One way to fix this is to add a memory management function that, given
+ * a pointer to a palloc'd chunk, returns the actual space consumed by the
+ * chunk.  This would be very easy in the current aset.c module, but I'm
+ * hesitant to do it because it might be unpleasant to support in future
+ * implementations of memory management.  (For example, a direct
+ * implementation of palloc as malloc could not support such a function
+ * portably.)
+ *
+ * A cruder answer is just to apply a fudge factor, say by initializing
+ * availMem to only three-quarters of what SortMem indicates.  This is
+ * probably the right answer if anyone complains that SortMem is not being
+ * obeyed very faithfully.
+ *
+ *--------------------
+ */
+
+static Tuplesortstate *tuplesort_begin_common(bool randomAccess);
+static void inittapes(Tuplesortstate *state);
+static void selectnewtape(Tuplesortstate *state);
+static void mergeruns(Tuplesortstate *state);
+static void mergeonerun(Tuplesortstate *state);
+static void beginmerge(Tuplesortstate *state);
+static void beginrun(Tuplesortstate *state);
+static void dumptuples(Tuplesortstate *state, bool alltuples);
+static void tuplesort_heap_insert(Tuplesortstate *state, void *tuple,
+								  int tapenum);
+static void tuplesort_heap_siftup(Tuplesortstate *state);
+static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
+static void markrunend(Tuplesortstate *state, int tapenum);
+static int qsort_comparetup(const void *a, const void *b);
+static int comparetup_heap(Tuplesortstate *state,
+						   const void *a, const void *b);
+static void *copytup_heap(Tuplesortstate *state, void *tup);
+static void writetup_heap(Tuplesortstate *state, int tapenum, void *tup);
+static void *readtup_heap(Tuplesortstate *state, int tapenum,
+						  unsigned int len);
+static int comparetup_index(Tuplesortstate *state,
+							const void *a, const void *b);
+static void *copytup_index(Tuplesortstate *state, void *tup);
+static void writetup_index(Tuplesortstate *state, int tapenum, void *tup);
+static void *readtup_index(Tuplesortstate *state, int tapenum,
+						   unsigned int len);
+
+/*
+ * Since qsort(3) will not pass any context info to qsort_comparetup(),
+ * we have to use this ugly static variable.  It is set to point to the
+ * active Tuplesortstate object just before calling qsort.  It should
+ * not be used directly by anything except qsort_comparetup().
+ */
+static Tuplesortstate *qsort_tuplesortstate;
+
+
+/*
+ *		tuplesort_begin_xxx
+ *
+ * Initialize for a tuple sort operation.
+ *
+ * After calling tuplesort_begin, the caller should call tuplesort_puttuple
+ * zero or more times, then call tuplesort_performsort when all the tuples
+ * have been supplied.  After performsort, retrieve the tuples in sorted
+ * order by calling tuplesort_gettuple until it returns NULL.  (If random
+ * access was requested, rescan, markpos, and restorepos can also be called.)
+ * Call tuplesort_end to terminate the operation and release memory/disk space.
+ */
+
+static Tuplesortstate *
+tuplesort_begin_common(bool randomAccess)
+{
+	Tuplesortstate *state;
+
+	state = (Tuplesortstate *) palloc(sizeof(Tuplesortstate));
+
+	MemSet((char *) state, 0, sizeof(Tuplesortstate));
+
+	state->status = TSS_INITIAL;
+	state->randomAccess = randomAccess;
+	state->availMem = SortMem * 1024L;
+	state->tapeset = NULL;
+
+	state->memtupcount = 0;
+	state->memtupsize = 1024;	/* initial guess */
+	state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));
+
+	state->heaptuples = NULL;	/* until and unless needed */
+	state->heaptupcount = 0;
+	state->heaptupsize = 0;
+	state->heapsrctapes = NULL;
+
+	/* Algorithm D variables will be initialized by inittapes, if needed */
+
+	state->result_tape = -1;	/* flag that result tape has not been formed */
+
+	return state;
+}
+
+Tuplesortstate *
+tuplesort_begin_heap(TupleDesc tupDesc,
+					 int nkeys, ScanKey keys,
+					 bool randomAccess)
+{
+	Tuplesortstate *state = tuplesort_begin_common(randomAccess);
+
+	AssertArg(nkeys >= 1);
+	AssertArg(keys[0].sk_attno != 0);
+	AssertArg(keys[0].sk_procedure != 0);
+
+	state->comparetup = comparetup_heap;
+	state->copytup = copytup_heap;
+	state->writetup = writetup_heap;
+	state->readtup = readtup_heap;
+
+	state->tupDesc = tupDesc;
+	state->nKeys = nkeys;
+	state->scanKeys = keys;
+
+	return state;
+}
+
+Tuplesortstate *
+tuplesort_begin_index(Relation indexRel,
+					  bool enforceUnique,
+					  bool randomAccess)
+{
+	Tuplesortstate *state = tuplesort_begin_common(randomAccess);
+
+	state->comparetup = comparetup_index;
+	state->copytup = copytup_index;
+	state->writetup = writetup_index;
+	state->readtup = readtup_index;
+
+	state->indexRel = indexRel;
+	state->enforceUnique = enforceUnique;
+
+	return state;
+}
+
+/*
+ * tuplesort_end
+ *
+ *	Release resources and clean up.
+ */
+void
+tuplesort_end(Tuplesortstate *state)
+{
+	int		i;
+
+	if (state->tapeset)
+		LogicalTapeSetClose(state->tapeset);
+	if (state->memtuples)
+	{
+		for (i = 0; i < state->memtupcount; i++)
+			pfree(state->memtuples[i]);
+		pfree(state->memtuples);
+	}
+	if (state->heaptuples)
+	{
+		for (i = 0; i < state->heaptupcount; i++)
+			pfree(state->heaptuples[i]);
+		pfree(state->heaptuples);
+	}
+	if (state->heapsrctapes)
+		pfree(state->heapsrctapes);
+}
+
+/*
+ * Accept one tuple while collecting input data for sort.
+ *
+ * Note that the input tuple is always copied; the caller need not save it.
+ */
+void
+tuplesort_puttuple(Tuplesortstate *state, void *tuple)
+{
+	/*
+	 * Copy the given tuple into memory we control, and decrease availMem.
+	 */
+	tuple = COPYTUP(state, tuple);
+
+	switch (state->status)
+	{
+		case TSS_INITIAL:
+			/*
+			 * Save the copied tuple into the unsorted array.
+			 */
+			if (state->memtupcount >= state->memtupsize)
+			{
+				/* Grow the unsorted array as needed. */
+				state->memtupsize *= 2;
+				state->memtuples = (void **)
+					repalloc(state->memtuples,
+							 state->memtupsize * sizeof(void *));
+			}
+			state->memtuples[state->memtupcount++] = tuple;
+			/*
+			 * Done if we still fit in available memory.
+			 */
+			if (! LACKMEM(state))
+				return;
+			/*
+			 * Nope; time to switch to tape-based operation.
+			 */
+			inittapes(state);
+			beginrun(state);
+			/*
+			 * Dump tuples until we are back under the limit.
+			 */
+			dumptuples(state, false);
+			break;
+		case TSS_BUILDRUNS:
+			/*
+			 * Insert the copied tuple into the heap if it can go into the
+			 * current run; otherwise add it to the unsorted array, whence
+			 * it will go into the next run.
+			 *
+			 * The tuple can go into the current run if it is >= the first
+			 * not-yet-output tuple.  (Actually, it could go into the current
+			 * run if it is >= the most recently output tuple ... but that
+			 * would require keeping around the tuple we last output, and
+			 * it's simplest to let writetup free the tuple when written.)
+			 *
+			 * Note there will always be at least one tuple in the heap
+			 * at this point; see dumptuples.
+			 */
+			Assert(state->heaptupcount > 0);
+			if (COMPARETUP(state, tuple, state->heaptuples[0]) >= 0)
+			{
+				tuplesort_heap_insert(state, tuple, 0);
+			}
+			else
+			{
+				if (state->memtupcount >= state->memtupsize)
+				{
+					/* Grow the unsorted array as needed. */
+					state->memtupsize *= 2;
+					state->memtuples = (void **)
+						repalloc(state->memtuples,
+								 state->memtupsize * sizeof(void *));
+				}
+				state->memtuples[state->memtupcount++] = tuple;
+			}
+			/*
+			 * If we are over the memory limit, dump tuples till we're under.
+			 */
+			dumptuples(state, false);
+			break;
+		default:
+			elog(ERROR, "tuplesort_puttuple: invalid state");
+			break;
+	}
+}
+
+/*
+ * All tuples have been provided; finish the sort.
+ */
+void
+tuplesort_performsort(Tuplesortstate *state)
+{
+	switch (state->status)
+	{
+		case TSS_INITIAL:
+			/*
+			 * We were able to accumulate all the tuples within the
+			 * allowed amount of memory.  Just qsort 'em and we're done.
+			 */
+			if (state->memtupcount > 1)
+			{
+				qsort_tuplesortstate = state;
+				qsort((void *) state->memtuples, state->memtupcount,
+					  sizeof(void *), qsort_comparetup);
+			}
+			state->current = 0;
+			state->eof_reached = false;
+			state->markpos_offset = 0;
+			state->markpos_eof = false;
+			state->status = TSS_SORTEDINMEM;
+			break;
+		case TSS_BUILDRUNS:
+			/*
+			 * Finish tape-based sort.  First, flush all tuples remaining
+			 * in memory out to tape; then merge until we have a single
+			 * remaining run (or, if !randomAccess, one run per tape).
+			 * Note that mergeruns sets the correct status.
+			 */
+			dumptuples(state, true);
+			mergeruns(state);
+			state->eof_reached = false;
+			state->markpos_block = 0L;
+			state->markpos_offset = 0;
+			state->markpos_eof = false;
+			break;
+		default:
+			elog(ERROR, "tuplesort_performsort: invalid state");
+			break;
+	}
+}
+
+/*
+ * Fetch the next tuple in either forward or back direction.
+ * Returns NULL if no more tuples.  If should_free is set, the
+ * caller must pfree the returned tuple when done with it.
+ */
+void *
+tuplesort_gettuple(Tuplesortstate *state, bool forward,
+				   bool *should_free)
+{
+	unsigned int	tuplen;
+	void		   *tup;
+
+	switch (state->status)
+	{
+		case TSS_SORTEDINMEM:
+			Assert(forward || state->randomAccess);
+			*should_free = false;
+			if (forward)
+			{
+				if (state->current < state->memtupcount)
+					return state->memtuples[state->current++];
+				state->eof_reached = true;
+				return NULL;
+			}
+			else
+			{
+				if (state->current <= 0)
+					return NULL;
+				/*
+				 * if all tuples are fetched already then we return last tuple,
+				 * else - tuple before last returned.
+				 */
+				if (state->eof_reached)
+					state->eof_reached = false;
+				else
+				{
+					state->current--; /* last returned tuple */
+					if (state->current <= 0)
+						return NULL;
+				}
+				return state->memtuples[state->current - 1];
+			}
+			break;
+
+		case TSS_SORTEDONTAPE:
+			Assert(forward || state->randomAccess);
+			*should_free = true;
+			if (forward)
+			{
+				if (state->eof_reached)
+					return NULL;
+				if ((tuplen = getlen(state, state->result_tape, true)) != 0)
+				{
+					tup = READTUP(state, state->result_tape, tuplen);
+					return tup;
+				}
+				else
+				{
+					state->eof_reached = true;
+					return NULL;
+				}
+			}
+			/* Backward.
+			 *
+			 * if all tuples are fetched already then we return last tuple,
+			 * else - tuple before last returned.
+			 */
+			if (state->eof_reached)
+			{
+				/*
+				 * Seek position is pointing just past the zero tuplen
+				 * at the end of file; back up to fetch last tuple's ending
+				 * length word.  If seek fails we must have a completely empty
+				 * file.
+				 */
+				if (! LogicalTapeBackspace(state->tapeset,
+										   state->result_tape,
+										   2 * sizeof(unsigned int)))
+					return NULL;
+				state->eof_reached = false;
+			}
+			else
+			{
+				/*
+				 * Back up and fetch previously-returned tuple's ending length
+				 * word.  If seek fails, assume we are at start of file.
+				 */
+				if (! LogicalTapeBackspace(state->tapeset,
+										   state->result_tape,
+										   sizeof(unsigned int)))
+					return NULL;
+				tuplen = getlen(state, state->result_tape, false);
+				/*
+				 * Back up to get ending length word of tuple before it.
+				 */
+				if (! LogicalTapeBackspace(state->tapeset,
+										   state->result_tape,
+										   tuplen + 2 * sizeof(unsigned int)))
+				{
+					/* If that fails, presumably the prev tuple is the first
+					 * in the file.  Back up so that it becomes next to read
+					 * in forward direction (not obviously right, but that is
+					 * what in-memory case does).
+					 */
+					if (! LogicalTapeBackspace(state->tapeset,
+											   state->result_tape,
+											   tuplen + sizeof(unsigned int)))
+						elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan");
+					return NULL;
+				}
+			}
+
+			tuplen = getlen(state, state->result_tape, false);
+			/*
+			 * Now we have the length of the prior tuple, back up and read it.
+			 * Note: READTUP expects we are positioned after the initial
+			 * length word of the tuple, so back up to that point.
+			 */
+			if (! LogicalTapeBackspace(state->tapeset,
+									   state->result_tape,
+									   tuplen))
+				elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan");
+			tup = READTUP(state, state->result_tape, tuplen);
+			return tup;
+
+		case TSS_FINALMERGE:
+			Assert(forward);
+			*should_free = true;
+			/*
+			 * This code should match the inner loop of mergeonerun().
+			 */
+			if (state->heaptupcount > 0)
+			{
+				int		srcTape = state->heapsrctapes[0];
+
+				tup = state->heaptuples[0];
+				tuplesort_heap_siftup(state);
+				if ((tuplen = getlen(state, srcTape, true)) != 0)
+				{
+					void   *newtup = READTUP(state, srcTape, tuplen);
+					tuplesort_heap_insert(state, newtup, srcTape);
+				}
+				return tup;
+			}
+			return NULL;
+
+		default:
+			elog(ERROR, "tuplesort_gettuple: invalid state");
+			return NULL;		/* keep compiler quiet */
+	}
+}
+
+/*
+ * inittapes - initialize for tape sorting.
+ *
+ * This is called only if we have found we don't have room to sort in memory.
+ */
+static void
+inittapes(Tuplesortstate *state)
+{
+	int			j;
+
+	state->tapeset = LogicalTapeSetCreate(MAXTAPES);
+
+	/*
+	 * Initialize heaptuples array slightly larger than current memtuples
+	 * usage; memtupcount is probably a good guess at how many tuples we
+	 * will be able to have in the heap at once.
+	 */
+	state->heaptupcount = 0;
+	state->heaptupsize = state->memtupcount + state->memtupcount / 4;
+	state->heaptuples = (void **) palloc(state->heaptupsize * sizeof(void *));
+
+	/*
+	 * Initialize variables of Algorithm D (step D1).
+	 */
+	for (j = 0; j < MAXTAPES; j++)
+	{
+		state->tp_fib[j] = 1;
+		state->tp_runs[j] = 0;
+		state->tp_dummy[j] = 1;
+		state->tp_tapenum[j] = j;
+	}
+	state->tp_fib[TAPERANGE] = 0;
+	state->tp_dummy[TAPERANGE] = 0;
+
+	state->Level = 1;
+	state->destTape = 0;
+
+	state->multipleRuns = false;
+
+	state->status = TSS_BUILDRUNS;
+}
+
+/*
+ * selectnewtape -- select new tape for new initial run.
+ *
+ * This is called after finishing a run when we know another run
+ * must be started.  This implements steps D3, D4 of Algorithm D.
+ */
+static void
+selectnewtape(Tuplesortstate *state)
+{
+	int		j;
+	int		a;
+
+	/* We now have at least two initial runs */
+	state->multipleRuns = true;
+
+	/* Step D3: advance j (destTape) */
+	if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape+1])
+	{
+		state->destTape++;
+		return;
+	}
+	if (state->tp_dummy[state->destTape] != 0)
+	{
+		state->destTape = 0;
+		return;
+	}
+
+	/* Step D4: increase level */
+	state->Level++;
+	a = state->tp_fib[0];
+	for (j = 0; j < TAPERANGE; j++)
+	{
+		state->tp_dummy[j] = a + state->tp_fib[j+1] - state->tp_fib[j];
+		state->tp_fib[j] = a + state->tp_fib[j+1];
+	}
+	state->destTape = 0;
+}
+
+/*
+ * mergeruns -- merge all the completed initial runs.
+ *
+ * This implements steps D5, D6 of Algorithm D.  All input data has
+ * already been written to initial runs on tape (see dumptuples).
+ */
+static void
+mergeruns(Tuplesortstate *state)
+{
+	int		tapenum,
+			svTape,
+			svRuns,
+			svDummy;
+
+	Assert(state->status == TSS_BUILDRUNS);
+	Assert(state->memtupcount == 0 && state->heaptupcount == 0);
+	/*
+	 * If we produced only one initial run (quite likely if the total
+	 * data volume is between 1X and 2X SortMem), we can just use that
+	 * tape as the finished output, rather than doing a useless merge.
+	 */
+	if (! state->multipleRuns)
+	{
+		state->result_tape = state->tp_tapenum[state->destTape];
+		/* must freeze and rewind the finished output tape */
+		LogicalTapeFreeze(state->tapeset, state->result_tape);
+		state->status = TSS_SORTEDONTAPE;
+		return;
+	}
+
+	/* End of step D2: rewind all output tapes to prepare for merging */
+	for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+		LogicalTapeRewind(state->tapeset, tapenum, false);
+
+	for (;;)
+	{
+		/* Step D5: merge runs onto tape[T] until tape[P] is empty */
+		while (state->tp_runs[TAPERANGE-1] || state->tp_dummy[TAPERANGE-1])
+		{
+			bool	allDummy = true;
+			bool	allOneRun = true;
+
+			for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+			{
+				if (state->tp_dummy[tapenum] == 0)
+					allDummy = false;
+				if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
+					allOneRun = false;
+			}
+			/*
+			 * If we don't have to produce a materialized sorted tape,
+			 * quit as soon as we're down to one real/dummy run per tape.
+			 */
+			if (! state->randomAccess && allOneRun)
+			{
+				Assert(! allDummy);
+				/* Initialize for the final merge pass */
+				beginmerge(state);
+				state->status = TSS_FINALMERGE;
+				return;
+			}
+			if (allDummy)
+			{	
+				state->tp_dummy[TAPERANGE]++;
+				for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+					state->tp_dummy[tapenum]--;
+			}
+			else
+			{
+				mergeonerun(state);
+			}
+		}
+		/* Step D6: decrease level */
+		if (--state->Level == 0)
+			break;
+		/* rewind output tape T to use as new input */
+		LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE],
+						  false);
+		/* rewind used-up input tape P, and prepare it for write pass */
+		LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE-1],
+						  true);
+		state->tp_runs[TAPERANGE-1] = 0;
+		/* reassign tape units per step D6; note we no longer care about A[] */
+		svTape = state->tp_tapenum[TAPERANGE];
+		svDummy = state->tp_dummy[TAPERANGE];
+		svRuns = state->tp_runs[TAPERANGE];
+		for (tapenum = TAPERANGE; tapenum > 0; tapenum--)
+		{
+			state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum-1];
+			state->tp_dummy[tapenum] = state->tp_dummy[tapenum-1];
+			state->tp_runs[tapenum] = state->tp_runs[tapenum-1];
+		}
+		state->tp_tapenum[0] = svTape;
+		state->tp_dummy[0] = svDummy;
+		state->tp_runs[0] = svRuns;
+	}
+	/*
+	 * Done.  Knuth says that the result is on TAPE[1], but since we exited
+	 * the loop without performing the last iteration of step D6, we have not
+	 * rearranged the tape unit assignment, and therefore the result is on
+	 * TAPE[T].  We need to do it this way so that we can freeze the final
+	 * output tape while rewinding it.  The last iteration of step D6 would
+	 * be a waste of cycles anyway...
+	 */
+	state->result_tape = state->tp_tapenum[TAPERANGE];
+	LogicalTapeFreeze(state->tapeset, state->result_tape);
+	state->status = TSS_SORTEDONTAPE;
+}
+
+/*
+ * Merge one run from each input tape, except ones with dummy runs.
+ *
+ * This is the inner loop of Algorithm D step D5.  We know that the
+ * output tape is TAPE[T].
+ */
+static void
+mergeonerun(Tuplesortstate *state)
+{
+	int				destTape = state->tp_tapenum[TAPERANGE];
+	int				srcTape;
+	unsigned int	tuplen;
+	void		   *tup;
+
+	/*
+	 * Start the merge by loading one tuple from each active source tape
+	 * into the heap.  We can also decrease the input run/dummy run counts.
+	 */
+	beginmerge(state);
+
+	/*
+	 * Execute merge by repeatedly extracting lowest tuple in heap,
+	 * writing it out, and replacing it with next tuple from same tape
+	 * (if there is another one).
+	 */
+	while (state->heaptupcount > 0)
+	{
+		WRITETUP(state, destTape, state->heaptuples[0]);
+		srcTape = state->heapsrctapes[0];
+		tuplesort_heap_siftup(state);
+		if ((tuplen = getlen(state, srcTape, true)) != 0)
+		{
+			tup = READTUP(state, srcTape, tuplen);
+			tuplesort_heap_insert(state, tup, srcTape);
+		}
+	}
+
+	/*
+	 * When the heap empties, we're done.  Write an end-of-run marker
+	 * on the output tape, and increment its count of real runs.
+	 */
+	markrunend(state, destTape);
+	state->tp_runs[TAPERANGE]++;
+}
+
+/*
+ * beginmerge - initialize for a merge pass
+ *
+ * We load the first tuple from each nondummy input run into the heap.
+ * We also decrease the counts of real and dummy runs for each tape.
+ */
+static void
+beginmerge(Tuplesortstate *state)
+{
+	int				tapenum;
+	int				srcTape;
+	unsigned int	tuplen;
+	void		   *tup;
+
+	Assert(state->heaptuples != NULL && state->heaptupcount == 0);
+	if (state->heapsrctapes == NULL)
+		state->heapsrctapes = (int *) palloc(MAXTAPES * sizeof(int));
+
+	for (tapenum = 0; tapenum < TAPERANGE; tapenum++)
+	{
+		if (state->tp_dummy[tapenum] > 0)
+		{
+			state->tp_dummy[tapenum]--;
+		}
+		else
+		{
+			Assert(state->tp_runs[tapenum] > 0);
+			state->tp_runs[tapenum]--;
+			srcTape = state->tp_tapenum[tapenum];
+			tuplen = getlen(state, srcTape, false);
+			tup = READTUP(state, srcTape, tuplen);
+			tuplesort_heap_insert(state, tup, srcTape);
+		}
+	}
+
+}
+
+/*
+ * beginrun - start a new initial run
+ *
+ * The tuples presently in the unsorted memory array are moved into
+ * the heap.
+ */
+static void
+beginrun(Tuplesortstate *state)
+{
+	int		i;
+
+	Assert(state->heaptupcount == 0 && state->memtupcount > 0);
+	for (i = 0; i < state->memtupcount; i++)
+		tuplesort_heap_insert(state, state->memtuples[i], 0);
+	state->memtupcount = 0;
+}
+
+/*
+ * dumptuples - remove tuples from heap and write to tape
+ *
+ * When alltuples = false, dump only enough tuples to get under the
+ * availMem limit (and leave at least one tuple in the heap in any case,
+ * since puttuple assumes it always has a tuple to compare to).
+ *
+ * When alltuples = true, dump everything currently in memory.
+ * (This case is only used at end of input data.)
+ *
+ * If we empty the heap, then start a new run using the tuples that
+ * have accumulated in memtuples[] (if any).
+ */
+static void
+dumptuples(Tuplesortstate *state, bool alltuples)
+{
+	while (alltuples ||
+		   (LACKMEM(state) &&
+			(state->heaptupcount > 0 || state->memtupcount > 0)))
+	{
+		/*
+		 * Dump the heap's frontmost entry, and sift up to remove it
+		 * from the heap.
+		 */
+		Assert(state->heaptupcount > 0);
+		WRITETUP(state, state->tp_tapenum[state->destTape],
+				 state->heaptuples[0]);
+		tuplesort_heap_siftup(state);
+		/*
+		 * If the heap is now empty, we've finished a run.
+		 */
+		if (state->heaptupcount == 0)
+		{
+			markrunend(state, state->tp_tapenum[state->destTape]);
+			state->tp_runs[state->destTape]++;
+			state->tp_dummy[state->destTape]--;	/* per Alg D step D2 */
+			if (state->memtupcount == 0)
+				break;			/* all input data has been written to tape */
+			/* Select new output tape and start a new run */
+			selectnewtape(state);
+			beginrun(state);
+		}
+	}
+}
+
+/*
+ * tuplesort_rescan		- rewind and replay the scan
+ */
+void
+tuplesort_rescan(Tuplesortstate *state)
+{
+	Assert(state->randomAccess);
+
+	switch (state->status)
+	{
+		case TSS_SORTEDINMEM:
+			state->current = 0;
+			state->eof_reached = false;
+			state->markpos_offset = 0;
+			state->markpos_eof = false;
+			break;
+		case TSS_SORTEDONTAPE:
+			LogicalTapeRewind(state->tapeset,
+							  state->result_tape,
+							  false);
+			state->eof_reached = false;
+			state->markpos_block = 0L;
+			state->markpos_offset = 0;
+			state->markpos_eof = false;
+			break;
+		default:
+			elog(ERROR, "tuplesort_rescan: invalid state");
+			break;
+	}
+}
+
+/*
+ * tuplesort_markpos	- saves current position in the merged sort file
+ */
+void
+tuplesort_markpos(Tuplesortstate *state)
+{
+	Assert(state->randomAccess);
+
+	switch (state->status)
+	{
+		case TSS_SORTEDINMEM:
+			state->markpos_offset = state->current;
+			state->markpos_eof = state->eof_reached;
+			break;
+		case TSS_SORTEDONTAPE:
+			LogicalTapeTell(state->tapeset,
+							state->result_tape,
+							& state->markpos_block,
+							& state->markpos_offset);
+			state->markpos_eof = state->eof_reached;
+			break;
+		default:
+			elog(ERROR, "tuplesort_markpos: invalid state");
+			break;
+	}
+}
+
+/*
+ * tuplesort_restorepos	- restores current position in merged sort file to
+ *						  last saved position
+ */
+void
+tuplesort_restorepos(Tuplesortstate *state)
+{
+	Assert(state->randomAccess);
+
+	switch (state->status)
+	{
+		case TSS_SORTEDINMEM:
+			state->current = state->markpos_offset;
+			state->eof_reached = state->markpos_eof;
+			break;
+		case TSS_SORTEDONTAPE:
+			if (! LogicalTapeSeek(state->tapeset,
+								  state->result_tape,
+								  state->markpos_block,
+								  state->markpos_offset))
+				elog(ERROR, "tuplesort_restorepos failed");
+			state->eof_reached = state->markpos_eof;
+			break;
+		default:
+			elog(ERROR, "tuplesort_restorepos: invalid state");
+			break;
+	}
+}
+
+
+/*
+ * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
+ */
+
+/*
+ * Insert a new tuple into an empty or existing heap, maintaining the
+ * heap invariant.  The heap lives in state->heaptuples[].  Also, if
+ * state->heapsrctapes is not NULL, we store each tuple's source tapenum
+ * in the corresponding element of state->heapsrctapes[].
+ */
+static void
+tuplesort_heap_insert(Tuplesortstate *state, void *tuple,
+					  int tapenum)
+{
+	int		j;
+
+	/*
+	 * Make sure heaptuples[] can handle another entry.
+	 * NOTE: we do not enlarge heapsrctapes[]; it's supposed
+	 * to be big enough when created.
+	 */
+	if (state->heaptupcount >= state->heaptupsize)
+	{
+		/* Grow the unsorted array as needed. */
+		state->heaptupsize *= 2;
+		state->heaptuples = (void **)
+			repalloc(state->heaptuples,
+					 state->heaptupsize * sizeof(void *));
+	}
+	/*
+	 * Sift-up the new entry, per Knuth 5.2.3 exercise 16.
+	 * Note that Knuth is using 1-based array indexes, not 0-based.
+	 */
+	j = state->heaptupcount++;
+	while (j > 0) {
+		int		i = (j-1) >> 1;
+
+		if (COMPARETUP(state, tuple, state->heaptuples[i]) >= 0)
+			break;
+		state->heaptuples[j] = state->heaptuples[i];
+		if (state->heapsrctapes)
+			state->heapsrctapes[j] = state->heapsrctapes[i];
+		j = i;
+	}
+	state->heaptuples[j] = tuple;
+	if (state->heapsrctapes)
+		state->heapsrctapes[j] = tapenum;
+}
+
+/*
+ * The tuple at state->heaptuples[0] has been removed from the heap.
+ * Decrement heaptupcount, and sift up to maintain the heap invariant.
+ */
+static void
+tuplesort_heap_siftup(Tuplesortstate *state)
+{
+	void  **heaptuples = state->heaptuples;
+	void   *tuple;
+	int		i,
+			n;
+
+	if (--state->heaptupcount <= 0)
+		return;
+	n = state->heaptupcount;
+	tuple = heaptuples[n];		/* tuple that must be reinserted */
+	i = 0;						/* i is where the "hole" is */
+    for (;;) {
+		int		j = 2*i + 1;
+
+		if (j >= n)
+			break;
+		if (j+1 < n &&
+			COMPARETUP(state, heaptuples[j], heaptuples[j+1]) > 0)
+			j++;
+		if (COMPARETUP(state, tuple, heaptuples[j]) <= 0)
+			break;
+		heaptuples[i] = heaptuples[j];
+		if (state->heapsrctapes)
+			state->heapsrctapes[i] = state->heapsrctapes[j];
+		i = j;
+    }
+    heaptuples[i] = tuple;
+	if (state->heapsrctapes)
+		state->heapsrctapes[i] = state->heapsrctapes[n];
+}
+
+
+/*
+ * Tape interface routines
+ */
+
+static unsigned int
+getlen(Tuplesortstate *state, int tapenum, bool eofOK)
+{
+	unsigned int	len;
+
+	if (LogicalTapeRead(state->tapeset, tapenum, (void *) &len,
+						sizeof(len)) != sizeof(len))
+		elog(ERROR, "tuplesort: unexpected end of tape");
+	if (len == 0 && !eofOK)
+		elog(ERROR, "tuplesort: unexpected end of data");
+	return len;
+}
+
+static void
+markrunend(Tuplesortstate *state, int tapenum)
+{
+	unsigned int	len = 0;
+
+	LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
+}
+
+
+/*
+ * qsort interface
+ */
+
+static int
+qsort_comparetup(const void *a, const void *b)
+{
+	/* The passed pointers are pointers to void * ... */
+
+	return COMPARETUP(qsort_tuplesortstate, * (void **) a, * (void **) b);
+}
+
+
+/*
+ * Routines specialized for HeapTuple case
+ */
+
+static int
+comparetup_heap(Tuplesortstate *state, const void *a, const void *b)
+{
+	HeapTuple	ltup = (HeapTuple) a;
+	HeapTuple	rtup = (HeapTuple) b;
+	int			nkey;
+
+	for (nkey = 0; nkey < state->nKeys; nkey++)
+	{
+		ScanKey		scanKey = state->scanKeys + nkey;
+		Datum		lattr,
+					rattr;
+		bool		isnull1,
+					isnull2;
+		int			result;
+
+		lattr = heap_getattr(ltup,
+							 scanKey->sk_attno,
+							 state->tupDesc,
+							 &isnull1);
+		rattr = heap_getattr(rtup,
+							 scanKey->sk_attno,
+							 state->tupDesc,
+							 &isnull2);
+		if (isnull1)
+		{
+			if (!isnull2)
+				return 1;		/* NULL sorts after non-NULL */
+		}
+		else if (isnull2)
+			return -1;
+		else if (scanKey->sk_flags & SK_COMMUTE)
+		{
+			if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr)))
+				result = (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr);
+			if (result)
+				return result;
+		}
+		else
+		{
+			if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr)))
+				result = (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr);
+			if (result)
+				return result;
+		}
+	}
+
+	return 0;
+}
+
+static void *
+copytup_heap(Tuplesortstate *state, void *tup)
+{
+	HeapTuple	tuple = (HeapTuple) tup;
+
+	USEMEM(state, HEAPTUPLESIZE + tuple->t_len);
+	return (void *) heap_copytuple(tuple);
+}
+
+/*
+ * We don't bother to write the HeapTupleData part of the tuple.
+ */
+
+static void
+writetup_heap(Tuplesortstate *state, int tapenum, void *tup)
+{
+	HeapTuple		tuple = (HeapTuple) tup;
+	unsigned int	tuplen;
+
+	tuplen = tuple->t_len + sizeof(tuplen);
+	LogicalTapeWrite(state->tapeset, tapenum,
+					 (void*) &tuplen, sizeof(tuplen));
+	LogicalTapeWrite(state->tapeset, tapenum,
+					 (void*) tuple->t_data, tuple->t_len);
+	if (state->randomAccess)	/* need trailing length word? */
+		LogicalTapeWrite(state->tapeset, tapenum,
+						 (void*) &tuplen, sizeof(tuplen));
+
+	FREEMEM(state, HEAPTUPLESIZE + tuple->t_len);
+	pfree(tuple);
+}
+
+static void *
+readtup_heap(Tuplesortstate *state, int tapenum, unsigned int len)
+{
+	unsigned int	tuplen = len - sizeof(unsigned int) + HEAPTUPLESIZE;
+	HeapTuple		tuple = (HeapTuple) palloc(tuplen);
+
+	USEMEM(state, tuplen);
+	/* reconstruct the HeapTupleData portion */
+	tuple->t_len = len - sizeof(unsigned int);
+	ItemPointerSetInvalid(&(tuple->t_self));
+	tuple->t_data = (HeapTupleHeader) (((char *) tuple) + HEAPTUPLESIZE);
+	/* read in the tuple proper */
+	if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple->t_data,
+						tuple->t_len) != tuple->t_len)
+		elog(ERROR, "tuplesort: unexpected end of data");
+	if (state->randomAccess)	/* need trailing length word? */
+		if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen,
+							sizeof(tuplen)) != sizeof(tuplen))
+			elog(ERROR, "tuplesort: unexpected end of data");
+	return (void *) tuple;
+}
+
+
+/*
+ * Routines specialized for IndexTuple case
+ *
+ * NOTE: actually, these are specialized for the btree case; it's not
+ * clear whether you could use them for a non-btree index.  Possibly
+ * you'd need to make another set of routines if you needed to sort
+ * according to another kind of index.
+ */
+
+static int
+comparetup_index(Tuplesortstate *state, const void *a, const void *b)
+{
+	IndexTuple	ltup = (IndexTuple) a;
+	IndexTuple	rtup = (IndexTuple) b;
+	TupleDesc	itdesc = state->indexRel->rd_att;
+	bool		equal_isnull = false;
+	Datum		lattr,
+				rattr;
+	bool		isnull1,
+				isnull2;
+	int			i;
+
+	for (i = 0; i < itdesc->natts; i++)
+	{
+		lattr = index_getattr(ltup, i + 1, itdesc, &isnull1);
+		rattr = index_getattr(rtup, i + 1, itdesc, &isnull2);
+
+		if (isnull1)
+		{
+			if (!isnull2)
+				return 1;		/* NULL sorts after non-NULL */
+			equal_isnull = true;
+			continue;
+		}
+		else if (isnull2)
+			return -1;
+
+		if (_bt_invokestrat(state->indexRel, i + 1,
+							BTGreaterStrategyNumber,
+							lattr, rattr))
+			return 1;
+		if (_bt_invokestrat(state->indexRel, i + 1,
+							BTGreaterStrategyNumber,
+							rattr, lattr))
+			return -1;
+	}
+
+	/*
+	 * If btree has asked us to enforce uniqueness, complain if two equal
+	 * tuples are detected (unless there was at least one NULL field).
+	 *
+	 * It is sufficient to make the test here, because if two tuples are
+	 * equal they *must* get compared at some stage of the sort --- otherwise
+	 * the sort algorithm wouldn't have checked whether one must appear
+	 * before the other.
+	 */
+	if (state->enforceUnique && !equal_isnull)
+		elog(ERROR, "Cannot create unique index. Table contains non-unique values");
+
+	return 0;
+}
+
+static void *
+copytup_index(Tuplesortstate *state, void *tup)
+{
+	IndexTuple		tuple = (IndexTuple) tup;
+	unsigned int	tuplen = IndexTupleSize(tuple);
+	IndexTuple		newtuple;
+
+	USEMEM(state, tuplen);
+	newtuple = (IndexTuple) palloc(tuplen);
+	memcpy(newtuple, tuple, tuplen);
+
+	return (void *) newtuple;
+}
+
+static void
+writetup_index(Tuplesortstate *state, int tapenum, void *tup)
+{
+	IndexTuple		tuple = (IndexTuple) tup;
+	unsigned int	tuplen;
+
+	tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
+	LogicalTapeWrite(state->tapeset, tapenum,
+					 (void*) &tuplen, sizeof(tuplen));
+	LogicalTapeWrite(state->tapeset, tapenum,
+					 (void*) tuple, IndexTupleSize(tuple));
+	if (state->randomAccess)	/* need trailing length word? */
+		LogicalTapeWrite(state->tapeset, tapenum,
+						 (void*) &tuplen, sizeof(tuplen));
+
+	FREEMEM(state, IndexTupleSize(tuple));
+	pfree(tuple);
+}
+
+static void *
+readtup_index(Tuplesortstate *state, int tapenum, unsigned int len)
+{
+	unsigned int	tuplen = len - sizeof(unsigned int);
+	IndexTuple		tuple = (IndexTuple) palloc(tuplen);
+
+	USEMEM(state, tuplen);
+	if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple,
+						tuplen) != tuplen)
+		elog(ERROR, "tuplesort: unexpected end of data");
+	if (state->randomAccess)	/* need trailing length word? */
+		if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen,
+							sizeof(tuplen)) != sizeof(tuplen))
+			elog(ERROR, "tuplesort: unexpected end of data");
+	return (void *) tuple;
+}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 7c57a9a4f99..613595febf4 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: nbtree.h,v 1.31 1999/08/08 20:12:49 tgl Exp $
+ * $Id: nbtree.h,v 1.32 1999/10/17 22:15:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -288,9 +288,12 @@ extern BTItem _bt_formitem(IndexTuple itup);
 /*
  * prototypes for functions in nbtsort.c
  */
-extern void *_bt_spoolinit(Relation index, int ntapes, bool isunique);
-extern void _bt_spooldestroy(void *spool);
-extern void _bt_spool(Relation index, BTItem btitem, void *spool);
-extern void _bt_leafbuild(Relation index, void *spool);
+
+typedef struct BTSpool BTSpool;	/* opaque type known only within nbtsort.c */
+
+extern BTSpool *_bt_spoolinit(Relation index, bool isunique);
+extern void _bt_spooldestroy(BTSpool *btspool);
+extern void _bt_spool(BTItem btitem, BTSpool *btspool);
+extern void _bt_leafbuild(BTSpool *btspool);
 
 #endif	 /* NBTREE_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 092fa57acb1..44aa8b8ace5 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: execnodes.h,v 1.36 1999/09/26 21:21:04 tgl Exp $
+ * $Id: execnodes.h,v 1.37 1999/10/17 22:15:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -597,17 +597,9 @@ typedef struct GroupState
 /* ----------------
  *	 SortState information
  *
- *|		sort nodes are really just a kind of a scan since
- *|		we implement sorts by retrieving the entire subplan
- *|		into a temp relation, sorting the temp relation into
- *|		another sorted relation, and then preforming a simple
- *|		unqualified sequential scan on the sorted relation..
- *|		-cim 10/15/89
- *
- *		Flag			indicated whether relation has been sorted
- *		Keys			scan key structures used to keep info on sort keys
- *		TempRelation	temporary relation containing result of executing
- *						the subplan.
+ *		sort_Done		indicates whether sort has been performed yet
+ *		sort_Keys		scan key structures describing the sort keys
+ *		tuplesortstate	private state of tuplesort.c
  *
  *	 CommonScanState information
  *
@@ -628,9 +620,9 @@ typedef struct GroupState
 typedef struct SortState
 {
 	CommonScanState csstate;	/* its first field is NodeTag */
-	bool		sort_Flag;
+	bool		sort_Done;
 	ScanKey		sort_Keys;
-	bool		cleaned;
+	void	   *tuplesortstate;
 } SortState;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 095ee074d38..a03dacfb02b 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: plannodes.h,v 1.30 1999/08/21 03:49:09 tgl Exp $
+ * $Id: plannodes.h,v 1.31 1999/10/17 22:15:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -288,8 +288,6 @@ typedef struct Sort
 	Oid			nonameid;
 	int			keycount;
 	SortState  *sortstate;
-	void	   *psortstate;
-	bool		cleaned;
 } Sort;
 
 /* ----------------
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
new file mode 100644
index 00000000000..7c5a3209897
--- /dev/null
+++ b/src/include/utils/tuplesort.h
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ * tuplesort.h
+ *	  Generalized tuple sorting routines.
+ *
+ * This module handles sorting of either heap tuples or index tuples
+ * (and could fairly easily support other kinds of sortable objects,
+ * if necessary).  It works efficiently for both small and large amounts
+ * of data.  Small amounts are sorted in-memory using qsort().  Large
+ * amounts are sorted using temporary files and a standard external sort
+ * algorithm.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: tuplesort.h,v 1.1 1999/10/17 22:15:09 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TUPLESORT_H
+#define TUPLESORT_H
+
+#include "access/htup.h"
+#include "access/itup.h"
+#include "access/skey.h"
+#include "access/tupdesc.h"
+#include "utils/rel.h"
+
+/* Tuplesortstate is an opaque type whose details are not known outside tuplesort.c. */
+
+typedef struct Tuplesortstate Tuplesortstate;
+
+/*
+ * We provide two different interfaces to what is essentially the same
+ * code: one for sorting HeapTuples and one for sorting IndexTuples.
+ * They differ primarily in the way that the sort key information is
+ * supplied.
+ */
+
+extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
+											int nkeys, ScanKey keys,
+											bool randomAccess);
+extern Tuplesortstate *tuplesort_begin_index(Relation indexRel,
+											 bool enforceUnique,
+											 bool randomAccess);
+
+extern void tuplesort_puttuple(Tuplesortstate *state, void *tuple);
+
+extern void tuplesort_performsort(Tuplesortstate *state);
+
+extern void *tuplesort_gettuple(Tuplesortstate *state, bool forward,
+								bool *should_free);
+#define tuplesort_getheaptuple(state, forward, should_free) \
+	((HeapTuple) tuplesort_gettuple(state, forward, should_free))
+#define tuplesort_getindextuple(state, forward, should_free) \
+	((IndexTuple) tuplesort_gettuple(state, forward, should_free))
+
+extern void tuplesort_end(Tuplesortstate *state);
+
+/*
+ * These routines may only be called if randomAccess was specified 'true'.
+ * Backwards scan in gettuple is likewise only allowed if randomAccess.
+ */
+
+extern void tuplesort_rescan(Tuplesortstate *state);
+extern void tuplesort_markpos(Tuplesortstate *state);
+extern void tuplesort_restorepos(Tuplesortstate *state);
+
+#endif	 /* TUPLESORT_H */
author	Tom Lane <tgl@sss.pgh.pa.us>	1999-10-17 22:15:09 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	1999-10-17 22:15:09 +0000
commit	26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54 (patch)
tree	cbcf32d78330eb3414abed1117b0a54090302a97 /src
parent	59ed74e60bb3c1ad2b83ebacbb49f74517d8764e (diff)
download	postgresql-26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54.tar.gz postgresql-26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54.zip