11 files changed, 593 insertions, 203 deletions
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 0892cdbe3eb..1fd6f0cb331 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994-5, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/explain.c,v 1.175 2008/05/14 19:10:29 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/explain.c,v 1.176 2008/08/07 03:04:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -558,19 +558,47 @@ explain_outNode(StringInfo str,
 			pname = "Unique";
 			break;
 		case T_SetOp:
-			switch (((SetOp *) plan)->cmd)
+			switch (((SetOp *) plan)->strategy)
 			{
-				case SETOPCMD_INTERSECT:
-					pname = "SetOp Intersect";
+				case SETOP_SORTED:
+					switch (((SetOp *) plan)->cmd)
+					{
+						case SETOPCMD_INTERSECT:
+							pname = "SetOp Intersect";
+							break;
+						case SETOPCMD_INTERSECT_ALL:
+							pname = "SetOp Intersect All";
+							break;
+						case SETOPCMD_EXCEPT:
+							pname = "SetOp Except";
+							break;
+						case SETOPCMD_EXCEPT_ALL:
+							pname = "SetOp Except All";
+							break;
+						default:
+							pname = "SetOp ???";
+							break;
+					}
 					break;
-				case SETOPCMD_INTERSECT_ALL:
-					pname = "SetOp Intersect All";
-					break;
-				case SETOPCMD_EXCEPT:
-					pname = "SetOp Except";
-					break;
-				case SETOPCMD_EXCEPT_ALL:
-					pname = "SetOp Except All";
+				case SETOP_HASHED:
+					switch (((SetOp *) plan)->cmd)
+					{
+						case SETOPCMD_INTERSECT:
+							pname = "HashSetOp Intersect";
+							break;
+						case SETOPCMD_INTERSECT_ALL:
+							pname = "HashSetOp Intersect All";
+							break;
+						case SETOPCMD_EXCEPT:
+							pname = "HashSetOp Except";
+							break;
+						case SETOPCMD_EXCEPT_ALL:
+							pname = "HashSetOp Except All";
+							break;
+						default:
+							pname = "HashSetOp ???";
+							break;
+					}
 					break;
 				default:
 					pname = "SetOp ???";
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index c60ceff1259..2421b0a6df4 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -4,33 +4,38 @@
  *	  Routines to handle INTERSECT and EXCEPT selection
  *
  * The input of a SetOp node consists of tuples from two relations,
- * which have been combined into one dataset and sorted on all the nonjunk
- * attributes.	In addition there is a junk attribute that shows which
- * relation each tuple came from.  The SetOp node scans each group of
- * identical tuples to determine how many came from each input relation.
- * Then it is a simple matter to emit the output demanded by the SQL spec
- * for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ * which have been combined into one dataset, with a junk attribute added
+ * that shows which relation each tuple came from.  In SETOP_SORTED mode,
+ * the input has furthermore been sorted according to all the grouping
+ * columns (ie, all the non-junk attributes).  The SetOp node scans each
+ * group of identical tuples to determine how many came from each input
+ * relation.  Then it is a simple matter to emit the output demanded by the
+ * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL.
+ *
+ * In SETOP_HASHED mode, the input is delivered in no particular order.
+ * We build a hash table in memory with one entry for each group of
+ * identical tuples, and count the number of tuples in the group from
+ * each relation.  After seeing all the input, we scan the hashtable and
+ * generate the correct output using those counts.
  *
  * This node type is not used for UNION or UNION ALL, since those can be
  * implemented more cheaply (there's no need for the junk attribute to
  * identify the source relation).
  *
+ * Note that SetOp does no qual checking nor projection.  The delivered
+ * output tuples are just copies of the first-to-arrive tuple in each
+ * input group.
+ *
  *
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.25 2008/01/01 19:45:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.26 2008/08/07 03:04:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
-/*
- * INTERFACE ROUTINES
- *		ExecSetOp		- filter input to generate INTERSECT/EXCEPT results
- *		ExecInitSetOp	- initialize node and subnodes..
- *		ExecEndSetOp	- shutdown node and subnodes
- */
 
 #include "postgres.h"
 
@@ -39,6 +44,160 @@
 #include "utils/memutils.h"
 
 
+/*
+ * SetOpStatePerGroupData - per-group working state
+ *
+ * These values are working state that is initialized at the start of
+ * an input tuple group and updated for each input tuple.
+ *
+ * In SETOP_SORTED mode, we need only one of these structs, and it's kept in
+ * the plan state node.  In SETOP_HASHED mode, the hash table contains one
+ * of these for each tuple group.
+ */
+typedef struct SetOpStatePerGroupData
+{
+	long		numLeft;		/* number of left-input dups in group */
+	long		numRight;		/* number of right-input dups in group */
+} SetOpStatePerGroupData;
+
+/*
+ * To implement hashed mode, we need a hashtable that stores a
+ * representative tuple and the duplicate counts for each distinct set
+ * of grouping columns.  We compute the hash key from the grouping columns.
+ */
+typedef struct SetOpHashEntryData *SetOpHashEntry;
+
+typedef struct SetOpHashEntryData
+{
+	TupleHashEntryData shared;	/* common header for hash table entries */
+	SetOpStatePerGroupData pergroup;
+} SetOpHashEntryData;
+
+
+static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate);
+static void setop_fill_hash_table(SetOpState *setopstate);
+static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate);
+
+
+/*
+ * Initialize state for a new group of input values.
+ */
+static void
+initialize_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+	pergroup->numLeft = pergroup->numRight = 0;
+}
+
+/*
+ * Advance the appropriate counter for one input tuple.
+ */
+static void
+advance_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup,
+			   TupleTableSlot *inputslot)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	int			flag;
+	bool		isNull;
+
+	flag = DatumGetInt32(slot_getattr(inputslot,
+									  node->flagColIdx,
+									  &isNull));
+	Assert(!isNull);
+	if (flag)
+		pergroup->numRight++;
+	else
+		pergroup->numLeft++;
+}
+
+/*
+ * Initialize the hash table to empty.
+ */
+static void
+build_hash_table(SetOpState *setopstate)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+
+	Assert(node->strategy == SETOP_HASHED);
+	Assert(node->numGroups > 0);
+
+	setopstate->hashtable = BuildTupleHashTable(node->numCols,
+												node->dupColIdx,
+												setopstate->eqfunctions,
+												setopstate->hashfunctions,
+												node->numGroups,
+												sizeof(SetOpHashEntryData),
+												setopstate->tableContext,
+												setopstate->tempContext);
+}
+
+/*
+ * Find or create a hashtable entry for the tuple group containing the
+ * given tuple.
+ */
+static SetOpHashEntry
+lookup_hash_entry(SetOpState *setopstate, TupleTableSlot *inputslot)
+{
+	SetOpHashEntry entry;
+	bool		isnew;
+
+	/* find or create the hashtable entry */
+	entry = (SetOpHashEntry) LookupTupleHashEntry(setopstate->hashtable,
+												  inputslot,
+												  &isnew);
+
+	if (isnew)
+	{
+		/* initialize counts for new tuple group */
+		initialize_counts(setopstate, &entry->pergroup);
+	}
+
+	/* Must reset temp context after each hashtable lookup */
+	MemoryContextReset(setopstate->tempContext);
+
+	return entry;
+}
+
+/*
+ * We've completed processing a tuple group.  Decide how many copies (if any)
+ * of its representative row to emit, and store the count into numOutput.
+ * This logic is straight from the SQL92 specification.
+ */
+static void
+set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup)
+{
+	SetOp	   *plannode = (SetOp *) setopstate->ps.plan;
+
+	switch (plannode->cmd)
+	{
+		case SETOPCMD_INTERSECT:
+			if (pergroup->numLeft > 0 && pergroup->numRight > 0)
+				setopstate->numOutput = 1;
+			else
+				setopstate->numOutput = 0;
+			break;
+		case SETOPCMD_INTERSECT_ALL:
+			setopstate->numOutput =
+				(pergroup->numLeft < pergroup->numRight) ?
+				pergroup->numLeft : pergroup->numRight;
+			break;
+		case SETOPCMD_EXCEPT:
+			if (pergroup->numLeft > 0 && pergroup->numRight == 0)
+				setopstate->numOutput = 1;
+			else
+				setopstate->numOutput = 0;
+			break;
+		case SETOPCMD_EXCEPT_ALL:
+			setopstate->numOutput =
+				(pergroup->numLeft < pergroup->numRight) ?
+				0 : (pergroup->numLeft - pergroup->numRight);
+			break;
+		default:
+			elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd);
+			break;
+	}
+}
+
+
 /* ----------------------------------------------------------------
  *		ExecSetOp
  * ----------------------------------------------------------------
@@ -47,14 +206,7 @@ TupleTableSlot *				/* return: a tuple or NULL */
 ExecSetOp(SetOpState *node)
 {
 	SetOp	   *plannode = (SetOp *) node->ps.plan;
-	TupleTableSlot *resultTupleSlot;
-	PlanState  *outerPlan;
-
-	/*
-	 * get information from the node
-	 */
-	outerPlan = outerPlanState(node);
-	resultTupleSlot = node->ps.ps_ResultTupleSlot;
+	TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;
 
 	/*
 	 * If the previously-returned tuple needs to be returned more than once,
@@ -66,142 +218,218 @@ ExecSetOp(SetOpState *node)
 		return resultTupleSlot;
 	}
 
-	/* Flag that we have no current tuple */
-	ExecClearTuple(resultTupleSlot);
+	/* Otherwise, we're done if we are out of groups */
+	if (node->setop_done)
+		return NULL;
+
+	/* Fetch the next tuple group according to the correct strategy */
+	if (plannode->strategy == SETOP_HASHED)
+	{
+		if (!node->table_filled)
+			setop_fill_hash_table(node);
+		return setop_retrieve_hash_table(node);
+	}
+	else
+		return setop_retrieve_direct(node);
+}
+
+/*
+ * ExecSetOp for non-hashed case
+ */
+static TupleTableSlot *
+setop_retrieve_direct(SetOpState *setopstate)
+{
+	SetOp	   *node = (SetOp *) setopstate->ps.plan;
+	PlanState  *outerPlan;
+	SetOpStatePerGroup pergroup;
+	TupleTableSlot *outerslot;
+	TupleTableSlot *resultTupleSlot;
 
 	/*
-	 * Absorb groups of duplicate tuples, counting them, and saving the first
-	 * of each group as a possible return value. At the end of each group,
-	 * decide whether to return anything.
-	 *
-	 * We assume that the tuples arrive in sorted order so we can detect
-	 * duplicates easily.
+	 * get state info from node
 	 */
-	for (;;)
-	{
-		TupleTableSlot *inputTupleSlot;
-		bool		endOfGroup;
+	outerPlan = outerPlanState(setopstate);
+	pergroup = setopstate->pergroup;
+	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
 
+	/*
+	 * We loop retrieving groups until we find one we should return
+	 */
+	while (!setopstate->setop_done)
+	{
 		/*
-		 * fetch a tuple from the outer subplan, unless we already did.
+		 * If we don't already have the first tuple of the new group, fetch it
+		 * from the outer plan.
 		 */
-		if (node->ps.ps_OuterTupleSlot == NULL &&
-			!node->subplan_done)
-		{
-			node->ps.ps_OuterTupleSlot =
-				ExecProcNode(outerPlan);
-			if (TupIsNull(node->ps.ps_OuterTupleSlot))
-				node->subplan_done = true;
-		}
-		inputTupleSlot = node->ps.ps_OuterTupleSlot;
-
-		if (TupIsNull(resultTupleSlot))
+		if (setopstate->grp_firstTuple == NULL)
 		{
-			/*
-			 * First of group: save a copy in result slot, and reset
-			 * duplicate-counters for new group.
-			 */
-			if (node->subplan_done)
-				return NULL;	/* no more tuples */
-			ExecCopySlot(resultTupleSlot, inputTupleSlot);
-			node->numLeft = 0;
-			node->numRight = 0;
-			endOfGroup = false;
-		}
-		else if (node->subplan_done)
-		{
-			/*
-			 * Reached end of input, so finish processing final group
-			 */
-			endOfGroup = true;
-		}
-		else
-		{
-			/*
-			 * Else test if the new tuple and the previously saved tuple
-			 * match.
-			 */
-			if (execTuplesMatch(inputTupleSlot,
-								resultTupleSlot,
-								plannode->numCols, plannode->dupColIdx,
-								node->eqfunctions,
-								node->tempContext))
-				endOfGroup = false;
+			outerslot = ExecProcNode(outerPlan);
+			if (!TupIsNull(outerslot))
+			{
+				/* Make a copy of the first input tuple */
+				setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot);
+			}
 			else
-				endOfGroup = true;
+			{
+				/* outer plan produced no tuples at all */
+				setopstate->setop_done = true;
+				return NULL;
+			}
 		}
 
-		if (endOfGroup)
+		/*
+		 * Store the copied first input tuple in the tuple table slot
+		 * reserved for it.  The tuple will be deleted when it is cleared
+		 * from the slot.
+		 */
+		ExecStoreTuple(setopstate->grp_firstTuple,
+					   resultTupleSlot,
+					   InvalidBuffer,
+					   true);
+		setopstate->grp_firstTuple = NULL;	/* don't keep two pointers */
+
+		/* Initialize working state for a new input tuple group */
+		initialize_counts(setopstate, pergroup);
+
+		/* Count the first input tuple */
+		advance_counts(setopstate, pergroup, resultTupleSlot);
+
+		/*
+		 * Scan the outer plan until we exhaust it or cross a group boundary.
+		 */
+		for (;;)
 		{
+			outerslot = ExecProcNode(outerPlan);
+			if (TupIsNull(outerslot))
+			{
+				/* no more outer-plan tuples available */
+				setopstate->setop_done = true;
+				break;
+			}
+
 			/*
-			 * We've reached the end of the group containing resultTuple.
-			 * Decide how many copies (if any) to emit.  This logic is
-			 * straight from the SQL92 specification.
+			 * Check whether we've crossed a group boundary.
 			 */
-			switch (plannode->cmd)
+			if (!execTuplesMatch(resultTupleSlot,
+								 outerslot,
+								 node->numCols, node->dupColIdx,
+								 setopstate->eqfunctions,
+								 setopstate->tempContext))
 			{
-				case SETOPCMD_INTERSECT:
-					if (node->numLeft > 0 && node->numRight > 0)
-						node->numOutput = 1;
-					else
-						node->numOutput = 0;
-					break;
-				case SETOPCMD_INTERSECT_ALL:
-					node->numOutput =
-						(node->numLeft < node->numRight) ?
-						node->numLeft : node->numRight;
-					break;
-				case SETOPCMD_EXCEPT:
-					if (node->numLeft > 0 && node->numRight == 0)
-						node->numOutput = 1;
-					else
-						node->numOutput = 0;
-					break;
-				case SETOPCMD_EXCEPT_ALL:
-					node->numOutput =
-						(node->numLeft < node->numRight) ?
-						0 : (node->numLeft - node->numRight);
-					break;
-				default:
-					elog(ERROR, "unrecognized set op: %d",
-						 (int) plannode->cmd);
-					break;
-			}
-			/* Fall out of for-loop if we have tuples to emit */
-			if (node->numOutput > 0)
+				/*
+				 * Save the first input tuple of the next group.
+				 */
+				setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot);
 				break;
-			/* Else flag that we have no current tuple, and loop around */
-			ExecClearTuple(resultTupleSlot);
+			}
+
+			/* Still in same group, so count this tuple */
+			advance_counts(setopstate, pergroup, outerslot);
 		}
-		else
+
+		/*
+		 * Done scanning input tuple group.  See if we should emit any
+		 * copies of result tuple, and if so return the first copy.
+		 */
+		set_output_count(setopstate, pergroup);
+
+		if (setopstate->numOutput > 0)
 		{
-			/*
-			 * Current tuple is member of same group as resultTuple. Count it
-			 * in the appropriate counter.
-			 */
-			int			flag;
-			bool		isNull;
-
-			flag = DatumGetInt32(slot_getattr(inputTupleSlot,
-											  plannode->flagColIdx,
-											  &isNull));
-			Assert(!isNull);
-			if (flag)
-				node->numRight++;
-			else
-				node->numLeft++;
-			/* Set flag to fetch a new input tuple, and loop around */
-			node->ps.ps_OuterTupleSlot = NULL;
+			setopstate->numOutput--;
+			return resultTupleSlot;
 		}
 	}
 
+	/* No more groups */
+	ExecClearTuple(resultTupleSlot);
+	return NULL;
+}
+
+/*
+ * ExecSetOp for hashed case: phase 1, read input and build hash table
+ */
+static void
+setop_fill_hash_table(SetOpState *setopstate)
+{
+	PlanState  *outerPlan;
+	SetOpHashEntry entry;
+	TupleTableSlot *outerslot;
+
+	/*
+	 * get state info from node
+	 */
+	outerPlan = outerPlanState(setopstate);
+
 	/*
-	 * If we fall out of loop, then we need to emit at least one copy of
-	 * resultTuple.
+	 * Process each outer-plan tuple, and then fetch the next one, until we
+	 * exhaust the outer plan.
 	 */
-	Assert(node->numOutput > 0);
-	node->numOutput--;
-	return resultTupleSlot;
+	for (;;)
+	{
+		outerslot = ExecProcNode(outerPlan);
+		if (TupIsNull(outerslot))
+			break;
+
+		/* Find or build hashtable entry for this tuple's group */
+		entry = lookup_hash_entry(setopstate, outerslot);
+
+		/* Advance the counts */
+		advance_counts(setopstate, &entry->pergroup, outerslot);
+	}
+
+	setopstate->table_filled = true;
+	/* Initialize to walk the hash table */
+	ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
+}
+
+/*
+ * ExecSetOp for hashed case: phase 2, retrieving groups from hash table
+ */
+static TupleTableSlot *
+setop_retrieve_hash_table(SetOpState *setopstate)
+{
+	SetOpHashEntry entry;
+	TupleTableSlot *resultTupleSlot;
+
+	/*
+	 * get state info from node
+	 */
+	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;
+
+	/*
+	 * We loop retrieving groups until we find one we should return
+	 */
+	while (!setopstate->setop_done)
+	{
+		/*
+		 * Find the next entry in the hash table
+		 */
+		entry = (SetOpHashEntry) ScanTupleHashTable(&setopstate->hashiter);
+		if (entry == NULL)
+		{
+			/* No more entries in hashtable, so done */
+			setopstate->setop_done = true;
+			return NULL;
+		}
+
+		/*
+		 * See if we should emit any copies of this tuple, and if so return
+		 * the first copy.
+		 */
+		set_output_count(setopstate, &entry->pergroup);
+
+		if (setopstate->numOutput > 0)
+		{
+			setopstate->numOutput--;
+			return ExecStoreMinimalTuple(entry->shared.firstTuple,
+										 resultTupleSlot,
+										 false);
+		}
+	}
+
+	/* No more groups */
+	ExecClearTuple(resultTupleSlot);
+	return NULL;
 }
 
 /* ----------------------------------------------------------------
@@ -226,9 +454,14 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	setopstate->ps.plan = (Plan *) node;
 	setopstate->ps.state = estate;
 
-	setopstate->ps.ps_OuterTupleSlot = NULL;
-	setopstate->subplan_done = false;
+	setopstate->eqfunctions = NULL;
+	setopstate->hashfunctions = NULL;
+	setopstate->setop_done = false;
 	setopstate->numOutput = 0;
+	setopstate->pergroup = NULL;
+	setopstate->grp_firstTuple = NULL;
+	setopstate->hashtable = NULL;
+	setopstate->tableContext = NULL;
 
 	/*
 	 * Miscellaneous initialization
@@ -244,6 +477,19 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 							  ALLOCSET_DEFAULT_INITSIZE,
 							  ALLOCSET_DEFAULT_MAXSIZE);
 
+	/*
+	 * If hashing, we also need a longer-lived context to store the hash
+	 * table.  The table can't just be kept in the per-query context because
+	 * we want to be able to throw it away in ExecReScanSetOp.
+	 */
+	if (node->strategy == SETOP_HASHED)
+		setopstate->tableContext =
+			AllocSetContextCreate(CurrentMemoryContext,
+								  "SetOp hash table",
+								  ALLOCSET_DEFAULT_MINSIZE,
+								  ALLOCSET_DEFAULT_INITSIZE,
+								  ALLOCSET_DEFAULT_MAXSIZE);
+
 #define SETOP_NSLOTS 1
 
 	/*
@@ -252,8 +498,13 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	ExecInitResultTupleSlot(estate, &setopstate->ps);
 
 	/*
-	 * then initialize outer plan
+	 * initialize child nodes
+	 *
+	 * If we are hashing then the child plan does not need
+	 * to handle REWIND efficiently; see ExecReScanSetOp.
 	 */
+	if (node->strategy == SETOP_HASHED)
+		eflags &= ~EXEC_FLAG_REWIND;
 	outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags);
 
 	/*
@@ -264,11 +515,30 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags)
 	setopstate->ps.ps_ProjInfo = NULL;
 
 	/*
-	 * Precompute fmgr lookup data for inner loop
+	 * Precompute fmgr lookup data for inner loop. We need both equality and
+	 * hashing functions to do it by hashing, but only equality if not
+	 * hashing.
 	 */
-	setopstate->eqfunctions =
-		execTuplesMatchPrepare(node->numCols,
-							   node->dupOperators);
+	if (node->strategy == SETOP_HASHED)
+		execTuplesHashPrepare(node->numCols,
+							  node->dupOperators,
+							  &setopstate->eqfunctions,
+							  &setopstate->hashfunctions);
+	else
+		setopstate->eqfunctions =
+			execTuplesMatchPrepare(node->numCols,
+								   node->dupOperators);
+
+	if (node->strategy == SETOP_HASHED)
+	{
+		build_hash_table(setopstate);
+		setopstate->table_filled = false;
+	}
+	else
+	{
+		setopstate->pergroup =
+			(SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData));
+	}
 
 	return setopstate;
 }
@@ -293,9 +563,11 @@ ExecEndSetOp(SetOpState *node)
 {
 	/* clean up tuple table */
 	ExecClearTuple(node->ps.ps_ResultTupleSlot);
-	node->ps.ps_OuterTupleSlot = NULL;
 
+	/* free subsidiary stuff including hashtable */
 	MemoryContextDelete(node->tempContext);
+	if (node->tableContext)
+		MemoryContextDelete(node->tableContext);
 
 	ExecEndNode(outerPlanState(node));
 }
@@ -305,10 +577,50 @@ void
 ExecReScanSetOp(SetOpState *node, ExprContext *exprCtxt)
 {
 	ExecClearTuple(node->ps.ps_ResultTupleSlot);
-	node->ps.ps_OuterTupleSlot = NULL;
-	node->subplan_done = false;
+	node->setop_done = false;
 	node->numOutput = 0;
 
+	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+	{
+		/*
+		 * In the hashed case, if we haven't yet built the hash table then we
+		 * can just return; nothing done yet, so nothing to undo. If subnode's
+		 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+		 * else no reason to re-scan it at all.
+		 */
+		if (!node->table_filled)
+			return;
+
+		/*
+		 * If we do have the hash table and the subplan does not have any
+		 * parameter changes, then we can just rescan the existing hash table;
+		 * no need to build it again.
+		 */
+		if (((PlanState *) node)->lefttree->chgParam == NULL)
+		{
+			ResetTupleHashIterator(node->hashtable, &node->hashiter);
+			return;
+		}
+	}
+
+	/* Release first tuple of group, if we have made a copy */
+	if (node->grp_firstTuple != NULL)
+	{
+		heap_freetuple(node->grp_firstTuple);
+		node->grp_firstTuple = NULL;
+	}
+
+	/* Release any hashtable storage */
+	if (node->tableContext)
+		MemoryContextResetAndDeleteChildren(node->tableContext);
+
+	/* And rebuild empty hashtable if needed */
+	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
+	{
+		build_hash_table(node);
+		node->table_filled = false;
+	}
+
 	/*
 	 * if chgParam of subnode is not null then plan will be re-scanned by
 	 * first ExecProcNode.
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index e562949d934..4767b924e9e 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -15,7 +15,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.397 2008/08/07 01:11:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.398 2008/08/07 03:04:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -652,10 +652,12 @@ _copySetOp(SetOp *from)
 	 * copy remainder of node
 	 */
 	COPY_SCALAR_FIELD(cmd);
+	COPY_SCALAR_FIELD(strategy);
 	COPY_SCALAR_FIELD(numCols);
 	COPY_POINTER_FIELD(dupColIdx, from->numCols * sizeof(AttrNumber));
 	COPY_POINTER_FIELD(dupOperators, from->numCols * sizeof(Oid));
 	COPY_SCALAR_FIELD(flagColIdx);
+	COPY_SCALAR_FIELD(numGroups);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 26f8fb3810a..8440864b0e6 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.331 2008/08/07 01:11:48 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.332 2008/08/07 03:04:03 tgl Exp $
  *
  * NOTES
  *	  Every node type that can appear in stored rules' parsetrees *must*
@@ -599,6 +599,7 @@ _outSetOp(StringInfo str, SetOp *node)
 	_outPlanInfo(str, (Plan *) node);
 
 	WRITE_ENUM_FIELD(cmd, SetOpCmd);
+	WRITE_ENUM_FIELD(strategy, SetOpStrategy);
 	WRITE_INT_FIELD(numCols);
 
 	appendStringInfo(str, " :dupColIdx");
@@ -610,6 +611,7 @@ _outSetOp(StringInfo str, SetOp *node)
 		appendStringInfo(str, " %u", node->dupOperators[i]);
 
 	WRITE_INT_FIELD(flagColIdx);
+	WRITE_LONG_FIELD(numGroups);
 }
 
 static void
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 80b6ed2edfe..4cf2a5208b1 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.242 2008/08/02 21:32:00 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.243 2008/08/07 03:04:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -3108,8 +3108,9 @@ make_unique(Plan *lefttree, List *distinctList)
  * already be sorted accordingly.
  */
 SetOp *
-make_setop(SetOpCmd cmd, Plan *lefttree,
-		   List *distinctList, AttrNumber flagColIdx)
+make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree,
+		   List *distinctList, AttrNumber flagColIdx, long numGroups,
+		   double outputRows)
 {
 	SetOp	   *node = makeNode(SetOp);
 	Plan	   *plan = &node->plan;
@@ -3120,20 +3121,13 @@ make_setop(SetOpCmd cmd, Plan *lefttree,
 	ListCell   *slitem;
 
 	copy_plan_costsize(plan, lefttree);
+	plan->plan_rows = outputRows;
 
 	/*
 	 * Charge one cpu_operator_cost per comparison per input tuple. We assume
 	 * all columns get compared at most of the tuples.
 	 */
-	plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols;
-
-	/*
-	 * We make the unsupported assumption that there will be 10% as many
-	 * tuples out as in.  Any way to do better?
-	 */
-	plan->plan_rows *= 0.1;
-	if (plan->plan_rows < 1)
-		plan->plan_rows = 1;
+	plan->total_cost += cpu_operator_cost * lefttree->plan_rows * numCols;
 
 	plan->targetlist = lefttree->targetlist;
 	plan->qual = NIL;
@@ -3160,10 +3154,12 @@ make_setop(SetOpCmd cmd, Plan *lefttree,
 	}
 
 	node->cmd = cmd;
+	node->strategy = strategy;
 	node->numCols = numCols;
 	node->dupColIdx = dupColIdx;
 	node->dupOperators = dupOperators;
 	node->flagColIdx = flagColIdx;
+	node->numGroups = numGroups;
 
 	return node;
 }
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 750d59a9515..3a155c0d0ad 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -22,7 +22,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.150 2008/08/07 01:11:50 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.151 2008/08/07 03:04:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,6 +60,7 @@ static Plan *generate_union_plan(SetOperationStmt *op, PlannerInfo *root,
 					double tuple_fraction,
 					List *refnames_tlist, List **sortClauses);
 static Plan *generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
+					   double tuple_fraction,
 					   List *refnames_tlist, List **sortClauses);
 static List *recurse_union_children(Node *setOp, PlannerInfo *root,
 					   double tuple_fraction,
@@ -229,7 +230,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 									   refnames_tlist,
 									   sortClauses);
 		else
-			plan = generate_nonunion_plan(op, root,
+			plan = generate_nonunion_plan(op, root, tuple_fraction,
 										  refnames_tlist,
 										  sortClauses);
 
@@ -341,6 +342,7 @@ generate_union_plan(SetOperationStmt *op, PlannerInfo *root,
  */
 static Plan *
 generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
+					   double tuple_fraction,
 					   List *refnames_tlist,
 					   List **sortClauses)
 {
@@ -351,6 +353,10 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 			   *groupList,
 			   *planlist,
 			   *child_sortclauses;
+	double		dNumDistinctRows;
+	double		dNumOutputRows;
+	long		numDistinctRows;
+	bool		use_hash;
 	SetOpCmd	cmd;
 
 	/* Recurse on children, ensuring their outputs are marked */
@@ -394,9 +400,31 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 	}
 
 	/*
+	 * XXX for the moment, take the number of distinct groups as being the
+	 * total input size, ie, the worst case.  This is too conservative, but
+	 * we don't want to risk having the hashtable overrun memory; also,
+	 * it's not clear how to get a decent estimate of the true size.
+	 */
+	dNumDistinctRows = plan->plan_rows;
+
+	/* Also convert to long int --- but 'ware overflow! */
+	numDistinctRows = (long) Min(dNumDistinctRows, (double) LONG_MAX);
+
+	/*
+	 * The output size is taken as 10% of that, which is a completely bogus
+	 * guess, but it's what we've used historically.
+	 */
+	dNumOutputRows = ceil(dNumDistinctRows * 0.1);
+
+	/*
 	 * Decide whether to hash or sort, and add a sort node if needed.
 	 */
-	plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan);
+	use_hash = choose_hashed_setop(root, groupList, plan,
+								   tuple_fraction, dNumDistinctRows,
+								   (op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT");
+
+	if (!use_hash)
+		plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan);
 
 	/*
 	 * Finally, add a SetOp plan node to generate the correct output.
@@ -414,9 +442,12 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 			cmd = SETOPCMD_INTERSECT;	/* keep compiler quiet */
 			break;
 	}
-	plan = (Plan *) make_setop(cmd, plan, groupList, list_length(op->colTypes) + 1);
+	plan = (Plan *) make_setop(cmd, use_hash ? SETOP_HASHED : SETOP_SORTED,
+							   plan, groupList, list_length(op->colTypes) + 1,
+							   numDistinctRows, dNumOutputRows);
 
-	*sortClauses = groupList;
+	/* Result is sorted only if we're not hashing */
+	*sortClauses = use_hash ? NIL : groupList;
 
 	return plan;
 }
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 96b7a16084b..ce41c750afc 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.185 2008/07/26 19:15:35 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.186 2008/08/07 03:04:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1414,21 +1414,31 @@ typedef struct HashState
 /* ----------------
  *	 SetOpState information
  *
- *		SetOp nodes are used "on top of" sort nodes to discard
- *		duplicate tuples returned from the sort phase.	These are
- *		more complex than a simple Unique since we have to count
- *		how many duplicates to return.
+ *		Even in "sorted" mode, SetOp nodes are more complex than a simple
+ *		Unique, since we have to count how many duplicates to return.  But
+ *		we also support hashing, so this is really more like a cut-down
+ *		form of Agg.
  * ----------------
  */
+/* this struct is private in nodeSetOp.c: */
+typedef struct SetOpStatePerGroupData *SetOpStatePerGroup;
+
 typedef struct SetOpState
 {
 	PlanState	ps;				/* its first field is NodeTag */
-	FmgrInfo   *eqfunctions;	/* per-field lookup data for equality fns */
-	bool		subplan_done;	/* has subplan returned EOF? */
-	long		numLeft;		/* number of left-input dups of cur group */
-	long		numRight;		/* number of right-input dups of cur group */
+	FmgrInfo   *eqfunctions;	/* per-grouping-field equality fns */
+	FmgrInfo   *hashfunctions;	/* per-grouping-field hash fns */
+	bool		setop_done;		/* indicates completion of output scan */
 	long		numOutput;		/* number of dups left to output */
 	MemoryContext tempContext;	/* short-term context for comparisons */
+	/* these fields are used in SETOP_SORTED mode: */
+	SetOpStatePerGroup pergroup;	/* per-group working state */
+	HeapTuple	grp_firstTuple; /* copy of first tuple of current group */
+	/* these fields are used in SETOP_HASHED mode: */
+	TupleHashTable hashtable;	/* hash table with one entry per group */
+	MemoryContext tableContext;	/* memory context containing hash table */
+	bool		table_filled;	/* hash table filled yet? */
+	TupleHashIterator hashiter; /* for iterating through hash table */
 } SetOpState;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 6fb93e810da..e2eaadd29ff 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.100 2008/04/13 20:51:21 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.101 2008/08/07 03:04:04 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -525,15 +525,23 @@ typedef enum SetOpCmd
 	SETOPCMD_EXCEPT_ALL
 } SetOpCmd;
 
+typedef enum SetOpStrategy
+{
+	SETOP_SORTED,				/* input must be sorted */
+	SETOP_HASHED				/* use internal hashtable */
+} SetOpStrategy;
+
 typedef struct SetOp
 {
 	Plan		plan;
 	SetOpCmd	cmd;			/* what to do */
+	SetOpStrategy strategy;		/* how to do it */
 	int			numCols;		/* number of columns to check for
 								 * duplicate-ness */
 	AttrNumber *dupColIdx;		/* their indexes in the target list */
 	Oid		   *dupOperators;	/* equality operators to compare with */
 	AttrNumber	flagColIdx;		/* where is the flag column, if any */
+	long		numGroups;		/* estimated number of groups in input */
 } SetOp;
 
 /* ----------------
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index 0b46a173706..616f0ea2ad0 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.108 2008/05/02 21:26:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.109 2008/08/07 03:04:04 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -61,8 +61,9 @@ extern Plan *materialize_finished_plan(Plan *subplan);
 extern Unique *make_unique(Plan *lefttree, List *distinctList);
 extern Limit *make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount,
 		   int64 offset_est, int64 count_est);
-extern SetOp *make_setop(SetOpCmd cmd, Plan *lefttree,
-		   List *distinctList, AttrNumber flagColIdx);
+extern SetOp *make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree,
+		   List *distinctList, AttrNumber flagColIdx, long numGroups,
+		   double outputRows);
 extern Result *make_result(PlannerInfo *root, List *tlist,
 			Node *resconstantqual, Plan *subplan);
 extern bool is_projection_capable_plan(Plan *plan);
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 722f9651be1..295dacea86d 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -275,21 +275,21 @@ SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl;
  4567890123456789
 (3 rows)
 
-SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
         q2         
 -------------------
  -4567890123456789
                456
 (2 rows)
 
-SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl ORDER BY 1;
         q2         
 -------------------
  -4567890123456789
                456
 (2 rows)
 
-SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1;
         q2         
 -------------------
  -4567890123456789
@@ -326,7 +326,7 @@ SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl;
   0
 (1 row)
 
-SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl;
+SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1;
           f1           
 -----------------------
  -1.2345678901234e+200
@@ -369,14 +369,14 @@ SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2
  -4567890123456789
 (7 rows)
 
-SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl;
+SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
         q1         
 -------------------
  -4567890123456789
                456
 (2 rows)
 
-SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl)));
+SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1)));
         q1         
 -------------------
                123
@@ -388,7 +388,7 @@ SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FR
                456
 (7 rows)
 
-(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl;
+(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
         q1         
 -------------------
  -4567890123456789
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index 0b83d6b185b..81e19d165c4 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -97,11 +97,11 @@ SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl;
 
 SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl;
 
-SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
 
-SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl ORDER BY 1;
 
-SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl;
+SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1;
 
 SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl;
 
@@ -115,7 +115,7 @@ SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl;
 
 SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl;
 
-SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl;
+SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1;
 
 --
 -- Operator precedence and (((((extra))))) parentheses
@@ -127,11 +127,11 @@ SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2
 
 (((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL SELECT q2 FROM int8_tbl;
 
-SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl;
+SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
 
-SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl)));
+SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1)));
 
-(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl;
+(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1;
 
 --
 -- Subqueries with ORDER BY & LIMIT clauses