Fix thinko in hash cost estimation: average frequency

should be computed from total number of distinct values in whole relation, not # distinct values we expect to have after restriction clauses are applied.
author: Tom Lane <tgl@sss.pgh.pa.us> 2001-06-10 02:59:35 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2001-06-10 02:59:35 +0000
commit: a8fe109ac186dea6616ec207c349ef92c214bfda (patch)
tree: ecd7c9ac7c001c11d5e01294a2846a01c55f0b1a /src
parent: 26c94c5d9c7ff7753dad85184adfe774b71580ff (diff)
download: postgresql-a8fe109ac186dea6616ec207c349ef92c214bfda.tar.gz
postgresql-a8fe109ac186dea6616ec207c349ef92c214bfda.zip
1 files changed, 6 insertions, 16 deletions
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 65c211deaee..06793f1d8b4 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -42,7 +42,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.75 2001/06/05 05:26:04 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.76 2001/06/10 02:59:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -873,6 +873,9 @@ estimate_hash_bucketsize(Query *root, Var *var)
 	if (ndistinct < 0.0)
 		ndistinct = -ndistinct * rel->tuples;
 
+	/* Also compute avg freq of all distinct data values in raw relation */
+	avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
+
 	/*
 	 * Adjust ndistinct to account for restriction clauses.  Observe we are
 	 * assuming that the data distribution is affected uniformly by the
@@ -884,17 +887,6 @@ estimate_hash_bucketsize(Query *root, Var *var)
 	ndistinct *= rel->rows / rel->tuples;
 
 	/*
-	 * Discourage use of hash join if there seem not to be very many distinct
-	 * data values.  The threshold here is somewhat arbitrary, as is the
-	 * fraction used to "discourage" the choice.
-	 */
-	if (ndistinct < 50.0)
-	{
-		ReleaseSysCache(tuple);
-		return 0.5;
-	}
-
-	/*
 	 * Form initial estimate of bucketsize fraction.  Here we use rel->rows,
 	 * ie the number of rows after applying restriction clauses, because
 	 * that's what the fraction will eventually be multiplied by in
@@ -903,8 +895,8 @@ estimate_hash_bucketsize(Query *root, Var *var)
 	estfract = (double) NTUP_PER_BUCKET / rel->rows;
 
 	/*
-	 * Adjust estimated bucketsize if too few distinct values to fill
-	 * all the buckets.
+	 * Adjust estimated bucketsize if too few distinct values (after
+	 * restriction clauses) to fill all the buckets.
 	 */
 	needdistinct = rel->rows / (double) NTUP_PER_BUCKET;
 	if (ndistinct < needdistinct)
@@ -931,8 +923,6 @@ estimate_hash_bucketsize(Query *root, Var *var)
 	/*
 	 * Adjust estimated bucketsize upward to account for skewed distribution.
 	 */
-	avgfreq = (1.0 - stats->stanullfrac) / ndistinct;
-
 	if (avgfreq > 0.0 && mcvfreq > avgfreq)
 		estfract *= mcvfreq / avgfreq;
author	Tom Lane <tgl@sss.pgh.pa.us>	2001-06-10 02:59:35 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2001-06-10 02:59:35 +0000
commit	a8fe109ac186dea6616ec207c349ef92c214bfda (patch)
tree	ecd7c9ac7c001c11d5e01294a2846a01c55f0b1a /src
parent	26c94c5d9c7ff7753dad85184adfe774b71580ff (diff)
download	postgresql-a8fe109ac186dea6616ec207c349ef92c214bfda.tar.gz postgresql-a8fe109ac186dea6616ec207c349ef92c214bfda.zip