Rewrite of planner statistics-gathering code. ANALYZE is now available as

a separate statement (though it can still be invoked as part of VACUUM, too). pg_statistic redesigned to be more flexible about what statistics are stored. ANALYZE now collects a list of several of the most common values, not just one, plus a histogram (not just the min and max values). Random sampling is used to make the process reasonably fast even on very large tables. The number of values and histogram bins collected is now user-settable via an ALTER TABLE command. There is more still to do; the new stats are not being used everywhere they could be in the planner. But the remaining changes for this project should be localized, and the behavior is already better than before. A not-very-related change is that sorting now makes use of btree comparison routines if it can find one, rather than invoking '<' twice.
author: Tom Lane <tgl@sss.pgh.pa.us> 2001-05-07 00:43:27 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2001-05-07 00:43:27 +0000
commit: f905d65ee35b3f84b6d4433a5198af0e2e7bd090 (patch)
tree: 68f5955bb1a7ecaa531cf6b3752f563943dbe079 /src/backend/utils/adt/selfuncs.c
parent: 9583aea9d09f6b3839ede8e57f990262b24e6979 (diff)
download: postgresql-f905d65ee35b3f84b6d4433a5198af0e2e7bd090.tar.gz
postgresql-f905d65ee35b3f84b6d4433a5198af0e2e7bd090.zip
1 files changed, 530 insertions, 393 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 1fe0afb0a35..41ba82db7b5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.87 2001/03/23 04:49:54 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.88 2001/05/07 00:43:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -57,9 +57,6 @@
 /* default selectivity estimate for pattern-match operators such as LIKE */
 #define DEFAULT_MATCH_SEL	0.01
 
-/* "fudge factor" for estimating frequency of not-most-common values */
-#define NOT_MOST_COMMON_RATIO  0.1
-
 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 				  Datum lobound, Datum hibound, Oid boundstypid,
 				  double *scaledlobound, double *scaledhibound);
@@ -75,17 +72,9 @@ static double convert_one_string_to_scalar(unsigned char *value,
 static unsigned char *convert_string_datum(Datum value, Oid typid);
 static double convert_timevalue_to_scalar(Datum value, Oid typid);
 static void getattproperties(Oid relid, AttrNumber attnum,
-				 Oid *typid,
-				 int *typlen,
-				 bool *typbyval,
-				 int32 *typmod);
-static bool getattstatistics(Oid relid, AttrNumber attnum,
-				 Oid typid, int32 typmod,
-				 double *nullfrac,
-				 double *commonfrac,
-				 Datum *commonval,
-				 Datum *loval,
-				 Datum *hival);
+							 Oid *typid, int32 *typmod);
+static double get_att_numdistinct(Oid relid, AttrNumber attnum, Oid typid,
+								  Form_pg_statistic stats);
 static Selectivity prefix_selectivity(char *prefix,
 				   Oid relid,
 				   AttrNumber attno,
@@ -115,134 +104,173 @@ eqsel(PG_FUNCTION_ARGS)
 	AttrNumber	attno = PG_GETARG_INT16(2);
 	Datum		value = PG_GETARG_DATUM(3);
 	int32		flag = PG_GETARG_INT32(4);
-	float8		result;
-
-	if (NONVALUE(attno) || NONVALUE(relid))
-		result = DEFAULT_EQ_SEL;
-	else
+	Oid			typid;
+	int32		typmod;
+	HeapTuple	statsTuple;
+	Datum	   *values;
+	int			nvalues;
+	float4	   *numbers;
+	int			nnumbers;
+	double		selec;
+
+	if (NONVALUE(relid) || NONVALUE(attno))
+		PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+
+	/* get info about the attribute */
+	getattproperties(relid, attno, &typid, &typmod);
+
+	/* get stats for the attribute, if available */
+	statsTuple = SearchSysCache(STATRELATT,
+								ObjectIdGetDatum(relid),
+								Int16GetDatum(attno),
+								0, 0);
+	if (HeapTupleIsValid(statsTuple))
 	{
-		Oid			typid;
-		int			typlen;
-		bool		typbyval;
-		int32		typmod;
-		double		nullfrac;
-		double		commonfrac;
-		Datum		commonval;
-		double		selec;
-
-		/* get info about the attribute */
-		getattproperties(relid, attno,
-						 &typid, &typlen, &typbyval, &typmod);
-
-		/* get stats for the attribute, if available */
-		if (getattstatistics(relid, attno, typid, typmod,
-							 &nullfrac, &commonfrac, &commonval,
-							 NULL, NULL))
-		{
-			if (flag & SEL_CONSTANT)
-			{
+		Form_pg_statistic stats;
 
-				/*
-				 * Is the constant "=" to the column's most common value?
-				 * (Although the operator may not really be "=", we will
-				 * assume that seeing whether it returns TRUE for the most
-				 * common value is useful information. If you don't like
-				 * it, maybe you shouldn't be using eqsel for your
-				 * operator...)
-				 */
-				RegProcedure eqproc = get_opcode(opid);
-				bool		mostcommon;
+		stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
 
-				if (eqproc == (RegProcedure) NULL)
-					elog(ERROR, "eqsel: no procedure for operator %u",
-						 opid);
+		if (flag & SEL_CONSTANT)
+		{
+			bool	match = false;
+			int		i;
 
-				/* be careful to apply operator right way 'round */
-				if (flag & SEL_RIGHT)
-					mostcommon = DatumGetBool(OidFunctionCall2(eqproc,
-															   commonval,
-															   value));
-				else
-					mostcommon = DatumGetBool(OidFunctionCall2(eqproc,
-															   value,
-															 commonval));
+			/*
+			 * Is the constant "=" to any of the column's most common
+			 * values?  (Although the given operator may not really be
+			 * "=", we will assume that seeing whether it returns TRUE
+			 * is an appropriate test.  If you don't like this, maybe you
+			 * shouldn't be using eqsel for your operator...)
+			 */
+			if (get_attstatsslot(statsTuple, typid, typmod,
+								 STATISTIC_KIND_MCV, InvalidOid,
+								 &values, &nvalues,
+								 &numbers, &nnumbers))
+			{
+				FmgrInfo	eqproc;
 
-				if (mostcommon)
-				{
+				fmgr_info(get_opcode(opid), &eqproc);
 
-					/*
-					 * Constant is "=" to the most common value.  We know
-					 * selectivity exactly (or as exactly as VACUUM could
-					 * calculate it, anyway).
-					 */
-					selec = commonfrac;
-				}
-				else
+				for (i = 0; i < nvalues; i++)
 				{
-
-					/*
-					 * Comparison is against a constant that is neither
-					 * the most common value nor null.	Its selectivity
-					 * cannot be more than this:
-					 */
-					selec = 1.0 - commonfrac - nullfrac;
-					if (selec > commonfrac)
-						selec = commonfrac;
-
-					/*
-					 * and in fact it's probably less, so we should apply
-					 * a fudge factor.	The only case where we don't is
-					 * for a boolean column, where indeed we have
-					 * estimated the less-common value's frequency
-					 * exactly!
-					 */
-					if (typid != BOOLOID)
-						selec *= NOT_MOST_COMMON_RATIO;
+					/* be careful to apply operator right way 'round */
+					if (flag & SEL_RIGHT)
+						match = DatumGetBool(FunctionCall2(&eqproc,
+														   values[i],
+														   value));
+					else
+						match = DatumGetBool(FunctionCall2(&eqproc,
+														   value,
+														   values[i]));
+					if (match)
+						break;
 				}
 			}
 			else
 			{
+				/* no most-common-value info available */
+				values = NULL;
+				numbers = NULL;
+				i = nvalues = nnumbers = 0;
+			}
 
+			if (match)
+			{
+				/*
+				 * Constant is "=" to this common value.  We know
+				 * selectivity exactly (or as exactly as VACUUM
+				 * could calculate it, anyway).
+				 */
+				selec = numbers[i];
+			}
+			else
+			{
 				/*
-				 * Search is for a value that we do not know a priori, but
-				 * we will assume it is not NULL.  Selectivity cannot be
-				 * more than this:
+				 * Comparison is against a constant that is neither
+				 * NULL nor any of the common values.  Its selectivity
+				 * cannot be more than this:
 				 */
-				selec = 1.0 - nullfrac;
-				if (selec > commonfrac)
-					selec = commonfrac;
+				double	sumcommon = 0.0;
+				double	otherdistinct;
 
+				for (i = 0; i < nnumbers; i++)
+					sumcommon += numbers[i];
+				selec = 1.0 - sumcommon - stats->stanullfrac;
+				/*
+				 * and in fact it's probably a good deal less.
+				 * We approximate that all the not-common values
+				 * share this remaining fraction equally, so we
+				 * divide by the number of other distinct values.
+				 */
+				otherdistinct = get_att_numdistinct(relid, attno,
+													typid, stats)
+					- nnumbers;
+				if (otherdistinct > 1)
+					selec /= otherdistinct;
 				/*
-				 * and in fact it's probably less, so apply a fudge
-				 * factor.
+				 * Another cross-check: selectivity shouldn't be
+				 * estimated as more than the least common
+				 * "most common value".
 				 */
-				selec *= NOT_MOST_COMMON_RATIO;
+				if (nnumbers > 0 && selec > numbers[nnumbers-1])
+					selec = numbers[nnumbers-1];
 			}
 
-			/* result should be in range, but make sure... */
-			if (selec < 0.0)
-				selec = 0.0;
-			else if (selec > 1.0)
-				selec = 1.0;
-
-			if (!typbyval)
-				pfree(DatumGetPointer(commonval));
+			free_attstatsslot(typid, values, nvalues, numbers, nnumbers);
 		}
 		else
 		{
+			double		ndistinct;
 
 			/*
-			 * No VACUUM ANALYZE stats available, so make a guess using
-			 * the dispersion stat (if we have that, which is unlikely for
-			 * a normal attribute; but for a system attribute we may be
-			 * able to estimate it).
+			 * Search is for a value that we do not know a priori, but
+			 * we will assume it is not NULL.  Estimate the selectivity
+			 * as non-null fraction divided by number of distinct values,
+			 * so that we get a result averaged over all possible values
+			 * whether common or uncommon.  (Essentially, we are assuming
+			 * that the not-yet-known comparison value is equally likely
+			 * to be any of the possible values, regardless of their
+			 * frequency in the table.  Is that a good idea?)
+			 */
+			selec = 1.0 - stats->stanullfrac;
+			ndistinct = get_att_numdistinct(relid, attno, typid, stats);
+			if (ndistinct > 1)
+				selec /= ndistinct;
+			/*
+			 * Cross-check: selectivity should never be
+			 * estimated as more than the most common value's.
 			 */
-			selec = get_attdispersion(relid, attno, 0.01);
+			if (get_attstatsslot(statsTuple, typid, typmod,
+								 STATISTIC_KIND_MCV, InvalidOid,
+								 NULL, NULL,
+								 &numbers, &nnumbers))
+			{
+				if (nnumbers > 0 && selec > numbers[0])
+					selec = numbers[0];
+				free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
+			}
 		}
 
-		result = (float8) selec;
+		ReleaseSysCache(statsTuple);
 	}
-	PG_RETURN_FLOAT8(result);
+	else
+	{
+		/*
+		 * No VACUUM ANALYZE stats available, so make a guess using
+		 * estimated number of distinct values and assuming they are
+		 * equally common.  (The guess is unlikely to be very good,
+		 * but we do know a few special cases.)
+		 */
+		selec = 1.0 / get_att_numdistinct(relid, attno, typid, NULL);
+	}
+
+	/* result should be in range, but make sure... */
+	if (selec < 0.0)
+		selec = 0.0;
+	else if (selec > 1.0)
+		selec = 1.0;
+
+	PG_RETURN_FLOAT8((float8) selec);
 }
 
 /*
@@ -301,117 +329,263 @@ scalarltsel(PG_FUNCTION_ARGS)
 	AttrNumber	attno = PG_GETARG_INT16(2);
 	Datum		value = PG_GETARG_DATUM(3);
 	int32		flag = PG_GETARG_INT32(4);
-	float8		result;
+	bool		isgt;
+	HeapTuple	oprTuple;
+	HeapTuple	statsTuple;
+	Form_pg_statistic stats;
+	Oid			contype;
+	FmgrInfo	opproc;
+	Oid			typid;
+	int32		typmod;
+	Datum	   *values;
+	int			nvalues;
+	float4	   *numbers;
+	int			nnumbers;
+	double		mcv_selec,
+				hist_selec,
+				sumcommon;
+	double		selec;
+	int			i;
+
+	if (NONVALUE(relid) || NONVALUE(attno))
+		PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+
+	/* Can't do anything useful if no constant to compare against, either */
+	if (!(flag & SEL_CONSTANT))
+		PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 
-	if (!(flag & SEL_CONSTANT) || NONVALUE(attno) || NONVALUE(relid))
-		result = DEFAULT_INEQ_SEL;
+	/*
+	 * Force the constant to be on the right to simplify later logic.
+	 * This means that we may be dealing with either "<" or ">" cases.
+	 */
+	if (flag & SEL_RIGHT)
+	{
+		/* we have x < const */
+		isgt = false;
+	}
 	else
 	{
-		HeapTuple	oprtuple;
-		Oid			ltype,
-					rtype,
-					contype;
-		Oid			typid;
-		int			typlen;
-		bool		typbyval;
-		int32		typmod;
-		Datum		hival,
-					loval;
-		double		val,
-					high,
-					low,
-					numerator,
-					denominator;
-
-		/*
-		 * Get left and right datatypes of the operator so we know what
-		 * type the constant is.
-		 */
-		oprtuple = SearchSysCache(OPEROID,
-								  ObjectIdGetDatum(opid),
-								  0, 0, 0);
-		if (!HeapTupleIsValid(oprtuple))
-			elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
-		ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
-		rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
-		contype = (flag & SEL_RIGHT) ? rtype : ltype;
-		ReleaseSysCache(oprtuple);
-
-		/* Now get info and stats about the attribute */
-		getattproperties(relid, attno,
-						 &typid, &typlen, &typbyval, &typmod);
-
-		if (!getattstatistics(relid, attno, typid, typmod,
-							  NULL, NULL, NULL,
-							  &loval, &hival))
+		/* we have const < x, commute to make x > const */
+		opid = get_commutator(opid);
+		if (!opid)
 		{
-			/* no stats available, so default result */
+			/* Use default selectivity (should we raise an error instead?) */
 			PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 		}
+		isgt = true;
+	}
 
-		/* Convert the values to a uniform comparison scale. */
-		if (!convert_to_scalar(value, contype, &val,
-							   loval, hival, typid,
-							   &low, &high))
-		{
+	/*
+	 * The constant might not be the same datatype as the column;
+	 * look at the operator's input types to find out what it is.
+	 * Also set up to be able to call the operator's execution proc.
+	 */
+	oprTuple = SearchSysCache(OPEROID,
+							  ObjectIdGetDatum(opid),
+							  0, 0, 0);
+	if (!HeapTupleIsValid(oprTuple))
+		elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
+	contype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprright;
+	fmgr_info(((Form_pg_operator) GETSTRUCT(oprTuple))->oprcode, &opproc);
+	ReleaseSysCache(oprTuple);
+
+	/* Now get info and stats about the attribute */
+	getattproperties(relid, attno, &typid, &typmod);
+
+	statsTuple = SearchSysCache(STATRELATT,
+								ObjectIdGetDatum(relid),
+								Int16GetDatum(attno),
+								0, 0);
+	if (!HeapTupleIsValid(statsTuple))
+	{
+		/* no stats available, so default result */
+		PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+	}
+	stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
 
-			/*
-			 * Ideally we'd produce an error here, on the grounds that the
-			 * given operator shouldn't have scalarltsel registered as its
-			 * selectivity func unless we can deal with its operand types.
-			 * But currently, all manner of stuff is invoking scalarltsel,
-			 * so give a default estimate until that can be fixed.
-			 */
-			if (!typbyval)
-			{
-				pfree(DatumGetPointer(hival));
-				pfree(DatumGetPointer(loval));
-			}
-			PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
-		}
+	/*
+	 * If we have most-common-values info, add up the fractions of the
+	 * MCV entries that satisfy MCV OP CONST.  These fractions contribute
+	 * directly to the result selectivity.  Also add up the total fraction
+	 * represented by MCV entries.
+	 */
+	mcv_selec = 0.0;
+	sumcommon = 0.0;
 
-		/* release temp storage if needed */
-		if (!typbyval)
+	if (get_attstatsslot(statsTuple, typid, typmod,
+						 STATISTIC_KIND_MCV, InvalidOid,
+						 &values, &nvalues,
+						 &numbers, &nnumbers))
+	{
+		for (i = 0; i < nvalues; i++)
 		{
-			pfree(DatumGetPointer(hival));
-			pfree(DatumGetPointer(loval));
+			if (DatumGetBool(FunctionCall2(&opproc,
+										   values[i],
+										   value)))
+				mcv_selec += numbers[i];
+			sumcommon += numbers[i];
 		}
+		free_attstatsslot(typid, values, nvalues, numbers, nnumbers);
+	}
+
+	/*
+	 * If there is a histogram, determine which bin the constant falls in,
+	 * and compute the resulting contribution to selectivity.
+	 *
+	 * Someday, VACUUM might store more than one histogram per rel/att,
+	 * corresponding to more than one possible sort ordering defined for
+	 * the column type.  However, to make that work we will need to figure
+	 * out which staop to search for --- it's not necessarily the one we
+	 * have at hand!  (For example, we might have a '<=' operator rather
+	 * than the '<' operator that will appear in staop.)  For now, assume
+	 * that whatever appears in pg_statistic is sorted the same way our
+	 * operator sorts.
+	 */
+	hist_selec = 0.0;
 
-		if (high <= low)
+	if (get_attstatsslot(statsTuple, typid, typmod,
+						 STATISTIC_KIND_HISTOGRAM, InvalidOid,
+						 &values, &nvalues,
+						 NULL, NULL))
+	{
+		if (nvalues > 1)
 		{
+			double	histfrac;
+			bool	ltcmp;
+
+			ltcmp = DatumGetBool(FunctionCall2(&opproc,
+											   values[0],
+											   value));
+			if (isgt)
+				ltcmp = !ltcmp;
+			if (!ltcmp)
+			{
+				/* Constant is below lower histogram boundary. */
+				histfrac = 0.0;
+			}
+			else
+			{
+				/*
+				 * Scan to find proper location.  This could be made faster
+				 * by using a binary-search method, but it's probably not
+				 * worth the trouble for typical histogram sizes.
+				 */
+				for (i = 1; i < nvalues; i++)
+				{
+					ltcmp = DatumGetBool(FunctionCall2(&opproc,
+													   values[i],
+													   value));
+					if (isgt)
+						ltcmp = !ltcmp;
+					if (!ltcmp)
+						break;
+				}
+				if (i >= nvalues)
+				{
+					/* Constant is above upper histogram boundary. */
+					histfrac = 1.0;
+				}
+				else
+				{
+					double		val,
+								high,
+								low;
+					double		binfrac;
 
+					/*
+					 * We have values[i-1] < constant < values[i].
+					 *
+					 * Convert the constant and the two nearest bin boundary
+					 * values to a uniform comparison scale, and do a linear
+					 * interpolation within this bin.
+					 */
+					if (convert_to_scalar(value, contype, &val,
+										  values[i-1], values[i], typid,
+										  &low, &high))
+					{
+						if (high <= low)
+						{
+							/* cope if bin boundaries appear identical */
+							binfrac = 0.5;
+						}
+						else if (val <= low)
+							binfrac = 0.0;
+						else if (val >= high)
+							binfrac = 1.0;
+						else
+							binfrac = (val - low) / (high - low);
+					}
+					else
+					{
+						/*
+						 * Ideally we'd produce an error here, on the grounds
+						 * that the given operator shouldn't have scalarltsel
+						 * registered as its selectivity func unless we can
+						 * deal with its operand types.  But currently, all
+						 * manner of stuff is invoking scalarltsel, so give a
+						 * default estimate until that can be fixed.
+						 */
+						binfrac = 0.5;
+					}
+					/*
+					 * Now, compute the overall selectivity across the values
+					 * represented by the histogram.  We have i-1 full bins
+					 * and binfrac partial bin below the constant.
+					 */
+					histfrac = (double) (i-1) + binfrac;
+					histfrac /= (double) (nvalues - 1);
+				}
+			}
 			/*
-			 * If we trusted the stats fully, we could return a small or
-			 * large selec depending on which side of the single data
-			 * point the constant is on.  But it seems better to assume
-			 * that the stats are wrong and return a default...
+			 * Now histfrac = fraction of histogram entries below the constant.
+			 *
+			 * Account for "<" vs ">"
 			 */
-			result = DEFAULT_INEQ_SEL;
-		}
-		else if (val < low || val > high)
-		{
-
+			hist_selec = isgt ? (1.0 - histfrac) : histfrac;
 			/*
-			 * If given value is outside the statistical range, return a
-			 * small or large value; but not 0.0/1.0 since there is a
-			 * chance the stats are out of date.
+			 * The histogram boundaries are only approximate to begin
+			 * with, and may well be out of date anyway.  Therefore,
+			 * don't believe extremely small or large selectivity
+			 * estimates.
 			 */
-			if (flag & SEL_RIGHT)
-				result = (val < low) ? 0.001 : 0.999;
-			else
-				result = (val < low) ? 0.999 : 0.001;
-		}
-		else
-		{
-			denominator = high - low;
-			if (flag & SEL_RIGHT)
-				numerator = val - low;
-			else
-				numerator = high - val;
-			result = numerator / denominator;
+			if (hist_selec < 0.001)
+				hist_selec = 0.001;
+			else if (hist_selec > 0.999)
+				hist_selec = 0.999;
 		}
+
+		free_attstatsslot(typid, values, nvalues, NULL, 0);
 	}
-	PG_RETURN_FLOAT8(result);
+
+	/*
+	 * Now merge the results from the MCV and histogram calculations,
+	 * realizing that the histogram covers only the non-null values that
+	 * are not listed in MCV.
+	 */
+	selec = 1.0 - stats->stanullfrac - sumcommon;
+
+	if (hist_selec > 0.0)
+		selec *= hist_selec;
+	else
+	{
+		/*
+		 * If no histogram but there are values not accounted for by MCV,
+		 * arbitrarily assume half of them will match.
+		 */
+		selec *= 0.5;
+	}
+
+	selec += mcv_selec;
+
+	ReleaseSysCache(statsTuple);
+
+	/* result should be in range, but make sure... */
+	if (selec < 0.0)
+		selec = 0.0;
+	else if (selec > 1.0)
+		selec = 1.0;
+
+	PG_RETURN_FLOAT8((float8) selec);
 }
 
 /*
@@ -428,34 +602,25 @@ scalargtsel(PG_FUNCTION_ARGS)
 	Datum		value = PG_GETARG_DATUM(3);
 	int32		flag = PG_GETARG_INT32(4);
 	Oid			ltopid;
-	float8		result;
 
 	/*
-	 * Compute selectivity of "<", then invert --- but only if we were
-	 * able to produce a non-default estimate.	Note that we get the
-	 * negator which strictly speaking means we are looking at "<=" for
-	 * ">" or "<" for ">=".  We assume this won't matter.
+	 * Commute so that we have a "<" or "<=" operator, then apply
+	 * scalarltsel.
 	 */
-	ltopid = get_negator(opid);
-	if (ltopid)
-	{
-		result = DatumGetFloat8(DirectFunctionCall5(scalarltsel,
-												ObjectIdGetDatum(ltopid),
-												 ObjectIdGetDatum(relid),
-													Int16GetDatum(attno),
-													value,
-													Int32GetDatum(flag)));
-	}
-	else
+	ltopid = get_commutator(opid);
+	if (!ltopid)
 	{
 		/* Use default selectivity (should we raise an error instead?) */
-		result = DEFAULT_INEQ_SEL;
+		PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
 	}
 
-	if (result != DEFAULT_INEQ_SEL)
-		result = 1.0 - result;
-
-	PG_RETURN_FLOAT8(result);
+	flag ^= SEL_RIGHT;
+	return DirectFunctionCall5(scalarltsel,
+							   ObjectIdGetDatum(ltopid),
+							   ObjectIdGetDatum(relid),
+							   Int16GetDatum(attno),
+							   value,
+							   Int32GetDatum(flag));
 }
 
 /*
@@ -476,7 +641,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 		result = DEFAULT_MATCH_SEL;
 	else
 	{
-		HeapTuple	oprtuple;
+		HeapTuple	oprTuple;
 		Oid			ltype,
 					rtype;
 		char	   *patt;
@@ -488,14 +653,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 		 * Get left and right datatypes of the operator so we know what
 		 * type the attribute is.
 		 */
-		oprtuple = SearchSysCache(OPEROID,
+		oprTuple = SearchSysCache(OPEROID,
 								  ObjectIdGetDatum(opid),
 								  0, 0, 0);
-		if (!HeapTupleIsValid(oprtuple))
+		if (!HeapTupleIsValid(oprTuple))
 			elog(ERROR, "patternsel: no tuple for operator %u", opid);
-		ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
-		rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
-		ReleaseSysCache(oprtuple);
+		ltype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprleft;
+		rtype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprright;
+		ReleaseSysCache(oprTuple);
 
 		/* the right-hand const is type text for all supported operators */
 		Assert(rtype == TEXTOID);
@@ -659,42 +824,88 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	AttrNumber	attno1 = PG_GETARG_INT16(2);
 	Oid			relid2 = PG_GETARG_OID(3);
 	AttrNumber	attno2 = PG_GETARG_INT16(4);
-	float8		result;
-	float8		num1,
-				num2,
-				min;
 	bool		unknown1 = NONVALUE(relid1) || NONVALUE(attno1);
 	bool		unknown2 = NONVALUE(relid2) || NONVALUE(attno2);
+	double		selec;
 
 	if (unknown1 && unknown2)
-		result = DEFAULT_EQ_SEL;
+		selec = DEFAULT_EQ_SEL;
 	else
 	{
-		num1 = unknown1 ? 1.0 : get_attdispersion(relid1, attno1, 0.01);
-		num2 = unknown2 ? 1.0 : get_attdispersion(relid2, attno2, 0.01);
+		Oid			typid1;
+		Oid			typid2;
+		int32		typmod1;
+		int32		typmod2;
+		HeapTuple	statsTuple1 = NULL;
+		HeapTuple	statsTuple2 = NULL;
+		Form_pg_statistic stats1 = NULL;
+		Form_pg_statistic stats2 = NULL;
+		double		nd1,
+					nd2;
+
+		if (unknown1)
+		{
+			nd1 = 100.0;
+		}
+		else
+		{
+			/* get info about the attribute */
+			getattproperties(relid1, attno1, &typid1, &typmod1);
+
+			/* get stats for the attribute, if available */
+			statsTuple1 = SearchSysCache(STATRELATT,
+										 ObjectIdGetDatum(relid1),
+										 Int16GetDatum(attno1),
+										 0, 0);
+			if (HeapTupleIsValid(statsTuple1))
+				stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
+
+			nd1 = get_att_numdistinct(relid1, attno1, typid1, stats1);
+		}
+
+		if (unknown2)
+		{
+			nd2 = 100.0;
+		}
+		else
+		{
+			/* get info about the attribute */
+			getattproperties(relid2, attno2, &typid2, &typmod2);
+
+			/* get stats for the attribute, if available */
+			statsTuple2 = SearchSysCache(STATRELATT,
+										 ObjectIdGetDatum(relid2),
+										 Int16GetDatum(attno2),
+										 0, 0);
+			if (HeapTupleIsValid(statsTuple2))
+				stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
+
+			nd2 = get_att_numdistinct(relid2, attno2, typid2, stats2);
+		}
 
 		/*
-		 * The join selectivity cannot be more than num2, since each tuple
-		 * in table 1 could match no more than num2 fraction of tuples in
-		 * table 2 (and that's only if the table-1 tuple matches the most
-		 * common value in table 2, so probably it's less).  By the same
-		 * reasoning it is not more than num1. The min is therefore an
-		 * upper bound.
+		 * Estimate the join selectivity as 1 / sqrt(nd1*nd2)
+		 * (can we produce any theory for this)?
 		 *
-		 * If we know the dispersion of only one side, use it; the reasoning
-		 * above still works.
+		 * XXX possibility to do better: if both attributes have histograms
+		 * then we could determine the exact join selectivity between the
+		 * MCV sets, and only have to assume the join behavior of the non-MCV
+		 * values.  This could be a big win when the MCVs cover a large part
+		 * of the population.
 		 *
-		 * XXX can we make a better estimate here?	Using the nullfrac
-		 * statistic might be helpful, for example.  Assuming the operator
-		 * is strict (does not succeed for null inputs) then the
-		 * selectivity couldn't be more than (1-nullfrac1)*(1-nullfrac2),
-		 * which might be usefully small if there are many nulls.  How
-		 * about applying the operator to the most common values?
+		 * XXX what about nulls?
 		 */
-		min = (num1 < num2) ? num1 : num2;
-		result = min;
+		selec = 1.0 / sqrt(nd1 * nd2);
+		if (selec > 1.0)
+			selec = 1.0;
+
+		if (HeapTupleIsValid(statsTuple1))
+			ReleaseSysCache(statsTuple1);
+		if (HeapTupleIsValid(statsTuple2))
+			ReleaseSysCache(statsTuple2);
+
 	}
-	PG_RETURN_FLOAT8(result);
+	PG_RETURN_FLOAT8((float8) selec);
 }
 
 /*
@@ -829,7 +1040,8 @@ icnlikejoinsel(PG_FUNCTION_ARGS)
  *	  Returns "true" if successful.
  *
  * All numeric datatypes are simply converted to their equivalent
- * "double" values.
+ * "double" values.  XXX what about NUMERIC values that are outside
+ * the range of "double"?
  *
  * String datatypes are converted by convert_string_to_scalar(),
  * which is explained below.  The reason why this routine deals with
@@ -917,7 +1129,7 @@ convert_numeric_to_scalar(Datum value, Oid typid)
 {
 	switch (typid)
 	{
-			case BOOLOID:
+		case BOOLOID:
 			return (double) DatumGetBool(value);
 		case INT2OID:
 			return (double) DatumGetInt16(value);
@@ -963,6 +1175,8 @@ convert_numeric_to_scalar(Datum value, Oid typid)
  * three strings before computing the scaled values.  This allows us to
  * "zoom in" when we encounter a narrow data range.  An example is a phone
  * number database where all the values begin with the same area code.
+ * (Actually, the bounds will be adjacent histogram-bin-boundary values,
+ * so this is more likely to happen than you might think.)
  */
 static void
 convert_string_to_scalar(unsigned char *value,
@@ -1208,11 +1422,11 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
 /*
  * getattproperties
  *	  Retrieve pg_attribute properties for an attribute,
- *	  including type OID, type len, type byval flag, typmod.
+ *	  including type OID and typmod.
  */
 static void
 getattproperties(Oid relid, AttrNumber attnum,
-				 Oid *typid, int *typlen, bool *typbyval, int32 *typmod)
+				 Oid *typid, int32 *typmod)
 {
 	HeapTuple	atp;
 	Form_pg_attribute att_tup;
@@ -1227,164 +1441,87 @@ getattproperties(Oid relid, AttrNumber attnum,
 	att_tup = (Form_pg_attribute) GETSTRUCT(atp);
 
 	*typid = att_tup->atttypid;
-	*typlen = att_tup->attlen;
-	*typbyval = att_tup->attbyval;
 	*typmod = att_tup->atttypmod;
 
 	ReleaseSysCache(atp);
 }
 
 /*
- * getattstatistics
- *	  Retrieve the pg_statistic data for an attribute.
- *	  Returns 'false' if no stats are available.
+ * get_att_numdistinct
  *
- * Inputs:
- * 'relid' and 'attnum' are the relation and attribute number.
- * 'typid' and 'typmod' are the type and typmod of the column,
- * which the caller must already have looked up.
+ *	  Estimate the number of distinct values of an attribute.
  *
- * Outputs:
- * The available stats are nullfrac, commonfrac, commonval, loval, hival.
- * The caller need not retrieve all five --- pass NULL pointers for the
- * unwanted values.
+ * relid, attnum: identify the attribute to examine.
+ * typid: type of attribute.
+ * stats: pg_statistic tuple for attribute, or NULL if not available.
  *
- * commonval, loval, hival are returned as Datums holding the internal
- * representation of the values.  (Note that these should be pfree'd
- * after use if the data type is not by-value.)
+ * XXX possible future improvement: look to see if there is a unique
+ * index on the attribute.  If so, we can estimate ndistinct = ntuples.
+ * This should probably override any info from pg_statistic.
  */
-static bool
-getattstatistics(Oid relid,
-				 AttrNumber attnum,
-				 Oid typid,
-				 int32 typmod,
-				 double *nullfrac,
-				 double *commonfrac,
-				 Datum *commonval,
-				 Datum *loval,
-				 Datum *hival)
+static double
+get_att_numdistinct(Oid relid, AttrNumber attnum, Oid typid,
+					Form_pg_statistic stats)
 {
-	HeapTuple	tuple;
-	HeapTuple	typeTuple;
-	FmgrInfo	inputproc;
-	Oid			typelem;
-	bool		isnull;
+	HeapTuple	reltup;
+	double		ntuples;
 
 	/*
-	 * We assume that there will only be one entry in pg_statistic for the
-	 * given rel/att, so we search WITHOUT considering the staop column.
-	 * Someday, VACUUM might store more than one entry per rel/att,
-	 * corresponding to more than one possible sort ordering defined for
-	 * the column type.  However, to make that work we will need to figure
-	 * out which staop to search for --- it's not necessarily the one we
-	 * have at hand!  (For example, we might have a '>' operator rather
-	 * than the '<' operator that will appear in staop.)
+	 * Special-case boolean columns: presumably, two distinct values.
+	 *
+	 * Are there any other cases we should wire in special estimates for?
 	 */
-	tuple = SearchSysCache(STATRELID,
-						   ObjectIdGetDatum(relid),
-						   Int16GetDatum((int16) attnum),
-						   0, 0);
-	if (!HeapTupleIsValid(tuple))
-	{
-		/* no such stats entry */
-		return false;
-	}
+	if (typid == BOOLOID)
+		return 2.0;
 
-	if (nullfrac)
-		*nullfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stanullfrac;
-	if (commonfrac)
-		*commonfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stacommonfrac;
-
-	/* Get the type input proc for the column datatype */
-	typeTuple = SearchSysCache(TYPEOID,
-							   ObjectIdGetDatum(typid),
-							   0, 0, 0);
-	if (!HeapTupleIsValid(typeTuple))
-		elog(ERROR, "getattstatistics: Cache lookup failed for type %u",
-			 typid);
-	fmgr_info(((Form_pg_type) GETSTRUCT(typeTuple))->typinput, &inputproc);
-	typelem = ((Form_pg_type) GETSTRUCT(typeTuple))->typelem;
-	ReleaseSysCache(typeTuple);
+	/*
+	 * If VACUUM ANALYZE determined a fixed estimate, use it.
+	 */
+	if (stats && stats->stadistinct > 0.0)
+		return stats->stadistinct;
 
 	/*
-	 * Values are variable-length fields, so cannot access as struct
-	 * fields. Must do it the hard way with SysCacheGetAttr.
+	 * Otherwise we need to get the relation size.
 	 */
-	if (commonval)
-	{
-		Datum		val = SysCacheGetAttr(STATRELID, tuple,
-										  Anum_pg_statistic_stacommonval,
-										  &isnull);
+	reltup = SearchSysCache(RELOID,
+							ObjectIdGetDatum(relid),
+							0, 0, 0);
+	if (!HeapTupleIsValid(reltup))
+		elog(ERROR, "get_att_numdistinct: no relation tuple %u", relid);
 
-		if (isnull)
-		{
-			elog(DEBUG, "getattstatistics: stacommonval is null");
-			*commonval = PointerGetDatum(NULL);
-		}
-		else
-		{
-			char	   *strval = DatumGetCString(DirectFunctionCall1(textout,
-																   val));
-
-			*commonval = FunctionCall3(&inputproc,
-									   CStringGetDatum(strval),
-									   ObjectIdGetDatum(typelem),
-									   Int32GetDatum(typmod));
-			pfree(strval);
-		}
-	}
+	ntuples = ((Form_pg_class) GETSTRUCT(reltup))->reltuples;
 
-	if (loval)
-	{
-		Datum		val = SysCacheGetAttr(STATRELID, tuple,
-										  Anum_pg_statistic_staloval,
-										  &isnull);
+	ReleaseSysCache(reltup);
 
-		if (isnull)
-		{
-			elog(DEBUG, "getattstatistics: staloval is null");
-			*loval = PointerGetDatum(NULL);
-		}
-		else
-		{
-			char	   *strval = DatumGetCString(DirectFunctionCall1(textout,
-																   val));
-
-			*loval = FunctionCall3(&inputproc,
-								   CStringGetDatum(strval),
-								   ObjectIdGetDatum(typelem),
-								   Int32GetDatum(typmod));
-			pfree(strval);
-		}
-	}
+	if (ntuples <= 0.0)
+		return 100.0;			/* no data available; return a default */
 
-	if (hival)
-	{
-		Datum		val = SysCacheGetAttr(STATRELID, tuple,
-										  Anum_pg_statistic_stahival,
-										  &isnull);
+	/*
+	 * If VACUUM ANALYZE determined a scaled estimate, use it.
+	 */
+	if (stats && stats->stadistinct < 0.0)
+		return - stats->stadistinct * ntuples;
 
-		if (isnull)
-		{
-			elog(DEBUG, "getattstatistics: stahival is null");
-			*hival = PointerGetDatum(NULL);
-		}
-		else
-		{
-			char	   *strval = DatumGetCString(DirectFunctionCall1(textout,
-																   val));
-
-			*hival = FunctionCall3(&inputproc,
-								   CStringGetDatum(strval),
-								   ObjectIdGetDatum(typelem),
-								   Int32GetDatum(typmod));
-			pfree(strval);
-		}
+	/*
+	 * VACUUM ANALYZE does not compute stats for system attributes,
+	 * but some of them can reasonably be assumed unique anyway.
+	 */
+	switch (attnum)
+	{
+		case ObjectIdAttributeNumber:
+		case SelfItemPointerAttributeNumber:
+			return ntuples;
+		case TableOidAttributeNumber:
+			return 1.0;
 	}
 
-	ReleaseSysCache(tuple);
+	/*
+	 * Estimate ndistinct = ntuples if the table is small, else 100.
+	 */
+	if (ntuples < 100.0)
+		return ntuples;
 
-	return true;
+	return 100.0;
 }
 
 /*-------------------------------------------------------------------------
author	Tom Lane <tgl@sss.pgh.pa.us>	2001-05-07 00:43:27 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2001-05-07 00:43:27 +0000
commit	f905d65ee35b3f84b6d4433a5198af0e2e7bd090 (patch)
tree	68f5955bb1a7ecaa531cf6b3752f563943dbe079 /src/backend/utils/adt/selfuncs.c
parent	9583aea9d09f6b3839ede8e57f990262b24e6979 (diff)
download	postgresql-f905d65ee35b3f84b6d4433a5198af0e2e7bd090.tar.gz postgresql-f905d65ee35b3f84b6d4433a5198af0e2e7bd090.zip