aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/selfuncs.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r--src/backend/utils/adt/selfuncs.c923
1 files changed, 530 insertions, 393 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 1fe0afb0a35..41ba82db7b5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.87 2001/03/23 04:49:54 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.88 2001/05/07 00:43:23 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -57,9 +57,6 @@
/* default selectivity estimate for pattern-match operators such as LIKE */
#define DEFAULT_MATCH_SEL 0.01
-/* "fudge factor" for estimating frequency of not-most-common values */
-#define NOT_MOST_COMMON_RATIO 0.1
-
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound);
@@ -75,17 +72,9 @@ static double convert_one_string_to_scalar(unsigned char *value,
static unsigned char *convert_string_datum(Datum value, Oid typid);
static double convert_timevalue_to_scalar(Datum value, Oid typid);
static void getattproperties(Oid relid, AttrNumber attnum,
- Oid *typid,
- int *typlen,
- bool *typbyval,
- int32 *typmod);
-static bool getattstatistics(Oid relid, AttrNumber attnum,
- Oid typid, int32 typmod,
- double *nullfrac,
- double *commonfrac,
- Datum *commonval,
- Datum *loval,
- Datum *hival);
+ Oid *typid, int32 *typmod);
+static double get_att_numdistinct(Oid relid, AttrNumber attnum, Oid typid,
+ Form_pg_statistic stats);
static Selectivity prefix_selectivity(char *prefix,
Oid relid,
AttrNumber attno,
@@ -115,134 +104,173 @@ eqsel(PG_FUNCTION_ARGS)
AttrNumber attno = PG_GETARG_INT16(2);
Datum value = PG_GETARG_DATUM(3);
int32 flag = PG_GETARG_INT32(4);
- float8 result;
-
- if (NONVALUE(attno) || NONVALUE(relid))
- result = DEFAULT_EQ_SEL;
- else
+ Oid typid;
+ int32 typmod;
+ HeapTuple statsTuple;
+ Datum *values;
+ int nvalues;
+ float4 *numbers;
+ int nnumbers;
+ double selec;
+
+ if (NONVALUE(relid) || NONVALUE(attno))
+ PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
+
+ /* get info about the attribute */
+ getattproperties(relid, attno, &typid, &typmod);
+
+ /* get stats for the attribute, if available */
+ statsTuple = SearchSysCache(STATRELATT,
+ ObjectIdGetDatum(relid),
+ Int16GetDatum(attno),
+ 0, 0);
+ if (HeapTupleIsValid(statsTuple))
{
- Oid typid;
- int typlen;
- bool typbyval;
- int32 typmod;
- double nullfrac;
- double commonfrac;
- Datum commonval;
- double selec;
-
- /* get info about the attribute */
- getattproperties(relid, attno,
- &typid, &typlen, &typbyval, &typmod);
-
- /* get stats for the attribute, if available */
- if (getattstatistics(relid, attno, typid, typmod,
- &nullfrac, &commonfrac, &commonval,
- NULL, NULL))
- {
- if (flag & SEL_CONSTANT)
- {
+ Form_pg_statistic stats;
- /*
- * Is the constant "=" to the column's most common value?
- * (Although the operator may not really be "=", we will
- * assume that seeing whether it returns TRUE for the most
- * common value is useful information. If you don't like
- * it, maybe you shouldn't be using eqsel for your
- * operator...)
- */
- RegProcedure eqproc = get_opcode(opid);
- bool mostcommon;
+ stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
- if (eqproc == (RegProcedure) NULL)
- elog(ERROR, "eqsel: no procedure for operator %u",
- opid);
+ if (flag & SEL_CONSTANT)
+ {
+ bool match = false;
+ int i;
- /* be careful to apply operator right way 'round */
- if (flag & SEL_RIGHT)
- mostcommon = DatumGetBool(OidFunctionCall2(eqproc,
- commonval,
- value));
- else
- mostcommon = DatumGetBool(OidFunctionCall2(eqproc,
- value,
- commonval));
+ /*
+ * Is the constant "=" to any of the column's most common
+ * values? (Although the given operator may not really be
+ * "=", we will assume that seeing whether it returns TRUE
+ * is an appropriate test. If you don't like this, maybe you
+ * shouldn't be using eqsel for your operator...)
+ */
+ if (get_attstatsslot(statsTuple, typid, typmod,
+ STATISTIC_KIND_MCV, InvalidOid,
+ &values, &nvalues,
+ &numbers, &nnumbers))
+ {
+ FmgrInfo eqproc;
- if (mostcommon)
- {
+ fmgr_info(get_opcode(opid), &eqproc);
- /*
- * Constant is "=" to the most common value. We know
- * selectivity exactly (or as exactly as VACUUM could
- * calculate it, anyway).
- */
- selec = commonfrac;
- }
- else
+ for (i = 0; i < nvalues; i++)
{
-
- /*
- * Comparison is against a constant that is neither
- * the most common value nor null. Its selectivity
- * cannot be more than this:
- */
- selec = 1.0 - commonfrac - nullfrac;
- if (selec > commonfrac)
- selec = commonfrac;
-
- /*
- * and in fact it's probably less, so we should apply
- * a fudge factor. The only case where we don't is
- * for a boolean column, where indeed we have
- * estimated the less-common value's frequency
- * exactly!
- */
- if (typid != BOOLOID)
- selec *= NOT_MOST_COMMON_RATIO;
+ /* be careful to apply operator right way 'round */
+ if (flag & SEL_RIGHT)
+ match = DatumGetBool(FunctionCall2(&eqproc,
+ values[i],
+ value));
+ else
+ match = DatumGetBool(FunctionCall2(&eqproc,
+ value,
+ values[i]));
+ if (match)
+ break;
}
}
else
{
+ /* no most-common-value info available */
+ values = NULL;
+ numbers = NULL;
+ i = nvalues = nnumbers = 0;
+ }
+ if (match)
+ {
+ /*
+ * Constant is "=" to this common value. We know
+ * selectivity exactly (or as exactly as VACUUM
+ * could calculate it, anyway).
+ */
+ selec = numbers[i];
+ }
+ else
+ {
/*
- * Search is for a value that we do not know a priori, but
- * we will assume it is not NULL. Selectivity cannot be
- * more than this:
+ * Comparison is against a constant that is neither
+ * NULL nor any of the common values. Its selectivity
+ * cannot be more than this:
*/
- selec = 1.0 - nullfrac;
- if (selec > commonfrac)
- selec = commonfrac;
+ double sumcommon = 0.0;
+ double otherdistinct;
+ for (i = 0; i < nnumbers; i++)
+ sumcommon += numbers[i];
+ selec = 1.0 - sumcommon - stats->stanullfrac;
+ /*
+ * and in fact it's probably a good deal less.
+ * We approximate that all the not-common values
+ * share this remaining fraction equally, so we
+ * divide by the number of other distinct values.
+ */
+ otherdistinct = get_att_numdistinct(relid, attno,
+ typid, stats)
+ - nnumbers;
+ if (otherdistinct > 1)
+ selec /= otherdistinct;
/*
- * and in fact it's probably less, so apply a fudge
- * factor.
+ * Another cross-check: selectivity shouldn't be
+ * estimated as more than the least common
+ * "most common value".
*/
- selec *= NOT_MOST_COMMON_RATIO;
+ if (nnumbers > 0 && selec > numbers[nnumbers-1])
+ selec = numbers[nnumbers-1];
}
- /* result should be in range, but make sure... */
- if (selec < 0.0)
- selec = 0.0;
- else if (selec > 1.0)
- selec = 1.0;
-
- if (!typbyval)
- pfree(DatumGetPointer(commonval));
+ free_attstatsslot(typid, values, nvalues, numbers, nnumbers);
}
else
{
+ double ndistinct;
/*
- * No VACUUM ANALYZE stats available, so make a guess using
- * the dispersion stat (if we have that, which is unlikely for
- * a normal attribute; but for a system attribute we may be
- * able to estimate it).
+ * Search is for a value that we do not know a priori, but
+ * we will assume it is not NULL. Estimate the selectivity
+ * as non-null fraction divided by number of distinct values,
+ * so that we get a result averaged over all possible values
+ * whether common or uncommon. (Essentially, we are assuming
+ * that the not-yet-known comparison value is equally likely
+ * to be any of the possible values, regardless of their
+ * frequency in the table. Is that a good idea?)
+ */
+ selec = 1.0 - stats->stanullfrac;
+ ndistinct = get_att_numdistinct(relid, attno, typid, stats);
+ if (ndistinct > 1)
+ selec /= ndistinct;
+ /*
+ * Cross-check: selectivity should never be
+ * estimated as more than the most common value's.
*/
- selec = get_attdispersion(relid, attno, 0.01);
+ if (get_attstatsslot(statsTuple, typid, typmod,
+ STATISTIC_KIND_MCV, InvalidOid,
+ NULL, NULL,
+ &numbers, &nnumbers))
+ {
+ if (nnumbers > 0 && selec > numbers[0])
+ selec = numbers[0];
+ free_attstatsslot(typid, NULL, 0, numbers, nnumbers);
+ }
}
- result = (float8) selec;
+ ReleaseSysCache(statsTuple);
}
- PG_RETURN_FLOAT8(result);
+ else
+ {
+ /*
+ * No VACUUM ANALYZE stats available, so make a guess using
+ * estimated number of distinct values and assuming they are
+ * equally common. (The guess is unlikely to be very good,
+ * but we do know a few special cases.)
+ */
+ selec = 1.0 / get_att_numdistinct(relid, attno, typid, NULL);
+ }
+
+ /* result should be in range, but make sure... */
+ if (selec < 0.0)
+ selec = 0.0;
+ else if (selec > 1.0)
+ selec = 1.0;
+
+ PG_RETURN_FLOAT8((float8) selec);
}
/*
@@ -301,117 +329,263 @@ scalarltsel(PG_FUNCTION_ARGS)
AttrNumber attno = PG_GETARG_INT16(2);
Datum value = PG_GETARG_DATUM(3);
int32 flag = PG_GETARG_INT32(4);
- float8 result;
+ bool isgt;
+ HeapTuple oprTuple;
+ HeapTuple statsTuple;
+ Form_pg_statistic stats;
+ Oid contype;
+ FmgrInfo opproc;
+ Oid typid;
+ int32 typmod;
+ Datum *values;
+ int nvalues;
+ float4 *numbers;
+ int nnumbers;
+ double mcv_selec,
+ hist_selec,
+ sumcommon;
+ double selec;
+ int i;
+
+ if (NONVALUE(relid) || NONVALUE(attno))
+ PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+
+ /* Can't do anything useful if no constant to compare against, either */
+ if (!(flag & SEL_CONSTANT))
+ PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
- if (!(flag & SEL_CONSTANT) || NONVALUE(attno) || NONVALUE(relid))
- result = DEFAULT_INEQ_SEL;
+ /*
+ * Force the constant to be on the right to simplify later logic.
+ * This means that we may be dealing with either "<" or ">" cases.
+ */
+ if (flag & SEL_RIGHT)
+ {
+ /* we have x < const */
+ isgt = false;
+ }
else
{
- HeapTuple oprtuple;
- Oid ltype,
- rtype,
- contype;
- Oid typid;
- int typlen;
- bool typbyval;
- int32 typmod;
- Datum hival,
- loval;
- double val,
- high,
- low,
- numerator,
- denominator;
-
- /*
- * Get left and right datatypes of the operator so we know what
- * type the constant is.
- */
- oprtuple = SearchSysCache(OPEROID,
- ObjectIdGetDatum(opid),
- 0, 0, 0);
- if (!HeapTupleIsValid(oprtuple))
- elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
- ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
- rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
- contype = (flag & SEL_RIGHT) ? rtype : ltype;
- ReleaseSysCache(oprtuple);
-
- /* Now get info and stats about the attribute */
- getattproperties(relid, attno,
- &typid, &typlen, &typbyval, &typmod);
-
- if (!getattstatistics(relid, attno, typid, typmod,
- NULL, NULL, NULL,
- &loval, &hival))
+ /* we have const < x, commute to make x > const */
+ opid = get_commutator(opid);
+ if (!opid)
{
- /* no stats available, so default result */
+ /* Use default selectivity (should we raise an error instead?) */
PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
}
+ isgt = true;
+ }
- /* Convert the values to a uniform comparison scale. */
- if (!convert_to_scalar(value, contype, &val,
- loval, hival, typid,
- &low, &high))
- {
+ /*
+ * The constant might not be the same datatype as the column;
+ * look at the operator's input types to find out what it is.
+ * Also set up to be able to call the operator's execution proc.
+ */
+ oprTuple = SearchSysCache(OPEROID,
+ ObjectIdGetDatum(opid),
+ 0, 0, 0);
+ if (!HeapTupleIsValid(oprTuple))
+ elog(ERROR, "scalarltsel: no tuple for operator %u", opid);
+ contype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprright;
+ fmgr_info(((Form_pg_operator) GETSTRUCT(oprTuple))->oprcode, &opproc);
+ ReleaseSysCache(oprTuple);
+
+ /* Now get info and stats about the attribute */
+ getattproperties(relid, attno, &typid, &typmod);
+
+ statsTuple = SearchSysCache(STATRELATT,
+ ObjectIdGetDatum(relid),
+ Int16GetDatum(attno),
+ 0, 0);
+ if (!HeapTupleIsValid(statsTuple))
+ {
+ /* no stats available, so default result */
+ PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+ }
+ stats = (Form_pg_statistic) GETSTRUCT(statsTuple);
- /*
- * Ideally we'd produce an error here, on the grounds that the
- * given operator shouldn't have scalarltsel registered as its
- * selectivity func unless we can deal with its operand types.
- * But currently, all manner of stuff is invoking scalarltsel,
- * so give a default estimate until that can be fixed.
- */
- if (!typbyval)
- {
- pfree(DatumGetPointer(hival));
- pfree(DatumGetPointer(loval));
- }
- PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
- }
+ /*
+ * If we have most-common-values info, add up the fractions of the
+ * MCV entries that satisfy MCV OP CONST. These fractions contribute
+ * directly to the result selectivity. Also add up the total fraction
+ * represented by MCV entries.
+ */
+ mcv_selec = 0.0;
+ sumcommon = 0.0;
- /* release temp storage if needed */
- if (!typbyval)
+ if (get_attstatsslot(statsTuple, typid, typmod,
+ STATISTIC_KIND_MCV, InvalidOid,
+ &values, &nvalues,
+ &numbers, &nnumbers))
+ {
+ for (i = 0; i < nvalues; i++)
{
- pfree(DatumGetPointer(hival));
- pfree(DatumGetPointer(loval));
+ if (DatumGetBool(FunctionCall2(&opproc,
+ values[i],
+ value)))
+ mcv_selec += numbers[i];
+ sumcommon += numbers[i];
}
+ free_attstatsslot(typid, values, nvalues, numbers, nnumbers);
+ }
+
+ /*
+ * If there is a histogram, determine which bin the constant falls in,
+ * and compute the resulting contribution to selectivity.
+ *
+ * Someday, VACUUM might store more than one histogram per rel/att,
+ * corresponding to more than one possible sort ordering defined for
+ * the column type. However, to make that work we will need to figure
+ * out which staop to search for --- it's not necessarily the one we
+ * have at hand! (For example, we might have a '<=' operator rather
+ * than the '<' operator that will appear in staop.) For now, assume
+ * that whatever appears in pg_statistic is sorted the same way our
+ * operator sorts.
+ */
+ hist_selec = 0.0;
- if (high <= low)
+ if (get_attstatsslot(statsTuple, typid, typmod,
+ STATISTIC_KIND_HISTOGRAM, InvalidOid,
+ &values, &nvalues,
+ NULL, NULL))
+ {
+ if (nvalues > 1)
{
+ double histfrac;
+ bool ltcmp;
+
+ ltcmp = DatumGetBool(FunctionCall2(&opproc,
+ values[0],
+ value));
+ if (isgt)
+ ltcmp = !ltcmp;
+ if (!ltcmp)
+ {
+ /* Constant is below lower histogram boundary. */
+ histfrac = 0.0;
+ }
+ else
+ {
+ /*
+ * Scan to find proper location. This could be made faster
+ * by using a binary-search method, but it's probably not
+ * worth the trouble for typical histogram sizes.
+ */
+ for (i = 1; i < nvalues; i++)
+ {
+ ltcmp = DatumGetBool(FunctionCall2(&opproc,
+ values[i],
+ value));
+ if (isgt)
+ ltcmp = !ltcmp;
+ if (!ltcmp)
+ break;
+ }
+ if (i >= nvalues)
+ {
+ /* Constant is above upper histogram boundary. */
+ histfrac = 1.0;
+ }
+ else
+ {
+ double val,
+ high,
+ low;
+ double binfrac;
+ /*
+ * We have values[i-1] < constant < values[i].
+ *
+ * Convert the constant and the two nearest bin boundary
+ * values to a uniform comparison scale, and do a linear
+ * interpolation within this bin.
+ */
+ if (convert_to_scalar(value, contype, &val,
+ values[i-1], values[i], typid,
+ &low, &high))
+ {
+ if (high <= low)
+ {
+ /* cope if bin boundaries appear identical */
+ binfrac = 0.5;
+ }
+ else if (val <= low)
+ binfrac = 0.0;
+ else if (val >= high)
+ binfrac = 1.0;
+ else
+ binfrac = (val - low) / (high - low);
+ }
+ else
+ {
+ /*
+ * Ideally we'd produce an error here, on the grounds
+ * that the given operator shouldn't have scalarltsel
+ * registered as its selectivity func unless we can
+ * deal with its operand types. But currently, all
+ * manner of stuff is invoking scalarltsel, so give a
+ * default estimate until that can be fixed.
+ */
+ binfrac = 0.5;
+ }
+ /*
+ * Now, compute the overall selectivity across the values
+ * represented by the histogram. We have i-1 full bins
+ * and binfrac partial bin below the constant.
+ */
+ histfrac = (double) (i-1) + binfrac;
+ histfrac /= (double) (nvalues - 1);
+ }
+ }
/*
- * If we trusted the stats fully, we could return a small or
- * large selec depending on which side of the single data
- * point the constant is on. But it seems better to assume
- * that the stats are wrong and return a default...
+ * Now histfrac = fraction of histogram entries below the constant.
+ *
+ * Account for "<" vs ">"
*/
- result = DEFAULT_INEQ_SEL;
- }
- else if (val < low || val > high)
- {
-
+ hist_selec = isgt ? (1.0 - histfrac) : histfrac;
/*
- * If given value is outside the statistical range, return a
- * small or large value; but not 0.0/1.0 since there is a
- * chance the stats are out of date.
+ * The histogram boundaries are only approximate to begin
+ * with, and may well be out of date anyway. Therefore,
+ * don't believe extremely small or large selectivity
+ * estimates.
*/
- if (flag & SEL_RIGHT)
- result = (val < low) ? 0.001 : 0.999;
- else
- result = (val < low) ? 0.999 : 0.001;
- }
- else
- {
- denominator = high - low;
- if (flag & SEL_RIGHT)
- numerator = val - low;
- else
- numerator = high - val;
- result = numerator / denominator;
+ if (hist_selec < 0.001)
+ hist_selec = 0.001;
+ else if (hist_selec > 0.999)
+ hist_selec = 0.999;
}
+
+ free_attstatsslot(typid, values, nvalues, NULL, 0);
}
- PG_RETURN_FLOAT8(result);
+
+ /*
+ * Now merge the results from the MCV and histogram calculations,
+ * realizing that the histogram covers only the non-null values that
+ * are not listed in MCV.
+ */
+ selec = 1.0 - stats->stanullfrac - sumcommon;
+
+ if (hist_selec > 0.0)
+ selec *= hist_selec;
+ else
+ {
+ /*
+ * If no histogram but there are values not accounted for by MCV,
+ * arbitrarily assume half of them will match.
+ */
+ selec *= 0.5;
+ }
+
+ selec += mcv_selec;
+
+ ReleaseSysCache(statsTuple);
+
+ /* result should be in range, but make sure... */
+ if (selec < 0.0)
+ selec = 0.0;
+ else if (selec > 1.0)
+ selec = 1.0;
+
+ PG_RETURN_FLOAT8((float8) selec);
}
/*
@@ -428,34 +602,25 @@ scalargtsel(PG_FUNCTION_ARGS)
Datum value = PG_GETARG_DATUM(3);
int32 flag = PG_GETARG_INT32(4);
Oid ltopid;
- float8 result;
/*
- * Compute selectivity of "<", then invert --- but only if we were
- * able to produce a non-default estimate. Note that we get the
- * negator which strictly speaking means we are looking at "<=" for
- * ">" or "<" for ">=". We assume this won't matter.
+ * Commute so that we have a "<" or "<=" operator, then apply
+ * scalarltsel.
*/
- ltopid = get_negator(opid);
- if (ltopid)
- {
- result = DatumGetFloat8(DirectFunctionCall5(scalarltsel,
- ObjectIdGetDatum(ltopid),
- ObjectIdGetDatum(relid),
- Int16GetDatum(attno),
- value,
- Int32GetDatum(flag)));
- }
- else
+ ltopid = get_commutator(opid);
+ if (!ltopid)
{
/* Use default selectivity (should we raise an error instead?) */
- result = DEFAULT_INEQ_SEL;
+ PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
}
- if (result != DEFAULT_INEQ_SEL)
- result = 1.0 - result;
-
- PG_RETURN_FLOAT8(result);
+ flag ^= SEL_RIGHT;
+ return DirectFunctionCall5(scalarltsel,
+ ObjectIdGetDatum(ltopid),
+ ObjectIdGetDatum(relid),
+ Int16GetDatum(attno),
+ value,
+ Int32GetDatum(flag));
}
/*
@@ -476,7 +641,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
result = DEFAULT_MATCH_SEL;
else
{
- HeapTuple oprtuple;
+ HeapTuple oprTuple;
Oid ltype,
rtype;
char *patt;
@@ -488,14 +653,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
* Get left and right datatypes of the operator so we know what
* type the attribute is.
*/
- oprtuple = SearchSysCache(OPEROID,
+ oprTuple = SearchSysCache(OPEROID,
ObjectIdGetDatum(opid),
0, 0, 0);
- if (!HeapTupleIsValid(oprtuple))
+ if (!HeapTupleIsValid(oprTuple))
elog(ERROR, "patternsel: no tuple for operator %u", opid);
- ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft;
- rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright;
- ReleaseSysCache(oprtuple);
+ ltype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprleft;
+ rtype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprright;
+ ReleaseSysCache(oprTuple);
/* the right-hand const is type text for all supported operators */
Assert(rtype == TEXTOID);
@@ -659,42 +824,88 @@ eqjoinsel(PG_FUNCTION_ARGS)
AttrNumber attno1 = PG_GETARG_INT16(2);
Oid relid2 = PG_GETARG_OID(3);
AttrNumber attno2 = PG_GETARG_INT16(4);
- float8 result;
- float8 num1,
- num2,
- min;
bool unknown1 = NONVALUE(relid1) || NONVALUE(attno1);
bool unknown2 = NONVALUE(relid2) || NONVALUE(attno2);
+ double selec;
if (unknown1 && unknown2)
- result = DEFAULT_EQ_SEL;
+ selec = DEFAULT_EQ_SEL;
else
{
- num1 = unknown1 ? 1.0 : get_attdispersion(relid1, attno1, 0.01);
- num2 = unknown2 ? 1.0 : get_attdispersion(relid2, attno2, 0.01);
+ Oid typid1;
+ Oid typid2;
+ int32 typmod1;
+ int32 typmod2;
+ HeapTuple statsTuple1 = NULL;
+ HeapTuple statsTuple2 = NULL;
+ Form_pg_statistic stats1 = NULL;
+ Form_pg_statistic stats2 = NULL;
+ double nd1,
+ nd2;
+
+ if (unknown1)
+ {
+ nd1 = 100.0;
+ }
+ else
+ {
+ /* get info about the attribute */
+ getattproperties(relid1, attno1, &typid1, &typmod1);
+
+ /* get stats for the attribute, if available */
+ statsTuple1 = SearchSysCache(STATRELATT,
+ ObjectIdGetDatum(relid1),
+ Int16GetDatum(attno1),
+ 0, 0);
+ if (HeapTupleIsValid(statsTuple1))
+ stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
+
+ nd1 = get_att_numdistinct(relid1, attno1, typid1, stats1);
+ }
+
+ if (unknown2)
+ {
+ nd2 = 100.0;
+ }
+ else
+ {
+ /* get info about the attribute */
+ getattproperties(relid2, attno2, &typid2, &typmod2);
+
+ /* get stats for the attribute, if available */
+ statsTuple2 = SearchSysCache(STATRELATT,
+ ObjectIdGetDatum(relid2),
+ Int16GetDatum(attno2),
+ 0, 0);
+ if (HeapTupleIsValid(statsTuple2))
+ stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
+
+ nd2 = get_att_numdistinct(relid2, attno2, typid2, stats2);
+ }
/*
- * The join selectivity cannot be more than num2, since each tuple
- * in table 1 could match no more than num2 fraction of tuples in
- * table 2 (and that's only if the table-1 tuple matches the most
- * common value in table 2, so probably it's less). By the same
- * reasoning it is not more than num1. The min is therefore an
- * upper bound.
+ * Estimate the join selectivity as 1 / sqrt(nd1*nd2)
+ * (can we produce any theory for this)?
*
- * If we know the dispersion of only one side, use it; the reasoning
- * above still works.
+ * XXX possibility to do better: if both attributes have histograms
+ * then we could determine the exact join selectivity between the
+ * MCV sets, and only have to assume the join behavior of the non-MCV
+ * values. This could be a big win when the MCVs cover a large part
+ * of the population.
*
- * XXX can we make a better estimate here? Using the nullfrac
- * statistic might be helpful, for example. Assuming the operator
- * is strict (does not succeed for null inputs) then the
- * selectivity couldn't be more than (1-nullfrac1)*(1-nullfrac2),
- * which might be usefully small if there are many nulls. How
- * about applying the operator to the most common values?
+ * XXX what about nulls?
*/
- min = (num1 < num2) ? num1 : num2;
- result = min;
+ selec = 1.0 / sqrt(nd1 * nd2);
+ if (selec > 1.0)
+ selec = 1.0;
+
+ if (HeapTupleIsValid(statsTuple1))
+ ReleaseSysCache(statsTuple1);
+ if (HeapTupleIsValid(statsTuple2))
+ ReleaseSysCache(statsTuple2);
+
}
- PG_RETURN_FLOAT8(result);
+ PG_RETURN_FLOAT8((float8) selec);
}
/*
@@ -829,7 +1040,8 @@ icnlikejoinsel(PG_FUNCTION_ARGS)
* Returns "true" if successful.
*
* All numeric datatypes are simply converted to their equivalent
- * "double" values.
+ * "double" values. XXX what about NUMERIC values that are outside
+ * the range of "double"?
*
* String datatypes are converted by convert_string_to_scalar(),
* which is explained below. The reason why this routine deals with
@@ -917,7 +1129,7 @@ convert_numeric_to_scalar(Datum value, Oid typid)
{
switch (typid)
{
- case BOOLOID:
+ case BOOLOID:
return (double) DatumGetBool(value);
case INT2OID:
return (double) DatumGetInt16(value);
@@ -963,6 +1175,8 @@ convert_numeric_to_scalar(Datum value, Oid typid)
* three strings before computing the scaled values. This allows us to
* "zoom in" when we encounter a narrow data range. An example is a phone
* number database where all the values begin with the same area code.
+ * (Actually, the bounds will be adjacent histogram-bin-boundary values,
+ * so this is more likely to happen than you might think.)
*/
static void
convert_string_to_scalar(unsigned char *value,
@@ -1208,11 +1422,11 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
/*
* getattproperties
* Retrieve pg_attribute properties for an attribute,
- * including type OID, type len, type byval flag, typmod.
+ * including type OID and typmod.
*/
static void
getattproperties(Oid relid, AttrNumber attnum,
- Oid *typid, int *typlen, bool *typbyval, int32 *typmod)
+ Oid *typid, int32 *typmod)
{
HeapTuple atp;
Form_pg_attribute att_tup;
@@ -1227,164 +1441,87 @@ getattproperties(Oid relid, AttrNumber attnum,
att_tup = (Form_pg_attribute) GETSTRUCT(atp);
*typid = att_tup->atttypid;
- *typlen = att_tup->attlen;
- *typbyval = att_tup->attbyval;
*typmod = att_tup->atttypmod;
ReleaseSysCache(atp);
}
/*
- * getattstatistics
- * Retrieve the pg_statistic data for an attribute.
- * Returns 'false' if no stats are available.
+ * get_att_numdistinct
*
- * Inputs:
- * 'relid' and 'attnum' are the relation and attribute number.
- * 'typid' and 'typmod' are the type and typmod of the column,
- * which the caller must already have looked up.
+ * Estimate the number of distinct values of an attribute.
*
- * Outputs:
- * The available stats are nullfrac, commonfrac, commonval, loval, hival.
- * The caller need not retrieve all five --- pass NULL pointers for the
- * unwanted values.
+ * relid, attnum: identify the attribute to examine.
+ * typid: type of attribute.
+ * stats: pg_statistic tuple for attribute, or NULL if not available.
*
- * commonval, loval, hival are returned as Datums holding the internal
- * representation of the values. (Note that these should be pfree'd
- * after use if the data type is not by-value.)
+ * XXX possible future improvement: look to see if there is a unique
+ * index on the attribute. If so, we can estimate ndistinct = ntuples.
+ * This should probably override any info from pg_statistic.
*/
-static bool
-getattstatistics(Oid relid,
- AttrNumber attnum,
- Oid typid,
- int32 typmod,
- double *nullfrac,
- double *commonfrac,
- Datum *commonval,
- Datum *loval,
- Datum *hival)
+static double
+get_att_numdistinct(Oid relid, AttrNumber attnum, Oid typid,
+ Form_pg_statistic stats)
{
- HeapTuple tuple;
- HeapTuple typeTuple;
- FmgrInfo inputproc;
- Oid typelem;
- bool isnull;
+ HeapTuple reltup;
+ double ntuples;
/*
- * We assume that there will only be one entry in pg_statistic for the
- * given rel/att, so we search WITHOUT considering the staop column.
- * Someday, VACUUM might store more than one entry per rel/att,
- * corresponding to more than one possible sort ordering defined for
- * the column type. However, to make that work we will need to figure
- * out which staop to search for --- it's not necessarily the one we
- * have at hand! (For example, we might have a '>' operator rather
- * than the '<' operator that will appear in staop.)
+ * Special-case boolean columns: presumably, two distinct values.
+ *
+ * Are there any other cases we should wire in special estimates for?
*/
- tuple = SearchSysCache(STATRELID,
- ObjectIdGetDatum(relid),
- Int16GetDatum((int16) attnum),
- 0, 0);
- if (!HeapTupleIsValid(tuple))
- {
- /* no such stats entry */
- return false;
- }
+ if (typid == BOOLOID)
+ return 2.0;
- if (nullfrac)
- *nullfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stanullfrac;
- if (commonfrac)
- *commonfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stacommonfrac;
-
- /* Get the type input proc for the column datatype */
- typeTuple = SearchSysCache(TYPEOID,
- ObjectIdGetDatum(typid),
- 0, 0, 0);
- if (!HeapTupleIsValid(typeTuple))
- elog(ERROR, "getattstatistics: Cache lookup failed for type %u",
- typid);
- fmgr_info(((Form_pg_type) GETSTRUCT(typeTuple))->typinput, &inputproc);
- typelem = ((Form_pg_type) GETSTRUCT(typeTuple))->typelem;
- ReleaseSysCache(typeTuple);
+ /*
+ * If VACUUM ANALYZE determined a fixed estimate, use it.
+ */
+ if (stats && stats->stadistinct > 0.0)
+ return stats->stadistinct;
/*
- * Values are variable-length fields, so cannot access as struct
- * fields. Must do it the hard way with SysCacheGetAttr.
+ * Otherwise we need to get the relation size.
*/
- if (commonval)
- {
- Datum val = SysCacheGetAttr(STATRELID, tuple,
- Anum_pg_statistic_stacommonval,
- &isnull);
+ reltup = SearchSysCache(RELOID,
+ ObjectIdGetDatum(relid),
+ 0, 0, 0);
+ if (!HeapTupleIsValid(reltup))
+ elog(ERROR, "get_att_numdistinct: no relation tuple %u", relid);
- if (isnull)
- {
- elog(DEBUG, "getattstatistics: stacommonval is null");
- *commonval = PointerGetDatum(NULL);
- }
- else
- {
- char *strval = DatumGetCString(DirectFunctionCall1(textout,
- val));
-
- *commonval = FunctionCall3(&inputproc,
- CStringGetDatum(strval),
- ObjectIdGetDatum(typelem),
- Int32GetDatum(typmod));
- pfree(strval);
- }
- }
+ ntuples = ((Form_pg_class) GETSTRUCT(reltup))->reltuples;
- if (loval)
- {
- Datum val = SysCacheGetAttr(STATRELID, tuple,
- Anum_pg_statistic_staloval,
- &isnull);
+ ReleaseSysCache(reltup);
- if (isnull)
- {
- elog(DEBUG, "getattstatistics: staloval is null");
- *loval = PointerGetDatum(NULL);
- }
- else
- {
- char *strval = DatumGetCString(DirectFunctionCall1(textout,
- val));
-
- *loval = FunctionCall3(&inputproc,
- CStringGetDatum(strval),
- ObjectIdGetDatum(typelem),
- Int32GetDatum(typmod));
- pfree(strval);
- }
- }
+ if (ntuples <= 0.0)
+ return 100.0; /* no data available; return a default */
- if (hival)
- {
- Datum val = SysCacheGetAttr(STATRELID, tuple,
- Anum_pg_statistic_stahival,
- &isnull);
+ /*
+ * If VACUUM ANALYZE determined a scaled estimate, use it.
+ */
+ if (stats && stats->stadistinct < 0.0)
+ return - stats->stadistinct * ntuples;
- if (isnull)
- {
- elog(DEBUG, "getattstatistics: stahival is null");
- *hival = PointerGetDatum(NULL);
- }
- else
- {
- char *strval = DatumGetCString(DirectFunctionCall1(textout,
- val));
-
- *hival = FunctionCall3(&inputproc,
- CStringGetDatum(strval),
- ObjectIdGetDatum(typelem),
- Int32GetDatum(typmod));
- pfree(strval);
- }
+ /*
+ * VACUUM ANALYZE does not compute stats for system attributes,
+ * but some of them can reasonably be assumed unique anyway.
+ */
+ switch (attnum)
+ {
+ case ObjectIdAttributeNumber:
+ case SelfItemPointerAttributeNumber:
+ return ntuples;
+ case TableOidAttributeNumber:
+ return 1.0;
}
- ReleaseSysCache(tuple);
+ /*
+ * Estimate ndistinct = ntuples if the table is small, else 100.
+ */
+ if (ntuples < 100.0)
+ return ntuples;
- return true;
+ return 100.0;
}
/*-------------------------------------------------------------------------