aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/selfuncs.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/selfuncs.c')
-rw-r--r--src/backend/utils/adt/selfuncs.c400
1 files changed, 323 insertions, 77 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 7d5609d447e..d484221232c 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.251 2008/08/14 18:47:59 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.252 2008/08/16 00:01:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -55,19 +55,34 @@
*
* float8 oprrest (internal, oid, internal, int4);
*
+ * The result is a selectivity, that is, a fraction (0 to 1) of the rows
+ * of the relation that are expected to produce a TRUE result for the
+ * given operator.
+ *
* The call convention for a join estimator (oprjoin function) is similar
- * except that varRelid is not needed, and instead the join type is
+ * except that varRelid is not needed, and instead join information is
* supplied:
*
* Selectivity oprjoin (PlannerInfo *root,
* Oid operator,
* List *args,
- * JoinType jointype);
+ * JoinType jointype,
+ * SpecialJoinInfo *sjinfo);
+ *
+ * float8 oprjoin (internal, oid, internal, int2, internal);
*
- * float8 oprjoin (internal, oid, internal, int2);
+ * (Before Postgres 8.4, join estimators had only the first four of these
+ * parameters. That signature is still allowed, but deprecated.) The
+ * relationship between jointype and sjinfo is explained in the comments for
+ * clause_selectivity() --- the short version is that jointype is usually
+ * best ignored in favor of examining sjinfo.
*
- * (We deliberately make the SQL signature different to facilitate
- * catching errors.)
+ * Join selectivity for regular inner and outer joins is defined as the
+ * fraction (0 to 1) of the cross product of the relations that is expected
+ * to produce a TRUE result for the given operator. For both semi and anti
+ * joins, however, the selectivity is defined as the fraction of the left-hand
+ * side relation's rows that are expected to have a match (ie, at least one
+ * row with a TRUE result) in the right-hand side.
*----------
*/
@@ -113,6 +128,10 @@ static double var_eq_non_const(VariableStatData *vardata, Oid operator,
static double ineq_histogram_selectivity(VariableStatData *vardata,
FmgrInfo *opproc, bool isgt,
Datum constval, Oid consttype);
+static double eqjoinsel_inner(Oid operator,
+ VariableStatData *vardata1, VariableStatData *vardata2);
+static double eqjoinsel_semi(Oid operator,
+ VariableStatData *vardata1, VariableStatData *vardata2);
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound);
@@ -1579,7 +1598,6 @@ scalararraysel(PlannerInfo *root,
Oid nominal_element_type;
RegProcedure oprsel;
FmgrInfo oprselproc;
- Datum selarg4;
Selectivity s1;
/*
@@ -1587,15 +1605,9 @@ scalararraysel(PlannerInfo *root,
* it hasn't got one.
*/
if (is_join_clause)
- {
oprsel = get_oprjoin(operator);
- selarg4 = Int16GetDatum(jointype);
- }
else
- {
oprsel = get_oprrest(operator);
- selarg4 = Int32GetDatum(varRelid);
- }
if (!oprsel)
return (Selectivity) 0.5;
fmgr_info(oprsel, &oprselproc);
@@ -1660,11 +1672,19 @@ scalararraysel(PlannerInfo *root,
elem_values[i],
elem_nulls[i],
elmbyval));
- s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
- PointerGetDatum(root),
- ObjectIdGetDatum(operator),
- PointerGetDatum(args),
- selarg4));
+ if (is_join_clause)
+ s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
+ PointerGetDatum(root),
+ ObjectIdGetDatum(operator),
+ PointerGetDatum(args),
+ Int16GetDatum(jointype),
+ PointerGetDatum(sjinfo)));
+ else
+ s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
+ PointerGetDatum(root),
+ ObjectIdGetDatum(operator),
+ PointerGetDatum(args),
+ Int32GetDatum(varRelid)));
if (useOr)
s1 = s1 + s2 - s1 * s2;
else
@@ -1694,11 +1714,19 @@ scalararraysel(PlannerInfo *root,
* estimation function would really care ...
*/
args = list_make2(leftop, elem);
- s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
- PointerGetDatum(root),
- ObjectIdGetDatum(operator),
- PointerGetDatum(args),
- selarg4));
+ if (is_join_clause)
+ s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
+ PointerGetDatum(root),
+ ObjectIdGetDatum(operator),
+ PointerGetDatum(args),
+ Int16GetDatum(jointype),
+ PointerGetDatum(sjinfo)));
+ else
+ s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
+ PointerGetDatum(root),
+ ObjectIdGetDatum(operator),
+ PointerGetDatum(args),
+ Int32GetDatum(varRelid)));
if (useOr)
s1 = s1 + s2 - s1 * s2;
else
@@ -1721,11 +1749,19 @@ scalararraysel(PlannerInfo *root,
dummyexpr->typeId = nominal_element_type;
dummyexpr->typeMod = -1;
args = list_make2(leftop, dummyexpr);
- s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
- PointerGetDatum(root),
- ObjectIdGetDatum(operator),
- PointerGetDatum(args),
- selarg4));
+ if (is_join_clause)
+ s2 = DatumGetFloat8(FunctionCall5(&oprselproc,
+ PointerGetDatum(root),
+ ObjectIdGetDatum(operator),
+ PointerGetDatum(args),
+ Int16GetDatum(jointype),
+ PointerGetDatum(sjinfo)));
+ else
+ s2 = DatumGetFloat8(FunctionCall4(&oprselproc,
+ PointerGetDatum(root),
+ ObjectIdGetDatum(operator),
+ PointerGetDatum(args),
+ Int32GetDatum(varRelid)));
s1 = useOr ? 0.0 : 1.0;
/*
@@ -1803,13 +1839,24 @@ rowcomparesel(PlannerInfo *root,
/* Build equivalent arg list for single operator */
opargs = list_make2(linitial(clause->largs), linitial(clause->rargs));
- /* Decide if it's a join clause, same as for OpExpr */
+ /*
+ * Decide if it's a join clause. This should match clausesel.c's
+ * treat_as_join_clause(), except that we intentionally consider only
+ * the leading columns and not the rest of the clause.
+ */
if (varRelid != 0)
{
/*
- * If we are considering a nestloop join then all clauses are
- * restriction clauses, since we are only interested in the one
- * relation.
+ * Caller is forcing restriction mode (eg, because we are examining
+ * an inner indexscan qual).
+ */
+ is_join_clause = false;
+ }
+ else if (sjinfo == NULL)
+ {
+ /*
+ * It must be a restriction clause, since it's being evaluated at
+ * a scan node.
*/
is_join_clause = false;
}
@@ -1817,7 +1864,6 @@ rowcomparesel(PlannerInfo *root,
{
/*
* Otherwise, it's a join if there's more than one relation used.
- * Notice we ignore the low-order columns here.
*/
is_join_clause = (NumRelids((Node *) opargs) > 1);
}
@@ -1827,7 +1873,8 @@ rowcomparesel(PlannerInfo *root,
/* Estimate selectivity for a join clause. */
s1 = join_selectivity(root, opno,
opargs,
- jointype);
+ jointype,
+ sjinfo);
}
else
{
@@ -1849,10 +1896,60 @@ eqjoinsel(PG_FUNCTION_ARGS)
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
+#ifdef NOT_USED
JoinType jointype = (JoinType) PG_GETARG_INT16(3);
+#endif
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
double selec;
VariableStatData vardata1;
VariableStatData vardata2;
+ bool join_is_reversed;
+
+ get_join_variables(root, args, sjinfo,
+ &vardata1, &vardata2, &join_is_reversed);
+
+ switch (sjinfo->jointype)
+ {
+ case JOIN_INNER:
+ case JOIN_LEFT:
+ case JOIN_FULL:
+ selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
+ break;
+ case JOIN_SEMI:
+ case JOIN_ANTI:
+ if (!join_is_reversed)
+ selec = eqjoinsel_semi(operator, &vardata1, &vardata2);
+ else
+ selec = eqjoinsel_semi(get_commutator(operator),
+ &vardata2, &vardata1);
+ break;
+ default:
+ /* other values not expected here */
+ elog(ERROR, "unrecognized join type: %d",
+ (int) sjinfo->jointype);
+ selec = 0; /* keep compiler quiet */
+ break;
+ }
+
+ ReleaseVariableStats(vardata1);
+ ReleaseVariableStats(vardata2);
+
+ CLAMP_PROBABILITY(selec);
+
+ PG_RETURN_FLOAT8((float8) selec);
+}
+
+/*
+ * eqjoinsel_inner --- eqjoinsel for normal inner join
+ *
+ * We also use this for LEFT/FULL outer joins; it's not presently clear
+ * that it's worth trying to distinguish them here.
+ */
+static double
+eqjoinsel_inner(Oid operator,
+ VariableStatData *vardata1, VariableStatData *vardata2)
+{
+ double selec;
double nd1;
double nd2;
Form_pg_statistic stats1 = NULL;
@@ -1868,29 +1965,27 @@ eqjoinsel(PG_FUNCTION_ARGS)
float4 *numbers2 = NULL;
int nnumbers2 = 0;
- get_join_variables(root, args, &vardata1, &vardata2);
-
- nd1 = get_variable_numdistinct(&vardata1);
- nd2 = get_variable_numdistinct(&vardata2);
+ nd1 = get_variable_numdistinct(vardata1);
+ nd2 = get_variable_numdistinct(vardata2);
- if (HeapTupleIsValid(vardata1.statsTuple))
+ if (HeapTupleIsValid(vardata1->statsTuple))
{
- stats1 = (Form_pg_statistic) GETSTRUCT(vardata1.statsTuple);
- have_mcvs1 = get_attstatsslot(vardata1.statsTuple,
- vardata1.atttype,
- vardata1.atttypmod,
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
+ have_mcvs1 = get_attstatsslot(vardata1->statsTuple,
+ vardata1->atttype,
+ vardata1->atttypmod,
STATISTIC_KIND_MCV,
InvalidOid,
&values1, &nvalues1,
&numbers1, &nnumbers1);
}
- if (HeapTupleIsValid(vardata2.statsTuple))
+ if (HeapTupleIsValid(vardata2->statsTuple))
{
- stats2 = (Form_pg_statistic) GETSTRUCT(vardata2.statsTuple);
- have_mcvs2 = get_attstatsslot(vardata2.statsTuple,
- vardata2.atttype,
- vardata2.atttypmod,
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
+ have_mcvs2 = get_attstatsslot(vardata2->statsTuple,
+ vardata2->atttype,
+ vardata2->atttypmod,
STATISTIC_KIND_MCV,
InvalidOid,
&values2, &nvalues2,
@@ -1933,25 +2028,6 @@ eqjoinsel(PG_FUNCTION_ARGS)
hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
/*
- * If we are doing any variant of JOIN_SEMI, pretend all the values of
- * the righthand relation are unique (ie, act as if it's been
- * DISTINCT'd).
- *
- * NOTE: it would be dangerous to try to be smart about JOIN_LEFT or
- * JOIN_RIGHT here, because we do not have enough information to
- * determine which var is really on which side of the join. Perhaps
- * someday we should pass in more information.
- */
- if (jointype == JOIN_SEMI)
- {
- float4 oneovern = 1.0 / nd2;
-
- for (i = 0; i < nvalues2; i++)
- numbers2[i] = oneovern;
- nullfrac2 = oneovern;
- }
-
- /*
* Note we assume that each MCV will match at most one member of the
* other MCV list. If the operator isn't really equality, there could
* be multiple matches --- but we don't look for them, both for speed
@@ -2075,18 +2151,170 @@ eqjoinsel(PG_FUNCTION_ARGS)
}
if (have_mcvs1)
- free_attstatsslot(vardata1.atttype, values1, nvalues1,
+ free_attstatsslot(vardata1->atttype, values1, nvalues1,
numbers1, nnumbers1);
if (have_mcvs2)
- free_attstatsslot(vardata2.atttype, values2, nvalues2,
+ free_attstatsslot(vardata2->atttype, values2, nvalues2,
numbers2, nnumbers2);
- ReleaseVariableStats(vardata1);
- ReleaseVariableStats(vardata2);
+ return selec;
+}
- CLAMP_PROBABILITY(selec);
+/*
+ * eqjoinsel_semi --- eqjoinsel for semi join
+ *
+ * (Also used for anti join, which we are supposed to estimate the same way.)
+ * Caller has ensured that vardata1 is the LHS variable.
+ */
+static double
+eqjoinsel_semi(Oid operator,
+ VariableStatData *vardata1, VariableStatData *vardata2)
+{
+ double selec;
+ double nd1;
+ double nd2;
+ Form_pg_statistic stats1 = NULL;
+ Form_pg_statistic stats2 = NULL;
+ bool have_mcvs1 = false;
+ Datum *values1 = NULL;
+ int nvalues1 = 0;
+ float4 *numbers1 = NULL;
+ int nnumbers1 = 0;
+ bool have_mcvs2 = false;
+ Datum *values2 = NULL;
+ int nvalues2 = 0;
+ float4 *numbers2 = NULL;
+ int nnumbers2 = 0;
- PG_RETURN_FLOAT8((float8) selec);
+ nd1 = get_variable_numdistinct(vardata1);
+ nd2 = get_variable_numdistinct(vardata2);
+
+ if (HeapTupleIsValid(vardata1->statsTuple))
+ {
+ stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
+ have_mcvs1 = get_attstatsslot(vardata1->statsTuple,
+ vardata1->atttype,
+ vardata1->atttypmod,
+ STATISTIC_KIND_MCV,
+ InvalidOid,
+ &values1, &nvalues1,
+ &numbers1, &nnumbers1);
+ }
+
+ if (HeapTupleIsValid(vardata2->statsTuple))
+ {
+ stats2 = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
+ have_mcvs2 = get_attstatsslot(vardata2->statsTuple,
+ vardata2->atttype,
+ vardata2->atttypmod,
+ STATISTIC_KIND_MCV,
+ InvalidOid,
+ &values2, &nvalues2,
+ &numbers2, &nnumbers2);
+ }
+
+ if (have_mcvs1 && have_mcvs2 && OidIsValid(operator))
+ {
+ /*
+ * We have most-common-value lists for both relations. Run through
+ * the lists to see which MCVs actually join to each other with the
+ * given operator. This allows us to determine the exact join
+ * selectivity for the portion of the relations represented by the MCV
+ * lists. We still have to estimate for the remaining population, but
+ * in a skewed distribution this gives us a big leg up in accuracy.
+ */
+ FmgrInfo eqproc;
+ bool *hasmatch1;
+ bool *hasmatch2;
+ double nullfrac1 = stats1->stanullfrac;
+ double matchfreq1;
+ int i,
+ nmatches;
+
+ fmgr_info(get_opcode(operator), &eqproc);
+ hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
+ hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
+
+ /*
+ * Note we assume that each MCV will match at most one member of the
+ * other MCV list. If the operator isn't really equality, there could
+ * be multiple matches --- but we don't look for them, both for speed
+ * and because the math wouldn't add up...
+ */
+ nmatches = 0;
+ for (i = 0; i < nvalues1; i++)
+ {
+ int j;
+
+ for (j = 0; j < nvalues2; j++)
+ {
+ if (hasmatch2[j])
+ continue;
+ if (DatumGetBool(FunctionCall2(&eqproc,
+ values1[i],
+ values2[j])))
+ {
+ hasmatch1[i] = hasmatch2[j] = true;
+ nmatches++;
+ break;
+ }
+ }
+ }
+ /* Sum up frequencies of matched MCVs */
+ matchfreq1 = 0.0;
+ for (i = 0; i < nvalues1; i++)
+ {
+ if (hasmatch1[i])
+ matchfreq1 += numbers1[i];
+ }
+ CLAMP_PROBABILITY(matchfreq1);
+ pfree(hasmatch1);
+ pfree(hasmatch2);
+
+ /*
+ * Now we need to estimate the fraction of relation 1 that has at
+ * least one join partner. We know for certain that the matched
+ * MCVs do, so that gives us a lower bound, but we're really in the
+ * dark about everything else. Our crude approach is: if nd1 <= nd2
+ * then assume all non-null rel1 rows have join partners, else assume
+ * for the uncertain rows that a fraction nd2/nd1 have join partners.
+ * We can discount the known-matched MCVs from the distinct-values
+ * counts before doing the division.
+ */
+ nd1 -= nmatches;
+ nd2 -= nmatches;
+ if (nd1 <= nd2 || nd2 <= 0)
+ selec = Max(matchfreq1, 1.0 - nullfrac1);
+ else
+ {
+ double uncertain = 1.0 - matchfreq1 - nullfrac1;
+
+ CLAMP_PROBABILITY(uncertain);
+ selec = matchfreq1 + (nd2/nd1) * uncertain;
+ }
+ }
+ else
+ {
+ /*
+ * Without MCV lists for both sides, we can only use the heuristic
+ * about nd1 vs nd2.
+ */
+ double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
+
+ if (nd1 <= nd2 || nd2 <= 0)
+ selec = 1.0 - nullfrac1;
+ else
+ selec = (nd2/nd1) * (1.0 - nullfrac1);
+ }
+
+ if (have_mcvs1)
+ free_attstatsslot(vardata1->atttype, values1, nvalues1,
+ numbers1, nnumbers1);
+ if (have_mcvs2)
+ free_attstatsslot(vardata2->atttype, values2, nvalues2,
+ numbers2, nnumbers2);
+
+ return selec;
}
/*
@@ -2099,6 +2327,7 @@ neqjoinsel(PG_FUNCTION_ARGS)
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
JoinType jointype = (JoinType) PG_GETARG_INT16(3);
+ SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
Oid eqop;
float8 result;
@@ -2109,11 +2338,12 @@ neqjoinsel(PG_FUNCTION_ARGS)
eqop = get_negator(operator);
if (eqop)
{
- result = DatumGetFloat8(DirectFunctionCall4(eqjoinsel,
+ result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
PointerGetDatum(root),
ObjectIdGetDatum(eqop),
PointerGetDatum(args),
- Int16GetDatum(jointype)));
+ Int16GetDatum(jointype),
+ PointerGetDatum(sjinfo)));
}
else
{
@@ -3661,10 +3891,17 @@ get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
/*
* get_join_variables
* Apply examine_variable() to each side of a join clause.
+ * Also, attempt to identify whether the join clause has the same
+ * or reversed sense compared to the SpecialJoinInfo.
+ *
+ * We consider the join clause "normal" if it is "lhs_var OP rhs_var",
+ * or "reversed" if it is "rhs_var OP lhs_var". In complicated cases
+ * where we can't tell for sure, we default to assuming it's normal.
*/
void
-get_join_variables(PlannerInfo *root, List *args,
- VariableStatData *vardata1, VariableStatData *vardata2)
+get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo,
+ VariableStatData *vardata1, VariableStatData *vardata2,
+ bool *join_is_reversed)
{
Node *left,
*right;
@@ -3677,6 +3914,15 @@ get_join_variables(PlannerInfo *root, List *args,
examine_variable(root, left, 0, vardata1);
examine_variable(root, right, 0, vardata2);
+
+ if (vardata1->rel &&
+ bms_is_subset(vardata1->rel->relids, sjinfo->syn_righthand))
+ *join_is_reversed = true; /* var1 is on RHS */
+ else if (vardata2->rel &&
+ bms_is_subset(vardata2->rel->relids, sjinfo->syn_lefthand))
+ *join_is_reversed = true; /* var2 is on LHS */
+ else
+ *join_is_reversed = false;
}
/*