aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/statistics/dependencies.c349
-rw-r--r--src/test/regress/expected/stats_ext.out18
-rw-r--r--src/test/regress/sql/stats_ext.sql8
3 files changed, 266 insertions, 109 deletions
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index 5f9b43bf7fb..ada1e78f6fd 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -30,6 +30,7 @@
#include "utils/fmgroids.h"
#include "utils/fmgrprotos.h"
#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
#include "utils/syscache.h"
#include "utils/typcache.h"
@@ -73,13 +74,18 @@ static double dependency_degree(int numrows, HeapTuple *rows, int k,
AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
static bool dependency_is_fully_matched(MVDependency *dependency,
Bitmapset *attnums);
-static bool dependency_implies_attribute(MVDependency *dependency,
- AttrNumber attnum);
static bool dependency_is_compatible_clause(Node *clause, Index relid,
AttrNumber *attnum);
static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
int ndependencies,
Bitmapset *attnums);
+static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
+ int varRelid, JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ MVDependency **dependencies,
+ int ndependencies,
+ AttrNumber *list_attnums,
+ Bitmapset **estimatedclauses);
static void
generate_dependencies_recurse(DependencyGenerator state, int index,
@@ -614,19 +620,6 @@ dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
}
/*
- * dependency_implies_attribute
- * check that the attnum matches is implied by the functional dependency
- */
-static bool
-dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum)
-{
- if (attnum == dependency->attributes[dependency->nattributes - 1])
- return true;
-
- return false;
-}
-
-/*
* statext_dependencies_load
* Load the functional dependencies for the indicated pg_statistic_ext tuple
*/
@@ -986,6 +979,183 @@ find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
}
/*
+ * clauselist_apply_dependencies
+ * Apply the specified functional dependencies to a list of clauses and
+ * return the estimated selecvitity of the clauses that are compatible
+ * with any of the given dependencies.
+ *
+ * This will estimate all not-already-estimated clauses that are compatible
+ * with functional dependencies, and which have an attribute mentioned by any
+ * of the given dependencies (either as an implying or implied attribute).
+ *
+ * Given (lists of) clauses on attributes (a,b) and a functional dependency
+ * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined
+ * using the formula
+ *
+ * P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
+ *
+ * where 'f' is the degree of dependency. This reflects the fact that we
+ * expect a fraction f of all rows to be consistent with the dependency
+ * (a=>b), and so have a selectivity of P(a), while the remaining rows are
+ * treated as independent.
+ *
+ * In practice, we use a slightly modified version of this formula, which uses
+ * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result
+ * should obviously not exceed either column's individual selectivity. I.e.,
+ * we actually combine selectivities using the formula
+ *
+ * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
+ *
+ * This can make quite a difference if the specific values matching the
+ * clauses are not consistent with the functional dependency.
+ */
+static Selectivity
+clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
+ int varRelid, JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ MVDependency **dependencies, int ndependencies,
+ AttrNumber *list_attnums,
+ Bitmapset **estimatedclauses)
+{
+ Bitmapset *attnums;
+ int i;
+ int j;
+ int nattrs;
+ Selectivity *attr_sel;
+ int attidx;
+ int listidx;
+ ListCell *l;
+ Selectivity s1;
+
+ /*
+ * Extract the attnums of all implying and implied attributes from all the
+ * given dependencies. Each of these attributes is expected to have at
+ * least 1 not-already-estimated compatible clause that we will estimate
+ * here.
+ */
+ attnums = NULL;
+ for (i = 0; i < ndependencies; i++)
+ {
+ for (j = 0; j < dependencies[i]->nattributes; j++)
+ {
+ AttrNumber attnum = dependencies[i]->attributes[j];
+
+ attnums = bms_add_member(attnums, attnum);
+ }
+ }
+
+ /*
+ * Compute per-column selectivity estimates for each of these attributes,
+ * and mark all the corresponding clauses as estimated.
+ */
+ nattrs = bms_num_members(attnums);
+ attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs);
+
+ attidx = 0;
+ i = -1;
+ while ((i = bms_next_member(attnums, i)) >= 0)
+ {
+ List *attr_clauses = NIL;
+ Selectivity simple_sel;
+
+ listidx = -1;
+ foreach(l, clauses)
+ {
+ Node *clause = (Node *) lfirst(l);
+
+ listidx++;
+ if (list_attnums[listidx] == i)
+ {
+ attr_clauses = lappend(attr_clauses, clause);
+ *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+ }
+ }
+
+ simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid,
+ jointype, sjinfo, NULL);
+ attr_sel[attidx++] = simple_sel;
+ }
+
+ /*
+ * Now combine these selectivities using the dependency information. For
+ * chains of dependencies such as a -> b -> c, the b -> c dependency will
+ * come before the a -> b dependency in the array, so we traverse the
+ * array backwards to ensure such chains are computed in the right order.
+ *
+ * As explained above, pairs of selectivities are combined using the
+ * formula
+ *
+ * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
+ *
+ * to ensure that the combined selectivity is never greater than either
+ * individual selectivity.
+ *
+ * Where multiple dependencies apply (e.g., a -> b -> c), we use
+ * conditional probabilities to compute the overall result as follows:
+ *
+ * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a)
+ *
+ * so we replace the selectivities of all implied attributes with
+ * conditional probabilities, that are conditional on all their implying
+ * attributes. The selectivities of all other non-implied attributes are
+ * left as they are.
+ */
+ for (i = ndependencies - 1; i >= 0; i--)
+ {
+ MVDependency *dependency = dependencies[i];
+ AttrNumber attnum;
+ Selectivity s2;
+ double f;
+
+ /* Selectivity of all the implying attributes */
+ s1 = 1.0;
+ for (j = 0; j < dependency->nattributes - 1; j++)
+ {
+ attnum = dependency->attributes[j];
+ attidx = bms_member_index(attnums, attnum);
+ s1 *= attr_sel[attidx];
+ }
+
+ /* Original selectivity of the implied attribute */
+ attnum = dependency->attributes[j];
+ attidx = bms_member_index(attnums, attnum);
+ s2 = attr_sel[attidx];
+
+ /*
+ * Replace s2 with the conditional probability s2 given s1, computed
+ * using the formula P(b|a) = P(a,b) / P(a), which simplifies to
+ *
+ * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
+ *
+ * where P(a) = s1, the selectivity of the implying attributes, and
+ * P(b) = s2, the selectivity of the implied attribute.
+ */
+ f = dependency->degree;
+
+ if (s1 <= s2)
+ attr_sel[attidx] = f + (1 - f) * s2;
+ else
+ attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2;
+ }
+
+ /*
+ * The overall selectivity of all the clauses on all these attributes is
+ * then the product of all the original (non-implied) probabilities and
+ * the new conditional (implied) probabilities.
+ */
+ s1 = 1.0;
+ for (i = 0; i < nattrs; i++)
+ s1 *= attr_sel[i];
+
+ CLAMP_PROBABILITY(s1);
+
+ pfree(attr_sel);
+ bms_free(attnums);
+
+ return s1;
+}
+
+/*
* dependencies_clauselist_selectivity
* Return the estimated selectivity of (a subset of) the given clauses
* using functional dependency statistics, or 1.0 if no useful functional
@@ -999,15 +1169,16 @@ find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
* between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
* dependency, we then combine the per-clause selectivities using the formula
*
- * P(a,b) = P(a) * [f + (1-f)*P(b)]
+ * P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
*
- * where 'f' is the degree of the dependency.
+ * where 'f' is the degree of the dependency. (Actually we use a slightly
+ * modified version of this formula -- see clauselist_apply_dependencies()).
*
* With clauses on more than two attributes, the dependencies are applied
* recursively, starting with the widest/strongest dependencies. For example
* P(a,b,c) is first split like this:
*
- * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)]
+ * P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c)
*
* assuming (a,b=>c) is the strongest dependency.
*/
@@ -1023,17 +1194,20 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
Selectivity s1 = 1.0;
ListCell *l;
Bitmapset *clauses_attnums = NULL;
- Bitmapset **list_attnums;
+ AttrNumber *list_attnums;
int listidx;
- MVDependencies **dependencies = NULL;
- int ndependencies = 0;
+ MVDependencies **func_dependencies;
+ int nfunc_dependencies;
+ int total_ndeps;
+ MVDependency **dependencies;
+ int ndependencies;
int i;
/* check if there's any stats that might be useful for us. */
if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
return 1.0;
- list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
+ list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
list_length(clauses));
/*
@@ -1056,11 +1230,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
if (!bms_is_member(listidx, *estimatedclauses) &&
dependency_is_compatible_clause(clause, rel->relid, &attnum))
{
- list_attnums[listidx] = bms_make_singleton(attnum);
+ list_attnums[listidx] = attnum;
clauses_attnums = bms_add_member(clauses_attnums, attnum);
}
else
- list_attnums[listidx] = NULL;
+ list_attnums[listidx] = InvalidAttrNumber;
listidx++;
}
@@ -1072,6 +1246,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
*/
if (bms_num_members(clauses_attnums) < 2)
{
+ bms_free(clauses_attnums);
pfree(list_attnums);
return 1.0;
}
@@ -1083,19 +1258,20 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
*
* To not waste cycles and memory, we deserialize dependencies only for
* statistics that match at least two attributes. The array is allocated
- * with the assumption that all objects match - we could grow the array
- * to make it just the right size, but it's likely wasteful anyway thanks
- * to moving the freed chunks to freelists etc.
+ * with the assumption that all objects match - we could grow the array to
+ * make it just the right size, but it's likely wasteful anyway thanks to
+ * moving the freed chunks to freelists etc.
*/
- ndependencies = 0;
- dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
- list_length(rel->statlist));
+ func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
+ list_length(rel->statlist));
+ nfunc_dependencies = 0;
+ total_ndeps = 0;
- foreach(l,rel->statlist)
+ foreach(l, rel->statlist)
{
- StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
- Bitmapset *matched;
- int num_matched;
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
+ Bitmapset *matched;
+ int num_matched;
/* skip statistics that are not of the correct type */
if (stat->kind != STATS_EXT_DEPENDENCIES)
@@ -1109,104 +1285,65 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
if (num_matched < 2)
continue;
- dependencies[ndependencies++]
+ func_dependencies[nfunc_dependencies]
= statext_dependencies_load(stat->statOid);
+
+ total_ndeps += func_dependencies[nfunc_dependencies]->ndeps;
+ nfunc_dependencies++;
}
/* if no matching stats could be found then we've nothing to do */
- if (!ndependencies)
+ if (nfunc_dependencies == 0)
{
+ pfree(func_dependencies);
+ bms_free(clauses_attnums);
pfree(list_attnums);
return 1.0;
}
/*
- * Apply the dependencies recursively, starting with the widest/strongest
- * ones, and proceeding to the smaller/weaker ones. At the end of each
- * round we factor in the selectivity of clauses on the implied attribute,
- * and remove the clauses from the list.
+ * Work out which dependencies we can apply, starting with the
+ * widest/stongest ones, and proceeding to smaller/weaker ones.
*/
+ dependencies = (MVDependency **) palloc(sizeof(MVDependency *) *
+ total_ndeps);
+ ndependencies = 0;
+
while (true)
{
- Selectivity s2 = 1.0;
MVDependency *dependency;
+ AttrNumber attnum;
/* the widest/strongest dependency, fully matched by clauses */
- dependency = find_strongest_dependency(dependencies, ndependencies,
+ dependency = find_strongest_dependency(func_dependencies,
+ nfunc_dependencies,
clauses_attnums);
-
- /* if no suitable dependency was found, we're done */
if (!dependency)
break;
- /*
- * We found an applicable dependency, so find all the clauses on the
- * implied attribute - with dependency (a,b => c) we look for clauses
- * on 'c'.
- */
- listidx = -1;
- foreach(l, clauses)
- {
- Node *clause;
- AttrNumber attnum;
-
- listidx++;
-
- /*
- * Skip incompatible clauses, and ones we've already estimated on.
- */
- if (!list_attnums[listidx])
- continue;
-
- /*
- * We expect the bitmaps ton contain a single attribute number.
- */
- attnum = bms_singleton_member(list_attnums[listidx]);
-
- /*
- * Technically we could find more than one clause for a given
- * attnum. Since these clauses must be equality clauses, we choose
- * to only take the selectivity estimate from the final clause in
- * the list for this attnum. If the attnum happens to be compared
- * to a different Const in another clause then no rows will match
- * anyway. If it happens to be compared to the same Const, then
- * ignoring the additional clause is just the thing to do.
- */
- if (dependency_implies_attribute(dependency, attnum))
- {
- clause = (Node *) lfirst(l);
+ dependencies[ndependencies++] = dependency;
- s2 = clause_selectivity(root, clause, varRelid, jointype,
- sjinfo);
-
- /* mark this one as done, so we don't touch it again. */
- *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
-
- /*
- * Mark that we've got and used the dependency on this clause.
- * We'll want to ignore this when looking for the next
- * strongest dependency above.
- */
- clauses_attnums = bms_del_member(clauses_attnums, attnum);
- }
- }
-
- /*
- * Now factor in the selectivity for all the "implied" clauses into
- * the final one, using this formula:
- *
- * P(a,b) = P(a) * (f + (1-f) * P(b))
- *
- * where 'f' is the degree of validity of the dependency.
- */
- s1 *= (dependency->degree + (1 - dependency->degree) * s2);
+ /* Ignore dependencies using this implied attribute in later loops */
+ attnum = dependency->attributes[dependency->nattributes - 1];
+ clauses_attnums = bms_del_member(clauses_attnums, attnum);
}
+ /*
+ * If we found applicable dependencies, use them to estimate all
+ * compatible clauses on attributes that they refer to.
+ */
+ if (ndependencies != 0)
+ s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype,
+ sjinfo, dependencies, ndependencies,
+ list_attnums, estimatedclauses);
+
/* free deserialized functional dependencies (and then the array) */
- for (i = 0; i < ndependencies; i++)
- pfree(dependencies[i]);
+ for (i = 0; i < nfunc_dependencies; i++)
+ pfree(func_dependencies[i]);
pfree(dependencies);
+ pfree(func_dependencies);
+ bms_free(clauses_attnums);
pfree(list_attnums);
return s1;
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index bd6d8be4344..f2f4008283f 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -440,6 +440,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
8 | 200
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 4 | 100
+(1 row)
+
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
estimated | actual
-----------+--------
@@ -600,6 +606,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
200 | 200
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
estimated | actual
-----------+--------
@@ -719,12 +731,14 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
1 | 0
(1 row)
--- check change of column type doesn't break it
+-- changing the type of column c causes its single-column stats to be dropped,
+-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
+-- clauses estimated with functional dependencies does not exceed this
ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
estimated | actual
-----------+--------
- 50 | 50
+ 25 | 50
(1 row)
ANALYZE functional_dependencies;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 897ed3051c8..f3b652dc4af 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -280,6 +280,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
@@ -342,6 +344,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
@@ -385,7 +389,9 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
--- check change of column type doesn't break it
+-- changing the type of column c causes its single-column stats to be dropped,
+-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
+-- clauses estimated with functional dependencies does not exceed this
ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');