aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/statistics/extended_stats.c139
-rw-r--r--src/test/regress/expected/stats_ext.out57
-rw-r--r--src/test/regress/sql/stats_ext.sql35
3 files changed, 167 insertions, 64 deletions
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 7a37bb5d1a7..d17b8d9b1f2 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1148,9 +1148,13 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
* statext_mcv_clauselist_selectivity
* Estimate clauses using the best multi-column statistics.
*
- * Selects the best extended (multi-column) statistic on a table (measured by
- * the number of attributes extracted from the clauses and covered by it), and
- * computes the selectivity for the supplied clauses.
+ * Applies available extended (multi-column) statistics on a table. There may
+ * be multiple applicable statistics (with respect to the clauses), in which
+ * case we use greedy approach. In each round we select the best statistic on
+ * a table (measured by the number of attributes extracted from the clauses
+ * and covered by it), and compute the selectivity for the supplied clauses.
+ * We repeat this process with the remaining clauses (if any), until none of
+ * the available statistics can be used.
*
* One of the main challenges with using MCV lists is how to extrapolate the
* estimate to the data not covered by the MCV list. To do that, we compute
@@ -1194,11 +1198,6 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
* 'estimatedclauses' is an input/output parameter. We set bits for the
* 0-based 'clauses' indexes we estimate for and also skip clause items that
* already have a bit set.
- *
- * XXX If we were to use multiple statistics, this is where it would happen.
- * We would simply repeat this on a loop on the "remaining" clauses, possibly
- * using the already estimated clauses as conditions (and combining the values
- * using conditional probability formula).
*/
static Selectivity
statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
@@ -1208,14 +1207,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
ListCell *l;
Bitmapset **list_attnums;
int listidx;
- StatisticExtInfo *stat;
- List *stat_clauses;
- Selectivity simple_sel,
- mcv_sel,
- mcv_basesel,
- mcv_totalsel,
- other_sel,
- sel;
+ Selectivity sel = 1.0;
/* check if there's any stats that might be useful for us. */
if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV))
@@ -1250,65 +1242,84 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
listidx++;
}
- /* find the best suited statistics object for these attnums */
- stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV,
- list_attnums, list_length(clauses));
-
- /* if no matching stats could be found then we've nothing to do */
- if (!stat)
- return 1.0;
+ /* apply as many extended statistics as possible */
+ while (true)
+ {
+ StatisticExtInfo *stat;
+ List *stat_clauses;
+ Selectivity simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ mcv_totalsel,
+ other_sel,
+ stat_sel;
+
+ /* find the best suited statistics object for these attnums */
+ stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV,
+ list_attnums, list_length(clauses));
+
+ /* if no (additional) matching stats could be found then we've nothing to do */
+ if (!stat)
+ break;
- /* Ensure choose_best_statistics produced an expected stats type. */
- Assert(stat->kind == STATS_EXT_MCV);
+ /* Ensure choose_best_statistics produced an expected stats type. */
+ Assert(stat->kind == STATS_EXT_MCV);
- /* now filter the clauses to be estimated using the selected MCV */
- stat_clauses = NIL;
+ /* now filter the clauses to be estimated using the selected MCV */
+ stat_clauses = NIL;
- listidx = 0;
- foreach(l, clauses)
- {
- /*
- * If the clause is compatible with the selected statistics, mark it
- * as estimated and add it to the list to estimate.
- */
- if (list_attnums[listidx] != NULL &&
- bms_is_subset(list_attnums[listidx], stat->keys))
+ listidx = 0;
+ foreach(l, clauses)
{
- stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
- *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+ /*
+ * If the clause is compatible with the selected statistics, mark it
+ * as estimated and add it to the list to estimate.
+ */
+ if (list_attnums[listidx] != NULL &&
+ bms_is_subset(list_attnums[listidx], stat->keys))
+ {
+ stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
+ *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+
+ bms_free(list_attnums[listidx]);
+ list_attnums[listidx] = NULL;
+ }
+
+ listidx++;
}
- listidx++;
- }
+ /*
+ * First compute "simple" selectivity, i.e. without the extended
+ * statistics, and essentially assuming independence of the
+ * columns/clauses. We'll then use the various selectivities computed from
+ * MCV list to improve it.
+ */
+ simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid,
+ jointype, sjinfo, NULL);
- /*
- * First compute "simple" selectivity, i.e. without the extended
- * statistics, and essentially assuming independence of the
- * columns/clauses. We'll then use the various selectivities computed from
- * MCV list to improve it.
- */
- simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid,
- jointype, sjinfo, NULL);
+ /*
+ * Now compute the multi-column estimate from the MCV list, along with the
+ * other selectivities (base & total selectivity).
+ */
+ mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid,
+ jointype, sjinfo, rel,
+ &mcv_basesel, &mcv_totalsel);
- /*
- * Now compute the multi-column estimate from the MCV list, along with the
- * other selectivities (base & total selectivity).
- */
- mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid,
- jointype, sjinfo, rel,
- &mcv_basesel, &mcv_totalsel);
+ /* Estimated selectivity of values not covered by MCV matches */
+ other_sel = simple_sel - mcv_basesel;
+ CLAMP_PROBABILITY(other_sel);
- /* Estimated selectivity of values not covered by MCV matches */
- other_sel = simple_sel - mcv_basesel;
- CLAMP_PROBABILITY(other_sel);
+ /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */
+ if (other_sel > 1.0 - mcv_totalsel)
+ other_sel = 1.0 - mcv_totalsel;
- /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */
- if (other_sel > 1.0 - mcv_totalsel)
- other_sel = 1.0 - mcv_totalsel;
+ /* Overall selectivity is the combination of MCV and non-MCV estimates. */
+ stat_sel = mcv_sel + other_sel;
+ CLAMP_PROBABILITY(stat_sel);
- /* Overall selectivity is the combination of MCV and non-MCV estimates. */
- sel = mcv_sel + other_sel;
- CLAMP_PROBABILITY(sel);
+ /* Factor the estimate from this MCV to the oveall estimate. */
+ sel *= stat_sel;
+ }
return sel;
}
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index d467c0dd7d9..61237dfb113 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -836,6 +836,63 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
1 | 0
(1 row)
+-- check the ability to use multiple MCV lists
+CREATE TABLE mcv_lists_multi (
+ a INTEGER,
+ b INTEGER,
+ c INTEGER,
+ d INTEGER
+);
+INSERT INTO mcv_lists_multi (a, b, c, d)
+ SELECT
+ mod(i,5),
+ mod(i,5),
+ mod(i,7),
+ mod(i,7)
+ FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists_multi;
+-- estimates without any mcv statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
+ estimated | actual
+-----------+--------
+ 200 | 1000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
+ estimated | actual
+-----------+--------
+ 102 | 714
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
+ estimated | actual
+-----------+--------
+ 4 | 142
+(1 row)
+
+-- create separate MCV statistics
+CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi;
+CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi;
+ANALYZE mcv_lists_multi;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
+ estimated | actual
+-----------+--------
+ 1000 | 1000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
+ estimated | actual
+-----------+--------
+ 714 | 714
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
+ estimated | actual
+-----------+--------
+ 143 | 142
+(1 row)
+
+DROP TABLE mcv_lists_multi;
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
-- the underlying table.
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 734b1f60c94..84f13e8814a 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -535,6 +535,41 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');
+-- check the ability to use multiple MCV lists
+CREATE TABLE mcv_lists_multi (
+ a INTEGER,
+ b INTEGER,
+ c INTEGER,
+ d INTEGER
+);
+
+INSERT INTO mcv_lists_multi (a, b, c, d)
+ SELECT
+ mod(i,5),
+ mod(i,5),
+ mod(i,7),
+ mod(i,7)
+ FROM generate_series(1,5000) s(i);
+
+ANALYZE mcv_lists_multi;
+
+-- estimates without any mcv statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
+
+-- create separate MCV statistics
+CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi;
+CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi;
+
+ANALYZE mcv_lists_multi;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
+
+DROP TABLE mcv_lists_multi;
+
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
-- the underlying table.