aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/optimizer/path/clausesel.c301
-rw-r--r--src/backend/statistics/dependencies.c4
-rw-r--r--src/backend/statistics/extended_stats.c217
-rw-r--r--src/backend/statistics/mcv.c169
-rw-r--r--src/include/optimizer/optimizer.h18
-rw-r--r--src/include/statistics/extended_stats_internal.h15
-rw-r--r--src/include/statistics/statistics.h3
-rw-r--r--src/test/regress/expected/stats_ext.out189
-rw-r--r--src/test/regress/sql/stats_ext.sql85
9 files changed, 798 insertions, 203 deletions
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 37a735b06bb..b88b29ec23a 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -44,6 +44,12 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
bool varonleft, bool isLTsel, Selectivity s2);
static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root,
List *clauses);
+static Selectivity clauselist_selectivity_or(PlannerInfo *root,
+ List *clauses,
+ int varRelid,
+ JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ bool use_extended_stats);
/****************************************************************************
* ROUTINES TO COMPUTE SELECTIVITIES
@@ -61,64 +67,10 @@ static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root,
*
* The basic approach is to apply extended statistics first, on as many
* clauses as possible, in order to capture cross-column dependencies etc.
- * The remaining clauses are then estimated using regular statistics tracked
- * for individual columns. This is done by simply passing the clauses to
- * clauselist_selectivity_simple.
- */
-Selectivity
-clauselist_selectivity(PlannerInfo *root,
- List *clauses,
- int varRelid,
- JoinType jointype,
- SpecialJoinInfo *sjinfo)
-{
- Selectivity s1 = 1.0;
- RelOptInfo *rel;
- Bitmapset *estimatedclauses = NULL;
-
- /*
- * Determine if these clauses reference a single relation. If so, and if
- * it has extended statistics, try to apply those.
- */
- rel = find_single_rel_for_clauses(root, clauses);
- if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
- {
- /*
- * Estimate as many clauses as possible using extended statistics.
- *
- * 'estimatedclauses' tracks the 0-based list position index of
- * clauses that we've estimated using extended statistics, and that
- * should be ignored.
- */
- s1 *= statext_clauselist_selectivity(root, clauses, varRelid,
- jointype, sjinfo, rel,
- &estimatedclauses);
- }
-
- /*
- * Apply normal selectivity estimates for the remaining clauses, passing
- * 'estimatedclauses' so that it skips already estimated ones.
- */
- return s1 * clauselist_selectivity_simple(root, clauses, varRelid,
- jointype, sjinfo,
- estimatedclauses);
-}
-
-/*
- * clauselist_selectivity_simple -
- * Compute the selectivity of an implicitly-ANDed list of boolean
- * expression clauses. The list can be empty, in which case 1.0
- * must be returned. List elements may be either RestrictInfos
- * or bare expression clauses --- the former is preferred since
- * it allows caching of results. The estimatedclauses bitmap tracks
- * clauses that have already been estimated by other means.
- *
- * See clause_selectivity() for the meaning of the additional parameters.
- *
- * Our basic approach is to take the product of the selectivities of the
- * subclauses. However, that's only right if the subclauses have independent
- * probabilities, and in reality they are often NOT independent. So,
- * we want to be smarter where we can.
+ * The remaining clauses are then estimated by taking the product of their
+ * selectivities, but that's only right if they have independent
+ * probabilities, and in reality they are often NOT independent even if they
+ * only refer to a single column. So, we want to be smarter where we can.
*
* We also recognize "range queries", such as "x > 34 AND x < 42". Clauses
* are recognized as possible range query components if they are restriction
@@ -147,28 +99,68 @@ clauselist_selectivity(PlannerInfo *root,
* selectivity functions; perhaps some day we can generalize the approach.
*/
Selectivity
-clauselist_selectivity_simple(PlannerInfo *root,
- List *clauses,
- int varRelid,
- JoinType jointype,
- SpecialJoinInfo *sjinfo,
- Bitmapset *estimatedclauses)
+clauselist_selectivity(PlannerInfo *root,
+ List *clauses,
+ int varRelid,
+ JoinType jointype,
+ SpecialJoinInfo *sjinfo)
+{
+ return clauselist_selectivity_ext(root, clauses, varRelid,
+ jointype, sjinfo, true);
+}
+
+/*
+ * clauselist_selectivity_ext -
+ * Extended version of clauselist_selectivity(). If "use_extended_stats"
+ * is false, all extended statistics will be ignored, and only per-column
+ * statistics will be used.
+ */
+Selectivity
+clauselist_selectivity_ext(PlannerInfo *root,
+ List *clauses,
+ int varRelid,
+ JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ bool use_extended_stats)
{
Selectivity s1 = 1.0;
+ RelOptInfo *rel;
+ Bitmapset *estimatedclauses = NULL;
RangeQueryClause *rqlist = NULL;
ListCell *l;
int listidx;
/*
- * If there's exactly one clause (and it was not estimated yet), just go
- * directly to clause_selectivity(). None of what we might do below is
- * relevant.
+ * If there's exactly one clause, just go directly to
+ * clause_selectivity_ext(). None of what we might do below is relevant.
*/
- if (list_length(clauses) == 1 && bms_is_empty(estimatedclauses))
- return clause_selectivity(root, (Node *) linitial(clauses),
- varRelid, jointype, sjinfo);
+ if (list_length(clauses) == 1)
+ return clause_selectivity_ext(root, (Node *) linitial(clauses),
+ varRelid, jointype, sjinfo,
+ use_extended_stats);
+
+ /*
+ * Determine if these clauses reference a single relation. If so, and if
+ * it has extended statistics, try to apply those.
+ */
+ rel = find_single_rel_for_clauses(root, clauses);
+ if (use_extended_stats && rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
+ {
+ /*
+ * Estimate as many clauses as possible using extended statistics.
+ *
+ * 'estimatedclauses' is populated with the 0-based list position
+ * index of clauses estimated here, and that should be ignored below.
+ */
+ s1 = statext_clauselist_selectivity(root, clauses, varRelid,
+ jointype, sjinfo, rel,
+ &estimatedclauses, false);
+ }
/*
+ * Apply normal selectivity estimates for remaining clauses. We'll be
+ * careful to skip any clauses which were already estimated above.
+ *
* Anything that doesn't look like a potential rangequery clause gets
* multiplied into s1 and forgotten. Anything that does gets inserted into
* an rqlist entry.
@@ -189,8 +181,9 @@ clauselist_selectivity_simple(PlannerInfo *root,
if (bms_is_member(listidx, estimatedclauses))
continue;
- /* Always compute the selectivity using clause_selectivity */
- s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
+ /* Compute the selectivity of this clause in isolation */
+ s2 = clause_selectivity_ext(root, clause, varRelid, jointype, sjinfo,
+ use_extended_stats);
/*
* Check for being passed a RestrictInfo.
@@ -351,6 +344,83 @@ clauselist_selectivity_simple(PlannerInfo *root,
}
/*
+ * clauselist_selectivity_or -
+ * Compute the selectivity of an implicitly-ORed list of boolean
+ * expression clauses. The list can be empty, in which case 0.0
+ * must be returned. List elements may be either RestrictInfos
+ * or bare expression clauses --- the former is preferred since
+ * it allows caching of results.
+ *
+ * See clause_selectivity() for the meaning of the additional parameters.
+ *
+ * The basic approach is to apply extended statistics first, on as many
+ * clauses as possible, in order to capture cross-column dependencies etc.
+ * The remaining clauses are then estimated as if they were independent.
+ */
+static Selectivity
+clauselist_selectivity_or(PlannerInfo *root,
+ List *clauses,
+ int varRelid,
+ JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ bool use_extended_stats)
+{
+ Selectivity s1 = 0.0;
+ RelOptInfo *rel;
+ Bitmapset *estimatedclauses = NULL;
+ ListCell *lc;
+ int listidx;
+
+ /*
+ * Determine if these clauses reference a single relation. If so, and if
+ * it has extended statistics, try to apply those.
+ */
+ rel = find_single_rel_for_clauses(root, clauses);
+ if (use_extended_stats && rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
+ {
+ /*
+ * Estimate as many clauses as possible using extended statistics.
+ *
+ * 'estimatedclauses' is populated with the 0-based list position
+ * index of clauses estimated here, and that should be ignored below.
+ */
+ s1 = statext_clauselist_selectivity(root, clauses, varRelid,
+ jointype, sjinfo, rel,
+ &estimatedclauses, true);
+ }
+
+ /*
+ * Estimate the remaining clauses as if they were independent.
+ *
+ * Selectivities for an OR clause are computed as s1+s2 - s1*s2 to account
+ * for the probable overlap of selected tuple sets.
+ *
+ * XXX is this too conservative?
+ */
+ listidx = -1;
+ foreach(lc, clauses)
+ {
+ Selectivity s2;
+
+ listidx++;
+
+ /*
+ * Skip this clause if it's already been estimated by some other
+ * statistics above.
+ */
+ if (bms_is_member(listidx, estimatedclauses))
+ continue;
+
+ s2 = clause_selectivity_ext(root, (Node *) lfirst(lc), varRelid,
+ jointype, sjinfo, use_extended_stats);
+
+ s1 = s1 + s2 - s1 * s2;
+ }
+
+ return s1;
+}
+
+/*
* addRangeClause --- add a new range clause for clauselist_selectivity
*
* Here is where we try to match up pairs of range-query clauses
@@ -602,6 +672,24 @@ clause_selectivity(PlannerInfo *root,
JoinType jointype,
SpecialJoinInfo *sjinfo)
{
+ return clause_selectivity_ext(root, clause, varRelid,
+ jointype, sjinfo, true);
+}
+
+/*
+ * clause_selectivity_ext -
+ * Extended version of clause_selectivity(). If "use_extended_stats" is
+ * false, all extended statistics will be ignored, and only per-column
+ * statistics will be used.
+ */
+Selectivity
+clause_selectivity_ext(PlannerInfo *root,
+ Node *clause,
+ int varRelid,
+ JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ bool use_extended_stats)
+{
Selectivity s1 = 0.5; /* default for any unhandled clause type */
RestrictInfo *rinfo = NULL;
bool cacheable = false;
@@ -716,42 +804,35 @@ clause_selectivity(PlannerInfo *root,
else if (is_notclause(clause))
{
/* inverse of the selectivity of the underlying clause */
- s1 = 1.0 - clause_selectivity(root,
- (Node *) get_notclausearg((Expr *) clause),
- varRelid,
- jointype,
- sjinfo);
+ s1 = 1.0 - clause_selectivity_ext(root,
+ (Node *) get_notclausearg((Expr *) clause),
+ varRelid,
+ jointype,
+ sjinfo,
+ use_extended_stats);
}
else if (is_andclause(clause))
{
/* share code with clauselist_selectivity() */
- s1 = clauselist_selectivity(root,
- ((BoolExpr *) clause)->args,
- varRelid,
- jointype,
- sjinfo);
+ s1 = clauselist_selectivity_ext(root,
+ ((BoolExpr *) clause)->args,
+ varRelid,
+ jointype,
+ sjinfo,
+ use_extended_stats);
}
else if (is_orclause(clause))
{
/*
- * Selectivities for an OR clause are computed as s1+s2 - s1*s2 to
- * account for the probable overlap of selected tuple sets.
- *
- * XXX is this too conservative?
+ * Almost the same thing as clauselist_selectivity, but with the
+ * clauses connected by OR.
*/
- ListCell *arg;
-
- s1 = 0.0;
- foreach(arg, ((BoolExpr *) clause)->args)
- {
- Selectivity s2 = clause_selectivity(root,
- (Node *) lfirst(arg),
- varRelid,
- jointype,
- sjinfo);
-
- s1 = s1 + s2 - s1 * s2;
- }
+ s1 = clauselist_selectivity_or(root,
+ ((BoolExpr *) clause)->args,
+ varRelid,
+ jointype,
+ sjinfo,
+ use_extended_stats);
}
else if (is_opclause(clause) || IsA(clause, DistinctExpr))
{
@@ -852,20 +933,22 @@ clause_selectivity(PlannerInfo *root,
else if (IsA(clause, RelabelType))
{
/* Not sure this case is needed, but it can't hurt */
- s1 = clause_selectivity(root,
- (Node *) ((RelabelType *) clause)->arg,
- varRelid,
- jointype,
- sjinfo);
+ s1 = clause_selectivity_ext(root,
+ (Node *) ((RelabelType *) clause)->arg,
+ varRelid,
+ jointype,
+ sjinfo,
+ use_extended_stats);
}
else if (IsA(clause, CoerceToDomain))
{
/* Not sure this case is needed, but it can't hurt */
- s1 = clause_selectivity(root,
- (Node *) ((CoerceToDomain *) clause)->arg,
- varRelid,
- jointype,
- sjinfo);
+ s1 = clause_selectivity_ext(root,
+ (Node *) ((CoerceToDomain *) clause)->arg,
+ varRelid,
+ jointype,
+ sjinfo,
+ use_extended_stats);
}
else
{
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index d950b4eabe0..b1abcde9687 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -1073,8 +1073,8 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
}
}
- simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid,
- jointype, sjinfo, NULL);
+ simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid,
+ jointype, sjinfo, false);
attr_sel[attidx++] = simple_sel;
}
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 36326927c6b..8d3cd091ada 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1239,10 +1239,10 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
* One of the main challenges with using MCV lists is how to extrapolate the
* estimate to the data not covered by the MCV list. To do that, we compute
* not only the "MCV selectivity" (selectivities for MCV items matching the
- * supplied clauses), but also a couple of derived selectivities:
+ * supplied clauses), but also the following related selectivities:
*
- * - simple selectivity: Computed without extended statistic, i.e. as if the
- * columns/clauses were independent
+ * - simple selectivity: Computed without extended statistics, i.e. as if the
+ * columns/clauses were independent.
*
* - base selectivity: Similar to simple selectivity, but is computed using
* the extended statistic by adding up the base frequencies (that we compute
@@ -1250,30 +1250,9 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
*
* - total selectivity: Selectivity covered by the whole MCV list.
*
- * - other selectivity: A selectivity estimate for data not covered by the MCV
- * list (i.e. satisfying the clauses, but not common enough to make it into
- * the MCV list)
- *
- * Note: While simple and base selectivities are defined in a quite similar
- * way, the values are computed differently and are not therefore equal. The
- * simple selectivity is computed as a product of per-clause estimates, while
- * the base selectivity is computed by adding up base frequencies of matching
- * items of the multi-column MCV list. So the values may differ for two main
- * reasons - (a) the MCV list may not cover 100% of the data and (b) some of
- * the MCV items did not match the estimated clauses.
- *
- * As both (a) and (b) reduce the base selectivity value, it generally holds
- * that (simple_selectivity >= base_selectivity). If the MCV list covers all
- * the data, the values may be equal.
- *
- * So, (simple_selectivity - base_selectivity) is an estimate for the part
- * not covered by the MCV list, and (mcv_selectivity - base_selectivity) may
- * be seen as a correction for the part covered by the MCV list. Those two
- * statements are actually equivalent.
- *
- * Note: Due to rounding errors and minor differences in how the estimates
- * are computed, the inequality may not always hold. Which is why we clamp
- * the selectivities to prevent strange estimate (negative etc.).
+ * These are passed to mcv_combine_selectivities() which combines them to
+ * produce a selectivity estimate that makes use of both per-column statistics
+ * and the multi-column MCV statistics.
*
* 'estimatedclauses' is an input/output parameter. We set bits for the
* 0-based 'clauses' indexes we estimate for and also skip clause items that
@@ -1282,16 +1261,17 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
static Selectivity
statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
JoinType jointype, SpecialJoinInfo *sjinfo,
- RelOptInfo *rel, Bitmapset **estimatedclauses)
+ RelOptInfo *rel, Bitmapset **estimatedclauses,
+ bool is_or)
{
ListCell *l;
Bitmapset **list_attnums;
int listidx;
- Selectivity sel = 1.0;
+ Selectivity sel = (is_or) ? 0.0 : 1.0;
/* check if there's any stats that might be useful for us. */
if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV))
- return 1.0;
+ return sel;
list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
list_length(clauses));
@@ -1327,12 +1307,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
{
StatisticExtInfo *stat;
List *stat_clauses;
- Selectivity simple_sel,
- mcv_sel,
- mcv_basesel,
- mcv_totalsel,
- other_sel,
- stat_sel;
+ Bitmapset *simple_clauses;
/* find the best suited statistics object for these attnums */
stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV,
@@ -1351,6 +1326,9 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
/* now filter the clauses to be estimated using the selected MCV */
stat_clauses = NIL;
+ /* record which clauses are simple (single column) */
+ simple_clauses = NULL;
+
listidx = 0;
foreach(l, clauses)
{
@@ -1361,6 +1339,10 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
if (list_attnums[listidx] != NULL &&
bms_is_subset(list_attnums[listidx], stat->keys))
{
+ if (bms_membership(list_attnums[listidx]) == BMS_SINGLETON)
+ simple_clauses = bms_add_member(simple_clauses,
+ list_length(stat_clauses));
+
stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
@@ -1371,40 +1353,131 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
listidx++;
}
- /*
- * First compute "simple" selectivity, i.e. without the extended
- * statistics, and essentially assuming independence of the
- * columns/clauses. We'll then use the various selectivities computed
- * from MCV list to improve it.
- */
- simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid,
- jointype, sjinfo, NULL);
-
- /*
- * Now compute the multi-column estimate from the MCV list, along with
- * the other selectivities (base & total selectivity).
- */
- mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid,
- jointype, sjinfo, rel,
- &mcv_basesel, &mcv_totalsel);
+ if (is_or)
+ {
+ bool *or_matches = NULL;
+ Selectivity simple_or_sel = 0.0;
+ MCVList *mcv_list;
- /* Estimated selectivity of values not covered by MCV matches */
- other_sel = simple_sel - mcv_basesel;
- CLAMP_PROBABILITY(other_sel);
+ /* Load the MCV list stored in the statistics object */
+ mcv_list = statext_mcv_load(stat->statOid);
- /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */
- if (other_sel > 1.0 - mcv_totalsel)
- other_sel = 1.0 - mcv_totalsel;
+ /*
+ * Compute the selectivity of the ORed list of clauses by
+ * estimating each in turn and combining them using the formula
+ * P(A OR B) = P(A) + P(B) - P(A AND B). This allows us to use
+ * the multivariate MCV stats to better estimate each term.
+ *
+ * Each time we iterate this formula, the clause "A" above is
+ * equal to all the clauses processed so far, combined with "OR".
+ */
+ listidx = 0;
+ foreach(l, stat_clauses)
+ {
+ Node *clause = (Node *) lfirst(l);
+ Selectivity simple_sel,
+ overlap_simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ overlap_mcvsel,
+ overlap_basesel,
+ mcv_totalsel,
+ clause_sel,
+ overlap_sel;
+
+ /*
+ * "Simple" selectivity of the next clause and its overlap
+ * with any of the previous clauses. These are our initial
+ * estimates of P(B) and P(A AND B), assuming independence of
+ * columns/clauses.
+ */
+ simple_sel = clause_selectivity_ext(root, clause, varRelid,
+ jointype, sjinfo, false);
+
+ overlap_simple_sel = simple_or_sel * simple_sel;
+
+ /*
+ * New "simple" selectivity of all clauses seen so far,
+ * assuming independence.
+ */
+ simple_or_sel += simple_sel - overlap_simple_sel;
+ CLAMP_PROBABILITY(simple_or_sel);
+
+ /*
+ * Multi-column estimate of this clause using MCV statistics,
+ * along with base and total selectivities, and corresponding
+ * selectivities for the overlap term P(A AND B).
+ */
+ mcv_sel = mcv_clause_selectivity_or(root, stat, mcv_list,
+ clause, &or_matches,
+ &mcv_basesel,
+ &overlap_mcvsel,
+ &overlap_basesel,
+ &mcv_totalsel);
+
+ /*
+ * Combine the simple and multi-column estimates.
+ *
+ * If this clause is a simple single-column clause, then we
+ * just use the simple selectivity estimate for it, since the
+ * multi-column statistics are unlikely to improve on that
+ * (and in fact could make it worse). For the overlap, we
+ * always make use of the multi-column statistics.
+ */
+ if (bms_is_member(listidx, simple_clauses))
+ clause_sel = simple_sel;
+ else
+ clause_sel = mcv_combine_selectivities(simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ mcv_totalsel);
+
+ overlap_sel = mcv_combine_selectivities(overlap_simple_sel,
+ overlap_mcvsel,
+ overlap_basesel,
+ mcv_totalsel);
+
+ /* Factor these into the overall result */
+ sel += clause_sel - overlap_sel;
+ CLAMP_PROBABILITY(sel);
+
+ listidx++;
+ }
+ }
+ else /* Implicitly-ANDed list of clauses */
+ {
+ Selectivity simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ mcv_totalsel,
+ stat_sel;
- /*
- * Overall selectivity is the combination of MCV and non-MCV
- * estimates.
- */
- stat_sel = mcv_sel + other_sel;
- CLAMP_PROBABILITY(stat_sel);
+ /*
+ * "Simple" selectivity, i.e. without any extended statistics,
+ * essentially assuming independence of the columns/clauses.
+ */
+ simple_sel = clauselist_selectivity_ext(root, stat_clauses,
+ varRelid, jointype,
+ sjinfo, false);
- /* Factor the estimate from this MCV to the overall estimate. */
- sel *= stat_sel;
+ /*
+ * Multi-column estimate using MCV statistics, along with base and
+ * total selectivities.
+ */
+ mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses,
+ varRelid, jointype, sjinfo,
+ rel, &mcv_basesel,
+ &mcv_totalsel);
+
+ /* Combine the simple and multi-column estimates. */
+ stat_sel = mcv_combine_selectivities(simple_sel,
+ mcv_sel,
+ mcv_basesel,
+ mcv_totalsel);
+
+ /* Factor this into the overall result */
+ sel *= stat_sel;
+ }
}
return sel;
@@ -1417,13 +1490,21 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
Selectivity
statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
JoinType jointype, SpecialJoinInfo *sjinfo,
- RelOptInfo *rel, Bitmapset **estimatedclauses)
+ RelOptInfo *rel, Bitmapset **estimatedclauses,
+ bool is_or)
{
Selectivity sel;
/* First, try estimating clauses using a multivariate MCV list. */
sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype,
- sjinfo, rel, estimatedclauses);
+ sjinfo, rel, estimatedclauses, is_or);
+
+ /*
+ * Functional dependencies only work for clauses connected by AND, so for
+ * OR clauses we're done.
+ */
+ if (is_or)
+ return sel;
/*
* Then, apply functional dependencies on the remaining clauses by calling
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 6a262f15436..fae792a2ddf 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -32,6 +32,7 @@
#include "utils/fmgroids.h"
#include "utils/fmgrprotos.h"
#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
#include "utils/syscache.h"
#include "utils/typcache.h"
@@ -1889,15 +1890,79 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
/*
+ * mcv_combine_selectivities
+ * Combine per-column and multi-column MCV selectivity estimates.
+ *
+ * simple_sel is a "simple" selectivity estimate (produced without using any
+ * extended statistics, essentially assuming independence of columns/clauses).
+ *
+ * mcv_sel and mcv_basesel are sums of the frequencies and base frequencies of
+ * all matching MCV items. The difference (mcv_sel - mcv_basesel) is then
+ * essentially interpreted as a correction to be added to simple_sel, as
+ * described below.
+ *
+ * mcv_totalsel is the sum of the frequencies of all MCV items (not just the
+ * matching ones). This is used as an upper bound on the portion of the
+ * selectivity estimates not covered by the MCV statistics.
+ *
+ * Note: While simple and base selectivities are defined in a quite similar
+ * way, the values are computed differently and are not therefore equal. The
+ * simple selectivity is computed as a product of per-clause estimates, while
+ * the base selectivity is computed by adding up base frequencies of matching
+ * items of the multi-column MCV list. So the values may differ for two main
+ * reasons - (a) the MCV list may not cover 100% of the data and (b) some of
+ * the MCV items did not match the estimated clauses.
+ *
+ * As both (a) and (b) reduce the base selectivity value, it generally holds
+ * that (simple_sel >= mcv_basesel). If the MCV list covers all the data, the
+ * values may be equal.
+ *
+ * So, other_sel = (simple_sel - mcv_basesel) is an estimate for the part not
+ * covered by the MCV list, and (mcv_sel - mcv_basesel) may be seen as a
+ * correction for the part covered by the MCV list. Those two statements are
+ * actually equivalent.
+ */
+Selectivity
+mcv_combine_selectivities(Selectivity simple_sel,
+ Selectivity mcv_sel,
+ Selectivity mcv_basesel,
+ Selectivity mcv_totalsel)
+{
+ Selectivity other_sel;
+ Selectivity sel;
+
+ /* estimated selectivity of values not covered by MCV matches */
+ other_sel = simple_sel - mcv_basesel;
+ CLAMP_PROBABILITY(other_sel);
+
+ /* this non-MCV selectivity cannot exceed 1 - mcv_totalsel */
+ if (other_sel > 1.0 - mcv_totalsel)
+ other_sel = 1.0 - mcv_totalsel;
+
+ /* overall selectivity is the sum of the MCV and non-MCV parts */
+ sel = mcv_sel + other_sel;
+ CLAMP_PROBABILITY(sel);
+
+ return sel;
+}
+
+
+/*
* mcv_clauselist_selectivity
- * Return the selectivity estimate computed using an MCV list.
+ * Use MCV statistics to estimate the selectivity of an implicitly-ANDed
+ * list of clauses.
*
- * First builds a bitmap of MCV items matching the clauses, and then sums
- * the frequencies of matching items.
+ * This determines which MCV items match every clause in the list and returns
+ * the sum of the frequencies of those items.
*
- * It also produces two additional interesting selectivities - total
- * selectivity of all the MCV items (not just the matching ones), and the
- * base frequency computed on the assumption of independence.
+ * In addition, it returns the sum of the base frequencies of each of those
+ * items (that is the sum of the selectivities that each item would have if
+ * the columns were independent of one another), and the total selectivity of
+ * all the MCV items (not just the matching ones). These are expected to be
+ * used together with a "simple" selectivity estimate (one based only on
+ * per-column statistics) to produce an overall selectivity estimate that
+ * makes use of both per-column and multi-column statistics --- see
+ * mcv_combine_selectivities().
*/
Selectivity
mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
@@ -1928,7 +1993,6 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
if (matches[i] != false)
{
- /* XXX Shouldn't the basesel be outside the if condition? */
*basesel += mcv->items[i].base_frequency;
s += mcv->items[i].frequency;
}
@@ -1936,3 +2000,94 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
return s;
}
+
+
+/*
+ * mcv_clause_selectivity_or
+ * Use MCV statistics to estimate the selectivity of a clause that
+ * appears in an ORed list of clauses.
+ *
+ * As with mcv_clauselist_selectivity() this determines which MCV items match
+ * the clause and returns both the sum of the frequencies and the sum of the
+ * base frequencies of those items, as well as the sum of the frequencies of
+ * all MCV items (not just the matching ones) so that this information can be
+ * used by mcv_combine_selectivities() to produce a selectivity estimate that
+ * makes use of both per-column and multi-column statistics.
+ *
+ * Additionally, we return information to help compute the overall selectivity
+ * of the ORed list of clauses assumed to contain this clause. This function
+ * is intended to be called for each clause in the ORed list of clauses,
+ * allowing the overall selectivity to be computed using the following
+ * algorithm:
+ *
+ * Suppose P[n] = P(C[1] OR C[2] OR ... OR C[n]) is the combined selectivity
+ * of the first n clauses in the list. Then the combined selectivity taking
+ * into account the next clause C[n+1] can be written as
+ *
+ * P[n+1] = P[n] + P(C[n+1]) - P((C[1] OR ... OR C[n]) AND C[n+1])
+ *
+ * The final term above represents the overlap between the clauses examined so
+ * far and the (n+1)'th clause. To estimate its selectivity, we track the
+ * match bitmap for the ORed list of clauses examined so far and examine its
+ * intersection with the match bitmap for the (n+1)'th clause.
+ *
+ * We then also return the sums of the MCV item frequencies and base
+ * frequencies for the match bitmap intersection corresponding to the overlap
+ * term above, so that they can be combined with a simple selectivity estimate
+ * for that term.
+ *
+ * The parameter "or_matches" is an in/out parameter tracking the match bitmap
+ * for the clauses examined so far. The caller is expected to set it to NULL
+ * the first time it calls this function.
+ */
+Selectivity
+mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
+ MCVList *mcv, Node *clause, bool **or_matches,
+ Selectivity *basesel, Selectivity *overlap_mcvsel,
+ Selectivity *overlap_basesel, Selectivity *totalsel)
+{
+ Selectivity s = 0.0;
+ bool *new_matches;
+ int i;
+
+ /* build the OR-matches bitmap, if not built already */
+ if (*or_matches == NULL)
+ *or_matches = palloc0(sizeof(bool) * mcv->nitems);
+
+ /* build the match bitmap for the new clause */
+ new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
+ mcv, false);
+
+ /*
+ * Sum the frequencies for all the MCV items matching this clause and also
+ * those matching the overlap between this clause and any of the preceding
+ * clauses as described above.
+ */
+ *basesel = 0.0;
+ *overlap_mcvsel = 0.0;
+ *overlap_basesel = 0.0;
+ *totalsel = 0.0;
+ for (i = 0; i < mcv->nitems; i++)
+ {
+ *totalsel += mcv->items[i].frequency;
+
+ if (new_matches[i])
+ {
+ s += mcv->items[i].frequency;
+ *basesel += mcv->items[i].base_frequency;
+
+ if ((*or_matches)[i])
+ {
+ *overlap_mcvsel += mcv->items[i].frequency;
+ *overlap_basesel += mcv->items[i].base_frequency;
+ }
+ }
+
+ /* update the OR-matches bitmap for the next clause */
+ (*or_matches)[i] = (*or_matches)[i] || new_matches[i];
+ }
+
+ pfree(new_matches);
+
+ return s;
+}
diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h
index 3e4171056e8..dea0e7338d5 100644
--- a/src/include/optimizer/optimizer.h
+++ b/src/include/optimizer/optimizer.h
@@ -58,17 +58,23 @@ extern Selectivity clause_selectivity(PlannerInfo *root,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo);
-extern Selectivity clauselist_selectivity_simple(PlannerInfo *root,
- List *clauses,
- int varRelid,
- JoinType jointype,
- SpecialJoinInfo *sjinfo,
- Bitmapset *estimatedclauses);
+extern Selectivity clause_selectivity_ext(PlannerInfo *root,
+ Node *clause,
+ int varRelid,
+ JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ bool use_extended_stats);
extern Selectivity clauselist_selectivity(PlannerInfo *root,
List *clauses,
int varRelid,
JoinType jointype,
SpecialJoinInfo *sjinfo);
+extern Selectivity clauselist_selectivity_ext(PlannerInfo *root,
+ List *clauses,
+ int varRelid,
+ JoinType jointype,
+ SpecialJoinInfo *sjinfo,
+ bool use_extended_stats);
/* in path/costsize.c: */
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 61e69696cfe..02bf6a05027 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -99,6 +99,11 @@ extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows,
extern bool examine_clause_args(List *args, Var **varp,
Const **cstp, bool *varonleftp);
+extern Selectivity mcv_combine_selectivities(Selectivity simple_sel,
+ Selectivity mcv_sel,
+ Selectivity mcv_basesel,
+ Selectivity mcv_totalsel);
+
extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
StatisticExtInfo *stat,
List *clauses,
@@ -109,4 +114,14 @@ extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
Selectivity *basesel,
Selectivity *totalsel);
+extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root,
+ StatisticExtInfo *stat,
+ MCVList *mcv,
+ Node *clause,
+ bool **or_matches,
+ Selectivity *basesel,
+ Selectivity *overlap_mcvsel,
+ Selectivity *overlap_basesel,
+ Selectivity *totalsel);
+
#endif /* EXTENDED_STATS_INTERNAL_H */
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 50fce4935f3..c9ed21155cd 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -116,7 +116,8 @@ extern Selectivity statext_clauselist_selectivity(PlannerInfo *root,
JoinType jointype,
SpecialJoinInfo *sjinfo,
RelOptInfo *rel,
- Bitmapset **estimatedclauses);
+ Bitmapset **estimatedclauses,
+ bool is_or);
extern bool has_stats_of_kind(List *stats, char requiredkind);
extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind,
Bitmapset **clause_attnums,
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 4c3edd213fb..dbbe9844b2e 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -1113,6 +1113,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
200 | 200
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
estimated | actual
-----------+--------
@@ -1173,13 +1179,6 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY
100 | 100
(1 row)
--- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
-SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
- estimated | actual
------------+--------
- 343 | 200
-(1 row)
-
-- check change of unrelated column type does not reset the MCV statistics
ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64);
SELECT d.stxdmcv IS NOT NULL
@@ -1477,6 +1476,134 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
1 | 0
(1 row)
+-- mcv covering just a small fraction of data
+CREATE TABLE mcv_lists_partial (
+ a INT,
+ b INT,
+ c INT
+);
+-- 10 frequent groups, each with 100 elements
+INSERT INTO mcv_lists_partial (a, b, c)
+ SELECT
+ mod(i,10),
+ mod(i,10),
+ mod(i,10)
+ FROM generate_series(0,999) s(i);
+-- 100 groups that will make it to the MCV list (includes the 10 frequent ones)
+INSERT INTO mcv_lists_partial (a, b, c)
+ SELECT
+ i,
+ i,
+ i
+ FROM generate_series(0,99) s(i);
+-- 4000 groups in total, most of which won't make it (just a single item)
+INSERT INTO mcv_lists_partial (a, b, c)
+ SELECT
+ i,
+ i,
+ i
+ FROM generate_series(0,3999) s(i);
+ANALYZE mcv_lists_partial;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0');
+ estimated | actual
+-----------+--------
+ 1 | 102
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0');
+ estimated | actual
+-----------+--------
+ 300 | 102
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10');
+ estimated | actual
+-----------+--------
+ 1 | 2
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10');
+ estimated | actual
+-----------+--------
+ 6 | 2
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10');
+ estimated | actual
+-----------+--------
+ 204 | 104
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0 AND c = 0) OR (a = 1 AND b = 1 AND c = 1) OR (a = 2 AND b = 2 AND c = 2)');
+ estimated | actual
+-----------+--------
+ 1 | 306
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0) OR (a = 0 AND c = 0) OR (b = 0 AND c = 0)');
+ estimated | actual
+-----------+--------
+ 6 | 102
+(1 row)
+
+CREATE STATISTICS mcv_lists_partial_stats (mcv) ON a, b, c
+ FROM mcv_lists_partial;
+ANALYZE mcv_lists_partial;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0');
+ estimated | actual
+-----------+--------
+ 102 | 102
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0');
+ estimated | actual
+-----------+--------
+ 96 | 102
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10');
+ estimated | actual
+-----------+--------
+ 2 | 2
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10');
+ estimated | actual
+-----------+--------
+ 2 | 2
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10');
+ estimated | actual
+-----------+--------
+ 102 | 104
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0 AND c = 0) OR (a = 1 AND b = 1 AND c = 1) OR (a = 2 AND b = 2 AND c = 2)');
+ estimated | actual
+-----------+--------
+ 300 | 306
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0) OR (a = 0 AND c = 0) OR (b = 0 AND c = 0)');
+ estimated | actual
+-----------+--------
+ 306 | 102
+(1 row)
+
+DROP TABLE mcv_lists_partial;
-- check the ability to use multiple MCV lists
CREATE TABLE mcv_lists_multi (
a INTEGER,
@@ -1506,12 +1633,36 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AN
102 | 714
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0');
+ estimated | actual
+-----------+--------
+ 143 | 142
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0');
+ estimated | actual
+-----------+--------
+ 1571 | 1572
+(1 row)
+
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
estimated | actual
-----------+--------
4 | 142
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)');
+ estimated | actual
+-----------+--------
+ 298 | 1572
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0');
+ estimated | actual
+-----------+--------
+ 2649 | 1572
+(1 row)
+
-- create separate MCV statistics
CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi;
CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi;
@@ -1528,12 +1679,36 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AN
714 | 714
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0');
+ estimated | actual
+-----------+--------
+ 143 | 142
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0');
+ estimated | actual
+-----------+--------
+ 1571 | 1572
+(1 row)
+
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
estimated | actual
-----------+--------
143 | 142
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)');
+ estimated | actual
+-----------+--------
+ 1571 | 1572
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0');
+ estimated | actual
+-----------+--------
+ 1714 | 1572
+(1 row)
+
DROP TABLE mcv_lists_multi;
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 9781e590a30..7912e733ae6 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -561,6 +561,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 4 >= a AND ''0
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)');
@@ -581,9 +583,6 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY[4, 5]) AND b IN (''1'', ''2'', NULL, ''3'') AND c > ANY (ARRAY[1, 2, NULL, 3])');
--- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
-SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
-
-- check change of unrelated column type does not reset the MCV statistics
ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64);
@@ -777,6 +776,78 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');
+-- mcv covering just a small fraction of data
+CREATE TABLE mcv_lists_partial (
+ a INT,
+ b INT,
+ c INT
+);
+
+-- 10 frequent groups, each with 100 elements
+INSERT INTO mcv_lists_partial (a, b, c)
+ SELECT
+ mod(i,10),
+ mod(i,10),
+ mod(i,10)
+ FROM generate_series(0,999) s(i);
+
+-- 100 groups that will make it to the MCV list (includes the 10 frequent ones)
+INSERT INTO mcv_lists_partial (a, b, c)
+ SELECT
+ i,
+ i,
+ i
+ FROM generate_series(0,99) s(i);
+
+-- 4000 groups in total, most of which won't make it (just a single item)
+INSERT INTO mcv_lists_partial (a, b, c)
+ SELECT
+ i,
+ i,
+ i
+ FROM generate_series(0,3999) s(i);
+
+ANALYZE mcv_lists_partial;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0 AND c = 0) OR (a = 1 AND b = 1 AND c = 1) OR (a = 2 AND b = 2 AND c = 2)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0) OR (a = 0 AND c = 0) OR (b = 0 AND c = 0)');
+
+CREATE STATISTICS mcv_lists_partial_stats (mcv) ON a, b, c
+ FROM mcv_lists_partial;
+
+ANALYZE mcv_lists_partial;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0 AND c = 0) OR (a = 1 AND b = 1 AND c = 1) OR (a = 2 AND b = 2 AND c = 2)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0 AND b = 0) OR (a = 0 AND c = 0) OR (b = 0 AND c = 0)');
+
+DROP TABLE mcv_lists_partial;
+
-- check the ability to use multiple MCV lists
CREATE TABLE mcv_lists_multi (
a INTEGER,
@@ -799,7 +870,11 @@ ANALYZE mcv_lists_multi;
-- estimates without any mcv statistics
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0');
-- create separate MCV statistics
CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi;
@@ -809,7 +884,11 @@ ANALYZE mcv_lists_multi;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0');
DROP TABLE mcv_lists_multi;