aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlexander Korotkov <akorotkov@postgresql.org>2025-04-23 20:13:51 +0300
committerAlexander Korotkov <akorotkov@postgresql.org>2025-04-23 20:25:21 +0300
commit9f404d7922e8831dc49bfa225530ba5309900e4e (patch)
tree72950ab02339a280611e95da553099dd9993c767 /src
parent3db61db48ef5b8898f7e85f98548fdec79d76524 (diff)
downloadpostgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.tar.gz
postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.zip
Properly prepare varinfos in estimate_multivariate_bucketsize()
To estimate with extended statistics, we need to clear the varnullingrels field in the expression, and duplicates are not allowed in the GroupVarInfo list. We might re-use add_unique_group_var(), but we don't do so for two reasons. 1) We must keep the origin_rinfos list ordered exactly the same way as varinfos. 2) add_unique_group_var() is designed for estimate_num_groups(), where a larger number of groups is worse. While estimating the number of hash buckets, we have the opposite: a lesser number of groups is worse. Therefore, we don't have to remove "known equal" vars: the removed var may valuably contribute to the multivariate statistics to grow the number of groups. This commit adds custom code to estimate_multivariate_bucketsize() to initialize varinfos properly. Reported-by: Robins Tharakan <tharakan@gmail.com> Discussion: https://postgr.es/m/18885-da51324078588253%40postgresql.org Author: Andrei Lepikhov <lepihov@gmail.com> Reviewed-by: David Rowley <dgrowleyml@gmail.com> Reviewed-by: Tomas Vondra <tomas@vondra.me> Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
Diffstat (limited to 'src')
-rw-r--r--src/backend/utils/adt/selfuncs.c60
-rw-r--r--src/test/regress/expected/stats_ext.out28
-rw-r--r--src/test/regress/sql/stats_ext.sql11
3 files changed, 96 insertions, 3 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 987f2154459..a96b1b9c0bc 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3850,6 +3850,8 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
if (bms_get_singleton_member(relids, &relid) &&
root->simple_rel_array[relid]->statlist != NIL)
{
+ bool is_duplicate = false;
+
/*
* This inner-side expression references only one relation.
* Extended statistics on this clause can exist.
@@ -3880,11 +3882,61 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
*/
continue;
- varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
+ /*
+ * We're going to add the new clause to the varinfos list. We
+ * might re-use add_unique_group_var(), but we don't do so for
+ * two reasons.
+ *
+ * 1) We must keep the origin_rinfos list ordered exactly the
+ * same way as varinfos.
+ *
+ * 2) add_unique_group_var() is designed for
+ * estimate_num_groups(), where a larger number of groups is
+ * worse. While estimating the number of hash buckets, we
+ * have the opposite: a lesser number of groups is worse.
+ * Therefore, we don't have to remove "known equal" vars: the
+ * removed var may valuably contribute to the multivariate
+ * statistics to grow the number of groups.
+ */
+
+ /*
+ * Clear nullingrels to correctly match hash keys. See
+ * add_unique_group_var()'s comment for details.
+ */
+ expr = remove_nulling_relids(expr, root->outer_join_rels, NULL);
+
+ /*
+ * Detect and exclude exact duplicates from the list of hash
+ * keys (like add_unique_group_var does).
+ */
+ foreach(lc1, varinfos)
+ {
+ varinfo = (GroupVarInfo *) lfirst(lc1);
+
+ if (!equal(expr, varinfo->var))
+ continue;
+
+ is_duplicate = true;
+ break;
+ }
+
+ if (is_duplicate)
+ {
+ /*
+ * Skip exact duplicates. Adding them to the otherclauses
+ * list also doesn't make sense.
+ */
+ continue;
+ }
+
+ /*
+ * Initialize GroupVarInfo. We only use it to call
+ * estimate_multivariate_ndistinct(), which doesn't care about
+ * ndistinct and isdefault fields. Thus, skip these fields.
+ */
+ varinfo = (GroupVarInfo *) palloc0(sizeof(GroupVarInfo));
varinfo->var = expr;
varinfo->rel = root->simple_rel_array[relid];
- varinfo->ndistinct = 0.0;
- varinfo->isdefault = false;
varinfos = lappend(varinfos, varinfo);
/*
@@ -3894,8 +3946,10 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
origin_rinfos = lappend(origin_rinfos, rinfo);
}
else
+ {
/* This clause can't be estimated with extended statistics */
otherclauses = lappend(otherclauses, rinfo);
+ }
clauses = foreach_delete_current(clauses, lc);
}
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 686d8c93aa8..6359e5fb689 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3427,4 +3427,32 @@ SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
-> Seq Scan on sb_2 b
(5 rows)
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+ QUERY PLAN
+--------------------------------------------------------
+ Hash Left Join
+ Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.x = sb_2.x))
+ -> Seq Scan on sb_1
+ -> Hash
+ -> Seq Scan on sb_2
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+ ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+ QUERY PLAN
+------------------------------------------------------------------------------
+ Hash Left Join
+ Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.y = sb_2.y) AND (sb_1.x = sb_2.x))
+ -> Seq Scan on sb_1
+ -> Hash
+ -> Seq Scan on sb_2
+(5 rows)
+
+RESET enable_nestloop;
+RESET enable_mergejoin;
DROP TABLE sb_1, sb_2 CASCADE;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index b71a6cd089f..da4f2fe9c93 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1747,4 +1747,15 @@ ANALYZE sb_2;
EXPLAIN (COSTS OFF) -- Choose hash join
SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+ ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+RESET enable_nestloop;
+RESET enable_mergejoin;
+
DROP TABLE sb_1, sb_2 CASCADE;