diff options
author | Alexander Korotkov <akorotkov@postgresql.org> | 2025-04-23 20:13:51 +0300 |
---|---|---|
committer | Alexander Korotkov <akorotkov@postgresql.org> | 2025-04-23 20:25:21 +0300 |
commit | 9f404d7922e8831dc49bfa225530ba5309900e4e (patch) | |
tree | 72950ab02339a280611e95da553099dd9993c767 /src | |
parent | 3db61db48ef5b8898f7e85f98548fdec79d76524 (diff) | |
download | postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.tar.gz postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.zip |
Properly prepare varinfos in estimate_multivariate_bucketsize()
To estimate with extended statistics, we need to clear the varnullingrels
field in the expression, and duplicates are not allowed in the GroupVarInfo
list. We might re-use add_unique_group_var(), but we don't do so for two
reasons.
1) We must keep the origin_rinfos list ordered exactly the same way as
varinfos.
2) add_unique_group_var() is designed for estimate_num_groups(), where a
larger number of groups is worse. While estimating the number of hash
buckets, we have the opposite: a lesser number of groups is worse.
Therefore, we don't have to remove "known equal" vars: the removed var
may valuably contribute to the multivariate statistics to grow the number
of groups.
This commit adds custom code to estimate_multivariate_bucketsize() to
initialize varinfos properly.
Reported-by: Robins Tharakan <tharakan@gmail.com>
Discussion: https://postgr.es/m/18885-da51324078588253%40postgresql.org
Author: Andrei Lepikhov <lepihov@gmail.com>
Reviewed-by: David Rowley <dgrowleyml@gmail.com>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 60 | ||||
-rw-r--r-- | src/test/regress/expected/stats_ext.out | 28 | ||||
-rw-r--r-- | src/test/regress/sql/stats_ext.sql | 11 |
3 files changed, 96 insertions, 3 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 987f2154459..a96b1b9c0bc 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3850,6 +3850,8 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, if (bms_get_singleton_member(relids, &relid) && root->simple_rel_array[relid]->statlist != NIL) { + bool is_duplicate = false; + /* * This inner-side expression references only one relation. * Extended statistics on this clause can exist. @@ -3880,11 +3882,61 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, */ continue; - varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo)); + /* + * We're going to add the new clause to the varinfos list. We + * might re-use add_unique_group_var(), but we don't do so for + * two reasons. + * + * 1) We must keep the origin_rinfos list ordered exactly the + * same way as varinfos. + * + * 2) add_unique_group_var() is designed for + * estimate_num_groups(), where a larger number of groups is + * worse. While estimating the number of hash buckets, we + * have the opposite: a lesser number of groups is worse. + * Therefore, we don't have to remove "known equal" vars: the + * removed var may valuably contribute to the multivariate + * statistics to grow the number of groups. + */ + + /* + * Clear nullingrels to correctly match hash keys. See + * add_unique_group_var()'s comment for details. + */ + expr = remove_nulling_relids(expr, root->outer_join_rels, NULL); + + /* + * Detect and exclude exact duplicates from the list of hash + * keys (like add_unique_group_var does). + */ + foreach(lc1, varinfos) + { + varinfo = (GroupVarInfo *) lfirst(lc1); + + if (!equal(expr, varinfo->var)) + continue; + + is_duplicate = true; + break; + } + + if (is_duplicate) + { + /* + * Skip exact duplicates. Adding them to the otherclauses + * list also doesn't make sense. + */ + continue; + } + + /* + * Initialize GroupVarInfo. We only use it to call + * estimate_multivariate_ndistinct(), which doesn't care about + * ndistinct and isdefault fields. Thus, skip these fields. + */ + varinfo = (GroupVarInfo *) palloc0(sizeof(GroupVarInfo)); varinfo->var = expr; varinfo->rel = root->simple_rel_array[relid]; - varinfo->ndistinct = 0.0; - varinfo->isdefault = false; varinfos = lappend(varinfos, varinfo); /* @@ -3894,8 +3946,10 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, origin_rinfos = lappend(origin_rinfos, rinfo); } else + { /* This clause can't be estimated with extended statistics */ otherclauses = lappend(otherclauses, rinfo); + } clauses = foreach_delete_current(clauses, lc); } diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 686d8c93aa8..6359e5fb689 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -3427,4 +3427,32 @@ SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z; -> Seq Scan on sb_2 b (5 rows) +-- Check that the Hash Join bucket size estimator detects equal clauses correctly. +SET enable_nestloop = 'off'; +SET enable_mergejoin = 'off'; +EXPLAIN (COSTS OFF) +SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x); + QUERY PLAN +-------------------------------------------------------- + Hash Left Join + Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.x = sb_2.x)) + -> Seq Scan on sb_1 + -> Hash + -> Seq Scan on sb_2 +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT FROM sb_1 LEFT JOIN sb_2 + ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y); + QUERY PLAN +------------------------------------------------------------------------------ + Hash Left Join + Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.y = sb_2.y) AND (sb_1.x = sb_2.x)) + -> Seq Scan on sb_1 + -> Hash + -> Seq Scan on sb_2 +(5 rows) + +RESET enable_nestloop; +RESET enable_mergejoin; DROP TABLE sb_1, sb_2 CASCADE; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index b71a6cd089f..da4f2fe9c93 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1747,4 +1747,15 @@ ANALYZE sb_2; EXPLAIN (COSTS OFF) -- Choose hash join SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z; +-- Check that the Hash Join bucket size estimator detects equal clauses correctly. +SET enable_nestloop = 'off'; +SET enable_mergejoin = 'off'; +EXPLAIN (COSTS OFF) +SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x); +EXPLAIN (COSTS OFF) +SELECT FROM sb_1 LEFT JOIN sb_2 + ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y); +RESET enable_nestloop; +RESET enable_mergejoin; + DROP TABLE sb_1, sb_2 CASCADE; |