Properly prepare varinfos in estimate_multivariate_bucketsize()

To estimate with extended statistics, we need to clear the varnullingrels field in the expression, and duplicates are not allowed in the GroupVarInfo list. We might re-use add_unique_group_var(), but we don't do so for two reasons. 1) We must keep the origin_rinfos list ordered exactly the same way as varinfos. 2) add_unique_group_var() is designed for estimate_num_groups(), where a larger number of groups is worse. While estimating the number of hash buckets, we have the opposite: a lesser number of groups is worse. Therefore, we don't have to remove "known equal" vars: the removed var may valuably contribute to the multivariate statistics to grow the number of groups. This commit adds custom code to estimate_multivariate_bucketsize() to initialize varinfos properly. Reported-by: Robins Tharakan <tharakan@gmail.com> Discussion: https://postgr.es/m/18885-da51324078588253%40postgresql.org Author: Andrei Lepikhov <lepihov@gmail.com> Reviewed-by: David Rowley <dgrowleyml@gmail.com> Reviewed-by: Tomas Vondra <tomas@vondra.me> Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
author: Alexander Korotkov <akorotkov@postgresql.org> 2025-04-23 20:13:51 +0300
committer: Alexander Korotkov <akorotkov@postgresql.org> 2025-04-23 20:25:21 +0300
commit: 9f404d7922e8831dc49bfa225530ba5309900e4e (patch)
tree: 72950ab02339a280611e95da553099dd9993c767 /src
parent: 3db61db48ef5b8898f7e85f98548fdec79d76524 (diff)
download: postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.tar.gz
postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.zip
3 files changed, 96 insertions, 3 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 987f2154459..a96b1b9c0bc 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3850,6 +3850,8 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 			if (bms_get_singleton_member(relids, &relid) &&
 				root->simple_rel_array[relid]->statlist != NIL)
 			{
+				bool		is_duplicate = false;
+
 				/*
 				 * This inner-side expression references only one relation.
 				 * Extended statistics on this clause can exist.
@@ -3880,11 +3882,61 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 					 */
 					continue;
 
-				varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
+				/*
+				 * We're going to add the new clause to the varinfos list.  We
+				 * might re-use add_unique_group_var(), but we don't do so for
+				 * two reasons.
+				 *
+				 * 1) We must keep the origin_rinfos list ordered exactly the
+				 * same way as varinfos.
+				 *
+				 * 2) add_unique_group_var() is designed for
+				 * estimate_num_groups(), where a larger number of groups is
+				 * worse.   While estimating the number of hash buckets, we
+				 * have the opposite: a lesser number of groups is worse.
+				 * Therefore, we don't have to remove "known equal" vars: the
+				 * removed var may valuably contribute to the multivariate
+				 * statistics to grow the number of groups.
+				 */
+
+				/*
+				 * Clear nullingrels to correctly match hash keys.  See
+				 * add_unique_group_var()'s comment for details.
+				 */
+				expr = remove_nulling_relids(expr, root->outer_join_rels, NULL);
+
+				/*
+				 * Detect and exclude exact duplicates from the list of hash
+				 * keys (like add_unique_group_var does).
+				 */
+				foreach(lc1, varinfos)
+				{
+					varinfo = (GroupVarInfo *) lfirst(lc1);
+
+					if (!equal(expr, varinfo->var))
+						continue;
+
+					is_duplicate = true;
+					break;
+				}
+
+				if (is_duplicate)
+				{
+					/*
+					 * Skip exact duplicates. Adding them to the otherclauses
+					 * list also doesn't make sense.
+					 */
+					continue;
+				}
+
+				/*
+				 * Initialize GroupVarInfo.  We only use it to call
+				 * estimate_multivariate_ndistinct(), which doesn't care about
+				 * ndistinct and isdefault fields.  Thus, skip these fields.
+				 */
+				varinfo = (GroupVarInfo *) palloc0(sizeof(GroupVarInfo));
 				varinfo->var = expr;
 				varinfo->rel = root->simple_rel_array[relid];
-				varinfo->ndistinct = 0.0;
-				varinfo->isdefault = false;
 				varinfos = lappend(varinfos, varinfo);
 
 				/*
@@ -3894,8 +3946,10 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 				origin_rinfos = lappend(origin_rinfos, rinfo);
 			}
 			else
+			{
 				/* This clause can't be estimated with extended statistics */
 				otherclauses = lappend(otherclauses, rinfo);
+			}
 
 			clauses = foreach_delete_current(clauses, lc);
 		}
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 686d8c93aa8..6359e5fb689 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3427,4 +3427,32 @@ SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
          ->  Seq Scan on sb_2 b
 (5 rows)
 
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+   ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.y = sb_2.y) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+RESET enable_nestloop;
+RESET enable_mergejoin;
 DROP TABLE sb_1, sb_2 CASCADE;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index b71a6cd089f..da4f2fe9c93 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1747,4 +1747,15 @@ ANALYZE sb_2;
 EXPLAIN (COSTS OFF) -- Choose hash join
 SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
 
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+   ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+RESET enable_nestloop;
+RESET enable_mergejoin;
+
 DROP TABLE sb_1, sb_2 CASCADE;
author	Alexander Korotkov <akorotkov@postgresql.org>	2025-04-23 20:13:51 +0300
committer	Alexander Korotkov <akorotkov@postgresql.org>	2025-04-23 20:25:21 +0300
commit	9f404d7922e8831dc49bfa225530ba5309900e4e (patch)
tree	72950ab02339a280611e95da553099dd9993c767 /src
parent	3db61db48ef5b8898f7e85f98548fdec79d76524 (diff)
download	postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.tar.gz postgresql-9f404d7922e8831dc49bfa225530ba5309900e4e.zip