Consider the "LIMIT 1" optimization with parallel DISTINCT

Similar to what was done in 5543677ec for non-parallel DISTINCT, apply the same optimization when the distinct_pathkeys are empty for the partial paths too. This can be faster than the non-parallel version when the first row matching the WHERE clause of the query takes a while to find. Parallel workers could speed that process up considerably. Author: Richard Guo Reviewed-by: David Rowley Discussion: https://postgr.es/m/CAMbWs49JC0qvfUbzs-TVzgMpSSBiMJ_6sN=BaA9iohBgYkr=LA@mail.gmail.com
author: David Rowley <drowley@postgresql.org> 2024-01-31 17:22:02 +1300
committer: David Rowley <drowley@postgresql.org> 2024-01-31 17:22:02 +1300
commit: b588cad688823b1e996ce05af4d88a954c005a3a (patch)
tree: 41ec222b7f8ad9ca83b80bd9be18e8815608fc60
parent: 3e91dba8b079c02dc5204108c7e797b402c75779 (diff)
download: postgresql-b588cad688823b1e996ce05af4d88a954c005a3a.tar.gz
postgresql-b588cad688823b1e996ce05af4d88a954c005a3a.zip
3 files changed, 72 insertions, 5 deletions
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 01fa45b9255..342f5ad8d0a 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4737,11 +4737,45 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
 																		-1.0);
 			}
 
-			add_partial_path(partial_distinct_rel, (Path *)
-							 create_upper_unique_path(root, partial_distinct_rel,
-													  sorted_path,
-													  list_length(root->distinct_pathkeys),
-													  numDistinctRows));
+			/*
+			 * An empty distinct_pathkeys means all tuples have the same value
+			 * for the DISTINCT clause.  See create_final_distinct_paths()
+			 */
+			if (root->distinct_pathkeys == NIL)
+			{
+				Node	   *limitCount;
+
+				limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
+												sizeof(int64),
+												Int64GetDatum(1), false,
+												FLOAT8PASSBYVAL);
+
+				/*
+				 * Apply a LimitPath onto the partial path to restrict the
+				 * tuples from each worker to 1.  create_final_distinct_paths
+				 * will need to apply an additional LimitPath to restrict this
+				 * to a single row after the Gather node.  If the query
+				 * already has a LIMIT clause, then we could end up with three
+				 * Limit nodes in the final plan.  Consolidating the top two
+				 * of these could be done, but does not seem worth troubling
+				 * over.
+				 */
+				add_partial_path(partial_distinct_rel, (Path *)
+								 create_limit_path(root, partial_distinct_rel,
+												   sorted_path,
+												   NULL,
+												   limitCount,
+												   LIMIT_OPTION_COUNT,
+												   0, 1));
+			}
+			else
+			{
+				add_partial_path(partial_distinct_rel, (Path *)
+								 create_upper_unique_path(root, partial_distinct_rel,
+														  sorted_path,
+														  list_length(root->distinct_pathkeys),
+														  numDistinctRows));
+			}
 		}
 	}
 
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index 9d44ea8056d..1f72756ccb4 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -348,6 +348,26 @@ SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
     0 |        1 |        2 |        3
 (1 row)
 
+SET parallel_setup_cost=0;
+SET min_parallel_table_scan_size=0;
+SET max_parallel_workers_per_gather=2;
+-- Ensure we get a plan with a Limit 1 in both partial distinct and final
+-- distinct
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 10;
+                  QUERY PLAN                  
+----------------------------------------------
+ Limit
+   ->  Gather
+         Workers Planned: 2
+         ->  Limit
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (four = 10)
+(6 rows)
+
+RESET max_parallel_workers_per_gather;
+RESET min_parallel_table_scan_size;
+RESET parallel_setup_cost;
 --
 -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
 -- very own regression file.
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index 1643526d991..da92c197aba 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -180,6 +180,19 @@ SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
 -- Ensure we only get 1 row
 SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
 
+SET parallel_setup_cost=0;
+SET min_parallel_table_scan_size=0;
+SET max_parallel_workers_per_gather=2;
+
+-- Ensure we get a plan with a Limit 1 in both partial distinct and final
+-- distinct
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT four FROM tenk1 WHERE four = 10;
+
+RESET max_parallel_workers_per_gather;
+RESET min_parallel_table_scan_size;
+RESET parallel_setup_cost;
+
 --
 -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
 -- very own regression file.
author	David Rowley <drowley@postgresql.org>	2024-01-31 17:22:02 +1300
committer	David Rowley <drowley@postgresql.org>	2024-01-31 17:22:02 +1300
commit	b588cad688823b1e996ce05af4d88a954c005a3a (patch)
tree	41ec222b7f8ad9ca83b80bd9be18e8815608fc60
parent	3e91dba8b079c02dc5204108c7e797b402c75779 (diff)
download	postgresql-b588cad688823b1e996ce05af4d88a954c005a3a.tar.gz postgresql-b588cad688823b1e996ce05af4d88a954c005a3a.zip