Consider the "LIMIT 1" optimization with parallel DISTINCT

Similar to what was done in 5543677ec for non-parallel DISTINCT, apply the same optimization when the distinct_pathkeys are empty for the partial paths too. This can be faster than the non-parallel version when the first row matching the WHERE clause of the query takes a while to find. Parallel workers could speed that process up considerably. Author: Richard Guo Reviewed-by: David Rowley Discussion: https://postgr.es/m/CAMbWs49JC0qvfUbzs-TVzgMpSSBiMJ_6sN=BaA9iohBgYkr=LA@mail.gmail.com
2025-10-15 05:46:52 +03:00 · 2024-01-31 17:22:02 +13:00
parent 3e91dba8b0
commit b588cad688
3 changed files with 72 additions and 5 deletions
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4737,6 +4737,39 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
 																		-1.0);
 			}
 			/*
 			 * An empty distinct_pathkeys means all tuples have the same value
 			 * for the DISTINCT clause.  See create_final_distinct_paths()
 			 */
 			if (root->distinct_pathkeys == NIL)
 			{
 				Node	   *limitCount;
 				limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
 												sizeof(int64),
 												Int64GetDatum(1), false,
 												FLOAT8PASSBYVAL);
 				/*
 				 * Apply a LimitPath onto the partial path to restrict the
 				 * tuples from each worker to 1.  create_final_distinct_paths
 				 * will need to apply an additional LimitPath to restrict this
 				 * to a single row after the Gather node.  If the query
 				 * already has a LIMIT clause, then we could end up with three
 				 * Limit nodes in the final plan.  Consolidating the top two
 				 * of these could be done, but does not seem worth troubling
 				 * over.
 				 */
 				add_partial_path(partial_distinct_rel, (Path *)
 								 create_limit_path(root, partial_distinct_rel,
 												   sorted_path,
 												   NULL,
 												   limitCount,
 												   LIMIT_OPTION_COUNT,
 												   0, 1));
 			}
 			else
 			{
 				add_partial_path(partial_distinct_rel, (Path *)
 								 create_upper_unique_path(root, partial_distinct_rel,
 														  sorted_path,
@@ -4744,6 +4777,7 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
 														  numDistinctRows));
 			}
 		}
 	}
 	/*
 	 * Now try hash aggregate paths, if enabled and hashing is possible. Since
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -348,6 +348,26 @@ SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
    0 |        1 |        2 |        3
 (1 row)
 SET parallel_setup_cost=0;
 SET min_parallel_table_scan_size=0;
 SET max_parallel_workers_per_gather=2;
 -- Ensure we get a plan with a Limit 1 in both partial distinct and final
 -- distinct
 EXPLAIN (COSTS OFF)
 SELECT DISTINCT four FROM tenk1 WHERE four = 10;
                  QUERY PLAN                  
 ----------------------------------------------
 Limit
   ->  Gather
         Workers Planned: 2
         ->  Limit
               ->  Parallel Seq Scan on tenk1
                     Filter: (four = 10)
 (6 rows)
 RESET max_parallel_workers_per_gather;
 RESET min_parallel_table_scan_size;
 RESET parallel_setup_cost;
 --
 -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
 -- very own regression file.
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -180,6 +180,19 @@ SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
 -- Ensure we only get 1 row
 SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
 SET parallel_setup_cost=0;
 SET min_parallel_table_scan_size=0;
 SET max_parallel_workers_per_gather=2;
 -- Ensure we get a plan with a Limit 1 in both partial distinct and final
 -- distinct
 EXPLAIN (COSTS OFF)
 SELECT DISTINCT four FROM tenk1 WHERE four = 10;
 RESET max_parallel_workers_per_gather;
 RESET min_parallel_table_scan_size;
 RESET parallel_setup_cost;
 --
 -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
 -- very own regression file.