Remove redundant grouping and DISTINCT columns.

Avoid explicitly grouping by columns that we know are redundant for sorting, for example we need group by only one of x and y in SELECT ... WHERE x = y GROUP BY x, y This comes up more often than you might think, as shown by the changes in the regression tests. It's nearly free to detect too, since we are just piggybacking on the existing logic that detects redundant pathkeys. (In some of the existing plans that change, it's visible that a sort step preceding the grouping step already didn't bother to sort by the redundant column, making the old plan a bit silly-looking.) To do this, build processed_groupClause and processed_distinctClause lists that omit any provably-redundant sort items, and consult those not the originals where relevant. This means that within the planner, one should usually consult root->processed_groupClause or root->processed_distinctClause if one wants to know which columns are to be grouped on; but to check whether grouping or distinct-ing is happening at all, check non-NIL-ness of parse->groupClause or parse->distinctClause. This is comparable to longstanding rules about handling the HAVING clause, so I don't think it'll be a huge maintenance problem. nodeAgg.c also needs minor mods, because it's now possible to generate AGG_PLAIN and AGG_SORTED Agg nodes with zero grouping columns. Patch by me; thanks to Richard Guo and David Rowley for review. Discussion: https://postgr.es/m/185315.1672179489@sss.pgh.pa.us
2025-07-27 12:41:57 +03:00 · 2023-01-18 12:37:57 -05:00
parent d540a02a72
commit 8d83a5d0a2
16 changed files with 303 additions and 171 deletions
--- a/contrib/postgres_fdw/deparse.c
+++ b/contrib/postgres_fdw/deparse.c
@ -3736,6 +3736,13 @@ appendGroupByClause(List *tlist, deparse_expr_cxt *context)
 	 */
 	Assert(!query->groupingSets);

+	/*
+	 * We intentionally print query->groupClause not processed_groupClause,
+	 * leaving it to the remote planner to get rid of any redundant GROUP BY
+	 * items again.  This is necessary in case processed_groupClause reduced
+	 * to empty, and in any case the redundancy situation on the remote might
+	 * be different than what we think here.
+	 */
 	foreach(lc, query->groupClause)
 	{
 		SortGroupClause *grp = (SortGroupClause *) lfirst(lc);
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@ -2293,7 +2293,7 @@ SELECT t1."C 1" FROM "S 1"."T 1" t1, LATERAL (SELECT DISTINCT t2.c1, t3.c1 FROM
               ->  Subquery Scan on q
                     ->  HashAggregate
                           Output: t2.c1, t3.c1
-                           Group Key: t2.c1, t3.c1
+                           Group Key: t2.c1
                           ->  Foreign Scan
                                 Output: t2.c1, t3.c1
                                 Relations: (public.ft1 t2) INNER JOIN (public.ft2 t3)
@ -2864,16 +2864,13 @@ select c2 * (random() <= 1)::int as c2 from ft2 group by c2 * (random() <= 1)::i
 -- GROUP BY clause in various forms, cardinal, alias and constant expression
 explain (verbose, costs off)
 select count(c2) w, c2 x, 5 y, 7.0 z from ft1 group by 2, y, 9.0::int order by 2;
-                                      QUERY PLAN                                       
---------------------------------------------------------------------------------------
- Sort
+                                                 QUERY PLAN                                                 
+------------------------------------------------------------------------------------------------------------
+ Foreign Scan
   Output: (count(c2)), c2, 5, 7.0, 9
-   Sort Key: ft1.c2
-   ->  Foreign Scan
-         Output: (count(c2)), c2, 5, 7.0, 9
-         Relations: Aggregate on (public.ft1)
-         Remote SQL: SELECT count(c2), c2, 5, 7.0, 9 FROM "S 1"."T 1" GROUP BY 2, 3, 5
-(7 rows)
+   Relations: Aggregate on (public.ft1)
+   Remote SQL: SELECT count(c2), c2, 5, 7.0, 9 FROM "S 1"."T 1" GROUP BY 2, 3, 5 ORDER BY c2 ASC NULLS LAST
+(4 rows)

 select count(c2) w, c2 x, 5 y, 7.0 z from ft1 group by 2, y, 9.0::int order by 2;
  w  | x | y |  z  
@ -2990,14 +2987,13 @@ select exists(select 1 from pg_enum), sum(c1) from ft1 group by 1;
                    QUERY PLAN                     
 ---------------------------------------------------
 GroupAggregate
-   Output: ($0), sum(ft1.c1)
-   Group Key: $0
+   Output: $0, sum(ft1.c1)
   InitPlan 1 (returns $0)
     ->  Seq Scan on pg_catalog.pg_enum
   ->  Foreign Scan on public.ft1
-         Output: $0, ft1.c1
+         Output: ft1.c1
         Remote SQL: SELECT "C 1" FROM "S 1"."T 1"
-(8 rows)
+(7 rows)

 select exists(select 1 from pg_enum), sum(c1) from ft1 group by 1;
 exists |  sum   
@ -3382,14 +3378,13 @@ select array_agg(c1 order by c1 using operator(public.<^)) from ft2 where c2 = 6
 --------------------------------------------------------------------------------------------------
 GroupAggregate
   Output: array_agg(c1 ORDER BY c1 USING <^ NULLS LAST), c2
-   Group Key: ft2.c2
   ->  Sort
-         Output: c2, c1
+         Output: c1, c2
         Sort Key: ft2.c1 USING <^
         ->  Foreign Scan on public.ft2
-               Output: c2, c1
+               Output: c1, c2
               Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE (("C 1" < 100)) AND ((c2 = 6))
-(9 rows)
+(8 rows)

 -- This should not be pushed either.
 explain (verbose, costs off)
@ -3458,14 +3453,13 @@ select array_agg(c1 order by c1 using operator(public.<^)) from ft2 where c2 = 6
 --------------------------------------------------------------------------------------------------
 GroupAggregate
   Output: array_agg(c1 ORDER BY c1 USING <^ NULLS LAST), c2
-   Group Key: ft2.c2
   ->  Sort
-         Output: c2, c1
+         Output: c1, c2
         Sort Key: ft2.c1 USING <^
         ->  Foreign Scan on public.ft2
-               Output: c2, c1
+               Output: c1, c2
               Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE (("C 1" < 100)) AND ((c2 = 6))
-(9 rows)
+(8 rows)

 -- Cleanup
 drop operator class my_op_class using btree;
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@ -3345,9 +3345,9 @@ estimate_path_cost_size(PlannerInfo *root,
 			}

 			/* Get number of grouping columns and possible number of groups */
-			numGroupCols = list_length(root->parse->groupClause);
+			numGroupCols = list_length(root->processed_groupClause);
 			numGroups = estimate_num_groups(root,
-											get_sortgrouplist_exprs(root->parse->groupClause,
+											get_sortgrouplist_exprs(root->processed_groupClause,
 																	fpinfo->grouped_tlist),
 											input_rows, NULL, NULL);

@ -3636,7 +3636,7 @@ adjust_foreign_grouping_path_cost(PlannerInfo *root,
 	 * pathkeys, adjust the costs with that function.  Otherwise, adjust the
 	 * costs by applying the same heuristic as for the scan or join case.
 	 */
-	if (!grouping_is_sortable(root->parse->groupClause) ||
+	if (!grouping_is_sortable(root->processed_groupClause) ||
 		!pathkeys_contained_in(pathkeys, root->group_pathkeys))
 	{
 		Path		sort_path;	/* dummy for result of cost_sort */
@ -6415,7 +6415,11 @@ foreign_grouping_ok(PlannerInfo *root, RelOptInfo *grouped_rel,
 		Index		sgref = get_pathtarget_sortgroupref(grouping_target, i);
 		ListCell   *l;

-		/* Check whether this expression is part of GROUP BY clause */
+		/*
+		 * Check whether this expression is part of GROUP BY clause.  Note we
+		 * check the whole GROUP BY clause not just processed_groupClause,
+		 * because we will ship all of it, cf. appendGroupByClause.
+		 */
 		if (sgref && get_sortgroupref_clause_noerr(sgref, query->groupClause))
 		{
 			TargetEntry *tle;