Remove GROUP BY columns that are functionally dependent on other columns.

If a GROUP BY clause includes all columns of a non-deferred primary key, as well as other columns of the same relation, those other columns are redundant and can be dropped from the grouping; the pkey is enough to ensure that each row of the table corresponds to a separate group. Getting rid of the excess columns will reduce the cost of the sorting or hashing needed to implement GROUP BY, and can indeed remove the need for a sort step altogether. This seems worth testing for since many query authors are not aware of the GROUP-BY-primary-key exception to the rule about queries not being allowed to reference non-grouped-by columns in their targetlists or HAVING clauses. Thus, redundant GROUP BY items are not uncommon. Also, we can make the test pretty cheap in most queries where it won't help by not looking up a rel's primary key until we've found that at least two of its columns are in GROUP BY. David Rowley, reviewed by Julien Rouhaud
2025-10-16 17:07:43 +03:00 · 2016-02-11 17:34:59 -05:00
parent 72eee410d4
commit d4c3a156cb
7 changed files with 360 additions and 5 deletions
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -856,6 +856,70 @@ ERROR:  aggregate function calls cannot be nested
 LINE 1: select (select max(min(unique1)) from int8_tbl) from tenk1;
                           ^
 --
+-- Test removal of redundant GROUP BY columns
+--
+create temp table t1 (a int, b int, c int, d int, primary key (a, b));
+create temp table t2 (x int, y int, z int, primary key (x, y));
+create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
+-- Non-primary-key columns can be removed from GROUP BY
+explain (costs off) select * from t1 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t1
+(3 rows)
+
+-- No removal can happen if the complete PK is not present in GROUP BY
+explain (costs off) select a,c from t1 group by a,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, c, d
+   ->  Seq Scan on t1
+(3 rows)
+
+-- Test removal across multiple relations
+explain (costs off) select *
+from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Group
+   Group Key: t1.a, t1.b, t2.x, t2.y
+   ->  Merge Join
+         Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y))
+         ->  Index Scan using t1_pkey on t1
+         ->  Index Scan using t2_pkey on t2
+(6 rows)
+
+-- Test case where t1 can be optimized but not t2
+explain (costs off) select t1.*,t2.x,t2.z
+from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ HashAggregate
+   Group Key: t1.a, t1.b, t2.x, t2.z
+   ->  Merge Join
+         Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y))
+         ->  Index Scan using t1_pkey on t1
+         ->  Index Scan using t2_pkey on t2
+(6 rows)
+
+-- Cannot optimize when PK is deferrable
+explain (costs off) select * from t3 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on t3
+(3 rows)
+
+drop table t1;
+drop table t2;
+drop table t3;
+--
 -- Test combinations of DISTINCT and/or ORDER BY
 --
 select array_agg(a order by b)
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3943,12 +3943,14 @@ select d.* from d left join (select distinct * from b) s
 (1 row)

 -- join removal is not possible when the GROUP BY contains a column that is
-- not in the join condition
+-- not in the join condition.  (Note: as of 9.6, we notice that b.id is a
+-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
+-- but this happens too late for join removal in the outer plan level.)
 explain (costs off)
 select d.* from d left join (select * from b group by b.id, b.c_id) s
  on d.a = s.id;
-                 QUERY PLAN                  
---------------------------------------------
+              QUERY PLAN               
+---------------------------------------
 Merge Left Join
   Merge Cond: (d.a = s.id)
   ->  Sort
@@ -3958,7 +3960,7 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
         Sort Key: s.id
         ->  Subquery Scan on s
               ->  HashAggregate
-                     Group Key: b.id, b.c_id
+                     Group Key: b.id
                     ->  Seq Scan on b
 (11 rows)

--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -300,6 +300,37 @@ drop table minmaxtest cascade;
 select max(min(unique1)) from tenk1;
 select (select max(min(unique1)) from int8_tbl) from tenk1;

+--
+-- Test removal of redundant GROUP BY columns
+--
+
+create temp table t1 (a int, b int, c int, d int, primary key (a, b));
+create temp table t2 (x int, y int, z int, primary key (x, y));
+create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
+
+-- Non-primary-key columns can be removed from GROUP BY
+explain (costs off) select * from t1 group by a,b,c,d;
+
+-- No removal can happen if the complete PK is not present in GROUP BY
+explain (costs off) select a,c from t1 group by a,c,d;
+
+-- Test removal across multiple relations
+explain (costs off) select *
+from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
+
+-- Test case where t1 can be optimized but not t2
+explain (costs off) select t1.*,t2.x,t2.z
+from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
+group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
+
+-- Cannot optimize when PK is deferrable
+explain (costs off) select * from t3 group by a,b,c;
+
+drop table t1;
+drop table t2;
+drop table t3;
+
 --
 -- Test combinations of DISTINCT and/or ORDER BY
 --
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1279,7 +1279,9 @@ select d.* from d left join (select distinct * from b) s
  on d.a = s.id and d.b = s.c_id;

 -- join removal is not possible when the GROUP BY contains a column that is
-- not in the join condition
+-- not in the join condition.  (Note: as of 9.6, we notice that b.id is a
+-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
+-- but this happens too late for join removal in the outer plan level.)
 explain (costs off)
 select d.* from d left join (select * from b group by b.id, b.c_id) s
  on d.a = s.id;