1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-02 09:02:37 +03:00

Remove GROUP BY columns that are functionally dependent on other columns.

If a GROUP BY clause includes all columns of a non-deferred primary key,
as well as other columns of the same relation, those other columns are
redundant and can be dropped from the grouping; the pkey is enough to
ensure that each row of the table corresponds to a separate group.
Getting rid of the excess columns will reduce the cost of the sorting or
hashing needed to implement GROUP BY, and can indeed remove the need for
a sort step altogether.

This seems worth testing for since many query authors are not aware of
the GROUP-BY-primary-key exception to the rule about queries not being
allowed to reference non-grouped-by columns in their targetlists or
HAVING clauses.  Thus, redundant GROUP BY items are not uncommon.  Also,
we can make the test pretty cheap in most queries where it won't help
by not looking up a rel's primary key until we've found that at least
two of its columns are in GROUP BY.

David Rowley, reviewed by Julien Rouhaud
This commit is contained in:
Tom Lane
2016-02-11 17:34:59 -05:00
parent 72eee410d4
commit d4c3a156cb
7 changed files with 360 additions and 5 deletions

View File

@ -856,6 +856,70 @@ ERROR: aggregate function calls cannot be nested
LINE 1: select (select max(min(unique1)) from int8_tbl) from tenk1;
^
--
-- Test removal of redundant GROUP BY columns
--
create temp table t1 (a int, b int, c int, d int, primary key (a, b));
create temp table t2 (x int, y int, z int, primary key (x, y));
create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
-- Non-primary-key columns can be removed from GROUP BY
explain (costs off) select * from t1 group by a,b,c,d;
QUERY PLAN
----------------------
HashAggregate
Group Key: a, b
-> Seq Scan on t1
(3 rows)
-- No removal can happen if the complete PK is not present in GROUP BY
explain (costs off) select a,c from t1 group by a,c,d;
QUERY PLAN
----------------------
HashAggregate
Group Key: a, c, d
-> Seq Scan on t1
(3 rows)
-- Test removal across multiple relations
explain (costs off) select *
from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
QUERY PLAN
-------------------------------------------------------
Group
Group Key: t1.a, t1.b, t2.x, t2.y
-> Merge Join
Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y))
-> Index Scan using t1_pkey on t1
-> Index Scan using t2_pkey on t2
(6 rows)
-- Test case where t1 can be optimized but not t2
explain (costs off) select t1.*,t2.x,t2.z
from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
QUERY PLAN
-------------------------------------------------------
HashAggregate
Group Key: t1.a, t1.b, t2.x, t2.z
-> Merge Join
Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y))
-> Index Scan using t1_pkey on t1
-> Index Scan using t2_pkey on t2
(6 rows)
-- Cannot optimize when PK is deferrable
explain (costs off) select * from t3 group by a,b,c;
QUERY PLAN
----------------------
HashAggregate
Group Key: a, b, c
-> Seq Scan on t3
(3 rows)
drop table t1;
drop table t2;
drop table t3;
--
-- Test combinations of DISTINCT and/or ORDER BY
--
select array_agg(a order by b)

View File

@ -3943,12 +3943,14 @@ select d.* from d left join (select distinct * from b) s
(1 row)
-- join removal is not possible when the GROUP BY contains a column that is
-- not in the join condition
-- not in the join condition. (Note: as of 9.6, we notice that b.id is a
-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
-- but this happens too late for join removal in the outer plan level.)
explain (costs off)
select d.* from d left join (select * from b group by b.id, b.c_id) s
on d.a = s.id;
QUERY PLAN
---------------------------------------------
QUERY PLAN
---------------------------------------
Merge Left Join
Merge Cond: (d.a = s.id)
-> Sort
@ -3958,7 +3960,7 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
Sort Key: s.id
-> Subquery Scan on s
-> HashAggregate
Group Key: b.id, b.c_id
Group Key: b.id
-> Seq Scan on b
(11 rows)

View File

@ -300,6 +300,37 @@ drop table minmaxtest cascade;
select max(min(unique1)) from tenk1;
select (select max(min(unique1)) from int8_tbl) from tenk1;
--
-- Test removal of redundant GROUP BY columns
--
create temp table t1 (a int, b int, c int, d int, primary key (a, b));
create temp table t2 (x int, y int, z int, primary key (x, y));
create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
-- Non-primary-key columns can be removed from GROUP BY
explain (costs off) select * from t1 group by a,b,c,d;
-- No removal can happen if the complete PK is not present in GROUP BY
explain (costs off) select a,c from t1 group by a,c,d;
-- Test removal across multiple relations
explain (costs off) select *
from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
-- Test case where t1 can be optimized but not t2
explain (costs off) select t1.*,t2.x,t2.z
from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
-- Cannot optimize when PK is deferrable
explain (costs off) select * from t3 group by a,b,c;
drop table t1;
drop table t2;
drop table t3;
--
-- Test combinations of DISTINCT and/or ORDER BY
--

View File

@ -1279,7 +1279,9 @@ select d.* from d left join (select distinct * from b) s
on d.a = s.id and d.b = s.c_id;
-- join removal is not possible when the GROUP BY contains a column that is
-- not in the join condition
-- not in the join condition. (Note: as of 9.6, we notice that b.id is a
-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
-- but this happens too late for join removal in the outer plan level.)
explain (costs off)
select d.* from d left join (select * from b group by b.id, b.c_id) s
on d.a = s.id;