1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-21 10:42:50 +03:00

Implement GROUP BY DISTINCT

With grouping sets, it's possible that some of the grouping sets are
duplicate.  This is especially common with CUBE and ROLLUP clauses. For
example GROUP BY CUBE (a,b), CUBE (b,c) is equivalent to

  GROUP BY GROUPING SETS (
    (a, b, c),
    (a, b, c),
    (a, b, c),
    (a, b),
    (a, b),
    (a, b),
    (a),
    (a),
    (a),
    (c, a),
    (c, a),
    (c, a),
    (c),
    (b, c),
    (b),
    ()
  )

Some of the grouping sets are calculated multiple times, which is mostly
unnecessary.  This commit implements a new GROUP BY DISTINCT feature, as
defined in the SQL standard, which eliminates the duplicate sets.

Author: Vik Fearing
Reviewed-by: Erik Rijkers, Georgios Kokolatos, Tomas Vondra
Discussion: https://postgr.es/m/bf3805a8-d7d1-ae61-fece-761b7ff41ecc@postgresfriends.org
This commit is contained in:
Tomas Vondra
2021-03-18 17:45:38 +01:00
parent cd91de0d17
commit be45be9c33
18 changed files with 333 additions and 27 deletions

View File

@@ -1071,7 +1071,7 @@ parseCheckAggregates(ParseState *pstate, Query *qry)
* The limit of 4096 is arbitrary and exists simply to avoid resource
* issues from pathological constructs.
*/
List *gsets = expand_grouping_sets(qry->groupingSets, 4096);
List *gsets = expand_grouping_sets(qry->groupingSets, qry->groupDistinct, 4096);
if (!gsets)
ereport(ERROR,
@@ -1735,6 +1735,33 @@ cmp_list_len_asc(const ListCell *a, const ListCell *b)
return (la > lb) ? 1 : (la == lb) ? 0 : -1;
}
/* list_sort comparator to sort sub-lists by length and contents */
static int
cmp_list_len_contents_asc(const ListCell *a, const ListCell *b)
{
int res = cmp_list_len_asc(a, b);
if (res == 0)
{
List *la = (List *) lfirst(a);
List *lb = (List *) lfirst(b);
ListCell *lca;
ListCell *lcb;
forboth(lca, la, lcb, lb)
{
int va = intVal(lca);
int vb = intVal(lcb);
if (va > vb)
return 1;
if (va < vb)
return -1;
}
}
return res;
}
/*
* Expand a groupingSets clause to a flat list of grouping sets.
* The returned list is sorted by length, shortest sets first.
@@ -1743,7 +1770,7 @@ cmp_list_len_asc(const ListCell *a, const ListCell *b)
* some consistency checks.
*/
List *
expand_grouping_sets(List *groupingSets, int limit)
expand_grouping_sets(List *groupingSets, bool groupDistinct, int limit)
{
List *expanded_groups = NIL;
List *result = NIL;
@@ -1801,8 +1828,31 @@ expand_grouping_sets(List *groupingSets, int limit)
result = new_result;
}
/* Now sort the lists by length */
list_sort(result, cmp_list_len_asc);
/* Now sort the lists by length and deduplicate if necessary */
if (!groupDistinct || list_length(result) < 2)
list_sort(result, cmp_list_len_asc);
else
{
ListCell *cell;
List *prev;
/* Sort each groupset individually */
foreach(cell, result)
list_sort(lfirst(cell), list_int_cmp);
/* Now sort the list of groupsets by length and contents */
list_sort(result, cmp_list_len_contents_asc);
/* Finally, remove duplicates */
prev = list_nth_node(List, result, 0);
for_each_from(cell, result, 1)
{
if (equal(lfirst(cell), prev))
foreach_delete_current(result, cell);
else
prev = lfirst(cell);
}
}
return result;
}