diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index e0192d4a491..6d6e67f84e9 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -19,6 +19,7 @@ #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/extensible.h" +#include "nodes/makefuncs.h" #include "optimizer/appendinfo.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" @@ -27,7 +28,9 @@ #include "optimizer/paths.h" #include "optimizer/planmain.h" #include "optimizer/tlist.h" +#include "parser/parse_clause.h" #include "parser/parsetree.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/selfuncs.h" @@ -1727,6 +1730,8 @@ UniquePath * create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, SpecialJoinInfo *sjinfo) { + List *uniq_exprs; + List *in_operators; UniquePath *pathnode; Path sort_path; /* dummy for result of cost_sort */ Path agg_path; /* dummy for result of cost_agg */ @@ -1760,6 +1765,134 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, */ oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(rel)); + /* + * First, identify the columns/expressions to be made unique along with + * the associated equality operators. We made lists of these when the + * SpecialJoinInfo was created, but that was before constructing + * EquivalenceClasses. Some of the values to be unique-ified may now be + * known redundant by the EquivalenceClass machinery (e.g., because they + * have been equated to constants). There is no need to compare such + * values during unique-ification, and indeed we had better not try + * because the Vars involved may not have propagated as high as the + * semijoin's level. We use make_pathkeys_for_sortclauses to detect such + * cases, which is a tad inefficient but it doesn't seem worth building + * specialized infrastructure for this. + * + * For a child rel, we can construct these lists from those of its parent. + */ + if (IS_OTHER_REL(rel)) + { + UniquePath *parent_path = (UniquePath *) rel->top_parent->cheapest_unique_path; + + Assert(parent_path && IsA(parent_path, UniquePath)); + uniq_exprs = (List *) + adjust_appendrel_attrs_multilevel(root, + (Node *) parent_path->uniq_exprs, + rel, + rel->top_parent); + in_operators = copyObject(parent_path->in_operators); + } + else + { + List *newtlist; + List *sortList; + ListCell *lc1; + ListCell *lc2; + + uniq_exprs = in_operators = newtlist = sortList = NIL; + forboth(lc1, sjinfo->semi_rhs_exprs, lc2, sjinfo->semi_operators) + { + Expr *uniqexpr = lfirst(lc1); + Oid in_oper = lfirst_oid(lc2); + Oid sortop; + + /* + * Try to build an ORDER BY list to sort the input compatibly. We + * do this for each sortable clause even when the clauses are not + * all sortable, so that we can detect clauses that are redundant + * according to the pathkey machinery. + */ + sortop = get_ordering_op_for_equality_op(in_oper, false); + if (OidIsValid(sortop)) + { + Oid eqop; + TargetEntry *tle; + SortGroupClause *sortcl; + List *sortPathkeys; + + /* + * The Unique node will need equality operators. Normally + * these are the same as the IN clause operators, but if those + * are cross-type operators then the equality operators are + * the ones for the IN clause operators' RHS datatype. + */ + eqop = get_equality_op_for_ordering_op(sortop, NULL); + if (!OidIsValid(eqop)) /* shouldn't happen */ + elog(ERROR, "could not find equality operator for ordering operator %u", + sortop); + + tle = makeTargetEntry((Expr *) uniqexpr, + list_length(newtlist) + 1, + NULL, + false); + newtlist = lappend(newtlist, tle); + + sortcl = makeNode(SortGroupClause); + sortcl->tleSortGroupRef = assignSortGroupRef(tle, newtlist); + sortcl->eqop = eqop; + sortcl->sortop = sortop; + sortcl->reverse_sort = false; + sortcl->nulls_first = false; + sortcl->hashable = false; /* no need to make this accurate */ + sortList = lappend(sortList, sortcl); + + /* + * At each step, convert the SortGroupClause list to pathkey + * form. If the just-added SortGroupClause is redundant, the + * result will be shorter than the SortGroupClause list. + */ + sortPathkeys = make_pathkeys_for_sortclauses(root, sortList, + newtlist); + if (list_length(sortPathkeys) != list_length(sortList)) + { + /* Drop the redundant SortGroupClause */ + sortList = list_delete_last(sortList); + Assert(list_length(sortPathkeys) == list_length(sortList)); + /* Undo tlist addition too */ + newtlist = list_delete_last(newtlist); + /* Don't need this column */ + continue; + } + } + else if (sjinfo->semi_can_btree) /* shouldn't happen */ + elog(ERROR, "could not find ordering operator for equality operator %u", + in_oper); + + /* + * We need to include this column in the output list. + * + * Under GEQO and when planning child joins, the sjinfo might be + * short-lived, so we'd better make copies of data structures we + * extract from it. + */ + uniq_exprs = lappend(uniq_exprs, copyObject(uniqexpr)); + in_operators = lappend_oid(in_operators, in_oper); + } + + /* + * It can happen that all the RHS columns are equated to constants. + * We'd have to do something special to unique-ify in that case, and + * it's such an unlikely-in-the-real-world case that it's not worth + * the effort. So just punt if we found no columns to unique-ify. + */ + if (uniq_exprs == NIL) + { + MemoryContextSwitchTo(oldcontext); + return NULL; + } + } + + /* OK, build the path node */ pathnode = makeNode(UniquePath); pathnode->path.pathtype = T_Unique; @@ -1778,14 +1911,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->path.pathkeys = NIL; pathnode->subpath = subpath; - - /* - * Under GEQO and when planning child joins, the sjinfo might be - * short-lived, so we'd better make copies of data structures we extract - * from it. - */ - pathnode->in_operators = copyObject(sjinfo->semi_operators); - pathnode->uniq_exprs = copyObject(sjinfo->semi_rhs_exprs); + pathnode->in_operators = in_operators; + pathnode->uniq_exprs = uniq_exprs; /* * If the input is a relation and it has a unique index that proves the @@ -1795,8 +1922,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, */ if (rel->rtekind == RTE_RELATION && sjinfo->semi_can_btree && relation_has_unique_index_for(root, rel, NIL, - sjinfo->semi_rhs_exprs, - sjinfo->semi_operators)) + uniq_exprs, + in_operators)) { pathnode->umethod = UNIQUE_PATH_NOOP; pathnode->path.rows = rel->rows; @@ -1829,13 +1956,13 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, { List *sub_tlist_colnos; - sub_tlist_colnos = translate_sub_tlist(sjinfo->semi_rhs_exprs, + sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid); if (sub_tlist_colnos && query_is_distinct_for(rte->subquery, sub_tlist_colnos, - sjinfo->semi_operators)) + in_operators)) { pathnode->umethod = UNIQUE_PATH_NOOP; pathnode->path.rows = rel->rows; @@ -1855,11 +1982,11 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, /* Estimate number of output rows */ pathnode->path.rows = estimate_num_groups(root, - sjinfo->semi_rhs_exprs, + uniq_exprs, rel->rows, NULL, NULL); - numCols = list_length(sjinfo->semi_rhs_exprs); + numCols = list_length(uniq_exprs); if (sjinfo->semi_can_btree) { diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 1f1ce2380af..4cfbe424603 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -3379,26 +3379,6 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*) ba | 0 | 1 (2 rows) --- Make sure that generation of HashAggregate for uniqification purposes --- does not lead to array overflow due to unexpected duplicate hash keys --- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com -set enable_memoize to off; -explain (costs off) - select 1 from tenk1 - where (hundred, thousand) in (select twothousand, twothousand from onek); - QUERY PLAN -------------------------------------------------------------- - Hash Join - Hash Cond: (tenk1.hundred = onek.twothousand) - -> Seq Scan on tenk1 - Filter: (hundred = thousand) - -> Hash - -> HashAggregate - Group Key: onek.twothousand, onek.twothousand - -> Seq Scan on onek -(8 rows) - -reset enable_memoize; -- -- Hash Aggregation Spill tests -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 390aabfb34b..a30ff88eef8 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3222,6 +3222,24 @@ where b.unique2 is null; -> Index Only Scan using tenk1_unique2 on tenk1 b (5 rows) +-- check that we avoid de-duplicating columns redundantly +set enable_memoize to off; +explain (costs off) +select 1 from tenk1 +where (hundred, thousand) in (select twothousand, twothousand from onek); + QUERY PLAN +------------------------------------------------- + Hash Join + Hash Cond: (tenk1.hundred = onek.twothousand) + -> Seq Scan on tenk1 + Filter: (hundred = thousand) + -> Hash + -> HashAggregate + Group Key: onek.twothousand + -> Seq Scan on onek +(8 rows) + +reset enable_memoize; -- -- regression test for bogus RTE_GROUP entries -- @@ -6500,6 +6518,68 @@ where t1.a = s.c; ---------- (0 rows) +rollback; +-- check handling of semijoins after join removal: we must suppress +-- unique-ification of known-constant values +begin; +create temp table t (a int unique, b int); +insert into t values (1, 2); +explain (verbose, costs off) +select t1.a from t t1 + left join t t2 on t1.a = t2.a + join t t3 on true +where exists (select 1 from t t4 + join t t5 on t4.b = t5.b + join t t6 on t5.b = t6.b + where t1.a = t4.a and t3.a = t5.a and t4.a = 1); + QUERY PLAN +------------------------------------------------------------------------------------ + Nested Loop + Output: t1.a + Inner Unique: true + -> Nested Loop + Output: t1.a, t5.a + -> Index Only Scan using t_a_key on pg_temp.t t1 + Output: t1.a + Index Cond: (t1.a = 1) + -> HashAggregate + Output: t5.a + Group Key: t5.a + -> Hash Join + Output: t5.a + Hash Cond: (t6.b = t4.b) + -> Seq Scan on pg_temp.t t6 + Output: t6.a, t6.b + -> Hash + Output: t4.b, t5.b, t5.a + -> Hash Join + Output: t4.b, t5.b, t5.a + Inner Unique: true + Hash Cond: (t5.b = t4.b) + -> Seq Scan on pg_temp.t t5 + Output: t5.a, t5.b + -> Hash + Output: t4.b, t4.a + -> Index Scan using t_a_key on pg_temp.t t4 + Output: t4.b, t4.a + Index Cond: (t4.a = 1) + -> Index Only Scan using t_a_key on pg_temp.t t3 + Output: t3.a + Index Cond: (t3.a = t5.a) +(32 rows) + +select t1.a from t t1 + left join t t2 on t1.a = t2.a + join t t3 on true +where exists (select 1 from t t4 + join t t5 on t4.b = t5.b + join t t6 on t5.b = t6.b + where t1.a = t4.a and t3.a = t5.a and t4.a = 1); + a +--- + 1 +(1 row) + rollback; -- test cases where we can remove a join, but not a PHV computed at it begin; diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 277b4b198cc..79eca85c985 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -1505,15 +1505,6 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*) from unnest(array['a','b']) u(v) group by v||'a' order by 1; --- Make sure that generation of HashAggregate for uniqification purposes --- does not lead to array overflow due to unexpected duplicate hash keys --- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com -set enable_memoize to off; -explain (costs off) - select 1 from tenk1 - where (hundred, thousand) in (select twothousand, twothousand from onek); -reset enable_memoize; - -- -- Hash Aggregation Spill tests -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index f6e7070db65..23e220afcd2 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -839,6 +839,13 @@ explain (costs off) select a.* from tenk1 a left join tenk1 b on a.unique1 = b.unique2 where b.unique2 is null; +-- check that we avoid de-duplicating columns redundantly +set enable_memoize to off; +explain (costs off) +select 1 from tenk1 +where (hundred, thousand) in (select twothousand, twothousand from onek); +reset enable_memoize; + -- -- regression test for bogus RTE_GROUP entries -- @@ -2420,6 +2427,32 @@ where t1.a = s.c; rollback; +-- check handling of semijoins after join removal: we must suppress +-- unique-ification of known-constant values +begin; + +create temp table t (a int unique, b int); +insert into t values (1, 2); + +explain (verbose, costs off) +select t1.a from t t1 + left join t t2 on t1.a = t2.a + join t t3 on true +where exists (select 1 from t t4 + join t t5 on t4.b = t5.b + join t t6 on t5.b = t6.b + where t1.a = t4.a and t3.a = t5.a and t4.a = 1); + +select t1.a from t t1 + left join t t2 on t1.a = t2.a + join t t3 on true +where exists (select 1 from t t4 + join t t5 on t4.b = t5.b + join t t6 on t5.b = t6.b + where t1.a = t4.a and t3.a = t5.a and t4.a = 1); + +rollback; + -- test cases where we can remove a join, but not a PHV computed at it begin;