1
0
mirror of https://github.com/postgres/postgres.git synced 2025-12-19 17:02:53 +03:00

Pathify RHS unique-ification for semijoin planning

There are two implementation techniques for semijoins: one uses the
JOIN_SEMI jointype, where the executor emits at most one matching row
per left-hand side (LHS) row; the other unique-ifies the right-hand
side (RHS) and then performs a plain inner join.

The latter technique currently has some drawbacks related to the
unique-ification step.

* Only the cheapest-total path of the RHS is considered during
unique-ification.  This may cause us to miss some optimization
opportunities; for example, a path with a better sort order might be
overlooked simply because it is not the cheapest in total cost.  Such
a path could help avoid a sort at a higher level, potentially
resulting in a cheaper overall plan.

* We currently rely on heuristics to choose between hash-based and
sort-based unique-ification.  A better approach would be to generate
paths for both methods and allow add_path() to decide which one is
preferable, consistent with how path selection is handled elsewhere in
the planner.

* In the sort-based implementation, we currently pay no attention to
the pathkeys of the input subpath or the resulting output.  This can
result in redundant sort nodes being added to the final plan.

This patch improves semijoin planning by creating a new RelOptInfo for
the RHS rel to represent its unique-ified version.  It then generates
multiple paths that represent elimination of distinct rows from the
RHS, considering both a hash-based implementation using the cheapest
total path of the original RHS rel, and sort-based implementations
that either exploit presorted input paths or explicitly sort the
cheapest total path.  All resulting paths compete in add_path(), and
those deemed worthy of consideration are added to the new RelOptInfo.
Finally, the unique-ified rel is joined with the other side of the
semijoin using a plain inner join.

As a side effect, most of the code related to the JOIN_UNIQUE_OUTER
and JOIN_UNIQUE_INNER jointypes -- used to indicate that the LHS or
RHS path should be made unique -- has been removed.  Besides, the
T_Unique path now has the same meaning for both semijoins and upper
DISTINCT clauses: it represents adjacent-duplicate removal on
presorted input.  This patch unifies their handling by sharing the
same data structures and functions.

This patch also removes the UNIQUE_PATH_NOOP related code along the
way, as it is dead code -- if the RHS rel is provably unique, the
semijoin should have already been simplified to a plain inner join by
analyzejoins.c.

Author: Richard Guo <guofenglinux@gmail.com>
Reviewed-by: Alexandra Wang <alexandra.wang.oss@gmail.com>
Reviewed-by: wenhui qiu <qiuwenhuifx@gmail.com>
Discussion: https://postgr.es/m/CAMbWs4-EBnaRvEs7frTLbsXiweSTUXifsteF-d3rvv01FKO86w@mail.gmail.com
This commit is contained in:
Richard Guo
2025-08-19 09:35:40 +09:00
parent 3c07944d04
commit 24225ad9aa
18 changed files with 1074 additions and 971 deletions

View File

@@ -9468,23 +9468,20 @@ where exists (select 1 from tenk1 t3
---------------------------------------------------------------------------------
Nested Loop
Output: t1.unique1, t2.hundred
-> Hash Join
-> Merge Join
Output: t1.unique1, t3.tenthous
Hash Cond: (t3.thousand = t1.unique1)
-> HashAggregate
Merge Cond: (t3.thousand = t1.unique1)
-> Unique
Output: t3.thousand, t3.tenthous
Group Key: t3.thousand, t3.tenthous
-> Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3
Output: t3.thousand, t3.tenthous
-> Hash
-> Index Only Scan using onek_unique1 on public.onek t1
Output: t1.unique1
-> Index Only Scan using onek_unique1 on public.onek t1
Output: t1.unique1
Index Cond: (t1.unique1 < 1)
Index Cond: (t1.unique1 < 1)
-> Index Only Scan using tenk1_hundred on public.tenk1 t2
Output: t2.hundred
Index Cond: (t2.hundred = t3.tenthous)
(18 rows)
(15 rows)
-- ... unless it actually is unique
create table j3 as select unique1, tenthous from onek;

View File

@@ -1134,48 +1134,50 @@ EXPLAIN (COSTS OFF)
SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
QUERY PLAN
---------------------------------------------------------------------------------
Sort
Merge Append
Sort Key: t1.a
-> Append
-> Nested Loop
Join Filter: (t1_2.a = t1_5.b)
-> HashAggregate
Group Key: t1_5.b
-> Nested Loop
Join Filter: (t1_2.a = t1_5.b)
-> Unique
-> Sort
Sort Key: t1_5.b
-> Hash Join
Hash Cond: (((t2_1.a + t2_1.b) / 2) = t1_5.b)
-> Seq Scan on prt1_e_p1 t2_1
-> Hash
-> Seq Scan on prt2_p1 t1_5
Filter: (a = 0)
-> Index Scan using iprt1_p1_a on prt1_p1 t1_2
Index Cond: (a = ((t2_1.a + t2_1.b) / 2))
Filter: (b = 0)
-> Nested Loop
Join Filter: (t1_3.a = t1_6.b)
-> HashAggregate
Group Key: t1_6.b
-> Index Scan using iprt1_p1_a on prt1_p1 t1_2
Index Cond: (a = ((t2_1.a + t2_1.b) / 2))
Filter: (b = 0)
-> Nested Loop
Join Filter: (t1_3.a = t1_6.b)
-> Unique
-> Sort
Sort Key: t1_6.b
-> Hash Join
Hash Cond: (((t2_2.a + t2_2.b) / 2) = t1_6.b)
-> Seq Scan on prt1_e_p2 t2_2
-> Hash
-> Seq Scan on prt2_p2 t1_6
Filter: (a = 0)
-> Index Scan using iprt1_p2_a on prt1_p2 t1_3
Index Cond: (a = ((t2_2.a + t2_2.b) / 2))
Filter: (b = 0)
-> Nested Loop
Join Filter: (t1_4.a = t1_7.b)
-> HashAggregate
Group Key: t1_7.b
-> Index Scan using iprt1_p2_a on prt1_p2 t1_3
Index Cond: (a = ((t2_2.a + t2_2.b) / 2))
Filter: (b = 0)
-> Nested Loop
Join Filter: (t1_4.a = t1_7.b)
-> Unique
-> Sort
Sort Key: t1_7.b
-> Nested Loop
-> Seq Scan on prt2_p3 t1_7
Filter: (a = 0)
-> Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_3
Index Cond: (((a + b) / 2) = t1_7.b)
-> Index Scan using iprt1_p3_a on prt1_p3 t1_4
Index Cond: (a = ((t2_3.a + t2_3.b) / 2))
Filter: (b = 0)
(41 rows)
-> Index Scan using iprt1_p3_a on prt1_p3 t1_4
Index Cond: (a = ((t2_3.a + t2_3.b) / 2))
Filter: (b = 0)
(43 rows)
SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
a | b | c
@@ -1190,46 +1192,48 @@ EXPLAIN (COSTS OFF)
SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
QUERY PLAN
---------------------------------------------------------------------------
Sort
Merge Append
Sort Key: t1.a
-> Append
-> Nested Loop
-> HashAggregate
Group Key: t1_6.b
-> Nested Loop
-> Unique
-> Sort
Sort Key: t1_6.b
-> Hash Semi Join
Hash Cond: (t1_6.b = ((t1_9.a + t1_9.b) / 2))
-> Seq Scan on prt2_p1 t1_6
-> Hash
-> Seq Scan on prt1_e_p1 t1_9
Filter: (c = 0)
-> Index Scan using iprt1_p1_a on prt1_p1 t1_3
Index Cond: (a = t1_6.b)
Filter: (b = 0)
-> Nested Loop
-> HashAggregate
Group Key: t1_7.b
-> Index Scan using iprt1_p1_a on prt1_p1 t1_3
Index Cond: (a = t1_6.b)
Filter: (b = 0)
-> Nested Loop
-> Unique
-> Sort
Sort Key: t1_7.b
-> Hash Semi Join
Hash Cond: (t1_7.b = ((t1_10.a + t1_10.b) / 2))
-> Seq Scan on prt2_p2 t1_7
-> Hash
-> Seq Scan on prt1_e_p2 t1_10
Filter: (c = 0)
-> Index Scan using iprt1_p2_a on prt1_p2 t1_4
Index Cond: (a = t1_7.b)
Filter: (b = 0)
-> Nested Loop
-> HashAggregate
Group Key: t1_8.b
-> Index Scan using iprt1_p2_a on prt1_p2 t1_4
Index Cond: (a = t1_7.b)
Filter: (b = 0)
-> Nested Loop
-> Unique
-> Sort
Sort Key: t1_8.b
-> Hash Semi Join
Hash Cond: (t1_8.b = ((t1_11.a + t1_11.b) / 2))
-> Seq Scan on prt2_p3 t1_8
-> Hash
-> Seq Scan on prt1_e_p3 t1_11
Filter: (c = 0)
-> Index Scan using iprt1_p3_a on prt1_p3 t1_5
Index Cond: (a = t1_8.b)
Filter: (b = 0)
(39 rows)
-> Index Scan using iprt1_p3_a on prt1_p3 t1_5
Index Cond: (a = t1_8.b)
Filter: (b = 0)
(41 rows)
SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
a | b | c

View File

@@ -707,6 +707,212 @@ select * from numeric_table
3
(4 rows)
--
-- Test that a semijoin implemented by unique-ifying the RHS can explore
-- different paths of the RHS rel.
--
create table semijoin_unique_tbl (a int, b int);
insert into semijoin_unique_tbl select i%10, i%10 from generate_series(1,1000)i;
create index on semijoin_unique_tbl(a, b);
analyze semijoin_unique_tbl;
-- Ensure that we get a plan with Unique + IndexScan
explain (verbose, costs off)
select * from semijoin_unique_tbl t1, semijoin_unique_tbl t2
where (t1.a, t2.a) in (select a, b from semijoin_unique_tbl t3)
order by t1.a, t2.a;
QUERY PLAN
------------------------------------------------------------------------------------------------------
Nested Loop
Output: t1.a, t1.b, t2.a, t2.b
-> Merge Join
Output: t1.a, t1.b, t3.b
Merge Cond: (t3.a = t1.a)
-> Unique
Output: t3.a, t3.b
-> Index Only Scan using semijoin_unique_tbl_a_b_idx on public.semijoin_unique_tbl t3
Output: t3.a, t3.b
-> Index Only Scan using semijoin_unique_tbl_a_b_idx on public.semijoin_unique_tbl t1
Output: t1.a, t1.b
-> Memoize
Output: t2.a, t2.b
Cache Key: t3.b
Cache Mode: logical
-> Index Only Scan using semijoin_unique_tbl_a_b_idx on public.semijoin_unique_tbl t2
Output: t2.a, t2.b
Index Cond: (t2.a = t3.b)
(18 rows)
-- Ensure that we can unique-ify expressions more complex than plain Vars
explain (verbose, costs off)
select * from semijoin_unique_tbl t1, semijoin_unique_tbl t2
where (t1.a, t2.a) in (select a+1, b+1 from semijoin_unique_tbl t3)
order by t1.a, t2.a;
QUERY PLAN
------------------------------------------------------------------------------------------------
Incremental Sort
Output: t1.a, t1.b, t2.a, t2.b
Sort Key: t1.a, t2.a
Presorted Key: t1.a
-> Merge Join
Output: t1.a, t1.b, t2.a, t2.b
Merge Cond: (t1.a = ((t3.a + 1)))
-> Index Only Scan using semijoin_unique_tbl_a_b_idx on public.semijoin_unique_tbl t1
Output: t1.a, t1.b
-> Sort
Output: t2.a, t2.b, t3.a, ((t3.a + 1))
Sort Key: ((t3.a + 1))
-> Hash Join
Output: t2.a, t2.b, t3.a, (t3.a + 1)
Hash Cond: (t2.a = (t3.b + 1))
-> Seq Scan on public.semijoin_unique_tbl t2
Output: t2.a, t2.b
-> Hash
Output: t3.a, t3.b
-> HashAggregate
Output: t3.a, t3.b
Group Key: (t3.a + 1), (t3.b + 1)
-> Seq Scan on public.semijoin_unique_tbl t3
Output: t3.a, t3.b, (t3.a + 1), (t3.b + 1)
(24 rows)
-- encourage use of parallel plans
set parallel_setup_cost=0;
set parallel_tuple_cost=0;
set min_parallel_table_scan_size=0;
set max_parallel_workers_per_gather=4;
set enable_indexscan to off;
-- Ensure that we get a parallel plan for the unique-ification
explain (verbose, costs off)
select * from semijoin_unique_tbl t1, semijoin_unique_tbl t2
where (t1.a, t2.a) in (select a, b from semijoin_unique_tbl t3)
order by t1.a, t2.a;
QUERY PLAN
----------------------------------------------------------------------------------------
Nested Loop
Output: t1.a, t1.b, t2.a, t2.b
-> Merge Join
Output: t1.a, t1.b, t3.b
Merge Cond: (t3.a = t1.a)
-> Unique
Output: t3.a, t3.b
-> Gather Merge
Output: t3.a, t3.b
Workers Planned: 2
-> Sort
Output: t3.a, t3.b
Sort Key: t3.a, t3.b
-> HashAggregate
Output: t3.a, t3.b
Group Key: t3.a, t3.b
-> Parallel Seq Scan on public.semijoin_unique_tbl t3
Output: t3.a, t3.b
-> Materialize
Output: t1.a, t1.b
-> Gather Merge
Output: t1.a, t1.b
Workers Planned: 2
-> Sort
Output: t1.a, t1.b
Sort Key: t1.a
-> Parallel Seq Scan on public.semijoin_unique_tbl t1
Output: t1.a, t1.b
-> Memoize
Output: t2.a, t2.b
Cache Key: t3.b
Cache Mode: logical
-> Bitmap Heap Scan on public.semijoin_unique_tbl t2
Output: t2.a, t2.b
Recheck Cond: (t2.a = t3.b)
-> Bitmap Index Scan on semijoin_unique_tbl_a_b_idx
Index Cond: (t2.a = t3.b)
(37 rows)
reset enable_indexscan;
reset max_parallel_workers_per_gather;
reset min_parallel_table_scan_size;
reset parallel_tuple_cost;
reset parallel_setup_cost;
drop table semijoin_unique_tbl;
create table unique_tbl_p (a int, b int) partition by range(a);
create table unique_tbl_p1 partition of unique_tbl_p for values from (0) to (5);
create table unique_tbl_p2 partition of unique_tbl_p for values from (5) to (10);
create table unique_tbl_p3 partition of unique_tbl_p for values from (10) to (20);
insert into unique_tbl_p select i%12, i from generate_series(0, 1000)i;
create index on unique_tbl_p1(a);
create index on unique_tbl_p2(a);
create index on unique_tbl_p3(a);
analyze unique_tbl_p;
set enable_partitionwise_join to on;
-- Ensure that the unique-ification works for partition-wise join
explain (verbose, costs off)
select * from unique_tbl_p t1, unique_tbl_p t2
where (t1.a, t2.a) in (select a, a from unique_tbl_p t3)
order by t1.a, t2.a;
QUERY PLAN
------------------------------------------------------------------------------------------------
Merge Append
Sort Key: t1.a
-> Nested Loop
Output: t1_1.a, t1_1.b, t2_1.a, t2_1.b
-> Nested Loop
Output: t1_1.a, t1_1.b, t3_1.a
-> Unique
Output: t3_1.a
-> Index Only Scan using unique_tbl_p1_a_idx on public.unique_tbl_p1 t3_1
Output: t3_1.a
-> Index Scan using unique_tbl_p1_a_idx on public.unique_tbl_p1 t1_1
Output: t1_1.a, t1_1.b
Index Cond: (t1_1.a = t3_1.a)
-> Memoize
Output: t2_1.a, t2_1.b
Cache Key: t1_1.a
Cache Mode: logical
-> Index Scan using unique_tbl_p1_a_idx on public.unique_tbl_p1 t2_1
Output: t2_1.a, t2_1.b
Index Cond: (t2_1.a = t1_1.a)
-> Nested Loop
Output: t1_2.a, t1_2.b, t2_2.a, t2_2.b
-> Nested Loop
Output: t1_2.a, t1_2.b, t3_2.a
-> Unique
Output: t3_2.a
-> Index Only Scan using unique_tbl_p2_a_idx on public.unique_tbl_p2 t3_2
Output: t3_2.a
-> Index Scan using unique_tbl_p2_a_idx on public.unique_tbl_p2 t1_2
Output: t1_2.a, t1_2.b
Index Cond: (t1_2.a = t3_2.a)
-> Memoize
Output: t2_2.a, t2_2.b
Cache Key: t1_2.a
Cache Mode: logical
-> Index Scan using unique_tbl_p2_a_idx on public.unique_tbl_p2 t2_2
Output: t2_2.a, t2_2.b
Index Cond: (t2_2.a = t1_2.a)
-> Nested Loop
Output: t1_3.a, t1_3.b, t2_3.a, t2_3.b
-> Nested Loop
Output: t1_3.a, t1_3.b, t3_3.a
-> Unique
Output: t3_3.a
-> Sort
Output: t3_3.a
Sort Key: t3_3.a
-> Seq Scan on public.unique_tbl_p3 t3_3
Output: t3_3.a
-> Index Scan using unique_tbl_p3_a_idx on public.unique_tbl_p3 t1_3
Output: t1_3.a, t1_3.b
Index Cond: (t1_3.a = t3_3.a)
-> Memoize
Output: t2_3.a, t2_3.b
Cache Key: t1_3.a
Cache Mode: logical
-> Index Scan using unique_tbl_p3_a_idx on public.unique_tbl_p3 t2_3
Output: t2_3.a, t2_3.b
Index Cond: (t2_3.a = t1_3.a)
(59 rows)
reset enable_partitionwise_join;
drop table unique_tbl_p;
--
-- Test case for bug #4290: bogus calculation of subplan param sets
--
@@ -2672,18 +2878,17 @@ EXPLAIN (COSTS OFF)
SELECT * FROM onek
WHERE (unique1,ten) IN (VALUES (1,1), (20,0), (99,9), (17,99))
ORDER BY unique1;
QUERY PLAN
-----------------------------------------------------------------
Sort
Sort Key: onek.unique1
-> Nested Loop
-> HashAggregate
Group Key: "*VALUES*".column1, "*VALUES*".column2
QUERY PLAN
----------------------------------------------------------------
Nested Loop
-> Unique
-> Sort
Sort Key: "*VALUES*".column1, "*VALUES*".column2
-> Values Scan on "*VALUES*"
-> Index Scan using onek_unique1 on onek
Index Cond: (unique1 = "*VALUES*".column1)
Filter: ("*VALUES*".column2 = ten)
(9 rows)
-> Index Scan using onek_unique1 on onek
Index Cond: (unique1 = "*VALUES*".column1)
Filter: ("*VALUES*".column2 = ten)
(8 rows)
EXPLAIN (COSTS OFF)
SELECT * FROM onek
@@ -2858,12 +3063,10 @@ SELECT ten FROM onek WHERE unique1 IN (VALUES (1), (2) ORDER BY 1);
-> Unique
-> Sort
Sort Key: "*VALUES*".column1
-> Sort
Sort Key: "*VALUES*".column1
-> Values Scan on "*VALUES*"
-> Values Scan on "*VALUES*"
-> Index Scan using onek_unique1 on onek
Index Cond: (unique1 = "*VALUES*".column1)
(9 rows)
(7 rows)
EXPLAIN (COSTS OFF)
SELECT ten FROM onek WHERE unique1 IN (VALUES (1), (2) LIMIT 1);

View File

@@ -361,6 +361,73 @@ select * from float_table
select * from numeric_table
where num_col in (select float_col from float_table);
--
-- Test that a semijoin implemented by unique-ifying the RHS can explore
-- different paths of the RHS rel.
--
create table semijoin_unique_tbl (a int, b int);
insert into semijoin_unique_tbl select i%10, i%10 from generate_series(1,1000)i;
create index on semijoin_unique_tbl(a, b);
analyze semijoin_unique_tbl;
-- Ensure that we get a plan with Unique + IndexScan
explain (verbose, costs off)
select * from semijoin_unique_tbl t1, semijoin_unique_tbl t2
where (t1.a, t2.a) in (select a, b from semijoin_unique_tbl t3)
order by t1.a, t2.a;
-- Ensure that we can unique-ify expressions more complex than plain Vars
explain (verbose, costs off)
select * from semijoin_unique_tbl t1, semijoin_unique_tbl t2
where (t1.a, t2.a) in (select a+1, b+1 from semijoin_unique_tbl t3)
order by t1.a, t2.a;
-- encourage use of parallel plans
set parallel_setup_cost=0;
set parallel_tuple_cost=0;
set min_parallel_table_scan_size=0;
set max_parallel_workers_per_gather=4;
set enable_indexscan to off;
-- Ensure that we get a parallel plan for the unique-ification
explain (verbose, costs off)
select * from semijoin_unique_tbl t1, semijoin_unique_tbl t2
where (t1.a, t2.a) in (select a, b from semijoin_unique_tbl t3)
order by t1.a, t2.a;
reset enable_indexscan;
reset max_parallel_workers_per_gather;
reset min_parallel_table_scan_size;
reset parallel_tuple_cost;
reset parallel_setup_cost;
drop table semijoin_unique_tbl;
create table unique_tbl_p (a int, b int) partition by range(a);
create table unique_tbl_p1 partition of unique_tbl_p for values from (0) to (5);
create table unique_tbl_p2 partition of unique_tbl_p for values from (5) to (10);
create table unique_tbl_p3 partition of unique_tbl_p for values from (10) to (20);
insert into unique_tbl_p select i%12, i from generate_series(0, 1000)i;
create index on unique_tbl_p1(a);
create index on unique_tbl_p2(a);
create index on unique_tbl_p3(a);
analyze unique_tbl_p;
set enable_partitionwise_join to on;
-- Ensure that the unique-ification works for partition-wise join
explain (verbose, costs off)
select * from unique_tbl_p t1, unique_tbl_p t2
where (t1.a, t2.a) in (select a, a from unique_tbl_p t3)
order by t1.a, t2.a;
reset enable_partitionwise_join;
drop table unique_tbl_p;
--
-- Test case for bug #4290: bogus calculation of subplan param sets
--