1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

Enable use of Memoize for ANTI joins

Currently, we do not support Memoize for SEMI and ANTI joins because
nested loop SEMI/ANTI joins do not scan the inner relation to
completion, which prevents Memoize from marking the cache entry as
complete.  One might argue that we could mark the cache entry as
complete after fetching the first inner tuple, but that would not be
safe: if the first inner tuple and the current outer tuple do not
satisfy the join clauses, a second inner tuple matching the parameters
would find the cache entry already marked as complete.

However, if the inner side is provably unique, this issue doesn't
arise, since there would be no second matching tuple.  That said, this
doesn't help in the case of SEMI joins, because a SEMI join with a
provably unique inner side would already have been reduced to an inner
join by reduce_unique_semijoins.

Therefore, in this patch, we check whether the inner relation is
provably unique for ANTI joins and enable the use of Memoize in such
cases.

Author: Richard Guo <guofenglinux@gmail.com>
Reviewed-by: wenhui qiu <qiuwenhuifx@gmail.com>
Reviewed-by: Andrei Lepikhov <lepihov@gmail.com>
Discussion: https://postgr.es/m/CAMbWs48FdLiMNrmJL-g6mDvoQVt0yNyJAqMkv4e2Pk-5GKCZLA@mail.gmail.com
This commit is contained in:
Richard Guo
2025-07-03 10:57:26 +09:00
parent 7b2eb72b1b
commit 0da29e4cb1
3 changed files with 112 additions and 22 deletions

View File

@@ -154,13 +154,17 @@ add_paths_to_joinrel(PlannerInfo *root,
/* /*
* See if the inner relation is provably unique for this outer rel. * See if the inner relation is provably unique for this outer rel.
* *
* We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't * We have some special cases: for JOIN_SEMI, it doesn't matter since the
* matter since the executor can make the equivalent optimization anyway; * executor can make the equivalent optimization anyway. It also doesn't
* we need not expend planner cycles on proofs. For JOIN_UNIQUE_INNER, we * help enable use of Memoize, since a semijoin with a provably unique
* must be considering a semijoin whose inner side is not provably unique * inner side should have been reduced to an inner join in that case.
* (else reduce_unique_semijoins would've simplified it), so there's no * Therefore, we need not expend planner cycles on proofs. (For
* point in calling innerrel_is_unique. However, if the LHS covers all of * JOIN_ANTI, although it doesn't help the executor for the same reason,
* the semijoin's min_lefthand, then it's appropriate to set inner_unique * it can benefit Memoize paths.) For JOIN_UNIQUE_INNER, we must be
* considering a semijoin whose inner side is not provably unique (else
* reduce_unique_semijoins would've simplified it), so there's no point in
* calling innerrel_is_unique. However, if the LHS covers all of the
* semijoin's min_lefthand, then it's appropriate to set inner_unique
* because the path produced by create_unique_path will be unique relative * because the path produced by create_unique_path will be unique relative
* to the LHS. (If we have an LHS that's only part of the min_lefthand, * to the LHS. (If we have an LHS that's only part of the min_lefthand,
* that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid * that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid
@@ -169,12 +173,6 @@ add_paths_to_joinrel(PlannerInfo *root,
switch (jointype) switch (jointype)
{ {
case JOIN_SEMI: case JOIN_SEMI:
case JOIN_ANTI:
/*
* XXX it may be worth proving this to allow a Memoize to be
* considered for Nested Loop Semi/Anti Joins.
*/
extra.inner_unique = false; /* well, unproven */ extra.inner_unique = false; /* well, unproven */
break; break;
case JOIN_UNIQUE_INNER: case JOIN_UNIQUE_INNER:
@@ -715,16 +713,21 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel,
return NULL; return NULL;
/* /*
* Currently we don't do this for SEMI and ANTI joins unless they're * Currently we don't do this for SEMI and ANTI joins, because nested loop
* marked as inner_unique. This is because nested loop SEMI/ANTI joins * SEMI/ANTI joins don't scan the inner node to completion, which means
* don't scan the inner node to completion, which will mean memoize cannot * memoize cannot mark the cache entry as complete. Nor can we mark the
* mark the cache entry as complete. * cache entry as complete after fetching the first inner tuple, because
* * if that tuple and the current outer tuple don't satisfy the join
* XXX Currently we don't attempt to mark SEMI/ANTI joins as inner_unique * clauses, a second inner tuple that satisfies the parameters would find
* = true. Should we? See add_paths_to_joinrel() * the cache entry already marked as complete. The only exception is when
* the inner relation is provably unique, as in that case, there won't be
* a second matching tuple and we can safely mark the cache entry as
* complete after fetching the first inner tuple. Note that in such
* cases, the SEMI join should have been reduced to an inner join by
* reduce_unique_semijoins.
*/ */
if (!extra->inner_unique && (jointype == JOIN_SEMI || if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
jointype == JOIN_ANTI)) !extra->inner_unique)
return NULL; return NULL;
/* /*

View File

@@ -25,6 +25,7 @@ begin
ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N'); ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
ln := regexp_replace(ln, 'loops=\d+', 'loops=N'); ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N'); ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
return next ln; return next ln;
end loop; end loop;
end; end;
@@ -500,3 +501,62 @@ RESET max_parallel_workers_per_gather;
RESET parallel_tuple_cost; RESET parallel_tuple_cost;
RESET parallel_setup_cost; RESET parallel_setup_cost;
RESET min_parallel_table_scan_size; RESET min_parallel_table_scan_size;
-- Ensure memoize works for ANTI joins
CREATE TABLE tab_anti (a int, b boolean);
INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
ANALYZE tab_anti;
-- Ensure we get a Memoize plan for ANTI join
SELECT explain_memoize('
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
ON t1.a+1 = t2.a
WHERE t2.a IS NULL;', false);
explain_memoize
--------------------------------------------------------------------------------------------
Aggregate (actual rows=1.00 loops=N)
-> Nested Loop Anti Join (actual rows=33.00 loops=N)
-> Seq Scan on tab_anti t1 (actual rows=100.00 loops=N)
-> Memoize (actual rows=0.67 loops=N)
Cache Key: (t1.a + 1), t1.a
Cache Mode: binary
Hits: 97 Misses: 3 Evictions: Zero Overflows: 0 Memory Usage: NkB
-> Subquery Scan on t2 (actual rows=0.67 loops=N)
Filter: ((t1.a + 1) = t2.a)
Rows Removed by Filter: 2
-> Unique (actual rows=2.67 loops=N)
-> Sort (actual rows=67.33 loops=N)
Sort Key: t2_1.a
Sort Method: quicksort Memory: NkB
-> Seq Scan on tab_anti t2_1 (actual rows=100.00 loops=N)
(15 rows)
-- And check we get the expected results.
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
ON t1.a+1 = t2.a
WHERE t2.a IS NULL;
count
-------
33
(1 row)
-- Ensure we do not add memoize node for SEMI join
EXPLAIN (COSTS OFF)
SELECT * FROM tab_anti t1 WHERE t1.a IN
(SELECT a FROM tab_anti t2 WHERE t2.b IN
(SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
QUERY PLAN
-------------------------------------------------
Nested Loop Semi Join
-> Seq Scan on tab_anti t1
-> Nested Loop Semi Join
Join Filter: (t1.a = t2.a)
-> Seq Scan on tab_anti t2
-> Subquery Scan on "ANY_subquery"
Filter: (t2.b = "ANY_subquery".b)
-> Result
One-Time Filter: (t2.a > 1)
-> Seq Scan on tab_anti t3
(10 rows)
DROP TABLE tab_anti;

View File

@@ -26,6 +26,7 @@ begin
ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N'); ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
ln := regexp_replace(ln, 'loops=\d+', 'loops=N'); ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N'); ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
return next ln; return next ln;
end loop; end loop;
end; end;
@@ -244,3 +245,29 @@ RESET max_parallel_workers_per_gather;
RESET parallel_tuple_cost; RESET parallel_tuple_cost;
RESET parallel_setup_cost; RESET parallel_setup_cost;
RESET min_parallel_table_scan_size; RESET min_parallel_table_scan_size;
-- Ensure memoize works for ANTI joins
CREATE TABLE tab_anti (a int, b boolean);
INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
ANALYZE tab_anti;
-- Ensure we get a Memoize plan for ANTI join
SELECT explain_memoize('
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
ON t1.a+1 = t2.a
WHERE t2.a IS NULL;', false);
-- And check we get the expected results.
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
ON t1.a+1 = t2.a
WHERE t2.a IS NULL;
-- Ensure we do not add memoize node for SEMI join
EXPLAIN (COSTS OFF)
SELECT * FROM tab_anti t1 WHERE t1.a IN
(SELECT a FROM tab_anti t2 WHERE t2.b IN
(SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
DROP TABLE tab_anti;