1
0
mirror of https://github.com/postgres/postgres.git synced 2025-12-19 17:02:53 +03:00

Enable use of Memoize for ANTI joins

Currently, we do not support Memoize for SEMI and ANTI joins because
nested loop SEMI/ANTI joins do not scan the inner relation to
completion, which prevents Memoize from marking the cache entry as
complete.  One might argue that we could mark the cache entry as
complete after fetching the first inner tuple, but that would not be
safe: if the first inner tuple and the current outer tuple do not
satisfy the join clauses, a second inner tuple matching the parameters
would find the cache entry already marked as complete.

However, if the inner side is provably unique, this issue doesn't
arise, since there would be no second matching tuple.  That said, this
doesn't help in the case of SEMI joins, because a SEMI join with a
provably unique inner side would already have been reduced to an inner
join by reduce_unique_semijoins.

Therefore, in this patch, we check whether the inner relation is
provably unique for ANTI joins and enable the use of Memoize in such
cases.

Author: Richard Guo <guofenglinux@gmail.com>
Reviewed-by: wenhui qiu <qiuwenhuifx@gmail.com>
Reviewed-by: Andrei Lepikhov <lepihov@gmail.com>
Discussion: https://postgr.es/m/CAMbWs48FdLiMNrmJL-g6mDvoQVt0yNyJAqMkv4e2Pk-5GKCZLA@mail.gmail.com
This commit is contained in:
Richard Guo
2025-07-03 10:57:26 +09:00
parent 7b2eb72b1b
commit 0da29e4cb1
3 changed files with 112 additions and 22 deletions

View File

@@ -25,6 +25,7 @@ begin
ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
return next ln;
end loop;
end;
@@ -500,3 +501,62 @@ RESET max_parallel_workers_per_gather;
RESET parallel_tuple_cost;
RESET parallel_setup_cost;
RESET min_parallel_table_scan_size;
-- Ensure memoize works for ANTI joins
CREATE TABLE tab_anti (a int, b boolean);
INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
ANALYZE tab_anti;
-- Ensure we get a Memoize plan for ANTI join
SELECT explain_memoize('
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
ON t1.a+1 = t2.a
WHERE t2.a IS NULL;', false);
explain_memoize
--------------------------------------------------------------------------------------------
Aggregate (actual rows=1.00 loops=N)
-> Nested Loop Anti Join (actual rows=33.00 loops=N)
-> Seq Scan on tab_anti t1 (actual rows=100.00 loops=N)
-> Memoize (actual rows=0.67 loops=N)
Cache Key: (t1.a + 1), t1.a
Cache Mode: binary
Hits: 97 Misses: 3 Evictions: Zero Overflows: 0 Memory Usage: NkB
-> Subquery Scan on t2 (actual rows=0.67 loops=N)
Filter: ((t1.a + 1) = t2.a)
Rows Removed by Filter: 2
-> Unique (actual rows=2.67 loops=N)
-> Sort (actual rows=67.33 loops=N)
Sort Key: t2_1.a
Sort Method: quicksort Memory: NkB
-> Seq Scan on tab_anti t2_1 (actual rows=100.00 loops=N)
(15 rows)
-- And check we get the expected results.
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
ON t1.a+1 = t2.a
WHERE t2.a IS NULL;
count
-------
33
(1 row)
-- Ensure we do not add memoize node for SEMI join
EXPLAIN (COSTS OFF)
SELECT * FROM tab_anti t1 WHERE t1.a IN
(SELECT a FROM tab_anti t2 WHERE t2.b IN
(SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
QUERY PLAN
-------------------------------------------------
Nested Loop Semi Join
-> Seq Scan on tab_anti t1
-> Nested Loop Semi Join
Join Filter: (t1.a = t2.a)
-> Seq Scan on tab_anti t2
-> Subquery Scan on "ANY_subquery"
Filter: (t2.b = "ANY_subquery".b)
-> Result
One-Time Filter: (t2.a > 1)
-> Seq Scan on tab_anti t3
(10 rows)
DROP TABLE tab_anti;

View File

@@ -26,6 +26,7 @@ begin
ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N');
ln := regexp_replace(ln, 'loops=\d+', 'loops=N');
ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N');
ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB');
return next ln;
end loop;
end;
@@ -244,3 +245,29 @@ RESET max_parallel_workers_per_gather;
RESET parallel_tuple_cost;
RESET parallel_setup_cost;
RESET min_parallel_table_scan_size;
-- Ensure memoize works for ANTI joins
CREATE TABLE tab_anti (a int, b boolean);
INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i;
ANALYZE tab_anti;
-- Ensure we get a Memoize plan for ANTI join
SELECT explain_memoize('
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
ON t1.a+1 = t2.a
WHERE t2.a IS NULL;', false);
-- And check we get the expected results.
SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN
LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2
ON t1.a+1 = t2.a
WHERE t2.a IS NULL;
-- Ensure we do not add memoize node for SEMI join
EXPLAIN (COSTS OFF)
SELECT * FROM tab_anti t1 WHERE t1.a IN
(SELECT a FROM tab_anti t2 WHERE t2.b IN
(SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0));
DROP TABLE tab_anti;