Fix "variable not found in subplan target lists" in semijoin de-duplication.

One mechanism we have for implementing semi-joins is to de-duplicate the output of the RHS and then treat the join as a plain inner join. Initial construction of the join's SpecialJoinInfo identifies the RHS columns that need to be de-duplicated, but later we may find that some of those don't need to be handled explicitly, either because they're known to be constant or because they are redundant with some previous column. Up to now, while sort-based de-duplication handled such cases well, hash-based de-duplication didn't: we'd still hash on all of the originally-identified columns. This is probably not a very big deal performance-wise, but in the wake of commit a3179ab69 it can cause planner errors. That happens when join elimination causes recalculation of variables' attr_needed bitmapsets, and we decide that a variable mentioned in a semijoin clause doesn't need to be propagated up to the join level anymore. There are a number of ways we could slice the blame for this, but the only fix that doesn't result in pessimizing plans for loosely-related cases is to be more careful about not hashing columns we don't actually need to de-duplicate. We can install that consideration into create_unique_paths in master, or the predecessor code in create_unique_path in v18, without much refactoring. (As follow-up work, it might be a good idea to look at more-invasive refactoring, in hopes of preventing other bugs in this area. But with v18 release so close, there's not time for that now, nor would we be likely to want to put such refactoring into v18 anyway.) Reported-by: Sergey Soloviev <sergey.soloviev@tantorlabs.ru> Diagnosed-by: Richard Guo <guofenglinux@gmail.com> Author: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Richard Guo <guofenglinux@gmail.com> Discussion: https://postgr.es/m/1fd1a421-4609-4d46-a1af-ab74d5de504a@tantorlabs.ru Backpatch-through: 18
2025-12-19 17:02:53 +03:00 · 2025-08-28 13:49:20 -04:00
parent 16a9165ce4
commit b8a1bdc458
5 changed files with 179 additions and 43 deletions
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -3398,26 +3398,6 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
 ba       |    0 |     1
 (2 rows)

-- Make sure that generation of HashAggregate for uniqification purposes
-- does not lead to array overflow due to unexpected duplicate hash keys
-- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
-set enable_memoize to off;
-explain (costs off)
-  select 1 from tenk1
-   where (hundred, thousand) in (select twothousand, twothousand from onek);
-                         QUERY PLAN                          
-------------------------------------------------------------
- Hash Join
-   Hash Cond: (tenk1.hundred = onek.twothousand)
-   ->  Seq Scan on tenk1
-         Filter: (hundred = thousand)
-   ->  Hash
-         ->  HashAggregate
-               Group Key: onek.twothousand, onek.twothousand
-               ->  Seq Scan on onek
-(8 rows)
-
-reset enable_memoize;
 --
 -- Hash Aggregation Spill tests
 --
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3222,6 +3222,24 @@ where b.unique2 is null;
         ->  Index Only Scan using tenk1_unique2 on tenk1 b
 (5 rows)

+-- check that we avoid de-duplicating columns redundantly
+set enable_memoize to off;
+explain (costs off)
+select 1 from tenk1
+where (hundred, thousand) in (select twothousand, twothousand from onek);
+                   QUERY PLAN                    
+-------------------------------------------------
+ Hash Join
+   Hash Cond: (tenk1.hundred = onek.twothousand)
+   ->  Seq Scan on tenk1
+         Filter: (hundred = thousand)
+   ->  Hash
+         ->  HashAggregate
+               Group Key: onek.twothousand
+               ->  Seq Scan on onek
+(8 rows)
+
+reset enable_memoize;
 --
 -- regression test for bogus RTE_GROUP entries
 --
@@ -6500,6 +6518,68 @@ where t1.a = s.c;
 ----------
 (0 rows)

+rollback;
+-- check handling of semijoins after join removal: we must suppress
+-- unique-ification of known-constant values
+begin;
+create temp table t (a int unique, b int);
+insert into t values (1, 2);
+explain (verbose, costs off)
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Nested Loop
+   Output: t1.a
+   Inner Unique: true
+   ->  Nested Loop
+         Output: t1.a, t5.a
+         ->  Index Only Scan using t_a_key on pg_temp.t t1
+               Output: t1.a
+               Index Cond: (t1.a = 1)
+         ->  HashAggregate
+               Output: t5.a
+               Group Key: t5.a
+               ->  Hash Join
+                     Output: t5.a
+                     Hash Cond: (t6.b = t4.b)
+                     ->  Seq Scan on pg_temp.t t6
+                           Output: t6.a, t6.b
+                     ->  Hash
+                           Output: t4.b, t5.b, t5.a
+                           ->  Hash Join
+                                 Output: t4.b, t5.b, t5.a
+                                 Inner Unique: true
+                                 Hash Cond: (t5.b = t4.b)
+                                 ->  Seq Scan on pg_temp.t t5
+                                       Output: t5.a, t5.b
+                                 ->  Hash
+                                       Output: t4.b, t4.a
+                                       ->  Index Scan using t_a_key on pg_temp.t t4
+                                             Output: t4.b, t4.a
+                                             Index Cond: (t4.a = 1)
+   ->  Index Only Scan using t_a_key on pg_temp.t t3
+         Output: t3.a
+         Index Cond: (t3.a = t5.a)
+(32 rows)
+
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+ a 
+---
+ 1
+(1 row)
+
 rollback;
 -- test cases where we can remove a join, but not a PHV computed at it
 begin;
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -1510,15 +1510,6 @@ select v||'a', case when v||'a' = 'aa' then 1 else 0 end, count(*)
  from unnest(array['a','b']) u(v)
 group by v||'a' order by 1;

-- Make sure that generation of HashAggregate for uniqification purposes
-- does not lead to array overflow due to unexpected duplicate hash keys
-- see CAFeeJoKKu0u+A_A9R9316djW-YW3-+Gtgvy3ju655qRHR3jtdA@mail.gmail.com
-set enable_memoize to off;
-explain (costs off)
-  select 1 from tenk1
-   where (hundred, thousand) in (select twothousand, twothousand from onek);
-reset enable_memoize;
-
 --
 -- Hash Aggregation Spill tests
 --
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -839,6 +839,13 @@ explain (costs off)
 select a.* from tenk1 a left join tenk1 b on a.unique1 = b.unique2
 where b.unique2 is null;

+-- check that we avoid de-duplicating columns redundantly
+set enable_memoize to off;
+explain (costs off)
+select 1 from tenk1
+where (hundred, thousand) in (select twothousand, twothousand from onek);
+reset enable_memoize;
+
 --
 -- regression test for bogus RTE_GROUP entries
 --
@@ -2420,6 +2427,32 @@ where t1.a = s.c;

 rollback;

+-- check handling of semijoins after join removal: we must suppress
+-- unique-ification of known-constant values
+begin;
+
+create temp table t (a int unique, b int);
+insert into t values (1, 2);
+
+explain (verbose, costs off)
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+
+select t1.a from t t1
+  left join t t2 on t1.a = t2.a
+       join t t3 on true
+where exists (select 1 from t t4
+                join t t5 on t4.b = t5.b
+                join t t6 on t5.b = t6.b
+              where t1.a = t4.a and t3.a = t5.a and t4.a = 1);
+
+rollback;
+
 -- test cases where we can remove a join, but not a PHV computed at it
 begin;