Pathify RHS unique-ification for semijoin planning

There are two implementation techniques for semijoins: one uses the JOIN_SEMI jointype, where the executor emits at most one matching row per left-hand side (LHS) row; the other unique-ifies the right-hand side (RHS) and then performs a plain inner join. The latter technique currently has some drawbacks related to the unique-ification step. * Only the cheapest-total path of the RHS is considered during unique-ification. This may cause us to miss some optimization opportunities; for example, a path with a better sort order might be overlooked simply because it is not the cheapest in total cost. Such a path could help avoid a sort at a higher level, potentially resulting in a cheaper overall plan. * We currently rely on heuristics to choose between hash-based and sort-based unique-ification. A better approach would be to generate paths for both methods and allow add_path() to decide which one is preferable, consistent with how path selection is handled elsewhere in the planner. * In the sort-based implementation, we currently pay no attention to the pathkeys of the input subpath or the resulting output. This can result in redundant sort nodes being added to the final plan. This patch improves semijoin planning by creating a new RelOptInfo for the RHS rel to represent its unique-ified version. It then generates multiple paths that represent elimination of distinct rows from the RHS, considering both a hash-based implementation using the cheapest total path of the original RHS rel, and sort-based implementations that either exploit presorted input paths or explicitly sort the cheapest total path. All resulting paths compete in add_path(), and those deemed worthy of consideration are added to the new RelOptInfo. Finally, the unique-ified rel is joined with the other side of the semijoin using a plain inner join. As a side effect, most of the code related to the JOIN_UNIQUE_OUTER and JOIN_UNIQUE_INNER jointypes -- used to indicate that the LHS or RHS path should be made unique -- has been removed. Besides, the T_Unique path now has the same meaning for both semijoins and upper DISTINCT clauses: it represents adjacent-duplicate removal on presorted input. This patch unifies their handling by sharing the same data structures and functions. This patch also removes the UNIQUE_PATH_NOOP related code along the way, as it is dead code -- if the RHS rel is provably unique, the semijoin should have already been simplified to a plain inner join by analyzejoins.c. Author: Richard Guo <guofenglinux@gmail.com> Reviewed-by: Alexandra Wang <alexandra.wang.oss@gmail.com> Reviewed-by: wenhui qiu <qiuwenhuifx@gmail.com> Discussion: https://postgr.es/m/CAMbWs4-EBnaRvEs7frTLbsXiweSTUXifsteF-d3rvv01FKO86w@mail.gmail.com
2025-12-19 17:02:53 +03:00 · 2025-08-19 09:35:40 +09:00
parent 3c07944d04
commit 24225ad9aa
18 changed files with 1074 additions and 971 deletions
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -703,8 +703,6 @@ typedef struct PartitionSchemeData *PartitionScheme;
 *			(regardless of ordering) among the unparameterized paths;
 *			or if there is no unparameterized path, the path with lowest
 *			total cost among the paths with minimum parameterization
- *		cheapest_unique_path - for caching cheapest path to produce unique
- *			(no duplicates) output from relation; NULL if not yet requested
 *		cheapest_parameterized_paths - best paths for their parameterizations;
 *			always includes cheapest_total_path, even if that's unparameterized
 *		direct_lateral_relids - rels this rel has direct LATERAL references to
@@ -770,6 +768,21 @@ typedef struct PartitionSchemeData *PartitionScheme;
 *					other rels for which we have tried and failed to prove
 *					this one unique
 *
+ * Three fields are used to cache information about unique-ification of this
+ * relation.  This is used to support semijoins where the relation appears on
+ * the RHS: the relation is first unique-ified, and then a regular join is
+ * performed:
+ *
+ *		unique_rel - the unique-ified version of the relation, containing paths
+ *					that produce unique (no duplicates) output from relation;
+ *					NULL if not yet requested
+ *		unique_pathkeys - pathkeys that represent the ordering requirements for
+ *					the relation's output in sort-based unique-ification
+ *					implementations
+ *		unique_groupclause - a list of SortGroupClause nodes that represent the
+ *					columns to be grouped on in hash-based unique-ification
+ *					implementations
+ *
 * The presence of the following fields depends on the restrictions
 * and joins that the relation participates in:
 *
@@ -930,7 +943,6 @@ typedef struct RelOptInfo
 	List	   *partial_pathlist;	/* partial Paths */
 	struct Path *cheapest_startup_path;
 	struct Path *cheapest_total_path;
-	struct Path *cheapest_unique_path;
 	List	   *cheapest_parameterized_paths;

 	/*
@@ -1004,6 +1016,16 @@ typedef struct RelOptInfo
 	/* known not unique for these set(s) */
 	List	   *non_unique_for_rels;

+	/*
+	 * information about unique-ification of this relation
+	 */
+	/* the unique-ified version of the relation */
+	struct RelOptInfo *unique_rel;
+	/* pathkeys for sort-based unique-ification implementations */
+	List	   *unique_pathkeys;
+	/* SortGroupClause nodes for hash-based unique-ification implementations */
+	List	   *unique_groupclause;
+
 	/*
 	 * used by various scans and joins:
 	 */
@@ -1097,6 +1119,17 @@ typedef struct RelOptInfo
 	((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0 && \
 	 (rel)->part_rels && (rel)->partexprs && (rel)->nullable_partexprs)

+/*
+ * Is given relation unique-ified?
+ *
+ * When the nominal jointype is JOIN_INNER, sjinfo->jointype is JOIN_SEMI, and
+ * the given rel is exactly the RHS of the semijoin, it indicates that the rel
+ * has been unique-ified.
+ */
+#define RELATION_WAS_MADE_UNIQUE(rel, sjinfo, nominal_jointype) \
+	((nominal_jointype) == JOIN_INNER && (sjinfo)->jointype == JOIN_SEMI && \
+	 bms_equal((sjinfo)->syn_righthand, (rel)->relids))
+
 /*
 * IndexOptInfo
 *		Per-index information for planning/optimization
@@ -1741,8 +1774,8 @@ typedef struct ParamPathInfo
 * and the specified outer rel(s).
 *
 * "rows" is the same as parent->rows in simple paths, but in parameterized
- * paths and UniquePaths it can be less than parent->rows, reflecting the
- * fact that we've filtered by extra join conditions or removed duplicates.
+ * paths it can be less than parent->rows, reflecting the fact that we've
+ * filtered by extra join conditions.
 *
 * "pathkeys" is a List of PathKey nodes (see above), describing the sort
 * ordering of the path's output rows.
@@ -2141,34 +2174,6 @@ typedef struct MemoizePath
 	double		est_hit_ratio;	/* estimated cache hit ratio, for EXPLAIN */
 } MemoizePath;

-/*
- * UniquePath represents elimination of distinct rows from the output of
- * its subpath.
- *
- * This can represent significantly different plans: either hash-based or
- * sort-based implementation, or a no-op if the input path can be proven
- * distinct already.  The decision is sufficiently localized that it's not
- * worth having separate Path node types.  (Note: in the no-op case, we could
- * eliminate the UniquePath node entirely and just return the subpath; but
- * it's convenient to have a UniquePath in the path tree to signal upper-level
- * routines that the input is known distinct.)
- */
-typedef enum UniquePathMethod
-{
-	UNIQUE_PATH_NOOP,			/* input is known unique already */
-	UNIQUE_PATH_HASH,			/* use hashing */
-	UNIQUE_PATH_SORT,			/* use sorting */
-} UniquePathMethod;
-
-typedef struct UniquePath
-{
-	Path		path;
-	Path	   *subpath;
-	UniquePathMethod umethod;
-	List	   *in_operators;	/* equality operators of the IN clause */
-	List	   *uniq_exprs;		/* expressions to be made unique */
-} UniquePath;
-
 /*
 * GatherPath runs several copies of a plan in parallel and collects the
 * results.  The parallel leader may also execute the plan, unless the
@@ -2375,17 +2380,17 @@ typedef struct GroupPath
 } GroupPath;

 /*
- * UpperUniquePath represents adjacent-duplicate removal (in presorted input)
+ * UniquePath represents adjacent-duplicate removal (in presorted input)
 *
 * The columns to be compared are the first numkeys columns of the path's
 * pathkeys.  The input is presumed already sorted that way.
 */
-typedef struct UpperUniquePath
+typedef struct UniquePath
 {
 	Path		path;
 	Path	   *subpath;		/* path representing input source */
 	int			numkeys;		/* number of pathkey columns to compare */
-} UpperUniquePath;
+} UniquePath;

 /*
 * AggPath represents generic computation of aggregate functions