Optimize joins when the inner relation can be proven unique.

If there can certainly be no more than one matching inner row for a given outer row, then the executor can move on to the next outer row as soon as it's found one match; there's no need to continue scanning the inner relation for this outer row. This saves useless scanning in nestloop and hash joins. In merge joins, it offers the opportunity to skip mark/restore processing, because we know we have not advanced past the first possible match for the next outer row. Of course, the devil is in the details: the proof of uniqueness must depend only on joinquals (not otherquals), and if we want to skip mergejoin mark/restore then it must depend only on merge clauses. To avoid adding more planning overhead than absolutely necessary, the present patch errs in the conservative direction: there are cases where inner_unique or skip_mark_restore processing could be used, but it will not do so because it's not sure that the uniqueness proof depended only on "safe" clauses. This could be improved later. David Rowley, reviewed and rather heavily editorialized on by me Discussion: https://postgr.es/m/CAApHDvqF6Sw-TK98bW48TdtFJ+3a7D2mFyZ7++=D-RyPsL76gw@mail.gmail.com
2025-11-01 21:31:19 +03:00 · 2017-04-07 22:20:03 -04:00
parent f13a9121f9
commit 9c7f5229ad
26 changed files with 987 additions and 206 deletions
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2081,15 +2081,13 @@ cost_group(Path *path, PlannerInfo *root,
 * 'jointype' is the type of join to be performed
 * 'outer_path' is the outer input to the join
 * 'inner_path' is the inner input to the join
- * 'sjinfo' is extra info about the join for selectivity estimation
- * 'semifactors' contains valid data if jointype is SEMI or ANTI
+ * 'extra' contains miscellaneous information about the join
 */
 void
 initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
 					  JoinType jointype,
 					  Path *outer_path, Path *inner_path,
-					  SpecialJoinInfo *sjinfo,
-					  SemiAntiJoinFactors *semifactors)
+					  JoinPathExtraData *extra)
 {
 	Cost		startup_cost = 0;
 	Cost		run_cost = 0;
@@ -2120,10 +2118,12 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
 	inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
 	inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;

-	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
+	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
+		extra->inner_unique)
 	{
 		/*
-		 * SEMI or ANTI join: executor will stop after first match.
+		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+		 * executor will stop after the first match.
 		 *
 		 * Getting decent estimates requires inspection of the join quals,
 		 * which we choose to postpone to final_cost_nestloop.
@@ -2156,14 +2156,12 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
 *
 * 'path' is already filled in except for the rows and cost fields
 * 'workspace' is the result from initial_cost_nestloop
- * 'sjinfo' is extra info about the join for selectivity estimation
- * 'semifactors' contains valid data if path->jointype is SEMI or ANTI
+ * 'extra' contains miscellaneous information about the join
 */
 void
 final_cost_nestloop(PlannerInfo *root, NestPath *path,
 					JoinCostWorkspace *workspace,
-					SpecialJoinInfo *sjinfo,
-					SemiAntiJoinFactors *semifactors)
+					JoinPathExtraData *extra)
 {
 	Path	   *outer_path = path->outerjoinpath;
 	Path	   *inner_path = path->innerjoinpath;
@@ -2206,10 +2204,12 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,

 	/* cost of inner-relation source data (we already dealt with outer rel) */

-	if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI)
+	if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
+		extra->inner_unique)
 	{
 		/*
-		 * SEMI or ANTI join: executor will stop after first match.
+		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+		 * executor will stop after the first match.
 		 */
 		Cost		inner_run_cost = workspace->inner_run_cost;
 		Cost		inner_rescan_run_cost = workspace->inner_rescan_run_cost;
@@ -2225,8 +2225,8 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
 		 * clamp inner_scan_frac to at most 1.0; but since match_count is at
 		 * least 1, no such clamp is needed now.)
 		 */
-		outer_matched_rows = rint(outer_path_rows * semifactors->outer_match_frac);
-		inner_scan_frac = 2.0 / (semifactors->match_count + 1.0);
+		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
+		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);

 		/*
 		 * Compute number of tuples processed (not number emitted!).  First,
@@ -2350,7 +2350,7 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
 * 'inner_path' is the inner input to the join
 * 'outersortkeys' is the list of sort keys for the outer path
 * 'innersortkeys' is the list of sort keys for the inner path
- * 'sjinfo' is extra info about the join for selectivity estimation
+ * 'extra' contains miscellaneous information about the join
 *
 * Note: outersortkeys and innersortkeys should be NIL if no explicit
 * sort is needed because the respective source path is already ordered.
@@ -2361,7 +2361,7 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 					   List *mergeclauses,
 					   Path *outer_path, Path *inner_path,
 					   List *outersortkeys, List *innersortkeys,
-					   SpecialJoinInfo *sjinfo)
+					   JoinPathExtraData *extra)
 {
 	Cost		startup_cost = 0;
 	Cost		run_cost = 0;
@@ -2562,26 +2562,33 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 * final_cost_mergejoin
 *	  Final estimate of the cost and result size of a mergejoin path.
 *
- * Unlike other costsize functions, this routine makes one actual decision:
- * whether we should materialize the inner path.  We do that either because
- * the inner path can't support mark/restore, or because it's cheaper to
- * use an interposed Material node to handle mark/restore.  When the decision
- * is cost-based it would be logically cleaner to build and cost two separate
- * paths with and without that flag set; but that would require repeating most
- * of the cost calculations, which are not all that cheap.  Since the choice
- * will not affect output pathkeys or startup cost, only total cost, there is
- * no possibility of wanting to keep both paths.  So it seems best to make
- * the decision here and record it in the path's materialize_inner field.
+ * Unlike other costsize functions, this routine makes two actual decisions:
+ * whether the executor will need to do mark/restore, and whether we should
+ * materialize the inner path.  It would be logically cleaner to build
+ * separate paths testing these alternatives, but that would require repeating
+ * most of the cost calculations, which are not all that cheap.  Since the
+ * choice will not affect output pathkeys or startup cost, only total cost,
+ * there is no possibility of wanting to keep more than one path.  So it seems
+ * best to make the decisions here and record them in the path's
+ * skip_mark_restore and materialize_inner fields.
+ *
+ * Mark/restore overhead is usually required, but can be skipped if we know
+ * that the executor need find only one match per outer tuple, and that the
+ * mergeclauses are sufficient to identify a match.
+ *
+ * We materialize the inner path if we need mark/restore and either the inner
+ * path can't support mark/restore, or it's cheaper to use an interposed
+ * Material node to handle mark/restore.
 *
 * 'path' is already filled in except for the rows and cost fields and
- *		materialize_inner
+ *		skip_mark_restore and materialize_inner
 * 'workspace' is the result from initial_cost_mergejoin
- * 'sjinfo' is extra info about the join for selectivity estimation
+ * 'extra' contains miscellaneous information about the join
 */
 void
 final_cost_mergejoin(PlannerInfo *root, MergePath *path,
 					 JoinCostWorkspace *workspace,
-					 SpecialJoinInfo *sjinfo)
+					 JoinPathExtraData *extra)
 {
 	Path	   *outer_path = path->jpath.outerjoinpath;
 	Path	   *inner_path = path->jpath.innerjoinpath;
@@ -2640,6 +2647,21 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
 	qp_qual_cost.startup -= merge_qual_cost.startup;
 	qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;

+	/*
+	 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+	 * executor will stop scanning for matches after the first match.  When
+	 * all the joinclauses are merge clauses, this means we don't ever need to
+	 * back up the merge, and so we can skip mark/restore overhead.
+	 */
+	if ((path->jpath.jointype == JOIN_SEMI ||
+		 path->jpath.jointype == JOIN_ANTI ||
+		 extra->inner_unique) &&
+		(list_length(path->jpath.joinrestrictinfo) ==
+		 list_length(path->path_mergeclauses)))
+		path->skip_mark_restore = true;
+	else
+		path->skip_mark_restore = false;
+
 	/*
 	 * Get approx # tuples passing the mergequals.  We use approx_tuple_count
 	 * here because we need an estimate done with JOIN_INNER semantics.
@@ -2670,9 +2692,9 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
 	 * computations?
 	 *
 	 * The whole issue is moot if we are working from a unique-ified outer
-	 * input.
+	 * input, or if we know we don't need to mark/restore at all.
 	 */
-	if (IsA(outer_path, UniquePath))
+	if (IsA(outer_path, UniquePath) ||path->skip_mark_restore)
 		rescannedtuples = 0;
 	else
 	{
@@ -2711,11 +2733,17 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
 	mat_inner_cost = inner_run_cost +
 		cpu_operator_cost * inner_path_rows * rescanratio;

+	/*
+	 * If we don't need mark/restore at all, we don't need materialization.
+	 */
+	if (path->skip_mark_restore)
+		path->materialize_inner = false;
+
 	/*
 	 * Prefer materializing if it looks cheaper, unless the user has asked to
 	 * suppress materialization.
 	 */
-	if (enable_material && mat_inner_cost < bare_inner_cost)
+	else if (enable_material && mat_inner_cost < bare_inner_cost)
 		path->materialize_inner = true;

 	/*
@@ -2876,16 +2904,14 @@ cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
 * 'hashclauses' is the list of joinclauses to be used as hash clauses
 * 'outer_path' is the outer input to the join
 * 'inner_path' is the inner input to the join
- * 'sjinfo' is extra info about the join for selectivity estimation
- * 'semifactors' contains valid data if jointype is SEMI or ANTI
+ * 'extra' contains miscellaneous information about the join
 */
 void
 initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 					  JoinType jointype,
 					  List *hashclauses,
 					  Path *outer_path, Path *inner_path,
-					  SpecialJoinInfo *sjinfo,
-					  SemiAntiJoinFactors *semifactors)
+					  JoinPathExtraData *extra)
 {
 	Cost		startup_cost = 0;
 	Cost		run_cost = 0;
@@ -2970,14 +2996,12 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 * 'path' is already filled in except for the rows and cost fields and
 *		num_batches
 * 'workspace' is the result from initial_cost_hashjoin
- * 'sjinfo' is extra info about the join for selectivity estimation
- * 'semifactors' contains valid data if path->jointype is SEMI or ANTI
+ * 'extra' contains miscellaneous information about the join
 */
 void
 final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 					JoinCostWorkspace *workspace,
-					SpecialJoinInfo *sjinfo,
-					SemiAntiJoinFactors *semifactors)
+					JoinPathExtraData *extra)
 {
 	Path	   *outer_path = path->jpath.outerjoinpath;
 	Path	   *inner_path = path->jpath.innerjoinpath;
@@ -3101,13 +3125,16 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,

 	/* CPU costs */

-	if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI)
+	if (path->jpath.jointype == JOIN_SEMI ||
+		path->jpath.jointype == JOIN_ANTI ||
+		extra->inner_unique)
 	{
 		double		outer_matched_rows;
 		Selectivity inner_scan_frac;

 		/*
-		 * SEMI or ANTI join: executor will stop after first match.
+		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+		 * executor will stop after the first match.
 		 *
 		 * For an outer-rel row that has at least one match, we can expect the
 		 * bucket scan to stop after a fraction 1/(match_count+1) of the
@@ -3117,8 +3144,8 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 		 * to clamp inner_scan_frac to at most 1.0; but since match_count is
 		 * at least 1, no such clamp is needed now.)
 		 */
-		outer_matched_rows = rint(outer_path_rows * semifactors->outer_match_frac);
-		inner_scan_frac = 2.0 / (semifactors->match_count + 1.0);
+		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
+		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);

 		startup_cost += hash_qual_cost.startup;
 		run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
@@ -3684,11 +3711,12 @@ get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,

 /*
 * compute_semi_anti_join_factors
- *	  Estimate how much of the inner input a SEMI or ANTI join
+ *	  Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
 *	  can be expected to scan.
 *
 * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
 * inner rows as soon as it finds a match to the current outer row.
+ * The same happens if we have detected the inner rel is unique.
 * We should therefore adjust some of the cost components for this effect.
 * This function computes some estimates needed for these adjustments.
 * These estimates will be the same regardless of the particular paths used
@@ -3698,7 +3726,7 @@ get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
 * Input parameters:
 *	outerrel: outer relation under consideration
 *	innerrel: inner relation under consideration
- *	jointype: must be JOIN_SEMI or JOIN_ANTI
+ *	jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
 *	sjinfo: SpecialJoinInfo relevant to this join
 *	restrictlist: join quals
 * Output parameters:
@@ -3720,16 +3748,14 @@ compute_semi_anti_join_factors(PlannerInfo *root,
 	List	   *joinquals;
 	ListCell   *l;

-	/* Should only be called in these cases */
-	Assert(jointype == JOIN_SEMI || jointype == JOIN_ANTI);
-
 	/*
 	 * In an ANTI join, we must ignore clauses that are "pushed down", since
 	 * those won't affect the match logic.  In a SEMI join, we do not
 	 * distinguish joinquals from "pushed down" quals, so just use the whole
-	 * restrictinfo list.
+	 * restrictinfo list.  For other outer join types, we should consider only
+	 * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
 	 */
-	if (jointype == JOIN_ANTI)
+	if (IS_OUTER_JOIN(jointype))
 	{
 		joinquals = NIL;
 		foreach(l, restrictlist)
@@ -3749,7 +3775,7 @@ compute_semi_anti_join_factors(PlannerInfo *root,
 	jselec = clauselist_selectivity(root,
 									joinquals,
 									0,
-									jointype,
+							 (jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
 									sjinfo);

 	/*
@@ -3776,7 +3802,7 @@ compute_semi_anti_join_factors(PlannerInfo *root,
 									&norm_sjinfo);

 	/* Avoid leaking a lot of ListCells */
-	if (jointype == JOIN_ANTI)
+	if (IS_OUTER_JOIN(jointype))
 		list_free(joinquals);

 	/*
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -21,6 +21,7 @@
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
+#include "optimizer/planmain.h"

 /* Hook for plugins to get control in add_paths_to_joinrel() */
 set_join_pathlist_hook_type set_join_pathlist_hook = NULL;
@@ -120,6 +121,35 @@ add_paths_to_joinrel(PlannerInfo *root,
 	extra.sjinfo = sjinfo;
 	extra.param_source_rels = NULL;

+	/*
+	 * See if the inner relation is provably unique for this outer rel.
+	 *
+	 * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't
+	 * matter since the executor can make the equivalent optimization anyway;
+	 * we need not expend planner cycles on proofs.  For JOIN_UNIQUE_INNER, we
+	 * know we're going to force uniqueness of the innerrel below.  For
+	 * JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid letting that value escape
+	 * this module.
+	 */
+	switch (jointype)
+	{
+		case JOIN_SEMI:
+		case JOIN_ANTI:
+			extra.inner_unique = false; /* well, unproven */
+			break;
+		case JOIN_UNIQUE_INNER:
+			extra.inner_unique = true;
+			break;
+		case JOIN_UNIQUE_OUTER:
+			extra.inner_unique = innerrel_is_unique(root, outerrel, innerrel,
+													JOIN_INNER, restrictlist);
+			break;
+		default:
+			extra.inner_unique = innerrel_is_unique(root, outerrel, innerrel,
+													jointype, restrictlist);
+			break;
+	}
+
 	/*
 	 * Find potential mergejoin clauses.  We can skip this if we are not
 	 * interested in doing a mergejoin.  However, mergejoin may be our only
@@ -136,10 +166,10 @@ add_paths_to_joinrel(PlannerInfo *root,
 														  &mergejoin_allowed);

 	/*
-	 * If it's SEMI or ANTI join, compute correction factors for cost
-	 * estimation.  These will be the same for all paths.
+	 * If it's SEMI, ANTI, or inner_unique join, compute correction factors
+	 * for cost estimation.  These will be the same for all paths.
 	 */
-	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
+	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || extra.inner_unique)
 		compute_semi_anti_join_factors(root, outerrel, innerrel,
 									   jointype, sjinfo, restrictlist,
 									   &extra.semifactors);
@@ -336,8 +366,7 @@ try_nestloop_path(PlannerInfo *root,
 	 * methodology worthwhile.
 	 */
 	initial_cost_nestloop(root, &workspace, jointype,
-						  outer_path, inner_path,
-						  extra->sjinfo, &extra->semifactors);
+						  outer_path, inner_path, extra);

 	if (add_path_precheck(joinrel,
 						  workspace.startup_cost, workspace.total_cost,
@@ -348,8 +377,7 @@ try_nestloop_path(PlannerInfo *root,
 									  joinrel,
 									  jointype,
 									  &workspace,
-									  extra->sjinfo,
-									  &extra->semifactors,
+									  extra,
 									  outer_path,
 									  inner_path,
 									  extra->restrictlist,
@@ -399,8 +427,7 @@ try_partial_nestloop_path(PlannerInfo *root,
 	 * cost.  Bail out right away if it looks terrible.
 	 */
 	initial_cost_nestloop(root, &workspace, jointype,
-						  outer_path, inner_path,
-						  extra->sjinfo, &extra->semifactors);
+						  outer_path, inner_path, extra);
 	if (!add_partial_path_precheck(joinrel, workspace.total_cost, pathkeys))
 		return;

@@ -410,8 +437,7 @@ try_partial_nestloop_path(PlannerInfo *root,
 										  joinrel,
 										  jointype,
 										  &workspace,
-										  extra->sjinfo,
-										  &extra->semifactors,
+										  extra,
 										  outer_path,
 										  inner_path,
 										  extra->restrictlist,
@@ -486,7 +512,7 @@ try_mergejoin_path(PlannerInfo *root,
 	initial_cost_mergejoin(root, &workspace, jointype, mergeclauses,
 						   outer_path, inner_path,
 						   outersortkeys, innersortkeys,
-						   extra->sjinfo);
+						   extra);

 	if (add_path_precheck(joinrel,
 						  workspace.startup_cost, workspace.total_cost,
@@ -497,7 +523,7 @@ try_mergejoin_path(PlannerInfo *root,
 									   joinrel,
 									   jointype,
 									   &workspace,
-									   extra->sjinfo,
+									   extra,
 									   outer_path,
 									   inner_path,
 									   extra->restrictlist,
@@ -562,7 +588,7 @@ try_partial_mergejoin_path(PlannerInfo *root,
 	initial_cost_mergejoin(root, &workspace, jointype, mergeclauses,
 						   outer_path, inner_path,
 						   outersortkeys, innersortkeys,
-						   extra->sjinfo);
+						   extra);

 	if (!add_partial_path_precheck(joinrel, workspace.total_cost, pathkeys))
 		return;
@@ -573,7 +599,7 @@ try_partial_mergejoin_path(PlannerInfo *root,
 										   joinrel,
 										   jointype,
 										   &workspace,
-										   extra->sjinfo,
+										   extra,
 										   outer_path,
 										   inner_path,
 										   extra->restrictlist,
@@ -620,8 +646,7 @@ try_hashjoin_path(PlannerInfo *root,
 	 * never have any output pathkeys, per comments in create_hashjoin_path.
 	 */
 	initial_cost_hashjoin(root, &workspace, jointype, hashclauses,
-						  outer_path, inner_path,
-						  extra->sjinfo, &extra->semifactors);
+						  outer_path, inner_path, extra);

 	if (add_path_precheck(joinrel,
 						  workspace.startup_cost, workspace.total_cost,
@@ -632,8 +657,7 @@ try_hashjoin_path(PlannerInfo *root,
 									  joinrel,
 									  jointype,
 									  &workspace,
-									  extra->sjinfo,
-									  &extra->semifactors,
+									  extra,
 									  outer_path,
 									  inner_path,
 									  extra->restrictlist,
@@ -683,8 +707,7 @@ try_partial_hashjoin_path(PlannerInfo *root,
 	 * cost.  Bail out right away if it looks terrible.
 	 */
 	initial_cost_hashjoin(root, &workspace, jointype, hashclauses,
-						  outer_path, inner_path,
-						  extra->sjinfo, &extra->semifactors);
+						  outer_path, inner_path, extra);
 	if (!add_partial_path_precheck(joinrel, workspace.total_cost, NIL))
 		return;

@@ -694,8 +717,7 @@ try_partial_hashjoin_path(PlannerInfo *root,
 										  joinrel,
 										  jointype,
 										  &workspace,
-										  extra->sjinfo,
-										  &extra->semifactors,
+										  extra,
 										  outer_path,
 										  inner_path,
 										  extra->restrictlist,
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -41,6 +41,11 @@ static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
 static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
 					List *clause_list);
 static Oid	distinct_col_search(int colno, List *colnos, List *opids);
+static bool is_innerrel_unique_for(PlannerInfo *root,
+					   RelOptInfo *outerrel,
+					   RelOptInfo *innerrel,
+					   JoinType jointype,
+					   List *restrictlist);


 /*
@@ -845,3 +850,171 @@ distinct_col_search(int colno, List *colnos, List *opids)
 	}
 	return InvalidOid;
 }
+
+
+/*
+ * innerrel_is_unique
+ *	  Check if the innerrel provably contains at most one tuple matching any
+ *	  tuple from the outerrel, based on join clauses in the 'restrictlist'.
+ *
+ * The proof must be made based only on clauses that will be "joinquals"
+ * rather than "otherquals" at execution.  For an inner join there's no
+ * difference; but if the join is outer, we must ignore pushed-down quals,
+ * as those will become "otherquals".  Note that this means the answer might
+ * vary depending on whether IS_OUTER_JOIN(jointype); since we cache the
+ * answer without regard to that, callers must take care not to call this
+ * with jointypes that would be classified differently by IS_OUTER_JOIN().
+ *
+ * The actual proof is undertaken by is_innerrel_unique_for(); this function
+ * is a frontend that is mainly concerned with caching the answers.
+ */
+bool
+innerrel_is_unique(PlannerInfo *root,
+				   RelOptInfo *outerrel,
+				   RelOptInfo *innerrel,
+				   JoinType jointype,
+				   List *restrictlist)
+{
+	MemoryContext old_context;
+	ListCell   *lc;
+
+	/* Certainly can't prove uniqueness when there are no joinclauses */
+	if (restrictlist == NIL)
+		return false;
+
+	/*
+	 * Make a quick check to eliminate cases in which we will surely be unable
+	 * to prove uniqueness of the innerrel.
+	 */
+	if (!rel_supports_distinctness(root, innerrel))
+		return false;
+
+	/*
+	 * Query the cache to see if we've managed to prove that innerrel is
+	 * unique for any subset of this outerrel.  We don't need an exact match,
+	 * as extra outerrels can't make the innerrel any less unique (or more
+	 * formally, the restrictlist for a join to a superset outerrel must be a
+	 * superset of the conditions we successfully used before).
+	 */
+	foreach(lc, innerrel->unique_for_rels)
+	{
+		Relids		unique_for_rels = (Relids) lfirst(lc);
+
+		if (bms_is_subset(unique_for_rels, outerrel->relids))
+			return true;		/* Success! */
+	}
+
+	/*
+	 * Conversely, we may have already determined that this outerrel, or some
+	 * superset thereof, cannot prove this innerrel to be unique.
+	 */
+	foreach(lc, innerrel->non_unique_for_rels)
+	{
+		Relids		unique_for_rels = (Relids) lfirst(lc);
+
+		if (bms_is_subset(outerrel->relids, unique_for_rels))
+			return false;
+	}
+
+	/* No cached information, so try to make the proof. */
+	if (is_innerrel_unique_for(root, outerrel, innerrel,
+							   jointype, restrictlist))
+	{
+		/*
+		 * Cache the positive result for future probes, being sure to keep it
+		 * in the planner_cxt even if we are working in GEQO.
+		 *
+		 * Note: one might consider trying to isolate the minimal subset of
+		 * the outerrels that proved the innerrel unique.  But it's not worth
+		 * the trouble, because the planner builds up joinrels incrementally
+		 * and so we'll see the minimally sufficient outerrels before any
+		 * supersets of them anyway.
+		 */
+		old_context = MemoryContextSwitchTo(root->planner_cxt);
+		innerrel->unique_for_rels = lappend(innerrel->unique_for_rels,
+											bms_copy(outerrel->relids));
+		MemoryContextSwitchTo(old_context);
+
+		return true;			/* Success! */
+	}
+	else
+	{
+		/*
+		 * None of the join conditions for outerrel proved innerrel unique, so
+		 * we can safely reject this outerrel or any subset of it in future
+		 * checks.
+		 *
+		 * However, in normal planning mode, caching this knowledge is totally
+		 * pointless; it won't be queried again, because we build up joinrels
+		 * from smaller to larger.  It is useful in GEQO mode, where the
+		 * knowledge can be carried across successive planning attempts; and
+		 * it's likely to be useful when using join-search plugins, too. Hence
+		 * cache only when join_search_private is non-NULL.  (Yeah, that's a
+		 * hack, but it seems reasonable.)
+		 */
+		if (root->join_search_private)
+		{
+			old_context = MemoryContextSwitchTo(root->planner_cxt);
+			innerrel->non_unique_for_rels =
+				lappend(innerrel->non_unique_for_rels,
+						bms_copy(outerrel->relids));
+			MemoryContextSwitchTo(old_context);
+		}
+
+		return false;
+	}
+}
+
+/*
+ * is_innerrel_unique_for
+ *	  Check if the innerrel provably contains at most one tuple matching any
+ *	  tuple from the outerrel, based on join clauses in the 'restrictlist'.
+ */
+static bool
+is_innerrel_unique_for(PlannerInfo *root,
+					   RelOptInfo *outerrel,
+					   RelOptInfo *innerrel,
+					   JoinType jointype,
+					   List *restrictlist)
+{
+	List	   *clause_list = NIL;
+	ListCell   *lc;
+
+	/*
+	 * Search for mergejoinable clauses that constrain the inner rel against
+	 * the outer rel.  If an operator is mergejoinable then it behaves like
+	 * equality for some btree opclass, so it's what we want.  The
+	 * mergejoinability test also eliminates clauses containing volatile
+	 * functions, which we couldn't depend on.
+	 */
+	foreach(lc, restrictlist)
+	{
+		RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(lc);
+
+		/*
+		 * As noted above, if it's a pushed-down clause and we're at an outer
+		 * join, we can't use it.
+		 */
+		if (restrictinfo->is_pushed_down && IS_OUTER_JOIN(jointype))
+			continue;
+
+		/* Ignore if it's not a mergejoinable clause */
+		if (!restrictinfo->can_join ||
+			restrictinfo->mergeopfamilies == NIL)
+			continue;			/* not mergejoinable */
+
+		/*
+		 * Check if clause has the form "outer op inner" or "inner op outer",
+		 * and if so mark which side is inner.
+		 */
+		if (!clause_sides_match_join(restrictinfo, outerrel->relids,
+									 innerrel->relids))
+			continue;			/* no good for these input relations */
+
+		/* OK, add to list */
+		clause_list = lappend(clause_list, restrictinfo);
+	}
+
+	/* Let rel_is_distinct_for() do the hard work */
+	return rel_is_distinct_for(root, innerrel, clause_list);
+}
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -215,12 +215,12 @@ static BitmapOr *make_bitmap_or(List *bitmapplans);
 static NestLoop *make_nestloop(List *tlist,
 			  List *joinclauses, List *otherclauses, List *nestParams,
 			  Plan *lefttree, Plan *righttree,
-			  JoinType jointype);
+			  JoinType jointype, bool inner_unique);
 static HashJoin *make_hashjoin(List *tlist,
 			  List *joinclauses, List *otherclauses,
 			  List *hashclauses,
 			  Plan *lefttree, Plan *righttree,
-			  JoinType jointype);
+			  JoinType jointype, bool inner_unique);
 static Hash *make_hash(Plan *lefttree,
 		  Oid skewTable,
 		  AttrNumber skewColumn,
@@ -235,7 +235,8 @@ static MergeJoin *make_mergejoin(List *tlist,
 			   int *mergestrategies,
 			   bool *mergenullsfirst,
 			   Plan *lefttree, Plan *righttree,
-			   JoinType jointype);
+			   JoinType jointype, bool inner_unique,
+			   bool skip_mark_restore);
 static Sort *make_sort(Plan *lefttree, int numCols,
 		  AttrNumber *sortColIdx, Oid *sortOperators,
 		  Oid *collations, bool *nullsFirst);
@@ -3714,7 +3715,8 @@ create_nestloop_plan(PlannerInfo *root,
 							  nestParams,
 							  outer_plan,
 							  inner_plan,
-							  best_path->jointype);
+							  best_path->jointype,
+							  best_path->inner_unique);

 	copy_generic_path_info(&join_plan->join.plan, &best_path->path);

@@ -4016,7 +4018,9 @@ create_mergejoin_plan(PlannerInfo *root,
 							   mergenullsfirst,
 							   outer_plan,
 							   inner_plan,
-							   best_path->jpath.jointype);
+							   best_path->jpath.jointype,
+							   best_path->jpath.inner_unique,
+							   best_path->skip_mark_restore);

 	/* Costs of sort and material steps are included in path cost already */
 	copy_generic_path_info(&join_plan->join.plan, &best_path->jpath.path);
@@ -4156,7 +4160,8 @@ create_hashjoin_plan(PlannerInfo *root,
 							  hashclauses,
 							  outer_plan,
 							  (Plan *) hash_plan,
-							  best_path->jpath.jointype);
+							  best_path->jpath.jointype,
+							  best_path->jpath.inner_unique);

 	copy_generic_path_info(&join_plan->join.plan, &best_path->jpath.path);

@@ -5349,7 +5354,8 @@ make_nestloop(List *tlist,
 			  List *nestParams,
 			  Plan *lefttree,
 			  Plan *righttree,
-			  JoinType jointype)
+			  JoinType jointype,
+			  bool inner_unique)
 {
 	NestLoop   *node = makeNode(NestLoop);
 	Plan	   *plan = &node->join.plan;
@@ -5359,6 +5365,7 @@ make_nestloop(List *tlist,
 	plan->lefttree = lefttree;
 	plan->righttree = righttree;
 	node->join.jointype = jointype;
+	node->join.inner_unique = inner_unique;
 	node->join.joinqual = joinclauses;
 	node->nestParams = nestParams;

@@ -5372,7 +5379,8 @@ make_hashjoin(List *tlist,
 			  List *hashclauses,
 			  Plan *lefttree,
 			  Plan *righttree,
-			  JoinType jointype)
+			  JoinType jointype,
+			  bool inner_unique)
 {
 	HashJoin   *node = makeNode(HashJoin);
 	Plan	   *plan = &node->join.plan;
@@ -5383,6 +5391,7 @@ make_hashjoin(List *tlist,
 	plan->righttree = righttree;
 	node->hashclauses = hashclauses;
 	node->join.jointype = jointype;
+	node->join.inner_unique = inner_unique;
 	node->join.joinqual = joinclauses;

 	return node;
@@ -5424,7 +5433,9 @@ make_mergejoin(List *tlist,
 			   bool *mergenullsfirst,
 			   Plan *lefttree,
 			   Plan *righttree,
-			   JoinType jointype)
+			   JoinType jointype,
+			   bool inner_unique,
+			   bool skip_mark_restore)
 {
 	MergeJoin  *node = makeNode(MergeJoin);
 	Plan	   *plan = &node->join.plan;
@@ -5433,12 +5444,14 @@ make_mergejoin(List *tlist,
 	plan->qual = otherclauses;
 	plan->lefttree = lefttree;
 	plan->righttree = righttree;
+	node->skip_mark_restore = skip_mark_restore;
 	node->mergeclauses = mergeclauses;
 	node->mergeFamilies = mergefamilies;
 	node->mergeCollations = mergecollations;
 	node->mergeStrategies = mergestrategies;
 	node->mergeNullsFirst = mergenullsfirst;
 	node->join.jointype = jointype;
+	node->join.inner_unique = inner_unique;
 	node->join.joinqual = joinclauses;

 	return node;
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2049,8 +2049,7 @@ calc_non_nestloop_required_outer(Path *outer_path, Path *inner_path)
 * 'joinrel' is the join relation.
 * 'jointype' is the type of join required
 * 'workspace' is the result from initial_cost_nestloop
- * 'sjinfo' is extra info about the join for selectivity estimation
- * 'semifactors' contains valid data if jointype is SEMI or ANTI
+ * 'extra' contains various information about the join
 * 'outer_path' is the outer path
 * 'inner_path' is the inner path
 * 'restrict_clauses' are the RestrictInfo nodes to apply at the join
@@ -2064,8 +2063,7 @@ create_nestloop_path(PlannerInfo *root,
 					 RelOptInfo *joinrel,
 					 JoinType jointype,
 					 JoinCostWorkspace *workspace,
-					 SpecialJoinInfo *sjinfo,
-					 SemiAntiJoinFactors *semifactors,
+					 JoinPathExtraData *extra,
 					 Path *outer_path,
 					 Path *inner_path,
 					 List *restrict_clauses,
@@ -2109,7 +2107,7 @@ create_nestloop_path(PlannerInfo *root,
 								  joinrel,
 								  outer_path,
 								  inner_path,
-								  sjinfo,
+								  extra->sjinfo,
 								  required_outer,
 								  &restrict_clauses);
 	pathnode->path.parallel_aware = false;
@@ -2119,11 +2117,12 @@ create_nestloop_path(PlannerInfo *root,
 	pathnode->path.parallel_workers = outer_path->parallel_workers;
 	pathnode->path.pathkeys = pathkeys;
 	pathnode->jointype = jointype;
+	pathnode->inner_unique = extra->inner_unique;
 	pathnode->outerjoinpath = outer_path;
 	pathnode->innerjoinpath = inner_path;
 	pathnode->joinrestrictinfo = restrict_clauses;

-	final_cost_nestloop(root, pathnode, workspace, sjinfo, semifactors);
+	final_cost_nestloop(root, pathnode, workspace, extra);

 	return pathnode;
 }
@@ -2136,7 +2135,7 @@ create_nestloop_path(PlannerInfo *root,
 * 'joinrel' is the join relation
 * 'jointype' is the type of join required
 * 'workspace' is the result from initial_cost_mergejoin
- * 'sjinfo' is extra info about the join for selectivity estimation
+ * 'extra' contains various information about the join
 * 'outer_path' is the outer path
 * 'inner_path' is the inner path
 * 'restrict_clauses' are the RestrictInfo nodes to apply at the join
@@ -2152,7 +2151,7 @@ create_mergejoin_path(PlannerInfo *root,
 					  RelOptInfo *joinrel,
 					  JoinType jointype,
 					  JoinCostWorkspace *workspace,
-					  SpecialJoinInfo *sjinfo,
+					  JoinPathExtraData *extra,
 					  Path *outer_path,
 					  Path *inner_path,
 					  List *restrict_clauses,
@@ -2172,7 +2171,7 @@ create_mergejoin_path(PlannerInfo *root,
 								  joinrel,
 								  outer_path,
 								  inner_path,
-								  sjinfo,
+								  extra->sjinfo,
 								  required_outer,
 								  &restrict_clauses);
 	pathnode->jpath.path.parallel_aware = false;
@@ -2182,15 +2181,17 @@ create_mergejoin_path(PlannerInfo *root,
 	pathnode->jpath.path.parallel_workers = outer_path->parallel_workers;
 	pathnode->jpath.path.pathkeys = pathkeys;
 	pathnode->jpath.jointype = jointype;
+	pathnode->jpath.inner_unique = extra->inner_unique;
 	pathnode->jpath.outerjoinpath = outer_path;
 	pathnode->jpath.innerjoinpath = inner_path;
 	pathnode->jpath.joinrestrictinfo = restrict_clauses;
 	pathnode->path_mergeclauses = mergeclauses;
 	pathnode->outersortkeys = outersortkeys;
 	pathnode->innersortkeys = innersortkeys;
+	/* pathnode->skip_mark_restore will be set by final_cost_mergejoin */
 	/* pathnode->materialize_inner will be set by final_cost_mergejoin */

-	final_cost_mergejoin(root, pathnode, workspace, sjinfo);
+	final_cost_mergejoin(root, pathnode, workspace, extra);

 	return pathnode;
 }
@@ -2202,8 +2203,7 @@ create_mergejoin_path(PlannerInfo *root,
 * 'joinrel' is the join relation
 * 'jointype' is the type of join required
 * 'workspace' is the result from initial_cost_hashjoin
- * 'sjinfo' is extra info about the join for selectivity estimation
- * 'semifactors' contains valid data if jointype is SEMI or ANTI
+ * 'extra' contains various information about the join
 * 'outer_path' is the cheapest outer path
 * 'inner_path' is the cheapest inner path
 * 'restrict_clauses' are the RestrictInfo nodes to apply at the join
@@ -2216,8 +2216,7 @@ create_hashjoin_path(PlannerInfo *root,
 					 RelOptInfo *joinrel,
 					 JoinType jointype,
 					 JoinCostWorkspace *workspace,
-					 SpecialJoinInfo *sjinfo,
-					 SemiAntiJoinFactors *semifactors,
+					 JoinPathExtraData *extra,
 					 Path *outer_path,
 					 Path *inner_path,
 					 List *restrict_clauses,
@@ -2234,7 +2233,7 @@ create_hashjoin_path(PlannerInfo *root,
 								  joinrel,
 								  outer_path,
 								  inner_path,
-								  sjinfo,
+								  extra->sjinfo,
 								  required_outer,
 								  &restrict_clauses);
 	pathnode->jpath.path.parallel_aware = false;
@@ -2256,13 +2255,14 @@ create_hashjoin_path(PlannerInfo *root,
 	 */
 	pathnode->jpath.path.pathkeys = NIL;
 	pathnode->jpath.jointype = jointype;
+	pathnode->jpath.inner_unique = extra->inner_unique;
 	pathnode->jpath.outerjoinpath = outer_path;
 	pathnode->jpath.innerjoinpath = inner_path;
 	pathnode->jpath.joinrestrictinfo = restrict_clauses;
 	pathnode->path_hashclauses = hashclauses;
 	/* final_cost_hashjoin will fill in pathnode->num_batches */

-	final_cost_hashjoin(root, pathnode, workspace, sjinfo, semifactors);
+	final_cost_hashjoin(root, pathnode, workspace, extra);

 	return pathnode;
 }
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -126,6 +126,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
 	rel->lateral_vars = NIL;
 	rel->lateral_referencers = NULL;
 	rel->indexlist = NIL;
+	rel->statlist = NIL;
 	rel->pages = 0;
 	rel->tuples = 0;
 	rel->allvisfrac = 0;
@@ -137,6 +138,8 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
 	rel->useridiscurrent = false;
 	rel->fdwroutine = NULL;
 	rel->fdw_private = NULL;
+	rel->unique_for_rels = NIL;
+	rel->non_unique_for_rels = NIL;
 	rel->baserestrictinfo = NIL;
 	rel->baserestrictcost.startup = 0;
 	rel->baserestrictcost.per_tuple = 0;
@@ -147,7 +150,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
 	/*
 	 * Pass top parent's relids down the inheritance hierarchy. If the parent
 	 * has top_parent_relids set, it's a direct or an indirect child of the top
-	 * parent indicated by top_parent_relids. By extention this child is also
+	 * parent indicated by top_parent_relids. By extension this child is also
 	 * an indirect child of that parent.
 	 */
 	if (parent)
@@ -503,6 +506,7 @@ build_join_rel(PlannerInfo *root,
 	joinrel->lateral_vars = NIL;
 	joinrel->lateral_referencers = NULL;
 	joinrel->indexlist = NIL;
+	joinrel->statlist = NIL;
 	joinrel->pages = 0;
 	joinrel->tuples = 0;
 	joinrel->allvisfrac = 0;
@@ -514,6 +518,8 @@ build_join_rel(PlannerInfo *root,
 	joinrel->useridiscurrent = false;
 	joinrel->fdwroutine = NULL;
 	joinrel->fdw_private = NULL;
+	joinrel->unique_for_rels = NIL;
+	joinrel->non_unique_for_rels = NIL;
 	joinrel->baserestrictinfo = NIL;
 	joinrel->baserestrictcost.startup = 0;
 	joinrel->baserestrictcost.per_tuple = 0;