Adjust definition of cheapest_total_path to work better with LATERAL.

In the initial cut at LATERAL, I kept the rule that cheapest_total_path was always unparameterized, which meant it had to be NULL if the relation has no unparameterized paths. It turns out to work much more nicely if we always have *some* path nominated as cheapest-total for each relation. In particular, let's still say it's the cheapest unparameterized path if there is one; if not, take the cheapest-total-cost path among those of the minimum available parameterization. (The first rule is actually a special case of the second.) This allows reversion of some temporary lobotomizations I'd put in place. In particular, the planner can now consider hash and merge joins for joins below a parameter-supplying nestloop, even if there aren't any unparameterized paths available. This should bring planning of LATERAL-containing queries to the same level as queries not using that feature. Along the way, simplify management of parameterized paths in add_path() and friends. In the original coding for parameterized paths in 9.2, I tried to minimize the logic changes in add_path(), so it just treated parameterization as yet another dimension of comparison for paths. We later made it ignore pathkeys (sort ordering) of parameterized paths, on the grounds that ordering isn't a useful property for the path on the inside of a nestloop, so we might as well get rid of useless parameterized paths as quickly as possible. But we didn't take that reasoning as far as we should have. Startup cost isn't a useful property inside a nestloop either, so add_path() ought to discount startup cost of parameterized paths as well. Having done that, the secondary sorting I'd implemented (in add_parameterized_path) is no longer needed --- any parameterized path that survives add_path() at all is worth considering at higher levels. So this should be a bit faster as well as simpler.
2025-07-02 09:02:37 +03:00 · 2012-08-29 22:05:27 -04:00
parent 9fe6da5c0d
commit e83bb10d6d
8 changed files with 217 additions and 252 deletions
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@ -37,7 +37,6 @@ typedef enum
 	COSTS_DIFFERENT				/* neither path dominates the other on cost */
 } PathCostComparison;

-static void add_parameterized_path(RelOptInfo *parent_rel, Path *new_path);
 static List *translate_sub_tlist(List *tlist, int relid);
 static bool query_is_distinct_for(Query *query, List *colnos, List *opids);
 static Oid	distinct_col_search(int colno, List *colnos, List *opids);
@ -139,6 +138,12 @@ compare_fractional_path_costs(Path *path1, Path *path2,
 * is fuzzily better than the other on startup cost and fuzzily worse on
 * total cost, we just say that their costs are "different", since neither
 * dominates the other across the whole performance spectrum.
+ *
+ * This function also includes special hacks to support a policy enforced
+ * by its sole caller, add_path(): paths that have any parameterization
+ * cannot win comparisons on the grounds of having cheaper startup cost,
+ * since we deem only total cost to be of interest for a parameterized path.
+ * (Unparameterized paths are more common, so we check for this case last.)
 */
 static PathCostComparison
 compare_path_costs_fuzzily(Path *path1, Path *path2, double fuzz_factor)
@ -150,7 +155,8 @@ compare_path_costs_fuzzily(Path *path1, Path *path2, double fuzz_factor)
 	if (path1->total_cost > path2->total_cost * fuzz_factor)
 	{
 		/* path1 fuzzily worse on total cost */
-		if (path2->startup_cost > path1->startup_cost * fuzz_factor)
+		if (path2->startup_cost > path1->startup_cost * fuzz_factor &&
+			path1->param_info == NULL)
 		{
 			/* ... but path2 fuzzily worse on startup, so DIFFERENT */
 			return COSTS_DIFFERENT;
@ -161,7 +167,8 @@ compare_path_costs_fuzzily(Path *path1, Path *path2, double fuzz_factor)
 	if (path2->total_cost > path1->total_cost * fuzz_factor)
 	{
 		/* path2 fuzzily worse on total cost */
-		if (path1->startup_cost > path2->startup_cost * fuzz_factor)
+		if (path1->startup_cost > path2->startup_cost * fuzz_factor &&
+			path2->param_info == NULL)
 		{
 			/* ... but path1 fuzzily worse on startup, so DIFFERENT */
 			return COSTS_DIFFERENT;
@ -170,12 +177,14 @@ compare_path_costs_fuzzily(Path *path1, Path *path2, double fuzz_factor)
 		return COSTS_BETTER1;
 	}
 	/* fuzzily the same on total cost */
-	if (path1->startup_cost > path2->startup_cost * fuzz_factor)
+	if (path1->startup_cost > path2->startup_cost * fuzz_factor &&
+		path2->param_info == NULL)
 	{
 		/* ... but path1 fuzzily worse on startup, so path2 wins */
 		return COSTS_BETTER2;
 	}
-	if (path2->startup_cost > path1->startup_cost * fuzz_factor)
+	if (path2->startup_cost > path1->startup_cost * fuzz_factor &&
+		path1->param_info == NULL)
 	{
 		/* ... but path2 fuzzily worse on startup, so path1 wins */
 		return COSTS_BETTER1;
@ -189,11 +198,19 @@ compare_path_costs_fuzzily(Path *path1, Path *path2, double fuzz_factor)
 *	  Find the minimum-cost paths from among a relation's paths,
 *	  and save them in the rel's cheapest-path fields.
 *
- * Only unparameterized paths are considered candidates for cheapest_startup
- * and cheapest_total.	The cheapest_parameterized_paths list collects paths
- * that are cheapest-total for their parameterization (i.e., there is no
- * cheaper path with the same or weaker parameterization).	This list always
- * includes the unparameterized cheapest-total path, too, if there is one.
+ * cheapest_total_path is normally the cheapest-total-cost unparameterized
+ * path; but if there are no unparameterized paths, we assign it to be the
+ * best (cheapest least-parameterized) parameterized path.  However, only
+ * unparameterized paths are considered candidates for cheapest_startup_path,
+ * so that will be NULL if there are no unparameterized paths.
+ *
+ * The cheapest_parameterized_paths list collects all parameterized paths
+ * that have survived the add_path() tournament for this relation.  (Since
+ * add_path ignores pathkeys and startup cost for a parameterized path,
+ * these will be paths that have best total cost or best row count for their
+ * parameterization.)  cheapest_parameterized_paths always includes the
+ * cheapest-total unparameterized path, too, if there is one; the users of
+ * that list find it more convenient if that's included.
 *
 * This is normally called only after we've finished constructing the path
 * list for the rel node.
@ -203,77 +220,118 @@ set_cheapest(RelOptInfo *parent_rel)
 {
 	Path	   *cheapest_startup_path;
 	Path	   *cheapest_total_path;
-	bool		have_parameterized_paths;
+	Path	   *best_param_path;
+	List	   *parameterized_paths;
 	ListCell   *p;

 	Assert(IsA(parent_rel, RelOptInfo));

-	cheapest_startup_path = cheapest_total_path = NULL;
-	have_parameterized_paths = false;
+	if (parent_rel->pathlist == NIL)
+		elog(ERROR, "could not devise a query plan for the given query");
+
+	cheapest_startup_path = cheapest_total_path = best_param_path = NULL;
+	parameterized_paths = NIL;

 	foreach(p, parent_rel->pathlist)
 	{
 		Path	   *path = (Path *) lfirst(p);
 		int			cmp;

-		/* We only consider unparameterized paths in this step */
 		if (path->param_info)
 		{
-			have_parameterized_paths = true;
-			continue;
-		}
+			/* Parameterized path, so add it to parameterized_paths */
+			parameterized_paths = lappend(parameterized_paths, path);

-		if (cheapest_total_path == NULL)
+			/*
+			 * If we have an unparameterized cheapest-total, we no longer care
+			 * about finding the best parameterized path, so move on.
+			 */
+			if (cheapest_total_path)
+				continue;
+
+			/*
+			 * Otherwise, track the best parameterized path, which is the one
+			 * with least total cost among those of the minimum
+			 * parameterization.
+			 */
+			if (best_param_path == NULL)
+				best_param_path = path;
+			else
+			{
+				switch (bms_subset_compare(PATH_REQ_OUTER(path),
+										   PATH_REQ_OUTER(best_param_path)))
+				{
+					case BMS_EQUAL:
+						/* keep the cheaper one */
+						if (compare_path_costs(path, best_param_path,
+											   TOTAL_COST) < 0)
+							best_param_path = path;
+						break;
+					case BMS_SUBSET1:
+						/* new path is less-parameterized */
+						best_param_path = path;
+						break;
+					case BMS_SUBSET2:
+						/* old path is less-parameterized, keep it */
+						break;
+					case BMS_DIFFERENT:
+						/*
+						 * This means that neither path has the least possible
+						 * parameterization for the rel.  We'll sit on the old
+						 * path until something better comes along.
+						 */
+						break;
+				}
+			}
+		}
+		else
 		{
-			cheapest_startup_path = cheapest_total_path = path;
-			continue;
+			/* Unparameterized path, so consider it for cheapest slots */
+			if (cheapest_total_path == NULL)
+			{
+				cheapest_startup_path = cheapest_total_path = path;
+				continue;
+			}
+
+			/*
+			 * If we find two paths of identical costs, try to keep the
+			 * better-sorted one.  The paths might have unrelated sort
+			 * orderings, in which case we can only guess which might be
+			 * better to keep, but if one is superior then we definitely
+			 * should keep that one.
+			 */
+			cmp = compare_path_costs(cheapest_startup_path, path, STARTUP_COST);
+			if (cmp > 0 ||
+				(cmp == 0 &&
+				 compare_pathkeys(cheapest_startup_path->pathkeys,
+								  path->pathkeys) == PATHKEYS_BETTER2))
+				cheapest_startup_path = path;
+
+			cmp = compare_path_costs(cheapest_total_path, path, TOTAL_COST);
+			if (cmp > 0 ||
+				(cmp == 0 &&
+				 compare_pathkeys(cheapest_total_path->pathkeys,
+								  path->pathkeys) == PATHKEYS_BETTER2))
+				cheapest_total_path = path;
 		}
-
-		/*
-		 * If we find two paths of identical costs, try to keep the
-		 * better-sorted one.  The paths might have unrelated sort orderings,
-		 * in which case we can only guess which might be better to keep, but
-		 * if one is superior then we definitely should keep that one.
-		 */
-		cmp = compare_path_costs(cheapest_startup_path, path, STARTUP_COST);
-		if (cmp > 0 ||
-			(cmp == 0 &&
-			 compare_pathkeys(cheapest_startup_path->pathkeys,
-							  path->pathkeys) == PATHKEYS_BETTER2))
-			cheapest_startup_path = path;
-
-		cmp = compare_path_costs(cheapest_total_path, path, TOTAL_COST);
-		if (cmp > 0 ||
-			(cmp == 0 &&
-			 compare_pathkeys(cheapest_total_path->pathkeys,
-							  path->pathkeys) == PATHKEYS_BETTER2))
-			cheapest_total_path = path;
 	}

-	if (cheapest_total_path == NULL && !have_parameterized_paths)
-		elog(ERROR, "could not devise a query plan for the given query");
+	/* Add cheapest unparameterized path, if any, to parameterized_paths */
+	if (cheapest_total_path)
+		parameterized_paths = lcons(cheapest_total_path, parameterized_paths);
+
+	/*
+	 * If there is no unparameterized path, use the best parameterized path
+	 * as cheapest_total_path (but not as cheapest_startup_path).
+	 */
+	if (cheapest_total_path == NULL)
+		cheapest_total_path = best_param_path;
+	Assert(cheapest_total_path != NULL);

 	parent_rel->cheapest_startup_path = cheapest_startup_path;
 	parent_rel->cheapest_total_path = cheapest_total_path;
 	parent_rel->cheapest_unique_path = NULL;	/* computed only if needed */
-
-	/* Seed the parameterized-paths list with the cheapest total, if any */
-	if (cheapest_total_path)
-		parent_rel->cheapest_parameterized_paths = list_make1(cheapest_total_path);
-	else
-		parent_rel->cheapest_parameterized_paths = NIL;
-
-	/* And, if there are any parameterized paths, add them in one at a time */
-	if (have_parameterized_paths)
-	{
-		foreach(p, parent_rel->pathlist)
-		{
-			Path	   *path = (Path *) lfirst(p);
-
-			if (path->param_info)
-				add_parameterized_path(parent_rel, path);
-		}
-	}
+	parent_rel->cheapest_parameterized_paths = parameterized_paths;
 }

 /*
@ -295,11 +353,12 @@ set_cheapest(RelOptInfo *parent_rel)
 *	  one parameterization can seldom dominate a path of another.  But such
 *	  cases do arise, so we make the full set of checks anyway.
 *
- *	  There is one policy decision embedded in this function, along with its
- *	  sibling add_path_precheck: we treat all parameterized paths as having
- *	  NIL pathkeys, so that they compete only on cost.	This is to reduce
- *	  the number of parameterized paths that are kept.	See discussion in
- *	  src/backend/optimizer/README.
+ *	  There are two policy decisions embedded in this function, along with
+ *	  its sibling add_path_precheck: we treat all parameterized paths as
+ *	  having NIL pathkeys, and we ignore their startup costs, so that they
+ *	  compete only on parameterization, total cost and rowcount.  This is to
+ *	  reduce the number of parameterized paths that are kept.  See discussion
+ *	  in src/backend/optimizer/README.
 *
 *	  The pathlist is kept sorted by total_cost, with cheaper paths
 *	  at the front.  Within this routine, that's simply a speed hack:
@ -552,7 +611,7 @@ add_path_precheck(RelOptInfo *parent_rel,
 	List	   *new_path_pathkeys;
 	ListCell   *p1;

-	/* Pretend parameterized paths have no pathkeys, per add_path comment */
+	/* Pretend parameterized paths have no pathkeys, per add_path policy */
 	new_path_pathkeys = required_outer ? NIL : pathkeys;

 	foreach(p1, parent_rel->pathlist)
@ -572,8 +631,10 @@ add_path_precheck(RelOptInfo *parent_rel,
 		 */
 		if (total_cost >= old_path->total_cost)
 		{
-			if (startup_cost >= old_path->startup_cost)
+			/* can win on startup cost only if unparameterized */
+			if (startup_cost >= old_path->startup_cost || required_outer)
 			{
+				/* new path does not win on cost, so check pathkeys... */
 				List	   *old_path_pathkeys;

 				old_path_pathkeys = old_path->param_info ? NIL : old_path->pathkeys;
@ -582,6 +643,7 @@ add_path_precheck(RelOptInfo *parent_rel,
 				if (keyscmp == PATHKEYS_EQUAL ||
 					keyscmp == PATHKEYS_BETTER2)
 				{
+					/* new path does not win on pathkeys... */
 					if (bms_equal(required_outer, PATH_REQ_OUTER(old_path)))
 					{
 						/* Found an old path that dominates the new one */
@ -604,123 +666,6 @@ add_path_precheck(RelOptInfo *parent_rel,
 	return true;
 }

-/*
- * add_parameterized_path
- *	  Consider a parameterized implementation path for the specified rel,
- *	  and add it to the rel's cheapest_parameterized_paths list if it
- *	  belongs there, removing any old entries that it dominates.
- *
- *	  This is essentially a cut-down form of add_path(): we do not care
- *	  about startup cost or sort ordering, only total cost, rowcount, and
- *	  parameterization.  Also, we must not recycle rejected paths, since
- *	  they will still be present in the rel's pathlist.
- *
- * 'parent_rel' is the relation entry to which the path corresponds.
- * 'new_path' is a parameterized path for parent_rel.
- *
- * Returns nothing, but modifies parent_rel->cheapest_parameterized_paths.
- */
-static void
-add_parameterized_path(RelOptInfo *parent_rel, Path *new_path)
-{
-	bool		accept_new = true;		/* unless we find a superior old path */
-	ListCell   *insert_after = NULL;	/* where to insert new item */
-	ListCell   *p1;
-	ListCell   *p1_prev;
-	ListCell   *p1_next;
-
-	/*
-	 * Loop to check proposed new path against old paths.  Note it is possible
-	 * for more than one old path to be tossed out because new_path dominates
-	 * it.
-	 *
-	 * We can't use foreach here because the loop body may delete the current
-	 * list cell.
-	 */
-	p1_prev = NULL;
-	for (p1 = list_head(parent_rel->cheapest_parameterized_paths);
-		 p1 != NULL; p1 = p1_next)
-	{
-		Path	   *old_path = (Path *) lfirst(p1);
-		bool		remove_old = false; /* unless new proves superior */
-		int			costcmp;
-		BMS_Comparison outercmp;
-
-		p1_next = lnext(p1);
-
-		costcmp = compare_path_costs(new_path, old_path, TOTAL_COST);
-		outercmp = bms_subset_compare(PATH_REQ_OUTER(new_path),
-									  PATH_REQ_OUTER(old_path));
-		if (outercmp != BMS_DIFFERENT)
-		{
-			if (costcmp < 0)
-			{
-				if (outercmp != BMS_SUBSET2 &&
-					new_path->rows <= old_path->rows)
-					remove_old = true;	/* new dominates old */
-			}
-			else if (costcmp > 0)
-			{
-				if (outercmp != BMS_SUBSET1 &&
-					new_path->rows >= old_path->rows)
-					accept_new = false; /* old dominates new */
-			}
-			else if (outercmp == BMS_SUBSET1 &&
-					 new_path->rows <= old_path->rows)
-				remove_old = true;		/* new dominates old */
-			else if (outercmp == BMS_SUBSET2 &&
-					 new_path->rows >= old_path->rows)
-				accept_new = false;		/* old dominates new */
-			else if (new_path->rows < old_path->rows)
-				remove_old = true;		/* new dominates old */
-			else
-			{
-				/* Same cost, rows, and param rels; arbitrarily keep old */
-				accept_new = false;		/* old equals or dominates new */
-			}
-		}
-
-		/*
-		 * Remove current element from cheapest_parameterized_paths if
-		 * dominated by new.
-		 */
-		if (remove_old)
-		{
-			parent_rel->cheapest_parameterized_paths =
-				list_delete_cell(parent_rel->cheapest_parameterized_paths,
-								 p1, p1_prev);
-			/* p1_prev does not advance */
-		}
-		else
-		{
-			/* new belongs after this old path if it has cost >= old's */
-			if (costcmp >= 0)
-				insert_after = p1;
-			/* p1_prev advances */
-			p1_prev = p1;
-		}
-
-		/*
-		 * If we found an old path that dominates new_path, we can quit
-		 * scanning the list; we will not add new_path, and we assume new_path
-		 * cannot dominate any other elements of the list.
-		 */
-		if (!accept_new)
-			break;
-	}
-
-	if (accept_new)
-	{
-		/* Accept the new path: insert it at proper place in list */
-		if (insert_after)
-			lappend_cell(parent_rel->cheapest_parameterized_paths,
-						 insert_after, new_path);
-		else
-			parent_rel->cheapest_parameterized_paths =
-				lcons(new_path, parent_rel->cheapest_parameterized_paths);
-	}
-}
-

 /*****************************************************************************
 *		PATH NODE CREATION ROUTINES
@ -1137,13 +1082,6 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 	int			numCols;
 	ListCell   *lc;

-	/* XXX temporary band-aid to not crash on LATERAL queries */
-	if (subpath == NULL)
-	{
-		Assert(subpath == rel->cheapest_total_path);
-		return NULL;
-	}
-
 	/* Caller made a mistake if subpath isn't cheapest_total ... */
 	Assert(subpath == rel->cheapest_total_path);
 	Assert(subpath->parent == rel);