Rewrite the planner's handling of materialized plan types so that there is

an explicit model of rescan costs being different from first-time costs. The costing of Material nodes in particular now has some visible relationship to the actual runtime behavior, where before it was essentially fantasy. This also fixes up a couple of places where different materialized plan types were treated differently for no very good reason (probably just oversights). A couple of the regression tests are affected, because the planner now chooses to put the other relation on the inside of a nestloop-with-materialize. So far as I can see both changes are sane, and the planner is now more consistently following the expectation that it should prefer to materialize the smaller of two relations. Per a recent discussion with Robert Haas.
2025-09-09 13:09:39 +03:00 · 2009-09-12 22:12:09 +00:00
parent 5f1b32ddf8
commit 9bb342811b
12 changed files with 1029 additions and 916 deletions
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.210 2009/07/11 04:09:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.211 2009/09/12 22:12:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -63,6 +63,7 @@

 #include <math.h>

+#include "executor/executor.h"
 #include "executor/nodeHash.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
@@ -119,6 +120,8 @@ typedef struct
 static MergeScanSelCache *cached_scansel(PlannerInfo *root,
 			   RestrictInfo *rinfo,
 			   PathKey *pathkey);
+static void cost_rescan(PlannerInfo *root, Path *path,
+			Cost *rescan_startup_cost, Cost *rescan_total_cost);
 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
 static bool adjust_semi_join(PlannerInfo *root, JoinPath *path,
 				 SpecialJoinInfo *sjinfo,
@@ -895,15 +898,26 @@ cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
 	rte = planner_rt_fetch(baserel->relid, root);
 	Assert(rte->rtekind == RTE_FUNCTION);

-	/* Estimate costs of executing the function expression */
+	/*
+	 * Estimate costs of executing the function expression.
+	 *
+	 * Currently, nodeFunctionscan.c always executes the function to
+	 * completion before returning any rows, and caches the results in a
+	 * tuplestore.  So the function eval cost is all startup cost, and
+	 * per-row costs are minimal.
+	 *
+	 * XXX in principle we ought to charge tuplestore spill costs if the
+	 * number of rows is large.  However, given how phony our rowcount
+	 * estimates for functions tend to be, there's not a lot of point
+	 * in that refinement right now.
+	 */
 	cost_qual_eval_node(&exprcost, rte->funcexpr, root);

-	startup_cost += exprcost.startup;
-	cpu_per_tuple = exprcost.per_tuple;
+	startup_cost += exprcost.startup + exprcost.per_tuple;

 	/* Add scanning CPU costs */
 	startup_cost += baserel->baserestrictcost.startup;
-	cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
+	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
 	run_cost += cpu_per_tuple * baserel->tuples;

 	path->startup_cost = startup_cost;
@@ -1176,41 +1190,44 @@ sort_exceeds_work_mem(Sort *sort)
 *
 * If the total volume of data to materialize exceeds work_mem, we will need
 * to write it to disk, so the cost is much higher in that case.
+ *
+ * Note that here we are estimating the costs for the first scan of the
+ * relation, so the materialization is all overhead --- any savings will
+ * occur only on rescan, which is estimated in cost_rescan.
 */
 void
 cost_material(Path *path,
-			  Cost input_cost, double tuples, int width)
+			  Cost input_startup_cost, Cost input_total_cost,
+			  double tuples, int width)
 {
-	Cost		startup_cost = input_cost;
-	Cost		run_cost = 0;
+	Cost		startup_cost = input_startup_cost;
+	Cost		run_cost = input_total_cost - input_startup_cost;
 	double		nbytes = relation_byte_size(tuples, width);
 	long		work_mem_bytes = work_mem * 1024L;

-	/* disk costs */
+	/*
+	 * Whether spilling or not, charge 2x cpu_tuple_cost per tuple to reflect
+	 * bookkeeping overhead.  (This rate must be more than cpu_tuple_cost;
+	 * if it is exactly the same then there will be a cost tie between
+	 * nestloop with A outer, materialized B inner and nestloop with B outer,
+	 * materialized A inner.  The extra cost ensures we'll prefer
+	 * materializing the smaller rel.)
+	 */
+	run_cost += 2 * cpu_tuple_cost * tuples;
+
+	/*
+	 * If we will spill to disk, charge at the rate of seq_page_cost per page.
+	 * This cost is assumed to be evenly spread through the plan run phase,
+	 * which isn't exactly accurate but our cost model doesn't allow for
+	 * nonuniform costs within the run phase.
+	 */
 	if (nbytes > work_mem_bytes)
 	{
 		double		npages = ceil(nbytes / BLCKSZ);

-		/* We'll write during startup and read during retrieval */
-		startup_cost += seq_page_cost * npages;
 		run_cost += seq_page_cost * npages;
 	}

-	/*
-	 * Charge a very small amount per inserted tuple, to reflect bookkeeping
-	 * costs.  We use cpu_tuple_cost/10 for this.  This is needed to break the
-	 * tie that would otherwise exist between nestloop with A outer,
-	 * materialized B inner and nestloop with B outer, materialized A inner.
-	 * The extra cost ensures we'll prefer materializing the smaller rel.
-	 */
-	startup_cost += cpu_tuple_cost * 0.1 * tuples;
-
-	/*
-	 * Also charge a small amount per extracted tuple.	We use cpu_tuple_cost
-	 * so that it doesn't appear worthwhile to materialize a bare seqscan.
-	 */
-	run_cost += cpu_tuple_cost * tuples;
-
 	path->startup_cost = startup_cost;
 	path->total_cost = startup_cost + run_cost;
 }
@@ -1400,7 +1417,10 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	Path	   *inner_path = path->innerjoinpath;
 	Cost		startup_cost = 0;
 	Cost		run_cost = 0;
+	Cost		inner_rescan_start_cost;
+	Cost		inner_rescan_total_cost;
 	Cost		inner_run_cost;
+	Cost		inner_rescan_run_cost;
 	Cost		cpu_per_tuple;
 	QualCost	restrict_qual_cost;
 	double		outer_path_rows = PATH_ROWS(outer_path);
@@ -1413,32 +1433,26 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	if (!enable_nestloop)
 		startup_cost += disable_cost;

+	/* estimate costs to rescan the inner relation */
+	cost_rescan(root, inner_path,
+				&inner_rescan_start_cost,
+				&inner_rescan_total_cost);
+
 	/* cost of source data */

 	/*
 	 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
 	 * before we can start returning tuples, so the join's startup cost is
-	 * their sum.  What's not so clear is whether the inner path's
-	 * startup_cost must be paid again on each rescan of the inner path. This
-	 * is not true if the inner path is materialized or is a hashjoin, but
-	 * probably is true otherwise.
+	 * their sum.  We'll also pay the inner path's rescan startup cost
+	 * multiple times.
 	 */
 	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
 	run_cost += outer_path->total_cost - outer_path->startup_cost;
-	if (IsA(inner_path, MaterialPath) ||
-		IsA(inner_path, HashPath))
-	{
-		/* charge only run cost for each iteration of inner path */
-	}
-	else
-	{
-		/*
-		 * charge startup cost for each iteration of inner path, except we
-		 * already charged the first startup_cost in our own startup
-		 */
-		run_cost += (outer_path_rows - 1) * inner_path->startup_cost;
-	}
+	if (outer_path_rows > 1)
+		run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
+
 	inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
+	inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;

 	if (adjust_semi_join(root, path, sjinfo,
 						 &outer_match_frac,
@@ -1458,12 +1472,22 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 		 * that fraction.  (If we used a larger fuzz factor, we'd have to
 		 * clamp inner_scan_frac to at most 1.0; but since match_count is at
 		 * least 1, no such clamp is needed now.)
+		 *
+		 * A complicating factor is that rescans may be cheaper than first
+		 * scans.  If we never scan all the way to the end of the inner rel,
+		 * it might be (depending on the plan type) that we'd never pay the
+		 * whole inner first-scan run cost.  However it is difficult to
+		 * estimate whether that will happen, so be conservative and always
+		 * charge the whole first-scan cost once.
 		 */
+		run_cost += inner_run_cost;
+
 		outer_matched_rows = rint(outer_path_rows * outer_match_frac);
 		inner_scan_frac = 2.0 / (match_count + 1.0);

-		/* Add inner run cost for outer tuples having matches */
-		run_cost += outer_matched_rows * inner_run_cost * inner_scan_frac;
+		/* Add inner run cost for additional outer tuples having matches */
+		if (outer_matched_rows > 1)
+			run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;

 		/* Compute number of tuples processed (not number emitted!) */
 		ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
@@ -1479,13 +1503,16 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 		if (indexed_join_quals)
 		{
 			run_cost += (outer_path_rows - outer_matched_rows) *
-				inner_run_cost / inner_path_rows;
-			/* We won't be evaluating any quals at all for these rows */
+				inner_rescan_run_cost / inner_path_rows;
+			/*
+			 * We won't be evaluating any quals at all for these rows,
+			 * so don't add them to ntuples.
+			 */
 		}
 		else
 		{
 			run_cost += (outer_path_rows - outer_matched_rows) *
-				inner_run_cost;
+				inner_rescan_run_cost;
 			ntuples += (outer_path_rows - outer_matched_rows) *
 				inner_path_rows;
 		}
@@ -1493,7 +1520,9 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	else
 	{
 		/* Normal case; we'll scan whole input rel for each outer row */
-		run_cost += outer_path_rows * inner_run_cost;
+		run_cost += inner_run_cost;
+		if (outer_path_rows > 1)
+			run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;

 		/* Compute number of tuples processed (not number emitted!) */
 		ntuples = outer_path_rows * inner_path_rows;
@@ -2190,13 +2219,13 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)

 		/*
 		 * Also account for subplan's startup cost. If the subplan is
-		 * uncorrelated or undirect correlated, AND its topmost node is a Sort
-		 * or Material node, assume that we'll only need to pay its startup
-		 * cost once; otherwise assume we pay the startup cost every time.
+		 * uncorrelated or undirect correlated, AND its topmost node is one
+		 * that materializes its output, assume that we'll only need to pay
+		 * its startup cost once; otherwise assume we pay the startup cost
+		 * every time.
 		 */
 		if (subplan->parParam == NIL &&
-			(IsA(plan, Sort) ||
-			 IsA(plan, Material)))
+			ExecMaterializesOutput(nodeTag(plan)))
 			sp_cost.startup += plan->startup_cost;
 		else
 			sp_cost.per_tuple += plan->startup_cost;
@@ -2207,6 +2236,81 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
 }


+/*
+ * cost_rescan
+ *		Given a finished Path, estimate the costs of rescanning it after
+ *		having done so the first time.  For some Path types a rescan is
+ *		cheaper than an original scan (if no parameters change), and this
+ *		function embodies knowledge about that.  The default is to return
+ *		the same costs stored in the Path.  (Note that the cost estimates
+ *		actually stored in Paths are always for first scans.)
+ *
+ * This function is not currently intended to model effects such as rescans
+ * being cheaper due to disk block caching; what we are concerned with is
+ * plan types wherein the executor caches results explicitly, or doesn't
+ * redo startup calculations, etc.
+ */
+static void
+cost_rescan(PlannerInfo *root, Path *path,
+			Cost *rescan_startup_cost,		/* output parameters */
+			Cost *rescan_total_cost)
+{
+	switch (path->pathtype)
+	{
+		case T_FunctionScan:
+			/*
+			 * Currently, nodeFunctionscan.c always executes the function
+			 * to completion before returning any rows, and caches the
+			 * results in a tuplestore.  So the function eval cost is
+			 * all startup cost and isn't paid over again on rescans.
+			 * However, all run costs will be paid over again.
+			 */
+			*rescan_startup_cost = 0;
+			*rescan_total_cost = path->total_cost - path->startup_cost;
+			break;
+		case T_HashJoin:
+			/*
+			 * Assume that all of the startup cost represents hash table
+			 * building, which we won't have to do over.
+			 */
+			*rescan_startup_cost = 0;
+			*rescan_total_cost = path->total_cost - path->startup_cost;
+			break;
+		case T_Material:
+		case T_CteScan:
+		case T_WorkTableScan:
+		case T_Sort:
+			{
+				/*
+				 * These plan types materialize their final result in a
+				 * tuplestore or tuplesort object.  So the rescan cost is only
+				 * cpu_tuple_cost per tuple, unless the result is large enough
+				 * to spill to disk.
+				 */
+				Cost	run_cost = cpu_tuple_cost * path->parent->rows;
+				double	nbytes = relation_byte_size(path->parent->rows,
+													path->parent->width);
+				long	work_mem_bytes = work_mem * 1024L;
+
+				if (nbytes > work_mem_bytes)
+				{
+					/* It will spill, so account for re-read cost */
+					double		npages = ceil(nbytes / BLCKSZ);
+
+					run_cost += seq_page_cost * npages;
+				}
+				*rescan_startup_cost = 0;
+				*rescan_total_cost = run_cost;
+			}
+			break;
+		default:
+			*rescan_startup_cost = path->startup_cost;
+			*rescan_total_cost = path->total_cost;
+			break;
+	}
+}
+
+
 /*
 * cost_qual_eval
 *		Estimate the CPU costs of evaluating a WHERE clause.
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.122 2009/06/11 14:48:59 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/joinpath.c,v 1.123 2009/09/12 22:12:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -16,6 +16,7 @@

 #include <math.h>

+#include "executor/executor.h"
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
@@ -405,18 +406,10 @@ match_unsorted_outer(PlannerInfo *root,
 	else if (nestjoinOK)
 	{
 		/*
-		 * If the cheapest inner path is a join or seqscan, we should consider
-		 * materializing it.  (This is a heuristic: we could consider it
-		 * always, but for inner indexscans it's probably a waste of time.)
-		 * Also skip it if the inner path materializes its output anyway.
+		 * Consider materializing the cheapest inner path, unless it is one
+		 * that materializes its output anyway.
 		 */
-		if (!(inner_cheapest_total->pathtype == T_IndexScan ||
-			  inner_cheapest_total->pathtype == T_BitmapHeapScan ||
-			  inner_cheapest_total->pathtype == T_TidScan ||
-			  inner_cheapest_total->pathtype == T_Material ||
-			  inner_cheapest_total->pathtype == T_FunctionScan ||
-			  inner_cheapest_total->pathtype == T_CteScan ||
-			  inner_cheapest_total->pathtype == T_WorkTableScan))
+		if (!ExecMaterializesOutput(inner_cheapest_total->pathtype))
 			matpath = (Path *)
 				create_material_path(innerrel, inner_cheapest_total);

--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.261 2009/07/17 23:19:34 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.262 2009/09/12 22:12:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -3266,6 +3266,7 @@ materialize_finished_plan(Plan *subplan)

 	/* Set cost data */
 	cost_material(&matpath,
+				  subplan->startup_cost,
 				  subplan->total_cost,
 				  subplan->plan_rows,
 				  subplan->plan_width);
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/subselect.c,v 1.152 2009/07/16 06:33:43 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/subselect.c,v 1.153 2009/09/12 22:12:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -15,6 +15,7 @@

 #include "catalog/pg_operator.h"
 #include "catalog/pg_type.h"
+#include "executor/executor.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
@@ -564,33 +565,16 @@ build_subplan(PlannerInfo *root, Plan *plan, List *rtable,
 			splan->useHashTable = true;

 		/*
-		 * Otherwise, we have the option to tack a MATERIAL node onto the top
+		 * Otherwise, we have the option to tack a Material node onto the top
 		 * of the subplan, to reduce the cost of reading it repeatedly.  This
 		 * is pointless for a direct-correlated subplan, since we'd have to
 		 * recompute its results each time anyway.	For uncorrelated/undirect
-		 * correlated subplans, we add MATERIAL unless the subplan's top plan
+		 * correlated subplans, we add Material unless the subplan's top plan
 		 * node would materialize its output anyway.
 		 */
-		else if (splan->parParam == NIL)
-		{
-			bool		use_material;
-
-			switch (nodeTag(plan))
-			{
-				case T_Material:
-				case T_FunctionScan:
-				case T_CteScan:
-				case T_WorkTableScan:
-				case T_Sort:
-					use_material = false;
-					break;
-				default:
-					use_material = true;
-					break;
-			}
-			if (use_material)
-				plan = materialize_finished_plan(plan);
-		}
+		else if (splan->parParam == NIL &&
+				 !ExecMaterializesOutput(nodeTag(plan)))
+			plan = materialize_finished_plan(plan);

 		result = (Node *) splan;
 		isInitPlan = false;
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.152 2009/06/11 14:48:59 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.153 2009/09/12 22:12:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -711,6 +711,7 @@ create_material_path(RelOptInfo *rel, Path *subpath)
 	pathnode->subpath = subpath;

 	cost_material(&pathnode->path,
+				  subpath->startup_cost,
 				  subpath->total_cost,
 				  rel->rows,
 				  rel->width);
@@ -1424,7 +1425,8 @@ create_mergejoin_path(PlannerInfo *root,
 		 * cost_mergejoin will avoid choosing anyway).	Therefore
 		 * cost_material's cost estimate is bogus and we should charge just
 		 * cpu_tuple_cost per tuple.  (Keep this estimate in sync with similar
-		 * ones in cost_mergejoin and create_mergejoin_plan.)
+		 * ones in cost_mergejoin and create_mergejoin_plan; also see
+		 * cost_rescan.)
 		 */
 		mpath->startup_cost = inner_path->startup_cost;
 		mpath->total_cost = inner_path->total_cost;