Teach CLUSTER to use seqscan-and-sort when it's faster than indexscan.

... or at least, when the planner's cost estimates say it will be faster. Leonardo Francalanci, reviewed by Itagaki Takahiro and Tom Lane
2025-07-07 00:36:50 +03:00 · 2010-10-07 20:00:28 -04:00
parent 694c56af2b
commit 3ba11d3df2
14 changed files with 716 additions and 141 deletions
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@ -1071,33 +1071,37 @@ cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm)
 *	  Determines and returns the cost of sorting a relation, including
 *	  the cost of reading the input data.
 *
- * If the total volume of data to sort is less than work_mem, we will do
+ * If the total volume of data to sort is less than sort_mem, we will do
 * an in-memory sort, which requires no I/O and about t*log2(t) tuple
 * comparisons for t tuples.
 *
- * If the total volume exceeds work_mem, we switch to a tape-style merge
+ * If the total volume exceeds sort_mem, we switch to a tape-style merge
 * algorithm.  There will still be about t*log2(t) tuple comparisons in
 * total, but we will also need to write and read each tuple once per
 * merge pass.	We expect about ceil(logM(r)) merge passes where r is the
 * number of initial runs formed and M is the merge order used by tuplesort.c.
- * Since the average initial run should be about twice work_mem, we have
- *		disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
+ * Since the average initial run should be about twice sort_mem, we have
+ *		disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem)))
 *		cpu = comparison_cost * t * log2(t)
 *
 * If the sort is bounded (i.e., only the first k result tuples are needed)
- * and k tuples can fit into work_mem, we use a heap method that keeps only
+ * and k tuples can fit into sort_mem, we use a heap method that keeps only
 * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
 *
 * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
 * accesses (XXX can't we refine that guess?)
 *
- * We charge two operator evals per tuple comparison, which should be in
- * the right ballpark in most cases.
+ * By default, we charge two operator evals per tuple comparison, which should
+ * be in the right ballpark in most cases.  The caller can tweak this by
+ * specifying nonzero comparison_cost; typically that's used for any extra
+ * work that has to be done to prepare the inputs to the comparison operators.
 *
 * 'pathkeys' is a list of sort keys
 * 'input_cost' is the total cost for reading the input data
 * 'tuples' is the number of tuples in the relation
 * 'width' is the average tuple width in bytes
+ * 'comparison_cost' is the extra cost per comparison, if any
+ * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
 * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
 *
 * NOTE: some callers currently pass NIL for pathkeys because they
@ -1110,6 +1114,7 @@ cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm)
 void
 cost_sort(Path *path, PlannerInfo *root,
 		  List *pathkeys, Cost input_cost, double tuples, int width,
+		  Cost comparison_cost, int sort_mem,
 		  double limit_tuples)
 {
 	Cost		startup_cost = input_cost;
@ -1117,7 +1122,7 @@ cost_sort(Path *path, PlannerInfo *root,
 	double		input_bytes = relation_byte_size(tuples, width);
 	double		output_bytes;
 	double		output_tuples;
-	long		work_mem_bytes = work_mem * 1024L;
+	long		sort_mem_bytes = sort_mem * 1024L;

 	if (!enable_sort)
 		startup_cost += disable_cost;
@ -1129,6 +1134,9 @@ cost_sort(Path *path, PlannerInfo *root,
 	if (tuples < 2.0)
 		tuples = 2.0;

+	/* Include the default cost-per-comparison */
+	comparison_cost += 2.0 * cpu_operator_cost;
+
 	/* Do we have a useful LIMIT? */
 	if (limit_tuples > 0 && limit_tuples < tuples)
 	{
@ -1141,24 +1149,23 @@ cost_sort(Path *path, PlannerInfo *root,
 		output_bytes = input_bytes;
 	}

-	if (output_bytes > work_mem_bytes)
+	if (output_bytes > sort_mem_bytes)
 	{
 		/*
 		 * We'll have to use a disk-based sort of all the tuples
 		 */
 		double		npages = ceil(input_bytes / BLCKSZ);
-		double		nruns = (input_bytes / work_mem_bytes) * 0.5;
-		double		mergeorder = tuplesort_merge_order(work_mem_bytes);
+		double		nruns = (input_bytes / sort_mem_bytes) * 0.5;
+		double		mergeorder = tuplesort_merge_order(sort_mem_bytes);
 		double		log_runs;
 		double		npageaccesses;

 		/*
 		 * CPU costs
 		 *
-		 * Assume about two operator evals per tuple comparison and N log2 N
-		 * comparisons
+		 * Assume about N log2 N comparisons
 		 */
-		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+		startup_cost += comparison_cost * tuples * LOG2(tuples);

 		/* Disk costs */

@ -1172,7 +1179,7 @@ cost_sort(Path *path, PlannerInfo *root,
 		startup_cost += npageaccesses *
 			(seq_page_cost * 0.75 + random_page_cost * 0.25);
 	}
-	else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
+	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
 	{
 		/*
 		 * We'll use a bounded heap-sort keeping just K tuples in memory, for
@ -1180,12 +1187,12 @@ cost_sort(Path *path, PlannerInfo *root,
 		 * factor is a bit higher than for quicksort.  Tweak it so that the
 		 * cost curve is continuous at the crossover point.
 		 */
-		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
+		startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
 	}
 	else
 	{
 		/* We'll use plain quicksort on all the input tuples */
-		startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
+		startup_cost += comparison_cost * tuples * LOG2(tuples);
 	}

 	/*
@ -1786,6 +1793,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 				  outer_path->total_cost,
 				  outer_path_rows,
 				  outer_path->parent->width,
+				  0.0,
+				  work_mem,
 				  -1.0);
 		startup_cost += sort_path.startup_cost;
 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
@ -1810,6 +1819,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 				  inner_path->total_cost,
 				  inner_path_rows,
 				  inner_path->parent->width,
+				  0.0,
+				  work_mem,
 				  -1.0);
 		startup_cost += sort_path.startup_cost;
 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@ -20,6 +20,7 @@
 #include <math.h>

 #include "access/skey.h"
+#include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
@ -3041,6 +3042,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
 			  lefttree->total_cost,
 			  lefttree->plan_rows,
 			  lefttree->plan_width,
+			  0.0,
+			  work_mem,
 			  limit_tuples);
 	plan->startup_cost = sort_path.startup_cost;
 	plan->total_cost = sort_path.total_cost;
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@ -20,6 +20,7 @@
 */
 #include "postgres.h"

+#include "miscadmin.h"
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
@ -415,7 +416,7 @@ query_planner(PlannerInfo *root, List *tlist,
 			cost_sort(&sort_path, root, root->query_pathkeys,
 					  cheapestpath->total_cost,
 					  final_rel->rows, final_rel->width,
-					  limit_tuples);
+					  0.0, work_mem, limit_tuples);
 		}

 		if (compare_fractional_path_costs(sortedpath, &sort_path,
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@ -26,6 +26,7 @@
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
+#include "optimizer/plancat.h"
 #include "optimizer/planmain.h"
 #include "optimizer/planner.h"
 #include "optimizer/prep.h"
@ -2276,7 +2277,8 @@ choose_hashed_grouping(PlannerInfo *root,
 	/* Result of hashed agg is always unsorted */
 	if (target_pathkeys)
 		cost_sort(&hashed_p, root, target_pathkeys, hashed_p.total_cost,
-				  dNumGroups, path_width, limit_tuples);
+				  dNumGroups, path_width,
+				  0.0, work_mem, limit_tuples);

 	if (sorted_path)
 	{
@ -2293,7 +2295,8 @@ choose_hashed_grouping(PlannerInfo *root,
 	if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys))
 	{
 		cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost,
-				  path_rows, path_width, -1.0);
+				  path_rows, path_width,
+				  0.0, work_mem, -1.0);
 		current_pathkeys = root->group_pathkeys;
 	}

@ -2310,7 +2313,8 @@ choose_hashed_grouping(PlannerInfo *root,
 	if (target_pathkeys &&
 		!pathkeys_contained_in(target_pathkeys, current_pathkeys))
 		cost_sort(&sorted_p, root, target_pathkeys, sorted_p.total_cost,
-				  dNumGroups, path_width, limit_tuples);
+				  dNumGroups, path_width,
+				  0.0, work_mem, limit_tuples);

 	/*
 	 * Now make the decision using the top-level tuple fraction.  First we
@ -2427,7 +2431,8 @@ choose_hashed_distinct(PlannerInfo *root,
 	 */
 	if (parse->sortClause)
 		cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
-				  dNumDistinctRows, path_width, limit_tuples);
+				  dNumDistinctRows, path_width,
+				  0.0, work_mem, limit_tuples);

 	/*
 	 * Now for the GROUP case.	See comments in grouping_planner about the
@ -2450,7 +2455,8 @@ choose_hashed_distinct(PlannerInfo *root,
 		else
 			current_pathkeys = root->sort_pathkeys;
 		cost_sort(&sorted_p, root, current_pathkeys, sorted_p.total_cost,
-				  path_rows, path_width, -1.0);
+				  path_rows, path_width,
+				  0.0, work_mem, -1.0);
 	}
 	cost_group(&sorted_p, root, numDistinctCols, dNumDistinctRows,
 			   sorted_p.startup_cost, sorted_p.total_cost,
@ -2458,7 +2464,8 @@ choose_hashed_distinct(PlannerInfo *root,
 	if (parse->sortClause &&
 		!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
 		cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
-				  dNumDistinctRows, path_width, limit_tuples);
+				  dNumDistinctRows, path_width,
+				  0.0, work_mem, limit_tuples);

 	/*
 	 * Now make the decision using the top-level tuple fraction.  First we
@ -2997,3 +3004,107 @@ expression_planner(Expr *expr)

 	return (Expr *) result;
 }
+
+
+/*
+ * plan_cluster_use_sort
+ *		Use the planner to decide how CLUSTER should implement sorting
+ *
+ * tableOid is the OID of a table to be clustered on its index indexOid
+ * (which is already known to be a btree index).  Decide whether it's
+ * cheaper to do an indexscan or a seqscan-plus-sort to execute the CLUSTER.
+ * Return TRUE to use sorting, FALSE to use an indexscan.
+ *
+ * Note: caller had better already hold some type of lock on the table.
+ */
+bool
+plan_cluster_use_sort(Oid tableOid, Oid indexOid)
+{
+	PlannerInfo *root;
+	Query	   *query;
+	PlannerGlobal *glob;
+	RangeTblEntry *rte;
+	RelOptInfo *rel;
+	IndexOptInfo *indexInfo;
+	QualCost	indexExprCost;
+	Cost		comparisonCost;
+	Path	   *seqScanPath;
+	Path		seqScanAndSortPath;
+	IndexPath  *indexScanPath;
+	ListCell   *lc;
+
+	/* Set up mostly-dummy planner state */
+	query = makeNode(Query);
+	query->commandType = CMD_SELECT;
+
+	glob = makeNode(PlannerGlobal);
+
+	root = makeNode(PlannerInfo);
+	root->parse = query;
+	root->glob = glob;
+	root->query_level = 1;
+	root->planner_cxt = CurrentMemoryContext;
+	root->wt_param_id = -1;
+
+	/* Build a minimal RTE for the rel */
+	rte = makeNode(RangeTblEntry);
+	rte->rtekind = RTE_RELATION;
+	rte->relid = tableOid;
+	rte->inh = false;
+	rte->inFromCl = true;
+	query->rtable = list_make1(rte);
+
+	/* ... and insert it into PlannerInfo */
+	root->simple_rel_array_size = 2;
+	root->simple_rel_array = (RelOptInfo **)
+		palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));
+	root->simple_rte_array = (RangeTblEntry **)
+		palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *));
+	root->simple_rte_array[1] = rte;
+
+	/* Build RelOptInfo */
+	rel = build_simple_rel(root, 1, RELOPT_BASEREL);
+
+	/*
+	 * Rather than doing all the pushups that would be needed to use
+	 * set_baserel_size_estimates, just do a quick hack for rows and width.
+	 */
+	rel->rows = rel->tuples;
+	rel->width = get_relation_data_width(tableOid);
+
+	root->total_table_pages = rel->pages;
+
+	/* Locate IndexOptInfo for the target index */
+	indexInfo = NULL;
+	foreach(lc, rel->indexlist)
+	{
+		indexInfo = (IndexOptInfo *) lfirst(lc);
+		if (indexInfo->indexoid == indexOid)
+			break;
+	}
+	if (lc == NULL)				/* not in the list? */
+		elog(ERROR, "index %u does not belong to table %u",
+			 indexOid, tableOid);
+
+	/*
+	 * Determine eval cost of the index expressions, if any.  We need to
+	 * charge twice that amount for each tuple comparison that happens
+	 * during the sort, since tuplesort.c will have to re-evaluate the
+	 * index expressions each time.  (XXX that's pretty inefficient...)
+	 */
+	cost_qual_eval(&indexExprCost, indexInfo->indexprs, root);
+	comparisonCost = 2.0 * (indexExprCost.startup + indexExprCost.per_tuple);
+
+	/* Estimate the cost of seq scan + sort */
+	seqScanPath = create_seqscan_path(root, rel);
+	cost_sort(&seqScanAndSortPath, root, NIL,
+			  seqScanPath->total_cost, rel->tuples, rel->width,
+			  comparisonCost, maintenance_work_mem, -1.0);
+
+	/* Estimate the cost of index scan */
+	indexScanPath = create_index_path(root, indexInfo,
+									  NIL, NIL,
+									  ForwardScanDirection, NULL);
+
+	return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
+}
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@ -805,7 +805,8 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 	sorted_p.total_cost = input_plan->total_cost;
 	/* XXX cost_sort doesn't actually look at pathkeys, so just pass NIL */
 	cost_sort(&sorted_p, root, NIL, sorted_p.total_cost,
-			  input_plan->plan_rows, input_plan->plan_width, -1.0);
+			  input_plan->plan_rows, input_plan->plan_width,
+			  0.0, work_mem, -1.0);
 	cost_group(&sorted_p, root, numGroupCols, dNumGroups,
 			   sorted_p.startup_cost, sorted_p.total_cost,
 			   input_plan->plan_rows);
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@ -969,6 +969,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 				  subpath->total_cost,
 				  rel->rows,
 				  rel->width,
+				  0.0,
+				  work_mem,
 				  -1.0);

 		/*
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@ -46,6 +46,7 @@ int			constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION;
 get_relation_info_hook_type get_relation_info_hook = NULL;


+static int32 get_rel_data_width(Relation rel, int32 *attr_widths);
 static List *get_relation_constraints(PlannerInfo *root,
 						 Oid relationObjectId, RelOptInfo *rel,
 						 bool include_notnull);
@ -406,28 +407,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
 				 * platform dependencies in the default plans which are kind
 				 * of a headache for regression testing.
 				 */
-				int32		tuple_width = 0;
-				int			i;
+				int32		tuple_width;

-				for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++)
-				{
-					Form_pg_attribute att = rel->rd_att->attrs[i - 1];
-					int32		item_width;
-
-					if (att->attisdropped)
-						continue;
-					/* This should match set_rel_width() in costsize.c */
-					item_width = get_attavgwidth(RelationGetRelid(rel), i);
-					if (item_width <= 0)
-					{
-						item_width = get_typavgwidth(att->atttypid,
-													 att->atttypmod);
-						Assert(item_width > 0);
-					}
-					if (attr_widths != NULL)
-						attr_widths[i] = item_width;
-					tuple_width += item_width;
-				}
+				tuple_width = get_rel_data_width(rel, attr_widths);
 				tuple_width += sizeof(HeapTupleHeaderData);
 				tuple_width += sizeof(ItemPointerData);
 				/* note: integer division is intentional here */
@ -449,6 +431,68 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
 }


+/*
+ * get_rel_data_width
+ *
+ * Estimate the average width of (the data part of) the relation's tuples.
+ * If attr_widths isn't NULL, also store per-column width estimates into
+ * that array.
+ *
+ * Currently we ignore dropped columns.  Ideally those should be included
+ * in the result, but we haven't got any way to get info about them; and
+ * since they might be mostly NULLs, treating them as zero-width is not
+ * necessarily the wrong thing anyway.
+ */
+static int32
+get_rel_data_width(Relation rel, int32 *attr_widths)
+{
+	int32		tuple_width = 0;
+	int			i;
+
+	for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++)
+	{
+		Form_pg_attribute att = rel->rd_att->attrs[i - 1];
+		int32		item_width;
+
+		if (att->attisdropped)
+			continue;
+		/* This should match set_rel_width() in costsize.c */
+		item_width = get_attavgwidth(RelationGetRelid(rel), i);
+		if (item_width <= 0)
+		{
+			item_width = get_typavgwidth(att->atttypid, att->atttypmod);
+			Assert(item_width > 0);
+		}
+		if (attr_widths != NULL)
+			attr_widths[i] = item_width;
+		tuple_width += item_width;
+	}
+
+	return tuple_width;
+}
+
+/*
+ * get_relation_data_width
+ *
+ * External API for get_rel_data_width
+ */
+int32
+get_relation_data_width(Oid relid)
+{
+	int32		result;
+	Relation	relation;
+
+	/* As above, assume relation is already locked */
+	relation = heap_open(relid, NoLock);
+
+	result = get_rel_data_width(relation, NULL);
+
+	heap_close(relation, NoLock);
+
+	return result;
+}
+
+
 /*
 * get_relation_constraints
 *