mirror of
https://github.com/postgres/postgres.git
synced 2025-07-05 07:21:24 +03:00
Teach CLUSTER to use seqscan-and-sort when it's faster than indexscan.
... or at least, when the planner's cost estimates say it will be faster. Leonardo Francalanci, reviewed by Itagaki Takahiro and Tom Lane
This commit is contained in:
@ -1071,33 +1071,37 @@ cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm)
|
||||
* Determines and returns the cost of sorting a relation, including
|
||||
* the cost of reading the input data.
|
||||
*
|
||||
* If the total volume of data to sort is less than work_mem, we will do
|
||||
* If the total volume of data to sort is less than sort_mem, we will do
|
||||
* an in-memory sort, which requires no I/O and about t*log2(t) tuple
|
||||
* comparisons for t tuples.
|
||||
*
|
||||
* If the total volume exceeds work_mem, we switch to a tape-style merge
|
||||
* If the total volume exceeds sort_mem, we switch to a tape-style merge
|
||||
* algorithm. There will still be about t*log2(t) tuple comparisons in
|
||||
* total, but we will also need to write and read each tuple once per
|
||||
* merge pass. We expect about ceil(logM(r)) merge passes where r is the
|
||||
* number of initial runs formed and M is the merge order used by tuplesort.c.
|
||||
* Since the average initial run should be about twice work_mem, we have
|
||||
* disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
|
||||
* Since the average initial run should be about twice sort_mem, we have
|
||||
* disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem)))
|
||||
* cpu = comparison_cost * t * log2(t)
|
||||
*
|
||||
* If the sort is bounded (i.e., only the first k result tuples are needed)
|
||||
* and k tuples can fit into work_mem, we use a heap method that keeps only
|
||||
* and k tuples can fit into sort_mem, we use a heap method that keeps only
|
||||
* k tuples in the heap; this will require about t*log2(k) tuple comparisons.
|
||||
*
|
||||
* The disk traffic is assumed to be 3/4ths sequential and 1/4th random
|
||||
* accesses (XXX can't we refine that guess?)
|
||||
*
|
||||
* We charge two operator evals per tuple comparison, which should be in
|
||||
* the right ballpark in most cases.
|
||||
* By default, we charge two operator evals per tuple comparison, which should
|
||||
* be in the right ballpark in most cases. The caller can tweak this by
|
||||
* specifying nonzero comparison_cost; typically that's used for any extra
|
||||
* work that has to be done to prepare the inputs to the comparison operators.
|
||||
*
|
||||
* 'pathkeys' is a list of sort keys
|
||||
* 'input_cost' is the total cost for reading the input data
|
||||
* 'tuples' is the number of tuples in the relation
|
||||
* 'width' is the average tuple width in bytes
|
||||
* 'comparison_cost' is the extra cost per comparison, if any
|
||||
* 'sort_mem' is the number of kilobytes of work memory allowed for the sort
|
||||
* 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
|
||||
*
|
||||
* NOTE: some callers currently pass NIL for pathkeys because they
|
||||
@ -1110,6 +1114,7 @@ cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm)
|
||||
void
|
||||
cost_sort(Path *path, PlannerInfo *root,
|
||||
List *pathkeys, Cost input_cost, double tuples, int width,
|
||||
Cost comparison_cost, int sort_mem,
|
||||
double limit_tuples)
|
||||
{
|
||||
Cost startup_cost = input_cost;
|
||||
@ -1117,7 +1122,7 @@ cost_sort(Path *path, PlannerInfo *root,
|
||||
double input_bytes = relation_byte_size(tuples, width);
|
||||
double output_bytes;
|
||||
double output_tuples;
|
||||
long work_mem_bytes = work_mem * 1024L;
|
||||
long sort_mem_bytes = sort_mem * 1024L;
|
||||
|
||||
if (!enable_sort)
|
||||
startup_cost += disable_cost;
|
||||
@ -1129,6 +1134,9 @@ cost_sort(Path *path, PlannerInfo *root,
|
||||
if (tuples < 2.0)
|
||||
tuples = 2.0;
|
||||
|
||||
/* Include the default cost-per-comparison */
|
||||
comparison_cost += 2.0 * cpu_operator_cost;
|
||||
|
||||
/* Do we have a useful LIMIT? */
|
||||
if (limit_tuples > 0 && limit_tuples < tuples)
|
||||
{
|
||||
@ -1141,24 +1149,23 @@ cost_sort(Path *path, PlannerInfo *root,
|
||||
output_bytes = input_bytes;
|
||||
}
|
||||
|
||||
if (output_bytes > work_mem_bytes)
|
||||
if (output_bytes > sort_mem_bytes)
|
||||
{
|
||||
/*
|
||||
* We'll have to use a disk-based sort of all the tuples
|
||||
*/
|
||||
double npages = ceil(input_bytes / BLCKSZ);
|
||||
double nruns = (input_bytes / work_mem_bytes) * 0.5;
|
||||
double mergeorder = tuplesort_merge_order(work_mem_bytes);
|
||||
double nruns = (input_bytes / sort_mem_bytes) * 0.5;
|
||||
double mergeorder = tuplesort_merge_order(sort_mem_bytes);
|
||||
double log_runs;
|
||||
double npageaccesses;
|
||||
|
||||
/*
|
||||
* CPU costs
|
||||
*
|
||||
* Assume about two operator evals per tuple comparison and N log2 N
|
||||
* comparisons
|
||||
* Assume about N log2 N comparisons
|
||||
*/
|
||||
startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
|
||||
startup_cost += comparison_cost * tuples * LOG2(tuples);
|
||||
|
||||
/* Disk costs */
|
||||
|
||||
@ -1172,7 +1179,7 @@ cost_sort(Path *path, PlannerInfo *root,
|
||||
startup_cost += npageaccesses *
|
||||
(seq_page_cost * 0.75 + random_page_cost * 0.25);
|
||||
}
|
||||
else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes)
|
||||
else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
|
||||
{
|
||||
/*
|
||||
* We'll use a bounded heap-sort keeping just K tuples in memory, for
|
||||
@ -1180,12 +1187,12 @@ cost_sort(Path *path, PlannerInfo *root,
|
||||
* factor is a bit higher than for quicksort. Tweak it so that the
|
||||
* cost curve is continuous at the crossover point.
|
||||
*/
|
||||
startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples);
|
||||
startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* We'll use plain quicksort on all the input tuples */
|
||||
startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);
|
||||
startup_cost += comparison_cost * tuples * LOG2(tuples);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1786,6 +1793,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
|
||||
outer_path->total_cost,
|
||||
outer_path_rows,
|
||||
outer_path->parent->width,
|
||||
0.0,
|
||||
work_mem,
|
||||
-1.0);
|
||||
startup_cost += sort_path.startup_cost;
|
||||
startup_cost += (sort_path.total_cost - sort_path.startup_cost)
|
||||
@ -1810,6 +1819,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
|
||||
inner_path->total_cost,
|
||||
inner_path_rows,
|
||||
inner_path->parent->width,
|
||||
0.0,
|
||||
work_mem,
|
||||
-1.0);
|
||||
startup_cost += sort_path.startup_cost;
|
||||
startup_cost += (sort_path.total_cost - sort_path.startup_cost)
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <math.h>
|
||||
|
||||
#include "access/skey.h"
|
||||
#include "miscadmin.h"
|
||||
#include "nodes/makefuncs.h"
|
||||
#include "nodes/nodeFuncs.h"
|
||||
#include "optimizer/clauses.h"
|
||||
@ -3041,6 +3042,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
|
||||
lefttree->total_cost,
|
||||
lefttree->plan_rows,
|
||||
lefttree->plan_width,
|
||||
0.0,
|
||||
work_mem,
|
||||
limit_tuples);
|
||||
plan->startup_cost = sort_path.startup_cost;
|
||||
plan->total_cost = sort_path.total_cost;
|
||||
|
@ -20,6 +20,7 @@
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "miscadmin.h"
|
||||
#include "optimizer/cost.h"
|
||||
#include "optimizer/pathnode.h"
|
||||
#include "optimizer/paths.h"
|
||||
@ -415,7 +416,7 @@ query_planner(PlannerInfo *root, List *tlist,
|
||||
cost_sort(&sort_path, root, root->query_pathkeys,
|
||||
cheapestpath->total_cost,
|
||||
final_rel->rows, final_rel->width,
|
||||
limit_tuples);
|
||||
0.0, work_mem, limit_tuples);
|
||||
}
|
||||
|
||||
if (compare_fractional_path_costs(sortedpath, &sort_path,
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "optimizer/cost.h"
|
||||
#include "optimizer/pathnode.h"
|
||||
#include "optimizer/paths.h"
|
||||
#include "optimizer/plancat.h"
|
||||
#include "optimizer/planmain.h"
|
||||
#include "optimizer/planner.h"
|
||||
#include "optimizer/prep.h"
|
||||
@ -2276,7 +2277,8 @@ choose_hashed_grouping(PlannerInfo *root,
|
||||
/* Result of hashed agg is always unsorted */
|
||||
if (target_pathkeys)
|
||||
cost_sort(&hashed_p, root, target_pathkeys, hashed_p.total_cost,
|
||||
dNumGroups, path_width, limit_tuples);
|
||||
dNumGroups, path_width,
|
||||
0.0, work_mem, limit_tuples);
|
||||
|
||||
if (sorted_path)
|
||||
{
|
||||
@ -2293,7 +2295,8 @@ choose_hashed_grouping(PlannerInfo *root,
|
||||
if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys))
|
||||
{
|
||||
cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost,
|
||||
path_rows, path_width, -1.0);
|
||||
path_rows, path_width,
|
||||
0.0, work_mem, -1.0);
|
||||
current_pathkeys = root->group_pathkeys;
|
||||
}
|
||||
|
||||
@ -2310,7 +2313,8 @@ choose_hashed_grouping(PlannerInfo *root,
|
||||
if (target_pathkeys &&
|
||||
!pathkeys_contained_in(target_pathkeys, current_pathkeys))
|
||||
cost_sort(&sorted_p, root, target_pathkeys, sorted_p.total_cost,
|
||||
dNumGroups, path_width, limit_tuples);
|
||||
dNumGroups, path_width,
|
||||
0.0, work_mem, limit_tuples);
|
||||
|
||||
/*
|
||||
* Now make the decision using the top-level tuple fraction. First we
|
||||
@ -2427,7 +2431,8 @@ choose_hashed_distinct(PlannerInfo *root,
|
||||
*/
|
||||
if (parse->sortClause)
|
||||
cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
|
||||
dNumDistinctRows, path_width, limit_tuples);
|
||||
dNumDistinctRows, path_width,
|
||||
0.0, work_mem, limit_tuples);
|
||||
|
||||
/*
|
||||
* Now for the GROUP case. See comments in grouping_planner about the
|
||||
@ -2450,7 +2455,8 @@ choose_hashed_distinct(PlannerInfo *root,
|
||||
else
|
||||
current_pathkeys = root->sort_pathkeys;
|
||||
cost_sort(&sorted_p, root, current_pathkeys, sorted_p.total_cost,
|
||||
path_rows, path_width, -1.0);
|
||||
path_rows, path_width,
|
||||
0.0, work_mem, -1.0);
|
||||
}
|
||||
cost_group(&sorted_p, root, numDistinctCols, dNumDistinctRows,
|
||||
sorted_p.startup_cost, sorted_p.total_cost,
|
||||
@ -2458,7 +2464,8 @@ choose_hashed_distinct(PlannerInfo *root,
|
||||
if (parse->sortClause &&
|
||||
!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
|
||||
cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
|
||||
dNumDistinctRows, path_width, limit_tuples);
|
||||
dNumDistinctRows, path_width,
|
||||
0.0, work_mem, limit_tuples);
|
||||
|
||||
/*
|
||||
* Now make the decision using the top-level tuple fraction. First we
|
||||
@ -2997,3 +3004,107 @@ expression_planner(Expr *expr)
|
||||
|
||||
return (Expr *) result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* plan_cluster_use_sort
|
||||
* Use the planner to decide how CLUSTER should implement sorting
|
||||
*
|
||||
* tableOid is the OID of a table to be clustered on its index indexOid
|
||||
* (which is already known to be a btree index). Decide whether it's
|
||||
* cheaper to do an indexscan or a seqscan-plus-sort to execute the CLUSTER.
|
||||
* Return TRUE to use sorting, FALSE to use an indexscan.
|
||||
*
|
||||
* Note: caller had better already hold some type of lock on the table.
|
||||
*/
|
||||
bool
|
||||
plan_cluster_use_sort(Oid tableOid, Oid indexOid)
|
||||
{
|
||||
PlannerInfo *root;
|
||||
Query *query;
|
||||
PlannerGlobal *glob;
|
||||
RangeTblEntry *rte;
|
||||
RelOptInfo *rel;
|
||||
IndexOptInfo *indexInfo;
|
||||
QualCost indexExprCost;
|
||||
Cost comparisonCost;
|
||||
Path *seqScanPath;
|
||||
Path seqScanAndSortPath;
|
||||
IndexPath *indexScanPath;
|
||||
ListCell *lc;
|
||||
|
||||
/* Set up mostly-dummy planner state */
|
||||
query = makeNode(Query);
|
||||
query->commandType = CMD_SELECT;
|
||||
|
||||
glob = makeNode(PlannerGlobal);
|
||||
|
||||
root = makeNode(PlannerInfo);
|
||||
root->parse = query;
|
||||
root->glob = glob;
|
||||
root->query_level = 1;
|
||||
root->planner_cxt = CurrentMemoryContext;
|
||||
root->wt_param_id = -1;
|
||||
|
||||
/* Build a minimal RTE for the rel */
|
||||
rte = makeNode(RangeTblEntry);
|
||||
rte->rtekind = RTE_RELATION;
|
||||
rte->relid = tableOid;
|
||||
rte->inh = false;
|
||||
rte->inFromCl = true;
|
||||
query->rtable = list_make1(rte);
|
||||
|
||||
/* ... and insert it into PlannerInfo */
|
||||
root->simple_rel_array_size = 2;
|
||||
root->simple_rel_array = (RelOptInfo **)
|
||||
palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));
|
||||
root->simple_rte_array = (RangeTblEntry **)
|
||||
palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *));
|
||||
root->simple_rte_array[1] = rte;
|
||||
|
||||
/* Build RelOptInfo */
|
||||
rel = build_simple_rel(root, 1, RELOPT_BASEREL);
|
||||
|
||||
/*
|
||||
* Rather than doing all the pushups that would be needed to use
|
||||
* set_baserel_size_estimates, just do a quick hack for rows and width.
|
||||
*/
|
||||
rel->rows = rel->tuples;
|
||||
rel->width = get_relation_data_width(tableOid);
|
||||
|
||||
root->total_table_pages = rel->pages;
|
||||
|
||||
/* Locate IndexOptInfo for the target index */
|
||||
indexInfo = NULL;
|
||||
foreach(lc, rel->indexlist)
|
||||
{
|
||||
indexInfo = (IndexOptInfo *) lfirst(lc);
|
||||
if (indexInfo->indexoid == indexOid)
|
||||
break;
|
||||
}
|
||||
if (lc == NULL) /* not in the list? */
|
||||
elog(ERROR, "index %u does not belong to table %u",
|
||||
indexOid, tableOid);
|
||||
|
||||
/*
|
||||
* Determine eval cost of the index expressions, if any. We need to
|
||||
* charge twice that amount for each tuple comparison that happens
|
||||
* during the sort, since tuplesort.c will have to re-evaluate the
|
||||
* index expressions each time. (XXX that's pretty inefficient...)
|
||||
*/
|
||||
cost_qual_eval(&indexExprCost, indexInfo->indexprs, root);
|
||||
comparisonCost = 2.0 * (indexExprCost.startup + indexExprCost.per_tuple);
|
||||
|
||||
/* Estimate the cost of seq scan + sort */
|
||||
seqScanPath = create_seqscan_path(root, rel);
|
||||
cost_sort(&seqScanAndSortPath, root, NIL,
|
||||
seqScanPath->total_cost, rel->tuples, rel->width,
|
||||
comparisonCost, maintenance_work_mem, -1.0);
|
||||
|
||||
/* Estimate the cost of index scan */
|
||||
indexScanPath = create_index_path(root, indexInfo,
|
||||
NIL, NIL,
|
||||
ForwardScanDirection, NULL);
|
||||
|
||||
return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
|
||||
}
|
||||
|
@ -805,7 +805,8 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
|
||||
sorted_p.total_cost = input_plan->total_cost;
|
||||
/* XXX cost_sort doesn't actually look at pathkeys, so just pass NIL */
|
||||
cost_sort(&sorted_p, root, NIL, sorted_p.total_cost,
|
||||
input_plan->plan_rows, input_plan->plan_width, -1.0);
|
||||
input_plan->plan_rows, input_plan->plan_width,
|
||||
0.0, work_mem, -1.0);
|
||||
cost_group(&sorted_p, root, numGroupCols, dNumGroups,
|
||||
sorted_p.startup_cost, sorted_p.total_cost,
|
||||
input_plan->plan_rows);
|
||||
|
@ -969,6 +969,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
|
||||
subpath->total_cost,
|
||||
rel->rows,
|
||||
rel->width,
|
||||
0.0,
|
||||
work_mem,
|
||||
-1.0);
|
||||
|
||||
/*
|
||||
|
@ -46,6 +46,7 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION;
|
||||
get_relation_info_hook_type get_relation_info_hook = NULL;
|
||||
|
||||
|
||||
static int32 get_rel_data_width(Relation rel, int32 *attr_widths);
|
||||
static List *get_relation_constraints(PlannerInfo *root,
|
||||
Oid relationObjectId, RelOptInfo *rel,
|
||||
bool include_notnull);
|
||||
@ -406,28 +407,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
|
||||
* platform dependencies in the default plans which are kind
|
||||
* of a headache for regression testing.
|
||||
*/
|
||||
int32 tuple_width = 0;
|
||||
int i;
|
||||
int32 tuple_width;
|
||||
|
||||
for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++)
|
||||
{
|
||||
Form_pg_attribute att = rel->rd_att->attrs[i - 1];
|
||||
int32 item_width;
|
||||
|
||||
if (att->attisdropped)
|
||||
continue;
|
||||
/* This should match set_rel_width() in costsize.c */
|
||||
item_width = get_attavgwidth(RelationGetRelid(rel), i);
|
||||
if (item_width <= 0)
|
||||
{
|
||||
item_width = get_typavgwidth(att->atttypid,
|
||||
att->atttypmod);
|
||||
Assert(item_width > 0);
|
||||
}
|
||||
if (attr_widths != NULL)
|
||||
attr_widths[i] = item_width;
|
||||
tuple_width += item_width;
|
||||
}
|
||||
tuple_width = get_rel_data_width(rel, attr_widths);
|
||||
tuple_width += sizeof(HeapTupleHeaderData);
|
||||
tuple_width += sizeof(ItemPointerData);
|
||||
/* note: integer division is intentional here */
|
||||
@ -449,6 +431,68 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* get_rel_data_width
|
||||
*
|
||||
* Estimate the average width of (the data part of) the relation's tuples.
|
||||
* If attr_widths isn't NULL, also store per-column width estimates into
|
||||
* that array.
|
||||
*
|
||||
* Currently we ignore dropped columns. Ideally those should be included
|
||||
* in the result, but we haven't got any way to get info about them; and
|
||||
* since they might be mostly NULLs, treating them as zero-width is not
|
||||
* necessarily the wrong thing anyway.
|
||||
*/
|
||||
static int32
|
||||
get_rel_data_width(Relation rel, int32 *attr_widths)
|
||||
{
|
||||
int32 tuple_width = 0;
|
||||
int i;
|
||||
|
||||
for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++)
|
||||
{
|
||||
Form_pg_attribute att = rel->rd_att->attrs[i - 1];
|
||||
int32 item_width;
|
||||
|
||||
if (att->attisdropped)
|
||||
continue;
|
||||
/* This should match set_rel_width() in costsize.c */
|
||||
item_width = get_attavgwidth(RelationGetRelid(rel), i);
|
||||
if (item_width <= 0)
|
||||
{
|
||||
item_width = get_typavgwidth(att->atttypid, att->atttypmod);
|
||||
Assert(item_width > 0);
|
||||
}
|
||||
if (attr_widths != NULL)
|
||||
attr_widths[i] = item_width;
|
||||
tuple_width += item_width;
|
||||
}
|
||||
|
||||
return tuple_width;
|
||||
}
|
||||
|
||||
/*
|
||||
* get_relation_data_width
|
||||
*
|
||||
* External API for get_rel_data_width
|
||||
*/
|
||||
int32
|
||||
get_relation_data_width(Oid relid)
|
||||
{
|
||||
int32 result;
|
||||
Relation relation;
|
||||
|
||||
/* As above, assume relation is already locked */
|
||||
relation = heap_open(relid, NoLock);
|
||||
|
||||
result = get_rel_data_width(relation, NULL);
|
||||
|
||||
heap_close(relation, NoLock);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* get_relation_constraints
|
||||
*
|
||||
|
Reference in New Issue
Block a user