mirror of
https://github.com/postgres/postgres.git
synced 2025-11-07 19:06:32 +03:00
Add an at-least-marginally-plausible method of estimating the number
of groups produced by GROUP BY. This improves the accuracy of planning estimates for grouped subselects, and is needed to check whether a hashed aggregation plan risks memory overflow.
This commit is contained in:
@@ -10,7 +10,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.122 2002/11/15 02:36:53 tgl Exp $
|
||||
* $Header: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v 1.123 2002/11/19 23:21:58 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -1684,7 +1684,8 @@ make_material(List *tlist, Plan *lefttree)
|
||||
|
||||
Agg *
|
||||
make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
|
||||
int ngrp, AttrNumber *grpColIdx, Plan *lefttree)
|
||||
int ngrp, AttrNumber *grpColIdx, long numGroups, int numAggs,
|
||||
Plan *lefttree)
|
||||
{
|
||||
Agg *node = makeNode(Agg);
|
||||
Plan *plan = &node->plan;
|
||||
@@ -1692,6 +1693,7 @@ make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
|
||||
node->aggstrategy = aggstrategy;
|
||||
node->numCols = ngrp;
|
||||
node->grpColIdx = grpColIdx;
|
||||
node->numGroups = numGroups;
|
||||
|
||||
copy_plan_costsize(plan, lefttree);
|
||||
|
||||
@@ -1699,15 +1701,11 @@ make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
|
||||
* Charge one cpu_operator_cost per aggregate function per input
|
||||
* tuple.
|
||||
*/
|
||||
plan->total_cost += cpu_operator_cost * plan->plan_rows *
|
||||
(length(pull_agg_clause((Node *) tlist)) +
|
||||
length(pull_agg_clause((Node *) qual)));
|
||||
plan->total_cost += cpu_operator_cost * plan->plan_rows * numAggs;
|
||||
|
||||
/*
|
||||
* We will produce a single output tuple if not grouping,
|
||||
* and a tuple per group otherwise. For now, estimate the number of
|
||||
* groups as 10% of the number of tuples --- bogus, but how to do
|
||||
* better?
|
||||
* and a tuple per group otherwise.
|
||||
*/
|
||||
if (aggstrategy == AGG_PLAIN)
|
||||
{
|
||||
@@ -1716,10 +1714,7 @@ make_agg(List *tlist, List *qual, AggStrategy aggstrategy,
|
||||
}
|
||||
else
|
||||
{
|
||||
plan->plan_rows *= 0.1;
|
||||
if (plan->plan_rows < 1)
|
||||
plan->plan_rows = 1;
|
||||
node->numGroups = (long) plan->plan_rows;
|
||||
plan->plan_rows = numGroups;
|
||||
}
|
||||
|
||||
plan->state = (EState *) NULL;
|
||||
@@ -1735,6 +1730,7 @@ Group *
|
||||
make_group(List *tlist,
|
||||
int ngrp,
|
||||
AttrNumber *grpColIdx,
|
||||
double numGroups,
|
||||
Plan *lefttree)
|
||||
{
|
||||
Group *node = makeNode(Group);
|
||||
@@ -1748,13 +1744,8 @@ make_group(List *tlist,
|
||||
*/
|
||||
plan->total_cost += cpu_operator_cost * plan->plan_rows * ngrp;
|
||||
|
||||
/*
|
||||
* Estimate the number of groups as 10% of the number of tuples
|
||||
* --- bogus, but how to do better?
|
||||
*/
|
||||
plan->plan_rows *= 0.1;
|
||||
if (plan->plan_rows < 1)
|
||||
plan->plan_rows = 1;
|
||||
/* One output tuple per estimated result group */
|
||||
plan->plan_rows = numGroups;
|
||||
|
||||
plan->state = (EState *) NULL;
|
||||
plan->qual = NULL;
|
||||
@@ -1786,17 +1777,16 @@ make_unique(List *tlist, Plan *lefttree, List *distinctList)
|
||||
|
||||
/*
|
||||
* Charge one cpu_operator_cost per comparison per input tuple. We
|
||||
* assume all columns get compared at most of the tuples.
|
||||
* assume all columns get compared at most of the tuples. (XXX probably
|
||||
* this is an overestimate.)
|
||||
*/
|
||||
plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols;
|
||||
|
||||
/*
|
||||
* As for Group, we make the unsupported assumption that there will be
|
||||
* 10% as many tuples out as in.
|
||||
* plan->plan_rows is left as a copy of the input subplan's plan_rows;
|
||||
* ie, we assume the filter removes nothing. The caller must alter this
|
||||
* if he has a better idea.
|
||||
*/
|
||||
plan->plan_rows *= 0.1;
|
||||
if (plan->plan_rows < 1)
|
||||
plan->plan_rows = 1;
|
||||
|
||||
plan->state = (EState *) NULL;
|
||||
plan->targetlist = tlist;
|
||||
@@ -1850,8 +1840,8 @@ make_setop(SetOpCmd cmd, List *tlist, Plan *lefttree,
|
||||
plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols;
|
||||
|
||||
/*
|
||||
* As for Group, we make the unsupported assumption that there will be
|
||||
* 10% as many tuples out as in.
|
||||
* We make the unsupported assumption that there will be 10% as many
|
||||
* tuples out as in. Any way to do better?
|
||||
*/
|
||||
plan->plan_rows *= 0.1;
|
||||
if (plan->plan_rows < 1)
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/optimizer/plan/initsplan.c,v 1.75 2002/09/04 20:31:21 momjian Exp $
|
||||
* $Header: /cvsroot/pgsql/src/backend/optimizer/plan/initsplan.c,v 1.76 2002/11/19 23:21:58 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -784,6 +784,71 @@ process_implied_equality(Query *root, Node *item1, Node *item2,
|
||||
pull_varnos((Node *) clause));
|
||||
}
|
||||
|
||||
/*
|
||||
* vars_known_equal
|
||||
* Detect whether two Vars are known equal due to equijoin clauses.
|
||||
*
|
||||
* This is not completely accurate since we avoid adding redundant restriction
|
||||
* clauses to individual base rels (see qual_is_redundant). However, after
|
||||
* the implied-equality-deduction phase, it is complete for Vars of different
|
||||
* rels; that's sufficient for planned uses.
|
||||
*/
|
||||
bool
|
||||
vars_known_equal(Query *root, Var *var1, Var *var2)
|
||||
{
|
||||
Index irel1;
|
||||
Index irel2;
|
||||
RelOptInfo *rel1;
|
||||
List *restrictlist;
|
||||
List *itm;
|
||||
|
||||
/*
|
||||
* Would need more work here if we wanted to check for known equality
|
||||
* of general clauses: there might be multiple base rels involved.
|
||||
*/
|
||||
Assert(IsA(var1, Var));
|
||||
irel1 = var1->varno;
|
||||
Assert(IsA(var2, Var));
|
||||
irel2 = var2->varno;
|
||||
|
||||
/*
|
||||
* If both vars belong to same rel, we need to look at that rel's
|
||||
* baserestrictinfo list. If different rels, each will have a
|
||||
* joininfo node for the other, and we can scan either list.
|
||||
*/
|
||||
rel1 = find_base_rel(root, irel1);
|
||||
if (irel1 == irel2)
|
||||
restrictlist = rel1->baserestrictinfo;
|
||||
else
|
||||
{
|
||||
JoinInfo *joininfo = find_joininfo_node(rel1,
|
||||
makeListi1(irel2));
|
||||
|
||||
restrictlist = joininfo->jinfo_restrictinfo;
|
||||
}
|
||||
|
||||
/*
|
||||
* Scan to see if equality is known.
|
||||
*/
|
||||
foreach(itm, restrictlist)
|
||||
{
|
||||
RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(itm);
|
||||
Node *left,
|
||||
*right;
|
||||
|
||||
if (restrictinfo->mergejoinoperator == InvalidOid)
|
||||
continue; /* ignore non-mergejoinable clauses */
|
||||
/* We now know the restrictinfo clause is a binary opclause */
|
||||
left = (Node *) get_leftop(restrictinfo->clause);
|
||||
right = (Node *) get_rightop(restrictinfo->clause);
|
||||
if ((equal(var1, left) && equal(var2, right)) ||
|
||||
(equal(var2, left) && equal(var1, right)))
|
||||
return true; /* found a matching clause */
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* qual_is_redundant
|
||||
* Detect whether an implied-equality qual that turns out to be a
|
||||
|
||||
@@ -8,14 +8,17 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.128 2002/11/14 19:00:36 tgl Exp $
|
||||
* $Header: /cvsroot/pgsql/src/backend/optimizer/plan/planner.c,v 1.129 2002/11/19 23:21:59 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#include "catalog/pg_type.h"
|
||||
#include "miscadmin.h"
|
||||
#include "nodes/makefuncs.h"
|
||||
#ifdef OPTIMIZER_DEBUG
|
||||
#include "nodes/print.h"
|
||||
@@ -35,6 +38,7 @@
|
||||
#include "parser/parse_expr.h"
|
||||
#include "rewrite/rewriteManip.h"
|
||||
#include "utils/lsyscache.h"
|
||||
#include "utils/selfuncs.h"
|
||||
|
||||
|
||||
/* Expression kind codes for preprocess_expression */
|
||||
@@ -160,6 +164,23 @@ subquery_planner(Query *parse, double tuple_fraction)
|
||||
parse->jointree = (FromExpr *)
|
||||
preprocess_jointree(parse, (Node *) parse->jointree);
|
||||
|
||||
/*
|
||||
* Detect whether any rangetable entries are RTE_JOIN kind; if not,
|
||||
* we can avoid the expense of doing flatten_join_alias_vars().
|
||||
* This must be done after we have done pull_up_subqueries, of course.
|
||||
*/
|
||||
parse->hasJoinRTEs = false;
|
||||
foreach(lst, parse->rtable)
|
||||
{
|
||||
RangeTblEntry *rte = (RangeTblEntry *) lfirst(lst);
|
||||
|
||||
if (rte->rtekind == RTE_JOIN)
|
||||
{
|
||||
parse->hasJoinRTEs = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Do expression preprocessing on targetlist and quals.
|
||||
*/
|
||||
@@ -694,9 +715,6 @@ preprocess_jointree(Query *parse, Node *jtnode)
|
||||
static Node *
|
||||
preprocess_expression(Query *parse, Node *expr, int kind)
|
||||
{
|
||||
bool has_join_rtes;
|
||||
List *rt;
|
||||
|
||||
/*
|
||||
* Simplify constant expressions.
|
||||
*
|
||||
@@ -737,22 +755,8 @@ preprocess_expression(Query *parse, Node *expr, int kind)
|
||||
* with base-relation variables, to allow quals to be pushed down. We
|
||||
* must do this after sublink processing, since it does not recurse
|
||||
* into sublinks.
|
||||
*
|
||||
* The flattening pass is expensive enough that it seems worthwhile to
|
||||
* scan the rangetable to see if we can avoid it.
|
||||
*/
|
||||
has_join_rtes = false;
|
||||
foreach(rt, parse->rtable)
|
||||
{
|
||||
RangeTblEntry *rte = lfirst(rt);
|
||||
|
||||
if (rte->rtekind == RTE_JOIN)
|
||||
{
|
||||
has_join_rtes = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (has_join_rtes)
|
||||
if (parse->hasJoinRTEs)
|
||||
expr = flatten_join_alias_vars(expr, parse->rtable, false);
|
||||
|
||||
return expr;
|
||||
@@ -931,6 +935,9 @@ grouping_planner(Query *parse, double tuple_fraction)
|
||||
AttrNumber *groupColIdx = NULL;
|
||||
Path *cheapest_path;
|
||||
Path *sorted_path;
|
||||
double dNumGroups = 0;
|
||||
long numGroups = 0;
|
||||
int numAggs = 0;
|
||||
bool use_hashed_grouping = false;
|
||||
|
||||
/* Preprocess targetlist in case we are inside an INSERT/UPDATE. */
|
||||
@@ -1006,6 +1013,19 @@ grouping_planner(Query *parse, double tuple_fraction)
|
||||
sort_pathkeys = make_pathkeys_for_sortclauses(parse->sortClause,
|
||||
tlist);
|
||||
|
||||
/*
|
||||
* Will need actual number of aggregates for estimating costs.
|
||||
* Also, it's possible that optimization has eliminated all
|
||||
* aggregates, and we may as well check for that here.
|
||||
*/
|
||||
if (parse->hasAggs)
|
||||
{
|
||||
numAggs = length(pull_agg_clause((Node *) tlist)) +
|
||||
length(pull_agg_clause(parse->havingQual));
|
||||
if (numAggs == 0)
|
||||
parse->hasAggs = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Figure out whether we need a sorted result from query_planner.
|
||||
*
|
||||
@@ -1215,6 +1235,14 @@ grouping_planner(Query *parse, double tuple_fraction)
|
||||
*/
|
||||
if (parse->groupClause)
|
||||
{
|
||||
/*
|
||||
* Always estimate the number of groups.
|
||||
*/
|
||||
dNumGroups = estimate_num_groups(parse,
|
||||
parse->groupClause,
|
||||
cheapest_path->parent->rows);
|
||||
numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
|
||||
|
||||
/*
|
||||
* Executor doesn't support hashed aggregation with DISTINCT
|
||||
* aggregates. (Doing so would imply storing *all* the input
|
||||
@@ -1226,10 +1254,30 @@ grouping_planner(Query *parse, double tuple_fraction)
|
||||
use_hashed_grouping = false;
|
||||
else
|
||||
{
|
||||
#if 0 /* much more to do here */
|
||||
/* TEMPORARY HOTWIRE FOR TESTING */
|
||||
use_hashed_grouping = true;
|
||||
/*
|
||||
* Use hashed grouping if (a) we think we can fit the
|
||||
* hashtable into SortMem, *and* (b) the estimated cost
|
||||
* is no more than doing it the other way. While avoiding
|
||||
* the need for sorted input is usually a win, the fact
|
||||
* that the output won't be sorted may be a loss; so we
|
||||
* need to do an actual cost comparison.
|
||||
*
|
||||
* In most cases we have no good way to estimate the size of
|
||||
* the transition value needed by an aggregate; arbitrarily
|
||||
* assume it is 100 bytes. Also set the overhead per hashtable
|
||||
* entry at 64 bytes.
|
||||
*/
|
||||
int hashentrysize = cheapest_path->parent->width + 64 +
|
||||
numAggs * 100;
|
||||
|
||||
if (hashentrysize * dNumGroups <= SortMem * 1024L)
|
||||
{
|
||||
/* much more to do here */
|
||||
#if 0
|
||||
/* TEMPORARY HOTWIRE FOR TESTING */
|
||||
use_hashed_grouping = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1319,6 +1367,8 @@ grouping_planner(Query *parse, double tuple_fraction)
|
||||
AGG_HASHED,
|
||||
length(parse->groupClause),
|
||||
groupColIdx,
|
||||
numGroups,
|
||||
numAggs,
|
||||
result_plan);
|
||||
/* Hashed aggregation produces randomly-ordered results */
|
||||
current_pathkeys = NIL;
|
||||
@@ -1356,6 +1406,8 @@ grouping_planner(Query *parse, double tuple_fraction)
|
||||
aggstrategy,
|
||||
length(parse->groupClause),
|
||||
groupColIdx,
|
||||
numGroups,
|
||||
numAggs,
|
||||
result_plan);
|
||||
}
|
||||
else
|
||||
@@ -1387,6 +1439,7 @@ grouping_planner(Query *parse, double tuple_fraction)
|
||||
result_plan = (Plan *) make_group(tlist,
|
||||
length(parse->groupClause),
|
||||
groupColIdx,
|
||||
dNumGroups,
|
||||
result_plan);
|
||||
}
|
||||
}
|
||||
@@ -1410,6 +1463,16 @@ grouping_planner(Query *parse, double tuple_fraction)
|
||||
{
|
||||
result_plan = (Plan *) make_unique(tlist, result_plan,
|
||||
parse->distinctClause);
|
||||
/*
|
||||
* If there was grouping or aggregation, leave plan_rows as-is
|
||||
* (ie, assume the result was already mostly unique). If not,
|
||||
* it's reasonable to assume the UNIQUE filter has effects
|
||||
* comparable to GROUP BY.
|
||||
*/
|
||||
if (!parse->groupClause && !parse->hasAggs)
|
||||
result_plan->plan_rows = estimate_num_groups(parse,
|
||||
parse->distinctClause,
|
||||
result_plan->plan_rows);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/optimizer/plan/setrefs.c,v 1.81 2002/09/04 20:31:21 momjian Exp $
|
||||
* $Header: /cvsroot/pgsql/src/backend/optimizer/plan/setrefs.c,v 1.82 2002/11/19 23:21:59 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -439,7 +439,14 @@ join_references_mutator(Node *node,
|
||||
return (Node *) newvar;
|
||||
}
|
||||
|
||||
/* Perhaps it's a join alias that can be resolved to input vars? */
|
||||
/* Return the Var unmodified, if it's for acceptable_rel */
|
||||
if (var->varno == context->acceptable_rel)
|
||||
return (Node *) copyObject(var);
|
||||
|
||||
/*
|
||||
* Perhaps it's a join alias that can be resolved to input vars?
|
||||
* We try this last since it's relatively slow.
|
||||
*/
|
||||
newnode = flatten_join_alias_vars((Node *) var,
|
||||
context->rtable,
|
||||
true);
|
||||
@@ -450,13 +457,8 @@ join_references_mutator(Node *node,
|
||||
return newnode;
|
||||
}
|
||||
|
||||
/*
|
||||
* No referent found for Var --- either raise an error, or return
|
||||
* the Var unmodified if it's for acceptable_rel.
|
||||
*/
|
||||
if (var->varno != context->acceptable_rel)
|
||||
elog(ERROR, "join_references: variable not in subplan target lists");
|
||||
return (Node *) copyObject(var);
|
||||
/* No referent found for Var */
|
||||
elog(ERROR, "join_references: variable not in subplan target lists");
|
||||
}
|
||||
return expression_tree_mutator(node,
|
||||
join_references_mutator,
|
||||
|
||||
Reference in New Issue
Block a user