mirror of
https://github.com/postgres/postgres.git
synced 2025-11-04 20:11:56 +03:00
Use extended stats for precise estimation of bucket size in hash join
Recognizing the real-life complexity where columns in the table often have functional dependencies, PostgreSQL's estimation of the number of distinct values over a set of columns can be underestimated (or much rarely, overestimated) when dealing with multi-clause JOIN. In the case of hash join, it can end up with a small number of predicted hash buckets and, as a result, picking non-optimal merge join. To improve the situation, we introduce one additional stage of bucket size estimation - having two or more join clauses estimator lookup for extended statistics and use it for multicolumn estimation. Clauses are grouped into lists, each containing expressions referencing the same relation. The result of the multicolumn estimation made over such a list is combined with others according to the caller's logic. Clauses that are not estimated are returned to the caller for further estimation. Discussion: https://postgr.es/m/52257607-57f6-850d-399a-ec33a654457b%40postgrespro.ru Author: Andrei Lepikhov <lepihov@gmail.com> Reviewed-by: Andy Fan <zhihui.fan1213@gmail.com> Reviewed-by: Tomas Vondra <tomas.vondra@enterprisedb.com> Reviewed-by: Alena Rybakina <lena.ribackina@yandex.ru> Reviewed-by: Alexander Korotkov <aekorotkov@gmail.com>
This commit is contained in:
@@ -3765,6 +3765,181 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
|
||||
return numdistinct;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to estimate the bucket size of the hash join inner side when the join
|
||||
* condition contains two or more clauses by employing extended statistics.
|
||||
*
|
||||
* The main idea of this approach is that the distinct value generated by
|
||||
* multivariate estimation on two or more columns would provide less bucket size
|
||||
* than estimation on one separate column.
|
||||
*
|
||||
* IMPORTANT: It is crucial to synchronize the approach of combining different
|
||||
* estimations with the caller's method.
|
||||
*
|
||||
* Return a list of clauses that didn't fetch any extended statistics.
|
||||
*/
|
||||
List *
|
||||
estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
|
||||
List *hashclauses,
|
||||
Selectivity *innerbucketsize)
|
||||
{
|
||||
List *clauses = list_copy(hashclauses);
|
||||
List *otherclauses = NIL;
|
||||
double ndistinct = 1.0;
|
||||
|
||||
if (list_length(hashclauses) <= 1)
|
||||
|
||||
/*
|
||||
* Nothing to do for a single clause. Could we employ univariate
|
||||
* extended stat here?
|
||||
*/
|
||||
return hashclauses;
|
||||
|
||||
while (clauses != NIL)
|
||||
{
|
||||
ListCell *lc;
|
||||
int relid = -1;
|
||||
List *varinfos = NIL;
|
||||
List *origin_rinfos = NIL;
|
||||
double mvndistinct;
|
||||
List *origin_varinfos;
|
||||
int group_relid = -1;
|
||||
RelOptInfo *group_rel = NULL;
|
||||
ListCell *lc1,
|
||||
*lc2;
|
||||
|
||||
/*
|
||||
* Find clauses, referencing the same single base relation and try to
|
||||
* estimate such a group with extended statistics. Create varinfo for
|
||||
* an approved clause, push it to otherclauses, if it can't be
|
||||
* estimated here or ignore to process at the next iteration.
|
||||
*/
|
||||
foreach(lc, clauses)
|
||||
{
|
||||
RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
|
||||
Node *expr;
|
||||
Relids relids;
|
||||
GroupVarInfo *varinfo;
|
||||
|
||||
/*
|
||||
* Find the inner side of the join, which we need to estimate the
|
||||
* number of buckets. Use outer_is_left because the
|
||||
* clause_sides_match_join routine has called on hash clauses.
|
||||
*/
|
||||
relids = rinfo->outer_is_left ?
|
||||
rinfo->right_relids : rinfo->left_relids;
|
||||
expr = rinfo->outer_is_left ?
|
||||
get_rightop(rinfo->clause) : get_leftop(rinfo->clause);
|
||||
|
||||
if (bms_get_singleton_member(relids, &relid) &&
|
||||
root->simple_rel_array[relid]->statlist != NIL)
|
||||
{
|
||||
/*
|
||||
* This inner-side expression references only one relation.
|
||||
* Extended statistics on this clause can exist.
|
||||
*/
|
||||
if (group_relid < 0)
|
||||
{
|
||||
RangeTblEntry *rte = root->simple_rte_array[relid];
|
||||
|
||||
if (!rte || (rte->relkind != RELKIND_RELATION &&
|
||||
rte->relkind != RELKIND_MATVIEW &&
|
||||
rte->relkind != RELKIND_FOREIGN_TABLE &&
|
||||
rte->relkind != RELKIND_PARTITIONED_TABLE))
|
||||
{
|
||||
/* Extended statistics can't exist in principle */
|
||||
otherclauses = lappend(otherclauses, rinfo);
|
||||
clauses = foreach_delete_current(clauses, lc);
|
||||
continue;
|
||||
}
|
||||
|
||||
group_relid = relid;
|
||||
group_rel = root->simple_rel_array[relid];
|
||||
}
|
||||
else if (group_relid != relid)
|
||||
|
||||
/*
|
||||
* Being in the group forming state we don't need other
|
||||
* clauses.
|
||||
*/
|
||||
continue;
|
||||
|
||||
varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
|
||||
varinfo->var = expr;
|
||||
varinfo->rel = root->simple_rel_array[relid];
|
||||
varinfo->ndistinct = 0.0;
|
||||
varinfo->isdefault = false;
|
||||
varinfos = lappend(varinfos, varinfo);
|
||||
|
||||
/*
|
||||
* Remember the link to RestrictInfo for the case the clause
|
||||
* is failed to be estimated.
|
||||
*/
|
||||
origin_rinfos = lappend(origin_rinfos, rinfo);
|
||||
}
|
||||
else
|
||||
/* This clause can't be estimated with extended statistics */
|
||||
otherclauses = lappend(otherclauses, rinfo);
|
||||
|
||||
clauses = foreach_delete_current(clauses, lc);
|
||||
}
|
||||
|
||||
if (list_length(varinfos) < 2)
|
||||
{
|
||||
/*
|
||||
* Multivariate statistics doesn't apply to single columns except
|
||||
* for expressions, but it has not been implemented yet.
|
||||
*/
|
||||
otherclauses = list_concat(otherclauses, origin_rinfos);
|
||||
list_free_deep(varinfos);
|
||||
list_free(origin_rinfos);
|
||||
continue;
|
||||
}
|
||||
|
||||
Assert(group_rel != NULL);
|
||||
|
||||
/* Employ the extended statistics. */
|
||||
origin_varinfos = varinfos;
|
||||
for (;;)
|
||||
{
|
||||
bool estimated = estimate_multivariate_ndistinct(root,
|
||||
group_rel,
|
||||
&varinfos,
|
||||
&mvndistinct);
|
||||
|
||||
if (!estimated)
|
||||
break;
|
||||
|
||||
/*
|
||||
* We've got an estimation. Use ndistinct value in a consistent
|
||||
* way - according to the caller's logic (see
|
||||
* final_cost_hashjoin).
|
||||
*/
|
||||
if (ndistinct < mvndistinct)
|
||||
ndistinct = mvndistinct;
|
||||
Assert(ndistinct >= 1.0);
|
||||
}
|
||||
|
||||
Assert(list_length(origin_varinfos) == list_length(origin_rinfos));
|
||||
|
||||
/* Collect unmatched clauses as otherclauses. */
|
||||
forboth(lc1, origin_varinfos, lc2, origin_rinfos)
|
||||
{
|
||||
GroupVarInfo *vinfo = lfirst(lc1);
|
||||
|
||||
if (!list_member_ptr(varinfos, vinfo))
|
||||
/* Already estimated */
|
||||
continue;
|
||||
|
||||
/* Can't be estimated here - push to the returning list */
|
||||
otherclauses = lappend(otherclauses, lfirst(lc2));
|
||||
}
|
||||
}
|
||||
|
||||
*innerbucketsize = 1.0 / ndistinct;
|
||||
return otherclauses;
|
||||
}
|
||||
|
||||
/*
|
||||
* Estimate hash bucket statistics when the specified expression is used
|
||||
* as a hash key for the given number of buckets.
|
||||
|
||||
Reference in New Issue
Block a user