mirror of
https://github.com/postgres/postgres.git
synced 2025-04-24 10:47:04 +03:00
passed to join selectivity estimators. Make use of this in eqjoinsel to derive non-bogus selectivity for IN clauses. Further tweaking of cost estimation for IN. initdb forced because of pg_proc.h changes.
550 lines
15 KiB
C
550 lines
15 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* clausesel.c
|
|
* Routines to compute clause selectivities
|
|
*
|
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* $Header: /cvsroot/pgsql/src/backend/optimizer/path/clausesel.c,v 1.56 2003/01/28 22:13:29 tgl Exp $
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "catalog/pg_operator.h"
|
|
#include "catalog/pg_type.h"
|
|
#include "nodes/makefuncs.h"
|
|
#include "optimizer/clauses.h"
|
|
#include "optimizer/cost.h"
|
|
#include "optimizer/plancat.h"
|
|
#include "optimizer/restrictinfo.h"
|
|
#include "parser/parsetree.h"
|
|
#include "utils/fmgroids.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/selfuncs.h"
|
|
|
|
|
|
/* note that pg_type.h hardwires size of bool as 1 ... duplicate it */
|
|
#define MAKEBOOLCONST(val,isnull) \
|
|
((Node *) makeConst(BOOLOID, 1, (Datum) (val), (isnull), true))
|
|
|
|
|
|
/*
|
|
* Data structure for accumulating info about possible range-query
|
|
* clause pairs in clauselist_selectivity.
|
|
*/
|
|
typedef struct RangeQueryClause
|
|
{
|
|
struct RangeQueryClause *next; /* next in linked list */
|
|
Node *var; /* The common variable of the clauses */
|
|
bool have_lobound; /* found a low-bound clause yet? */
|
|
bool have_hibound; /* found a high-bound clause yet? */
|
|
Selectivity lobound; /* Selectivity of a var > something clause */
|
|
Selectivity hibound; /* Selectivity of a var < something clause */
|
|
} RangeQueryClause;
|
|
|
|
static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
|
|
bool varonleft, bool isLTsel, Selectivity s2);
|
|
|
|
|
|
/****************************************************************************
|
|
* ROUTINES TO COMPUTE SELECTIVITIES
|
|
****************************************************************************/
|
|
|
|
/*
|
|
* restrictlist_selectivity -
|
|
* Compute the selectivity of an implicitly-ANDed list of RestrictInfo
|
|
* clauses.
|
|
*
|
|
* This is the same as clauselist_selectivity except for the representation
|
|
* of the clause list.
|
|
*/
|
|
Selectivity
|
|
restrictlist_selectivity(Query *root,
|
|
List *restrictinfo_list,
|
|
int varRelid,
|
|
JoinType jointype)
|
|
{
|
|
List *clauselist = get_actual_clauses(restrictinfo_list);
|
|
Selectivity result;
|
|
|
|
result = clauselist_selectivity(root, clauselist, varRelid, jointype);
|
|
freeList(clauselist);
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* clauselist_selectivity -
|
|
* Compute the selectivity of an implicitly-ANDed list of boolean
|
|
* expression clauses. The list can be empty, in which case 1.0
|
|
* must be returned.
|
|
*
|
|
* See clause_selectivity() for the meaning of the additional parameters.
|
|
*
|
|
* Our basic approach is to take the product of the selectivities of the
|
|
* subclauses. However, that's only right if the subclauses have independent
|
|
* probabilities, and in reality they are often NOT independent. So,
|
|
* we want to be smarter where we can.
|
|
|
|
* Currently, the only extra smarts we have is to recognize "range queries",
|
|
* such as "x > 34 AND x < 42". Clauses are recognized as possible range
|
|
* query components if they are restriction opclauses whose operators have
|
|
* scalarltsel() or scalargtsel() as their restriction selectivity estimator.
|
|
* We pair up clauses of this form that refer to the same variable. An
|
|
* unpairable clause of this kind is simply multiplied into the selectivity
|
|
* product in the normal way. But when we find a pair, we know that the
|
|
* selectivities represent the relative positions of the low and high bounds
|
|
* within the column's range, so instead of figuring the selectivity as
|
|
* hisel * losel, we can figure it as hisel + losel - 1. (To visualize this,
|
|
* see that hisel is the fraction of the range below the high bound, while
|
|
* losel is the fraction above the low bound; so hisel can be interpreted
|
|
* directly as a 0..1 value but we need to convert losel to 1-losel before
|
|
* interpreting it as a value. Then the available range is 1-losel to hisel.
|
|
* However, this calculation double-excludes nulls, so really we need
|
|
* hisel + losel + null_frac - 1.)
|
|
* If the calculation yields zero or negative, however, we chicken out and
|
|
* use a default estimate; that probably means that one or both
|
|
* selectivities is a default estimate rather than an actual range value.
|
|
* Of course this is all very dependent on the behavior of
|
|
* scalarltsel/scalargtsel; perhaps some day we can generalize the approach.
|
|
*/
|
|
Selectivity
|
|
clauselist_selectivity(Query *root,
|
|
List *clauses,
|
|
int varRelid,
|
|
JoinType jointype)
|
|
{
|
|
Selectivity s1 = 1.0;
|
|
RangeQueryClause *rqlist = NULL;
|
|
List *clist;
|
|
|
|
/*
|
|
* Initial scan over clauses. Anything that doesn't look like a
|
|
* potential rangequery clause gets multiplied into s1 and forgotten.
|
|
* Anything that does gets inserted into an rqlist entry.
|
|
*/
|
|
foreach(clist, clauses)
|
|
{
|
|
Node *clause = (Node *) lfirst(clist);
|
|
Selectivity s2;
|
|
|
|
/*
|
|
* See if it looks like a restriction clause with a pseudoconstant
|
|
* on one side. (Anything more complicated than that might not
|
|
* behave in the simple way we are expecting.)
|
|
*
|
|
* NB: for consistency of results, this fragment of code had better
|
|
* match what clause_selectivity() would do in the cases it
|
|
* handles.
|
|
*/
|
|
if (is_opclause(clause) &&
|
|
(varRelid != 0 || NumRelids(clause) == 1))
|
|
{
|
|
OpExpr *expr = (OpExpr *) clause;
|
|
|
|
if (length(expr->args) == 2)
|
|
{
|
|
bool varonleft = true;
|
|
|
|
if (is_pseudo_constant_clause(lsecond(expr->args)) ||
|
|
(varonleft = false,
|
|
is_pseudo_constant_clause(lfirst(expr->args))))
|
|
{
|
|
Oid opno = expr->opno;
|
|
RegProcedure oprrest = get_oprrest(opno);
|
|
|
|
s2 = restriction_selectivity(root, opno,
|
|
expr->args, varRelid);
|
|
|
|
/*
|
|
* If we reach here, we have computed the same result
|
|
* that clause_selectivity would, so we can just use
|
|
* s2 if it's the wrong oprrest. But if it's the
|
|
* right oprrest, add the clause to rqlist for later
|
|
* processing.
|
|
*/
|
|
switch (oprrest)
|
|
{
|
|
case F_SCALARLTSEL:
|
|
addRangeClause(&rqlist, clause,
|
|
varonleft, true, s2);
|
|
break;
|
|
case F_SCALARGTSEL:
|
|
addRangeClause(&rqlist, clause,
|
|
varonleft, false, s2);
|
|
break;
|
|
default:
|
|
/* Just merge the selectivity in generically */
|
|
s1 = s1 * s2;
|
|
break;
|
|
}
|
|
continue; /* drop to loop bottom */
|
|
}
|
|
}
|
|
}
|
|
/* Not the right form, so treat it generically. */
|
|
s2 = clause_selectivity(root, clause, varRelid, jointype);
|
|
s1 = s1 * s2;
|
|
}
|
|
|
|
/*
|
|
* Now scan the rangequery pair list.
|
|
*/
|
|
while (rqlist != NULL)
|
|
{
|
|
RangeQueryClause *rqnext;
|
|
|
|
if (rqlist->have_lobound && rqlist->have_hibound)
|
|
{
|
|
/* Successfully matched a pair of range clauses */
|
|
Selectivity s2 = rqlist->hibound + rqlist->lobound - 1.0;
|
|
|
|
/* Adjust for double-exclusion of NULLs */
|
|
s2 += nulltestsel(root, IS_NULL, rqlist->var, varRelid);
|
|
|
|
/*
|
|
* A zero or slightly negative s2 should be converted into a
|
|
* small positive value; we probably are dealing with a very
|
|
* tight range and got a bogus result due to roundoff errors.
|
|
* However, if s2 is very negative, then we probably have
|
|
* default selectivity estimates on one or both sides of the
|
|
* range. In that case, insert a not-so-wildly-optimistic
|
|
* default estimate.
|
|
*/
|
|
if (s2 <= 0.0)
|
|
{
|
|
if (s2 < -0.01)
|
|
{
|
|
/*
|
|
* No data available --- use a default estimate that
|
|
* is small, but not real small.
|
|
*/
|
|
s2 = 0.005;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* It's just roundoff error; use a small positive
|
|
* value
|
|
*/
|
|
s2 = 1.0e-10;
|
|
}
|
|
}
|
|
/* Merge in the selectivity of the pair of clauses */
|
|
s1 *= s2;
|
|
}
|
|
else
|
|
{
|
|
/* Only found one of a pair, merge it in generically */
|
|
if (rqlist->have_lobound)
|
|
s1 *= rqlist->lobound;
|
|
else
|
|
s1 *= rqlist->hibound;
|
|
}
|
|
/* release storage and advance */
|
|
rqnext = rqlist->next;
|
|
pfree(rqlist);
|
|
rqlist = rqnext;
|
|
}
|
|
|
|
return s1;
|
|
}
|
|
|
|
/*
|
|
* addRangeClause --- add a new range clause for clauselist_selectivity
|
|
*
|
|
* Here is where we try to match up pairs of range-query clauses
|
|
*/
|
|
static void
|
|
addRangeClause(RangeQueryClause **rqlist, Node *clause,
|
|
bool varonleft, bool isLTsel, Selectivity s2)
|
|
{
|
|
RangeQueryClause *rqelem;
|
|
Node *var;
|
|
bool is_lobound;
|
|
|
|
if (varonleft)
|
|
{
|
|
var = get_leftop((Expr *) clause);
|
|
is_lobound = !isLTsel; /* x < something is high bound */
|
|
}
|
|
else
|
|
{
|
|
var = get_rightop((Expr *) clause);
|
|
is_lobound = isLTsel; /* something < x is low bound */
|
|
}
|
|
|
|
for (rqelem = *rqlist; rqelem; rqelem = rqelem->next)
|
|
{
|
|
/*
|
|
* We use full equal() here because the "var" might be a function
|
|
* of one or more attributes of the same relation...
|
|
*/
|
|
if (!equal(var, rqelem->var))
|
|
continue;
|
|
/* Found the right group to put this clause in */
|
|
if (is_lobound)
|
|
{
|
|
if (!rqelem->have_lobound)
|
|
{
|
|
rqelem->have_lobound = true;
|
|
rqelem->lobound = s2;
|
|
}
|
|
else
|
|
{
|
|
|
|
/*------
|
|
* We have found two similar clauses, such as
|
|
* x < y AND x < z.
|
|
* Keep only the more restrictive one.
|
|
*------
|
|
*/
|
|
if (rqelem->lobound > s2)
|
|
rqelem->lobound = s2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!rqelem->have_hibound)
|
|
{
|
|
rqelem->have_hibound = true;
|
|
rqelem->hibound = s2;
|
|
}
|
|
else
|
|
{
|
|
|
|
/*------
|
|
* We have found two similar clauses, such as
|
|
* x > y AND x > z.
|
|
* Keep only the more restrictive one.
|
|
*------
|
|
*/
|
|
if (rqelem->hibound > s2)
|
|
rqelem->hibound = s2;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* No matching var found, so make a new clause-pair data structure */
|
|
rqelem = (RangeQueryClause *) palloc(sizeof(RangeQueryClause));
|
|
rqelem->var = var;
|
|
if (is_lobound)
|
|
{
|
|
rqelem->have_lobound = true;
|
|
rqelem->have_hibound = false;
|
|
rqelem->lobound = s2;
|
|
}
|
|
else
|
|
{
|
|
rqelem->have_lobound = false;
|
|
rqelem->have_hibound = true;
|
|
rqelem->hibound = s2;
|
|
}
|
|
rqelem->next = *rqlist;
|
|
*rqlist = rqelem;
|
|
}
|
|
|
|
|
|
/*
|
|
* clause_selectivity -
|
|
* Compute the selectivity of a general boolean expression clause.
|
|
*
|
|
* varRelid is either 0 or a rangetable index.
|
|
*
|
|
* When varRelid is not 0, only variables belonging to that relation are
|
|
* considered in computing selectivity; other vars are treated as constants
|
|
* of unknown values. This is appropriate for estimating the selectivity of
|
|
* a join clause that is being used as a restriction clause in a scan of a
|
|
* nestloop join's inner relation --- varRelid should then be the ID of the
|
|
* inner relation.
|
|
*
|
|
* When varRelid is 0, all variables are treated as variables. This
|
|
* is appropriate for ordinary join clauses and restriction clauses.
|
|
*
|
|
* jointype is the join type, if the clause is a join clause. Pass JOIN_INNER
|
|
* if the clause isn't a join clause or the context is uncertain.
|
|
*/
|
|
Selectivity
|
|
clause_selectivity(Query *root,
|
|
Node *clause,
|
|
int varRelid,
|
|
JoinType jointype)
|
|
{
|
|
Selectivity s1 = 1.0; /* default for any unhandled clause type */
|
|
|
|
if (clause == NULL)
|
|
return s1;
|
|
if (IsA(clause, Var))
|
|
{
|
|
Var *var = (Var *) clause;
|
|
|
|
/*
|
|
* We probably shouldn't ever see an uplevel Var here, but if we
|
|
* do, return the default selectivity...
|
|
*/
|
|
if (var->varlevelsup == 0 &&
|
|
(varRelid == 0 || varRelid == (int) var->varno))
|
|
{
|
|
RangeTblEntry *rte = rt_fetch(var->varno, root->rtable);
|
|
|
|
if (rte->rtekind == RTE_SUBQUERY)
|
|
{
|
|
/*
|
|
* XXX not smart about subquery references... any way to
|
|
* do better?
|
|
*/
|
|
s1 = 0.5;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* A Var at the top of a clause must be a bool Var. This
|
|
* is equivalent to the clause reln.attribute = 't', so we
|
|
* compute the selectivity as if that is what we have.
|
|
*/
|
|
s1 = restriction_selectivity(root,
|
|
BooleanEqualOperator,
|
|
makeList2(var,
|
|
MAKEBOOLCONST(true,
|
|
false)),
|
|
varRelid);
|
|
}
|
|
}
|
|
}
|
|
else if (IsA(clause, Param))
|
|
{
|
|
/* XXX any way to do better? */
|
|
s1 = 1.0;
|
|
}
|
|
else if (IsA(clause, Const))
|
|
{
|
|
/* bool constant is pretty easy... */
|
|
s1 = ((bool) ((Const *) clause)->constvalue) ? 1.0 : 0.0;
|
|
}
|
|
else if (not_clause(clause))
|
|
{
|
|
/* inverse of the selectivity of the underlying clause */
|
|
s1 = 1.0 - clause_selectivity(root,
|
|
(Node *) get_notclausearg((Expr *) clause),
|
|
varRelid,
|
|
jointype);
|
|
}
|
|
else if (and_clause(clause))
|
|
{
|
|
/* share code with clauselist_selectivity() */
|
|
s1 = clauselist_selectivity(root,
|
|
((BoolExpr *) clause)->args,
|
|
varRelid,
|
|
jointype);
|
|
}
|
|
else if (or_clause(clause))
|
|
{
|
|
/*
|
|
* Selectivities for an 'or' clause are computed as s1+s2 - s1*s2
|
|
* to account for the probable overlap of selected tuple sets. XXX
|
|
* is this too conservative?
|
|
*/
|
|
List *arg;
|
|
|
|
s1 = 0.0;
|
|
foreach(arg, ((BoolExpr *) clause)->args)
|
|
{
|
|
Selectivity s2 = clause_selectivity(root,
|
|
(Node *) lfirst(arg),
|
|
varRelid,
|
|
jointype);
|
|
|
|
s1 = s1 + s2 - s1 * s2;
|
|
}
|
|
}
|
|
else if (is_opclause(clause))
|
|
{
|
|
Oid opno = ((OpExpr *) clause)->opno;
|
|
bool is_join_clause;
|
|
|
|
if (varRelid != 0)
|
|
{
|
|
/*
|
|
* If we are considering a nestloop join then all clauses are
|
|
* restriction clauses, since we are only interested in the
|
|
* one relation.
|
|
*/
|
|
is_join_clause = false;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Otherwise, it's a join if there's more than one relation
|
|
* used.
|
|
*/
|
|
is_join_clause = (NumRelids(clause) > 1);
|
|
}
|
|
|
|
if (is_join_clause)
|
|
{
|
|
/* Estimate selectivity for a join clause. */
|
|
s1 = join_selectivity(root, opno,
|
|
((OpExpr *) clause)->args,
|
|
jointype);
|
|
}
|
|
else
|
|
{
|
|
/* Estimate selectivity for a restriction clause. */
|
|
s1 = restriction_selectivity(root, opno,
|
|
((OpExpr *) clause)->args,
|
|
varRelid);
|
|
}
|
|
}
|
|
else if (is_funcclause(clause))
|
|
{
|
|
/*
|
|
* This is not an operator, so we guess at the selectivity. THIS
|
|
* IS A HACK TO GET V4 OUT THE DOOR. FUNCS SHOULD BE ABLE TO HAVE
|
|
* SELECTIVITIES THEMSELVES. -- JMH 7/9/92
|
|
*/
|
|
s1 = (Selectivity) 0.3333333;
|
|
}
|
|
else if (is_subplan(clause))
|
|
{
|
|
/*
|
|
* Just for the moment! FIX ME! - vadim 02/04/98
|
|
*/
|
|
s1 = (Selectivity) 0.5;
|
|
}
|
|
else if (IsA(clause, NullTest))
|
|
{
|
|
/* Use node specific selectivity calculation function */
|
|
s1 = nulltestsel(root,
|
|
((NullTest *) clause)->nulltesttype,
|
|
(Node *) ((NullTest *) clause)->arg,
|
|
varRelid);
|
|
}
|
|
else if (IsA(clause, BooleanTest))
|
|
{
|
|
/* Use node specific selectivity calculation function */
|
|
s1 = booltestsel(root,
|
|
((BooleanTest *) clause)->booltesttype,
|
|
(Node *) ((BooleanTest *) clause)->arg,
|
|
varRelid,
|
|
jointype);
|
|
}
|
|
else if (IsA(clause, RelabelType))
|
|
{
|
|
/* Not sure this case is needed, but it can't hurt */
|
|
s1 = clause_selectivity(root,
|
|
(Node *) ((RelabelType *) clause)->arg,
|
|
varRelid,
|
|
jointype);
|
|
}
|
|
|
|
#ifdef SELECTIVITY_DEBUG
|
|
elog(DEBUG3, "clause_selectivity: s1 %f", s1);
|
|
#endif /* SELECTIVITY_DEBUG */
|
|
|
|
return s1;
|
|
}
|