postgres/src/backend/optimizer/path/orindxpath.c

/*-------------------------------------------------------------------------
 *
 * orindxpath.c
 *	  Routines to find index paths that match a set of OR clauses
 *
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/optimizer/path/orindxpath.c,v 1.67 2005/03/27 06:29:36 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "optimizer/clauses.h"
#include "optimizer/cost.h"
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/restrictinfo.h"


static IndexPath *best_or_subclause_indexes(Query *root, RelOptInfo *rel,
						  List *subclauses);
static bool best_or_subclause_index(Query *root,
						RelOptInfo *rel,
						Expr *subclause,
						IndexOptInfo **retIndexInfo,
						List **retIndexClauses,
						List **retIndexQuals,
						Cost *retStartupCost,
						Cost *retTotalCost);


/*----------
 * create_or_index_quals
 *	  Examine join OR-of-AND quals to see if any useful restriction OR
 *	  clauses can be extracted.  If so, add them to the query.
 *
 * Although a join clause must reference other relations overall,
 * an OR of ANDs clause might contain sub-clauses that reference just this
 * relation and can be used to build a restriction clause.
 * For example consider
 *		WHERE ((a.x = 42 AND b.y = 43) OR (a.x = 44 AND b.z = 45));
 * We can transform this into
 *		WHERE ((a.x = 42 AND b.y = 43) OR (a.x = 44 AND b.z = 45))
 *			AND (a.x = 42 OR a.x = 44)
 *			AND (b.y = 43 OR b.z = 45);
 * which opens the potential to build OR indexscans on a and b.  In essence
 * this is a partial transformation to CNF (AND of ORs format).  It is not
 * complete, however, because we do not unravel the original OR --- doing so
 * would usually bloat the qualification expression to little gain.
 *
 * The added quals are partially redundant with the original OR, and therefore
 * will cause the size of the joinrel to be underestimated when it is finally
 * formed.	(This would be true of a full transformation to CNF as well; the
 * fault is not really in the transformation, but in clauselist_selectivity's
 * inability to recognize redundant conditions.)  To minimize the collateral
 * damage, we want to minimize the number of quals added.  Therefore we do
 * not add every possible extracted restriction condition to the query.
 * Instead, we search for the single restriction condition that generates
 * the most useful (cheapest) OR indexscan, and add only that condition.
 * This is a pretty ad-hoc heuristic, but quite useful.
 *
 * We can then compensate for the redundancy of the added qual by poking
 * the recorded selectivity of the original OR clause, thereby ensuring
 * the added qual doesn't change the estimated size of the joinrel when
 * it is finally formed.  This is a MAJOR HACK: it depends on the fact
 * that clause selectivities are cached and on the fact that the same
 * RestrictInfo node will appear in every joininfo list that might be used
 * when the joinrel is formed.	And it probably isn't right in cases where
 * the size estimation is nonlinear (i.e., outer and IN joins).  But it
 * beats not doing anything.
 *
 * NOTE: one might think this messiness could be worked around by generating
 * the indexscan path with a small path->rows value, and not touching the
 * rel's baserestrictinfo or rel->rows.  However, that does not work.
 * The optimizer's fundamental design assumes that every general-purpose
 * Path for a given relation generates the same number of rows.  Without
 * this assumption we'd not be able to optimize solely on the cost of Paths,
 * but would have to take number of output rows into account as well.
 * (Perhaps someday that'd be worth doing, but it's a pretty big change...)
 *
 * 'rel' is the relation entry for which quals are to be created
 *
 * If successful, adds qual(s) to rel->baserestrictinfo and returns TRUE.
 * If no quals available, returns FALSE and doesn't change rel.
 *
 * Note: check_partial_indexes() must have been run previously.
 *----------
 */
bool
create_or_index_quals(Query *root, RelOptInfo *rel)
{
	IndexPath  *bestpath = NULL;
	RestrictInfo *bestrinfo = NULL;
	List	   *newrinfos;
	RestrictInfo *or_rinfo;
	Selectivity or_selec,
				orig_selec;
	ListCell   *i;

	/*
	 * We use the best_or_subclause_indexes() machinery to locate the best
	 * combination of restriction subclauses.  Note we must ignore any
	 * joinclauses that are not marked valid_everywhere, because they
	 * cannot be pushed down due to outer-join rules.
	 */
	foreach(i, rel->joininfo)
	{
		JoinInfo   *joininfo = (JoinInfo *) lfirst(i);
		ListCell   *j;

		foreach(j, joininfo->jinfo_restrictinfo)
		{
			RestrictInfo *rinfo = (RestrictInfo *) lfirst(j);

			if (restriction_is_or_clause(rinfo) &&
				rinfo->valid_everywhere)
			{
				IndexPath  *pathnode;

				pathnode = best_or_subclause_indexes(root,
													 rel,
								   ((BoolExpr *) rinfo->orclause)->args);

				if (pathnode)
				{
					if (bestpath == NULL ||
					pathnode->path.total_cost < bestpath->path.total_cost)
					{
						bestpath = pathnode;
						bestrinfo = rinfo;
					}
				}
			}
		}
	}

	/* Fail if no suitable clauses found */
	if (bestpath == NULL)
		return false;

	/*
	 * Convert the indexclauses structure to a RestrictInfo tree, and add
	 * it to the rel's restriction list.
	 */
	newrinfos = make_restrictinfo_from_indexclauses(bestpath->indexclauses,
													true, true);
	Assert(list_length(newrinfos) == 1);
	or_rinfo = (RestrictInfo *) linitial(newrinfos);
	rel->baserestrictinfo = list_concat(rel->baserestrictinfo, newrinfos);

	/*
	 * Adjust the original OR clause's cached selectivity to compensate
	 * for the selectivity of the added (but redundant) lower-level qual.
	 * This should result in the join rel getting approximately the same
	 * rows estimate as it would have gotten without all these
	 * shenanigans. (XXX major hack alert ... this depends on the
	 * assumption that the selectivity will stay cached ...)
	 */
	or_selec = clause_selectivity(root, (Node *) or_rinfo,
								  0, JOIN_INNER);
	if (or_selec > 0 && or_selec < 1)
	{
		orig_selec = clause_selectivity(root, (Node *) bestrinfo,
										0, JOIN_INNER);
		bestrinfo->this_selec = orig_selec / or_selec;
		/* clamp result to sane range */
		if (bestrinfo->this_selec > 1)
			bestrinfo->this_selec = 1;
	}

	/* Tell caller to recompute rel's rows estimate */
	return true;
}

/*
 * create_or_index_paths
 *	  Creates multi-scan index paths for indexes that match OR clauses.
 *
 * 'rel' is the relation entry for which the paths are to be created
 *
 * Returns nothing, but adds paths to rel->pathlist via add_path().
 *
 * Note: check_partial_indexes() must have been run previously.
 */
void
create_or_index_paths(Query *root, RelOptInfo *rel)
{
	ListCell   *l;

	/*
	 * Check each restriction clause to see if it is an OR clause, and if
	 * so, try to make a path using it.
	 */
	foreach(l, rel->baserestrictinfo)
	{
		RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);

		if (restriction_is_or_clause(rinfo))
		{
			IndexPath  *pathnode;

			pathnode = best_or_subclause_indexes(root,
												 rel,
								   ((BoolExpr *) rinfo->orclause)->args);

			if (pathnode)
				add_path(rel, (Path *) pathnode);
		}
	}
}

/*
 * best_or_subclause_indexes
 *	  Determine the best index to be used in conjunction with each subclause
 *	  of an OR clause, and build a Path for a multi-index scan.
 *
 * 'rel' is the node of the relation to be scanned
 * 'subclauses' are the subclauses of the OR clause (must be the modified
 *		form that includes sub-RestrictInfo clauses)
 *
 * Returns an IndexPath if successful, or NULL if it is not possible to
 * find an index for each OR subclause.
 *
 * NOTE: we choose each scan on the basis of its total cost, ignoring startup
 * cost.  This is reasonable as long as all index types have zero or small
 * startup cost, but we might have to work harder if any index types with
 * nontrivial startup cost are ever invented.
 *
 * This routine also creates the indexqual list that will be needed by
 * the executor.  The indexqual list has one entry for each scan of the base
 * rel, which is a sublist of indexqual conditions to apply in that scan.
 * The implicit semantics are AND across each sublist of quals, and OR across
 * the toplevel list (note that the executor takes care not to return any
 * single tuple more than once).
 */
static IndexPath *
best_or_subclause_indexes(Query *root,
						  RelOptInfo *rel,
						  List *subclauses)
{
	List	   *infos = NIL;
	List	   *clauses = NIL;
	List	   *quals = NIL;
	Cost		path_startup_cost = 0;
	Cost		path_total_cost = 0;
	ListCell   *slist;
	IndexPath  *pathnode;

	/* Gather info for each OR subclause */
	foreach(slist, subclauses)
	{
		Expr	   *subclause = lfirst(slist);
		IndexOptInfo *best_indexinfo;
		List	   *best_indexclauses;
		List	   *best_indexquals;
		Cost		best_startup_cost;
		Cost		best_total_cost;

		if (!best_or_subclause_index(root, rel, subclause,
									 &best_indexinfo,
									 &best_indexclauses, &best_indexquals,
								   &best_startup_cost, &best_total_cost))
			return NULL;		/* failed to match this subclause */

		infos = lappend(infos, best_indexinfo);
		clauses = lappend(clauses, best_indexclauses);
		quals = lappend(quals, best_indexquals);

		/*
		 * Path startup_cost is the startup cost for the first index scan
		 * only; startup costs for later scans will be paid later on, so
		 * they just get reflected in total_cost.
		 *
		 * Total cost is sum of the per-scan costs.
		 */
		if (slist == list_head(subclauses))		/* first scan? */
			path_startup_cost = best_startup_cost;
		path_total_cost += best_total_cost;
	}

	/* We succeeded, so build an IndexPath node */
	pathnode = makeNode(IndexPath);

	pathnode->path.pathtype = T_IndexScan;
	pathnode->path.parent = rel;
	pathnode->path.startup_cost = path_startup_cost;
	pathnode->path.total_cost = path_total_cost;

	/*
	 * This is an IndexScan, but the overall result will consist of tuples
	 * extracted in multiple passes (one for each subclause of the OR), so
	 * the result cannot be claimed to have any particular ordering.
	 */
	pathnode->path.pathkeys = NIL;

	pathnode->indexinfo = infos;
	pathnode->indexclauses = clauses;
	pathnode->indexquals = quals;

	/* It's not an innerjoin path. */
	pathnode->isjoininner = false;

	/* We don't actually care what order the index scans in. */
	pathnode->indexscandir = NoMovementScanDirection;

	/*
	 * The number of rows is the same as the parent rel's estimate, since
	 * this isn't a join inner indexscan.
	 */
	pathnode->rows = rel->rows;

	return pathnode;
}

/*
 * best_or_subclause_index
 *	  Determines which is the best index to be used with a subclause of an
 *	  OR clause by estimating the cost of using each index and selecting
 *	  the least expensive (considering total cost only, for now).
 *
 * Returns FALSE if no index exists that can be used with this OR subclause;
 * in that case the output parameters are not set.
 *
 * 'rel' is the node of the relation to be scanned
 * 'subclause' is the OR subclause being considered
 *
 * '*retIndexInfo' gets the IndexOptInfo of the best index
 * '*retIndexClauses' gets a list of the index clauses for the best index
 * '*retIndexQuals' gets a list of the expanded indexquals for the best index
 * '*retStartupCost' gets the startup cost of a scan with that index
 * '*retTotalCost' gets the total cost of a scan with that index
 */
static bool
best_or_subclause_index(Query *root,
						RelOptInfo *rel,
						Expr *subclause,
						IndexOptInfo **retIndexInfo,	/* return value */
						List **retIndexClauses, /* return value */
						List **retIndexQuals,	/* return value */
						Cost *retStartupCost,	/* return value */
						Cost *retTotalCost)		/* return value */
{
	bool		found = false;
	ListCell   *ilist;

	foreach(ilist, rel->indexlist)
	{
		IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
		List	   *indexclauses;
		List	   *indexquals;
		Path		subclause_path;

		/*
		 * Ignore partial indexes that do not match the query.  If predOK
		 * is true then the index's predicate is implied by top-level
		 * restriction clauses, so we can use it.  However, it might also
		 * be implied by the current OR subclause (perhaps in conjunction
		 * with the top-level clauses), in which case we can use it for this
		 * particular scan.
		 *
		 * XXX this code is partially redundant with logic in
		 * group_clauses_by_indexkey_for_or(); consider refactoring.
		 */
		if (index->indpred != NIL && !index->predOK)
		{
			List   *subclauserinfos;

			if (and_clause((Node *) subclause))
				subclauserinfos = list_copy(((BoolExpr *) subclause)->args);
			else if (IsA(subclause, RestrictInfo))
				subclauserinfos = list_make1(subclause);
			else
				continue;		/* probably can't happen */
			if (!pred_test(index->indpred,
						   list_concat(subclauserinfos,
									   rel->baserestrictinfo)))
				continue;
		}

		/* Collect index clauses usable with this index */
		indexclauses = group_clauses_by_indexkey_for_or(index, subclause);

		/*
		 * Ignore index if it doesn't match the subclause at all; except
		 * that if it's a partial index matching the current OR subclause,
		 * consider it anyway, since effectively we are using the index
		 * predicate to match the subclause.  (Note: we exclude partial
		 * indexes that are predOK; else such a partial index would be
		 * considered to match *every* OR subclause, generating bogus OR
		 * plans that are redundant with the basic scan on that index.)
		 */
		if (indexclauses == NIL && (index->indpred == NIL || index->predOK))
			continue;

		/* Convert clauses to indexquals the executor can handle */
		indexquals = expand_indexqual_conditions(index, indexclauses);

		cost_index(&subclause_path, root, index, indexquals, false);

		if (!found || subclause_path.total_cost < *retTotalCost)
		{
			*retIndexInfo = index;
			*retIndexClauses = flatten_clausegroups_list(indexclauses);
			*retIndexQuals = indexquals;
			*retStartupCost = subclause_path.startup_cost;
			*retTotalCost = subclause_path.total_cost;
			found = true;
		}
	}

	return found;
}