New cost model for planning, incorporating a penalty for random page

accesses versus sequential accesses, a (very crude) estimate of the effects of caching on random page accesses, and cost to evaluate WHERE- clause expressions. Export critical parameters for this model as SET variables. Also, create SET variables for the planner's enable flags (enable_seqscan, enable_indexscan, etc) so that these can be controlled more conveniently than via PGOPTIONS. Planner now estimates both startup cost (cost before retrieving first tuple) and total cost of each path, so it can optimize queries with LIMIT on a reasonable basis by interpolating between these costs. Same facility is a win for EXISTS(...) subqueries and some other cases. Redesign pathkey representation to achieve a major speedup in planning (I saw as much as 5X on a 10-way join); also minor changes in planner to reduce memory consumption by recycling discarded Path nodes and not constructing unnecessary lists. Minor cleanups to display more-plausible costs in some cases in EXPLAIN output. Initdb forced by change in interface to index cost estimation functions.
2025-11-15 03:41:20 +03:00 · 2000-02-15 20:49:31 +00:00
parent 553b5da6a1
commit b1577a7c78
50 changed files with 3200 additions and 1723 deletions
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/allpaths.c,v 1.58 2000/02/07 04:40:59 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/allpaths.c,v 1.59 2000/02/15 20:49:16 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -100,7 +100,7 @@ set_base_rel_pathlist(Query *root)
 		/*
 		 * Generate paths and add them to the rel's pathlist.
 		 *
-		 * add_path/add_pathlist will discard any paths that are dominated
+		 * Note: add_path() will discard any paths that are dominated
 		 * by another available path, keeping only those paths that are
 		 * superior along at least one dimension of cost or sortedness.
 		 */
@@ -109,24 +109,21 @@ set_base_rel_pathlist(Query *root)
 		add_path(rel, create_seqscan_path(rel));

 		/* Consider TID scans */
-		add_pathlist(rel, create_tidscan_paths(root, rel));
+		create_tidscan_paths(root, rel);

 		/* Consider index paths for both simple and OR index clauses */
-		add_pathlist(rel, create_index_paths(root,
-											 rel,
-											 indices,
-											 rel->baserestrictinfo,
-											 rel->joininfo));
+		create_index_paths(root, rel, indices,
+						   rel->baserestrictinfo,
+						   rel->joininfo);

 		/* Note: create_or_index_paths depends on create_index_paths
 		 * to have marked OR restriction clauses with relevant indices;
-		 * this is why it doesn't need to be given the full list of indices.
+		 * this is why it doesn't need to be given the list of indices.
 		 */
-		add_pathlist(rel, create_or_index_paths(root, rel,
-												rel->baserestrictinfo));
+		create_or_index_paths(root, rel, rel->baserestrictinfo);

 		/* Now find the cheapest of the paths for this rel */
-		set_cheapest(rel, rel->pathlist);
+		set_cheapest(rel);
 	}
 }

@@ -196,8 +193,8 @@ make_one_rel_by_joins(Query *root, int levels_needed)
 				xfunc_trypullup(rel);
 #endif

-			/* Find and save the cheapest path for this rel */
-			set_cheapest(rel, rel->pathlist);
+			/* Find and save the cheapest paths for this rel */
+			set_cheapest(rel);

 #ifdef OPTIMIZER_DEBUG
 			debug_print_rel(root, rel);
@@ -279,15 +276,26 @@ print_path(Query *root, Path *path, int indent)
 	if (join)
 	{
 		jp = (JoinPath *) path;
-		printf("%s rows=%.0f cost=%f\n",
-			   ptype, path->parent->rows, path->path_cost);
+
+		printf("%s rows=%.0f cost=%.2f..%.2f\n",
+			   ptype, path->parent->rows,
+			   path->startup_cost, path->total_cost);
+
+		if (path->pathkeys)
+		{
+			for (i = 0; i < indent; i++)
+				printf("\t");
+			printf("  pathkeys=");
+			print_pathkeys(path->pathkeys, root->rtable);
+		}
+
 		switch (nodeTag(path))
 		{
 			case T_MergePath:
 			case T_HashPath:
-				for (i = 0; i < indent + 1; i++)
+				for (i = 0; i < indent; i++)
 					printf("\t");
-				printf("   clauses=(");
+				printf("  clauses=(");
 				print_joinclauses(root, jp->joinrestrictinfo);
 				printf(")\n");

@@ -297,9 +305,9 @@ print_path(Query *root, Path *path, int indent)

 					if (mp->outersortkeys || mp->innersortkeys)
 					{
-						for (i = 0; i < indent + 1; i++)
+						for (i = 0; i < indent; i++)
 							printf("\t");
-						printf("   sortouter=%d sortinner=%d\n",
+						printf("  sortouter=%d sortinner=%d\n",
 							   ((mp->outersortkeys) ? 1 : 0),
 							   ((mp->innersortkeys) ? 1 : 0));
 					}
@@ -315,11 +323,14 @@ print_path(Query *root, Path *path, int indent)
 	{
 		int			relid = lfirsti(path->parent->relids);

-		printf("%s(%d) rows=%.0f cost=%f\n",
-			   ptype, relid, path->parent->rows, path->path_cost);
+		printf("%s(%d) rows=%.0f cost=%.2f..%.2f\n",
+			   ptype, relid, path->parent->rows,
+			   path->startup_cost, path->total_cost);

-		if (IsA(path, IndexPath))
+		if (path->pathkeys)
 		{
+			for (i = 0; i < indent; i++)
+				printf("\t");
 			printf("  pathkeys=");
 			print_pathkeys(path->pathkeys, root->rtable);
 		}
@@ -339,8 +350,10 @@ debug_print_rel(Query *root, RelOptInfo *rel)
 	printf("\tpath list:\n");
 	foreach(l, rel->pathlist)
 		print_path(root, lfirst(l), 1);
-	printf("\tcheapest path:\n");
-	print_path(root, rel->cheapestpath, 1);
+	printf("\tcheapest startup path:\n");
+	print_path(root, rel->cheapest_startup_path, 1);
+	printf("\tcheapest total path:\n");
+	print_path(root, rel->cheapest_total_path, 1);
 }

 #endif	 /* OPTIMIZER_DEBUG */
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -3,23 +3,46 @@
 * costsize.c
 *	  Routines to compute (and set) relation sizes and path costs
 *
- * Path costs are measured in units of disk accesses: one page fetch
- * has cost 1.  The other primitive unit is the CPU time required to
- * process one tuple, which we set at "cpu_page_weight" of a page
- * fetch.  Obviously, the CPU time per tuple depends on the query
- * involved, but the relative CPU and disk speeds of a given platform
- * are so variable that we are lucky if we can get useful numbers
- * at all.  cpu_page_weight is user-settable, in case a particular
- * user is clueful enough to have a better-than-default estimate
- * of the ratio for his platform.  There is also cpu_index_page_weight,
- * the cost to process a tuple of an index during an index scan.
+ * Path costs are measured in units of disk accesses: one sequential page
+ * fetch has cost 1.  All else is scaled relative to a page fetch, using
+ * the scaling parameters
+ *
+ *	random_page_cost	Cost of a non-sequential page fetch
+ *	cpu_tuple_cost		Cost of typical CPU time to process a tuple
+ *	cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
+ *	cpu_operator_cost	Cost of CPU time to process a typical WHERE operator
+ *
+ * We also use a rough estimate "effective_cache_size" of the number of
+ * disk pages in Postgres + OS-level disk cache.  (We can't simply use
+ * NBuffers for this purpose because that would ignore the effects of
+ * the kernel's disk cache.)
+ *
+ * Obviously, taking constants for these values is an oversimplification,
+ * but it's tough enough to get any useful estimates even at this level of
+ * detail.  Note that all of these parameters are user-settable, in case
+ * the default values are drastically off for a particular platform.
+ *
+ * We compute two separate costs for each path:
+ *		total_cost: total estimated cost to fetch all tuples
+ *		startup_cost: cost that is expended before first tuple is fetched
+ * In some scenarios, such as when there is a LIMIT or we are implementing
+ * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
+ * path's result.  A caller can estimate the cost of fetching a partial
+ * result by interpolating between startup_cost and total_cost.  In detail:
+ *		actual_cost = startup_cost +
+ *			(total_cost - startup_cost) * tuples_to_fetch / path->parent->rows;
+ * Note that a relation's rows count (and, by extension, a Plan's plan_rows)
+ * are set without regard to any LIMIT, so that this equation works properly.
+ * (Also, these routines guarantee not to set the rows count to zero, so there
+ * will be no zero divide.)  RelOptInfos, Paths, and Plans themselves never
+ * account for LIMIT.
 *
 * 
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.51 2000/02/07 04:40:59 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.52 2000/02/15 20:49:16 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -27,26 +50,25 @@
 #include "postgres.h"

 #include <math.h>
-#ifdef HAVE_LIMITS_H
-#include <limits.h>
-#ifndef MAXINT
-#define MAXINT		  INT_MAX
-#endif
-#else
-#ifdef HAVE_VALUES_H
-#include <values.h>
-#endif
-#endif

 #include "miscadmin.h"
+#include "nodes/plannodes.h"
+#include "optimizer/clauses.h"
 #include "optimizer/cost.h"
 #include "optimizer/internal.h"
 #include "optimizer/tlist.h"
 #include "utils/lsyscache.h"


-Cost		cpu_page_weight = CPU_PAGE_WEIGHT;
-Cost		cpu_index_page_weight = CPU_INDEX_PAGE_WEIGHT;
+#define LOG2(x)  (log(x) / 0.693147180559945)
+#define LOG6(x)  (log(x) / 1.79175946922805)
+
+
+double		effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
+Cost		random_page_cost = DEFAULT_RANDOM_PAGE_COST;
+Cost		cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
+Cost		cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
+Cost		cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;

 Cost		disable_cost = 100000000.0;

@@ -59,53 +81,114 @@ bool		enable_mergejoin = true;
 bool		enable_hashjoin = true;


+static bool cost_qual_eval_walker(Node *node, Cost *total);
 static void set_rel_width(Query *root, RelOptInfo *rel);
 static int	compute_attribute_width(TargetEntry *tlistentry);
 static double relation_byte_size(double tuples, int width);
 static double page_size(double tuples, int width);
-static double base_log(double x, double b);


 /*
 * cost_seqscan
 *	  Determines and returns the cost of scanning a relation sequentially.
- *	  If the relation is a temporary to be materialized from a query
- *	  embedded within a data field (determined by 'relid' containing an
- *	  attribute reference), then a predetermined constant is returned (we
- *	  have NO IDEA how big the result of a POSTQUEL procedure is going to
- *	  be).
 *
- *		disk = p
- *		cpu = CPU-PAGE-WEIGHT * t
+ * If the relation is a temporary to be materialized from a query
+ * embedded within a data field (determined by 'relid' containing an
+ * attribute reference), then a predetermined constant is returned (we
+ * have NO IDEA how big the result of a POSTQUEL procedure is going to be).
+ *
+ * Note: for historical reasons, this routine and the others in this module
+ * use the passed result Path only to store their startup_cost and total_cost
+ * results into.  All the input data they need is passed as separate
+ * parameters, even though much of it could be extracted from the result Path.
 */
-Cost
-cost_seqscan(RelOptInfo *baserel)
+void
+cost_seqscan(Path *path, RelOptInfo *baserel)
 {
-	Cost		temp = 0;
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	Cost		cpu_per_tuple;

 	/* Should only be applied to base relations */
 	Assert(length(baserel->relids) == 1);

 	if (!enable_seqscan)
-		temp += disable_cost;
+		startup_cost += disable_cost;

+	/* disk costs */
 	if (lfirsti(baserel->relids) < 0)
 	{
 		/*
 		 * cost of sequentially scanning a materialized temporary relation
 		 */
-		temp += _NONAME_SCAN_COST_;
+		run_cost += _NONAME_SCAN_COST_;
 	}
 	else
 	{
-		temp += baserel->pages;
-		temp += cpu_page_weight * baserel->tuples;
+		/*
+		 * The cost of reading a page sequentially is 1.0, by definition.
+		 * Note that the Unix kernel will typically do some amount of
+		 * read-ahead optimization, so that this cost is less than the true
+		 * cost of reading a page from disk.  We ignore that issue here,
+		 * but must take it into account when estimating the cost of
+		 * non-sequential accesses!
+		 */
+		run_cost += baserel->pages;	/* sequential fetches with cost 1.0 */
 	}

-	Assert(temp >= 0);
-	return temp;
+	/* CPU costs */
+	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
+	run_cost += cpu_per_tuple * baserel->tuples;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
 }

+/*
+ * cost_nonsequential_access
+ *	  Estimate the cost of accessing one page at random from a relation
+ *	  (or sort temp file) of the given size in pages.
+ *
+ * The simplistic model that the cost is random_page_cost is what we want
+ * to use for large relations; but for small ones that is a serious
+ * overestimate because of the effects of caching.  This routine tries to
+ * account for that.
+ *
+ * Unfortunately we don't have any good way of estimating the effective cache
+ * size we are working with --- we know that Postgres itself has NBuffers
+ * internal buffers, but the size of the kernel's disk cache is uncertain,
+ * and how much of it we get to use is even less certain.  We punt the problem
+ * for now by assuming we are given an effective_cache_size parameter.
+ *
+ * Given a guesstimated cache size, we estimate the actual I/O cost per page
+ * with the entirely ad-hoc equations:
+ *	for rel_size <= effective_cache_size:
+ *		1 + (random_page_cost/2-1) * (rel_size/effective_cache_size) ** 2
+ *	for rel_size >= effective_cache_size:
+ *		random_page_cost * (1 - (effective_cache_size/rel_size)/2)
+ * These give the right asymptotic behavior (=> 1.0 as rel_size becomes
+ * small, => random_page_cost as it becomes large) and meet in the middle
+ * with the estimate that the cache is about 50% effective for a relation
+ * of the same size as effective_cache_size.  (XXX this is probably all
+ * wrong, but I haven't been able to find any theory about how effective
+ * a disk cache should be presumed to be.)
+ */
+static Cost
+cost_nonsequential_access(double relpages)
+{
+	double		relsize;
+
+	/* don't crash on bad input data */
+	if (relpages <= 0.0 || effective_cache_size <= 0.0)
+		return random_page_cost;
+
+	relsize = relpages / effective_cache_size;
+
+	if (relsize >= 1.0)
+		return random_page_cost * (1.0 - 0.5 / relsize);
+	else
+		return 1.0 + (random_page_cost * 0.5 - 1.0) * relsize * relsize;
+}

 /*
 * cost_index
@@ -126,25 +209,28 @@ cost_seqscan(RelOptInfo *baserel)
 * tuples, but they won't reduce the number of tuples we have to fetch from
 * the table, so they don't reduce the scan cost.
 */
-Cost
-cost_index(Query *root,
+void
+cost_index(Path *path, Query *root,
 		   RelOptInfo *baserel,
 		   IndexOptInfo *index,
 		   List *indexQuals,
 		   bool is_injoin)
 {
-	Cost		temp = 0;
-	Cost		indexAccessCost;
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	Cost		cpu_per_tuple;
+	Cost		indexStartupCost;
+	Cost		indexTotalCost;
 	Selectivity	indexSelectivity;
-	double		reltuples;
-	double		relpages;
+	double		tuples_fetched;
+	double		pages_fetched;

 	/* Should only be applied to base relations */
 	Assert(IsA(baserel, RelOptInfo) && IsA(index, IndexOptInfo));
 	Assert(length(baserel->relids) == 1);

 	if (!enable_indexscan && !is_injoin)
-		temp += disable_cost;
+		startup_cost += disable_cost;

 	/*
 	 * Call index-access-method-specific code to estimate the processing
@@ -152,31 +238,21 @@ cost_index(Query *root,
 	 * (ie, the fraction of main-table tuples we will have to retrieve).
 	 */
 	fmgr(index->amcostestimate, root, baserel, index, indexQuals,
-		 &indexAccessCost, &indexSelectivity);
+		 &indexStartupCost, &indexTotalCost, &indexSelectivity);

 	/* all costs for touching index itself included here */
-	temp += indexAccessCost;
+	startup_cost += indexStartupCost;
+	run_cost += indexTotalCost - indexStartupCost;

-	/*--------------------
-	 * Estimate number of main-table tuples and pages touched.
+	/*
+	 * Estimate number of main-table tuples and pages fetched.
 	 *
-	 * Worst case is that each tuple the index tells us to fetch comes
-	 * from a different base-rel page, in which case the I/O cost would be
-	 * 'reltuples' pages.  In practice we can expect the number of page
-	 * fetches to be reduced by the buffer cache, because more than one
-	 * tuple can be retrieved per page fetched.  Currently, we estimate
-	 * the number of pages to be retrieved as
-	 *			MIN(reltuples, relpages)
-	 * This amounts to assuming that the buffer cache is perfectly efficient
-	 * and never ends up reading the same page twice within one scan, which
-	 * of course is too optimistic.  On the other hand, we are assuming that
-	 * the target tuples are perfectly uniformly distributed across the
-	 * relation's pages, which is too pessimistic --- any nonuniformity of
-	 * distribution will reduce the number of pages we have to fetch.
-	 * So, we guess-and-hope that these sources of error will more or less
-	 * balance out.
-	 *
-	 * XXX need to add a penalty for nonsequential page fetches.
+	 * If the number of tuples is much smaller than the number of pages in
+	 * the relation, each tuple will cost a separate nonsequential fetch.
+	 * If it is comparable or larger, then probably we will be able to avoid
+	 * some fetches.  We use a growth rate of log(#tuples/#pages + 1) ---
+	 * probably totally bogus, but intuitively it gives the right shape of
+	 * curve at least.
 	 *
 	 * XXX if the relation has recently been "clustered" using this index,
 	 * then in fact the target tuples will be highly nonuniformly distributed,
@@ -184,54 +260,77 @@ cost_index(Query *root,
 	 * have no way to know whether the relation has been clustered, nor how
 	 * much it's been modified since the last clustering, so we ignore this
 	 * effect.  Would be nice to do better someday.
-	 *--------------------
 	 */

-	reltuples = indexSelectivity * baserel->tuples;
+	tuples_fetched = indexSelectivity * baserel->tuples;

-	relpages = reltuples;
-	if (baserel->pages > 0 && baserel->pages < relpages)
-		relpages = baserel->pages;
+	if (tuples_fetched > 0 && baserel->pages > 0)
+		pages_fetched = baserel->pages *
+			log(tuples_fetched / baserel->pages + 1.0);
+	else
+		pages_fetched = tuples_fetched;
+
+	/*
+	 * Now estimate one nonsequential access per page fetched,
+	 * plus appropriate CPU costs per tuple.
+	 */

 	/* disk costs for main table */
-	temp += relpages;
+	run_cost += pages_fetched * cost_nonsequential_access(baserel->pages);

-	/* CPU costs for heap tuples */
-	temp += cpu_page_weight * reltuples;
+	/* CPU costs */
+	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
+	/*
+	 * Assume that the indexquals will be removed from the list of
+	 * restriction clauses that we actually have to evaluate as qpquals.
+	 * This is not completely right, but it's close.
+	 * For a lossy index, however, we will have to recheck all the quals.
+	 */
+	if (! index->lossy)
+		cpu_per_tuple -= cost_qual_eval(indexQuals);

-	Assert(temp >= 0);
-	return temp;
+	run_cost += cpu_per_tuple * tuples_fetched;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
 }

 /*
 * cost_tidscan
 *	  Determines and returns the cost of scanning a relation using tid-s.
- *
- *		disk = number of tids
- *		cpu = CPU-PAGE-WEIGHT * number_of_tids
 */
-Cost
-cost_tidscan(RelOptInfo *baserel, List *tideval)
+void
+cost_tidscan(Path *path, RelOptInfo *baserel, List *tideval)
 {
-	Cost	temp = 0;
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	Cost		cpu_per_tuple;
+	int			ntuples = length(tideval);

 	if (!enable_tidscan)
-		temp += disable_cost;
+		startup_cost += disable_cost;

-	temp += (1.0 + cpu_page_weight) * length(tideval);
+	/* disk costs --- assume each tuple on a different page */
+	run_cost += random_page_cost * ntuples;

-	return temp;
+	/* CPU costs */
+	cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost;
+	run_cost += cpu_per_tuple * ntuples;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
 }
 
 /*
 * cost_sort
 *	  Determines and returns the cost of sorting a relation.
 *
+ * The cost of supplying the input data is NOT included; the caller should
+ * add that cost to both startup and total costs returned from this routine!
+ *
 * If the total volume of data to sort is less than SortMem, we will do
 * an in-memory sort, which requires no I/O and about t*log2(t) tuple
- * comparisons for t tuples.  We use cpu_index_page_weight as the cost
- * of a tuple comparison (is this reasonable, or do we need another
- * basic parameter?).
+ * comparisons for t tuples.
 *
 * If the total volume exceeds SortMem, we switch to a tape-style merge
 * algorithm.  There will still be about t*log2(t) tuple comparisons in
@@ -240,8 +339,14 @@ cost_tidscan(RelOptInfo *baserel, List *tideval)
 * number of initial runs formed (log6 because tuplesort.c uses six-tape
 * merging).  Since the average initial run should be about twice SortMem,
 * we have
- *		disk = 2 * p * ceil(log6(p / (2*SortMem)))
- *		cpu = CPU-INDEX-PAGE-WEIGHT * t * log2(t)
+ *		disk traffic = 2 * relsize * ceil(log6(p / (2*SortMem)))
+ *		cpu = comparison_cost * t * log2(t)
+ *
+ * The disk traffic is assumed to be half sequential and half random
+ * accesses (XXX can't we refine that guess?)
+ *
+ * We charge two operator evals per tuple comparison, which should be in
+ * the right ballpark in most cases.
 *
 * 'pathkeys' is a list of sort keys
 * 'tuples' is the number of tuples in the relation
@@ -252,15 +357,16 @@ cost_tidscan(RelOptInfo *baserel, List *tideval)
 * currently do anything with pathkeys anyway, that doesn't matter...
 * but if it ever does, it should react gracefully to lack of key data.
 */
-Cost
-cost_sort(List *pathkeys, double tuples, int width)
+void
+cost_sort(Path *path, List *pathkeys, double tuples, int width)
 {
-	Cost		temp = 0;
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
 	double		nbytes = relation_byte_size(tuples, width);
 	long		sortmembytes = SortMem * 1024L;

 	if (!enable_sort)
-		temp += disable_cost;
+		startup_cost += disable_cost;

 	/*
 	 * We want to be sure the cost of a sort is never estimated as zero,
@@ -270,43 +376,40 @@ cost_sort(List *pathkeys, double tuples, int width)
 	if (tuples < 2.0)
 		tuples = 2.0;

-	temp += cpu_index_page_weight * tuples * base_log(tuples, 2.0);
+	/*
+	 * CPU costs
+	 *
+	 * Assume about two operator evals per tuple comparison
+	 * and N log2 N comparisons
+	 */
+	startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples);

+	/* disk costs */
 	if (nbytes > sortmembytes)
 	{
 		double		npages = ceil(nbytes / BLCKSZ);
 		double		nruns = nbytes / (sortmembytes * 2);
-		double		log_runs = ceil(base_log(nruns, 6.0));
+		double		log_runs = ceil(LOG6(nruns));
+		double		npageaccesses;

 		if (log_runs < 1.0)
 			log_runs = 1.0;
-		temp += 2 * npages * log_runs;
+		npageaccesses = 2.0 * npages * log_runs;
+		/* Assume half are sequential (cost 1), half are not */
+		startup_cost += npageaccesses *
+			(1.0 + cost_nonsequential_access(npages)) * 0.5;
 	}

-	Assert(temp > 0);
-	return temp;
+	/*
+	 * Note: should we bother to assign a nonzero run_cost to reflect the
+	 * overhead of extracting tuples from the sort result?  Probably not
+	 * worth worrying about.
+	 */
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
 }


-/*
- * cost_result
- *	  Determines and returns the cost of writing a relation of 'tuples'
- *	  tuples of 'width' bytes out to a result relation.
- */
-#ifdef NOT_USED
-Cost
-cost_result(double tuples, int width)
-{
-	Cost		temp = 0;
-
-	temp += page_size(tuples, width);
-	temp += cpu_page_weight * tuples;
-	Assert(temp >= 0);
-	return temp;
-}
-
-#endif
-
 /*
 * cost_nestloop
 *	  Determines and returns the cost of joining two relations using the
@@ -314,23 +417,45 @@ cost_result(double tuples, int width)
 *
 * 'outer_path' is the path for the outer relation
 * 'inner_path' is the path for the inner relation
+ * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 * 'is_indexjoin' is true if we are using an indexscan for the inner relation
+ *		(not currently needed here; the indexscan adjusts its cost...)
 */
-Cost
-cost_nestloop(Path *outer_path,
+void
+cost_nestloop(Path *path,
+			  Path *outer_path,
 			  Path *inner_path,
+			  List *restrictlist,
 			  bool is_indexjoin)
 {
-	Cost		temp = 0;
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	Cost		cpu_per_tuple;
+	double		ntuples;

 	if (!enable_nestloop)
-		temp += disable_cost;
+		startup_cost += disable_cost;

-	temp += outer_path->path_cost;
-	temp += outer_path->parent->rows * inner_path->path_cost;
+	/* cost of source data */
+	/*
+	 * NOTE: we assume that the inner path's startup_cost is paid once, not
+	 * over again on each restart.  This is certainly correct if the inner
+	 * path is materialized.  Are there any cases where it is wrong?
+	 */
+	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
+	run_cost += outer_path->total_cost - outer_path->startup_cost;
+	run_cost += outer_path->parent->rows *
+		(inner_path->total_cost - inner_path->startup_cost);

-	Assert(temp >= 0);
-	return temp;
+	/* number of tuples processed (not number emitted!) */
+	ntuples = outer_path->parent->rows * inner_path->parent->rows;
+
+	/* CPU costs */
+	cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
+	run_cost += cpu_per_tuple * ntuples;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
 }

 /*
@@ -340,33 +465,66 @@ cost_nestloop(Path *outer_path,
 *
 * 'outer_path' is the path for the outer relation
 * 'inner_path' is the path for the inner relation
+ * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 * 'outersortkeys' and 'innersortkeys' are lists of the keys to be used
 *				to sort the outer and inner relations, or NIL if no explicit
 *				sort is needed because the source path is already ordered
 */
-Cost
-cost_mergejoin(Path *outer_path,
+void
+cost_mergejoin(Path *path,
+			   Path *outer_path,
 			   Path *inner_path,
+			   List *restrictlist,
 			   List *outersortkeys,
 			   List *innersortkeys)
 {
-	Cost		temp = 0;
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	Cost		cpu_per_tuple;
+	double		ntuples;
+	Path		sort_path;		/* dummy for result of cost_sort */

 	if (!enable_mergejoin)
-		temp += disable_cost;
+		startup_cost += disable_cost;

 	/* cost of source data */
-	temp += outer_path->path_cost + inner_path->path_cost;
+	/*
+	 * Note we are assuming that each source tuple is fetched just once,
+	 * which is not right in the presence of equal keys.  If we had a way of
+	 * estimating the proportion of equal keys, we could apply a correction
+	 * factor...
+	 */
+	if (outersortkeys)			/* do we need to sort outer? */
+	{
+		startup_cost += outer_path->total_cost;
+		cost_sort(&sort_path,
+				  outersortkeys,
+				  outer_path->parent->rows,
+				  outer_path->parent->width);
+		startup_cost += sort_path.startup_cost;
+		run_cost += sort_path.total_cost - sort_path.startup_cost;
+	}
+	else
+	{
+		startup_cost += outer_path->startup_cost;
+		run_cost += outer_path->total_cost - outer_path->startup_cost;
+	}

-	if (outersortkeys)			/* do we need to sort? */
-		temp += cost_sort(outersortkeys,
-						  outer_path->parent->rows,
-						  outer_path->parent->width);
-
-	if (innersortkeys)			/* do we need to sort? */
-		temp += cost_sort(innersortkeys,
-						  inner_path->parent->rows,
-						  inner_path->parent->width);
+	if (innersortkeys)			/* do we need to sort inner? */
+	{
+		startup_cost += inner_path->total_cost;
+		cost_sort(&sort_path,
+				  innersortkeys,
+				  inner_path->parent->rows,
+				  inner_path->parent->width);
+		startup_cost += sort_path.startup_cost;
+		run_cost += sort_path.total_cost - sort_path.startup_cost;
+	}
+	else
+	{
+		startup_cost += inner_path->startup_cost;
+		run_cost += inner_path->total_cost - inner_path->startup_cost;
+	}

 	/*
 	 * Estimate the number of tuples to be processed in the mergejoin itself
@@ -374,11 +532,14 @@ cost_mergejoin(Path *outer_path,
 	 * underestimate if there are many equal-keyed tuples in either relation,
 	 * but we have no good way of estimating that...
 	 */
-	temp += cpu_page_weight * (outer_path->parent->rows +
-							   inner_path->parent->rows);
+	ntuples = outer_path->parent->rows + inner_path->parent->rows;

-	Assert(temp >= 0);
-	return temp;
+	/* CPU costs */
+	cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
+	run_cost += cpu_per_tuple * ntuples;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
 }

 /*
@@ -388,15 +549,21 @@ cost_mergejoin(Path *outer_path,
 *
 * 'outer_path' is the path for the outer relation
 * 'inner_path' is the path for the inner relation
+ * 'restrictlist' are the RestrictInfo nodes to be applied at the join
 * 'innerdisbursion' is an estimate of the disbursion statistic
 *				for the inner hash key.
 */
-Cost
-cost_hashjoin(Path *outer_path,
+void
+cost_hashjoin(Path *path,
+			  Path *outer_path,
 			  Path *inner_path,
+			  List *restrictlist,
 			  Selectivity innerdisbursion)
 {
-	Cost		temp = 0;
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	Cost		cpu_per_tuple;
+	double		ntuples;
 	double		outerbytes = relation_byte_size(outer_path->parent->rows,
 												outer_path->parent->width);
 	double		innerbytes = relation_byte_size(inner_path->parent->rows,
@@ -404,48 +571,169 @@ cost_hashjoin(Path *outer_path,
 	long		hashtablebytes = SortMem * 1024L;

 	if (!enable_hashjoin)
-		temp += disable_cost;
+		startup_cost += disable_cost;

 	/* cost of source data */
-	temp += outer_path->path_cost + inner_path->path_cost;
+	startup_cost += outer_path->startup_cost;
+	run_cost += outer_path->total_cost - outer_path->startup_cost;
+	startup_cost += inner_path->total_cost;

-	/* cost of computing hash function: must do it once per tuple */
-	temp += cpu_page_weight * (outer_path->parent->rows +
-							   inner_path->parent->rows);
+	/* cost of computing hash function: must do it once per input tuple */
+	startup_cost += cpu_operator_cost * inner_path->parent->rows;
+	run_cost += cpu_operator_cost * outer_path->parent->rows;

 	/* the number of tuple comparisons needed is the number of outer
 	 * tuples times the typical hash bucket size, which we estimate
-	 * conservatively as the inner disbursion times the inner tuple
-	 * count.  The cost per comparison is set at cpu_index_page_weight;
-	 * is that reasonable, or do we need another basic parameter?
+	 * conservatively as the inner disbursion times the inner tuple count.
 	 */
-	temp += cpu_index_page_weight * outer_path->parent->rows *
+	run_cost += cpu_operator_cost * outer_path->parent->rows *
 		(inner_path->parent->rows * innerdisbursion);

+	/*
+	 * Estimate the number of tuples that get through the hashing filter
+	 * as one per tuple in the two source relations.  This could be a drastic
+	 * underestimate if there are many equal-keyed tuples in either relation,
+	 * but we have no good way of estimating that...
+	 */
+	ntuples = outer_path->parent->rows + inner_path->parent->rows;
+
+	/* CPU costs */
+	cpu_per_tuple = cpu_tuple_cost + cost_qual_eval(restrictlist);
+	run_cost += cpu_per_tuple * ntuples;
+
 	/*
 	 * if inner relation is too big then we will need to "batch" the join,
 	 * which implies writing and reading most of the tuples to disk an
-	 * extra time.  Charge one cost unit per page of I/O.
+	 * extra time.  Charge one cost unit per page of I/O (correct since
+	 * it should be nice and sequential...).  Writing the inner rel counts
+	 * as startup cost, all the rest as run cost.
 	 */
 	if (innerbytes > hashtablebytes)
-		temp += 2 * (page_size(outer_path->parent->rows,
-							   outer_path->parent->width) +
-					 page_size(inner_path->parent->rows,
-							   inner_path->parent->width));
+	{
+		double	outerpages = page_size(outer_path->parent->rows,
+									   outer_path->parent->width);
+		double	innerpages = page_size(inner_path->parent->rows,
+									   inner_path->parent->width);
+
+		startup_cost += innerpages;
+		run_cost += innerpages + 2 * outerpages;
+	}

 	/*
 	 * Bias against putting larger relation on inside.  We don't want
 	 * an absolute prohibition, though, since larger relation might have
 	 * better disbursion --- and we can't trust the size estimates
-	 * unreservedly, anyway.
+	 * unreservedly, anyway.  Instead, inflate the startup cost by
+	 * the square root of the size ratio.  (Why square root?  No real good
+	 * reason, but it seems reasonable...)
 	 */
-	if (innerbytes > outerbytes)
-		temp *= 1.1;			/* is this an OK fudge factor? */
+	if (innerbytes > outerbytes && outerbytes > 0)
+	{
+		startup_cost *= sqrt(innerbytes / outerbytes);
+	}

-	Assert(temp >= 0);
-	return temp;
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
 }

+
+/*
+ * cost_qual_eval
+ *		Estimate the CPU cost of evaluating a WHERE clause (once).
+ *		The input can be either an implicitly-ANDed list of boolean
+ *		expressions, or a list of RestrictInfo nodes.
+ */
+Cost
+cost_qual_eval(List *quals)
+{
+	Cost	total = 0;
+
+	cost_qual_eval_walker((Node *) quals, &total);
+	return total;
+}
+
+static bool
+cost_qual_eval_walker(Node *node, Cost *total)
+{
+	if (node == NULL)
+		return false;
+	/*
+	 * Our basic strategy is to charge one cpu_operator_cost for each
+	 * operator or function node in the given tree.  Vars and Consts
+	 * are charged zero, and so are boolean operators (AND, OR, NOT).
+	 * Simplistic, but a lot better than no model at all.
+	 *
+	 * Should we try to account for the possibility of short-circuit
+	 * evaluation of AND/OR?
+	 */
+	if (IsA(node, Expr))
+	{
+		Expr   *expr = (Expr *) node;
+
+		switch (expr->opType)
+		{
+			case OP_EXPR:
+			case FUNC_EXPR:
+				*total += cpu_operator_cost;
+				break;
+			case OR_EXPR:
+			case AND_EXPR:
+			case NOT_EXPR:
+				break;
+			case SUBPLAN_EXPR:
+				/*
+				 * A subplan node in an expression indicates that the subplan
+				 * will be executed on each evaluation, so charge accordingly.
+				 * (We assume that sub-selects that can be executed as
+				 * InitPlans have already been removed from the expression.)
+				 *
+				 * NOTE: this logic should agree with make_subplan in
+				 * subselect.c. 
+				 */
+				{
+					SubPlan	   *subplan = (SubPlan *) expr->oper;
+					Plan	   *plan = subplan->plan;
+					Cost		subcost;
+
+					if (subplan->sublink->subLinkType == EXISTS_SUBLINK)
+					{
+						/* we only need to fetch 1 tuple */
+						subcost = plan->startup_cost +
+							(plan->total_cost - plan->startup_cost) / plan->plan_rows;
+					}
+					else if (subplan->sublink->subLinkType == EXPR_SUBLINK)
+					{
+						/* assume we need all tuples */
+						subcost = plan->total_cost;
+					}
+					else
+					{
+						/* assume we need 50% of the tuples */
+						subcost = plan->startup_cost +
+							0.50 * (plan->total_cost - plan->startup_cost);
+					}
+					*total += subcost;
+				}
+				break;
+		}
+		/* fall through to examine args of Expr node */
+	}
+	/*
+	 * expression_tree_walker doesn't know what to do with RestrictInfo nodes,
+	 * but we just want to recurse through them.
+	 */
+	if (IsA(node, RestrictInfo))
+	{
+		RestrictInfo   *restrictinfo = (RestrictInfo *) node;
+
+		return cost_qual_eval_walker((Node *) restrictinfo->clause, total);
+	}
+	/* Otherwise, recurse. */
+	return expression_tree_walker(node, cost_qual_eval_walker,
+								  (void *) total);
+}
+
+
 /*
 * set_baserel_size_estimates
 *		Set the size estimates for the given base relation.
@@ -457,6 +745,7 @@ cost_hashjoin(Path *outer_path,
 *	rows: the estimated number of output tuples (after applying
 *	      restriction clauses).
 *	width: the estimated average output tuple width in bytes.
+ *	baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
 */
 void
 set_baserel_size_estimates(Query *root, RelOptInfo *rel)
@@ -468,7 +757,14 @@ set_baserel_size_estimates(Query *root, RelOptInfo *rel)
 		restrictlist_selectivity(root,
 								 rel->baserestrictinfo,
 								 lfirsti(rel->relids));
-	Assert(rel->rows >= 0);
+	/*
+	 * Force estimate to be at least one row, to make explain output look
+	 * better and to avoid possible divide-by-zero when interpolating cost.
+	 */
+	if (rel->rows < 1.0)
+		rel->rows = 1.0;
+
+	rel->baserestrictcost = cost_qual_eval(rel->baserestrictinfo);

 	set_rel_width(root, rel);
 }
@@ -513,7 +809,12 @@ set_joinrel_size_estimates(Query *root, RelOptInfo *rel,
 									 restrictlist,
 									 0);

-	Assert(temp >= 0);
+	/*
+	 * Force estimate to be at least one row, to make explain output look
+	 * better and to avoid possible divide-by-zero when interpolating cost.
+	 */
+	if (temp < 1.0)
+		temp = 1.0;
 	rel->rows = temp;

 	/*
@@ -582,9 +883,3 @@ page_size(double tuples, int width)
 {
 	return ceil(relation_byte_size(tuples, width) / BLCKSZ);
 }
-
-static double
-base_log(double x, double b)
-{
-	return log(x) / log(b);
-}
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/indxpath.c,v 1.79 2000/02/05 18:26:09 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/indxpath.c,v 1.80 2000/02/15 20:49:16 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -83,7 +83,8 @@ static List *index_innerjoin(Query *root, RelOptInfo *rel, IndexOptInfo *index,
 static bool useful_for_mergejoin(RelOptInfo *rel, IndexOptInfo *index,
 								 List *joininfo_list);
 static bool useful_for_ordering(Query *root, RelOptInfo *rel,
-								IndexOptInfo *index);
+								IndexOptInfo *index,
+								ScanDirection scandir);
 static bool match_index_to_operand(int indexkey, Var *operand,
 								   RelOptInfo *rel, IndexOptInfo *index);
 static bool function_index_operand(Expr *funcOpnd, RelOptInfo *rel,
@@ -106,6 +107,8 @@ static bool string_lessthan(const char * str1, const char * str2,
 /*
 * create_index_paths()
 *	  Generate all interesting index paths for the given relation.
+ *	  Candidate paths are added to the rel's pathlist (using add_path).
+ *	  Additional IndexPath nodes may also be added to rel's innerjoin list.
 *
 * To be considered for an index scan, an index must match one or more
 * restriction clauses or join clauses from the query's qual condition,
@@ -120,29 +123,26 @@ static bool string_lessthan(const char * str1, const char * str2,
 * in its join clauses.  In that context, values for the other rels'
 * attributes are available and fixed during any one scan of the indexpath.
 *
- * This routine's return value is a list of plain IndexPaths for each
- * index the routine deems potentially interesting for the current query
+ * An IndexPath is generated and submitted to add_path() for each index
+ * this routine deems potentially interesting for the current query
 * (at most one IndexPath per index on the given relation).  An innerjoin
 * path is also generated for each interesting combination of outer join
- * relations.  The innerjoin paths are *not* in the return list, but are
- * appended to the "innerjoin" list of the relation itself.
+ * relations.  The innerjoin paths are *not* passed to add_path(), but are
+ * appended to the "innerjoin" list of the relation for later consideration
+ * in nested-loop joins.
 *
 * 'rel' is the relation for which we want to generate index paths
 * 'indices' is a list of available indexes for 'rel'
 * 'restrictinfo_list' is a list of restrictinfo nodes for 'rel'
 * 'joininfo_list' is a list of joininfo nodes for 'rel'
- *
- * Returns a list of IndexPath access path descriptors.  Additional
- * IndexPath nodes may also be added to the rel->innerjoin list.
 */
-List *
+void
 create_index_paths(Query *root,
 				   RelOptInfo *rel,
 				   List *indices,
 				   List *restrictinfo_list,
 				   List *joininfo_list)
 {
-	List	   *retval = NIL;
 	List	   *ilist;

 	foreach(ilist, indices)
@@ -189,9 +189,9 @@ create_index_paths(Query *root,
 													restrictinfo_list);

 		if (restrictclauses != NIL)
-			retval = lappend(retval,
-							 create_index_path(root, rel, index,
-											   restrictclauses));
+			add_path(rel, (Path *) create_index_path(root, rel, index,
+													 restrictclauses,
+													 NoMovementScanDirection));

 		/*
 		 * 3. If this index can be used for a mergejoin, then create an
@@ -205,10 +205,22 @@ create_index_paths(Query *root,
 		if (restrictclauses == NIL)
 		{
 			if (useful_for_mergejoin(rel, index, joininfo_list) ||
-				useful_for_ordering(root, rel, index))
-				retval = lappend(retval,
-								 create_index_path(root, rel, index, NIL));
+				useful_for_ordering(root, rel, index, ForwardScanDirection))
+				add_path(rel, (Path *)
+						 create_index_path(root, rel, index,
+										   NIL,
+										   ForwardScanDirection));
 		}
+		/*
+		 * Currently, backwards scan is never considered except for the case
+		 * of matching a query result ordering.  Possibly should consider
+		 * it in other places?
+		 */
+		if (useful_for_ordering(root, rel, index, BackwardScanDirection))
+			add_path(rel, (Path *)
+					 create_index_path(root, rel, index,
+									   NIL,
+									   BackwardScanDirection));

 		/*
 		 * 4. Create an innerjoin index path for each combination of
@@ -231,8 +243,6 @@ create_index_paths(Query *root,
 												   joinouterrelids));
 		}
 	}
-
-	return retval;
 }


@@ -892,39 +902,26 @@ useful_for_mergejoin(RelOptInfo *rel,
 *	  Determine whether the given index can produce an ordering matching
 *	  the order that is wanted for the query result.
 *
- * We check to see whether either forward or backward scan direction can
- * match the specified pathkeys.
- *
 * 'rel' is the relation for which 'index' is defined
+ * 'scandir' is the contemplated scan direction
 */
 static bool
 useful_for_ordering(Query *root,
 					RelOptInfo *rel,
-					IndexOptInfo *index)
+					IndexOptInfo *index,
+					ScanDirection scandir)
 {
 	List	   *index_pathkeys;

 	if (root->query_pathkeys == NIL)
 		return false;			/* no special ordering requested */

-	index_pathkeys = build_index_pathkeys(root, rel, index);
+	index_pathkeys = build_index_pathkeys(root, rel, index, scandir);

 	if (index_pathkeys == NIL)
 		return false;			/* unordered index */

-	if (pathkeys_contained_in(root->query_pathkeys, index_pathkeys))
-		return true;
-
-	/* caution: commute_pathkeys destructively modifies its argument;
-	 * safe because we just built the index_pathkeys for local use here.
-	 */
-	if (commute_pathkeys(index_pathkeys))
-	{
-		if (pathkeys_contained_in(root->query_pathkeys, index_pathkeys))
-			return true;		/* useful as a reverse-order path */
-	}
-
-	return false;
+	return pathkeys_contained_in(root->query_pathkeys, index_pathkeys);
 }

 /****************************************************************************
@@ -1433,7 +1430,12 @@ index_innerjoin(Query *root, RelOptInfo *rel, IndexOptInfo *index,

 		pathnode->path.pathtype = T_IndexScan;
 		pathnode->path.parent = rel;
-		pathnode->path.pathkeys = build_index_pathkeys(root, rel, index);
+		/*
+		 * There's no point in marking the path with any pathkeys, since
+		 * it will only ever be used as the inner path of a nestloop,
+		 * and so its ordering does not matter.
+		 */
+		pathnode->path.pathkeys = NIL;

 		indexquals = get_actual_clauses(clausegroup);
 		/* expand special operators to indexquals the executor can handle */
@@ -1446,11 +1448,13 @@ index_innerjoin(Query *root, RelOptInfo *rel, IndexOptInfo *index,
 		pathnode->indexid = lconsi(index->indexoid, NIL);
 		pathnode->indexqual = lcons(indexquals, NIL);

+		/* We don't actually care what order the index scans in ... */
+		pathnode->indexscandir = NoMovementScanDirection;
+
 		/* joinrelids saves the rels needed on the outer side of the join */
 		pathnode->joinrelids = lfirst(outerrelids_list);

-		pathnode->path.path_cost = cost_index(root, rel, index, indexquals,
-											  true);
+		cost_index(&pathnode->path, root, rel, index, indexquals, true);

 		path_list = lappend(path_list, pathnode);
 		outerrelids_list = lnext(outerrelids_list);
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/joinpath.c,v 1.51 2000/02/07 04:40:59 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/joinpath.c,v 1.52 2000/02/15 20:49:17 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -27,24 +27,21 @@
 #include "parser/parsetree.h"
 #include "utils/lsyscache.h"

+static void sort_inner_and_outer(Query *root, RelOptInfo *joinrel,
+								 RelOptInfo *outerrel, RelOptInfo *innerrel,
+								 List *restrictlist, List *mergeclause_list);
+static void match_unsorted_outer(Query *root, RelOptInfo *joinrel,
+								 RelOptInfo *outerrel, RelOptInfo *innerrel,
+								 List *restrictlist, List *mergeclause_list);
+#ifdef NOT_USED
+static void match_unsorted_inner(Query *root, RelOptInfo *joinrel,
+								 RelOptInfo *outerrel, RelOptInfo *innerrel,
+								 List *restrictlist, List *mergeclause_list);
+#endif
+static void hash_inner_and_outer(Query *root, RelOptInfo *joinrel,
+								 RelOptInfo *outerrel, RelOptInfo *innerrel,
+								 List *restrictlist);
 static Path *best_innerjoin(List *join_paths, List *outer_relid);
-static List *sort_inner_and_outer(RelOptInfo *joinrel,
-								  RelOptInfo *outerrel,
-								  RelOptInfo *innerrel,
-								  List *restrictlist,
-								  List *mergeclause_list);
-static List *match_unsorted_outer(RelOptInfo *joinrel, RelOptInfo *outerrel,
-								  RelOptInfo *innerrel, List *restrictlist,
-								  List *outerpath_list, Path *cheapest_inner,
-								  Path *best_innerjoin,
-								  List *mergeclause_list);
-static List *match_unsorted_inner(RelOptInfo *joinrel, RelOptInfo *outerrel,
-								  RelOptInfo *innerrel, List *restrictlist,
-								  List *innerpath_list,
-								  List *mergeclause_list);
-static List *hash_inner_and_outer(Query *root, RelOptInfo *joinrel,
-								  RelOptInfo *outerrel, RelOptInfo *innerrel,
-								  List *restrictlist);
 static Selectivity estimate_disbursion(Query *root, Var *var);
 static List *select_mergejoin_clauses(RelOptInfo *joinrel,
 									  RelOptInfo *outerrel,
@@ -70,14 +67,8 @@ add_paths_to_joinrel(Query *root,
 					 RelOptInfo *innerrel,
 					 List *restrictlist)
 {
-	Path	   *bestinnerjoin;
 	List	   *mergeclause_list = NIL;

-	/*
-	 * Get the best inner join for match_unsorted_outer().
-	 */
-	bestinnerjoin = best_innerjoin(innerrel->innerjoin, outerrel->relids);
-
 	/*
 	 * Find potential mergejoin clauses.
 	 */
@@ -91,84 +82,41 @@ add_paths_to_joinrel(Query *root,
 	 * 1. Consider mergejoin paths where both relations must be
 	 * explicitly sorted.
 	 */
-	add_pathlist(joinrel, sort_inner_and_outer(joinrel,
-											   outerrel,
-											   innerrel,
-											   restrictlist,
-											   mergeclause_list));
+	sort_inner_and_outer(root, joinrel, outerrel, innerrel,
+						 restrictlist, mergeclause_list);

 	/*
 	 * 2. Consider paths where the outer relation need not be
 	 * explicitly sorted. This includes both nestloops and
 	 * mergejoins where the outer path is already ordered.
 	 */
-	add_pathlist(joinrel, match_unsorted_outer(joinrel,
-											   outerrel,
-											   innerrel,
-											   restrictlist,
-											   outerrel->pathlist,
-											   innerrel->cheapestpath,
-											   bestinnerjoin,
-											   mergeclause_list));
+	match_unsorted_outer(root, joinrel, outerrel, innerrel,
+						 restrictlist, mergeclause_list);

+#ifdef NOT_USED
 	/*
 	 * 3. Consider paths where the inner relation need not be
 	 * explicitly sorted.  This includes mergejoins only
 	 * (nestloops were already built in match_unsorted_outer).
+	 *
+	 * Diked out as redundant 2/13/2000 -- tgl.  There isn't any
+	 * really significant difference between the inner and outer
+	 * side of a mergejoin, so match_unsorted_inner creates no paths
+	 * that aren't equivalent to those made by match_unsorted_outer
+	 * when add_paths_to_joinrel() is invoked with the two rels given
+	 * in the other order.
 	 */
-	add_pathlist(joinrel, match_unsorted_inner(joinrel,
-											   outerrel,
-											   innerrel,
-											   restrictlist,
-											   innerrel->pathlist,
-											   mergeclause_list));
+	match_unsorted_inner(root, joinrel, outerrel, innerrel,
+						 restrictlist, mergeclause_list);
+#endif

 	/*
 	 * 4. Consider paths where both outer and inner relations must be
 	 * hashed before being joined.
 	 */
 	if (enable_hashjoin)
-		add_pathlist(joinrel, hash_inner_and_outer(root,
-												   joinrel,
-												   outerrel,
-												   innerrel,
-												   restrictlist));
-}
-
-/*
- * best_innerjoin
- *	  Find the cheapest index path that has already been identified by
- *	  indexable_joinclauses() as being a possible inner path for the given
- *	  outer relation(s) in a nestloop join.
- *
- * 'join_paths' is a list of potential inner indexscan join paths
- * 'outer_relids' is the relid list of the outer join relation
- *
- * Returns the pathnode of the best path, or NULL if there's no
- * usable path.
- */
-static Path *
-best_innerjoin(List *join_paths, Relids outer_relids)
-{
-	Path	   *cheapest = (Path *) NULL;
-	List	   *join_path;
-
-	foreach(join_path, join_paths)
-	{
-		Path	   *path = (Path *) lfirst(join_path);
-
-		Assert(IsA(path, IndexPath));
-
-		/* path->joinrelids is the set of base rels that must be part of
-		 * outer_relids in order to use this inner path, because those
-		 * rels are used in the index join quals of this inner path.
-		 */
-		if (is_subseti(((IndexPath *) path)->joinrelids, outer_relids) &&
-			(cheapest == NULL ||
-			 path_is_cheaper(path, cheapest)))
-			cheapest = path;
-	}
-	return cheapest;
+		hash_inner_and_outer(root, joinrel, outerrel, innerrel,
+							 restrictlist);
 }

 /*
@@ -183,17 +131,15 @@ best_innerjoin(List *join_paths, Relids outer_relids)
 *		clauses that apply to this join
 * 'mergeclause_list' is a list of RestrictInfo nodes for available
 *		mergejoin clauses in this join
- *
- * Returns a list of mergejoin paths.
 */
-static List *
-sort_inner_and_outer(RelOptInfo *joinrel,
+static void
+sort_inner_and_outer(Query *root,
+					 RelOptInfo *joinrel,
 					 RelOptInfo *outerrel,
 					 RelOptInfo *innerrel,
 					 List *restrictlist,
 					 List *mergeclause_list)
 {
-	List	   *path_list = NIL;
 	List	   *i;

 	/*
@@ -223,7 +169,6 @@ sort_inner_and_outer(RelOptInfo *joinrel,
 		List		   *outerkeys;
 		List		   *innerkeys;
 		List		   *merge_pathkeys;
-		MergePath	   *path_node;

 		/* Make a mergeclause list with this guy first. */
 		curclause_list = lcons(restrictinfo,
@@ -231,31 +176,37 @@ sort_inner_and_outer(RelOptInfo *joinrel,
 									   listCopy(mergeclause_list)));
 		/* Build sort pathkeys for both sides.
 		 *
-		 * Note: it's possible that the cheapest path will already be
-		 * sorted properly --- create_mergejoin_path will detect that case
-		 * and suppress an explicit sort step.
+		 * Note: it's possible that the cheapest paths will already be
+		 * sorted properly.  create_mergejoin_path will detect that case
+		 * and suppress an explicit sort step, so we needn't do so here.
 		 */
-		outerkeys = make_pathkeys_for_mergeclauses(curclause_list,
+		outerkeys = make_pathkeys_for_mergeclauses(root,
+												   curclause_list,
 												   outerrel->targetlist);
-		innerkeys = make_pathkeys_for_mergeclauses(curclause_list,
+		innerkeys = make_pathkeys_for_mergeclauses(root,
+												   curclause_list,
 												   innerrel->targetlist);
 		/* Build pathkeys representing output sort order. */
 		merge_pathkeys = build_join_pathkeys(outerkeys,
 											 joinrel->targetlist,
-											 curclause_list);
-		/* And now we can make the path. */
-		path_node = create_mergejoin_path(joinrel,
-										  outerrel->cheapestpath,
-										  innerrel->cheapestpath,
-										  restrictlist,
-										  merge_pathkeys,
-										  get_actual_clauses(curclause_list),
-										  outerkeys,
-										  innerkeys);
+											 root->equi_key_list);

-		path_list = lappend(path_list, path_node);
+		/*
+		 * And now we can make the path.  We only consider the cheapest-
+		 * total-cost input paths, since we are assuming here that a sort
+		 * is required.  We will consider cheapest-startup-cost input paths
+		 * later, and only if they don't need a sort.
+		 */
+		add_path(joinrel, (Path *)
+				 create_mergejoin_path(joinrel,
+									   outerrel->cheapest_total_path,
+									   innerrel->cheapest_total_path,
+									   restrictlist,
+									   merge_pathkeys,
+									   get_actual_clauses(curclause_list),
+									   outerkeys,
+									   innerkeys));
 	}
-	return path_list;
 }

 /*
@@ -266,74 +217,56 @@ sort_inner_and_outer(RelOptInfo *joinrel,
 *	  only outer paths that are already ordered well enough for merging).
 *
 * We always generate a nestloop path for each available outer path.
- * If an indexscan inner path exists that is compatible with this outer rel
- * and cheaper than the cheapest general-purpose inner path, then we use
- * the indexscan inner path; else we use the cheapest general-purpose inner.
+ * In fact we may generate as many as three: one on the cheapest-total-cost
+ * inner path, one on the cheapest-startup-cost inner path (if different),
+ * and one on the best inner-indexscan path (if any).
 *
 * We also consider mergejoins if mergejoin clauses are available.  We have
- * two ways to generate the inner path for a mergejoin: use the cheapest
- * inner path (sorting it if it's not suitably ordered already), or using an
- * inner path that is already suitably ordered for the merge.  If the
- * cheapest inner path is suitably ordered, then by definition it's the one
- * to use.  Otherwise, we look for ordered paths that are cheaper than the
- * cheapest inner + sort costs.  If we have several mergeclauses, it could be
- * that there is no inner path (or only a very expensive one) for the full
- * list of mergeclauses, but better paths exist if we truncate the
- * mergeclause list (thereby discarding some sort key requirements).  So, we
- * consider truncations of the mergeclause list as well as the full list.
- * In any case, we find the cheapest suitable path and generate a single
- * output mergejoin path.  (Since all the possible mergejoins will have
- * identical output pathkeys, there is no need to keep any but the cheapest.)
+ * two ways to generate the inner path for a mergejoin: sort the cheapest
+ * inner path, or use an inner path that is already suitably ordered for the
+ * merge.  If we have several mergeclauses, it could be that there is no inner
+ * path (or only a very expensive one) for the full list of mergeclauses, but
+ * better paths exist if we truncate the mergeclause list (thereby discarding
+ * some sort key requirements).  So, we consider truncations of the
+ * mergeclause list as well as the full list.  (Ideally we'd consider all
+ * subsets of the mergeclause list, but that seems way too expensive.)
 *
 * 'joinrel' is the join relation
 * 'outerrel' is the outer join relation
 * 'innerrel' is the inner join relation
 * 'restrictlist' contains all of the RestrictInfo nodes for restriction
 *		clauses that apply to this join
- * 'outerpath_list' is the list of possible outer paths
- * 'cheapest_inner' is the cheapest inner path
- * 'best_innerjoin' is the best inner index path (if any)
 * 'mergeclause_list' is a list of RestrictInfo nodes for available
 *		mergejoin clauses in this join
- *
- * Returns a list of possible join path nodes.
 */
-static List *
-match_unsorted_outer(RelOptInfo *joinrel,
+static void
+match_unsorted_outer(Query *root,
+					 RelOptInfo *joinrel,
 					 RelOptInfo *outerrel,
 					 RelOptInfo *innerrel,
 					 List *restrictlist,
-					 List *outerpath_list,
-					 Path *cheapest_inner,
-					 Path *best_innerjoin,
 					 List *mergeclause_list)
 {
-	List	   *path_list = NIL;
-	Path	   *nestinnerpath;
+	Path	   *bestinnerjoin;
 	List	   *i;

 	/*
-	 * We only use the best innerjoin indexpath if it is cheaper
-	 * than the cheapest general-purpose inner path.
+	 * Get the best innerjoin indexpath (if any) for this outer rel.
+	 * It's the same for all outer paths.
 	 */
-	if (best_innerjoin &&
-		path_is_cheaper(best_innerjoin, cheapest_inner))
-		nestinnerpath = best_innerjoin;
-	else
-		nestinnerpath = cheapest_inner;
+	bestinnerjoin = best_innerjoin(innerrel->innerjoin, outerrel->relids);

-	foreach(i, outerpath_list)
+	foreach(i, outerrel->pathlist)
 	{
 		Path	   *outerpath = (Path *) lfirst(i);
-		List	   *mergeclauses;
 		List	   *merge_pathkeys;
+		List	   *mergeclauses;
 		List	   *innersortkeys;
-		Path	   *mergeinnerpath;
-		int			mergeclausecount;
+		List	   *trialsortkeys;
+		Path	   *cheapest_startup_inner;
+		Path	   *cheapest_total_inner;
+		int			clausecnt;

-		/* Look for useful mergeclauses (if any) */
-		mergeclauses = find_mergeclauses_for_pathkeys(outerpath->pathkeys,
-													  mergeclause_list);
 		/*
 		 * The result will have this sort order (even if it is implemented
 		 * as a nestloop, and even if some of the mergeclauses are implemented
@@ -341,91 +274,137 @@ match_unsorted_outer(RelOptInfo *joinrel,
 		 */
 		merge_pathkeys = build_join_pathkeys(outerpath->pathkeys,
 											 joinrel->targetlist,
-											 mergeclauses);
+											 root->equi_key_list);

-		/* Always consider a nestloop join with this outer and best inner. */
-		path_list = lappend(path_list,
-							create_nestloop_path(joinrel,
-												 outerpath,
-												 nestinnerpath,
-												 restrictlist,
-												 merge_pathkeys));
+		/*
+		 * Always consider a nestloop join with this outer and cheapest-
+		 * total-cost inner.  Consider nestloops using the cheapest-
+		 * startup-cost inner as well, and the best innerjoin indexpath.
+		 */
+		add_path(joinrel, (Path *)
+				 create_nestloop_path(joinrel,
+									  outerpath,
+									  innerrel->cheapest_total_path,
+									  restrictlist,
+									  merge_pathkeys));
+		if (innerrel->cheapest_startup_path != innerrel->cheapest_total_path)
+			add_path(joinrel, (Path *)
+					 create_nestloop_path(joinrel,
+										  outerpath,
+										  innerrel->cheapest_startup_path,
+										  restrictlist,
+										  merge_pathkeys));
+		if (bestinnerjoin != NULL)
+			add_path(joinrel, (Path *)
+					 create_nestloop_path(joinrel,
+										  outerpath,
+										  bestinnerjoin,
+										  restrictlist,
+										  merge_pathkeys));
+
+		/* Look for useful mergeclauses (if any) */
+		mergeclauses = find_mergeclauses_for_pathkeys(outerpath->pathkeys,
+													  mergeclause_list);

 		/* Done with this outer path if no chance for a mergejoin */
 		if (mergeclauses == NIL)
 			continue;

 		/* Compute the required ordering of the inner path */
-		innersortkeys = make_pathkeys_for_mergeclauses(mergeclauses,
+		innersortkeys = make_pathkeys_for_mergeclauses(root,
+													   mergeclauses,
 													   innerrel->targetlist);

-		/* Set up on the assumption that we will use the cheapest_inner */
-		mergeinnerpath = cheapest_inner;
-		mergeclausecount = length(mergeclauses);
-
-		/* If the cheapest_inner doesn't need to be sorted, it is the winner
-		 * by definition.
+		/*
+		 * Generate a mergejoin on the basis of sorting the cheapest inner.
+		 * Since a sort will be needed, only cheapest total cost matters.
 		 */
-		if (pathkeys_contained_in(innersortkeys,
-								  cheapest_inner->pathkeys))
-		{
-			/* cheapest_inner is the winner */
-			innersortkeys = NIL; /* we do not need to sort it... */
-		}
-		else
-		{
-			/* look for a presorted path that's cheaper */
-			List	   *trialsortkeys = listCopy(innersortkeys);
-			Cost		cheapest_cost;
-			int			clausecount;
+		add_path(joinrel, (Path *)
+				 create_mergejoin_path(joinrel,
+									   outerpath,
+									   innerrel->cheapest_total_path,
+									   restrictlist,
+									   merge_pathkeys,
+									   get_actual_clauses(mergeclauses),
+									   NIL,
+									   innersortkeys));

-			cheapest_cost = cheapest_inner->path_cost +
-				cost_sort(innersortkeys, innerrel->rows, innerrel->width);
+		/*
+		 * Look for presorted inner paths that satisfy the mergeclause list
+		 * or any truncation thereof.  Here, we consider both cheap startup
+		 * cost and cheap total cost.
+		 */
+		trialsortkeys = listCopy(innersortkeys); /* modifiable copy */
+		cheapest_startup_inner = NULL;
+		cheapest_total_inner = NULL;

-			for (clausecount = mergeclausecount;
-				 clausecount > 0;
-				 clausecount--)
+		for (clausecnt = length(mergeclauses); clausecnt > 0; clausecnt--)
+		{
+			Path	   *innerpath;
+
+			/* Look for an inner path ordered well enough to merge with
+			 * the first 'clausecnt' mergeclauses.  NB: trialsortkeys list
+			 * is modified destructively, which is why we made a copy...
+			 */
+			trialsortkeys = ltruncate(clausecnt, trialsortkeys);
+			innerpath = get_cheapest_path_for_pathkeys(innerrel->pathlist,
+													   trialsortkeys,
+													   TOTAL_COST);
+			if (innerpath != NULL &&
+				(cheapest_total_inner == NULL ||
+				 compare_path_costs(innerpath, cheapest_total_inner,
+									TOTAL_COST) < 0))
 			{
-				Path	   *trialinnerpath;
+				/* Found a cheap (or even-cheaper) sorted path */
+				List   *newclauses;

-				/* Look for an inner path ordered well enough to merge with
-				 * the first 'clausecount' mergeclauses.  NB: trialsortkeys
-				 * is modified destructively, which is why we made a copy...
-				 */
-				trialinnerpath =
-					get_cheapest_path_for_pathkeys(innerrel->pathlist,
-												   ltruncate(clausecount,
-															 trialsortkeys),
-												   false);
-				if (trialinnerpath != NULL &&
-					trialinnerpath->path_cost < cheapest_cost)
+				newclauses = ltruncate(clausecnt,
+									   get_actual_clauses(mergeclauses));
+				add_path(joinrel, (Path *)
+						 create_mergejoin_path(joinrel,
+											   outerpath,
+											   innerpath,
+											   restrictlist,
+											   merge_pathkeys,
+											   newclauses,
+											   NIL,
+											   NIL));
+				cheapest_total_inner = innerpath;
+			}
+			/* Same on the basis of cheapest startup cost ... */
+			innerpath = get_cheapest_path_for_pathkeys(innerrel->pathlist,
+													   trialsortkeys,
+													   STARTUP_COST);
+			if (innerpath != NULL &&
+				(cheapest_startup_inner == NULL ||
+				 compare_path_costs(innerpath, cheapest_startup_inner,
+									STARTUP_COST) < 0))
+			{
+				/* Found a cheap (or even-cheaper) sorted path */
+				if (innerpath != cheapest_total_inner)
 				{
-					/* Found a cheaper (or even-cheaper) sorted path */
-					cheapest_cost = trialinnerpath->path_cost;
-					mergeinnerpath = trialinnerpath;
-					mergeclausecount = clausecount;
-					innersortkeys = NIL; /* we will not need to sort it... */
+					List   *newclauses;
+
+					newclauses = ltruncate(clausecnt,
+										   get_actual_clauses(mergeclauses));
+					add_path(joinrel, (Path *)
+							 create_mergejoin_path(joinrel,
+												   outerpath,
+												   innerpath,
+												   restrictlist,
+												   merge_pathkeys,
+												   newclauses,
+												   NIL,
+												   NIL));
 				}
+				cheapest_startup_inner = innerpath;
 			}
 		}
-
-		/* Finally, we can build the mergejoin path */
-		mergeclauses = ltruncate(mergeclausecount,
-								 get_actual_clauses(mergeclauses));
-		path_list = lappend(path_list,
-							create_mergejoin_path(joinrel,
-												  outerpath,
-												  mergeinnerpath,
-												  restrictlist,
-												  merge_pathkeys,
-												  mergeclauses,
-												  NIL,
-												  innersortkeys));
 	}
-
-	return path_list;
 }

+#ifdef NOT_USED
+
 /*
 * match_unsorted_inner
 *	  Generate mergejoin paths that use an explicit sort of the outer path
@@ -436,86 +415,105 @@ match_unsorted_outer(RelOptInfo *joinrel,
 * 'innerrel' is the inner join relation
 * 'restrictlist' contains all of the RestrictInfo nodes for restriction
 *		clauses that apply to this join
- * 'innerpath_list' is the list of possible inner join paths
 * 'mergeclause_list' is a list of RestrictInfo nodes for available
 *		mergejoin clauses in this join
- *
- * Returns a list of possible merge paths.
 */
-static List *
-match_unsorted_inner(RelOptInfo *joinrel,
+static void
+match_unsorted_inner(Query *root,
+					 RelOptInfo *joinrel,
 					 RelOptInfo *outerrel,
 					 RelOptInfo *innerrel,
 					 List *restrictlist,
-					 List *innerpath_list,
 					 List *mergeclause_list)
 {
-	List	   *path_list = NIL;
 	List	   *i;

-	foreach(i, innerpath_list)
+	foreach(i, innerrel->pathlist)
 	{
 		Path	   *innerpath = (Path *) lfirst(i);
 		List	   *mergeclauses;
+		List	   *outersortkeys;
+		List	   *merge_pathkeys;
+		Path	   *totalouterpath;
+		Path	   *startupouterpath;

 		/* Look for useful mergeclauses (if any) */
 		mergeclauses = find_mergeclauses_for_pathkeys(innerpath->pathkeys,
 													  mergeclause_list);
+		if (mergeclauses == NIL)
+			continue;

-		if (mergeclauses)
+		/* Compute the required ordering of the outer path */
+		outersortkeys = make_pathkeys_for_mergeclauses(root,
+													   mergeclauses,
+													   outerrel->targetlist);
+
+		/*
+		 * Generate a mergejoin on the basis of sorting the cheapest outer.
+		 * Since a sort will be needed, only cheapest total cost matters.
+		 */
+		merge_pathkeys = build_join_pathkeys(outersortkeys,
+											 joinrel->targetlist,
+											 root->equi_key_list);
+		add_path(joinrel, (Path *)
+				 create_mergejoin_path(joinrel,
+									   outerrel->cheapest_total_path,
+									   innerpath,
+									   restrictlist,
+									   merge_pathkeys,
+									   get_actual_clauses(mergeclauses),
+									   outersortkeys,
+									   NIL));
+		/*
+		 * Now generate mergejoins based on already-sufficiently-ordered
+		 * outer paths.  There's likely to be some redundancy here with paths
+		 * already generated by merge_unsorted_outer ... but since
+		 * merge_unsorted_outer doesn't consider all permutations of the
+		 * mergeclause list, it may fail to notice that this particular
+		 * innerpath could have been used with this outerpath.
+		 */
+		totalouterpath = get_cheapest_path_for_pathkeys(outerrel->pathlist,
+														outersortkeys,
+														TOTAL_COST);
+		if (totalouterpath == NULL)
+			continue;			/* there won't be a startup-cost path either */
+
+		merge_pathkeys = build_join_pathkeys(totalouterpath->pathkeys,
+											 joinrel->targetlist,
+											 root->equi_key_list);
+		add_path(joinrel, (Path *)
+				 create_mergejoin_path(joinrel,
+									   totalouterpath,
+									   innerpath,
+									   restrictlist,
+									   merge_pathkeys,
+									   get_actual_clauses(mergeclauses),
+									   NIL,
+									   NIL));
+
+		startupouterpath = get_cheapest_path_for_pathkeys(outerrel->pathlist,
+														  outersortkeys,
+														  STARTUP_COST);
+		if (startupouterpath != NULL && startupouterpath != totalouterpath)
 		{
-			List	   *outersortkeys;
-			Path	   *mergeouterpath;
-			List	   *merge_pathkeys;
-
-			/* Compute the required ordering of the outer path */
-			outersortkeys =
-				make_pathkeys_for_mergeclauses(mergeclauses,
-											   outerrel->targetlist);
-
-			/* Look for an outer path already ordered well enough to merge */
-			mergeouterpath =
-				get_cheapest_path_for_pathkeys(outerrel->pathlist,
-											   outersortkeys,
-											   false);
-
-			/* Should we use the mergeouter, or sort the cheapest outer? */
-			if (mergeouterpath != NULL &&
-				mergeouterpath->path_cost <=
-				(outerrel->cheapestpath->path_cost +
-				 cost_sort(outersortkeys, outerrel->rows, outerrel->width)))
-			{
-				/* Use mergeouterpath */
-				outersortkeys = NIL;	/* no explicit sort step */
-			}
-			else
-			{
-				/* Use outerrel->cheapestpath, with the outersortkeys */
-				mergeouterpath = outerrel->cheapestpath;
-			}
-
-			/* Compute pathkeys the result will have */
-			merge_pathkeys = build_join_pathkeys(
-				outersortkeys ? outersortkeys : mergeouterpath->pathkeys,
-				joinrel->targetlist,
-				mergeclauses);
-
-			mergeclauses = get_actual_clauses(mergeclauses);
-			path_list = lappend(path_list,
-								create_mergejoin_path(joinrel,
-													  mergeouterpath,
-													  innerpath,
-													  restrictlist,
-													  merge_pathkeys,
-													  mergeclauses,
-													  outersortkeys,
-													  NIL));
+			merge_pathkeys = build_join_pathkeys(startupouterpath->pathkeys,
+												 joinrel->targetlist,
+												 root->equi_key_list);
+			add_path(joinrel, (Path *)
+					 create_mergejoin_path(joinrel,
+										   startupouterpath,
+										   innerpath,
+										   restrictlist,
+										   merge_pathkeys,
+										   get_actual_clauses(mergeclauses),
+										   NIL,
+										   NIL));
 		}
 	}
-
-	return path_list;
 }

+#endif
+
 /*
 * hash_inner_and_outer
 *	  Create hashjoin join paths by explicitly hashing both the outer and
@@ -526,17 +524,14 @@ match_unsorted_inner(RelOptInfo *joinrel,
 * 'innerrel' is the inner join relation
 * 'restrictlist' contains all of the RestrictInfo nodes for restriction
 *		clauses that apply to this join
- *
- * Returns a list of hashjoin paths.
 */
-static List *
+static void
 hash_inner_and_outer(Query *root,
 					 RelOptInfo *joinrel,
 					 RelOptInfo *outerrel,
 					 RelOptInfo *innerrel,
 					 List *restrictlist)
 {
-	List	   *hpath_list = NIL;
 	Relids		outerrelids = outerrel->relids;
 	Relids		innerrelids = innerrel->relids;
 	List	   *i;
@@ -558,7 +553,6 @@ hash_inner_and_outer(Query *root,
 				   *right,
 				   *inner;
 		Selectivity	innerdisbursion;
-		HashPath   *hash_path;

 		if (restrictinfo->hashjoinoperator == InvalidOid)
 			continue;			/* not hashjoinable */
@@ -581,17 +575,66 @@ hash_inner_and_outer(Query *root,
 		/* estimate disbursion of inner var for costing purposes */
 		innerdisbursion = estimate_disbursion(root, inner);

-		hash_path = create_hashjoin_path(joinrel,
-										 outerrel->cheapestpath,
-										 innerrel->cheapestpath,
-										 restrictlist,
-										 lcons(clause, NIL),
-										 innerdisbursion);
-
-		hpath_list = lappend(hpath_list, hash_path);
+		/*
+		 * We consider both the cheapest-total-cost and cheapest-startup-cost
+		 * outer paths.  There's no need to consider any but the cheapest-
+		 * total-cost inner path, however.
+		 */
+		add_path(joinrel, (Path *)
+				 create_hashjoin_path(joinrel,
+									  outerrel->cheapest_total_path,
+									  innerrel->cheapest_total_path,
+									  restrictlist,
+									  lcons(clause, NIL),
+									  innerdisbursion));
+		if (outerrel->cheapest_startup_path != outerrel->cheapest_total_path)
+			add_path(joinrel, (Path *)
+					 create_hashjoin_path(joinrel,
+										  outerrel->cheapest_startup_path,
+										  innerrel->cheapest_total_path,
+										  restrictlist,
+										  lcons(clause, NIL),
+										  innerdisbursion));
 	}
+}

-	return hpath_list;
+/*
+ * best_innerjoin
+ *	  Find the cheapest index path that has already been identified by
+ *	  indexable_joinclauses() as being a possible inner path for the given
+ *	  outer relation(s) in a nestloop join.
+ *
+ * We compare indexpaths on total_cost only, assuming that they will all have
+ * zero or negligible startup_cost.  We might have to think harder someday...
+ *
+ * 'join_paths' is a list of potential inner indexscan join paths
+ * 'outer_relids' is the relid list of the outer join relation
+ *
+ * Returns the pathnode of the best path, or NULL if there's no
+ * usable path.
+ */
+static Path *
+best_innerjoin(List *join_paths, Relids outer_relids)
+{
+	Path	   *cheapest = (Path *) NULL;
+	List	   *join_path;
+
+	foreach(join_path, join_paths)
+	{
+		Path	   *path = (Path *) lfirst(join_path);
+
+		Assert(IsA(path, IndexPath));
+
+		/* path->joinrelids is the set of base rels that must be part of
+		 * outer_relids in order to use this inner path, because those
+		 * rels are used in the index join quals of this inner path.
+		 */
+		if (is_subseti(((IndexPath *) path)->joinrelids, outer_relids) &&
+			(cheapest == NULL ||
+			 compare_path_costs(path, cheapest, TOTAL_COST) < 0))
+			cheapest = path;
+	}
+	return cheapest;
 }

 /*
--- a/src/backend/optimizer/path/orindxpath.c
+++ b/src/backend/optimizer/path/orindxpath.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/orindxpath.c,v 1.36 2000/02/05 18:26:09 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/orindxpath.c,v 1.37 2000/02/15 20:49:17 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -19,6 +19,7 @@
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
 #include "optimizer/internal.h"
+#include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/plancat.h"
 #include "optimizer/restrictinfo.h"
@@ -27,14 +28,13 @@

 static void best_or_subclause_indices(Query *root, RelOptInfo *rel,
 									  List *subclauses, List *indices,
-									  List **indexquals,
-									  List **indexids,
-									  Cost *cost);
+									  IndexPath *pathnode);
 static void best_or_subclause_index(Query *root, RelOptInfo *rel,
 									Expr *subclause, List *indices,
 									List **retIndexQual,
 									Oid *retIndexid,
-									Cost *retCost);
+									Cost *retStartupCost,
+									Cost *retTotalCost);


 /*
@@ -45,14 +45,13 @@ static void best_or_subclause_index(Query *root, RelOptInfo *rel,
 * 'rel' is the relation entry for which the paths are to be created
 * 'clauses' is the list of available restriction clause nodes
 *
- * Returns a list of index path nodes.
- *
+ * Returns nothing, but adds paths to rel->pathlist via add_path().
 */
-List *
+void
 create_or_index_paths(Query *root,
-					  RelOptInfo *rel, List *clauses)
+					  RelOptInfo *rel,
+					  List *clauses)
 {
-	List	   *path_list = NIL;
 	List	   *clist;

 	foreach(clist, clauses)
@@ -86,17 +85,6 @@ create_or_index_paths(Query *root,
 				 * best available index for each subclause.
 				 */
 				IndexPath  *pathnode = makeNode(IndexPath);
-				List	   *indexquals;
-				List	   *indexids;
-				Cost		cost;
-
-				best_or_subclause_indices(root,
-										  rel,
-										  clausenode->clause->args,
-										  clausenode->subclauseindices,
-										  &indexquals,
-										  &indexids,
-										  &cost);

 				pathnode->path.pathtype = T_IndexScan;
 				pathnode->path.parent = rel;
@@ -108,17 +96,21 @@ create_or_index_paths(Query *root,
 				 */
 				pathnode->path.pathkeys = NIL;

-				pathnode->indexid = indexids;
-				pathnode->indexqual = indexquals;
-				pathnode->joinrelids = NIL;	/* no join clauses here */
-				pathnode->path.path_cost = cost;
+				/* We don't actually care what order the index scans in ... */
+				pathnode->indexscandir = NoMovementScanDirection;

-				path_list = lappend(path_list, pathnode);
+				pathnode->joinrelids = NIL;	/* no join clauses here */
+
+				best_or_subclause_indices(root,
+										  rel,
+										  clausenode->clause->args,
+										  clausenode->subclauseindices,
+										  pathnode);
+
+				add_path(rel, (Path *) pathnode);
 			}
 		}
 	}
-
-	return path_list;
 }

 /*
@@ -128,53 +120,68 @@ create_or_index_paths(Query *root,
 *	  indices.	The cost is the sum of the individual index costs, since
 *	  the executor will perform a scan for each subclause of the 'or'.
 *
- * This routine also creates the indexquals and indexids lists that will
- * be needed by the executor.  The indexquals list has one entry for each
+ * This routine also creates the indexqual and indexid lists that will
+ * be needed by the executor.  The indexqual list has one entry for each
 * scan of the base rel, which is a sublist of indexqual conditions to
 * apply in that scan.  The implicit semantics are AND across each sublist
 * of quals, and OR across the toplevel list (note that the executor
- * takes care not to return any single tuple more than once).  The indexids
- * list gives the index to be used in each scan.
+ * takes care not to return any single tuple more than once).  The indexid
+ * list gives the OID of the index to be used in each scan.
 *
 * 'rel' is the node of the relation on which the indexes are defined
 * 'subclauses' are the subclauses of the 'or' clause
 * 'indices' is a list of sublists of the IndexOptInfo nodes that matched
 *		each subclause of the 'or' clause
- * '*indexquals' gets the constructed indexquals for the path (a list
+ * 'pathnode' is the IndexPath node being built.
+ *
+ * Results are returned by setting these fields of the passed pathnode:
+ * 'indexqual' gets the constructed indexquals for the path (a list
 *		of sublists of clauses, one sublist per scan of the base rel)
- * '*indexids' gets a list of the index OIDs for each scan of the rel
- * '*cost' gets the total cost of the path
+ * 'indexid' gets a list of the index OIDs for each scan of the rel
+ * 'startup_cost' and 'total_cost' get the complete path costs.
+ *
+ * 'startup_cost' is the startup cost for the first index scan only;
+ * startup costs for later scans will be paid later on, so they just
+ * get reflected in total_cost.
+ *
+ * NOTE: we choose each scan on the basis of its total cost, ignoring startup
+ * cost.  This is reasonable as long as all index types have zero or small
+ * startup cost, but we might have to work harder if any index types with
+ * nontrivial startup cost are ever invented.
 */
 static void
 best_or_subclause_indices(Query *root,
 						  RelOptInfo *rel,
 						  List *subclauses,
 						  List *indices,
-						  List **indexquals,	/* return value */
-						  List **indexids,		/* return value */
-						  Cost *cost)			/* return value */
+						  IndexPath *pathnode)
 {
 	List	   *slist;

-	*indexquals = NIL;
-	*indexids = NIL;
-	*cost = (Cost) 0.0;
+	pathnode->indexqual = NIL;
+	pathnode->indexid = NIL;
+	pathnode->path.startup_cost = 0;
+	pathnode->path.total_cost = 0;

 	foreach(slist, subclauses)
 	{
 		Expr	   *subclause = lfirst(slist);
 		List	   *best_indexqual;
 		Oid			best_indexid;
-		Cost		best_cost;
+		Cost		best_startup_cost;
+		Cost		best_total_cost;

 		best_or_subclause_index(root, rel, subclause, lfirst(indices),
-								&best_indexqual, &best_indexid, &best_cost);
+								&best_indexqual, &best_indexid,
+								&best_startup_cost, &best_total_cost);

 		Assert(best_indexid != InvalidOid);

-		*indexquals = lappend(*indexquals, best_indexqual);
-		*indexids = lappendi(*indexids, best_indexid);
-		*cost += best_cost;
+		pathnode->indexqual = lappend(pathnode->indexqual, best_indexqual);
+		pathnode->indexid = lappendi(pathnode->indexid, best_indexid);
+		if (slist == subclauses)		/* first scan? */
+			pathnode->path.startup_cost = best_startup_cost;
+		pathnode->path.total_cost += best_total_cost;

 		indices = lnext(indices);
 	}
@@ -182,16 +189,17 @@ best_or_subclause_indices(Query *root,

 /*
 * best_or_subclause_index
- *	  Determines which is the best index to be used with a subclause of
- *	  an 'or' clause by estimating the cost of using each index and selecting
- *	  the least expensive.
+ *	  Determines which is the best index to be used with a subclause of an
+ *	  'or' clause by estimating the cost of using each index and selecting
+ *	  the least expensive (considering total cost only, for now).
 *
 * 'rel' is the node of the relation on which the index is defined
 * 'subclause' is the OR subclause being considered
 * 'indices' is a list of IndexOptInfo nodes that match the subclause
 * '*retIndexQual' gets a list of the indexqual conditions for the best index
 * '*retIndexid' gets the OID of the best index
- * '*retCost' gets the cost of a scan with that index
+ * '*retStartupCost' gets the startup cost of a scan with that index
+ * '*retTotalCost' gets the total cost of a scan with that index
 */
 static void
 best_or_subclause_index(Query *root,
@@ -200,7 +208,8 @@ best_or_subclause_index(Query *root,
 						List *indices,
 						List **retIndexQual,	/* return value */
 						Oid *retIndexid,		/* return value */
-						Cost *retCost)			/* return value */
+						Cost *retStartupCost,	/* return value */
+						Cost *retTotalCost)		/* return value */
 {
 	bool		first_time = true;
 	List	   *ilist;
@@ -208,27 +217,28 @@ best_or_subclause_index(Query *root,
 	/* if we don't match anything, return zeros */
 	*retIndexQual = NIL;
 	*retIndexid = InvalidOid;
-	*retCost = 0.0;
+	*retStartupCost = 0;
+	*retTotalCost = 0;

 	foreach(ilist, indices)
 	{
 		IndexOptInfo *index = (IndexOptInfo *) lfirst(ilist);
 		List	   *indexqual;
-		Cost		subcost;
+		Path		subclause_path;

 		Assert(IsA(index, IndexOptInfo));

 		/* Convert this 'or' subclause to an indexqual list */
 		indexqual = extract_or_indexqual_conditions(rel, index, subclause);

-		subcost = cost_index(root, rel, index, indexqual,
-							 false);
+		cost_index(&subclause_path, root, rel, index, indexqual, false);

-		if (first_time || subcost < *retCost)
+		if (first_time || subclause_path.total_cost < *retTotalCost)
 		{
 			*retIndexQual = indexqual;
 			*retIndexid = index->indexoid;
-			*retCost = subcost;
+			*retStartupCost = subclause_path.startup_cost;
+			*retTotalCost = subclause_path.total_cost;
 			first_time = false;
 		}
 	}
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/pathkeys.c,v 1.18 2000/01/26 05:56:34 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/pathkeys.c,v 1.19 2000/02/15 20:49:17 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -17,6 +17,7 @@
 #include "nodes/makefuncs.h"
 #include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
+#include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/tlist.h"
 #include "optimizer/var.h"
@@ -25,9 +26,9 @@
 #include "utils/lsyscache.h"

 static PathKeyItem *makePathKeyItem(Node *key, Oid sortop);
-static Var *find_indexkey_var(int indexkey, List *tlist);
-static List *build_join_pathkey(List *pathkeys, List *join_rel_tlist,
-								List *joinclauses);
+static List *make_canonical_pathkey(Query *root, PathKeyItem *item);
+static Var *find_indexkey_var(Query *root, RelOptInfo *rel,
+							  AttrNumber varattno);


 /*--------------------
@@ -50,50 +51,122 @@ static List *build_join_pathkey(List *pathkeys, List *join_rel_tlist,
 *	Note that a multi-pass indexscan (OR clause scan) has NIL pathkeys since
 *	we can say nothing about the overall order of its result.  Also, an
 *	indexscan on an unordered type of index generates NIL pathkeys.  However,
- *	we can always create a pathkey by doing an explicit sort.
+ *	we can always create a pathkey by doing an explicit sort.  The pathkeys
+ *	for a sort plan's output just represent the sort key fields and the
+ *	ordering operators used.
 *
- *	Multi-relation RelOptInfo Path's are more complicated.  Mergejoins are
- *	only performed with equijoins ("=").  Because of this, the resulting
- *	multi-relation path actually has more than one primary key.  For example,
- *	a mergejoin using a clause "tab1.col1 = tab2.col1" would generate pathkeys
- *	of ( (tab1.col1/sortop1 tab2.col1/sortop2) ), indicating that the major
- *	sort order of the Path can be taken to be *either* tab1.col1 or tab2.col1.
- *	They are equal, so they are both primary sort keys.  This allows future
- *	joins to use either var as a pre-sorted key to prevent upper Mergejoins
- *	from having to re-sort the Path.  This is why pathkeys is a List of Lists.
- *
- *	Note that while the order of the top list is meaningful (primary vs.
- *	secondary sort key), the order of each sublist is arbitrary.  No code
- *	working with pathkeys should generate a result that depends on the order
- *	of a pathkey sublist.
+ *	Things get more interesting when we consider joins.  Suppose we do a
+ *	mergejoin between A and B using the mergeclause A.X = B.Y.  The output
+ *	of the mergejoin is sorted by X --- but it is also sorted by Y.  We
+ *	represent this fact by listing both keys in a single pathkey sublist:
+ *	( (A.X/xsortop B.Y/ysortop) ).  This pathkey asserts that the major
+ *	sort order of the Path can be taken to be *either* A.X or B.Y.
+ *	They are equal, so they are both primary sort keys.  By doing this,
+ *	we allow future joins to use either var as a pre-sorted key, so upper
+ *	Mergejoins may be able to avoid having to re-sort the Path.  This is
+ *	why pathkeys is a List of Lists.
 *
 *	We keep a sortop associated with each PathKeyItem because cross-data-type
- *	mergejoins are possible; for example int4=int8 is mergejoinable.  In this
- *	case we need to remember that the left var is ordered by int4lt while
- *	the right var is ordered by int8lt.  So the different members of each
- *	sublist could have different sortops.
+ *	mergejoins are possible; for example int4 = int8 is mergejoinable.
+ *	In this case we need to remember that the left var is ordered by int4lt
+ *	while the right var is ordered by int8lt.  So the different members of
+ *	each sublist could have different sortops.
 *
- *	When producing the pathkeys for a merge or nestloop join, we can keep
- *	all of the keys of the outer path, since the ordering of the outer path
- *	will be preserved in the result.  We add to each pathkey sublist any inner
- *	vars that are equijoined to any of the outer vars in the sublist.  In the
- *	nestloop case we have to be careful to consider only equijoin operators;
- *	the nestloop's join clauses might include non-equijoin operators.
- *	(Currently, we do this by considering only mergejoinable operators while
- *	making the pathkeys, since we have no separate marking for operators that
- *	are equijoins but aren't mergejoinable.)
+ *	Note that while the order of the top list is meaningful (primary vs.
+ *	secondary sort key), the order of each sublist is arbitrary.  Each sublist
+ *	should be regarded as a set of equivalent keys, with no significance
+ *	to the list order.
+ *
+ *	With a little further thought, it becomes apparent that pathkeys for
+ *	joins need not only come from mergejoins.  For example, if we do a
+ *	nestloop join between outer relation A and inner relation B, then any
+ *	pathkeys relevant to A are still valid for the join result: we have
+ *	not altered the order of the tuples from A.  Even more interesting,
+ *	if there was a mergeclause (more formally, an "equijoin clause") A.X=B.Y,
+ *	and A.X was a pathkey for the outer relation A, then we can assert that
+ *	B.Y is a pathkey for the join result; X was ordered before and still is,
+ *	and the joined values of Y are equal to the joined values of X, so Y
+ *	must now be ordered too.  This is true even though we used no mergejoin.
+ *
+ *	More generally, whenever we have an equijoin clause A.X = B.Y and a
+ *	pathkey A.X, we can add B.Y to that pathkey if B is part of the joined
+ *	relation the pathkey is for, *no matter how we formed the join*.
+ *
+ *	In short, then: when producing the pathkeys for a merge or nestloop join,
+ *	we can keep all of the keys of the outer path, since the ordering of the
+ *	outer path will be preserved in the result.  Furthermore, we can add to
+ *	each pathkey sublist any inner vars that are equijoined to any of the
+ *	outer vars in the sublist; this works regardless of whether we are
+ *	implementing the join using that equijoin clause as a mergeclause,
+ *	or merely enforcing the clause after-the-fact as a qpqual filter.
 *
 *	Although Hashjoins also work only with equijoin operators, it is *not*
 *	safe to consider the output of a Hashjoin to be sorted in any particular
 *	order --- not even the outer path's order.  This is true because the
 *	executor might have to split the join into multiple batches.  Therefore
- *	a Hashjoin is always given NIL pathkeys.
+ *	a Hashjoin is always given NIL pathkeys.  (Also, we need to use only
+ *	mergejoinable operators when deducing which inner vars are now sorted,
+ *	because a mergejoin operator tells us which left- and right-datatype
+ *	sortops can be considered equivalent, whereas a hashjoin operator
+ *	doesn't imply anything about sort order.)
 *
 *	Pathkeys are also useful to represent an ordering that we wish to achieve,
 *	since they are easily compared to the pathkeys of a potential candidate
 *	path.  So, SortClause lists are turned into pathkeys lists for use inside
 *	the optimizer.
 *
+ *	OK, now for how it *really* works:
+ *
+ *	We did implement pathkeys just as described above, and found that the
+ *	planner spent a huge amount of time comparing pathkeys, because the
+ *	representation of pathkeys as unordered lists made it expensive to decide
+ *	whether two were equal or not.  So, we've modified the representation
+ *	as described next.
+ *
+ *	If we scan the WHERE clause for equijoin clauses (mergejoinable clauses)
+ *	during planner startup, we can construct lists of equivalent pathkey items
+ *	for the query.  There could be more than two items per equivalence set;
+ *	for example, WHERE A.X = B.Y AND B.Y = C.Z AND D.R = E.S creates the
+ *	equivalence sets { A.X B.Y C.Z } and { D.R E.S } (plus associated sortops).
+ *	Any pathkey item that belongs to an equivalence set implies that all the
+ *	other items in its set apply to the relation too, or at least all the ones
+ *	that are for fields present in the relation.  (Some of the items in the
+ *	set might be for as-yet-unjoined relations.)  Furthermore, any multi-item
+ *	pathkey sublist that appears at any stage of planning the query *must* be
+ *	a subset of one or another of these equivalence sets; there's no way we'd
+ *	have put two items in the same pathkey sublist unless they were equijoined
+ *	in WHERE.
+ *
+ *	Now suppose that we allow a pathkey sublist to contain pathkey items for
+ *	vars that are not yet part of the pathkey's relation.  This introduces
+ *	no logical difficulty, because such items can easily be seen to be
+ *	irrelevant; we just mandate that they be ignored.  But having allowed
+ *	this, we can declare (by fiat) that any multiple-item pathkey sublist
+ *	must be equal() to the appropriate equivalence set.  In effect, whenever
+ *	we make a pathkey sublist that mentions any var appearing in an
+ *	equivalence set, we instantly add all the other vars equivalenced to it,
+ *	whether they appear yet in the pathkey's relation or not.  And we also
+ *	mandate that the pathkey sublist appear in the same order as the
+ *	equivalence set it comes from.  (In practice, we simply return a pointer
+ *	to the relevant equivalence set without building any new sublist at all.)
+ *	This makes comparing pathkeys very simple and fast, and saves a lot of
+ *	work and memory space for pathkey construction as well.
+ *
+ *	Note that pathkey sublists having just one item still exist, and are
+ *	not expected to be equal() to any equivalence set.  This occurs when
+ *	we describe a sort order that involves a var that's not mentioned in
+ *	any equijoin clause of the WHERE.  We could add singleton sets containing
+ *	such vars to the query's list of equivalence sets, but there's little
+ *	point in doing so.
+ *
+ *	By the way, it's OK and even useful for us to build equivalence sets
+ *	that mention multiple vars from the same relation.  For example, if
+ *	we have WHERE A.X = A.Y and we are scanning A using an index on X,
+ *	we can legitimately conclude that the path is sorted by Y as well;
+ *	and this could be handy if Y is the variable used in other join clauses
+ *	or ORDER BY.  So, any WHERE clause with a mergejoinable operator can
+ *	contribute to an equivalence set, even if it's not a join clause.
+ *
 *	-- bjm & tgl
 *--------------------
 */
@@ -113,6 +186,129 @@ makePathKeyItem(Node *key, Oid sortop)
 	return item;
 }

+/*
+ * add_equijoined_keys
+ *	  The given clause has a mergejoinable operator, so its two sides
+ *	  can be considered equal after restriction clause application; in
+ *	  particular, any pathkey mentioning one side (with the correct sortop)
+ *	  can be expanded to include the other as well.  Record the vars and
+ *	  associated sortops in the query's equi_key_list for future use.
+ *
+ * The query's equi_key_list field points to a list of sublists of PathKeyItem
+ * nodes, where each sublist is a set of two or more vars+sortops that have
+ * been identified as logically equivalent (and, therefore, we may consider
+ * any two in a set to be equal).  As described above, we will subsequently
+ * use direct pointers to one of these sublists to represent any pathkey
+ * that involves an equijoined variable.
+ *
+ * This code would actually work fine with expressions more complex than
+ * a single Var, but currently it won't see any because check_mergejoinable
+ * won't accept such clauses as mergejoinable.
+ */
+void
+add_equijoined_keys(Query *root, RestrictInfo *restrictinfo)
+{
+	Expr	   *clause = restrictinfo->clause;
+	PathKeyItem *item1 = makePathKeyItem((Node *) get_leftop(clause),
+										 restrictinfo->left_sortop);
+	PathKeyItem *item2 = makePathKeyItem((Node *) get_rightop(clause),
+										 restrictinfo->right_sortop);
+	List	   *newset,
+			   *cursetlink;
+
+	/* We might see a clause X=X; don't make a single-element list from it */
+	if (equal(item1, item2))
+		return;
+	/*
+	 * Our plan is to make a two-element set, then sweep through the existing
+	 * equijoin sets looking for matches to item1 or item2.  When we find one,
+	 * we remove that set from equi_key_list and union it into our new set.
+	 * When done, we add the new set to the front of equi_key_list.
+	 *
+	 * This is a standard UNION-FIND problem, for which there exist better
+	 * data structures than simple lists.  If this code ever proves to be
+	 * a bottleneck then it could be sped up --- but for now, simple is
+	 * beautiful.
+	 */
+	newset = lcons(item1, lcons(item2, NIL));
+
+	foreach(cursetlink, root->equi_key_list)
+	{
+		List	   *curset = lfirst(cursetlink);
+
+		if (member(item1, curset) || member(item2, curset))
+		{
+			/* Found a set to merge into our new set */
+			newset = LispUnion(newset, curset);
+			/* Remove old set from equi_key_list.  NOTE this does not change
+			 * lnext(cursetlink), so the outer foreach doesn't break.
+			 */
+			root->equi_key_list = lremove(curset, root->equi_key_list);
+			freeList(curset);	/* might as well recycle old cons cells */
+		}
+	}
+
+	root->equi_key_list = lcons(newset, root->equi_key_list);
+}
+
+/*
+ * make_canonical_pathkey
+ *	  Given a PathKeyItem, find the equi_key_list subset it is a member of,
+ *	  if any.  If so, return a pointer to that sublist, which is the
+ *	  canonical representation (for this query) of that PathKeyItem's
+ *	  equivalence set.  If it is not found, return a single-element list
+ *	  containing the PathKeyItem (when the item has no equivalence peers,
+ *	  we just allow it to be a standalone list).
+ *
+ * Note that this function must not be used until after we have completed
+ * scanning the WHERE clause for equijoin operators.
+ */
+static List *
+make_canonical_pathkey(Query *root, PathKeyItem *item)
+{
+	List	   *cursetlink;
+
+	foreach(cursetlink, root->equi_key_list)
+	{
+		List	   *curset = lfirst(cursetlink);
+
+		if (member(item, curset))
+			return curset;
+	}
+	return lcons(item, NIL);
+}
+
+/*
+ * canonicalize_pathkeys
+ *	   Convert a not-necessarily-canonical pathkeys list to canonical form.
+ *
+ * Note that this function must not be used until after we have completed
+ * scanning the WHERE clause for equijoin operators.
+ */
+List *
+canonicalize_pathkeys(Query *root, List *pathkeys)
+{
+	List	   *new_pathkeys = NIL;
+	List	   *i;
+
+	foreach(i, pathkeys)
+	{
+		List		   *pathkey = (List *) lfirst(i);
+		PathKeyItem	   *item;
+
+		/*
+		 * It's sufficient to look at the first entry in the sublist;
+		 * if there are more entries, they're already part of an
+		 * equivalence set by definition.
+		 */
+		Assert(pathkey != NIL);
+		item = (PathKeyItem *) lfirst(pathkey);
+		new_pathkeys = lappend(new_pathkeys,
+							   make_canonical_pathkey(root, item));
+	}
+	return new_pathkeys;
+}
+
 /****************************************************************************
 *		PATHKEY COMPARISONS
 ****************************************************************************/
@@ -126,15 +322,21 @@ makePathKeyItem(Node *key, Oid sortop)
 *	  it contains all the keys of the other plus more.  For example, either
 *	  ((A) (B)) or ((A B)) is better than ((A)).
 *
- *	This gets called a lot, so it is optimized.
+ *	  Because we actually only expect to see canonicalized pathkey sublists,
+ *	  we don't have to do the full two-way-subset-inclusion test on each
+ *	  pair of sublists that is implied by the above statement.  Instead we
+ *	  just do an equal().  In the normal case where multi-element sublists
+ *	  are pointers into the root's equi_key_list, equal() will be very fast:
+ *	  it will recognize pointer equality when the sublists are the same,
+ *	  and will fail at the first sublist element when they are not.
+ *
+ * Yes, this gets called enough to be worth coding it this tensely.
 */
 PathKeysComparison
 compare_pathkeys(List *keys1, List *keys2)
 {
 	List	   *key1,
 			   *key2;
-	bool		key1_subsetof_key2 = true,
-				key2_subsetof_key1 = true;

 	for (key1 = keys1, key2 = keys2;
 		 key1 != NIL && key2 != NIL;
@@ -142,36 +344,12 @@ compare_pathkeys(List *keys1, List *keys2)
 	{
 		List	   *subkey1 = lfirst(key1);
 		List	   *subkey2 = lfirst(key2);
-		List	   *i;

-		/* We have to do this the hard way since the ordering of the subkey
-		 * lists is arbitrary.
+		/* We will never have two subkeys where one is a subset of the other,
+		 * because of the canonicalization explained above.  Either they are
+		 * equal or they ain't.
 		 */
-		if (key1_subsetof_key2)
-		{
-			foreach(i, subkey1)
-			{
-				if (! member(lfirst(i), subkey2))
-				{
-					key1_subsetof_key2 = false;
-					break;
-				}
-			}
-		}
-
-		if (key2_subsetof_key1)
-		{
-			foreach(i, subkey2)
-			{
-				if (! member(lfirst(i), subkey1))
-				{
-					key2_subsetof_key1 = false;
-					break;
-				}
-			}
-		}
-
-		if (!key1_subsetof_key2 && !key2_subsetof_key1)
+		if (! equal(subkey1, subkey2))
 			return PATHKEYS_DIFFERENT; /* no need to keep looking */
 	}

@@ -180,18 +358,11 @@ compare_pathkeys(List *keys1, List *keys2)
 	 * of the other list are not NIL --- no pathkey list should ever have
 	 * a NIL sublist.)
 	 */
-	if (key1 != NIL)
-		key1_subsetof_key2 = false;
-	if (key2 != NIL)
-		key2_subsetof_key1 = false;
-
-	if (key1_subsetof_key2 && key2_subsetof_key1)
+	if (key1 == NIL && key2 == NIL)
 		return PATHKEYS_EQUAL;
-	if (key1_subsetof_key2)
-		return PATHKEYS_BETTER2;
-	if (key2_subsetof_key1)
-		return PATHKEYS_BETTER1;
-	return PATHKEYS_DIFFERENT;
+	if (key1 != NIL)
+		return PATHKEYS_BETTER1; /* key1 is longer */
+	return PATHKEYS_BETTER2;	/* key2 is longer */
 }

 /*
@@ -215,16 +386,16 @@ pathkeys_contained_in(List *keys1, List *keys2)

 /*
 * get_cheapest_path_for_pathkeys
- *	  Find the cheapest path in 'paths' that satisfies the given pathkeys.
- *	  Return NULL if no such path.
+ *	  Find the cheapest path (according to the specified criterion) that
+ *	  satisfies the given pathkeys.  Return NULL if no such path.
 *
- * 'paths' is a list of possible paths (either inner or outer)
- * 'pathkeys' represents a required ordering
- * if 'indexpaths_only' is true, only IndexPaths will be considered.
+ * 'paths' is a list of possible paths that all generate the same relation
+ * 'pathkeys' represents a required ordering (already canonicalized!)
+ * 'cost_criterion' is STARTUP_COST or TOTAL_COST
 */
 Path *
 get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
-							   bool indexpaths_only)
+							   CostSelector cost_criterion)
 {
 	Path	   *matched_path = NULL;
 	List	   *i;
@@ -233,15 +404,55 @@ get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
 	{
 		Path	   *path = (Path *) lfirst(i);

-		if (indexpaths_only && ! IsA(path, IndexPath))
+		/*
+		 * Since cost comparison is a lot cheaper than pathkey comparison,
+		 * do that first.  (XXX is that still true?)
+		 */
+		if (matched_path != NULL &&
+			compare_path_costs(matched_path, path, cost_criterion) <= 0)
 			continue;

 		if (pathkeys_contained_in(pathkeys, path->pathkeys))
-		{
-			if (matched_path == NULL ||
-				path->path_cost < matched_path->path_cost)
-				matched_path = path;
-		}
+			matched_path = path;
+	}
+	return matched_path;
+}
+
+/*
+ * get_cheapest_fractional_path_for_pathkeys
+ *	  Find the cheapest path (for retrieving a specified fraction of all
+ *	  the tuples) that satisfies the given pathkeys.
+ *	  Return NULL if no such path.
+ *
+ * See compare_fractional_path_costs() for the interpretation of the fraction
+ * parameter.
+ *
+ * 'paths' is a list of possible paths that all generate the same relation
+ * 'pathkeys' represents a required ordering (already canonicalized!)
+ * 'fraction' is the fraction of the total tuples expected to be retrieved
+ */
+Path *
+get_cheapest_fractional_path_for_pathkeys(List *paths,
+										  List *pathkeys,
+										  double fraction)
+{
+	Path	   *matched_path = NULL;
+	List	   *i;
+
+	foreach(i, paths)
+	{
+		Path	   *path = (Path *) lfirst(i);
+
+		/*
+		 * Since cost comparison is a lot cheaper than pathkey comparison,
+		 * do that first.
+		 */
+		if (matched_path != NULL &&
+			compare_fractional_path_costs(matched_path, path, fraction) <= 0)
+			continue;
+
+		if (pathkeys_contained_in(pathkeys, path->pathkeys))
+			matched_path = path;
 	}
 	return matched_path;
 }
@@ -255,18 +466,22 @@ get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
 *	  Build a pathkeys list that describes the ordering induced by an index
 *	  scan using the given index.  (Note that an unordered index doesn't
 *	  induce any ordering; such an index will have no sortop OIDS in
- *	  its "ordering" field.)
+ *	  its "ordering" field, and we will return NIL.)
 *
- * Vars in the resulting pathkeys list are taken from the rel's targetlist.
- * If we can't find the indexkey in the targetlist, we assume that the
- * ordering of that key is not interesting.
+ * If 'scandir' is BackwardScanDirection, attempt to build pathkeys
+ * representing a backwards scan of the index.  Return NIL if can't do it.
 */
 List *
-build_index_pathkeys(Query *root, RelOptInfo *rel, IndexOptInfo *index)
+build_index_pathkeys(Query *root,
+					 RelOptInfo *rel,
+					 IndexOptInfo *index,
+					 ScanDirection scandir)
 {
 	List	   *retval = NIL;
 	int		   *indexkeys = index->indexkeys;
 	Oid		   *ordering = index->ordering;
+	PathKeyItem *item;
+	Oid			sortop;

 	if (!indexkeys || indexkeys[0] == 0 ||
 		!ordering || ordering[0] == InvalidOid)
@@ -275,8 +490,6 @@ build_index_pathkeys(Query *root, RelOptInfo *rel, IndexOptInfo *index)
 	if (index->indproc)
 	{
 		/* Functional index: build a representation of the function call */
-		int			relid = lfirsti(rel->relids);
-		Oid			reloid = getrelid(relid, root->rtable);
 		Func	   *funcnode = makeNode(Func);
 		List	   *funcargs = NIL;

@@ -291,43 +504,42 @@ build_index_pathkeys(Query *root, RelOptInfo *rel, IndexOptInfo *index)

 		while (*indexkeys != 0)
 		{
-			int			varattno = *indexkeys;
-			Oid			vartypeid = get_atttype(reloid, varattno);
-			int32		type_mod = get_atttypmod(reloid, varattno);
-
 			funcargs = lappend(funcargs,
-							   makeVar(relid, varattno, vartypeid,
-									   type_mod, 0));
+							   find_indexkey_var(root, rel, *indexkeys));
 			indexkeys++;
 		}

+		sortop = *ordering;
+		if (ScanDirectionIsBackward(scandir))
+		{
+			sortop = get_commutator(sortop);
+			if (sortop == InvalidOid)
+				return NIL;		/* oops, no reverse sort operator? */
+		}
+
 		/* Make a one-sublist pathkeys list for the function expression */
-		retval = lcons(lcons(
-			makePathKeyItem((Node *) make_funcclause(funcnode, funcargs),
-							*ordering),
-			NIL), NIL);
+		item = makePathKeyItem((Node *) make_funcclause(funcnode, funcargs),
+							   sortop);
+		retval = lcons(make_canonical_pathkey(root, item), NIL);
 	}
 	else
 	{
 		/* Normal non-functional index */
-		List	   *rel_tlist = rel->targetlist;
-
 		while (*indexkeys != 0 && *ordering != InvalidOid)
 		{
-			Var		*relvar = find_indexkey_var(*indexkeys, rel_tlist);
+			Var		*relvar = find_indexkey_var(root, rel, *indexkeys);

-			/* If we can find no tlist entry for the n'th sort key,
-			 * then we're done generating pathkeys; any subsequent sort keys
-			 * no longer apply, since we can't represent the ordering properly
-			 * even if there are tlist entries for them.
-			 */
-			if (!relvar)
-				break;
-			/* OK, make a one-element sublist for this sort key */
-			retval = lappend(retval,
-							 lcons(makePathKeyItem((Node *) relvar,
-												   *ordering),
-								   NIL));
+			sortop = *ordering;
+			if (ScanDirectionIsBackward(scandir))
+			{
+				sortop = get_commutator(sortop);
+				if (sortop == InvalidOid)
+					break;		/* oops, no reverse sort operator? */
+			}
+
+			/* OK, make a sublist for this sort key */
+			item = makePathKeyItem((Node *) relvar, sortop);
+			retval = lappend(retval, make_canonical_pathkey(root, item));

 			indexkeys++;
 			ordering++;
@@ -338,21 +550,37 @@ build_index_pathkeys(Query *root, RelOptInfo *rel, IndexOptInfo *index)
 }

 /*
- * Find a var in a relation's targetlist that matches an indexkey attrnum.
+ * Find or make a Var node for the specified attribute of the rel.
+ *
+ * We first look for the var in the rel's target list, because that's
+ * easy and fast.  But the var might not be there (this should normally
+ * only happen for vars that are used in WHERE restriction clauses,
+ * but not in join clauses or in the SELECT target list).  In that case,
+ * gin up a Var node the hard way.
 */
 static Var *
-find_indexkey_var(int indexkey, List *tlist)
+find_indexkey_var(Query *root, RelOptInfo *rel, AttrNumber varattno)
 {
 	List	   *temp;
+	int			relid;
+	Oid			reloid,
+				vartypeid;
+	int32		type_mod;

-	foreach(temp, tlist)
+	foreach(temp, rel->targetlist)
 	{
 		Var	   *tle_var = get_expr(lfirst(temp));

-		if (IsA(tle_var, Var) && tle_var->varattno == indexkey)
+		if (IsA(tle_var, Var) && tle_var->varattno == varattno)
 			return tle_var;
 	}
-	return NULL;
+
+	relid = lfirsti(rel->relids);
+	reloid = getrelid(relid, root->rtable);
+	vartypeid = get_atttype(reloid, varattno);
+	type_mod = get_atttypmod(reloid, varattno);
+
+	return makeVar(relid, varattno, vartypeid, type_mod, 0);
 }

 /*
@@ -360,164 +588,33 @@ find_indexkey_var(int indexkey, List *tlist)
 *	  Build the path keys for a join relation constructed by mergejoin or
 *	  nestloop join.  These keys should include all the path key vars of the
 *	  outer path (since the join will retain the ordering of the outer path)
- *	  plus any vars of the inner path that are mergejoined to the outer vars.
+ *	  plus any vars of the inner path that are equijoined to the outer vars.
 *
- *	  Per the discussion at the top of this file, mergejoined inner vars
+ *	  Per the discussion at the top of this file, equijoined inner vars
 *	  can be considered path keys of the result, just the same as the outer
- *	  vars they were joined with.
- *
- *	  We can also use inner path vars as pathkeys of a nestloop join, but we
- *	  must be careful that we only consider equijoin clauses and not general
- *	  join clauses.  For example, "t1.a < t2.b" might be a join clause of a
- *	  nestloop, but it doesn't result in b acquiring the ordering of a!
- *	  joinpath.c handles that problem by only passing this routine clauses
- *	  that are marked mergejoinable, even if a nestloop join is being built.
- *	  Therefore we only have 't1.a = t2.b' style clauses, and can expect that
- *	  the inner var will acquire the outer's ordering no matter which join
- *	  method is actually used.
- *
- *	  We drop pathkeys that are not vars of the join relation's tlist,
- *	  on the assumption that they are not interesting to higher levels.
- *	  (Is this correct??  To support expression pathkeys we might want to
- *	  check that all vars mentioned in the key are in the tlist, instead.)
- *
- * All vars in the result are taken from the join relation's tlist,
- * not from the given pathkeys or joinclauses.
+ *	  vars they were joined with; furthermore, it doesn't matter what kind
+ *	  of join algorithm is actually used.
 *
 * 'outer_pathkeys' is the list of the outer path's path keys
 * 'join_rel_tlist' is the target list of the join relation
- * 'joinclauses' is the list of mergejoinable clauses to consider (note this
- *		is a list of RestrictInfos, not just bare qual clauses); can be NIL
+ * 'equi_key_list' is the query's list of pathkeyitem equivalence sets
 *
 * Returns the list of new path keys.
- *
 */
 List *
 build_join_pathkeys(List *outer_pathkeys,
 					List *join_rel_tlist,
-					List *joinclauses)
+					List *equi_key_list)
 {
-	List	   *final_pathkeys = NIL;
-	List	   *i;
-
-	foreach(i, outer_pathkeys)
-	{
-		List	   *outer_pathkey = lfirst(i);
-		List	   *new_pathkey;
-
-		new_pathkey = build_join_pathkey(outer_pathkey, join_rel_tlist,
-										 joinclauses);
-		/* if we can find no sortable vars for the n'th sort key,
-		 * then we're done generating pathkeys; any subsequent sort keys
-		 * no longer apply, since we can't represent the ordering properly.
-		 */
-		if (new_pathkey == NIL)
-			break;
-		final_pathkeys = lappend(final_pathkeys, new_pathkey);
-	}
-	return final_pathkeys;
-}
-
-/*
- * build_join_pathkey
- *	  Generate an individual pathkey sublist, consisting of the outer vars
- *	  already mentioned in 'pathkey' plus any inner vars that are joined to
- *	  them (and thus can now also be considered path keys, per discussion
- *	  at the top of this file).
- *
- *	  Note that each returned pathkey uses the var node found in
- *	  'join_rel_tlist' rather than the input pathkey or joinclause var node.
- *	  (Is this important?)
- *
- * Returns a new pathkey (list of PathKeyItems).
- */
-static List *
-build_join_pathkey(List *pathkey,
-				   List *join_rel_tlist,
-				   List *joinclauses)
-{
-	List	   *new_pathkey = NIL;
-	List	   *i,
-			   *j;
-
-	foreach(i, pathkey)
-	{
-		PathKeyItem *key = (PathKeyItem *) lfirst(i);
-		Node	   *tlist_key;
-
-		Assert(key && IsA(key, PathKeyItem));
-
-		tlist_key = matching_tlist_expr(key->key, join_rel_tlist);
-		if (tlist_key)
-			new_pathkey = lcons(makePathKeyItem(tlist_key,
-												key->sortop),
-								new_pathkey);
-
-		foreach(j, joinclauses)
-		{
-			RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(j);
-			Expr	   *joinclause = restrictinfo->clause;
-			/* We assume the clause is a binary opclause... */
-			Node	   *l = (Node *) get_leftop(joinclause);
-			Node	   *r = (Node *) get_rightop(joinclause);
-			Node	   *other_var = NULL;
-			Oid			other_sortop = InvalidOid;
-
-			if (equal(key->key, l))
-			{
-				other_var = r;
-				other_sortop = restrictinfo->right_sortop;
-			}
-			else if (equal(key->key, r))
-			{
-				other_var = l;
-				other_sortop = restrictinfo->left_sortop;
-			}
-
-			if (other_var && other_sortop)
-			{
-				tlist_key = matching_tlist_expr(other_var, join_rel_tlist);
-				if (tlist_key)
-					new_pathkey = lcons(makePathKeyItem(tlist_key,
-														other_sortop),
-										new_pathkey);
-			}
-		}
-	}
-
-	return new_pathkey;
-}
-
-/*
- * commute_pathkeys
- *		Attempt to commute the operators in a set of pathkeys, producing
- *		pathkeys that describe the reverse sort order (DESC instead of ASC).
- *		Returns TRUE if successful (all the operators have commutators).
- *
- * CAUTION: given pathkeys are modified in place, even if not successful!!
- * Usually, caller should have just built or copied the pathkeys list to
- * ensure there are no unwanted side-effects.
- */
-bool
-commute_pathkeys(List *pathkeys)
-{
-	List	   *i;
-
-	foreach(i, pathkeys)
-	{
-		List	   *pathkey = lfirst(i);
-		List	   *j;
-
-		foreach(j, pathkey)
-		{
-			PathKeyItem	   *key = lfirst(j);
-
-			key->sortop = get_commutator(key->sortop);
-			if (key->sortop == InvalidOid)
-				return false;
-		}
-	}
-	return true;				/* successful */
+	/*
+	 * This used to be quite a complex bit of code, but now that all
+	 * pathkey sublists start out life canonicalized, we don't have to
+	 * do a darn thing here!  The inner-rel vars we used to need to add
+	 * are *already* part of the outer pathkey!
+	 *
+	 * I'd remove the routine entirely, but maybe someday we'll need it...
+	 */
+	return outer_pathkeys;
 }

 /****************************************************************************
@@ -529,11 +626,18 @@ commute_pathkeys(List *pathkeys)
 *		Generate a pathkeys list that represents the sort order specified
 *		by a list of SortClauses (GroupClauses will work too!)
 *
+ * NB: the result is NOT in canonical form, but must be passed through
+ * canonicalize_pathkeys() before it can be used for comparisons or
+ * labeling relation sort orders.  (We do things this way because
+ * union_planner needs to be able to construct requested pathkeys before
+ * the pathkey equivalence sets have been created for the query.)
+ *
 * 'sortclauses' is a list of SortClause or GroupClause nodes
 * 'tlist' is the targetlist to find the referenced tlist entries in
 */
 List *
-make_pathkeys_for_sortclauses(List *sortclauses, List *tlist)
+make_pathkeys_for_sortclauses(List *sortclauses,
+							  List *tlist)
 {
 	List	   *pathkeys = NIL;
 	List	   *i;
@@ -546,7 +650,11 @@ make_pathkeys_for_sortclauses(List *sortclauses, List *tlist)

 		sortkey = get_sortgroupclause_expr(sortcl, tlist);
 		pathkey = makePathKeyItem(sortkey, sortcl->sortop);
-		/* pathkey becomes a one-element sublist */
+		/*
+		 * The pathkey becomes a one-element sublist, for now;
+		 * canonicalize_pathkeys() might replace it with a longer
+		 * sublist later.
+		 */
 		pathkeys = lappend(pathkeys, lcons(pathkey, NIL));
 	}
 	return pathkeys;
@@ -599,6 +707,7 @@ find_mergeclauses_for_pathkeys(List *pathkeys, List *restrictinfos)
 		{
 			PathKeyItem	   *keyitem = lfirst(j);
 			Node		   *key = keyitem->key;
+			Oid				keyop = keyitem->sortop;
 			List		   *k;

 			foreach(k, restrictinfos)
@@ -607,8 +716,10 @@ find_mergeclauses_for_pathkeys(List *pathkeys, List *restrictinfos)

 				Assert(restrictinfo->mergejoinoperator != InvalidOid);

-				if ((equal(key, get_leftop(restrictinfo->clause)) ||
-					 equal(key, get_rightop(restrictinfo->clause))) &&
+				if (((keyop == restrictinfo->left_sortop &&
+					  equal(key, get_leftop(restrictinfo->clause))) ||
+					 (keyop == restrictinfo->right_sortop &&
+					  equal(key, get_rightop(restrictinfo->clause)))) &&
 					! member(restrictinfo, mergeclauses))
 				{
 					matched_restrictinfo = restrictinfo;
@@ -645,7 +756,7 @@ find_mergeclauses_for_pathkeys(List *pathkeys, List *restrictinfos)
 * 'mergeclauses' is a list of RestrictInfos for mergejoin clauses
 *			that will be used in a merge join.
 * 'tlist' is a relation target list for either the inner or outer
- *			side of the proposed join rel.
+ *			side of the proposed join rel.  (Not actually needed anymore)
 *
 * Returns a pathkeys list that can be applied to the indicated relation.
 *
@@ -654,7 +765,9 @@ find_mergeclauses_for_pathkeys(List *pathkeys, List *restrictinfos)
 * just make the keys, eh?
 */
 List *
-make_pathkeys_for_mergeclauses(List *mergeclauses, List *tlist)
+make_pathkeys_for_mergeclauses(Query *root,
+							   List *mergeclauses,
+							   List *tlist)
 {
 	List	   *pathkeys = NIL;
 	List	   *i;
@@ -664,32 +777,24 @@ make_pathkeys_for_mergeclauses(List *mergeclauses, List *tlist)
 		RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(i);
 		Node	   *key;
 		Oid			sortop;
+		PathKeyItem *item;

 		Assert(restrictinfo->mergejoinoperator != InvalidOid);

 		/*
 		 * Find the key and sortop needed for this mergeclause.
 		 *
-		 * We can use either side of the mergeclause, since we haven't yet
-		 * committed to which side will be inner.
+		 * Both sides of the mergeclause should appear in one of the
+		 * query's pathkey equivalence classes, so it doesn't matter
+		 * which one we use here.
 		 */
-		key = matching_tlist_expr((Node *) get_leftop(restrictinfo->clause),
-								  tlist);
+		key = (Node *) get_leftop(restrictinfo->clause);
 		sortop = restrictinfo->left_sortop;
-		if (! key)
-		{
-			key = matching_tlist_expr((Node *) get_rightop(restrictinfo->clause),
-									  tlist);
-			sortop = restrictinfo->right_sortop;
-		}
-		if (! key)
-			elog(ERROR, "make_pathkeys_for_mergeclauses: can't find key");
 		/*
 		 * Add a pathkey sublist for this sort item
 		 */
-		pathkeys = lappend(pathkeys,
-						   lcons(makePathKeyItem(key, sortop),
-								 NIL));
+		item = makePathKeyItem(key, sortop);
+		pathkeys = lappend(pathkeys, make_canonical_pathkey(root, item));
 	}

 	return pathkeys;
--- a/src/backend/optimizer/path/tidpath.c
+++ b/src/backend/optimizer/path/tidpath.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/tidpath.c,v 1.4 2000/02/07 04:40:59 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/tidpath.c,v 1.5 2000/02/15 20:49:17 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -36,7 +36,7 @@
 #include "parser/parsetree.h"
 #include "utils/lsyscache.h"

-static List	*create_tidscan_joinpaths(RelOptInfo *);
+static void create_tidscan_joinpaths(RelOptInfo *rel);
 static List	*TidqualFromRestrictinfo(List *relids, List *restrictinfo);
 static bool	isEvaluable(int varno, Node *node);
 static Node	*TidequalClause(int varno, Expr *node);
@@ -234,61 +234,54 @@ TidqualFromRestrictinfo(List *relids, List *restrictinfo)

 /*
 * create_tidscan_joinpaths
- *	  Creates a path corresponding to a tid_direct scan, returning the
- *	  pathnode.
+ *	  Create innerjoin paths if there are suitable joinclauses.
 *
+ * XXX does this actually work?
 */
-List *
+static void
 create_tidscan_joinpaths(RelOptInfo *rel)
 {
 	List		*rlst = NIL,
 				*lst;
-	TidPath		*pathnode = (TidPath *) NULL;
-	List		*restinfo,
-				*tideval;

 	foreach (lst, rel->joininfo)
 	{
-		JoinInfo   *joininfo = (JoinInfo *)lfirst(lst);
+		JoinInfo   *joininfo = (JoinInfo *) lfirst(lst);
+		List		*restinfo,
+					*tideval;

 		restinfo = joininfo->jinfo_restrictinfo;
 		tideval = TidqualFromRestrictinfo(rel->relids, restinfo);
 		if (length(tideval) == 1)
 		{
-			pathnode = makeNode(TidPath);
+			TidPath		*pathnode = makeNode(TidPath);

 			pathnode->path.pathtype = T_TidScan;
 			pathnode->path.parent = rel;
 			pathnode->path.pathkeys = NIL;
-			pathnode->path.path_cost = cost_tidscan(rel, tideval);
 			pathnode->tideval = tideval;
 			pathnode->unjoined_relids = joininfo->unjoined_relids;
+
+			cost_tidscan(&pathnode->path, rel, tideval);
+
 			rlst = lappend(rlst, pathnode);
 		}
 	}
 	rel->innerjoin = nconc(rel->innerjoin, rlst);
-	return rlst;
 }

 /*
 * create_tidscan_paths
- *	  Creates a path corresponding to a tid direct scan, returning the
- *	  pathnode List.
- *
+ *	  Creates paths corresponding to tid direct scans of the given rel.
+ *	  Candidate paths are added to the rel's pathlist (using add_path).
 */
-List *
+void
 create_tidscan_paths(Query *root, RelOptInfo *rel)
 {
-	List	*rlst = NIL;
-	TidPath	*pathnode = (TidPath *) NULL;
 	List	*tideval = TidqualFromRestrictinfo(rel->relids,
 											   rel->baserestrictinfo);
 	
 	if (tideval)
-		pathnode = create_tidscan_path(rel, tideval);
-	if (pathnode)
-		rlst = lcons(pathnode, rlst);
+		add_path(rel, (Path *) create_tidscan_path(rel, tideval));
 	create_tidscan_joinpaths(rel);
-
-	return rlst;
 }