Improve planner's handling of SetOp plans.

Remove the code for inserting flag columns in the inputs of a SetOp. That was the only reason why there would be resjunk columns in a set-operations plan tree, so we can get rid of some code that supported that, too. Get rid of choose_hashed_setop() in favor of building Paths for the hashed and sorted alternatives, and letting them fight it out within add_path(). Remove set_operation_ordered_results_useful(), which was giving wrong answers due to examining the wrong ancestor node: we need to examine the immediate SetOperationStmt parent not the topmost node. Instead make each caller of recurse_set_operations() pass down the relevant parent node. (This thinko seems to have led only to wasted planning cycles and possibly-inferior plans, not wrong query answers. Perhaps we should back-patch it, but I'm not doing so right now.) Teach generate_nonunion_paths() to consider pre-sorted inputs for sorted SetOps, rather than always generating a Sort node. Patch by me; thanks to Richard Guo and David Rowley for review. Discussion: https://postgr.es/m/1850138.1731549611@sss.pgh.pa.us
2025-11-12 05:01:15 +03:00 · 2024-12-19 17:02:25 -05:00
parent 2762792952
commit 8d96f57d5c
8 changed files with 365 additions and 332 deletions
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -616,7 +616,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
 * setops is used for set operation subqueries to provide the subquery with
 * the context in which it's being used so that Paths correctly sorted for the
 * set operation can be generated.  NULL when not planning a set operation
- * child.
+ * child, or when a child of a set op that isn't interested in sorted input.
 *
 * Basically, this routine does the stuff that should only be done once
 * per Query object.  It then calls grouping_planner.  At one time,
@@ -1350,7 +1350,7 @@ preprocess_phv_expression(PlannerInfo *root, Expr *expr)
 * setops is used for set operation subqueries to provide the subquery with
 * the context in which it's being used so that Paths correctly sorted for the
 * set operation can be generated.  NULL when not planning a set operation
- * child.
+ * child, or when a child of a set op that isn't interested in sorted input.
 *
 * Returns nothing; the useful output is in the Paths we attach to the
 * (UPPERREL_FINAL, NULL) upperrel in *root.  In addition,
@@ -3467,8 +3467,7 @@ standard_qp_callback(PlannerInfo *root, void *extra)
 									  tlist);
 	/* setting setop_pathkeys might be useful to the union planner */
-	if (qp_extra->setop != NULL &&
+	if (qp_extra->setop != NULL)
 		set_operation_ordered_results_useful(qp_extra->setop))
 	{
 		List	   *groupClauses;
 		bool		sortable;
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -39,9 +39,9 @@
 static RelOptInfo *recurse_set_operations(Node *setOp, PlannerInfo *root,
 										  SetOperationStmt *parentOp,
 										  List *colTypes, List *colCollations,
-										  bool junkOK,
+										  List *refnames_tlist,
 										  int flag, List *refnames_tlist,
 										  List **pTargetList,
 										  bool *istrivial_tlist);
 static RelOptInfo *generate_recursion_path(SetOperationStmt *setOp,
@@ -64,19 +64,13 @@ static List *plan_union_children(PlannerInfo *root,
 								 List **tlist_list,
 								 List **istrivial_tlist);
 static void postprocess_setop_rel(PlannerInfo *root, RelOptInfo *rel);
 static bool choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 								Path *lpath, Path *rpath,
 								double dNumGroups, double dNumOutputRows,
 								const char *construct);
 static List *generate_setop_tlist(List *colTypes, List *colCollations,
 								  int flag,
 								  Index varno,
 								  bool hack_constants,
 								  List *input_tlist,
 								  List *refnames_tlist,
 								  bool *trivial_tlist);
 static List *generate_append_tlist(List *colTypes, List *colCollations,
 								   bool flag,
 								   List *input_tlists,
 								   List *refnames_tlist);
 static List *generate_setop_grouplist(SetOperationStmt *op, List *targetlist);
@@ -160,12 +154,11 @@ plan_set_operations(PlannerInfo *root)
 		/*
 		 * Recurse on setOperations tree to generate paths for set ops. The
 		 * final output paths should have just the column types shown as the
-		 * output from the top-level node, plus possibly resjunk working
+		 * output from the top-level node.
 		 * columns (we can rely on upper-level nodes to deal with that).
 		 */
 		setop_rel = recurse_set_operations((Node *) topop, root,
 										   NULL,	/* no parent */
 										   topop->colTypes, topop->colCollations,
 										   true, -1,
 										   leftmostQuery->targetList,
 										   &top_tlist,
 										   &trivial_tlist);
@@ -177,50 +170,36 @@ plan_set_operations(PlannerInfo *root)
 	return setop_rel;
 }
 /*
 * set_operation_ordered_results_useful
 *		Return true if the given SetOperationStmt can be executed by utilizing
 *		paths that provide sorted input according to the setop's targetlist.
 *		Returns false when sorted paths are not any more useful than unsorted
 *		ones.
 */
 bool
 set_operation_ordered_results_useful(SetOperationStmt *setop)
 {
 	/*
 	 * Paths sorted by the targetlist are useful for UNION as we can opt to
 	 * MergeAppend the sorted paths then Unique them.  Ordered paths are no
 	 * more useful than unordered ones for UNION ALL.
 	 */
 	if (!setop->all && setop->op == SETOP_UNION)
 		return true;
 	/*
 	 * EXCEPT / EXCEPT ALL / INTERSECT / INTERSECT ALL cannot yet utilize
 	 * correctly sorted input paths.
 	 */
 	return false;
 }
 /*
 * recurse_set_operations
 *	  Recursively handle one step in a tree of set operations
 *
 * setOp: current step (could be a SetOperationStmt or a leaf RangeTblRef)
 * parentOp: parent step, or NULL if none (but see below)
 * colTypes: OID list of set-op's result column datatypes
 * colCollations: OID list of set-op's result column collations
 * junkOK: if true, child resjunk columns may be left in the result
 * flag: if >= 0, add a resjunk output column indicating value of flag
 * refnames_tlist: targetlist to take column names from
 *
 * parentOp should be passed as NULL unless that step is interested in
 * getting sorted output from this step.  ("Sorted" means "sorted according
 * to the default btree opclasses of the result column datatypes".)
 *
 * Returns a RelOptInfo for the subtree, as well as these output parameters:
 * *pTargetList: receives the fully-fledged tlist for the subtree's top plan
 * *istrivial_tlist: true if, and only if, datatypes between parent and child
 * match.
 *
 * If setOp is a leaf node, this function plans the sub-query but does
 * not populate the pathlist of the returned RelOptInfo.  The caller will
 * generate SubqueryScan paths using useful path(s) of the subquery (see
 * build_setop_child_paths).  But this function does build the paths for
 * set-operation nodes.
 *
 * The pTargetList output parameter is mostly redundant with the pathtarget
 * of the returned RelOptInfo, but for the moment we need it because much of
 * the logic in this file depends on flag columns being marked resjunk.
- * Pending a redesign of how that works, this is the easy way out.
+ * XXX Now that there are no flag columns and hence no resjunk columns, we
 * could probably refactor this file to deal only in pathtargets.
 *
 * We don't have to care about typmods here: the only allowed difference
 * between set-op input and output typmods is input is a specific typmod
@@ -228,9 +207,9 @@ set_operation_ordered_results_useful(SetOperationStmt *setop)
 */
 static RelOptInfo *
 recurse_set_operations(Node *setOp, PlannerInfo *root,
 					   SetOperationStmt *parentOp,
 					   List *colTypes, List *colCollations,
-					   bool junkOK,
+					   List *refnames_tlist,
 					   int flag, List *refnames_tlist,
 					   List **pTargetList,
 					   bool *istrivial_tlist)
 {
@@ -245,7 +224,6 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 	{
 		RangeTblRef *rtr = (RangeTblRef *) setOp;
 		RangeTblEntry *rte = root->simple_rte_array[rtr->rtindex];
 		SetOperationStmt *setops;
 		Query	   *subquery = rte->subquery;
 		PlannerInfo *subroot;
 		List	   *tlist;
@@ -260,15 +238,13 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 		Assert(root->plan_params == NIL);
 		/*
-		 * Pass the set operation details to the subquery_planner to have it
+		 * Generate a subroot and Paths for the subquery.  If we have a
-		 * consider generating Paths correctly ordered for the set operation.
+		 * parentOp, pass that down to encourage subquery_planner to consider
 		 * suitably-sorted Paths.
 		 */
 		setops = castNode(SetOperationStmt, root->parse->setOperations);
 		/* Generate a subroot and Paths for the subquery */
 		subroot = rel->subroot = subquery_planner(root->glob, subquery, root,
 												  false, root->tuple_fraction,
-												  setops);
+												  parentOp);
 		/*
 		 * It should not be possible for the primitive query to contain any
@@ -279,7 +255,6 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 		/* Figure out the appropriate target list for this subquery. */
 		tlist = generate_setop_tlist(colTypes, colCollations,
 									 flag,
 									 rtr->rtindex,
 									 true,
 									 subroot->processed_tlist,
@@ -318,16 +293,14 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 		 * generate_append_tlist() or generate_setop_tlist(), this will work.
 		 * We just tell generate_setop_tlist() to use varno 0.
 		 */
-		if (flag >= 0 ||
+		if (!tlist_same_datatypes(*pTargetList, colTypes, false) ||
-			!tlist_same_datatypes(*pTargetList, colTypes, junkOK) ||
+			!tlist_same_collations(*pTargetList, colCollations, false))
 			!tlist_same_collations(*pTargetList, colCollations, junkOK))
 		{
 			PathTarget *target;
 			bool		trivial_tlist;
 			ListCell   *lc;
 			*pTargetList = generate_setop_tlist(colTypes, colCollations,
 												flag,
 												0,
 												false,
 												*pTargetList,
@@ -410,8 +383,8 @@ generate_recursion_path(SetOperationStmt *setOp, PlannerInfo *root,
 	 * separately without any intention of combining them into one Append.
 	 */
 	lrel = recurse_set_operations(setOp->larg, root,
 								  NULL, /* no value in sorted results */
 								  setOp->colTypes, setOp->colCollations,
 								  false, -1,
 								  refnames_tlist,
 								  &lpath_tlist,
 								  &lpath_trivial_tlist);
@@ -422,8 +395,8 @@ generate_recursion_path(SetOperationStmt *setOp, PlannerInfo *root,
 	/* The right path will want to look at the left one ... */
 	root->non_recursive_path = lpath;
 	rrel = recurse_set_operations(setOp->rarg, root,
 								  NULL, /* no value in sorted results */
 								  setOp->colTypes, setOp->colCollations,
 								  false, -1,
 								  refnames_tlist,
 								  &rpath_tlist,
 								  &rpath_trivial_tlist);
@@ -436,7 +409,7 @@ generate_recursion_path(SetOperationStmt *setOp, PlannerInfo *root,
 	/*
 	 * Generate tlist for RecursiveUnion path node --- same as in Append cases
 	 */
-	tlist = generate_append_tlist(setOp->colTypes, setOp->colCollations, false,
+	tlist = generate_append_tlist(setOp->colTypes, setOp->colCollations,
 								  list_make2(lpath_tlist, rpath_tlist),
 								  refnames_tlist);
@@ -495,6 +468,10 @@ generate_recursion_path(SetOperationStmt *setOp, PlannerInfo *root,
 * build_setop_child_paths
 *		Build paths for the set op child relation denoted by 'rel'.
 *
 * 'rel' is an RTE_SUBQUERY relation.  We have already generated paths within
 * the subquery's subroot; the task here is to create SubqueryScan paths for
 * 'rel', representing scans of the useful subquery paths.
 *
 * interesting_pathkeys: if not NIL, also include paths that suit these
 * pathkeys, sorting any unsorted paths as required.
 * *pNumGroups: if not NULL, we estimate the number of distinct groups
@@ -736,7 +713,7 @@ generate_union_paths(SetOperationStmt *op, PlannerInfo *root,
 	 * concerned, but we must make it look real anyway for the benefit of the
 	 * next plan level up.
 	 */
-	tlist = generate_append_tlist(op->colTypes, op->colCollations, false,
+	tlist = generate_append_tlist(op->colTypes, op->colCollations,
 								  tlist_list, refnames_tlist);
 	*pTargetList = tlist;
@@ -1033,11 +1010,13 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	bool		lpath_trivial_tlist,
 				rpath_trivial_tlist,
 				result_trivial_tlist;
 	List	   *nonunion_pathkeys = NIL;
 	double		dLeftGroups,
 				dRightGroups,
 				dNumGroups,
 				dNumOutputRows;
-	bool		use_hash;
+	bool		can_sort;
 	bool		can_hash;
 	SetOpCmd	cmd;
 	/*
@@ -1047,26 +1026,69 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	/* Recurse on children */
 	lrel = recurse_set_operations(op->larg, root,
 								  op,
 								  op->colTypes, op->colCollations,
 								  false, -1,
 								  refnames_tlist,
 								  &lpath_tlist,
 								  &lpath_trivial_tlist);
 	if (lrel->rtekind == RTE_SUBQUERY)
 		build_setop_child_paths(root, lrel, lpath_trivial_tlist, lpath_tlist,
 								NIL, &dLeftGroups);
 	else
 		dLeftGroups = lrel->rows;
 	rrel = recurse_set_operations(op->rarg, root,
 								  op,
 								  op->colTypes, op->colCollations,
 								  false, -1,
 								  refnames_tlist,
 								  &rpath_tlist,
 								  &rpath_trivial_tlist);
 	/*
 	 * Generate tlist for SetOp plan node.
 	 *
 	 * The tlist for a SetOp plan isn't important so far as the SetOp is
 	 * concerned, but we must make it look real anyway for the benefit of the
 	 * next plan level up.
 	 */
 	tlist = generate_setop_tlist(op->colTypes, op->colCollations,
 								 0, false, lpath_tlist, refnames_tlist,
 								 &result_trivial_tlist);
 	/* We should not have needed any type coercions in the tlist */
 	Assert(result_trivial_tlist);
 	*pTargetList = tlist;
 	/* Identify the grouping semantics */
 	groupList = generate_setop_grouplist(op, tlist);
 	/* Check whether the operators support sorting or hashing */
 	can_sort = grouping_is_sortable(groupList);
 	can_hash = grouping_is_hashable(groupList);
 	if (!can_sort && !can_hash)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 		/* translator: %s is INTERSECT or EXCEPT */
 				 errmsg("could not implement %s",
 						(op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT"),
 				 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
 	if (can_sort)
 	{
 		/* Determine the pathkeys for sorting by the whole target list */
 		nonunion_pathkeys = make_pathkeys_for_sortclauses(root, groupList,
 														  tlist);
 		root->query_pathkeys = nonunion_pathkeys;
 	}
 	/*
 	 * Now that we've got all that info, we can build the child paths.
 	 */
 	if (lrel->rtekind == RTE_SUBQUERY)
 		build_setop_child_paths(root, lrel, lpath_trivial_tlist, lpath_tlist,
 								nonunion_pathkeys, &dLeftGroups);
 	else
 		dLeftGroups = lrel->rows;
 	if (rrel->rtekind == RTE_SUBQUERY)
 		build_setop_child_paths(root, rrel, rpath_trivial_tlist, rpath_tlist,
-								NIL, &dRightGroups);
+								nonunion_pathkeys, &dRightGroups);
 	else
 		dRightGroups = rrel->rows;
@@ -1102,30 +1124,11 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	lpath = lrel->cheapest_total_path;
 	rpath = rrel->cheapest_total_path;
 	/*
 	 * Generate tlist for SetOp plan node.
 	 *
 	 * The tlist for a SetOp plan isn't important so far as the SetOp is
 	 * concerned, but we must make it look real anyway for the benefit of the
 	 * next plan level up.
 	 */
 	tlist = generate_setop_tlist(op->colTypes, op->colCollations, -1,
 								 0, false, lpath_tlist, refnames_tlist,
 								 &result_trivial_tlist);
 	/* We should not have needed any type coercions in the tlist */
 	Assert(result_trivial_tlist);
 	*pTargetList = tlist;
 	/* Build result relation. */
 	result_rel = fetch_upper_rel(root, UPPERREL_SETOP,
 								 bms_union(lrel->relids, rrel->relids));
 	result_rel->reltarget = create_pathtarget(root, tlist);
 	/* Identify the grouping semantics */
 	groupList = generate_setop_grouplist(op, tlist);
 	/*
 	 * Estimate number of distinct groups that we'll need hashtable entries
 	 * for; this is the size of the left-hand input for EXCEPT, or the smaller
@@ -1144,41 +1147,9 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 		dNumGroups = dLeftGroups;
 		dNumOutputRows = op->all ? Min(lpath->rows, rpath->rows) : dNumGroups;
 	}
 	result_rel->rows = dNumOutputRows;
-	/*
+	/* Select the SetOpCmd type */
 	 * Decide whether to hash or sort, and add sort nodes if needed.
 	 */
 	use_hash = choose_hashed_setop(root, groupList, lpath, rpath,
 								   dNumGroups, dNumOutputRows,
 								   (op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT");
 	if (groupList && !use_hash)
 	{
 		List	   *pathkeys;
 		pathkeys = make_pathkeys_for_sortclauses(root,
 												 groupList,
 												 lpath_tlist);
 		if (!pathkeys_contained_in(pathkeys, lpath->pathkeys))
 			lpath = (Path *) create_sort_path(root,
 											  lpath->parent,
 											  lpath,
 											  pathkeys,
 											  -1.0);
 		pathkeys = make_pathkeys_for_sortclauses(root,
 												 groupList,
 												 rpath_tlist);
 		if (!pathkeys_contained_in(pathkeys, rpath->pathkeys))
 			rpath = (Path *) create_sort_path(root,
 											  rpath->parent,
 											  rpath,
 											  pathkeys,
 											  -1.0);
 	}
 	/*
 	 * Finally, add a SetOp path node to generate the correct output.
 	 */
 	switch (op->op)
 	{
 		case SETOP_INTERSECT:
@@ -1192,18 +1163,90 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 			cmd = SETOPCMD_INTERSECT;	/* keep compiler quiet */
 			break;
 	}
 	path = (Path *) create_setop_path(root,
 									  result_rel,
 									  lpath,
 									  rpath,
 									  cmd,
 									  use_hash ? SETOP_HASHED : SETOP_SORTED,
 									  groupList,
 									  dNumGroups,
 									  dNumOutputRows);
-	result_rel->rows = path->rows;
+	/*
-	add_path(result_rel, path);
+	 * If we can hash, that just requires a SetOp atop the cheapest inputs.
 	 */
 	if (can_hash)
 	{
 		path = (Path *) create_setop_path(root,
 										  result_rel,
 										  lpath,
 										  rpath,
 										  cmd,
 										  SETOP_HASHED,
 										  groupList,
 										  dNumGroups,
 										  dNumOutputRows);
 		add_path(result_rel, path);
 	}
 	/*
 	 * If we can sort, generate the cheapest sorted input paths, and add a
 	 * SetOp atop those.
 	 */
 	if (can_sort)
 	{
 		List	   *pathkeys;
 		Path	   *slpath,
 				   *srpath;
 		/* First the left input ... */
 		pathkeys = make_pathkeys_for_sortclauses(root,
 												 groupList,
 												 lpath_tlist);
 		if (pathkeys_contained_in(pathkeys, lpath->pathkeys))
 			slpath = lpath;		/* cheapest path is already sorted */
 		else
 		{
 			slpath = get_cheapest_path_for_pathkeys(lrel->pathlist,
 													nonunion_pathkeys,
 													NULL,
 													TOTAL_COST,
 													false);
 			/* Subquery failed to produce any presorted paths? */
 			if (slpath == NULL)
 				slpath = (Path *) create_sort_path(root,
 												   lpath->parent,
 												   lpath,
 												   pathkeys,
 												   -1.0);
 		}
 		/* and now the same for the right. */
 		pathkeys = make_pathkeys_for_sortclauses(root,
 												 groupList,
 												 rpath_tlist);
 		if (pathkeys_contained_in(pathkeys, rpath->pathkeys))
 			srpath = rpath;		/* cheapest path is already sorted */
 		else
 		{
 			srpath = get_cheapest_path_for_pathkeys(rrel->pathlist,
 													nonunion_pathkeys,
 													NULL,
 													TOTAL_COST,
 													false);
 			/* Subquery failed to produce any presorted paths? */
 			if (srpath == NULL)
 				srpath = (Path *) create_sort_path(root,
 												   rpath->parent,
 												   rpath,
 												   pathkeys,
 												   -1.0);
 		}
 		path = (Path *) create_setop_path(root,
 										  result_rel,
 										  slpath,
 										  srpath,
 										  cmd,
 										  SETOP_SORTED,
 										  groupList,
 										  dNumGroups,
 										  dNumOutputRows);
 		add_path(result_rel, path);
 	}
 	return result_rel;
 }
@@ -1259,17 +1302,15 @@ plan_union_children(PlannerInfo *root,
 		/*
 		 * Not same, so plan this child separately.
 		 *
-		 * Note we disallow any resjunk columns in child results.  This is
+		 * If top_union isn't a UNION ALL, then we are interested in sorted
-		 * necessary since the Append node that implements the union won't do
+		 * output from the child, so pass top_union as parentOp.  Note that
-		 * any projection, and upper levels will get confused if some of our
+		 * this isn't necessarily the child node's immediate SetOperationStmt
-		 * output tuples have junk and some don't.  This case only arises when
+		 * parent, but that's fine: it's the effective parent.
 		 * we have an EXCEPT or INTERSECT as child, else there won't be
 		 * resjunk anyway.
 		 */
 		result = lappend(result, recurse_set_operations(setOp, root,
 														top_union->all ? NULL : top_union,
 														top_union->colTypes,
 														top_union->colCollations,
 														false, -1,
 														refnames_tlist,
 														&child_tlist,
 														&trivial_tlist));
@@ -1298,121 +1339,11 @@ postprocess_setop_rel(PlannerInfo *root, RelOptInfo *rel)
 	set_cheapest(rel);
 }
 /*
 * choose_hashed_setop - should we use hashing for a set operation?
 *
 * XXX probably this should go away: just make both paths and let
 * add_path sort it out.
 */
 static bool
 choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 					Path *lpath, Path *rpath,
 					double dNumGroups, double dNumOutputRows,
 					const char *construct)
 {
 	int			numGroupCols = list_length(groupClauses);
 	Size		hash_mem_limit = get_hash_memory_limit();
 	bool		can_sort;
 	bool		can_hash;
 	Size		hashentrysize;
 	Path		hashed_p;
 	Path		sorted_p;
 	double		tuple_fraction;
 	/* Check whether the operators support sorting or hashing */
 	can_sort = grouping_is_sortable(groupClauses);
 	can_hash = grouping_is_hashable(groupClauses);
 	if (can_hash && can_sort)
 	{
 		/* we have a meaningful choice to make, continue ... */
 	}
 	else if (can_hash)
 		return true;
 	else if (can_sort)
 		return false;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 		/* translator: %s is UNION, INTERSECT, or EXCEPT */
 				 errmsg("could not implement %s", construct),
 				 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
 	/* Prefer sorting when enable_hashagg is off */
 	if (!enable_hashagg)
 		return false;
 	/*
 	 * Don't do it if it doesn't look like the hashtable will fit into
 	 * hash_mem.
 	 */
 	hashentrysize = MAXALIGN(lpath->pathtarget->width) + MAXALIGN(SizeofMinimalTupleHeader);
 	if (hashentrysize * dNumGroups > hash_mem_limit)
 		return false;
 	/*
 	 * See if the estimated cost is no more than doing it the other way.
 	 *
 	 * We need to consider input_plan + hashagg versus input_plan + sort +
 	 * group. XXX NOT TRUE: Note that the actual result plan might involve a
 	 * SetOp or Unique node, not Agg or Group, but the cost estimates for Agg
 	 * and Group should be close enough for our purposes here.
 	 *
 	 * These path variables are dummies that just hold cost fields; we don't
 	 * make actual Paths for these steps.
 	 */
 	cost_agg(&hashed_p, root, AGG_HASHED, NULL,
 			 numGroupCols, dNumGroups,
 			 NIL,
 			 lpath->disabled_nodes + rpath->disabled_nodes,
 			 lpath->startup_cost + rpath->startup_cost,
 			 lpath->total_cost + rpath->total_cost,
 			 lpath->rows + rpath->rows,
 			 lpath->pathtarget->width);
 	/*
 	 * Now for the sorted case.  XXX NOT TRUE: Note that the input is *always*
 	 * unsorted, since it was made by appending unrelated sub-relations
 	 * together.
 	 */
 	sorted_p.disabled_nodes = lpath->disabled_nodes + rpath->disabled_nodes;
 	sorted_p.startup_cost = lpath->startup_cost + rpath->startup_cost;
 	sorted_p.total_cost = lpath->total_cost + rpath->total_cost;
 	/* XXX cost_sort doesn't actually look at pathkeys, so just pass NIL */
 	cost_sort(&sorted_p, root, NIL, sorted_p.disabled_nodes,
 			  sorted_p.total_cost,
 			  lpath->rows + rpath->rows,
 			  lpath->pathtarget->width,
 			  0.0, work_mem, -1.0);
 	cost_group(&sorted_p, root, numGroupCols, dNumGroups,
 			   NIL,
 			   sorted_p.disabled_nodes,
 			   sorted_p.startup_cost, sorted_p.total_cost,
 			   lpath->rows + rpath->rows);
 	/*
 	 * Now make the decision using the top-level tuple fraction.  First we
 	 * have to convert an absolute count (LIMIT) into fractional form.
 	 */
 	tuple_fraction = root->tuple_fraction;
 	if (tuple_fraction >= 1.0)
 		tuple_fraction /= dNumOutputRows;
 	if (compare_fractional_path_costs(&hashed_p, &sorted_p,
 									  tuple_fraction) < 0)
 	{
 		/* Hashed is cheaper, so use it */
 		return true;
 	}
 	return false;
 }
 /*
 * Generate targetlist for a set-operation plan node
 *
 * colTypes: OID list of set-op's result column datatypes
 * colCollations: OID list of set-op's result column collations
 * flag: -1 if no flag column needed, 0 or 1 to create a const flag column
 * varno: varno to use in generated Vars
 * hack_constants: true to copy up constants (see comments in code)
 * input_tlist: targetlist of this node's input node
@@ -1421,7 +1352,6 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 */
 static List *
 generate_setop_tlist(List *colTypes, List *colCollations,
 					 int flag,
 					 Index varno,
 					 bool hack_constants,
 					 List *input_tlist,
@@ -1520,7 +1450,7 @@ generate_setop_tlist(List *colTypes, List *colCollations,
 							  false);
 		/*
-		 * By convention, all non-resjunk columns in a setop tree have
+		 * By convention, all output columns in a setop tree have
 		 * ressortgroupref equal to their resno.  In some cases the ref isn't
 		 * needed, but this is a cleaner way than modifying the tlist later.
 		 */
@@ -1529,25 +1459,6 @@ generate_setop_tlist(List *colTypes, List *colCollations,
 		tlist = lappend(tlist, tle);
 	}
 	if (flag >= 0)
 	{
 		/* Add a resjunk flag column */
 		/* flag value is the given constant */
 		expr = (Node *) makeConst(INT4OID,
 								  -1,
 								  InvalidOid,
 								  sizeof(int32),
 								  Int32GetDatum(flag),
 								  false,
 								  true);
 		tle = makeTargetEntry((Expr *) expr,
 							  (AttrNumber) resno++,
 							  pstrdup("flag"),
 							  true);
 		tlist = lappend(tlist, tle);
 		*trivial_tlist = false; /* the extra entry makes it not trivial */
 	}
 	return tlist;
 }
@@ -1556,7 +1467,6 @@ generate_setop_tlist(List *colTypes, List *colCollations,
 *
 * colTypes: OID list of set-op's result column datatypes
 * colCollations: OID list of set-op's result column collations
 * flag: true to create a flag column copied up from subplans
 * input_tlists: list of tlists for sub-plans of the Append
 * refnames_tlist: targetlist to take column names from
 *
@@ -1570,7 +1480,6 @@ generate_setop_tlist(List *colTypes, List *colCollations,
 */
 static List *
 generate_append_tlist(List *colTypes, List *colCollations,
 					  bool flag,
 					  List *input_tlists,
 					  List *refnames_tlist)
 {
@@ -1604,8 +1513,7 @@ generate_append_tlist(List *colTypes, List *colCollations,
 		{
 			TargetEntry *subtle = (TargetEntry *) lfirst(subtlistl);
-			if (subtle->resjunk)
+			Assert(!subtle->resjunk);
 				continue;
 			Assert(curColType != NULL);
 			if (exprType((Node *) subtle->expr) == lfirst_oid(curColType))
 			{
@@ -1654,7 +1562,7 @@ generate_append_tlist(List *colTypes, List *colCollations,
 							  false);
 		/*
-		 * By convention, all non-resjunk columns in a setop tree have
+		 * By convention, all output columns in a setop tree have
 		 * ressortgroupref equal to their resno.  In some cases the ref isn't
 		 * needed, but this is a cleaner way than modifying the tlist later.
 		 */
@@ -1663,23 +1571,6 @@ generate_append_tlist(List *colTypes, List *colCollations,
 		tlist = lappend(tlist, tle);
 	}
 	if (flag)
 	{
 		/* Add a resjunk flag column */
 		/* flag value is shown as copied up from subplan */
 		expr = (Node *) makeVar(0,
 								resno,
 								INT4OID,
 								-1,
 								InvalidOid,
 								0);
 		tle = makeTargetEntry((Expr *) expr,
 							  (AttrNumber) resno++,
 							  pstrdup("flag"),
 							  true);
 		tlist = lappend(tlist, tle);
 	}
 	pfree(colTypmods);
 	return tlist;
@@ -1709,12 +1600,7 @@ generate_setop_grouplist(SetOperationStmt *op, List *targetlist)
 		TargetEntry *tle = (TargetEntry *) lfirst(lt);
 		SortGroupClause *sgc;
-		if (tle->resjunk)
+		Assert(!tle->resjunk);
 		{
 			/* resjunk columns should not have sortgrouprefs */
 			Assert(tle->ressortgroupref == 0);
 			continue;			/* ignore resjunk columns */
 		}
 		/* non-resjunk columns should have sortgroupref = resno */
 		Assert(tle->ressortgroupref == tle->resno);
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3681,17 +3681,70 @@ create_setop_path(PlannerInfo *root,
 	pathnode->numGroups = numGroups;
 	/*
-	 * Charge one cpu_operator_cost per comparison per input tuple. We assume
+	 * Compute cost estimates.  As things stand, we end up with the same total
-	 * all columns get compared at most of the tuples.
+	 * cost in this node for sort and hash methods, but different startup
-	 *
+	 * costs.  This could be refined perhaps, but it'll do for now.
 	 * XXX all wrong for hashing
 	 */
 	pathnode->path.disabled_nodes =
 		leftpath->disabled_nodes + rightpath->disabled_nodes;
-	pathnode->path.startup_cost =
+	if (strategy == SETOP_SORTED)
-		leftpath->startup_cost + rightpath->startup_cost;
+	{
-	pathnode->path.total_cost = leftpath->total_cost + rightpath->total_cost +
+		/*
-		cpu_operator_cost * (leftpath->rows + rightpath->rows) * list_length(groupList);
+		 * In sorted mode, we can emit output incrementally.  Charge one
 		 * cpu_operator_cost per comparison per input tuple.  Like cost_group,
 		 * we assume all columns get compared at most of the tuples.
 		 */
 		pathnode->path.startup_cost =
 			leftpath->startup_cost + rightpath->startup_cost;
 		pathnode->path.total_cost =
 			leftpath->total_cost + rightpath->total_cost +
 			cpu_operator_cost * (leftpath->rows + rightpath->rows) * list_length(groupList);
 		/*
 		 * Also charge a small amount per extracted tuple.  Like cost_sort,
 		 * charge only operator cost not cpu_tuple_cost, since SetOp does no
 		 * qual-checking or projection.
 		 */
 		pathnode->path.total_cost += cpu_operator_cost * outputRows;
 	}
 	else
 	{
 		Size		hashentrysize;
 		/*
 		 * In hashed mode, we must read all the input before we can emit
 		 * anything.  Also charge comparison costs to represent the cost of
 		 * hash table lookups.
 		 */
 		pathnode->path.startup_cost =
 			leftpath->total_cost + rightpath->total_cost +
 			cpu_operator_cost * (leftpath->rows + rightpath->rows) * list_length(groupList);
 		pathnode->path.total_cost = pathnode->path.startup_cost;
 		/*
 		 * Also charge a small amount per extracted tuple.  Like cost_sort,
 		 * charge only operator cost not cpu_tuple_cost, since SetOp does no
 		 * qual-checking or projection.
 		 */
 		pathnode->path.total_cost += cpu_operator_cost * outputRows;
 		/*
 		 * Mark the path as disabled if enable_hashagg is off.  While this
 		 * isn't exactly a HashAgg node, it seems close enough to justify
 		 * letting that switch control it.
 		 */
 		if (!enable_hashagg)
 			pathnode->path.disabled_nodes++;
 		/*
 		 * Also disable if it doesn't look like the hashtable will fit into
 		 * hash_mem.
 		 */
 		hashentrysize = MAXALIGN(leftpath->pathtarget->width) +
 			MAXALIGN(SizeofMinimalTupleHeader);
 		if (hashentrysize * numGroups > get_hash_memory_limit())
 			pathnode->path.disabled_nodes++;
 	}
 	pathnode->path.rows = outputRows;
 	return pathnode;
--- a/src/include/optimizer/prep.h
+++ b/src/include/optimizer/prep.h
@@ -53,6 +53,5 @@ extern void preprocess_aggrefs(PlannerInfo *root, Node *clause);
 * prototypes for prepunion.c
 */
 extern RelOptInfo *plan_set_operations(PlannerInfo *root);
 extern bool set_operation_ordered_results_useful(SetOperationStmt *setop);
 #endif							/* PREP_H */
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1221,8 +1221,10 @@ where o.ten = 0;
 (1 row)
 --
-- Test rescan of a SetOp node
+-- Test rescan of a hashed SetOp node
 --
 begin;
 set local enable_sort = off;
 explain (costs off)
 select count(*) from
  onek o cross join lateral (
@@ -1256,6 +1258,50 @@ where o.ten = 1;
   100
 (1 row)
 rollback;
 --
 -- Test rescan of a sorted SetOp node
 --
 begin;
 set local enable_hashagg = off;
 explain (costs off)
 select count(*) from
  onek o cross join lateral (
    select * from onek i1 where i1.unique1 = o.unique1
    except
    select * from onek i2 where i2.unique1 = o.unique2
  ) ss
 where o.ten = 1;
                                                                                                     QUERY PLAN                                                                                                      
 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Aggregate
   ->  Nested Loop
         ->  Seq Scan on onek o
               Filter: (ten = 1)
         ->  SetOp Except
               ->  Sort
                     Sort Key: i1.unique1, i1.unique2, i1.two, i1.four, i1.ten, i1.twenty, i1.hundred, i1.thousand, i1.twothousand, i1.fivethous, i1.tenthous, i1.odd, i1.even, i1.stringu1, i1.stringu2, i1.string4
                     ->  Index Scan using onek_unique1 on onek i1
                           Index Cond: (unique1 = o.unique1)
               ->  Sort
                     Sort Key: i2.unique1, i2.unique2, i2.two, i2.four, i2.ten, i2.twenty, i2.hundred, i2.thousand, i2.twothousand, i2.fivethous, i2.tenthous, i2.odd, i2.even, i2.stringu1, i2.stringu2, i2.string4
                     ->  Index Scan using onek_unique1 on onek i2
                           Index Cond: (unique1 = o.unique2)
 (13 rows)
 select count(*) from
  onek o cross join lateral (
    select * from onek i1 where i1.unique1 = o.unique1
    except
    select * from onek i2 where i2.unique1 = o.unique2
  ) ss
 where o.ten = 1;
 count 
 -------
   100
 (1 row)
 rollback;
 --
 -- Test rescan of a RecursiveUnion node
 --
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -385,13 +385,15 @@ select count(*) from
  5000
 (1 row)
 -- this query will prefer a sorted setop unless we force it.
 set enable_indexscan to off;
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
-                         QUERY PLAN                         
+           QUERY PLAN            
------------------------------------------------------------
+---------------------------------
 HashSetOp Except
-   ->  Index Only Scan using tenk1_unique1 on tenk1
+   ->  Seq Scan on tenk1
-   ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
+   ->  Seq Scan on tenk1 tenk1_1
         Filter: (unique2 <> 10)
 (4 rows)
@@ -401,6 +403,7 @@ select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
      10
 (1 row)
 reset enable_indexscan;
 -- the hashed implementation is sensitive to child plans' tuple slot types
 explain (costs off)
 select * from int8_tbl intersect select q2, q1 from int8_tbl order by 1, 2;
@@ -455,17 +458,15 @@ select count(*) from
 explain (costs off)
 select count(*) from
  ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
-                               QUERY PLAN                               
+                            QUERY PLAN                            
------------------------------------------------------------------------
+------------------------------------------------------------------
 Aggregate
   ->  SetOp Intersect
         ->  Sort
               Sort Key: tenk1.fivethous
               ->  Seq Scan on tenk1
-         ->  Sort
+         ->  Index Only Scan using tenk1_unique1 on tenk1 tenk1_1
-               Sort Key: tenk1_1.unique1
+(6 rows)
               ->  Index Only Scan using tenk1_unique1 on tenk1 tenk1_1
 (8 rows)
 select count(*) from
  ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
@@ -476,17 +477,13 @@ select count(*) from
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
-                            QUERY PLAN                            
+                         QUERY PLAN                         
------------------------------------------------------------------
+------------------------------------------------------------
 SetOp Except
-   ->  Sort
+   ->  Index Only Scan using tenk1_unique1 on tenk1
-         Sort Key: tenk1.unique1
+   ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
-         ->  Index Only Scan using tenk1_unique1 on tenk1
+         Filter: (unique2 <> 10)
-   ->  Sort
+(4 rows)
         Sort Key: tenk1_1.unique2
         ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
               Filter: (unique2 <> 10)
 (8 rows)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 unique1 
@@ -494,6 +491,20 @@ select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
      10
 (1 row)
 explain (costs off)
 select f1 from int4_tbl union all
  (select unique1 from tenk1 union select unique2 from tenk1);
                               QUERY PLAN                               
 ------------------------------------------------------------------------
 Append
   ->  Seq Scan on int4_tbl
   ->  Unique
         ->  Merge Append
               Sort Key: tenk1.unique1
               ->  Index Only Scan using tenk1_unique1 on tenk1
               ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
 (7 rows)
 reset enable_hashagg;
 -- non-hashable type
 set enable_hashagg to on;
@@ -978,7 +989,7 @@ explain (costs off)
 select from generate_series(1,5) intersect select from generate_series(1,3);
                        QUERY PLAN                        
 ----------------------------------------------------------
- HashSetOp Intersect
+ SetOp Intersect
   ->  Function Scan on generate_series
   ->  Function Scan on generate_series generate_series_1
 (3 rows)
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -638,8 +638,11 @@ select sum(ss.tst::int) from
 where o.ten = 0;
 --
-- Test rescan of a SetOp node
+-- Test rescan of a hashed SetOp node
 --
 begin;
 set local enable_sort = off;
 explain (costs off)
 select count(*) from
  onek o cross join lateral (
@@ -657,6 +660,33 @@ select count(*) from
  ) ss
 where o.ten = 1;
 rollback;
 --
 -- Test rescan of a sorted SetOp node
 --
 begin;
 set local enable_hashagg = off;
 explain (costs off)
 select count(*) from
  onek o cross join lateral (
    select * from onek i1 where i1.unique1 = o.unique1
    except
    select * from onek i2 where i2.unique1 = o.unique2
  ) ss
 where o.ten = 1;
 select count(*) from
  onek o cross join lateral (
    select * from onek i1 where i1.unique1 = o.unique1
    except
    select * from onek i2 where i2.unique1 = o.unique2
  ) ss
 where o.ten = 1;
 rollback;
 --
 -- Test rescan of a RecursiveUnion node
 --
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -134,10 +134,15 @@ select count(*) from
 select count(*) from
  ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
 -- this query will prefer a sorted setop unless we force it.
 set enable_indexscan to off;
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 reset enable_indexscan;
 -- the hashed implementation is sensitive to child plans' tuple slot types
 explain (costs off)
 select * from int8_tbl intersect select q2, q1 from int8_tbl order by 1, 2;
@@ -162,6 +167,10 @@ explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 explain (costs off)
 select f1 from int4_tbl union all
  (select unique1 from tenk1 union select unique2 from tenk1);
 reset enable_hashagg;
 -- non-hashable type