Support hashing for duplicate-elimination in INTERSECT and EXCEPT queries.

This completes my project of improving usage of hashing for duplicate elimination (aggregate functions with DISTINCT remain undone, but that's for some other day). As with the previous patches, this means we can INTERSECT/EXCEPT on datatypes that can hash but not sort, and it means that INTERSECT/EXCEPT without ORDER BY are no longer certain to produce sorted output.
2025-07-02 09:02:37 +03:00 · 2008-08-07 03:04:04 +00:00
parent 2d1d96b1ce
commit 368df30427
11 changed files with 597 additions and 207 deletions
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@ -10,7 +10,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.242 2008/08/02 21:32:00 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.243 2008/08/07 03:04:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -3108,8 +3108,9 @@ make_unique(Plan *lefttree, List *distinctList)
 * already be sorted accordingly.
 */
 SetOp *
-make_setop(SetOpCmd cmd, Plan *lefttree,
-		   List *distinctList, AttrNumber flagColIdx)
+make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree,
+		   List *distinctList, AttrNumber flagColIdx, long numGroups,
+		   double outputRows)
 {
 	SetOp	   *node = makeNode(SetOp);
 	Plan	   *plan = &node->plan;
@ -3120,20 +3121,13 @@ make_setop(SetOpCmd cmd, Plan *lefttree,
 	ListCell   *slitem;

 	copy_plan_costsize(plan, lefttree);
+	plan->plan_rows = outputRows;

 	/*
 	 * Charge one cpu_operator_cost per comparison per input tuple. We assume
 	 * all columns get compared at most of the tuples.
 	 */
-	plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols;
-
-	/*
-	 * We make the unsupported assumption that there will be 10% as many
-	 * tuples out as in.  Any way to do better?
-	 */
-	plan->plan_rows *= 0.1;
-	if (plan->plan_rows < 1)
-		plan->plan_rows = 1;
+	plan->total_cost += cpu_operator_cost * lefttree->plan_rows * numCols;

 	plan->targetlist = lefttree->targetlist;
 	plan->qual = NIL;
@ -3160,10 +3154,12 @@ make_setop(SetOpCmd cmd, Plan *lefttree,
 	}

 	node->cmd = cmd;
+	node->strategy = strategy;
 	node->numCols = numCols;
 	node->dupColIdx = dupColIdx;
 	node->dupOperators = dupOperators;
 	node->flagColIdx = flagColIdx;
+	node->numGroups = numGroups;

 	return node;
 }
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@ -22,7 +22,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.150 2008/08/07 01:11:50 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.151 2008/08/07 03:04:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -60,6 +60,7 @@ static Plan *generate_union_plan(SetOperationStmt *op, PlannerInfo *root,
 					double tuple_fraction,
 					List *refnames_tlist, List **sortClauses);
 static Plan *generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
+					   double tuple_fraction,
 					   List *refnames_tlist, List **sortClauses);
 static List *recurse_union_children(Node *setOp, PlannerInfo *root,
 					   double tuple_fraction,
@ -229,7 +230,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
 									   refnames_tlist,
 									   sortClauses);
 		else
-			plan = generate_nonunion_plan(op, root,
+			plan = generate_nonunion_plan(op, root, tuple_fraction,
 										  refnames_tlist,
 										  sortClauses);

@ -341,6 +342,7 @@ generate_union_plan(SetOperationStmt *op, PlannerInfo *root,
 */
 static Plan *
 generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
+					   double tuple_fraction,
 					   List *refnames_tlist,
 					   List **sortClauses)
 {
@ -351,6 +353,10 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 			   *groupList,
 			   *planlist,
 			   *child_sortclauses;
+	double		dNumDistinctRows;
+	double		dNumOutputRows;
+	long		numDistinctRows;
+	bool		use_hash;
 	SetOpCmd	cmd;

 	/* Recurse on children, ensuring their outputs are marked */
@ -393,10 +399,32 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 		return plan;
 	}

+	/*
+	 * XXX for the moment, take the number of distinct groups as being the
+	 * total input size, ie, the worst case.  This is too conservative, but
+	 * we don't want to risk having the hashtable overrun memory; also,
+	 * it's not clear how to get a decent estimate of the true size.
+	 */
+	dNumDistinctRows = plan->plan_rows;
+
+	/* Also convert to long int --- but 'ware overflow! */
+	numDistinctRows = (long) Min(dNumDistinctRows, (double) LONG_MAX);
+
+	/*
+	 * The output size is taken as 10% of that, which is a completely bogus
+	 * guess, but it's what we've used historically.
+	 */
+	dNumOutputRows = ceil(dNumDistinctRows * 0.1);
+
 	/*
 	 * Decide whether to hash or sort, and add a sort node if needed.
 	 */
-	plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan);
+	use_hash = choose_hashed_setop(root, groupList, plan,
+								   tuple_fraction, dNumDistinctRows,
+								   (op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT");
+
+	if (!use_hash)
+		plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan);

 	/*
 	 * Finally, add a SetOp plan node to generate the correct output.
@ -414,9 +442,12 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 			cmd = SETOPCMD_INTERSECT;	/* keep compiler quiet */
 			break;
 	}
-	plan = (Plan *) make_setop(cmd, plan, groupList, list_length(op->colTypes) + 1);
+	plan = (Plan *) make_setop(cmd, use_hash ? SETOP_HASHED : SETOP_SORTED,
+							   plan, groupList, list_length(op->colTypes) + 1,
+							   numDistinctRows, dNumOutputRows);

-	*sortClauses = groupList;
+	/* Result is sorted only if we're not hashing */
+	*sortClauses = use_hash ? NIL : groupList;

 	return plan;
 }