Optimize multi-batch hash joins when the outer relation has a nonuniform

distribution, by creating a special fast path for the (first few) most common values of the outer relation. Tuples having hashvalues matching the MCVs are effectively forced to be in the first batch, so that we never write them out to the batch temp files. Bryce Cutt and Ramon Lawrence, with some editorialization by me.
2025-11-15 03:41:20 +03:00 · 2009-03-21 00:04:40 +00:00
parent 249d936bed
commit 596efd27ed
10 changed files with 604 additions and 34 deletions
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.97 2009/01/01 17:23:41 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.98 2009/03/21 00:04:38 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -198,19 +198,23 @@ ExecHashJoin(HashJoinState *node)
 			node->hj_MatchedOuter = false;

 			/*
-			 * now we have an outer tuple, find the corresponding bucket for
-			 * this tuple from the hash table
+			 * Now we have an outer tuple; find the corresponding bucket for
+			 * this tuple in the main hash table or skew hash table.
 			 */
 			node->hj_CurHashValue = hashvalue;
 			ExecHashGetBucketAndBatch(hashtable, hashvalue,
 									  &node->hj_CurBucketNo, &batchno);
+			node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
+															 hashvalue);
 			node->hj_CurTuple = NULL;

 			/*
 			 * Now we've got an outer tuple and the corresponding hash bucket,
-			 * but this tuple may not belong to the current batch.
+			 * but it might not belong to the current batch, or it might
+			 * match a skew bucket.
 			 */
-			if (batchno != hashtable->curbatch)
+			if (batchno != hashtable->curbatch &&
+				node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
 			{
 				/*
 				 * Need to postpone this outer tuple to a later batch. Save it
@@ -452,6 +456,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)

 	hjstate->hj_CurHashValue = 0;
 	hjstate->hj_CurBucketNo = 0;
+	hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
 	hjstate->hj_CurTuple = NULL;

 	/*
@@ -651,6 +656,19 @@ start_over:
 			BufFileClose(hashtable->outerBatchFile[curbatch]);
 		hashtable->outerBatchFile[curbatch] = NULL;
 	}
+	else						/* we just finished the first batch */
+	{
+		/*
+		 * Reset some of the skew optimization state variables, since we
+		 * no longer need to consider skew tuples after the first batch.
+		 * The memory context reset we are about to do will release the
+		 * skew hashtable itself.
+		 */
+		hashtable->skewEnabled = false;
+		hashtable->skewBucket = NULL;
+		hashtable->skewBucketNums = NULL;
+		hashtable->spaceUsedSkew = 0;
+	}

 	/*
 	 * We can always skip over any batches that are completely empty on both
@@ -880,6 +898,7 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
 	/* Always reset intra-tuple state */
 	node->hj_CurHashValue = 0;
 	node->hj_CurBucketNo = 0;
+	node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
 	node->hj_CurTuple = NULL;

 	node->js.ps.ps_TupFromTlist = false;