Revise hash join and hash aggregation code to use the same datatype-

specific hash functions used by hash indexes, rather than the old not-datatype-aware ComputeHashFunc routine. This makes it safe to do hash joining on several datatypes that previously couldn't use hashing. The sets of datatypes that are hash indexable and hash joinable are now exactly the same, whereas before each had some that weren't in the other.
2025-07-02 09:02:37 +03:00 · 2003-06-22 22:04:55 +00:00
parent 0dda75f6eb
commit bff0422b6c
27 changed files with 489 additions and 232 deletions
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/execGrouping.c,v 1.2 2003/01/12 04:03:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/execGrouping.c,v 1.3 2003/06/22 22:04:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -19,6 +19,8 @@
 #include "executor/executor.h"
 #include "parser/parse_oper.h"
 #include "utils/memutils.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"


 /*****************************************************************************
@ -213,76 +215,46 @@ execTuplesMatchPrepare(TupleDesc tupdesc,
 	return eqfunctions;
 }

-
-/*****************************************************************************
- *		Utility routines for hashing
- *****************************************************************************/
-
 /*
- * ComputeHashFunc
+ * execTuplesHashPrepare
+ *		Look up the equality and hashing functions needed for a TupleHashTable.
 *
- *		the hash function for hash joins (also used for hash aggregation)
- *
- *		XXX this probably ought to be replaced with datatype-specific
- *		hash functions, such as those already implemented for hash indexes.
+ * This is similar to execTuplesMatchPrepare, but we also need to find the
+ * hash functions associated with the equality operators.  *eqfunctions and
+ * *hashfunctions receive the palloc'd result arrays.
 */
-uint32
-ComputeHashFunc(Datum key, int typLen, bool byVal)
+void
+execTuplesHashPrepare(TupleDesc tupdesc,
+					  int numCols,
+					  AttrNumber *matchColIdx,
+					  FmgrInfo **eqfunctions,
+					  FmgrInfo **hashfunctions)
 {
-	unsigned char *k;
+	int			i;

-	if (byVal)
+	*eqfunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
+	*hashfunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
+
+	for (i = 0; i < numCols; i++)
 	{
-		/*
-		 * If it's a by-value data type, just hash the whole Datum value.
-		 * This assumes that datatypes narrower than Datum are
-		 * consistently padded (either zero-extended or sign-extended, but
-		 * not random bits) to fill Datum; see the XXXGetDatum macros in
-		 * postgres.h. NOTE: it would not work to do hash_any(&key, len)
-		 * since this would get the wrong bytes on a big-endian machine.
-		 */
-		k = (unsigned char *) &key;
-		typLen = sizeof(Datum);
-	}
-	else
-	{
-		if (typLen > 0)
-		{
-			/* fixed-width pass-by-reference type */
-			k = (unsigned char *) DatumGetPointer(key);
-		}
-		else if (typLen == -1)
-		{
-			/*
-			 * It's a varlena type, so 'key' points to a "struct varlena".
-			 * NOTE: VARSIZE returns the "real" data length plus the
-			 * sizeof the "vl_len" attribute of varlena (the length
-			 * information). 'key' points to the beginning of the varlena
-			 * struct, so we have to use "VARDATA" to find the beginning
-			 * of the "real" data.	Also, we have to be careful to detoast
-			 * the datum if it's toasted.  (We don't worry about freeing
-			 * the detoasted copy; that happens for free when the
-			 * per-tuple memory context is reset in ExecHashGetBucket.)
-			 */
-			struct varlena *vkey = PG_DETOAST_DATUM(key);
+		AttrNumber	att = matchColIdx[i];
+		Oid			typid = tupdesc->attrs[att - 1]->atttypid;
+		Operator	optup;
+		Oid			eq_opr;
+		Oid			eq_function;
+		Oid			hash_function;

-			typLen = VARSIZE(vkey) - VARHDRSZ;
-			k = (unsigned char *) VARDATA(vkey);
-		}
-		else if (typLen == -2)
-		{
-			/* It's a null-terminated C string */
-			typLen = strlen(DatumGetCString(key)) + 1;
-			k = (unsigned char *) DatumGetPointer(key);
-		}
-		else
-		{
-			elog(ERROR, "ComputeHashFunc: Invalid typLen %d", typLen);
-			k = NULL;			/* keep compiler quiet */
-		}
+		optup = equality_oper(typid, false);
+		eq_opr = oprid(optup);
+		eq_function = oprfuncid(optup);
+		ReleaseSysCache(optup);
+		hash_function = get_op_hash_function(eq_opr);
+		if (!OidIsValid(hash_function))
+			elog(ERROR, "Could not find hash function for hash operator %u",
+				 eq_opr);
+		fmgr_info(eq_function, &(*eqfunctions)[i]);
+		fmgr_info(hash_function, &(*hashfunctions)[i]);
 	}
-
-	return DatumGetUInt32(hash_any(k, typLen));
 }


@ -299,19 +271,21 @@ ComputeHashFunc(Datum key, int typLen, bool byVal)
 *
 *	numCols, keyColIdx: identify the tuple fields to use as lookup key
 *	eqfunctions: equality comparison functions to use
+ *	hashfunctions: datatype-specific hashing functions to use
 *	nbuckets: number of buckets to make
 *	entrysize: size of each entry (at least sizeof(TupleHashEntryData))
 *	tablecxt: memory context in which to store table and table entries
 *	tempcxt: short-lived context for evaluation hash and comparison functions
 *
- * The eqfunctions array may be made with execTuplesMatchPrepare().
+ * The function arrays may be made with execTuplesHashPrepare().
 *
- * Note that keyColIdx and eqfunctions must be allocated in storage that
- * will live as long as the hashtable does.
+ * Note that keyColIdx, eqfunctions, and hashfunctions must be allocated in
+ * storage that will live as long as the hashtable does.
 */
 TupleHashTable
 BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 					FmgrInfo *eqfunctions,
+					FmgrInfo *hashfunctions,
 					int nbuckets, Size entrysize,
 					MemoryContext tablecxt, MemoryContext tempcxt)
 {
@ -328,6 +302,7 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 	hashtable->numCols = numCols;
 	hashtable->keyColIdx = keyColIdx;
 	hashtable->eqfunctions = eqfunctions;
+	hashtable->hashfunctions = hashfunctions;
 	hashtable->tablecxt = tablecxt;
 	hashtable->tempcxt = tempcxt;
 	hashtable->entrysize = entrysize;
@ -375,11 +350,15 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 		hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);

 		attr = heap_getattr(tuple, att, tupdesc, &isNull);
-		if (isNull)
-			continue;			/* treat nulls as having hash key 0 */
-		hashkey ^= ComputeHashFunc(attr,
-								   (int) tupdesc->attrs[att - 1]->attlen,
-								   tupdesc->attrs[att - 1]->attbyval);
+
+		if (!isNull)			/* treat nulls as having hash key 0 */
+		{
+			uint32		hkey;
+
+			hkey = DatumGetUInt32(FunctionCall1(&hashtable->hashfunctions[i],
+												attr));
+			hashkey ^= hkey;
+		}
 	}
 	bucketno = hashkey % (uint32) hashtable->nbuckets;

--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@ -45,7 +45,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeAgg.c,v 1.106 2003/06/06 15:04:01 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeAgg.c,v 1.107 2003/06/22 22:04:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -582,6 +582,7 @@ build_hash_table(AggState *aggstate)
 	aggstate->hashtable = BuildTupleHashTable(node->numCols,
 											  node->grpColIdx,
 											  aggstate->eqfunctions,
+											  aggstate->hashfunctions,
 											  node->numGroups,
 											  entrysize,
 											  aggstate->aggcontext,
@ -1035,6 +1036,7 @@ ExecInitAgg(Agg *node, EState *estate)
 	aggstate->aggs = NIL;
 	aggstate->numaggs = 0;
 	aggstate->eqfunctions = NULL;
+	aggstate->hashfunctions = NULL;
 	aggstate->peragg = NULL;
 	aggstate->agg_done = false;
 	aggstate->pergroup = NULL;
@ -1123,14 +1125,23 @@ ExecInitAgg(Agg *node, EState *estate)
 	}

 	/*
-	 * If we are grouping, precompute fmgr lookup data for inner loop
+	 * If we are grouping, precompute fmgr lookup data for inner loop.
+	 * We need both equality and hashing functions to do it by hashing,
+	 * but only equality if not hashing.
 	 */
 	if (node->numCols > 0)
 	{
-		aggstate->eqfunctions =
-			execTuplesMatchPrepare(ExecGetScanType(&aggstate->ss),
-								   node->numCols,
-								   node->grpColIdx);
+		if (node->aggstrategy == AGG_HASHED)
+			execTuplesHashPrepare(ExecGetScanType(&aggstate->ss),
+								  node->numCols,
+								  node->grpColIdx,
+								  &aggstate->eqfunctions,
+								  &aggstate->hashfunctions);
+		else
+			aggstate->eqfunctions =
+				execTuplesMatchPrepare(ExecGetScanType(&aggstate->ss),
+									   node->numCols,
+									   node->grpColIdx);
 	}

 	/*
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeHash.c,v 1.75 2003/03/27 16:51:27 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeHash.c,v 1.76 2003/06/22 22:04:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -192,7 +192,7 @@ ExecEndHash(HashState *node)
 * ----------------------------------------------------------------
 */
 HashJoinTable
-ExecHashTableCreate(Hash *node)
+ExecHashTableCreate(Hash *node, List *hashOperators)
 {
 	HashJoinTable hashtable;
 	Plan	   *outerNode;
@ -201,7 +201,7 @@ ExecHashTableCreate(Hash *node)
 	int			nbatch;
 	int			nkeys;
 	int			i;
-	List	   *hk;
+	List	   *ho;
 	MemoryContext oldcxt;

 	/*
@ -237,17 +237,20 @@ ExecHashTableCreate(Hash *node)
 	hashtable->outerBatchSize = NULL;

 	/*
-	 * Get info about the datatypes of the hash keys.
+	 * Get info about the hash functions to be used for each hash key.
 	 */
-	nkeys = length(node->hashkeys);
-	hashtable->typLens = (int16 *) palloc(nkeys * sizeof(int16));
-	hashtable->typByVals = (bool *) palloc(nkeys * sizeof(bool));
+	nkeys = length(hashOperators);
+	hashtable->hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
 	i = 0;
-	foreach(hk, node->hashkeys)
+	foreach(ho, hashOperators)
 	{
-		get_typlenbyval(exprType(lfirst(hk)),
-						&hashtable->typLens[i],
-						&hashtable->typByVals[i]);
+		Oid		hashfn;
+
+		hashfn = get_op_hash_function(lfirsto(ho));
+		if (!OidIsValid(hashfn))
+			elog(ERROR, "Could not find hash function for hash operator %u",
+				 lfirsto(ho));
+		fmgr_info(hashfn, &hashtable->hashfunctions[i]);
 		i++;
 	}

@ -520,7 +523,7 @@ ExecHashGetBucket(HashJoinTable hashtable,

 	/*
 	 * We reset the eval context each time to reclaim any memory leaked in
-	 * the hashkey expressions or ComputeHashFunc itself.
+	 * the hashkey expressions.
 	 */
 	ResetExprContext(econtext);

@ -545,9 +548,11 @@ ExecHashGetBucket(HashJoinTable hashtable,
 		 */
 		if (!isNull)			/* treat nulls as having hash key 0 */
 		{
-			hashkey ^= ComputeHashFunc(keyval,
-									   (int) hashtable->typLens[i],
-									   hashtable->typByVals[i]);
+			uint32		hkey;
+
+			hkey = DatumGetUInt32(FunctionCall1(&hashtable->hashfunctions[i],
+												keyval));
+			hashkey ^= hkey;
 		}

 		i++;
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.51 2003/05/30 20:23:10 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.52 2003/06/22 22:04:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -117,7 +117,8 @@ ExecHashJoin(HashJoinState *node)
 		 * create the hash table
 		 */
 		Assert(hashtable == NULL);
-		hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan);
+		hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
+										node->hj_HashOperators);
 		node->hj_HashTable = hashtable;

 		/*
@ -305,6 +306,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate)
 	Plan	   *outerNode;
 	Hash	   *hashNode;
 	List	   *hclauses;
+	List	   *hoperators;
 	List	   *hcl;

 	/*
@ -406,8 +408,9 @@ ExecInitHashJoin(HashJoin *node, EState *estate)

 	/*
 	 * The planner already made a list of the inner hashkeys for us,
-	 * but we also need a list of the outer hashkeys.  Each list of
-	 * exprs must then be prepared for execution.
+	 * but we also need a list of the outer hashkeys, as well as a list
+	 * of the hash operator OIDs.  Both lists of exprs must then be prepared
+	 * for execution.
 	 */
 	hjstate->hj_InnerHashKeys = (List *)
 		ExecInitExpr((Expr *) hashNode->hashkeys,
@ -416,13 +419,19 @@ ExecInitHashJoin(HashJoin *node, EState *estate)
 		hjstate->hj_InnerHashKeys;

 	hclauses = NIL;
+	hoperators = NIL;
 	foreach(hcl, node->hashclauses)
 	{
-		hclauses = lappend(hclauses, get_leftop(lfirst(hcl)));
+		OpExpr	   *hclause = (OpExpr *) lfirst(hcl);
+
+		Assert(IsA(hclause, OpExpr));
+		hclauses = lappend(hclauses, get_leftop((Expr *) hclause));
+		hoperators = lappendo(hoperators, hclause->opno);
 	}
 	hjstate->hj_OuterHashKeys = (List *)
 		ExecInitExpr((Expr *) hclauses,
 					 (PlanState *) hjstate);
+	hjstate->hj_HashOperators = hoperators;

 	hjstate->js.ps.ps_OuterTupleSlot = NULL;
 	hjstate->js.ps.ps_TupFromTlist = false;
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeSubplan.c,v 1.46 2003/06/06 15:04:01 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeSubplan.c,v 1.47 2003/06/22 22:04:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -519,6 +519,7 @@ buildSubPlanHash(SubPlanState *node)
 	node->hashtable = BuildTupleHashTable(ncols,
 										  node->keyColIdx,
 										  node->eqfunctions,
+										  node->hashfunctions,
 										  nbuckets,
 										  sizeof(TupleHashEntryData),
 										  node->tablecxt,
@ -537,6 +538,7 @@ buildSubPlanHash(SubPlanState *node)
 		node->hashnulls = BuildTupleHashTable(ncols,
 											  node->keyColIdx,
 											  node->eqfunctions,
+											  node->hashfunctions,
 											  nbuckets,
 											  sizeof(TupleHashEntryData),
 											  node->tablecxt,
@ -700,6 +702,7 @@ ExecInitSubPlan(SubPlanState *node, EState *estate)
 	node->innerecontext = NULL;
 	node->keyColIdx = NULL;
 	node->eqfunctions = NULL;
+	node->hashfunctions = NULL;

 	/*
 	 * create an EState for the subplan
@ -797,11 +800,12 @@ ExecInitSubPlan(SubPlanState *node, EState *estate)
 		 * ExecTypeFromTL).
 		 *
 		 * We also extract the combining operators themselves to initialize
-		 * the equality functions for the hash tables.
+		 * the equality and hashing functions for the hash tables.
 		 */
 		lefttlist = righttlist = NIL;
 		leftptlist = rightptlist = NIL;
 		node->eqfunctions = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
+		node->hashfunctions = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
 		i = 1;
 		foreach(lexpr, node->exprs)
 		{
@ -811,6 +815,7 @@ ExecInitSubPlan(SubPlanState *node, EState *estate)
 			Expr	   *expr;
 			TargetEntry *tle;
 			GenericExprState *tlestate;
+			Oid			hashfn;

 			Assert(IsA(fstate, FuncExprState));
 			Assert(IsA(opexpr, OpExpr));
@ -850,6 +855,13 @@ ExecInitSubPlan(SubPlanState *node, EState *estate)
 			fmgr_info(opexpr->opfuncid, &node->eqfunctions[i-1]);
 			node->eqfunctions[i-1].fn_expr = (Node *) opexpr;

+			/* Lookup the associated hash function */
+			hashfn = get_op_hash_function(opexpr->opno);
+			if (!OidIsValid(hashfn))
+				elog(ERROR, "Could not find hash function for hash operator %u",
+					 opexpr->opno);
+			fmgr_info(hashfn, &node->hashfunctions[i-1]);
+
 			i++;
 		}