Fix several hash functions that were taking chintzy shortcuts instead of

delivering a well-randomized hash value. I got religion on this after observing that performance of multi-batch hash join degrades terribly if the higher-order bits of hash values aren't random, as indeed was true for say hashes of small integer values. It's now expected and documented that hash functions should use hash_any or some comparable method to ensure that all bits of their output are about equally random. initdb forced because this change invalidates existing hash indexes. For the same reason, this isn't back-patchable; the hash join performance problem will get a band-aid fix in the back branches.
2025-08-28 18:48:04 +03:00 · 2007-06-01 15:33:19 +00:00
parent 397d00af8f
commit 1f559b7d3a
5 changed files with 67 additions and 41 deletions
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -1,19 +1,26 @@
 /*-------------------------------------------------------------------------
 *
 * hashfunc.c
- *	  Comparison functions for hash access method.
+ *	  Support functions for hash access method.
 *
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.51 2007/04/02 03:49:37 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.52 2007/06/01 15:33:18 tgl Exp $
 *
 * NOTES
 *	  These functions are stored in pg_amproc.	For each operator class
- *	  defined on hash tables, they compute the hash value of the argument.
+ *	  defined for hash indexes, they compute the hash value of the argument.
 *
+ *	  Additional hash functions appear in /utils/adt/ files for various
+ *	  specialized datatypes.
+ *
+ *	  It is expected that every bit of a hash function's 32-bit result is
+ *	  as random as every other; failure to ensure this is likely to lead
+ *	  to poor performance of hash joins, for example.  In most cases a hash
+ *	  function should use hash_any() or its variant hash_uint32().
 *-------------------------------------------------------------------------
 */

@@ -26,19 +33,19 @@
 Datum
 hashchar(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_UINT32(~((uint32) PG_GETARG_CHAR(0)));
+	return hash_uint32((int32) PG_GETARG_CHAR(0));
 }

 Datum
 hashint2(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_UINT32(~((uint32) PG_GETARG_INT16(0)));
+	return hash_uint32((int32) PG_GETARG_INT16(0));
 }

 Datum
 hashint4(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_UINT32(~PG_GETARG_UINT32(0));
+	return hash_uint32(PG_GETARG_INT32(0));
 }

 Datum
@@ -59,23 +66,23 @@ hashint8(PG_FUNCTION_ARGS)

 	lohalf ^= (val >= 0) ? hihalf : ~hihalf;

-	PG_RETURN_UINT32(~lohalf);
+	return hash_uint32(lohalf);
 #else
 	/* here if we can't count on "x >> 32" to work sanely */
-	PG_RETURN_UINT32(~((uint32) PG_GETARG_INT64(0)));
+	return hash_uint32((int32) PG_GETARG_INT64(0));
 #endif
 }

 Datum
 hashoid(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_UINT32(~((uint32) PG_GETARG_OID(0)));
+	return hash_uint32((uint32) PG_GETARG_OID(0));
 }

 Datum
 hashenum(PG_FUNCTION_ARGS)
 {
-    PG_RETURN_UINT32(~((uint32) PG_GETARG_OID(0)));
+	return hash_uint32((uint32) PG_GETARG_OID(0));
 }

 Datum
@@ -283,6 +290,31 @@ hash_any(register const unsigned char *k, register int keylen)
 			/* case 0: nothing left to add */
 	}
 	mix(a, b, c);
+
+	/* report the result */
+	return UInt32GetDatum(c);
+}
+
+/*
+ * hash_uint32() -- hash a 32-bit value
+ *
+ * This has the same result (at least on little-endian machines) as
+ *		hash_any(&k, sizeof(uint32))
+ * but is faster and doesn't force the caller to store k into memory.
+ */
+Datum
+hash_uint32(uint32 k)
+{
+	register uint32 a,
+				b,
+				c;
+
+	a = 0x9e3779b9 + k;
+	b = 0x9e3779b9;
+	c = 3923095 + (uint32) sizeof(uint32);
+
+	mix(a, b, c);
+
 	/* report the result */
 	return UInt32GetDatum(c);
 }