Introduce 64-bit hash functions with a 64-bit seed.

This will be useful for hash partitioning, which needs a way to seed the hash functions to avoid problems such as a hash index on a hash partitioned table clumping all values into a small portion of the bucket space; it's also useful for anything that wants a 64-bit hash value rather than a 32-bit hash value. Just in case somebody wants a 64-bit hash value that is compatible with the existing 32-bit hash values, make the low 32-bits of the 64-bit hash value match the 32-bit hash value when the seed is 0. Robert Haas and Amul Sul Discussion: http://postgr.es/m/CA+Tgmoafx2yoJuhCQQOL5CocEi-w_uG4S2xT0EtgiJnPGcHW3g@mail.gmail.com
2025-11-12 05:01:15 +03:00 · 2017-08-31 22:21:21 -04:00
parent 2d44c58c79
commit 81c5e46c49
33 changed files with 1555 additions and 42 deletions
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -46,18 +46,36 @@ hashchar(PG_FUNCTION_ARGS)
 	return hash_uint32((int32) PG_GETARG_CHAR(0));
 }

+Datum
+hashcharextended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended((int32) PG_GETARG_CHAR(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashint2(PG_FUNCTION_ARGS)
 {
 	return hash_uint32((int32) PG_GETARG_INT16(0));
 }

+Datum
+hashint2extended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended((int32) PG_GETARG_INT16(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashint4(PG_FUNCTION_ARGS)
 {
 	return hash_uint32(PG_GETARG_INT32(0));
 }

+Datum
+hashint4extended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended(PG_GETARG_INT32(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashint8(PG_FUNCTION_ARGS)
 {
@@ -78,18 +96,43 @@ hashint8(PG_FUNCTION_ARGS)
 	return hash_uint32(lohalf);
 }

+Datum
+hashint8extended(PG_FUNCTION_ARGS)
+{
+	/* Same approach as hashint8 */
+	int64		val = PG_GETARG_INT64(0);
+	uint32		lohalf = (uint32) val;
+	uint32		hihalf = (uint32) (val >> 32);
+
+	lohalf ^= (val >= 0) ? hihalf : ~hihalf;
+
+	return hash_uint32_extended(lohalf, PG_GETARG_INT64(1));
+}
+
 Datum
 hashoid(PG_FUNCTION_ARGS)
 {
 	return hash_uint32((uint32) PG_GETARG_OID(0));
 }

+Datum
+hashoidextended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashenum(PG_FUNCTION_ARGS)
 {
 	return hash_uint32((uint32) PG_GETARG_OID(0));
 }

+Datum
+hashenumextended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashfloat4(PG_FUNCTION_ARGS)
 {
@@ -116,6 +159,21 @@ hashfloat4(PG_FUNCTION_ARGS)
 	return hash_any((unsigned char *) &key8, sizeof(key8));
 }

+Datum
+hashfloat4extended(PG_FUNCTION_ARGS)
+{
+	float4		key = PG_GETARG_FLOAT4(0);
+	uint64		seed = PG_GETARG_INT64(1);
+	float8		key8;
+
+	/* Same approach as hashfloat4 */
+	if (key == (float4) 0)
+		PG_RETURN_UINT64(seed);
+	key8 = key;
+
+	return hash_any_extended((unsigned char *) &key8, sizeof(key8), seed);
+}
+
 Datum
 hashfloat8(PG_FUNCTION_ARGS)
 {
@@ -132,6 +190,19 @@ hashfloat8(PG_FUNCTION_ARGS)
 	return hash_any((unsigned char *) &key, sizeof(key));
 }

+Datum
+hashfloat8extended(PG_FUNCTION_ARGS)
+{
+	float8		key = PG_GETARG_FLOAT8(0);
+	uint64		seed = PG_GETARG_INT64(1);
+
+	/* Same approach as hashfloat8 */
+	if (key == (float8) 0)
+		PG_RETURN_UINT64(seed);
+
+	return hash_any_extended((unsigned char *) &key, sizeof(key), seed);
+}
+
 Datum
 hashoidvector(PG_FUNCTION_ARGS)
 {
@@ -140,6 +211,16 @@ hashoidvector(PG_FUNCTION_ARGS)
 	return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
 }

+Datum
+hashoidvectorextended(PG_FUNCTION_ARGS)
+{
+	oidvector  *key = (oidvector *) PG_GETARG_POINTER(0);
+
+	return hash_any_extended((unsigned char *) key->values,
+							 key->dim1 * sizeof(Oid),
+							 PG_GETARG_INT64(1));
+}
+
 Datum
 hashname(PG_FUNCTION_ARGS)
 {
@@ -148,6 +229,15 @@ hashname(PG_FUNCTION_ARGS)
 	return hash_any((unsigned char *) key, strlen(key));
 }

+Datum
+hashnameextended(PG_FUNCTION_ARGS)
+{
+	char	   *key = NameStr(*PG_GETARG_NAME(0));
+
+	return hash_any_extended((unsigned char *) key, strlen(key),
+							 PG_GETARG_INT64(1));
+}
+
 Datum
 hashtext(PG_FUNCTION_ARGS)
 {
@@ -168,6 +258,22 @@ hashtext(PG_FUNCTION_ARGS)
 	return result;
 }

+Datum
+hashtextextended(PG_FUNCTION_ARGS)
+{
+	text	   *key = PG_GETARG_TEXT_PP(0);
+	Datum		result;
+
+	/* Same approach as hashtext */
+	result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
+							   VARSIZE_ANY_EXHDR(key),
+							   PG_GETARG_INT64(1));
+
+	PG_FREE_IF_COPY(key, 0);
+
+	return result;
+}
+
 /*
 * hashvarlena() can be used for any varlena datatype in which there are
 * no non-significant bits, ie, distinct bitpatterns never compare as equal.
@@ -187,6 +293,21 @@ hashvarlena(PG_FUNCTION_ARGS)
 	return result;
 }

+Datum
+hashvarlenaextended(PG_FUNCTION_ARGS)
+{
+	struct varlena *key = PG_GETARG_VARLENA_PP(0);
+	Datum		result;
+
+	result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
+							   VARSIZE_ANY_EXHDR(key),
+							   PG_GETARG_INT64(1));
+
+	PG_FREE_IF_COPY(key, 0);
+
+	return result;
+}
+
 /*
 * This hash function was written by Bob Jenkins
 * (bob_jenkins@burtleburtle.net), and superficially adapted
@@ -502,7 +623,227 @@ hash_any(register const unsigned char *k, register int keylen)
 }

 /*
- * hash_uint32() -- hash a 32-bit value
+ * hash_any_extended() -- hash into a 64-bit value, using an optional seed
+ *		k		: the key (the unaligned variable-length array of bytes)
+ *		len		: the length of the key, counting by bytes
+ *		seed	: a 64-bit seed (0 means no seed)
+ *
+ * Returns a uint64 value.  Otherwise similar to hash_any.
+ */
+Datum
+hash_any_extended(register const unsigned char *k, register int keylen,
+				  uint64 seed)
+{
+	register uint32 a,
+				b,
+				c,
+				len;
+
+	/* Set up the internal state */
+	len = keylen;
+	a = b = c = 0x9e3779b9 + len + 3923095;
+
+	/* If the seed is non-zero, use it to perturb the internal state. */
+	if (seed != 0)
+	{
+		/*
+		 * In essence, the seed is treated as part of the data being hashed,
+		 * but for simplicity, we pretend that it's padded with four bytes of
+		 * zeroes so that the seed constitutes a 12-byte chunk.
+		 */
+		a += (uint32) (seed >> 32);
+		b += (uint32) seed;
+		mix(a, b, c);
+	}
+
+	/* If the source pointer is word-aligned, we use word-wide fetches */
+	if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
+	{
+		/* Code path for aligned source data */
+		register const uint32 *ka = (const uint32 *) k;
+
+		/* handle most of the key */
+		while (len >= 12)
+		{
+			a += ka[0];
+			b += ka[1];
+			c += ka[2];
+			mix(a, b, c);
+			ka += 3;
+			len -= 12;
+		}
+
+		/* handle the last 11 bytes */
+		k = (const unsigned char *) ka;
+#ifdef WORDS_BIGENDIAN
+		switch (len)
+		{
+			case 11:
+				c += ((uint32) k[10] << 8);
+				/* fall through */
+			case 10:
+				c += ((uint32) k[9] << 16);
+				/* fall through */
+			case 9:
+				c += ((uint32) k[8] << 24);
+				/* the lowest byte of c is reserved for the length */
+				/* fall through */
+			case 8:
+				b += ka[1];
+				a += ka[0];
+				break;
+			case 7:
+				b += ((uint32) k[6] << 8);
+				/* fall through */
+			case 6:
+				b += ((uint32) k[5] << 16);
+				/* fall through */
+			case 5:
+				b += ((uint32) k[4] << 24);
+				/* fall through */
+			case 4:
+				a += ka[0];
+				break;
+			case 3:
+				a += ((uint32) k[2] << 8);
+				/* fall through */
+			case 2:
+				a += ((uint32) k[1] << 16);
+				/* fall through */
+			case 1:
+				a += ((uint32) k[0] << 24);
+				/* case 0: nothing left to add */
+		}
+#else							/* !WORDS_BIGENDIAN */
+		switch (len)
+		{
+			case 11:
+				c += ((uint32) k[10] << 24);
+				/* fall through */
+			case 10:
+				c += ((uint32) k[9] << 16);
+				/* fall through */
+			case 9:
+				c += ((uint32) k[8] << 8);
+				/* the lowest byte of c is reserved for the length */
+				/* fall through */
+			case 8:
+				b += ka[1];
+				a += ka[0];
+				break;
+			case 7:
+				b += ((uint32) k[6] << 16);
+				/* fall through */
+			case 6:
+				b += ((uint32) k[5] << 8);
+				/* fall through */
+			case 5:
+				b += k[4];
+				/* fall through */
+			case 4:
+				a += ka[0];
+				break;
+			case 3:
+				a += ((uint32) k[2] << 16);
+				/* fall through */
+			case 2:
+				a += ((uint32) k[1] << 8);
+				/* fall through */
+			case 1:
+				a += k[0];
+				/* case 0: nothing left to add */
+		}
+#endif							/* WORDS_BIGENDIAN */
+	}
+	else
+	{
+		/* Code path for non-aligned source data */
+
+		/* handle most of the key */
+		while (len >= 12)
+		{
+#ifdef WORDS_BIGENDIAN
+			a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
+			b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
+			c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
+#else							/* !WORDS_BIGENDIAN */
+			a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
+			b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
+			c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
+#endif							/* WORDS_BIGENDIAN */
+			mix(a, b, c);
+			k += 12;
+			len -= 12;
+		}
+
+		/* handle the last 11 bytes */
+#ifdef WORDS_BIGENDIAN
+		switch (len)			/* all the case statements fall through */
+		{
+			case 11:
+				c += ((uint32) k[10] << 8);
+			case 10:
+				c += ((uint32) k[9] << 16);
+			case 9:
+				c += ((uint32) k[8] << 24);
+				/* the lowest byte of c is reserved for the length */
+			case 8:
+				b += k[7];
+			case 7:
+				b += ((uint32) k[6] << 8);
+			case 6:
+				b += ((uint32) k[5] << 16);
+			case 5:
+				b += ((uint32) k[4] << 24);
+			case 4:
+				a += k[3];
+			case 3:
+				a += ((uint32) k[2] << 8);
+			case 2:
+				a += ((uint32) k[1] << 16);
+			case 1:
+				a += ((uint32) k[0] << 24);
+				/* case 0: nothing left to add */
+		}
+#else							/* !WORDS_BIGENDIAN */
+		switch (len)			/* all the case statements fall through */
+		{
+			case 11:
+				c += ((uint32) k[10] << 24);
+			case 10:
+				c += ((uint32) k[9] << 16);
+			case 9:
+				c += ((uint32) k[8] << 8);
+				/* the lowest byte of c is reserved for the length */
+			case 8:
+				b += ((uint32) k[7] << 24);
+			case 7:
+				b += ((uint32) k[6] << 16);
+			case 6:
+				b += ((uint32) k[5] << 8);
+			case 5:
+				b += k[4];
+			case 4:
+				a += ((uint32) k[3] << 24);
+			case 3:
+				a += ((uint32) k[2] << 16);
+			case 2:
+				a += ((uint32) k[1] << 8);
+			case 1:
+				a += k[0];
+				/* case 0: nothing left to add */
+		}
+#endif							/* WORDS_BIGENDIAN */
+	}
+
+	final(a, b, c);
+
+	/* report the result */
+	PG_RETURN_UINT64(((uint64) b << 32) | c);
+}
+
+/*
+ * hash_uint32() -- hash a 32-bit value to a 32-bit value
 *
 * This has the same result as
 *		hash_any(&k, sizeof(uint32))
@@ -523,3 +864,32 @@ hash_uint32(uint32 k)
 	/* report the result */
 	return UInt32GetDatum(c);
 }
+
+/*
+ * hash_uint32_extended() -- hash a 32-bit value to a 64-bit value, with a seed
+ *
+ * Like hash_uint32, this is a convenience function.
+ */
+Datum
+hash_uint32_extended(uint32 k, uint64 seed)
+{
+	register uint32 a,
+				b,
+				c;
+
+	a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
+
+	if (seed != 0)
+	{
+		a += (uint32) (seed >> 32);
+		b += (uint32) seed;
+		mix(a, b, c);
+	}
+
+	a += k;
+
+	final(a, b, c);
+
+	/* report the result */
+	PG_RETURN_UINT64(((uint64) b << 32) | c);
+}
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -373,7 +373,7 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
 	if (ffactor < 10)
 		ffactor = 10;

-	procid = index_getprocid(rel, 1, HASHPROC);
+	procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);

 	/*
 	 * We initialize the metapage, the first N bucket pages, and the first
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -85,7 +85,7 @@ _hash_datum2hashkey(Relation rel, Datum key)
 	Oid			collation;

 	/* XXX assumes index has only one attribute */
-	procinfo = index_getprocinfo(rel, 1, HASHPROC);
+	procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC);
 	collation = rel->rd_indcollation[0];

 	return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
@@ -108,10 +108,10 @@ _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
 	hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
 								  keytype,
 								  keytype,
-								  HASHPROC);
+								  HASHSTANDARD_PROC);
 	if (!RegProcedureIsValid(hash_proc))
 		elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
-			 HASHPROC, keytype, keytype,
+			 HASHSTANDARD_PROC, keytype, keytype,
 			 RelationGetRelationName(rel));
 	collation = rel->rd_indcollation[0];

--- a/src/backend/access/hash/hashvalidate.c
+++ b/src/backend/access/hash/hashvalidate.c
@@ -29,7 +29,7 @@
 #include "utils/syscache.h"


-static bool check_hash_func_signature(Oid funcid, Oid restype, Oid argtype);
+static bool check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype);


 /*
@@ -105,8 +105,9 @@ hashvalidate(Oid opclassoid)
 		/* Check procedure numbers and function signatures */
 		switch (procform->amprocnum)
 		{
-			case HASHPROC:
-				if (!check_hash_func_signature(procform->amproc, INT4OID,
+			case HASHSTANDARD_PROC:
+			case HASHEXTENDED_PROC:
+				if (!check_hash_func_signature(procform->amproc, procform->amprocnum,
 											   procform->amproclefttype))
 				{
 					ereport(INFO,
@@ -264,19 +265,37 @@ hashvalidate(Oid opclassoid)
 * hacks in the core hash opclass definitions.
 */
 static bool
-check_hash_func_signature(Oid funcid, Oid restype, Oid argtype)
+check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype)
 {
 	bool		result = true;
+	Oid			restype;
+	int16		nargs;
 	HeapTuple	tp;
 	Form_pg_proc procform;

+	switch (amprocnum)
+	{
+		case HASHSTANDARD_PROC:
+			restype = INT4OID;
+			nargs = 1;
+			break;
+
+		case HASHEXTENDED_PROC:
+			restype = INT8OID;
+			nargs = 2;
+			break;
+
+		default:
+			elog(ERROR, "invalid amprocnum");
+	}
+
 	tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
 	if (!HeapTupleIsValid(tp))
 		elog(ERROR, "cache lookup failed for function %u", funcid);
 	procform = (Form_pg_proc) GETSTRUCT(tp);

 	if (procform->prorettype != restype || procform->proretset ||
-		procform->pronargs != 1)
+		procform->pronargs != nargs)
 		result = false;

 	if (!IsBinaryCoercible(argtype, procform->proargtypes.values[0]))
@@ -290,24 +309,29 @@ check_hash_func_signature(Oid funcid, Oid restype, Oid argtype)
 		 * identity, not just its input type, because hashvarlena() takes
 		 * INTERNAL and allowing any such function seems too scary.
 		 */
-		if (funcid == F_HASHINT4 &&
+		if ((funcid == F_HASHINT4 || funcid == F_HASHINT4EXTENDED) &&
 			(argtype == DATEOID ||
 			 argtype == ABSTIMEOID || argtype == RELTIMEOID ||
 			 argtype == XIDOID || argtype == CIDOID))
 			 /* okay, allowed use of hashint4() */ ;
-		else if (funcid == F_TIMESTAMP_HASH &&
+		else if ((funcid == F_TIMESTAMP_HASH ||
+				  funcid == F_TIMESTAMP_HASH_EXTENDED) &&
 				 argtype == TIMESTAMPTZOID)
 			 /* okay, allowed use of timestamp_hash() */ ;
-		else if (funcid == F_HASHCHAR &&
+		else if ((funcid == F_HASHCHAR || funcid == F_HASHCHAREXTENDED) &&
 				 argtype == BOOLOID)
 			 /* okay, allowed use of hashchar() */ ;
-		else if (funcid == F_HASHVARLENA &&
+		else if ((funcid == F_HASHVARLENA || funcid == F_HASHVARLENAEXTENDED) &&
 				 argtype == BYTEAOID)
 			 /* okay, allowed use of hashvarlena() */ ;
 		else
 			result = false;
 	}

+	/* If function takes a second argument, it must be for a 64-bit salt. */
+	if (nargs == 2 && procform->proargtypes.values[1] != INT8OID)
+		result = false;
+
 	ReleaseSysCache(tp);
 	return result;
 }