TupleHashTable: store additional data along with tuple.

Previously, the caller needed to allocate the memory and the TupleHashTable would store a pointer to it. That wastes space for the palloc overhead as well as the size of the pointer itself. Now, the TupleHashTable relies on the caller to correctly specify the additionalsize, and allocates that amount of space. The caller can then request a pointer into that space. Discussion: https://postgr.es/m/b9cbf0219a9859dc8d240311643ff4362fd9602c.camel@j-davis.com Reviewed-by: Heikki Linnakangas
2025-12-12 02:37:31 +03:00 · 2025-01-10 17:14:37 -08:00
parent 34c6e65242
commit e0ece2a981
6 changed files with 78 additions and 31 deletions
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -20,6 +20,13 @@
 #include "miscadmin.h"
 #include "utils/lsyscache.h"

+typedef struct TupleHashEntryData
+{
+	MinimalTuple firstTuple;	/* copy of first tuple in this group */
+	uint32		status;			/* hash status */
+	uint32		hash;			/* hash value (cached) */
+} TupleHashEntryData;
+
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
 static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
 												 const MinimalTuple tuple);
@@ -196,6 +203,7 @@ BuildTupleHashTable(PlanState *parent,
 	hashtable->tab_collations = collations;
 	hashtable->tablecxt = tablecxt;
 	hashtable->tempcxt = tempcxt;
+	hashtable->additionalsize = additionalsize;
 	hashtable->tableslot = NULL;	/* will be made on first lookup */
 	hashtable->inputslot = NULL;
 	hashtable->in_hash_expr = NULL;
@@ -273,6 +281,15 @@ ResetTupleHashTable(TupleHashTable hashtable)
 	tuplehash_reset(hashtable->hashtab);
 }

+/*
+ * Return size of the hash bucket. Useful for estimating memory usage.
+ */
+size_t
+TupleHashEntrySize(void)
+{
+	return sizeof(TupleHashEntryData);
+}
+
 /*
 * Find or create a hashtable entry for the tuple group containing the
 * given tuple.  The tuple must be the same type as the hashtable entries.
@@ -339,6 +356,24 @@ TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
 	return hash;
 }

+MinimalTuple
+TupleHashEntryGetTuple(TupleHashEntry entry)
+{
+	return entry->firstTuple;
+}
+
+/*
+ * Get a pointer into the additional space allocated for this entry. The
+ * amount of space available is the additionalsize specified to
+ * BuildTupleHashTable(). If additionalsize was specified as zero, no
+ * additional space is available and this function should not be called.
+ */
+void *
+TupleHashEntryGetAdditional(TupleHashEntry entry)
+{
+	return (char *) entry->firstTuple + MAXALIGN(entry->firstTuple->t_len);
+}
+
 /*
 * A variant of LookupTupleHashEntry for callers that have already computed
 * the hash value.
@@ -477,13 +512,31 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 		}
 		else
 		{
+			MinimalTuple firstTuple;
+			size_t totalsize; /* including alignment and additionalsize */
+
 			/* created new entry */
 			*isnew = true;
 			/* zero caller data */
-			entry->additional = NULL;
 			MemoryContextSwitchTo(hashtable->tablecxt);
+
 			/* Copy the first tuple into the table context */
-			entry->firstTuple = ExecCopySlotMinimalTuple(slot);
+			firstTuple = ExecCopySlotMinimalTuple(slot);
+
+			/*
+			 * Allocate additional space right after the MinimalTuple of size
+			 * additionalsize. The caller can get a pointer to this data with
+			 * TupleHashEntryGetAdditional(), and store arbitrary data there.
+			 *
+			 * This avoids the need to store an extra pointer or allocate an
+			 * additional chunk, which would waste memory.
+			 */
+			totalsize = MAXALIGN(firstTuple->t_len) + hashtable->additionalsize;
+			firstTuple = repalloc(firstTuple, totalsize);
+			memset((char *) firstTuple + firstTuple->t_len, 0,
+				   totalsize - firstTuple->t_len);
+
+			entry->firstTuple = firstTuple;
 		}
 	}
 	else
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1713,7 +1713,7 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
 		transitionChunkSize = 0;

 	return
-		sizeof(TupleHashEntryData) +
+		TupleHashEntrySize() +
 		tupleChunkSize +
 		pergroupChunkSize +
 		transitionChunkSize;
@@ -1954,7 +1954,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	if (aggstate->hash_ngroups_current > 0)
 	{
 		aggstate->hashentrysize =
-			sizeof(TupleHashEntryData) +
+			TupleHashEntrySize() +
 			(hashkey_mem / (double) aggstate->hash_ngroups_current);
 	}
 }
@@ -2055,11 +2055,7 @@ initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
 	if (aggstate->numtrans == 0)
 		return;

-	pergroup = (AggStatePerGroup)
-		MemoryContextAlloc(hashtable->tablecxt,
-						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
-
-	entry->additional = pergroup;
+	pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(entry);

 	/*
 	 * Initialize aggregates for new tuple group, lookup_hash_entries()
@@ -2123,7 +2119,7 @@ lookup_hash_entries(AggState *aggstate)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, hashtable, entry);
-			pergroup[setno] = entry->additional;
+			pergroup[setno] = TupleHashEntryGetAdditional(entry);
 		}
 		else
 		{
@@ -2681,7 +2677,7 @@ agg_refill_hash_table(AggState *aggstate)
 		{
 			if (isnew)
 				initialize_hash_entry(aggstate, perhash->hashtable, entry);
-			aggstate->hash_pergroup[batch->setno] = entry->additional;
+			aggstate->hash_pergroup[batch->setno] = TupleHashEntryGetAdditional(entry);
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2773,7 +2769,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 	ExprContext *econtext;
 	AggStatePerAgg peragg;
 	AggStatePerGroup pergroup;
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *firstSlot;
 	TupleTableSlot *result;
 	AggStatePerHash perhash;
@@ -2845,7 +2841,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		 * Transform representative tuple back into one with the right
 		 * columns.
 		 */
-		ExecStoreMinimalTuple(entry->firstTuple, hashslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashslot, false);
 		slot_getallattrs(hashslot);

 		ExecClearTuple(firstSlot);
@@ -2861,7 +2857,7 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
 		}
 		ExecStoreVirtualTuple(firstSlot);

-		pergroup = (AggStatePerGroup) entry->additional;
+		pergroup = (AggStatePerGroup) TupleHashEntryGetAdditional(entry);

 		/*
 		 * Use the representative input tuple for any references to
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -425,6 +425,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 	{
 		TupleTableSlot *outerslot;
 		TupleHashEntryData *entry;
+		SetOpStatePerGroup pergroup;
 		bool		isnew;

 		outerslot = ExecProcNode(outerPlan);
@@ -437,16 +438,16 @@ setop_fill_hash_table(SetOpState *setopstate)
 									 outerslot,
 									 &isnew, NULL);

+		pergroup = TupleHashEntryGetAdditional(entry);
 		/* If new tuple group, initialize counts to zero */
 		if (isnew)
 		{
-			entry->additional = (SetOpStatePerGroup)
-				MemoryContextAllocZero(setopstate->hashtable->tablecxt,
-									   sizeof(SetOpStatePerGroupData));
+			pergroup->numLeft = 0;
+			pergroup->numRight = 0;
 		}

 		/* Advance the counts */
-		((SetOpStatePerGroup) entry->additional)->numLeft++;
+		pergroup->numLeft++;

 		/* Must reset expression context after each hashtable lookup */
 		ResetExprContext(econtext);
@@ -478,7 +479,7 @@ setop_fill_hash_table(SetOpState *setopstate)

 			/* Advance the counts if entry is already present */
 			if (entry)
-				((SetOpStatePerGroup) entry->additional)->numRight++;
+				((SetOpStatePerGroup) TupleHashEntryGetAdditional(entry))->numRight++;

 			/* Must reset expression context after each hashtable lookup */
 			ResetExprContext(econtext);
@@ -496,7 +497,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 static TupleTableSlot *
 setop_retrieve_hash_table(SetOpState *setopstate)
 {
-	TupleHashEntryData *entry;
+	TupleHashEntry entry;
 	TupleTableSlot *resultTupleSlot;

 	/*
@@ -526,12 +527,12 @@ setop_retrieve_hash_table(SetOpState *setopstate)
 		 * See if we should emit any copies of this tuple, and if so return
 		 * the first copy.
 		 */
-		set_output_count(setopstate, (SetOpStatePerGroup) entry->additional);
+		set_output_count(setopstate, (SetOpStatePerGroup) TupleHashEntryGetAdditional(entry));

 		if (setopstate->numOutput > 0)
 		{
 			setopstate->numOutput--;
-			return ExecStoreMinimalTuple(entry->firstTuple,
+			return ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry),
 										 resultTupleSlot,
 										 false);
 		}
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -753,7 +753,7 @@ findPartialMatch(TupleHashTable hashtable, TupleTableSlot *slot,
 	{
 		CHECK_FOR_INTERRUPTS();

-		ExecStoreMinimalTuple(entry->firstTuple, hashtable->tableslot, false);
+		ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry), hashtable->tableslot, false);
 		if (!execTuplesUnequal(slot, hashtable->tableslot,
 							   numCols, keyColIdx,
 							   eqfunctions,
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -148,9 +148,12 @@ extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   bool *isnew, uint32 *hash);
 extern uint32 TupleHashTableHash(TupleHashTable hashtable,
 								 TupleTableSlot *slot);
+extern size_t TupleHashEntrySize(void);
 extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,
 											   TupleTableSlot *slot,
 											   bool *isnew, uint32 hash);
+extern MinimalTuple TupleHashEntryGetTuple(TupleHashEntry entry);
+extern void *TupleHashEntryGetAdditional(TupleHashEntry entry);
 extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
 										 TupleTableSlot *slot,
 										 ExprState *eqcomp,
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -806,17 +806,10 @@ typedef struct ExecAuxRowMark
 * point to tab_hash_expr and tab_eq_func respectively.
 * ----------------------------------------------------------------
 */
+typedef struct TupleHashEntryData TupleHashEntryData;
 typedef struct TupleHashEntryData *TupleHashEntry;
 typedef struct TupleHashTableData *TupleHashTable;

-typedef struct TupleHashEntryData
-{
-	MinimalTuple firstTuple;	/* copy of first tuple in this group */
-	void	   *additional;		/* user data */
-	uint32		status;			/* hash status */
-	uint32		hash;			/* hash value (cached) */
-} TupleHashEntryData;
-
 /* define parameters necessary to generate the tuple hash table interface */
 #define SH_PREFIX tuplehash
 #define SH_ELEMENT_TYPE TupleHashEntryData
@@ -835,6 +828,7 @@ typedef struct TupleHashTableData
 	Oid		   *tab_collations; /* collations for hash and comparison */
 	MemoryContext tablecxt;		/* memory context containing table */
 	MemoryContext tempcxt;		/* context for function evaluations */
+	Size		additionalsize;	/* size of additional data */
 	TupleTableSlot *tableslot;	/* slot for referencing table entries */
 	/* The following fields are set transiently for each table search: */
 	TupleTableSlot *inputslot;	/* current input tuple's slot */