1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

HashAgg: use Bump allocator for hash TupleHashTable entries.

The entries aren't freed until the entire hash table is destroyed, so
use the Bump allocator to improve allocation speed, avoid wasting
space on the chunk header, and avoid wasting space due to the
power-of-two allocations.

Discussion: https://postgr.es/m/CAApHDvqv1aNB4cM36FzRwivXrEvBO_LsG_eQ3nqDXTjECaatOQ@mail.gmail.com
Reviewed-by: David Rowley
This commit is contained in:
Jeff Davis
2025-03-24 22:05:33 -07:00
parent cc4331605a
commit cc721c459d
3 changed files with 104 additions and 29 deletions

View File

@@ -322,19 +322,18 @@ CreateExprContext(EState *estate)
ExprContext *
CreateWorkExprContext(EState *estate)
{
Size minContextSize = ALLOCSET_DEFAULT_MINSIZE;
Size initBlockSize = ALLOCSET_DEFAULT_INITSIZE;
Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
/* choose the maxBlockSize to be no larger than 1/16 of work_mem */
while (maxBlockSize > work_mem * (Size) 1024 / 16)
maxBlockSize >>= 1;
maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE)
maxBlockSize = ALLOCSET_DEFAULT_INITSIZE;
/* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
return CreateExprContextInternal(estate, minContextSize,
initBlockSize, maxBlockSize);
/* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
return CreateExprContextInternal(estate, ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE, maxBlockSize);
}
/* ----------------

View File

@@ -406,6 +406,7 @@ static void build_hash_tables(AggState *aggstate);
static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
bool nullcheck);
static void hash_create_memory(AggState *aggstate);
static long hash_choose_num_buckets(double hashentrysize,
long ngroups, Size memory);
static int hash_choose_num_partitions(double input_groups,
@@ -1509,7 +1510,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
{
AggStatePerHash perhash = &aggstate->perhash[setno];
MemoryContext metacxt = aggstate->hash_metacxt;
MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory;
MemoryContext tablecxt = aggstate->hash_tablecxt;
MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory;
Size additionalsize;
@@ -1535,7 +1536,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
nbuckets,
additionalsize,
metacxt,
hashcxt,
tablecxt,
tmpcxt,
DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
}
@@ -1706,15 +1707,19 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace)
tupleWidth);
Size pergroupSize = numTrans * sizeof(AggStatePerGroupData);
tupleChunkSize = CHUNKHDRSZ + tupleSize;
if (pergroupSize > 0)
pergroupChunkSize = CHUNKHDRSZ + pergroupSize;
else
pergroupChunkSize = 0;
/*
* Entries use the Bump allocator, so the chunk sizes are the same as the
* requested sizes.
*/
tupleChunkSize = MAXALIGN(tupleSize);
pergroupChunkSize = pergroupSize;
/*
* Transition values use AllocSet, which has a chunk header and also uses
* power-of-two allocations.
*/
if (transitionSpace > 0)
transitionChunkSize = CHUNKHDRSZ + transitionSpace;
transitionChunkSize = CHUNKHDRSZ + pg_nextpower2_size_t(transitionSpace);
else
transitionChunkSize = 0;
@@ -1864,8 +1869,11 @@ hash_agg_check_limits(AggState *aggstate)
uint64 ngroups = aggstate->hash_ngroups_current;
Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
true);
Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
true);
Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt,
true);
Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
true);
Size total_mem = meta_mem + entry_mem + tval_mem;
bool do_spill = false;
#ifdef USE_INJECTION_POINTS
@@ -1884,7 +1892,7 @@ hash_agg_check_limits(AggState *aggstate)
* can be sure to make progress even in edge cases.
*/
if (aggstate->hash_ngroups_current > 0 &&
(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
(total_mem > aggstate->hash_mem_limit ||
ngroups > aggstate->hash_ngroups_limit))
{
do_spill = true;
@@ -1939,6 +1947,7 @@ static void
hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
{
Size meta_mem;
Size entry_mem;
Size hashkey_mem;
Size buffer_mem;
Size total_mem;
@@ -1950,7 +1959,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
/* memory for the hash table itself */
meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
/* memory for the group keys and transition states */
/* memory for hash entries */
entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true);
/* memory for byref transition states */
hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
/* memory for read/write tape buffers, if spilled */
@@ -1959,7 +1971,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
buffer_mem += HASHAGG_READ_BUFFER_SIZE;
/* update peak mem */
total_mem = meta_mem + hashkey_mem + buffer_mem;
total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem;
if (total_mem > aggstate->hash_mem_peak)
aggstate->hash_mem_peak = total_mem;
@@ -1981,6 +1993,64 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
}
}
/*
* Create memory contexts used for hash aggregation.
*/
static void
hash_create_memory(AggState *aggstate)
{
Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE;
/*
* The hashcontext's per-tuple memory will be used for byref transition
* values and returned by AggCheckCallContext().
*/
aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state);
/*
* The meta context will be used for the bucket array of
* TupleHashEntryData (or arrays, in the case of grouping sets). As the
* hash table grows, the bucket array will double in size and the old one
* will be freed, so an AllocSet is appropriate. For large bucket arrays,
* the large allocation path will be used, so it's not worth worrying
* about wasting space due to power-of-two allocations.
*/
aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
"HashAgg meta context",
ALLOCSET_DEFAULT_SIZES);
/*
* The hash entries themselves, which include the grouping key
* (firstTuple) and pergroup data, are stored in the table context. The
* bump allocator can be used because the entries are not freed until the
* entire hash table is reset. The bump allocator is faster for
* allocations and avoids wasting space on the chunk header or
* power-of-two allocations.
*
* Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
* to avoid large jumps in memory usage.
*/
/*
* Like CreateWorkExprContext(), use smaller sizings for smaller work_mem,
* to avoid large jumps in memory usage.
*/
maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16);
/* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */
maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE);
/* and no smaller than ALLOCSET_DEFAULT_INITSIZE */
maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE);
aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt,
"HashAgg table context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
maxBlockSize);
}
/*
* Choose a reasonable number of buckets for the initial hash table size.
*/
@@ -2642,6 +2712,7 @@ agg_refill_hash_table(AggState *aggstate)
/* free memory and reset hash tables */
ReScanExprContext(aggstate->hashcontext);
MemoryContextReset(aggstate->hash_tablecxt);
for (int setno = 0; setno < aggstate->num_hashes; setno++)
ResetTupleHashTable(aggstate->perhash[setno].hashtable);
@@ -3326,7 +3397,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
}
if (use_hashing)
aggstate->hashcontext = CreateWorkExprContext(estate);
hash_create_memory(aggstate);
ExecAssignExprContext(estate, &aggstate->ss.ps);
@@ -3621,9 +3692,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
Plan *outerplan = outerPlan(node);
uint64 totalGroups = 0;
aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
"HashAgg meta context",
ALLOCSET_DEFAULT_SIZES);
aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
&TTSOpsMinimalTuple);
aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
@@ -4368,6 +4436,12 @@ ExecEndAgg(AggState *node)
MemoryContextDelete(node->hash_metacxt);
node->hash_metacxt = NULL;
}
if (node->hash_tablecxt != NULL)
{
MemoryContextDelete(node->hash_tablecxt);
node->hash_tablecxt = NULL;
}
for (transno = 0; transno < node->numtrans; transno++)
{
@@ -4484,6 +4558,7 @@ ExecReScanAgg(AggState *node)
node->hash_ngroups_current = 0;
ReScanExprContext(node->hashcontext);
MemoryContextReset(node->hash_tablecxt);
/* Rebuild an empty hash table */
build_hash_tables(node);
node->table_filled = false;

View File

@@ -2560,7 +2560,8 @@ typedef struct AggState
/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
bool table_filled; /* hash table filled yet? */
int num_hashes;
MemoryContext hash_metacxt; /* memory for hash table itself */
MemoryContext hash_metacxt; /* memory for hash table bucket array */
MemoryContext hash_tablecxt; /* memory for hash table entries */
struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */
struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set,
* exists only during first pass */
@@ -2586,7 +2587,7 @@ typedef struct AggState
* per-group pointers */
/* support for evaluation of agg input expressions: */
#define FIELDNO_AGGSTATE_ALL_PERGROUPS 53
#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54
AggStatePerGroup *all_pergroups; /* array of first ->pergroups, than
* ->hash_pergroup */
SharedAggInfo *shared_info; /* one entry per worker */