diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 55ab18fb826..772c86e70e9 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -322,19 +322,18 @@ CreateExprContext(EState *estate) ExprContext * CreateWorkExprContext(EState *estate) { - Size minContextSize = ALLOCSET_DEFAULT_MINSIZE; - Size initBlockSize = ALLOCSET_DEFAULT_INITSIZE; Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE; - /* choose the maxBlockSize to be no larger than 1/16 of work_mem */ - while (maxBlockSize > work_mem * (Size) 1024 / 16) - maxBlockSize >>= 1; + maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16); - if (maxBlockSize < ALLOCSET_DEFAULT_INITSIZE) - maxBlockSize = ALLOCSET_DEFAULT_INITSIZE; + /* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */ + maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE); - return CreateExprContextInternal(estate, minContextSize, - initBlockSize, maxBlockSize); + /* and no smaller than ALLOCSET_DEFAULT_INITSIZE */ + maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE); + + return CreateExprContextInternal(estate, ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, maxBlockSize); } /* ---------------- diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index b4a7698a0b3..beccbfdc6fe 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -406,6 +406,7 @@ static void build_hash_tables(AggState *aggstate); static void build_hash_table(AggState *aggstate, int setno, long nbuckets); static void hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck); +static void hash_create_memory(AggState *aggstate); static long hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory); static int hash_choose_num_partitions(double input_groups, @@ -1509,7 +1510,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) { AggStatePerHash perhash = &aggstate->perhash[setno]; MemoryContext metacxt = aggstate->hash_metacxt; - MemoryContext hashcxt = aggstate->hashcontext->ecxt_per_tuple_memory; + MemoryContext tablecxt = aggstate->hash_tablecxt; MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory; Size additionalsize; @@ -1535,7 +1536,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) nbuckets, additionalsize, metacxt, - hashcxt, + tablecxt, tmpcxt, DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)); } @@ -1706,15 +1707,19 @@ hash_agg_entry_size(int numTrans, Size tupleWidth, Size transitionSpace) tupleWidth); Size pergroupSize = numTrans * sizeof(AggStatePerGroupData); - tupleChunkSize = CHUNKHDRSZ + tupleSize; - - if (pergroupSize > 0) - pergroupChunkSize = CHUNKHDRSZ + pergroupSize; - else - pergroupChunkSize = 0; + /* + * Entries use the Bump allocator, so the chunk sizes are the same as the + * requested sizes. + */ + tupleChunkSize = MAXALIGN(tupleSize); + pergroupChunkSize = pergroupSize; + /* + * Transition values use AllocSet, which has a chunk header and also uses + * power-of-two allocations. + */ if (transitionSpace > 0) - transitionChunkSize = CHUNKHDRSZ + transitionSpace; + transitionChunkSize = CHUNKHDRSZ + pg_nextpower2_size_t(transitionSpace); else transitionChunkSize = 0; @@ -1864,8 +1869,11 @@ hash_agg_check_limits(AggState *aggstate) uint64 ngroups = aggstate->hash_ngroups_current; Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - Size hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, - true); + Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, + true); + Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, + true); + Size total_mem = meta_mem + entry_mem + tval_mem; bool do_spill = false; #ifdef USE_INJECTION_POINTS @@ -1884,7 +1892,7 @@ hash_agg_check_limits(AggState *aggstate) * can be sure to make progress even in edge cases. */ if (aggstate->hash_ngroups_current > 0 && - (meta_mem + hashkey_mem > aggstate->hash_mem_limit || + (total_mem > aggstate->hash_mem_limit || ngroups > aggstate->hash_ngroups_limit)) { do_spill = true; @@ -1939,6 +1947,7 @@ static void hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) { Size meta_mem; + Size entry_mem; Size hashkey_mem; Size buffer_mem; Size total_mem; @@ -1950,7 +1959,10 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) /* memory for the hash table itself */ meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - /* memory for the group keys and transition states */ + /* memory for hash entries */ + entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true); + + /* memory for byref transition states */ hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); /* memory for read/write tape buffers, if spilled */ @@ -1959,7 +1971,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) buffer_mem += HASHAGG_READ_BUFFER_SIZE; /* update peak mem */ - total_mem = meta_mem + hashkey_mem + buffer_mem; + total_mem = meta_mem + entry_mem + hashkey_mem + buffer_mem; if (total_mem > aggstate->hash_mem_peak) aggstate->hash_mem_peak = total_mem; @@ -1981,6 +1993,64 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) } } +/* + * Create memory contexts used for hash aggregation. + */ +static void +hash_create_memory(AggState *aggstate) +{ + Size maxBlockSize = ALLOCSET_DEFAULT_MAXSIZE; + + /* + * The hashcontext's per-tuple memory will be used for byref transition + * values and returned by AggCheckCallContext(). + */ + aggstate->hashcontext = CreateWorkExprContext(aggstate->ss.ps.state); + + /* + * The meta context will be used for the bucket array of + * TupleHashEntryData (or arrays, in the case of grouping sets). As the + * hash table grows, the bucket array will double in size and the old one + * will be freed, so an AllocSet is appropriate. For large bucket arrays, + * the large allocation path will be used, so it's not worth worrying + * about wasting space due to power-of-two allocations. + */ + aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt, + "HashAgg meta context", + ALLOCSET_DEFAULT_SIZES); + + /* + * The hash entries themselves, which include the grouping key + * (firstTuple) and pergroup data, are stored in the table context. The + * bump allocator can be used because the entries are not freed until the + * entire hash table is reset. The bump allocator is faster for + * allocations and avoids wasting space on the chunk header or + * power-of-two allocations. + * + * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem, + * to avoid large jumps in memory usage. + */ + + /* + * Like CreateWorkExprContext(), use smaller sizings for smaller work_mem, + * to avoid large jumps in memory usage. + */ + maxBlockSize = pg_prevpower2_size_t(work_mem * (Size) 1024 / 16); + + /* But no bigger than ALLOCSET_DEFAULT_MAXSIZE */ + maxBlockSize = Min(maxBlockSize, ALLOCSET_DEFAULT_MAXSIZE); + + /* and no smaller than ALLOCSET_DEFAULT_INITSIZE */ + maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE); + + aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt, + "HashAgg table context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + maxBlockSize); + +} + /* * Choose a reasonable number of buckets for the initial hash table size. */ @@ -2642,6 +2712,7 @@ agg_refill_hash_table(AggState *aggstate) /* free memory and reset hash tables */ ReScanExprContext(aggstate->hashcontext); + MemoryContextReset(aggstate->hash_tablecxt); for (int setno = 0; setno < aggstate->num_hashes; setno++) ResetTupleHashTable(aggstate->perhash[setno].hashtable); @@ -3326,7 +3397,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) } if (use_hashing) - aggstate->hashcontext = CreateWorkExprContext(estate); + hash_create_memory(aggstate); ExecAssignExprContext(estate, &aggstate->ss.ps); @@ -3621,9 +3692,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) Plan *outerplan = outerPlan(node); uint64 totalGroups = 0; - aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt, - "HashAgg meta context", - ALLOCSET_DEFAULT_SIZES); aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc, &TTSOpsMinimalTuple); aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc, @@ -4368,6 +4436,12 @@ ExecEndAgg(AggState *node) MemoryContextDelete(node->hash_metacxt); node->hash_metacxt = NULL; } + if (node->hash_tablecxt != NULL) + { + MemoryContextDelete(node->hash_tablecxt); + node->hash_tablecxt = NULL; + } + for (transno = 0; transno < node->numtrans; transno++) { @@ -4484,6 +4558,7 @@ ExecReScanAgg(AggState *node) node->hash_ngroups_current = 0; ReScanExprContext(node->hashcontext); + MemoryContextReset(node->hash_tablecxt); /* Rebuild an empty hash table */ build_hash_tables(node); node->table_filled = false; diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index d4d4e655180..b5539ddb41e 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2560,7 +2560,8 @@ typedef struct AggState /* these fields are used in AGG_HASHED and AGG_MIXED modes: */ bool table_filled; /* hash table filled yet? */ int num_hashes; - MemoryContext hash_metacxt; /* memory for hash table itself */ + MemoryContext hash_metacxt; /* memory for hash table bucket array */ + MemoryContext hash_tablecxt; /* memory for hash table entries */ struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */ struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set, * exists only during first pass */ @@ -2586,7 +2587,7 @@ typedef struct AggState * per-group pointers */ /* support for evaluation of agg input expressions: */ -#define FIELDNO_AGGSTATE_ALL_PERGROUPS 53 +#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54 AggStatePerGroup *all_pergroups; /* array of first ->pergroups, than * ->hash_pergroup */ SharedAggInfo *shared_info; /* one entry per worker */