diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index f34530fdacf..b4bdaa3c305 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -145,8 +145,8 @@ execTuplesHashPrepare(int numCols, * collations: collations to use in comparisons * nbuckets: initial estimate of hashtable size * additionalsize: size of data that may be stored along with the hash entry - * metacxt: memory context for long-lived allocation, but not per-entry data - * tablecxt: memory context in which to store table entries + * metacxt: memory context for long-lived data and the simplehash table + * tuplescxt: memory context in which to store the hashed tuples themselves * tempcxt: short-lived context for evaluation hash and comparison functions * use_variable_hash_iv: if true, adjust hash IV per-parallel-worker * @@ -157,11 +157,25 @@ execTuplesHashPrepare(int numCols, * Note that the keyColIdx, hashfunctions, and collations arrays must be * allocated in storage that will live as long as the hashtable does. * + * The metacxt and tuplescxt are separate because it's usually desirable for + * tuplescxt to be a BumpContext to avoid memory wastage, while metacxt must + * support pfree in case the simplehash table needs to be enlarged. (We could + * simplify the API of TupleHashTables by managing the tuplescxt internally. + * But that would be disadvantageous to nodeAgg.c and nodeSubplan.c, which use + * a single tuplescxt for multiple TupleHashTables that are reset together.) + * * LookupTupleHashEntry, FindTupleHashEntry, and related functions may leak * memory in the tempcxt. It is caller's responsibility to reset that context * reasonably often, typically once per tuple. (We do it that way, rather * than managing an extra context within the hashtable, because in many cases * the caller can specify a tempcxt that it needs to reset per-tuple anyway.) + * + * We don't currently provide DestroyTupleHashTable functionality; the hash + * table will be cleaned up at destruction of the metacxt. (Some callers + * bother to delete the tuplescxt explicitly, though it'd be sufficient to + * ensure it's a child of the metacxt.) There's not much point in working + * harder than this so long as the expression-evaluation infrastructure + * behaves similarly. */ TupleHashTable BuildTupleHashTable(PlanState *parent, @@ -175,7 +189,7 @@ BuildTupleHashTable(PlanState *parent, long nbuckets, Size additionalsize, MemoryContext metacxt, - MemoryContext tablecxt, + MemoryContext tuplescxt, MemoryContext tempcxt, bool use_variable_hash_iv) { @@ -183,14 +197,24 @@ BuildTupleHashTable(PlanState *parent, Size entrysize; Size hash_mem_limit; MemoryContext oldcontext; - bool allow_jit; uint32 hash_iv = 0; Assert(nbuckets > 0); - additionalsize = MAXALIGN(additionalsize); - entrysize = sizeof(TupleHashEntryData) + additionalsize; - /* Limit initial table size request to not more than hash_mem */ + /* tuplescxt must be separate, else ResetTupleHashTable breaks things */ + Assert(metacxt != tuplescxt); + + /* ensure additionalsize is maxalign'ed */ + additionalsize = MAXALIGN(additionalsize); + + /* + * Limit initial table size request to not more than hash_mem. + * + * XXX this calculation seems pretty misguided, as it counts only overhead + * and not the tuples themselves. But we have no knowledge of the + * expected tuple width here. + */ + entrysize = sizeof(TupleHashEntryData) + additionalsize; hash_mem_limit = get_hash_memory_limit() / entrysize; if (nbuckets > hash_mem_limit) nbuckets = hash_mem_limit; @@ -202,7 +226,7 @@ BuildTupleHashTable(PlanState *parent, hashtable->numCols = numCols; hashtable->keyColIdx = keyColIdx; hashtable->tab_collations = collations; - hashtable->tablecxt = tablecxt; + hashtable->tuplescxt = tuplescxt; hashtable->tempcxt = tempcxt; hashtable->additionalsize = additionalsize; hashtable->tableslot = NULL; /* will be made on first lookup */ @@ -230,16 +254,6 @@ BuildTupleHashTable(PlanState *parent, hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc), &TTSOpsMinimalTuple); - /* - * If the caller fails to make the metacxt different from the tablecxt, - * allowing JIT would lead to the generated functions to a) live longer - * than the query or b) be re-generated each time the table is being - * reset. Therefore prevent JIT from being used in that case, by not - * providing a parent node (which prevents accessing the JitContext in the - * EState). - */ - allow_jit = (metacxt != tablecxt); - /* build hash ExprState for all columns */ hashtable->tab_hash_expr = ExecBuildHash32FromAttrs(inputDesc, inputOps, @@ -247,7 +261,7 @@ BuildTupleHashTable(PlanState *parent, collations, numCols, keyColIdx, - allow_jit ? parent : NULL, + parent, hash_iv); /* build comparator for all columns */ @@ -256,7 +270,7 @@ BuildTupleHashTable(PlanState *parent, &TTSOpsMinimalTuple, numCols, keyColIdx, eqfuncoids, collations, - allow_jit ? parent : NULL); + parent); /* * While not pretty, it's ok to not shut down this context, but instead @@ -273,13 +287,19 @@ BuildTupleHashTable(PlanState *parent, /* * Reset contents of the hashtable to be empty, preserving all the non-content - * state. Note that the tablecxt passed to BuildTupleHashTable() should - * also be reset, otherwise there will be leaks. + * state. + * + * Note: in usages where several TupleHashTables share a tuplescxt, all must + * be reset together, as the first one's reset call will destroy all their + * data. The additional reset calls for the rest will redundantly reset the + * tuplescxt. But because of mcxt.c's isReset flag, that's cheap enough that + * we need not avoid it. */ void ResetTupleHashTable(TupleHashTable hashtable) { tuplehash_reset(hashtable->hashtab); + MemoryContextReset(hashtable->tuplescxt); } /* @@ -489,10 +509,10 @@ LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot, /* created new entry */ *isnew = true; - MemoryContextSwitchTo(hashtable->tablecxt); + MemoryContextSwitchTo(hashtable->tuplescxt); /* - * Copy the first tuple into the table context, and request + * Copy the first tuple into the tuples context, and request * additionalsize extra bytes before the allocation. * * The caller can get a pointer to the additional data with diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 64643c3943a..759ffeed2e6 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -1457,7 +1457,7 @@ find_cols_walker(Node *node, FindColsContext *context) * We have a separate hashtable and associated perhash data structure for each * grouping set for which we're doing hashing. * - * The contents of the hash tables always live in the hashcontext's per-tuple + * The contents of the hash tables live in the aggstate's hash_tuplescxt * memory context (there is only one of these for all tables together, since * they are all reset at the same time). */ @@ -1509,7 +1509,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) { AggStatePerHash perhash = &aggstate->perhash[setno]; MemoryContext metacxt = aggstate->hash_metacxt; - MemoryContext tablecxt = aggstate->hash_tablecxt; + MemoryContext tuplescxt = aggstate->hash_tuplescxt; MemoryContext tmpcxt = aggstate->tmpcontext->ecxt_per_tuple_memory; Size additionalsize; @@ -1535,7 +1535,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets) nbuckets, additionalsize, metacxt, - tablecxt, + tuplescxt, tmpcxt, DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit)); } @@ -1868,7 +1868,7 @@ hash_agg_check_limits(AggState *aggstate) uint64 ngroups = aggstate->hash_ngroups_current; Size meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); - Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, + Size entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt, true); Size tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); @@ -1959,7 +1959,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true); /* memory for hash entries */ - entry_mem = MemoryContextMemAllocated(aggstate->hash_tablecxt, true); + entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt, true); /* memory for byref transition states */ hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true); @@ -2042,11 +2042,11 @@ hash_create_memory(AggState *aggstate) /* and no smaller than ALLOCSET_DEFAULT_INITSIZE */ maxBlockSize = Max(maxBlockSize, ALLOCSET_DEFAULT_INITSIZE); - aggstate->hash_tablecxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt, - "HashAgg table context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - maxBlockSize); + aggstate->hash_tuplescxt = BumpContextCreate(aggstate->ss.ps.state->es_query_cxt, + "HashAgg hashed tuples", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + maxBlockSize); } @@ -2707,7 +2707,6 @@ agg_refill_hash_table(AggState *aggstate) /* free memory and reset hash tables */ ReScanExprContext(aggstate->hashcontext); - MemoryContextReset(aggstate->hash_tablecxt); for (int setno = 0; setno < aggstate->num_hashes; setno++) ResetTupleHashTable(aggstate->perhash[setno].hashtable); @@ -4428,18 +4427,18 @@ ExecEndAgg(AggState *node) hashagg_reset_spill_state(node); + /* Release hash tables too */ if (node->hash_metacxt != NULL) { MemoryContextDelete(node->hash_metacxt); node->hash_metacxt = NULL; } - if (node->hash_tablecxt != NULL) + if (node->hash_tuplescxt != NULL) { - MemoryContextDelete(node->hash_tablecxt); - node->hash_tablecxt = NULL; + MemoryContextDelete(node->hash_tuplescxt); + node->hash_tuplescxt = NULL; } - for (transno = 0; transno < node->numtrans; transno++) { AggStatePerTrans pertrans = &node->pertrans[transno]; @@ -4555,8 +4554,7 @@ ExecReScanAgg(AggState *node) node->hash_ngroups_current = 0; ReScanExprContext(node->hashcontext); - MemoryContextReset(node->hash_tablecxt); - /* Rebuild an empty hash table */ + /* Rebuild empty hash table(s) */ build_hash_tables(node); node->table_filled = false; /* iterator will be reset when the table is filled */ diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c index 40f66fd0680..ebb7919b49b 100644 --- a/src/backend/executor/nodeRecursiveunion.c +++ b/src/backend/executor/nodeRecursiveunion.c @@ -53,7 +53,7 @@ build_hash_table(RecursiveUnionState *rustate) node->numGroups, 0, rustate->ps.state->es_query_cxt, - rustate->tableContext, + rustate->tuplesContext, rustate->tempContext, false); } @@ -197,7 +197,7 @@ ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags) rustate->hashfunctions = NULL; rustate->hashtable = NULL; rustate->tempContext = NULL; - rustate->tableContext = NULL; + rustate->tuplesContext = NULL; /* initialize processing state */ rustate->recursing = false; @@ -209,7 +209,8 @@ ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags) * If hashing, we need a per-tuple memory context for comparisons, and a * longer-lived context to store the hash table. The table can't just be * kept in the per-query context because we want to be able to throw it - * away when rescanning. + * away when rescanning. We can use a BumpContext to save storage, + * because we will have no need to delete individual table entries. */ if (node->numCols > 0) { @@ -217,10 +218,10 @@ ExecInitRecursiveUnion(RecursiveUnion *node, EState *estate, int eflags) AllocSetContextCreate(CurrentMemoryContext, "RecursiveUnion", ALLOCSET_DEFAULT_SIZES); - rustate->tableContext = - AllocSetContextCreate(CurrentMemoryContext, - "RecursiveUnion hash table", - ALLOCSET_DEFAULT_SIZES); + rustate->tuplesContext = + BumpContextCreate(CurrentMemoryContext, + "RecursiveUnion hashed tuples", + ALLOCSET_DEFAULT_SIZES); } /* @@ -288,11 +289,11 @@ ExecEndRecursiveUnion(RecursiveUnionState *node) tuplestore_end(node->working_table); tuplestore_end(node->intermediate_table); - /* free subsidiary stuff including hashtable */ + /* free subsidiary stuff including hashtable data */ if (node->tempContext) MemoryContextDelete(node->tempContext); - if (node->tableContext) - MemoryContextDelete(node->tableContext); + if (node->tuplesContext) + MemoryContextDelete(node->tuplesContext); /* * close down subplans @@ -328,10 +329,6 @@ ExecReScanRecursiveUnion(RecursiveUnionState *node) if (outerPlan->chgParam == NULL) ExecReScan(outerPlan); - /* Release any hashtable storage */ - if (node->tableContext) - MemoryContextReset(node->tableContext); - /* Empty hashtable if needed */ if (plan->numCols > 0) ResetTupleHashTable(node->hashtable); diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c index 4068481a523..7b223a7ca3a 100644 --- a/src/backend/executor/nodeSetOp.c +++ b/src/backend/executor/nodeSetOp.c @@ -106,7 +106,7 @@ build_hash_table(SetOpState *setopstate) node->numGroups, sizeof(SetOpStatePerGroupData), setopstate->ps.state->es_query_cxt, - setopstate->tableContext, + setopstate->tuplesContext, econtext->ecxt_per_tuple_memory, false); } @@ -589,13 +589,15 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) /* * If hashing, we also need a longer-lived context to store the hash * table. The table can't just be kept in the per-query context because - * we want to be able to throw it away in ExecReScanSetOp. + * we want to be able to throw it away in ExecReScanSetOp. We can use a + * BumpContext to save storage, because we will have no need to delete + * individual table entries. */ if (node->strategy == SETOP_HASHED) - setopstate->tableContext = - AllocSetContextCreate(CurrentMemoryContext, - "SetOp hash table", - ALLOCSET_DEFAULT_SIZES); + setopstate->tuplesContext = + BumpContextCreate(CurrentMemoryContext, + "SetOp hashed tuples", + ALLOCSET_DEFAULT_SIZES); /* * initialize child nodes @@ -680,9 +682,9 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) void ExecEndSetOp(SetOpState *node) { - /* free subsidiary stuff including hashtable */ - if (node->tableContext) - MemoryContextDelete(node->tableContext); + /* free subsidiary stuff including hashtable data */ + if (node->tuplesContext) + MemoryContextDelete(node->tuplesContext); ExecEndNode(outerPlanState(node)); ExecEndNode(innerPlanState(node)); @@ -721,11 +723,7 @@ ExecReScanSetOp(SetOpState *node) return; } - /* Release any hashtable storage */ - if (node->tableContext) - MemoryContextReset(node->tableContext); - - /* And rebuild an empty hashtable */ + /* Else, we must rebuild the hashtable */ ResetTupleHashTable(node->hashtable); node->table_filled = false; } diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c index 53fb56f7388..9f6e45bcb0b 100644 --- a/src/backend/executor/nodeSubplan.c +++ b/src/backend/executor/nodeSubplan.c @@ -506,7 +506,6 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) * saves a needless fetch inner op step for the hashing ExprState created * in BuildTupleHashTable(). */ - MemoryContextReset(node->hashtablecxt); node->havehashrows = false; node->havenullrows = false; @@ -528,7 +527,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) nbuckets, 0, node->planstate->state->es_query_cxt, - node->hashtablecxt, + node->tuplesContext, innerecontext->ecxt_per_tuple_memory, false); @@ -557,7 +556,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext) nbuckets, 0, node->planstate->state->es_query_cxt, - node->hashtablecxt, + node->tuplesContext, innerecontext->ecxt_per_tuple_memory, false); } @@ -838,7 +837,7 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) sstate->projRight = NULL; sstate->hashtable = NULL; sstate->hashnulls = NULL; - sstate->hashtablecxt = NULL; + sstate->tuplesContext = NULL; sstate->innerecontext = NULL; sstate->keyColIdx = NULL; sstate->tab_eq_funcoids = NULL; @@ -889,11 +888,11 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent) *righttlist; ListCell *l; - /* We need a memory context to hold the hash table(s) */ - sstate->hashtablecxt = - AllocSetContextCreate(CurrentMemoryContext, - "Subplan HashTable Context", - ALLOCSET_DEFAULT_SIZES); + /* We need a memory context to hold the hash table(s)' tuples */ + sstate->tuplesContext = + BumpContextCreate(CurrentMemoryContext, + "SubPlan hashed tuples", + ALLOCSET_DEFAULT_SIZES); /* and a short-lived exprcontext for function evaluation */ sstate->innerecontext = CreateExprContext(estate); diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 26db9522d8b..8e7a5453064 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -141,7 +141,7 @@ extern TupleHashTable BuildTupleHashTable(PlanState *parent, long nbuckets, Size additionalsize, MemoryContext metacxt, - MemoryContext tablecxt, + MemoryContext tuplescxt, MemoryContext tempcxt, bool use_variable_hash_iv); extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index a36653c37f9..18ae8f0d4bb 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -844,9 +844,15 @@ typedef struct ExecAuxRowMark typedef struct TupleHashEntryData *TupleHashEntry; typedef struct TupleHashTableData *TupleHashTable; +/* + * TupleHashEntryData is a slot in the tuplehash_hash table. If it's + * populated, it contains a pointer to a MinimalTuple that can also have + * associated "additional data". That's stored in the TupleHashTable's + * tuplescxt. + */ typedef struct TupleHashEntryData { - MinimalTuple firstTuple; /* copy of first tuple in this group */ + MinimalTuple firstTuple; /* -> copy of first tuple in this group */ uint32 status; /* hash status */ uint32 hash; /* hash value (cached) */ } TupleHashEntryData; @@ -861,13 +867,13 @@ typedef struct TupleHashEntryData typedef struct TupleHashTableData { - tuplehash_hash *hashtab; /* underlying hash table */ + tuplehash_hash *hashtab; /* underlying simplehash hash table */ int numCols; /* number of columns in lookup key */ AttrNumber *keyColIdx; /* attr numbers of key columns */ ExprState *tab_hash_expr; /* ExprState for hashing table datatype(s) */ ExprState *tab_eq_func; /* comparator for table datatype(s) */ Oid *tab_collations; /* collations for hash and comparison */ - MemoryContext tablecxt; /* memory context containing table */ + MemoryContext tuplescxt; /* memory context storing hashed tuples */ MemoryContext tempcxt; /* context for function evaluations */ Size additionalsize; /* size of additional data */ TupleTableSlot *tableslot; /* slot for referencing table entries */ @@ -1017,7 +1023,7 @@ typedef struct SubPlanState TupleHashTable hashnulls; /* hash table for rows with null(s) */ bool havehashrows; /* true if hashtable is not empty */ bool havenullrows; /* true if hashnulls is not empty */ - MemoryContext hashtablecxt; /* memory context containing hash tables */ + MemoryContext tuplesContext; /* context containing hash tables' tuples */ ExprContext *innerecontext; /* econtext for computing inner tuples */ int numCols; /* number of columns being hashed */ /* each of the remaining fields is an array of length numCols: */ @@ -1566,7 +1572,7 @@ typedef struct RecursiveUnionState FmgrInfo *hashfunctions; /* per-grouping-field hash fns */ MemoryContext tempContext; /* short-term context for comparisons */ TupleHashTable hashtable; /* hash table for tuples already seen */ - MemoryContext tableContext; /* memory context containing hash table */ + MemoryContext tuplesContext; /* context containing hash table's tuples */ } RecursiveUnionState; /* ---------------- @@ -2567,7 +2573,7 @@ typedef struct AggState bool table_filled; /* hash table filled yet? */ int num_hashes; MemoryContext hash_metacxt; /* memory for hash table bucket array */ - MemoryContext hash_tablecxt; /* memory for hash table entries */ + MemoryContext hash_tuplescxt; /* memory for hash table tuples */ struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */ struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set, * exists only during first pass */ @@ -2866,7 +2872,7 @@ typedef struct SetOpState Oid *eqfuncoids; /* per-grouping-field equality fns */ FmgrInfo *hashfunctions; /* per-grouping-field hash fns */ TupleHashTable hashtable; /* hash table with one entry per group */ - MemoryContext tableContext; /* memory context containing hash table */ + MemoryContext tuplesContext; /* context containing hash table's tuples */ bool table_filled; /* hash table filled yet? */ TupleHashIterator hashiter; /* for iterating through hash table */ } SetOpState;