Use bump memory context for tuplesorts

29f6a959c added a bump allocator type for efficient compact allocations. Here we make use of this for non-bounded tuplesorts to store tuples. This is very space efficient when storing narrow tuples due to bump.c not having chunk headers. This means we can fit more tuples in work_mem before spilling to disk, or perform an in-memory sort touching fewer cacheline. Author: David Rowley Reviewed-by: Nathan Bossart Reviewed-by: Matthias van de Meent Reviewed-by: Tomas Vondra Reviewed-by: John Naylor Discussion: https://postgr.es/m/CAApHDvqGSpCU95TmM=Bp=6xjL_nLys4zdZOpfNyWBk97Xrdj2w@mail.gmail.com
2025-11-15 03:41:20 +03:00 · 2024-04-08 00:32:26 +12:00
parent f3ff7bf83b
commit 6ed83d5fa5
3 changed files with 77 additions and 32 deletions
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -191,6 +191,11 @@ struct Tuplesortstate
 								 * tuples to return? */
 	bool		boundUsed;		/* true if we made use of a bounded heap */
 	int			bound;			/* if bounded, the maximum number of tuples */
+	int64		tupleMem;		/* memory consumed by individual tuples.
+								 * storing this separately from what we track
+								 * in availMem allows us to subtract the
+								 * memory consumed by all tuples when dumping
+								 * tuples to tape */
 	int64		availMem;		/* remaining memory available, in bytes */
 	int64		allowedMem;		/* total memory allowed, in bytes */
 	int			maxTapes;		/* max number of input tapes to merge in each
@@ -764,16 +769,16 @@ tuplesort_begin_batch(Tuplesortstate *state)
 	 * in the parent context, not this context, because there is no need to
 	 * free memtuples early.  For bounded sorts, tuples may be pfreed in any
 	 * order, so we use a regular aset.c context so that it can make use of
-	 * free'd memory.  When the sort is not bounded, we make use of a
-	 * generation.c context as this keeps allocations more compact with less
-	 * wastage.  Allocations are also slightly more CPU efficient.
+	 * free'd memory.  When the sort is not bounded, we make use of a bump.c
+	 * context as this keeps allocations more compact with less wastage.
+	 * Allocations are also slightly more CPU efficient.
 	 */
-	if (state->base.sortopt & TUPLESORT_ALLOWBOUNDED)
-		state->base.tuplecontext = AllocSetContextCreate(state->base.sortcontext,
+	if (TupleSortUseBumpTupleCxt(state->base.sortopt))
+		state->base.tuplecontext = BumpContextCreate(state->base.sortcontext,
 													 "Caller tuples",
 													 ALLOCSET_DEFAULT_SIZES);
 	else
-		state->base.tuplecontext = GenerationContextCreate(state->base.sortcontext,
+		state->base.tuplecontext = AllocSetContextCreate(state->base.sortcontext,
 														 "Caller tuples",
 														 ALLOCSET_DEFAULT_SIZES);

@@ -1181,15 +1186,16 @@ noalloc:
 * Shared code for tuple and datum cases.
 */
 void
-tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbrev)
+tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple,
+						  bool useAbbrev, Size tuplen)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);

 	Assert(!LEADER(state));

-	/* Count the size of the out-of-line data */
-	if (tuple->tuple != NULL)
-		USEMEM(state, GetMemoryChunkSpace(tuple->tuple));
+	/* account for the memory used for this tuple */
+	USEMEM(state, tuplen);
+	state->tupleMem += tuplen;

 	if (!useAbbrev)
 	{
@@ -2397,13 +2403,6 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 		SortTuple  *stup = &state->memtuples[i];

 		WRITETUP(state, state->destTape, stup);
-
-		/*
-		 * Account for freeing the tuple, but no need to do the actual pfree
-		 * since the tuplecontext is being reset after the loop.
-		 */
-		if (stup->tuple != NULL)
-			FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
 	}

 	state->memtupcount = 0;
@@ -2411,12 +2410,19 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 	/*
 	 * Reset tuple memory.  We've freed all of the tuples that we previously
 	 * allocated.  It's important to avoid fragmentation when there is a stark
-	 * change in the sizes of incoming tuples.  Fragmentation due to
-	 * AllocSetFree's bucketing by size class might be particularly bad if
-	 * this step wasn't taken.
+	 * change in the sizes of incoming tuples.  In bounded sorts,
+	 * fragmentation due to AllocSetFree's bucketing by size class might be
+	 * particularly bad if this step wasn't taken.
 	 */
 	MemoryContextReset(state->base.tuplecontext);

+	/*
+	 * Now update the memory accounting to subtract the memory used by the
+	 * tuple.
+	 */
+	FREEMEM(state, state->tupleMem);
+	state->tupleMem = 0;
+
 	markrunend(state->destTape);

 #ifdef TRACE_SORT
--- a/src/backend/utils/sort/tuplesortvariants.c
+++ b/src/backend/utils/sort/tuplesortvariants.c
@@ -674,6 +674,7 @@ tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
 	SortTuple	stup;
 	MinimalTuple tuple;
 	HeapTupleData htup;
+	Size		tuplen;

 	/* copy the tuple into sort storage */
 	tuple = ExecCopySlotMinimalTuple(slot);
@@ -686,9 +687,15 @@ tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
 							   tupDesc,
 							   &stup.isnull1);

+	/* GetMemoryChunkSpace is not supported for bump contexts */
+	if (TupleSortUseBumpTupleCxt(base->sortopt))
+		tuplen = MAXALIGN(tuple->t_len);
+	else
+		tuplen = GetMemoryChunkSpace(tuple);
+
 	tuplesort_puttuple_common(state, &stup,
 							  base->sortKeys->abbrev_converter &&
-							  !stup.isnull1);
+							  !stup.isnull1, tuplen);

 	MemoryContextSwitchTo(oldcontext);
 }
@@ -705,6 +712,7 @@ tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
 	TuplesortPublic *base = TuplesortstateGetPublic(state);
 	MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext);
 	TuplesortClusterArg *arg = (TuplesortClusterArg *) base->arg;
+	Size		tuplen;

 	/* copy the tuple into sort storage */
 	tup = heap_copytuple(tup);
@@ -722,10 +730,16 @@ tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
 								   &stup.isnull1);
 	}

+	/* GetMemoryChunkSpace is not supported for bump contexts */
+	if (TupleSortUseBumpTupleCxt(base->sortopt))
+		tuplen = MAXALIGN(HEAPTUPLESIZE + tup->t_len);
+	else
+		tuplen = GetMemoryChunkSpace(tup);
+
 	tuplesort_puttuple_common(state, &stup,
 							  base->haveDatum1 &&
 							  base->sortKeys->abbrev_converter &&
-							  !stup.isnull1);
+							  !stup.isnull1, tuplen);

 	MemoryContextSwitchTo(oldcontext);
 }
@@ -743,6 +757,7 @@ tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
 	IndexTuple	tuple;
 	TuplesortPublic *base = TuplesortstateGetPublic(state);
 	TuplesortIndexArg *arg = (TuplesortIndexArg *) base->arg;
+	Size		tuplen;

 	stup.tuple = index_form_tuple_context(RelationGetDescr(rel), values,
 										  isnull, base->tuplecontext);
@@ -754,10 +769,16 @@ tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
 								RelationGetDescr(arg->indexRel),
 								&stup.isnull1);

+	/* GetMemoryChunkSpace is not supported for bump contexts */
+	if (TupleSortUseBumpTupleCxt(base->sortopt))
+		tuplen = MAXALIGN(tuple->t_info & INDEX_SIZE_MASK);
+	else
+		tuplen = GetMemoryChunkSpace(tuple);
+
 	tuplesort_puttuple_common(state, &stup,
 							  base->sortKeys &&
 							  base->sortKeys->abbrev_converter &&
-							  !stup.isnull1);
+							  !stup.isnull1, tuplen);
 }

 /*
@@ -770,6 +791,7 @@ tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
 	BrinSortTuple *bstup;
 	TuplesortPublic *base = TuplesortstateGetPublic(state);
 	MemoryContext oldcontext = MemoryContextSwitchTo(base->tuplecontext);
+	Size		tuplen;

 	/* allocate space for the whole BRIN sort tuple */
 	bstup = palloc(BRINSORTTUPLE_SIZE(size));
@@ -781,10 +803,16 @@ tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
 	stup.datum1 = tuple->bt_blkno;
 	stup.isnull1 = false;

+	/* GetMemoryChunkSpace is not supported for bump contexts */
+	if (TupleSortUseBumpTupleCxt(base->sortopt))
+		tuplen = MAXALIGN(BRINSORTTUPLE_SIZE(size));
+	else
+		tuplen = GetMemoryChunkSpace(bstup);
+
 	tuplesort_puttuple_common(state, &stup,
 							  base->sortKeys &&
 							  base->sortKeys->abbrev_converter &&
-							  !stup.isnull1);
+							  !stup.isnull1, tuplen);

 	MemoryContextSwitchTo(oldcontext);
 }
@@ -833,7 +861,7 @@ tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)

 	tuplesort_puttuple_common(state, &stup,
 							  base->tuples &&
-							  base->sortKeys->abbrev_converter && !isNull);
+							  base->sortKeys->abbrev_converter && !isNull, 0);

 	MemoryContextSwitchTo(oldcontext);
 }
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -98,6 +98,15 @@ typedef enum
 /* specifies if the tuplesort is able to support bounded sorts */
 #define TUPLESORT_ALLOWBOUNDED			(1 << 1)

+/*
+ * For bounded sort, tuples get pfree'd when they fall outside of the bound.
+ * When bounded sorts are not required, we can use a bump context for tuple
+ * allocation as there's no risk that pfree will ever be called for a tuple.
+ * Define a macro to make it easier for code to figure out if we're using a
+ * bump allocator.
+ */
+#define TupleSortUseBumpTupleCxt(opt) (((opt) & TUPLESORT_ALLOWBOUNDED) == 0)
+
 typedef struct TuplesortInstrumentation
 {
 	TuplesortMethod sortMethod; /* sort algorithm used */
@@ -109,10 +118,11 @@ typedef struct TuplesortInstrumentation
 * The objects we actually sort are SortTuple structs.  These contain
 * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
 * which is a separate palloc chunk --- we assume it is just one chunk and
- * can be freed by a simple pfree() (except during merge, when we use a
- * simple slab allocator).  SortTuples also contain the tuple's first key
- * column in Datum/nullflag format, and a source/input tape number that
- * tracks which tape each heap element/slot belongs to during merging.
+ * can be freed by a simple pfree() (except during merge, where we use a
+ * simple slab allocator, and during a non-bounded sort where we use a bump
+ * allocator).  SortTuples also contain the tuple's first key column in
+ * Datum/nullflag format, and a source/input tape number that tracks which
+ * tape each heap element/slot belongs to during merging.
 *
 * Storing the first key column lets us save heap_getattr or index_getattr
 * calls during tuple comparisons.  We could extract and save all the key
@@ -367,7 +377,8 @@ extern Tuplesortstate *tuplesort_begin_common(int workMem,
 extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
 extern bool tuplesort_used_bound(Tuplesortstate *state);
 extern void tuplesort_puttuple_common(Tuplesortstate *state,
-									  SortTuple *tuple, bool useAbbrev);
+									  SortTuple *tuple, bool useAbbrev,
+									  Size tuplen);
 extern void tuplesort_performsort(Tuplesortstate *state);
 extern bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 									  SortTuple *stup);