Simplify tape block format.

No more indirect blocks. The blocks form a linked list instead. This saves some memory, because we don't need to have a buffer in memory to hold the indirect block (or blocks). To reflect that, TAPE_BUFFER_OVERHEAD is reduced from 3 to 1 buffer, which allows using more memory for building the initial runs. Reviewed by Peter Geoghegan and Robert Haas. Discussion: https://www.postgresql.org/message-id/34678beb-938e-646e-db9f-a7def5c44ada%40iki.fi
2025-11-10 17:42:29 +03:00 · 2016-12-22 18:45:00 +02:00
parent b86515da1a
commit 01ec25631f
3 changed files with 218 additions and 478 deletions
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -240,16 +240,16 @@ typedef enum
 * Parameters for calculation of number of tapes to use --- see inittapes()
 * and tuplesort_merge_order().
 *
- * In this calculation we assume that each tape will cost us about 3 blocks
- * worth of buffer space (which is an underestimate for very large data
- * volumes, but it's probably close enough --- see logtape.c).
+ * In this calculation we assume that each tape will cost us about 1 blocks
+ * worth of buffer space.  This ignores the overhead of all the other data
+ * structures needed for each tape, but it's probably close enough.
 *
 * MERGE_BUFFER_SIZE is how much data we'd like to read from each input
 * tape during a preread cycle (see discussion at top of file).
 */
 #define MINORDER		6		/* minimum merge order */
 #define MAXORDER		500		/* maximum merge order */
-#define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
+#define TAPE_BUFFER_OVERHEAD		BLCKSZ
 #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)

 /*
@@ -1849,6 +1849,7 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 						  SortTuple *stup)
 {
 	unsigned int tuplen;
+	size_t		nmoved;

 	switch (state->status)
 	{
@@ -1948,10 +1949,13 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				 * end of file; back up to fetch last tuple's ending length
 				 * word.  If seek fails we must have a completely empty file.
 				 */
-				if (!LogicalTapeBackspace(state->tapeset,
-										  state->result_tape,
-										  2 * sizeof(unsigned int)))
+				nmoved = LogicalTapeBackspace(state->tapeset,
+											  state->result_tape,
+											  2 * sizeof(unsigned int));
+				if (nmoved == 0)
 					return false;
+				else if (nmoved != 2 * sizeof(unsigned int))
+					elog(ERROR, "unexpected tape position");
 				state->eof_reached = false;
 			}
 			else
@@ -1960,31 +1964,34 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				 * Back up and fetch previously-returned tuple's ending length
 				 * word.  If seek fails, assume we are at start of file.
 				 */
-				if (!LogicalTapeBackspace(state->tapeset,
-										  state->result_tape,
-										  sizeof(unsigned int)))
+				nmoved = LogicalTapeBackspace(state->tapeset,
+											  state->result_tape,
+											  sizeof(unsigned int));
+				if (nmoved == 0)
 					return false;
+				else if (nmoved != sizeof(unsigned int))
+					elog(ERROR, "unexpected tape position");
 				tuplen = getlen(state, state->result_tape, false);

 				/*
 				 * Back up to get ending length word of tuple before it.
 				 */
-				if (!LogicalTapeBackspace(state->tapeset,
-										  state->result_tape,
-										  tuplen + 2 * sizeof(unsigned int)))
+				nmoved = LogicalTapeBackspace(state->tapeset,
+											  state->result_tape,
+										  tuplen + 2 * sizeof(unsigned int));
+				if (nmoved == tuplen + sizeof(unsigned int))
 				{
 					/*
-					 * If that fails, presumably the prev tuple is the first
-					 * in the file.  Back up so that it becomes next to read
-					 * in forward direction (not obviously right, but that is
-					 * what in-memory case does).
+					 * We backed up over the previous tuple, but there was no
+					 * ending length word before it.  That means that the prev
+					 * tuple is the first tuple in the file.  It is now the
+					 * next to read in forward direction (not obviously right,
+					 * but that is what in-memory case does).
 					 */
-					if (!LogicalTapeBackspace(state->tapeset,
-											  state->result_tape,
-											  tuplen + sizeof(unsigned int)))
-						elog(ERROR, "bogus tuple length in backward scan");
 					return false;
 				}
+				else if (nmoved != tuplen + 2 * sizeof(unsigned int))
+					elog(ERROR, "bogus tuple length in backward scan");
 			}

 			tuplen = getlen(state, state->result_tape, false);
@@ -1994,9 +2001,10 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 			 * Note: READTUP expects we are positioned after the initial
 			 * length word of the tuple, so back up to that point.
 			 */
-			if (!LogicalTapeBackspace(state->tapeset,
-									  state->result_tape,
-									  tuplen))
+			nmoved = LogicalTapeBackspace(state->tapeset,
+										  state->result_tape,
+										  tuplen);
+			if (nmoved != tuplen)
 				elog(ERROR, "bogus tuple length in backward scan");
 			READTUP(state, stup, state->result_tape, tuplen);

@@ -3183,11 +3191,10 @@ tuplesort_restorepos(Tuplesortstate *state)
 			state->eof_reached = state->markpos_eof;
 			break;
 		case TSS_SORTEDONTAPE:
-			if (!LogicalTapeSeek(state->tapeset,
-								 state->result_tape,
-								 state->markpos_block,
-								 state->markpos_offset))
-				elog(ERROR, "tuplesort_restorepos failed");
+			LogicalTapeSeek(state->tapeset,
+							state->result_tape,
+							state->markpos_block,
+							state->markpos_offset);
 			state->eof_reached = state->markpos_eof;
 			break;
 		default: