When creating a large hash index, pre-sort the index entries by estimated

bucket number, so as to ensure locality of access to the index during the insertion step. Without this, building an index significantly larger than available RAM takes a very long time because of thrashing. On the other hand, sorting is just useless overhead when the index does fit in RAM. We choose to sort when the initial index size exceeds effective_cache_size. This is a revised version of work by Tom Raney and Shreya Bhargava.
2025-11-10 17:42:29 +03:00 · 2008-03-16 23:15:08 +00:00
parent ec6550c6c0
commit 787eba734b
8 changed files with 324 additions and 39 deletions
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -91,7 +91,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.81 2008/01/01 19:45:55 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.82 2008/03/16 23:15:08 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -100,6 +100,7 @@

 #include <limits.h>

+#include "access/hash.h"
 #include "access/heapam.h"
 #include "access/nbtree.h"
 #include "catalog/pg_amop.h"
@@ -336,12 +337,17 @@ struct Tuplesortstate

 	/*
 	 * These variables are specific to the IndexTuple case; they are set by
-	 * tuplesort_begin_index and used only by the IndexTuple routines.
+	 * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
 	 */
-	Relation	indexRel;
+	Relation	indexRel;		/* index being built */
+
+	/* These are specific to the index_btree subcase: */
 	ScanKey		indexScanKey;
 	bool		enforceUnique;	/* complain if we find duplicate tuples */

+	/* These are specific to the index_hash subcase: */
+	uint32		hash_mask;		/* mask for sortable part of hash code */
+
 	/*
 	 * These variables are specific to the Datum case; they are set by
 	 * tuplesort_begin_datum and used only by the DatumTuple routines.
@@ -437,14 +443,17 @@ static void writetup_heap(Tuplesortstate *state, int tapenum,
 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
 			 int tapenum, unsigned int len);
 static void reversedirection_heap(Tuplesortstate *state);
-static int comparetup_index(const SortTuple *a, const SortTuple *b,
+static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
+				 Tuplesortstate *state);
+static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 				 Tuplesortstate *state);
 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
 static void writetup_index(Tuplesortstate *state, int tapenum,
 			   SortTuple *stup);
 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
 			  int tapenum, unsigned int len);
-static void reversedirection_index(Tuplesortstate *state);
+static void reversedirection_index_btree(Tuplesortstate *state);
+static void reversedirection_index_hash(Tuplesortstate *state);
 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
 				 Tuplesortstate *state);
 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
@@ -606,9 +615,9 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 }

 Tuplesortstate *
-tuplesort_begin_index(Relation indexRel,
-					  bool enforceUnique,
-					  int workMem, bool randomAccess)
+tuplesort_begin_index_btree(Relation indexRel,
+							bool enforceUnique,
+							int workMem, bool randomAccess)
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 	MemoryContext oldcontext;
@@ -625,14 +634,13 @@ tuplesort_begin_index(Relation indexRel,

 	state->nKeys = RelationGetNumberOfAttributes(indexRel);

-	state->comparetup = comparetup_index;
+	state->comparetup = comparetup_index_btree;
 	state->copytup = copytup_index;
 	state->writetup = writetup_index;
 	state->readtup = readtup_index;
-	state->reversedirection = reversedirection_index;
+	state->reversedirection = reversedirection_index_btree;

 	state->indexRel = indexRel;
-	/* see comments below about btree dependence of this code... */
 	state->indexScanKey = _bt_mkscankey_nodata(indexRel);
 	state->enforceUnique = enforceUnique;

@@ -641,6 +649,40 @@ tuplesort_begin_index(Relation indexRel,
 	return state;
 }

+Tuplesortstate *
+tuplesort_begin_index_hash(Relation indexRel,
+						   uint32 hash_mask,
+						   int workMem, bool randomAccess)
+{
+	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+	MemoryContext oldcontext;
+
+	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG,
+			 "begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c",
+			 hash_mask,
+			 workMem, randomAccess ? 't' : 'f');
+#endif
+
+	state->nKeys = 1;			/* Only one sort column, the hash code */
+
+	state->comparetup = comparetup_index_hash;
+	state->copytup = copytup_index;
+	state->writetup = writetup_index;
+	state->readtup = readtup_index;
+	state->reversedirection = reversedirection_index_hash;
+
+	state->indexRel = indexRel;
+	state->hash_mask = hash_mask;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	return state;
+}
+
 Tuplesortstate *
 tuplesort_begin_datum(Oid datumType,
 					  Oid sortOperator, bool nullsFirstFlag,
@@ -2637,14 +2679,14 @@ reversedirection_heap(Tuplesortstate *state)
 /*
 * Routines specialized for IndexTuple case
 *
- * NOTE: actually, these are specialized for the btree case; it's not
- * clear whether you could use them for a non-btree index.	Possibly
- * you'd need to make another set of routines if you needed to sort
- * according to another kind of index.
+ * The btree and hash cases require separate comparison functions, but the
+ * IndexTuple representation is the same so the copy/write/read support
+ * functions can be shared.
 */

 static int
-comparetup_index(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+comparetup_index_btree(const SortTuple *a, const SortTuple *b,
+					   Tuplesortstate *state)
 {
 	/*
 	 * This is similar to _bt_tuplecompare(), but we have already done the
@@ -2748,6 +2790,62 @@ comparetup_index(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
 	return 0;
 }

+static int
+comparetup_index_hash(const SortTuple *a, const SortTuple *b,
+					  Tuplesortstate *state)
+{
+	/*
+	 * It's slightly annoying to redo the hash function each time, although
+	 * most hash functions ought to be cheap.  Is it worth having a variant
+	 * tuple storage format so we can store the hash code?
+	 */
+	uint32		hash1;
+	uint32		hash2;
+	IndexTuple	tuple1;
+	IndexTuple	tuple2;
+
+	/* Allow interrupting long sorts */
+	CHECK_FOR_INTERRUPTS();
+
+	/* Compute hash codes and mask off bits we don't want to sort by */
+	Assert(!a->isnull1);
+	Assert(!b->isnull1);
+
+	hash1 = _hash_datum2hashkey(state->indexRel, a->datum1) & state->hash_mask;
+	hash2 = _hash_datum2hashkey(state->indexRel, b->datum1) & state->hash_mask;
+
+	if (hash1 > hash2)
+		return 1;
+	else if (hash1 < hash2)
+		return -1;
+
+	/*
+	 * If hash values are equal, we sort on ItemPointer.  This does not affect
+	 * validity of the finished index, but it offers cheap insurance against
+	 * performance problems with bad qsort implementations that have trouble
+	 * with large numbers of equal keys.
+	 */
+	tuple1 = (IndexTuple) a->tuple;
+	tuple2 = (IndexTuple) b->tuple;
+
+	{
+		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
+		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
+
+		if (blk1 != blk2)
+			return (blk1 < blk2) ? -1 : 1;
+	}
+	{
+		OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
+		OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
+
+		if (pos1 != pos2)
+			return (pos1 < pos2) ? -1 : 1;
+	}
+
+	return 0;
+}
+
 static void
 copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
 {
@@ -2810,7 +2908,7 @@ readtup_index(Tuplesortstate *state, SortTuple *stup,
 }

 static void
-reversedirection_index(Tuplesortstate *state)
+reversedirection_index_btree(Tuplesortstate *state)
 {
 	ScanKey		scanKey = state->indexScanKey;
 	int			nkey;
@@ -2821,6 +2919,13 @@ reversedirection_index(Tuplesortstate *state)
 	}
 }

+static void
+reversedirection_index_hash(Tuplesortstate *state)
+{
+	/* We don't support reversing direction in a hash index sort */
+	elog(ERROR, "reversedirection_index_hash is not implemented");
+}
+

 /*
 * Routines specialized for DatumTuple case