When creating a large hash index, pre-sort the index entries by estimated

bucket number, so as to ensure locality of access to the index during the insertion step. Without this, building an index significantly larger than available RAM takes a very long time because of thrashing. On the other hand, sorting is just useless overhead when the index does fit in RAM. We choose to sort when the initial index size exceeds effective_cache_size. This is a revised version of work by Tom Raney and Shreya Bhargava.
2025-11-10 17:42:29 +03:00 · 2008-03-16 23:15:08 +00:00
parent ec6550c6c0
commit 787eba734b
8 changed files with 324 additions and 39 deletions
--- a/src/backend/access/hash/Makefile
+++ b/src/backend/access/hash/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/hash
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.14 2008/02/19 10:30:06 petere Exp $
+#    $PostgreSQL: pgsql/src/backend/access/hash/Makefile,v 1.15 2008/03/16 23:15:08 tgl Exp $
 #
 #-------------------------------------------------------------------------

@@ -13,6 +13,6 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global

 OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
-       hashsearch.o hashutil.o
+       hashsearch.o hashsort.o hashutil.o

 include $(top_srcdir)/src/backend/common.mk
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.99 2008/03/15 20:46:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.100 2008/03/16 23:15:08 tgl Exp $
 *
 * NOTES
 *	  This file contains only the public interface routines.
@@ -22,13 +22,15 @@
 #include "access/hash.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
+#include "optimizer/cost.h"
 #include "optimizer/plancat.h"


 /* Working state for hashbuild and its callback */
 typedef struct
 {
-	double		indtuples;
+	HSpool	   *spool;			/* NULL if not using spooling */
+	double		indtuples;		/* # tuples accepted into index */
 } HashBuildState;

 static void hashbuildCallback(Relation index,
@@ -51,6 +53,7 @@ hashbuild(PG_FUNCTION_ARGS)
 	IndexBuildResult *result;
 	BlockNumber	relpages;
 	double		reltuples;
+	uint32		num_buckets;
 	HashBuildState buildstate;

 	/*
@@ -61,19 +64,43 @@ hashbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));

-	/* estimate the number of rows currently present in the table */
+	/* Estimate the number of rows currently present in the table */
 	estimate_rel_size(heap, NULL, &relpages, &reltuples);

-	/* initialize the hash index metadata page and initial buckets */
-	_hash_metapinit(index, reltuples);
+	/* Initialize the hash index metadata page and initial buckets */
+	num_buckets = _hash_metapinit(index, reltuples);

-	/* build the index */
+	/*
+	 * If we just insert the tuples into the index in scan order, then
+	 * (assuming their hash codes are pretty random) there will be no locality
+	 * of access to the index, and if the index is bigger than available RAM
+	 * then we'll thrash horribly.  To prevent that scenario, we can sort the
+	 * tuples by (expected) bucket number.  However, such a sort is useless
+	 * overhead when the index does fit in RAM.  We choose to sort if the
+	 * initial index size exceeds effective_cache_size.
+	 *
+	 * NOTE: this test will need adjustment if a bucket is ever different
+	 * from one page.
+	 */
+	if (num_buckets >= (uint32) effective_cache_size)
+		buildstate.spool = _h_spoolinit(index, num_buckets);
+	else
+		buildstate.spool = NULL;
+
+	/* prepare to build the index */
 	buildstate.indtuples = 0;

 	/* do the heap scan */
 	reltuples = IndexBuildHeapScan(heap, index, indexInfo,
 								   hashbuildCallback, (void *) &buildstate);

+	if (buildstate.spool)
+	{
+		/* sort the tuples and insert them into the index */
+		_h_indexbuild(buildstate.spool);
+		_h_spooldestroy(buildstate.spool);
+	}
+
 	/*
 	 * Return statistics
 	 */
@@ -110,7 +137,11 @@ hashbuildCallback(Relation index,
 		return;
 	}

-	_hash_doinsert(index, itup);
+	/* Either spool the tuple for sorting, or just put it into the index */
+	if (buildstate->spool)
+		_h_spool(itup, buildstate->spool);
+	else
+		_hash_doinsert(index, itup);

 	buildstate->indtuples += 1;

--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.73 2008/03/15 20:46:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.74 2008/03/16 23:15:08 tgl Exp $
 *
 * NOTES
 *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -315,13 +315,14 @@ _hash_chgbufaccess(Relation rel,
 *				the initial buckets, and the initial bitmap page.
 *
 * The initial number of buckets is dependent on num_tuples, an estimate
- * of the number of tuples to be loaded into the index initially.
+ * of the number of tuples to be loaded into the index initially.  The
+ * chosen number of buckets is returned.
 *
 * We are fairly cavalier about locking here, since we know that no one else
 * could be accessing this index.  In particular the rule about not holding
 * multiple buffer locks is ignored.
 */
-void
+uint32
 _hash_metapinit(Relation rel, double num_tuples)
 {
 	HashMetaPage metap;
@@ -437,11 +438,22 @@ _hash_metapinit(Relation rel, double num_tuples)
 	metap->hashm_ovflpoint = log2_num_buckets;
 	metap->hashm_firstfree = 0;

+	/*
+	 * Release buffer lock on the metapage while we initialize buckets.
+	 * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
+	 * won't accomplish anything.  It's a bad idea to hold buffer locks
+	 * for long intervals in any case, since that can block the bgwriter.
+	 */
+	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
+
 	/*
 	 * Initialize the first N buckets
 	 */
 	for (i = 0; i < num_buckets; i++)
 	{
+		/* Allow interrupts, in case N is huge */
+		CHECK_FOR_INTERRUPTS();
+
 		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
 		pg = BufferGetPage(buf);
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -453,6 +465,9 @@ _hash_metapinit(Relation rel, double num_tuples)
 		_hash_wrtbuf(rel, buf);
 	}

+	/* Now reacquire buffer lock on metapage */
+	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
+
 	/*
 	 * Initialize first bitmap page
 	 */
@@ -460,6 +475,8 @@ _hash_metapinit(Relation rel, double num_tuples)

 	/* all done */
 	_hash_wrtbuf(rel, metabuf);
+
+	return num_buckets;
 }

 /*
--- a/src/backend/access/hash/hashsort.c
+++ b/src/backend/access/hash/hashsort.c
@@ -0,0 +1,116 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashsort.c
+ *		Sort tuples for insertion into a new hash index.
+ *
+ * When building a very large hash index, we pre-sort the tuples by bucket
+ * number to improve locality of access to the index, and thereby avoid
+ * thrashing.  We use tuplesort.c to sort the given index tuples into order.
+ *
+ * Note: if the number of rows in the table has been underestimated,
+ * bucket splits may occur during the index build.  In that case we'd
+ * be inserting into two or more buckets for each possible masked-off
+ * hash code value.  That's no big problem though, since we'll still have
+ * plenty of locality of access.
+ *
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashsort.c,v 1.1 2008/03/16 23:15:08 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "miscadmin.h"
+#include "utils/tuplesort.h"
+
+
+/*
+ * Status record for spooling/sorting phase.
+ */
+struct HSpool
+{
+	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
+	Relation	index;
+};
+
+
+/*
+ * create and initialize a spool structure
+ */
+HSpool *
+_h_spoolinit(Relation index, uint32 num_buckets)
+{
+	HSpool	   *hspool = (HSpool *) palloc0(sizeof(HSpool));
+	uint32		hash_mask;
+
+	hspool->index = index;
+
+	/*
+	 * Determine the bitmask for hash code values.  Since there are currently
+	 * num_buckets buckets in the index, the appropriate mask can be computed
+	 * as follows.
+	 *
+	 * Note: at present, the passed-in num_buckets is always a power of 2,
+	 * so we could just compute num_buckets - 1.  We prefer not to assume
+	 * that here, though.
+	 */
+	hash_mask = (((uint32) 1) << _hash_log2(num_buckets)) - 1;
+
+	/*
+	 * We size the sort area as maintenance_work_mem rather than work_mem to
+	 * speed index creation.  This should be OK since a single backend can't
+	 * run multiple index creations in parallel.
+	 */
+	hspool->sortstate = tuplesort_begin_index_hash(index,
+												   hash_mask,
+												   maintenance_work_mem,
+												   false);
+
+	return hspool;
+}
+
+/*
+ * clean up a spool structure and its substructures.
+ */
+void
+_h_spooldestroy(HSpool *hspool)
+{
+	tuplesort_end(hspool->sortstate);
+	pfree(hspool);
+}
+
+/*
+ * spool an index entry into the sort file.
+ */
+void
+_h_spool(IndexTuple itup, HSpool *hspool)
+{
+	tuplesort_putindextuple(hspool->sortstate, itup);
+}
+
+/*
+ * given a spool loaded by successive calls to _h_spool,
+ * create an entire index.
+ */
+void
+_h_indexbuild(HSpool *hspool)
+{
+	IndexTuple	itup;
+	bool		should_free;
+
+	tuplesort_performsort(hspool->sortstate);
+
+	while ((itup = tuplesort_getindextuple(hspool->sortstate,
+										   true, &should_free)) != NULL)
+	{
+		_hash_doinsert(hspool->index, itup);
+		if (should_free)
+			pfree(itup);
+	}
+}