Preliminary cleanup for hash index code (doesn't attack the locking problem

yet). Fix a couple of bugs that would only appear if multiple bitmap pages are used, including a buffer reference leak and incorrect computation of bit indexes. Get rid of 'overflow address' concept, which accomplished nothing except obfuscating the code and creating a risk of failure due to limited range of offset field. Rename some misleadingly-named fields and routines, and improve documentation.
2025-10-25 13:17:41 +03:00 · 2003-09-01 20:26:34 +00:00
parent eaeb8621f8
commit 65c2d427fb
4 changed files with 374 additions and 460 deletions
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: hash.h,v 1.49 2003/08/04 02:40:10 momjian Exp $
+ * $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $
 *
 * NOTES
 *		modeled after Margo Seltzer's hash implementation for unix.
@@ -24,43 +24,18 @@
 #include "fmgr.h"

 /*
- * An overflow page is a spare page allocated for storing data whose
- * bucket doesn't have room to store it. We use overflow pages rather
- * than just splitting the bucket because there is a linear order in
- * the way we split buckets. In other words, if there isn't enough space
- * in the bucket itself, put it in an overflow page.
- *
- * Overflow page addresses are stored in form: (Splitnumber, Page offset).
- *
- * A splitnumber is the number of the generation where the table doubles
- * in size. The ovflpage's offset within the splitnumber; offsets start
- * at 1.
- *
- * We convert the stored bitmap address into a page address with the
- * macro OADDR_OF(S, O) where S is the splitnumber and O is the page
- * offset.
+ * Mapping from hash bucket number to physical block number of bucket's
+ * starting page.  Beware of multiple evaluations of argument!  Also notice
+ * macro's implicit dependency on "metap".
 */
 typedef uint32 Bucket;
-typedef bits16 OverflowPageAddress;
-typedef uint32 SplitNumber;
-typedef uint32 PageOffset;
-
-/* A valid overflow address will always have a page offset >= 1 */
-#define InvalidOvflAddress		0
-
-#define SPLITSHIFT		11
-#define SPLITMASK		0x7FF
-#define SPLITNUM(N)		((SplitNumber)(((uint32)(N)) >> SPLITSHIFT))
-#define OPAGENUM(N)		((PageOffset)((N) & SPLITMASK))
-#define OADDR_OF(S,O)	((OverflowPageAddress)((uint32)((uint32)(S) << SPLITSHIFT) + (O)))

 #define BUCKET_TO_BLKNO(B) \
-		((Bucket) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
-#define OADDR_TO_BLKNO(B)		 \
-		((BlockNumber) \
-		 (BUCKET_TO_BLKNO ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B))));
+		((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)

 /*
+ * Special space for hash index pages.
+ *
 * hasho_flag tells us which type of page we're looking at.  For
 * example, knowing overflow pages from bucket pages is necessary
 * information when you're deleting tuples from a page. If all the
@@ -69,7 +44,6 @@ typedef uint32 PageOffset;
 * the tuples are deleted from a bucket page, no additional action is
 * necessary.
 */
-
 #define LH_UNUSED_PAGE			(0)
 #define LH_OVERFLOW_PAGE		(1 << 0)
 #define LH_BUCKET_PAGE			(1 << 1)
@@ -78,9 +52,9 @@ typedef uint32 PageOffset;

 typedef struct HashPageOpaqueData
 {
-	bits16		hasho_flag;		/* is this page a bucket or ovfl */
+	bits16		hasho_flag;		/* page type code, see above */
 	Bucket		hasho_bucket;	/* bucket number this pg belongs to */
-	OverflowPageAddress hasho_oaddr;	/* ovfl address of this ovfl pg */
+	bits16		hasho_oaddr;	/* no longer used; delete someday */
 	BlockNumber hasho_nextblkno;	/* next ovfl blkno */
 	BlockNumber hasho_prevblkno;	/* previous ovfl (or bucket) blkno */
 } HashPageOpaqueData;
@@ -91,10 +65,8 @@ typedef HashPageOpaqueData *HashPageOpaque;
 *	ScanOpaqueData is used to remember which buffers we're currently
 *	examining in the scan.	We keep these buffers locked and pinned and
 *	recorded in the opaque entry of the scan in order to avoid doing a
- *	ReadBuffer() for every tuple in the index.	This avoids semop() calls,
- *	which are expensive.
+ *	ReadBuffer() for every tuple in the index.
 */
-
 typedef struct HashScanOpaqueData
 {
 	Buffer		hashso_curbuf;
@@ -113,60 +85,55 @@ typedef HashScanOpaqueData *HashScanOpaque;
 #define HASH_VERSION	0

 /*
- * NCACHED is used to set the array sizeof spares[] & bitmaps[].
+ * Spares[] holds the number of overflow pages currently allocated at or
+ * before a certain splitpoint. For example, if spares[3] = 7 then there are
+ * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro).  The
+ * value in spares[ovflpoint] increases as overflow pages are added at the
+ * end of the index.  Once ovflpoint increases (ie, we have actually allocated
+ * the bucket pages belonging to that splitpoint) the number of spares at the
+ * prior splitpoint cannot change anymore.
 *
- * Spares[] is used to hold the number overflow pages currently
- * allocated at a certain splitpoint. For example, if spares[3] = 7
- * then there are a maximum of 7 ovflpages available at splitpoint 3.
- * The value in spares[] will change as ovflpages are added within
- * a splitpoint.
+ * ovflpages that have been recycled for reuse can be found by looking at
+ * bitmaps that are stored within ovflpages dedicated for the purpose.
+ * The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the
+ * number of currently existing bitmaps.
 *
- * Within a splitpoint, one can find which ovflpages are available and
- * which are used by looking at a bitmaps that are stored on the ovfl
- * pages themselves. There is at least one bitmap for every splitpoint's
- * ovflpages. Bitmaps[] contains the ovflpage addresses of the ovflpages
- * that hold the ovflpage bitmaps.
- *
- * The reason that the size is restricted to NCACHED (32) is because
- * the bitmaps are 16 bits: upper 5 represent the splitpoint, lower 11
- * indicate the page number within the splitpoint. Since there are
- * only 5 bits to store the splitpoint, there can only be 32 splitpoints.
- * Both spares[] and bitmaps[] use splitpoints as there indices, so there
- * can only be 32 of them.
+ * The limitation on the size of spares[] comes from the fact that there's
+ * no point in having more than 2^32 buckets with only uint32 hashcodes.
+ * There is no particularly good reason for bitmaps[] to be the same size,
+ * but we're stuck with that until we want to force an initdb.  (With 8K
+ * block size, 32 bitmaps limit us to 8 Gb of overflow space...)
 */
-
-#define NCACHED			32
-
+#define HASH_MAX_SPLITPOINTS		32
+#define HASH_MAX_BITMAPS			32

 typedef struct HashMetaPageData
 {
 	PageHeaderData hashm_phdr;	/* pad for page header (do not use) */
 	uint32		hashm_magic;	/* magic no. for hash tables */
 	uint32		hashm_version;	/* version ID */
-	uint32		hashm_nkeys;	/* number of keys stored in the table */
-	uint16		hashm_ffactor;	/* fill factor */
-	uint16		hashm_bsize;	/* bucket size (bytes) - must be a power
+	uint32		hashm_ntuples;	/* number of tuples stored in the table */
+	uint16		hashm_ffactor;	/* target fill factor (tuples/bucket) */
+	uint16		hashm_bsize;	/* index page size (bytes) - must be a power
 								 * of 2 */
-	uint16		hashm_bshift;	/* bucket shift */
-	uint16		hashm_bmsize;	/* bitmap array size (bytes) - must be a
-								 * power of 2 */
+	uint16		hashm_bshift;	/* log2(bsize) */
+	uint16		hashm_bmsize;	/* bitmap array size (bytes) - must be
+								 * exactly half of hashm_bsize */
 	uint32		hashm_maxbucket;	/* ID of maximum bucket in use */
 	uint32		hashm_highmask; /* mask to modulo into entire table */
 	uint32		hashm_lowmask;	/* mask to modulo into lower half of table */
-	uint32		hashm_ovflpoint;/* pageno. from which ovflpgs being
+	uint32		hashm_ovflpoint;/* splitpoint from which ovflpgs being
 								 * allocated */
-	uint32		hashm_lastfreed;	/* last ovflpage freed */
-	uint32		hashm_nmaps;	/* Initial number of bitmaps */
-	uint32		hashm_spares[NCACHED];	/* spare pages available at
-										 * splitpoints */
-	BlockNumber hashm_mapp[NCACHED];	/* blknumbers of ovfl page maps */
+	uint32		hashm_firstfree;	/* lowest-number free ovflpage (bit#) */
+	uint32		hashm_nmaps;	/* number of bitmap pages */
+	uint32		hashm_spares[HASH_MAX_SPLITPOINTS];	/* spare pages before
+													 * each splitpoint */
+	BlockNumber hashm_mapp[HASH_MAX_BITMAPS];	/* blknos of ovfl bitmaps */
 	RegProcedure hashm_procid;	/* hash procedure id from pg_proc */
 } HashMetaPageData;

 typedef HashMetaPageData *HashMetaPage;

-extern bool BuildingHash;
-
 typedef struct HashItemData
 {
 	IndexTupleData hash_itup;
@@ -178,31 +145,33 @@ typedef HashItemData *HashItem;
 * Constants
 */
 #define DEFAULT_FFACTOR			300
-#define SPLITMAX				8
 #define BYTE_TO_BIT				3		/* 2^3 bits/byte */
-#define INT_TO_BYTE				2		/* 2^2 bytes/int */
-#define INT_TO_BIT				5		/* 2^5 bits/int */
 #define ALL_SET					((uint32) ~0)

 /*
- * bitmap pages do not contain tuples.	they do contain the standard
+ * Bitmap pages do not contain tuples.	They do contain the standard
 * page headers and trailers; however, everything in between is a
- * giant bit array.  the number of bits that fit on a page obviously
- * depends on the page size and the header/trailer overhead.
+ * giant bit array.  The number of bits that fit on a page obviously
+ * depends on the page size and the header/trailer overhead.  In the
+ * present implementation, we use exactly half of a page for bitmap,
+ * so that we have a power-of-2 bits per page.
+ *
+ * The fact that the metapage has separate bsize and bmsize fields,
+ * but only one bshift field, is a design error that ought to be fixed.
 */
 #define BMPGSZ_BYTE(metap)		((metap)->hashm_bmsize)
 #define BMPGSZ_BIT(metap)		((metap)->hashm_bmsize << BYTE_TO_BIT)
+#define BMPG_SHIFT(metap)		((metap)->hashm_bshift - 1 + BYTE_TO_BIT)
+#define BMPG_MASK(metap)		(BMPGSZ_BIT(metap) - 1)
 #define HashPageGetBitmap(pg) \
 	((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData))))

 /*
- * The number of bits in an ovflpage bitmap which
- * tells which ovflpages are empty versus in use (NOT the number of
- * bits in an overflow page *address* bitmap).
+ * The number of bits in an ovflpage bitmap word.
 */
-#define BITS_PER_MAP	32		/* Number of bits in ovflpage bitmap */
+#define BITS_PER_MAP	32		/* Number of bits in uint32 */

-/* Given the address of the beginning of a big map, clear/set the nth bit */
+/* Given the address of the beginning of a bit map, clear/set the nth bit */
 #define CLRBIT(A, N)	((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
 #define SETBIT(A, N)	((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
 #define ISSET(A, N)		((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
@@ -213,18 +182,9 @@ typedef HashItemData *HashItem;
 #define HASH_READ		0
 #define HASH_WRITE		1

-/*
- *	In general, the hash code tries to localize its knowledge about page
- *	layout to a couple of routines.  However, we need a special value to
- *	indicate "no page number" in those places where we expect page numbers.
- */
-
-#define P_NONE			0
-
 /*
 *	Strategy number. There's only one valid strategy for hashing: equality.
 */
-
 #define HTEqualStrategyNumber			1
 #define HTMaxStrategyNumber				1

@@ -233,9 +193,11 @@ typedef HashItemData *HashItem;
 *	us with an amproc procudure for hashing a key of the new type.
 *	Since we only have one such proc in amproc, it's number 1.
 */
-
 #define HASHPROC		1

+
+extern bool BuildingHash;
+
 /* public routines */

 extern Datum hashbuild(PG_FUNCTION_ARGS);
@@ -276,36 +238,32 @@ extern Datum hash_any(register const unsigned char *k, register int keylen);
 /* hashinsert.c */
 extern InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem);

-
 /* hashovfl.c */
-extern Buffer _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf);
-extern Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf);
-extern int32 _hash_initbitmap(Relation rel, HashMetaPage metap, int32 pnum,
-				 int32 nbits, int32 ndx);
+extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
+extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf);
+extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
+							 BlockNumber blkno);
 extern void _hash_squeezebucket(Relation rel, HashMetaPage metap,
 					Bucket bucket);

-
 /* hashpage.c */
 extern void _hash_metapinit(Relation rel);
 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access);
 extern void _hash_relbuf(Relation rel, Buffer buf, int access);
 extern void _hash_wrtbuf(Relation rel, Buffer buf);
 extern void _hash_wrtnorelbuf(Buffer buf);
-extern Page _hash_chgbufaccess(Relation rel, Buffer *bufp, int from_access,
+extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
 				   int to_access);
 extern void _hash_pageinit(Page page, Size size);
 extern void _hash_pagedel(Relation rel, ItemPointer tid);
 extern void _hash_expandtable(Relation rel, Buffer metabuf);

-
 /* hashscan.c */
 extern void _hash_regscan(IndexScanDesc scan);
 extern void _hash_dropscan(IndexScanDesc scan);
 extern void _hash_adjscans(Relation rel, ItemPointer tid);
 extern void AtEOXact_hash(void);

-
 /* hashsearch.c */
 extern void _hash_search(Relation rel, int keysz, ScanKey scankey,
 			 Buffer *bufP, HashMetaPage metap);
@@ -314,7 +272,6 @@ extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir,
 		   Buffer metabuf);

-
 /* hashutil.c */
 extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup);
 extern void _hash_freeskey(ScanKey skey);
@@ -324,7 +281,6 @@ extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key);
 extern uint32 _hash_log2(uint32 num);
 extern void _hash_checkpage(Page page, int flags);

-
 /* hash.c */
 extern void hash_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void hash_undo(XLogRecPtr lsn, XLogRecord *record);