mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	Preliminary cleanup for hash index code (doesn't attack the locking problem
yet). Fix a couple of bugs that would only appear if multiple bitmap pages are used, including a buffer reference leak and incorrect computation of bit indexes. Get rid of 'overflow address' concept, which accomplished nothing except obfuscating the code and creating a risk of failure due to limited range of offset field. Rename some misleadingly-named fields and routines, and improve documentation.
This commit is contained in:
		| @@ -7,7 +7,7 @@ | ||||
|  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $Id: hash.h,v 1.49 2003/08/04 02:40:10 momjian Exp $ | ||||
|  * $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $ | ||||
|  * | ||||
|  * NOTES | ||||
|  *		modeled after Margo Seltzer's hash implementation for unix. | ||||
| @@ -24,43 +24,18 @@ | ||||
| #include "fmgr.h" | ||||
|  | ||||
| /* | ||||
|  * An overflow page is a spare page allocated for storing data whose | ||||
|  * bucket doesn't have room to store it. We use overflow pages rather | ||||
|  * than just splitting the bucket because there is a linear order in | ||||
|  * the way we split buckets. In other words, if there isn't enough space | ||||
|  * in the bucket itself, put it in an overflow page. | ||||
|  * | ||||
|  * Overflow page addresses are stored in form: (Splitnumber, Page offset). | ||||
|  * | ||||
|  * A splitnumber is the number of the generation where the table doubles | ||||
|  * in size. The ovflpage's offset within the splitnumber; offsets start | ||||
|  * at 1. | ||||
|  * | ||||
|  * We convert the stored bitmap address into a page address with the | ||||
|  * macro OADDR_OF(S, O) where S is the splitnumber and O is the page | ||||
|  * offset. | ||||
|  * Mapping from hash bucket number to physical block number of bucket's | ||||
|  * starting page.  Beware of multiple evaluations of argument!  Also notice | ||||
|  * macro's implicit dependency on "metap". | ||||
|  */ | ||||
| typedef uint32 Bucket; | ||||
| typedef bits16 OverflowPageAddress; | ||||
| typedef uint32 SplitNumber; | ||||
| typedef uint32 PageOffset; | ||||
|  | ||||
| /* A valid overflow address will always have a page offset >= 1 */ | ||||
| #define InvalidOvflAddress		0 | ||||
|  | ||||
| #define SPLITSHIFT		11 | ||||
| #define SPLITMASK		0x7FF | ||||
| #define SPLITNUM(N)		((SplitNumber)(((uint32)(N)) >> SPLITSHIFT)) | ||||
| #define OPAGENUM(N)		((PageOffset)((N) & SPLITMASK)) | ||||
| #define OADDR_OF(S,O)	((OverflowPageAddress)((uint32)((uint32)(S) << SPLITSHIFT) + (O))) | ||||
|  | ||||
| #define BUCKET_TO_BLKNO(B) \ | ||||
| 		((Bucket) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) | ||||
| #define OADDR_TO_BLKNO(B)		 \ | ||||
| 		((BlockNumber) \ | ||||
| 		 (BUCKET_TO_BLKNO ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B)))); | ||||
| 		((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1) | ||||
|  | ||||
| /* | ||||
|  * Special space for hash index pages. | ||||
|  * | ||||
|  * hasho_flag tells us which type of page we're looking at.  For | ||||
|  * example, knowing overflow pages from bucket pages is necessary | ||||
|  * information when you're deleting tuples from a page. If all the | ||||
| @@ -69,7 +44,6 @@ typedef uint32 PageOffset; | ||||
|  * the tuples are deleted from a bucket page, no additional action is | ||||
|  * necessary. | ||||
|  */ | ||||
|  | ||||
| #define LH_UNUSED_PAGE			(0) | ||||
| #define LH_OVERFLOW_PAGE		(1 << 0) | ||||
| #define LH_BUCKET_PAGE			(1 << 1) | ||||
| @@ -78,9 +52,9 @@ typedef uint32 PageOffset; | ||||
|  | ||||
| typedef struct HashPageOpaqueData | ||||
| { | ||||
| 	bits16		hasho_flag;		/* is this page a bucket or ovfl */ | ||||
| 	bits16		hasho_flag;		/* page type code, see above */ | ||||
| 	Bucket		hasho_bucket;	/* bucket number this pg belongs to */ | ||||
| 	OverflowPageAddress hasho_oaddr;	/* ovfl address of this ovfl pg */ | ||||
| 	bits16		hasho_oaddr;	/* no longer used; delete someday */ | ||||
| 	BlockNumber hasho_nextblkno;	/* next ovfl blkno */ | ||||
| 	BlockNumber hasho_prevblkno;	/* previous ovfl (or bucket) blkno */ | ||||
| } HashPageOpaqueData; | ||||
| @@ -91,10 +65,8 @@ typedef HashPageOpaqueData *HashPageOpaque; | ||||
|  *	ScanOpaqueData is used to remember which buffers we're currently | ||||
|  *	examining in the scan.	We keep these buffers locked and pinned and | ||||
|  *	recorded in the opaque entry of the scan in order to avoid doing a | ||||
|  *	ReadBuffer() for every tuple in the index.	This avoids semop() calls, | ||||
|  *	which are expensive. | ||||
|  *	ReadBuffer() for every tuple in the index. | ||||
|  */ | ||||
|  | ||||
| typedef struct HashScanOpaqueData | ||||
| { | ||||
| 	Buffer		hashso_curbuf; | ||||
| @@ -113,60 +85,55 @@ typedef HashScanOpaqueData *HashScanOpaque; | ||||
| #define HASH_VERSION	0 | ||||
|  | ||||
| /* | ||||
|  * NCACHED is used to set the array sizeof spares[] & bitmaps[]. | ||||
|  * Spares[] holds the number of overflow pages currently allocated at or | ||||
|  * before a certain splitpoint. For example, if spares[3] = 7 then there are | ||||
|  * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro).  The | ||||
|  * value in spares[ovflpoint] increases as overflow pages are added at the | ||||
|  * end of the index.  Once ovflpoint increases (ie, we have actually allocated | ||||
|  * the bucket pages belonging to that splitpoint) the number of spares at the | ||||
|  * prior splitpoint cannot change anymore. | ||||
|  * | ||||
|  * Spares[] is used to hold the number overflow pages currently | ||||
|  * allocated at a certain splitpoint. For example, if spares[3] = 7 | ||||
|  * then there are a maximum of 7 ovflpages available at splitpoint 3. | ||||
|  * The value in spares[] will change as ovflpages are added within | ||||
|  * a splitpoint. | ||||
|  * ovflpages that have been recycled for reuse can be found by looking at | ||||
|  * bitmaps that are stored within ovflpages dedicated for the purpose. | ||||
|  * The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the | ||||
|  * number of currently existing bitmaps. | ||||
|  * | ||||
|  * Within a splitpoint, one can find which ovflpages are available and | ||||
|  * which are used by looking at a bitmaps that are stored on the ovfl | ||||
|  * pages themselves. There is at least one bitmap for every splitpoint's | ||||
|  * ovflpages. Bitmaps[] contains the ovflpage addresses of the ovflpages | ||||
|  * that hold the ovflpage bitmaps. | ||||
|  * | ||||
|  * The reason that the size is restricted to NCACHED (32) is because | ||||
|  * the bitmaps are 16 bits: upper 5 represent the splitpoint, lower 11 | ||||
|  * indicate the page number within the splitpoint. Since there are | ||||
|  * only 5 bits to store the splitpoint, there can only be 32 splitpoints. | ||||
|  * Both spares[] and bitmaps[] use splitpoints as there indices, so there | ||||
|  * can only be 32 of them. | ||||
|  * The limitation on the size of spares[] comes from the fact that there's | ||||
|  * no point in having more than 2^32 buckets with only uint32 hashcodes. | ||||
|  * There is no particularly good reason for bitmaps[] to be the same size, | ||||
|  * but we're stuck with that until we want to force an initdb.  (With 8K | ||||
|  * block size, 32 bitmaps limit us to 8 Gb of overflow space...) | ||||
|  */ | ||||
|  | ||||
| #define NCACHED			32 | ||||
|  | ||||
| #define HASH_MAX_SPLITPOINTS		32 | ||||
| #define HASH_MAX_BITMAPS			32 | ||||
|  | ||||
| typedef struct HashMetaPageData | ||||
| { | ||||
| 	PageHeaderData hashm_phdr;	/* pad for page header (do not use) */ | ||||
| 	uint32		hashm_magic;	/* magic no. for hash tables */ | ||||
| 	uint32		hashm_version;	/* version ID */ | ||||
| 	uint32		hashm_nkeys;	/* number of keys stored in the table */ | ||||
| 	uint16		hashm_ffactor;	/* fill factor */ | ||||
| 	uint16		hashm_bsize;	/* bucket size (bytes) - must be a power | ||||
| 	uint32		hashm_ntuples;	/* number of tuples stored in the table */ | ||||
| 	uint16		hashm_ffactor;	/* target fill factor (tuples/bucket) */ | ||||
| 	uint16		hashm_bsize;	/* index page size (bytes) - must be a power | ||||
| 								 * of 2 */ | ||||
| 	uint16		hashm_bshift;	/* bucket shift */ | ||||
| 	uint16		hashm_bmsize;	/* bitmap array size (bytes) - must be a | ||||
| 								 * power of 2 */ | ||||
| 	uint16		hashm_bshift;	/* log2(bsize) */ | ||||
| 	uint16		hashm_bmsize;	/* bitmap array size (bytes) - must be | ||||
| 								 * exactly half of hashm_bsize */ | ||||
| 	uint32		hashm_maxbucket;	/* ID of maximum bucket in use */ | ||||
| 	uint32		hashm_highmask; /* mask to modulo into entire table */ | ||||
| 	uint32		hashm_lowmask;	/* mask to modulo into lower half of table */ | ||||
| 	uint32		hashm_ovflpoint;/* pageno. from which ovflpgs being | ||||
| 	uint32		hashm_ovflpoint;/* splitpoint from which ovflpgs being | ||||
| 								 * allocated */ | ||||
| 	uint32		hashm_lastfreed;	/* last ovflpage freed */ | ||||
| 	uint32		hashm_nmaps;	/* Initial number of bitmaps */ | ||||
| 	uint32		hashm_spares[NCACHED];	/* spare pages available at | ||||
| 										 * splitpoints */ | ||||
| 	BlockNumber hashm_mapp[NCACHED];	/* blknumbers of ovfl page maps */ | ||||
| 	uint32		hashm_firstfree;	/* lowest-number free ovflpage (bit#) */ | ||||
| 	uint32		hashm_nmaps;	/* number of bitmap pages */ | ||||
| 	uint32		hashm_spares[HASH_MAX_SPLITPOINTS];	/* spare pages before | ||||
| 													 * each splitpoint */ | ||||
| 	BlockNumber hashm_mapp[HASH_MAX_BITMAPS];	/* blknos of ovfl bitmaps */ | ||||
| 	RegProcedure hashm_procid;	/* hash procedure id from pg_proc */ | ||||
| } HashMetaPageData; | ||||
|  | ||||
| typedef HashMetaPageData *HashMetaPage; | ||||
|  | ||||
| extern bool BuildingHash; | ||||
|  | ||||
| typedef struct HashItemData | ||||
| { | ||||
| 	IndexTupleData hash_itup; | ||||
| @@ -178,31 +145,33 @@ typedef HashItemData *HashItem; | ||||
|  * Constants | ||||
|  */ | ||||
| #define DEFAULT_FFACTOR			300 | ||||
| #define SPLITMAX				8 | ||||
| #define BYTE_TO_BIT				3		/* 2^3 bits/byte */ | ||||
| #define INT_TO_BYTE				2		/* 2^2 bytes/int */ | ||||
| #define INT_TO_BIT				5		/* 2^5 bits/int */ | ||||
| #define ALL_SET					((uint32) ~0) | ||||
|  | ||||
| /* | ||||
|  * bitmap pages do not contain tuples.	they do contain the standard | ||||
|  * Bitmap pages do not contain tuples.	They do contain the standard | ||||
|  * page headers and trailers; however, everything in between is a | ||||
|  * giant bit array.  the number of bits that fit on a page obviously | ||||
|  * depends on the page size and the header/trailer overhead. | ||||
|  * giant bit array.  The number of bits that fit on a page obviously | ||||
|  * depends on the page size and the header/trailer overhead.  In the | ||||
|  * present implementation, we use exactly half of a page for bitmap, | ||||
|  * so that we have a power-of-2 bits per page. | ||||
|  * | ||||
|  * The fact that the metapage has separate bsize and bmsize fields, | ||||
|  * but only one bshift field, is a design error that ought to be fixed. | ||||
|  */ | ||||
| #define BMPGSZ_BYTE(metap)		((metap)->hashm_bmsize) | ||||
| #define BMPGSZ_BIT(metap)		((metap)->hashm_bmsize << BYTE_TO_BIT) | ||||
| #define BMPG_SHIFT(metap)		((metap)->hashm_bshift - 1 + BYTE_TO_BIT) | ||||
| #define BMPG_MASK(metap)		(BMPGSZ_BIT(metap) - 1) | ||||
| #define HashPageGetBitmap(pg) \ | ||||
| 	((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData)))) | ||||
|  | ||||
| /* | ||||
|  * The number of bits in an ovflpage bitmap which | ||||
|  * tells which ovflpages are empty versus in use (NOT the number of | ||||
|  * bits in an overflow page *address* bitmap). | ||||
|  * The number of bits in an ovflpage bitmap word. | ||||
|  */ | ||||
| #define BITS_PER_MAP	32		/* Number of bits in ovflpage bitmap */ | ||||
| #define BITS_PER_MAP	32		/* Number of bits in uint32 */ | ||||
|  | ||||
| /* Given the address of the beginning of a big map, clear/set the nth bit */ | ||||
| /* Given the address of the beginning of a bit map, clear/set the nth bit */ | ||||
| #define CLRBIT(A, N)	((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP))) | ||||
| #define SETBIT(A, N)	((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP))) | ||||
| #define ISSET(A, N)		((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP))) | ||||
| @@ -213,18 +182,9 @@ typedef HashItemData *HashItem; | ||||
| #define HASH_READ		0 | ||||
| #define HASH_WRITE		1 | ||||
|  | ||||
| /* | ||||
|  *	In general, the hash code tries to localize its knowledge about page | ||||
|  *	layout to a couple of routines.  However, we need a special value to | ||||
|  *	indicate "no page number" in those places where we expect page numbers. | ||||
|  */ | ||||
|  | ||||
| #define P_NONE			0 | ||||
|  | ||||
| /* | ||||
|  *	Strategy number. There's only one valid strategy for hashing: equality. | ||||
|  */ | ||||
|  | ||||
| #define HTEqualStrategyNumber			1 | ||||
| #define HTMaxStrategyNumber				1 | ||||
|  | ||||
| @@ -233,9 +193,11 @@ typedef HashItemData *HashItem; | ||||
|  *	us with an amproc procudure for hashing a key of the new type. | ||||
|  *	Since we only have one such proc in amproc, it's number 1. | ||||
|  */ | ||||
|  | ||||
| #define HASHPROC		1 | ||||
|  | ||||
|  | ||||
| extern bool BuildingHash; | ||||
|  | ||||
| /* public routines */ | ||||
|  | ||||
| extern Datum hashbuild(PG_FUNCTION_ARGS); | ||||
| @@ -276,36 +238,32 @@ extern Datum hash_any(register const unsigned char *k, register int keylen); | ||||
| /* hashinsert.c */ | ||||
| extern InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem); | ||||
|  | ||||
|  | ||||
| /* hashovfl.c */ | ||||
| extern Buffer _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf); | ||||
| extern Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf); | ||||
| extern int32 _hash_initbitmap(Relation rel, HashMetaPage metap, int32 pnum, | ||||
| 				 int32 nbits, int32 ndx); | ||||
| extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); | ||||
| extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf); | ||||
| extern void _hash_initbitmap(Relation rel, HashMetaPage metap, | ||||
| 							 BlockNumber blkno); | ||||
| extern void _hash_squeezebucket(Relation rel, HashMetaPage metap, | ||||
| 					Bucket bucket); | ||||
|  | ||||
|  | ||||
| /* hashpage.c */ | ||||
| extern void _hash_metapinit(Relation rel); | ||||
| extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access); | ||||
| extern void _hash_relbuf(Relation rel, Buffer buf, int access); | ||||
| extern void _hash_wrtbuf(Relation rel, Buffer buf); | ||||
| extern void _hash_wrtnorelbuf(Buffer buf); | ||||
| extern Page _hash_chgbufaccess(Relation rel, Buffer *bufp, int from_access, | ||||
| extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, | ||||
| 				   int to_access); | ||||
| extern void _hash_pageinit(Page page, Size size); | ||||
| extern void _hash_pagedel(Relation rel, ItemPointer tid); | ||||
| extern void _hash_expandtable(Relation rel, Buffer metabuf); | ||||
|  | ||||
|  | ||||
| /* hashscan.c */ | ||||
| extern void _hash_regscan(IndexScanDesc scan); | ||||
| extern void _hash_dropscan(IndexScanDesc scan); | ||||
| extern void _hash_adjscans(Relation rel, ItemPointer tid); | ||||
| extern void AtEOXact_hash(void); | ||||
|  | ||||
|  | ||||
| /* hashsearch.c */ | ||||
| extern void _hash_search(Relation rel, int keysz, ScanKey scankey, | ||||
| 			 Buffer *bufP, HashMetaPage metap); | ||||
| @@ -314,7 +272,6 @@ extern bool _hash_first(IndexScanDesc scan, ScanDirection dir); | ||||
| extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, | ||||
| 		   Buffer metabuf); | ||||
|  | ||||
|  | ||||
| /* hashutil.c */ | ||||
| extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup); | ||||
| extern void _hash_freeskey(ScanKey skey); | ||||
| @@ -324,7 +281,6 @@ extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key); | ||||
| extern uint32 _hash_log2(uint32 num); | ||||
| extern void _hash_checkpage(Page page, int flags); | ||||
|  | ||||
|  | ||||
| /* hash.c */ | ||||
| extern void hash_redo(XLogRecPtr lsn, XLogRecord *record); | ||||
| extern void hash_undo(XLogRecPtr lsn, XLogRecord *record); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user