1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

Preliminary cleanup for hash index code (doesn't attack the locking problem

yet).  Fix a couple of bugs that would only appear if multiple bitmap pages
are used, including a buffer reference leak and incorrect computation of bit
indexes.  Get rid of 'overflow address' concept, which accomplished nothing
except obfuscating the code and creating a risk of failure due to limited
range of offset field.  Rename some misleadingly-named fields and routines,
and improve documentation.
This commit is contained in:
Tom Lane
2003-09-01 20:26:34 +00:00
parent eaeb8621f8
commit 65c2d427fb
4 changed files with 374 additions and 460 deletions

View File

@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: hash.h,v 1.49 2003/08/04 02:40:10 momjian Exp $
* $Id: hash.h,v 1.50 2003/09/01 20:26:34 tgl Exp $
*
* NOTES
* modeled after Margo Seltzer's hash implementation for unix.
@@ -24,43 +24,18 @@
#include "fmgr.h"
/*
* An overflow page is a spare page allocated for storing data whose
* bucket doesn't have room to store it. We use overflow pages rather
* than just splitting the bucket because there is a linear order in
* the way we split buckets. In other words, if there isn't enough space
* in the bucket itself, put it in an overflow page.
*
* Overflow page addresses are stored in form: (Splitnumber, Page offset).
*
* A splitnumber is the number of the generation where the table doubles
* in size. The ovflpage's offset within the splitnumber; offsets start
* at 1.
*
* We convert the stored bitmap address into a page address with the
* macro OADDR_OF(S, O) where S is the splitnumber and O is the page
* offset.
* Mapping from hash bucket number to physical block number of bucket's
* starting page. Beware of multiple evaluations of argument! Also notice
* macro's implicit dependency on "metap".
*/
typedef uint32 Bucket;
typedef bits16 OverflowPageAddress;
typedef uint32 SplitNumber;
typedef uint32 PageOffset;
/* A valid overflow address will always have a page offset >= 1 */
#define InvalidOvflAddress 0
#define SPLITSHIFT 11
#define SPLITMASK 0x7FF
#define SPLITNUM(N) ((SplitNumber)(((uint32)(N)) >> SPLITSHIFT))
#define OPAGENUM(N) ((PageOffset)((N) & SPLITMASK))
#define OADDR_OF(S,O) ((OverflowPageAddress)((uint32)((uint32)(S) << SPLITSHIFT) + (O)))
#define BUCKET_TO_BLKNO(B) \
((Bucket) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
#define OADDR_TO_BLKNO(B) \
((BlockNumber) \
(BUCKET_TO_BLKNO ( (1 << SPLITNUM((B))) -1 ) + OPAGENUM((B))));
((BlockNumber) ((B) + ((B) ? metap->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
/*
* Special space for hash index pages.
*
* hasho_flag tells us which type of page we're looking at. For
* example, knowing overflow pages from bucket pages is necessary
* information when you're deleting tuples from a page. If all the
@@ -69,7 +44,6 @@ typedef uint32 PageOffset;
* the tuples are deleted from a bucket page, no additional action is
* necessary.
*/
#define LH_UNUSED_PAGE (0)
#define LH_OVERFLOW_PAGE (1 << 0)
#define LH_BUCKET_PAGE (1 << 1)
@@ -78,9 +52,9 @@ typedef uint32 PageOffset;
typedef struct HashPageOpaqueData
{
bits16 hasho_flag; /* is this page a bucket or ovfl */
bits16 hasho_flag; /* page type code, see above */
Bucket hasho_bucket; /* bucket number this pg belongs to */
OverflowPageAddress hasho_oaddr; /* ovfl address of this ovfl pg */
bits16 hasho_oaddr; /* no longer used; delete someday */
BlockNumber hasho_nextblkno; /* next ovfl blkno */
BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */
} HashPageOpaqueData;
@@ -91,10 +65,8 @@ typedef HashPageOpaqueData *HashPageOpaque;
* ScanOpaqueData is used to remember which buffers we're currently
* examining in the scan. We keep these buffers locked and pinned and
* recorded in the opaque entry of the scan in order to avoid doing a
* ReadBuffer() for every tuple in the index. This avoids semop() calls,
* which are expensive.
* ReadBuffer() for every tuple in the index.
*/
typedef struct HashScanOpaqueData
{
Buffer hashso_curbuf;
@@ -113,60 +85,55 @@ typedef HashScanOpaqueData *HashScanOpaque;
#define HASH_VERSION 0
/*
* NCACHED is used to set the array sizeof spares[] & bitmaps[].
* Spares[] holds the number of overflow pages currently allocated at or
* before a certain splitpoint. For example, if spares[3] = 7 then there are
* 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The
* value in spares[ovflpoint] increases as overflow pages are added at the
* end of the index. Once ovflpoint increases (ie, we have actually allocated
* the bucket pages belonging to that splitpoint) the number of spares at the
* prior splitpoint cannot change anymore.
*
* Spares[] is used to hold the number overflow pages currently
* allocated at a certain splitpoint. For example, if spares[3] = 7
* then there are a maximum of 7 ovflpages available at splitpoint 3.
* The value in spares[] will change as ovflpages are added within
* a splitpoint.
* ovflpages that have been recycled for reuse can be found by looking at
* bitmaps that are stored within ovflpages dedicated for the purpose.
* The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the
* number of currently existing bitmaps.
*
* Within a splitpoint, one can find which ovflpages are available and
* which are used by looking at a bitmaps that are stored on the ovfl
* pages themselves. There is at least one bitmap for every splitpoint's
* ovflpages. Bitmaps[] contains the ovflpage addresses of the ovflpages
* that hold the ovflpage bitmaps.
*
* The reason that the size is restricted to NCACHED (32) is because
* the bitmaps are 16 bits: upper 5 represent the splitpoint, lower 11
* indicate the page number within the splitpoint. Since there are
* only 5 bits to store the splitpoint, there can only be 32 splitpoints.
* Both spares[] and bitmaps[] use splitpoints as there indices, so there
* can only be 32 of them.
* The limitation on the size of spares[] comes from the fact that there's
* no point in having more than 2^32 buckets with only uint32 hashcodes.
* There is no particularly good reason for bitmaps[] to be the same size,
* but we're stuck with that until we want to force an initdb. (With 8K
* block size, 32 bitmaps limit us to 8 Gb of overflow space...)
*/
#define NCACHED 32
#define HASH_MAX_SPLITPOINTS 32
#define HASH_MAX_BITMAPS 32
typedef struct HashMetaPageData
{
PageHeaderData hashm_phdr; /* pad for page header (do not use) */
uint32 hashm_magic; /* magic no. for hash tables */
uint32 hashm_version; /* version ID */
uint32 hashm_nkeys; /* number of keys stored in the table */
uint16 hashm_ffactor; /* fill factor */
uint16 hashm_bsize; /* bucket size (bytes) - must be a power
uint32 hashm_ntuples; /* number of tuples stored in the table */
uint16 hashm_ffactor; /* target fill factor (tuples/bucket) */
uint16 hashm_bsize; /* index page size (bytes) - must be a power
* of 2 */
uint16 hashm_bshift; /* bucket shift */
uint16 hashm_bmsize; /* bitmap array size (bytes) - must be a
* power of 2 */
uint16 hashm_bshift; /* log2(bsize) */
uint16 hashm_bmsize; /* bitmap array size (bytes) - must be
* exactly half of hashm_bsize */
uint32 hashm_maxbucket; /* ID of maximum bucket in use */
uint32 hashm_highmask; /* mask to modulo into entire table */
uint32 hashm_lowmask; /* mask to modulo into lower half of table */
uint32 hashm_ovflpoint;/* pageno. from which ovflpgs being
uint32 hashm_ovflpoint;/* splitpoint from which ovflpgs being
* allocated */
uint32 hashm_lastfreed; /* last ovflpage freed */
uint32 hashm_nmaps; /* Initial number of bitmaps */
uint32 hashm_spares[NCACHED]; /* spare pages available at
* splitpoints */
BlockNumber hashm_mapp[NCACHED]; /* blknumbers of ovfl page maps */
uint32 hashm_firstfree; /* lowest-number free ovflpage (bit#) */
uint32 hashm_nmaps; /* number of bitmap pages */
uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before
* each splitpoint */
BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */
RegProcedure hashm_procid; /* hash procedure id from pg_proc */
} HashMetaPageData;
typedef HashMetaPageData *HashMetaPage;
extern bool BuildingHash;
typedef struct HashItemData
{
IndexTupleData hash_itup;
@@ -178,31 +145,33 @@ typedef HashItemData *HashItem;
* Constants
*/
#define DEFAULT_FFACTOR 300
#define SPLITMAX 8
#define BYTE_TO_BIT 3 /* 2^3 bits/byte */
#define INT_TO_BYTE 2 /* 2^2 bytes/int */
#define INT_TO_BIT 5 /* 2^5 bits/int */
#define ALL_SET ((uint32) ~0)
/*
* bitmap pages do not contain tuples. they do contain the standard
* Bitmap pages do not contain tuples. They do contain the standard
* page headers and trailers; however, everything in between is a
* giant bit array. the number of bits that fit on a page obviously
* depends on the page size and the header/trailer overhead.
* giant bit array. The number of bits that fit on a page obviously
* depends on the page size and the header/trailer overhead. In the
* present implementation, we use exactly half of a page for bitmap,
* so that we have a power-of-2 bits per page.
*
* The fact that the metapage has separate bsize and bmsize fields,
* but only one bshift field, is a design error that ought to be fixed.
*/
#define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize)
#define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
#define BMPG_SHIFT(metap) ((metap)->hashm_bshift - 1 + BYTE_TO_BIT)
#define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1)
#define HashPageGetBitmap(pg) \
((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData))))
/*
* The number of bits in an ovflpage bitmap which
* tells which ovflpages are empty versus in use (NOT the number of
* bits in an overflow page *address* bitmap).
* The number of bits in an ovflpage bitmap word.
*/
#define BITS_PER_MAP 32 /* Number of bits in ovflpage bitmap */
#define BITS_PER_MAP 32 /* Number of bits in uint32 */
/* Given the address of the beginning of a big map, clear/set the nth bit */
/* Given the address of the beginning of a bit map, clear/set the nth bit */
#define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
#define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
#define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
@@ -213,18 +182,9 @@ typedef HashItemData *HashItem;
#define HASH_READ 0
#define HASH_WRITE 1
/*
* In general, the hash code tries to localize its knowledge about page
* layout to a couple of routines. However, we need a special value to
* indicate "no page number" in those places where we expect page numbers.
*/
#define P_NONE 0
/*
* Strategy number. There's only one valid strategy for hashing: equality.
*/
#define HTEqualStrategyNumber 1
#define HTMaxStrategyNumber 1
@@ -233,9 +193,11 @@ typedef HashItemData *HashItem;
* us with an amproc procudure for hashing a key of the new type.
* Since we only have one such proc in amproc, it's number 1.
*/
#define HASHPROC 1
extern bool BuildingHash;
/* public routines */
extern Datum hashbuild(PG_FUNCTION_ARGS);
@@ -276,36 +238,32 @@ extern Datum hash_any(register const unsigned char *k, register int keylen);
/* hashinsert.c */
extern InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem);
/* hashovfl.c */
extern Buffer _hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf);
extern Buffer _hash_freeovflpage(Relation rel, Buffer ovflbuf);
extern int32 _hash_initbitmap(Relation rel, HashMetaPage metap, int32 pnum,
int32 nbits, int32 ndx);
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf);
extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
BlockNumber blkno);
extern void _hash_squeezebucket(Relation rel, HashMetaPage metap,
Bucket bucket);
/* hashpage.c */
extern void _hash_metapinit(Relation rel);
extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access);
extern void _hash_relbuf(Relation rel, Buffer buf, int access);
extern void _hash_wrtbuf(Relation rel, Buffer buf);
extern void _hash_wrtnorelbuf(Buffer buf);
extern Page _hash_chgbufaccess(Relation rel, Buffer *bufp, int from_access,
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
int to_access);
extern void _hash_pageinit(Page page, Size size);
extern void _hash_pagedel(Relation rel, ItemPointer tid);
extern void _hash_expandtable(Relation rel, Buffer metabuf);
/* hashscan.c */
extern void _hash_regscan(IndexScanDesc scan);
extern void _hash_dropscan(IndexScanDesc scan);
extern void _hash_adjscans(Relation rel, ItemPointer tid);
extern void AtEOXact_hash(void);
/* hashsearch.c */
extern void _hash_search(Relation rel, int keysz, ScanKey scankey,
Buffer *bufP, HashMetaPage metap);
@@ -314,7 +272,6 @@ extern bool _hash_first(IndexScanDesc scan, ScanDirection dir);
extern bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir,
Buffer metabuf);
/* hashutil.c */
extern ScanKey _hash_mkscankey(Relation rel, IndexTuple itup);
extern void _hash_freeskey(ScanKey skey);
@@ -324,7 +281,6 @@ extern Bucket _hash_call(Relation rel, HashMetaPage metap, Datum key);
extern uint32 _hash_log2(uint32 num);
extern void _hash_checkpage(Page page, int flags);
/* hash.c */
extern void hash_redo(XLogRecPtr lsn, XLogRecord *record);
extern void hash_undo(XLogRecPtr lsn, XLogRecord *record);