diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 3ff70cde3ce..a894236885f 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.4 2003/11/29 19:51:40 pgsql Exp $ +$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.4.8.1 2007/04/19 20:24:10 tgl Exp $ This directory contains an implementation of hash indexing for Postgres. @@ -72,6 +72,18 @@ index, and preparing to allocate additional overflow pages after those bucket pages. hashm_spares[] entries before S cannot change anymore, since that would require moving already-created bucket pages. +The last page nominally used by the index is always determinable from +hashm_spares[S]. To avoid complaints from smgr, the logical EOF as seen by +the filesystem and smgr must always be greater than or equal to this page. +We have to allow the case "greater than" because it's possible that during +an index extension we crash after allocating filesystem space and before +updating the metapage. Note that on filesystems that allow "holes" in +files, it's entirely likely that pages before the logical EOF are not yet +allocated: when we allocate a new splitpoint's worth of bucket pages, we +physically zero the last such page to force the EOF up, and the first such +page will be used immediately, but the intervening pages are not written +until needed. + Since overflow pages may be recycled if enough tuples are deleted from their bucket, we need a way to keep track of currently-free overflow pages. The state of each overflow page (0 = available, 1 = not available) @@ -305,6 +317,10 @@ we can just error out without any great harm being done. Free space management --------------------- +(Question: why is this so complicated? Why not just have a linked list +of free pages with the list head in the metapage? It's not like we +avoid needing to modify the metapage with all this.) + Free space management consists of two sub-algorithms, one for reserving an overflow page to add to a bucket chain, and one for returning an empty overflow page to the free pool. diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 4fe0301c75d..f95e1c6d7f8 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.53 2006/11/19 21:33:22 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.53.2.1 2007/04/19 20:24:10 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -272,19 +272,12 @@ _hash_getovflpage(Relation rel, Buffer metabuf) blkno = bitno_to_blkno(metap, bit); /* - * We have to fetch the page with P_NEW to ensure smgr's idea of the + * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that - * doesn't block incoming searches. Best way to fix it would be to stop - * maintaining hashm_spares[hashm_ovflpoint] and rely entirely on the - * smgr relation length to track where new overflow pages come from; - * then we could release the metapage before we do the smgrextend. - * FIXME later (not in beta...) + * doesn't block incoming searches. */ - newbuf = _hash_getbuf(rel, P_NEW, HASH_WRITE); - if (BufferGetBlockNumber(newbuf) != blkno) - elog(ERROR, "unexpected hash relation size: %u, should be %u", - BufferGetBlockNumber(newbuf), blkno); + newbuf = _hash_getnewbuf(rel, blkno, HASH_WRITE); metap->hashm_spares[splitnum]++; @@ -507,19 +500,14 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) /* * It is okay to write-lock the new bitmap page while holding metapage * write lock, because no one else could be contending for the new page. - * Also, the metapage lock makes it safe to extend the index using P_NEW, - * which we want to do to ensure the smgr's idea of the relation size - * stays in step with ours. + * Also, the metapage lock makes it safe to extend the index using + * _hash_getnewbuf. * * There is some loss of concurrency in possibly doing I/O for the new * page while holding the metapage lock, but this path is taken so seldom * that it's not worth worrying about. */ - buf = _hash_getbuf(rel, P_NEW, HASH_WRITE); - if (BufferGetBlockNumber(buf) != blkno) - elog(ERROR, "unexpected hash relation size: %u, should be %u", - BufferGetBlockNumber(buf), blkno); - + buf = _hash_getnewbuf(rel, blkno, HASH_WRITE); pg = BufferGetPage(buf); /* initialize the page */ diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 0f643836a1c..f08404d0536 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.61 2006/11/19 21:33:23 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.61.2.1 2007/04/19 20:24:10 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -36,7 +36,8 @@ #include "utils/lsyscache.h" -static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks); +static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, + uint32 nblocks); static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, @@ -104,8 +105,9 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access) * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). * - * blkno == P_NEW is allowed, but it is caller's responsibility to - * ensure that only one process can extend the index at a time. + * P_NEW is disallowed because this routine should only be used + * to access pages that are known to be before the filesystem EOF. + * Extending the index should be done with _hash_getnewbuf. * * All call sites should call either _hash_checkpage or _hash_pageinit * on the returned page, depending on whether the block is expected @@ -116,6 +118,9 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access) { Buffer buf; + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + buf = ReadBuffer(rel, blkno); if (access != HASH_NOLOCK) @@ -125,6 +130,51 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access) return buf; } +/* + * _hash_getnewbuf() -- Get a new page at the end of the index. + * + * This has the same API as _hash_getbuf, except that we are adding + * a page to the index, and hence expect the page to be past the + * logical EOF. (However, we have to support the case where it isn't, + * since a prior try might have crashed after extending the filesystem + * EOF but before updating the metapage to reflect the added page.) + * + * It is caller's responsibility to ensure that only one process can + * extend the index at a time. + * + * All call sites should call _hash_pageinit on the returned page. + * Also, it's difficult to imagine why access would not be HASH_WRITE. + */ +Buffer +_hash_getnewbuf(Relation rel, BlockNumber blkno, int access) +{ + BlockNumber nblocks = RelationGetNumberOfBlocks(rel); + Buffer buf; + + if (blkno == P_NEW) + elog(ERROR, "hash AM does not use P_NEW"); + if (blkno > nblocks) + elog(ERROR, "access to noncontiguous page in hash index \"%s\"", + RelationGetRelationName(rel)); + + /* smgr insists we use P_NEW to extend the relation */ + if (blkno == nblocks) + { + buf = ReadBuffer(rel, P_NEW); + if (BufferGetBlockNumber(buf) != blkno) + elog(ERROR, "unexpected hash relation size: %u, should be %u", + BufferGetBlockNumber(buf), blkno); + } + else + buf = ReadBuffer(rel, blkno); + + if (access != HASH_NOLOCK) + LockBuffer(buf, access); + + /* ref count and lock type are correct */ + return buf; +} + /* * _hash_relbuf() -- release a locked buffer. * @@ -238,12 +288,11 @@ _hash_metapinit(Relation rel) /* * We initialize the metapage, the first two bucket pages, and the - * first bitmap page in sequence, using P_NEW to cause smgrextend() - * calls to occur. This ensures that the smgr level has the right - * idea of the physical index length. + * first bitmap page in sequence, using _hash_getnewbuf to cause + * smgrextend() calls to occur. This ensures that the smgr level + * has the right idea of the physical index length. */ - metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE); - Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE); + metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, HASH_WRITE); pg = BufferGetPage(metabuf); _hash_pageinit(pg, BufferGetPageSize(metabuf)); @@ -296,8 +345,7 @@ _hash_metapinit(Relation rel) */ for (i = 0; i <= 1; i++) { - buf = _hash_getbuf(rel, P_NEW, HASH_WRITE); - Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i)); + buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); @@ -345,7 +393,6 @@ _hash_expandtable(Relation rel, Buffer metabuf) Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; - BlockNumber firstblock = InvalidBlockNumber; BlockNumber start_oblkno; BlockNumber start_nblkno; uint32 maxbucket; @@ -397,11 +444,45 @@ _hash_expandtable(Relation rel, Buffer metabuf) if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; + /* + * Determine which bucket is to be split, and attempt to lock the old + * bucket. If we can't get the lock, give up. + * + * The lock protects us against other backends, but not against our own + * backend. Must check for active scans separately. + */ + new_bucket = metap->hashm_maxbucket + 1; + + old_bucket = (new_bucket & metap->hashm_lowmask); + + start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); + + if (_hash_has_active_scan(rel, old_bucket)) + goto fail; + + if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) + goto fail; + + /* + * Likewise lock the new bucket (should never fail). + * + * Note: it is safe to compute the new bucket's blkno here, even though + * we may still need to update the BUCKET_TO_BLKNO mapping. This is + * because the current value of hashm_spares[hashm_ovflpoint] correctly + * shows where we are going to put a new splitpoint's worth of buckets. + */ + start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); + + if (_hash_has_active_scan(rel, new_bucket)) + elog(ERROR, "scan in progress on supposedly new bucket"); + + if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) + elog(ERROR, "could not get lock on supposedly new bucket"); + /* * If the split point is increasing (hashm_maxbucket's log base 2 * increases), we need to allocate a new batch of bucket pages. */ - new_bucket = metap->hashm_maxbucket + 1; spare_ndx = _hash_log2(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { @@ -412,34 +493,15 @@ _hash_expandtable(Relation rel, Buffer metabuf) * this maps one-to-one to blocks required, but someday we may need * a more complicated calculation here. */ - firstblock = _hash_alloc_buckets(rel, new_bucket); - if (firstblock == InvalidBlockNumber) - goto fail; /* can't split due to BlockNumber overflow */ + if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) + { + /* can't split due to BlockNumber overflow */ + _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); + _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); + goto fail; + } } - /* - * Determine which bucket is to be split, and attempt to lock the old - * bucket. If we can't get the lock, give up. - * - * The lock protects us against other backends, but not against our own - * backend. Must check for active scans separately. - * - * Ideally we would lock the new bucket too before proceeding, but if we - * are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping isn't - * correct yet. For simplicity we update the metapage first and then - * lock. This should be okay because no one else should be trying to lock - * the new bucket yet... - */ - old_bucket = (new_bucket & metap->hashm_lowmask); - - start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); - - if (_hash_has_active_scan(rel, old_bucket)) - goto fail; - - if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) - goto fail; - /* * Okay to proceed with split. Update the metapage bucket mapping info. * @@ -472,20 +534,6 @@ _hash_expandtable(Relation rel, Buffer metabuf) metap->hashm_ovflpoint = spare_ndx; } - /* now we can compute the new bucket's primary block number */ - start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); - - /* if we added a splitpoint, should match result of _hash_alloc_buckets */ - if (firstblock != InvalidBlockNumber && - firstblock != start_nblkno) - elog(PANIC, "unexpected hash relation size: %u, should be %u", - firstblock, start_nblkno); - - Assert(!_hash_has_active_scan(rel, new_bucket)); - - if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) - elog(PANIC, "could not get lock on supposedly new bucket"); - /* Done mucking with metapage */ END_CRIT_SECTION(); @@ -551,23 +599,16 @@ fail: * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. * - * Returns the first block number in the new splitpoint's range, or - * InvalidBlockNumber if allocation failed due to BlockNumber overflow. + * Returns TRUE if successful, or FALSE if allocation failed due to + * BlockNumber overflow. */ -static BlockNumber -_hash_alloc_buckets(Relation rel, uint32 nblocks) +static bool +_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { - BlockNumber firstblock; BlockNumber lastblock; BlockNumber endblock; char zerobuf[BLCKSZ]; - /* - * Since we hold metapage lock, no one else is either splitting or - * allocating a new page in _hash_getovflpage(); hence it's safe to - * assume that the relation length isn't changing under us. - */ - firstblock = RelationGetNumberOfBlocks(rel); lastblock = firstblock + nblocks - 1; /* @@ -575,12 +616,12 @@ _hash_alloc_buckets(Relation rel, uint32 nblocks) * extend the index anymore. */ if (lastblock < firstblock || lastblock == InvalidBlockNumber) - return InvalidBlockNumber; - - /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */ + return false; MemSet(zerobuf, 0, sizeof(zerobuf)); + RelationOpenSmgr(rel); + /* * XXX If the extension results in creation of new segment files, * we have to make sure that each non-last file is correctly filled out to @@ -597,7 +638,7 @@ _hash_alloc_buckets(Relation rel, uint32 nblocks) smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp); - return firstblock; + return true; } diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 77ed99baa68..016e51d09f9 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.73 2006/07/13 16:49:19 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.73.2.1 2007/04/19 20:24:10 tgl Exp $ * * NOTES * modeled after Margo Seltzer's hash implementation for unix. @@ -281,6 +281,7 @@ extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access); extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access); extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access); extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access); +extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, int access); extern void _hash_relbuf(Relation rel, Buffer buf); extern void _hash_dropbuf(Relation rel, Buffer buf); extern void _hash_wrtbuf(Relation rel, Buffer buf);