1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-12 05:01:15 +03:00

Reduce use of heavyweight locking inside hash AM.

Avoid using LockPage(rel, 0, lockmode) to protect against changes to
the bucket mapping.  Instead, an exclusive buffer content lock is now
viewed as sufficient permission to modify the metapage, and a shared
buffer content lock is used when such modifications need to be
prevented.  This more relaxed locking regimen makes it possible that,
when we're busy getting a heavyweight bucket on the bucket we intend
to search or insert into, a bucket split might occur underneath us.
To compenate for that possibility, we use a loop-and-retry system:
release the metapage content lock, acquire the heavyweight lock on the
target bucket, and then reacquire the metapage content lock and check
that the bucket mapping has not changed.   Normally it hasn't, and
we're done.  But if by chance it has, we simply unlock the metapage,
release the heavyweight lock we acquired previously, lock the new
bucket, and loop around again.  Even in the worst case we cannot loop
very many times here, since we don't split the same bucket again until
we've split all the other buckets, and 2^N gets big pretty fast.

This results in greatly improved concurrency, because we're
effectively replacing two lwlock acquire-and-release cycles in
exclusive mode (on one of the lock manager locks) with a single
acquire-and-release cycle in shared mode (on the metapage buffer
content lock).  Testing shows that it's still not quite as good as
btree; for that, we'd probably have to find some way of getting rid
of the heavyweight bucket locks as well, which does not appear
straightforward.

Patch by me, review by Jeff Janes.
This commit is contained in:
Robert Haas
2012-06-26 06:56:10 -04:00
parent 038f3a0509
commit 76837c1507
4 changed files with 145 additions and 129 deletions

View File

@@ -32,6 +32,8 @@ _hash_doinsert(Relation rel, IndexTuple itup)
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
BlockNumber oldblkno = InvalidBlockNumber;
bool retry = false;
Page page;
HashPageOpaque pageopaque;
Size itemsz;
@@ -49,12 +51,6 @@ _hash_doinsert(Relation rel, IndexTuple itup)
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */
/*
* Acquire shared split lock so we can compute the target bucket safely
* (see README).
*/
_hash_getlock(rel, 0, HASH_SHARE);
/* Read the metapage */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
@@ -75,24 +71,44 @@ _hash_doinsert(Relation rel, IndexTuple itup)
errhint("Values larger than a buffer page cannot be indexed.")));
/*
* Compute the target bucket number, and convert to block number.
* Loop until we get a lock on the correct target bucket.
*/
bucket = _hash_hashkey2bucket(hashkey,
metap->hashm_maxbucket,
metap->hashm_highmask,
metap->hashm_lowmask);
for (;;)
{
/*
* Compute the target bucket number, and convert to block number.
*/
bucket = _hash_hashkey2bucket(hashkey,
metap->hashm_maxbucket,
metap->hashm_highmask,
metap->hashm_lowmask);
blkno = BUCKET_TO_BLKNO(metap, bucket);
blkno = BUCKET_TO_BLKNO(metap, bucket);
/* release lock on metapage, but keep pin since we'll need it again */
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
/* Release metapage lock, but keep pin. */
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
/*
* Acquire share lock on target bucket; then we can release split lock.
*/
_hash_getlock(rel, blkno, HASH_SHARE);
/*
* If the previous iteration of this loop locked what is still the
* correct target bucket, we are done. Otherwise, drop any old lock
* and lock what now appears to be the correct bucket.
*/
if (retry)
{
if (oldblkno == blkno)
break;
_hash_droplock(rel, oldblkno, HASH_SHARE);
}
_hash_getlock(rel, blkno, HASH_SHARE);
_hash_droplock(rel, 0, HASH_SHARE);
/*
* Reacquire metapage lock and check that no bucket split has taken
* place while we were awaiting the bucket lock.
*/
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
oldblkno = blkno;
retry = true;
}
/* Fetch the primary bucket page for the bucket */
buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);