mirror of
https://github.com/postgres/postgres.git
synced 2025-11-10 17:42:29 +03:00
Major overhaul of btree index code. Eliminate special BTP_CHAIN logic for
duplicate keys by letting search go to the left rather than right when an equal key is seen at an upper tree level. Fix poor choice of page split point (leading to insertion failures) that was forced by chaining logic. Don't store leftmost key in non-leaf pages, since it's not necessary. Don't create root page until something is first stored in the index, so an unused index is now 8K not 16K. (Doesn't seem to be as easy to get rid of the metadata page, unfortunately.) Massive cleanup of unreadable code, fix poor, obsolete, and just plain wrong documentation and comments. See src/backend/access/nbtree/README for the gory details.
This commit is contained in:
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.36 2000/04/12 17:14:49 momjian Exp $
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.37 2000/07/21 06:42:32 tgl Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||
@@ -90,7 +90,7 @@ _bt_metapinit(Relation rel)
|
||||
metad.btm_version = BTREE_VERSION;
|
||||
metad.btm_root = P_NONE;
|
||||
metad.btm_level = 0;
|
||||
memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
|
||||
memcpy((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
|
||||
|
||||
op = (BTPageOpaque) PageGetSpecialPointer(pg);
|
||||
op->btpo_flags = BTP_META;
|
||||
@@ -102,52 +102,6 @@ _bt_metapinit(Relation rel)
|
||||
UnlockRelation(rel, AccessExclusiveLock);
|
||||
}
|
||||
|
||||
#ifdef NOT_USED
|
||||
/*
|
||||
* _bt_checkmeta() -- Verify that the metadata stored in a btree are
|
||||
* reasonable.
|
||||
*/
|
||||
void
|
||||
_bt_checkmeta(Relation rel)
|
||||
{
|
||||
Buffer metabuf;
|
||||
Page metap;
|
||||
BTMetaPageData *metad;
|
||||
BTPageOpaque op;
|
||||
int nblocks;
|
||||
|
||||
/* if the relation is empty, this is init time; don't complain */
|
||||
if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
|
||||
return;
|
||||
|
||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
||||
metap = BufferGetPage(metabuf);
|
||||
op = (BTPageOpaque) PageGetSpecialPointer(metap);
|
||||
if (!(op->btpo_flags & BTP_META))
|
||||
{
|
||||
elog(ERROR, "Invalid metapage for index %s",
|
||||
RelationGetRelationName(rel));
|
||||
}
|
||||
metad = BTPageGetMeta(metap);
|
||||
|
||||
if (metad->btm_magic != BTREE_MAGIC)
|
||||
{
|
||||
elog(ERROR, "Index %s is not a btree",
|
||||
RelationGetRelationName(rel));
|
||||
}
|
||||
|
||||
if (metad->btm_version != BTREE_VERSION)
|
||||
{
|
||||
elog(ERROR, "Version mismatch on %s: version %d file, version %d code",
|
||||
RelationGetRelationName(rel),
|
||||
metad->btm_version, BTREE_VERSION);
|
||||
}
|
||||
|
||||
_bt_relbuf(rel, metabuf, BT_READ);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* _bt_getroot() -- Get the root page of the btree.
|
||||
*
|
||||
@@ -157,11 +111,15 @@ _bt_checkmeta(Relation rel)
|
||||
* standard class of race conditions exists here; I think I covered
|
||||
* them all in the Hopi Indian rain dance of lock requests below.
|
||||
*
|
||||
* We pass in the access type (BT_READ or BT_WRITE), and return the
|
||||
* root page's buffer with the appropriate lock type set. Reference
|
||||
* count on the root page gets bumped by ReadBuffer. The metadata
|
||||
* page is unlocked and unreferenced by this process when this routine
|
||||
* returns.
|
||||
* The access type parameter (BT_READ or BT_WRITE) controls whether
|
||||
* a new root page will be created or not. If access = BT_READ,
|
||||
* and no root page exists, we just return InvalidBuffer. For
|
||||
* BT_WRITE, we try to create the root page if it doesn't exist.
|
||||
* NOTE that the returned root page will have only a read lock set
|
||||
* on it even if access = BT_WRITE!
|
||||
*
|
||||
* On successful return, the root page is pinned and read-locked.
|
||||
* The metadata page is not locked or pinned on exit.
|
||||
*/
|
||||
Buffer
|
||||
_bt_getroot(Relation rel, int access)
|
||||
@@ -178,78 +136,71 @@ _bt_getroot(Relation rel, int access)
|
||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
||||
metapg = BufferGetPage(metabuf);
|
||||
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
||||
Assert(metaopaque->btpo_flags & BTP_META);
|
||||
metad = BTPageGetMeta(metapg);
|
||||
|
||||
if (metad->btm_magic != BTREE_MAGIC)
|
||||
{
|
||||
if (!(metaopaque->btpo_flags & BTP_META) ||
|
||||
metad->btm_magic != BTREE_MAGIC)
|
||||
elog(ERROR, "Index %s is not a btree",
|
||||
RelationGetRelationName(rel));
|
||||
}
|
||||
|
||||
if (metad->btm_version != BTREE_VERSION)
|
||||
{
|
||||
elog(ERROR, "Version mismatch on %s: version %d file, version %d code",
|
||||
elog(ERROR, "Version mismatch on %s: version %d file, version %d code",
|
||||
RelationGetRelationName(rel),
|
||||
metad->btm_version, BTREE_VERSION);
|
||||
}
|
||||
|
||||
/* if no root page initialized yet, do it */
|
||||
if (metad->btm_root == P_NONE)
|
||||
{
|
||||
/* If access = BT_READ, caller doesn't want us to create root yet */
|
||||
if (access == BT_READ)
|
||||
{
|
||||
_bt_relbuf(rel, metabuf, BT_READ);
|
||||
return InvalidBuffer;
|
||||
}
|
||||
|
||||
/* turn our read lock in for a write lock */
|
||||
_bt_relbuf(rel, metabuf, BT_READ);
|
||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
|
||||
metapg = BufferGetPage(metabuf);
|
||||
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
||||
Assert(metaopaque->btpo_flags & BTP_META);
|
||||
metad = BTPageGetMeta(metapg);
|
||||
/* trade in our read lock for a write lock */
|
||||
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
||||
LockBuffer(metabuf, BT_WRITE);
|
||||
|
||||
/*
|
||||
* Race condition: if someone else initialized the metadata
|
||||
* between the time we released the read lock and acquired the
|
||||
* write lock, above, we want to avoid doing it again.
|
||||
* write lock, above, we must avoid doing it again.
|
||||
*/
|
||||
|
||||
if (metad->btm_root == P_NONE)
|
||||
{
|
||||
|
||||
/*
|
||||
* Get, initialize, write, and leave a lock of the appropriate
|
||||
* type on the new root page. Since this is the first page in
|
||||
* the tree, it's a leaf.
|
||||
* the tree, it's a leaf as well as the root.
|
||||
*/
|
||||
|
||||
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
|
||||
rootblkno = BufferGetBlockNumber(rootbuf);
|
||||
rootpg = BufferGetPage(rootbuf);
|
||||
|
||||
metad->btm_root = rootblkno;
|
||||
metad->btm_level = 1;
|
||||
|
||||
_bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
|
||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
|
||||
rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
|
||||
_bt_wrtnorelbuf(rel, rootbuf);
|
||||
|
||||
/* swap write lock for read lock, if appropriate */
|
||||
if (access != BT_WRITE)
|
||||
{
|
||||
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
|
||||
LockBuffer(rootbuf, BT_READ);
|
||||
}
|
||||
/* swap write lock for read lock */
|
||||
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
|
||||
LockBuffer(rootbuf, BT_READ);
|
||||
|
||||
/* okay, metadata is correct */
|
||||
/* okay, metadata is correct, write and release it */
|
||||
_bt_wrtbuf(rel, metabuf);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
/*
|
||||
* Metadata initialized by someone else. In order to
|
||||
* guarantee no deadlocks, we have to release the metadata
|
||||
* page and start all over again.
|
||||
*/
|
||||
|
||||
_bt_relbuf(rel, metabuf, BT_WRITE);
|
||||
return _bt_getroot(rel, access);
|
||||
}
|
||||
@@ -259,22 +210,21 @@ _bt_getroot(Relation rel, int access)
|
||||
rootblkno = metad->btm_root;
|
||||
_bt_relbuf(rel, metabuf, BT_READ); /* done with the meta page */
|
||||
|
||||
rootbuf = _bt_getbuf(rel, rootblkno, access);
|
||||
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
|
||||
}
|
||||
|
||||
/*
|
||||
* Race condition: If the root page split between the time we looked
|
||||
* at the metadata page and got the root buffer, then we got the wrong
|
||||
* buffer.
|
||||
* buffer. Release it and try again.
|
||||
*/
|
||||
|
||||
rootpg = BufferGetPage(rootbuf);
|
||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
|
||||
if (!(rootopaque->btpo_flags & BTP_ROOT))
|
||||
{
|
||||
|
||||
if (! P_ISROOT(rootopaque))
|
||||
{
|
||||
/* it happened, try again */
|
||||
_bt_relbuf(rel, rootbuf, access);
|
||||
_bt_relbuf(rel, rootbuf, BT_READ);
|
||||
return _bt_getroot(rel, access);
|
||||
}
|
||||
|
||||
@@ -283,7 +233,6 @@ _bt_getroot(Relation rel, int access)
|
||||
* count is correct, and we have no lock set on the metadata page.
|
||||
* Return the root block.
|
||||
*/
|
||||
|
||||
return rootbuf;
|
||||
}
|
||||
|
||||
@@ -291,33 +240,38 @@ _bt_getroot(Relation rel, int access)
|
||||
* _bt_getbuf() -- Get a buffer by block number for read or write.
|
||||
*
|
||||
* When this routine returns, the appropriate lock is set on the
|
||||
* requested buffer its reference count is correct.
|
||||
* requested buffer and its reference count has been incremented
|
||||
* (ie, the buffer is "locked and pinned").
|
||||
*/
|
||||
Buffer
|
||||
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
{
|
||||
Buffer buf;
|
||||
Page page;
|
||||
|
||||
if (blkno != P_NEW)
|
||||
{
|
||||
/* Read an existing block of the relation */
|
||||
buf = ReadBuffer(rel, blkno);
|
||||
LockBuffer(buf, access);
|
||||
}
|
||||
else
|
||||
{
|
||||
Page page;
|
||||
|
||||
/*
|
||||
* Extend bufmgr code is unclean and so we have to use locking
|
||||
* Extend the relation by one page.
|
||||
*
|
||||
* Extend bufmgr code is unclean and so we have to use extra locking
|
||||
* here.
|
||||
*/
|
||||
LockPage(rel, 0, ExclusiveLock);
|
||||
buf = ReadBuffer(rel, blkno);
|
||||
LockBuffer(buf, access);
|
||||
UnlockPage(rel, 0, ExclusiveLock);
|
||||
blkno = BufferGetBlockNumber(buf);
|
||||
|
||||
/* Initialize the new page before returning it */
|
||||
page = BufferGetPage(buf);
|
||||
_bt_pageinit(page, BufferGetPageSize(buf));
|
||||
LockBuffer(buf, access);
|
||||
}
|
||||
|
||||
/* ref count and lock type are correct */
|
||||
@@ -326,6 +280,8 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
|
||||
/*
|
||||
* _bt_relbuf() -- release a locked buffer.
|
||||
*
|
||||
* Lock and pin (refcount) are both dropped.
|
||||
*/
|
||||
void
|
||||
_bt_relbuf(Relation rel, Buffer buf, int access)
|
||||
@@ -337,9 +293,15 @@ _bt_relbuf(Relation rel, Buffer buf, int access)
|
||||
/*
|
||||
* _bt_wrtbuf() -- write a btree page to disk.
|
||||
*
|
||||
* This routine releases the lock held on the buffer and our reference
|
||||
* to it. It is an error to call _bt_wrtbuf() without a write lock
|
||||
* or a reference to the buffer.
|
||||
* This routine releases the lock held on the buffer and our refcount
|
||||
* for it. It is an error to call _bt_wrtbuf() without a write lock
|
||||
* and a pin on the buffer.
|
||||
*
|
||||
* NOTE: actually, the buffer manager just marks the shared buffer page
|
||||
* dirty here, the real I/O happens later. Since we can't persuade the
|
||||
* Unix kernel to schedule disk writes in a particular order, there's not
|
||||
* much point in worrying about this. The most we can say is that all the
|
||||
* writes will occur before commit.
|
||||
*/
|
||||
void
|
||||
_bt_wrtbuf(Relation rel, Buffer buf)
|
||||
@@ -353,7 +315,9 @@ _bt_wrtbuf(Relation rel, Buffer buf)
|
||||
* our reference or lock.
|
||||
*
|
||||
* It is an error to call _bt_wrtnorelbuf() without a write lock
|
||||
* or a reference to the buffer.
|
||||
* and a pin on the buffer.
|
||||
*
|
||||
* See above NOTE.
|
||||
*/
|
||||
void
|
||||
_bt_wrtnorelbuf(Relation rel, Buffer buf)
|
||||
@@ -389,10 +353,10 @@ _bt_pageinit(Page page, Size size)
|
||||
* we split the root page, we record the new parent in the metadata page
|
||||
* for the relation. This routine does the work.
|
||||
*
|
||||
* No direct preconditions, but if you don't have the a write lock on
|
||||
* No direct preconditions, but if you don't have the write lock on
|
||||
* at least the old root page when you call this, you're making a big
|
||||
* mistake. On exit, metapage data is correct and we no longer have
|
||||
* a reference to or lock on the metapage.
|
||||
* a pin or lock on the metapage.
|
||||
*/
|
||||
void
|
||||
_bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
|
||||
@@ -416,127 +380,8 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_getstackbuf() -- Walk back up the tree one step, and find the item
|
||||
* we last looked at in the parent.
|
||||
*
|
||||
* This is possible because we save a bit image of the last item
|
||||
* we looked at in the parent, and the update algorithm guarantees
|
||||
* that if items above us in the tree move, they only move right.
|
||||
*
|
||||
* Also, re-set bts_blkno & bts_offset if changed and
|
||||
* bts_btitem (it may be changed - see _bt_insertonpg).
|
||||
* Delete an item from a btree. It had better be a leaf item...
|
||||
*/
|
||||
Buffer
|
||||
_bt_getstackbuf(Relation rel, BTStack stack, int access)
|
||||
{
|
||||
Buffer buf;
|
||||
BlockNumber blkno;
|
||||
OffsetNumber start,
|
||||
offnum,
|
||||
maxoff;
|
||||
OffsetNumber i;
|
||||
Page page;
|
||||
ItemId itemid;
|
||||
BTItem item;
|
||||
BTPageOpaque opaque;
|
||||
BTItem item_save;
|
||||
int item_nbytes;
|
||||
|
||||
blkno = stack->bts_blkno;
|
||||
buf = _bt_getbuf(rel, blkno, access);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
if (stack->bts_offset == InvalidOffsetNumber ||
|
||||
maxoff >= stack->bts_offset)
|
||||
{
|
||||
|
||||
/*
|
||||
* _bt_insertonpg set bts_offset to InvalidOffsetNumber in the
|
||||
* case of concurrent ROOT page split
|
||||
*/
|
||||
if (stack->bts_offset == InvalidOffsetNumber)
|
||||
i = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
|
||||
else
|
||||
{
|
||||
itemid = PageGetItemId(page, stack->bts_offset);
|
||||
item = (BTItem) PageGetItem(page, itemid);
|
||||
|
||||
/* if the item is where we left it, we're done */
|
||||
if (BTItemSame(item, stack->bts_btitem))
|
||||
{
|
||||
pfree(stack->bts_btitem);
|
||||
item_nbytes = ItemIdGetLength(itemid);
|
||||
item_save = (BTItem) palloc(item_nbytes);
|
||||
memmove((char *) item_save, (char *) item, item_nbytes);
|
||||
stack->bts_btitem = item_save;
|
||||
return buf;
|
||||
}
|
||||
i = OffsetNumberNext(stack->bts_offset);
|
||||
}
|
||||
|
||||
/* if the item has just moved right on this page, we're done */
|
||||
for (;
|
||||
i <= maxoff;
|
||||
i = OffsetNumberNext(i))
|
||||
{
|
||||
itemid = PageGetItemId(page, i);
|
||||
item = (BTItem) PageGetItem(page, itemid);
|
||||
|
||||
/* if the item is where we left it, we're done */
|
||||
if (BTItemSame(item, stack->bts_btitem))
|
||||
{
|
||||
stack->bts_offset = i;
|
||||
pfree(stack->bts_btitem);
|
||||
item_nbytes = ItemIdGetLength(itemid);
|
||||
item_save = (BTItem) palloc(item_nbytes);
|
||||
memmove((char *) item_save, (char *) item, item_nbytes);
|
||||
stack->bts_btitem = item_save;
|
||||
return buf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* by here, the item we're looking for moved right at least one page */
|
||||
for (;;)
|
||||
{
|
||||
blkno = opaque->btpo_next;
|
||||
if (P_RIGHTMOST(opaque))
|
||||
elog(FATAL, "my bits moved right off the end of the world!\
|
||||
\n\tRecreate index %s.", RelationGetRelationName(rel));
|
||||
|
||||
_bt_relbuf(rel, buf, access);
|
||||
buf = _bt_getbuf(rel, blkno, access);
|
||||
page = BufferGetPage(buf);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/* if we have a right sibling, step over the high key */
|
||||
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
|
||||
|
||||
/* see if it's on this page */
|
||||
for (offnum = start;
|
||||
offnum <= maxoff;
|
||||
offnum = OffsetNumberNext(offnum))
|
||||
{
|
||||
itemid = PageGetItemId(page, offnum);
|
||||
item = (BTItem) PageGetItem(page, itemid);
|
||||
if (BTItemSame(item, stack->bts_btitem))
|
||||
{
|
||||
stack->bts_offset = offnum;
|
||||
stack->bts_blkno = blkno;
|
||||
pfree(stack->bts_btitem);
|
||||
item_nbytes = ItemIdGetLength(itemid);
|
||||
item_save = (BTItem) palloc(item_nbytes);
|
||||
memmove((char *) item_save, (char *) item, item_nbytes);
|
||||
stack->bts_btitem = item_save;
|
||||
return buf;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_bt_pagedel(Relation rel, ItemPointer tid)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user