mirror of
https://github.com/postgres/postgres.git
synced 2025-07-28 23:42:10 +03:00
Remove block number field from nbtree stack.
The initial value of the nbtree stack downlink block number field recorded during an initial descent of the tree wasn't actually used. Both _bt_getstackbuf() callers overwrote the value with their own value. Remove the block number field from the stack struct, and add a child block number argument to _bt_getstackbuf() in its place. This makes the overall design of _bt_getstackbuf() clearer. Author: Peter Geoghegan Reviewed-By: Anastasia Lubennikova Discussion: https://postgr.es/m/CAH2-Wzmx+UbXt2YNOUCZ-a04VdXU=S=OHuAuD7Z8uQq-PXTYUg@mail.gmail.com
This commit is contained in:
@ -224,7 +224,13 @@ it, but it's still linked to its siblings.
|
||||
|
||||
(Note: Lanin and Shasha prefer to make the key space move left, but their
|
||||
argument for doing so hinges on not having left-links, which we have
|
||||
anyway. So we simplify the algorithm by moving key space right.)
|
||||
anyway. So we simplify the algorithm by moving the key space right. Note
|
||||
also that Lanin and Shasha optimistically avoid holding multiple locks as
|
||||
the tree is ascended. They're willing to release all locks and retry in
|
||||
"rare" cases where the correct location for a new downlink cannot be found
|
||||
immediately. We prefer to stick with Lehman and Yao's approach of
|
||||
pessimistically coupling buffer locks when ascending the tree, since it's
|
||||
far simpler.)
|
||||
|
||||
To preserve consistency on the parent level, we cannot merge the key space
|
||||
of a page into its right sibling unless the right sibling is a child of
|
||||
|
@ -1797,7 +1797,6 @@ _bt_insert_parent(Relation rel,
|
||||
stack = &fakestack;
|
||||
stack->bts_blkno = BufferGetBlockNumber(pbuf);
|
||||
stack->bts_offset = InvalidOffsetNumber;
|
||||
stack->bts_btentry = InvalidBlockNumber;
|
||||
stack->bts_parent = NULL;
|
||||
_bt_relbuf(rel, pbuf);
|
||||
}
|
||||
@ -1819,8 +1818,7 @@ _bt_insert_parent(Relation rel,
|
||||
* new downlink will be inserted at the correct offset. Even buf's
|
||||
* parent may have changed.
|
||||
*/
|
||||
stack->bts_btentry = bknum;
|
||||
pbuf = _bt_getstackbuf(rel, stack);
|
||||
pbuf = _bt_getstackbuf(rel, stack, bknum);
|
||||
|
||||
/*
|
||||
* Now we can unlock the right child. The left child will be unlocked
|
||||
@ -1834,7 +1832,7 @@ _bt_insert_parent(Relation rel,
|
||||
errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
|
||||
RelationGetRelationName(rel), bknum, rbknum)));
|
||||
|
||||
/* Recursively update the parent */
|
||||
/* Recursively insert into the parent */
|
||||
_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
|
||||
new_item, stack->bts_offset + 1,
|
||||
is_only);
|
||||
@ -1901,21 +1899,37 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_getstackbuf() -- Walk back up the tree one step, and find the item
|
||||
* we last looked at in the parent.
|
||||
* _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
|
||||
* tuple whose downlink points to child page.
|
||||
*
|
||||
* This is possible because we save the downlink from the parent item,
|
||||
* which is enough to uniquely identify it. Insertions into the parent
|
||||
* level could cause the item to move right; deletions could cause it
|
||||
* to move left, but not left of the page we previously found it in.
|
||||
* Caller passes child's block number, which is used to identify
|
||||
* associated pivot tuple in parent page using a linear search that
|
||||
* matches on pivot's downlink/block number. The expected location of
|
||||
* the pivot tuple is taken from the stack one level above the child
|
||||
* page. This is used as a starting point. Insertions into the
|
||||
* parent level could cause the pivot tuple to move right; deletions
|
||||
* could cause it to move left, but not left of the page we previously
|
||||
* found it on.
|
||||
*
|
||||
* Adjusts bts_blkno & bts_offset if changed.
|
||||
* Caller can use its stack to relocate the pivot tuple/downlink for
|
||||
* any same-level page to the right of the page found by its initial
|
||||
* descent. This is necessary because of the possibility that caller
|
||||
* moved right to recover from a concurrent page split. It's also
|
||||
* convenient for certain callers to be able to step right when there
|
||||
* wasn't a concurrent page split, while still using their original
|
||||
* stack. For example, the checkingunique _bt_doinsert() case may
|
||||
* have to step right when there are many physical duplicates, and its
|
||||
* scantid forces an insertion to the right of the "first page the
|
||||
* value could be on".
|
||||
*
|
||||
* Returns write-locked buffer, or InvalidBuffer if item not found
|
||||
* (should not happen).
|
||||
* Returns write-locked parent page buffer, or InvalidBuffer if pivot
|
||||
* tuple not found (should not happen). Adjusts bts_blkno &
|
||||
* bts_offset if changed. Page split caller should insert its new
|
||||
* pivot tuple for its new right sibling page on parent page, at the
|
||||
* offset number bts_offset + 1.
|
||||
*/
|
||||
Buffer
|
||||
_bt_getstackbuf(Relation rel, BTStack stack)
|
||||
_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
|
||||
{
|
||||
BlockNumber blkno;
|
||||
OffsetNumber start;
|
||||
@ -1977,7 +1991,7 @@ _bt_getstackbuf(Relation rel, BTStack stack)
|
||||
itemid = PageGetItemId(page, offnum);
|
||||
item = (IndexTuple) PageGetItem(page, itemid);
|
||||
|
||||
if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
|
||||
if (BTreeInnerTupleGetDownLink(item) == child)
|
||||
{
|
||||
/* Return accurate pointer to where link is now */
|
||||
stack->bts_blkno = blkno;
|
||||
@ -1993,7 +2007,7 @@ _bt_getstackbuf(Relation rel, BTStack stack)
|
||||
itemid = PageGetItemId(page, offnum);
|
||||
item = (IndexTuple) PageGetItem(page, itemid);
|
||||
|
||||
if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
|
||||
if (BTreeInnerTupleGetDownLink(item) == child)
|
||||
{
|
||||
/* Return accurate pointer to where link is now */
|
||||
stack->bts_blkno = blkno;
|
||||
|
@ -1189,8 +1189,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
|
||||
* non-unique high keys in leaf level pages. Even heapkeyspace indexes
|
||||
* can have a stale stack due to insertions into the parent.
|
||||
*/
|
||||
stack->bts_btentry = child;
|
||||
pbuf = _bt_getstackbuf(rel, stack);
|
||||
pbuf = _bt_getstackbuf(rel, stack, child);
|
||||
if (pbuf == InvalidBuffer)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
|
@ -146,23 +146,16 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
|
||||
par_blkno = BufferGetBlockNumber(*bufP);
|
||||
|
||||
/*
|
||||
* We need to save the location of the index entry we chose in the
|
||||
* parent page on a stack. In case we split the tree, we'll use the
|
||||
* stack to work back up to the parent page. We also save the actual
|
||||
* downlink (block) to uniquely identify the index entry, in case it
|
||||
* moves right while we're working lower in the tree. See the paper
|
||||
* by Lehman and Yao for how this is detected and handled. (We use the
|
||||
* child link during the second half of a page split -- if caller ends
|
||||
* up splitting the child it usually ends up inserting a new pivot
|
||||
* tuple for child's new right sibling immediately after the original
|
||||
* bts_offset offset recorded here. The downlink block will be needed
|
||||
* to check if bts_offset remains the position of this same pivot
|
||||
* tuple.)
|
||||
* We need to save the location of the pivot tuple we chose in the
|
||||
* parent page on a stack. If we need to split a page, we'll use
|
||||
* the stack to work back up to its parent page. If caller ends up
|
||||
* splitting a page one level down, it usually ends up inserting a
|
||||
* new pivot tuple/downlink immediately after the location recorded
|
||||
* here.
|
||||
*/
|
||||
new_stack = (BTStack) palloc(sizeof(BTStackData));
|
||||
new_stack->bts_blkno = par_blkno;
|
||||
new_stack->bts_offset = offnum;
|
||||
new_stack->bts_btentry = blkno;
|
||||
new_stack->bts_parent = stack_in;
|
||||
|
||||
/*
|
||||
|
@ -403,20 +403,16 @@ typedef struct BTMetaPageData
|
||||
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
|
||||
|
||||
/*
|
||||
* BTStackData -- As we descend a tree, we push the (location, downlink)
|
||||
* pairs from internal pages onto a private stack. If we split a
|
||||
* leaf, we use this stack to walk back up the tree and insert data
|
||||
* into parent pages (and possibly to split them, too). Lehman and
|
||||
* Yao's update algorithm guarantees that under no circumstances can
|
||||
* our private stack give us an irredeemably bad picture up the tree.
|
||||
* Again, see the paper for details.
|
||||
* BTStackData -- As we descend a tree, we push the location of pivot
|
||||
* tuples whose downlink we are about to follow onto a private stack. If
|
||||
* we split a leaf, we use this stack to walk back up the tree and insert
|
||||
* data into its parent page at the correct location. We may also have to
|
||||
* recursively split a grandparent of the leaf page (and so on).
|
||||
*/
|
||||
|
||||
typedef struct BTStackData
|
||||
{
|
||||
BlockNumber bts_blkno;
|
||||
OffsetNumber bts_offset;
|
||||
BlockNumber bts_btentry;
|
||||
struct BTStackData *bts_parent;
|
||||
} BTStackData;
|
||||
|
||||
@ -731,7 +727,7 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
|
||||
*/
|
||||
extern bool _bt_doinsert(Relation rel, IndexTuple itup,
|
||||
IndexUniqueCheck checkUnique, Relation heapRel);
|
||||
extern Buffer _bt_getstackbuf(Relation rel, BTStack stack);
|
||||
extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child);
|
||||
extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
|
||||
|
||||
/*
|
||||
|
Reference in New Issue
Block a user