1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-16 15:02:33 +03:00

Fix "failed to re-find parent key" btree VACUUM failure by revising page

deletion code to avoid the case where an upper-level btree page remains "half
dead" for a significant period of time, and to block insertions into a key
range that is in process of being re-assigned to the right sibling of the
deleted page's parent.  This prevents the scenario reported by Ed L. wherein
index keys could become out-of-order in the grandparent index level.

Since this is a moderately invasive fix, I'm applying it only to HEAD.
The bug exists back to 7.4, but the back branches will get a different patch.
This commit is contained in:
Tom Lane
2006-11-01 19:43:17 +00:00
parent 19d0c46def
commit 70ce5c9082
6 changed files with 359 additions and 135 deletions

View File

@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.100 2006/10/04 00:29:49 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.101 2006/11/01 19:43:17 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -723,7 +723,93 @@ _bt_delitems(Relation rel, Buffer buf,
}
/*
* _bt_pagedel() -- Delete a page from the b-tree.
* Subroutine to pre-check whether a page deletion is safe, that is, its
* parent page would be left in a valid or deletable state.
*
* "target" is the page we wish to delete, and "stack" is a search stack
* leading to it (approximately). Note that we will update the stack
* entry(s) to reflect current downlink positions --- this is harmless and
* indeed saves later search effort in _bt_pagedel.
*
* Note: it's OK to release page locks after checking, because a safe
* deletion can't become unsafe due to concurrent activity. A non-rightmost
* page cannot become rightmost unless there's a concurrent page deletion,
* but only VACUUM does page deletion and we only allow one VACUUM on an index
* at a time. An only child could acquire a sibling (of the same parent) only
* by being split ... but that would make it a non-rightmost child so the
* deletion is still safe.
*/
static bool
_bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
{
BlockNumber parent;
OffsetNumber poffset,
maxoff;
Buffer pbuf;
Page page;
BTPageOpaque opaque;
/*
* In recovery mode, assume the deletion being replayed is valid. We
* can't always check it because we won't have a full search stack,
* and we should complain if there's a problem, anyway.
*/
if (InRecovery)
return true;
/* Locate the parent's downlink (updating the stack entry if needed) */
ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
pbuf = _bt_getstackbuf(rel, stack, BT_READ);
if (pbuf == InvalidBuffer)
elog(ERROR, "failed to re-find parent key in \"%s\" for deletion target page %u",
RelationGetRelationName(rel), target);
parent = stack->bts_blkno;
poffset = stack->bts_offset;
page = BufferGetPage(pbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
/*
* If the target is the rightmost child of its parent, then we can't
* delete, unless it's also the only child.
*/
if (poffset >= maxoff)
{
/* It's rightmost child... */
if (poffset == P_FIRSTDATAKEY(opaque))
{
/*
* It's only child, so safe if parent would itself be removable.
* We have to check the parent itself, and then recurse to
* test the conditions at the parent's parent.
*/
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
{
_bt_relbuf(rel, pbuf);
return false;
}
_bt_relbuf(rel, pbuf);
return _bt_parent_deletion_safe(rel, parent, stack->bts_parent);
}
else
{
/* Unsafe to delete */
_bt_relbuf(rel, pbuf);
return false;
}
}
else
{
/* Not rightmost child, so safe to delete */
_bt_relbuf(rel, pbuf);
return true;
}
}
/*
* _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
*
* This action unlinks the page from the b-tree structure, removing all
* pointers leading to it --- but not touching its own left and right links.
@@ -731,19 +817,25 @@ _bt_delitems(Relation rel, Buffer buf,
* may currently be trying to follow links leading to the page; they have to
* be allowed to use its right-link to recover. See nbtree/README.
*
* On entry, the target buffer must be pinned and read-locked. This lock and
* pin will be dropped before exiting.
* On entry, the target buffer must be pinned and locked (either read or write
* lock is OK). This lock and pin will be dropped before exiting.
*
* Returns the number of pages successfully deleted (zero on failure; could
* be more than one if parent blocks were deleted).
* The "stack" argument can be a search stack leading (approximately) to the
* target page, or NULL --- outside callers typically pass NULL since they
* have not done such a search, but internal recursion cases pass the stack
* to avoid duplicated search effort.
*
* Returns the number of pages successfully deleted (zero if page cannot
* be deleted now; could be more than one if parent pages were deleted too).
*
* NOTE: this leaks memory. Rather than trying to clean up everything
* carefully, it's better to run it in a temp context that can be reset
* frequently.
*/
int
_bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
_bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full)
{
int result;
BlockNumber target,
leftsib,
rightsib,
@@ -756,7 +848,6 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
IndexTuple targetkey,
itup;
ScanKey itup_scankey;
BTStack stack;
Buffer lbuf,
rbuf,
pbuf;
@@ -778,6 +869,9 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
{
/* Should never fail to delete a half-dead page */
Assert(!P_ISHALFDEAD(opaque));
_bt_relbuf(rel, buf);
return 0;
}
@@ -793,36 +887,79 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
/*
* We need to get an approximate pointer to the page's parent page. Use
* the standard search mechanism to search for the page's high key; this
* will give us a link to either the current parent or someplace to its
* left (if there are multiple equal high keys). To avoid deadlocks, we'd
* better drop the target page lock first.
* To avoid deadlocks, we'd better drop the target page lock before
* going further.
*/
_bt_relbuf(rel, buf);
/* we need an insertion scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, targetkey);
/* find the leftmost leaf page containing this key */
stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
&lbuf, BT_READ);
/* don't need a pin on that either */
_bt_relbuf(rel, lbuf);
/*
* If we are trying to delete an interior page, _bt_search did more than
* we needed. Locate the stack item pointing to our parent level.
* We need an approximate pointer to the page's parent page. We use
* the standard search mechanism to search for the page's high key; this
* will give us a link to either the current parent or someplace to its
* left (if there are multiple equal high keys). In recursion cases,
* the caller already generated a search stack and we can just re-use
* that work.
*/
ilevel = 0;
for (;;)
if (stack == NULL)
{
if (stack == NULL)
elog(ERROR, "not enough stack items");
if (ilevel == targetlevel)
break;
stack = stack->bts_parent;
ilevel++;
if (!InRecovery)
{
/* we need an insertion scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, targetkey);
/* find the leftmost leaf page containing this key */
stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
&lbuf, BT_READ);
/* don't need a pin on that either */
_bt_relbuf(rel, lbuf);
/*
* If we are trying to delete an interior page, _bt_search did
* more than we needed. Locate the stack item pointing to our
* parent level.
*/
ilevel = 0;
for (;;)
{
if (stack == NULL)
elog(ERROR, "not enough stack items");
if (ilevel == targetlevel)
break;
stack = stack->bts_parent;
ilevel++;
}
}
else
{
/*
* During WAL recovery, we can't use _bt_search (for one reason,
* it might invoke user-defined comparison functions that expect
* facilities not available in recovery mode). Instead, just
* set up a dummy stack pointing to the left end of the parent
* tree level, from which _bt_getstackbuf will walk right to the
* parent page. Painful, but we don't care too much about
* performance in this scenario.
*/
pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
stack = (BTStack) palloc(sizeof(BTStackData));
stack->bts_blkno = BufferGetBlockNumber(pbuf);
stack->bts_offset = InvalidOffsetNumber;
/* bts_btentry will be initialized below */
stack->bts_parent = NULL;
_bt_relbuf(rel, pbuf);
}
}
/*
* We cannot delete a page that is the rightmost child of its immediate
* parent, unless it is the only child --- in which case the parent has
* to be deleted too, and the same condition applies recursively to it.
* We have to check this condition all the way up before trying to delete.
* We don't need to re-test when deleting a non-leaf page, though.
*/
if (targetlevel == 0 &&
!_bt_parent_deletion_safe(rel, target, stack))
return 0;
/*
* We have to lock the pages we need to modify in the standard order:
* moving right, then up. Else we will deadlock against other writers.
@@ -898,15 +1035,16 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
if (pbuf == InvalidBuffer)
elog(ERROR, "failed to re-find parent key in \"%s\"",
RelationGetRelationName(rel));
elog(ERROR, "failed to re-find parent key in \"%s\" for deletion target page %u",
RelationGetRelationName(rel), target);
parent = stack->bts_blkno;
poffset = stack->bts_offset;
/*
* If the target is the rightmost child of its parent, then we can't
* delete, unless it's also the only child --- in which case the parent
* changes to half-dead status.
* changes to half-dead status. The "can't delete" case should have been
* detected by _bt_parent_deletion_safe, so complain if we see it now.
*/
page = BufferGetPage(pbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -918,14 +1056,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
if (poffset == P_FIRSTDATAKEY(opaque))
parent_half_dead = true;
else
{
_bt_relbuf(rel, pbuf);
_bt_relbuf(rel, rbuf);
_bt_relbuf(rel, buf);
if (BufferIsValid(lbuf))
_bt_relbuf(rel, lbuf);
return 0;
}
elog(ERROR, "failed to delete rightmost child %u of %u in \"%s\"",
target, parent, RelationGetRelationName(rel));
}
else
{
@@ -940,10 +1072,13 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
* might be possible to push the fast root even further down, but the odds
* of doing so are slim, and the locking considerations daunting.)
*
* We don't support handling this in the case where the parent is
* becoming half-dead, even though it theoretically could occur.
*
* We can safely acquire a lock on the metapage here --- see comments for
* _bt_newroot().
*/
if (leftsib == P_NONE)
if (leftsib == P_NONE && !parent_half_dead)
{
page = BufferGetPage(rbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1031,6 +1166,7 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
*/
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
opaque->btpo_flags &= ~BTP_HALF_DEAD;
opaque->btpo_flags |= BTP_DELETED;
opaque->btpo.xact =
vacuum_full ? FrozenTransactionId : ReadNewTransactionId();
@@ -1085,6 +1221,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
nextrdata++;
xlinfo = XLOG_BTREE_DELETE_PAGE_META;
}
else if (parent_half_dead)
xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
else
xlinfo = XLOG_BTREE_DELETE_PAGE;
@@ -1138,34 +1276,52 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
END_CRIT_SECTION();
/* release buffers; send out relcache inval if metapage changed */
/* release metapage; send out relcache inval if metapage changed */
if (BufferIsValid(metabuf))
{
CacheInvalidateRelcache(rel);
_bt_relbuf(rel, metabuf);
}
_bt_relbuf(rel, pbuf);
_bt_relbuf(rel, rbuf);
_bt_relbuf(rel, buf);
/* can always release leftsib immediately */
if (BufferIsValid(lbuf))
_bt_relbuf(rel, lbuf);
/*
* If parent became half dead, recurse to try to delete it. Otherwise, if
* If parent became half dead, recurse to delete it. Otherwise, if
* right sibling is empty and is now the last child of the parent, recurse
* to try to delete it. (These cases cannot apply at the same time,
* though the second case might itself recurse to the first.)
*
* When recursing to parent, we hold the lock on the target page until
* done. This delays any insertions into the keyspace that was just
* effectively reassigned to the parent's right sibling. If we allowed
* that, and there were enough such insertions before we finish deleting
* the parent, page splits within that keyspace could lead to inserting
* out-of-order keys into the grandparent level. It is thought that that
* wouldn't have any serious consequences, but it still seems like a
* pretty bad idea.
*/
if (parent_half_dead)
{
buf = _bt_getbuf(rel, parent, BT_READ);
return _bt_pagedel(rel, buf, vacuum_full) + 1;
/* recursive call will release pbuf */
_bt_relbuf(rel, rbuf);
result = _bt_pagedel(rel, pbuf, stack->bts_parent, vacuum_full) + 1;
_bt_relbuf(rel, buf);
}
if (parent_one_child && rightsib_empty)
else if (parent_one_child && rightsib_empty)
{
buf = _bt_getbuf(rel, rightsib, BT_READ);
return _bt_pagedel(rel, buf, vacuum_full) + 1;
_bt_relbuf(rel, pbuf);
_bt_relbuf(rel, buf);
/* recursive call will release rbuf */
result = _bt_pagedel(rel, rbuf, stack, vacuum_full) + 1;
}
else
{
_bt_relbuf(rel, pbuf);
_bt_relbuf(rel, buf);
_bt_relbuf(rel, rbuf);
result = 1;
}
return 1;
return result;
}