1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-27 12:41:57 +03:00

Fix "failed to re-find parent key" btree VACUUM failure by revising page

deletion code to avoid the case where an upper-level btree page remains "half
dead" for a significant period of time, and to block insertions into a key
range that is in process of being re-assigned to the right sibling of the
deleted page's parent.  This prevents the scenario reported by Ed L. wherein
index keys could become out-of-order in the grandparent index level.

Since this is a moderately invasive fix, I'm applying it only to HEAD.
The bug exists back to 7.4, but the back branches will get a different patch.
This commit is contained in:
Tom Lane
2006-11-01 19:43:17 +00:00
parent 19d0c46def
commit 70ce5c9082
6 changed files with 359 additions and 135 deletions

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.38 2006/10/04 00:29:49 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.39 2006/11/01 19:43:17 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -22,31 +22,41 @@
* them manually if they are not seen in the WAL log during replay. This
* makes it safe for page insertion to be a multiple-WAL-action process.
*
* Similarly, deletion of an only child page and deletion of its parent page
* form multiple WAL log entries, and we have to be prepared to follow through
* with the deletion if the log ends between.
*
* The data structure is a simple linked list --- this should be good enough,
* since we don't expect a page split to remain incomplete for long.
* since we don't expect a page split or multi deletion to remain incomplete
* for long. In any case we need to respect the order of operations.
*/
typedef struct bt_incomplete_split
typedef struct bt_incomplete_action
{
RelFileNode node; /* the index */
bool is_split; /* T = pending split, F = pending delete */
/* these fields are for a split: */
bool is_root; /* we split the root */
BlockNumber leftblk; /* left half of split */
BlockNumber rightblk; /* right half of split */
bool is_root; /* we split the root */
} bt_incomplete_split;
/* these fields are for a delete: */
BlockNumber delblk; /* parent block to be deleted */
} bt_incomplete_action;
static List *incomplete_splits;
static List *incomplete_actions;
static void
log_incomplete_split(RelFileNode node, BlockNumber leftblk,
BlockNumber rightblk, bool is_root)
{
bt_incomplete_split *split = palloc(sizeof(bt_incomplete_split));
bt_incomplete_action *action = palloc(sizeof(bt_incomplete_action));
split->node = node;
split->leftblk = leftblk;
split->rightblk = rightblk;
split->is_root = is_root;
incomplete_splits = lappend(incomplete_splits, split);
action->node = node;
action->is_split = true;
action->is_root = is_root;
action->leftblk = leftblk;
action->rightblk = rightblk;
incomplete_actions = lappend(incomplete_actions, action);
}
static void
@ -54,17 +64,50 @@ forget_matching_split(RelFileNode node, BlockNumber downlink, bool is_root)
{
ListCell *l;
foreach(l, incomplete_splits)
foreach(l, incomplete_actions)
{
bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l);
bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
if (RelFileNodeEquals(node, split->node) &&
downlink == split->rightblk)
if (RelFileNodeEquals(node, action->node) &&
action->is_split &&
downlink == action->rightblk)
{
if (is_root != split->is_root)
if (is_root != action->is_root)
elog(LOG, "forget_matching_split: fishy is_root data (expected %d, got %d)",
split->is_root, is_root);
incomplete_splits = list_delete_ptr(incomplete_splits, split);
action->is_root, is_root);
incomplete_actions = list_delete_ptr(incomplete_actions, action);
pfree(action);
break; /* need not look further */
}
}
}
static void
log_incomplete_deletion(RelFileNode node, BlockNumber delblk)
{
bt_incomplete_action *action = palloc(sizeof(bt_incomplete_action));
action->node = node;
action->is_split = false;
action->delblk = delblk;
incomplete_actions = lappend(incomplete_actions, action);
}
static void
forget_matching_deletion(RelFileNode node, BlockNumber delblk)
{
ListCell *l;
foreach(l, incomplete_actions)
{
bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
if (RelFileNodeEquals(node, action->node) &&
!action->is_split &&
delblk == action->delblk)
{
incomplete_actions = list_delete_ptr(incomplete_actions, action);
pfree(action);
break; /* need not look further */
}
}
@ -389,8 +432,7 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
}
static void
btree_xlog_delete_page(bool ismeta,
XLogRecPtr lsn, XLogRecord *record)
btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
{
xl_btree_delete_page *xlrec = (xl_btree_delete_page *) XLogRecGetData(record);
Relation reln;
@ -427,6 +469,7 @@ btree_xlog_delete_page(bool ismeta,
poffset = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
if (poffset >= PageGetMaxOffsetNumber(page))
{
Assert(info == XLOG_BTREE_DELETE_PAGE_HALF);
Assert(poffset == P_FIRSTDATAKEY(pageop));
PageIndexTupleDelete(page, poffset);
pageop->btpo_flags |= BTP_HALF_DEAD;
@ -437,6 +480,7 @@ btree_xlog_delete_page(bool ismeta,
IndexTuple itup;
OffsetNumber nextoffset;
Assert(info != XLOG_BTREE_DELETE_PAGE_HALF);
itemid = PageGetItemId(page, poffset);
itup = (IndexTuple) PageGetItem(page, itemid);
ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
@ -523,7 +567,7 @@ btree_xlog_delete_page(bool ismeta,
UnlockReleaseBuffer(buffer);
/* Update metapage if needed */
if (ismeta)
if (info == XLOG_BTREE_DELETE_PAGE_META)
{
xl_btree_metadata md;
@ -533,6 +577,13 @@ btree_xlog_delete_page(bool ismeta,
md.root, md.level,
md.fastroot, md.fastlevel);
}
/* Forget any completed deletion */
forget_matching_deletion(xlrec->target.node, target);
/* If parent became half-dead, remember it for deletion */
if (info == XLOG_BTREE_DELETE_PAGE_HALF)
log_incomplete_deletion(xlrec->target.node, parent);
}
static void
@ -620,10 +671,9 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
btree_xlog_delete(lsn, record);
break;
case XLOG_BTREE_DELETE_PAGE:
btree_xlog_delete_page(false, lsn, record);
break;
case XLOG_BTREE_DELETE_PAGE_META:
btree_xlog_delete_page(true, lsn, record);
case XLOG_BTREE_DELETE_PAGE_HALF:
btree_xlog_delete_page(info, lsn, record);
break;
case XLOG_BTREE_NEWROOT:
btree_xlog_newroot(lsn, record);
@ -724,6 +774,7 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
}
case XLOG_BTREE_DELETE_PAGE:
case XLOG_BTREE_DELETE_PAGE_META:
case XLOG_BTREE_DELETE_PAGE_HALF:
{
xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec;
@ -752,7 +803,7 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
void
btree_xlog_startup(void)
{
incomplete_splits = NIL;
incomplete_actions = NIL;
}
void
@ -760,45 +811,60 @@ btree_xlog_cleanup(void)
{
ListCell *l;
foreach(l, incomplete_splits)
foreach(l, incomplete_actions)
{
bt_incomplete_split *split = (bt_incomplete_split *) lfirst(l);
bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);
Relation reln;
Buffer lbuf,
rbuf;
Page lpage,
rpage;
BTPageOpaque lpageop,
rpageop;
bool is_only;
reln = XLogOpenRelation(split->node);
lbuf = XLogReadBuffer(reln, split->leftblk, false);
/* failure should be impossible because we wrote this page earlier */
if (!BufferIsValid(lbuf))
elog(PANIC, "btree_xlog_cleanup: left block unfound");
lpage = (Page) BufferGetPage(lbuf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
rbuf = XLogReadBuffer(reln, split->rightblk, false);
/* failure should be impossible because we wrote this page earlier */
if (!BufferIsValid(rbuf))
elog(PANIC, "btree_xlog_cleanup: right block unfound");
rpage = (Page) BufferGetPage(rbuf);
rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
reln = XLogOpenRelation(action->node);
if (action->is_split)
{
/* finish an incomplete split */
Buffer lbuf,
rbuf;
Page lpage,
rpage;
BTPageOpaque lpageop,
rpageop;
bool is_only;
/* if the two pages are all of their level, it's a only-page split */
is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
lbuf = XLogReadBuffer(reln, action->leftblk, false);
/* failure is impossible because we wrote this page earlier */
if (!BufferIsValid(lbuf))
elog(PANIC, "btree_xlog_cleanup: left block unfound");
lpage = (Page) BufferGetPage(lbuf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
rbuf = XLogReadBuffer(reln, action->rightblk, false);
/* failure is impossible because we wrote this page earlier */
if (!BufferIsValid(rbuf))
elog(PANIC, "btree_xlog_cleanup: right block unfound");
rpage = (Page) BufferGetPage(rbuf);
rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
_bt_insert_parent(reln, lbuf, rbuf, NULL,
split->is_root, is_only);
/* if the pages are all of their level, it's a only-page split */
is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
_bt_insert_parent(reln, lbuf, rbuf, NULL,
action->is_root, is_only);
}
else
{
/* finish an incomplete deletion (of a half-dead page) */
Buffer buf;
buf = XLogReadBuffer(reln, action->delblk, false);
if (BufferIsValid(buf))
if (_bt_pagedel(reln, buf, NULL, true) == 0)
elog(PANIC, "btree_xlog_cleanup: _bt_pagdel failed");
}
}
incomplete_splits = NIL;
incomplete_actions = NIL;
}
bool
btree_safe_restartpoint(void)
{
if (incomplete_splits)
if (incomplete_actions)
return false;
return true;
}