diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 8fc0c5c7bf3..aec11d49028 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1,4 +1,4 @@ -$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.7 2003/02/21 00:06:21 tgl Exp $ +$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.7.4.1 2006/11/01 19:50:14 tgl Exp $ This directory contains a correct implementation of Lehman and Yao's high-concurrency B-tree management algorithm (P. Lehman and S. Yao, @@ -189,9 +189,24 @@ the half-dead page's level and its parent's level may be a little out of whack: key space that appears to belong to the half-dead page's parent on the parent level may really belong to its right sibling. We can tolerate this, however, because insertions and deletions on upper tree levels are always -done by reference to child page numbers, not keys. The only cost is that -searches may sometimes descend to the half-dead page and then have to move -right, rather than going directly to the sibling page. +done by reference to child page numbers, not keys. Searches may sometimes +descend to the half-dead page and then have to move right, rather than going +directly to the sibling page, but this is no different from the behavior +during a split. + +A special case that arises from using half-dead pages for rightmost children +is that it's possible for the grandparent level's sequence of keys to become +out-of-order. This occurs when there are a large number of insertions into +the key space that's been implicitly transferred to the right sibling of the +half-dead page's parent. If the right sibling itself splits, the split +bounding key (which could be less than the high key of the parent page) is +inserted into the grandparent level to the right of the parent page. This +is pretty ugly, but it causes no serious damage. Searches, again, may descend +a bit to the left of the optimal path but will be able to recover. The only +problem is that when it comes time to delete the half-dead page, _bt_pagedel's +normal strategy for finding the target page's parent can fail: the search for +the page's high key may well descend to the right of the parent. In this case +we recover by searching from the left end of the parent level. A deleted page cannot be reclaimed immediately, since there may be other processes waiting to reference it (ie, search processes that just left the diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 299425a1e55..4bbcb0e10cd 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.106.2.2 2005/10/12 17:18:31 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.106.2.3 2006/11/01 19:50:15 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1253,8 +1253,8 @@ _bt_insert_parent(Relation rel, /* Check for error only after writing children */ if (pbuf == InvalidBuffer) - elog(ERROR, "failed to re-find parent key in \"%s\"", - RelationGetRelationName(rel)); + elog(ERROR, "failed to re-find parent key in \"%s\" for split pages %u/%u", + RelationGetRelationName(rel), bknum, rbknum); /* Recursively update the parent */ newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 0051888ad4f..6ec5d1353cd 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.72.2.1 2005/05/07 21:33:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.72.2.2 2006/11/01 19:50:15 tgl Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -897,16 +897,28 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); /* - * Next find and write-lock the current parent of the target page. - * This is essentially the same as the corresponding step of - * splitting. + * Next find and write-lock the current parent of the target page. This is + * essentially the same as the corresponding step of splitting. However, + * it's possible for the search to fail (for reasons explained in README). + * If that happens, we recover by searching the whole parent level, which + * is a tad inefficient but doesn't happen often enough to be a problem. */ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) - elog(ERROR, "failed to re-find parent key in \"%s\"", - RelationGetRelationName(rel)); + { + /* Find the leftmost page in the parent level */ + pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false); + stack->bts_blkno = BufferGetBlockNumber(pbuf); + stack->bts_offset = InvalidOffsetNumber; + _bt_relbuf(rel, pbuf); + /* and repeat search from there */ + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + if (pbuf == InvalidBuffer) + elog(ERROR, "failed to re-find parent key in \"%s\" for deletion target page %u", + RelationGetRelationName(rel), target); + } parent = stack->bts_blkno; poffset = stack->bts_offset;