More infrastructure for btree compaction project. Tree-traversal code

now knows what to do upon hitting a dead page (in theory anyway, it's untested...). Add a post-VACUUM-cleanup entry point for index AMs, to provide a place for dead-page scavenging to happen. Also, fix oversight that broke btpo_prev links in temporary indexes. initdb forced due to additions in pg_am.
2025-11-13 16:22:44 +03:00 · 2003-02-22 00:45:05 +00:00
parent 4fff132d1b
commit 799bc58dc7
18 changed files with 709 additions and 345 deletions
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.99 2002/11/13 00:39:46 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.100 2003/02/22 00:45:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -1650,8 +1650,9 @@ gistbulkdelete(PG_FUNCTION_ARGS)

 	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
 	result->num_pages = num_pages;
-	result->tuples_removed = tuples_removed;
 	result->num_index_tuples = num_index_tuples;
+	result->tuples_removed = tuples_removed;
+	result->pages_free = 0;

 	PG_RETURN_POINTER(result);
 }
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.60 2002/09/04 20:31:09 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.61 2003/02/22 00:45:03 tgl Exp $
 *
 * NOTES
 *	  This file contains only the public interface routines.
@@ -491,8 +491,9 @@ hashbulkdelete(PG_FUNCTION_ARGS)

 	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
 	result->num_pages = num_pages;
-	result->tuples_removed = tuples_removed;
 	result->num_index_tuples = num_index_tuples;
+	result->tuples_removed = tuples_removed;
+	result->pages_free = 0;

 	PG_RETURN_POINTER(result);
 }
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.63 2003/01/08 19:41:40 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.64 2003/02/22 00:45:03 tgl Exp $
 *
 * INTERFACE ROUTINES
 *		index_open		- open an index relation by relation OID
@@ -23,6 +23,7 @@
 *		index_restrpos	- restore a scan position
 *		index_getnext	- get the next tuple from a scan
 *		index_bulk_delete	- bulk deletion of index tuples
+ *		index_vacuum_cleanup	- post-deletion cleanup of an index
 *		index_cost_estimator	- fetch amcostestimate procedure OID
 *		index_getprocid - get a support procedure OID
 *
@@ -579,6 +580,37 @@ index_bulk_delete(Relation indexRelation,
 	return result;
 }

+/* ----------------
+ *		index_vacuum_cleanup - do post-deletion cleanup of an index
+ *
+ *		return value is an optional palloc'd struct of statistics
+ * ----------------
+ */
+IndexBulkDeleteResult *
+index_vacuum_cleanup(Relation indexRelation,
+					 IndexVacuumCleanupInfo *info,
+					 IndexBulkDeleteResult *stats)
+{
+	RegProcedure procedure;
+	IndexBulkDeleteResult *result;
+
+	RELATION_CHECKS;
+
+	/* It's okay for an index AM not to have a vacuumcleanup procedure */
+	if (!RegProcedureIsValid(indexRelation->rd_am->amvacuumcleanup))
+		return stats;
+
+	GET_REL_PROCEDURE(vacuum_cleanup, amvacuumcleanup);
+
+	result = (IndexBulkDeleteResult *)
+		DatumGetPointer(OidFunctionCall3(procedure,
+										 PointerGetDatum(indexRelation),
+										 PointerGetDatum((Pointer) info),
+										 PointerGetDatum((Pointer) stats)));
+
+	return result;
+}
+
 /* ----------------
 *		index_cost_estimator
 *
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.97 2003/02/21 00:06:21 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.98 2003/02/22 00:45:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -280,12 +280,21 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
 			if (!_bt_isequal(itupdesc, page, P_HIKEY,
 							 natts, itup_scankey))
 				break;
-			nblkno = opaque->btpo_next;
-			if (nbuf != InvalidBuffer)
-				_bt_relbuf(rel, nbuf);
-			nbuf = _bt_getbuf(rel, nblkno, BT_READ);
-			page = BufferGetPage(nbuf);
-			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			/* Advance to next non-dead page --- there must be one */
+			for (;;)
+			{
+				nblkno = opaque->btpo_next;
+				if (nbuf != InvalidBuffer)
+					_bt_relbuf(rel, nbuf);
+				nbuf = _bt_getbuf(rel, nblkno, BT_READ);
+				page = BufferGetPage(nbuf);
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (!P_IGNORE(opaque))
+					break;
+				if (P_RIGHTMOST(opaque))
+					elog(ERROR, "_bt_check_unique: fell off the end of %s",
+						 RelationGetRelationName(rel));
+			}
 			maxoff = PageGetMaxOffsetNumber(page);
 			offset = P_FIRSTDATAKEY(opaque);
 		}
@@ -414,20 +423,34 @@ _bt_insertonpg(Relation rel,
 			   _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
 			   random() > (MAX_RANDOM_VALUE / 100))
 		{
-			/* step right one page */
-			BlockNumber rblkno = lpageop->btpo_next;
-			Buffer		rbuf;
-
 			/*
-			 * must write-lock next page before releasing write lock on
+			 * step right to next non-dead page
+			 *
+			 * must write-lock that page before releasing write lock on
 			 * current page; else someone else's _bt_check_unique scan
-			 * could fail to see our insertion.
+			 * could fail to see our insertion.  write locks on intermediate
+			 * dead pages won't do because we don't know when they will get
+			 * de-linked from the tree.
 			 */
-			rbuf = _bt_getbuf(rel, rblkno, BT_WRITE);
+			Buffer		rbuf = InvalidBuffer;
+
+			for (;;)
+			{
+				BlockNumber rblkno = lpageop->btpo_next;
+
+				if (rbuf != InvalidBuffer)
+					_bt_relbuf(rel, rbuf);
+				rbuf = _bt_getbuf(rel, rblkno, BT_WRITE);
+				page = BufferGetPage(rbuf);
+				lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (!P_IGNORE(lpageop))
+					break;
+				if (P_RIGHTMOST(lpageop))
+					elog(ERROR, "_bt_insertonpg: fell off the end of %s",
+						 RelationGetRelationName(rel));
+			}
 			_bt_relbuf(rel, buf);
 			buf = rbuf;
-			page = BufferGetPage(buf);
-			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 			movedright = true;
 		}

@@ -633,8 +656,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	BTPageOpaque ropaque,
 				lopaque,
 				oopaque;
-	Buffer		sbuf = 0;
-	Page		spage = 0;
+	Buffer		sbuf = InvalidBuffer;
+	Page		spage = NULL;
+	BTPageOpaque sopaque = NULL;
 	Size		itemsz;
 	ItemId		itemid;
 	BTItem		item;
@@ -792,6 +816,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	{
 		sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
 		spage = BufferGetPage(sbuf);
+		sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+		if (sopaque->btpo_prev != ropaque->btpo_prev)
+			elog(PANIC, "btree: right sibling's left-link doesn't match");
 	}

 	/*
@@ -802,6 +829,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	 */
 	START_CRIT_SECTION();

+	if (!P_RIGHTMOST(ropaque))
+		sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+
 	/* XLOG stuff */
 	if (!rel->rd_istemp)
 	{
@@ -847,10 +877,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,

 		if (!P_RIGHTMOST(ropaque))
 		{
-			BTPageOpaque sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
-
-			sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
-
 			rdata[2].next = &(rdata[3]);
 			rdata[3].buffer = sbuf;
 			rdata[3].data = NULL;
@@ -1250,58 +1276,63 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
 		Buffer		buf;
 		Page		page;
 		BTPageOpaque opaque;
-		OffsetNumber offnum,
-					minoff,
-					maxoff;
-		ItemId		itemid;
-		BTItem		item;

 		buf = _bt_getbuf(rel, blkno, access);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-		minoff = P_FIRSTDATAKEY(opaque);
-		maxoff = PageGetMaxOffsetNumber(page);

-		/*
-		 * start = InvalidOffsetNumber means "search the whole page".
-		 * We need this test anyway due to possibility that
-		 * page has a high key now when it didn't before.
-		 */
-		if (start < minoff)
-			start = minoff;
-
-		/*
-		 * These loops will check every item on the page --- but in an order
-		 * that's attuned to the probability of where it actually is.  Scan
-		 * to the right first, then to the left.
-		 */
-		for (offnum = start;
-			 offnum <= maxoff;
-			 offnum = OffsetNumberNext(offnum))
+		if (!P_IGNORE(opaque))
 		{
-			itemid = PageGetItemId(page, offnum);
-			item = (BTItem) PageGetItem(page, itemid);
-			if (BTItemSame(item, &stack->bts_btitem))
+			OffsetNumber offnum,
+						minoff,
+						maxoff;
+			ItemId		itemid;
+			BTItem		item;
+
+			minoff = P_FIRSTDATAKEY(opaque);
+			maxoff = PageGetMaxOffsetNumber(page);
+
+			/*
+			 * start = InvalidOffsetNumber means "search the whole page".
+			 * We need this test anyway due to possibility that
+			 * page has a high key now when it didn't before.
+			 */
+			if (start < minoff)
+				start = minoff;
+
+			/*
+			 * These loops will check every item on the page --- but in an
+			 * order that's attuned to the probability of where it actually
+			 * is.  Scan to the right first, then to the left.
+			 */
+			for (offnum = start;
+				 offnum <= maxoff;
+				 offnum = OffsetNumberNext(offnum))
 			{
-				/* Return accurate pointer to where link is now */
-				stack->bts_blkno = blkno;
-				stack->bts_offset = offnum;
-				return buf;
+				itemid = PageGetItemId(page, offnum);
+				item = (BTItem) PageGetItem(page, itemid);
+				if (BTItemSame(item, &stack->bts_btitem))
+				{
+					/* Return accurate pointer to where link is now */
+					stack->bts_blkno = blkno;
+					stack->bts_offset = offnum;
+					return buf;
+				}
 			}
-		}

-		for (offnum = OffsetNumberPrev(start);
-			 offnum >= minoff;
-			 offnum = OffsetNumberPrev(offnum))
-		{
-			itemid = PageGetItemId(page, offnum);
-			item = (BTItem) PageGetItem(page, itemid);
-			if (BTItemSame(item, &stack->bts_btitem))
+			for (offnum = OffsetNumberPrev(start);
+				 offnum >= minoff;
+				 offnum = OffsetNumberPrev(offnum))
 			{
-				/* Return accurate pointer to where link is now */
-				stack->bts_blkno = blkno;
-				stack->bts_offset = offnum;
-				return buf;
+				itemid = PageGetItemId(page, offnum);
+				item = (BTItem) PageGetItem(page, itemid);
+				if (BTItemSame(item, &stack->bts_btitem))
+				{
+					/* Return accurate pointer to where link is now */
+					stack->bts_blkno = blkno;
+					stack->bts_offset = offnum;
+					return buf;
+				}
 			}
 		}

@@ -1365,6 +1396,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
 	rootpage = BufferGetPage(rootbuf);
 	rootblknum = BufferGetBlockNumber(rootbuf);
+
+	/* acquire lock on the metapage */
 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.59 2003/02/21 00:06:21 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.60 2003/02/22 00:45:04 tgl Exp $
 *
 *	NOTES
 *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -22,34 +22,17 @@
 */
 #include "postgres.h"

-#include <time.h>
-
 #include "access/nbtree.h"
 #include "miscadmin.h"
 #include "storage/lmgr.h"

-extern bool FixBTree;			/* comments in nbtree.c */
-extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
-
-/*
- *	We use high-concurrency locking on btrees.	There are two cases in
- *	which we don't do locking.  One is when we're building the btree.
- *	Since the creating transaction has not committed, no one can see
- *	the index, and there's no reason to share locks.  The second case
- *	is when we're just starting up the database system.  We use some
- *	special-purpose initialization code in the relation cache manager
- *	(see utils/cache/relcache.c) to allow us to do indexed scans on
- *	the system catalogs before we'd normally be able to.  This happens
- *	before the lock table is fully initialized, so we can't use it.
- *	Strictly speaking, this violates 2pl, but we don't do 2pl on the
- *	system catalogs anyway, so I declare this to be okay.
- */
-
-#define USELOCKING		(!BuildingBtree && !IsInitProcessingMode())
-

 /*
 *	_bt_metapinit() -- Initialize the metadata page of a new btree.
+ *
+ * Note: there's no real need for any locking here.  Since the transaction
+ * creating the index hasn't committed yet, no one else can even see the index
+ * much less be trying to use it.
 */
 void
 _bt_metapinit(Relation rel)
@@ -59,10 +42,6 @@ _bt_metapinit(Relation rel)
 	BTMetaPageData *metad;
 	BTPageOpaque op;

-	/* can't be sharing this with anyone, now... */
-	if (USELOCKING)
-		LockRelation(rel, AccessExclusiveLock);
-
 	if (RelationGetNumberOfBlocks(rel) != 0)
 		elog(ERROR, "Cannot initialize non-empty btree %s",
 			 RelationGetRelationName(rel));
@@ -114,10 +93,6 @@ _bt_metapinit(Relation rel)
 	END_CRIT_SECTION();

 	WriteBuffer(buf);
-
-	/* all done */
-	if (USELOCKING)
-		UnlockRelation(rel, AccessExclusiveLock);
 }

 /*
@@ -142,7 +117,8 @@ _bt_metapinit(Relation rel)
 *		what we will return is the old root, which is now just the leftmost
 *		page on a probably-not-very-wide level.  For most purposes this is
 *		as good as or better than the true root, so we do not bother to
- *		insist on finding the true root.
+ *		insist on finding the true root.  We do, however, guarantee to
+ *		return a live (not deleted or half-dead) page.
 *
 *		On successful return, the root page is pinned and read-locked.
 *		The metadata page is not locked or pinned on exit.
@@ -157,6 +133,7 @@ _bt_getroot(Relation rel, int access)
 	Page		rootpage;
 	BTPageOpaque rootopaque;
 	BlockNumber rootblkno;
+	uint32		rootlevel;
 	BTMetaPageData *metad;

 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
@@ -164,6 +141,7 @@ _bt_getroot(Relation rel, int access)
 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
 	metad = BTPageGetMeta(metapg);

+	/* sanity-check the metapage */
 	if (!(metaopaque->btpo_flags & BTP_META) ||
 		metad->btm_magic != BTREE_MAGIC)
 		elog(ERROR, "Index %s is not a btree",
@@ -191,90 +169,113 @@ _bt_getroot(Relation rel, int access)
 		/*
 		 * Race condition:	if someone else initialized the metadata
 		 * between the time we released the read lock and acquired the
-		 * write lock, above, we must avoid doing it again.
+		 * write lock, we must avoid doing it again.
 		 */
-		if (metad->btm_root == P_NONE)
-		{
-			/*
-			 * Get, initialize, write, and leave a lock of the appropriate
-			 * type on the new root page.  Since this is the first page in
-			 * the tree, it's a leaf as well as the root.
-			 */
-			rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
-			rootblkno = BufferGetBlockNumber(rootbuf);
-			rootpage = BufferGetPage(rootbuf);
-
-			_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
-			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
-			rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
-			rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
-			rootopaque->btpo.level = 0;
-
-			/* NO ELOG(ERROR) till meta is updated */
-			START_CRIT_SECTION();
-
-			metad->btm_root = rootblkno;
-			metad->btm_level = 0;
-			metad->btm_fastroot = rootblkno;
-			metad->btm_fastlevel = 0;
-
-			/* XLOG stuff */
-			if (!rel->rd_istemp)
-			{
-				xl_btree_newroot xlrec;
-				XLogRecPtr	recptr;
-				XLogRecData rdata;
-
-				xlrec.node = rel->rd_node;
-				xlrec.rootblk = rootblkno;
-				xlrec.level = 0;
-
-				rdata.buffer = InvalidBuffer;
-				rdata.data = (char *) &xlrec;
-				rdata.len = SizeOfBtreeNewroot;
-				rdata.next = NULL;
-
-				recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
-
-				PageSetLSN(rootpage, recptr);
-				PageSetSUI(rootpage, ThisStartUpID);
-				PageSetLSN(metapg, recptr);
-				PageSetSUI(metapg, ThisStartUpID);
-			}
-
-			END_CRIT_SECTION();
-
-			_bt_wrtnorelbuf(rel, rootbuf);
-
-			/*
-			 * swap root write lock for read lock.  There is no danger of
-			 * anyone else accessing the new root page while it's unlocked,
-			 * since no one else knows where it is yet.
-			 */
-			LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
-			LockBuffer(rootbuf, BT_READ);
-
-			/* okay, metadata is correct, write and release it */
-			_bt_wrtbuf(rel, metabuf);
-		}
-		else
+		if (metad->btm_root != P_NONE)
 		{
 			/*
 			 * Metadata initialized by someone else.  In order to
 			 * guarantee no deadlocks, we have to release the metadata
-			 * page and start all over again.
+			 * page and start all over again.  (Is that really true?
+			 * But it's hardly worth trying to optimize this case.)
 			 */
 			_bt_relbuf(rel, metabuf);
 			return _bt_getroot(rel, access);
 		}
+
+		/*
+		 * Get, initialize, write, and leave a lock of the appropriate
+		 * type on the new root page.  Since this is the first page in
+		 * the tree, it's a leaf as well as the root.
+		 */
+		rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+		rootblkno = BufferGetBlockNumber(rootbuf);
+		rootpage = BufferGetPage(rootbuf);
+
+		_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+		rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+		rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
+		rootopaque->btpo.level = 0;
+
+		/* NO ELOG(ERROR) till meta is updated */
+		START_CRIT_SECTION();
+
+		metad->btm_root = rootblkno;
+		metad->btm_level = 0;
+		metad->btm_fastroot = rootblkno;
+		metad->btm_fastlevel = 0;
+
+		/* XLOG stuff */
+		if (!rel->rd_istemp)
+		{
+			xl_btree_newroot xlrec;
+			XLogRecPtr	recptr;
+			XLogRecData rdata;
+
+			xlrec.node = rel->rd_node;
+			xlrec.rootblk = rootblkno;
+			xlrec.level = 0;
+
+			rdata.buffer = InvalidBuffer;
+			rdata.data = (char *) &xlrec;
+			rdata.len = SizeOfBtreeNewroot;
+			rdata.next = NULL;
+
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
+
+			PageSetLSN(rootpage, recptr);
+			PageSetSUI(rootpage, ThisStartUpID);
+			PageSetLSN(metapg, recptr);
+			PageSetSUI(metapg, ThisStartUpID);
+		}
+
+		END_CRIT_SECTION();
+
+		_bt_wrtnorelbuf(rel, rootbuf);
+
+		/*
+		 * swap root write lock for read lock.  There is no danger of
+		 * anyone else accessing the new root page while it's unlocked,
+		 * since no one else knows where it is yet.
+		 */
+		LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
+		LockBuffer(rootbuf, BT_READ);
+
+		/* okay, metadata is correct, write and release it */
+		_bt_wrtbuf(rel, metabuf);
 	}
 	else
 	{
 		rootblkno = metad->btm_fastroot;
+		Assert(rootblkno != P_NONE);
+		rootlevel = metad->btm_fastlevel;

 		_bt_relbuf(rel, metabuf);		/* done with the meta page */

-		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+		for (;;)
+		{
+			rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+			rootpage = BufferGetPage(rootbuf);
+			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+			if (!P_IGNORE(rootopaque))
+				break;
+
+			/* it's dead, Jim.  step right one page */
+			if (P_RIGHTMOST(rootopaque))
+				elog(ERROR, "No live root page found in %s",
+					 RelationGetRelationName(rel));
+			rootblkno = rootopaque->btpo_next;
+
+			_bt_relbuf(rel, rootbuf);
+		}
+
+		/* Note: can't check btpo.level on deleted pages */
+		if (rootopaque->btpo.level != rootlevel)
+			elog(ERROR, "Root page %u of %s has level %u, expected %u",
+				 rootblkno, RelationGetRelationName(rel),
+				 rootopaque->btpo.level, rootlevel);
 	}

 	/*
@@ -305,7 +306,10 @@ _bt_gettrueroot(Relation rel)
 	Page		metapg;
 	BTPageOpaque metaopaque;
 	Buffer		rootbuf;
+	Page		rootpage;
+	BTPageOpaque rootopaque;
 	BlockNumber rootblkno;
+	uint32		rootlevel;
 	BTMetaPageData *metad;

 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
@@ -331,10 +335,33 @@ _bt_gettrueroot(Relation rel)
 	}

 	rootblkno = metad->btm_root;
+	rootlevel = metad->btm_level;

 	_bt_relbuf(rel, metabuf);	/* done with the meta page */

-	rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+	for (;;)
+	{
+		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
+		rootpage = BufferGetPage(rootbuf);
+		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+
+		if (!P_IGNORE(rootopaque))
+			break;
+
+		/* it's dead, Jim.  step right one page */
+		if (P_RIGHTMOST(rootopaque))
+			elog(ERROR, "No live root page found in %s",
+				 RelationGetRelationName(rel));
+		rootblkno = rootopaque->btpo_next;
+
+		_bt_relbuf(rel, rootbuf);
+	}
+
+	/* Note: can't check btpo.level on deleted pages */
+	if (rootopaque->btpo.level != rootlevel)
+		elog(ERROR, "Root page %u of %s has level %u, expected %u",
+			 rootblkno, RelationGetRelationName(rel),
+			 rootopaque->btpo.level, rootlevel);

 	return rootbuf;
 }
@@ -342,6 +369,8 @@ _bt_gettrueroot(Relation rel)
 /*
 *	_bt_getbuf() -- Get a buffer by block number for read or write.
 *
+ *		blkno == P_NEW means to get an unallocated index page.
+ *
 *		When this routine returns, the appropriate lock is set on the
 *		requested buffer and its reference count has been incremented
 *		(ie, the buffer is "locked and pinned").
@@ -359,18 +388,35 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 	}
 	else
 	{
+		bool		needLock;
 		Page		page;

+		/* XXX soon: ask FSM about free space */
+
 		/*
 		 * Extend the relation by one page.
 		 *
-		 * Extend bufmgr code is unclean and so we have to use extra locking
-		 * here.
+		 * We have to use a lock to ensure no one else is extending the rel at
+		 * the same time, else we will both try to initialize the same new
+		 * page.  We can skip locking for new or temp relations, however,
+		 * since no one else could be accessing them.
 		 */
-		LockPage(rel, 0, ExclusiveLock);
-		buf = ReadBuffer(rel, blkno);
+		needLock = !(rel->rd_isnew || rel->rd_istemp);
+
+		if (needLock)
+			LockPage(rel, 0, ExclusiveLock);
+
+		buf = ReadBuffer(rel, P_NEW);
+
+		/*
+		 * Release the file-extension lock; it's now OK for someone else to
+		 * extend the relation some more.
+		 */
+		if (needLock)
+			UnlockPage(rel, 0, ExclusiveLock);
+
+		/* Acquire appropriate buffer lock on new page */
 		LockBuffer(buf, access);
-		UnlockPage(rel, 0, ExclusiveLock);

 		/* Initialize the new page before returning it */
 		page = BufferGetPage(buf);
@@ -403,10 +449,9 @@ _bt_relbuf(Relation rel, Buffer buf)
 *		and a pin on the buffer.
 *
 * NOTE: actually, the buffer manager just marks the shared buffer page
- * dirty here, the real I/O happens later.	Since we can't persuade the
- * Unix kernel to schedule disk writes in a particular order, there's not
- * much point in worrying about this.  The most we can say is that all the
- * writes will occur before commit.
+ * dirty here; the real I/O happens later.  This is okay since we are not
+ * relying on write ordering anyway.  The WAL mechanism is responsible for
+ * guaranteeing correctness after a crash.
 */
 void
 _bt_wrtbuf(Relation rel, Buffer buf)
@@ -455,8 +500,9 @@ _bt_pageinit(Page page, Size size)
 *		mistake.  On exit, metapage data is correct and we no longer have
 *		a pin or lock on the metapage.
 *
- * XXX this is not used for splitting anymore, only in nbtsort.c at the
- * completion of btree building.
+ * Actually this is not used for splitting on-the-fly anymore.  It's only used
+ * in nbtsort.c at the completion of btree building, where we know we have
+ * sole access to the index anyway.
 */
 void
 _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
@@ -512,6 +558,10 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
 /*
 * Delete an item from a btree page.
 *
+ * This must only be used for deleting leaf items.  Deleting an item on a
+ * non-leaf page has to be done as part of an atomic action that includes
+ * deleting the page it points to.
+ *
 * This routine assumes that the caller has pinned and locked the buffer,
 * and will write the buffer afterwards.
 */
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.95 2003/02/21 00:06:21 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.96 2003/02/22 00:45:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -23,6 +23,7 @@
 #include "access/nbtree.h"
 #include "catalog/index.h"
 #include "miscadmin.h"
+#include "storage/freespace.h"


 /* Working state for btbuild and its callback */
@@ -44,7 +45,6 @@ typedef struct
 } BTBuildState;


-bool		BuildingBtree = false;		/* see comment in btbuild() */
 bool		FastBuild = true;	/* use SORT instead of insertion build */

 /*
@@ -68,13 +68,7 @@ static void btbuildCallback(Relation index,
 void
 AtEOXact_nbtree(void)
 {
-	/*
-	 * Note: these actions should only be necessary during xact abort; but
-	 * they can't hurt during a commit.
-	 */
-
-	/* If we were building a btree, we ain't anymore. */
-	BuildingBtree = false;
+	/* nothing to do at the moment */
 }


@@ -95,9 +89,6 @@ btbuild(PG_FUNCTION_ARGS)
 	double		reltuples;
 	BTBuildState buildstate;

-	/* set flag to disable locking */
-	BuildingBtree = true;
-
 	/*
 	 * bootstrap processing does something strange, so don't use
 	 * sort/build for initial catalog indices.	at some point i need to
@@ -172,9 +163,6 @@ btbuild(PG_FUNCTION_ARGS)
 	}
 #endif   /* BTREE_BUILD_STATS */

-	/* all done */
-	BuildingBtree = false;
-
 	/*
 	 * Since we just counted the tuples in the heap, we update its stats
 	 * in pg_class to guarantee that the planner takes advantage of the
@@ -689,10 +677,6 @@ btbulkdelete(PG_FUNCTION_ARGS)
 				 * We now need to back up the scan one item, so that the next
 				 * cycle will re-examine the same offnum on this page (which
 				 * now holds the next item).
-				 *
-				 * For now, just hack the current-item index.  Will need to
-				 * be smarter when deletion includes removal of empty
-				 * index pages.
 				 */
 				current->ip_posid--;
 			}
@@ -708,12 +692,89 @@ btbulkdelete(PG_FUNCTION_ARGS)

 	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
 	result->num_pages = num_pages;
-	result->tuples_removed = tuples_removed;
 	result->num_index_tuples = num_index_tuples;
+	result->tuples_removed = tuples_removed;
+	result->pages_free = 0;		/* not computed here */

 	PG_RETURN_POINTER(result);
 }

+/*
+ * Post-VACUUM cleanup.
+ *
+ * Here, we scan looking for pages we can delete or return to the freelist.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+Datum
+btvacuumcleanup(PG_FUNCTION_ARGS)
+{
+	Relation	rel = (Relation) PG_GETARG_POINTER(0);
+#ifdef NOT_USED
+	IndexVacuumCleanupInfo *info = (IndexVacuumCleanupInfo *) PG_GETARG_POINTER(1);
+#endif
+	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(2);
+	BlockNumber num_pages;
+	BlockNumber blkno;
+	PageFreeSpaceInfo *pageSpaces;
+	int			nFreePages,
+				maxFreePages;
+
+	Assert(stats != NULL);
+
+	num_pages = RelationGetNumberOfBlocks(rel);
+
+	/* No point in remembering more than MaxFSMPages pages */
+	maxFreePages = MaxFSMPages;
+	if ((BlockNumber) maxFreePages > num_pages)
+		maxFreePages = (int) num_pages + 1;	/* +1 to avoid palloc(0) */
+	pageSpaces = (PageFreeSpaceInfo *) palloc(maxFreePages * sizeof(PageFreeSpaceInfo));
+	nFreePages = 0;
+
+	/*
+	 * Scan through all pages of index, except metapage.  (Any pages added
+	 * after we start the scan will not be examined; this should be fine,
+	 * since they can't possibly be empty.)
+	 */
+	for (blkno = BTREE_METAPAGE+1; blkno < num_pages; blkno++)
+	{
+		Buffer	buf;
+		Page	page;
+		BTPageOpaque opaque;
+
+		buf = _bt_getbuf(rel, blkno, BT_READ);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (P_ISDELETED(opaque))
+		{
+			/* XXX if safe-to-reclaim... */
+			if (nFreePages < maxFreePages)
+			{
+				pageSpaces[nFreePages].blkno = blkno;
+				/* The avail-space value is bogus, but must be < BLCKSZ */
+				pageSpaces[nFreePages].avail = BLCKSZ-1;
+				nFreePages++;
+			}
+		}
+		_bt_relbuf(rel, buf);
+	}
+
+	/*
+	 * Update the shared Free Space Map with the info we now have about
+	 * free space in the index, discarding any old info the map may have.
+	 * We do not need to sort the page numbers; they're in order already.
+	 */
+	MultiRecordFreeSpace(&rel->rd_node, 0, nFreePages, pageSpaces);
+
+	pfree(pageSpaces);
+
+	/* update statistics */
+	stats->num_pages = num_pages;
+	stats->pages_free = nFreePages;
+
+	PG_RETURN_POINTER(stats);
+}
+
 /*
 * Restore scan position when btgettuple is called to continue a scan.
 *
@@ -739,7 +800,7 @@ _bt_restscan(IndexScanDesc scan)
 				maxoff;
 	BTPageOpaque opaque;
 	Buffer		nextbuf;
-	ItemPointerData target = so->curHeapIptr;
+	ItemPointer target = &(so->curHeapIptr);
 	BTItem		item;
 	BlockNumber blkno;

@@ -759,7 +820,7 @@ _bt_restscan(IndexScanDesc scan)
 	 * current->ip_posid before first index tuple on the current page
 	 * (_bt_step will move it right)...  XXX still needed?
 	 */
-	if (!ItemPointerIsValid(&target))
+	if (!ItemPointerIsValid(target))
 	{
 		ItemPointerSetOffsetNumber(current,
 							   OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
@@ -778,11 +839,7 @@ _bt_restscan(IndexScanDesc scan)
 			 offnum = OffsetNumberNext(offnum))
 		{
 			item = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-			if (item->bti_itup.t_tid.ip_blkid.bi_hi ==
-				target.ip_blkid.bi_hi &&
-				item->bti_itup.t_tid.ip_blkid.bi_lo ==
-				target.ip_blkid.bi_lo &&
-				item->bti_itup.t_tid.ip_posid == target.ip_posid)
+			if (BTTidSame(item->bti_itup.t_tid, *target))
 			{
 				/* Found it */
 				current->ip_posid = offnum;
@@ -793,22 +850,33 @@ _bt_restscan(IndexScanDesc scan)
 		/*
 		 * The item we're looking for moved right at least one page, so
 		 * move right.  We are careful here to pin and read-lock the next
-		 * page before releasing the current one.  This ensures that a
-		 * concurrent btbulkdelete scan cannot pass our position --- if it
+		 * non-dead page before releasing the current one.  This ensures that
+		 * a concurrent btbulkdelete scan cannot pass our position --- if it
 		 * did, it might be able to reach and delete our target item before
 		 * we can find it again.
 		 */
 		if (P_RIGHTMOST(opaque))
-			elog(FATAL, "_bt_restscan: my bits moved right off the end of the world!"
+			elog(ERROR, "_bt_restscan: my bits moved right off the end of the world!"
 				 "\n\tRecreate index %s.", RelationGetRelationName(rel));
-
-		blkno = opaque->btpo_next;
-		nextbuf = _bt_getbuf(rel, blkno, BT_READ);
+		/* Advance to next non-dead page --- there must be one */
+		nextbuf = InvalidBuffer;
+		for (;;)
+		{
+			blkno = opaque->btpo_next;
+			if (nextbuf != InvalidBuffer)
+				_bt_relbuf(rel, nextbuf);
+			nextbuf = _bt_getbuf(rel, blkno, BT_READ);
+			page = BufferGetPage(nextbuf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			if (!P_IGNORE(opaque))
+				break;
+			if (P_RIGHTMOST(opaque))
+				elog(ERROR, "_bt_restscan: fell off the end of %s",
+					 RelationGetRelationName(rel));
+		}
 		_bt_relbuf(rel, buf);
 		so->btso_curbuf = buf = nextbuf;
-		page = BufferGetPage(buf);
 		maxoff = PageGetMaxOffsetNumber(page);
-		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		offnum = P_FIRSTDATAKEY(opaque);
 		ItemPointerSet(current, blkno, offnum);
 	}
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1,14 +1,14 @@
 /*-------------------------------------------------------------------------
 *
 * nbtsearch.c
- *	  search code for postgres btrees.
+ *	  Search code for postgres btrees.
 *
 *
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.74 2003/02/22 00:45:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -19,6 +19,7 @@
 #include "access/nbtree.h"


+static Buffer _bt_walk_left(Relation rel, Buffer buf);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);


@@ -79,10 +80,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
 		par_blkno = BufferGetBlockNumber(*bufP);

 		/*
-		 * We need to save the bit image of the index entry we chose in
+		 * We need to save the location of the index entry we chose in
 		 * the parent page on a stack. In case we split the tree, we'll
-		 * use this bit image to figure out what our real parent page is,
-		 * in case the parent splits while we're working lower in the
+		 * use the stack to work back up to the parent page.  We also save
+		 * the actual downlink (TID) to uniquely identify the index entry,
+		 * in case it moves right while we're working lower in the
 		 * tree.  See the paper by Lehman and Yao for how this is detected
 		 * and handled. (We use the child link to disambiguate duplicate
 		 * keys in the index -- Lehman and Yao disallow duplicate keys.)
@@ -114,7 +116,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey,
 /*
 *	_bt_moveright() -- move right in the btree if necessary.
 *
- *		When we drop and reacquire a pointer to a page, it is possible that
+ *		When we follow a pointer to reach a page, it is possible that
 *		the page has changed in the meanwhile.	If this happens, we're
 *		guaranteed that the page has "split right" -- that is, that any
 *		data that appeared on the page originally is either on the page
@@ -148,9 +150,13 @@ _bt_moveright(Relation rel,
 	 * right.  (If the scan key is equal to the high key, we might or
 	 * might not need to move right; have to scan the page first anyway.)
 	 * It could even have split more than once, so scan as far as needed.
+	 *
+	 * We also have to move right if we followed a link that brought us to
+	 * a dead page.
 	 */
 	while (!P_RIGHTMOST(opaque) &&
-		   _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)
+		   (P_IGNORE(opaque) ||
+			_bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0))
 	{
 		/* step right one page */
 		BlockNumber rblkno = opaque->btpo_next;
@@ -161,6 +167,10 @@ _bt_moveright(Relation rel,
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	}

+	if (P_IGNORE(opaque))
+		elog(ERROR, "_bt_moveright: fell off the end of %s",
+			 RelationGetRelationName(rel));
+
 	return buf;
 }

@@ -796,7 +806,6 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 	OffsetNumber offnum,
 				maxoff;
 	BlockNumber blkno;
-	BlockNumber obknum;

 	/*
 	 * Don't use ItemPointerGetOffsetNumber or you risk to get assertion
@@ -814,7 +823,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 			offnum = OffsetNumberNext(offnum);
 		else
 		{
-			/* walk right to the next page with data */
+			/* Walk right to the next page with data */
 			for (;;)
 			{
 				/* if we're at end of scan, release the buffer and return */
@@ -831,58 +840,56 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 				*bufP = _bt_getbuf(rel, blkno, BT_READ);
 				page = BufferGetPage(*bufP);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-				maxoff = PageGetMaxOffsetNumber(page);
-				/* done if it's not empty */
-				offnum = P_FIRSTDATAKEY(opaque);
-				if (!PageIsEmpty(page) && offnum <= maxoff)
-					break;
+				if (!P_IGNORE(opaque))
+				{
+					maxoff = PageGetMaxOffsetNumber(page);
+					/* done if it's not empty */
+					offnum = P_FIRSTDATAKEY(opaque);
+					if (!PageIsEmpty(page) && offnum <= maxoff)
+						break;
+				}
 			}
 		}
 	}
-	else
+	else						/* backwards scan */
 	{
 		if (offnum > P_FIRSTDATAKEY(opaque))
 			offnum = OffsetNumberPrev(offnum);
 		else
 		{
-			/* walk left to the next page with data */
+			/*
+			 * Walk left to the next page with data.  This is much more
+			 * complex than the walk-right case because of the possibility
+			 * that the page to our left splits while we are in flight to it,
+			 * plus the possibility that the page we were on gets deleted
+			 * after we leave it.  See nbtree/README for details.
+			 */
 			for (;;)
 			{
-				/* if we're at end of scan, release the buffer and return */
-				if (P_LEFTMOST(opaque))
+				*bufP = _bt_walk_left(rel, *bufP);
+
+				/* if we're at end of scan, return failure */
+				if (*bufP == InvalidBuffer)
 				{
-					_bt_relbuf(rel, *bufP);
 					ItemPointerSetInvalid(current);
-					*bufP = so->btso_curbuf = InvalidBuffer;
+					so->btso_curbuf = InvalidBuffer;
 					return false;
 				}
-				/* step left */
-				obknum = BufferGetBlockNumber(*bufP);
-				blkno = opaque->btpo_prev;
-				_bt_relbuf(rel, *bufP);
-				*bufP = _bt_getbuf(rel, blkno, BT_READ);
 				page = BufferGetPage(*bufP);
 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
 				/*
-				 * If the adjacent page just split, then we have to walk
-				 * right to find the block that's now adjacent to where we
-				 * were.  Because pages only split right, we don't have to
-				 * worry about this failing to terminate.
+				 * Okay, we managed to move left to a non-deleted page.
+				 * Done if it's not half-dead and not empty.  Else loop back
+				 * and do it all again.
 				 */
-				while (opaque->btpo_next != obknum)
+				if (!P_IGNORE(opaque))
 				{
-					blkno = opaque->btpo_next;
-					_bt_relbuf(rel, *bufP);
-					*bufP = _bt_getbuf(rel, blkno, BT_READ);
-					page = BufferGetPage(*bufP);
-					opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+					maxoff = PageGetMaxOffsetNumber(page);
+					offnum = maxoff;
+					if (!PageIsEmpty(page) &&
+						maxoff >= P_FIRSTDATAKEY(opaque))
+						break;
 				}
-				/* done if it's not empty */
-				maxoff = PageGetMaxOffsetNumber(page);
-				offnum = maxoff;
-				if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque))
-					break;
 			}
 		}
 	}
@@ -895,11 +902,133 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 	return true;
 }

+/*
+ * _bt_walk_left() -- step left one page, if possible
+ *
+ * The given buffer must be pinned and read-locked.  This will be dropped
+ * before stepping left.  On return, we have pin and read lock on the
+ * returned page, instead.
+ *
+ * Returns InvalidBuffer if there is no page to the left (no lock is held
+ * in that case).
+ *
+ * When working on a non-leaf level, it is possible for the returned page
+ * to be half-dead; the caller should check that condition and step left
+ * again if it's important.
+ */
+static Buffer
+_bt_walk_left(Relation rel, Buffer buf)
+{
+	Page		page;
+	BTPageOpaque opaque;
+
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	for (;;)
+	{
+		BlockNumber obknum;
+		BlockNumber lblkno;
+		BlockNumber blkno;
+		int			tries;
+
+		/* if we're at end of tree, release buf and return failure */
+		if (P_LEFTMOST(opaque))
+		{
+			_bt_relbuf(rel, buf);
+			break;
+		}
+		/* remember original page we are stepping left from */
+		obknum = BufferGetBlockNumber(buf);
+		/* step left */
+		blkno = lblkno = opaque->btpo_prev;
+		_bt_relbuf(rel, buf);
+		buf = _bt_getbuf(rel, blkno, BT_READ);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		/*
+		 * If this isn't the page we want, walk right till we find
+		 * what we want --- but go no more than four hops (an
+		 * arbitrary limit).  If we don't find the correct page by then,
+		 * the most likely bet is that the original page got deleted
+		 * and isn't in the sibling chain at all anymore, not that its
+		 * left sibling got split more than four times.
+		 *
+		 * Note that it is correct to test P_ISDELETED not P_IGNORE
+		 * here, because half-dead pages are still in the sibling
+		 * chain.  Caller must reject half-dead pages if wanted.
+		 */
+		tries = 0;
+		for (;;)
+		{
+			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
+			{
+				/* Found desired page, return it */
+				return buf;
+			}
+			if (P_RIGHTMOST(opaque) || ++tries > 4)
+				break;
+			blkno = opaque->btpo_next;
+			_bt_relbuf(rel, buf);
+			buf = _bt_getbuf(rel, blkno, BT_READ);
+			page = BufferGetPage(buf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
+
+		/* Return to the original page to see what's up */
+		_bt_relbuf(rel, buf);
+		buf = _bt_getbuf(rel, obknum, BT_READ);
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (P_ISDELETED(opaque))
+		{
+			/*
+			 * It was deleted.  Move right to first nondeleted page
+			 * (there must be one); that is the page that has acquired the
+			 * deleted one's keyspace, so stepping left from it will take
+			 * us where we want to be.
+			 */
+			for (;;)
+			{
+				if (P_RIGHTMOST(opaque))
+					elog(ERROR, "_bt_walk_left: fell off the end of %s",
+						 RelationGetRelationName(rel));
+				blkno = opaque->btpo_next;
+				_bt_relbuf(rel, buf);
+				buf = _bt_getbuf(rel, blkno, BT_READ);
+				page = BufferGetPage(buf);
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (!P_ISDELETED(opaque))
+					break;
+			}
+			/*
+			 * Now return to top of loop, resetting obknum to
+			 * point to this nondeleted page, and try again.
+			 */
+		}
+		else
+		{
+			/*
+			 * It wasn't deleted; the explanation had better be
+			 * that the page to the left got split or deleted.
+			 * Without this check, we'd go into an infinite loop
+			 * if there's anything wrong.
+			 */
+			if (opaque->btpo_prev == lblkno)
+				elog(ERROR, "_bt_walk_left: can't find left sibling in %s",
+					 RelationGetRelationName(rel));
+			/* Okay to try again with new lblkno value */
+		}
+	}
+
+	return InvalidBuffer;
+}
+
 /*
 * _bt_get_endpoint() -- Find the first or last page on a given tree level
 *
 * If the index is empty, we will return InvalidBuffer; any other failure
- * condition causes elog().
+ * condition causes elog().  We will not return a dead page.
 *
 * The returned buffer is pinned and read-locked.
 */
@@ -941,12 +1070,13 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
 		 * step right if needed to get to it (this could happen if the
 		 * page split since we obtained a pointer to it).
 		 */
-		while (P_ISDELETED(opaque) ||
+		while (P_IGNORE(opaque) ||
 			   (rightmost && !P_RIGHTMOST(opaque)))
 		{
 			blkno = opaque->btpo_next;
 			if (blkno == P_NONE)
-				elog(ERROR, "_bt_get_endpoint: ran off end of btree");
+				elog(ERROR, "_bt_get_endpoint: fell off the end of %s",
+					 RelationGetRelationName(rel));
 			_bt_relbuf(rel, buf);
 			buf = _bt_getbuf(rel, blkno, BT_READ);
 			page = BufferGetPage(buf);
@@ -959,7 +1089,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
 		if (opaque->btpo.level < level)
 			elog(ERROR, "_bt_get_endpoint: btree level %u not found", level);

-		/* Step to leftmost or rightmost child page */
+		/* Descend to leftmost or rightmost child page */
 		if (rightmost)
 			offnum = PageGetMaxOffsetNumber(page);
 		else
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1,4 +1,5 @@
 /*-------------------------------------------------------------------------
+ *
 * nbtsort.c
 *		Build a btree from sorted input by loading leaf pages sequentially.
 *
@@ -35,7 +36,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.72 2003/02/22 00:45:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -164,8 +165,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 		ResetUsage();
 	}
 #endif   /* BTREE_BUILD_STATS */
-	tuplesort_performsort(btspool->sortstate);

+	tuplesort_performsort(btspool->sortstate);
 	if (btspool2)
 		tuplesort_performsort(btspool2->sortstate);
 	_bt_load(btspool->index, btspool, btspool2);
@@ -331,7 +332,7 @@ _bt_sortaddtup(Page page,

 	if (PageAddItem(page, (Item) btitem, itemsize, itup_off,
 					LP_USED) == InvalidOffsetNumber)
-		elog(FATAL, "btree: failed to add item to the page in _bt_sort");
+		elog(ERROR, "btree: failed to add item to the page in _bt_sort");
 }

 /*----------
@@ -470,8 +471,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti)

 		/*
 		 * Write out the old page.	We never want to see it again, so we
-		 * can give up our lock (if we had one; most likely BuildingBtree
-		 * is set, so we aren't locking).
+		 * can give up our lock.
 		 */
 		_bt_blwritepage(index, obuf);

@@ -534,7 +534,7 @@ _bt_uppershutdown(Relation index, BTPageState *state)
 		if (s->btps_next == (BTPageState *) NULL)
 		{
 			opaque->btpo_flags |= BTP_ROOT;
-			_bt_metaproot(index, blkno, s->btps_level + 1);
+			_bt_metaproot(index, blkno, s->btps_level);
 		}
 		else
 		{
--- a/src/backend/access/rtree/rtree.c
+++ b/src/backend/access/rtree/rtree.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.75 2002/09/04 20:31:13 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.76 2003/02/22 00:45:04 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -1250,8 +1250,9 @@ rtbulkdelete(PG_FUNCTION_ARGS)

 	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult));
 	result->num_pages = num_pages;
-	result->tuples_removed = tuples_removed;
 	result->num_index_tuples = num_index_tuples;
+	result->tuples_removed = tuples_removed;
+	result->pages_free = 0;

 	PG_RETURN_POINTER(result);
 }
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.247 2003/02/09 06:56:27 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.248 2003/02/22 00:45:05 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -2603,17 +2603,25 @@ static void
 scan_index(Relation indrel, double num_tuples)
 {
 	IndexBulkDeleteResult *stats;
+	IndexVacuumCleanupInfo vcinfo;
 	VacRUsage	ru0;

 	vac_init_rusage(&ru0);

 	/*
-	 * Even though we're not planning to delete anything, use the
-	 * ambulkdelete call, so that the scan happens within the index AM for
-	 * more speed.
+	 * Even though we're not planning to delete anything, we use the
+	 * ambulkdelete call, because (a) the scan happens within the index AM
+	 * for more speed, and (b) it may want to pass private statistics to
+	 * the amvacuumcleanup call.
 	 */
 	stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);

+	/* Do post-VACUUM cleanup, even though we deleted nothing */
+	vcinfo.vacuum_full = true;
+	vcinfo.message_level = elevel;
+
+	stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
 	if (!stats)
 		return;

@@ -2622,9 +2630,9 @@ scan_index(Relation indrel, double num_tuples)
 						stats->num_pages, stats->num_index_tuples,
 						false);

-	elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
+	elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f.\n\t%s",
 		 RelationGetRelationName(indrel),
-		 stats->num_pages, stats->num_index_tuples,
+		 stats->num_pages, stats->pages_free, stats->num_index_tuples,
 		 vac_show_rusage(&ru0));

 	/*
@@ -2661,6 +2669,7 @@ vacuum_index(VacPageList vacpagelist, Relation indrel,
 			 double num_tuples, int keep_tuples)
 {
 	IndexBulkDeleteResult *stats;
+	IndexVacuumCleanupInfo vcinfo;
 	VacRUsage	ru0;

 	vac_init_rusage(&ru0);
@@ -2668,6 +2677,12 @@ vacuum_index(VacPageList vacpagelist, Relation indrel,
 	/* Do bulk deletion */
 	stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);

+	/* Do post-VACUUM cleanup */
+	vcinfo.vacuum_full = true;
+	vcinfo.message_level = elevel;
+
+	stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
 	if (!stats)
 		return;

@@ -2676,8 +2691,9 @@ vacuum_index(VacPageList vacpagelist, Relation indrel,
 						stats->num_pages, stats->num_index_tuples,
 						false);

-	elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
-		 RelationGetRelationName(indrel), stats->num_pages,
+	elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f: Deleted %.0f.\n\t%s",
+		 RelationGetRelationName(indrel),
+		 stats->num_pages, stats->pages_free,
 		 stats->num_index_tuples - keep_tuples, stats->tuples_removed,
 		 vac_show_rusage(&ru0));

--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -31,7 +31,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.23 2002/11/13 00:39:46 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.24 2003/02/22 00:45:05 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -200,7 +200,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 				tups_vacuumed,
 				nkeep,
 				nunused;
-	bool		did_vacuum_index = false;
 	int			i;
 	VacRUsage	ru0;

@@ -244,7 +243,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			/* Remove index entries */
 			for (i = 0; i < nindexes; i++)
 				lazy_vacuum_index(Irel[i], vacrelstats);
-			did_vacuum_index = true;
 			/* Remove tuples from heap */
 			lazy_vacuum_heap(onerel, vacrelstats);
 			/* Forget the now-vacuumed tuples, and press on */
@@ -415,7 +413,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 	vacrelstats->rel_tuples = num_tuples;

 	/* If any tuples need to be deleted, perform final vacuum cycle */
-	/* XXX put a threshold on min nuber of tuples here? */
+	/* XXX put a threshold on min number of tuples here? */
 	if (vacrelstats->num_dead_tuples > 0)
 	{
 		/* Remove index entries */
@@ -424,9 +422,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		/* Remove tuples from heap */
 		lazy_vacuum_heap(onerel, vacrelstats);
 	}
-	else if (!did_vacuum_index)
+	else
 	{
-		/* Scan indexes just to update pg_class statistics about them */
+		/* Must do post-vacuum cleanup and statistics update anyway */
 		for (i = 0; i < nindexes; i++)
 			lazy_scan_index(Irel[i], vacrelstats);
 	}
@@ -551,42 +549,36 @@ static void
 lazy_scan_index(Relation indrel, LVRelStats *vacrelstats)
 {
 	IndexBulkDeleteResult *stats;
+	IndexVacuumCleanupInfo vcinfo;
 	VacRUsage	ru0;

 	vac_init_rusage(&ru0);

 	/*
-	 * If the index is not partial, skip the scan, and just assume it has
-	 * the same number of tuples as the heap.
-	 */
-	if (!vac_is_partial_index(indrel))
-	{
-		vac_update_relstats(RelationGetRelid(indrel),
-							RelationGetNumberOfBlocks(indrel),
-							vacrelstats->rel_tuples,
-							false);
-		return;
-	}
-
-	/*
-	 * If index is unsafe for concurrent access, must lock it; but a
-	 * shared lock should be sufficient.
+	 * If index is unsafe for concurrent access, must lock it.
 	 */
 	if (!indrel->rd_am->amconcurrent)
-		LockRelation(indrel, AccessShareLock);
+		LockRelation(indrel, AccessExclusiveLock);

 	/*
-	 * Even though we're not planning to delete anything, use the
-	 * ambulkdelete call, so that the scan happens within the index AM for
-	 * more speed.
+	 * Even though we're not planning to delete anything, we use the
+	 * ambulkdelete call, because (a) the scan happens within the index AM
+	 * for more speed, and (b) it may want to pass private statistics to
+	 * the amvacuumcleanup call.
 	 */
 	stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);

+	/* Do post-VACUUM cleanup, even though we deleted nothing */
+	vcinfo.vacuum_full = false;
+	vcinfo.message_level = elevel;
+
+	stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
 	/*
 	 * Release lock acquired above.
 	 */
 	if (!indrel->rd_am->amconcurrent)
-		UnlockRelation(indrel, AccessShareLock);
+		UnlockRelation(indrel, AccessExclusiveLock);

 	if (!stats)
 		return;
@@ -596,9 +588,9 @@ lazy_scan_index(Relation indrel, LVRelStats *vacrelstats)
 						stats->num_pages, stats->num_index_tuples,
 						false);

-	elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
+	elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f.\n\t%s",
 		 RelationGetRelationName(indrel),
-		 stats->num_pages, stats->num_index_tuples,
+		 stats->num_pages, stats->pages_free, stats->num_index_tuples,
 		 vac_show_rusage(&ru0));

 	pfree(stats);
@@ -617,6 +609,7 @@ static void
 lazy_vacuum_index(Relation indrel, LVRelStats *vacrelstats)
 {
 	IndexBulkDeleteResult *stats;
+	IndexVacuumCleanupInfo vcinfo;
 	VacRUsage	ru0;

 	vac_init_rusage(&ru0);
@@ -630,26 +623,33 @@ lazy_vacuum_index(Relation indrel, LVRelStats *vacrelstats)
 	/* Do bulk deletion */
 	stats = index_bulk_delete(indrel, lazy_tid_reaped, (void *) vacrelstats);

+	/* Do post-VACUUM cleanup */
+	vcinfo.vacuum_full = false;
+	vcinfo.message_level = elevel;
+
+	stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
+
 	/*
 	 * Release lock acquired above.
 	 */
 	if (!indrel->rd_am->amconcurrent)
 		UnlockRelation(indrel, AccessExclusiveLock);

+	if (!stats)
+		return;
+
 	/* now update statistics in pg_class */
-	if (stats)
-	{
-		vac_update_relstats(RelationGetRelid(indrel),
-							stats->num_pages, stats->num_index_tuples,
-							false);
+	vac_update_relstats(RelationGetRelid(indrel),
+						stats->num_pages, stats->num_index_tuples,
+						false);

-		elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
-			 RelationGetRelationName(indrel), stats->num_pages,
-			 stats->num_index_tuples, stats->tuples_removed,
-			 vac_show_rusage(&ru0));
+	elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f: Deleted %.0f.\n\t%s",
+		 RelationGetRelationName(indrel),
+		 stats->num_pages, stats->pages_free,
+		 stats->num_index_tuples, stats->tuples_removed,
+		 vac_show_rusage(&ru0));

-		pfree(stats);
-	}
+	pfree(stats);
 }

 /*