Postgres95 1.01 Distribution - Virgin Sources

2025-08-31 17:02:12 +03:00 · 1996-07-09 06:22:35 +00:00
commit d31084e9d1
868 changed files with 242656 additions and 0 deletions
--- a/src/backend/access/nbtree/Makefile.inc
+++ b/src/backend/access/nbtree/Makefile.inc
@@ -0,0 +1,15 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for access/nbtree (btree acess methods)
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= nbtcompare.c nbtinsert.c nbtpage.c nbtree.c nbtscan.c nbtsearch.c \
+	nbtstrat.c nbtutils.c nbtsort.c
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -0,0 +1,68 @@
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+
+This directory contains a correct implementation of Lehman and Yao's
+btree management algorithm that supports concurrent access for Postgres.
+We have made the following changes in order to incorporate their algorithm
+into Postgres:
+
+	+  The requirement that all btree keys be unique is too onerous,
+	   but the algorithm won't work correctly without it.  As a result,
+	   this implementation adds an OID (guaranteed to be unique) to
+	   every key in the index.  This guarantees uniqueness within a set
+	   of duplicates.  Space overhead is four bytes.
+
+	   For this reason, when we're passed an index tuple to store by the
+	   common access method code, we allocate a larger one and copy the
+	   supplied tuple into it.  No Postgres code outside of the btree
+	   access method knows about this xid or sequence number.
+
+	+  Lehman and Yao don't require read locks, but assume that in-
+	   memory copies of tree nodes are unshared.  Postgres shares
+	   in-memory buffers among backends.  As a result, we do page-
+	   level read locking on btree nodes in order to guarantee that
+	   no record is modified while we are examining it.  This reduces
+	   concurrency but guaranteees correct behavior.
+
+	+  Read locks on a page are held for as long as a scan has a pointer
+	   to the page.  However, locks are always surrendered before the
+	   sibling page lock is acquired (for readers), so we remain deadlock-
+	   free.  I will do a formal proof if I get bored anytime soon.
+
+In addition, the following things are handy to know:
+
+	+  Page zero of every btree is a meta-data page.  This page stores
+	   the location of the root page, a pointer to a list of free
+	   pages, and other stuff that's handy to know.
+
+	+  This algorithm doesn't really work, since it requires ordered
+	   writes, and UNIX doesn't support ordered writes.
+
+	+  There's one other case where we may screw up in this
+	   implementation.  When we start a scan, we descend the tree
+	   to the key nearest the one in the qual, and once we get there,
+	   position ourselves correctly for the qual type (eg, <, >=, etc).
+	   If we happen to step off a page, decide we want to get back to
+	   it, and fetch the page again, and if some bad person has split
+	   the page and moved the last tuple we saw off of it, then the
+	   code complains about botched concurrency in an elog(WARN, ...)
+	   and gives up the ghost.  This is the ONLY violation of Lehman
+	   and Yao's guarantee of correct behavior that I am aware of in
+	   this code.
+
+Notes to operator class implementors:
+
+	With this implementation, we require the user to supply us with
+	a procedure for pg_amproc.  This procedure should take two keys
+	A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B,
+	respectively.  See the contents of that relation for the btree
+	access method for some samples.
+
+Notes to mao for implementation document:
+
+	On deletions, we need to adjust the position of active scans on
+	the index.  The code in nbtscan.c handles this.  We don't need to
+	do this for splits because of the way splits are handled; if they
+	happen behind us, we'll automatically go to the next page, and if
+	they happen in front of us, we're not affected by them.  For
+	insertions, if we inserted a tuple behind the current scan location
+	on the current scan page, we move one space ahead.
--- a/src/backend/access/nbtree/nbtcompare.c
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -0,0 +1,173 @@
+/*-------------------------------------------------------------------------
+ *
+ * btcompare.c--
+ *    Comparison functions for btree access method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *  NOTES
+ *	These functions are stored in pg_amproc.  For each operator class
+ *	defined on btrees, they compute
+ *
+ *		compare(a, b):
+ *			< 0 if a < b,
+ *			= 0 if a == b,
+ *			> 0 if a > b.
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+#include "utils/nabstime.h"
+
+int32
+btint2cmp(int16 a, int16 b)
+{
+    return ((int32) (a - b));
+}
+
+int32
+btint4cmp(int32 a, int32 b)
+{
+    return (a - b);
+}
+
+int32
+btint24cmp(int16 a, int32 b)
+{
+    return (((int32) a) - b);
+}
+
+int32
+btint42cmp(int32 a, int16 b)
+{
+    return (a - ((int32) b));
+}
+
+int32
+btfloat4cmp(float32 a, float32 b)
+{
+    if (*a > *b)
+	return (1);
+    else if (*a == *b)
+	return (0);
+    else
+	return (-1);
+}
+
+int32
+btfloat8cmp(float64 a, float64 b)
+{
+    if (*a > *b)
+	return (1);
+    else if (*a == *b)
+	return (0);
+    else
+	return (-1);
+}
+
+int32
+btoidcmp(Oid a, Oid b)
+{
+    if (a > b)
+	return (1);
+    else if (a == b)
+	return (0);
+    else
+	return (-1);
+}
+
+int32
+btabstimecmp(AbsoluteTime a, AbsoluteTime b)
+{
+    if (AbsoluteTimeIsBefore(a, b))
+	return (1);
+    else if (AbsoluteTimeIsBefore(b, a))
+	return (-1);
+    else
+	return (0);
+}
+
+int32
+btcharcmp(char a, char b)
+{
+    return ((int32) (a - b));
+}
+
+int32
+btchar2cmp(uint16 a, uint16 b)
+{
+    return (strncmp((char *) &a, (char *) &b, 2));
+}
+
+int32
+btchar4cmp(uint32 a, uint32 b)
+{
+    return (strncmp((char *) &a, (char *) &b, 4));
+}
+
+int32
+btchar8cmp(char *a, char *b)
+{
+    return (strncmp(a, b, 8));
+}
+
+int32
+btchar16cmp(char *a, char *b)
+{
+    return (strncmp(a, b, 16));
+}
+
+int32
+btnamecmp(NameData *a, NameData *b)
+{
+     return (strncmp(a->data, b->data, NAMEDATALEN));
+}
+
+int32
+bttextcmp(struct varlena *a, struct varlena *b)
+{
+    char *ap, *bp;
+    int len;
+    int res;
+    
+    ap = VARDATA(a);
+    bp = VARDATA(b);
+    
+    /* len is the length of the shorter of the two strings */
+    if ((len = VARSIZE(a)) > VARSIZE(b))
+	len = VARSIZE(b);
+    
+    /* len includes the four bytes in which string length is stored */
+    len -= sizeof(VARSIZE(a));
+    
+    /*
+     *  If the two strings differ in the first len bytes, or if they're
+     *  the same in the first len bytes and they're both len bytes long,
+     *  we're done.
+     */
+    
+    res = 0;
+    if (len > 0) {
+	do {
+	    res = (int) (*ap++ - *bp++);
+	    len--;
+	} while (res == 0 && len != 0);
+    }
+    
+    if (res != 0 || VARSIZE(a) == VARSIZE(b))
+	return (res);
+    
+    /*
+     *  The two strings are the same in the first len bytes, and they
+     *  are of different lengths.
+     */
+    
+    if (VARSIZE(a) < VARSIZE(b))
+	return (-1);
+    else
+	return (1);
+}
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -0,0 +1,831 @@
+/*-------------------------------------------------------------------------
+ *
+ * btinsert.c--
+ *    Item insertion in Lehman and Yao btrees for Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem);
+static Buffer _bt_split(Relation rel, Buffer buf);
+static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit);
+static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem);
+static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem);
+static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, Oid bti_oid, BTItem newItem);
+
+/*
+ *  _bt_doinsert() -- Handle insertion of a single btitem in the tree.
+ *
+ *	This routine is called by the public interface routines, btbuild
+ *	and btinsert.  By here, btitem is filled in, and has a unique
+ *	(xid, seqno) pair.
+ */
+InsertIndexResult
+_bt_doinsert(Relation rel, BTItem btitem)
+{
+    ScanKey itup_scankey;
+    IndexTuple itup;
+    BTStack stack;
+    Buffer buf;
+    BlockNumber blkno;
+    int natts;
+    InsertIndexResult res;
+    
+    itup = &(btitem->bti_itup);
+    
+    /* we need a scan key to do our search, so build one */
+    itup_scankey = _bt_mkscankey(rel, itup);
+    natts = rel->rd_rel->relnatts;
+    
+    /* find the page containing this key */
+    stack = _bt_search(rel, natts, itup_scankey, &buf);
+    blkno = BufferGetBlockNumber(buf);
+    
+    /* trade in our read lock for a write lock */
+    _bt_relbuf(rel, buf, BT_READ);
+    buf = _bt_getbuf(rel, blkno, BT_WRITE);
+    
+    /*
+     *  If the page was split between the time that we surrendered our
+     *  read lock and acquired our write lock, then this page may no
+     *  longer be the right place for the key we want to insert.  In this
+     *  case, we need to move right in the tree.  See Lehman and Yao for
+     *  an excruciatingly precise description.
+     */
+    
+    buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
+    
+    /* do the insertion */
+    res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey,
+			 btitem, (BTItem) NULL);
+    
+    /* be tidy */
+    _bt_freestack(stack);
+    _bt_freeskey(itup_scankey);
+    
+    return (res);
+}
+
+/*
+ *  _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ *	This recursive procedure does the following things:
+ *
+ *	    +  if necessary, splits the target page.
+ *	    +  finds the right place to insert the tuple (taking into
+ *	       account any changes induced by a split).
+ *	    +  inserts the tuple.
+ *	    +  if the page was split, pops the parent stack, and finds the
+ *	       right place to insert the new child pointer (by walking
+ *	       right using information stored in the parent stack).
+ *	    +  invoking itself with the appropriate tuple for the right
+ *	       child page on the parent.
+ *
+ *	On entry, we must have the right buffer on which to do the
+ *	insertion, and the buffer must be pinned and locked.  On return,
+ *	we will have dropped both the pin and the write lock on the buffer.
+ *
+ *	The locking interactions in this code are critical.  You should
+ *	grok Lehman and Yao's paper before making any changes.  In addition,
+ *	you need to understand how we disambiguate duplicate keys in this
+ *	implementation, in order to be able to find our location using
+ *	L&Y "move right" operations.  Since we may insert duplicate user
+ *	keys, and since these dups may propogate up the tree, we use the
+ *	'afteritem' parameter to position ourselves correctly for the
+ *	insertion on internal pages.
+ */
+static InsertIndexResult
+_bt_insertonpg(Relation rel,
+	       Buffer buf,
+	       BTStack stack,
+	       int keysz,
+	       ScanKey scankey,
+	       BTItem btitem,
+	       BTItem afteritem)
+{
+    InsertIndexResult res;
+    Page page;
+    Buffer rbuf;
+    Buffer pbuf;
+    Page rpage;
+    ScanKey newskey;
+    BTItem ritem;
+    BTPageOpaque rpageop;
+    BlockNumber rbknum, itup_blkno;
+    OffsetNumber itup_off;
+    int itemsz;
+    InsertIndexResult newres;
+    BTItem new_item = (BTItem) NULL;
+    BTItem lowLeftItem;
+    
+    page = BufferGetPage(buf);
+    itemsz = IndexTupleDSize(btitem->bti_itup)
+	+ (sizeof(BTItemData) - sizeof(IndexTupleData));
+
+    itemsz = DOUBLEALIGN(itemsz);	/* be safe, PageAddItem will do this
+					   but we need to be consistent */
+    
+    if (PageGetFreeSpace(page) < itemsz) {
+	
+	/* split the buffer into left and right halves */
+	rbuf = _bt_split(rel, buf);
+	
+	/* which new page (left half or right half) gets the tuple? */
+	if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) {
+	    /* left page */
+	    itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+				    itemsz, btitem, afteritem);
+	    itup_blkno = BufferGetBlockNumber(buf);
+	} else {
+	    /* right page */
+	    itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey,
+				    itemsz, btitem, afteritem);
+	    itup_blkno = BufferGetBlockNumber(rbuf);
+	}
+	
+	/*
+	 *  By here,
+	 *
+	 *	+  our target page has been split;
+	 *	+  the original tuple has been inserted;
+	 *	+  we have write locks on both the old (left half) and new
+	 *	   (right half) buffers, after the split; and
+	 *	+  we have the key we want to insert into the parent.
+	 *
+	 *  Do the parent insertion.  We need to hold onto the locks for
+	 *  the child pages until we locate the parent, but we can release
+	 *  them before doing the actual insertion (see Lehman and Yao for
+	 *  the reasoning).
+	 */
+	
+	if (stack == (BTStack) NULL) {
+	    
+	    /* create a new root node and release the split buffers */
+	    _bt_newroot(rel, buf, rbuf);
+	    _bt_relbuf(rel, buf, BT_WRITE);
+	    _bt_relbuf(rel, rbuf, BT_WRITE);
+	    
+	} else {
+
+	    /* form a index tuple that points at the new right page */
+	    rbknum = BufferGetBlockNumber(rbuf);
+	    rpage = BufferGetPage(rbuf);
+	    rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+	    
+	    /*
+	     *  By convention, the first entry (0) on every
+	     *  non-rightmost page is the high key for that page.  In
+	     *  order to get the lowest key on the new right page, we
+	     *  actually look at its second (1) entry.
+	     */
+	    
+	    if (! P_RIGHTMOST(rpageop)) {
+		ritem = (BTItem) PageGetItem(rpage,
+					     PageGetItemId(rpage, P_FIRSTKEY));
+	    } else {
+		ritem = (BTItem) PageGetItem(rpage,
+					     PageGetItemId(rpage, P_HIKEY));
+	    }
+	    
+	    /* get a unique btitem for this key */
+	    new_item = _bt_formitem(&(ritem->bti_itup));
+	    
+	    ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
+	    
+	    /* find the parent buffer */
+	    pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+	    
+	    /*
+	     *  If the key of new_item is < than the key of the item
+	     *  in the parent page pointing to the left page
+	     *  (stack->bts_btitem), we have to update the latter key;
+	     *  otherwise the keys on the parent page wouldn't be
+	     *  monotonically increasing after we inserted the new
+	     *  pointer to the right page (new_item). This only
+	     *  happens if our left page is the leftmost page and a
+	     *  new minimum key had been inserted before, which is not
+	     *  reflected in the parent page but didn't matter so
+	     *  far. If there are duplicate keys and this new minimum
+	     *  key spills over to our new right page, we get an
+	     *  inconsistency if we don't update the left key in the
+	     *  parent page.
+	     */
+	    
+	    if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item,
+	                    BTGreaterStrategyNumber)) {
+		lowLeftItem =
+		    (BTItem) PageGetItem(page,
+					 PageGetItemId(page, P_FIRSTKEY));
+		/* page must have right pointer after split */
+		_bt_updateitem(rel, keysz, pbuf, stack->bts_btitem->bti_oid,
+		               lowLeftItem);
+	    }
+	    
+	    /* don't need the children anymore */
+	    _bt_relbuf(rel, buf, BT_WRITE);
+	    _bt_relbuf(rel, rbuf, BT_WRITE);
+	    
+	    newskey = _bt_mkscankey(rel, &(new_item->bti_itup));
+	    newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+				    keysz, newskey, new_item,
+				    stack->bts_btitem);
+	    
+	    /* be tidy */
+	    pfree(newres);
+	    pfree(newskey);
+	    pfree(new_item);
+	}
+    } else {
+	itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+				itemsz, btitem, afteritem);
+	itup_blkno = BufferGetBlockNumber(buf);
+	
+	_bt_relbuf(rel, buf, BT_WRITE);
+    }
+    
+    /* by here, the new tuple is inserted */
+    res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+    ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+    
+    return (res);
+}
+
+/*
+ *  _bt_split() -- split a page in the btree.
+ *
+ *	On entry, buf is the page to split, and is write-locked and pinned.
+ *	Returns the new right sibling of buf, pinned and write-locked.  The
+ *	pin and lock on buf are maintained.
+ */
+static Buffer
+_bt_split(Relation rel, Buffer buf)
+{
+    Buffer rbuf;
+    Page origpage;
+    Page leftpage, rightpage;
+    BTPageOpaque ropaque, lopaque, oopaque;
+    Buffer sbuf;
+    Page spage;
+    BTPageOpaque sopaque;
+    Size itemsz;
+    ItemId itemid;
+    BTItem item;
+    OffsetNumber leftoff, rightoff;
+    OffsetNumber start;
+    OffsetNumber maxoff;
+    OffsetNumber firstright;
+    OffsetNumber i;
+    Size llimit;
+    
+    rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+    origpage = BufferGetPage(buf);
+    leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
+    rightpage = BufferGetPage(rbuf);
+    
+    _bt_pageinit(rightpage, BufferGetPageSize(rbuf));
+    _bt_pageinit(leftpage, BufferGetPageSize(buf));
+    
+    /* init btree private data */
+    oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+    lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+    ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+    
+    /* if we're splitting this page, it won't be the root when we're done */
+    oopaque->btpo_flags &= ~BTP_ROOT;
+    lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags;
+    lopaque->btpo_prev = oopaque->btpo_prev;
+    ropaque->btpo_prev = BufferGetBlockNumber(buf);
+    lopaque->btpo_next = BufferGetBlockNumber(rbuf);
+    ropaque->btpo_next = oopaque->btpo_next;
+    
+    /*
+     *  If the page we're splitting is not the rightmost page at its
+     *  level in the tree, then the first (0) entry on the page is the
+     *  high key for the page.  We need to copy that to the right
+     *  half.  Otherwise (meaning the rightmost page case), we should
+     *  treat the line pointers beginning at zero as user data.
+     *
+     *  We leave a blank space at the start of the line table for the
+     *  left page.  We'll come back later and fill it in with the high
+     *  key item we get from the right key.
+     */
+    
+    leftoff = P_FIRSTKEY;
+    ropaque->btpo_next = oopaque->btpo_next;
+    if (! P_RIGHTMOST(oopaque)) {
+	/* splitting a non-rightmost page, start at the first data item */
+	start = P_FIRSTKEY;
+
+	/* copy the original high key to the new page */
+	itemid = PageGetItemId(origpage, P_HIKEY);
+	itemsz = ItemIdGetLength(itemid);
+	item = (BTItem) PageGetItem(origpage, itemid);
+	(void) PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED);
+	rightoff = P_FIRSTKEY;
+    } else {
+	/* splitting a rightmost page, "high key" is the first data item */
+	start = P_HIKEY;
+
+	/* the new rightmost page will not have a high key */
+	rightoff = P_HIKEY;
+    }
+    maxoff = PageGetMaxOffsetNumber(origpage);
+    llimit = PageGetFreeSpace(leftpage) / 2;
+    firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit);
+    
+    for (i = start; i <= maxoff; i = OffsetNumberNext(i)) {
+	itemid = PageGetItemId(origpage, i);
+	itemsz = ItemIdGetLength(itemid);
+	item = (BTItem) PageGetItem(origpage, itemid);
+	
+	/* decide which page to put it on */
+	if (i < firstright) {
+	    (void) PageAddItem(leftpage, (Item) item, itemsz, leftoff,
+			       LP_USED);
+	    leftoff = OffsetNumberNext(leftoff);
+	} else {
+	    (void) PageAddItem(rightpage, (Item) item, itemsz, rightoff,
+			       LP_USED);
+	    rightoff = OffsetNumberNext(rightoff);
+	}
+    }
+    
+    /*
+     *  Okay, page has been split, high key on right page is correct.  Now
+     *  set the high key on the left page to be the min key on the right
+     *  page.
+     */
+    
+    if (P_RIGHTMOST(ropaque)) {
+	itemid = PageGetItemId(rightpage, P_HIKEY);
+    } else {
+	itemid = PageGetItemId(rightpage, P_FIRSTKEY);
+    }
+    itemsz = ItemIdGetLength(itemid);
+    item = (BTItem) PageGetItem(rightpage, itemid);
+    
+    /*
+     *  We left a hole for the high key on the left page; fill it.  The
+     *  modal crap is to tell the page manager to put the new item on the
+     *  page and not screw around with anything else.  Whoever designed
+     *  this interface has presumably crawled back into the dung heap they
+     *  came from.  No one here will admit to it.
+     */
+    
+    PageManagerModeSet(OverwritePageManagerMode);
+    (void) PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED);
+    PageManagerModeSet(ShufflePageManagerMode);
+    
+    /*
+     *  By here, the original data page has been split into two new halves,
+     *  and these are correct.  The algorithm requires that the left page
+     *  never move during a split, so we copy the new left page back on top
+     *  of the original.  Note that this is not a waste of time, since we
+     *  also require (in the page management code) that the center of a
+     *  page always be clean, and the most efficient way to guarantee this
+     *  is just to compact the data by reinserting it into a new left page.
+     */
+    
+    PageRestoreTempPage(leftpage, origpage);
+    
+    /* write these guys out */
+    _bt_wrtnorelbuf(rel, rbuf);
+    _bt_wrtnorelbuf(rel, buf);
+    
+    /*
+     *  Finally, we need to grab the right sibling (if any) and fix the
+     *  prev pointer there.  We are guaranteed that this is deadlock-free
+     *  since no other writer will be moving holding a lock on that page
+     *  and trying to move left, and all readers release locks on a page
+     *  before trying to fetch its neighbors.
+     */
+    
+    if (! P_RIGHTMOST(ropaque)) {
+	sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
+	spage = BufferGetPage(sbuf);
+	sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+	sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+	
+	/* write and release the old right sibling */
+	_bt_wrtbuf(rel, sbuf);
+    }
+    
+    /* split's done */
+    return (rbuf);
+}
+
+/*
+ *  _bt_findsplitloc() -- find a safe place to split a page.
+ *
+ *	In order to guarantee the proper handling of searches for duplicate
+ *	keys, the first duplicate in the chain must either be the first
+ *	item on the page after the split, or the entire chain must be on
+ *	one of the two pages.  That is,
+ *		[1 2 2 2 3 4 5]
+ *	must become
+ *		[1] [2 2 2 3 4 5]
+ *	or
+ *		[1 2 2 2] [3 4 5]
+ *	but not
+ *		[1 2 2] [2 3 4 5].
+ *	However,
+ *		[2 2 2 2 2 3 4]
+ *	may be split as
+ *		[2 2 2 2] [2 3 4].
+ */
+static OffsetNumber
+_bt_findsplitloc(Relation rel,
+		 Page page,
+		 OffsetNumber start,
+		 OffsetNumber maxoff,
+		 Size llimit)
+{
+    OffsetNumber i;
+    OffsetNumber saferight;
+    ItemId nxtitemid, safeitemid;
+    BTItem safeitem, nxtitem;
+    IndexTuple safetup, nxttup;
+    Size nbytes;
+    TupleDesc itupdesc;
+    int natts;
+    int attno;
+    Datum attsafe;
+    Datum attnext;
+    bool null;
+    
+    itupdesc = RelationGetTupleDescriptor(rel);
+    natts = rel->rd_rel->relnatts;
+    
+    saferight = start;
+    safeitemid = PageGetItemId(page, saferight);
+    nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData);
+    safeitem = (BTItem) PageGetItem(page, safeitemid);
+    safetup = &(safeitem->bti_itup);
+    
+    i = OffsetNumberNext(start);
+    
+    while (nbytes < llimit) {
+	
+	/* check the next item on the page */
+	nxtitemid = PageGetItemId(page, i);
+	nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData));
+	nxtitem = (BTItem) PageGetItem(page, nxtitemid);
+	nxttup = &(nxtitem->bti_itup);
+	
+	/* test against last known safe item */
+	for (attno = 1; attno <= natts; attno++) {
+	    attsafe = index_getattr(safetup, attno, itupdesc, &null);
+	    attnext = index_getattr(nxttup, attno, itupdesc, &null);
+
+	    /*
+	     *  If the tuple we're looking at isn't equal to the last safe one
+	     *  we saw, then it's our new safe tuple.
+	     */
+	    
+	    if (!_bt_invokestrat(rel, attno, BTEqualStrategyNumber,
+				 attsafe, attnext)) {
+		safetup = nxttup;
+		saferight = i;
+		
+		/* break is for the attno for loop */
+		break;
+	    }
+	}
+	i = OffsetNumberNext(i);
+    }
+    
+    /*
+     *  If the chain of dups starts at the beginning of the page and extends
+     *  past the halfway mark, we can split it in the middle.
+     */
+    
+    if (saferight == start)
+	saferight = i;
+    
+    return (saferight);
+}
+
+/*
+ *  _bt_newroot() -- Create a new root page for the index.
+ *
+ *	We've just split the old root page and need to create a new one.
+ *	In order to do this, we add a new root page to the file, then lock
+ *	the metadata page and update it.  This is guaranteed to be deadlock-
+ *	free, because all readers release their locks on the metadata page
+ *	before trying to lock the root, and all writers lock the root before
+ *	trying to lock the metadata page.  We have a write lock on the old
+ *	root page, so we have not introduced any cycles into the waits-for
+ *	graph.
+ *
+ *	On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ *	locked.  We don't drop the locks in this routine; that's done by
+ *	the caller.  On exit, a new root page exists with entries for the
+ *	two new children.  The new root page is neither pinned nor locked.
+ */
+static void
+_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
+{
+    Buffer rootbuf;
+    Page lpage, rpage, rootpage;
+    BlockNumber lbkno, rbkno;
+    BlockNumber rootbknum;
+    BTPageOpaque rootopaque;
+    ItemId itemid;
+    BTItem item;
+    Size itemsz;
+    BTItem new_item;
+    
+    /* get a new root page */
+    rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+    rootpage = BufferGetPage(rootbuf);
+    _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+    
+    /* set btree special data */
+    rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+    rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+    rootopaque->btpo_flags |= BTP_ROOT;
+    
+    /*
+     *  Insert the internal tuple pointers.
+     */
+    
+    lbkno = BufferGetBlockNumber(lbuf);
+    rbkno = BufferGetBlockNumber(rbuf);
+    lpage = BufferGetPage(lbuf);
+    rpage = BufferGetPage(rbuf);
+    
+    /*
+     * step over the high key on the left page while building the 
+     * left page pointer.
+     */
+    itemid = PageGetItemId(lpage, P_FIRSTKEY);
+    itemsz = ItemIdGetLength(itemid);
+    item = (BTItem) PageGetItem(lpage, itemid);
+    new_item = _bt_formitem(&(item->bti_itup));
+    ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_FIRSTKEY);
+    
+    /*
+     * insert the left page pointer into the new root page.  the root
+     * page is the rightmost page on its level so the "high key" item
+     * is the first data item.
+     */
+    (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED);
+    pfree(new_item);
+    
+    /*
+     * the right page is the rightmost page on the second level, so 
+     * the "high key" item is the first data item on that page as well.
+     */
+    itemid = PageGetItemId(rpage, P_HIKEY);
+    itemsz = ItemIdGetLength(itemid);
+    item = (BTItem) PageGetItem(rpage, itemid);
+    new_item = _bt_formitem(&(item->bti_itup));
+    ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
+    
+    /*
+     * insert the right page pointer into the new root page.
+     */
+    (void) PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED);
+    pfree(new_item);
+    
+    /* write and let go of the root buffer */
+    rootbknum = BufferGetBlockNumber(rootbuf);
+    _bt_wrtbuf(rel, rootbuf);
+    
+    /* update metadata page with new root block number */
+    _bt_metaproot(rel, rootbknum);
+}
+
+/*
+ *  _bt_pgaddtup() -- add a tuple to a particular page in the index.
+ *
+ *	This routine adds the tuple to the page as requested, and keeps the
+ *	write lock and reference associated with the page's buffer.  It is
+ *	an error to call pgaddtup() without a write lock and reference.  If
+ *	afteritem is non-null, it's the item that we expect our new item
+ *	to follow.  Otherwise, we do a binary search for the correct place
+ *	and insert the new item there.
+ */
+static OffsetNumber
+_bt_pgaddtup(Relation rel,
+	     Buffer buf,
+	     int keysz,
+	     ScanKey itup_scankey,
+	     Size itemsize,
+	     BTItem btitem,
+	     BTItem afteritem)
+{
+    OffsetNumber itup_off;
+    OffsetNumber first;
+    Page page;
+    BTPageOpaque opaque;
+    BTItem chkitem;
+    Oid afteroid;
+    
+    page = BufferGetPage(buf);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+    
+    if (afteritem == (BTItem) NULL) {
+	itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION);
+    } else {
+	afteroid = afteritem->bti_oid;
+	itup_off = first;
+	
+	do {
+	    chkitem =
+		(BTItem) PageGetItem(page, PageGetItemId(page, itup_off));
+	    itup_off = OffsetNumberNext(itup_off);
+	} while (chkitem->bti_oid != afteroid);
+    }
+
+    (void) PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED);
+    
+    /* write the buffer, but hold our lock */
+    _bt_wrtnorelbuf(rel, buf);
+    
+    return (itup_off);
+}
+
+/*
+ *  _bt_goesonpg() -- Does a new tuple belong on this page?
+ *
+ *	This is part of the complexity introduced by allowing duplicate
+ *	keys into the index.  The tuple belongs on this page if:
+ *
+ *		+ there is no page to the right of this one; or
+ *		+ it is less than the high key on the page; or
+ *		+ the item it is to follow ("afteritem") appears on this
+ *		  page.
+ */
+static bool
+_bt_goesonpg(Relation rel,
+	     Buffer buf,
+	     Size keysz,
+	     ScanKey scankey,
+	     BTItem afteritem)
+{
+    Page page;
+    ItemId hikey;
+    BTPageOpaque opaque;
+    BTItem chkitem;
+    OffsetNumber offnum, maxoff;
+    Oid afteroid;
+    bool found;
+    
+    page = BufferGetPage(buf);
+    
+    /* no right neighbor? */
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    if (P_RIGHTMOST(opaque))
+	return (true);
+    
+    /*
+     *  this is a non-rightmost page, so it must have a high key item.
+     *
+     *  If the scan key is < the high key (the min key on the next page),
+     *  then it for sure belongs here.
+     */
+    hikey = PageGetItemId(page, P_HIKEY);
+    if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber))
+	return (true);
+    
+    /*
+     *  If the scan key is > the high key, then it for sure doesn't belong
+     *  here.
+     */
+    
+    if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber))
+	return (false);
+    
+    /*
+     *  If we have no adjacency information, and the item is equal to the
+     *  high key on the page (by here it is), then the item does not belong
+     *  on this page.
+     */
+    
+    if (afteritem == (BTItem) NULL)
+	return (false);
+    
+    /* damn, have to work for it.  i hate that. */
+    afteroid = afteritem->bti_oid;
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    /*
+     *  Search the entire page for the afteroid.  We need to do this, rather
+     *  than doing a binary search and starting from there, because if the
+     *  key we're searching for is the leftmost key in the tree at this
+     *  level, then a binary search will do the wrong thing.  Splits are
+     *  pretty infrequent, so the cost isn't as bad as it could be.
+     */
+    
+    found = false;
+    for (offnum = P_FIRSTKEY;
+	 offnum <= maxoff;
+	 offnum = OffsetNumberNext(offnum)) {
+	chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+	if (chkitem->bti_oid == afteroid) {
+	    found = true;
+	    break;
+	}
+    }
+    
+    return (found);
+}
+
+/*
+ *	_bt_itemcmp() -- compare item1 to item2 using a requested
+ *		         strategy (<, <=, =, >=, >)
+ *
+ */
+bool
+_bt_itemcmp(Relation rel,
+	    Size keysz,
+	    BTItem item1,
+	    BTItem item2,
+	    StrategyNumber strat)
+{
+    TupleDesc tupDes;
+    IndexTuple indexTuple1, indexTuple2;
+    Datum attrDatum1, attrDatum2;
+    int i;
+    bool isNull;
+    bool compare;
+    
+    tupDes = RelationGetTupleDescriptor(rel);
+    indexTuple1 = &(item1->bti_itup);
+    indexTuple2 = &(item2->bti_itup);
+    
+    for (i = 1; i <= keysz; i++) {
+	attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isNull);
+	attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isNull);
+	compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2);
+	if (!compare) {
+	    return (false);
+	}
+    }
+    return (true);
+}
+
+/*
+ *	_bt_updateitem() -- updates the key of the item identified by the
+ *			    oid with the key of newItem (done in place)
+ *
+ */
+static void
+_bt_updateitem(Relation rel,
+	       Size keysz,
+	       Buffer buf,
+	       Oid bti_oid,
+	       BTItem newItem)
+{
+    Page page;
+    OffsetNumber maxoff;
+    OffsetNumber i;
+    ItemPointerData itemPtrData;
+    BTItem item;
+    IndexTuple oldIndexTuple, newIndexTuple;
+    
+    page = BufferGetPage(buf);
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    /* locate item on the page */
+    i = P_HIKEY;
+    do {
+	item = (BTItem) PageGetItem(page, PageGetItemId(page, i));
+	i = OffsetNumberNext(i);
+    } while (i <= maxoff && item->bti_oid != bti_oid);
+    
+    /* this should never happen (in theory) */
+    if (item->bti_oid != bti_oid) {
+	elog(FATAL, "_bt_getstackbuf was lying!!");
+    }
+    
+    oldIndexTuple = &(item->bti_itup);
+    newIndexTuple = &(newItem->bti_itup);
+    
+    /* keep the original item pointer */
+    ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData);
+    CopyIndexTuple(newIndexTuple, &oldIndexTuple);
+    ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid));
+}
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -0,0 +1,523 @@
+/*-------------------------------------------------------------------------
+ *
+ * btpage.c--
+ *    BTree-specific page management code for the Postgres btree access
+ *    method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *  NOTES
+ *     Postgres btree pages look like ordinary relation pages.  The opaque
+ *     data at high addresses includes pointers to left and right siblings
+ *     and flag data describing page state.  The first page in a btree, page
+ *     zero, is special -- it stores meta-information describing the tree.
+ *     Pages one and higher store the actual tree data.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+#define BTREE_METAPAGE	0
+#define BTREE_MAGIC	0x053162
+#define BTREE_VERSION	0
+
+typedef struct BTMetaPageData {
+    uint32	btm_magic;
+    uint32	btm_version;
+    BlockNumber	btm_root;
+} BTMetaPageData;
+
+#define	BTPageGetMeta(p) \
+    ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0])
+
+extern bool	BuildingBtree;
+
+/*
+ *  We use high-concurrency locking on btrees.  There are two cases in
+ *  which we don't do locking.  One is when we're building the btree.
+ *  Since the creating transaction has not committed, no one can see
+ *  the index, and there's no reason to share locks.  The second case
+ *  is when we're just starting up the database system.  We use some
+ *  special-purpose initialization code in the relation cache manager
+ *  (see utils/cache/relcache.c) to allow us to do indexed scans on
+ *  the system catalogs before we'd normally be able to.  This happens
+ *  before the lock table is fully initialized, so we can't use it.
+ *  Strictly speaking, this violates 2pl, but we don't do 2pl on the
+ *  system catalogs anyway, so I declare this to be okay.
+ */
+
+#define USELOCKING	(!BuildingBtree && !IsInitProcessingMode())
+
+/*
+ *  _bt_metapinit() -- Initialize the metadata page of a btree.
+ */
+void
+_bt_metapinit(Relation rel)
+{
+    Buffer buf;
+    Page pg;
+    int nblocks;
+    BTMetaPageData metad;
+    BTPageOpaque op;
+    
+    /* can't be sharing this with anyone, now... */
+    if (USELOCKING)
+	RelationSetLockForWrite(rel);
+    
+    if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
+	elog(WARN, "Cannot initialize non-empty btree %s",
+	     RelationGetRelationName(rel));
+    }
+    
+    buf = ReadBuffer(rel, P_NEW);
+    pg = BufferGetPage(buf);
+    _bt_pageinit(pg, BufferGetPageSize(buf));
+    
+    metad.btm_magic = BTREE_MAGIC;
+    metad.btm_version = BTREE_VERSION;
+    metad.btm_root = P_NONE;
+    memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
+    
+    op = (BTPageOpaque) PageGetSpecialPointer(pg);
+    op->btpo_flags = BTP_META;
+
+    WriteBuffer(buf);
+    
+    /* all done */
+    if (USELOCKING)
+	RelationUnsetLockForWrite(rel);
+}
+
+/*
+ *  _bt_checkmeta() -- Verify that the metadata stored in a btree are
+ *		       reasonable.
+ */
+void
+_bt_checkmeta(Relation rel)
+{
+    Buffer metabuf;
+    Page metap;
+    BTMetaPageData *metad;
+    BTPageOpaque op;
+    int nblocks;
+    
+    /* if the relation is empty, this is init time; don't complain */
+    if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
+	return;
+    
+    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+    metap = BufferGetPage(metabuf);
+    op = (BTPageOpaque) PageGetSpecialPointer(metap);
+    if (!(op->btpo_flags & BTP_META)) {
+	elog(WARN, "Invalid metapage for index %s",
+	     RelationGetRelationName(rel));
+    }
+    metad = BTPageGetMeta(metap);
+
+    if (metad->btm_magic != BTREE_MAGIC) {
+	elog(WARN, "Index %s is not a btree",
+	     RelationGetRelationName(rel));
+    }
+    
+    if (metad->btm_version != BTREE_VERSION) {
+	elog(WARN, "Version mismatch on %s:  version %d file, version %d code",
+	     RelationGetRelationName(rel),
+	     metad->btm_version, BTREE_VERSION);
+    }
+    
+    _bt_relbuf(rel, metabuf, BT_READ);
+}
+
+/*
+ *  _bt_getroot() -- Get the root page of the btree.
+ *
+ *	Since the root page can move around the btree file, we have to read
+ *	its location from the metadata page, and then read the root page
+ *	itself.  If no root page exists yet, we have to create one.  The
+ *	standard class of race conditions exists here; I think I covered
+ *	them all in the Hopi Indian rain dance of lock requests below.
+ *
+ *	We pass in the access type (BT_READ or BT_WRITE), and return the
+ *	root page's buffer with the appropriate lock type set.  Reference
+ *	count on the root page gets bumped by ReadBuffer.  The metadata
+ *	page is unlocked and unreferenced by this process when this routine
+ *	returns.
+ */
+Buffer
+_bt_getroot(Relation rel, int access)
+{
+    Buffer metabuf;
+    Page metapg;
+    BTPageOpaque metaopaque;
+    Buffer rootbuf;
+    Page rootpg;
+    BTPageOpaque rootopaque;
+    BlockNumber rootblkno;
+    BTMetaPageData *metad;
+    
+    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+    metapg = BufferGetPage(metabuf);
+    metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+    Assert(metaopaque->btpo_flags & BTP_META);
+    metad = BTPageGetMeta(metapg);
+    
+    /* if no root page initialized yet, do it */
+    if (metad->btm_root == P_NONE) {
+	
+	/* turn our read lock in for a write lock */
+	_bt_relbuf(rel, metabuf, BT_READ);
+	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+	metapg = BufferGetPage(metabuf);
+	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+	Assert(metaopaque->btpo_flags & BTP_META);
+	metad = BTPageGetMeta(metapg);
+	
+	/*
+	 *  Race condition:  if someone else initialized the metadata between
+	 *  the time we released the read lock and acquired the write lock,
+	 *  above, we want to avoid doing it again.
+	 */
+	
+	if (metad->btm_root == P_NONE) {
+	    
+	    /*
+	     *  Get, initialize, write, and leave a lock of the appropriate
+	     *  type on the new root page.  Since this is the first page in
+	     *  the tree, it's a leaf.
+	     */
+	    
+	    rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+	    rootblkno = BufferGetBlockNumber(rootbuf);
+	    rootpg = BufferGetPage(rootbuf);
+	    metad->btm_root = rootblkno;
+	    _bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
+	    rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+	    rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
+	    _bt_wrtnorelbuf(rel, rootbuf);
+	    
+	    /* swap write lock for read lock, if appropriate */
+	    if (access != BT_WRITE) {
+		_bt_setpagelock(rel, rootblkno, BT_READ);
+		_bt_unsetpagelock(rel, rootblkno, BT_WRITE);
+	    }
+	    
+	    /* okay, metadata is correct */
+	    _bt_wrtbuf(rel, metabuf);
+	} else {
+	    
+	    /*
+	     *  Metadata initialized by someone else.  In order to guarantee
+	     *  no deadlocks, we have to release the metadata page and start
+	     *  all over again.
+	     */
+	    
+	    _bt_relbuf(rel, metabuf, BT_WRITE);
+	    return (_bt_getroot(rel, access));
+	}
+    } else {
+	rootbuf = _bt_getbuf(rel, metad->btm_root, access);
+	
+	/* done with the meta page */
+	_bt_relbuf(rel, metabuf, BT_READ);
+    }
+    
+    /*
+     *  Race condition:  If the root page split between the time we looked
+     *  at the metadata page and got the root buffer, then we got the wrong
+     *  buffer.
+     */
+    
+    rootpg = BufferGetPage(rootbuf);
+    rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+    if (!(rootopaque->btpo_flags & BTP_ROOT)) {
+	
+	/* it happened, try again */
+	_bt_relbuf(rel, rootbuf, access);
+	return (_bt_getroot(rel, access));
+    }
+    
+    /*
+     *  By here, we have a correct lock on the root block, its reference
+     *  count is correct, and we have no lock set on the metadata page.
+     *  Return the root block.
+     */
+    
+    return (rootbuf);
+}
+
+/*
+ *  _bt_getbuf() -- Get a buffer by block number for read or write.
+ *
+ *	When this routine returns, the appropriate lock is set on the
+ *	requested buffer its reference count is correct.
+ */
+Buffer
+_bt_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+    Buffer buf;
+    Page page;
+    
+    /*
+     *  If we want a new block, we can't set a lock of the appropriate type
+     *  until we've instantiated the buffer.
+     */
+    
+    if (blkno != P_NEW) {
+	if (access == BT_WRITE)
+	    _bt_setpagelock(rel, blkno, BT_WRITE);
+	else
+	    _bt_setpagelock(rel, blkno, BT_READ);
+	
+	buf = ReadBuffer(rel, blkno);
+    } else {
+	buf = ReadBuffer(rel, blkno);
+	blkno = BufferGetBlockNumber(buf);
+	page = BufferGetPage(buf);
+	_bt_pageinit(page, BufferGetPageSize(buf));
+	
+	if (access == BT_WRITE)
+	    _bt_setpagelock(rel, blkno, BT_WRITE);
+	else
+	    _bt_setpagelock(rel, blkno, BT_READ);
+    }
+    
+    /* ref count and lock type are correct */
+    return (buf);
+}
+
+/*
+ *  _bt_relbuf() -- release a locked buffer.
+ */
+void
+_bt_relbuf(Relation rel, Buffer buf, int access)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    
+    /* access had better be one of read or write */
+    if (access == BT_WRITE)
+	_bt_unsetpagelock(rel, blkno, BT_WRITE);
+    else
+	_bt_unsetpagelock(rel, blkno, BT_READ);
+    
+    ReleaseBuffer(buf);
+}
+
+/*
+ *  _bt_wrtbuf() -- write a btree page to disk.
+ *
+ *	This routine releases the lock held on the buffer and our reference
+ *	to it.  It is an error to call _bt_wrtbuf() without a write lock
+ *	or a reference to the buffer.
+ */
+void
+_bt_wrtbuf(Relation rel, Buffer buf)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    WriteBuffer(buf);
+    _bt_unsetpagelock(rel, blkno, BT_WRITE);
+}
+
+/*
+ *  _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
+ *			 our reference or lock.
+ *
+ *	It is an error to call _bt_wrtnorelbuf() without a write lock
+ *	or a reference to the buffer.
+ */
+void
+_bt_wrtnorelbuf(Relation rel, Buffer buf)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    WriteNoReleaseBuffer(buf);
+}
+
+/*
+ *  _bt_pageinit() -- Initialize a new page.
+ */
+void
+_bt_pageinit(Page page, Size size)
+{
+    /*
+     *  Cargo-cult programming -- don't really need this to be zero, but
+     *  creating new pages is an infrequent occurrence and it makes me feel
+     *  good when I know they're empty.
+     */
+    
+    memset(page, 0, size);
+    
+    PageInit(page, size, sizeof(BTPageOpaqueData));
+}
+
+/*
+ *  _bt_metaproot() -- Change the root page of the btree.
+ *
+ *	Lehman and Yao require that the root page move around in order to
+ *	guarantee deadlock-free short-term, fine-granularity locking.  When
+ *	we split the root page, we record the new parent in the metadata page
+ *	for the relation.  This routine does the work.
+ *
+ *	No direct preconditions, but if you don't have the a write lock on
+ *	at least the old root page when you call this, you're making a big
+ *	mistake.  On exit, metapage data is correct and we no longer have
+ *	a reference to or lock on the metapage.
+ */
+void
+_bt_metaproot(Relation rel, BlockNumber rootbknum)
+{
+    Buffer metabuf;
+    Page metap;
+    BTPageOpaque metaopaque;
+    BTMetaPageData *metad;
+    
+    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+    metap = BufferGetPage(metabuf);
+    metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
+    Assert(metaopaque->btpo_flags & BTP_META);
+    metad = BTPageGetMeta(metap);
+    metad->btm_root = rootbknum;
+    _bt_wrtbuf(rel, metabuf);
+}
+
+/*
+ *  _bt_getstackbuf() -- Walk back up the tree one step, and find the item
+ *			 we last looked at in the parent.
+ *
+ *	This is possible because we save a bit image of the last item
+ *	we looked at in the parent, and the update algorithm guarantees
+ *	that if items above us in the tree move, they only move right.
+ */
+Buffer
+_bt_getstackbuf(Relation rel, BTStack stack, int access)
+{
+    Buffer buf;
+    BlockNumber blkno;
+    OffsetNumber start, offnum, maxoff;
+    OffsetNumber i;
+    Page page;
+    ItemId itemid;
+    BTItem item;
+    BTPageOpaque opaque;
+    
+    blkno = stack->bts_blkno;
+    buf = _bt_getbuf(rel, blkno, access);
+    page = BufferGetPage(buf);
+    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+    maxoff = PageGetMaxOffsetNumber(page);
+    
+    if (maxoff >= stack->bts_offset) {
+	itemid = PageGetItemId(page, stack->bts_offset);
+	item = (BTItem) PageGetItem(page, itemid);
+	
+	/* if the item is where we left it, we're done */
+	if (item->bti_oid == stack->bts_btitem->bti_oid)
+	    return (buf);
+	
+	/* if the item has just moved right on this page, we're done */
+	for (i = OffsetNumberNext(stack->bts_offset);
+	     i <= maxoff;
+	     i = OffsetNumberNext(i)) {
+	    itemid = PageGetItemId(page, i);
+	    item = (BTItem) PageGetItem(page, itemid);
+	    
+	    /* if the item is where we left it, we're done */
+	    if (item->bti_oid == stack->bts_btitem->bti_oid)
+		return (buf);
+	}
+    }
+    
+    /* by here, the item we're looking for moved right at least one page */
+    for (;;) {
+	blkno = opaque->btpo_next;
+	if (P_RIGHTMOST(opaque))
+	    elog(FATAL, "my bits moved right off the end of the world!");
+	
+	_bt_relbuf(rel, buf, access);
+	buf = _bt_getbuf(rel, blkno, access);
+	page = BufferGetPage(buf);
+	maxoff = PageGetMaxOffsetNumber(page);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	
+	/* if we have a right sibling, step over the high key */
+	start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+	
+	/* see if it's on this page */
+	for (offnum = start;
+	     offnum <= maxoff;
+	     offnum = OffsetNumberNext(offnum)) {
+	    itemid = PageGetItemId(page, offnum);
+	    item = (BTItem) PageGetItem(page, itemid);
+	    if (item->bti_oid == stack->bts_btitem->bti_oid)
+		return (buf);
+	}
+    }
+}
+
+void
+_bt_setpagelock(Relation rel, BlockNumber blkno, int access)
+{
+    ItemPointerData iptr;
+    
+    if (USELOCKING) {
+	ItemPointerSet(&iptr, blkno, P_HIKEY);
+	
+	if (access == BT_WRITE)
+	    RelationSetSingleWLockPage(rel, &iptr);
+	else
+	    RelationSetSingleRLockPage(rel, &iptr);
+    }
+}
+
+void
+_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access)
+{
+    ItemPointerData iptr;
+    
+    if (USELOCKING) {
+	ItemPointerSet(&iptr, blkno, P_HIKEY);
+	
+	if (access == BT_WRITE)
+	    RelationUnsetSingleWLockPage(rel, &iptr);
+	else
+	    RelationUnsetSingleRLockPage(rel, &iptr);
+    }
+}
+
+void
+_bt_pagedel(Relation rel, ItemPointer tid)
+{
+    Buffer buf;
+    Page page;
+    BlockNumber blkno;
+    OffsetNumber offno;
+    
+    blkno = ItemPointerGetBlockNumber(tid);
+    offno = ItemPointerGetOffsetNumber(tid);
+    
+    buf = _bt_getbuf(rel, blkno, BT_WRITE);
+    page = BufferGetPage(buf);
+    
+    PageIndexTupleDelete(page, offno);
+    
+    /* write the buffer and release the lock */
+    _bt_wrtbuf(rel, buf);
+}
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -0,0 +1,516 @@
+/*-------------------------------------------------------------------------
+ *
+ * btree.c--
+ *    Implementation of Lehman and Yao's btree management algorithm for
+ *    Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ * NOTES
+ *    This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+#include "access/funcindex.h"
+
+#include "nodes/execnodes.h"
+#include "nodes/plannodes.h"
+
+#include "executor/executor.h"
+#include "executor/tuptable.h"
+
+#include "catalog/index.h"
+
+bool	BuildingBtree = false;
+bool	FastBuild = false; /* turn this on to make bulk builds work*/
+
+/*
+ *  btbuild() -- build a new btree index.
+ *
+ *	We use a global variable to record the fact that we're creating
+ *	a new index.  This is used to avoid high-concurrency locking,
+ *	since the index won't be visible until this transaction commits
+ *	and since building is guaranteed to be single-threaded.
+ */
+void
+btbuild(Relation heap,
+	Relation index,
+	int natts,
+	AttrNumber *attnum,
+	IndexStrategy istrat,
+	uint16 pcount,
+	Datum *params,
+	FuncIndexInfo *finfo,
+	PredInfo *predInfo)
+{
+    HeapScanDesc hscan;
+    Buffer buffer;
+    HeapTuple htup;
+    IndexTuple itup;
+    TupleDesc htupdesc, itupdesc;
+    Datum *attdata;
+    bool *nulls;
+    InsertIndexResult res;
+    int nhtups, nitups;
+    int i;
+    BTItem btitem;
+    ExprContext *econtext;
+    TupleTable tupleTable;
+    TupleTableSlot *slot;
+    Oid hrelid, irelid;
+    Node *pred, *oldPred;
+    void *spool;
+    
+    /* note that this is a new btree */
+    BuildingBtree = true;
+    
+    pred = predInfo->pred;
+    oldPred = predInfo->oldPred;
+
+    /* initialize the btree index metadata page (if this is a new index) */
+    if (oldPred == NULL)
+	_bt_metapinit(index);
+    
+    /* get tuple descriptors for heap and index relations */
+    htupdesc = RelationGetTupleDescriptor(heap);
+    itupdesc = RelationGetTupleDescriptor(index);
+    
+    /* get space for data items that'll appear in the index tuple */
+    attdata = (Datum *) palloc(natts * sizeof(Datum));
+    nulls = (bool *) palloc(natts * sizeof(bool));
+    
+    /*
+     * If this is a predicate (partial) index, we will need to evaluate the
+     * predicate using ExecQual, which requires the current tuple to be in a
+     * slot of a TupleTable.  In addition, ExecQual must have an ExprContext
+     * referring to that slot.  Here, we initialize dummy TupleTable and
+     * ExprContext objects for this purpose. --Nels, Feb '92
+     */
+#ifndef OMIT_PARTIAL_INDEX
+    if (pred != NULL || oldPred != NULL) {
+	tupleTable = ExecCreateTupleTable(1);
+	slot = ExecAllocTableSlot(tupleTable);
+	econtext = makeNode(ExprContext);
+	FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
+    }
+#endif /* OMIT_PARTIAL_INDEX */
+    
+    /* start a heap scan */
+    hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+    htup = heap_getnext(hscan, 0, &buffer);
+    
+    /* build the index */
+    nhtups = nitups = 0;
+    
+    if (FastBuild) {
+	spool = _bt_spoolinit(index, 7);
+	res = (InsertIndexResult) NULL;
+    }
+
+    for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
+	
+	nhtups++;
+	
+	/*
+	 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+	 * this tuple if it was already in the existing partial index
+	 */
+	if (oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+
+	    /*SetSlotContents(slot, htup);*/
+	    slot->val = htup;
+	    if (ExecQual((List*)oldPred, econtext) == true) {
+		nitups++;
+		continue;
+	    }
+#endif /* OMIT_PARTIAL_INDEX */    	
+	}
+	
+	/* Skip this tuple if it doesn't satisfy the partial-index predicate */
+	if (pred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+	    /* SetSlotContents(slot, htup); */
+	    slot->val = htup;
+	    if (ExecQual((List*)pred, econtext) == false)
+		continue;
+#endif /* OMIT_PARTIAL_INDEX */    	
+	}
+	
+	nitups++;
+	
+	/*
+	 *  For the current heap tuple, extract all the attributes
+	 *  we use in this index, and note which are null.
+	 */
+	
+	for (i = 1; i <= natts; i++) {
+	    int  attoff;
+	    bool attnull;
+	    
+	    /*
+	     *  Offsets are from the start of the tuple, and are
+	     *  zero-based; indices are one-based.  The next call
+	     *  returns i - 1.  That's data hiding for you.
+	     */
+	    
+	    attoff = AttrNumberGetAttrOffset(i);
+	    attdata[attoff] = GetIndexValue(htup, 
+					    htupdesc,
+					    attoff, 
+					    attnum, 
+					    finfo, 
+					    &attnull,
+					    buffer);
+	    nulls[attoff] = (attnull ? 'n' : ' ');
+	}
+	
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(itupdesc, attdata, nulls);
+	
+	/*
+	 *  If the single index key is null, we don't insert it into
+	 *  the index.  Btrees support scans on <, <=, =, >=, and >.
+	 *  Relational algebra says that A op B (where op is one of the
+	 *  operators above) returns null if either A or B is null.  This
+	 *  means that no qualification used in an index scan could ever
+	 *  return true on a null attribute.  It also means that indices
+	 *  can't be used by ISNULL or NOTNULL scans, but that's an
+	 *  artifact of the strategy map architecture chosen in 1986, not
+	 *  of the way nulls are handled here.
+	 */
+	
+	if (itup->t_info & INDEX_NULL_MASK) {
+	    pfree(itup);
+	    continue;
+	}
+	
+	itup->t_tid = htup->t_ctid;
+	btitem = _bt_formitem(itup);
+
+	/*
+	 * if we are doing bottom-up btree build, we insert the index
+	 * into a spool page for subsequent processing.  otherwise, we
+	 * insert into the btree.
+	 */
+	if (FastBuild) {
+	    _bt_spool(index, btitem, spool);
+	} else {
+	    res = _bt_doinsert(index, btitem);
+	}
+
+	pfree(btitem);
+	pfree(itup);
+	if (res) {
+	    pfree(res);
+	}
+    }
+    
+    /* okay, all heap tuples are indexed */
+    heap_endscan(hscan);
+    
+    if (pred != NULL || oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+	ExecDestroyTupleTable(tupleTable, true);
+	pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */    	
+    }
+    
+    /*
+     * if we are doing bottom-up btree build, we now have a bunch of
+     * sorted runs in the spool pages.  finish the build by (1)
+     * merging the runs, (2) inserting the sorted tuples into btree
+     * pages and (3) building the upper levels.
+     */
+    if (FastBuild) {
+	_bt_spool(index, (BTItem) NULL, spool);	/* flush spool */
+	_bt_leafbuild(index, spool);
+	_bt_spooldestroy(spool);
+    }
+
+    /*
+     *  Since we just counted the tuples in the heap, we update its
+     *  stats in pg_class to guarantee that the planner takes advantage
+     *  of the index we just created. Finally, only update statistics
+     *  during normal index definitions, not for indices on system catalogs
+     *  created during bootstrap processing.  We must close the relations
+     *  before updatings statistics to guarantee that the relcache entries
+     *  are flushed when we increment the command counter in UpdateStats().
+     */
+    if (IsNormalProcessingMode())
+	{
+	    hrelid = heap->rd_id;
+	    irelid = index->rd_id;
+	    heap_close(heap);
+	    index_close(index);
+	    UpdateStats(hrelid, nhtups, true);
+	    UpdateStats(irelid, nitups, false);
+	    if (oldPred != NULL) {
+		if (nitups == nhtups) pred = NULL;
+		UpdateIndexPredicate(irelid, oldPred, pred);
+	    }  
+	}
+    
+    /* be tidy */
+    pfree(nulls);
+    pfree(attdata);
+    
+    /* all done */
+    BuildingBtree = false;
+}
+
+/*
+ *  btinsert() -- insert an index tuple into a btree.
+ *
+ *	Descend the tree recursively, find the appropriate location for our
+ *	new tuple, put it there, set its unique OID as appropriate, and
+ *	return an InsertIndexResult to the caller.
+ */
+InsertIndexResult
+btinsert(Relation rel, IndexTuple itup)
+{
+    BTItem btitem;
+    InsertIndexResult res;
+    
+    if (itup->t_info & INDEX_NULL_MASK)
+	return ((InsertIndexResult) NULL);
+    
+    btitem = _bt_formitem(itup);
+    
+    res = _bt_doinsert(rel, btitem);
+    pfree(btitem);
+    
+    return (res);
+}
+
+/*
+ *  btgettuple() -- Get the next tuple in the scan.
+ */
+char *
+btgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+    RetrieveIndexResult res;
+    
+    /*
+     *  If we've already initialized this scan, we can just advance it
+     *  in the appropriate direction.  If we haven't done so yet, we
+     *  call a routine to get the first item in the scan.
+     */
+    
+    if (ItemPointerIsValid(&(scan->currentItemData)))
+	res = _bt_next(scan, dir);
+    else
+	res = _bt_first(scan, dir);
+    
+    return ((char *) res);
+}
+
+/*
+ *  btbeginscan() -- start a scan on a btree index
+ */
+char *
+btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey)
+{
+    IndexScanDesc scan;
+    StrategyNumber strat;
+    BTScanOpaque so;
+    
+    /* first order the keys in the qualification */
+    if (keysz > 1)
+	_bt_orderkeys(rel, &keysz, scankey);
+    
+    /* now get the scan */
+    scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
+    so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
+    so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
+    scan->opaque = so;
+    
+    /* finally, be sure that the scan exploits the tree order */
+    scan->scanFromEnd = false;
+    scan->flags = 0x0;
+    if (keysz > 0) {
+	strat = _bt_getstrat(scan->relation, 1 /* XXX */,
+			     scankey[0].sk_procedure);
+	
+	if (strat == BTLessStrategyNumber
+	    || strat == BTLessEqualStrategyNumber)
+	    scan->scanFromEnd = true;
+    } else {
+	scan->scanFromEnd = true;
+    }
+    
+    /* register scan in case we change pages it's using */
+    _bt_regscan(scan);
+    
+    return ((char *) scan);
+}
+
+/*
+ *  btrescan() -- rescan an index relation
+ */
+void
+btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* we hold a read lock on the current page in the scan */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+	so->btso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* and we hold a read lock on the last marked item in the scan */
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+	so->btso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* reset the scan key */
+    if (scan->numberOfKeys > 0) {
+	memmove(scan->keyData,
+		scankey,
+		scan->numberOfKeys * sizeof(ScanKeyData));
+    }
+}
+
+void
+btmovescan(IndexScanDesc scan, Datum v)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* release any locks we still hold */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+	so->btso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    scan->keyData[0].sk_argument = v;
+}
+
+/*
+ *  btendscan() -- close down a scan
+ */
+void
+btendscan(IndexScanDesc scan)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* release any locks we still hold */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	if (BufferIsValid(so->btso_curbuf))
+	    _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+	so->btso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	if (BufferIsValid(so->btso_mrkbuf))
+	    _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+	so->btso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* don't need scan registered anymore */
+    _bt_dropscan(scan);
+    
+    /* be tidy */
+#ifdef PERFECT_MMGR
+    pfree (scan->opaque);
+#endif /* PERFECT_MMGR */
+}
+
+/*
+ *  btmarkpos() -- save current scan position
+ */
+void
+btmarkpos(IndexScanDesc scan)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* release lock on old marked data, if any */
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+	so->btso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* bump lock on currentItemData and copy to currentMarkData */
+    if (ItemPointerIsValid(&(scan->currentItemData))) {
+	so->btso_mrkbuf = _bt_getbuf(scan->relation,
+				     BufferGetBlockNumber(so->btso_curbuf),
+				     BT_READ);
+	scan->currentMarkData = scan->currentItemData;
+    }
+}
+
+/*
+ *  btrestrpos() -- restore scan to last saved position
+ */
+void
+btrestrpos(IndexScanDesc scan)
+{
+    ItemPointer iptr;
+    BTScanOpaque so;
+    
+    so = (BTScanOpaque) scan->opaque;
+    
+    /* release lock on current data, if any */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+	so->btso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* bump lock on currentMarkData and copy to currentItemData */
+    if (ItemPointerIsValid(&(scan->currentMarkData))) {
+	so->btso_curbuf = _bt_getbuf(scan->relation,
+				     BufferGetBlockNumber(so->btso_mrkbuf),
+				     BT_READ);
+	
+	scan->currentItemData = scan->currentMarkData;
+    }
+}
+
+/* stubs */
+void
+btdelete(Relation rel, ItemPointer tid)
+{
+    /* adjust any active scans that will be affected by this deletion */
+    _bt_adjscans(rel, tid);
+    
+    /* delete the data from the page */
+    _bt_pagedel(rel, tid);
+}
--- a/src/backend/access/nbtree/nbtscan.c
+++ b/src/backend/access/nbtree/nbtscan.c
@@ -0,0 +1,164 @@
+/*-------------------------------------------------------------------------
+ *
+ * btscan.c--
+ *    manage scans on btrees.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *
+ * NOTES
+ *   Because we can be doing an index scan on a relation while we update
+ *   it, we need to avoid missing data that moves around in the index.
+ *   The routines and global variables in this file guarantee that all
+ *   scans in the local address space stay correctly positioned.  This
+ *   is all we need to worry about, since write locking guarantees that
+ *   no one else will be on the same page at the same time as we are.
+ *
+ *   The scheme is to manage a list of active scans in the current backend.
+ *   Whenever we add or remove records from an index, or whenever we
+ *   split a leaf page, we check the list of active scans to see if any
+ *   has been affected.  A scan is affected only if it is on the same
+ *   relation, and the same page, as the update.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/nbtree.h"
+
+typedef struct BTScanListData {
+    IndexScanDesc		btsl_scan;
+    struct BTScanListData	*btsl_next;
+} BTScanListData;
+
+typedef BTScanListData	*BTScanList;
+
+static BTScanList	BTScans = (BTScanList) NULL;
+     
+/*
+ *  _bt_regscan() -- register a new scan.
+ */
+void
+_bt_regscan(IndexScanDesc scan)
+{
+    BTScanList new_el;
+    
+    new_el = (BTScanList) palloc(sizeof(BTScanListData));
+    new_el->btsl_scan = scan;
+    new_el->btsl_next = BTScans;
+    BTScans = new_el;
+}
+
+/*
+ *  _bt_dropscan() -- drop a scan from the scan list
+ */
+void
+_bt_dropscan(IndexScanDesc scan)
+{
+    BTScanList chk, last;
+    
+    last = (BTScanList) NULL;
+    for (chk = BTScans;
+	 chk != (BTScanList) NULL && chk->btsl_scan != scan;
+	 chk = chk->btsl_next) {
+	last = chk;
+    }
+    
+    if (chk == (BTScanList) NULL)
+	elog(WARN, "btree scan list trashed; can't find 0x%lx", scan);
+    
+    if (last == (BTScanList) NULL)
+	BTScans = chk->btsl_next;
+    else
+	last->btsl_next = chk->btsl_next;
+    
+#ifdef PERFECT_MEM
+    pfree (chk);
+#endif /* PERFECT_MEM */
+}
+
+void
+_bt_adjscans(Relation rel, ItemPointer tid)
+{
+    BTScanList l;
+    Oid relid;
+    
+    relid = rel->rd_id;
+    for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) {
+	if (relid == l->btsl_scan->relation->rd_id)
+	    _bt_scandel(l->btsl_scan, ItemPointerGetBlockNumber(tid),
+			ItemPointerGetOffsetNumber(tid));
+    }
+}
+
+void
+_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+    ItemPointer current;
+    Buffer buf;
+    BTScanOpaque so;
+    
+    if (!_bt_scantouched(scan, blkno, offno))
+	return;
+    
+    so = (BTScanOpaque) scan->opaque;
+    buf = so->btso_curbuf;
+    
+    current = &(scan->currentItemData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno) {
+	_bt_step(scan, &buf, BackwardScanDirection);
+	so->btso_curbuf = buf;
+    }
+    
+    current = &(scan->currentMarkData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno) {
+	ItemPointerData tmp;
+	tmp = *current;
+	*current = scan->currentItemData;
+	scan->currentItemData = tmp;
+	_bt_step(scan, &buf, BackwardScanDirection);
+	so->btso_mrkbuf = buf;
+	tmp = *current;
+	*current = scan->currentItemData;
+	scan->currentItemData = tmp;
+    }
+}
+
+bool
+_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+    ItemPointer current;
+    
+    current = &(scan->currentItemData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno)
+	return (true);
+    
+    current = &(scan->currentMarkData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno)
+	return (true);
+    
+    return (false);
+}
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
--- a/src/backend/access/nbtree/nbtstrat.c
+++ b/src/backend/access/nbtree/nbtstrat.c
@@ -0,0 +1,134 @@
+/*-------------------------------------------------------------------------
+ *
+ * btstrat.c--
+ *    Srategy map entries for the btree indexed access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/nbtree.h"
+
+/*
+ * Note:
+ *	StrategyNegate, StrategyCommute, and StrategyNegateCommute
+ *	assume <, <=, ==, >=, > ordering.
+ */
+static StrategyNumber	BTNegate[5] = {
+    BTGreaterEqualStrategyNumber,
+    BTGreaterStrategyNumber,
+    InvalidStrategy,
+    BTLessStrategyNumber,
+    BTLessEqualStrategyNumber
+};
+
+static StrategyNumber	BTCommute[5] = {
+    BTGreaterStrategyNumber,
+    BTGreaterEqualStrategyNumber,
+    InvalidStrategy,
+    BTLessEqualStrategyNumber,
+    BTLessStrategyNumber
+};
+
+static StrategyNumber	BTNegateCommute[5] = {
+    BTLessEqualStrategyNumber,
+    BTLessStrategyNumber,
+    InvalidStrategy,
+    BTGreaterStrategyNumber,
+    BTGreaterEqualStrategyNumber
+};
+
+static uint16	BTLessTermData[] = {		/* XXX type clash */
+    2,
+    BTLessStrategyNumber,
+    SK_NEGATE,
+    BTLessStrategyNumber,
+    SK_NEGATE | SK_COMMUTE
+};
+
+static uint16	BTLessEqualTermData[] = {	/* XXX type clash */
+    2,
+    BTLessEqualStrategyNumber,
+    0x0,
+    BTLessEqualStrategyNumber,
+    SK_COMMUTE
+};
+
+static uint16	BTGreaterEqualTermData[] = {	/* XXX type clash */
+    2,
+    BTGreaterEqualStrategyNumber,
+    0x0,
+    BTGreaterEqualStrategyNumber,
+    SK_COMMUTE
+    };
+
+static uint16	BTGreaterTermData[] = {		/* XXX type clash */
+    2,
+    BTGreaterStrategyNumber,
+    SK_NEGATE,
+    BTGreaterStrategyNumber,
+    SK_NEGATE | SK_COMMUTE
+};
+
+static StrategyTerm	BTEqualExpressionData[] = {
+    (StrategyTerm)BTLessTermData,		/* XXX */
+    (StrategyTerm)BTLessEqualTermData,		/* XXX */
+    (StrategyTerm)BTGreaterEqualTermData,	/* XXX */
+    (StrategyTerm)BTGreaterTermData,		/* XXX */
+    NULL
+};
+
+static StrategyEvaluationData	BTEvaluationData = {
+    /* XXX static for simplicity */
+    
+    BTMaxStrategyNumber,
+    (StrategyTransformMap)BTNegate,	/* XXX */
+    (StrategyTransformMap)BTCommute,	/* XXX */
+    (StrategyTransformMap)BTNegateCommute,	/* XXX */
+
+    { NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL,
+      NULL,NULL,NULL,NULL,NULL,NULL,NULL}
+};
+
+/* ----------------------------------------------------------------
+ *	RelationGetBTStrategy
+ * ----------------------------------------------------------------
+ */
+
+StrategyNumber
+_bt_getstrat(Relation rel,
+	     AttrNumber attno,
+	     RegProcedure proc)
+{
+    StrategyNumber	strat;
+    
+    strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc);
+    
+    Assert(StrategyNumberIsValid(strat));
+    
+    return (strat);
+}
+
+bool
+_bt_invokestrat(Relation rel,
+		AttrNumber attno,
+		StrategyNumber strat,
+		Datum left,
+		Datum right)
+{
+    return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat, 
+				   left, right));
+}
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * btutils.c--
+ *    Utility code for Postgres btree implementation.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+#include "utils/datum.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/iqual.h"
+#include "access/nbtree.h"
+
+ScanKey 
+_bt_mkscankey(Relation rel, IndexTuple itup)
+{     
+    ScanKey skey;
+    TupleDesc itupdesc;
+    int natts;
+    int i;
+    Datum arg;
+    RegProcedure proc;
+    bool null;
+    
+    natts = rel->rd_rel->relnatts;
+    itupdesc = RelationGetTupleDescriptor(rel);
+    
+    skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
+    
+    for (i = 0; i < natts; i++) {
+	arg = index_getattr(itup, i + 1, itupdesc, &null);
+	proc = index_getprocid(rel, i + 1, BTORDER_PROC);
+	ScanKeyEntryInitialize(&skey[i],
+			       0x0, (AttrNumber) (i + 1), proc, arg);
+    }
+    
+    return (skey);
+}
+
+void
+_bt_freeskey(ScanKey skey)
+{
+    pfree(skey);
+}
+
+void
+_bt_freestack(BTStack stack)
+{
+    BTStack ostack;
+    
+    while (stack != (BTStack) NULL) {
+	ostack = stack;
+	stack = stack->bts_parent;
+	pfree(ostack->bts_btitem);
+	pfree(ostack);
+    }
+}
+
+/*
+ *  _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals.
+ *
+ *	The order of the keys in the qual match the ordering imposed by
+ *	the index.  This routine only needs to be called if there are
+ *	more than one qual clauses using this index.
+ */
+void
+_bt_orderkeys(Relation relation, uint16 *numberOfKeys, ScanKey key)
+{
+    ScanKey xform;
+    ScanKeyData *cur;
+    StrategyMap map;
+    int nbytes;
+    long test;
+    int i, j;
+    int init[BTMaxStrategyNumber+1];
+    
+    /* haven't looked at any strategies yet */
+    for (i = 0; i <= BTMaxStrategyNumber; i++)
+	init[i] = 0;
+    
+    /* get space for the modified array of keys */
+    nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData);
+    xform = (ScanKey) palloc(nbytes);
+    memset(xform, 0, nbytes); 
+    
+    
+    /* get the strategy map for this index/attribute pair */
+    /*
+     *  XXX
+     *  When we support multiple keys in a single index, this is what
+     *  we'll want to do.  At present, the planner is hosed, so we
+     *  hard-wire the attribute number below.  Postgres only does single-
+     *  key indices...
+     * map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+     * 				    BTMaxStrategyNumber,
+     * 				    key->data[0].attributeNumber);
+     */
+    map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+				      BTMaxStrategyNumber,
+				      1 /* XXX */ );
+    
+    /* check each key passed in */
+    for (i = *numberOfKeys; --i >= 0; ) {
+	cur = &key[i];
+	for (j = BTMaxStrategyNumber; --j >= 0; ) {
+	    if (cur->sk_procedure == map->entry[j].sk_procedure)
+		break;
+	}
+	
+	/* have we seen one of these before? */
+	if (init[j]) {
+	    /* yup, use the appropriate value */
+	    test =
+		(long) FMGR_PTR2(cur->sk_func, cur->sk_procedure,
+				 cur->sk_argument, xform[j].sk_argument);
+	    if (test)
+		xform[j].sk_argument = cur->sk_argument;
+	} else {
+	    /* nope, use this value */
+	    memmove(&xform[j], cur, sizeof(*cur));
+	   
+	    init[j] = 1;
+	}
+    }
+    
+    /* if = has been specified, no other key will be used */
+    if (init[BTEqualStrategyNumber - 1]) {
+	init[BTLessStrategyNumber - 1] = 0;
+	init[BTLessEqualStrategyNumber - 1] = 0;
+	init[BTGreaterEqualStrategyNumber - 1] = 0;
+	init[BTGreaterStrategyNumber - 1] = 0;
+    }
+    
+    /* only one of <, <= */
+    if (init[BTLessStrategyNumber - 1]
+	&& init[BTLessEqualStrategyNumber - 1]) {
+	
+	ScanKeyData *lt, *le;
+	
+	lt = &xform[BTLessStrategyNumber - 1];
+	le = &xform[BTLessEqualStrategyNumber - 1];
+	
+	/*
+	 *  DO NOT use the cached function stuff here -- this is key
+	 *  ordering, happens only when the user expresses a hokey
+	 *  qualification, and gets executed only once, anyway.  The
+	 *  transform maps are hard-coded, and can't be initialized
+	 *  in the correct way.
+	 */
+	
+	test = (long) fmgr(le->sk_procedure, le->sk_argument, lt->sk_argument);
+	
+	if (test)
+	    init[BTLessEqualStrategyNumber - 1] = 0;
+	else
+	    init[BTLessStrategyNumber - 1] = 0;
+    }
+    
+    /* only one of >, >= */
+    if (init[BTGreaterStrategyNumber - 1]
+	&& init[BTGreaterEqualStrategyNumber - 1]) {
+	
+	ScanKeyData *gt, *ge;
+	
+	gt = &xform[BTGreaterStrategyNumber - 1];
+	ge = &xform[BTGreaterEqualStrategyNumber - 1];
+	
+	/* see note above on function cache */
+	test = (long) fmgr(ge->sk_procedure, gt->sk_argument, gt->sk_argument);
+	
+	if (test)
+	    init[BTGreaterStrategyNumber - 1] = 0;
+	else
+	    init[BTGreaterEqualStrategyNumber - 1] = 0;
+    }
+    
+    /* okay, reorder and count */
+    j = 0;
+    
+    for (i = BTMaxStrategyNumber; --i >= 0; )
+	if (init[i])
+	    key[j++] = xform[i];
+    
+    *numberOfKeys = j;
+    
+    pfree(xform);
+}
+
+bool
+_bt_checkqual(IndexScanDesc scan, IndexTuple itup)
+{
+    if (scan->numberOfKeys > 0)
+	return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
+			      scan->numberOfKeys, scan->keyData));
+    else
+	return (true);
+}
+
+BTItem
+_bt_formitem(IndexTuple itup)
+{
+    int nbytes_btitem;
+    BTItem btitem;
+    Size tuplen;
+    extern Oid newoid();
+    
+    /* disallow nulls in btree keys */
+    if (itup->t_info & INDEX_NULL_MASK)
+	elog(WARN, "btree indices cannot include null keys");
+    
+    /* make a copy of the index tuple with room for the sequence number */
+    tuplen = IndexTupleSize(itup);
+    nbytes_btitem = tuplen +
+	(sizeof(BTItemData) - sizeof(IndexTupleData));
+    
+    btitem = (BTItem) palloc(nbytes_btitem);
+    memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
+    
+    btitem->bti_oid = newoid();
+    return (btitem);
+}