Make btree index structure adjustments and WAL logging changes needed to

support btree compaction, as per proposal of a few days ago. btree index pages no longer store parent links, instead they have a level indicator (counting up from zero for leaf pages). The FixBTree recovery logic is removed, and replaced by code that detects missing parent-level insertions during WAL replay. Also, generate appropriate WAL entries when updating btree metapage and when building a btree index from scratch. I believe btree indexes are now completely WAL-legal for the first time. initdb forced due to index and WAL changes.
2025-07-28 23:42:10 +03:00 · 2003-02-21 00:06:22 +00:00
parent 4df0f1d26f
commit 70508ba7ae
13 changed files with 2179 additions and 1875 deletions
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: nbtree.h,v 1.63 2002/07/02 05:48:44 momjian Exp $
+ * $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -22,46 +22,55 @@
 /*
 *	BTPageOpaqueData -- At the end of every page, we store a pointer
 *	to both siblings in the tree.  This is used to do forward/backward
- *	index scans.  See Lehman and Yao's paper for more
- *	info.  In addition, we need to know what type of page this is
- *	(leaf or internal), and whether the page is available for reuse.
+ *	index scans.  The next-page link is also critical for recovery when
+ *	a search has navigated to the wrong page due to concurrent page splits
+ *	or deletions; see src/backend/access/nbtree/README for more info.
 *
- *	We also store a back-link to the parent page, but this cannot be trusted
- *	very far since it does not get updated when the parent is split.
- *	See backend/access/nbtree/README for details.
+ *  In addition, we store the page's btree level (counting upwards from
+ *	zero at a leaf page) as well as some flag bits indicating the page type
+ *	and status.  If the page is deleted, we replace the level with the
+ *	next-transaction-ID value indicating when it is safe to reclaim the page.
+ *
+ *	NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
+ *	instead.
 */

 typedef struct BTPageOpaqueData
 {
-	BlockNumber btpo_prev;		/* used for backward index scans */
-	BlockNumber btpo_next;		/* used for forward index scans */
-	BlockNumber btpo_parent;	/* pointer to parent, but not updated on
-								 * parent split */
-	uint16		btpo_flags;		/* LEAF?, ROOT?, FREE?, META?, REORDER? */
-
+	BlockNumber btpo_prev;		/* left sibling, or P_NONE if leftmost */
+	BlockNumber btpo_next;		/* right sibling, or P_NONE if rightmost */
+	union
+	{
+		uint32	level;			/* tree level --- zero for leaf pages */
+		TransactionId xact;		/* next transaction ID, if deleted */
+	} btpo;
+	uint16		btpo_flags;		/* flag bits, see below */
 } BTPageOpaqueData;

 typedef BTPageOpaqueData *BTPageOpaque;

 /* Bits defined in btpo_flags */
-#define BTP_LEAF		(1 << 0)	/* leaf page, if not internal page */
+#define BTP_LEAF		(1 << 0)	/* leaf page, i.e. not internal page */
 #define BTP_ROOT		(1 << 1)	/* root page (has no parent) */
-#define BTP_FREE		(1 << 2)	/* page not in use */
+#define BTP_DELETED		(1 << 2)	/* page has been deleted from tree */
 #define BTP_META		(1 << 3)	/* meta-page */
-#define BTP_REORDER		(1 << 4)	/* items need reordering */


 /*
 * The Meta page is always the first page in the btree index.
 * Its primary purpose is to point to the location of the btree root page.
+ * We also point to the "fast" root, which is the current effective root;
+ * see README for discussion.
 */

 typedef struct BTMetaPageData
 {
-	uint32		btm_magic;
-	uint32		btm_version;
-	BlockNumber btm_root;
-	int32		btm_level;
+	uint32		btm_magic;		/* should contain BTREE_MAGIC */
+	uint32		btm_version;	/* should contain BTREE_VERSION */
+	BlockNumber btm_root;		/* current root location */
+	uint32		btm_level;		/* tree level of the root page */
+	BlockNumber btm_fastroot;	/* current "fast" root location */
+	uint32		btm_fastlevel;	/* tree level of the "fast" root page */
 } BTMetaPageData;

 #define BTPageGetMeta(p) \
@ -69,12 +78,7 @@ typedef struct BTMetaPageData

 #define BTREE_METAPAGE	0		/* first page is meta */
 #define BTREE_MAGIC		0x053162	/* magic number of btree pages */
-
-#define BTreeInvalidParent(opaque)	\
-	(opaque->btpo_parent == InvalidBlockNumber || \
-		opaque->btpo_parent == BTREE_METAPAGE)
-
-#define BTREE_VERSION	1
+#define BTREE_VERSION	2		/* current version number */

 /*
 * We actually need to be able to fit three items on every page,
@ -84,6 +88,295 @@ typedef struct BTMetaPageData
 	((PageGetPageSize(page) - \
 	  sizeof(PageHeaderData) - \
 	  MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
+
+/*
+ *	BTItems are what we store in the btree.  Each item is an index tuple,
+ *	including key and pointer values.  (In some cases either the key or the
+ *	pointer may go unused, see backend/access/nbtree/README for details.)
+ *
+ *	Old comments:
+ *	In addition, we must guarantee that all tuples in the index are unique,
+ *	in order to satisfy some assumptions in Lehman and Yao.  The way that we
+ *	do this is by generating a new OID for every insertion that we do in the
+ *	tree.  This adds eight bytes to the size of btree index tuples.  Note
+ *	that we do not use the OID as part of a composite key; the OID only
+ *	serves as a unique identifier for a given index tuple (logical position
+ *	within a page).
+ *
+ *	New comments:
+ *	actually, we must guarantee that all tuples in A LEVEL
+ *	are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
+ *	as unique identifier for a given index tuple (logical position
+ *	within a level). - vadim 04/09/97
+ */
+
+typedef struct BTItemData
+{
+	IndexTupleData bti_itup;
+} BTItemData;
+
+typedef BTItemData *BTItem;
+
+/*
+ * For XLOG: size without alignment. Sizeof works as long as
+ * IndexTupleData has exactly 8 bytes.
+ */
+#define SizeOfBTItem	sizeof(BTItemData)
+
+/* Test whether items are the "same" per the above notes */
+#define BTItemSame(i1, i2)	  ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
+								(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
+								(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
+								(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
+								(i1)->bti_itup.t_tid.ip_posid == \
+								(i2)->bti_itup.t_tid.ip_posid )
+
+/*
+ *	In general, the btree code tries to localize its knowledge about
+ *	page layout to a couple of routines.  However, we need a special
+ *	value to indicate "no page number" in those places where we expect
+ *	page numbers.  We can use zero for this because we never need to
+ *	make a pointer to the metadata page.
+ */
+
+#define P_NONE			0
+
+/*
+ * Macros to test whether a page is leftmost or rightmost on its tree level,
+ * as well as other state info kept in the opaque data.
+ */
+#define P_LEFTMOST(opaque)		((opaque)->btpo_prev == P_NONE)
+#define P_RIGHTMOST(opaque)		((opaque)->btpo_next == P_NONE)
+#define P_ISLEAF(opaque)		((opaque)->btpo_flags & BTP_LEAF)
+#define P_ISROOT(opaque)		((opaque)->btpo_flags & BTP_ROOT)
+#define P_ISDELETED(opaque)		((opaque)->btpo_flags & BTP_DELETED)
+
+/*
+ *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
+ *	page.  The high key is not a data key, but gives info about what range of
+ *	keys is supposed to be on this page.  The high key on a page is required
+ *	to be greater than or equal to any data key that appears on the page.
+ *	If we find ourselves trying to insert a key > high key, we know we need
+ *	to move right (this should only happen if the page was split since we
+ *	examined the parent page).
+ *
+ *	Our insertion algorithm guarantees that we can use the initial least key
+ *	on our right sibling as the high key.  Once a page is created, its high
+ *	key changes only if the page is split.
+ *
+ *	On a non-rightmost page, the high key lives in item 1 and data items
+ *	start in item 2.  Rightmost pages have no high key, so we store data
+ *	items beginning in item 1.
+ */
+
+#define P_HIKEY				((OffsetNumber) 1)
+#define P_FIRSTKEY			((OffsetNumber) 2)
+#define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
+
+/*
+ * XLOG records for btree operations
+ *
+ * XLOG allows to store some information in high 4 bits of log
+ * record xl_info field
+ */
+#define XLOG_BTREE_INSERT_LEAF	0x00	/* add btitem without split */
+#define XLOG_BTREE_INSERT_UPPER	0x10	/* same, on a non-leaf page */
+#define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
+#define XLOG_BTREE_SPLIT_L		0x30	/* add btitem with split */
+#define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
+#define XLOG_BTREE_SPLIT_L_ROOT	0x50	/* add btitem with split of root */
+#define XLOG_BTREE_SPLIT_R_ROOT	0x60	/* as above, new item on right */
+#define XLOG_BTREE_DELETE		0x70	/* delete leaf btitem */
+#define XLOG_BTREE_DELETE_PAGE	0x80	/* delete an entire page */
+#define XLOG_BTREE_DELETE_PAGE_META	0x90 /* same, plus update metapage */
+#define XLOG_BTREE_NEWROOT		0xA0	/* new root page */
+#define XLOG_BTREE_NEWMETA		0xB0	/* update metadata page */
+#define XLOG_BTREE_NEWPAGE		0xC0	/* new index page during build */
+
+/*
+ * All that we need to find changed index tuple
+ */
+typedef struct xl_btreetid
+{
+	RelFileNode node;
+	ItemPointerData tid;		/* changed tuple id */
+} xl_btreetid;
+
+/*
+ * All that we need to regenerate the meta-data page
+ */
+typedef struct xl_btree_metadata
+{
+	BlockNumber root;
+	uint32		level;
+	BlockNumber fastroot;
+	uint32		fastlevel;
+} xl_btree_metadata;
+
+/*
+ * This is what we need to know about simple (without split) insert.
+ *
+ * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
+ * Note that INSERT_META implies it's not a leaf page.
+ */
+typedef struct xl_btree_insert
+{
+	xl_btreetid target;			/* inserted tuple id */
+	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
+	/* BTITEM FOLLOWS AT END OF STRUCT */
+} xl_btree_insert;
+
+#define SizeOfBtreeInsert	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
+
+/*
+ * On insert with split we save items of both left and right siblings
+ * and restore content of both pages from log record.  This way takes less
+ * xlog space than the normal approach, because if we did it standardly,
+ * XLogInsert would almost always think the right page is new and store its
+ * whole page image.
+ *
+ * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
+ * The _L and _R variants indicate whether the inserted btitem went into the
+ * left or right split page (and thus, whether otherblk is the right or left
+ * page of the split pair).  The _ROOT variants indicate that we are splitting
+ * the root page, and thus that a newroot record rather than an insert or
+ * split record should follow.  Note that a split record never carries a
+ * metapage update --- we'll do that in the parent-level update.
+ */
+typedef struct xl_btree_split
+{
+	xl_btreetid target;			/* inserted tuple id */
+	BlockNumber otherblk;		/* second block participated in split: */
+	/* first one is stored in target' tid */
+	BlockNumber leftblk;		/* prev/left block */
+	BlockNumber rightblk;		/* next/right block */
+	uint32		level;			/* tree level of page being split */
+	uint16		leftlen;		/* len of left page items below */
+	/* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
+} xl_btree_split;
+
+#define SizeOfBtreeSplit	(offsetof(xl_btree_split, leftlen) + sizeof(uint16))
+
+/*
+ * This is what we need to know about delete of an individual leaf btitem
+ */
+typedef struct xl_btree_delete
+{
+	xl_btreetid target;			/* deleted tuple id */
+} xl_btree_delete;
+
+#define SizeOfBtreeDelete	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
+
+/*
+ * This is what we need to know about deletion of a btree page.  The target
+ * identifies the tuple removed from the parent page (note that we remove
+ * this tuple's downlink and the *following* tuple's key).  Note we do not
+ * store any content for the deleted page --- it is just rewritten as empty
+ * during recovery.
+ */
+typedef struct xl_btree_delete_page
+{
+	xl_btreetid target;			/* deleted tuple id in parent page */
+	BlockNumber deadblk;		/* child block being deleted */
+	BlockNumber leftblk;		/* child block's left sibling, if any */
+	BlockNumber rightblk;		/* child block's right sibling */
+	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
+} xl_btree_delete_page;
+
+#define SizeOfBtreeDeletePage	(offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber))
+
+/*
+ * New root log record.  There are zero btitems if this is to establish an
+ * empty root, or two if it is the result of splitting an old root.
+ *
+ * Note that although this implies rewriting the metadata page, we don't need
+ * an xl_btree_metadata record --- the rootblk and level are sufficient.
+ */
+typedef struct xl_btree_newroot
+{
+	RelFileNode node;
+	BlockNumber rootblk;		/* location of new root */
+	uint32		level;			/* its tree level */
+	/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
+} xl_btree_newroot;
+
+#define SizeOfBtreeNewroot	(offsetof(xl_btree_newroot, level) + sizeof(uint32))
+
+/*
+ * New metapage log record.  This is not issued during routine operations;
+ * it's only used when initializing an empty index and at completion of
+ * index build.
+ */
+typedef struct xl_btree_newmeta
+{
+	RelFileNode node;
+	xl_btree_metadata meta;
+} xl_btree_newmeta;
+
+#define SizeOfBtreeNewmeta	(sizeof(xl_btree_newmeta))
+
+/*
+ * New index page log record.  This is only used while building a new index.
+ */
+typedef struct xl_btree_newpage
+{
+	RelFileNode node;
+	BlockNumber blkno;			/* location of new page */
+	/* entire page contents follow at end of record */
+} xl_btree_newpage;
+
+#define SizeOfBtreeNewpage	(offsetof(xl_btree_newpage, blkno) + sizeof(BlockNumber))
+
+
+/*
+ *	Operator strategy numbers -- ordering of these is <, <=, =, >=, >
+ */
+
+#define BTLessStrategyNumber			1
+#define BTLessEqualStrategyNumber		2
+#define BTEqualStrategyNumber			3
+#define BTGreaterEqualStrategyNumber	4
+#define BTGreaterStrategyNumber			5
+#define BTMaxStrategyNumber				5
+
+/*
+ *	When a new operator class is declared, we require that the user
+ *	supply us with an amproc procedure for determining whether, for
+ *	two keys a and b, a < b, a = b, or a > b.  This routine must
+ *	return < 0, 0, > 0, respectively, in these three cases.  Since we
+ *	only have one such proc in amproc, it's number 1.
+ */
+
+#define BTORDER_PROC	1
+
+/*
+ *	We need to be able to tell the difference between read and write
+ *	requests for pages, in order to do locking correctly.
+ */
+
+#define BT_READ			BUFFER_LOCK_SHARE
+#define BT_WRITE		BUFFER_LOCK_EXCLUSIVE
+
+/*
+ *	BTStackData -- As we descend a tree, we push the (location, downlink)
+ *	pairs from internal pages onto a private stack.  If we split a
+ *	leaf, we use this stack to walk back up the tree and insert data
+ *	into parent pages (and possibly to split them, too).  Lehman and
+ *	Yao's update algorithm guarantees that under no circumstances can
+ *	our private stack give us an irredeemably bad picture up the tree.
+ *	Again, see the paper for details.
+ */
+
+typedef struct BTStackData
+{
+	BlockNumber bts_blkno;
+	OffsetNumber bts_offset;
+	BTItemData	bts_btitem;
+	struct BTStackData *bts_parent;
+} BTStackData;
+
+typedef BTStackData *BTStack;
+
 /*
 *	BTScanOpaqueData is used to remember which buffers we're currently
 *	examining in the scan.	We keep these buffers pinned (but not locked,
@ -116,212 +409,6 @@ typedef struct BTScanOpaqueData

 typedef BTScanOpaqueData *BTScanOpaque;

-/*
- *	BTItems are what we store in the btree.  Each item is an index tuple,
- *	including key and pointer values.  (In some cases either the key or the
- *	pointer may go unused, see backend/access/nbtree/README for details.)
- *
- *	Old comments:
- *	In addition, we must guarantee that all tuples in the index are unique,
- *	in order to satisfy some assumptions in Lehman and Yao.  The way that we
- *	do this is by generating a new OID for every insertion that we do in the
- *	tree.  This adds eight bytes to the size of btree index tuples.  Note
- *	that we do not use the OID as part of a composite key; the OID only
- *	serves as a unique identifier for a given index tuple (logical position
- *	within a page).
- *
- *	New comments:
- *	actually, we must guarantee that all tuples in A LEVEL
- *	are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
- *	as unique identifier for a given index tuple (logical position
- *	within a level). - vadim 04/09/97
- */
-
-typedef struct BTItemData
-{
-	IndexTupleData bti_itup;
-} BTItemData;
-
-typedef BTItemData *BTItem;
-
-/*
- * For XLOG: size without alignement. Sizeof works as long as
- * IndexTupleData has exactly 8 bytes.
- */
-#define SizeOfBTItem	sizeof(BTItemData)
-
-/* Test whether items are the "same" per the above notes */
-#define BTItemSame(i1, i2)	  ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
-								(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
-								(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
-								(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
-								(i1)->bti_itup.t_tid.ip_posid == \
-								(i2)->bti_itup.t_tid.ip_posid )
-
-/*
- *	BTStackData -- As we descend a tree, we push the (key, pointer)
- *	pairs from internal nodes onto a private stack.  If we split a
- *	leaf, we use this stack to walk back up the tree and insert data
- *	into parent nodes (and possibly to split them, too).  Lehman and
- *	Yao's update algorithm guarantees that under no circumstances can
- *	our private stack give us an irredeemably bad picture up the tree.
- *	Again, see the paper for details.
- */
-
-typedef struct BTStackData
-{
-	BlockNumber bts_blkno;
-	OffsetNumber bts_offset;
-	BTItemData	bts_btitem;
-	struct BTStackData *bts_parent;
-} BTStackData;
-
-typedef BTStackData *BTStack;
-
-/*
- *	We need to be able to tell the difference between read and write
- *	requests for pages, in order to do locking correctly.
- */
-
-#define BT_READ			BUFFER_LOCK_SHARE
-#define BT_WRITE		BUFFER_LOCK_EXCLUSIVE
-
-/*
- *	In general, the btree code tries to localize its knowledge about
- *	page layout to a couple of routines.  However, we need a special
- *	value to indicate "no page number" in those places where we expect
- *	page numbers.  We can use zero for this because we never need to
- *	make a pointer to the metadata page.
- */
-
-#define P_NONE			0
-
-/*
- * Macros to test whether a page is leftmost or rightmost on its tree level,
- * as well as other state info kept in the opaque data.
- */
-#define P_LEFTMOST(opaque)		((opaque)->btpo_prev == P_NONE)
-#define P_RIGHTMOST(opaque)		((opaque)->btpo_next == P_NONE)
-#define P_ISLEAF(opaque)		((opaque)->btpo_flags & BTP_LEAF)
-#define P_ISROOT(opaque)		((opaque)->btpo_flags & BTP_ROOT)
-
-/*
- *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
- *	page.  The high key is not a data key, but gives info about what range of
- *	keys is supposed to be on this page.  The high key on a page is required
- *	to be greater than or equal to any data key that appears on the page.
- *	If we find ourselves trying to insert a key > high key, we know we need
- *	to move right (this should only happen if the page was split since we
- *	examined the parent page).
- *
- *	Our insertion algorithm guarantees that we can use the initial least key
- *	on our right sibling as the high key.  Once a page is created, its high
- *	key changes only if the page is split.
- *
- *	On a non-rightmost page, the high key lives in item 1 and data items
- *	start in item 2.  Rightmost pages have no high key, so we store data
- *	items beginning in item 1.
- */
-
-#define P_HIKEY				((OffsetNumber) 1)
-#define P_FIRSTKEY			((OffsetNumber) 2)
-#define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
-
-/*
- * XLOG allows to store some information in high 4 bits of log
- * record xl_info field
- */
-#define XLOG_BTREE_DELETE	0x00	/* delete btitem */
-#define XLOG_BTREE_INSERT	0x10	/* add btitem without split */
-#define XLOG_BTREE_SPLIT	0x20	/* add btitem with split */
-#define XLOG_BTREE_SPLEFT	0x30	/* as above + flag that new btitem */
- /* goes to the left sibling */
-#define XLOG_BTREE_NEWROOT	0x40	/* new root page */
-
-#define XLOG_BTREE_LEAF		0x80	/* leaf/internal page was changed */
-
-/*
- * All what we need to find changed index tuple
- */
-typedef struct xl_btreetid
-{
-	RelFileNode node;
-	ItemPointerData tid;		/* changed tuple id */
-} xl_btreetid;
-
-/*
- * This is what we need to know about delete
- */
-typedef struct xl_btree_delete
-{
-	xl_btreetid target;			/* deleted tuple id */
-} xl_btree_delete;
-
-#define SizeOfBtreeDelete	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
-
-/*
- * This is what we need to know about pure (without split) insert
- */
-typedef struct xl_btree_insert
-{
-	xl_btreetid target;			/* inserted tuple id */
-	/* BTITEM FOLLOWS AT END OF STRUCT */
-} xl_btree_insert;
-
-#define SizeOfBtreeInsert	(offsetof(xl_btreetid, tid) + SizeOfIptrData)
-
-/*
- * On insert with split we save items of both left and right siblings
- * and restore content of both pages from log record
- */
-typedef struct xl_btree_split
-{
-	xl_btreetid target;			/* inserted tuple id */
-	BlockIdData otherblk;		/* second block participated in split: */
-	/* first one is stored in target' tid */
-	BlockIdData parentblk;		/* parent block */
-	BlockIdData leftblk;		/* prev left block */
-	BlockIdData rightblk;		/* next right block */
-	uint16		leftlen;		/* len of left page items below */
-	/* LEFT AND RIGHT PAGES ITEMS FOLLOW AT THE END */
-} xl_btree_split;
-
-#define SizeOfBtreeSplit	(offsetof(xl_btree_split, leftlen) + sizeof(uint16))
-
-/*
- * New root log record.
- */
-typedef struct xl_btree_newroot
-{
-	RelFileNode node;
-	int32		level;
-	BlockIdData rootblk;
-	/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
-} xl_btree_newroot;
-
-#define SizeOfBtreeNewroot	(offsetof(xl_btree_newroot, rootblk) + sizeof(BlockIdData))
-
-/*
- *	Operator strategy numbers -- ordering of these is <, <=, =, >=, >
- */
-
-#define BTLessStrategyNumber			1
-#define BTLessEqualStrategyNumber		2
-#define BTEqualStrategyNumber			3
-#define BTGreaterEqualStrategyNumber	4
-#define BTGreaterStrategyNumber			5
-#define BTMaxStrategyNumber				5
-
-/*
- *	When a new operator class is declared, we require that the user
- *	supply us with an amproc procedure for determining whether, for
- *	two keys a and b, a < b, a = b, or a > b.  This routine must
- *	return < 0, 0, > 0, respectively, in these three cases.  Since we
- *	only have one such proc in amproc, it's number 1.
- */
-
-#define BTORDER_PROC	1
-
 /*
 * prototypes for functions in nbtree.c (external entry points for btree)
 */
@ -340,27 +427,26 @@ extern Datum btmarkpos(PG_FUNCTION_ARGS);
 extern Datum btrestrpos(PG_FUNCTION_ARGS);
 extern Datum btbulkdelete(PG_FUNCTION_ARGS);

-extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
-extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
-extern void btree_desc(char *buf, uint8 xl_info, char *rec);
-
 /*
 * prototypes for functions in nbtinsert.c
 */
 extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
 			 bool index_is_unique, Relation heapRel);
+extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+							  BTStack stack, bool is_root, bool is_only);

 /*
 * prototypes for functions in nbtpage.c
 */
 extern void _bt_metapinit(Relation rel);
 extern Buffer _bt_getroot(Relation rel, int access);
+extern Buffer _bt_gettrueroot(Relation rel);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern void _bt_relbuf(Relation rel, Buffer buf);
 extern void _bt_wrtbuf(Relation rel, Buffer buf);
 extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
-extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level);
+extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
 extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);

 /*
@ -377,6 +463,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
+extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);

 /*
 * prototypes for functions in nbtstrat.c
@ -407,4 +494,13 @@ extern void _bt_spooldestroy(BTSpool *btspool);
 extern void _bt_spool(BTItem btitem, BTSpool *btspool);
 extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);

+/*
+ * prototypes for functions in nbtxlog.c
+ */
+extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
+extern void btree_desc(char *buf, uint8 xl_info, char *rec);
+extern void btree_xlog_startup(void);
+extern void btree_xlog_cleanup(void);
+
 #endif   /* NBTREE_H */