mirror of
https://github.com/postgres/postgres.git
synced 2025-07-28 23:42:10 +03:00
Make btree index structure adjustments and WAL logging changes needed to
support btree compaction, as per proposal of a few days ago. btree index pages no longer store parent links, instead they have a level indicator (counting up from zero for leaf pages). The FixBTree recovery logic is removed, and replaced by code that detects missing parent-level insertions during WAL replay. Also, generate appropriate WAL entries when updating btree metapage and when building a btree index from scratch. I believe btree indexes are now completely WAL-legal for the first time. initdb forced due to index and WAL changes.
This commit is contained in:
@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $Id: nbtree.h,v 1.63 2002/07/02 05:48:44 momjian Exp $
|
||||
* $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -22,46 +22,55 @@
|
||||
/*
|
||||
* BTPageOpaqueData -- At the end of every page, we store a pointer
|
||||
* to both siblings in the tree. This is used to do forward/backward
|
||||
* index scans. See Lehman and Yao's paper for more
|
||||
* info. In addition, we need to know what type of page this is
|
||||
* (leaf or internal), and whether the page is available for reuse.
|
||||
* index scans. The next-page link is also critical for recovery when
|
||||
* a search has navigated to the wrong page due to concurrent page splits
|
||||
* or deletions; see src/backend/access/nbtree/README for more info.
|
||||
*
|
||||
* We also store a back-link to the parent page, but this cannot be trusted
|
||||
* very far since it does not get updated when the parent is split.
|
||||
* See backend/access/nbtree/README for details.
|
||||
* In addition, we store the page's btree level (counting upwards from
|
||||
* zero at a leaf page) as well as some flag bits indicating the page type
|
||||
* and status. If the page is deleted, we replace the level with the
|
||||
* next-transaction-ID value indicating when it is safe to reclaim the page.
|
||||
*
|
||||
* NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
|
||||
* instead.
|
||||
*/
|
||||
|
||||
typedef struct BTPageOpaqueData
|
||||
{
|
||||
BlockNumber btpo_prev; /* used for backward index scans */
|
||||
BlockNumber btpo_next; /* used for forward index scans */
|
||||
BlockNumber btpo_parent; /* pointer to parent, but not updated on
|
||||
* parent split */
|
||||
uint16 btpo_flags; /* LEAF?, ROOT?, FREE?, META?, REORDER? */
|
||||
|
||||
BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
|
||||
BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
|
||||
union
|
||||
{
|
||||
uint32 level; /* tree level --- zero for leaf pages */
|
||||
TransactionId xact; /* next transaction ID, if deleted */
|
||||
} btpo;
|
||||
uint16 btpo_flags; /* flag bits, see below */
|
||||
} BTPageOpaqueData;
|
||||
|
||||
typedef BTPageOpaqueData *BTPageOpaque;
|
||||
|
||||
/* Bits defined in btpo_flags */
|
||||
#define BTP_LEAF (1 << 0) /* leaf page, if not internal page */
|
||||
#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
|
||||
#define BTP_ROOT (1 << 1) /* root page (has no parent) */
|
||||
#define BTP_FREE (1 << 2) /* page not in use */
|
||||
#define BTP_DELETED (1 << 2) /* page has been deleted from tree */
|
||||
#define BTP_META (1 << 3) /* meta-page */
|
||||
#define BTP_REORDER (1 << 4) /* items need reordering */
|
||||
|
||||
|
||||
/*
|
||||
* The Meta page is always the first page in the btree index.
|
||||
* Its primary purpose is to point to the location of the btree root page.
|
||||
* We also point to the "fast" root, which is the current effective root;
|
||||
* see README for discussion.
|
||||
*/
|
||||
|
||||
typedef struct BTMetaPageData
|
||||
{
|
||||
uint32 btm_magic;
|
||||
uint32 btm_version;
|
||||
BlockNumber btm_root;
|
||||
int32 btm_level;
|
||||
uint32 btm_magic; /* should contain BTREE_MAGIC */
|
||||
uint32 btm_version; /* should contain BTREE_VERSION */
|
||||
BlockNumber btm_root; /* current root location */
|
||||
uint32 btm_level; /* tree level of the root page */
|
||||
BlockNumber btm_fastroot; /* current "fast" root location */
|
||||
uint32 btm_fastlevel; /* tree level of the "fast" root page */
|
||||
} BTMetaPageData;
|
||||
|
||||
#define BTPageGetMeta(p) \
|
||||
@ -69,12 +78,7 @@ typedef struct BTMetaPageData
|
||||
|
||||
#define BTREE_METAPAGE 0 /* first page is meta */
|
||||
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
|
||||
|
||||
#define BTreeInvalidParent(opaque) \
|
||||
(opaque->btpo_parent == InvalidBlockNumber || \
|
||||
opaque->btpo_parent == BTREE_METAPAGE)
|
||||
|
||||
#define BTREE_VERSION 1
|
||||
#define BTREE_VERSION 2 /* current version number */
|
||||
|
||||
/*
|
||||
* We actually need to be able to fit three items on every page,
|
||||
@ -84,6 +88,295 @@ typedef struct BTMetaPageData
|
||||
((PageGetPageSize(page) - \
|
||||
sizeof(PageHeaderData) - \
|
||||
MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
|
||||
|
||||
/*
|
||||
* BTItems are what we store in the btree. Each item is an index tuple,
|
||||
* including key and pointer values. (In some cases either the key or the
|
||||
* pointer may go unused, see backend/access/nbtree/README for details.)
|
||||
*
|
||||
* Old comments:
|
||||
* In addition, we must guarantee that all tuples in the index are unique,
|
||||
* in order to satisfy some assumptions in Lehman and Yao. The way that we
|
||||
* do this is by generating a new OID for every insertion that we do in the
|
||||
* tree. This adds eight bytes to the size of btree index tuples. Note
|
||||
* that we do not use the OID as part of a composite key; the OID only
|
||||
* serves as a unique identifier for a given index tuple (logical position
|
||||
* within a page).
|
||||
*
|
||||
* New comments:
|
||||
* actually, we must guarantee that all tuples in A LEVEL
|
||||
* are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
|
||||
* as unique identifier for a given index tuple (logical position
|
||||
* within a level). - vadim 04/09/97
|
||||
*/
|
||||
|
||||
typedef struct BTItemData
|
||||
{
|
||||
IndexTupleData bti_itup;
|
||||
} BTItemData;
|
||||
|
||||
typedef BTItemData *BTItem;
|
||||
|
||||
/*
|
||||
* For XLOG: size without alignment. Sizeof works as long as
|
||||
* IndexTupleData has exactly 8 bytes.
|
||||
*/
|
||||
#define SizeOfBTItem sizeof(BTItemData)
|
||||
|
||||
/* Test whether items are the "same" per the above notes */
|
||||
#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
|
||||
(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
|
||||
(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
|
||||
(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
|
||||
(i1)->bti_itup.t_tid.ip_posid == \
|
||||
(i2)->bti_itup.t_tid.ip_posid )
|
||||
|
||||
/*
|
||||
* In general, the btree code tries to localize its knowledge about
|
||||
* page layout to a couple of routines. However, we need a special
|
||||
* value to indicate "no page number" in those places where we expect
|
||||
* page numbers. We can use zero for this because we never need to
|
||||
* make a pointer to the metadata page.
|
||||
*/
|
||||
|
||||
#define P_NONE 0
|
||||
|
||||
/*
|
||||
* Macros to test whether a page is leftmost or rightmost on its tree level,
|
||||
* as well as other state info kept in the opaque data.
|
||||
*/
|
||||
#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
|
||||
#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
|
||||
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
|
||||
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
|
||||
#define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
|
||||
|
||||
/*
|
||||
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
|
||||
* page. The high key is not a data key, but gives info about what range of
|
||||
* keys is supposed to be on this page. The high key on a page is required
|
||||
* to be greater than or equal to any data key that appears on the page.
|
||||
* If we find ourselves trying to insert a key > high key, we know we need
|
||||
* to move right (this should only happen if the page was split since we
|
||||
* examined the parent page).
|
||||
*
|
||||
* Our insertion algorithm guarantees that we can use the initial least key
|
||||
* on our right sibling as the high key. Once a page is created, its high
|
||||
* key changes only if the page is split.
|
||||
*
|
||||
* On a non-rightmost page, the high key lives in item 1 and data items
|
||||
* start in item 2. Rightmost pages have no high key, so we store data
|
||||
* items beginning in item 1.
|
||||
*/
|
||||
|
||||
#define P_HIKEY ((OffsetNumber) 1)
|
||||
#define P_FIRSTKEY ((OffsetNumber) 2)
|
||||
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
|
||||
|
||||
/*
|
||||
* XLOG records for btree operations
|
||||
*
|
||||
* XLOG allows to store some information in high 4 bits of log
|
||||
* record xl_info field
|
||||
*/
|
||||
#define XLOG_BTREE_INSERT_LEAF 0x00 /* add btitem without split */
|
||||
#define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
|
||||
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
|
||||
#define XLOG_BTREE_SPLIT_L 0x30 /* add btitem with split */
|
||||
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
|
||||
#define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add btitem with split of root */
|
||||
#define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
|
||||
#define XLOG_BTREE_DELETE 0x70 /* delete leaf btitem */
|
||||
#define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */
|
||||
#define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, plus update metapage */
|
||||
#define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
|
||||
#define XLOG_BTREE_NEWMETA 0xB0 /* update metadata page */
|
||||
#define XLOG_BTREE_NEWPAGE 0xC0 /* new index page during build */
|
||||
|
||||
/*
|
||||
* All that we need to find changed index tuple
|
||||
*/
|
||||
typedef struct xl_btreetid
|
||||
{
|
||||
RelFileNode node;
|
||||
ItemPointerData tid; /* changed tuple id */
|
||||
} xl_btreetid;
|
||||
|
||||
/*
|
||||
* All that we need to regenerate the meta-data page
|
||||
*/
|
||||
typedef struct xl_btree_metadata
|
||||
{
|
||||
BlockNumber root;
|
||||
uint32 level;
|
||||
BlockNumber fastroot;
|
||||
uint32 fastlevel;
|
||||
} xl_btree_metadata;
|
||||
|
||||
/*
|
||||
* This is what we need to know about simple (without split) insert.
|
||||
*
|
||||
* This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
|
||||
* Note that INSERT_META implies it's not a leaf page.
|
||||
*/
|
||||
typedef struct xl_btree_insert
|
||||
{
|
||||
xl_btreetid target; /* inserted tuple id */
|
||||
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
|
||||
/* BTITEM FOLLOWS AT END OF STRUCT */
|
||||
} xl_btree_insert;
|
||||
|
||||
#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
||||
|
||||
/*
|
||||
* On insert with split we save items of both left and right siblings
|
||||
* and restore content of both pages from log record. This way takes less
|
||||
* xlog space than the normal approach, because if we did it standardly,
|
||||
* XLogInsert would almost always think the right page is new and store its
|
||||
* whole page image.
|
||||
*
|
||||
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
|
||||
* The _L and _R variants indicate whether the inserted btitem went into the
|
||||
* left or right split page (and thus, whether otherblk is the right or left
|
||||
* page of the split pair). The _ROOT variants indicate that we are splitting
|
||||
* the root page, and thus that a newroot record rather than an insert or
|
||||
* split record should follow. Note that a split record never carries a
|
||||
* metapage update --- we'll do that in the parent-level update.
|
||||
*/
|
||||
typedef struct xl_btree_split
|
||||
{
|
||||
xl_btreetid target; /* inserted tuple id */
|
||||
BlockNumber otherblk; /* second block participated in split: */
|
||||
/* first one is stored in target' tid */
|
||||
BlockNumber leftblk; /* prev/left block */
|
||||
BlockNumber rightblk; /* next/right block */
|
||||
uint32 level; /* tree level of page being split */
|
||||
uint16 leftlen; /* len of left page items below */
|
||||
/* LEFT AND RIGHT PAGES TUPLES FOLLOW AT THE END */
|
||||
} xl_btree_split;
|
||||
|
||||
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need to know about delete of an individual leaf btitem
|
||||
*/
|
||||
typedef struct xl_btree_delete
|
||||
{
|
||||
xl_btreetid target; /* deleted tuple id */
|
||||
} xl_btree_delete;
|
||||
|
||||
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
||||
|
||||
/*
|
||||
* This is what we need to know about deletion of a btree page. The target
|
||||
* identifies the tuple removed from the parent page (note that we remove
|
||||
* this tuple's downlink and the *following* tuple's key). Note we do not
|
||||
* store any content for the deleted page --- it is just rewritten as empty
|
||||
* during recovery.
|
||||
*/
|
||||
typedef struct xl_btree_delete_page
|
||||
{
|
||||
xl_btreetid target; /* deleted tuple id in parent page */
|
||||
BlockNumber deadblk; /* child block being deleted */
|
||||
BlockNumber leftblk; /* child block's left sibling, if any */
|
||||
BlockNumber rightblk; /* child block's right sibling */
|
||||
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
|
||||
} xl_btree_delete_page;
|
||||
|
||||
#define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, rightblk) + sizeof(BlockNumber))
|
||||
|
||||
/*
|
||||
* New root log record. There are zero btitems if this is to establish an
|
||||
* empty root, or two if it is the result of splitting an old root.
|
||||
*
|
||||
* Note that although this implies rewriting the metadata page, we don't need
|
||||
* an xl_btree_metadata record --- the rootblk and level are sufficient.
|
||||
*/
|
||||
typedef struct xl_btree_newroot
|
||||
{
|
||||
RelFileNode node;
|
||||
BlockNumber rootblk; /* location of new root */
|
||||
uint32 level; /* its tree level */
|
||||
/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
|
||||
} xl_btree_newroot;
|
||||
|
||||
#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
|
||||
|
||||
/*
|
||||
* New metapage log record. This is not issued during routine operations;
|
||||
* it's only used when initializing an empty index and at completion of
|
||||
* index build.
|
||||
*/
|
||||
typedef struct xl_btree_newmeta
|
||||
{
|
||||
RelFileNode node;
|
||||
xl_btree_metadata meta;
|
||||
} xl_btree_newmeta;
|
||||
|
||||
#define SizeOfBtreeNewmeta (sizeof(xl_btree_newmeta))
|
||||
|
||||
/*
|
||||
* New index page log record. This is only used while building a new index.
|
||||
*/
|
||||
typedef struct xl_btree_newpage
|
||||
{
|
||||
RelFileNode node;
|
||||
BlockNumber blkno; /* location of new page */
|
||||
/* entire page contents follow at end of record */
|
||||
} xl_btree_newpage;
|
||||
|
||||
#define SizeOfBtreeNewpage (offsetof(xl_btree_newpage, blkno) + sizeof(BlockNumber))
|
||||
|
||||
|
||||
/*
|
||||
* Operator strategy numbers -- ordering of these is <, <=, =, >=, >
|
||||
*/
|
||||
|
||||
#define BTLessStrategyNumber 1
|
||||
#define BTLessEqualStrategyNumber 2
|
||||
#define BTEqualStrategyNumber 3
|
||||
#define BTGreaterEqualStrategyNumber 4
|
||||
#define BTGreaterStrategyNumber 5
|
||||
#define BTMaxStrategyNumber 5
|
||||
|
||||
/*
|
||||
* When a new operator class is declared, we require that the user
|
||||
* supply us with an amproc procedure for determining whether, for
|
||||
* two keys a and b, a < b, a = b, or a > b. This routine must
|
||||
* return < 0, 0, > 0, respectively, in these three cases. Since we
|
||||
* only have one such proc in amproc, it's number 1.
|
||||
*/
|
||||
|
||||
#define BTORDER_PROC 1
|
||||
|
||||
/*
|
||||
* We need to be able to tell the difference between read and write
|
||||
* requests for pages, in order to do locking correctly.
|
||||
*/
|
||||
|
||||
#define BT_READ BUFFER_LOCK_SHARE
|
||||
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
|
||||
|
||||
/*
|
||||
* BTStackData -- As we descend a tree, we push the (location, downlink)
|
||||
* pairs from internal pages onto a private stack. If we split a
|
||||
* leaf, we use this stack to walk back up the tree and insert data
|
||||
* into parent pages (and possibly to split them, too). Lehman and
|
||||
* Yao's update algorithm guarantees that under no circumstances can
|
||||
* our private stack give us an irredeemably bad picture up the tree.
|
||||
* Again, see the paper for details.
|
||||
*/
|
||||
|
||||
typedef struct BTStackData
|
||||
{
|
||||
BlockNumber bts_blkno;
|
||||
OffsetNumber bts_offset;
|
||||
BTItemData bts_btitem;
|
||||
struct BTStackData *bts_parent;
|
||||
} BTStackData;
|
||||
|
||||
typedef BTStackData *BTStack;
|
||||
|
||||
/*
|
||||
* BTScanOpaqueData is used to remember which buffers we're currently
|
||||
* examining in the scan. We keep these buffers pinned (but not locked,
|
||||
@ -116,212 +409,6 @@ typedef struct BTScanOpaqueData
|
||||
|
||||
typedef BTScanOpaqueData *BTScanOpaque;
|
||||
|
||||
/*
|
||||
* BTItems are what we store in the btree. Each item is an index tuple,
|
||||
* including key and pointer values. (In some cases either the key or the
|
||||
* pointer may go unused, see backend/access/nbtree/README for details.)
|
||||
*
|
||||
* Old comments:
|
||||
* In addition, we must guarantee that all tuples in the index are unique,
|
||||
* in order to satisfy some assumptions in Lehman and Yao. The way that we
|
||||
* do this is by generating a new OID for every insertion that we do in the
|
||||
* tree. This adds eight bytes to the size of btree index tuples. Note
|
||||
* that we do not use the OID as part of a composite key; the OID only
|
||||
* serves as a unique identifier for a given index tuple (logical position
|
||||
* within a page).
|
||||
*
|
||||
* New comments:
|
||||
* actually, we must guarantee that all tuples in A LEVEL
|
||||
* are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
|
||||
* as unique identifier for a given index tuple (logical position
|
||||
* within a level). - vadim 04/09/97
|
||||
*/
|
||||
|
||||
typedef struct BTItemData
|
||||
{
|
||||
IndexTupleData bti_itup;
|
||||
} BTItemData;
|
||||
|
||||
typedef BTItemData *BTItem;
|
||||
|
||||
/*
|
||||
* For XLOG: size without alignement. Sizeof works as long as
|
||||
* IndexTupleData has exactly 8 bytes.
|
||||
*/
|
||||
#define SizeOfBTItem sizeof(BTItemData)
|
||||
|
||||
/* Test whether items are the "same" per the above notes */
|
||||
#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
|
||||
(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
|
||||
(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
|
||||
(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
|
||||
(i1)->bti_itup.t_tid.ip_posid == \
|
||||
(i2)->bti_itup.t_tid.ip_posid )
|
||||
|
||||
/*
|
||||
* BTStackData -- As we descend a tree, we push the (key, pointer)
|
||||
* pairs from internal nodes onto a private stack. If we split a
|
||||
* leaf, we use this stack to walk back up the tree and insert data
|
||||
* into parent nodes (and possibly to split them, too). Lehman and
|
||||
* Yao's update algorithm guarantees that under no circumstances can
|
||||
* our private stack give us an irredeemably bad picture up the tree.
|
||||
* Again, see the paper for details.
|
||||
*/
|
||||
|
||||
typedef struct BTStackData
|
||||
{
|
||||
BlockNumber bts_blkno;
|
||||
OffsetNumber bts_offset;
|
||||
BTItemData bts_btitem;
|
||||
struct BTStackData *bts_parent;
|
||||
} BTStackData;
|
||||
|
||||
typedef BTStackData *BTStack;
|
||||
|
||||
/*
|
||||
* We need to be able to tell the difference between read and write
|
||||
* requests for pages, in order to do locking correctly.
|
||||
*/
|
||||
|
||||
#define BT_READ BUFFER_LOCK_SHARE
|
||||
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
|
||||
|
||||
/*
|
||||
* In general, the btree code tries to localize its knowledge about
|
||||
* page layout to a couple of routines. However, we need a special
|
||||
* value to indicate "no page number" in those places where we expect
|
||||
* page numbers. We can use zero for this because we never need to
|
||||
* make a pointer to the metadata page.
|
||||
*/
|
||||
|
||||
#define P_NONE 0
|
||||
|
||||
/*
|
||||
* Macros to test whether a page is leftmost or rightmost on its tree level,
|
||||
* as well as other state info kept in the opaque data.
|
||||
*/
|
||||
#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
|
||||
#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
|
||||
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
|
||||
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
|
||||
|
||||
/*
|
||||
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
|
||||
* page. The high key is not a data key, but gives info about what range of
|
||||
* keys is supposed to be on this page. The high key on a page is required
|
||||
* to be greater than or equal to any data key that appears on the page.
|
||||
* If we find ourselves trying to insert a key > high key, we know we need
|
||||
* to move right (this should only happen if the page was split since we
|
||||
* examined the parent page).
|
||||
*
|
||||
* Our insertion algorithm guarantees that we can use the initial least key
|
||||
* on our right sibling as the high key. Once a page is created, its high
|
||||
* key changes only if the page is split.
|
||||
*
|
||||
* On a non-rightmost page, the high key lives in item 1 and data items
|
||||
* start in item 2. Rightmost pages have no high key, so we store data
|
||||
* items beginning in item 1.
|
||||
*/
|
||||
|
||||
#define P_HIKEY ((OffsetNumber) 1)
|
||||
#define P_FIRSTKEY ((OffsetNumber) 2)
|
||||
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
|
||||
|
||||
/*
|
||||
* XLOG allows to store some information in high 4 bits of log
|
||||
* record xl_info field
|
||||
*/
|
||||
#define XLOG_BTREE_DELETE 0x00 /* delete btitem */
|
||||
#define XLOG_BTREE_INSERT 0x10 /* add btitem without split */
|
||||
#define XLOG_BTREE_SPLIT 0x20 /* add btitem with split */
|
||||
#define XLOG_BTREE_SPLEFT 0x30 /* as above + flag that new btitem */
|
||||
/* goes to the left sibling */
|
||||
#define XLOG_BTREE_NEWROOT 0x40 /* new root page */
|
||||
|
||||
#define XLOG_BTREE_LEAF 0x80 /* leaf/internal page was changed */
|
||||
|
||||
/*
|
||||
* All what we need to find changed index tuple
|
||||
*/
|
||||
typedef struct xl_btreetid
|
||||
{
|
||||
RelFileNode node;
|
||||
ItemPointerData tid; /* changed tuple id */
|
||||
} xl_btreetid;
|
||||
|
||||
/*
|
||||
* This is what we need to know about delete
|
||||
*/
|
||||
typedef struct xl_btree_delete
|
||||
{
|
||||
xl_btreetid target; /* deleted tuple id */
|
||||
} xl_btree_delete;
|
||||
|
||||
#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
||||
|
||||
/*
|
||||
* This is what we need to know about pure (without split) insert
|
||||
*/
|
||||
typedef struct xl_btree_insert
|
||||
{
|
||||
xl_btreetid target; /* inserted tuple id */
|
||||
/* BTITEM FOLLOWS AT END OF STRUCT */
|
||||
} xl_btree_insert;
|
||||
|
||||
#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
|
||||
|
||||
/*
|
||||
* On insert with split we save items of both left and right siblings
|
||||
* and restore content of both pages from log record
|
||||
*/
|
||||
typedef struct xl_btree_split
|
||||
{
|
||||
xl_btreetid target; /* inserted tuple id */
|
||||
BlockIdData otherblk; /* second block participated in split: */
|
||||
/* first one is stored in target' tid */
|
||||
BlockIdData parentblk; /* parent block */
|
||||
BlockIdData leftblk; /* prev left block */
|
||||
BlockIdData rightblk; /* next right block */
|
||||
uint16 leftlen; /* len of left page items below */
|
||||
/* LEFT AND RIGHT PAGES ITEMS FOLLOW AT THE END */
|
||||
} xl_btree_split;
|
||||
|
||||
#define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* New root log record.
|
||||
*/
|
||||
typedef struct xl_btree_newroot
|
||||
{
|
||||
RelFileNode node;
|
||||
int32 level;
|
||||
BlockIdData rootblk;
|
||||
/* 0 or 2 BTITEMS FOLLOW AT END OF STRUCT */
|
||||
} xl_btree_newroot;
|
||||
|
||||
#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, rootblk) + sizeof(BlockIdData))
|
||||
|
||||
/*
|
||||
* Operator strategy numbers -- ordering of these is <, <=, =, >=, >
|
||||
*/
|
||||
|
||||
#define BTLessStrategyNumber 1
|
||||
#define BTLessEqualStrategyNumber 2
|
||||
#define BTEqualStrategyNumber 3
|
||||
#define BTGreaterEqualStrategyNumber 4
|
||||
#define BTGreaterStrategyNumber 5
|
||||
#define BTMaxStrategyNumber 5
|
||||
|
||||
/*
|
||||
* When a new operator class is declared, we require that the user
|
||||
* supply us with an amproc procedure for determining whether, for
|
||||
* two keys a and b, a < b, a = b, or a > b. This routine must
|
||||
* return < 0, 0, > 0, respectively, in these three cases. Since we
|
||||
* only have one such proc in amproc, it's number 1.
|
||||
*/
|
||||
|
||||
#define BTORDER_PROC 1
|
||||
|
||||
/*
|
||||
* prototypes for functions in nbtree.c (external entry points for btree)
|
||||
*/
|
||||
@ -340,27 +427,26 @@ extern Datum btmarkpos(PG_FUNCTION_ARGS);
|
||||
extern Datum btrestrpos(PG_FUNCTION_ARGS);
|
||||
extern Datum btbulkdelete(PG_FUNCTION_ARGS);
|
||||
|
||||
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
|
||||
extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
|
||||
extern void btree_desc(char *buf, uint8 xl_info, char *rec);
|
||||
|
||||
/*
|
||||
* prototypes for functions in nbtinsert.c
|
||||
*/
|
||||
extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
|
||||
bool index_is_unique, Relation heapRel);
|
||||
extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
|
||||
BTStack stack, bool is_root, bool is_only);
|
||||
|
||||
/*
|
||||
* prototypes for functions in nbtpage.c
|
||||
*/
|
||||
extern void _bt_metapinit(Relation rel);
|
||||
extern Buffer _bt_getroot(Relation rel, int access);
|
||||
extern Buffer _bt_gettrueroot(Relation rel);
|
||||
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
|
||||
extern void _bt_relbuf(Relation rel, Buffer buf);
|
||||
extern void _bt_wrtbuf(Relation rel, Buffer buf);
|
||||
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
|
||||
extern void _bt_pageinit(Page page, Size size);
|
||||
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level);
|
||||
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level);
|
||||
extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid);
|
||||
|
||||
/*
|
||||
@ -377,6 +463,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
|
||||
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
|
||||
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
|
||||
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
|
||||
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
|
||||
|
||||
/*
|
||||
* prototypes for functions in nbtstrat.c
|
||||
@ -407,4 +494,13 @@ extern void _bt_spooldestroy(BTSpool *btspool);
|
||||
extern void _bt_spool(BTItem btitem, BTSpool *btspool);
|
||||
extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
|
||||
|
||||
/*
|
||||
* prototypes for functions in nbtxlog.c
|
||||
*/
|
||||
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
|
||||
extern void btree_undo(XLogRecPtr lsn, XLogRecord *record);
|
||||
extern void btree_desc(char *buf, uint8 xl_info, char *rec);
|
||||
extern void btree_xlog_startup(void);
|
||||
extern void btree_xlog_cleanup(void);
|
||||
|
||||
#endif /* NBTREE_H */
|
||||
|
Reference in New Issue
Block a user