1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-24 01:29:19 +03:00

Major overhaul of btree index code. Eliminate special BTP_CHAIN logic for

duplicate keys by letting search go to the left rather than right when an
equal key is seen at an upper tree level.  Fix poor choice of page split
point (leading to insertion failures) that was forced by chaining logic.
Don't store leftmost key in non-leaf pages, since it's not necessary.
Don't create root page until something is first stored in the index, so an
unused index is now 8K not 16K.  (Doesn't seem to be as easy to get rid of
the metadata page, unfortunately.)  Massive cleanup of unreadable code,
fix poor, obsolete, and just plain wrong documentation and comments.
See src/backend/access/nbtree/README for the gory details.
This commit is contained in:
Tom Lane
2000-07-21 06:42:39 +00:00
parent c9537ca88f
commit 9e85183bfc
11 changed files with 1629 additions and 2859 deletions

View File

@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: nbtree.h,v 1.38 2000/06/15 03:32:31 momjian Exp $
* $Id: nbtree.h,v 1.39 2000/07/21 06:42:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -24,14 +24,9 @@
* info. In addition, we need to know what sort of page this is
* (leaf or internal), and whether the page is available for reuse.
*
* Lehman and Yao's algorithm requires a ``high key'' on every page.
* The high key on a page is guaranteed to be greater than or equal
* to any key that appears on this page. Our insertion algorithm
* guarantees that we can use the initial least key on our right
* sibling as the high key. We allocate space for the line pointer
* to the high key in the opaque data at the end of the page.
*
* Rightmost pages in the tree have no high key.
* We also store a back-link to the parent page, but this cannot be trusted
* very far since it does not get updated when the parent is split.
* See backend/access/nbtree/README for details.
*/
typedef struct BTPageOpaqueData
@@ -41,11 +36,11 @@ typedef struct BTPageOpaqueData
BlockNumber btpo_parent;
uint16 btpo_flags;
#define BTP_LEAF (1 << 0)
#define BTP_ROOT (1 << 1)
#define BTP_FREE (1 << 2)
#define BTP_META (1 << 3)
#define BTP_CHAIN (1 << 4)
/* Bits defined in btpo_flags */
#define BTP_LEAF (1 << 0) /* It's a leaf page */
#define BTP_ROOT (1 << 1) /* It's the root page (has no parent) */
#define BTP_FREE (1 << 2) /* not currently used... */
#define BTP_META (1 << 3) /* Set in the meta-page only */
} BTPageOpaqueData;
@@ -84,21 +79,24 @@ typedef struct BTScanOpaqueData
typedef BTScanOpaqueData *BTScanOpaque;
/*
* BTItems are what we store in the btree. Each item has an index
* tuple, including key and pointer values. In addition, we must
* guarantee that all tuples in the index are unique, in order to
* satisfy some assumptions in Lehman and Yao. The way that we do
* this is by generating a new OID for every insertion that we do in
* the tree. This adds eight bytes to the size of btree index
* tuples. Note that we do not use the OID as part of a composite
* key; the OID only serves as a unique identifier for a given index
* tuple (logical position within a page).
* BTItems are what we store in the btree. Each item is an index tuple,
* including key and pointer values. (In some cases either the key or the
* pointer may go unused, see backend/access/nbtree/README for details.)
*
* Old comments:
* In addition, we must guarantee that all tuples in the index are unique,
* in order to satisfy some assumptions in Lehman and Yao. The way that we
* do this is by generating a new OID for every insertion that we do in the
* tree. This adds eight bytes to the size of btree index tuples. Note
* that we do not use the OID as part of a composite key; the OID only
* serves as a unique identifier for a given index tuple (logical position
* within a page).
*
* New comments:
* actually, we must guarantee that all tuples in A LEVEL
* are unique, not in ALL INDEX. So, we can use bti_itup->t_tid
* as unique identifier for a given index tuple (logical position
* within a level). - vadim 04/09/97
* within a level). - vadim 04/09/97
*/
typedef struct BTItemData
@@ -108,12 +106,13 @@ typedef struct BTItemData
typedef BTItemData *BTItem;
#define BTItemSame(i1, i2) ( i1->bti_itup.t_tid.ip_blkid.bi_hi == \
i2->bti_itup.t_tid.ip_blkid.bi_hi && \
i1->bti_itup.t_tid.ip_blkid.bi_lo == \
i2->bti_itup.t_tid.ip_blkid.bi_lo && \
i1->bti_itup.t_tid.ip_posid == \
i2->bti_itup.t_tid.ip_posid )
/* Test whether items are the "same" per the above notes */
#define BTItemSame(i1, i2) ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \
(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \
(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \
(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \
(i1)->bti_itup.t_tid.ip_posid == \
(i2)->bti_itup.t_tid.ip_posid )
/*
* BTStackData -- As we descend a tree, we push the (key, pointer)
@@ -129,24 +128,12 @@ typedef struct BTStackData
{
BlockNumber bts_blkno;
OffsetNumber bts_offset;
BTItem bts_btitem;
BTItemData bts_btitem;
struct BTStackData *bts_parent;
} BTStackData;
typedef BTStackData *BTStack;
typedef struct BTPageState
{
Buffer btps_buf;
Page btps_page;
BTItem btps_lastbti;
OffsetNumber btps_lastoff;
OffsetNumber btps_firstoff;
int btps_level;
bool btps_doupper;
struct BTPageState *btps_next;
} BTPageState;
/*
* We need to be able to tell the difference between read and write
* requests for pages, in order to do locking correctly.
@@ -155,31 +142,49 @@ typedef struct BTPageState
#define BT_READ BUFFER_LOCK_SHARE
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
/*
* Similarly, the difference between insertion and non-insertion binary
* searches on a given page makes a difference when we're descending the
* tree.
*/
#define BT_INSERTION 0
#define BT_DESCENT 1
/*
* In general, the btree code tries to localize its knowledge about
* page layout to a couple of routines. However, we need a special
* value to indicate "no page number" in those places where we expect
* page numbers.
* page numbers. We can use zero for this because we never need to
* make a pointer to the metadata page.
*/
#define P_NONE 0
/*
* Macros to test whether a page is leftmost or rightmost on its tree level,
* as well as other state info kept in the opaque data.
*/
#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
#define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
#define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
* page. The high key is not a data key, but gives info about what range of
* keys is supposed to be on this page. The high key on a page is required
* to be greater than or equal to any data key that appears on the page.
* If we find ourselves trying to insert a key > high key, we know we need
* to move right (this should only happen if the page was split since we
* examined the parent page).
*
* Our insertion algorithm guarantees that we can use the initial least key
* on our right sibling as the high key. Once a page is created, its high
* key changes only if the page is split.
*
* On a non-rightmost page, the high key lives in item 1 and data items
* start in item 2. Rightmost pages have no high key, so we store data
* items beginning in item 1.
*/
#define P_HIKEY ((OffsetNumber) 1)
#define P_FIRSTKEY ((OffsetNumber) 2)
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
/*
* Strategy numbers -- ordering of these is <, <=, =, >=, >
* Operator strategy numbers -- ordering of these is <, <=, =, >=, >
*/
#define BTLessStrategyNumber 1
@@ -200,29 +205,7 @@ typedef struct BTPageState
#define BTORDER_PROC 1
/*
* prototypes for functions in nbtinsert.c
*/
extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
bool index_is_unique, Relation heapRel);
extern bool _bt_itemcmp(Relation rel, Size keysz, ScanKey scankey,
BTItem item1, BTItem item2, StrategyNumber strat);
/*
* prototypes for functions in nbtpage.c
*/
extern void _bt_metapinit(Relation rel);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern void _bt_relbuf(Relation rel, Buffer buf, int access);
extern void _bt_wrtbuf(Relation rel, Buffer buf);
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level);
extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
extern void _bt_pagedel(Relation rel, ItemPointer tid);
/*
* prototypes for functions in nbtree.c
* prototypes for functions in nbtree.c (external entry points for btree)
*/
extern bool BuildingBtree; /* in nbtree.c */
@@ -237,6 +220,25 @@ extern Datum btmarkpos(PG_FUNCTION_ARGS);
extern Datum btrestrpos(PG_FUNCTION_ARGS);
extern Datum btdelete(PG_FUNCTION_ARGS);
/*
* prototypes for functions in nbtinsert.c
*/
extern InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem,
bool index_is_unique, Relation heapRel);
/*
* prototypes for functions in nbtpage.c
*/
extern void _bt_metapinit(Relation rel);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern void _bt_relbuf(Relation rel, Buffer buf, int access);
extern void _bt_wrtbuf(Relation rel, Buffer buf);
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level);
extern void _bt_pagedel(Relation rel, ItemPointer tid);
/*
* prototypes for functions in nbtscan.c
*/
@@ -249,13 +251,13 @@ extern void AtEOXact_nbtree(void);
* prototypes for functions in nbtsearch.c
*/
extern BTStack _bt_search(Relation rel, int keysz, ScanKey scankey,
Buffer *bufP);
Buffer *bufP, int access);
extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
ScanKey scankey, int access);
extern bool _bt_skeycmp(Relation rel, Size keysz, ScanKey scankey,
Page page, ItemId itemid, StrategyNumber strat);
extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
ScanKey scankey, int srchtype);
ScanKey scankey);
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
Page page, OffsetNumber offnum);
extern RetrieveIndexResult _bt_next(IndexScanDesc scan, ScanDirection dir);
extern RetrieveIndexResult _bt_first(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);