mirror of
https://github.com/postgres/postgres.git
synced 2025-10-21 02:52:47 +03:00
Indexes with INCLUDE columns and their support in B-tree
This patch introduces INCLUDE clause to index definition. This clause specifies a list of columns which will be included as a non-key part in the index. The INCLUDE columns exist solely to allow more queries to benefit from index-only scans. Also, such columns don't need to have appropriate operator classes. Expressions are not supported as INCLUDE columns since they cannot be used in index-only scans. Index access methods supporting INCLUDE are indicated by amcaninclude flag in IndexAmRoutine. For now, only B-tree indexes support INCLUDE clause. In B-tree indexes INCLUDE columns are truncated from pivot index tuples (tuples located in non-leaf pages and high keys). Therefore, B-tree indexes now might have variable number of attributes. This patch also provides generic facility to support that: pivot tuples contain number of their attributes in t_tid.ip_posid. Free 13th bit of t_info is used for indicating that. This facility will simplify further support of index suffix truncation. The changes of above are backward-compatible, pg_upgrade doesn't need special handling of B-tree indexes for that. Bump catalog version Author: Anastasia Lubennikova with contribition by Alexander Korotkov and me Reviewed by: Peter Geoghegan, Tomas Vondra, Antonin Houska, Jeff Janes, David Rowley, Alexander Korotkov Discussion: https://www.postgresql.org/message-id/flat/56168952.4010101@postgrespro.ru
This commit is contained in:
@@ -191,6 +191,8 @@ typedef struct IndexAmRoutine
|
||||
bool ampredlocks;
|
||||
/* does AM support parallel scan? */
|
||||
bool amcanparallel;
|
||||
/* does AM support columns included with clause INCLUDE? */
|
||||
bool amcaninclude;
|
||||
/* type of data stored in index, or InvalidOid if variable */
|
||||
Oid amkeytype;
|
||||
|
||||
|
@@ -280,7 +280,7 @@ typedef HashMetaPageData *HashMetaPage;
|
||||
sizeof(ItemIdData) - \
|
||||
MAXALIGN(sizeof(HashPageOpaqueData)))
|
||||
|
||||
#define INDEX_MOVED_BY_SPLIT_MASK 0x2000
|
||||
#define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT
|
||||
|
||||
#define HASH_MIN_FILLFACTOR 10
|
||||
#define HASH_DEFAULT_FILLFACTOR 75
|
||||
|
@@ -41,7 +41,7 @@ typedef struct IndexTupleData
|
||||
*
|
||||
* 15th (high) bit: has nulls
|
||||
* 14th bit: has var-width attributes
|
||||
* 13th bit: unused
|
||||
* 13th bit: AM-defined meaning
|
||||
* 12-0 bit: size of tuple
|
||||
* ---------------
|
||||
*/
|
||||
@@ -63,7 +63,8 @@ typedef IndexAttributeBitMapData * IndexAttributeBitMap;
|
||||
* t_info manipulation macros
|
||||
*/
|
||||
#define INDEX_SIZE_MASK 0x1FFF
|
||||
/* bit 0x2000 is reserved for index-AM specific usage */
|
||||
#define INDEX_AM_RESERVED_BIT 0x2000 /* reserved for index-AM specific
|
||||
* usage */
|
||||
#define INDEX_VAR_MASK 0x4000
|
||||
#define INDEX_NULL_MASK 0x8000
|
||||
|
||||
@@ -146,5 +147,7 @@ extern Datum nocache_index_getattr(IndexTuple tup, int attnum,
|
||||
extern void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor,
|
||||
Datum *values, bool *isnull);
|
||||
extern IndexTuple CopyIndexTuple(IndexTuple source);
|
||||
extern IndexTuple index_truncate_tuple(TupleDesc tupleDescriptor,
|
||||
IndexTuple olditup, int new_indnatts);
|
||||
|
||||
#endif /* ITUP_H */
|
||||
|
@@ -139,31 +139,6 @@ typedef struct BTMetaPageData
|
||||
#define BTREE_DEFAULT_FILLFACTOR 90
|
||||
#define BTREE_NONLEAF_FILLFACTOR 70
|
||||
|
||||
/*
|
||||
* Test whether two btree entries are "the same".
|
||||
*
|
||||
* Old comments:
|
||||
* In addition, we must guarantee that all tuples in the index are unique,
|
||||
* in order to satisfy some assumptions in Lehman and Yao. The way that we
|
||||
* do this is by generating a new OID for every insertion that we do in the
|
||||
* tree. This adds eight bytes to the size of btree index tuples. Note
|
||||
* that we do not use the OID as part of a composite key; the OID only
|
||||
* serves as a unique identifier for a given index tuple (logical position
|
||||
* within a page).
|
||||
*
|
||||
* New comments:
|
||||
* actually, we must guarantee that all tuples in A LEVEL
|
||||
* are unique, not in ALL INDEX. So, we can use the t_tid
|
||||
* as unique identifier for a given index tuple (logical position
|
||||
* within a level). - vadim 04/09/97
|
||||
*/
|
||||
#define BTTidSame(i1, i2) \
|
||||
((ItemPointerGetBlockNumber(&(i1)) == ItemPointerGetBlockNumber(&(i2))) && \
|
||||
(ItemPointerGetOffsetNumber(&(i1)) == ItemPointerGetOffsetNumber(&(i2))))
|
||||
#define BTEntrySame(i1, i2) \
|
||||
BTTidSame((i1)->t_tid, (i2)->t_tid)
|
||||
|
||||
|
||||
/*
|
||||
* In general, the btree code tries to localize its knowledge about
|
||||
* page layout to a couple of routines. However, we need a special
|
||||
@@ -212,6 +187,68 @@ typedef struct BTMetaPageData
|
||||
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
|
||||
|
||||
|
||||
/*
|
||||
* B-tree index with INCLUDE clause has non-key (included) attributes, which
|
||||
* are used solely in index-only scans. Those non-key attributes are present
|
||||
* in leaf index tuples which point to corresponding heap tuples. However,
|
||||
* tree also contains "pivot" tuples. Pivot tuples are used for navigation
|
||||
* during tree traversal. Pivot tuples include tuples on non-leaf pages and
|
||||
* high key tuples. Such, tuples don't need to included attributes, because
|
||||
* they have no use during tree traversal. This is why we truncate them in
|
||||
* order to save some space. Therefore, B-tree index with INCLUDE clause
|
||||
* contain tuples with variable number of attributes.
|
||||
*
|
||||
* In order to keep on-disk compatibility with upcoming suffix truncation of
|
||||
* pivot tuples, we store number of attributes present inside tuple itself.
|
||||
* Thankfully, offset number is always unused in pivot tuple. So, we use free
|
||||
* bit of index tuple flags as sign that offset have alternative meaning: it
|
||||
* stores number of keys present in index tuple (12 bit is far enough for that).
|
||||
* And we have 4 bits reserved for future usage.
|
||||
*
|
||||
* Right now INDEX_ALT_TID_MASK is set only on truncation of non-key
|
||||
* attributes of included indexes. But potentially every pivot index tuple
|
||||
* might have INDEX_ALT_TID_MASK set. Then this tuple should have number of
|
||||
* attributes correctly set in BT_N_KEYS_OFFSET_MASK, and in future it might
|
||||
* use some bits of BT_RESERVED_OFFSET_MASK.
|
||||
*
|
||||
* Non-pivot tuples might also use bit of BT_RESERVED_OFFSET_MASK. Despite
|
||||
* they store heap tuple offset, higher bits of offset are always free.
|
||||
*/
|
||||
#define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT /* flag indicating t_tid
|
||||
* offset has an
|
||||
* alternative meaning */
|
||||
#define BT_RESERVED_OFFSET_MASK 0xF000 /* mask of bits in t_tid offset
|
||||
* reserved for future usage */
|
||||
#define BT_N_KEYS_OFFSET_MASK 0x0FFF /* mask of bits in t_tid offset
|
||||
* holding number of attributes
|
||||
* actually present in index tuple */
|
||||
|
||||
/* Acess to downlink block number */
|
||||
#define BTreeInnerTupleGetDownLink(itup) \
|
||||
ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
|
||||
|
||||
#define BTreeInnerTupleSetDownLink(itup, blkno) \
|
||||
ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
|
||||
|
||||
/* Set number of attributes to B-tree index tuple overriding t_tid offset */
|
||||
#define BTreeTupSetNAtts(itup, n) \
|
||||
do { \
|
||||
(itup)->t_info |= INDEX_ALT_TID_MASK; \
|
||||
ItemPointerSetOffsetNumber(&(itup)->t_tid, n); \
|
||||
} while(0)
|
||||
|
||||
/* Get number of attributes in B-tree index tuple */
|
||||
#define BTreeTupGetNAtts(itup, index) \
|
||||
( \
|
||||
(itup)->t_info & INDEX_ALT_TID_MASK ? \
|
||||
( \
|
||||
AssertMacro((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_RESERVED_OFFSET_MASK) == 0), \
|
||||
ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
|
||||
) \
|
||||
: \
|
||||
IndexRelationGetNumberOfAttributes(index) \
|
||||
)
|
||||
|
||||
/*
|
||||
* Operator strategy numbers for B-tree have been moved to access/stratnum.h,
|
||||
* because many places need to use them in ScanKeyInit() calls.
|
||||
@@ -265,7 +302,7 @@ typedef struct BTStackData
|
||||
{
|
||||
BlockNumber bts_blkno;
|
||||
OffsetNumber bts_offset;
|
||||
IndexTupleData bts_btentry;
|
||||
BlockNumber bts_btentry;
|
||||
struct BTStackData *bts_parent;
|
||||
} BTStackData;
|
||||
|
||||
@@ -524,6 +561,7 @@ extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
|
||||
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
|
||||
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
|
||||
Snapshot snapshot);
|
||||
extern bool _bt_check_natts(Relation index, Page page, OffsetNumber offnum);
|
||||
|
||||
/*
|
||||
* prototypes for functions in nbtutils.c
|
||||
@@ -552,6 +590,7 @@ extern bytea *btoptions(Datum reloptions, bool validate);
|
||||
extern bool btproperty(Oid index_oid, int attno,
|
||||
IndexAMProperty prop, const char *propname,
|
||||
bool *res, bool *isnull);
|
||||
extern IndexTuple _bt_truncate_tuple(Relation idxrel, IndexTuple olditup);
|
||||
|
||||
/*
|
||||
* prototypes for functions in nbtvalidate.c
|
||||
|
@@ -28,7 +28,8 @@
|
||||
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
|
||||
#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
|
||||
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
|
||||
/* 0x50 and 0x60 are unused */
|
||||
#define XLOG_BTREE_SPLIT_L_HIGHKEY 0x50 /* as above, include truncated highkey */
|
||||
#define XLOG_BTREE_SPLIT_R_HIGHKEY 0x60 /* as above, include truncated highkey */
|
||||
#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
|
||||
#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
|
||||
#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
|
||||
@@ -82,10 +83,11 @@ typedef struct xl_btree_insert
|
||||
* Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
|
||||
* The _L and _R variants indicate whether the inserted tuple went into the
|
||||
* left or right split page (and thus, whether newitemoff and the new item
|
||||
* are stored or not). The _ROOT variants indicate that we are splitting
|
||||
* the root page, and thus that a newroot record rather than an insert or
|
||||
* split record should follow. Note that a split record never carries a
|
||||
* metapage update --- we'll do that in the parent-level update.
|
||||
* are stored or not). The _HIGHKEY variants indicate that we've logged
|
||||
* explicitly left page high key value, otherwise redo should use right page
|
||||
* leftmost key as a left page high key. _HIGHKEY is specified for internal
|
||||
* pages where right page leftmost key is suppressed, and for leaf pages
|
||||
* of covering indexes where high key have non-key attributes truncated.
|
||||
*
|
||||
* Backup Blk 0: original page / new left page
|
||||
*
|
||||
|
Reference in New Issue
Block a user