Indexes with INCLUDE columns and their support in B-tree

This patch introduces INCLUDE clause to index definition. This clause specifies a list of columns which will be included as a non-key part in the index. The INCLUDE columns exist solely to allow more queries to benefit from index-only scans. Also, such columns don't need to have appropriate operator classes. Expressions are not supported as INCLUDE columns since they cannot be used in index-only scans. Index access methods supporting INCLUDE are indicated by amcaninclude flag in IndexAmRoutine. For now, only B-tree indexes support INCLUDE clause. In B-tree indexes INCLUDE columns are truncated from pivot index tuples (tuples located in non-leaf pages and high keys). Therefore, B-tree indexes now might have variable number of attributes. This patch also provides generic facility to support that: pivot tuples contain number of their attributes in t_tid.ip_posid. Free 13th bit of t_info is used for indicating that. This facility will simplify further support of index suffix truncation. The changes of above are backward-compatible, pg_upgrade doesn't need special handling of B-tree indexes for that. Bump catalog version Author: Anastasia Lubennikova with contribition by Alexander Korotkov and me Reviewed by: Peter Geoghegan, Tomas Vondra, Antonin Houska, Jeff Janes, David Rowley, Alexander Korotkov Discussion: https://www.postgresql.org/message-id/flat/56168952.4010101@postgrespro.ru
2025-12-07 12:02:30 +03:00 · 2018-04-07 23:00:39 +03:00
parent 01bb85169a
commit 8224de4f42
89 changed files with 2112 additions and 467 deletions
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -191,6 +191,8 @@ typedef struct IndexAmRoutine
 	bool		ampredlocks;
 	/* does AM support parallel scan? */
 	bool		amcanparallel;
+	/* does AM support columns included with clause INCLUDE? */
+	bool		amcaninclude;
 	/* type of data stored in index, or InvalidOid if variable */
 	Oid			amkeytype;

--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -280,7 +280,7 @@ typedef HashMetaPageData *HashMetaPage;
 				  sizeof(ItemIdData) - \
 				  MAXALIGN(sizeof(HashPageOpaqueData)))

-#define INDEX_MOVED_BY_SPLIT_MASK	0x2000
+#define INDEX_MOVED_BY_SPLIT_MASK	INDEX_AM_RESERVED_BIT

 #define HASH_MIN_FILLFACTOR			10
 #define HASH_DEFAULT_FILLFACTOR		75
--- a/src/include/access/itup.h
+++ b/src/include/access/itup.h
@@ -41,7 +41,7 @@ typedef struct IndexTupleData
 	 *
 	 * 15th (high) bit: has nulls
 	 * 14th bit: has var-width attributes
-	 * 13th bit: unused
+	 * 13th bit: AM-defined meaning
 	 * 12-0 bit: size of tuple
 	 * ---------------
 	 */
@@ -63,7 +63,8 @@ typedef IndexAttributeBitMapData * IndexAttributeBitMap;
 * t_info manipulation macros
 */
 #define INDEX_SIZE_MASK 0x1FFF
-/* bit 0x2000 is reserved for index-AM specific usage */
+#define INDEX_AM_RESERVED_BIT 0x2000	/* reserved for index-AM specific
+										 * usage */
 #define INDEX_VAR_MASK	0x4000
 #define INDEX_NULL_MASK 0x8000

@@ -146,5 +147,7 @@ extern Datum nocache_index_getattr(IndexTuple tup, int attnum,
 extern void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor,
 				   Datum *values, bool *isnull);
 extern IndexTuple CopyIndexTuple(IndexTuple source);
+extern IndexTuple index_truncate_tuple(TupleDesc tupleDescriptor,
+					 IndexTuple olditup, int new_indnatts);

 #endif							/* ITUP_H */
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -139,31 +139,6 @@ typedef struct BTMetaPageData
 #define BTREE_DEFAULT_FILLFACTOR	90
 #define BTREE_NONLEAF_FILLFACTOR	70

-/*
- *	Test whether two btree entries are "the same".
- *
- *	Old comments:
- *	In addition, we must guarantee that all tuples in the index are unique,
- *	in order to satisfy some assumptions in Lehman and Yao.  The way that we
- *	do this is by generating a new OID for every insertion that we do in the
- *	tree.  This adds eight bytes to the size of btree index tuples.  Note
- *	that we do not use the OID as part of a composite key; the OID only
- *	serves as a unique identifier for a given index tuple (logical position
- *	within a page).
- *
- *	New comments:
- *	actually, we must guarantee that all tuples in A LEVEL
- *	are unique, not in ALL INDEX. So, we can use the t_tid
- *	as unique identifier for a given index tuple (logical position
- *	within a level). - vadim 04/09/97
- */
-#define BTTidSame(i1, i2)	\
-	((ItemPointerGetBlockNumber(&(i1)) == ItemPointerGetBlockNumber(&(i2))) && \
-	 (ItemPointerGetOffsetNumber(&(i1)) == ItemPointerGetOffsetNumber(&(i2))))
-#define BTEntrySame(i1, i2) \
-	BTTidSame((i1)->t_tid, (i2)->t_tid)
-
-
 /*
 *	In general, the btree code tries to localize its knowledge about
 *	page layout to a couple of routines.  However, we need a special
@@ -212,6 +187,68 @@ typedef struct BTMetaPageData
 #define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)


+/*
+ * B-tree index with INCLUDE clause has non-key (included) attributes, which
+ * are used solely in index-only scans.  Those non-key attributes are present
+ * in leaf index tuples which point to corresponding heap tuples.  However,
+ * tree also contains "pivot" tuples.  Pivot tuples are used for navigation
+ * during tree traversal.  Pivot tuples include tuples on non-leaf pages and
+ * high key tuples.  Such, tuples don't need to included attributes, because
+ * they have no use during tree traversal.  This is why we truncate them in
+ * order to save some space.  Therefore, B-tree index with INCLUDE clause
+ * contain tuples with variable number of attributes.
+ *
+ * In order to keep on-disk compatibility with upcoming suffix truncation of
+ * pivot tuples, we store number of attributes present inside tuple itself.
+ * Thankfully, offset number is always unused in pivot tuple.  So, we use free
+ * bit of index tuple flags as sign that offset have alternative meaning: it
+ * stores number of keys present in index tuple (12 bit is far enough for that).
+ * And we have 4 bits reserved for future usage.
+ *
+ * Right now INDEX_ALT_TID_MASK is set only on truncation of non-key
+ * attributes of included indexes.  But potentially every pivot index tuple
+ * might have INDEX_ALT_TID_MASK set.  Then this tuple should have number of
+ * attributes correctly set in BT_N_KEYS_OFFSET_MASK, and in future it might
+ * use some bits of BT_RESERVED_OFFSET_MASK.
+ *
+ * Non-pivot tuples might also use bit of BT_RESERVED_OFFSET_MASK.  Despite
+ * they store heap tuple offset, higher bits of offset are always free.
+ */
+#define INDEX_ALT_TID_MASK		INDEX_AM_RESERVED_BIT	/* flag indicating t_tid
+														 * offset has an
+														 * alternative meaning */
+#define BT_RESERVED_OFFSET_MASK	0xF000	/* mask of bits in t_tid offset
+										 * reserved for future usage */
+#define BT_N_KEYS_OFFSET_MASK	0x0FFF	/* mask of bits in t_tid offset
+										 * holding number of attributes
+										 * actually present in index tuple */
+
+/* Acess to downlink block number */
+#define BTreeInnerTupleGetDownLink(itup) \
+	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
+
+#define BTreeInnerTupleSetDownLink(itup, blkno) \
+	ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
+
+/* Set number of attributes to B-tree index tuple overriding t_tid offset */
+#define BTreeTupSetNAtts(itup, n) \
+	do { \
+		(itup)->t_info |= INDEX_ALT_TID_MASK; \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, n); \
+	} while(0)
+
+/* Get number of attributes in B-tree index tuple */
+#define BTreeTupGetNAtts(itup, index)	\
+	( \
+		(itup)->t_info & INDEX_ALT_TID_MASK ? \
+		( \
+			AssertMacro((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_RESERVED_OFFSET_MASK) == 0), \
+			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
+		) \
+		: \
+		IndexRelationGetNumberOfAttributes(index) \
+	)
+
 /*
 *	Operator strategy numbers for B-tree have been moved to access/stratnum.h,
 *	because many places need to use them in ScanKeyInit() calls.
@@ -265,7 +302,7 @@ typedef struct BTStackData
 {
 	BlockNumber bts_blkno;
 	OffsetNumber bts_offset;
-	IndexTupleData bts_btentry;
+	BlockNumber bts_btentry;
 	struct BTStackData *bts_parent;
 } BTStackData;

@@ -524,6 +561,7 @@ extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 				 Snapshot snapshot);
+extern bool _bt_check_natts(Relation index, Page page, OffsetNumber offnum);

 /*
 * prototypes for functions in nbtutils.c
@@ -552,6 +590,7 @@ extern bytea *btoptions(Datum reloptions, bool validate);
 extern bool btproperty(Oid index_oid, int attno,
 		   IndexAMProperty prop, const char *propname,
 		   bool *res, bool *isnull);
+extern IndexTuple _bt_truncate_tuple(Relation idxrel, IndexTuple olditup);

 /*
 * prototypes for functions in nbtvalidate.c
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,7 +28,8 @@
 #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
 #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
-/* 0x50 and 0x60 are unused */
+#define XLOG_BTREE_SPLIT_L_HIGHKEY 0x50 /* as above, include truncated highkey */
+#define XLOG_BTREE_SPLIT_R_HIGHKEY 0x60 /* as above, include truncated highkey */
 #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
 #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
 #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
@@ -82,10 +83,11 @@ typedef struct xl_btree_insert
 * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
 * The _L and _R variants indicate whether the inserted tuple went into the
 * left or right split page (and thus, whether newitemoff and the new item
- * are stored or not).  The _ROOT variants indicate that we are splitting
- * the root page, and thus that a newroot record rather than an insert or
- * split record should follow.  Note that a split record never carries a
- * metapage update --- we'll do that in the parent-level update.
+ * are stored or not).  The _HIGHKEY variants indicate that we've logged
+ * explicitly left page high key value, otherwise redo should use right page
+ * leftmost key as a left page high key.  _HIGHKEY is specified for internal
+ * pages where right page leftmost key is suppressed, and for leaf pages
+ * of covering indexes where high key have non-key attributes truncated.
 *
 * Backup Blk 0: original page / new left page
 *