diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c
index 564c818558f..2ddedef7141 100644
--- a/contrib/pageinspect/btreefuncs.c
+++ b/contrib/pageinspect/btreefuncs.c
@@ -31,9 +31,11 @@
#include "access/relation.h"
#include "catalog/namespace.h"
#include "catalog/pg_am.h"
+#include "catalog/pg_type.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "pageinspect.h"
+#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/varlena.h"
@@ -45,6 +47,8 @@ PG_FUNCTION_INFO_V1(bt_page_stats);
#define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
#define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID)
+#define DatumGetItemPointer(X) ((ItemPointer) DatumGetPointer(X))
+#define ItemPointerGetDatum(X) PointerGetDatum(X)
/* note: BlockNumber is unsigned, hence can't be negative */
#define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \
@@ -243,6 +247,9 @@ struct user_args
{
Page page;
OffsetNumber offset;
+ bool leafpage;
+ bool rightmost;
+ TupleDesc tupd;
};
/*-------------------------------------------------------
@@ -252,17 +259,25 @@ struct user_args
* ------------------------------------------------------
*/
static Datum
-bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
+bt_page_print_tuples(FuncCallContext *fctx, struct user_args *uargs)
{
- char *values[6];
+ Page page = uargs->page;
+ OffsetNumber offset = uargs->offset;
+ bool leafpage = uargs->leafpage;
+ bool rightmost = uargs->rightmost;
+ bool ispivottuple;
+ Datum values[9];
+ bool nulls[9];
HeapTuple tuple;
ItemId id;
IndexTuple itup;
int j;
int off;
int dlen;
- char *dump;
+ char *dump,
+ *datacstring;
char *ptr;
+ ItemPointer htid;
id = PageGetItemId(page, offset);
@@ -272,18 +287,49 @@ bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
itup = (IndexTuple) PageGetItem(page, id);
j = 0;
- values[j++] = psprintf("%d", offset);
- values[j++] = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&itup->t_tid),
- ItemPointerGetOffsetNumberNoCheck(&itup->t_tid));
- values[j++] = psprintf("%d", (int) IndexTupleSize(itup));
- values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f');
- values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
+ memset(nulls, 0, sizeof(nulls));
+ values[j++] = DatumGetInt16(offset);
+ values[j++] = ItemPointerGetDatum(&itup->t_tid);
+ values[j++] = Int32GetDatum((int) IndexTupleSize(itup));
+ values[j++] = BoolGetDatum(IndexTupleHasNulls(itup));
+ values[j++] = BoolGetDatum(IndexTupleHasVarwidths(itup));
ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
+
+ /*
+ * Make sure that "data" column does not include posting list or pivot
+ * tuple representation of heap TID(s).
+ *
+ * Note: BTreeTupleIsPivot() won't work reliably on !heapkeyspace indexes
+ * (those built before BTREE_VERSION 4), but we have no way of determining
+ * if this page came from a !heapkeyspace index. We may only have a bytea
+ * nbtree page image to go on, so in general there is no metapage that we
+ * can check.
+ *
+ * That's okay here because BTreeTupleIsPivot() can only return false for
+ * a !heapkeyspace pivot, never true for a !heapkeyspace non-pivot. Since
+ * heap TID isn't part of the keyspace in a !heapkeyspace index anyway,
+ * there cannot possibly be a pivot tuple heap TID representation that we
+ * fail to make an adjustment for. A !heapkeyspace index can have
+ * BTreeTupleIsPivot() return true (due to things like suffix truncation
+ * for INCLUDE indexes in Postgres v11), but when that happens
+ * BTreeTupleGetHeapTID() can be trusted to work reliably (i.e. return
+ * NULL).
+ *
+ * Note: BTreeTupleIsPosting() always works reliably, even with
+ * !heapkeyspace indexes.
+ */
+ if (BTreeTupleIsPosting(itup))
+ dlen -= IndexTupleSize(itup) - BTreeTupleGetPostingOffset(itup);
+ else if (BTreeTupleIsPivot(itup) && BTreeTupleGetHeapTID(itup) != NULL)
+ dlen -= MAXALIGN(sizeof(ItemPointerData));
+
+ if (dlen < 0 || dlen > INDEX_SIZE_MASK)
+ elog(ERROR, "invalid tuple length %d for tuple at offset number %u",
+ dlen, offset);
dump = palloc0(dlen * 3 + 1);
- values[j] = dump;
+ datacstring = dump;
for (off = 0; off < dlen; off++)
{
if (off > 0)
@@ -291,8 +337,62 @@ bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset)
sprintf(dump, "%02x", *(ptr + off) & 0xff);
dump += 2;
}
+ values[j++] = CStringGetTextDatum(datacstring);
+ pfree(datacstring);
- tuple = BuildTupleFromCStrings(fctx->attinmeta, values);
+ /*
+ * We need to work around the BTreeTupleIsPivot() !heapkeyspace limitation
+ * again. Deduce whether or not tuple must be a pivot tuple based on
+ * whether or not the page is a leaf page, as well as the page offset
+ * number of the tuple.
+ */
+ ispivottuple = (!leafpage || (!rightmost && offset == P_HIKEY));
+
+ /* LP_DEAD bit can never be set for pivot tuples, so show a NULL there */
+ if (!ispivottuple)
+ values[j++] = BoolGetDatum(ItemIdIsDead(id));
+ else
+ {
+ Assert(!ItemIdIsDead(id));
+ nulls[j++] = true;
+ }
+
+ htid = BTreeTupleGetHeapTID(itup);
+ if (ispivottuple && !BTreeTupleIsPivot(itup))
+ {
+ /* Don't show bogus heap TID in !heapkeyspace pivot tuple */
+ htid = NULL;
+ }
+
+ if (htid)
+ values[j++] = ItemPointerGetDatum(htid);
+ else
+ nulls[j++] = true;
+
+ if (BTreeTupleIsPosting(itup))
+ {
+ /* Build an array of item pointers */
+ ItemPointer tids;
+ Datum *tids_datum;
+ int nposting;
+
+ tids = BTreeTupleGetPosting(itup);
+ nposting = BTreeTupleGetNPosting(itup);
+ tids_datum = (Datum *) palloc(nposting * sizeof(Datum));
+ for (int i = 0; i < nposting; i++)
+ tids_datum[i] = ItemPointerGetDatum(&tids[i]);
+ values[j++] = PointerGetDatum(construct_array(tids_datum,
+ nposting,
+ TIDOID,
+ sizeof(ItemPointerData),
+ false, 's'));
+ pfree(tids_datum);
+ }
+ else
+ nulls[j++] = true;
+
+ /* Build and return the result tuple */
+ tuple = heap_form_tuple(uargs->tupd, values, nulls);
return HeapTupleGetDatum(tuple);
}
@@ -378,12 +478,15 @@ bt_page_items(PG_FUNCTION_ARGS)
elog(NOTICE, "page is deleted");
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+ uargs->leafpage = P_ISLEAF(opaque);
+ uargs->rightmost = P_RIGHTMOST(opaque);
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
+ tupleDesc = BlessTupleDesc(tupleDesc);
- fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
+ uargs->tupd = tupleDesc;
fctx->user_fctx = uargs;
@@ -395,7 +498,7 @@ bt_page_items(PG_FUNCTION_ARGS)
if (fctx->call_cntr < fctx->max_calls)
{
- result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
+ result = bt_page_print_tuples(fctx, uargs);
uargs->offset++;
SRF_RETURN_NEXT(fctx, result);
}
@@ -463,12 +566,15 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
elog(NOTICE, "page is deleted");
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+ uargs->leafpage = P_ISLEAF(opaque);
+ uargs->rightmost = P_RIGHTMOST(opaque);
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
+ tupleDesc = BlessTupleDesc(tupleDesc);
- fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
+ uargs->tupd = tupleDesc;
fctx->user_fctx = uargs;
@@ -480,7 +586,7 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
if (fctx->call_cntr < fctx->max_calls)
{
- result = bt_page_print_tuples(fctx, uargs->page, uargs->offset);
+ result = bt_page_print_tuples(fctx, uargs);
uargs->offset++;
SRF_RETURN_NEXT(fctx, result);
}
@@ -510,7 +616,7 @@ bt_metap(PG_FUNCTION_ARGS)
BTMetaPageData *metad;
TupleDesc tupleDesc;
int j;
- char *values[8];
+ char *values[9];
Buffer buffer;
Page page;
HeapTuple tuple;
@@ -557,17 +663,21 @@ bt_metap(PG_FUNCTION_ARGS)
/*
* Get values of extended metadata if available, use default values
- * otherwise.
+ * otherwise. Note that we rely on the assumption that btm_allequalimage
+ * is initialized to zero with indexes that were built on versions prior
+ * to Postgres 13 (just like _bt_metaversion()).
*/
if (metad->btm_version >= BTREE_NOVAC_VERSION)
{
values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
+ values[j++] = metad->btm_allequalimage ? "t" : "f";
}
else
{
values[j++] = "0";
values[j++] = "-1";
+ values[j++] = "f";
}
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out
index 07c2dcd7714..17bf0c54708 100644
--- a/contrib/pageinspect/expected/btree.out
+++ b/contrib/pageinspect/expected/btree.out
@@ -12,6 +12,7 @@ fastroot | 1
fastlevel | 0
oldest_xact | 0
last_cleanup_num_tuples | -1
+allequalimage | t
SELECT * FROM bt_page_stats('test1_a_idx', 0);
ERROR: block 0 is a meta page
@@ -41,6 +42,9 @@ itemlen | 16
nulls | f
vars | f
data | 01 00 00 00 00 00 00 01
+dead | f
+htid | (0,1)
+tids |
SELECT * FROM bt_page_items('test1_a_idx', 2);
ERROR: block number out of range
@@ -54,6 +58,9 @@ itemlen | 16
nulls | f
vars | f
data | 01 00 00 00 00 00 00 01
+dead | f
+htid | (0,1)
+tids |
SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 2));
ERROR: block number 2 is out of range for relation "test1_a_idx"
diff --git a/contrib/pageinspect/pageinspect--1.7--1.8.sql b/contrib/pageinspect/pageinspect--1.7--1.8.sql
index 2a7c4b35165..e34c214c93c 100644
--- a/contrib/pageinspect/pageinspect--1.7--1.8.sql
+++ b/contrib/pageinspect/pageinspect--1.7--1.8.sql
@@ -14,3 +14,56 @@ CREATE FUNCTION heap_tuple_infomask_flags(
RETURNS record
AS 'MODULE_PATHNAME', 'heap_tuple_infomask_flags'
LANGUAGE C STRICT PARALLEL SAFE;
+
+--
+-- bt_metap()
+--
+DROP FUNCTION bt_metap(text);
+CREATE FUNCTION bt_metap(IN relname text,
+ OUT magic int4,
+ OUT version int4,
+ OUT root int4,
+ OUT level int4,
+ OUT fastroot int4,
+ OUT fastlevel int4,
+ OUT oldest_xact int4,
+ OUT last_cleanup_num_tuples real,
+ OUT allequalimage boolean)
+AS 'MODULE_PATHNAME', 'bt_metap'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+--
+-- bt_page_items(text, int4)
+--
+DROP FUNCTION bt_page_items(text, int4);
+CREATE FUNCTION bt_page_items(IN relname text, IN blkno int4,
+ OUT itemoffset smallint,
+ OUT ctid tid,
+ OUT itemlen smallint,
+ OUT nulls bool,
+ OUT vars bool,
+ OUT data text,
+ OUT dead boolean,
+ OUT htid tid,
+ OUT tids tid[])
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'bt_page_items'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+--
+-- bt_page_items(bytea)
+--
+DROP FUNCTION bt_page_items(bytea);
+CREATE FUNCTION bt_page_items(IN page bytea,
+ OUT itemoffset smallint,
+ OUT ctid tid,
+ OUT itemlen smallint,
+ OUT nulls bool,
+ OUT vars bool,
+ OUT data text,
+ OUT dead boolean,
+ OUT htid tid,
+ OUT tids tid[])
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'bt_page_items_bytea'
+LANGUAGE C STRICT PARALLEL SAFE;
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index 7e2e1487d79..fedc94e8b22 100644
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -300,13 +300,14 @@ test=# SELECT t_ctid, raw_flags, combined_flags
test=# SELECT * FROM bt_metap('pg_cast_oid_index');
-[ RECORD 1 ]-----------+-------
magic | 340322
-version | 3
+version | 4
root | 1
level | 0
fastroot | 1
fastlevel | 0
oldest_xact | 582
last_cleanup_num_tuples | 1000
+allequalimage | f
@@ -329,11 +330,11 @@ test=# SELECT * FROM bt_page_stats('pg_cast_oid_index', 1);
-[ RECORD 1 ]-+-----
blkno | 1
type | l
-live_items | 256
+live_items | 224
dead_items | 0
-avg_item_size | 12
+avg_item_size | 16
page_size | 8192
-free_size | 4056
+free_size | 3668
btpo_prev | 0
btpo_next | 0
btpo | 0
@@ -356,33 +357,75 @@ btpo_flags | 3
bt_page_items returns detailed information about
all of the items on a B-tree index page. For example:
-test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
- itemoffset | ctid | itemlen | nulls | vars | data
-------------+---------+---------+-------+------+-------------
- 1 | (0,1) | 12 | f | f | 23 27 00 00
- 2 | (0,2) | 12 | f | f | 24 27 00 00
- 3 | (0,3) | 12 | f | f | 25 27 00 00
- 4 | (0,4) | 12 | f | f | 26 27 00 00
- 5 | (0,5) | 12 | f | f | 27 27 00 00
- 6 | (0,6) | 12 | f | f | 28 27 00 00
- 7 | (0,7) | 12 | f | f | 29 27 00 00
- 8 | (0,8) | 12 | f | f | 2a 27 00 00
+ test=# SELECT itemoffset, ctid, itemlen, nulls, vars, data, dead, htid, tids[0:2] AS some_tids
+ FROM bt_page_items(get_raw_page('tenk2_hundred', 5));
+ itemoffset | ctid | itemlen | nulls | vars | data | dead | htid | some_tids
+------------+-----------+---------+-------+------+-------------------------+------+--------+---------------------
+ 1 | (16,1) | 16 | f | f | 30 00 00 00 00 00 00 00 | | |
+ 2 | (16,8292) | 616 | f | f | 24 00 00 00 00 00 00 00 | f | (1,6) | {"(1,6)","(10,22)"}
+ 3 | (16,8292) | 616 | f | f | 25 00 00 00 00 00 00 00 | f | (1,18) | {"(1,18)","(4,22)"}
+ 4 | (16,8292) | 616 | f | f | 26 00 00 00 00 00 00 00 | f | (4,18) | {"(4,18)","(6,17)"}
+ 5 | (16,8292) | 616 | f | f | 27 00 00 00 00 00 00 00 | f | (1,2) | {"(1,2)","(1,19)"}
+ 6 | (16,8292) | 616 | f | f | 28 00 00 00 00 00 00 00 | f | (2,24) | {"(2,24)","(4,11)"}
+ 7 | (16,8292) | 616 | f | f | 29 00 00 00 00 00 00 00 | f | (2,17) | {"(2,17)","(11,2)"}
+ 8 | (16,8292) | 616 | f | f | 2a 00 00 00 00 00 00 00 | f | (0,25) | {"(0,25)","(3,20)"}
+ 9 | (16,8292) | 616 | f | f | 2b 00 00 00 00 00 00 00 | f | (0,10) | {"(0,10)","(0,14)"}
+ 10 | (16,8292) | 616 | f | f | 2c 00 00 00 00 00 00 00 | f | (1,3) | {"(1,3)","(3,9)"}
+ 11 | (16,8292) | 616 | f | f | 2d 00 00 00 00 00 00 00 | f | (6,28) | {"(6,28)","(11,1)"}
+ 12 | (16,8292) | 616 | f | f | 2e 00 00 00 00 00 00 00 | f | (0,27) | {"(0,27)","(1,13)"}
+ 13 | (16,8292) | 616 | f | f | 2f 00 00 00 00 00 00 00 | f | (4,17) | {"(4,17)","(4,21)"}
+(13 rows)
- In a B-tree leaf page, ctid points to a heap tuple.
- In an internal page, the block number part of ctid
- points to another page in the index itself, while the offset part
- (the second number) is ignored and is usually 1.
+ This is a B-tree leaf page. All tuples that point to the table
+ happen to be posting list tuples (all of which store a total of
+ 100 6 byte TIDs). There is also a high key
tuple
+ at itemoffset number 1.
+ ctid is used to store encoded
+ information about each tuple in this example, though leaf page
+ tuples often store a heap TID directly in the
+ ctid field instead.
+ tids is the list of TIDs stored as a
+ posting list.
+
+
+ In an internal page (not shown), the block number part of
+ ctid is a downlink
,
+ which is a block number of another page in the index itself.
+ The offset part (the second number) of
+ ctid stores encoded information about
+ the tuple, such as the number of columns present (suffix
+ truncation may have removed unneeded suffix columns). Truncated
+ columns are treated as having the value minus
+ infinity
.
+
+
+ htid shows a heap TID for the tuple,
+ regardless of the underlying tuple representation. This value
+ may match ctid, or may be decoded
+ from the alternative representations used by posting list tuples
+ and tuples from internal pages. Tuples in internal pages
+ usually have the implementation level heap TID column truncated
+ away, which is represented as a NULL
+ htid value.
Note that the first item on any non-rightmost page (any page with
a non-zero value in the btpo_next field) is the
page's high key
, meaning its data
serves as an upper bound on all items appearing on the page, while
- its ctid field is meaningless. Also, on non-leaf
- pages, the first real data item (the first item that is not a high
- key) is a minus infinity
item, with no actual value
- in its data field. Such an item does have a valid
- downlink in its ctid field, however.
+ its ctid field does not point to
+ another block. Also, on internal pages, the first real data
+ item (the first item that is not a high key) reliably has every
+ column truncated away, leaving no actual value in its
+ data field. Such an item does have a
+ valid downlink in its ctid field,
+ however.
+
+
+ For more details about the structure of B-tree indexes, see
+ . For more details about
+ deduplication and posting lists, see .
@@ -402,17 +445,24 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
with get_raw_page should be passed as argument. So
the last example could also be rewritten like this:
-test=# SELECT * FROM bt_page_items(get_raw_page('pg_cast_oid_index', 1));
- itemoffset | ctid | itemlen | nulls | vars | data
-------------+---------+---------+-------+------+-------------
- 1 | (0,1) | 12 | f | f | 23 27 00 00
- 2 | (0,2) | 12 | f | f | 24 27 00 00
- 3 | (0,3) | 12 | f | f | 25 27 00 00
- 4 | (0,4) | 12 | f | f | 26 27 00 00
- 5 | (0,5) | 12 | f | f | 27 27 00 00
- 6 | (0,6) | 12 | f | f | 28 27 00 00
- 7 | (0,7) | 12 | f | f | 29 27 00 00
- 8 | (0,8) | 12 | f | f | 2a 27 00 00
+ test=# SELECT itemoffset, ctid, itemlen, nulls, vars, data, dead, htid, tids[0:2] AS some_tids
+ FROM bt_page_items(get_raw_page('tenk2_hundred', 5));
+ itemoffset | ctid | itemlen | nulls | vars | data | dead | htid | some_tids
+------------+-----------+---------+-------+------+-------------------------+------+--------+---------------------
+ 1 | (16,1) | 16 | f | f | 30 00 00 00 00 00 00 00 | | |
+ 2 | (16,8292) | 616 | f | f | 24 00 00 00 00 00 00 00 | f | (1,6) | {"(1,6)","(10,22)"}
+ 3 | (16,8292) | 616 | f | f | 25 00 00 00 00 00 00 00 | f | (1,18) | {"(1,18)","(4,22)"}
+ 4 | (16,8292) | 616 | f | f | 26 00 00 00 00 00 00 00 | f | (4,18) | {"(4,18)","(6,17)"}
+ 5 | (16,8292) | 616 | f | f | 27 00 00 00 00 00 00 00 | f | (1,2) | {"(1,2)","(1,19)"}
+ 6 | (16,8292) | 616 | f | f | 28 00 00 00 00 00 00 00 | f | (2,24) | {"(2,24)","(4,11)"}
+ 7 | (16,8292) | 616 | f | f | 29 00 00 00 00 00 00 00 | f | (2,17) | {"(2,17)","(11,2)"}
+ 8 | (16,8292) | 616 | f | f | 2a 00 00 00 00 00 00 00 | f | (0,25) | {"(0,25)","(3,20)"}
+ 9 | (16,8292) | 616 | f | f | 2b 00 00 00 00 00 00 00 | f | (0,10) | {"(0,10)","(0,14)"}
+ 10 | (16,8292) | 616 | f | f | 2c 00 00 00 00 00 00 00 | f | (1,3) | {"(1,3)","(3,9)"}
+ 11 | (16,8292) | 616 | f | f | 2d 00 00 00 00 00 00 00 | f | (6,28) | {"(6,28)","(11,1)"}
+ 12 | (16,8292) | 616 | f | f | 2e 00 00 00 00 00 00 00 | f | (0,27) | {"(0,27)","(1,13)"}
+ 13 | (16,8292) | 616 | f | f | 2f 00 00 00 00 00 00 00 | f | (4,17) | {"(4,17)","(4,21)"}
+(13 rows)
All the other details are the same as explained in the previous item.