diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index c5764b544fd..dcec3b85203 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -4,7 +4,7 @@ MODULE_big = amcheck OBJS = verify_nbtree.o $(WIN32RES) EXTENSION = amcheck -DATA = amcheck--1.0--1.1.sql amcheck--1.0.sql +DATA = amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql PGFILEDESC = "amcheck - function for verifying relation integrity" REGRESS = check check_btree diff --git a/contrib/amcheck/amcheck--1.1--1.2.sql b/contrib/amcheck/amcheck--1.1--1.2.sql new file mode 100644 index 00000000000..883530dec74 --- /dev/null +++ b/contrib/amcheck/amcheck--1.1--1.2.sql @@ -0,0 +1,19 @@ +/* contrib/amcheck/amcheck--1.1--1.2.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.2'" to load this file. \quit + +-- In order to avoid issues with dependencies when updating amcheck to 1.2, +-- create new, overloaded version of the 1.1 function signature + +-- +-- bt_index_parent_check() +-- +CREATE FUNCTION bt_index_parent_check(index regclass, + heapallindexed boolean, rootdescend boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'bt_index_parent_check' +LANGUAGE C STRICT PARALLEL RESTRICTED; + +-- Don't want this to be available to public +REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean) FROM PUBLIC; diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control index 469048403db..c6e310046d4 100644 --- a/contrib/amcheck/amcheck.control +++ b/contrib/amcheck/amcheck.control @@ -1,5 +1,5 @@ # amcheck extension comment = 'functions for verifying relation integrity' -default_version = '1.1' +default_version = '1.2' module_pathname = '$libdir/amcheck' relocatable = true diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out index 1e6079ddd29..6103aa4e472 100644 --- a/contrib/amcheck/expected/check_btree.out +++ b/contrib/amcheck/expected/check_btree.out @@ -126,7 +126,8 @@ SELECT bt_index_parent_check('bttest_multi_idx', true); (1 row) -- --- Test for multilevel page deletion/downlink present checks +-- Test for multilevel page deletion/downlink present checks, and rootdescend +-- checks -- INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); @@ -137,7 +138,7 @@ VACUUM delete_test_table; -- root" DELETE FROM delete_test_table WHERE a < 79990; VACUUM delete_test_table; -SELECT bt_index_parent_check('delete_test_table_pkey', true); +SELECT bt_index_parent_check('delete_test_table_pkey', true, true); bt_index_parent_check ----------------------- diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql index 3f1e0d17efe..f3e365f4bdf 100644 --- a/contrib/amcheck/sql/check_btree.sql +++ b/contrib/amcheck/sql/check_btree.sql @@ -78,7 +78,8 @@ INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i; SELECT bt_index_parent_check('bttest_multi_idx', true); -- --- Test for multilevel page deletion/downlink present checks +-- Test for multilevel page deletion/downlink present checks, and rootdescend +-- checks -- INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); @@ -89,7 +90,7 @@ VACUUM delete_test_table; -- root" DELETE FROM delete_test_table WHERE a < 79990; VACUUM delete_test_table; -SELECT bt_index_parent_check('delete_test_table_pkey', true); +SELECT bt_index_parent_check('delete_test_table_pkey', true, true); -- -- BUG #15597: must not assume consistent input toasting state when forming diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 4363e6b82e7..6ae3bca9536 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -75,6 +75,8 @@ typedef struct BtreeCheckState bool readonly; /* Also verifying heap has no unindexed tuples? */ bool heapallindexed; + /* Also making sure non-pivot tuples can be found by new search? */ + bool rootdescend; /* Per-page context */ MemoryContext targetcontext; /* Buffer access strategy */ @@ -124,10 +126,11 @@ PG_FUNCTION_INFO_V1(bt_index_check); PG_FUNCTION_INFO_V1(bt_index_parent_check); static void bt_index_check_internal(Oid indrelid, bool parentcheck, - bool heapallindexed); + bool heapallindexed, bool rootdescend); static inline void btree_index_checkable(Relation rel); static void bt_check_every_level(Relation rel, Relation heaprel, - bool heapkeyspace, bool readonly, bool heapallindexed); + bool heapkeyspace, bool readonly, bool heapallindexed, + bool rootdescend); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); @@ -140,6 +143,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup, bool tupleIsAlive, void *checkstate); static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup); +static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, @@ -177,7 +181,7 @@ bt_index_check(PG_FUNCTION_ARGS) if (PG_NARGS() == 2) heapallindexed = PG_GETARG_BOOL(1); - bt_index_check_internal(indrelid, false, heapallindexed); + bt_index_check_internal(indrelid, false, heapallindexed, false); PG_RETURN_VOID(); } @@ -196,11 +200,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS) { Oid indrelid = PG_GETARG_OID(0); bool heapallindexed = false; + bool rootdescend = false; - if (PG_NARGS() == 2) + if (PG_NARGS() >= 2) heapallindexed = PG_GETARG_BOOL(1); + if (PG_NARGS() == 3) + rootdescend = PG_GETARG_BOOL(2); - bt_index_check_internal(indrelid, true, heapallindexed); + bt_index_check_internal(indrelid, true, heapallindexed, rootdescend); PG_RETURN_VOID(); } @@ -209,7 +216,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS) * Helper for bt_index_[parent_]check, coordinating the bulk of the work. */ static void -bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed) +bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, + bool rootdescend) { Oid heapid; Relation indrel; @@ -267,7 +275,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed) /* Check index, possibly against table it is an index on */ heapkeyspace = _bt_heapkeyspace(indrel); bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, - heapallindexed); + heapallindexed, rootdescend); /* * Release locks early. That's ok here because nothing in the called @@ -338,7 +346,7 @@ btree_index_checkable(Relation rel) */ static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, - bool readonly, bool heapallindexed) + bool readonly, bool heapallindexed, bool rootdescend) { BtreeCheckState *state; Page metapage; @@ -362,6 +370,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->heapkeyspace = heapkeyspace; state->readonly = readonly; state->heapallindexed = heapallindexed; + state->rootdescend = rootdescend; if (state->heapallindexed) { @@ -430,6 +439,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, } } + Assert(!state->rootdescend || state->readonly); + if (state->rootdescend && !state->heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search", + RelationGetRelationName(rel)), + errhint("Only B-Tree version 4 indexes support rootdescend verification."))); + /* Create context for page */ state->targetcontext = AllocSetContextCreate(CurrentMemoryContext, "amcheck context", @@ -922,6 +939,31 @@ bt_target_page_check(BtreeCheckState *state) if (offset_is_negative_infinity(topaque, offset)) continue; + /* + * Readonly callers may optionally verify that non-pivot tuples can + * each be found by an independent search that starts from the root + */ + if (state->rootdescend && P_ISLEAF(topaque) && + !bt_rootdescend(state, itup)) + { + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid))); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("could not find tuple using search from root page in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + itid, htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_pivotsearch(state->rel, itup); @@ -1526,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, * internal pages. In more general terms, a negative infinity item is * only negative infinity with respect to the subtree that the page is * at the root of. + * + * See also: bt_rootdescend(), which can even detect transitive + * inconsistencies on cousin leaf pages. */ if (offset_is_negative_infinity(copaque, offset)) continue; @@ -1926,6 +1971,81 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) return reformed; } +/* + * Search for itup in index, starting from fast root page. itup must be a + * non-pivot tuple. This is only supported with heapkeyspace indexes, since + * we rely on having fully unique keys to find a match with only a signle + * visit to a leaf page, barring an interrupted page split, where we may have + * to move right. (A concurrent page split is impossible because caller must + * be readonly caller.) + * + * This routine can detect very subtle transitive consistency issues across + * more than one level of the tree. Leaf pages all have a high key (even the + * rightmost page has a conceptual positive infinity high key), but not a low + * key. Their downlink in parent is a lower bound, which along with the high + * key is almost enough to detect every possible inconsistency. A downlink + * separator key value won't always be available from parent, though, because + * the first items of internal pages are negative infinity items, truncated + * down to zero attributes during internal page splits. While it's true that + * bt_downlink_check() and the high key check can detect most imaginable key + * space problems, there are remaining problems it won't detect with non-pivot + * tuples in cousin leaf pages. Starting a search from the root for every + * existing leaf tuple detects small inconsistencies in upper levels of the + * tree that cannot be detected any other way. (Besides all this, this is + * probably also useful as a direct test of the code used by index scans + * themselves.) + */ +static bool +bt_rootdescend(BtreeCheckState *state, IndexTuple itup) +{ + BTScanInsert key; + BTStack stack; + Buffer lbuf; + bool exists; + + key = _bt_mkscankey(state->rel, itup); + Assert(key->heapkeyspace && key->scantid != NULL); + + /* + * Search from root. + * + * Ideally, we would arrange to only move right within _bt_search() when + * an interrupted page split is detected (i.e. when the incomplete split + * bit is found to be set), but for now we accept the possibility that + * that could conceal an inconsistency. + */ + Assert(state->readonly && state->rootdescend); + exists = false; + stack = _bt_search(state->rel, key, &lbuf, BT_READ, NULL); + + if (BufferIsValid(lbuf)) + { + BTInsertStateData insertstate; + OffsetNumber offnum; + Page page; + + insertstate.itup = itup; + insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); + insertstate.itup_key = key; + insertstate.bounds_valid = false; + insertstate.buf = lbuf; + + /* Get matching tuple on leaf page */ + offnum = _bt_binsrch_insert(state->rel, &insertstate); + /* Compare first >= matching item on leaf page, if any */ + page = BufferGetPage(lbuf); + if (offnum <= PageGetMaxOffsetNumber(page) && + _bt_compare(state->rel, key, page, offnum) == 0) + exists = true; + _bt_relbuf(state->rel, lbuf); + } + + _bt_freestack(stack); + pfree(key); + + return exists; +} + /* * Is particular offset within page (whose special state is passed by caller) * the page negative-infinity item? diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 8bb60d5c2da..627651d8d4a 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -112,7 +112,7 @@ ORDER BY c.relpages DESC LIMIT 10; - bt_index_parent_check(index regclass, heapallindexed boolean) returns void + bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean) returns void bt_index_parent_check @@ -126,7 +126,10 @@ ORDER BY c.relpages DESC LIMIT 10; argument is true, the function verifies the presence of all heap tuples that should be found within the index, and that there are no missing downlinks in the index - structure. The checks that can be performed by + structure. When the optional rootdescend + argument is true, verification re-finds + tuples on the leaf level by performing a new search from the + root page for each tuple. The checks that can be performed by bt_index_parent_check are a superset of the checks that can be performed by bt_index_check. bt_index_parent_check can be thought of as