From c1afd175b5b2e5c44f6da34988342e00ecdfb518 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 20 Mar 2019 10:41:36 -0700 Subject: [PATCH] Allow amcheck to re-find tuples using new search. Teach contrib/amcheck's bt_index_parent_check() function to take advantage of the uniqueness property of heapkeyspace indexes in support of a new verification option: non-pivot tuples (non-highkey tuples on the leaf level) can optionally be re-found using a new search for each, that starts from the root page. If a tuple cannot be re-found, report that the index is corrupt. The new "rootdescend" verification option is exhaustive, and can therefore make a call to bt_index_parent_check() take a lot longer. Re-finding tuples during verification is mostly intended as an option for backend developers, since the corruption scenarios that it alone is uniquely capable of detecting seem fairly far-fetched. For example, "rootdescend" verification is much more likely to detect corruption of the least significant byte of a key from a pivot tuple in the root page of a B-Tree that already has at least three levels. Typically, only a few tuples on a cousin leaf page are at risk of "getting overlooked" by index scans in this scenario. The corrupt key in the root page is only slightly corrupt: corrupt enough to give wrong answers to some queries, and yet not corrupt enough to allow the problem to be detected without verifying agreement between the leaf page and the root page, skipping at least one internal page level. The existing bt_index_parent_check() checks never cross more than a single level. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas Discussion: https://postgr.es/m/CAH2-Wz=yTWnVu+HeHGKb2AGiADL9eprn-cKYAto4MkKOuiGtRQ@mail.gmail.com --- contrib/amcheck/Makefile | 2 +- contrib/amcheck/amcheck--1.1--1.2.sql | 19 ++++ contrib/amcheck/amcheck.control | 2 +- contrib/amcheck/expected/check_btree.out | 5 +- contrib/amcheck/sql/check_btree.sql | 5 +- contrib/amcheck/verify_nbtree.c | 136 +++++++++++++++++++++-- doc/src/sgml/amcheck.sgml | 7 +- 7 files changed, 160 insertions(+), 16 deletions(-) create mode 100644 contrib/amcheck/amcheck--1.1--1.2.sql diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index c5764b544fd..dcec3b85203 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -4,7 +4,7 @@ MODULE_big = amcheck OBJS = verify_nbtree.o $(WIN32RES) EXTENSION = amcheck -DATA = amcheck--1.0--1.1.sql amcheck--1.0.sql +DATA = amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql PGFILEDESC = "amcheck - function for verifying relation integrity" REGRESS = check check_btree diff --git a/contrib/amcheck/amcheck--1.1--1.2.sql b/contrib/amcheck/amcheck--1.1--1.2.sql new file mode 100644 index 00000000000..883530dec74 --- /dev/null +++ b/contrib/amcheck/amcheck--1.1--1.2.sql @@ -0,0 +1,19 @@ +/* contrib/amcheck/amcheck--1.1--1.2.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.2'" to load this file. \quit + +-- In order to avoid issues with dependencies when updating amcheck to 1.2, +-- create new, overloaded version of the 1.1 function signature + +-- +-- bt_index_parent_check() +-- +CREATE FUNCTION bt_index_parent_check(index regclass, + heapallindexed boolean, rootdescend boolean) +RETURNS VOID +AS 'MODULE_PATHNAME', 'bt_index_parent_check' +LANGUAGE C STRICT PARALLEL RESTRICTED; + +-- Don't want this to be available to public +REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean) FROM PUBLIC; diff --git a/contrib/amcheck/amcheck.control b/contrib/amcheck/amcheck.control index 469048403db..c6e310046d4 100644 --- a/contrib/amcheck/amcheck.control +++ b/contrib/amcheck/amcheck.control @@ -1,5 +1,5 @@ # amcheck extension comment = 'functions for verifying relation integrity' -default_version = '1.1' +default_version = '1.2' module_pathname = '$libdir/amcheck' relocatable = true diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out index 1e6079ddd29..6103aa4e472 100644 --- a/contrib/amcheck/expected/check_btree.out +++ b/contrib/amcheck/expected/check_btree.out @@ -126,7 +126,8 @@ SELECT bt_index_parent_check('bttest_multi_idx', true); (1 row) -- --- Test for multilevel page deletion/downlink present checks +-- Test for multilevel page deletion/downlink present checks, and rootdescend +-- checks -- INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); @@ -137,7 +138,7 @@ VACUUM delete_test_table; -- root" DELETE FROM delete_test_table WHERE a < 79990; VACUUM delete_test_table; -SELECT bt_index_parent_check('delete_test_table_pkey', true); +SELECT bt_index_parent_check('delete_test_table_pkey', true, true); bt_index_parent_check ----------------------- diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql index 3f1e0d17efe..f3e365f4bdf 100644 --- a/contrib/amcheck/sql/check_btree.sql +++ b/contrib/amcheck/sql/check_btree.sql @@ -78,7 +78,8 @@ INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i; SELECT bt_index_parent_check('bttest_multi_idx', true); -- --- Test for multilevel page deletion/downlink present checks +-- Test for multilevel page deletion/downlink present checks, and rootdescend +-- checks -- INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); @@ -89,7 +90,7 @@ VACUUM delete_test_table; -- root" DELETE FROM delete_test_table WHERE a < 79990; VACUUM delete_test_table; -SELECT bt_index_parent_check('delete_test_table_pkey', true); +SELECT bt_index_parent_check('delete_test_table_pkey', true, true); -- -- BUG #15597: must not assume consistent input toasting state when forming diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 4363e6b82e7..6ae3bca9536 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -75,6 +75,8 @@ typedef struct BtreeCheckState bool readonly; /* Also verifying heap has no unindexed tuples? */ bool heapallindexed; + /* Also making sure non-pivot tuples can be found by new search? */ + bool rootdescend; /* Per-page context */ MemoryContext targetcontext; /* Buffer access strategy */ @@ -124,10 +126,11 @@ PG_FUNCTION_INFO_V1(bt_index_check); PG_FUNCTION_INFO_V1(bt_index_parent_check); static void bt_index_check_internal(Oid indrelid, bool parentcheck, - bool heapallindexed); + bool heapallindexed, bool rootdescend); static inline void btree_index_checkable(Relation rel); static void bt_check_every_level(Relation rel, Relation heaprel, - bool heapkeyspace, bool readonly, bool heapallindexed); + bool heapkeyspace, bool readonly, bool heapallindexed, + bool rootdescend); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); @@ -140,6 +143,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup, bool tupleIsAlive, void *checkstate); static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup); +static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup); static inline bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset); static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, @@ -177,7 +181,7 @@ bt_index_check(PG_FUNCTION_ARGS) if (PG_NARGS() == 2) heapallindexed = PG_GETARG_BOOL(1); - bt_index_check_internal(indrelid, false, heapallindexed); + bt_index_check_internal(indrelid, false, heapallindexed, false); PG_RETURN_VOID(); } @@ -196,11 +200,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS) { Oid indrelid = PG_GETARG_OID(0); bool heapallindexed = false; + bool rootdescend = false; - if (PG_NARGS() == 2) + if (PG_NARGS() >= 2) heapallindexed = PG_GETARG_BOOL(1); + if (PG_NARGS() == 3) + rootdescend = PG_GETARG_BOOL(2); - bt_index_check_internal(indrelid, true, heapallindexed); + bt_index_check_internal(indrelid, true, heapallindexed, rootdescend); PG_RETURN_VOID(); } @@ -209,7 +216,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS) * Helper for bt_index_[parent_]check, coordinating the bulk of the work. */ static void -bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed) +bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, + bool rootdescend) { Oid heapid; Relation indrel; @@ -267,7 +275,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed) /* Check index, possibly against table it is an index on */ heapkeyspace = _bt_heapkeyspace(indrel); bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, - heapallindexed); + heapallindexed, rootdescend); /* * Release locks early. That's ok here because nothing in the called @@ -338,7 +346,7 @@ btree_index_checkable(Relation rel) */ static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, - bool readonly, bool heapallindexed) + bool readonly, bool heapallindexed, bool rootdescend) { BtreeCheckState *state; Page metapage; @@ -362,6 +370,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->heapkeyspace = heapkeyspace; state->readonly = readonly; state->heapallindexed = heapallindexed; + state->rootdescend = rootdescend; if (state->heapallindexed) { @@ -430,6 +439,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, } } + Assert(!state->rootdescend || state->readonly); + if (state->rootdescend && !state->heapkeyspace) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search", + RelationGetRelationName(rel)), + errhint("Only B-Tree version 4 indexes support rootdescend verification."))); + /* Create context for page */ state->targetcontext = AllocSetContextCreate(CurrentMemoryContext, "amcheck context", @@ -922,6 +939,31 @@ bt_target_page_check(BtreeCheckState *state) if (offset_is_negative_infinity(topaque, offset)) continue; + /* + * Readonly callers may optionally verify that non-pivot tuples can + * each be found by an independent search that starts from the root + */ + if (state->rootdescend && P_ISLEAF(topaque) && + !bt_rootdescend(state, itup)) + { + char *itid, + *htid; + + itid = psprintf("(%u,%u)", state->targetblock, offset); + htid = psprintf("(%u,%u)", + ItemPointerGetBlockNumber(&(itup->t_tid)), + ItemPointerGetOffsetNumber(&(itup->t_tid))); + + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("could not find tuple using search from root page in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.", + itid, htid, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + /* Build insertion scankey for current page offset */ skey = bt_mkscankey_pivotsearch(state->rel, itup); @@ -1526,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, * internal pages. In more general terms, a negative infinity item is * only negative infinity with respect to the subtree that the page is * at the root of. + * + * See also: bt_rootdescend(), which can even detect transitive + * inconsistencies on cousin leaf pages. */ if (offset_is_negative_infinity(copaque, offset)) continue; @@ -1926,6 +1971,81 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup) return reformed; } +/* + * Search for itup in index, starting from fast root page. itup must be a + * non-pivot tuple. This is only supported with heapkeyspace indexes, since + * we rely on having fully unique keys to find a match with only a signle + * visit to a leaf page, barring an interrupted page split, where we may have + * to move right. (A concurrent page split is impossible because caller must + * be readonly caller.) + * + * This routine can detect very subtle transitive consistency issues across + * more than one level of the tree. Leaf pages all have a high key (even the + * rightmost page has a conceptual positive infinity high key), but not a low + * key. Their downlink in parent is a lower bound, which along with the high + * key is almost enough to detect every possible inconsistency. A downlink + * separator key value won't always be available from parent, though, because + * the first items of internal pages are negative infinity items, truncated + * down to zero attributes during internal page splits. While it's true that + * bt_downlink_check() and the high key check can detect most imaginable key + * space problems, there are remaining problems it won't detect with non-pivot + * tuples in cousin leaf pages. Starting a search from the root for every + * existing leaf tuple detects small inconsistencies in upper levels of the + * tree that cannot be detected any other way. (Besides all this, this is + * probably also useful as a direct test of the code used by index scans + * themselves.) + */ +static bool +bt_rootdescend(BtreeCheckState *state, IndexTuple itup) +{ + BTScanInsert key; + BTStack stack; + Buffer lbuf; + bool exists; + + key = _bt_mkscankey(state->rel, itup); + Assert(key->heapkeyspace && key->scantid != NULL); + + /* + * Search from root. + * + * Ideally, we would arrange to only move right within _bt_search() when + * an interrupted page split is detected (i.e. when the incomplete split + * bit is found to be set), but for now we accept the possibility that + * that could conceal an inconsistency. + */ + Assert(state->readonly && state->rootdescend); + exists = false; + stack = _bt_search(state->rel, key, &lbuf, BT_READ, NULL); + + if (BufferIsValid(lbuf)) + { + BTInsertStateData insertstate; + OffsetNumber offnum; + Page page; + + insertstate.itup = itup; + insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); + insertstate.itup_key = key; + insertstate.bounds_valid = false; + insertstate.buf = lbuf; + + /* Get matching tuple on leaf page */ + offnum = _bt_binsrch_insert(state->rel, &insertstate); + /* Compare first >= matching item on leaf page, if any */ + page = BufferGetPage(lbuf); + if (offnum <= PageGetMaxOffsetNumber(page) && + _bt_compare(state->rel, key, page, offnum) == 0) + exists = true; + _bt_relbuf(state->rel, lbuf); + } + + _bt_freestack(stack); + pfree(key); + + return exists; +} + /* * Is particular offset within page (whose special state is passed by caller) * the page negative-infinity item? diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 8bb60d5c2da..627651d8d4a 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -112,7 +112,7 @@ ORDER BY c.relpages DESC LIMIT 10; - bt_index_parent_check(index regclass, heapallindexed boolean) returns void + bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean) returns void bt_index_parent_check @@ -126,7 +126,10 @@ ORDER BY c.relpages DESC LIMIT 10; argument is true, the function verifies the presence of all heap tuples that should be found within the index, and that there are no missing downlinks in the index - structure. The checks that can be performed by + structure. When the optional rootdescend + argument is true, verification re-finds + tuples on the leaf level by performing a new search from the + root page for each tuple. The checks that can be performed by bt_index_parent_check are a superset of the checks that can be performed by bt_index_check. bt_index_parent_check can be thought of as