mirror of
https://github.com/postgres/postgres.git
synced 2025-07-26 01:22:12 +03:00
Add missing and dangling downlink checks to amcheck
When bt_index_parent_check() is called with the heapallindexed option,
allocate a second Bloom filter to fingerprint block numbers that appear
in the downlinks of internal pages. Use Bloom filter probes when
walking the B-Tree to detect missing downlinks. This can detect subtle
problems with page deletion/VACUUM, such as corruption caused by the bug
just fixed in commit 6db4b499
.
The downlink Bloom filter is bound in size by work_mem. Its optimal
size is typically far smaller than that of the regular heapallindexed
Bloom filter, especially when the index has high fan-out.
Author: Peter Geoghegan
Reviewer: Teodor Sigaev
Discussion: https://postgr.es/m/CAH2-WznUzY4fWTjm1tBB3JpVz8cCfz7k_qVp5BhuPyhivmWJFg@mail.gmail.com
This commit is contained in:
@ -91,6 +91,10 @@ typedef struct BtreeCheckState
|
|||||||
|
|
||||||
/* Bloom filter fingerprints B-Tree index */
|
/* Bloom filter fingerprints B-Tree index */
|
||||||
bloom_filter *filter;
|
bloom_filter *filter;
|
||||||
|
/* Bloom filter fingerprints downlink blocks within tree */
|
||||||
|
bloom_filter *downlinkfilter;
|
||||||
|
/* Right half of incomplete split marker */
|
||||||
|
bool rightsplit;
|
||||||
/* Debug counter */
|
/* Debug counter */
|
||||||
int64 heaptuplespresent;
|
int64 heaptuplespresent;
|
||||||
} BtreeCheckState;
|
} BtreeCheckState;
|
||||||
@ -124,6 +128,7 @@ static void bt_target_page_check(BtreeCheckState *state);
|
|||||||
static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
|
static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
|
||||||
static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
|
static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
|
||||||
ScanKey targetkey);
|
ScanKey targetkey);
|
||||||
|
static void bt_downlink_missing_check(BtreeCheckState *state);
|
||||||
static void bt_tuple_present_callback(Relation index, HeapTuple htup,
|
static void bt_tuple_present_callback(Relation index, HeapTuple htup,
|
||||||
Datum *values, bool *isnull,
|
Datum *values, bool *isnull,
|
||||||
bool tupleIsAlive, void *checkstate);
|
bool tupleIsAlive, void *checkstate);
|
||||||
@ -360,6 +365,9 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
|
|||||||
* before index fingerprinting begins, so we can later be certain that
|
* before index fingerprinting begins, so we can later be certain that
|
||||||
* index fingerprinting should have reached all tuples returned by
|
* index fingerprinting should have reached all tuples returned by
|
||||||
* IndexBuildHeapScan().
|
* IndexBuildHeapScan().
|
||||||
|
*
|
||||||
|
* In readonly case, we also check for problems with missing downlinks.
|
||||||
|
* A second Bloom filter is used for this.
|
||||||
*/
|
*/
|
||||||
if (!state->readonly)
|
if (!state->readonly)
|
||||||
{
|
{
|
||||||
@ -386,6 +394,23 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
|
|||||||
errmsg("index \"%s\" cannot be verified using transaction snapshot",
|
errmsg("index \"%s\" cannot be verified using transaction snapshot",
|
||||||
RelationGetRelationName(rel))));
|
RelationGetRelationName(rel))));
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int64 total_pages;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Extra readonly downlink check.
|
||||||
|
*
|
||||||
|
* In readonly case, we know that there cannot be a concurrent page
|
||||||
|
* split or a concurrent page deletion, which gives us the
|
||||||
|
* opportunity to verify that every non-ignorable page had a
|
||||||
|
* downlink one level up. We must be tolerant of interrupted page
|
||||||
|
* splits and page deletions, though. This is taken care of in
|
||||||
|
* bt_downlink_missing_check().
|
||||||
|
*/
|
||||||
|
total_pages = (int64) state->rel->rd_rel->relpages;
|
||||||
|
state->downlinkfilter = bloom_create(total_pages, work_mem, seed);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Create context for page */
|
/* Create context for page */
|
||||||
@ -426,6 +451,12 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
|
|||||||
current.istruerootlevel = true;
|
current.istruerootlevel = true;
|
||||||
while (current.leftmost != P_NONE)
|
while (current.leftmost != P_NONE)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Leftmost page on level cannot be right half of incomplete split.
|
||||||
|
* This can go stale immediately in !readonly case.
|
||||||
|
*/
|
||||||
|
state->rightsplit = false;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Verify this level, and get left most page for next level down, if
|
* Verify this level, and get left most page for next level down, if
|
||||||
* not at leaf level
|
* not at leaf level
|
||||||
@ -449,6 +480,16 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
|
|||||||
IndexInfo *indexinfo = BuildIndexInfo(state->rel);
|
IndexInfo *indexinfo = BuildIndexInfo(state->rel);
|
||||||
HeapScanDesc scan;
|
HeapScanDesc scan;
|
||||||
|
|
||||||
|
/* Report on extra downlink checks performed in readonly case */
|
||||||
|
if (state->readonly)
|
||||||
|
{
|
||||||
|
ereport(DEBUG1,
|
||||||
|
(errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",
|
||||||
|
RelationGetRelationName(rel),
|
||||||
|
100.0 * bloom_prop_bits_set(state->downlinkfilter))));
|
||||||
|
bloom_free(state->downlinkfilter);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create our own scan for IndexBuildHeapScan(), rather than getting it
|
* Create our own scan for IndexBuildHeapScan(), rather than getting it
|
||||||
* to do so for us. This is required so that we can actually use the
|
* to do so for us. This is required so that we can actually use the
|
||||||
@ -564,6 +605,25 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
|
|||||||
|
|
||||||
if (P_IGNORE(opaque))
|
if (P_IGNORE(opaque))
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Since there cannot be a concurrent VACUUM operation in readonly
|
||||||
|
* mode, and since a page has no links within other pages (siblings
|
||||||
|
* and parent) once it is marked fully deleted, it should be
|
||||||
|
* impossible to land on a fully deleted page in readonly mode.
|
||||||
|
* See bt_downlink_check() for further details.
|
||||||
|
*
|
||||||
|
* The bt_downlink_check() P_ISDELETED() check is repeated here so
|
||||||
|
* that pages that are only reachable through sibling links get
|
||||||
|
* checked.
|
||||||
|
*/
|
||||||
|
if (state->readonly && P_ISDELETED(opaque))
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg("downlink or sibling link points to deleted block in index \"%s\"",
|
||||||
|
RelationGetRelationName(state->rel)),
|
||||||
|
errdetail_internal("Block=%u left block=%u left link from block=%u.",
|
||||||
|
current, leftcurrent, opaque->btpo_prev)));
|
||||||
|
|
||||||
if (P_RIGHTMOST(opaque))
|
if (P_RIGHTMOST(opaque))
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
@ -617,7 +677,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
|
|||||||
/* Internal page -- downlink gets leftmost on next level */
|
/* Internal page -- downlink gets leftmost on next level */
|
||||||
itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(opaque));
|
itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(opaque));
|
||||||
itup = (IndexTuple) PageGetItem(state->target, itemid);
|
itup = (IndexTuple) PageGetItem(state->target, itemid);
|
||||||
nextleveldown.leftmost = ItemPointerGetBlockNumberNoCheck(&(itup->t_tid));
|
nextleveldown.leftmost = BTreeInnerTupleGetDownLink(itup);
|
||||||
nextleveldown.level = opaque->btpo.level - 1;
|
nextleveldown.level = opaque->btpo.level - 1;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -639,6 +699,10 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
|
|||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* readonly mode can only ever land on live pages and half-dead pages,
|
||||||
|
* so sibling pointers should always be in mutual agreement
|
||||||
|
*/
|
||||||
if (state->readonly && opaque->btpo_prev != leftcurrent)
|
if (state->readonly && opaque->btpo_prev != leftcurrent)
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
@ -668,6 +732,13 @@ nextpage:
|
|||||||
errmsg("circular link chain found in block %u of index \"%s\"",
|
errmsg("circular link chain found in block %u of index \"%s\"",
|
||||||
current, RelationGetRelationName(state->rel))));
|
current, RelationGetRelationName(state->rel))));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Record if page that is about to become target is the right half of
|
||||||
|
* an incomplete page split. This can go stale immediately in
|
||||||
|
* !readonly case.
|
||||||
|
*/
|
||||||
|
state->rightsplit = P_INCOMPLETE_SPLIT(opaque);
|
||||||
|
|
||||||
leftcurrent = current;
|
leftcurrent = current;
|
||||||
current = opaque->btpo_next;
|
current = opaque->btpo_next;
|
||||||
|
|
||||||
@ -812,6 +883,16 @@ bt_target_page_check(BtreeCheckState *state)
|
|||||||
(uint32) state->targetlsn)));
|
(uint32) state->targetlsn)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Fingerprint downlink blocks in heapallindexed + readonly case */
|
||||||
|
if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))
|
||||||
|
{
|
||||||
|
BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
|
||||||
|
|
||||||
|
bloom_add_element(state->downlinkfilter,
|
||||||
|
(unsigned char *) &childblock,
|
||||||
|
sizeof(BlockNumber));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Don't try to generate scankey using "negative infinity" item on
|
* Don't try to generate scankey using "negative infinity" item on
|
||||||
* internal pages. They are always truncated to zero attributes.
|
* internal pages. They are always truncated to zero attributes.
|
||||||
@ -984,11 +1065,19 @@ bt_target_page_check(BtreeCheckState *state)
|
|||||||
*/
|
*/
|
||||||
if (!P_ISLEAF(topaque) && state->readonly)
|
if (!P_ISLEAF(topaque) && state->readonly)
|
||||||
{
|
{
|
||||||
BlockNumber childblock = ItemPointerGetBlockNumberNoCheck(&(itup->t_tid));
|
BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
|
||||||
|
|
||||||
bt_downlink_check(state, childblock, skey);
|
bt_downlink_check(state, childblock, skey);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* * Check if page has a downlink in parent *
|
||||||
|
*
|
||||||
|
* This can only be checked in readonly + heapallindexed case.
|
||||||
|
*/
|
||||||
|
if (state->heapallindexed && state->readonly)
|
||||||
|
bt_downlink_missing_check(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1272,6 +1361,40 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
|
|||||||
copaque = (BTPageOpaque) PageGetSpecialPointer(child);
|
copaque = (BTPageOpaque) PageGetSpecialPointer(child);
|
||||||
maxoffset = PageGetMaxOffsetNumber(child);
|
maxoffset = PageGetMaxOffsetNumber(child);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Since there cannot be a concurrent VACUUM operation in readonly mode,
|
||||||
|
* and since a page has no links within other pages (siblings and parent)
|
||||||
|
* once it is marked fully deleted, it should be impossible to land on a
|
||||||
|
* fully deleted page.
|
||||||
|
*
|
||||||
|
* It does not quite make sense to enforce that the page cannot even be
|
||||||
|
* half-dead, despite the fact the downlink is modified at the same stage
|
||||||
|
* that the child leaf page is marked half-dead. That's incorrect because
|
||||||
|
* there may occasionally be multiple downlinks from a chain of pages
|
||||||
|
* undergoing deletion, where multiple successive calls are made to
|
||||||
|
* _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark
|
||||||
|
* the leaf page as fully dead. While _bt_mark_page_halfdead() usually
|
||||||
|
* removes the downlink to the leaf page that is marked half-dead, that's
|
||||||
|
* not guaranteed, so it's possible we'll land on a half-dead page with a
|
||||||
|
* downlink due to an interrupted multi-level page deletion.
|
||||||
|
*
|
||||||
|
* We go ahead with our checks if the child page is half-dead. It's safe
|
||||||
|
* to do so because we do not test the child's high key, so it does not
|
||||||
|
* matter that the original high key will have been replaced by a dummy
|
||||||
|
* truncated high key within _bt_mark_page_halfdead(). All other page
|
||||||
|
* items are left intact on a half-dead page, so there is still something
|
||||||
|
* to test.
|
||||||
|
*/
|
||||||
|
if (P_ISDELETED(copaque))
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg("downlink to deleted page found in index \"%s\"",
|
||||||
|
RelationGetRelationName(state->rel)),
|
||||||
|
errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.",
|
||||||
|
state->targetblock, childblock,
|
||||||
|
(uint32) (state->targetlsn >> 32),
|
||||||
|
(uint32) state->targetlsn)));
|
||||||
|
|
||||||
for (offset = P_FIRSTDATAKEY(copaque);
|
for (offset = P_FIRSTDATAKEY(copaque);
|
||||||
offset <= maxoffset;
|
offset <= maxoffset;
|
||||||
offset = OffsetNumberNext(offset))
|
offset = OffsetNumberNext(offset))
|
||||||
@ -1300,6 +1423,191 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
|
|||||||
pfree(child);
|
pfree(child);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Checks if page is missing a downlink that it should have.
|
||||||
|
*
|
||||||
|
* A page that lacks a downlink/parent may indicate corruption. However, we
|
||||||
|
* must account for the fact that a missing downlink can occasionally be
|
||||||
|
* encountered in a non-corrupt index. This can be due to an interrupted page
|
||||||
|
* split, or an interrupted multi-level page deletion (i.e. there was a hard
|
||||||
|
* crash or an error during a page split, or while VACUUM was deleting a
|
||||||
|
* multi-level chain of pages).
|
||||||
|
*
|
||||||
|
* Note that this can only be called in readonly mode, so there is no need to
|
||||||
|
* be concerned about concurrent page splits or page deletions.
|
||||||
|
*/
|
||||||
|
static void
|
||||||
|
bt_downlink_missing_check(BtreeCheckState *state)
|
||||||
|
{
|
||||||
|
BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
|
||||||
|
ItemId itemid;
|
||||||
|
IndexTuple itup;
|
||||||
|
Page child;
|
||||||
|
BTPageOpaque copaque;
|
||||||
|
uint32 level;
|
||||||
|
BlockNumber childblk;
|
||||||
|
|
||||||
|
Assert(state->heapallindexed && state->readonly);
|
||||||
|
Assert(!P_IGNORE(topaque));
|
||||||
|
|
||||||
|
/* No next level up with downlinks to fingerprint from the true root */
|
||||||
|
if (P_ISROOT(topaque))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Incomplete (interrupted) page splits can account for the lack of a
|
||||||
|
* downlink. Some inserting transaction should eventually complete the
|
||||||
|
* page split in passing, when it notices that the left sibling page is
|
||||||
|
* P_INCOMPLETE_SPLIT().
|
||||||
|
*
|
||||||
|
* In general, VACUUM is not prepared for there to be no downlink to a page
|
||||||
|
* that it deletes. This is the main reason why the lack of a downlink can
|
||||||
|
* be reported as corruption here. It's not obvious that an invalid
|
||||||
|
* missing downlink can result in wrong answers to queries, though, since
|
||||||
|
* index scans that land on the child may end up consistently moving right.
|
||||||
|
* The handling of concurrent page splits (and page deletions) within
|
||||||
|
* _bt_moveright() cannot distinguish inconsistencies that last for a
|
||||||
|
* moment from inconsistencies that are permanent and irrecoverable.
|
||||||
|
*
|
||||||
|
* VACUUM isn't even prepared to delete pages that have no downlink due to
|
||||||
|
* an incomplete page split, but it can detect and reason about that case
|
||||||
|
* by design, so it shouldn't be taken to indicate corruption. See
|
||||||
|
* _bt_pagedel() for full details.
|
||||||
|
*/
|
||||||
|
if (state->rightsplit)
|
||||||
|
{
|
||||||
|
ereport(DEBUG1,
|
||||||
|
(errcode(ERRCODE_NO_DATA),
|
||||||
|
errmsg("harmless interrupted page split detected in index %s",
|
||||||
|
RelationGetRelationName(state->rel)),
|
||||||
|
errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
|
||||||
|
state->targetblock, topaque->btpo.level,
|
||||||
|
topaque->btpo_prev,
|
||||||
|
(uint32) (state->targetlsn >> 32),
|
||||||
|
(uint32) state->targetlsn)));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Target's downlink is typically present in parent/fingerprinted */
|
||||||
|
if (!bloom_lacks_element(state->downlinkfilter,
|
||||||
|
(unsigned char *) &state->targetblock,
|
||||||
|
sizeof(BlockNumber)))
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Target is probably the "top parent" of a multi-level page deletion.
|
||||||
|
* We'll need to descend the subtree to make sure that descendant pages are
|
||||||
|
* consistent with that, though.
|
||||||
|
*
|
||||||
|
* If the target page (which must be non-ignorable) is a leaf page, then
|
||||||
|
* clearly it can't be the top parent. The lack of a downlink is probably
|
||||||
|
* a symptom of a broad problem that could just as easily cause
|
||||||
|
* inconsistencies anywhere else.
|
||||||
|
*/
|
||||||
|
if (P_ISLEAF(topaque))
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg("leaf index block lacks downlink in index \"%s\"",
|
||||||
|
RelationGetRelationName(state->rel)),
|
||||||
|
errdetail_internal("Block=%u page lsn=%X/%X.",
|
||||||
|
state->targetblock,
|
||||||
|
(uint32) (state->targetlsn >> 32),
|
||||||
|
(uint32) state->targetlsn)));
|
||||||
|
|
||||||
|
/* Descend from the target page, which is an internal page */
|
||||||
|
elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
|
||||||
|
RelationGetRelationName(state->rel));
|
||||||
|
|
||||||
|
level = topaque->btpo.level;
|
||||||
|
itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(topaque));
|
||||||
|
itup = (IndexTuple) PageGetItem(state->target, itemid);
|
||||||
|
childblk = BTreeInnerTupleGetDownLink(itup);
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
||||||
|
child = palloc_btree_page(state, childblk);
|
||||||
|
copaque = (BTPageOpaque) PageGetSpecialPointer(child);
|
||||||
|
|
||||||
|
if (P_ISLEAF(copaque))
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Do an extra sanity check in passing on internal pages */
|
||||||
|
if (copaque->btpo.level != level - 1)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
|
||||||
|
RelationGetRelationName(state->rel)),
|
||||||
|
errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
|
||||||
|
state->targetblock, childblk,
|
||||||
|
level - 1, copaque->btpo.level)));
|
||||||
|
|
||||||
|
level = copaque->btpo.level;
|
||||||
|
itemid = PageGetItemId(child, P_FIRSTDATAKEY(copaque));
|
||||||
|
itup = (IndexTuple) PageGetItem(child, itemid);
|
||||||
|
childblk = BTreeInnerTupleGetDownLink(itup);
|
||||||
|
/* Be slightly more pro-active in freeing this memory, just in case */
|
||||||
|
pfree(child);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Since there cannot be a concurrent VACUUM operation in readonly mode,
|
||||||
|
* and since a page has no links within other pages (siblings and parent)
|
||||||
|
* once it is marked fully deleted, it should be impossible to land on a
|
||||||
|
* fully deleted page. See bt_downlink_check() for further details.
|
||||||
|
*
|
||||||
|
* The bt_downlink_check() P_ISDELETED() check is repeated here because
|
||||||
|
* bt_downlink_check() does not visit pages reachable through negative
|
||||||
|
* infinity items. Besides, bt_downlink_check() is unwilling to descend
|
||||||
|
* multiple levels. (The similar bt_downlink_check() P_ISDELETED() check
|
||||||
|
* within bt_check_level_from_leftmost() won't reach the page either, since
|
||||||
|
* the leaf's live siblings should have their sibling links updating to
|
||||||
|
* bypass the deletion target page when it is marked fully dead.)
|
||||||
|
*
|
||||||
|
* If this error is raised, it might be due to a previous multi-level page
|
||||||
|
* deletion that failed to realize that it wasn't yet safe to mark the leaf
|
||||||
|
* page as fully dead. A "dangling downlink" will still remain when this
|
||||||
|
* happens. The fact that the dangling downlink's page (the leaf's
|
||||||
|
* parent/ancestor page) lacked a downlink is incidental.
|
||||||
|
*/
|
||||||
|
if (P_ISDELETED(copaque))
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
|
||||||
|
RelationGetRelationName(state->rel)),
|
||||||
|
errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.",
|
||||||
|
state->targetblock, childblk,
|
||||||
|
(uint32) (state->targetlsn >> 32),
|
||||||
|
(uint32) state->targetlsn)));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iff leaf page is half-dead, its high key top parent link should point to
|
||||||
|
* what VACUUM considered to be the top parent page at the instant it was
|
||||||
|
* interrupted. Provided the high key link actually points to the target
|
||||||
|
* page, the missing downlink we detected is consistent with there having
|
||||||
|
* been an interrupted multi-level page deletion. This means that the
|
||||||
|
* subtree with the target page at its root (a page deletion chain) is in a
|
||||||
|
* consistent state, enabling VACUUM to resume deleting the entire chain
|
||||||
|
* the next time it encounters the half-dead leaf page.
|
||||||
|
*/
|
||||||
|
if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
|
||||||
|
{
|
||||||
|
itemid = PageGetItemId(child, P_HIKEY);
|
||||||
|
itup = (IndexTuple) PageGetItem(child, itemid);
|
||||||
|
if (BTreeTupleGetTopParent(itup) == state->targetblock)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg("internal index block lacks downlink in index \"%s\"",
|
||||||
|
RelationGetRelationName(state->rel)),
|
||||||
|
errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
|
||||||
|
state->targetblock, topaque->btpo.level,
|
||||||
|
(uint32) (state->targetlsn >> 32),
|
||||||
|
(uint32) state->targetlsn)));
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Per-tuple callback from IndexBuildHeapScan, used to determine if index has
|
* Per-tuple callback from IndexBuildHeapScan, used to determine if index has
|
||||||
* all the entries that definitely should have been observed in leaf pages of
|
* all the entries that definitely should have been observed in leaf pages of
|
||||||
@ -1376,13 +1684,11 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
|
|||||||
* Note that we rely on deterministic index_form_tuple() TOAST compression.
|
* Note that we rely on deterministic index_form_tuple() TOAST compression.
|
||||||
* If index_form_tuple() was ever enhanced to compress datums out-of-line,
|
* If index_form_tuple() was ever enhanced to compress datums out-of-line,
|
||||||
* or otherwise varied when or how compression was applied, our assumption
|
* or otherwise varied when or how compression was applied, our assumption
|
||||||
* would break, leading to false positive reports of corruption. For now,
|
* would break, leading to false positive reports of corruption. It's also
|
||||||
* we don't decompress/normalize toasted values as part of fingerprinting.
|
* possible that non-pivot tuples could in the future have alternative
|
||||||
*
|
* equivalent representations (e.g. by using the INDEX_ALT_TID_MASK bit).
|
||||||
* In future, non-pivot index tuples might get use of
|
* For now, we don't decompress/normalize toasted values as part of
|
||||||
* BT_N_KEYS_OFFSET_MASK. Then binary representation of index tuple linked
|
* fingerprinting.
|
||||||
* to particular heap tuple might vary and meeds to be normalized before
|
|
||||||
* bloom filter lookup.
|
|
||||||
*/
|
*/
|
||||||
itup = index_form_tuple(RelationGetDescr(index), values, isnull);
|
itup = index_form_tuple(RelationGetDescr(index), values, isnull);
|
||||||
itup->t_tid = htup->t_self;
|
itup->t_tid = htup->t_self;
|
||||||
@ -1393,8 +1699,8 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
|
|||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||||
errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
|
errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
|
||||||
ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
|
ItemPointerGetBlockNumber(&(itup->t_tid)),
|
||||||
ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)),
|
ItemPointerGetOffsetNumber(&(itup->t_tid)),
|
||||||
RelationGetRelationName(state->heaprel),
|
RelationGetRelationName(state->heaprel),
|
||||||
RelationGetRelationName(state->rel)),
|
RelationGetRelationName(state->rel)),
|
||||||
!state->readonly
|
!state->readonly
|
||||||
@ -1520,6 +1826,7 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
|
|||||||
Buffer buffer;
|
Buffer buffer;
|
||||||
Page page;
|
Page page;
|
||||||
BTPageOpaque opaque;
|
BTPageOpaque opaque;
|
||||||
|
OffsetNumber maxoffset;
|
||||||
|
|
||||||
page = palloc(BLCKSZ);
|
page = palloc(BLCKSZ);
|
||||||
|
|
||||||
@ -1566,9 +1873,13 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
|
|||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
errmsg("version mismatch in index \"%s\": file version %d, "
|
errmsg("version mismatch in index \"%s\": file version %d, "
|
||||||
"current version %d, minimal supported version %d",
|
"current version %d, minimum supported version %d",
|
||||||
RelationGetRelationName(state->rel),
|
RelationGetRelationName(state->rel),
|
||||||
metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
|
metad->btm_version, BTREE_VERSION,
|
||||||
|
BTREE_MIN_VERSION)));
|
||||||
|
|
||||||
|
/* Finished with metapage checks */
|
||||||
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1581,12 +1892,66 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
|
|||||||
errmsg("invalid leaf page level %u for block %u in index \"%s\"",
|
errmsg("invalid leaf page level %u for block %u in index \"%s\"",
|
||||||
opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
|
opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
|
||||||
|
|
||||||
if (blocknum != BTREE_METAPAGE && !P_ISLEAF(opaque) &&
|
if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
|
||||||
!P_ISDELETED(opaque) && opaque->btpo.level == 0)
|
opaque->btpo.level == 0)
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
errmsg("invalid internal page level 0 for block %u in index \"%s\"",
|
errmsg("invalid internal page level 0 for block %u in index \"%s\"",
|
||||||
opaque->btpo.level, RelationGetRelationName(state->rel))));
|
blocknum, RelationGetRelationName(state->rel))));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sanity checks for number of items on page.
|
||||||
|
*
|
||||||
|
* As noted at the beginning of _bt_binsrch(), an internal page must have
|
||||||
|
* children, since there must always be a negative infinity downlink (there
|
||||||
|
* may also be a highkey). In the case of non-rightmost leaf pages, there
|
||||||
|
* must be at least a highkey.
|
||||||
|
*
|
||||||
|
* This is correct when pages are half-dead, since internal pages are never
|
||||||
|
* half-dead, and leaf pages must have a high key when half-dead (the
|
||||||
|
* rightmost page can never be deleted). It's also correct with fully
|
||||||
|
* deleted pages: _bt_unlink_halfdead_page() doesn't change anything about
|
||||||
|
* the target page other than setting the page as fully dead, and setting
|
||||||
|
* its xact field. In particular, it doesn't change the sibling links in
|
||||||
|
* the deletion target itself, since they're required when index scans land
|
||||||
|
* on the deletion target, and then need to move right (or need to move
|
||||||
|
* left, in the case of backward index scans).
|
||||||
|
*/
|
||||||
|
maxoffset = PageGetMaxOffsetNumber(page);
|
||||||
|
if (maxoffset > MaxIndexTuplesPerPage)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)",
|
||||||
|
blocknum, RelationGetRelationName(state->rel),
|
||||||
|
MaxIndexTuplesPerPage)));
|
||||||
|
|
||||||
|
if (!P_ISLEAF(opaque) && maxoffset < P_FIRSTDATAKEY(opaque))
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink",
|
||||||
|
blocknum, RelationGetRelationName(state->rel))));
|
||||||
|
|
||||||
|
if (P_ISLEAF(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item",
|
||||||
|
blocknum, RelationGetRelationName(state->rel))));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In general, internal pages are never marked half-dead, except on
|
||||||
|
* versions of Postgres prior to 9.4, where it can be valid transient
|
||||||
|
* state. This state is nonetheless treated as corruption by VACUUM on
|
||||||
|
* from version 9.4 on, so do the same here. See _bt_pagedel() for full
|
||||||
|
* details.
|
||||||
|
*
|
||||||
|
* Internal pages should never have garbage items, either.
|
||||||
|
*/
|
||||||
|
if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||||
|
errmsg("internal page block %u in index \"%s\" is half-dead",
|
||||||
|
blocknum, RelationGetRelationName(state->rel)),
|
||||||
|
errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
|
||||||
|
|
||||||
if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
|
if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
|
@ -55,7 +55,7 @@
|
|||||||
<function>bt_index_check</function> tests that its target, a
|
<function>bt_index_check</function> tests that its target, a
|
||||||
B-Tree index, respects a variety of invariants. Example usage:
|
B-Tree index, respects a variety of invariants. Example usage:
|
||||||
<screen>
|
<screen>
|
||||||
test=# SELECT bt_index_check(index => c.oid, heapallindexed => i.indisunique)
|
test=# SELECT bt_index_check(index => c.oid, heapallindexed => i.indisunique),
|
||||||
c.relname,
|
c.relname,
|
||||||
c.relpages
|
c.relpages
|
||||||
FROM pg_index i
|
FROM pg_index i
|
||||||
@ -67,7 +67,7 @@ WHERE am.amname = 'btree' AND n.nspname = 'pg_catalog'
|
|||||||
-- Don't check temp tables, which may be from another session:
|
-- Don't check temp tables, which may be from another session:
|
||||||
AND c.relpersistence != 't'
|
AND c.relpersistence != 't'
|
||||||
-- Function may throw an error when this is omitted:
|
-- Function may throw an error when this is omitted:
|
||||||
AND i.indisready AND i.indisvalid
|
AND c.relkind = 'i' AND i.indisready AND i.indisvalid
|
||||||
ORDER BY c.relpages DESC LIMIT 10;
|
ORDER BY c.relpages DESC LIMIT 10;
|
||||||
bt_index_check | relname | relpages
|
bt_index_check | relname | relpages
|
||||||
----------------+---------------------------------+----------
|
----------------+---------------------------------+----------
|
||||||
@ -126,7 +126,8 @@ ORDER BY c.relpages DESC LIMIT 10;
|
|||||||
Optionally, when the <parameter>heapallindexed</parameter>
|
Optionally, when the <parameter>heapallindexed</parameter>
|
||||||
argument is <literal>true</literal>, the function verifies the
|
argument is <literal>true</literal>, the function verifies the
|
||||||
presence of all heap tuples that should be found within the
|
presence of all heap tuples that should be found within the
|
||||||
index. The checks that can be performed by
|
index, and that there are no missing downlinks in the index
|
||||||
|
structure. The checks that can be performed by
|
||||||
<function>bt_index_parent_check</function> are a superset of the
|
<function>bt_index_parent_check</function> are a superset of the
|
||||||
checks that can be performed by <function>bt_index_check</function>.
|
checks that can be performed by <function>bt_index_check</function>.
|
||||||
<function>bt_index_parent_check</function> can be thought of as
|
<function>bt_index_parent_check</function> can be thought of as
|
||||||
|
Reference in New Issue
Block a user