1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-31 22:04:40 +03:00

Use full 64-bit XIDs in deleted nbtree pages.

Otherwise we risk "leaking" deleted pages by making them non-recyclable
indefinitely.  Commit 6655a729 did the same thing for deleted pages in
GiST indexes.  That work was used as a starting point here.

Stop storing an XID indicating the oldest bpto.xact across all deleted
though unrecycled pages in nbtree metapages.  There is no longer any
reason to care about that condition/the oldest XID.  It only ever made
sense when wraparound was something _bt_vacuum_needs_cleanup() had to
consider.

The btm_oldest_btpo_xact metapage field has been repurposed and renamed.
It is now btm_last_cleanup_num_delpages, which is used to remember how
many non-recycled deleted pages remain from the last VACUUM (in practice
its value is usually the precise number of pages that were _newly
deleted_ during the specific VACUUM operation that last set the field).

The general idea behind storing btm_last_cleanup_num_delpages is to use
it to give _some_ consideration to non-recycled deleted pages inside
_bt_vacuum_needs_cleanup() -- though never too much.  We only really
need to avoid leaving a truly excessive number of deleted pages in an
unrecycled state forever.  We only do this to cover certain narrow cases
where no other factor makes VACUUM do a full scan, and yet the index
continues to grow (and so actually misses out on recycling existing
deleted pages).

These metapage changes result in a clear user-visible benefit: We no
longer trigger full index scans during VACUUM operations solely due to
the presence of only 1 or 2 known deleted (though unrecycled) blocks
from a very large index.  All that matters now is keeping the costs and
benefits in balance over time.

Fix an issue that has been around since commit 857f9c36, which added the
"skip full scan of index" mechanism (i.e. the _bt_vacuum_needs_cleanup()
logic).  The accuracy of btm_last_cleanup_num_heap_tuples accidentally
hinged upon _when_ the source value gets stored.  We now always store
btm_last_cleanup_num_heap_tuples in btvacuumcleanup().  This fixes the
issue because IndexVacuumInfo.num_heap_tuples (the source field) is
expected to accurately indicate the state of the table _after_ the
VACUUM completes inside btvacuumcleanup().

A backpatchable fix cannot easily be extracted from this commit.  A
targeted fix for the issue will follow in a later commit, though that
won't happen today.

I (pgeoghegan) have chosen to remove any mention of deleted pages in the
documentation of the vacuum_cleanup_index_scale_factor GUC/param, since
the presence of deleted (though unrecycled) pages is no longer of much
concern to users.  The vacuum_cleanup_index_scale_factor description in
the docs now seems rather unclear in any case, and it should probably be
rewritten in the near future.  Perhaps some passing mention of page
deletion will be added back at the same time.

Bump XLOG_PAGE_MAGIC due to nbtree WAL records using full XIDs now.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com>
Discussion: https://postgr.es/m/CAH2-WznpdHvujGUwYZ8sihX=d5u-tRYhi-F4wnV2uN2zHpMUXw@mail.gmail.com
This commit is contained in:
Peter Geoghegan
2021-02-24 18:41:34 -08:00
parent 8a4f9522d0
commit e5d8a99903
20 changed files with 623 additions and 407 deletions

View File

@ -769,7 +769,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
P_FIRSTDATAKEY(opaque));
itup = (IndexTuple) PageGetItem(state->target, itemid);
nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
nextleveldown.level = opaque->btpo.level - 1;
nextleveldown.level = opaque->btpo_level - 1;
}
else
{
@ -794,14 +794,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
if (opaque->btpo_prev != leftcurrent)
bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
/* Check level, which must be valid for non-ignorable page */
if (level.level != opaque->btpo.level)
/* Check level */
if (level.level != opaque->btpo_level)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
RelationGetRelationName(state->rel)),
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
current, level.level, opaque->btpo.level)));
current, level.level, opaque->btpo_level)));
/* Verify invariants for page */
bt_target_page_check(state);
@ -1164,7 +1164,7 @@ bt_target_page_check(BtreeCheckState *state)
bt_child_highkey_check(state,
offset,
NULL,
topaque->btpo.level);
topaque->btpo_level);
}
continue;
}
@ -1520,7 +1520,7 @@ bt_target_page_check(BtreeCheckState *state)
if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
{
bt_child_highkey_check(state, InvalidOffsetNumber,
NULL, topaque->btpo.level);
NULL, topaque->btpo_level);
}
}
@ -1597,7 +1597,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
ereport(DEBUG1,
(errcode(ERRCODE_NO_DATA),
errmsg_internal("level %u leftmost page of index \"%s\" was found deleted or half dead",
opaque->btpo.level, RelationGetRelationName(state->rel)),
opaque->btpo_level, RelationGetRelationName(state->rel)),
errdetail_internal("Deleted page found when building scankey from right sibling.")));
/* Be slightly more pro-active in freeing this memory, just in case */
@ -1900,14 +1900,15 @@ bt_child_highkey_check(BtreeCheckState *state,
state->targetblock, blkno,
LSN_FORMAT_ARGS(state->targetlsn))));
/* Check level for non-ignorable page */
if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
/* Do level sanity check */
if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
opaque->btpo_level != target_level - 1)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
RelationGetRelationName(state->rel)),
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
blkno, target_level - 1, opaque->btpo.level)));
blkno, target_level - 1, opaque->btpo_level)));
/* Try to detect circular links */
if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
@ -2132,7 +2133,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
* check for downlink connectivity.
*/
bt_child_highkey_check(state, downlinkoffnum,
child, topaque->btpo.level);
child, topaque->btpo_level);
/*
* Since there cannot be a concurrent VACUUM operation in readonly mode,
@ -2275,7 +2276,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
errmsg_internal("harmless interrupted page split detected in index %s",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
blkno, opaque->btpo.level,
blkno, opaque->btpo_level,
opaque->btpo_prev,
LSN_FORMAT_ARGS(pagelsn))));
return;
@ -2304,7 +2305,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
RelationGetRelationName(state->rel));
level = opaque->btpo.level;
level = opaque->btpo_level;
itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
itup = (IndexTuple) PageGetItem(page, itemid);
childblk = BTreeTupleGetDownLink(itup);
@ -2319,16 +2320,16 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
break;
/* Do an extra sanity check in passing on internal pages */
if (copaque->btpo.level != level - 1)
if (copaque->btpo_level != level - 1)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
RelationGetRelationName(state->rel)),
errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
blkno, childblk,
level - 1, copaque->btpo.level)));
level - 1, copaque->btpo_level)));
level = copaque->btpo.level;
level = copaque->btpo_level;
itemid = PageGetItemIdCareful(state, childblk, child,
P_FIRSTDATAKEY(copaque));
itup = (IndexTuple) PageGetItem(child, itemid);
@ -2389,7 +2390,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
errmsg("internal index block lacks downlink in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
blkno, opaque->btpo.level,
blkno, opaque->btpo_level,
LSN_FORMAT_ARGS(pagelsn))));
}
@ -2983,21 +2984,28 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
}
/*
* Deleted pages have no sane "level" field, so can only check non-deleted
* page level
* Deleted pages that still use the old 32-bit XID representation have no
* sane "level" field because they type pun the field, but all other pages
* (including pages deleted on Postgres 14+) have a valid value.
*/
if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("invalid leaf page level %u for block %u in index \"%s\"",
opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
{
/* Okay, no reason not to trust btpo_level field from page */
if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
opaque->btpo.level == 0)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("invalid internal page level 0 for block %u in index \"%s\"",
blocknum, RelationGetRelationName(state->rel))));
if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"",
opaque->btpo_level, blocknum,
RelationGetRelationName(state->rel))));
if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"",
blocknum,
RelationGetRelationName(state->rel))));
}
/*
* Sanity checks for number of items on page.
@ -3044,8 +3052,6 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
* state. This state is nonetheless treated as corruption by VACUUM on
* from version 9.4 on, so do the same here. See _bt_pagedel() for full
* details.
*
* Internal pages should never have garbage items, either.
*/
if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
ereport(ERROR,
@ -3054,11 +3060,27 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
blocknum, RelationGetRelationName(state->rel)),
errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
/*
* Check that internal pages have no garbage items, and that no page has
* an invalid combination of deletion-related page level flags
*/
if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("internal page block %u in index \"%s\" has garbage items",
blocknum, RelationGetRelationName(state->rel))));
errmsg_internal("internal page block %u in index \"%s\" has garbage items",
blocknum, RelationGetRelationName(state->rel))));
if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
blocknum, RelationGetRelationName(state->rel))));
if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
blocknum, RelationGetRelationName(state->rel))));
return page;
}

View File

@ -75,11 +75,7 @@ typedef struct BTPageStat
/* opaque data */
BlockNumber btpo_prev;
BlockNumber btpo_next;
union
{
uint32 level;
TransactionId xact;
} btpo;
uint32 btpo_level;
uint16 btpo_flags;
BTCycleId btpo_cycleid;
} BTPageStat;
@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
/* page type (flags) */
if (P_ISDELETED(opaque))
{
stat->type = 'd';
stat->btpo.xact = opaque->btpo.xact;
return;
/* We divide deleted pages into leaf ('d') or internal ('D') */
if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque))
stat->type = 'd';
else
stat->type = 'D';
/*
* Report safexid in a deleted page.
*
* Handle pg_upgrade'd deleted pages that used the previous safexid
* representation in btpo_level field (this used to be a union type
* called "bpto").
*/
if (P_HAS_FULLXID(opaque))
{
FullTransactionId safexid = BTPageGetDeleteXid(page);
elog(NOTICE, "deleted page from block %u has safexid %u:%u",
blkno, EpochFromFullTransactionId(safexid),
XidFromFullTransactionId(safexid));
}
else
elog(NOTICE, "deleted page from block %u has safexid %u",
blkno, opaque->btpo_level);
/* Don't interpret BTDeletedPageData as index tuples */
maxoff = InvalidOffsetNumber;
}
else if (P_IGNORE(opaque))
stat->type = 'e';
@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
/* btpage opaque data */
stat->btpo_prev = opaque->btpo_prev;
stat->btpo_next = opaque->btpo_next;
stat->btpo.level = opaque->btpo.level;
stat->btpo_level = opaque->btpo_level;
stat->btpo_flags = opaque->btpo_flags;
stat->btpo_cycleid = opaque->btpo_cycleid;
@ -237,7 +257,7 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
values[j++] = psprintf("%u", stat.free_size);
values[j++] = psprintf("%u", stat.btpo_prev);
values[j++] = psprintf("%u", stat.btpo_next);
values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
values[j++] = psprintf("%u", stat.btpo_level);
values[j++] = psprintf("%d", stat.btpo_flags);
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
@ -503,10 +523,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
if (P_ISDELETED(opaque))
elog(NOTICE, "page is deleted");
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
if (!P_ISDELETED(opaque))
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
else
{
/* Don't interpret BTDeletedPageData as index tuples */
elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno);
fctx->max_calls = 0;
}
uargs->leafpage = P_ISLEAF(opaque);
uargs->rightmost = P_RIGHTMOST(opaque);
@ -603,7 +627,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
if (P_ISDELETED(opaque))
elog(NOTICE, "page is deleted");
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
if (!P_ISDELETED(opaque))
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
else
{
/* Don't interpret BTDeletedPageData as index tuples */
elog(NOTICE, "page from block is deleted");
fctx->max_calls = 0;
}
uargs->leafpage = P_ISLEAF(opaque);
uargs->rightmost = P_RIGHTMOST(opaque);
@ -692,10 +723,7 @@ bt_metap(PG_FUNCTION_ARGS)
/*
* We need a kluge here to detect API versions prior to 1.8. Earlier
* versions incorrectly used int4 for certain columns. This caused
* various problems. For example, an int4 version of the "oldest_xact"
* column would not work with TransactionId values that happened to exceed
* PG_INT32_MAX.
* versions incorrectly used int4 for certain columns.
*
* There is no way to reliably avoid the problems created by the old
* function definition at this point, so insist that the user update the
@ -723,7 +751,8 @@ bt_metap(PG_FUNCTION_ARGS)
*/
if (metad->btm_version >= BTREE_NOVAC_VERSION)
{
values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
values[j++] = psprintf(INT64_FORMAT,
(int64) metad->btm_last_cleanup_num_delpages);
values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
values[j++] = metad->btm_allequalimage ? "t" : "f";
}

View File

@ -3,16 +3,16 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
CREATE INDEX test1_a_idx ON test1 USING btree (a);
\x
SELECT * FROM bt_metap('test1_a_idx');
-[ RECORD 1 ]-----------+-------
magic | 340322
version | 4
root | 1
level | 0
fastroot | 1
fastlevel | 0
oldest_xact | 0
last_cleanup_num_tuples | -1
allequalimage | t
-[ RECORD 1 ]-------------+-------
magic | 340322
version | 4
root | 1
level | 0
fastroot | 1
fastlevel | 0
last_cleanup_num_delpages | 0
last_cleanup_num_tuples | -1
allequalimage | t
SELECT * FROM bt_page_stats('test1_a_idx', -1);
ERROR: invalid block number
@ -29,7 +29,7 @@ page_size | 8192
free_size | 8128
btpo_prev | 0
btpo_next | 0
btpo | 0
btpo_level | 0
btpo_flags | 3
SELECT * FROM bt_page_stats('test1_a_idx', 2);

View File

@ -66,6 +66,23 @@ RETURNS smallint
AS 'MODULE_PATHNAME', 'page_checksum_1_9'
LANGUAGE C STRICT PARALLEL SAFE;
--
-- bt_metap()
--
DROP FUNCTION bt_metap(text);
CREATE FUNCTION bt_metap(IN relname text,
OUT magic int4,
OUT version int4,
OUT root int8,
OUT level int8,
OUT fastroot int8,
OUT fastlevel int8,
OUT last_cleanup_num_delpages int8,
OUT last_cleanup_num_tuples float8,
OUT allequalimage boolean)
AS 'MODULE_PATHNAME', 'bt_metap'
LANGUAGE C STRICT PARALLEL SAFE;
--
-- bt_page_stats()
--
@ -80,7 +97,7 @@ CREATE FUNCTION bt_page_stats(IN relname text, IN blkno int8,
OUT free_size int4,
OUT btpo_prev int8,
OUT btpo_next int8,
OUT btpo int4,
OUT btpo_level int8,
OUT btpo_flags int4)
AS 'MODULE_PATHNAME', 'bt_page_stats_1_9'
LANGUAGE C STRICT PARALLEL SAFE;

View File

@ -283,8 +283,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
page = BufferGetPage(buffer);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* Determine page type, and update totals */
/*
* Determine page type, and update totals.
*
* Note that we arbitrarily bucket deleted pages together without
* considering if they're leaf pages or internal pages.
*/
if (P_ISDELETED(opaque))
indexStat.deleted_pages++;
else if (P_IGNORE(opaque))