mirror of
https://github.com/postgres/postgres.git
synced 2025-07-27 12:41:57 +03:00
Use full 64-bit XIDs in deleted nbtree pages.
Otherwise we risk "leaking" deleted pages by making them non-recyclable indefinitely. Commit6655a729
did the same thing for deleted pages in GiST indexes. That work was used as a starting point here. Stop storing an XID indicating the oldest bpto.xact across all deleted though unrecycled pages in nbtree metapages. There is no longer any reason to care about that condition/the oldest XID. It only ever made sense when wraparound was something _bt_vacuum_needs_cleanup() had to consider. The btm_oldest_btpo_xact metapage field has been repurposed and renamed. It is now btm_last_cleanup_num_delpages, which is used to remember how many non-recycled deleted pages remain from the last VACUUM (in practice its value is usually the precise number of pages that were _newly deleted_ during the specific VACUUM operation that last set the field). The general idea behind storing btm_last_cleanup_num_delpages is to use it to give _some_ consideration to non-recycled deleted pages inside _bt_vacuum_needs_cleanup() -- though never too much. We only really need to avoid leaving a truly excessive number of deleted pages in an unrecycled state forever. We only do this to cover certain narrow cases where no other factor makes VACUUM do a full scan, and yet the index continues to grow (and so actually misses out on recycling existing deleted pages). These metapage changes result in a clear user-visible benefit: We no longer trigger full index scans during VACUUM operations solely due to the presence of only 1 or 2 known deleted (though unrecycled) blocks from a very large index. All that matters now is keeping the costs and benefits in balance over time. Fix an issue that has been around since commit857f9c36
, which added the "skip full scan of index" mechanism (i.e. the _bt_vacuum_needs_cleanup() logic). The accuracy of btm_last_cleanup_num_heap_tuples accidentally hinged upon _when_ the source value gets stored. We now always store btm_last_cleanup_num_heap_tuples in btvacuumcleanup(). This fixes the issue because IndexVacuumInfo.num_heap_tuples (the source field) is expected to accurately indicate the state of the table _after_ the VACUUM completes inside btvacuumcleanup(). A backpatchable fix cannot easily be extracted from this commit. A targeted fix for the issue will follow in a later commit, though that won't happen today. I (pgeoghegan) have chosen to remove any mention of deleted pages in the documentation of the vacuum_cleanup_index_scale_factor GUC/param, since the presence of deleted (though unrecycled) pages is no longer of much concern to users. The vacuum_cleanup_index_scale_factor description in the docs now seems rather unclear in any case, and it should probably be rewritten in the near future. Perhaps some passing mention of page deletion will be added back at the same time. Bump XLOG_PAGE_MAGIC due to nbtree WAL records using full XIDs now. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznpdHvujGUwYZ8sihX=d5u-tRYhi-F4wnV2uN2zHpMUXw@mail.gmail.com
This commit is contained in:
@ -75,11 +75,7 @@ typedef struct BTPageStat
|
||||
/* opaque data */
|
||||
BlockNumber btpo_prev;
|
||||
BlockNumber btpo_next;
|
||||
union
|
||||
{
|
||||
uint32 level;
|
||||
TransactionId xact;
|
||||
} btpo;
|
||||
uint32 btpo_level;
|
||||
uint16 btpo_flags;
|
||||
BTCycleId btpo_cycleid;
|
||||
} BTPageStat;
|
||||
@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
|
||||
/* page type (flags) */
|
||||
if (P_ISDELETED(opaque))
|
||||
{
|
||||
stat->type = 'd';
|
||||
stat->btpo.xact = opaque->btpo.xact;
|
||||
return;
|
||||
/* We divide deleted pages into leaf ('d') or internal ('D') */
|
||||
if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque))
|
||||
stat->type = 'd';
|
||||
else
|
||||
stat->type = 'D';
|
||||
|
||||
/*
|
||||
* Report safexid in a deleted page.
|
||||
*
|
||||
* Handle pg_upgrade'd deleted pages that used the previous safexid
|
||||
* representation in btpo_level field (this used to be a union type
|
||||
* called "bpto").
|
||||
*/
|
||||
if (P_HAS_FULLXID(opaque))
|
||||
{
|
||||
FullTransactionId safexid = BTPageGetDeleteXid(page);
|
||||
|
||||
elog(NOTICE, "deleted page from block %u has safexid %u:%u",
|
||||
blkno, EpochFromFullTransactionId(safexid),
|
||||
XidFromFullTransactionId(safexid));
|
||||
}
|
||||
else
|
||||
elog(NOTICE, "deleted page from block %u has safexid %u",
|
||||
blkno, opaque->btpo_level);
|
||||
|
||||
/* Don't interpret BTDeletedPageData as index tuples */
|
||||
maxoff = InvalidOffsetNumber;
|
||||
}
|
||||
else if (P_IGNORE(opaque))
|
||||
stat->type = 'e';
|
||||
@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
|
||||
/* btpage opaque data */
|
||||
stat->btpo_prev = opaque->btpo_prev;
|
||||
stat->btpo_next = opaque->btpo_next;
|
||||
stat->btpo.level = opaque->btpo.level;
|
||||
stat->btpo_level = opaque->btpo_level;
|
||||
stat->btpo_flags = opaque->btpo_flags;
|
||||
stat->btpo_cycleid = opaque->btpo_cycleid;
|
||||
|
||||
@ -237,7 +257,7 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
|
||||
values[j++] = psprintf("%u", stat.free_size);
|
||||
values[j++] = psprintf("%u", stat.btpo_prev);
|
||||
values[j++] = psprintf("%u", stat.btpo_next);
|
||||
values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
|
||||
values[j++] = psprintf("%u", stat.btpo_level);
|
||||
values[j++] = psprintf("%d", stat.btpo_flags);
|
||||
|
||||
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
|
||||
@ -503,10 +523,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
|
||||
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
|
||||
|
||||
if (P_ISDELETED(opaque))
|
||||
elog(NOTICE, "page is deleted");
|
||||
|
||||
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
|
||||
if (!P_ISDELETED(opaque))
|
||||
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
|
||||
else
|
||||
{
|
||||
/* Don't interpret BTDeletedPageData as index tuples */
|
||||
elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno);
|
||||
fctx->max_calls = 0;
|
||||
}
|
||||
uargs->leafpage = P_ISLEAF(opaque);
|
||||
uargs->rightmost = P_RIGHTMOST(opaque);
|
||||
|
||||
@ -603,7 +627,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS)
|
||||
if (P_ISDELETED(opaque))
|
||||
elog(NOTICE, "page is deleted");
|
||||
|
||||
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
|
||||
if (!P_ISDELETED(opaque))
|
||||
fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
|
||||
else
|
||||
{
|
||||
/* Don't interpret BTDeletedPageData as index tuples */
|
||||
elog(NOTICE, "page from block is deleted");
|
||||
fctx->max_calls = 0;
|
||||
}
|
||||
uargs->leafpage = P_ISLEAF(opaque);
|
||||
uargs->rightmost = P_RIGHTMOST(opaque);
|
||||
|
||||
@ -692,10 +723,7 @@ bt_metap(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* We need a kluge here to detect API versions prior to 1.8. Earlier
|
||||
* versions incorrectly used int4 for certain columns. This caused
|
||||
* various problems. For example, an int4 version of the "oldest_xact"
|
||||
* column would not work with TransactionId values that happened to exceed
|
||||
* PG_INT32_MAX.
|
||||
* versions incorrectly used int4 for certain columns.
|
||||
*
|
||||
* There is no way to reliably avoid the problems created by the old
|
||||
* function definition at this point, so insist that the user update the
|
||||
@ -723,7 +751,8 @@ bt_metap(PG_FUNCTION_ARGS)
|
||||
*/
|
||||
if (metad->btm_version >= BTREE_NOVAC_VERSION)
|
||||
{
|
||||
values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
|
||||
values[j++] = psprintf(INT64_FORMAT,
|
||||
(int64) metad->btm_last_cleanup_num_delpages);
|
||||
values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
|
||||
values[j++] = metad->btm_allequalimage ? "t" : "f";
|
||||
}
|
||||
|
Reference in New Issue
Block a user