diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index a5a76278391..c4ca6339182 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -769,7 +769,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(state->target, itemid); nextleveldown.leftmost = BTreeTupleGetDownLink(itup); - nextleveldown.level = opaque->btpo.level - 1; + nextleveldown.level = opaque->btpo_level - 1; } else { @@ -794,14 +794,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) if (opaque->btpo_prev != leftcurrent) bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); - /* Check level, which must be valid for non-ignorable page */ - if (level.level != opaque->btpo.level) + /* Check level */ + if (level.level != opaque->btpo_level) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", - current, level.level, opaque->btpo.level))); + current, level.level, opaque->btpo_level))); /* Verify invariants for page */ bt_target_page_check(state); @@ -1164,7 +1164,7 @@ bt_target_page_check(BtreeCheckState *state) bt_child_highkey_check(state, offset, NULL, - topaque->btpo.level); + topaque->btpo_level); } continue; } @@ -1520,7 +1520,7 @@ bt_target_page_check(BtreeCheckState *state) if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) { bt_child_highkey_check(state, InvalidOffsetNumber, - NULL, topaque->btpo.level); + NULL, topaque->btpo_level); } } @@ -1597,7 +1597,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg_internal("level %u leftmost page of index \"%s\" was found deleted or half dead", - opaque->btpo.level, RelationGetRelationName(state->rel)), + opaque->btpo_level, RelationGetRelationName(state->rel)), errdetail_internal("Deleted page found when building scankey from right sibling."))); /* Be slightly more pro-active in freeing this memory, just in case */ @@ -1900,14 +1900,15 @@ bt_child_highkey_check(BtreeCheckState *state, state->targetblock, blkno, LSN_FORMAT_ARGS(state->targetlsn)))); - /* Check level for non-ignorable page */ - if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1) + /* Do level sanity check */ + if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) && + opaque->btpo_level != target_level - 1) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block found while following rightlinks from child of index \"%s\" has invalid level", RelationGetRelationName(state->rel)), errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", - blkno, target_level - 1, opaque->btpo.level))); + blkno, target_level - 1, opaque->btpo_level))); /* Try to detect circular links */ if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev) @@ -2132,7 +2133,7 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, * check for downlink connectivity. */ bt_child_highkey_check(state, downlinkoffnum, - child, topaque->btpo.level); + child, topaque->btpo_level); /* * Since there cannot be a concurrent VACUUM operation in readonly mode, @@ -2275,7 +2276,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, errmsg_internal("harmless interrupted page split detected in index %s", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", - blkno, opaque->btpo.level, + blkno, opaque->btpo_level, opaque->btpo_prev, LSN_FORMAT_ARGS(pagelsn)))); return; @@ -2304,7 +2305,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", RelationGetRelationName(state->rel)); - level = opaque->btpo.level; + level = opaque->btpo_level; itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(page, itemid); childblk = BTreeTupleGetDownLink(itup); @@ -2319,16 +2320,16 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, break; /* Do an extra sanity check in passing on internal pages */ - if (copaque->btpo.level != level - 1) + if (copaque->btpo_level != level - 1) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", blkno, childblk, - level - 1, copaque->btpo.level))); + level - 1, copaque->btpo_level))); - level = copaque->btpo.level; + level = copaque->btpo_level; itemid = PageGetItemIdCareful(state, childblk, child, P_FIRSTDATAKEY(copaque)); itup = (IndexTuple) PageGetItem(child, itemid); @@ -2389,7 +2390,7 @@ bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, errmsg("internal index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u page lsn=%X/%X.", - blkno, opaque->btpo.level, + blkno, opaque->btpo_level, LSN_FORMAT_ARGS(pagelsn)))); } @@ -2983,21 +2984,28 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) } /* - * Deleted pages have no sane "level" field, so can only check non-deleted - * page level + * Deleted pages that still use the old 32-bit XID representation have no + * sane "level" field because they type pun the field, but all other pages + * (including pages deleted on Postgres 14+) have a valid value. */ - if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid leaf page level %u for block %u in index \"%s\"", - opaque->btpo.level, blocknum, RelationGetRelationName(state->rel)))); + if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) + { + /* Okay, no reason not to trust btpo_level field from page */ - if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && - opaque->btpo.level == 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("invalid internal page level 0 for block %u in index \"%s\"", - blocknum, RelationGetRelationName(state->rel)))); + if (P_ISLEAF(opaque) && opaque->btpo_level != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"", + opaque->btpo_level, blocknum, + RelationGetRelationName(state->rel)))); + + if (!P_ISLEAF(opaque) && opaque->btpo_level == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"", + blocknum, + RelationGetRelationName(state->rel)))); + } /* * Sanity checks for number of items on page. @@ -3044,8 +3052,6 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) * state. This state is nonetheless treated as corruption by VACUUM on * from version 9.4 on, so do the same here. See _bt_pagedel() for full * details. - * - * Internal pages should never have garbage items, either. */ if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)) ereport(ERROR, @@ -3054,11 +3060,27 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) blocknum, RelationGetRelationName(state->rel)), errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); + /* + * Check that internal pages have no garbage items, and that no page has + * an invalid combination of deletion-related page level flags + */ if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("internal page block %u in index \"%s\" has garbage items", - blocknum, RelationGetRelationName(state->rel)))); + errmsg_internal("internal page block %u in index \"%s\" has garbage items", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"", + blocknum, RelationGetRelationName(state->rel)))); + + if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("deleted page block %u in index \"%s\" is half-dead", + blocknum, RelationGetRelationName(state->rel)))); return page; } diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 8bb180bbbe0..b7725b572f0 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -75,11 +75,7 @@ typedef struct BTPageStat /* opaque data */ BlockNumber btpo_prev; BlockNumber btpo_next; - union - { - uint32 level; - TransactionId xact; - } btpo; + uint32 btpo_level; uint16 btpo_flags; BTCycleId btpo_cycleid; } BTPageStat; @@ -112,9 +108,33 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) /* page type (flags) */ if (P_ISDELETED(opaque)) { - stat->type = 'd'; - stat->btpo.xact = opaque->btpo.xact; - return; + /* We divide deleted pages into leaf ('d') or internal ('D') */ + if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque)) + stat->type = 'd'; + else + stat->type = 'D'; + + /* + * Report safexid in a deleted page. + * + * Handle pg_upgrade'd deleted pages that used the previous safexid + * representation in btpo_level field (this used to be a union type + * called "bpto"). + */ + if (P_HAS_FULLXID(opaque)) + { + FullTransactionId safexid = BTPageGetDeleteXid(page); + + elog(NOTICE, "deleted page from block %u has safexid %u:%u", + blkno, EpochFromFullTransactionId(safexid), + XidFromFullTransactionId(safexid)); + } + else + elog(NOTICE, "deleted page from block %u has safexid %u", + blkno, opaque->btpo_level); + + /* Don't interpret BTDeletedPageData as index tuples */ + maxoff = InvalidOffsetNumber; } else if (P_IGNORE(opaque)) stat->type = 'e'; @@ -128,7 +148,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) /* btpage opaque data */ stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; - stat->btpo.level = opaque->btpo.level; + stat->btpo_level = opaque->btpo_level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; @@ -237,7 +257,7 @@ bt_page_stats_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) values[j++] = psprintf("%u", stat.free_size); values[j++] = psprintf("%u", stat.btpo_prev); values[j++] = psprintf("%u", stat.btpo_next); - values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level); + values[j++] = psprintf("%u", stat.btpo_level); values[j++] = psprintf("%d", stat.btpo_flags); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), @@ -503,10 +523,14 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page); - if (P_ISDELETED(opaque)) - elog(NOTICE, "page is deleted"); - - fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + if (!P_ISDELETED(opaque)) + fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + else + { + /* Don't interpret BTDeletedPageData as index tuples */ + elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno); + fctx->max_calls = 0; + } uargs->leafpage = P_ISLEAF(opaque); uargs->rightmost = P_RIGHTMOST(opaque); @@ -603,7 +627,14 @@ bt_page_items_bytea(PG_FUNCTION_ARGS) if (P_ISDELETED(opaque)) elog(NOTICE, "page is deleted"); - fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + if (!P_ISDELETED(opaque)) + fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + else + { + /* Don't interpret BTDeletedPageData as index tuples */ + elog(NOTICE, "page from block is deleted"); + fctx->max_calls = 0; + } uargs->leafpage = P_ISLEAF(opaque); uargs->rightmost = P_RIGHTMOST(opaque); @@ -692,10 +723,7 @@ bt_metap(PG_FUNCTION_ARGS) /* * We need a kluge here to detect API versions prior to 1.8. Earlier - * versions incorrectly used int4 for certain columns. This caused - * various problems. For example, an int4 version of the "oldest_xact" - * column would not work with TransactionId values that happened to exceed - * PG_INT32_MAX. + * versions incorrectly used int4 for certain columns. * * There is no way to reliably avoid the problems created by the old * function definition at this point, so insist that the user update the @@ -723,7 +751,8 @@ bt_metap(PG_FUNCTION_ARGS) */ if (metad->btm_version >= BTREE_NOVAC_VERSION) { - values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact); + values[j++] = psprintf(INT64_FORMAT, + (int64) metad->btm_last_cleanup_num_delpages); values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples); values[j++] = metad->btm_allequalimage ? "t" : "f"; } diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out index a7632be36a1..c60bc88560c 100644 --- a/contrib/pageinspect/expected/btree.out +++ b/contrib/pageinspect/expected/btree.out @@ -3,16 +3,16 @@ INSERT INTO test1 VALUES (72057594037927937, 'text'); CREATE INDEX test1_a_idx ON test1 USING btree (a); \x SELECT * FROM bt_metap('test1_a_idx'); --[ RECORD 1 ]-----------+------- -magic | 340322 -version | 4 -root | 1 -level | 0 -fastroot | 1 -fastlevel | 0 -oldest_xact | 0 -last_cleanup_num_tuples | -1 -allequalimage | t +-[ RECORD 1 ]-------------+------- +magic | 340322 +version | 4 +root | 1 +level | 0 +fastroot | 1 +fastlevel | 0 +last_cleanup_num_delpages | 0 +last_cleanup_num_tuples | -1 +allequalimage | t SELECT * FROM bt_page_stats('test1_a_idx', -1); ERROR: invalid block number @@ -29,7 +29,7 @@ page_size | 8192 free_size | 8128 btpo_prev | 0 btpo_next | 0 -btpo | 0 +btpo_level | 0 btpo_flags | 3 SELECT * FROM bt_page_stats('test1_a_idx', 2); diff --git a/contrib/pageinspect/pageinspect--1.8--1.9.sql b/contrib/pageinspect/pageinspect--1.8--1.9.sql index 79a42a7b11e..be89a64ca14 100644 --- a/contrib/pageinspect/pageinspect--1.8--1.9.sql +++ b/contrib/pageinspect/pageinspect--1.8--1.9.sql @@ -66,6 +66,23 @@ RETURNS smallint AS 'MODULE_PATHNAME', 'page_checksum_1_9' LANGUAGE C STRICT PARALLEL SAFE; +-- +-- bt_metap() +-- +DROP FUNCTION bt_metap(text); +CREATE FUNCTION bt_metap(IN relname text, + OUT magic int4, + OUT version int4, + OUT root int8, + OUT level int8, + OUT fastroot int8, + OUT fastlevel int8, + OUT last_cleanup_num_delpages int8, + OUT last_cleanup_num_tuples float8, + OUT allequalimage boolean) +AS 'MODULE_PATHNAME', 'bt_metap' +LANGUAGE C STRICT PARALLEL SAFE; + -- -- bt_page_stats() -- @@ -80,7 +97,7 @@ CREATE FUNCTION bt_page_stats(IN relname text, IN blkno int8, OUT free_size int4, OUT btpo_prev int8, OUT btpo_next int8, - OUT btpo int4, + OUT btpo_level int8, OUT btpo_flags int4) AS 'MODULE_PATHNAME', 'bt_page_stats_1_9' LANGUAGE C STRICT PARALLEL SAFE; diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c index b1ce0d77d73..5368bb30f0c 100644 --- a/contrib/pgstattuple/pgstatindex.c +++ b/contrib/pgstattuple/pgstatindex.c @@ -283,8 +283,12 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - /* Determine page type, and update totals */ - + /* + * Determine page type, and update totals. + * + * Note that we arbitrarily bucket deleted pages together without + * considering if they're leaf pages or internal pages. + */ if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index d7a73767984..b5718fc1366 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8529,11 +8529,10 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; If no tuples were deleted from the heap, B-tree indexes are still - scanned at the VACUUM cleanup stage when at least one - of the following conditions is met: the index statistics are stale, or - the index contains deleted pages that can be recycled during cleanup. - Index statistics are considered to be stale if the number of newly - inserted tuples exceeds the vacuum_cleanup_index_scale_factor + scanned at the VACUUM cleanup stage when the + index's statistics are stale. Index statistics are considered + stale if the number of newly inserted tuples exceeds the + vacuum_cleanup_index_scale_factor fraction of the total number of heap tuples detected by the previous statistics collection. The total number of heap tuples is stored in the index meta-page. Note that the meta-page does not include this data diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index e29eb0783ab..59620faec00 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -298,16 +298,16 @@ test=# SELECT t_ctid, raw_flags, combined_flags index's metapage. For example: test=# SELECT * FROM bt_metap('pg_cast_oid_index'); --[ RECORD 1 ]-----------+------- -magic | 340322 -version | 4 -root | 1 -level | 0 -fastroot | 1 -fastlevel | 0 -oldest_xact | 582 -last_cleanup_num_tuples | 1000 -allequalimage | f +-[ RECORD 1 ]-------------+------- +magic | 340322 +version | 4 +root | 1 +level | 0 +fastroot | 1 +fastlevel | 0 +last_cleanup_num_delpages | 0 +last_cleanup_num_tuples | 230 +allequalimage | f @@ -337,7 +337,7 @@ page_size | 8192 free_size | 3668 btpo_prev | 0 btpo_next | 0 -btpo | 0 +btpo_level | 0 btpo_flags | 3 diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index f2eda79bc1a..1c80eae044a 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -394,28 +394,8 @@ gistRedoPageReuse(XLogReaderState *record) * same exclusion effect on primary and standby. */ if (InHotStandby) - { - FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid; - FullTransactionId nextXid = ReadNextFullTransactionId(); - uint64 diff; - - /* - * ResolveRecoveryConflictWithSnapshot operates on 32-bit - * TransactionIds, so truncate the logged FullTransactionId. If the - * logged value is very old, so that XID wrap-around already happened - * on it, there can't be any snapshots that still see it. - */ - diff = U64FromFullTransactionId(nextXid) - - U64FromFullTransactionId(latestRemovedFullXid); - if (diff < MaxTransactionId / 2) - { - TransactionId latestRemovedXid; - - latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid); - ResolveRecoveryConflictWithSnapshot(latestRemovedXid, - xlrec->node); - } - } + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); } void diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index e3336039125..1edb9f95797 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1241,7 +1241,7 @@ _bt_insertonpg(Relation rel, metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); - if (metad->btm_fastlevel >= opaque->btpo.level) + if (metad->btm_fastlevel >= opaque->btpo_level) { /* no update wanted */ _bt_relbuf(rel, metabuf); @@ -1268,7 +1268,7 @@ _bt_insertonpg(Relation rel, if (metad->btm_version < BTREE_NOVAC_VERSION) _bt_upgrademetapage(metapg); metad->btm_fastroot = BufferGetBlockNumber(buf); - metad->btm_fastlevel = opaque->btpo.level; + metad->btm_fastlevel = opaque->btpo_level; MarkBufferDirty(metabuf); } @@ -1331,7 +1331,7 @@ _bt_insertonpg(Relation rel, xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; - xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; xlmeta.allequalimage = metad->btm_allequalimage; @@ -1537,7 +1537,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; lopaque->btpo_prev = oopaque->btpo_prev; /* handle btpo_next after rightpage buffer acquired */ - lopaque->btpo.level = oopaque->btpo.level; + lopaque->btpo_level = oopaque->btpo_level; /* handle btpo_cycleid after rightpage buffer acquired */ /* @@ -1722,7 +1722,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = oopaque->btpo_next; - ropaque->btpo.level = oopaque->btpo.level; + ropaque->btpo_level = oopaque->btpo_level; ropaque->btpo_cycleid = lopaque->btpo_cycleid; /* @@ -1950,7 +1950,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, uint8 xlinfo; XLogRecPtr recptr; - xlrec.level = ropaque->btpo.level; + xlrec.level = ropaque->btpo_level; /* See comments below on newitem, orignewitem, and posting lists */ xlrec.firstrightoff = firstrightoff; xlrec.newitemoff = newitemoff; @@ -2142,7 +2142,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false, NULL); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); @@ -2480,15 +2480,15 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = BTP_ROOT; - rootopaque->btpo.level = - ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1; + rootopaque->btpo_level = + ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1; rootopaque->btpo_cycleid = 0; /* update metapage data */ metad->btm_root = rootblknum; - metad->btm_level = rootopaque->btpo.level; + metad->btm_level = rootopaque->btpo_level; metad->btm_fastroot = rootblknum; - metad->btm_fastlevel = rootopaque->btpo.level; + metad->btm_fastlevel = rootopaque->btpo_level; /* * Insert the left page pointer into the new root page. The root page is @@ -2548,7 +2548,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.level = metad->btm_level; md.fastroot = rootblknum; md.fastlevel = metad->btm_level; - md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; md.allequalimage = metad->btm_allequalimage; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 8c326a4774c..a43805a7b09 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -37,7 +37,7 @@ static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, - TransactionId latestRemovedXid); + FullTransactionId safexid); static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, OffsetNumber *deletable, int ndeletable, @@ -50,7 +50,6 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, - TransactionId *oldestBtpoXact, uint32 *ndeleted); static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, @@ -78,7 +77,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, metad->btm_level = level; metad->btm_fastroot = rootbknum; metad->btm_fastlevel = level; - metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_delpages = 0; metad->btm_last_cleanup_num_heap_tuples = -1.0; metad->btm_allequalimage = allequalimage; @@ -118,7 +117,7 @@ _bt_upgrademetapage(Page page) /* Set version number and fill extra fields added into version 3 */ metad->btm_version = BTREE_NOVAC_VERSION; - metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_delpages = 0; metad->btm_last_cleanup_num_heap_tuples = -1.0; /* Only a REINDEX can set this field */ Assert(!metad->btm_allequalimage); @@ -169,35 +168,61 @@ _bt_getmeta(Relation rel, Buffer metabuf) } /* - * _bt_update_meta_cleanup_info() -- Update cleanup-related information in - * the metapage. + * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup(). * - * This routine checks if provided cleanup-related information is matching - * to those written in the metapage. On mismatch, metapage is overwritten. + * This routine is called at the end of each VACUUM's btvacuumcleanup() + * call. Its purpose is to maintain the metapage fields that are used by + * _bt_vacuum_needs_cleanup() to decide whether or not a btvacuumscan() + * call should go ahead for an entire VACUUM operation. + * + * See btvacuumcleanup() and _bt_vacuum_needs_cleanup() for details of + * the two fields that we maintain here. + * + * The information that we maintain for btvacuumcleanup() describes the + * state of the index (as well as the table it indexes) just _after_ the + * ongoing VACUUM operation. The next _bt_vacuum_needs_cleanup() call + * will consider the information we saved for it during the next VACUUM + * operation (assuming that there will be no btbulkdelete() call during + * the next VACUUM operation -- if there is then the question of skipping + * btvacuumscan() doesn't even arise). */ void -_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, - float8 numHeapTuples) +_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages, + float8 num_heap_tuples) { Buffer metabuf; Page metapg; BTMetaPageData *metad; - bool needsRewrite = false; + bool rewrite = false; XLogRecPtr recptr; - /* read the metapage and check if it needs rewrite */ + /* + * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage + * field started out as a TransactionId field called btm_oldest_btpo_xact. + * Both "versions" are just uint32 fields. It was convenient to repurpose + * the field when we began to use 64-bit XIDs in deleted pages. + * + * It's possible that a pg_upgrade'd database will contain an XID value in + * what is now recognized as the metapage's btm_last_cleanup_num_delpages + * field. _bt_vacuum_needs_cleanup() may even believe that this value + * indicates that there are lots of pages that it needs to recycle, when + * in reality there are only one or two. The worst that can happen is + * that there will be a call to btvacuumscan a little earlier, which will + * set btm_last_cleanup_num_delpages to a sane value when we're called. + */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); - /* outdated version of metapage always needs rewrite */ + /* Always dynamically upgrade index/metapage when BTREE_MIN_VERSION */ if (metad->btm_version < BTREE_NOVAC_VERSION) - needsRewrite = true; - else if (metad->btm_oldest_btpo_xact != oldestBtpoXact || - metad->btm_last_cleanup_num_heap_tuples != numHeapTuples) - needsRewrite = true; + rewrite = true; + else if (metad->btm_last_cleanup_num_delpages != num_delpages) + rewrite = true; + else if (metad->btm_last_cleanup_num_heap_tuples != num_heap_tuples) + rewrite = true; - if (!needsRewrite) + if (!rewrite) { _bt_relbuf(rel, metabuf); return; @@ -214,8 +239,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, _bt_upgrademetapage(metapg); /* update cleanup-related information */ - metad->btm_oldest_btpo_xact = oldestBtpoXact; - metad->btm_last_cleanup_num_heap_tuples = numHeapTuples; + metad->btm_last_cleanup_num_delpages = num_delpages; + metad->btm_last_cleanup_num_heap_tuples = num_heap_tuples; MarkBufferDirty(metabuf); /* write wal record if needed */ @@ -232,8 +257,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, md.level = metad->btm_level; md.fastroot = metad->btm_fastroot; md.fastlevel = metad->btm_fastlevel; - md.oldest_btpo_xact = oldestBtpoXact; - md.last_cleanup_num_heap_tuples = numHeapTuples; + md.last_cleanup_num_delpages = num_delpages; + md.last_cleanup_num_heap_tuples = num_heap_tuples; md.allequalimage = metad->btm_allequalimage; XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); @@ -244,6 +269,7 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, } END_CRIT_SECTION(); + _bt_relbuf(rel, metabuf); } @@ -316,7 +342,7 @@ _bt_getroot(Relation rel, int access) * because that's not set in a "fast root". */ if (!P_IGNORE(rootopaque) && - rootopaque->btpo.level == rootlevel && + rootopaque->btpo_level == rootlevel && P_LEFTMOST(rootopaque) && P_RIGHTMOST(rootopaque)) { @@ -377,7 +403,7 @@ _bt_getroot(Relation rel, int access) rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); - rootopaque->btpo.level = 0; + rootopaque->btpo_level = 0; rootopaque->btpo_cycleid = 0; /* Get raw page pointer for metapage */ metapg = BufferGetPage(metabuf); @@ -393,7 +419,7 @@ _bt_getroot(Relation rel, int access) metad->btm_level = 0; metad->btm_fastroot = rootblkno; metad->btm_fastlevel = 0; - metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_delpages = 0; metad->btm_last_cleanup_num_heap_tuples = -1.0; MarkBufferDirty(rootbuf); @@ -416,7 +442,7 @@ _bt_getroot(Relation rel, int access) md.level = 0; md.fastroot = rootblkno; md.fastlevel = 0; - md.oldest_btpo_xact = InvalidTransactionId; + md.last_cleanup_num_delpages = 0; md.last_cleanup_num_heap_tuples = -1.0; md.allequalimage = metad->btm_allequalimage; @@ -481,11 +507,10 @@ _bt_getroot(Relation rel, int access) rootblkno = rootopaque->btpo_next; } - /* Note: can't check btpo.level on deleted pages */ - if (rootopaque->btpo.level != rootlevel) + if (rootopaque->btpo_level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), - rootopaque->btpo.level, rootlevel); + rootopaque->btpo_level, rootlevel); } /* @@ -585,11 +610,10 @@ _bt_gettrueroot(Relation rel) rootblkno = rootopaque->btpo_next; } - /* Note: can't check btpo.level on deleted pages */ - if (rootopaque->btpo.level != rootlevel) + if (rootopaque->btpo_level != rootlevel) elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", rootblkno, RelationGetRelationName(rel), - rootopaque->btpo.level, rootlevel); + rootopaque->btpo_level, rootlevel); return rootbuf; } @@ -762,7 +786,7 @@ _bt_checkpage(Relation rel, Buffer buf) * Log the reuse of a page from the FSM. */ static void -_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid) +_bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid) { xl_btree_reuse_page xlrec_reuse; @@ -775,7 +799,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX /* XLOG stuff */ xlrec_reuse.node = rel->rd_node; xlrec_reuse.block = blkno; - xlrec_reuse.latestRemovedXid = latestRemovedXid; + xlrec_reuse.latestRemovedFullXid = safexid; XLogBeginInsert(); XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage); @@ -856,26 +880,34 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) if (_bt_conditionallockbuf(rel, buf)) { page = BufferGetPage(buf); - if (_bt_page_recyclable(page)) + + /* + * It's possible to find an all-zeroes page in an index. For + * example, a backend might successfully extend the relation + * one page and then crash before it is able to make a WAL + * entry for adding the page. If we find a zeroed page then + * reclaim it immediately. + */ + if (PageIsNew(page)) + { + /* Okay to use page. Initialize and return it. */ + _bt_pageinit(page, BufferGetPageSize(buf)); + return buf; + } + + if (BTPageIsRecyclable(page)) { /* * If we are generating WAL for Hot Standby then create a * WAL record that will allow us to conflict with queries * running on standby, in case they have snapshots older - * than btpo.xact. This can only apply if the page does - * have a valid btpo.xact value, ie not if it's new. (We - * must check that because an all-zero page has no special - * space.) + * than safexid value */ - if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) && - !PageIsNew(page)) - { - BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) + _bt_log_reuse_page(rel, blkno, + BTPageGetDeleteXid(page)); - _bt_log_reuse_page(rel, blkno, opaque->btpo.xact); - } - - /* Okay to use page. Re-initialize and return it */ + /* Okay to use page. Re-initialize and return it. */ _bt_pageinit(page, BufferGetPageSize(buf)); return buf; } @@ -1073,40 +1105,6 @@ _bt_pageinit(Page page, Size size) PageInit(page, size, sizeof(BTPageOpaqueData)); } -/* - * _bt_page_recyclable() -- Is an existing page recyclable? - * - * This exists to make sure _bt_getbuf and btvacuumscan have the same - * policy about whether a page is safe to re-use. But note that _bt_getbuf - * knows enough to distinguish the PageIsNew condition from the other one. - * At some point it might be appropriate to redesign this to have a three-way - * result value. - */ -bool -_bt_page_recyclable(Page page) -{ - BTPageOpaque opaque; - - /* - * It's possible to find an all-zeroes page in an index --- for example, a - * backend might successfully extend the relation one page and then crash - * before it is able to make a WAL entry for adding the page. If we find a - * zeroed page then reclaim it. - */ - if (PageIsNew(page)) - return true; - - /* - * Otherwise, recycle if deleted and too old to have any processes - * interested in it. - */ - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (P_ISDELETED(opaque) && - GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact)) - return true; - return false; -} - /* * Delete item(s) from a btree leaf page during VACUUM. * @@ -1768,16 +1766,12 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) * that the btvacuumscan scan has yet to reach; they'll get counted later * instead. * - * Maintains *oldestBtpoXact for any pages that get deleted. Caller is - * responsible for maintaining *oldestBtpoXact in the case of pages that were - * deleted by a previous VACUUM. - * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ uint32 -_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) +_bt_pagedel(Relation rel, Buffer leafbuf) { uint32 ndeleted = 0; BlockNumber rightsib; @@ -1985,8 +1979,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) { /* Check for interrupts in _bt_unlink_halfdead_page */ if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno, - &rightsib_empty, oldestBtpoXact, - &ndeleted)) + &rightsib_empty, &ndeleted)) { /* * _bt_unlink_halfdead_page should never fail, since we @@ -2002,8 +1995,6 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) } Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque)); - Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact, - *oldestBtpoXact)); rightsib = opaque->btpo_next; @@ -2264,12 +2255,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) * containing leafbuf. (We always set *rightsib_empty for caller, just to be * consistent.) * - * We maintain *oldestBtpoXact for pages that are deleted by the current - * VACUUM operation here. This must be handled here because we conservatively - * assume that there needs to be a new call to ReadNextTransactionId() each - * time a page gets deleted. See comments about the underlying assumption - * below. - * * Must hold pin and lock on leafbuf at entry (read or write doesn't matter). * On success exit, we'll be holding pin and write lock. On failure exit, * we'll release both pin and lock before returning (we define it that way @@ -2277,8 +2262,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) */ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, - bool *rightsib_empty, TransactionId *oldestBtpoXact, - uint32 *ndeleted) + bool *rightsib_empty, uint32 *ndeleted) { BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); BlockNumber leafleftsib; @@ -2294,12 +2278,12 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, BTMetaPageData *metad = NULL; ItemId itemid; Page page; - PageHeader header; BTPageOpaque opaque; + FullTransactionId safexid; bool rightsib_is_rightmost; - int targetlevel; + uint32 targetlevel; IndexTuple leafhikey; - BlockNumber nextchild; + BlockNumber leaftopparent; page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -2343,7 +2327,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); leftsib = opaque->btpo_prev; - targetlevel = opaque->btpo.level; + targetlevel = opaque->btpo_level; Assert(targetlevel > 0); /* @@ -2450,20 +2434,26 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque)) elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); - nextchild = InvalidBlockNumber; + + /* Leaf page is also target page: don't set leaftopparent */ + leaftopparent = InvalidBlockNumber; } else { + IndexTuple finaldataitem; + if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) || P_ISLEAF(opaque)) elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); - /* Remember the next non-leaf child down in the subtree */ + /* Target is internal: set leaftopparent for next call here... */ itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); - nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid)); - if (nextchild == leafblkno) - nextchild = InvalidBlockNumber; + finaldataitem = (IndexTuple) PageGetItem(page, itemid); + leaftopparent = BTreeTupleGetDownLink(finaldataitem); + /* ...except when it would be a redundant pointer-to-self */ + if (leaftopparent == leafblkno) + leaftopparent = InvalidBlockNumber; } /* @@ -2553,13 +2543,13 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * no lock was held. */ if (target != leafblkno) - BTreeTupleSetTopParent(leafhikey, nextchild); + BTreeTupleSetTopParent(leafhikey, leaftopparent); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone. Storing GetTopTransactionId() would work, but * we're in VACUUM and would not otherwise have an XID. Having already - * updated links to the target, ReadNextTransactionId() suffices as an + * updated links to the target, ReadNextFullTransactionId() suffices as an * upper bound. Any scan having retained a now-stale link is advertising * in its PGPROC an xmin less than or equal to the value we read here. It * will continue to do so, holding back the xmin horizon, for the duration @@ -2568,17 +2558,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque)); - opaque->btpo_flags &= ~BTP_HALF_DEAD; - opaque->btpo_flags |= BTP_DELETED; - opaque->btpo.xact = ReadNextTransactionId(); /* - * Remove the remaining tuples on the page. This keeps things simple for - * WAL consistency checking. + * Store upper bound XID that's used to determine when deleted page is no + * longer needed as a tombstone */ - header = (PageHeader) page; - header->pd_lower = SizeOfPageHeaderData; - header->pd_upper = header->pd_special; + safexid = ReadNextFullTransactionId(); + BTPageSetDeleted(page, safexid); + opaque->btpo_cycleid = 0; /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) @@ -2616,15 +2603,16 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target != leafblkno) XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT); - /* information on the unlinked block */ + /* information stored on the target/to-be-unlinked block */ xlrec.leftsib = leftsib; xlrec.rightsib = rightsib; - xlrec.btpo_xact = opaque->btpo.xact; + xlrec.level = targetlevel; + xlrec.safexid = safexid; /* information needed to recreate the leaf block (if not the target) */ xlrec.leafleftsib = leafleftsib; xlrec.leafrightsib = leafrightsib; - xlrec.topparent = nextchild; + xlrec.leaftopparent = leaftopparent; XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage); @@ -2638,7 +2626,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; - xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; xlmeta.allequalimage = metad->btm_allequalimage; @@ -2681,9 +2669,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, _bt_relbuf(rel, lbuf); _bt_relbuf(rel, rbuf); - if (!TransactionIdIsValid(*oldestBtpoXact) || - TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact)) - *oldestBtpoXact = opaque->btpo.xact; + /* If the target is not leafbuf, we're done with it now -- release it */ + if (target != leafblkno) + _bt_relbuf(rel, buf); /* * If btvacuumscan won't revisit this page in a future btvacuumpage call @@ -2693,10 +2681,6 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target <= scanblkno) (*ndeleted)++; - /* If the target is not leafbuf, we're done with it now -- release it */ - if (target != leafblkno) - _bt_relbuf(rel, buf); - return true; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 289bd3c15da..3b2e0aa5cb7 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -46,8 +46,6 @@ typedef struct IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; - BlockNumber totFreePages; /* true total # of free pages */ - TransactionId oldestBtpoXact; MemoryContext pagedelcontext; } BTVacState; @@ -790,7 +788,7 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup * * Called by btvacuumcleanup when btbulkdelete was never called because no - * tuples need to be deleted. + * tuples needed to be deleted by VACUUM. * * When we return false, VACUUM can even skip the cleanup-only call to * btvacuumscan (i.e. there will be no btvacuumscan call for this index at @@ -802,66 +800,75 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) Buffer metabuf; Page metapg; BTMetaPageData *metad; - bool result = false; + BTOptions *relopts; + float8 cleanup_scale_factor; + uint32 btm_version; + BlockNumber prev_num_delpages; + float8 prev_num_heap_tuples; + /* + * Copy details from metapage to local variables quickly. + * + * Note that we deliberately avoid using cached version of metapage here. + */ metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); + btm_version = metad->btm_version; + + if (btm_version < BTREE_NOVAC_VERSION) + { + /* + * Metapage needs to be dynamically upgraded to store fields that are + * only present when btm_version >= BTREE_NOVAC_VERSION + */ + _bt_relbuf(info->index, metabuf); + return true; + } + + prev_num_delpages = metad->btm_last_cleanup_num_delpages; + prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + _bt_relbuf(info->index, metabuf); /* - * XXX: If IndexVacuumInfo contained the heap relation, we could be more - * aggressive about vacuuming non catalog relations by passing the table - * to GlobalVisCheckRemovableXid(). + * If the underlying table has received a sufficiently high number of + * insertions since the last VACUUM operation that called btvacuumscan(), + * then have the current VACUUM operation call btvacuumscan() now. This + * happens when the statistics are deemed stale. + * + * XXX: We should have a more principled way of determining what + * "staleness" means. The vacuum_cleanup_index_scale_factor GUC (and the + * index-level storage param) seem hard to tune in a principled way. */ + relopts = (BTOptions *) info->index->rd_options; + cleanup_scale_factor = (relopts && + relopts->vacuum_cleanup_index_scale_factor >= 0) + ? relopts->vacuum_cleanup_index_scale_factor + : vacuum_cleanup_index_scale_factor; - if (metad->btm_version < BTREE_NOVAC_VERSION) - { - /* - * Do cleanup if metapage needs upgrade, because we don't have - * cleanup-related meta-information yet. - */ - result = true; - } - else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) && - GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact)) - { - /* - * If any oldest btpo.xact from a previously deleted page in the index - * is visible to everyone, then at least one deleted page can be - * recycled -- don't skip cleanup. - */ - result = true; - } - else - { - BTOptions *relopts; - float8 cleanup_scale_factor; - float8 prev_num_heap_tuples; + if (cleanup_scale_factor <= 0 || + info->num_heap_tuples < 0 || + prev_num_heap_tuples <= 0 || + (info->num_heap_tuples - prev_num_heap_tuples) / + prev_num_heap_tuples >= cleanup_scale_factor) + return true; - /* - * If table receives enough insertions and no cleanup was performed, - * then index would appear have stale statistics. If scale factor is - * set, we avoid that by performing cleanup if the number of inserted - * tuples exceeds vacuum_cleanup_index_scale_factor fraction of - * original tuples count. - */ - relopts = (BTOptions *) info->index->rd_options; - cleanup_scale_factor = (relopts && - relopts->vacuum_cleanup_index_scale_factor >= 0) - ? relopts->vacuum_cleanup_index_scale_factor - : vacuum_cleanup_index_scale_factor; - prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; + /* + * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the + * total size of the index. We can reasonably expect (though are not + * guaranteed) to be able to recycle this many pages if we decide to do a + * btvacuumscan call during the ongoing btvacuumcleanup. + * + * Our approach won't reliably avoid "wasted" cleanup-only btvacuumscan + * calls. That is, we can end up scanning the entire index without ever + * placing even 1 of the prev_num_delpages pages in the free space map, at + * least in certain narrow cases (see nbtree/README section on recycling + * deleted pages for details). This rarely matters in practice. + */ + if (prev_num_delpages > RelationGetNumberOfBlocks(info->index) / 20) + return true; - if (cleanup_scale_factor <= 0 || - info->num_heap_tuples < 0 || - prev_num_heap_tuples <= 0 || - (info->num_heap_tuples - prev_num_heap_tuples) / - prev_num_heap_tuples >= cleanup_scale_factor) - result = true; - } - - _bt_relbuf(info->index, metabuf); - return result; + return false; } /* @@ -904,30 +911,62 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) { + BlockNumber num_delpages; + /* No-op in ANALYZE ONLY mode */ if (info->analyze_only) return stats; /* - * If btbulkdelete was called, we need not do anything, just return the - * stats from the latest btbulkdelete call. If it wasn't called, we might - * still need to do a pass over the index, to recycle any newly-recyclable - * pages or to obtain index statistics. _bt_vacuum_needs_cleanup - * determines if either are needed. + * If btbulkdelete was called, we need not do anything (we just maintain + * the information used within _bt_vacuum_needs_cleanup() by calling + * _bt_set_cleanup_info() below). * - * Since we aren't going to actually delete any leaf items, there's no - * need to go through all the vacuum-cycle-ID pushups. + * If btbulkdelete was _not_ called, then we have a choice to make: we + * must decide whether or not a btvacuumscan() call is needed now (i.e. + * whether the ongoing VACUUM operation can entirely avoid a physical scan + * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us + * now. */ if (stats == NULL) { - /* Check if we need a cleanup */ + /* Check if VACUUM operation can entirely avoid btvacuumscan() call */ if (!_bt_vacuum_needs_cleanup(info)) return NULL; + /* + * Since we aren't going to actually delete any leaf items, there's no + * need to go through all the vacuum-cycle-ID pushups here + */ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); btvacuumscan(info, stats, NULL, NULL, 0); } + /* + * By here, we know for sure that this VACUUM operation won't be skipping + * its btvacuumscan() call. Maintain the count of the current number of + * heap tuples in the metapage. Also maintain the num_delpages value. + * This information will be used by _bt_vacuum_needs_cleanup() during + * future VACUUM operations that don't need to call btbulkdelete(). + * + * num_delpages is the number of deleted pages now in the index that were + * not safe to place in the FSM to be recycled just yet. We expect that + * it will almost certainly be possible to place all of these pages in the + * FSM during the next VACUUM operation. That factor alone might cause + * _bt_vacuum_needs_cleanup() to force the next VACUUM to proceed with a + * btvacuumscan() call. + * + * Note: We must delay the _bt_set_cleanup_info() call until this late + * stage of VACUUM (the btvacuumcleanup() phase), to keep num_heap_tuples + * accurate. The btbulkdelete()-time num_heap_tuples value is generally + * just pg_class.reltuples for the heap relation _before_ VACUUM began. + * In general cleanup info should describe the state of the index/table + * _after_ VACUUM finishes. + */ + Assert(stats->pages_deleted >= stats->pages_free); + num_delpages = stats->pages_deleted - stats->pages_free; + _bt_set_cleanup_info(info->index, num_delpages, info->num_heap_tuples); + /* * It's quite possible for us to be fooled by concurrent page splits into * double-counting some index tuples, so disbelieve any total that exceeds @@ -957,8 +996,6 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true). - * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup - * is maintained. * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. @@ -975,12 +1012,25 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, bool needLock; /* - * Reset counts that will be incremented during the scan; needed in case - * of multiple scans during a single VACUUM command + * Reset fields that track information about the entire index now. This + * avoids double-counting in the case where a single VACUUM command + * requires multiple scans of the index. + * + * Avoid resetting the tuples_removed field here, since it tracks + * information about the VACUUM command, and so must last across each call + * to btvacuumscan(). + * + * (Note that pages_free is treated as state about the whole index, not + * the current VACUUM. This is appropriate because RecordFreeIndexPage() + * calls are idempotent, and get repeated for the same deleted pages in + * some scenarios. The point for us is to track the number of recyclable + * pages in the index at the end of the VACUUM command.) */ + stats->num_pages = 0; stats->estimated_count = false; stats->num_index_tuples = 0; stats->pages_deleted = 0; + stats->pages_free = 0; /* Set up info to pass down to btvacuumpage */ vstate.info = info; @@ -988,8 +1038,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; - vstate.totFreePages = 0; - vstate.oldestBtpoXact = InvalidTransactionId; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, @@ -1048,6 +1096,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } } + /* Set statistics num_pages field to final size of index */ + stats->num_pages = num_pages; + MemoryContextDelete(vstate.pagedelcontext); /* @@ -1062,27 +1113,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * Note that if no recyclable pages exist, we don't bother vacuuming the * FSM at all. */ - if (vstate.totFreePages > 0) + if (stats->pages_free > 0) IndexFreeSpaceMapVacuum(rel); - - /* - * Maintain the oldest btpo.xact and a count of the current number of heap - * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup). - * - * The page with the oldest btpo.xact is typically a page deleted by this - * VACUUM operation, since pages deleted by a previous VACUUM operation - * tend to be placed in the FSM (by the current VACUUM operation) -- such - * pages are not candidates to be the oldest btpo.xact. (Note that pages - * placed in the FSM are reported as deleted pages in the bulk delete - * statistics, despite not counting as deleted pages for the purposes of - * determining the oldest btpo.xact.) - */ - _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact, - info->num_heap_tuples); - - /* update statistics */ - stats->num_pages = num_pages; - stats->pages_free = vstate.totFreePages; } /* @@ -1188,13 +1220,12 @@ backtrack: } } - /* Page is valid, see what to do with it */ - if (_bt_page_recyclable(page)) + if (!opaque || BTPageIsRecyclable(page)) { /* Okay to recycle this page (which could be leaf or internal) */ RecordFreeIndexPage(rel, blkno); - vstate->totFreePages++; stats->pages_deleted++; + stats->pages_free++; } else if (P_ISDELETED(opaque)) { @@ -1203,17 +1234,12 @@ backtrack: * recycle yet. */ stats->pages_deleted++; - - /* Maintain the oldest btpo.xact */ - if (!TransactionIdIsValid(vstate->oldestBtpoXact) || - TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) - vstate->oldestBtpoXact = opaque->btpo.xact; } else if (P_ISHALFDEAD(opaque)) { /* * Half-dead leaf page. Try to delete now. Might update - * oldestBtpoXact and pages_deleted below. + * pages_deleted below. */ attempt_pagedel = true; } @@ -1430,7 +1456,7 @@ backtrack: * count. There will be no double-counting. */ Assert(blkno == scanblkno); - stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact); + stats->pages_deleted += _bt_pagedel(rel, buf); MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 2e3bda8171d..d1177d8772c 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -169,7 +169,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, * we're on the level 1 and asked to lock leaf page in write mode, * then lock next page in write mode, because it must be a leaf. */ - if (opaque->btpo.level == 1 && access == BT_WRITE) + if (opaque->btpo_level == 1 && access == BT_WRITE) page_access = BT_WRITE; /* drop the read lock on the page, then acquire one on its child */ @@ -2341,9 +2341,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, } /* Done? */ - if (opaque->btpo.level == level) + if (opaque->btpo_level == level) break; - if (opaque->btpo.level < level) + if (opaque->btpo_level < level) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("btree level %u not found in index \"%s\"", diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 5683daa34d3..2c4d7f6e25a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -620,7 +620,7 @@ _bt_blnewpage(uint32 level) /* Initialize BT opaque state */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_prev = opaque->btpo_next = P_NONE; - opaque->btpo.level = level; + opaque->btpo_level = level; opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; opaque->btpo_cycleid = 0; diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index c1d578cc016..8b7c143db48 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -112,7 +112,7 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) md->btm_fastlevel = xlrec->fastlevel; /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */ Assert(md->btm_version >= BTREE_NOVAC_VERSION); - md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; + md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages; md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; md->btm_allequalimage = xlrec->allequalimage; @@ -297,7 +297,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record) ropaque->btpo_prev = origpagenumber; ropaque->btpo_next = spagenumber; - ropaque->btpo.level = xlrec->level; + ropaque->btpo_level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; @@ -773,7 +773,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) pageop->btpo_prev = xlrec->leftblk; pageop->btpo_next = xlrec->rightblk; - pageop->btpo.level = 0; + pageop->btpo_level = 0; pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_cycleid = 0; @@ -802,6 +802,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; + uint32 level; + bool isleaf; + FullTransactionId safexid; Buffer leftbuf; Buffer target; Buffer rightbuf; @@ -810,6 +813,12 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; + level = xlrec->level; + isleaf = (level == 0); + safexid = xlrec->safexid; + + /* No leaftopparent for level 0 (leaf page) or level 1 target */ + Assert(xlrec->leaftopparent == InvalidBlockNumber || level > 1); /* * In normal operation, we would lock all the pages this WAL record @@ -844,9 +853,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) pageop->btpo_prev = leftsib; pageop->btpo_next = rightsib; - pageop->btpo.xact = xlrec->btpo_xact; - pageop->btpo_flags = BTP_DELETED; - if (!BlockNumberIsValid(xlrec->topparent)) + pageop->btpo_level = level; + BTPageSetDeleted(page, safexid); + if (isleaf) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; @@ -892,6 +901,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) Buffer leafbuf; IndexTupleData trunctuple; + Assert(!isleaf); + leafbuf = XLogInitBufferForRedo(record, 3); page = (Page) BufferGetPage(leafbuf); @@ -901,13 +912,13 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_prev = xlrec->leafleftsib; pageop->btpo_next = xlrec->leafrightsib; - pageop->btpo.level = 0; + pageop->btpo_level = 0; pageop->btpo_cycleid = 0; /* Add a dummy hikey item */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); - BTreeTupleSetTopParent(&trunctuple, xlrec->topparent); + BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) @@ -942,7 +953,7 @@ btree_xlog_newroot(XLogReaderState *record) pageop->btpo_flags = BTP_ROOT; pageop->btpo_prev = pageop->btpo_next = P_NONE; - pageop->btpo.level = xlrec->level; + pageop->btpo_level = xlrec->level; if (xlrec->level == 0) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; @@ -963,26 +974,40 @@ btree_xlog_newroot(XLogReaderState *record) _bt_restore_meta(record, 2); } +/* + * In general VACUUM must defer recycling as a way of avoiding certain race + * conditions. Deleted pages contain a safexid value that is used by VACUUM + * to determine whether or not it's safe to place a page that was deleted by + * VACUUM earlier into the FSM now. See nbtree/README. + * + * As far as any backend operating during original execution is concerned, the + * FSM is a cache of recycle-safe pages; the mere presence of the page in the + * FSM indicates that the page must already be safe to recycle (actually, + * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just + * because it would be unwise to completely trust the FSM, given its current + * limitations). + * + * This isn't sufficient to prevent similar concurrent recycling race + * conditions during Hot Standby, though. For that we need to log a + * xl_btree_reuse_page record at the point that a page is actually recycled + * and reused for an entirely unrelated page inside _bt_split(). These + * records include the same safexid value from the original deleted page, + * stored in the record's latestRemovedFullXid field. + * + * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used + * to determine if it's safe to recycle a page. This mirrors our own test: + * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs(). + * Consequently, one XID value achieves the same exclusion effect on primary + * and standby. + */ static void btree_xlog_reuse_page(XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); - /* - * Btree reuse_page records exist to provide a conflict point when we - * reuse pages in the index via the FSM. That's all they do though. - * - * latestRemovedXid was the page's btpo.xact. The - * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually - * mirrors the pgxact->xmin > limitXmin test in - * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the - * same exclusion effect on primary and standby. - */ if (InHotStandby) - { - ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, - xlrec->node); - } + ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid, + xlrec->node); } void diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 6e0d6a2b729..f7cc4dd3e6d 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -80,12 +80,13 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec; - appendStringInfo(buf, "left %u; right %u; btpo_xact %u; ", - xlrec->leftsib, xlrec->rightsib, - xlrec->btpo_xact); - appendStringInfo(buf, "leafleft %u; leafright %u; topparent %u", + appendStringInfo(buf, "left %u; right %u; level %u; safexid %u:%u; ", + xlrec->leftsib, xlrec->rightsib, xlrec->level, + EpochFromFullTransactionId(xlrec->safexid), + XidFromFullTransactionId(xlrec->safexid)); + appendStringInfo(buf, "leafleft %u; leafright %u; leaftopparent %u", xlrec->leafleftsib, xlrec->leafrightsib, - xlrec->topparent); + xlrec->leaftopparent); break; } case XLOG_BTREE_NEWROOT: @@ -99,9 +100,11 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; - appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u", + appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u", xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->latestRemovedXid); + xlrec->node.relNode, + EpochFromFullTransactionId(xlrec->latestRemovedFullXid), + XidFromFullTransactionId(xlrec->latestRemovedFullXid)); break; } case XLOG_BTREE_META_CLEANUP: @@ -110,8 +113,8 @@ btree_desc(StringInfo buf, XLogReaderState *record) xlrec = (xl_btree_metadata *) XLogRecGetBlockData(record, 0, NULL); - appendStringInfo(buf, "oldest_btpo_xact %u; last_cleanup_num_heap_tuples: %f", - xlrec->oldest_btpo_xact, + appendStringInfo(buf, "last_cleanup_num_delpages %u; last_cleanup_num_heap_tuples: %f", + xlrec->last_cleanup_num_delpages, xlrec->last_cleanup_num_heap_tuples); break; } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index a3ee652030c..17de5a6d0ed 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -452,6 +452,34 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode true); } +/* + * Variant of ResolveRecoveryConflictWithSnapshot that works with + * FullTransactionId values + */ +void +ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node) +{ + /* + * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds, + * so truncate the logged FullTransactionId. If the logged value is very + * old, so that XID wrap-around already happened on it, there can't be any + * snapshots that still see it. + */ + FullTransactionId nextXid = ReadNextFullTransactionId(); + uint64 diff; + + diff = U64FromFullTransactionId(nextXid) - + U64FromFullTransactionId(latestRemovedFullXid); + if (diff < MaxTransactionId / 2) + { + TransactionId latestRemovedXid; + + latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node); + } +} + void ResolveRecoveryConflictWithTablespace(Oid tsid) { diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index cad4f2bdeb9..9ac90d74398 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -37,8 +37,9 @@ typedef uint16 BTCycleId; * * In addition, we store the page's btree level (counting upwards from * zero at a leaf page) as well as some flag bits indicating the page type - * and status. If the page is deleted, we replace the level with the - * next-transaction-ID value indicating when it is safe to reclaim the page. + * and status. If the page is deleted, a BTDeletedPageData struct is stored + * in the page's tuple area, while a standard BTPageOpaqueData struct is + * stored in the page special area. * * We also store a "vacuum cycle ID". When a page is split while VACUUM is * processing the index, a nonzero value associated with the VACUUM run is @@ -52,17 +53,17 @@ typedef uint16 BTCycleId; * * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested * instead. + * + * NOTE: the btpo_level field used to be a union type in order to allow + * deleted pages to store a 32-bit safexid in the same field. We now store + * 64-bit/full safexid values using BTDeletedPageData instead. */ typedef struct BTPageOpaqueData { BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ - union - { - uint32 level; /* tree level --- zero for leaf pages */ - TransactionId xact; /* next transaction ID, if deleted */ - } btpo; + uint32 btpo_level; /* tree level --- zero for leaf pages */ uint16 btpo_flags; /* flag bits, see below */ BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ } BTPageOpaqueData; @@ -78,6 +79,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ +#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ /* * The max allowed value of a cycle ID is a bit less than 64K. This is @@ -105,10 +107,12 @@ typedef struct BTMetaPageData BlockNumber btm_fastroot; /* current "fast" root location */ uint32 btm_fastlevel; /* tree level of the "fast" root page */ /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ - TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted - * pages */ - float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples - * during last cleanup */ + + /* number of deleted, non-recyclable pages during last cleanup */ + uint32 btm_last_cleanup_num_delpages; + /* number of heap tuples during last cleanup */ + float8 btm_last_cleanup_num_heap_tuples; + bool btm_allequalimage; /* are all columns "equalimage"? */ } BTMetaPageData; @@ -220,6 +224,93 @@ typedef struct BTMetaPageData #define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) #define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) +#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) + +/* + * BTDeletedPageData is the page contents of a deleted page + */ +typedef struct BTDeletedPageData +{ + FullTransactionId safexid; /* See BTPageIsRecyclable() */ +} BTDeletedPageData; + +static inline void +BTPageSetDeleted(Page page, FullTransactionId safexid) +{ + BTPageOpaque opaque; + PageHeader header; + BTDeletedPageData *contents; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + header = ((PageHeader) page); + + opaque->btpo_flags &= ~BTP_HALF_DEAD; + opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID; + header->pd_lower = MAXALIGN(SizeOfPageHeaderData) + + sizeof(BTDeletedPageData); + header->pd_upper = header->pd_special; + + /* Set safexid in deleted page */ + contents = ((BTDeletedPageData *) PageGetContents(page)); + contents->safexid = safexid; +} + +static inline FullTransactionId +BTPageGetDeleteXid(Page page) +{ + BTPageOpaque opaque; + BTDeletedPageData *contents; + + /* We only expect to be called with a deleted page */ + Assert(!PageIsNew(page)); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISDELETED(opaque)); + + /* pg_upgrade'd deleted page -- must be safe to delete now */ + if (!P_HAS_FULLXID(opaque)) + return FirstNormalFullTransactionId; + + /* Get safexid from deleted page */ + contents = ((BTDeletedPageData *) PageGetContents(page)); + return contents->safexid; +} + +/* + * Is an existing page recyclable? + * + * This exists to centralize the policy on which deleted pages are now safe to + * re-use. + * + * Note: PageIsNew() pages are always safe to recycle, but we can't deal with + * them here (caller is responsible for that case themselves). Caller might + * well need special handling for new pages anyway. + */ +static inline bool +BTPageIsRecyclable(Page page) +{ + BTPageOpaque opaque; + + Assert(!PageIsNew(page)); + + /* Recycling okay iff page is deleted and safexid is old enough */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISDELETED(opaque)) + { + /* + * The page was deleted, but when? If it was just deleted, a scan + * might have seen the downlink to it, and will read the page later. + * As long as that can happen, we must keep the deleted page around as + * a tombstone. + * + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have + * seen its downlink, and we can recycle it. + */ + return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page)); + } + + return false; +} /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost @@ -962,7 +1053,7 @@ typedef struct BTOptions { int32 varlena_header_; /* varlena header (do not touch directly!) */ int fillfactor; /* page fill factor in percent (0..100) */ - /* fraction of newly inserted tuples prior to trigger index cleanup */ + /* fraction of newly inserted tuples needed to trigger index cleanup */ float8 vacuum_cleanup_index_scale_factor; bool deduplicate_items; /* Try to deduplicate items? */ } BTOptions; @@ -1066,8 +1157,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, */ extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage); -extern void _bt_update_meta_cleanup_info(Relation rel, - TransactionId oldestBtpoXact, float8 numHeapTuples); +extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages, + float8 num_heap_tuples); extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); @@ -1084,15 +1175,13 @@ extern void _bt_unlockbuf(Relation rel, Buffer buf); extern bool _bt_conditionallockbuf(Relation rel, Buffer buf); extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); -extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable); extern void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate); -extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf, - TransactionId *oldestBtpoXact); +extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf); /* * prototypes for functions in nbtsearch.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 7ae5c98c2b8..3df34fcda2d 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -13,6 +13,7 @@ #ifndef NBTXLOG_H #define NBTXLOG_H +#include "access/transam.h" #include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/off.h" @@ -52,7 +53,7 @@ typedef struct xl_btree_metadata uint32 level; BlockNumber fastroot; uint32 fastlevel; - TransactionId oldest_btpo_xact; + uint32 last_cleanup_num_delpages; float8 last_cleanup_num_heap_tuples; bool allequalimage; } xl_btree_metadata; @@ -187,7 +188,7 @@ typedef struct xl_btree_reuse_page { RelFileNode node; BlockNumber block; - TransactionId latestRemovedXid; + FullTransactionId latestRemovedFullXid; } xl_btree_reuse_page; #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) @@ -282,9 +283,12 @@ typedef struct xl_btree_mark_page_halfdead #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) /* - * This is what we need to know about deletion of a btree page. Note we do - * not store any content for the deleted page --- it is just rewritten as empty - * during recovery, apart from resetting the btpo.xact. + * This is what we need to know about deletion of a btree page. Note that we + * only leave behind a small amount of bookkeeping information in deleted + * pages (deleted pages must be kept around as tombstones for a while). It is + * convenient for the REDO routine to regenerate its target page from scratch. + * This is why WAL record describes certain details that are actually directly + * available from the target page. * * Backup Blk 0: target block being deleted * Backup Blk 1: target block's left sibling, if any @@ -296,20 +300,24 @@ typedef struct xl_btree_unlink_page { BlockNumber leftsib; /* target block's left sibling, if any */ BlockNumber rightsib; /* target block's right sibling */ + uint32 level; /* target block's level */ + FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */ /* - * Information needed to recreate the leaf page, when target is an - * internal page. + * Information needed to recreate a half-dead leaf page with correct + * topparent link. The fields are only used when deletion operation's + * target page is an internal page. REDO routine creates half-dead page + * from scratch to keep things simple (this is the same convenient + * approach used for the target page itself). */ BlockNumber leafleftsib; BlockNumber leafrightsib; - BlockNumber topparent; /* next child down in the subtree */ + BlockNumber leaftopparent; /* next child down in the subtree */ - TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ } xl_btree_unlink_page; -#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) +#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber)) /* * New root log record. There are zero tuples if this is to establish an diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 224cae0246f..8d09eaec93d 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD109 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD10A /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 94d33851d09..38fd85a4316 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -31,6 +31,8 @@ extern void ShutdownRecoveryTransactionEnvironment(void); extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node); +extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node); extern void ResolveRecoveryConflictWithTablespace(Oid tsid); extern void ResolveRecoveryConflictWithDatabase(Oid dbid);