diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 46d49bf0251..bfe33b6b431 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -413,7 +413,30 @@ to be "visible to everyone". As collateral damage, we wait for snapshots taken until the next transaction to allocate an XID commits. We also wait for running XIDs with no snapshots. -The need for this additional indirection after a page deletion operation +Prior to PostgreSQL 14, VACUUM would only place _old_ deleted pages that +it encounters during its linear scan (pages deleted by a previous VACUUM +operation) in the FSM. Newly deleted pages were never placed in the FSM, +because that was assumed to _always_ be unsafe. That assumption was +unnecessarily pessimistic in practice, though -- it often doesn't take +very long for newly deleted pages to become safe to place in the FSM. +There is no truly principled way to predict when deleted pages will become +safe to place in the FSM for recycling -- it might become safe almost +immediately (long before the current VACUUM completes), or it might not +even be safe by the time the next VACUUM takes place. Recycle safety is +purely a question of maintaining the consistency (or at least the apparent +consistency) of a physical data structure. The state within the backend +running VACUUM is simply not relevant. + +PostgreSQL 14 added the ability for VACUUM to consider if it's possible to +recycle newly deleted pages at the end of the full index scan where the +page deletion took place. It is convenient to check if it's safe at that +point. This does require that VACUUM keep around a little bookkeeping +information about newly deleted pages, but that's very cheap. Using +in-memory state for this avoids the need to revisit newly deleted pages a +second time later on -- we can just use safexid values from the local +bookkeeping state to determine recycle safety in a deferred fashion. + +The need for additional FSM indirection after a page deletion operation takes place is a natural consequence of the highly permissive rules for index scans with Lehman and Yao's design. In general an index scan doesn't have to hold a lock or even a pin on any page when it descends the diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index fc744cf9fdb..530d924bff8 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -32,7 +32,9 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "storage/procarray.h" #include "utils/memdebug.h" +#include "utils/memutils.h" #include "utils/snapmgr.h" static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); @@ -57,6 +59,8 @@ static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib); +static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, + FullTransactionId safexid); /* * _bt_initmetapage() -- Fill a page buffer with a correct metapage image @@ -209,13 +213,9 @@ _bt_vacuum_needs_cleanup(Relation rel) * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the * total size of the index. We can reasonably expect (though are not * guaranteed) to be able to recycle this many pages if we decide to do a - * btvacuumscan call during the ongoing btvacuumcleanup. - * - * Our approach won't reliably avoid "wasted" cleanup-only btvacuumscan - * calls. That is, we can end up scanning the entire index without ever - * placing even 1 of the prev_num_delpages pages in the free space map, at - * least in certain narrow cases (see nbtree/README section on recycling - * deleted pages for details). This rarely comes up in practice. + * btvacuumscan call during the ongoing btvacuumcleanup. For further + * details see the nbtree/README section on placing deleted pages in the + * FSM. */ if (prev_num_delpages > 0 && prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20) @@ -2729,6 +2729,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target <= scanblkno) stats->pages_deleted++; + /* + * Remember information about the target page (now a newly deleted page) + * in dedicated vstate space for later. The page will be considered as a + * candidate to place in the FSM at the end of the current btvacuumscan() + * call. + */ + _bt_pendingfsm_add(vstate, target, safexid); + return true; } @@ -2873,3 +2881,177 @@ _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, subtreeparent, poffset, topparent, topparentrightsib); } + +/* + * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize + * optimization. + * + * Called at the start of a btvacuumscan(). Caller's cleanuponly argument + * indicates if ongoing VACUUM has not (and will not) call btbulkdelete(). + * + * We expect to allocate memory inside VACUUM's top-level memory context here. + * The working buffer is subject to a limit based on work_mem. Our strategy + * when the array can no longer grow within the bounds of that limit is to + * stop saving additional newly deleted pages, while proceeding as usual with + * the pages that we can fit. + */ +void +_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly) +{ + int64 maxbufsize; + + /* + * Don't bother with optimization in cleanup-only case -- we don't expect + * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan() + * can only take place because this optimization didn't work out during + * the last VACUUM. + */ + if (cleanuponly) + return; + + /* + * Cap maximum size of array so that we always respect work_mem. Avoid + * int overflow here. + */ + vstate->bufsize = 256; + maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM); + maxbufsize = Min(maxbufsize, INT_MAX); + maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM)); + /* Stay sane with small work_mem */ + maxbufsize = Max(maxbufsize, vstate->bufsize); + vstate->maxbufsize = maxbufsize; + + /* Allocate buffer, indicate that there are currently 0 pending pages */ + vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize); + vstate->npendingpages = 0; +} + +/* + * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during + * the ongoing VACUUM operation) into the free space map -- though only when + * it is actually safe to do so by now. + * + * Called at the end of a btvacuumscan(), just before free space map vacuuming + * takes place. + * + * Frees memory allocated by _bt_pendingfsm_init(), if any. + */ +void +_bt_pendingfsm_finalize(Relation rel, BTVacState *vstate) +{ + IndexBulkDeleteResult *stats = vstate->stats; + + Assert(stats->pages_newly_deleted >= vstate->npendingpages); + + if (vstate->npendingpages == 0) + { + /* Just free memory when nothing to do */ + if (vstate->pendingpages) + pfree(vstate->pendingpages); + + return; + } + +#ifdef DEBUG_BTREE_PENDING_FSM + + /* + * Debugging aid: Sleep for 5 seconds to greatly increase the chances of + * placing pending pages in the FSM. Note that the optimization will + * never be effective without some other backend concurrently consuming an + * XID. + */ + pg_usleep(5000000L); +#endif + + /* + * Recompute VACUUM XID boundaries. + * + * We don't actually care about the oldest non-removable XID. Computing + * the oldest such XID has a useful side-effect that we rely on: it + * forcibly updates the XID horizon state for this backend. This step is + * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize + * that it is now safe to recycle newly deleted pages without this step. + */ + GetOldestNonRemovableTransactionId(NULL); + + for (int i = 0; i < vstate->npendingpages; i++) + { + BlockNumber target = vstate->pendingpages[i].target; + FullTransactionId safexid = vstate->pendingpages[i].safexid; + + /* + * Do the equivalent of checking BTPageIsRecyclable(), but without + * accessing the page again a second time. + * + * Give up on finding the first non-recyclable page -- all later pages + * must be non-recyclable too, since _bt_pendingfsm_add() adds pages + * to the array in safexid order. + */ + if (!GlobalVisCheckRemovableFullXid(NULL, safexid)) + break; + + RecordFreeIndexPage(rel, target); + stats->pages_free++; + } + + pfree(vstate->pendingpages); +} + +/* + * Maintain array of pages that were deleted during current btvacuumscan() + * call, for use in _bt_pendingfsm_finalize() + */ +static void +_bt_pendingfsm_add(BTVacState *vstate, + BlockNumber target, + FullTransactionId safexid) +{ + Assert(vstate->npendingpages <= vstate->bufsize); + Assert(vstate->bufsize <= vstate->maxbufsize); + +#ifdef USE_ASSERT_CHECKING + + /* + * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the + * array will always be in safexid order (since that is the order that we + * save them in here) + */ + if (vstate->npendingpages > 0) + { + FullTransactionId lastsafexid = + vstate->pendingpages[vstate->npendingpages - 1].safexid; + + Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid)); + } +#endif + + /* + * If temp buffer reaches maxbufsize/work_mem capacity then we discard + * information about this page. + * + * Note that this also covers the case where we opted to not use the + * optimization in _bt_pendingfsm_init(). + */ + if (vstate->npendingpages == vstate->maxbufsize) + return; + + /* Consider enlarging buffer */ + if (vstate->npendingpages == vstate->bufsize) + { + int newbufsize = vstate->bufsize * 2; + + /* Respect work_mem */ + if (newbufsize > vstate->maxbufsize) + newbufsize = vstate->maxbufsize; + + vstate->bufsize = newbufsize; + vstate->pendingpages = + repalloc(vstate->pendingpages, + sizeof(BTPendingFSM) * vstate->bufsize); + } + + /* Save metadata for newly deleted page */ + vstate->pendingpages[vstate->npendingpages].target = target; + vstate->pendingpages[vstate->npendingpages].safexid = safexid; + vstate->npendingpages++; +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index c02c4e7710d..9282c9ea22f 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -859,9 +859,13 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup(). * * num_delpages is the number of deleted pages now in the index that were - * not safe to place in the FSM to be recycled just yet. We expect that - * it will almost certainly be possible to place all of these pages in the - * FSM during the next VACUUM operation. + * not safe to place in the FSM to be recycled just yet. num_delpages is + * greater than 0 only when _bt_pagedel() actually deleted pages during + * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must + * have failed to place any newly deleted pages in the FSM just moments + * ago. (Actually, there are edge cases where recycling of the current + * VACUUM's newly deleted pages does not even become safe by the time the + * next VACUUM comes around. See nbtree/README.) */ Assert(stats->pages_deleted >= stats->pages_free); num_delpages = stats->pages_deleted - stats->pages_free; @@ -937,6 +941,14 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, "_bt_pagedel", ALLOCSET_DEFAULT_SIZES); + /* Initialize vstate fields used by _bt_pendingfsm_finalize */ + vstate.bufsize = 0; + vstate.maxbufsize = 0; + vstate.pendingpages = NULL; + vstate.npendingpages = 0; + /* Consider applying _bt_pendingfsm_finalize optimization */ + _bt_pendingfsm_init(rel, &vstate, (callback == NULL)); + /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing @@ -995,17 +1007,15 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, MemoryContextDelete(vstate.pagedelcontext); /* - * If we found any recyclable pages (and recorded them in the FSM), then - * forcibly update the upper-level FSM pages to ensure that searchers can - * find them. It's possible that the pages were also found during - * previous scans and so this is a waste of time, but it's cheap enough - * relative to scanning the index that it shouldn't matter much, and - * making sure that free pages are available sooner not later seems - * worthwhile. + * If there were any calls to _bt_pagedel() during scan of the index then + * see if any of the resulting pages can be placed in the FSM now. When + * it's not safe we'll have to leave it up to a future VACUUM operation. * - * Note that if no recyclable pages exist, we don't bother vacuuming the - * FSM at all. + * Finally, if we placed any pages in the FSM (either just now or during + * the scan), forcibly update the upper-level FSM pages to ensure that + * searchers can find them. */ + _bt_pendingfsm_finalize(rel, &vstate); if (stats->pages_free > 0) IndexFreeSpaceMapVacuum(rel); } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 88eccfcb732..a645c42e685 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -279,7 +279,8 @@ BTPageGetDeleteXid(Page page) * Is an existing page recyclable? * * This exists to centralize the policy on which deleted pages are now safe to - * re-use. + * re-use. However, _bt_pendingfsm_finalize() duplicates some of the same + * logic because it doesn't work directly with pages -- keep the two in sync. * * Note: PageIsNew() pages are always safe to recycle, but we can't deal with * them here (caller is responsible for that case themselves). Caller might @@ -305,6 +306,10 @@ BTPageIsRecyclable(Page page) * For that check if the deletion XID could still be visible to * anyone. If not, then no scan that's still in progress could have * seen its downlink, and we can recycle it. + * + * XXX: If we had the heap relation we could be more aggressive about + * recycling deleted pages in non-catalog relations. For now we just + * pass NULL. That is at least simple and consistent. */ return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page)); } @@ -313,9 +318,15 @@ BTPageIsRecyclable(Page page) } /* - * BTVacState is private nbtree.c state used during VACUUM. It is exported - * for use by page deletion related code in nbtpage.c. + * BTVacState and BTPendingFSM are private nbtree.c state used during VACUUM. + * They are exported for use by page deletion related code in nbtpage.c. */ +typedef struct BTPendingFSM +{ + BlockNumber target; /* Page deleted by current VACUUM */ + FullTransactionId safexid; /* Page's BTDeletedPageData.safexid */ +} BTPendingFSM; + typedef struct BTVacState { IndexVacuumInfo *info; @@ -324,6 +335,14 @@ typedef struct BTVacState void *callback_state; BTCycleId cycleid; MemoryContext pagedelcontext; + + /* + * _bt_pendingfsm_finalize() state + */ + int bufsize; /* pendingpages space (in # elements) */ + int maxbufsize; /* max bufsize that respects work_mem */ + BTPendingFSM *pendingpages; /* One entry per newly deleted page */ + int npendingpages; /* current # valid pendingpages */ } BTVacState; /* @@ -1195,6 +1214,9 @@ extern void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate); extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate); +extern void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, + bool cleanuponly); +extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate); /* * prototypes for functions in nbtsearch.c