Compute XID horizon for page level index vacuum on primary.

Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2025-08-31 17:02:12 +03:00 · 2019-03-26 14:41:46 -07:00
parent 126d631222
commit 558a9165e0
16 changed files with 316 additions and 321 deletions
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1101,10 +1101,16 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
+	TransactionId latestRemovedXid = InvalidTransactionId;

 	/* Shouldn't be called unless there's something to do */
 	Assert(nitems > 0);

+	if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+		latestRemovedXid =
+			index_compute_xid_horizon_for_tuples(rel, heapRel, buf,
+												 itemnos, nitems);
+
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();

@@ -1134,7 +1140,7 @@ _bt_delitems_delete(Relation rel, Buffer buf,
 		XLogRecPtr	recptr;
 		xl_btree_delete xlrec_delete;

-		xlrec_delete.hnode = heapRel->rd_node;
+		xlrec_delete.latestRemovedXid = latestRemovedXid;
 		xlrec_delete.nitems = nitems;

 		XLogBeginInsert();
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -501,159 +501,6 @@ btree_xlog_vacuum(XLogReaderState *record)
 		UnlockReleaseBuffer(buffer);
 }

-/*
- * Get the latestRemovedXid from the heap pages pointed at by the index
- * tuples being deleted. This puts the work for calculating latestRemovedXid
- * into the recovery path rather than the primary path.
- *
- * It's possible that this generates a fair amount of I/O, since an index
- * block may have hundreds of tuples being deleted. Repeat accesses to the
- * same heap blocks are common, though are not yet optimised.
- *
- * XXX optimise later with something like XLogPrefetchBuffer()
- */
-static TransactionId
-btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record)
-{
-	xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
-	OffsetNumber *unused;
-	Buffer		ibuffer,
-				hbuffer;
-	Page		ipage,
-				hpage;
-	RelFileNode rnode;
-	BlockNumber blkno;
-	ItemId		iitemid,
-				hitemid;
-	IndexTuple	itup;
-	HeapTupleHeader htuphdr;
-	BlockNumber hblkno;
-	OffsetNumber hoffnum;
-	TransactionId latestRemovedXid = InvalidTransactionId;
-	int			i;
-
-	/*
-	 * If there's nothing running on the standby we don't need to derive a
-	 * full latestRemovedXid value, so use a fast path out of here.  This
-	 * returns InvalidTransactionId, and so will conflict with all HS
-	 * transactions; but since we just worked out that that's zero people,
-	 * it's OK.
-	 *
-	 * XXX There is a race condition here, which is that a new backend might
-	 * start just after we look.  If so, it cannot need to conflict, but this
-	 * coding will result in throwing a conflict anyway.
-	 */
-	if (CountDBBackends(InvalidOid) == 0)
-		return latestRemovedXid;
-
-	/*
-	 * In what follows, we have to examine the previous state of the index
-	 * page, as well as the heap page(s) it points to.  This is only valid if
-	 * WAL replay has reached a consistent database state; which means that
-	 * the preceding check is not just an optimization, but is *necessary*. We
-	 * won't have let in any user sessions before we reach consistency.
-	 */
-	if (!reachedConsistency)
-		elog(PANIC, "btree_xlog_delete_get_latestRemovedXid: cannot operate with inconsistent data");
-
-	/*
-	 * Get index page.  If the DB is consistent, this should not fail, nor
-	 * should any of the heap page fetches below.  If one does, we return
-	 * InvalidTransactionId to cancel all HS transactions.  That's probably
-	 * overkill, but it's safe, and certainly better than panicking here.
-	 */
-	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
-	ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
-	if (!BufferIsValid(ibuffer))
-		return InvalidTransactionId;
-	LockBuffer(ibuffer, BT_READ);
-	ipage = (Page) BufferGetPage(ibuffer);
-
-	/*
-	 * Loop through the deleted index items to obtain the TransactionId from
-	 * the heap items they point to.
-	 */
-	unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
-
-	for (i = 0; i < xlrec->nitems; i++)
-	{
-		/*
-		 * Identify the index tuple about to be deleted
-		 */
-		iitemid = PageGetItemId(ipage, unused[i]);
-		itup = (IndexTuple) PageGetItem(ipage, iitemid);
-
-		/*
-		 * Locate the heap page that the index tuple points at
-		 */
-		hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
-		hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
-		if (!BufferIsValid(hbuffer))
-		{
-			UnlockReleaseBuffer(ibuffer);
-			return InvalidTransactionId;
-		}
-		LockBuffer(hbuffer, BT_READ);
-		hpage = (Page) BufferGetPage(hbuffer);
-
-		/*
-		 * Look up the heap tuple header that the index tuple points at by
-		 * using the heap node supplied with the xlrec. We can't use
-		 * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
-		 * Note that we are not looking at tuple data here, just headers.
-		 */
-		hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
-		hitemid = PageGetItemId(hpage, hoffnum);
-
-		/*
-		 * Follow any redirections until we find something useful.
-		 */
-		while (ItemIdIsRedirected(hitemid))
-		{
-			hoffnum = ItemIdGetRedirect(hitemid);
-			hitemid = PageGetItemId(hpage, hoffnum);
-			CHECK_FOR_INTERRUPTS();
-		}
-
-		/*
-		 * If the heap item has storage, then read the header and use that to
-		 * set latestRemovedXid.
-		 *
-		 * Some LP_DEAD items may not be accessible, so we ignore them.
-		 */
-		if (ItemIdHasStorage(hitemid))
-		{
-			htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
-
-			HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
-		}
-		else if (ItemIdIsDead(hitemid))
-		{
-			/*
-			 * Conjecture: if hitemid is dead then it had xids before the xids
-			 * marked on LP_NORMAL items. So we just ignore this item and move
-			 * onto the next, for the purposes of calculating
-			 * latestRemovedxids.
-			 */
-		}
-		else
-			Assert(!ItemIdIsUsed(hitemid));
-
-		UnlockReleaseBuffer(hbuffer);
-	}
-
-	UnlockReleaseBuffer(ibuffer);
-
-	/*
-	 * If all heap tuples were LP_DEAD then we will be returning
-	 * InvalidTransactionId here, which avoids conflicts. This matches
-	 * existing logic which assumes that LP_DEAD tuples must already be older
-	 * than the latestRemovedXid on the cleanup record that set them as
-	 * LP_DEAD, hence must already have generated a conflict.
-	 */
-	return latestRemovedXid;
-}
-
 static void
 btree_xlog_delete(XLogReaderState *record)
 {
@@ -676,12 +523,11 @@ btree_xlog_delete(XLogReaderState *record)
 	 */
 	if (InHotStandby)
 	{
-		TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record);
 		RelFileNode rnode;

 		XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);

-		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
+		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
 	}

 	/*