Compute XID horizon for page level index vacuum on primary.

Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2025-11-22 12:22:45 +03:00 · 2019-03-26 14:41:46 -07:00
parent 126d631222
commit 558a9165e0
16 changed files with 316 additions and 321 deletions
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -67,6 +67,7 @@
 #include "utils/lsyscache.h"
 #include "utils/relcache.h"
 #include "utils/snapmgr.h"
+#include "utils/spccache.h"


 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
@@ -162,6 +163,20 @@ static const struct
 #define ConditionalLockTupleTuplock(rel, tup, mode) \
 	ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

+#ifdef USE_PREFETCH
+/*
+ * heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use
+ * this structure to coordinate prefetching activity.
+ */
+typedef struct
+{
+	BlockNumber cur_hblkno;
+	int			next_item;
+	int			nitems;
+	ItemPointerData *tids;
+} XidHorizonPrefetchState;
+#endif
+
 /*
 * This table maps tuple lock strength values for each particular
 * MultiXactStatus value.
@@ -6861,6 +6876,212 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
 	/* *latestRemovedXid may still be invalid at end */
 }

+#ifdef USE_PREFETCH
+/*
+ * Helper function for heap_compute_xid_horizon_for_tuples.  Issue prefetch
+ * requests for the number of buffers indicated by prefetch_count.  The
+ * prefetch_state keeps track of all the buffers that we can prefetch and
+ * which ones have already been prefetched; each call to this function picks
+ * up where the previous call left off.
+ */
+static void
+xid_horizon_prefetch_buffer(Relation rel,
+							XidHorizonPrefetchState *prefetch_state,
+							int prefetch_count)
+{
+	BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
+	int			count = 0;
+	int			i;
+	int			nitems = prefetch_state->nitems;
+	ItemPointerData *tids = prefetch_state->tids;
+
+	for (i = prefetch_state->next_item;
+		 i < nitems && count < prefetch_count;
+		 i++)
+	{
+		ItemPointer htid = &tids[i];
+
+		if (cur_hblkno == InvalidBlockNumber ||
+			ItemPointerGetBlockNumber(htid) != cur_hblkno)
+		{
+			cur_hblkno = ItemPointerGetBlockNumber(htid);
+			PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
+			count++;
+		}
+	}
+
+	/*
+	 * Save the prefetch position so that next time we can continue from that
+	 * position.
+	 */
+	prefetch_state->next_item = i;
+	prefetch_state->cur_hblkno = cur_hblkno;
+}
+#endif
+
+/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted.
+ *
+ * We used to do this during recovery rather than on the primary, but that
+ * approach now appears inferior.  It meant that the master could generate
+ * a lot of work for the standby without any back-pressure to slow down the
+ * master, and it required the standby to have reached consistency, whereas
+ * we want to have correct information available even before that point.
+ *
+ * It's possible for this to generate a fair amount of I/O, since we may be
+ * deleting hundreds of tuples from a single index block.  To amortize that
+ * cost to some degree, this uses prefetching and combines repeat accesses to
+ * the same block.
+ */
+TransactionId
+heap_compute_xid_horizon_for_tuples(Relation rel,
+									ItemPointerData *tids,
+									int nitems)
+{
+	TransactionId latestRemovedXid = InvalidTransactionId;
+	BlockNumber hblkno;
+	Buffer		buf = InvalidBuffer;
+	Page		hpage;
+#ifdef USE_PREFETCH
+	XidHorizonPrefetchState prefetch_state;
+	int			io_concurrency;
+	int			prefetch_distance;
+#endif
+
+	/*
+	 * Sort to avoid repeated lookups for the same page, and to make it more
+	 * likely to access items in an efficient order. In particular, this
+	 * ensures that if there are multiple pointers to the same page, they all
+	 * get processed looking up and locking the page just once.
+	 */
+	qsort((void *) tids, nitems, sizeof(ItemPointerData),
+		  (int (*) (const void *, const void *)) ItemPointerCompare);
+
+#ifdef USE_PREFETCH
+	/* Initialize prefetch state. */
+	prefetch_state.cur_hblkno = InvalidBlockNumber;
+	prefetch_state.next_item = 0;
+	prefetch_state.nitems = nitems;
+	prefetch_state.tids = tids;
+
+	/*
+	 * Compute the prefetch distance that we will attempt to maintain.
+	 *
+	 * We don't use the regular formula to determine how much to prefetch
+	 * here, but instead just add a constant to effective_io_concurrency.
+	 * That's because it seems best to do some prefetching here even when
+	 * effective_io_concurrency is set to 0, but if the DBA thinks it's OK to
+	 * do more prefetching for other operations, then it's probably OK to do
+	 * more prefetching in this case, too. It may be that this formula is too
+	 * simplistic, but at the moment there is no evidence of that or any idea
+	 * about what would work better.
+	 */
+	io_concurrency = get_tablespace_io_concurrency(rel->rd_rel->reltablespace);
+	prefetch_distance = Min((io_concurrency) + 10, MAX_IO_CONCURRENCY);
+
+	/* Start prefetching. */
+	xid_horizon_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
+#endif
+
+	/* Iterate over all tids, and check their horizon */
+	hblkno = InvalidBlockNumber;
+	for (int i = 0; i < nitems; i++)
+	{
+		ItemPointer htid = &tids[i];
+		ItemId		hitemid;
+		OffsetNumber hoffnum;
+
+		/*
+		 * Read heap buffer, but avoid refetching if it's the same block as
+		 * required for the last tid.
+		 */
+		if (hblkno == InvalidBlockNumber ||
+			ItemPointerGetBlockNumber(htid) != hblkno)
+		{
+			/* release old buffer */
+			if (BufferIsValid(buf))
+			{
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				ReleaseBuffer(buf);
+			}
+
+			hblkno = ItemPointerGetBlockNumber(htid);
+
+			buf = ReadBuffer(rel, hblkno);
+
+#ifdef USE_PREFETCH
+
+			/*
+			 * To maintain the prefetch distance, prefetch one more page for
+			 * each page we read.
+			 */
+			xid_horizon_prefetch_buffer(rel, &prefetch_state, 1);
+#endif
+
+			hpage = BufferGetPage(buf);
+
+			LockBuffer(buf, BUFFER_LOCK_SHARE);
+		}
+
+		hoffnum = ItemPointerGetOffsetNumber(htid);
+		hitemid = PageGetItemId(hpage, hoffnum);
+
+		/*
+		 * Follow any redirections until we find something useful.
+		 */
+		while (ItemIdIsRedirected(hitemid))
+		{
+			hoffnum = ItemIdGetRedirect(hitemid);
+			hitemid = PageGetItemId(hpage, hoffnum);
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		/*
+		 * If the heap item has storage, then read the header and use that to
+		 * set latestRemovedXid.
+		 *
+		 * Some LP_DEAD items may not be accessible, so we ignore them.
+		 */
+		if (ItemIdHasStorage(hitemid))
+		{
+			HeapTupleHeader htuphdr;
+
+			htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+			HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
+		}
+		else if (ItemIdIsDead(hitemid))
+		{
+			/*
+			 * Conjecture: if hitemid is dead then it had xids before the xids
+			 * marked on LP_NORMAL items. So we just ignore this item and move
+			 * onto the next, for the purposes of calculating
+			 * latestRemovedxids.
+			 */
+		}
+		else
+			Assert(!ItemIdIsUsed(hitemid));
+
+	}
+
+	if (BufferIsValid(buf))
+	{
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buf);
+	}
+
+	/*
+	 * If all heap tuples were LP_DEAD then we will be returning
+	 * InvalidTransactionId here, which avoids conflicts. This matches
+	 * existing logic which assumes that LP_DEAD tuples must already be older
+	 * than the latestRemovedXid on the cleanup record that set them as
+	 * LP_DEAD, hence must already have generated a conflict.
+	 */
+
+	return latestRemovedXid;
+}
+
 /*
 * Perform XLogInsert to register a heap cleanup info message. These
 * messages are sent once per VACUUM and are required because