Allow read only connections during recovery, known as Hot Standby.

Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2025-07-27 12:41:57 +03:00 · 2009-12-19 01:32:45 +00:00
parent 78a09145e0
commit efc16ea520
87 changed files with 6165 additions and 428 deletions
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.55 2009/06/11 14:48:54 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.56 2009/12/19 01:32:33 sriggs Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -16,7 +16,11 @@

 #include "access/nbtree.h"
 #include "access/transam.h"
+#include "access/xact.h"
 #include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "storage/standby.h"
+#include "miscadmin.h"

 /*
 * We must keep track of expected insertions due to page splits, and apply
@ -458,6 +462,97 @@ btree_xlog_split(bool onleft, bool isroot,
 						 xlrec->leftsib, xlrec->rightsib, isroot);
 }

+static void
+btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_vacuum *xlrec;
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque opaque;
+
+	xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+
+	/*
+	 * If queries might be active then we need to ensure every block is unpinned
+	 * between the lastBlockVacuumed and the current block, if there are any.
+	 * This ensures that every block in the index is touched during VACUUM as
+	 * required to ensure scans work correctly.
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_READY &&
+		(xlrec->lastBlockVacuumed + 1) != xlrec->block)
+	{
+		BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+
+		for (; blkno < xlrec->block; blkno++)
+		{
+			/*
+			 * XXX we don't actually need to read the block, we
+			 * just need to confirm it is unpinned. If we had a special call
+			 * into the buffer manager we could optimise this so that
+			 * if the block is not in shared_buffers we confirm it as unpinned.
+			 *
+			 * Another simple optimization would be to check if there's any
+			 * backends running; if not, we could just skip this.
+			 */
+			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL);
+			if (BufferIsValid(buffer))
+			{
+				LockBufferForCleanup(buffer);
+				UnlockReleaseBuffer(buffer);
+			}
+		}
+	}
+
+	/*
+	 * If the block was restored from a full page image, nothing more to do.
+	 * The RestoreBkpBlocks() call already pinned and took cleanup lock on
+	 * it. XXX: Perhaps we should call RestoreBkpBlocks() *after* the loop
+	 * above, to make the disk access more sequential.
+	 */
+	if (record->xl_info & XLR_BKP_BLOCK_1)
+		return;
+
+	/*
+	 * Like in btvacuumpage(), we need to take a cleanup lock on every leaf
+	 * page. See nbtree/README for details.
+	 */
+	buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
+	if (!BufferIsValid(buffer))
+		return;
+	LockBufferForCleanup(buffer);
+	page = (Page) BufferGetPage(buffer);
+
+	if (XLByteLE(lsn, PageGetLSN(page)))
+	{
+		UnlockReleaseBuffer(buffer);
+		return;
+	}
+
+	if (record->xl_len > SizeOfBtreeVacuum)
+	{
+		OffsetNumber *unused;
+		OffsetNumber *unend;
+
+		unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum);
+		unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+		if ((unend - unused) > 0)
+			PageIndexMultiDelete(page, unused, unend - unused);
+	}
+
+	/*
+	 * Mark the page as not containing any LP_DEAD items --- see comments in
+	 * _bt_delitems().
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	PageSetLSN(page, lsn);
+	PageSetTLI(page, ThisTimeLineID);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
+}
+
 static void
 btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 {
@ -470,6 +565,11 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 		return;

 	xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+	/*
+	 * We don't need to take a cleanup lock to apply these changes.
+	 * See nbtree/README for details.
+	 */
 	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
 	if (!BufferIsValid(buffer))
 		return;
@ -714,7 +814,43 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;

-	RestoreBkpBlocks(lsn, record, false);
+	/*
+	 * Btree delete records can conflict with standby queries. You might
+	 * think that vacuum records would conflict as well, but we've handled
+	 * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+	 * cleaned by the vacuum of the heap and so we can resolve any conflicts
+	 * just once when that arrives. After that any we know that no conflicts
+	 * exist from individual btree vacuum records on that index.
+	 */
+	if (InHotStandby)
+	{
+		if (info == XLOG_BTREE_DELETE)
+		{
+			xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+			VirtualTransactionId *backends;
+
+			/*
+			 * XXX Currently we put everybody on death row, because
+			 * currently _bt_delitems() supplies InvalidTransactionId.
+			 * This can be fairly painful, so providing a better value
+			 * here is worth some thought and possibly some effort to
+			 * improve.
+			 */
+			backends = GetConflictingVirtualXIDs(xlrec->latestRemovedXid,
+												 InvalidOid,
+												 true);
+
+			ResolveRecoveryConflictWithVirtualXIDs(backends,
+												   "b-tree delete",
+												   CONFLICT_MODE_ERROR);
+		}
+	}
+
+	/*
+	 * Vacuum needs to pin and take cleanup lock on every leaf page,
+	 * a regular exclusive lock is enough for all other purposes.
+	 */
+	RestoreBkpBlocks(lsn, record, (info == XLOG_BTREE_VACUUM));

 	switch (info)
 	{
@ -739,6 +875,9 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 		case XLOG_BTREE_SPLIT_R_ROOT:
 			btree_xlog_split(false, true, lsn, record);
 			break;
+		case XLOG_BTREE_VACUUM:
+			btree_xlog_vacuum(lsn, record);
+			break;
 		case XLOG_BTREE_DELETE:
 			btree_xlog_delete(lsn, record);
 			break;
@ -843,13 +982,24 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
 								 xlrec->level, xlrec->firstright);
 				break;
 			}
+		case XLOG_BTREE_VACUUM:
+			{
+				xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
+
+				appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u",
+								 xlrec->node.spcNode, xlrec->node.dbNode,
+								 xlrec->node.relNode, xlrec->block,
+								 xlrec->lastBlockVacuumed);
+				break;
+			}
 		case XLOG_BTREE_DELETE:
 			{
 				xl_btree_delete *xlrec = (xl_btree_delete *) rec;

-				appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u",
+				appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u",
 								 xlrec->node.spcNode, xlrec->node.dbNode,
-								 xlrec->node.relNode, xlrec->block);
+								 xlrec->node.relNode, xlrec->block,
+								 xlrec->latestRemovedXid);
 				break;
 			}
 		case XLOG_BTREE_DELETE_PAGE: