Add GUC to log long wait times on recovery conflicts.

This commit adds GUC log_recovery_conflict_waits that controls whether a log message is produced when the startup process is waiting longer than deadlock_timeout for recovery conflicts. This is useful in determining if recovery conflicts prevent the recovery from applying WAL. Note that currently a log message is produced only when recovery conflict has not been resolved yet even after deadlock_timeout passes, i.e., only when the startup process is still waiting for recovery conflict even after deadlock_timeout. Author: Bertrand Drouvot, Masahiko Sawada Reviewed-by: Alvaro Herrera, Kyotaro Horiguchi, Fujii Masao Discussion: https://postgr.es/m/9a60178c-a853-1440-2cdc-c3af916cff59@amazon.com
2025-09-02 04:21:28 +03:00 · 2021-01-08 00:47:03 +09:00
parent f7a1a805cb
commit 0650ff2303
8 changed files with 292 additions and 30 deletions
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1064,8 +1064,10 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 	LWLock	   *partitionLock = LockHashPartitionLock(hashcode);
 	PROC_QUEUE *waitQueue = &(lock->waitProcs);
 	LOCKMASK	myHeldLocks = MyProc->heldLocks;
+	TimestampTz standbyWaitStart = 0;
 	bool		early_deadlock = false;
 	bool		allow_autovacuum_cancel = true;
+	bool		logged_recovery_conflict = false;
 	ProcWaitStatus myWaitStatus;
 	PGPROC	   *proc;
 	PGPROC	   *leader = MyProc->lockGroupLeader;
@@ -1261,6 +1263,14 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		else
 			enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout);
 	}
+	else if (log_recovery_conflict_waits)
+	{
+		/*
+		 * Set the wait start timestamp if logging is enabled and in hot
+		 * standby.
+		 */
+		standbyWaitStart = GetCurrentTimestamp();
+	}

 	/*
 	 * If somebody wakes us between LWLockRelease and WaitLatch, the latch
@@ -1280,8 +1290,42 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 	{
 		if (InHotStandby)
 		{
-			/* Set a timer and wait for that or for the Lock to be granted */
-			ResolveRecoveryConflictWithLock(locallock->tag.lock);
+			bool		maybe_log_conflict =
+			(standbyWaitStart != 0 && !logged_recovery_conflict);
+
+			/* Set a timer and wait for that or for the lock to be granted */
+			ResolveRecoveryConflictWithLock(locallock->tag.lock,
+											maybe_log_conflict);
+
+			/*
+			 * Emit the log message if the startup process is waiting longer
+			 * than deadlock_timeout for recovery conflict on lock.
+			 */
+			if (maybe_log_conflict)
+			{
+				TimestampTz now = GetCurrentTimestamp();
+
+				if (TimestampDifferenceExceeds(standbyWaitStart, now,
+											   DeadlockTimeout))
+				{
+					VirtualTransactionId *vxids;
+					int			cnt;
+
+					vxids = GetLockConflicts(&locallock->tag.lock,
+											 AccessExclusiveLock, &cnt);
+
+					/*
+					 * Log the recovery conflict and the list of PIDs of
+					 * backends holding the conflicting lock. Note that we do
+					 * logging even if there are no such backends right now
+					 * because the startup process here has already waited
+					 * longer than deadlock_timeout.
+					 */
+					LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+										standbyWaitStart, now, cnt > 0 ? vxids : NULL);
+					logged_recovery_conflict = true;
+				}
+			}
 		}
 		else
 		{