mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Fix LOCK_TIMEOUT handling during parallel apply.
Previously, the parallel apply worker used SIGINT to receive a graceful shutdown signal from the leader apply worker. However, SIGINT is also used by the LOCK_TIMEOUT handler to trigger a query-cancel interrupt. This overlap caused the parallel apply worker to miss LOCK_TIMEOUT signals, leading to incorrect behavior during lock wait/contention. This patch resolves the conflict by switching the graceful shutdown signal from SIGINT to SIGUSR2. Reported-by: Zane Duffield <duffieldzane@gmail.com> Diagnosed-by: Zhijie Hou <houzj.fnst@fujitsu.com> Author: Hayato Kuroda <kuroda.hayato@fujitsu.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Backpatch-through: 16, where it was introduced Discussion: https://postgr.es/m/CACMiCkXyC4au74kvE2g6Y=mCEF8X6r-Ne_ty4r7qWkUjRE4+oQ@mail.gmail.com
This commit is contained in:
		@@ -94,9 +94,8 @@ SignalHandlerForCrashExit(SIGNAL_ARGS)
 | 
				
			|||||||
 * shut down and exit.
 | 
					 * shut down and exit.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Typically, this handler would be used for SIGTERM, but some processes use
 | 
					 * Typically, this handler would be used for SIGTERM, but some processes use
 | 
				
			||||||
 * other signals. In particular, the checkpointer exits on SIGUSR2, and the WAL
 | 
					 * other signals. In particular, the checkpointer and parallel apply worker
 | 
				
			||||||
 * writer and the logical replication parallel apply worker exits on either
 | 
					 * exit on SIGUSR2, and the WAL writer exits on either SIGINT or SIGTERM.
 | 
				
			||||||
 * SIGINT or SIGTERM.
 | 
					 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * ShutdownRequestPending should be checked at a convenient place within the
 | 
					 * ShutdownRequestPending should be checked at a convenient place within the
 | 
				
			||||||
 * main loop, or else the main loop should call HandleMainLoopInterrupts.
 | 
					 * main loop, or else the main loop should call HandleMainLoopInterrupts.
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -869,10 +869,17 @@ ParallelApplyWorkerMain(Datum main_arg)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	InitializingApplyWorker = true;
 | 
						InitializingApplyWorker = true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Setup signal handling. */
 | 
						/*
 | 
				
			||||||
 | 
						 * Setup signal handling.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * Note: We intentionally used SIGUSR2 to trigger a graceful shutdown
 | 
				
			||||||
 | 
						 * initiated by the leader apply worker. This helps to differentiate it
 | 
				
			||||||
 | 
						 * from the case where we abort the current transaction and exit on
 | 
				
			||||||
 | 
						 * receiving SIGTERM.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 | 
						pqsignal(SIGHUP, SignalHandlerForConfigReload);
 | 
				
			||||||
	pqsignal(SIGINT, SignalHandlerForShutdownRequest);
 | 
					 | 
				
			||||||
	pqsignal(SIGTERM, die);
 | 
						pqsignal(SIGTERM, die);
 | 
				
			||||||
 | 
						pqsignal(SIGUSR2, SignalHandlerForShutdownRequest);
 | 
				
			||||||
	BackgroundWorkerUnblockSignals();
 | 
						BackgroundWorkerUnblockSignals();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
@@ -971,9 +978,9 @@ ParallelApplyWorkerMain(Datum main_arg)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * The parallel apply worker must not get here because the parallel apply
 | 
						 * The parallel apply worker must not get here because the parallel apply
 | 
				
			||||||
	 * worker will only stop when it receives a SIGTERM or SIGINT from the
 | 
						 * worker will only stop when it receives a SIGTERM or SIGUSR2 from the
 | 
				
			||||||
	 * leader, or when there is an error. None of these cases will allow the
 | 
						 * leader, or SIGINT from itself, or when there is an error. None of these
 | 
				
			||||||
	 * code to reach here.
 | 
						 * cases will allow the code to reach here.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	Assert(false);
 | 
						Assert(false);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -639,7 +639,7 @@ logicalrep_worker_stop(Oid subid, Oid relid)
 | 
				
			|||||||
/*
 | 
					/*
 | 
				
			||||||
 * Stop the given logical replication parallel apply worker.
 | 
					 * Stop the given logical replication parallel apply worker.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Node that the function sends SIGINT instead of SIGTERM to the parallel apply
 | 
					 * Node that the function sends SIGUSR2 instead of SIGTERM to the parallel apply
 | 
				
			||||||
 * worker so that the worker exits cleanly.
 | 
					 * worker so that the worker exits cleanly.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
void
 | 
					void
 | 
				
			||||||
@@ -677,7 +677,7 @@ logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo)
 | 
				
			|||||||
	 * Only stop the worker if the generation matches and the worker is alive.
 | 
						 * Only stop the worker if the generation matches and the worker is alive.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (worker->generation == generation && worker->proc)
 | 
						if (worker->generation == generation && worker->proc)
 | 
				
			||||||
		logicalrep_worker_stop_internal(worker, SIGINT);
 | 
							logicalrep_worker_stop_internal(worker, SIGUSR2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	LWLockRelease(LogicalRepWorkerLock);
 | 
						LWLockRelease(LogicalRepWorkerLock);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user