mirror of
https://github.com/postgres/postgres.git
synced 2025-10-27 00:12:01 +03:00
Fix LOCK_TIMEOUT handling during parallel apply.
Previously, the parallel apply worker used SIGINT to receive a graceful shutdown signal from the leader apply worker. However, SIGINT is also used by the LOCK_TIMEOUT handler to trigger a query-cancel interrupt. This overlap caused the parallel apply worker to miss LOCK_TIMEOUT signals, leading to incorrect behavior during lock wait/contention. This patch resolves the conflict by switching the graceful shutdown signal from SIGINT to SIGUSR2. Reported-by: Zane Duffield <duffieldzane@gmail.com> Diagnosed-by: Zhijie Hou <houzj.fnst@fujitsu.com> Author: Hayato Kuroda <kuroda.hayato@fujitsu.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Backpatch-through: 16, where it was introduced Discussion: https://postgr.es/m/CACMiCkXyC4au74kvE2g6Y=mCEF8X6r-Ne_ty4r7qWkUjRE4+oQ@mail.gmail.com
This commit is contained in:
@@ -94,9 +94,8 @@ SignalHandlerForCrashExit(SIGNAL_ARGS)
|
||||
* shut down and exit.
|
||||
*
|
||||
* Typically, this handler would be used for SIGTERM, but some processes use
|
||||
* other signals. In particular, the checkpointer exits on SIGUSR2, and the WAL
|
||||
* writer and the logical replication parallel apply worker exits on either
|
||||
* SIGINT or SIGTERM.
|
||||
* other signals. In particular, the checkpointer and parallel apply worker
|
||||
* exit on SIGUSR2, and the WAL writer exits on either SIGINT or SIGTERM.
|
||||
*
|
||||
* ShutdownRequestPending should be checked at a convenient place within the
|
||||
* main loop, or else the main loop should call ProcessMainLoopInterrupts.
|
||||
|
||||
@@ -869,10 +869,17 @@ ParallelApplyWorkerMain(Datum main_arg)
|
||||
|
||||
InitializingApplyWorker = true;
|
||||
|
||||
/* Setup signal handling. */
|
||||
/*
|
||||
* Setup signal handling.
|
||||
*
|
||||
* Note: We intentionally used SIGUSR2 to trigger a graceful shutdown
|
||||
* initiated by the leader apply worker. This helps to differentiate it
|
||||
* from the case where we abort the current transaction and exit on
|
||||
* receiving SIGTERM.
|
||||
*/
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGINT, SignalHandlerForShutdownRequest);
|
||||
pqsignal(SIGTERM, die);
|
||||
pqsignal(SIGUSR2, SignalHandlerForShutdownRequest);
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
/*
|
||||
@@ -971,9 +978,9 @@ ParallelApplyWorkerMain(Datum main_arg)
|
||||
|
||||
/*
|
||||
* The parallel apply worker must not get here because the parallel apply
|
||||
* worker will only stop when it receives a SIGTERM or SIGINT from the
|
||||
* leader, or when there is an error. None of these cases will allow the
|
||||
* code to reach here.
|
||||
* worker will only stop when it receives a SIGTERM or SIGUSR2 from the
|
||||
* leader, or SIGINT from itself, or when there is an error. None of these
|
||||
* cases will allow the code to reach here.
|
||||
*/
|
||||
Assert(false);
|
||||
}
|
||||
|
||||
@@ -636,7 +636,7 @@ logicalrep_worker_stop(Oid subid, Oid relid)
|
||||
/*
|
||||
* Stop the given logical replication parallel apply worker.
|
||||
*
|
||||
* Node that the function sends SIGINT instead of SIGTERM to the parallel apply
|
||||
* Node that the function sends SIGUSR2 instead of SIGTERM to the parallel apply
|
||||
* worker so that the worker exits cleanly.
|
||||
*/
|
||||
void
|
||||
@@ -674,7 +674,7 @@ logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo)
|
||||
* Only stop the worker if the generation matches and the worker is alive.
|
||||
*/
|
||||
if (worker->generation == generation && worker->proc)
|
||||
logicalrep_worker_stop_internal(worker, SIGINT);
|
||||
logicalrep_worker_stop_internal(worker, SIGUSR2);
|
||||
|
||||
LWLockRelease(LogicalRepWorkerLock);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user