mirror of
https://github.com/postgres/postgres.git
synced 2025-12-13 14:22:43 +03:00
Enhance slot synchronization API to respect promotion signal.
Previously, during a promotion, only the slot synchronization worker was signaled to shut down. The backend executing slot synchronization via the pg_sync_replication_slots() SQL function was not signaled, allowing it to complete its synchronization cycle before exiting. An upcoming patch improves pg_sync_replication_slots() to wait until replication slots are fully persisted before finishing. This behaviour requires the backend to exit promptly if a promotion occurs. This patch ensures that, during promotion, a signal is also sent to the backend running pg_sync_replication_slots(), allowing it to be interrupted and exit immediately. Author: Ajin Cherian <itsajin@gmail.com> Reviewed-by: Shveta Malik <shveta.malik@gmail.com> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Discussion: https://postgr.es/m/CAFPTHDZAA%2BgWDntpa5ucqKKba41%3DtXmoXqN3q4rpjO9cdxgQrw%40mail.gmail.com
This commit is contained in:
@@ -71,11 +71,14 @@
|
|||||||
/*
|
/*
|
||||||
* Struct for sharing information to control slot synchronization.
|
* Struct for sharing information to control slot synchronization.
|
||||||
*
|
*
|
||||||
* The slot sync worker's pid is needed by the startup process to shut it
|
* The 'pid' is either the slot sync worker's pid or the backend's pid running
|
||||||
* down during promotion. The startup process shuts down the slot sync worker
|
* the SQL function pg_sync_replication_slots(). When the startup process sets
|
||||||
* and also sets stopSignaled=true to handle the race condition when the
|
* 'stopSignaled' during promotion, it uses this 'pid' to wake up the currently
|
||||||
|
* synchronizing process so that the process can immediately stop its
|
||||||
|
* synchronizing work on seeing 'stopSignaled' set.
|
||||||
|
* Setting 'stopSignaled' is also used to handle the race condition when the
|
||||||
* postmaster has not noticed the promotion yet and thus may end up restarting
|
* postmaster has not noticed the promotion yet and thus may end up restarting
|
||||||
* the slot sync worker. If stopSignaled is set, the worker will exit in such a
|
* the slot sync worker. If 'stopSignaled' is set, the worker will exit in such a
|
||||||
* case. The SQL function pg_sync_replication_slots() will also error out if
|
* case. The SQL function pg_sync_replication_slots() will also error out if
|
||||||
* this flag is set. Note that we don't need to reset this variable as after
|
* this flag is set. Note that we don't need to reset this variable as after
|
||||||
* promotion the slot sync worker won't be restarted because the pmState
|
* promotion the slot sync worker won't be restarted because the pmState
|
||||||
@@ -1195,10 +1198,10 @@ ValidateSlotSyncParams(int elevel)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Re-read the config file.
|
* Re-read the config file for slot synchronization.
|
||||||
*
|
*
|
||||||
* Exit if any of the slot sync GUCs have changed. The postmaster will
|
* Exit or throw error if relevant GUCs have changed depending on whether
|
||||||
* restart it.
|
* called from slot sync worker or from the SQL function pg_sync_replication_slots()
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
slotsync_reread_config(void)
|
slotsync_reread_config(void)
|
||||||
@@ -1209,7 +1212,10 @@ slotsync_reread_config(void)
|
|||||||
bool old_hot_standby_feedback = hot_standby_feedback;
|
bool old_hot_standby_feedback = hot_standby_feedback;
|
||||||
bool conninfo_changed;
|
bool conninfo_changed;
|
||||||
bool primary_slotname_changed;
|
bool primary_slotname_changed;
|
||||||
|
bool is_slotsync_worker = AmLogicalSlotSyncWorkerProcess();
|
||||||
|
bool parameter_changed = false;
|
||||||
|
|
||||||
|
if (is_slotsync_worker)
|
||||||
Assert(sync_replication_slots);
|
Assert(sync_replication_slots);
|
||||||
|
|
||||||
ConfigReloadPending = false;
|
ConfigReloadPending = false;
|
||||||
@@ -1221,33 +1227,61 @@ slotsync_reread_config(void)
|
|||||||
pfree(old_primary_slotname);
|
pfree(old_primary_slotname);
|
||||||
|
|
||||||
if (old_sync_replication_slots != sync_replication_slots)
|
if (old_sync_replication_slots != sync_replication_slots)
|
||||||
|
{
|
||||||
|
if (is_slotsync_worker)
|
||||||
{
|
{
|
||||||
ereport(LOG,
|
ereport(LOG,
|
||||||
/* translator: %s is a GUC variable name */
|
/* translator: %s is a GUC variable name */
|
||||||
errmsg("replication slot synchronization worker will shut down because \"%s\" is disabled", "sync_replication_slots"));
|
errmsg("replication slot synchronization worker will stop because \"%s\" is disabled",
|
||||||
|
"sync_replication_slots"));
|
||||||
|
|
||||||
proc_exit(0);
|
proc_exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
parameter_changed = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
if (conninfo_changed ||
|
if (conninfo_changed ||
|
||||||
primary_slotname_changed ||
|
primary_slotname_changed ||
|
||||||
(old_hot_standby_feedback != hot_standby_feedback))
|
(old_hot_standby_feedback != hot_standby_feedback))
|
||||||
|
{
|
||||||
|
|
||||||
|
if (is_slotsync_worker)
|
||||||
{
|
{
|
||||||
ereport(LOG,
|
ereport(LOG,
|
||||||
errmsg("replication slot synchronization worker will restart because of a parameter change"));
|
errmsg("replication slot synchronization worker will restart because of a parameter change"));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reset the last-start time for this worker so that the postmaster
|
* Reset the last-start time for this worker so that the
|
||||||
* can restart it without waiting for SLOTSYNC_RESTART_INTERVAL_SEC.
|
* postmaster can restart it without waiting for
|
||||||
|
* SLOTSYNC_RESTART_INTERVAL_SEC.
|
||||||
*/
|
*/
|
||||||
SlotSyncCtx->last_start_time = 0;
|
SlotSyncCtx->last_start_time = 0;
|
||||||
|
|
||||||
proc_exit(0);
|
proc_exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
parameter_changed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we have reached here with a parameter change, we must be running in
|
||||||
|
* SQL function, emit error in such a case.
|
||||||
|
*/
|
||||||
|
if (parameter_changed)
|
||||||
|
{
|
||||||
|
Assert(!is_slotsync_worker);
|
||||||
|
ereport(ERROR,
|
||||||
|
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||||
|
errmsg("replication slot synchronization will stop because of a parameter change"));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Interrupt handler for main loop of slot sync worker.
|
* Interrupt handler for process performing slot synchronization.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
ProcessSlotSyncInterrupts(void)
|
ProcessSlotSyncInterrupts(void)
|
||||||
@@ -1255,12 +1289,25 @@ ProcessSlotSyncInterrupts(void)
|
|||||||
CHECK_FOR_INTERRUPTS();
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
||||||
if (SlotSyncCtx->stopSignaled)
|
if (SlotSyncCtx->stopSignaled)
|
||||||
|
{
|
||||||
|
if (AmLogicalSlotSyncWorkerProcess())
|
||||||
{
|
{
|
||||||
ereport(LOG,
|
ereport(LOG,
|
||||||
errmsg("replication slot synchronization worker is shutting down because promotion is triggered"));
|
errmsg("replication slot synchronization worker will stop because promotion is triggered"));
|
||||||
|
|
||||||
proc_exit(0);
|
proc_exit(0);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* For the backend executing SQL function
|
||||||
|
* pg_sync_replication_slots().
|
||||||
|
*/
|
||||||
|
ereport(ERROR,
|
||||||
|
errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||||
|
errmsg("replication slot synchronization will stop because promotion is triggered"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (ConfigReloadPending)
|
if (ConfigReloadPending)
|
||||||
slotsync_reread_config();
|
slotsync_reread_config();
|
||||||
@@ -1362,29 +1409,14 @@ wait_for_slot_activity(bool some_slot_updated)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Emit an error if a promotion or a concurrent sync call is in progress.
|
* Emit an error if a concurrent sync call is in progress.
|
||||||
* Otherwise, advertise that a sync is in progress.
|
* Otherwise, advertise that a sync is in progress.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
check_and_set_sync_info(pid_t worker_pid)
|
check_and_set_sync_info(pid_t sync_process_pid)
|
||||||
{
|
{
|
||||||
SpinLockAcquire(&SlotSyncCtx->mutex);
|
SpinLockAcquire(&SlotSyncCtx->mutex);
|
||||||
|
|
||||||
/* The worker pid must not be already assigned in SlotSyncCtx */
|
|
||||||
Assert(worker_pid == InvalidPid || SlotSyncCtx->pid == InvalidPid);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Emit an error if startup process signaled the slot sync machinery to
|
|
||||||
* stop. See comments atop SlotSyncCtxStruct.
|
|
||||||
*/
|
|
||||||
if (SlotSyncCtx->stopSignaled)
|
|
||||||
{
|
|
||||||
SpinLockRelease(&SlotSyncCtx->mutex);
|
|
||||||
ereport(ERROR,
|
|
||||||
errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
||||||
errmsg("cannot synchronize replication slots when standby promotion is ongoing"));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (SlotSyncCtx->syncing)
|
if (SlotSyncCtx->syncing)
|
||||||
{
|
{
|
||||||
SpinLockRelease(&SlotSyncCtx->mutex);
|
SpinLockRelease(&SlotSyncCtx->mutex);
|
||||||
@@ -1393,13 +1425,16 @@ check_and_set_sync_info(pid_t worker_pid)
|
|||||||
errmsg("cannot synchronize replication slots concurrently"));
|
errmsg("cannot synchronize replication slots concurrently"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* The pid must not be already assigned in SlotSyncCtx */
|
||||||
|
Assert(SlotSyncCtx->pid == InvalidPid);
|
||||||
|
|
||||||
SlotSyncCtx->syncing = true;
|
SlotSyncCtx->syncing = true;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Advertise the required PID so that the startup process can kill the
|
* Advertise the required PID so that the startup process can kill the
|
||||||
* slot sync worker on promotion.
|
* slot sync process on promotion.
|
||||||
*/
|
*/
|
||||||
SlotSyncCtx->pid = worker_pid;
|
SlotSyncCtx->pid = sync_process_pid;
|
||||||
|
|
||||||
SpinLockRelease(&SlotSyncCtx->mutex);
|
SpinLockRelease(&SlotSyncCtx->mutex);
|
||||||
|
|
||||||
@@ -1414,6 +1449,7 @@ reset_syncing_flag(void)
|
|||||||
{
|
{
|
||||||
SpinLockAcquire(&SlotSyncCtx->mutex);
|
SpinLockAcquire(&SlotSyncCtx->mutex);
|
||||||
SlotSyncCtx->syncing = false;
|
SlotSyncCtx->syncing = false;
|
||||||
|
SlotSyncCtx->pid = InvalidPid;
|
||||||
SpinLockRelease(&SlotSyncCtx->mutex);
|
SpinLockRelease(&SlotSyncCtx->mutex);
|
||||||
|
|
||||||
syncing_slots = false;
|
syncing_slots = false;
|
||||||
@@ -1622,7 +1658,7 @@ update_synced_slots_inactive_since(void)
|
|||||||
if (!StandbyMode)
|
if (!StandbyMode)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* The slot sync worker or SQL function mustn't be running by now */
|
/* The slot sync worker or the SQL function mustn't be running by now */
|
||||||
Assert((SlotSyncCtx->pid == InvalidPid) && !SlotSyncCtx->syncing);
|
Assert((SlotSyncCtx->pid == InvalidPid) && !SlotSyncCtx->syncing);
|
||||||
|
|
||||||
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
|
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
|
||||||
@@ -1651,16 +1687,18 @@ update_synced_slots_inactive_since(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Shut down the slot sync worker.
|
* Shut down slot synchronization.
|
||||||
*
|
*
|
||||||
* This function sends signal to shutdown slot sync worker, if required. It
|
* This function sets stopSignaled=true and wakes up the slot sync process
|
||||||
* also waits till the slot sync worker has exited or
|
* (either worker or backend running the SQL function pg_sync_replication_slots())
|
||||||
|
* so that worker can exit or the SQL function pg_sync_replication_slots() can
|
||||||
|
* finish. It also waits till the slot sync worker has exited or
|
||||||
* pg_sync_replication_slots() has finished.
|
* pg_sync_replication_slots() has finished.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
ShutDownSlotSync(void)
|
ShutDownSlotSync(void)
|
||||||
{
|
{
|
||||||
pid_t worker_pid;
|
pid_t sync_process_pid;
|
||||||
|
|
||||||
SpinLockAcquire(&SlotSyncCtx->mutex);
|
SpinLockAcquire(&SlotSyncCtx->mutex);
|
||||||
|
|
||||||
@@ -1677,16 +1715,16 @@ ShutDownSlotSync(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
worker_pid = SlotSyncCtx->pid;
|
sync_process_pid = SlotSyncCtx->pid;
|
||||||
|
|
||||||
SpinLockRelease(&SlotSyncCtx->mutex);
|
SpinLockRelease(&SlotSyncCtx->mutex);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Signal slotsync worker if it was still running. The worker will stop
|
* Signal process doing slotsync, if any. The process will stop upon
|
||||||
* upon detecting that the stopSignaled flag is set to true.
|
* detecting that the stopSignaled flag is set to true.
|
||||||
*/
|
*/
|
||||||
if (worker_pid != InvalidPid)
|
if (sync_process_pid != InvalidPid)
|
||||||
kill(worker_pid, SIGUSR1);
|
kill(sync_process_pid, SIGUSR1);
|
||||||
|
|
||||||
/* Wait for slot sync to end */
|
/* Wait for slot sync to end */
|
||||||
for (;;)
|
for (;;)
|
||||||
@@ -1835,7 +1873,10 @@ SyncReplicationSlots(WalReceiverConn *wrconn)
|
|||||||
{
|
{
|
||||||
PG_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn));
|
PG_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn));
|
||||||
{
|
{
|
||||||
check_and_set_sync_info(InvalidPid);
|
check_and_set_sync_info(MyProcPid);
|
||||||
|
|
||||||
|
/* Check for interrupts and config changes */
|
||||||
|
ProcessSlotSyncInterrupts();
|
||||||
|
|
||||||
validate_remote_info(wrconn);
|
validate_remote_info(wrconn);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user