mirror of
https://github.com/postgres/postgres.git
synced 2025-11-16 15:02:33 +03:00
Fix unconditional WAL receiver shutdown during stream-archive transition
Commitb4f584f9d2(affecting v15~, later backpatched down to 13 as of3635a0a35a) introduced an unconditional WAL receiver shutdown when switching from streaming to archive WAL sources. This causes problems during a timeline switch, when a WAL receiver enters WALRCV_WAITING state but remains alive, waiting for instructions. The unconditional shutdown can break some monitoring scenarios as the WAL receiver gets repeatedly terminated and re-spawned, causing pg_stat_wal_receiver.status to show a "streaming" instead of "waiting" status, masking the fact that the WAL receiver is waiting for a new TLI and a new LSN to be able to continue streaming. This commit changes the WAL receiver behavior so as the shutdown becomes conditional, with InstallXLogFileSegmentActive being always reset to prevent the regression fixed byb4f584f9d2: only terminate the WAL receiver when it is actively streaming (WALRCV_STREAMING, WALRCV_STARTING, or WALRCV_RESTARTING). When in WALRCV_WAITING state, just reset InstallXLogFileSegmentActive flag to allow archive restoration without killing the process. WALRCV_STOPPED and WALRCV_STOPPING are not reachable states in this code path. For the latter, the startup process is the one in charge of setting WALRCV_STOPPING via ShutdownWalRcv(), waiting for the WAL receiver to reach a WALRCV_STOPPED state after switching walRcvState, so WaitForWALToBecomeAvailable() cannot be reached while a WAL receiver is in a WALRCV_STOPPING state. A regression test is added to check that a WAL receiver is not stopped on timeline jump, that fails when the fix of this commit is reverted. Reported-by: Ryan Bird <ryanzxg@gmail.com> Author: Xuneng Zhou <xunengzhou@gmail.com> Reviewed-by: Noah Misch <noah@leadboat.com> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/19093-c4fff49a608f82a0@postgresql.org Backpatch-through: 13
This commit is contained in:
@@ -936,6 +936,7 @@ static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
|
|||||||
int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
|
int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
|
||||||
static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
|
static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
|
||||||
bool fetching_ckpt, XLogRecPtr tliRecPtr);
|
bool fetching_ckpt, XLogRecPtr tliRecPtr);
|
||||||
|
static void ResetInstallXLogFileSegmentActive(void);
|
||||||
static void XLogShutdownWalRcv(void);
|
static void XLogShutdownWalRcv(void);
|
||||||
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
|
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
|
||||||
static void XLogFileClose(void);
|
static void XLogFileClose(void);
|
||||||
@@ -12611,8 +12612,18 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
|
|||||||
* Before we leave XLOG_FROM_STREAM state, make sure that
|
* Before we leave XLOG_FROM_STREAM state, make sure that
|
||||||
* walreceiver is not active, so that it won't overwrite
|
* walreceiver is not active, so that it won't overwrite
|
||||||
* WAL that we restore from archive.
|
* WAL that we restore from archive.
|
||||||
|
* If walreceiver is actively streaming (or attempting to
|
||||||
|
* connect), we must shut it down. However, if it's
|
||||||
|
* already in WAITING state (e.g., due to timeline
|
||||||
|
* divergence), we only need to reset the install flag to
|
||||||
|
* allow archive restoration.
|
||||||
*/
|
*/
|
||||||
XLogShutdownWalRcv();
|
if (WalRcvStreaming())
|
||||||
|
XLogShutdownWalRcv();
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ResetInstallXLogFileSegmentActive();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Before we sleep, re-scan for possible new timelines if
|
* Before we sleep, re-scan for possible new timelines if
|
||||||
@@ -12958,15 +12969,21 @@ StartupRequestWalReceiverRestart(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Disable WAL file recycling and preallocation. */
|
||||||
|
static void
|
||||||
|
ResetInstallXLogFileSegmentActive(void)
|
||||||
|
{
|
||||||
|
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
|
||||||
|
XLogCtl->InstallXLogFileSegmentActive = false;
|
||||||
|
LWLockRelease(ControlFileLock);
|
||||||
|
}
|
||||||
|
|
||||||
/* Thin wrapper around ShutdownWalRcv(). */
|
/* Thin wrapper around ShutdownWalRcv(). */
|
||||||
static void
|
static void
|
||||||
XLogShutdownWalRcv(void)
|
XLogShutdownWalRcv(void)
|
||||||
{
|
{
|
||||||
ShutdownWalRcv();
|
ShutdownWalRcv();
|
||||||
|
ResetInstallXLogFileSegmentActive();
|
||||||
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
|
|
||||||
XLogCtl->InstallXLogFileSegmentActive = false;
|
|
||||||
LWLockRelease(ControlFileLock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use warnings;
|
|||||||
use File::Path qw(rmtree);
|
use File::Path qw(rmtree);
|
||||||
use PostgresNode;
|
use PostgresNode;
|
||||||
use TestLib;
|
use TestLib;
|
||||||
use Test::More tests => 3;
|
use Test::More tests => 4;
|
||||||
|
|
||||||
$ENV{PGDATABASE} = 'postgres';
|
$ENV{PGDATABASE} = 'postgres';
|
||||||
|
|
||||||
@@ -68,6 +68,14 @@ my $result =
|
|||||||
$node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int");
|
$node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int");
|
||||||
is($result, qq(2000), 'check content of standby 2');
|
is($result, qq(2000), 'check content of standby 2');
|
||||||
|
|
||||||
|
# Check the logs, WAL receiver should not have been stopped while
|
||||||
|
# transitioning to its new timeline. There is no need to rely on an
|
||||||
|
# offset in this check of the server logs: a new log file is used on
|
||||||
|
# node restart when primary_conninfo is updated above.
|
||||||
|
ok( !$node_standby_2->log_contains(
|
||||||
|
"FATAL: .* terminating walreceiver process due to administrator command"
|
||||||
|
),
|
||||||
|
'WAL receiver should not be stopped across timeline jumps');
|
||||||
|
|
||||||
# Ensure that a standby is able to follow a master on a newer timeline
|
# Ensure that a standby is able to follow a master on a newer timeline
|
||||||
# when WAL archiving is enabled.
|
# when WAL archiving is enabled.
|
||||||
|
|||||||
Reference in New Issue
Block a user