mirror of
https://github.com/postgres/postgres.git
synced 2025-04-24 10:47:04 +03:00
Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d7f71764b8c628044e7ce2225f9aa43b67, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch.
This commit is contained in:
parent
6ba365aa46
commit
45811be94e
@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
|
|||||||
PgStatPID = 0,
|
PgStatPID = 0,
|
||||||
SysLoggerPID = 0;
|
SysLoggerPID = 0;
|
||||||
|
|
||||||
|
/* Startup process's status */
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
STARTUP_NOT_RUNNING,
|
||||||
|
STARTUP_RUNNING,
|
||||||
|
STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */
|
||||||
|
STARTUP_CRASHED
|
||||||
|
} StartupStatusEnum;
|
||||||
|
|
||||||
|
static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
|
||||||
|
|
||||||
/* Startup/shutdown state */
|
/* Startup/shutdown state */
|
||||||
#define NoShutdown 0
|
#define NoShutdown 0
|
||||||
#define SmartShutdown 1
|
#define SmartShutdown 1
|
||||||
@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
|
|||||||
static int Shutdown = NoShutdown;
|
static int Shutdown = NoShutdown;
|
||||||
|
|
||||||
static bool FatalError = false; /* T if recovering from backend crash */
|
static bool FatalError = false; /* T if recovering from backend crash */
|
||||||
static bool RecoveryError = false; /* T if WAL recovery failed */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We use a simple state machine to control startup, shutdown, and
|
* We use a simple state machine to control startup, shutdown, and
|
||||||
@ -301,8 +311,6 @@ static bool RecoveryError = false; /* T if WAL recovery failed */
|
|||||||
* states, nor in PM_SHUTDOWN states (because we don't enter those states
|
* states, nor in PM_SHUTDOWN states (because we don't enter those states
|
||||||
* when trying to recover from a crash). It can be true in PM_STARTUP state,
|
* when trying to recover from a crash). It can be true in PM_STARTUP state,
|
||||||
* because we don't clear it until we've successfully started WAL redo.
|
* because we don't clear it until we've successfully started WAL redo.
|
||||||
* Similarly, RecoveryError means that we have crashed during recovery, and
|
|
||||||
* should not try to restart.
|
|
||||||
*/
|
*/
|
||||||
typedef enum
|
typedef enum
|
||||||
{
|
{
|
||||||
@ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[])
|
|||||||
*/
|
*/
|
||||||
StartupPID = StartupDataBase();
|
StartupPID = StartupDataBase();
|
||||||
Assert(StartupPID != 0);
|
Assert(StartupPID != 0);
|
||||||
|
StartupStatus = STARTUP_RUNNING;
|
||||||
pmState = PM_STARTUP;
|
pmState = PM_STARTUP;
|
||||||
|
|
||||||
/* Some workers may be scheduled to start now */
|
/* Some workers may be scheduled to start now */
|
||||||
@ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS)
|
|||||||
if (Shutdown > NoShutdown &&
|
if (Shutdown > NoShutdown &&
|
||||||
(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
|
(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
|
||||||
{
|
{
|
||||||
|
StartupStatus = STARTUP_NOT_RUNNING;
|
||||||
pmState = PM_WAIT_BACKENDS;
|
pmState = PM_WAIT_BACKENDS;
|
||||||
/* PostmasterStateMachine logic does the rest */
|
/* PostmasterStateMachine logic does the rest */
|
||||||
continue;
|
continue;
|
||||||
@ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS)
|
|||||||
{
|
{
|
||||||
ereport(LOG,
|
ereport(LOG,
|
||||||
(errmsg("shutdown at recovery target")));
|
(errmsg("shutdown at recovery target")));
|
||||||
|
StartupStatus = STARTUP_NOT_RUNNING;
|
||||||
Shutdown = SmartShutdown;
|
Shutdown = SmartShutdown;
|
||||||
TerminateChildren(SIGTERM);
|
TerminateChildren(SIGTERM);
|
||||||
pmState = PM_WAIT_BACKENDS;
|
pmState = PM_WAIT_BACKENDS;
|
||||||
@ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS)
|
|||||||
/*
|
/*
|
||||||
* After PM_STARTUP, any unexpected exit (including FATAL exit) of
|
* After PM_STARTUP, any unexpected exit (including FATAL exit) of
|
||||||
* the startup process is catastrophic, so kill other children,
|
* the startup process is catastrophic, so kill other children,
|
||||||
* and set RecoveryError so we don't try to reinitialize after
|
* and set StartupStatus so we don't try to reinitialize after
|
||||||
* they're gone. Exception: if FatalError is already set, that
|
* they're gone. Exception: if StartupStatus is STARTUP_SIGNALED,
|
||||||
* implies we previously sent the startup process a SIGQUIT, so
|
* then we previously sent the startup process a SIGQUIT; so
|
||||||
* that's probably the reason it died, and we do want to try to
|
* that's probably the reason it died, and we do want to try to
|
||||||
* restart in that case.
|
* restart in that case.
|
||||||
*/
|
*/
|
||||||
if (!EXIT_STATUS_0(exitstatus))
|
if (!EXIT_STATUS_0(exitstatus))
|
||||||
{
|
{
|
||||||
if (!FatalError)
|
if (StartupStatus == STARTUP_SIGNALED)
|
||||||
RecoveryError = true;
|
StartupStatus = STARTUP_NOT_RUNNING;
|
||||||
|
else
|
||||||
|
StartupStatus = STARTUP_CRASHED;
|
||||||
HandleChildCrash(pid, exitstatus,
|
HandleChildCrash(pid, exitstatus,
|
||||||
_("startup process"));
|
_("startup process"));
|
||||||
continue;
|
continue;
|
||||||
@ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS)
|
|||||||
/*
|
/*
|
||||||
* Startup succeeded, commence normal operations
|
* Startup succeeded, commence normal operations
|
||||||
*/
|
*/
|
||||||
|
StartupStatus = STARTUP_NOT_RUNNING;
|
||||||
FatalError = false;
|
FatalError = false;
|
||||||
Assert(AbortStartTime == 0);
|
Assert(AbortStartTime == 0);
|
||||||
ReachedNormalRunning = true;
|
ReachedNormalRunning = true;
|
||||||
@ -3190,7 +3204,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
|
|||||||
|
|
||||||
/* Take care of the startup process too */
|
/* Take care of the startup process too */
|
||||||
if (pid == StartupPID)
|
if (pid == StartupPID)
|
||||||
|
{
|
||||||
StartupPID = 0;
|
StartupPID = 0;
|
||||||
|
StartupStatus = STARTUP_CRASHED;
|
||||||
|
}
|
||||||
else if (StartupPID != 0 && take_action)
|
else if (StartupPID != 0 && take_action)
|
||||||
{
|
{
|
||||||
ereport(DEBUG2,
|
ereport(DEBUG2,
|
||||||
@ -3198,6 +3215,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
|
|||||||
(SendStop ? "SIGSTOP" : "SIGQUIT"),
|
(SendStop ? "SIGSTOP" : "SIGQUIT"),
|
||||||
(int) StartupPID)));
|
(int) StartupPID)));
|
||||||
signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
|
signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
|
||||||
|
StartupStatus = STARTUP_SIGNALED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Take care of the bgwriter too */
|
/* Take care of the bgwriter too */
|
||||||
@ -3589,13 +3607,14 @@ PostmasterStateMachine(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If recovery failed, or the user does not want an automatic restart
|
* If the startup process failed, or the user does not want an automatic
|
||||||
* after backend crashes, wait for all non-syslogger children to exit, and
|
* restart after backend crashes, wait for all non-syslogger children to
|
||||||
* then exit postmaster. We don't try to reinitialize when recovery fails,
|
* exit, and then exit postmaster. We don't try to reinitialize when the
|
||||||
* because more than likely it will just fail again and we will keep
|
* startup process fails, because more than likely it will just fail again
|
||||||
* trying forever.
|
* and we will keep trying forever.
|
||||||
*/
|
*/
|
||||||
if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))
|
if (pmState == PM_NO_CHILDREN &&
|
||||||
|
(StartupStatus == STARTUP_CRASHED || !restart_after_crash))
|
||||||
ExitPostmaster(1);
|
ExitPostmaster(1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -3615,6 +3634,7 @@ PostmasterStateMachine(void)
|
|||||||
|
|
||||||
StartupPID = StartupDataBase();
|
StartupPID = StartupDataBase();
|
||||||
Assert(StartupPID != 0);
|
Assert(StartupPID != 0);
|
||||||
|
StartupStatus = STARTUP_RUNNING;
|
||||||
pmState = PM_STARTUP;
|
pmState = PM_STARTUP;
|
||||||
/* crash recovery started, reset SIGKILL flag */
|
/* crash recovery started, reset SIGKILL flag */
|
||||||
AbortStartTime = 0;
|
AbortStartTime = 0;
|
||||||
@ -3746,7 +3766,11 @@ TerminateChildren(int signal)
|
|||||||
{
|
{
|
||||||
SignalChildren(signal);
|
SignalChildren(signal);
|
||||||
if (StartupPID != 0)
|
if (StartupPID != 0)
|
||||||
|
{
|
||||||
signal_child(StartupPID, signal);
|
signal_child(StartupPID, signal);
|
||||||
|
if (signal == SIGQUIT || signal == SIGKILL)
|
||||||
|
StartupStatus = STARTUP_SIGNALED;
|
||||||
|
}
|
||||||
if (BgWriterPID != 0)
|
if (BgWriterPID != 0)
|
||||||
signal_child(BgWriterPID, signal);
|
signal_child(BgWriterPID, signal);
|
||||||
if (CheckpointerPID != 0)
|
if (CheckpointerPID != 0)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user