diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index bd50ea8e480..24b1624bad1 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -11500,6 +11500,62 @@ LOG: CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1)
+
+ send_abort_for_crash (boolean)
+
+ send_abort_for_crash configuration parameter
+
+
+
+
+ By default, after a backend crash the postmaster will stop remaining
+ child processes by sending them SIGQUIT
+ signals, which permits them to exit more-or-less gracefully. When
+ this option is set to on,
+ SIGABRT is sent instead. That normally
+ results in production of a core dump file for each such child
+ process.
+ This can be handy for investigating the states of other processes
+ after a crash. It can also consume lots of disk space in the event
+ of repeated crashes, so do not enable this on systems you are not
+ monitoring carefully.
+ Beware that no support exists for cleaning up the core file(s)
+ automatically.
+ This parameter can only be set in
+ the postgresql.conf file or on the server
+ command line.
+
+
+
+
+
+ send_abort_for_kill (boolean)
+
+ send_abort_for_kill configuration parameter
+
+
+
+
+ By default, after attempting to stop a child process with
+ SIGQUIT, the postmaster will wait five
+ seconds and then send SIGKILL to force
+ immediate termination. When this option is set
+ to on, SIGABRT is sent
+ instead of SIGKILL. That normally results
+ in production of a core dump file for each such child process.
+ This can be handy for investigating the states
+ of stuck child processes. It can also consume lots
+ of disk space in the event of repeated crashes, so do not enable
+ this on systems you are not monitoring carefully.
+ Beware that no support exists for cleaning up the core file(s)
+ automatically.
+ This parameter can only be set in
+ the postgresql.conf file or on the server
+ command line.
+
+
+
+
diff --git a/doc/src/sgml/ref/postgres-ref.sgml b/doc/src/sgml/ref/postgres-ref.sgml
index 55a3f6c69d1..b13a16a117f 100644
--- a/doc/src/sgml/ref/postgres-ref.sgml
+++ b/doc/src/sgml/ref/postgres-ref.sgml
@@ -409,24 +409,6 @@ PostgreSQL documentation
-
-
-
-
- This option is for debugging problems that cause a server
- process to die abnormally. The ordinary strategy in this
- situation is to notify all other server processes that they
- must terminate and then reinitialize the shared memory and
- semaphores. This is because an errant server process could
- have corrupted some shared state before terminating. This
- option specifies that postgres will
- not reinitialize shared data structures. A knowledgeable
- system programmer can then use a debugger to examine shared
- memory and semaphore state.
-
-
-
-
@@ -466,14 +448,9 @@ PostgreSQL documentation
This option is for debugging problems that cause a server
process to die abnormally. The ordinary strategy in this
situation is to notify all other server processes that they
- must terminate and then reinitialize the shared memory and
- semaphores. This is because an errant server process could
- have corrupted some shared state before terminating. This
- option specifies that postgres will
- stop all other server processes by sending the signal
- SIGSTOP, but will not cause them to
- terminate. This permits system programmers to collect core
- dumps from all server processes by hand.
+ must terminate, by sending them SIGQUIT
+ signals. With this option, SIGABRT
+ will be sent instead, resulting in production of core dump files.
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index f5da4260a13..67f556b4dd8 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -352,11 +352,10 @@ help(const char *progname)
printf(_("\nDeveloper options:\n"));
printf(_(" -f s|i|o|b|t|n|m|h forbid use of some plan types\n"));
- printf(_(" -n do not reinitialize shared memory after abnormal exit\n"));
printf(_(" -O allow system table structure changes\n"));
printf(_(" -P disable system indexes\n"));
printf(_(" -t pa|pl|ex show timings after each query\n"));
- printf(_(" -T send SIGSTOP to all backend processes if one dies\n"));
+ printf(_(" -T send SIGABRT to all backend processes if one dies\n"));
printf(_(" -W NUM wait NUM seconds to allow attach from a debugger\n"));
printf(_("\nOptions for single-user mode:\n"));
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 1da5752047f..c83cc8cc6cd 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -219,16 +219,6 @@ int ReservedBackends;
#define MAXLISTEN 64
static pgsocket ListenSocket[MAXLISTEN];
-/*
- * These globals control the behavior of the postmaster in case some
- * backend dumps core. Normally, it kills all peers of the dead backend
- * and reinitializes shared memory. By specifying -s or -n, we can have
- * the postmaster stop (rather than kill) peers and not reinitialize
- * shared data structures. (Reinit is currently dead code, though.)
- */
-static bool Reinit = true;
-static int SendStop = false;
-
/* still more option variables */
bool EnableSSL = false;
@@ -243,6 +233,8 @@ bool enable_bonjour = false;
char *bonjour_name;
bool restart_after_crash = true;
bool remove_temp_files_after_crash = true;
+bool send_abort_for_crash = false;
+bool send_abort_for_kill = false;
/* PIDs of special child processes; 0 when not running */
static pid_t StartupPID = 0,
@@ -414,6 +406,7 @@ static void report_fork_failure_to_client(Port *port, int errnum);
static CAC_state canAcceptConnections(int backend_type);
static bool RandomCancelKey(int32 *cancel_key);
static void signal_child(pid_t pid, int signal);
+static void sigquit_child(pid_t pid);
static bool SignalSomeChildren(int signal, int target);
static void TerminateChildren(int signal);
@@ -697,7 +690,7 @@ PostmasterMain(int argc, char *argv[])
* tcop/postgres.c (the option sets should not conflict) and with the
* common help() function in main/main.c.
*/
- while ((opt = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lN:nOPp:r:S:sTt:W:-:")) != -1)
+ while ((opt = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lN:OPp:r:S:sTt:W:-:")) != -1)
{
switch (opt)
{
@@ -767,11 +760,6 @@ PostmasterMain(int argc, char *argv[])
SetConfigOption("max_connections", optarg, PGC_POSTMASTER, PGC_S_ARGV);
break;
- case 'n':
- /* Don't reinit shared mem after abnormal exit */
- Reinit = false;
- break;
-
case 'O':
SetConfigOption("allow_system_table_mods", "true", PGC_POSTMASTER, PGC_S_ARGV);
break;
@@ -799,11 +787,10 @@ PostmasterMain(int argc, char *argv[])
case 'T':
/*
- * In the event that some backend dumps core, send SIGSTOP,
- * rather than SIGQUIT, to all its peers. This lets the wily
- * post_hacker collect core dumps from everyone.
+ * This option used to be defined as sending SIGSTOP after a
+ * backend crash, but sending SIGABRT seems more useful.
*/
- SendStop = true;
+ SetConfigOption("send_abort_for_crash", "true", PGC_POSTMASTER, PGC_S_ARGV);
break;
case 't':
@@ -1896,20 +1883,23 @@ ServerLoop(void)
/*
* If we already sent SIGQUIT to children and they are slow to shut
- * down, it's time to send them SIGKILL. This doesn't happen
- * normally, but under certain conditions backends can get stuck while
- * shutting down. This is a last measure to get them unwedged.
+ * down, it's time to send them SIGKILL (or SIGABRT if requested).
+ * This doesn't happen normally, but under certain conditions backends
+ * can get stuck while shutting down. This is a last measure to get
+ * them unwedged.
*
* Note we also do this during recovery from a process crash.
*/
- if ((Shutdown >= ImmediateShutdown || (FatalError && !SendStop)) &&
+ if ((Shutdown >= ImmediateShutdown || FatalError) &&
AbortStartTime != 0 &&
(now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS)
{
/* We were gentle with them before. Not anymore */
ereport(LOG,
- (errmsg("issuing SIGKILL to recalcitrant children")));
- TerminateChildren(SIGKILL);
+ /* translator: %s is SIGKILL or SIGABRT */
+ (errmsg("issuing %s to recalcitrant children",
+ send_abort_for_kill ? "SIGABRT" : "SIGKILL")));
+ TerminateChildren(send_abort_for_kill ? SIGABRT : SIGKILL);
/* reset flag so we don't SIGKILL again */
AbortStartTime = 0;
}
@@ -2903,6 +2893,7 @@ pmdie(SIGNAL_ARGS)
#endif
/* tell children to shut down ASAP */
+ /* (note we don't apply send_abort_for_crash here) */
SetQuitSignalReason(PMQUIT_FOR_STOP);
TerminateChildren(SIGQUIT);
pmState = PM_WAIT_BACKENDS;
@@ -3470,20 +3461,9 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/*
* This worker is still alive. Unless we did so already, tell it
* to commit hara-kiri.
- *
- * SIGQUIT is the special signal that says exit without proc_exit
- * and let the user know what's going on. But if SendStop is set
- * (-T on command line), then we send SIGSTOP instead, so that we
- * can get core dumps from all backends by hand.
*/
if (take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) rw->rw_pid)));
- signal_child(rw->rw_pid, (SendStop ? SIGSTOP : SIGQUIT));
- }
+ sigquit_child(rw->rw_pid);
}
}
@@ -3514,13 +3494,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
* This backend is still alive. Unless we did so already, tell it
* to commit hara-kiri.
*
- * SIGQUIT is the special signal that says exit without proc_exit
- * and let the user know what's going on. But if SendStop is set
- * (-T on command line), then we send SIGSTOP instead, so that we
- * can get core dumps from all backends by hand.
- *
- * We could exclude dead_end children here, but at least in the
- * SIGSTOP case it seems better to include them.
+ * We could exclude dead_end children here, but at least when
+ * sending SIGABRT it seems better to include them.
*
* Background workers were already processed above; ignore them
* here.
@@ -3529,13 +3504,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
continue;
if (take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) bp->pid)));
- signal_child(bp->pid, (SendStop ? SIGSTOP : SIGQUIT));
- }
+ sigquit_child(bp->pid);
}
}
@@ -3547,11 +3516,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
}
else if (StartupPID != 0 && take_action)
{
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) StartupPID)));
- signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
+ sigquit_child(StartupPID);
StartupStatus = STARTUP_SIGNALED;
}
@@ -3559,73 +3524,37 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
if (pid == BgWriterPID)
BgWriterPID = 0;
else if (BgWriterPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) BgWriterPID)));
- signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
- }
+ sigquit_child(BgWriterPID);
/* Take care of the checkpointer too */
if (pid == CheckpointerPID)
CheckpointerPID = 0;
else if (CheckpointerPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) CheckpointerPID)));
- signal_child(CheckpointerPID, (SendStop ? SIGSTOP : SIGQUIT));
- }
+ sigquit_child(CheckpointerPID);
/* Take care of the walwriter too */
if (pid == WalWriterPID)
WalWriterPID = 0;
else if (WalWriterPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) WalWriterPID)));
- signal_child(WalWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
- }
+ sigquit_child(WalWriterPID);
/* Take care of the walreceiver too */
if (pid == WalReceiverPID)
WalReceiverPID = 0;
else if (WalReceiverPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) WalReceiverPID)));
- signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT));
- }
+ sigquit_child(WalReceiverPID);
/* Take care of the autovacuum launcher too */
if (pid == AutoVacPID)
AutoVacPID = 0;
else if (AutoVacPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) AutoVacPID)));
- signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
- }
+ sigquit_child(AutoVacPID);
/* Take care of the archiver too */
if (pid == PgArchPID)
PgArchPID = 0;
else if (PgArchPID != 0 && take_action)
- {
- ereport(DEBUG2,
- (errmsg_internal("sending %s to process %d",
- (SendStop ? "SIGSTOP" : "SIGQUIT"),
- (int) PgArchPID)));
- signal_child(PgArchPID, (SendStop ? SIGSTOP : SIGQUIT));
- }
+ sigquit_child(PgArchPID);
/* We do NOT restart the syslogger */
@@ -3834,6 +3763,10 @@ PostmasterStateMachine(void)
* Any required cleanup will happen at next restart. We
* set FatalError so that an "abnormal shutdown" message
* gets logged when we exit.
+ *
+ * We don't consult send_abort_for_crash here, as it's
+ * unlikely that dumping cores would illuminate the reason
+ * for checkpointer fork failure.
*/
FatalError = true;
pmState = PM_WAIT_DEAD_END;
@@ -4003,8 +3936,8 @@ signal_child(pid_t pid, int signal)
case SIGINT:
case SIGTERM:
case SIGQUIT:
- case SIGSTOP:
case SIGKILL:
+ case SIGABRT:
if (kill(-pid, signal) < 0)
elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) (-pid), signal);
break;
@@ -4014,6 +3947,24 @@ signal_child(pid_t pid, int signal)
#endif
}
+/*
+ * Convenience function for killing a child process after a crash of some
+ * other child process. We log the action at a higher level than we would
+ * otherwise do, and we apply send_abort_for_crash to decide which signal
+ * to send. Normally it's SIGQUIT -- and most other comments in this file
+ * are written on the assumption that it is -- but developers might prefer
+ * to use SIGABRT to collect per-child core dumps.
+ */
+static void
+sigquit_child(pid_t pid)
+{
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (send_abort_for_crash ? "SIGABRT" : "SIGQUIT"),
+ (int) pid)));
+ signal_child(pid, (send_abort_for_crash ? SIGABRT : SIGQUIT));
+}
+
/*
* Send a signal to the targeted children (but NOT special children;
* dead_end children are never signaled, either).
@@ -4069,7 +4020,7 @@ TerminateChildren(int signal)
if (StartupPID != 0)
{
signal_child(StartupPID, signal);
- if (signal == SIGQUIT || signal == SIGKILL)
+ if (signal == SIGQUIT || signal == SIGKILL || signal == SIGABRT)
StartupStatus = STARTUP_SIGNALED;
}
if (BgWriterPID != 0)
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 836b49484a1..349dd6a5379 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -1227,6 +1227,26 @@ struct config_bool ConfigureNamesBool[] =
true,
NULL, NULL, NULL
},
+ {
+ {"send_abort_for_crash", PGC_SIGHUP, DEVELOPER_OPTIONS,
+ gettext_noop("Send SIGABRT not SIGQUIT to child processes after backend crash."),
+ NULL,
+ GUC_NOT_IN_SAMPLE
+ },
+ &send_abort_for_crash,
+ false,
+ NULL, NULL, NULL
+ },
+ {
+ {"send_abort_for_kill", PGC_SIGHUP, DEVELOPER_OPTIONS,
+ gettext_noop("Send SIGABRT not SIGKILL to stuck child processes."),
+ NULL,
+ GUC_NOT_IN_SAMPLE
+ },
+ &send_abort_for_kill,
+ false,
+ NULL, NULL, NULL
+ },
{
{"log_duration", PGC_SUSET, LOGGING_WHAT,
diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h
index 90e333ccd20..f078321df24 100644
--- a/src/include/postmaster/postmaster.h
+++ b/src/include/postmaster/postmaster.h
@@ -30,6 +30,8 @@ extern PGDLLIMPORT bool enable_bonjour;
extern PGDLLIMPORT char *bonjour_name;
extern PGDLLIMPORT bool restart_after_crash;
extern PGDLLIMPORT bool remove_temp_files_after_crash;
+extern PGDLLIMPORT bool send_abort_for_crash;
+extern PGDLLIMPORT bool send_abort_for_kill;
#ifdef WIN32
extern PGDLLIMPORT HANDLE PostmasterHandle;