1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-24 01:29:19 +03:00

Check for too many postmaster children before spawning a bgworker.

The postmaster's code path for spawning a bgworker neglected to check
whether we already have the max number of live child processes.  That's
a bit hard to hit, since it would necessarily be a transient condition;
but if we do, AssignPostmasterChildSlot() fails causing a postmaster
crash, as seen in a report from Bhargav Kamineni.

To fix, invoke canAcceptConnections() in the bgworker code path, as we
do in the other code paths that spawn children.  Since we don't want
the same pmState tests in this case, add a child-process-type parameter
to canAcceptConnections() so that it can know what to do.

Back-patch to 9.5.  In principle the same hazard exists in 9.4, but the
code is enough different that this patch wouldn't quite fix it there.
Given the tiny usage of bgworkers in that branch it doesn't seem worth
creating a variant patch for it.

Discussion: https://postgr.es/m/18733.1570382257@sss.pgh.pa.us
This commit is contained in:
Tom Lane
2019-10-07 12:39:09 -04:00
parent e4d050e5f7
commit 7e8d0eb63f

View File

@@ -409,7 +409,7 @@ static void SendNegotiateProtocolVersion(List *unrecognized_protocol_options);
static void processCancelRequest(Port *port, void *pkt); static void processCancelRequest(Port *port, void *pkt);
static int initMasks(fd_set *rmask); static int initMasks(fd_set *rmask);
static void report_fork_failure_to_client(Port *port, int errnum); static void report_fork_failure_to_client(Port *port, int errnum);
static CAC_state canAcceptConnections(void); static CAC_state canAcceptConnections(int backend_type);
static bool RandomCancelKey(int32 *cancel_key); static bool RandomCancelKey(int32 *cancel_key);
static void signal_child(pid_t pid, int signal); static void signal_child(pid_t pid, int signal);
static bool SignalSomeChildren(int signal, int targets); static bool SignalSomeChildren(int signal, int targets);
@@ -2384,16 +2384,21 @@ processCancelRequest(Port *port, void *pkt)
} }
/* /*
* canAcceptConnections --- check to see if database state allows connections. * canAcceptConnections --- check to see if database state allows connections
* of the specified type. backend_type can be BACKEND_TYPE_NORMAL,
* BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet
* know whether a NORMAL connection might turn into a walsender.)
*/ */
static CAC_state static CAC_state
canAcceptConnections(void) canAcceptConnections(int backend_type)
{ {
CAC_state result = CAC_OK; CAC_state result = CAC_OK;
/* /*
* Can't start backends when in startup/shutdown/inconsistent recovery * Can't start backends when in startup/shutdown/inconsistent recovery
* state. * state. We treat autovac workers the same as user backends for this
* purpose. However, bgworkers are excluded from this test; we expect
* bgworker_should_start_now() decided whether the DB state allows them.
* *
* In state PM_WAIT_BACKUP only superusers can connect (this must be * In state PM_WAIT_BACKUP only superusers can connect (this must be
* allowed so that a superuser can end online backup mode); we return * allowed so that a superuser can end online backup mode); we return
@@ -2401,7 +2406,8 @@ canAcceptConnections(void)
* that neither CAC_OK nor CAC_WAITBACKUP can safely be returned until we * that neither CAC_OK nor CAC_WAITBACKUP can safely be returned until we
* have checked for too many children. * have checked for too many children.
*/ */
if (pmState != PM_RUN) if (pmState != PM_RUN &&
backend_type != BACKEND_TYPE_BGWORKER)
{ {
if (pmState == PM_WAIT_BACKUP) if (pmState == PM_WAIT_BACKUP)
result = CAC_WAITBACKUP; /* allow superusers only */ result = CAC_WAITBACKUP; /* allow superusers only */
@@ -2421,7 +2427,7 @@ canAcceptConnections(void)
/* /*
* Don't start too many children. * Don't start too many children.
* *
* We allow more connections than we can have backends here because some * We allow more connections here than we can have backends because some
* might still be authenticating; they might fail auth, or some existing * might still be authenticating; they might fail auth, or some existing
* backend might exit before the auth cycle is completed. The exact * backend might exit before the auth cycle is completed. The exact
* MaxBackends limit is enforced when a new backend tries to join the * MaxBackends limit is enforced when a new backend tries to join the
@@ -4086,7 +4092,7 @@ BackendStartup(Port *port)
bn->cancel_key = MyCancelKey; bn->cancel_key = MyCancelKey;
/* Pass down canAcceptConnections state */ /* Pass down canAcceptConnections state */
port->canAcceptConnections = canAcceptConnections(); port->canAcceptConnections = canAcceptConnections(BACKEND_TYPE_NORMAL);
bn->dead_end = (port->canAcceptConnections != CAC_OK && bn->dead_end = (port->canAcceptConnections != CAC_OK &&
port->canAcceptConnections != CAC_WAITBACKUP); port->canAcceptConnections != CAC_WAITBACKUP);
@@ -5458,7 +5464,7 @@ StartAutovacuumWorker(void)
* we have to check to avoid race-condition problems during DB state * we have to check to avoid race-condition problems during DB state
* changes. * changes.
*/ */
if (canAcceptConnections() == CAC_OK) if (canAcceptConnections(BACKEND_TYPE_AUTOVAC) == CAC_OK)
{ {
/* /*
* Compute the cancel key that will be assigned to this session. We * Compute the cancel key that will be assigned to this session. We
@@ -5703,12 +5709,13 @@ do_start_bgworker(RegisteredBgWorker *rw)
/* /*
* Allocate and assign the Backend element. Note we must do this before * Allocate and assign the Backend element. Note we must do this before
* forking, so that we can handle out of memory properly. * forking, so that we can handle failures (out of memory or child-process
* slots) cleanly.
* *
* Treat failure as though the worker had crashed. That way, the * Treat failure as though the worker had crashed. That way, the
* postmaster will wait a bit before attempting to start it again; if it * postmaster will wait a bit before attempting to start it again; if we
* tried again right away, most likely it'd find itself repeating the * tried again right away, most likely we'd find ourselves hitting the
* out-of-memory or fork failure condition. * same resource-exhaustion condition.
*/ */
if (!assign_backendlist_entry(rw)) if (!assign_backendlist_entry(rw))
{ {
@@ -5834,6 +5841,19 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
{ {
Backend *bn; Backend *bn;
/*
* Check that database state allows another connection. Currently the
* only possible failure is CAC_TOOMANY, so we just log an error message
* based on that rather than checking the error code precisely.
*/
if (canAcceptConnections(BACKEND_TYPE_BGWORKER) != CAC_OK)
{
ereport(LOG,
(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
errmsg("no slot available for new worker process")));
return false;
}
/* /*
* Compute the cancel key that will be assigned to this session. We * Compute the cancel key that will be assigned to this session. We
* probably don't need cancel keys for background workers, but we'd better * probably don't need cancel keys for background workers, but we'd better