mirror of
https://github.com/quay/quay.git
synced 2025-04-18 10:44:06 +03:00
health: Add statement timeout to health check (PROJQUAY-7950) (#3228)
* health: Add statement timeout to health check (PROJQUAY-7950) Currently, our query to check team roles (part of the db health check) does not time out after a certain period but runs indefinitely. This causes worker timeouts after 30 seconds, which ends up in worker being killed by the master gunicorn process. We limit the maximum execution time of the query to 5000 ms so that proper exception is raised if that timeout is reached. PostgreSQL logs: ~~~ 2024-09-16 09:38:56.431 EDT [115775] testuser@quayclone2 LOG: duration: 0.011 ms statement: BEGIN 2024-09-16 09:38:56.431 EDT [115775] testuser@quayclone2 LOG: duration: 0.034 ms statement: SET statement_timeout=5000 2024-09-16 09:38:56.431 EDT [115775] testuser@quayclone2 LOG: duration: 0.010 ms statement: COMMIT 2024-09-16 09:38:56.432 EDT [115775] testuser@quayclone2 LOG: duration: 0.004 ms statement: BEGIN 2024-09-16 09:38:56.432 EDT [115775] testuser@quayclone2 LOG: duration: 0.300 ms statement: SELECT "t1"."id", "t1"."name" FROM "teamrole" AS "t1" LIMIT 1 2024-09-16 09:38:56.433 EDT [115775] testuser@quayclone2 LOG: duration: 0.010 ms statement: COMMIT 2024-09-16 09:38:56.433 EDT [115775] testuser@quayclone2 LOG: duration: 0.005 ms statement: BEGIN 2024-09-16 09:38:56.433 EDT [115775] testuser@quayclone2 LOG: duration: 0.012 ms statement: SET statement_timeout=0 2024-09-16 09:38:56.433 EDT [115775] testuser@quayclone2 LOG: duration: 0.006 ms statement: COMMIT ~~~ Quay logs: ~~~ gunicorn-web stdout | 2024-09-16 13:38:56,412 [287] [DEBUG] [peewee.pool] Created new connection 127610088683136. gunicorn-web stdout | 2024-09-16 13:38:56,417 [287] [DEBUG] [data.model.health] Validating database connection. gunicorn-web stdout | 2024-09-16 13:38:56,418 [287] [INFO] [data.database] Connection pooling disabled for postgresql gunicorn-web stdout | 2024-09-16 13:38:56,431 [287] [DEBUG] [peewee] ('SET statement_timeout=%s', (5000,)) gunicorn-web stdout | 2024-09-16 13:38:56,431 [287] [DEBUG] [data.model.health] Checking for existence of team roles, timeout 5000 ms. gunicorn-web stdout | 2024-09-16 13:38:56,432 [287] [DEBUG] [peewee] ('SELECT "t1"."id", "t1"."name" FROM "teamrole" AS "t1" LIMIT %s', [1]) gunicorn-web stdout | 2024-09-16 13:38:56,433 [287] [DEBUG] [peewee] ('SET statement_timeout=0', None) gunicorn-web stdout | 2024-09-16 13:38:56,434 [287] [DEBUG] [app] Ending request: urn:request:d039265b-414e-4d03-b29f-3e481286bf0f (/health/instance)... ~~~ * Fix generator function
This commit is contained in:
parent
6da65c5003
commit
84249a153b
@ -1,16 +1,36 @@
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
|
||||
from data.database import TeamRole, validate_database_url
|
||||
from data.database import TeamRole, db, validate_database_url
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def sql_timeout(app_config, database, timeout):
|
||||
# Apply the context manager only if PostgreSQL is used as db schema
|
||||
if "postgresql" in app_config["DB_URI"]:
|
||||
logger.debug("Checking for existence of team roles, timeout 5000 ms.")
|
||||
database.execute_sql("SET statement_timeout=%s;", (timeout,))
|
||||
try:
|
||||
yield database
|
||||
finally:
|
||||
database.execute_sql("SET statement_timeout=%s;", (0,))
|
||||
else:
|
||||
logger.debug("Checking for existence of team roles.")
|
||||
try:
|
||||
yield database
|
||||
finally:
|
||||
pass
|
||||
|
||||
|
||||
def check_health(app_config):
|
||||
# Attempt to connect to the database first. If the DB is not responding,
|
||||
# using the validate_database_url will timeout quickly, as opposed to
|
||||
# making a normal connect which will just hang (thus breaking the health
|
||||
# check).
|
||||
try:
|
||||
logger.debug("Validating database connection.")
|
||||
validate_database_url(
|
||||
app_config["DB_URI"], app_config["DB_CONNECTION_ARGS"], connect_timeout=3
|
||||
)
|
||||
@ -19,7 +39,8 @@ def check_health(app_config):
|
||||
|
||||
# We will connect to the db, check that it contains some team role kinds
|
||||
try:
|
||||
okay = bool(list(TeamRole.select().limit(1)))
|
||||
return (okay, "Could not connect to the database" if not okay else None)
|
||||
with sql_timeout(app_config, db, 5000):
|
||||
okay = bool(list(TeamRole.select().limit(1)))
|
||||
return (okay, "Could not execute query, timeout reached" if not okay else None)
|
||||
except Exception as ex:
|
||||
return (False, "Could not connect to the database: %s" % str(ex))
|
||||
|
Loading…
x
Reference in New Issue
Block a user