1
0
mirror of https://github.com/quay/quay.git synced 2026-01-29 08:42:15 +03:00
Files
quay/util/metrics/prometheus.py

188 lines
5.5 KiB
Python

import logging
import os
import socket
import sys
import threading
import time
import urllib.error
import urllib.parse
import urllib.request
from collections import namedtuple
from cachetools.func import lru_cache
from flask import g, request
from prometheus_client import REGISTRY, Counter, Gauge, Histogram, push_to_gateway
logger = logging.getLogger(__name__)
# DB connections
db_pooled_connections_in_use = Gauge(
"quay_db_pooled_connections_in_use", "number of pooled db connections in use"
)
db_pooled_connections_available = Gauge(
"quay_db_pooled_connections_available", "number of pooled db connections available"
)
db_connect_calls = Counter(
"quay_db_connect_calls",
"number of connect() calls made to db",
)
db_close_calls = Counter(
"quay_db_close_calls",
"number of close() calls made to db",
)
request_duration = Histogram(
"quay_request_duration_seconds",
"seconds taken to process a request",
labelnames=["method", "route", "status"],
)
# GC: DB table rows
gc_table_rows_deleted = Counter(
"quay_gc_table_rows_deleted", "number of table rows deleted by GC", labelnames=["table"]
)
# GC: Storage blob
gc_storage_blobs_deleted = Counter(
"quay_gc_storage_blobs_deleted", "number of storage blobs deleted"
)
# GC iterations
gc_repos_purged = Counter(
"quay_gc_repos_purged",
"number of repositories purged by the RepositoryGCWorker or NamespaceGCWorker",
)
gc_namespaces_purged = Counter(
"quay_gc_namespaces_purged", "number of namespaces purged by the NamespaceGCWorker"
)
gc_iterations = Counter("quay_gc_iterations", "number of iterations by the GCWorker")
secscan_request_duration = Histogram(
"quay_secscan_request_duration_seconds",
"seconds taken to make an index request to the secscan service",
labelnames=["method", "action", "status"],
)
secscan_index_layer_size = Histogram(
"quay_secscan_index_layer_size_bytes",
"bytes submitted to index to the secscan service",
)
INF = float("inf")
SECSCAN_RESULT_BUCKETS = (60, 300, 600, 900, 1200, 1500, 1800, 2100, 2400, INF)
secscan_result_duration = Histogram(
"quay_secscan_result_duration_seconds",
"how long it takes to receive scan results after pushing an image",
buckets=SECSCAN_RESULT_BUCKETS,
)
PROMETHEUS_PUSH_INTERVAL_SECONDS = 30
ONE_DAY_IN_SECONDS = 60 * 60 * 24
@lru_cache(maxsize=1)
def process_grouping_key():
"""
Implements a grouping key based on the last argument used to run the current process.
https://github.com/prometheus/client_python#exporting-to-a-pushgateway
"""
return {
"host": socket.gethostname(),
"process_name": os.path.basename(sys.argv[-1]),
"pid": str(os.getpid()),
}
class PrometheusPlugin(object):
"""
Application plugin for reporting metrics to Prometheus.
"""
def __init__(self, app=None):
self.app = app
if app is not None:
self.state = self.init_app(app)
else:
self.state = None
def init_app(self, app):
pusher = ThreadPusher(app)
pusher.start()
# register extension with app
app.extensions = getattr(app, "extensions", {})
app.extensions["prometheus"] = pusher
return pusher
def __getattr__(self, name):
return getattr(self.state, name, None)
class ThreadPusher(threading.Thread):
def __init__(self, app):
super(ThreadPusher, self).__init__()
self.daemon = True
self._app = app
def run(self):
agg_url = self._app.config.get("PROMETHEUS_PUSHGATEWAY_URL")
while True:
# Practically disable this worker, if there is no pushgateway.
if agg_url is None or os.getenv("TEST", "false").lower() == "true":
time.sleep(ONE_DAY_IN_SECONDS)
continue
time.sleep(PROMETHEUS_PUSH_INTERVAL_SECONDS)
try:
push_to_gateway(
agg_url,
job=self._app.config.get("PROMETHEUS_NAMESPACE", "quay"),
registry=REGISTRY,
grouping_key=process_grouping_key(),
)
logger.debug(
"pushed registry to pushgateway at %s with grouping key %s",
agg_url,
process_grouping_key(),
)
except urllib.error.URLError:
# There are many scenarios when the gateway might not be running.
# These could be testing scenarios or simply processes racing to start.
# Rather than try to guess all of them, keep it simple and let it fail.
if os.getenv("DEBUGLOG", "false").lower() == "true":
logger.exception(
"failed to push registry to pushgateway at %s with grouping key %s",
agg_url,
process_grouping_key(),
)
else:
pass
def timed_blueprint(bp):
"""
Decorates a blueprint to have its request duration tracked by Prometheus.
"""
def _time_before_request():
g._request_start_time = time.time()
bp.before_request(_time_before_request)
def _time_after_request():
def f(r):
start = getattr(g, "_request_start_time", None)
if start is None:
return r
dur = time.time() - start
request_duration.labels(request.method, request.endpoint, r.status_code).observe(dur)
return r
return f
bp.after_request(_time_after_request())
return bp