diff --git a/example/otel/README.md b/example/otel/README.md index f2883207..5d1431e0 100644 --- a/example/otel/README.md +++ b/example/otel/README.md @@ -40,9 +40,25 @@ UPTRACE_DSN=http://project2_secret_token@localhost:14317/2 go run client.go trace: http://localhost:14318/traces/ee029d8782242c8ed38b16d961093b35 ``` +![Redis trace](./image/redis-trace.png) + You can also open Uptrace UI at [http://localhost:14318](http://localhost:14318) to view available spans, logs, and metrics. +## Redis monitoring + +You can also [monitor Redis performance](https://uptrace.dev/opentelemetry/redis-monitoring.html) +metrics By installing OpenTelemetry Collector. + +[OpenTelemetry Collector](https://uptrace.dev/opentelemetry/collector.html) is an agent that pulls +telemetry data from systems you want to monitor and sends it to APM tools using the OpenTelemetry +protocol (OTLP). + +When telemetry data reaches Uptrace, it automatically generates a Redis dashboard from a pre-defined +template. + +![Redis dashboard](./image/metrics.png) + ## Links - [Uptrace open-source APM](https://uptrace.dev/get/open-source-apm.html) diff --git a/example/otel/config/alertmanager.yml b/example/otel/config/alertmanager.yml new file mode 100644 index 00000000..9ea790db --- /dev/null +++ b/example/otel/config/alertmanager.yml @@ -0,0 +1,53 @@ +# See https://prometheus.io/docs/alerting/latest/configuration/ for details. + +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'mailhog:1025' + smtp_from: 'alertmanager@example.com' + smtp_require_tls: false + +receivers: + - name: 'team-X' + email_configs: + - to: 'some-receiver@example.com' + send_resolved: true + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: team-X + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + + # The child route trees. + routes: + # This route matches error alerts created from spans or logs. + - matchers: + - alert_kind="error" + group_interval: 24h + receiver: team-X + +# The directory from which notification templates are read. +templates: + - '/etc/alertmanager/template/*.tmpl' diff --git a/example/otel/otel-collector.yaml b/example/otel/config/otel-collector.yaml similarity index 100% rename from example/otel/otel-collector.yaml rename to example/otel/config/otel-collector.yaml diff --git a/example/otel/vector.toml b/example/otel/config/vector.toml similarity index 100% rename from example/otel/vector.toml rename to example/otel/config/vector.toml diff --git a/example/otel/docker-compose.yml b/example/otel/docker-compose.yml index d4c5a695..786f6e1a 100644 --- a/example/otel/docker-compose.yml +++ b/example/otel/docker-compose.yml @@ -18,7 +18,7 @@ services: - '9000:9000' uptrace: - image: 'uptrace/uptrace:1.1.0' + image: 'uptrace/uptrace:1.2.0' #image: 'uptrace/uptrace-dev:latest' restart: on-failure volumes: @@ -36,11 +36,8 @@ services: otel-collector: image: otel/opentelemetry-collector-contrib:0.58.0 restart: on-failure - user: '0:0' # required for logs volumes: - - ./otel-collector.yaml:/etc/otelcol-contrib/config.yaml - - /var/lib/docker/containers:/var/lib/docker/containers:ro - - /var/log:/var/log:ro + - ./config/otel-collector.yaml:/etc/otelcol-contrib/config.yaml ports: - '4317:4317' - '4318:4318' @@ -48,7 +45,25 @@ services: vector: image: timberio/vector:0.24.X-alpine volumes: - - ./vector.toml:/etc/vector/vector.toml:ro + - ./config/vector.toml:/etc/vector/vector.toml:ro + + alertmanager: + image: prom/alertmanager:v0.24.0 + restart: on-failure + volumes: + - ./config/alertmanager.yml:/etc/alertmanager/config.yml + - alertmanager_data:/alertmanager + ports: + - 9093:9093 + command: + - '--config.file=/etc/alertmanager/config.yml' + - '--storage.path=/alertmanager' + + mailhog: + image: mailhog/mailhog:v1.0.1 + restart: on-failure + ports: + - '8025:8025' redis-server: image: redis diff --git a/example/otel/image/metrics.png b/example/otel/image/metrics.png new file mode 100644 index 00000000..7c2beb4d Binary files /dev/null and b/example/otel/image/metrics.png differ diff --git a/example/otel/image/redis-trace.png b/example/otel/image/redis-trace.png new file mode 100644 index 00000000..4f7115e1 Binary files /dev/null and b/example/otel/image/redis-trace.png differ diff --git a/example/otel/uptrace.yml b/example/otel/uptrace.yml index 7c21e54f..9116d42d 100644 --- a/example/otel/uptrace.yml +++ b/example/otel/uptrace.yml @@ -13,6 +13,16 @@ ## foo: $$FOO_BAR ## +## +## ClickHouse database credentials. +## +ch: + # Connection string for ClickHouse database. For example: + # clickhouse://:@:/?sslmode=disable + # + # See https://clickhouse.uptrace.dev/guide/golang-clickhouse.html#options + dsn: 'clickhouse://default:@clickhouse:9000/uptrace?sslmode=disable' + ## ## A list of pre-configured projects. Each project is fully isolated. ## @@ -26,6 +36,10 @@ projects: - service.name - host.name - deployment.environment + # Group spans by deployment.environment attribute. + group_by_env: false + # Group funcs spans by service.name attribute. + group_funcs_by_service: false # Other projects can be used to monitor your applications. # To monitor micro-services or multiple related services, use a single project. @@ -36,6 +50,49 @@ projects: - service.name - host.name - deployment.environment + # Group spans by deployment.environment attribute. + group_by_env: false + # Group funcs spans by service.name attribute. + group_funcs_by_service: false + +## +## Create metrics from spans and events. +## +metrics_from_spans: + - name: uptrace.tracing.spans_duration + description: Spans duration (excluding events) + instrument: histogram + unit: microseconds + value: span.duration / 1000 + attrs: + - span.system as system + - service.name as service + - host.name as host + - span.status_code as status + where: not span.is_event + + - name: uptrace.tracing.spans + description: Spans count (excluding events) + instrument: counter + unit: 1 + value: span.count + attrs: + - span.system as system + - service.name as service + - host.name as host + - span.status_code as status + where: not span.is_event + + - name: uptrace.tracing.events + description: Events count (excluding spans) + instrument: counter + unit: 1 + value: span.count + attrs: + - span.system as system + - service.name as service + - host.name as host + where: span.is_event ## ## To require authentication, uncomment the following section. @@ -78,16 +135,6 @@ auth: # # Defaults to 'preferred_username'. # claim: preferred_username -## -## ClickHouse database credentials. -## -ch: - # Connection string for ClickHouse database. For example: - # clickhouse://:@:/?sslmode=disable - # - # See https://clickhouse.uptrace.dev/guide/golang-clickhouse.html#options - dsn: 'clickhouse://default:@clickhouse:9000/uptrace?sslmode=disable' - ## ## Alerting rules for monitoring metrics. ## @@ -102,8 +149,8 @@ alerting: - $net_errors > 0 group by host.name # for the last 5 minutes for: 5m - # in the project id=1 - projects: [1] + annotations: + summary: '{{ $labels.host_name }} has high number of net errors: {{ $values.net_errors }}' - name: Filesystem usage >= 90% metrics: @@ -114,7 +161,8 @@ alerting: - where device !~ "loop" - $fs_usage{state="used"} / $fs_usage >= 0.9 for: 5m - projects: [1] + annotations: + summary: '{{ $labels.host_name }} has high FS usage: {{ $values.fs_usage }}' - name: Uptrace is dropping spans metrics: @@ -122,7 +170,17 @@ alerting: query: - $spans{type=dropped} > 0 for: 1m - projects: [1] + annotations: + summary: 'Uptrace has dropped {{ $values.spans }} spans' + + - name: Always firing (for fun and testing) + metrics: + - process.runtime.go.goroutines as $goroutines + query: + - $goroutines >= 0 group by host.name + for: 1m + annotations: + summary: '{{ $labels.host_name }} has high number of goroutines: {{ $values.goroutines }}' # Create alerts from error logs and span events. create_alerts_from_spans: @@ -139,8 +197,8 @@ alerting: ## alertmanager_client: # AlertManager API endpoints that Uptrace uses to manage alerts. - # urls: - # - 'http://alertmanager:9093/api/v2/alerts' + urls: + - 'http://alertmanager:9093/api/v2/alerts' ## ## Various options to tweak ClickHouse schema.