From 4a48182677cd09a912275e22d880d9b342377db3 Mon Sep 17 00:00:00 2001 From: Till JS Date: Sun, 29 Mar 2026 19:22:44 +0200 Subject: [PATCH] feat(monitoring): integrate Promtail for centralized log collection via Loki Loki was already running but had no log shipper. Adds Promtail to collect Docker logs from all 66 containers with automatic tier labeling (infra, auth, core, app, matrix, games) and a Grafana Logs Explorer dashboard. Co-Authored-By: Claude Opus 4.6 (1M context) --- docker-compose.macmini.yml | 19 + docker/grafana/dashboards/logs-explorer.json | 334 ++++++++++++++++++ .../grafana/provisioning/datasources/loki.yml | 7 + docker/promtail/config.yaml | 109 ++++++ scripts/mac-mini/build-app.sh | 2 + 5 files changed, 471 insertions(+) create mode 100644 docker/grafana/dashboards/logs-explorer.json create mode 100644 docker/promtail/config.yaml diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index 1178ef9e2..ff6d73bd6 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -1635,6 +1635,25 @@ services: retries: 3 start_period: 15s + promtail: + image: grafana/promtail:3.0.0 + container_name: mana-mon-promtail + restart: always + mem_limit: 96m + command: -config.file=/etc/promtail/config.yaml -config.expand-env=true + volumes: + - ./docker/promtail:/etc/promtail:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9080/ready"] + interval: 300s + timeout: 10s + retries: 3 + start_period: 10s + pushgateway: image: prom/pushgateway:v1.7.0 container_name: mana-mon-pushgateway diff --git a/docker/grafana/dashboards/logs-explorer.json b/docker/grafana/dashboards/logs-explorer.json new file mode 100644 index 000000000..4b4eed447 --- /dev/null +++ b/docker/grafana/dashboards/logs-explorer.json @@ -0,0 +1,334 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisLabel": "", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "stacking": { "group": "A", "mode": "normal" } + } + } + }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 1 }, + "id": 2, + "title": "Log Volume by Tier", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (tier) (count_over_time({tier=~\".+\"} [$__interval]))", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "stacking": { "group": "A", "mode": "normal" } + } + } + }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 1 }, + "id": 3, + "title": "Errors & Warnings", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (level) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|warn|fatal|panic)\" [$__interval]))", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "red", "value": 200 } + ] + } + } + }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 7 }, + "id": 4, + "title": "Errors (last 1h)", + "type": "stat", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum(count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [1h]))", + "refId": "A", + "instant": true + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 100 }, + { "color": "red", "value": 500 } + ] + } + } + }, + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 7 }, + "id": 5, + "title": "Warnings (last 1h)", + "type": "stat", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum(count_over_time({tier=~\".+\"} |~ \"(?i)warn\" [1h]))", + "refId": "A", + "instant": true + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "displayMode": "gradient", "showValue": "auto" } + } + }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 7 }, + "id": 6, + "title": "Top 10 Noisiest Services", + "type": "barchart", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "topk(10, sum by (service) (count_over_time({tier=~\".+\"} [1h])))", + "refId": "A", + "instant": true + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, + "id": 10, + "panels": [], + "title": "Error Logs", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 }, + "id": 11, + "title": "Errors across all services", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\"", + "refId": "A" + } + ] + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }, + "id": 20, + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 25 }, + "id": 21, + "title": "Auth Service Logs", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=\"auth\"}", + "refId": "A" + } + ] + } + ], + "title": "Auth Services", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, + "id": 30, + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 26 }, + "id": 31, + "title": "Core Service Logs", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=\"core\"}", + "refId": "A" + } + ] + } + ], + "title": "Core Services (Sync, Search, Gateway)", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, + "id": 40, + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 27 }, + "id": 41, + "title": "Web App Logs", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=\"app\"} | service =~ \"$service\"", + "refId": "A" + } + ] + } + ], + "title": "Web Apps", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 }, + "id": 50, + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 28 }, + "id": 51, + "title": "Matrix Stack Logs", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=\"matrix\"}", + "refId": "A" + } + ] + } + ], + "title": "Matrix Stack", + "type": "row" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["logs", "loki"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "loki", "uid": "loki" }, + "definition": "label_values(service)", + "includeAll": true, + "multi": true, + "name": "service", + "query": "label_values(service)", + "refresh": 2, + "type": "query" + }, + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "loki", "uid": "loki" }, + "definition": "label_values(tier)", + "includeAll": true, + "multi": true, + "name": "tier", + "query": "label_values(tier)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Logs Explorer", + "uid": "logs-explorer", + "version": 1 +} diff --git a/docker/grafana/provisioning/datasources/loki.yml b/docker/grafana/provisioning/datasources/loki.yml index 11f3fa87a..b430f2e29 100644 --- a/docker/grafana/provisioning/datasources/loki.yml +++ b/docker/grafana/provisioning/datasources/loki.yml @@ -3,7 +3,14 @@ apiVersion: 1 datasources: - name: Loki type: loki + uid: loki access: proxy url: http://loki:3100 isDefault: false editable: true + jsonData: + maxLines: 1000 + derivedFields: + - name: request_id + matcherRegex: '"request_id":"([a-f0-9-]+)"' + url: "" diff --git a/docker/promtail/config.yaml b/docker/promtail/config.yaml new file mode 100644 index 000000000..3b55a41eb --- /dev/null +++ b/docker/promtail/config.yaml @@ -0,0 +1,109 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + batchwait: 3s + batchsize: 1048576 # 1 MB + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 30s + filters: + # Only collect from our compose project + - name: label + values: ["com.docker.compose.project"] + + relabel_configs: + # Extract compose service name → label "service" + - source_labels: ["__meta_docker_container_label_com_docker_compose_service"] + target_label: "service" + + # Extract container name → label "container" + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + + # Extract compose project → label "project" + - source_labels: ["__meta_docker_container_label_com_docker_compose_project"] + target_label: "project" + + # Tier labels based on container name prefix for easy filtering + # mana-infra-* → tier=infra + - source_labels: ["container"] + regex: "mana-infra-.*" + target_label: "tier" + replacement: "infra" + # mana-core-* → tier=core + - source_labels: ["container"] + regex: "mana-core-.*" + target_label: "tier" + replacement: "core" + # mana-auth/credits/user/subscriptions/analytics → tier=auth + - source_labels: ["container"] + regex: "mana-(auth|credits|user|subscriptions|analytics)" + target_label: "tier" + replacement: "auth" + # mana-app-* → tier=app + - source_labels: ["container"] + regex: "mana-app-.*" + target_label: "tier" + replacement: "app" + # mana-mon-* → tier=monitoring + - source_labels: ["container"] + regex: "mana-mon-.*" + target_label: "tier" + replacement: "monitoring" + # mana-matrix-* → tier=matrix + - source_labels: ["container"] + regex: "mana-matrix-.*" + target_label: "tier" + replacement: "matrix" + # mana-game-* → tier=games + - source_labels: ["container"] + regex: "mana-game-.*" + target_label: "tier" + replacement: "games" + # mana-service-* → tier=service + - source_labels: ["container"] + regex: "mana-service-.*" + target_label: "tier" + replacement: "service" + + # Drop monitoring container logs to save space (they're noisy) + - source_labels: ["tier"] + regex: "monitoring" + action: drop + + pipeline_stages: + # Try to parse JSON logs (Go services, Hono services) + - json: + expressions: + level: level + msg: msg + error: error + status: status + method: method + path: path + duration: duration + request_id: request_id + # Fall back: extract level from common log patterns + - regex: + expression: '(?i)(?Perror|warn|info|debug|fatal|panic)' + # Normalize level label + - labels: + level: + # Add timestamp from log if available + - timestamp: + source: time + format: RFC3339Nano + fallback_formats: + - "2006-01-02T15:04:05.000Z" + - "2006-01-02 15:04:05" + action_on_failure: fudge diff --git a/scripts/mac-mini/build-app.sh b/scripts/mac-mini/build-app.sh index 29e96cb3e..f3726a8cc 100755 --- a/scripts/mac-mini/build-app.sh +++ b/scripts/mac-mini/build-app.sh @@ -34,6 +34,8 @@ MONITORING_CONTAINERS=( mana-mon-alert-notifier mana-mon-glitchtip mana-mon-glitchtip-worker + mana-mon-loki + mana-mon-promtail ) # Track if we stopped monitoring