diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index 1178ef9e2..ff6d73bd6 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -1635,6 +1635,25 @@ services: retries: 3 start_period: 15s + promtail: + image: grafana/promtail:3.0.0 + container_name: mana-mon-promtail + restart: always + mem_limit: 96m + command: -config.file=/etc/promtail/config.yaml -config.expand-env=true + volumes: + - ./docker/promtail:/etc/promtail:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + depends_on: + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9080/ready"] + interval: 300s + timeout: 10s + retries: 3 + start_period: 10s + pushgateway: image: prom/pushgateway:v1.7.0 container_name: mana-mon-pushgateway diff --git a/docker/grafana/dashboards/logs-explorer.json b/docker/grafana/dashboards/logs-explorer.json new file mode 100644 index 000000000..4b4eed447 --- /dev/null +++ b/docker/grafana/dashboards/logs-explorer.json @@ -0,0 +1,334 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisLabel": "", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "stacking": { "group": "A", "mode": "normal" } + } + } + }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 1 }, + "id": 2, + "title": "Log Volume by Tier", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (tier) (count_over_time({tier=~\".+\"} [$__interval]))", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "stacking": { "group": "A", "mode": "normal" } + } + } + }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 1 }, + "id": 3, + "title": "Errors & Warnings", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum by (level) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|warn|fatal|panic)\" [$__interval]))", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "red", "value": 200 } + ] + } + } + }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 7 }, + "id": 4, + "title": "Errors (last 1h)", + "type": "stat", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum(count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [1h]))", + "refId": "A", + "instant": true + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 100 }, + { "color": "red", "value": 500 } + ] + } + } + }, + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 7 }, + "id": 5, + "title": "Warnings (last 1h)", + "type": "stat", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "sum(count_over_time({tier=~\".+\"} |~ \"(?i)warn\" [1h]))", + "refId": "A", + "instant": true + } + ] + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "displayMode": "gradient", "showValue": "auto" } + } + }, + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 7 }, + "id": 6, + "title": "Top 10 Noisiest Services", + "type": "barchart", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "topk(10, sum by (service) (count_over_time({tier=~\".+\"} [1h])))", + "refId": "A", + "instant": true + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, + "id": 10, + "panels": [], + "title": "Error Logs", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 }, + "id": 11, + "title": "Errors across all services", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\"", + "refId": "A" + } + ] + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }, + "id": 20, + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 25 }, + "id": 21, + "title": "Auth Service Logs", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=\"auth\"}", + "refId": "A" + } + ] + } + ], + "title": "Auth Services", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, + "id": 30, + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 26 }, + "id": 31, + "title": "Core Service Logs", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=\"core\"}", + "refId": "A" + } + ] + } + ], + "title": "Core Services (Sync, Search, Gateway)", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, + "id": 40, + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 27 }, + "id": 41, + "title": "Web App Logs", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=\"app\"} | service =~ \"$service\"", + "refId": "A" + } + ] + } + ], + "title": "Web Apps", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 }, + "id": 50, + "panels": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 28 }, + "id": 51, + "title": "Matrix Stack Logs", + "type": "logs", + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + }, + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{tier=\"matrix\"}", + "refId": "A" + } + ] + } + ], + "title": "Matrix Stack", + "type": "row" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["logs", "loki"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "loki", "uid": "loki" }, + "definition": "label_values(service)", + "includeAll": true, + "multi": true, + "name": "service", + "query": "label_values(service)", + "refresh": 2, + "type": "query" + }, + { + "current": { "selected": false, "text": "All", "value": "$__all" }, + "datasource": { "type": "loki", "uid": "loki" }, + "definition": "label_values(tier)", + "includeAll": true, + "multi": true, + "name": "tier", + "query": "label_values(tier)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Logs Explorer", + "uid": "logs-explorer", + "version": 1 +} diff --git a/docker/grafana/provisioning/datasources/loki.yml b/docker/grafana/provisioning/datasources/loki.yml index 11f3fa87a..b430f2e29 100644 --- a/docker/grafana/provisioning/datasources/loki.yml +++ b/docker/grafana/provisioning/datasources/loki.yml @@ -3,7 +3,14 @@ apiVersion: 1 datasources: - name: Loki type: loki + uid: loki access: proxy url: http://loki:3100 isDefault: false editable: true + jsonData: + maxLines: 1000 + derivedFields: + - name: request_id + matcherRegex: '"request_id":"([a-f0-9-]+)"' + url: "" diff --git a/docker/promtail/config.yaml b/docker/promtail/config.yaml new file mode 100644 index 000000000..3b55a41eb --- /dev/null +++ b/docker/promtail/config.yaml @@ -0,0 +1,109 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + batchwait: 3s + batchsize: 1048576 # 1 MB + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 30s + filters: + # Only collect from our compose project + - name: label + values: ["com.docker.compose.project"] + + relabel_configs: + # Extract compose service name → label "service" + - source_labels: ["__meta_docker_container_label_com_docker_compose_service"] + target_label: "service" + + # Extract container name → label "container" + - source_labels: ["__meta_docker_container_name"] + regex: "/(.*)" + target_label: "container" + + # Extract compose project → label "project" + - source_labels: ["__meta_docker_container_label_com_docker_compose_project"] + target_label: "project" + + # Tier labels based on container name prefix for easy filtering + # mana-infra-* → tier=infra + - source_labels: ["container"] + regex: "mana-infra-.*" + target_label: "tier" + replacement: "infra" + # mana-core-* → tier=core + - source_labels: ["container"] + regex: "mana-core-.*" + target_label: "tier" + replacement: "core" + # mana-auth/credits/user/subscriptions/analytics → tier=auth + - source_labels: ["container"] + regex: "mana-(auth|credits|user|subscriptions|analytics)" + target_label: "tier" + replacement: "auth" + # mana-app-* → tier=app + - source_labels: ["container"] + regex: "mana-app-.*" + target_label: "tier" + replacement: "app" + # mana-mon-* → tier=monitoring + - source_labels: ["container"] + regex: "mana-mon-.*" + target_label: "tier" + replacement: "monitoring" + # mana-matrix-* → tier=matrix + - source_labels: ["container"] + regex: "mana-matrix-.*" + target_label: "tier" + replacement: "matrix" + # mana-game-* → tier=games + - source_labels: ["container"] + regex: "mana-game-.*" + target_label: "tier" + replacement: "games" + # mana-service-* → tier=service + - source_labels: ["container"] + regex: "mana-service-.*" + target_label: "tier" + replacement: "service" + + # Drop monitoring container logs to save space (they're noisy) + - source_labels: ["tier"] + regex: "monitoring" + action: drop + + pipeline_stages: + # Try to parse JSON logs (Go services, Hono services) + - json: + expressions: + level: level + msg: msg + error: error + status: status + method: method + path: path + duration: duration + request_id: request_id + # Fall back: extract level from common log patterns + - regex: + expression: '(?i)(?Perror|warn|info|debug|fatal|panic)' + # Normalize level label + - labels: + level: + # Add timestamp from log if available + - timestamp: + source: time + format: RFC3339Nano + fallback_formats: + - "2006-01-02T15:04:05.000Z" + - "2006-01-02 15:04:05" + action_on_failure: fudge diff --git a/scripts/mac-mini/build-app.sh b/scripts/mac-mini/build-app.sh index 29e96cb3e..f3726a8cc 100755 --- a/scripts/mac-mini/build-app.sh +++ b/scripts/mac-mini/build-app.sh @@ -34,6 +34,8 @@ MONITORING_CONTAINERS=( mana-mon-alert-notifier mana-mon-glitchtip mana-mon-glitchtip-worker + mana-mon-loki + mana-mon-promtail ) # Track if we stopped monitoring