feat(monitoring): integrate Promtail for centralized log collection via Loki

Loki was already running but had no log shipper. Adds Promtail to collect
Docker logs from all 66 containers with automatic tier labeling (infra,
auth, core, app, matrix, games) and a Grafana Logs Explorer dashboard.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-03-29 19:22:44 +02:00
parent 58bef0ab25
commit 4a48182677
5 changed files with 471 additions and 0 deletions

View file

@ -1635,6 +1635,25 @@ services:
retries: 3
start_period: 15s
promtail:
image: grafana/promtail:3.0.0
container_name: mana-mon-promtail
restart: always
mem_limit: 96m
command: -config.file=/etc/promtail/config.yaml -config.expand-env=true
volumes:
- ./docker/promtail:/etc/promtail:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
loki:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9080/ready"]
interval: 300s
timeout: 10s
retries: 3
start_period: 10s
pushgateway:
image: prom/pushgateway:v1.7.0
container_name: mana-mon-pushgateway

View file

@ -0,0 +1,334 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"panels": [],
"title": "Overview",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisLabel": "",
"barAlignment": 0,
"drawStyle": "bars",
"fillOpacity": 80,
"stacking": { "group": "A", "mode": "normal" }
}
}
},
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 1 },
"id": 2,
"title": "Log Volume by Tier",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum by (tier) (count_over_time({tier=~\".+\"} [$__interval]))",
"refId": "A"
}
]
},
{
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "bars",
"fillOpacity": 80,
"stacking": { "group": "A", "mode": "normal" }
}
}
},
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 1 },
"id": 3,
"title": "Errors & Warnings",
"type": "timeseries",
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum by (level) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|warn|fatal|panic)\" [$__interval]))",
"refId": "A"
}
]
},
{
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 50 },
{ "color": "red", "value": 200 }
]
}
}
},
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 7 },
"id": 4,
"title": "Errors (last 1h)",
"type": "stat",
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [1h]))",
"refId": "A",
"instant": true
}
]
},
{
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 100 },
{ "color": "red", "value": 500 }
]
}
}
},
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 7 },
"id": 5,
"title": "Warnings (last 1h)",
"type": "stat",
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "sum(count_over_time({tier=~\".+\"} |~ \"(?i)warn\" [1h]))",
"refId": "A",
"instant": true
}
]
},
{
"datasource": { "type": "loki", "uid": "loki" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "displayMode": "gradient", "showValue": "auto" }
}
},
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 7 },
"id": 6,
"title": "Top 10 Noisiest Services",
"type": "barchart",
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "topk(10, sum by (service) (count_over_time({tier=~\".+\"} [1h])))",
"refId": "A",
"instant": true
}
]
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
"id": 10,
"panels": [],
"title": "Error Logs",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 },
"id": 11,
"title": "Errors across all services",
"type": "logs",
"options": {
"showTime": true,
"showLabels": true,
"showCommonLabels": false,
"wrapLogMessage": true,
"prettifyLogMessage": false,
"enableLogDetails": true,
"dedupStrategy": "none",
"sortOrder": "Descending"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "{tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\"",
"refId": "A"
}
]
},
{
"collapsed": true,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
"id": 20,
"panels": [
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 25 },
"id": 21,
"title": "Auth Service Logs",
"type": "logs",
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "{tier=\"auth\"}",
"refId": "A"
}
]
}
],
"title": "Auth Services",
"type": "row"
},
{
"collapsed": true,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 },
"id": 30,
"panels": [
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 26 },
"id": 31,
"title": "Core Service Logs",
"type": "logs",
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "{tier=\"core\"}",
"refId": "A"
}
]
}
],
"title": "Core Services (Sync, Search, Gateway)",
"type": "row"
},
{
"collapsed": true,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
"id": 40,
"panels": [
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 27 },
"id": 41,
"title": "Web App Logs",
"type": "logs",
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "{tier=\"app\"} | service =~ \"$service\"",
"refId": "A"
}
]
}
],
"title": "Web Apps",
"type": "row"
},
{
"collapsed": true,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 },
"id": 50,
"panels": [
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 28 },
"id": 51,
"title": "Matrix Stack Logs",
"type": "logs",
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
},
"targets": [
{
"datasource": { "type": "loki", "uid": "loki" },
"expr": "{tier=\"matrix\"}",
"refId": "A"
}
]
}
],
"title": "Matrix Stack",
"type": "row"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["logs", "loki"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "All", "value": "$__all" },
"datasource": { "type": "loki", "uid": "loki" },
"definition": "label_values(service)",
"includeAll": true,
"multi": true,
"name": "service",
"query": "label_values(service)",
"refresh": 2,
"type": "query"
},
{
"current": { "selected": false, "text": "All", "value": "$__all" },
"datasource": { "type": "loki", "uid": "loki" },
"definition": "label_values(tier)",
"includeAll": true,
"multi": true,
"name": "tier",
"query": "label_values(tier)",
"refresh": 2,
"type": "query"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Logs Explorer",
"uid": "logs-explorer",
"version": 1
}

View file

@ -3,7 +3,14 @@ apiVersion: 1
datasources:
- name: Loki
type: loki
uid: loki
access: proxy
url: http://loki:3100
isDefault: false
editable: true
jsonData:
maxLines: 1000
derivedFields:
- name: request_id
matcherRegex: '"request_id":"([a-f0-9-]+)"'
url: ""

109
docker/promtail/config.yaml Normal file
View file

@ -0,0 +1,109 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
batchwait: 3s
batchsize: 1048576 # 1 MB
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 30s
filters:
# Only collect from our compose project
- name: label
values: ["com.docker.compose.project"]
relabel_configs:
# Extract compose service name → label "service"
- source_labels: ["__meta_docker_container_label_com_docker_compose_service"]
target_label: "service"
# Extract container name → label "container"
- source_labels: ["__meta_docker_container_name"]
regex: "/(.*)"
target_label: "container"
# Extract compose project → label "project"
- source_labels: ["__meta_docker_container_label_com_docker_compose_project"]
target_label: "project"
# Tier labels based on container name prefix for easy filtering
# mana-infra-* → tier=infra
- source_labels: ["container"]
regex: "mana-infra-.*"
target_label: "tier"
replacement: "infra"
# mana-core-* → tier=core
- source_labels: ["container"]
regex: "mana-core-.*"
target_label: "tier"
replacement: "core"
# mana-auth/credits/user/subscriptions/analytics → tier=auth
- source_labels: ["container"]
regex: "mana-(auth|credits|user|subscriptions|analytics)"
target_label: "tier"
replacement: "auth"
# mana-app-* → tier=app
- source_labels: ["container"]
regex: "mana-app-.*"
target_label: "tier"
replacement: "app"
# mana-mon-* → tier=monitoring
- source_labels: ["container"]
regex: "mana-mon-.*"
target_label: "tier"
replacement: "monitoring"
# mana-matrix-* → tier=matrix
- source_labels: ["container"]
regex: "mana-matrix-.*"
target_label: "tier"
replacement: "matrix"
# mana-game-* → tier=games
- source_labels: ["container"]
regex: "mana-game-.*"
target_label: "tier"
replacement: "games"
# mana-service-* → tier=service
- source_labels: ["container"]
regex: "mana-service-.*"
target_label: "tier"
replacement: "service"
# Drop monitoring container logs to save space (they're noisy)
- source_labels: ["tier"]
regex: "monitoring"
action: drop
pipeline_stages:
# Try to parse JSON logs (Go services, Hono services)
- json:
expressions:
level: level
msg: msg
error: error
status: status
method: method
path: path
duration: duration
request_id: request_id
# Fall back: extract level from common log patterns
- regex:
expression: '(?i)(?P<level>error|warn|info|debug|fatal|panic)'
# Normalize level label
- labels:
level:
# Add timestamp from log if available
- timestamp:
source: time
format: RFC3339Nano
fallback_formats:
- "2006-01-02T15:04:05.000Z"
- "2006-01-02 15:04:05"
action_on_failure: fudge

View file

@ -34,6 +34,8 @@ MONITORING_CONTAINERS=(
mana-mon-alert-notifier
mana-mon-glitchtip
mana-mon-glitchtip-worker
mana-mon-loki
mana-mon-promtail
)
# Track if we stopped monitoring