mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 20:01:09 +02:00
feat(monitoring): integrate Promtail for centralized log collection via Loki
Loki was already running but had no log shipper. Adds Promtail to collect Docker logs from all 66 containers with automatic tier labeling (infra, auth, core, app, matrix, games) and a Grafana Logs Explorer dashboard. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
58bef0ab25
commit
4a48182677
5 changed files with 471 additions and 0 deletions
|
|
@ -1635,6 +1635,25 @@ services:
|
|||
retries: 3
|
||||
start_period: 15s
|
||||
|
||||
promtail:
|
||||
image: grafana/promtail:3.0.0
|
||||
container_name: mana-mon-promtail
|
||||
restart: always
|
||||
mem_limit: 96m
|
||||
command: -config.file=/etc/promtail/config.yaml -config.expand-env=true
|
||||
volumes:
|
||||
- ./docker/promtail:/etc/promtail:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
depends_on:
|
||||
loki:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9080/ready"]
|
||||
interval: 300s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
|
||||
pushgateway:
|
||||
image: prom/pushgateway:v1.7.0
|
||||
container_name: mana-mon-pushgateway
|
||||
|
|
|
|||
334
docker/grafana/dashboards/logs-explorer.json
Normal file
334
docker/grafana/dashboards/logs-explorer.json
Normal file
|
|
@ -0,0 +1,334 @@
|
|||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisLabel": "",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 80,
|
||||
"stacking": { "group": "A", "mode": "normal" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 1 },
|
||||
"id": 2,
|
||||
"title": "Log Volume by Tier",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "sum by (tier) (count_over_time({tier=~\".+\"} [$__interval]))",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"drawStyle": "bars",
|
||||
"fillOpacity": 80,
|
||||
"stacking": { "group": "A", "mode": "normal" }
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 1 },
|
||||
"id": 3,
|
||||
"title": "Errors & Warnings",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "sum by (level) (count_over_time({tier=~\".+\"} |~ \"(?i)(error|warn|fatal|panic)\" [$__interval]))",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "red", "value": 200 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 6, "x": 0, "y": 7 },
|
||||
"id": 4,
|
||||
"title": "Errors (last 1h)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "sum(count_over_time({tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\" [1h]))",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 500 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 6, "x": 6, "y": 7 },
|
||||
"id": 5,
|
||||
"title": "Warnings (last 1h)",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "sum(count_over_time({tier=~\".+\"} |~ \"(?i)warn\" [1h]))",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "displayMode": "gradient", "showValue": "auto" }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 7 },
|
||||
"id": 6,
|
||||
"title": "Top 10 Noisiest Services",
|
||||
"type": "barchart",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "topk(10, sum by (service) (count_over_time({tier=~\".+\"} [1h])))",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 },
|
||||
"id": 10,
|
||||
"panels": [],
|
||||
"title": "Error Logs",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 11,
|
||||
"title": "Errors across all services",
|
||||
"type": "logs",
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"showCommonLabels": false,
|
||||
"wrapLogMessage": true,
|
||||
"prettifyLogMessage": false,
|
||||
"enableLogDetails": true,
|
||||
"dedupStrategy": "none",
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "{tier=~\".+\"} |~ \"(?i)(error|fatal|panic)\"",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 },
|
||||
"id": 20,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 25 },
|
||||
"id": 21,
|
||||
"title": "Auth Service Logs",
|
||||
"type": "logs",
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "{tier=\"auth\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Auth Services",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 },
|
||||
"id": 30,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 26 },
|
||||
"id": 31,
|
||||
"title": "Core Service Logs",
|
||||
"type": "logs",
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "{tier=\"core\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Core Services (Sync, Search, Gateway)",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 },
|
||||
"id": 40,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 27 },
|
||||
"id": 41,
|
||||
"title": "Web App Logs",
|
||||
"type": "logs",
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "{tier=\"app\"} | service =~ \"$service\"",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Web Apps",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"collapsed": true,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 },
|
||||
"id": 50,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 28 },
|
||||
"id": 51,
|
||||
"title": "Matrix Stack Logs",
|
||||
"type": "logs",
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"expr": "{tier=\"matrix\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Matrix Stack",
|
||||
"type": "row"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"tags": ["logs", "loki"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "selected": false, "text": "All", "value": "$__all" },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"definition": "label_values(service)",
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"name": "service",
|
||||
"query": "label_values(service)",
|
||||
"refresh": 2,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": { "selected": false, "text": "All", "value": "$__all" },
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"definition": "label_values(tier)",
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"name": "tier",
|
||||
"query": "label_values(tier)",
|
||||
"refresh": 2,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Logs Explorer",
|
||||
"uid": "logs-explorer",
|
||||
"version": 1
|
||||
}
|
||||
|
|
@ -3,7 +3,14 @@ apiVersion: 1
|
|||
datasources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
uid: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
isDefault: false
|
||||
editable: true
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
derivedFields:
|
||||
- name: request_id
|
||||
matcherRegex: '"request_id":"([a-f0-9-]+)"'
|
||||
url: ""
|
||||
|
|
|
|||
109
docker/promtail/config.yaml
Normal file
109
docker/promtail/config.yaml
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
batchwait: 3s
|
||||
batchsize: 1048576 # 1 MB
|
||||
|
||||
scrape_configs:
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 30s
|
||||
filters:
|
||||
# Only collect from our compose project
|
||||
- name: label
|
||||
values: ["com.docker.compose.project"]
|
||||
|
||||
relabel_configs:
|
||||
# Extract compose service name → label "service"
|
||||
- source_labels: ["__meta_docker_container_label_com_docker_compose_service"]
|
||||
target_label: "service"
|
||||
|
||||
# Extract container name → label "container"
|
||||
- source_labels: ["__meta_docker_container_name"]
|
||||
regex: "/(.*)"
|
||||
target_label: "container"
|
||||
|
||||
# Extract compose project → label "project"
|
||||
- source_labels: ["__meta_docker_container_label_com_docker_compose_project"]
|
||||
target_label: "project"
|
||||
|
||||
# Tier labels based on container name prefix for easy filtering
|
||||
# mana-infra-* → tier=infra
|
||||
- source_labels: ["container"]
|
||||
regex: "mana-infra-.*"
|
||||
target_label: "tier"
|
||||
replacement: "infra"
|
||||
# mana-core-* → tier=core
|
||||
- source_labels: ["container"]
|
||||
regex: "mana-core-.*"
|
||||
target_label: "tier"
|
||||
replacement: "core"
|
||||
# mana-auth/credits/user/subscriptions/analytics → tier=auth
|
||||
- source_labels: ["container"]
|
||||
regex: "mana-(auth|credits|user|subscriptions|analytics)"
|
||||
target_label: "tier"
|
||||
replacement: "auth"
|
||||
# mana-app-* → tier=app
|
||||
- source_labels: ["container"]
|
||||
regex: "mana-app-.*"
|
||||
target_label: "tier"
|
||||
replacement: "app"
|
||||
# mana-mon-* → tier=monitoring
|
||||
- source_labels: ["container"]
|
||||
regex: "mana-mon-.*"
|
||||
target_label: "tier"
|
||||
replacement: "monitoring"
|
||||
# mana-matrix-* → tier=matrix
|
||||
- source_labels: ["container"]
|
||||
regex: "mana-matrix-.*"
|
||||
target_label: "tier"
|
||||
replacement: "matrix"
|
||||
# mana-game-* → tier=games
|
||||
- source_labels: ["container"]
|
||||
regex: "mana-game-.*"
|
||||
target_label: "tier"
|
||||
replacement: "games"
|
||||
# mana-service-* → tier=service
|
||||
- source_labels: ["container"]
|
||||
regex: "mana-service-.*"
|
||||
target_label: "tier"
|
||||
replacement: "service"
|
||||
|
||||
# Drop monitoring container logs to save space (they're noisy)
|
||||
- source_labels: ["tier"]
|
||||
regex: "monitoring"
|
||||
action: drop
|
||||
|
||||
pipeline_stages:
|
||||
# Try to parse JSON logs (Go services, Hono services)
|
||||
- json:
|
||||
expressions:
|
||||
level: level
|
||||
msg: msg
|
||||
error: error
|
||||
status: status
|
||||
method: method
|
||||
path: path
|
||||
duration: duration
|
||||
request_id: request_id
|
||||
# Fall back: extract level from common log patterns
|
||||
- regex:
|
||||
expression: '(?i)(?P<level>error|warn|info|debug|fatal|panic)'
|
||||
# Normalize level label
|
||||
- labels:
|
||||
level:
|
||||
# Add timestamp from log if available
|
||||
- timestamp:
|
||||
source: time
|
||||
format: RFC3339Nano
|
||||
fallback_formats:
|
||||
- "2006-01-02T15:04:05.000Z"
|
||||
- "2006-01-02 15:04:05"
|
||||
action_on_failure: fudge
|
||||
|
|
@ -34,6 +34,8 @@ MONITORING_CONTAINERS=(
|
|||
mana-mon-alert-notifier
|
||||
mana-mon-glitchtip
|
||||
mana-mon-glitchtip-worker
|
||||
mana-mon-loki
|
||||
mana-mon-promtail
|
||||
)
|
||||
|
||||
# Track if we stopped monitoring
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue