feat(monitoring): add comprehensive Grafana dashboards and alerting

New dashboards:
- Application Details: Node.js runtime (heap, event loop, GC),
  HTTP details (status codes, methods, top routes), error analysis
- Database Details: PostgreSQL and Redis metrics with detailed breakdowns

Alerting rules (docker/prometheus/alerts.yml):
- Service: down, high/very high error rate, slow response time
- Infrastructure: high CPU/memory/disk usage
- Database: PostgreSQL/Redis down, high connections, low cache hit
- Container: high CPU/memory, restarts

All dashboards include service selector variable for filtering.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-01-26 09:47:18 +01:00
parent 41dea775a6
commit 8c259a008b
5 changed files with 2029 additions and 0 deletions

View file

@ -533,6 +533,7 @@ services:
- '--web.enable-lifecycle'
volumes:
- ./docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./docker/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090"

View file

@ -0,0 +1,871 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"panels": [],
"title": "Service Selection",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{
"options": {
"0": { "color": "red", "index": 1, "text": "DOWN" },
"1": { "color": "green", "index": 0, "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 1 },
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "10.0.0",
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "up{job=~\"$service\"}",
"legendFormat": "{{job}}",
"refId": "A"
}
],
"title": "Service Status",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 3,
"panels": [],
"title": "HTTP Requests",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "reqps"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 },
"id": 4,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (job)",
"legendFormat": "{{job}}",
"refId": "A"
}
],
"title": "Request Rate",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "reqps"
},
"overrides": [
{
"matcher": { "id": "byRegexp", "options": ".*2[0-9]{2}.*" },
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
},
{
"matcher": { "id": "byRegexp", "options": ".*4[0-9]{2}.*" },
"properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }]
},
{
"matcher": { "id": "byRegexp", "options": ".*5[0-9]{2}.*" },
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
}
]
},
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 },
"id": 5,
"options": {
"legend": {
"calcs": ["mean"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (status)",
"legendFormat": "{{status}}",
"refId": "A"
}
],
"title": "Requests by Status Code",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "reqps"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 },
"id": 6,
"options": {
"legend": {
"calcs": ["mean"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (method)",
"legendFormat": "{{method}}",
"refId": "A"
}
],
"title": "Requests by HTTP Method",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "s"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 },
"id": 7,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=~\"$service\"}[5m])) by (le, job))",
"legendFormat": "p50 {{job}}",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\"$service\"}[5m])) by (le, job))",
"legendFormat": "p95 {{job}}",
"refId": "B"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=~\"$service\"}[5m])) by (le, job))",
"legendFormat": "p99 {{job}}",
"refId": "C"
}
],
"title": "Response Time Percentiles",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"align": "auto",
"cellOptions": { "type": "auto" },
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "reqps"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Rate" },
"properties": [
{ "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } }
]
}
]
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 },
"id": 8,
"options": {
"cellHeight": "sm",
"footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "Rate" }]
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "topk(10, sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (route))",
"format": "table",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Top 10 Routes",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": { "Time": true },
"indexByName": {},
"renameByName": { "Value": "Rate", "route": "Route" }
}
}
],
"type": "table"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 },
"id": 9,
"panels": [],
"title": "Node.js Runtime",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 },
"id": 10,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "nodejs_heap_size_used_bytes{job=~\"$service\"}",
"legendFormat": "{{job}} used",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "nodejs_heap_size_total_bytes{job=~\"$service\"}",
"legendFormat": "{{job}} total",
"refId": "B"
}
],
"title": "Heap Memory",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "s"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 23 },
"id": 11,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "nodejs_eventloop_lag_seconds{job=~\"$service\"}",
"legendFormat": "{{job}}",
"refId": "A"
}
],
"title": "Event Loop Lag",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 },
"id": 12,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "nodejs_active_handles_total{job=~\"$service\"}",
"legendFormat": "{{job}} handles",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "nodejs_active_requests_total{job=~\"$service\"}",
"legendFormat": "{{job}} requests",
"refId": "B"
}
],
"title": "Active Handles & Requests",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "s"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 },
"id": 13,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "rate(nodejs_gc_duration_seconds_sum{job=~\"$service\"}[5m])",
"legendFormat": "{{job}} {{kind}}",
"refId": "A"
}
],
"title": "GC Duration",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 },
"id": 14,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "rate(process_cpu_user_seconds_total{job=~\"$service\"}[5m]) * 100",
"legendFormat": "{{job}} user",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "rate(process_cpu_system_seconds_total{job=~\"$service\"}[5m]) * 100",
"legendFormat": "{{job}} system",
"refId": "B"
}
],
"title": "Process CPU Usage",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 },
"id": 15,
"panels": [],
"title": "Error Analysis",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 },
"id": 16,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(rate(http_requests_total{job=~\"$service\", status=~\"5..\"}[5m])) by (job) / sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (job)",
"legendFormat": "{{job}}",
"refId": "A"
}
],
"title": "Error Rate (5xx)",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"align": "auto",
"cellOptions": { "type": "auto" },
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "short"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Count" },
"properties": [
{ "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } }
]
}
]
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 },
"id": 17,
"options": {
"cellHeight": "sm",
"footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
"showHeader": true,
"sortBy": [{ "desc": true, "displayName": "Count" }]
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "topk(10, sum(increase(http_requests_total{job=~\"$service\", status=~\"4..|5..\"}[1h])) by (route, status, method))",
"format": "table",
"instant": true,
"refId": "A"
}
],
"title": "Top Error Routes (Last Hour)",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": { "Time": true },
"renameByName": {
"Value": "Count",
"method": "Method",
"route": "Route",
"status": "Status"
}
}
}
],
"type": "table"
}
],
"refresh": "30s",
"schemaVersion": 38,
"tags": ["manacore", "application", "nodejs"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
"hide": 0,
"includeAll": false,
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": ".*",
"current": { "selected": true, "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"definition": "label_values(up{job=~\".*backend|mana-core-auth\"}, job)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "service",
"options": [],
"query": {
"query": "label_values(up{job=~\".*backend|mana-core-auth\"}, job)",
"refId": "PrometheusVariableQueryEditor-VariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Application Details",
"uid": "application-details",
"version": 1,
"weekStart": ""
}

View file

@ -0,0 +1,908 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"panels": [],
"title": "PostgreSQL Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 50 },
{ "color": "red", "value": 80 }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(pg_stat_activity_count)",
"refId": "A"
}
],
"title": "Total Connections",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 20 },
{ "color": "red", "value": 50 }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(pg_stat_activity_count{state=\"active\"})",
"refId": "A"
}
],
"title": "Active Queries",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "count(pg_database_size_bytes{datname!~\"template.*|postgres\"})",
"refId": "A"
}
],
"title": "Databases",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 90 },
{ "color": "green", "value": 95 }
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["mean"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "avg(pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} / (pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} + pg_stat_database_blks_read{datname!~\"template.*|postgres\"} + 0.0001)) * 100",
"refId": "A"
}
],
"title": "Avg Cache Hit Ratio",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "sum(pg_database_size_bytes{datname!~\"template.*|postgres\"})",
"refId": "A"
}
],
"title": "Total Database Size",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{
"options": {
"0": { "color": "red", "index": 1, "text": "DOWN" },
"1": { "color": "green", "index": 0, "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
"id": 7,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "pg_up",
"refId": "A"
}
],
"title": "PostgreSQL Status",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 },
"id": 8,
"options": {
"legend": {
"calcs": ["lastNotNull"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "pg_database_size_bytes{datname!~\"template.*|postgres\"}",
"legendFormat": "{{datname}}",
"refId": "A"
}
],
"title": "Database Size by App",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 },
"id": 9,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "pg_stat_activity_count{state=\"active\",datname!~\"template.*|postgres\"}",
"legendFormat": "{{datname}}",
"refId": "A"
}
],
"title": "Active Connections by Database",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 },
"id": 10,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "rate(pg_stat_database_xact_commit{datname!~\"template.*|postgres\"}[5m])",
"legendFormat": "{{datname}} commits",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "rate(pg_stat_database_xact_rollback{datname!~\"template.*|postgres\"}[5m])",
"legendFormat": "{{datname}} rollbacks",
"refId": "B"
}
],
"title": "Transactions per Second",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 },
"id": 11,
"options": {
"legend": {
"calcs": ["mean", "min"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "(pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} / (pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} + pg_stat_database_blks_read{datname!~\"template.*|postgres\"} + 0.0001)) * 100",
"legendFormat": "{{datname}}",
"refId": "A"
}
],
"title": "Cache Hit Ratio by Database",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
"id": 12,
"panels": [],
"title": "Redis Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{
"options": {
"0": { "color": "red", "index": 1, "text": "DOWN" },
"1": { "color": "green", "index": 0, "text": "UP" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 22 },
"id": 13,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "redis_up",
"refId": "A"
}
],
"title": "Redis Status",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 50 },
{ "color": "red", "value": 100 }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 22 },
"id": 14,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "redis_connected_clients",
"refId": "A"
}
],
"title": "Connected Clients",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 22 },
"id": 15,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "redis_memory_used_bytes",
"refId": "A"
}
],
"title": "Memory Used",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 22 },
"id": 16,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "redis_db_keys",
"refId": "A"
}
],
"title": "Total Keys",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 22 },
"id": 17,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "redis_blocked_clients",
"refId": "A"
}
],
"title": "Blocked Clients",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 90 },
{ "color": "green", "value": 95 }
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 22 },
"id": 18,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total) * 100",
"refId": "A"
}
],
"title": "Hit Rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 },
"id": 19,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "rate(redis_commands_processed_total[5m])",
"legendFormat": "Commands/s",
"refId": "A"
}
],
"title": "Commands per Second",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "decbytes"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 },
"id": 20,
"options": {
"legend": {
"calcs": ["mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "redis_memory_used_bytes",
"legendFormat": "Used",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"expr": "redis_memory_max_bytes",
"legendFormat": "Max",
"refId": "B"
}
],
"title": "Memory Usage",
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 38,
"tags": ["manacore", "database", "postgresql", "redis"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
"hide": 0,
"includeAll": false,
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Database Details",
"uid": "database-details",
"version": 1,
"weekStart": ""
}

View file

@ -0,0 +1,245 @@
groups:
- name: service_alerts
rules:
# Service Down Alert
- alert: ServiceDown
expr: up{job=~"mana-core-auth|chat-backend|todo-backend|calendar-backend|clock-backend|contacts-backend"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "{{ $labels.job }} has been down for more than 1 minute."
# High Error Rate (> 5% of requests are 5xx)
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)
/ sum(rate(http_requests_total[5m])) by (job) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.job }}"
description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 5%)"
# Very High Error Rate (> 20% of requests are 5xx)
- alert: VeryHighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)
/ sum(rate(http_requests_total[5m])) by (job) > 0.20
for: 2m
labels:
severity: critical
annotations:
summary: "Very high error rate on {{ $labels.job }}"
description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 20%)"
# Slow Response Time (p95 > 2s)
- alert: SlowResponseTime
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Slow response time on {{ $labels.job }}"
description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}"
# Very Slow Response Time (p95 > 5s)
- alert: VerySlowResponseTime
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 5
for: 2m
labels:
severity: critical
annotations:
summary: "Very slow response time on {{ $labels.job }}"
description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}"
# High Memory Usage (Node.js heap > 500MB)
- alert: HighHeapMemory
expr: nodejs_heap_size_used_bytes > 500 * 1024 * 1024
for: 10m
labels:
severity: warning
annotations:
summary: "High heap memory on {{ $labels.job }}"
description: "{{ $labels.job }} heap usage is {{ $value | humanize1024 }}B"
# Event Loop Lag (> 100ms)
- alert: HighEventLoopLag
expr: nodejs_eventloop_lag_seconds > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High event loop lag on {{ $labels.job }}"
description: "{{ $labels.job }} event loop lag is {{ $value | humanizeDuration }}"
- name: infrastructure_alerts
rules:
# High CPU Usage (> 80%)
- alert: HighCPUUsage
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage on host"
description: "CPU usage is {{ $value | humanize }}%"
# Very High CPU Usage (> 95%)
- alert: VeryHighCPUUsage
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 5m
labels:
severity: critical
annotations:
summary: "Very high CPU usage on host"
description: "CPU usage is {{ $value | humanize }}%"
# High Memory Usage (> 85%)
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage on host"
description: "Memory usage is {{ $value | humanize }}%"
# Very High Memory Usage (> 95%)
- alert: VeryHighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 5m
labels:
severity: critical
annotations:
summary: "Very high memory usage on host"
description: "Memory usage is {{ $value | humanize }}%"
# High Disk Usage (> 80%)
- alert: HighDiskUsage
expr: |
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"}
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High disk usage on {{ $labels.mountpoint }}"
description: "Disk usage is {{ $value | humanize }}%"
# Very High Disk Usage (> 90%)
- alert: VeryHighDiskUsage
expr: |
(1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"}
/ node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Very high disk usage on {{ $labels.mountpoint }}"
description: "Disk usage is {{ $value | humanize }}%"
- name: database_alerts
rules:
# PostgreSQL Down
- alert: PostgreSQLDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL is down"
description: "PostgreSQL has been down for more than 1 minute."
# Redis Down
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is down"
description: "Redis has been down for more than 1 minute."
# PostgreSQL High Connections (> 80)
- alert: PostgreSQLHighConnections
expr: sum(pg_stat_activity_count) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High PostgreSQL connections"
description: "PostgreSQL has {{ $value }} connections (> 80)"
# PostgreSQL Low Cache Hit Ratio (< 90%)
- alert: PostgreSQLLowCacheHitRatio
expr: |
avg(pg_stat_database_blks_hit{datname!~"template.*|postgres"}
/ (pg_stat_database_blks_hit{datname!~"template.*|postgres"}
+ pg_stat_database_blks_read{datname!~"template.*|postgres"} + 0.0001)) * 100 < 90
for: 10m
labels:
severity: warning
annotations:
summary: "PostgreSQL low cache hit ratio"
description: "PostgreSQL cache hit ratio is {{ $value | humanize }}%"
# Redis High Memory (> 1GB)
- alert: RedisHighMemory
expr: redis_memory_used_bytes > 1024 * 1024 * 1024
for: 10m
labels:
severity: warning
annotations:
summary: "Redis high memory usage"
description: "Redis memory usage is {{ $value | humanize1024 }}B"
# Redis Blocked Clients
- alert: RedisBlockedClients
expr: redis_blocked_clients > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Redis has blocked clients"
description: "Redis has {{ $value }} blocked clients"
- name: container_alerts
rules:
# Container High CPU (> 80% of limit)
- alert: ContainerHighCPU
expr: |
sum(rate(container_cpu_usage_seconds_total{id=~"/docker/.+"}[5m])) by (name) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high CPU"
description: "Container {{ $labels.name }} CPU usage is {{ $value | humanize }}%"
# Container High Memory (> 80% of limit)
- alert: ContainerHighMemory
expr: |
container_memory_usage_bytes{id=~"/docker/.+"}
/ container_spec_memory_limit_bytes{id=~"/docker/.+"} * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high memory"
description: "Container {{ $labels.name }} memory usage is {{ $value | humanize }}%"
# Container Restart
- alert: ContainerRestarted
expr: |
increase(container_start_time_seconds{id=~"/docker/.+"}[5m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: "Container {{ $labels.name }} restarted"
description: "Container {{ $labels.name }} has restarted."

View file

@ -5,6 +5,10 @@ global:
scrape_interval: 15s
evaluation_interval: 15s
# Load alerting rules
rule_files:
- /etc/prometheus/alerts.yml
# Alertmanager configuration (optional, for future use)
# alerting:
# alertmanagers: