diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index e1e9fb36d..f8fe73bc8 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -533,6 +533,7 @@ services: - '--web.enable-lifecycle' volumes: - ./docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./docker/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro - prometheus_data:/prometheus ports: - "9090:9090" diff --git a/docker/grafana/dashboards/application-details.json b/docker/grafana/dashboards/application-details.json new file mode 100644 index 000000000..ed3772d41 --- /dev/null +++ b/docker/grafana/dashboards/application-details.json @@ -0,0 +1,871 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Service Selection", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 1, "text": "DOWN" }, + "1": { "color": "green", "index": 0, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "up{job=~\"$service\"}", + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Service Status", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 3, + "panels": [], + "title": "HTTP Requests", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 }, + "id": 4, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (job)", + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { "id": "byRegexp", "options": ".*2[0-9]{2}.*" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byRegexp", "options": ".*4[0-9]{2}.*" }, + "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byRegexp", "options": ".*5[0-9]{2}.*" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 }, + "id": 5, + "options": { + "legend": { + "calcs": ["mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (status)", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Requests by Status Code", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 }, + "id": 6, + "options": { + "legend": { + "calcs": ["mean"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (method)", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "Requests by HTTP Method", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "id": 7, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{job=~\"$service\"}[5m])) by (le, job))", + "legendFormat": "p50 {{job}}", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\"$service\"}[5m])) by (le, job))", + "legendFormat": "p95 {{job}}", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{job=~\"$service\"}[5m])) by (le, job))", + "legendFormat": "p99 {{job}}", + "refId": "C" + } + ], + "title": "Response Time Percentiles", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Rate" }, + "properties": [ + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "id": 8, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Rate" }] + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "topk(10, sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (route))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Top 10 Routes", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "indexByName": {}, + "renameByName": { "Value": "Rate", "route": "Route" } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 9, + "panels": [], + "title": "Node.js Runtime", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 }, + "id": 10, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "nodejs_heap_size_used_bytes{job=~\"$service\"}", + "legendFormat": "{{job}} used", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "nodejs_heap_size_total_bytes{job=~\"$service\"}", + "legendFormat": "{{job}} total", + "refId": "B" + } + ], + "title": "Heap Memory", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 23 }, + "id": 11, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "nodejs_eventloop_lag_seconds{job=~\"$service\"}", + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Event Loop Lag", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }, + "id": 12, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "nodejs_active_handles_total{job=~\"$service\"}", + "legendFormat": "{{job}} handles", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "nodejs_active_requests_total{job=~\"$service\"}", + "legendFormat": "{{job}} requests", + "refId": "B" + } + ], + "title": "Active Handles & Requests", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }, + "id": 13, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "rate(nodejs_gc_duration_seconds_sum{job=~\"$service\"}[5m])", + "legendFormat": "{{job}} {{kind}}", + "refId": "A" + } + ], + "title": "GC Duration", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }, + "id": 14, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "rate(process_cpu_user_seconds_total{job=~\"$service\"}[5m]) * 100", + "legendFormat": "{{job}} user", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "rate(process_cpu_system_seconds_total{job=~\"$service\"}[5m]) * 100", + "legendFormat": "{{job}} system", + "refId": "B" + } + ], + "title": "Process CPU Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }, + "id": 15, + "panels": [], + "title": "Error Analysis", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }, + "id": 16, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(http_requests_total{job=~\"$service\", status=~\"5..\"}[5m])) by (job) / sum(rate(http_requests_total{job=~\"$service\"}[5m])) by (job)", + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Error Rate (5xx)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Count" }, + "properties": [ + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }, + "id": 17, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Count" }] + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "topk(10, sum(increase(http_requests_total{job=~\"$service\", status=~\"4..|5..\"}[1h])) by (route, status, method))", + "format": "table", + "instant": true, + "refId": "A" + } + ], + "title": "Top Error Routes (Last Hour)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "renameByName": { + "Value": "Count", + "method": "Method", + "route": "Route", + "status": "Status" + } + } + } + ], + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["manacore", "application", "nodejs"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(up{job=~\".*backend|mana-core-auth\"}, job)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "service", + "options": [], + "query": { + "query": "label_values(up{job=~\".*backend|mana-core-auth\"}, job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Application Details", + "uid": "application-details", + "version": 1, + "weekStart": "" +} diff --git a/docker/grafana/dashboards/database-details.json b/docker/grafana/dashboards/database-details.json new file mode 100644 index 000000000..ef41ffa75 --- /dev/null +++ b/docker/grafana/dashboards/database-details.json @@ -0,0 +1,908 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "PostgreSQL Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "red", "value": 80 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(pg_stat_activity_count)", + "refId": "A" + } + ], + "title": "Total Connections", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 20 }, + { "color": "red", "value": 50 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(pg_stat_activity_count{state=\"active\"})", + "refId": "A" + } + ], + "title": "Active Queries", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "count(pg_database_size_bytes{datname!~\"template.*|postgres\"})", + "refId": "A" + } + ], + "title": "Databases", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 90 }, + { "color": "green", "value": 95 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} / (pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} + pg_stat_database_blks_read{datname!~\"template.*|postgres\"} + 0.0001)) * 100", + "refId": "A" + } + ], + "title": "Avg Cache Hit Ratio", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(pg_database_size_bytes{datname!~\"template.*|postgres\"})", + "refId": "A" + } + ], + "title": "Total Database Size", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 1, "text": "DOWN" }, + "1": { "color": "green", "index": 0, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "pg_up", + "refId": "A" + } + ], + "title": "PostgreSQL Status", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "pg_database_size_bytes{datname!~\"template.*|postgres\"}", + "legendFormat": "{{datname}}", + "refId": "A" + } + ], + "title": "Database Size by App", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }, + "id": 9, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "pg_stat_activity_count{state=\"active\",datname!~\"template.*|postgres\"}", + "legendFormat": "{{datname}}", + "refId": "A" + } + ], + "title": "Active Connections by Database", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 }, + "id": 10, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "rate(pg_stat_database_xact_commit{datname!~\"template.*|postgres\"}[5m])", + "legendFormat": "{{datname}} commits", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "rate(pg_stat_database_xact_rollback{datname!~\"template.*|postgres\"}[5m])", + "legendFormat": "{{datname}} rollbacks", + "refId": "B" + } + ], + "title": "Transactions per Second", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 }, + "id": 11, + "options": { + "legend": { + "calcs": ["mean", "min"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} / (pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} + pg_stat_database_blks_read{datname!~\"template.*|postgres\"} + 0.0001)) * 100", + "legendFormat": "{{datname}}", + "refId": "A" + } + ], + "title": "Cache Hit Ratio by Database", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, + "id": 12, + "panels": [], + "title": "Redis Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 1, "text": "DOWN" }, + "1": { "color": "green", "index": 0, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 22 }, + "id": 13, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_up", + "refId": "A" + } + ], + "title": "Redis Status", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "red", "value": 100 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 22 }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_connected_clients", + "refId": "A" + } + ], + "title": "Connected Clients", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 22 }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_memory_used_bytes", + "refId": "A" + } + ], + "title": "Memory Used", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 22 }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_db_keys", + "refId": "A" + } + ], + "title": "Total Keys", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 22 }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_blocked_clients", + "refId": "A" + } + ], + "title": "Blocked Clients", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 90 }, + { "color": "green", "value": 95 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 22 }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total) * 100", + "refId": "A" + } + ], + "title": "Hit Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 }, + "id": 19, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "rate(redis_commands_processed_total[5m])", + "legendFormat": "Commands/s", + "refId": "A" + } + ], + "title": "Commands per Second", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 }, + "id": 20, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_memory_used_bytes", + "legendFormat": "Used", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_memory_max_bytes", + "legendFormat": "Max", + "refId": "B" + } + ], + "title": "Memory Usage", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["manacore", "database", "postgresql", "redis"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Database Details", + "uid": "database-details", + "version": 1, + "weekStart": "" +} diff --git a/docker/prometheus/alerts.yml b/docker/prometheus/alerts.yml new file mode 100644 index 000000000..f05400d47 --- /dev/null +++ b/docker/prometheus/alerts.yml @@ -0,0 +1,245 @@ +groups: + - name: service_alerts + rules: + # Service Down Alert + - alert: ServiceDown + expr: up{job=~"mana-core-auth|chat-backend|todo-backend|calendar-backend|clock-backend|contacts-backend"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Service {{ $labels.job }} is down" + description: "{{ $labels.job }} has been down for more than 1 minute." + + # High Error Rate (> 5% of requests are 5xx) + - alert: HighErrorRate + expr: | + sum(rate(http_requests_total{status=~"5.."}[5m])) by (job) + / sum(rate(http_requests_total[5m])) by (job) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "High error rate on {{ $labels.job }}" + description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 5%)" + + # Very High Error Rate (> 20% of requests are 5xx) + - alert: VeryHighErrorRate + expr: | + sum(rate(http_requests_total{status=~"5.."}[5m])) by (job) + / sum(rate(http_requests_total[5m])) by (job) > 0.20 + for: 2m + labels: + severity: critical + annotations: + summary: "Very high error rate on {{ $labels.job }}" + description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 20%)" + + # Slow Response Time (p95 > 2s) + - alert: SlowResponseTime + expr: | + histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "Slow response time on {{ $labels.job }}" + description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}" + + # Very Slow Response Time (p95 > 5s) + - alert: VerySlowResponseTime + expr: | + histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 5 + for: 2m + labels: + severity: critical + annotations: + summary: "Very slow response time on {{ $labels.job }}" + description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}" + + # High Memory Usage (Node.js heap > 500MB) + - alert: HighHeapMemory + expr: nodejs_heap_size_used_bytes > 500 * 1024 * 1024 + for: 10m + labels: + severity: warning + annotations: + summary: "High heap memory on {{ $labels.job }}" + description: "{{ $labels.job }} heap usage is {{ $value | humanize1024 }}B" + + # Event Loop Lag (> 100ms) + - alert: HighEventLoopLag + expr: nodejs_eventloop_lag_seconds > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "High event loop lag on {{ $labels.job }}" + description: "{{ $labels.job }} event loop lag is {{ $value | humanizeDuration }}" + + - name: infrastructure_alerts + rules: + # High CPU Usage (> 80%) + - alert: HighCPUUsage + expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU usage on host" + description: "CPU usage is {{ $value | humanize }}%" + + # Very High CPU Usage (> 95%) + - alert: VeryHighCPUUsage + expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 + for: 5m + labels: + severity: critical + annotations: + summary: "Very high CPU usage on host" + description: "CPU usage is {{ $value | humanize }}%" + + # High Memory Usage (> 85%) + - alert: HighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "High memory usage on host" + description: "Memory usage is {{ $value | humanize }}%" + + # Very High Memory Usage (> 95%) + - alert: VeryHighMemoryUsage + expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 + for: 5m + labels: + severity: critical + annotations: + summary: "Very high memory usage on host" + description: "Memory usage is {{ $value | humanize }}%" + + # High Disk Usage (> 80%) + - alert: HighDiskUsage + expr: | + (1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"} + / node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 80 + for: 10m + labels: + severity: warning + annotations: + summary: "High disk usage on {{ $labels.mountpoint }}" + description: "Disk usage is {{ $value | humanize }}%" + + # Very High Disk Usage (> 90%) + - alert: VeryHighDiskUsage + expr: | + (1 - (node_filesystem_avail_bytes{mountpoint=~"/host_mnt/Users|/"} + / node_filesystem_size_bytes{mountpoint=~"/host_mnt/Users|/"})) * 100 > 90 + for: 5m + labels: + severity: critical + annotations: + summary: "Very high disk usage on {{ $labels.mountpoint }}" + description: "Disk usage is {{ $value | humanize }}%" + + - name: database_alerts + rules: + # PostgreSQL Down + - alert: PostgreSQLDown + expr: pg_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "PostgreSQL is down" + description: "PostgreSQL has been down for more than 1 minute." + + # Redis Down + - alert: RedisDown + expr: redis_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Redis is down" + description: "Redis has been down for more than 1 minute." + + # PostgreSQL High Connections (> 80) + - alert: PostgreSQLHighConnections + expr: sum(pg_stat_activity_count) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High PostgreSQL connections" + description: "PostgreSQL has {{ $value }} connections (> 80)" + + # PostgreSQL Low Cache Hit Ratio (< 90%) + - alert: PostgreSQLLowCacheHitRatio + expr: | + avg(pg_stat_database_blks_hit{datname!~"template.*|postgres"} + / (pg_stat_database_blks_hit{datname!~"template.*|postgres"} + + pg_stat_database_blks_read{datname!~"template.*|postgres"} + 0.0001)) * 100 < 90 + for: 10m + labels: + severity: warning + annotations: + summary: "PostgreSQL low cache hit ratio" + description: "PostgreSQL cache hit ratio is {{ $value | humanize }}%" + + # Redis High Memory (> 1GB) + - alert: RedisHighMemory + expr: redis_memory_used_bytes > 1024 * 1024 * 1024 + for: 10m + labels: + severity: warning + annotations: + summary: "Redis high memory usage" + description: "Redis memory usage is {{ $value | humanize1024 }}B" + + # Redis Blocked Clients + - alert: RedisBlockedClients + expr: redis_blocked_clients > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Redis has blocked clients" + description: "Redis has {{ $value }} blocked clients" + + - name: container_alerts + rules: + # Container High CPU (> 80% of limit) + - alert: ContainerHighCPU + expr: | + sum(rate(container_cpu_usage_seconds_total{id=~"/docker/.+"}[5m])) by (name) * 100 > 80 + for: 10m + labels: + severity: warning + annotations: + summary: "Container {{ $labels.name }} high CPU" + description: "Container {{ $labels.name }} CPU usage is {{ $value | humanize }}%" + + # Container High Memory (> 80% of limit) + - alert: ContainerHighMemory + expr: | + container_memory_usage_bytes{id=~"/docker/.+"} + / container_spec_memory_limit_bytes{id=~"/docker/.+"} * 100 > 80 + for: 10m + labels: + severity: warning + annotations: + summary: "Container {{ $labels.name }} high memory" + description: "Container {{ $labels.name }} memory usage is {{ $value | humanize }}%" + + # Container Restart + - alert: ContainerRestarted + expr: | + increase(container_start_time_seconds{id=~"/docker/.+"}[5m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: "Container {{ $labels.name }} restarted" + description: "Container {{ $labels.name }} has restarted." diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 337e9d64a..ad8d73f6d 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -5,6 +5,10 @@ global: scrape_interval: 15s evaluation_interval: 15s +# Load alerting rules +rule_files: + - /etc/prometheus/alerts.yml + # Alertmanager configuration (optional, for future use) # alerting: # alertmanagers: