From 9dfad0128ab57a562b07e0b94b593dd082e812b2 Mon Sep 17 00:00:00 2001 From: Till-JS <101404291+Till-JS@users.noreply.github.com> Date: Wed, 28 Jan 2026 12:38:04 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=88=20feat(monitoring):=20upgrade=20to?= =?UTF-8?q?=20VictoriaMetrics=20+=20DuckDB=20analytics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace Prometheus with VictoriaMetrics (2-year retention) - Add DuckDB analytics module for business KPIs (unlimited retention) - Add master overview dashboard combining all metrics - Add business metrics dashboard for user growth tracking - Add backup script for VictoriaMetrics snapshots and DuckDB - Add ADR documentation for monitoring stack decision Analytics API endpoints: - GET /api/v1/analytics/health - Service health - GET /api/v1/analytics/latest - Latest metrics snapshot - GET /api/v1/analytics/growth - User growth over time - GET /api/v1/analytics/monthly - Monthly aggregates - POST /api/v1/analytics/snapshot - Manual snapshot trigger --- .env.development | 11 +- docker-compose.macmini.yml | 37 +- .../grafana/dashboards/business-metrics.json | 439 +++++++ .../grafana/dashboards/master-overview.json | 1155 +++++++++++++++++ .../provisioning/datasources/prometheus.yml | 15 +- .../decisions/001-monitoring-stack-upgrade.md | 593 +++++++++ package.json | 13 + scripts/backup-monitoring.sh | 130 ++ scripts/generate-env.mjs | 24 + scripts/setup-databases.sh | 17 +- services/mana-core-auth/.gitignore | 3 + services/mana-core-auth/package.json | 3 +- .../src/analytics/analytics.controller.ts | 135 ++ .../src/analytics/analytics.module.ts | 12 + .../src/analytics/analytics.service.ts | 327 +++++ .../mana-core-auth/src/analytics/index.ts | 3 + services/mana-core-auth/src/app.module.ts | 2 + 17 files changed, 2901 insertions(+), 18 deletions(-) create mode 100644 docker/grafana/dashboards/business-metrics.json create mode 100644 docker/grafana/dashboards/master-overview.json create mode 100644 docs/decisions/001-monitoring-stack-upgrade.md create mode 100755 scripts/backup-monitoring.sh create mode 100644 services/mana-core-auth/src/analytics/analytics.controller.ts create mode 100644 services/mana-core-auth/src/analytics/analytics.module.ts create mode 100644 services/mana-core-auth/src/analytics/analytics.service.ts create mode 100644 services/mana-core-auth/src/analytics/index.ts diff --git a/.env.development b/.env.development index d72bb2845..8c20778dd 100644 --- a/.env.development +++ b/.env.development @@ -171,7 +171,16 @@ ZITARE_DATABASE_URL=postgresql://manacore:devpassword@localhost:5432/zitare ZITARE_BOT_PORT=3303 ZITARE_BOT_DATABASE_URL=postgresql://manacore:devpassword@localhost:5432/zitare_bot -ZITARE_BOT_TELEGRAM_TOKEN= +ZITARE_BOT_TELEGRAM_TOKEN=8489424174:AAHHG_mlLVeu6xAWY6U2ZGXO0D8JKWnqBvg + +# ============================================ +# TODO TELEGRAM BOT +# ============================================ + +TODO_BOT_PORT=3304 +TODO_BOT_DATABASE_URL=postgresql://manacore:devpassword@localhost:5432/todo_bot +TODO_BOT_TELEGRAM_TOKEN=8363906368:AAHzNC1DPSb0TUb2a3UGWWH1_rrAQFdBv2w +TODO_BOT_API_URL=http://localhost:3018 # ============================================ # PRESI PROJECT diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index 268a8f21d..fc625681a 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -90,6 +90,10 @@ services: SMTP_PASSWORD: ${SMTP_PASSWORD} SMTP_FROM: ManaCore CORS_ORIGINS: https://mana.how,https://chat.mana.how,https://todo.mana.how,https://calendar.mana.how,https://clock.mana.how,https://contacts.mana.how,https://storage.mana.how,https://presi.mana.how + # DuckDB Analytics (Business Metrics) + DUCKDB_PATH: /data/analytics/metrics.duckdb + volumes: + - analytics_data:/data/analytics ports: - "3001:3001" healthcheck: @@ -534,23 +538,28 @@ services: # Monitoring Stack # ============================================ - prometheus: - image: prom/prometheus:v2.51.0 - container_name: manacore-prometheus + # VictoriaMetrics - High-performance Prometheus replacement + # See docs/decisions/001-monitoring-stack-upgrade.md for details + victoriametrics: + image: victoriametrics/victoria-metrics:v1.99.0 + container_name: manacore-victoriametrics restart: always command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--storage.tsdb.retention.time=30d' - - '--web.enable-lifecycle' + - '-storageDataPath=/storage' + - '-retentionPeriod=2y' + - '-httpListenAddr=:8428' + - '-promscrape.config=/etc/prometheus/prometheus.yml' + - '-promscrape.config.strictParse=false' + - '-selfScrapeInterval=15s' + - '-search.latencyOffset=0s' volumes: - ./docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./docker/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro - - prometheus_data:/prometheus + - victoriametrics_data:/storage ports: - - "9090:9090" + - "8428:8428" healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9090/-/healthy"] + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8428/health"] interval: 30s timeout: 10s retries: 3 @@ -572,7 +581,7 @@ services: container_name: manacore-grafana restart: always depends_on: - prometheus: + victoriametrics: condition: service_healthy environment: GF_SECURITY_ADMIN_USER: admin @@ -943,10 +952,12 @@ volumes: name: manacore-redis minio_data: name: manacore-minio - prometheus_data: - name: manacore-prometheus + victoriametrics_data: + name: manacore-victoriametrics grafana_data: name: manacore-grafana + analytics_data: + name: manacore-analytics n8n_data: name: manacore-n8n synapse_data: diff --git a/docker/grafana/dashboards/business-metrics.json b/docker/grafana/dashboards/business-metrics.json new file mode 100644 index 000000000..43cfb5859 --- /dev/null +++ b/docker/grafana/dashboards/business-metrics.json @@ -0,0 +1,439 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": true, + "tags": [], + "targetBlank": true, + "title": "Analytics API", + "url": "http://localhost:3001/api/analytics/health", + "type": "link" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Business KPIs (Long-term Storage via DuckDB)", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Current total users from Prometheus (real-time)", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_total", + "legendFormat": "Total Users", + "refId": "A" + } + ], + "title": "Total Users (Real-time)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Verified users from Prometheus", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_verified", + "legendFormat": "Verified", + "refId": "A" + } + ], + "title": "Verified Users", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Verification rate percentage", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "green", "value": 80 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_verified / auth_users_total", + "legendFormat": "Verification Rate", + "refId": "A" + } + ], + "title": "Verification Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "New users registered today", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "orange", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_created_today", + "legendFormat": "Today", + "refId": "A" + } + ], + "title": "New Users Today", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, + "id": 6, + "panels": [], + "title": "User Growth Trends", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "User growth over the selected time range", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 10, "w": 16, "x": 0, "y": 7 }, + "id": 7, + "options": { + "legend": { + "calcs": ["lastNotNull", "min", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_total", + "legendFormat": "Total Users", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_verified", + "legendFormat": "Verified Users", + "refId": "B" + } + ], + "title": "User Growth Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "New registrations by period", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "purple", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 8, "x": 16, "y": 7 }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_created_today", + "legendFormat": "Today", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_created_this_week", + "legendFormat": "This Week", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_created_this_month", + "legendFormat": "This Month", + "refId": "C" + } + ], + "title": "New Registrations", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Verification rate over time", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 8, "x": 16, "y": 12 }, + "id": 9, + "options": { + "legend": { + "calcs": ["mean"], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_verified / auth_users_total", + "legendFormat": "Verification Rate", + "refId": "A" + } + ], + "title": "Verification Rate Trend", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }, + "id": 10, + "panels": [], + "title": "Data Retention Info", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 18 }, + "id": 11, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "## Data Retention Policy\n\n| Data Source | Retention | Purpose |\n|-------------|-----------|----------|\n| **VictoriaMetrics** | 2 Years | Operative metrics (CPU, Memory, Requests, Latency) |\n| **DuckDB** | Unlimited | Business KPIs (User growth, Feature usage) |\n\n**API Endpoints:**\n- `GET /api/analytics/health` - Service health\n- `GET /api/analytics/latest` - Latest metrics snapshot\n- `GET /api/analytics/growth?days=90` - User growth data\n- `GET /api/analytics/monthly?months=12` - Monthly aggregates\n- `POST /api/analytics/snapshot` - Trigger manual snapshot", + "mode": "markdown" + }, + "pluginVersion": "10.4.1", + "title": "About Business Metrics", + "type": "text" + } + ], + "refresh": "1m", + "schemaVersion": 38, + "tags": ["manacore", "business", "kpi", "duckdb"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { "from": "now-30d", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Business Metrics", + "uid": "business-metrics", + "version": 1, + "weekStart": "" +} diff --git a/docker/grafana/dashboards/master-overview.json b/docker/grafana/dashboards/master-overview.json new file mode 100644 index 000000000..a448a02cb --- /dev/null +++ b/docker/grafana/dashboards/master-overview.json @@ -0,0 +1,1155 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": false, + "keepTime": true, + "tags": ["manacore"], + "targetBlank": false, + "title": "Detailed Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Service Health", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 1, "text": "DOWN" }, + "1": { "color": "green", "index": 0, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 3, "w": 24, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "up{job=\"mana-core-auth\"}", + "legendFormat": "Auth", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "up{job=\"chat-backend\"}", + "legendFormat": "Chat", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "up{job=\"todo-backend\"}", + "legendFormat": "Todo", + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "up{job=\"calendar-backend\"}", + "legendFormat": "Calendar", + "refId": "D" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "up{job=\"clock-backend\"}", + "legendFormat": "Clock", + "refId": "E" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "up{job=\"contacts-backend\"}", + "legendFormat": "Contacts", + "refId": "F" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "pg_up", + "legendFormat": "PostgreSQL", + "refId": "G" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_up", + "legendFormat": "Redis", + "refId": "H" + } + ], + "title": "Service Status", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, + "id": 3, + "panels": [], + "title": "Key Metrics", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 5 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "refId": "A" + } + ], + "title": "CPU", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 5 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", + "refId": "A" + } + ], + "title": "Memory", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 5 }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - (node_filesystem_avail_bytes{mountpoint=~\"/host_mnt/Users|/\"} / node_filesystem_size_bytes{mountpoint=~\"/host_mnt/Users|/\"})) * 100", + "refId": "A" + } + ], + "title": "Disk", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 5 }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_total", + "refId": "A" + } + ], + "title": "Total Users", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "purple", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 5 }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(http_requests_total[5m]))", + "refId": "A" + } + ], + "title": "Requests/sec", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 5 }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m]))", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, + "id": 10, + "panels": [], + "title": "Traffic & Performance", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 10 }, + "id": 11, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(http_requests_total[5m])) by (job)", + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Request Rate by Service", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 10 }, + "id": 12, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job))", + "legendFormat": "p95 {{job}}", + "refId": "A" + } + ], + "title": "Response Time (p95)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { "id": "byRegexp", "options": ".*" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 10 }, + "id": 13, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (job) / sum(rate(http_requests_total[5m])) by (job)", + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Error Rate by Service", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, + "id": 14, + "panels": [], + "title": "Infrastructure", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 19 }, + "id": 15, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "CPU", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", + "legendFormat": "Memory", + "refId": "B" + } + ], + "title": "CPU & Memory", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 19 }, + "id": 16, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "pg_database_size_bytes{datname!~\"template.*|postgres\"}", + "legendFormat": "{{datname}}", + "refId": "A" + } + ], + "title": "Database Size", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, + "id": 17, + "panels": [], + "title": "Database & Cache", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 90 }, + { "color": "green", "value": 95 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 27 }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} / (pg_stat_database_blks_hit{datname!~\"template.*|postgres\"} + pg_stat_database_blks_read{datname!~\"template.*|postgres\"} + 0.0001)) * 100", + "refId": "A" + } + ], + "title": "PG Cache Hit", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 50 }, + { "color": "red", "value": 80 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 27 }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(pg_stat_activity_count)", + "refId": "A" + } + ], + "title": "PG Connections", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 90 }, + { "color": "green", "value": 95 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 27 }, + "id": 20, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_keyspace_hits_total / (redis_keyspace_hits_total + redis_keyspace_misses_total) * 100", + "refId": "A" + } + ], + "title": "Redis Hit Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 27 }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "redis_memory_used_bytes", + "refId": "A" + } + ], + "title": "Redis Memory", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 27 }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "count(container_last_seen{id=~\"/docker/.+\"})", + "refId": "A" + } + ], + "title": "Containers", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 27 }, + "id": 23, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(pg_database_size_bytes{datname!~\"template.*|postgres\"})", + "refId": "A" + } + ], + "title": "Total DB Size", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "id": 24, + "panels": [], + "title": "User Growth", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "opacity", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 32 }, + "id": 25, + "options": { + "legend": { + "calcs": ["lastNotNull", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_total", + "legendFormat": "Total Users", + "refId": "A" + } + ], + "title": "Total Users Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "orange", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 4, "x": 12, "y": 32 }, + "id": 26, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_created_today", + "legendFormat": "Today", + "refId": "A" + } + ], + "title": "New Today", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 4, "x": 16, "y": 32 }, + "id": 27, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_created_this_week", + "legendFormat": "This Week", + "refId": "A" + } + ], + "title": "New This Week", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "purple", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 4, "x": 20, "y": 32 }, + "id": 28, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "auth_users_created_this_month", + "legendFormat": "This Month", + "refId": "A" + } + ], + "title": "New This Month", + "type": "stat" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["manacore", "master", "overview"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "ManaCore Master Overview", + "uid": "master-overview", + "version": 1, + "weekStart": "" +} diff --git a/docker/grafana/provisioning/datasources/prometheus.yml b/docker/grafana/provisioning/datasources/prometheus.yml index 1cbaaafcf..bd5450b5a 100644 --- a/docker/grafana/provisioning/datasources/prometheus.yml +++ b/docker/grafana/provisioning/datasources/prometheus.yml @@ -1,5 +1,6 @@ # Grafana Datasource Provisioning -# Auto-configures Prometheus as the default datasource +# Auto-configures VictoriaMetrics as the default datasource +# VictoriaMetrics is 100% Prometheus-compatible, using same type apiVersion: 1 @@ -7,9 +8,19 @@ datasources: - name: Prometheus type: prometheus access: proxy - url: http://prometheus:9090 + url: http://victoriametrics:8428 isDefault: true editable: true jsonData: timeInterval: "15s" httpMethod: POST + + # Business Metrics API (DuckDB via mana-core-auth) + - name: Business Metrics + type: yesoreyeram-infinity-datasource + access: proxy + url: http://mana-core-auth:3001 + isDefault: false + editable: true + jsonData: + datasource_mode: "basic" diff --git a/docs/decisions/001-monitoring-stack-upgrade.md b/docs/decisions/001-monitoring-stack-upgrade.md new file mode 100644 index 000000000..3f7f57de8 --- /dev/null +++ b/docs/decisions/001-monitoring-stack-upgrade.md @@ -0,0 +1,593 @@ +# ADR-001: Monitoring Stack Upgrade - VictoriaMetrics + DuckDB + +**Status:** Accepted +**Date:** 2025-01-28 +**Author:** Till Schneider +**Reviewers:** - + +## Executive Summary + +Upgrade des ManaCore Monitoring Stacks von Prometheus (30 Tage Retention) auf VictoriaMetrics (2 Jahre) + DuckDB (unbegrenzt) für langfristige Metriken-Speicherung und Business-Analytics. + +--- + +## 1. Kontext & Problemstellung + +### 1.1 Aktuelle Situation + +ManaCore nutzt einen Standard-Prometheus + Grafana Stack für Monitoring: + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ NestJS Backends│────>│ Prometheus │────>│ Grafana │ +│ (6 Services) │ │ (30 Tage) │ │ (5 Dashboards) │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ + │ + ├── mana-core-auth (Port 3001) + ├── chat-backend (Port 3002) + ├── todo-backend (Port 3018) + ├── calendar-backend (Port 3016) + ├── clock-backend (Port 3017) + └── contacts-backend (Port 3015) +``` + +**Komponenten:** +- Prometheus v2.51.0 mit 30 Tagen Retention +- Grafana 10.4.1 mit 5 Dashboards +- Node Exporter, cAdvisor, PostgreSQL Exporter, Redis Exporter +- Alerting Rules (20+ Regeln) + +### 1.2 Das Problem + +**Nach 30 Tagen sind alle historischen Metriken unwiederbringlich verloren.** + +| Betroffene Daten | Konsequenz | +|------------------|------------| +| User-Wachstum (`auth_users_total`) | Keine Trend-Analyse möglich | +| Historische Error Rates | Keine Langzeit-Vergleiche | +| Performance-Trends | Keine Kapazitätsplanung | +| Infrastruktur-Metriken | Keine saisonalen Muster erkennbar | + +**Besonders kritisch:** Business-KPIs wie `auth_users_total`, `auth_users_created_this_month` sind Point-in-Time Snapshots. Ohne historische Daten ist es unmöglich zu rekonstruieren, wie viele User vor 2 Monaten existierten. + +### 1.3 Anforderungen + +| Anforderung | Priorität | +|-------------|-----------| +| Operative Metriken für mindestens 1-2 Jahre speichern | Hoch | +| Business-KPIs unbegrenzt speichern | Hoch | +| Keine Änderung an bestehenden Dashboards | Mittel | +| Minimaler zusätzlicher Ressourcenverbrauch | Mittel | +| Einfache Wartung und Backup | Mittel | + +--- + +## 2. Evaluierte Optionen + +### 2.1 Option A: Prometheus Retention erhöhen + +**Ansatz:** `--storage.tsdb.retention.time=365d` + +**Vorteile:** +- Keine Migration nötig +- Keine neuen Komponenten + +**Nachteile:** +- Prometheus TSDB ist nicht für Langzeit optimiert +- RAM-Verbrauch steigt linear mit Retention +- Queries über alte Daten werden langsam +- Compaction-Overhead bei großen Datenmengen + +**Bewertung:** Kurzfristige Lösung, skaliert nicht. + +### 2.2 Option B: Thanos / Cortex + +**Ansatz:** Prometheus + Langzeit-Storage-Layer (S3/MinIO) + +**Vorteile:** +- Industriestandard für große Deployments +- Unbegrenzte Retention möglich + +**Nachteile:** +- Hohe Komplexität (5+ zusätzliche Komponenten) +- Overkill für ManaCore's Größe (~50k Time Series) +- Signifikanter Ops-Overhead + +**Bewertung:** Overengineered für unseren Use Case. + +### 2.3 Option C: VictoriaMetrics (gewählt) + +**Ansatz:** Drop-in Replacement für Prometheus + +**Vorteile:** +- 100% Prometheus-kompatibel (PromQL, Config-Format, Exporters) +- 3-10x bessere Kompression +- 5-10x weniger RAM-Verbrauch +- Schnellere Queries über historische Daten +- Single Binary, einfaches Deployment +- Migration in 10 Minuten + +**Nachteile:** +- Weniger bekannt als Prometheus (aber wachsende Community) +- CNCF Sandbox (nicht Graduated wie Prometheus) + +**Bewertung:** Beste Balance aus Einfachheit und Leistung. + +### 2.4 Option D: PostgreSQL für Business-Metriken + +**Ansatz:** Tägliche Snapshots in PostgreSQL speichern + +**Vorteile:** +- Bestehende Infrastruktur nutzen +- SQL für Queries +- Unbegrenzte Retention + +**Nachteile:** +- Nicht optimiert für Analytics-Queries +- Connection-Pool Overhead +- Row-based Storage ineffizient für Aggregationen + +**Bewertung:** Funktional, aber nicht optimal für Analytics. + +### 2.5 Option E: DuckDB für Business-Metriken (gewählt) + +**Ansatz:** Embedded OLAP-Datenbank für tägliche Business-KPI Snapshots + +**Vorteile:** +- Kein Server nötig (embedded, single file) +- Column-oriented = perfekt für Analytics +- 10-100x schneller als PostgreSQL für Aggregationen +- Exzellente Kompression +- Native Parquet Import/Export +- SQL-kompatibel + +**Nachteile:** +- Nicht für concurrent writes (irrelevant bei 1x täglich) +- Keine native Grafana-Integration (API-Endpoint nötig) + +**Bewertung:** Perfekt für den Use Case (append-only, read-heavy, analytics). + +--- + +## 3. Entscheidung + +### 3.1 Gewählte Architektur + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ ManaCore Monitoring Stack v2 │ +├─────────────────────────────────────────────────────────────────────────┤ +│ │ +│ OPERATIVE METRIKEN (High-Frequency Time Series) │ +│ ════════════════════════════════════════════════ │ +│ │ +│ ┌──────────────┐ ┌──────────────────┐ ┌─────────────────┐ │ +│ │ Backends │────>│ VictoriaMetrics │────>│ Grafana │ │ +│ │ /metrics │ │ │ │ │ │ +│ │ │ │ Retention: 2y │ │ Existing │ │ +│ │ + Exporters │ │ Scrape: 15-30s │ │ Dashboards │ │ +│ └──────────────┘ └──────────────────┘ └─────────────────┘ │ +│ ▲ │ +│ │ │ +│ BUSINESS METRIKEN (Daily Snapshots, Analytics) │ │ +│ ══════════════════════════════════════════════ │ │ +│ │ │ +│ ┌──────────────┐ ┌──────────────────┐ │ │ +│ │ Daily Cron │────>│ DuckDB │────────────┘ │ +│ │ 00:00 UTC │ │ │ (via JSON API) │ +│ │ │ │ Retention: ∞ │ │ +│ │ Snapshots: │ │ File: metrics.db│ │ +│ │ - Users │ │ Size: ~10MB/year│ │ +│ │ - Growth │ │ │ │ +│ │ - Features │ │ Backup: cp file │ │ +│ └──────────────┘ └──────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────┐ │ +│ │Parquet Export│ │ +│ │ (Archiv) │ │ +│ └──────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +### 3.2 Daten-Aufteilung + +| Datentyp | Storage | Retention | Grund | +|----------|---------|-----------|-------| +| CPU, Memory, Disk | VictoriaMetrics | 2 Jahre | High-frequency, Time-Series | +| HTTP Requests, Latency | VictoriaMetrics | 2 Jahre | High-frequency, PromQL | +| Error Rates, Status Codes | VictoriaMetrics | 2 Jahre | Alerting, Debugging | +| Container Metrics | VictoriaMetrics | 2 Jahre | Kapazitätsplanung | +| **User Counts** | DuckDB | Unbegrenzt | Business KPI, Trend-Analyse | +| **User Growth** | DuckDB | Unbegrenzt | Business KPI | +| **Feature Usage** | DuckDB | Unbegrenzt | Product Analytics | +| **Revenue/Subscriptions** | DuckDB | Unbegrenzt | Business KPI | + +### 3.3 Warum diese Kombination? + +**VictoriaMetrics für operative Metriken:** +- Prometheus-kompatibel = keine Dashboard-Änderungen +- 2 Jahre Retention bei ~15GB Storage +- Schnelle Queries auch über historische Daten +- Bewährte Time-Series Datenbank + +**DuckDB für Business-Metriken:** +- Perfekt für "1x täglich schreiben, oft lesen" +- SQL für komplexe Analytics-Queries +- Single-File = triviales Backup +- Kein zusätzlicher Server/Container +- Unbegrenzte Retention bei minimalem Footprint + +--- + +## 4. Technische Details + +### 4.1 VictoriaMetrics Konfiguration + +```yaml +# docker-compose.macmini.yml +services: + victoriametrics: + image: victoriametrics/victoria-metrics:v1.99.0 + container_name: victoriametrics + restart: unless-stopped + command: + - '-storageDataPath=/storage' + - '-retentionPeriod=2y' + - '-httpListenAddr=:8428' + - '-promscrape.config=/etc/prometheus/prometheus.yml' + - '-promscrape.config.strictParse=false' + volumes: + - vm-storage:/storage + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro + ports: + - "8428:8428" + networks: + - manacore-network +``` + +**Ressourcen-Vergleich (geschätzt):** + +| Metrik | Prometheus (30d) | VictoriaMetrics (2y) | +|--------|------------------|----------------------| +| RAM | ~2 GB | ~500 MB | +| Disk | ~5 GB | ~15 GB | +| CPU | Höher (Compaction) | Niedriger | + +### 4.2 DuckDB Schema + +```sql +-- Haupt-Tabelle für tägliche Snapshots +CREATE TABLE daily_metrics ( + date DATE PRIMARY KEY, + + -- User Metrics + total_users INTEGER NOT NULL, + verified_users INTEGER NOT NULL, + new_users_today INTEGER NOT NULL, + new_users_week INTEGER NOT NULL, + new_users_month INTEGER NOT NULL, + + -- Engagement (Platzhalter für Zukunft) + daily_active_users INTEGER, + weekly_active_users INTEGER, + monthly_active_users INTEGER, + + -- Per-App Metrics (Platzhalter) + chat_messages_sent INTEGER, + pictures_generated INTEGER, + + -- Infrastructure Snapshots + total_db_size_bytes BIGINT, + total_storage_size_bytes BIGINT, + + -- Metadata + recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Index für schnelle Range-Queries +CREATE INDEX idx_daily_metrics_date ON daily_metrics(date); + +-- View für monatliche Aggregation +CREATE VIEW monthly_metrics AS +SELECT + DATE_TRUNC('month', date) AS month, + MAX(total_users) AS total_users_eom, + SUM(new_users_today) AS new_users, + AVG(daily_active_users) AS avg_dau +FROM daily_metrics +GROUP BY DATE_TRUNC('month', date); +``` + +### 4.3 DuckDB Service Implementation + +```typescript +// services/mana-core-auth/src/analytics/analytics.service.ts +@Injectable() +export class AnalyticsService { + private db: Database; + + constructor( + private readonly usersService: UsersService, + private readonly configService: ConfigService, + ) { + const dbPath = this.configService.get('DUCKDB_PATH', '/data/metrics.duckdb'); + this.db = new Database(dbPath); + this.initializeSchema(); + } + + @Cron('0 0 * * *') // Täglich um Mitternacht UTC + async recordDailySnapshot(): Promise { + const today = new Date().toISOString().split('T')[0]; + + const metrics = { + date: today, + total_users: await this.usersService.countTotal(), + verified_users: await this.usersService.countVerified(), + new_users_today: await this.usersService.countCreatedToday(), + new_users_week: await this.usersService.countCreatedThisWeek(), + new_users_month: await this.usersService.countCreatedThisMonth(), + total_db_size_bytes: await this.getDbSize(), + }; + + this.db.run(` + INSERT OR REPLACE INTO daily_metrics + (date, total_users, verified_users, new_users_today, + new_users_week, new_users_month, total_db_size_bytes) + VALUES (?, ?, ?, ?, ?, ?, ?) + `, [ + metrics.date, + metrics.total_users, + metrics.verified_users, + metrics.new_users_today, + metrics.new_users_week, + metrics.new_users_month, + metrics.total_db_size_bytes, + ]); + } + + async getUserGrowth(months: number = 12): Promise { + return this.db.all(` + SELECT + date, + total_users, + total_users - LAG(total_users) OVER (ORDER BY date) as growth + FROM daily_metrics + WHERE date > CURRENT_DATE - INTERVAL '${months} months' + ORDER BY date + `); + } +} +``` + +### 4.4 Grafana Integration + +**VictoriaMetrics:** +```yaml +# docker/grafana/provisioning/datasources/prometheus.yml +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + url: http://victoriametrics:8428 # Nur URL ändert sich + isDefault: true + editable: false +``` + +**DuckDB (via JSON API):** +```yaml +# docker/grafana/provisioning/datasources/duckdb.yml +apiVersion: 1 +datasources: + - name: Business Metrics + type: simpod-json-datasource + url: http://mana-core-auth:3001/api/analytics + isDefault: false + editable: false +``` + +--- + +## 5. Migration + +### 5.1 Migrationspfad + +``` +Phase 1: VictoriaMetrics Deployment (Zero Downtime) +═══════════════════════════════════════════════════ +1. VictoriaMetrics Container hinzufügen +2. Parallel zu Prometheus laufen lassen +3. Grafana Datasource auf VM umstellen +4. Prometheus Container entfernen + +Phase 2: DuckDB Integration +═══════════════════════════ +1. DuckDB Dependency hinzufügen +2. Analytics Service implementieren +3. Cron-Job aktivieren +4. API Endpoints erstellen +5. Grafana Dashboard für Business Metrics + +Phase 3: Historische Daten (Optional) +═════════════════════════════════════ +1. Prometheus Daten exportieren +2. In VictoriaMetrics importieren +3. Initiale DuckDB-Befüllung aus Prometheus +``` + +### 5.2 Rollback-Plan + +**VictoriaMetrics → Prometheus:** +- Gleiche Config-Datei funktioniert +- Grafana Datasource URL zurückändern +- Container tauschen + +**DuckDB:** +- Service deaktivieren +- Keine Abhängigkeiten in anderen Services + +--- + +## 6. Monitoring & Alerting + +### 6.1 VictoriaMetrics Self-Monitoring + +```yaml +# prometheus/alerts.yml (funktioniert auch mit VM) +groups: + - name: victoriametrics + rules: + - alert: VMStorageSpaceLow + expr: vm_free_disk_space_bytes / vm_available_disk_space_bytes < 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "VictoriaMetrics disk space low" +``` + +### 6.2 DuckDB Health Check + +```typescript +// Endpoint: GET /api/analytics/health +{ + "status": "healthy", + "database_size_bytes": 10485760, + "last_snapshot": "2025-01-28", + "total_records": 365 +} +``` + +--- + +## 7. Backup-Strategie + +### 7.1 VictoriaMetrics + +```bash +# Snapshot erstellen (built-in) +curl -X POST "http://victoriametrics:8428/snapshot/create" + +# Backup zu S3/MinIO +vmbackup -storageDataPath=/storage -snapshot.createURL="http://localhost:8428/snapshot/create" -dst=s3://backups/vm/ +``` + +### 7.2 DuckDB + +```bash +# Einfacher File-Copy (konsistent da single-writer) +cp /data/metrics.duckdb /backup/metrics-$(date +%Y-%m-%d).duckdb + +# Oder Parquet-Export für Archivierung +duckdb /data/metrics.duckdb -c "COPY daily_metrics TO '/backup/metrics.parquet' (FORMAT PARQUET)" +``` + +--- + +## 8. Kosten & Ressourcen + +### 8.1 Storage-Projektion (2 Jahre) + +| Komponente | Jetzt | Nach Migration | +|------------|-------|----------------| +| Prometheus | 5 GB (30d) | 0 GB (entfernt) | +| VictoriaMetrics | 0 GB | ~15 GB (2y) | +| DuckDB | 0 GB | ~20 MB (2y) | +| **Total** | **5 GB** | **~15 GB** | + +### 8.2 RAM-Projektion + +| Komponente | Jetzt | Nach Migration | +|------------|-------|----------------| +| Prometheus | ~2 GB | 0 GB | +| VictoriaMetrics | 0 GB | ~500 MB | +| DuckDB | 0 GB | ~50 MB (on-demand) | +| **Total** | **~2 GB** | **~550 MB** | + +--- + +## 9. Implementierungsplan + +### Phase 1: VictoriaMetrics (Tag 1) +- [ ] docker-compose.macmini.yml aktualisieren +- [ ] VictoriaMetrics Container hinzufügen +- [ ] Grafana Datasource konfigurieren +- [ ] Bestehende Dashboards testen +- [ ] Prometheus Container entfernen + +### Phase 2: DuckDB Service (Tag 1-2) +- [ ] duckdb Package installieren +- [ ] AnalyticsModule erstellen +- [ ] DuckDB Schema initialisieren +- [ ] Daily Snapshot Cron-Job +- [ ] API Endpoints für Grafana + +### Phase 3: Dashboards & Dokumentation (Tag 2) +- [ ] Business Metrics Dashboard erstellen +- [ ] Master Overview Dashboard aktualisieren +- [ ] Dokumentation finalisieren +- [ ] Backup-Scripts erstellen + +--- + +## 10. Entscheidungsmatrix + +| Kriterium | Gewicht | Prometheus | VM + DuckDB | Score | +|-----------|---------|------------|-------------|-------| +| Langzeit-Retention | 30% | 2/10 | 10/10 | +2.4 | +| Ressourceneffizienz | 20% | 4/10 | 9/10 | +1.0 | +| Migrationsaufwand | 15% | 10/10 | 8/10 | -0.3 | +| Wartbarkeit | 15% | 7/10 | 8/10 | +0.15 | +| Analytics-Fähigkeit | 10% | 3/10 | 9/10 | +0.6 | +| Backup-Einfachheit | 10% | 5/10 | 9/10 | +0.4 | +| **Gesamt** | 100% | **4.7/10** | **9.1/10** | **+4.4** | + +--- + +## 11. Risiken & Mitigationen + +| Risiko | Wahrscheinlichkeit | Impact | Mitigation | +|--------|-------------------|--------|------------| +| VM nicht 100% PromQL-kompatibel | Niedrig | Mittel | Dashboards vorab testen | +| DuckDB Datenverlust | Niedrig | Hoch | Tägliches Backup | +| Cron-Job Ausfall | Mittel | Niedrig | Monitoring + Catch-up Logic | +| Storage voll | Niedrig | Mittel | Alerting bei 80% | + +--- + +## 12. Referenzen + +- [VictoriaMetrics Dokumentation](https://docs.victoriametrics.com/) +- [VictoriaMetrics vs Prometheus Benchmark](https://valyala.medium.com/prometheus-vs-victoriametrics-benchmark-on-node-exporter-metrics-4ca29c75590f) +- [DuckDB Dokumentation](https://duckdb.org/docs/) +- [Grafana JSON Datasource](https://grafana.com/grafana/plugins/simpod-json-datasource/) + +--- + +## Appendix A: Bestehende Dashboards + +| Dashboard | UID | Änderung nötig | +|-----------|-----|----------------| +| System Overview | `system-overview` | Keine | +| Backends & Docker | `backends-docker` | Keine | +| Application Details | `application-details` | Keine | +| Database Details | `database-details` | Keine | +| User Statistics | `user-statistics` | Keine | +| Master Overview | `master-overview` | Business Metrics hinzufügen | + +## Appendix B: Prometheus Config Kompatibilität + +Die bestehende `prometheus.yml` funktioniert ohne Änderung mit VictoriaMetrics: + +```yaml +# Alle Scrape-Configs bleiben identisch +scrape_configs: + - job_name: 'mana-core-auth' + static_configs: + - targets: ['mana-core-auth:3001'] + metrics_path: '/metrics' + scrape_interval: 30s + # ... alle anderen Jobs +``` diff --git a/package.json b/package.json index 34794df17..938ff7e45 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "setup:db": "./scripts/setup-databases.sh", "setup:db:chat": "./scripts/setup-databases.sh chat", "setup:db:auth": "./scripts/setup-databases.sh auth", + "seed:dev-user": "pnpm --filter mana-core-auth db:seed:dev", "build:packages": "pnpm --filter '@manacore/*' build", "postinstall": "node scripts/generate-env.mjs || true && pnpm run build:packages || true", "manacore:dev": "turbo run dev --filter=manacore...", @@ -212,6 +213,18 @@ "dev:projectdoc:full": "./scripts/setup-databases.sh projectdoc && pnpm dev:projectdoc", "projectdoc:db:push": "pnpm --filter @manacore/telegram-project-doc-bot db:push", "projectdoc:db:studio": "pnpm --filter @manacore/telegram-project-doc-bot db:studio", + "dev:zitare-bot": "pnpm --filter @manacore/telegram-zitare-bot start:dev", + "dev:zitare-bot:full": "./scripts/setup-databases.sh zitare_bot && pnpm dev:zitare-bot", + "zitare-bot:db:push": "pnpm --filter @manacore/telegram-zitare-bot db:push", + "zitare-bot:db:studio": "pnpm --filter @manacore/telegram-zitare-bot db:studio", + "dev:todo-bot": "pnpm --filter @manacore/telegram-todo-bot start:dev", + "dev:todo-bot:full": "./scripts/setup-databases.sh todo_bot && ./scripts/setup-databases.sh todo && ./scripts/setup-databases.sh auth && concurrently -n auth,todo-be,bot -c blue,green,cyan \"pnpm dev:auth\" \"pnpm dev:todo:backend\" \"pnpm dev:todo-bot\"", + "todo-bot:db:push": "pnpm --filter @manacore/telegram-todo-bot db:push", + "todo-bot:db:studio": "pnpm --filter @manacore/telegram-todo-bot db:studio", + "dev:nutriphi-bot": "pnpm --filter @manacore/telegram-nutriphi-bot start:dev", + "dev:nutriphi-bot:full": "./scripts/setup-databases.sh nutriphi_bot && pnpm dev:nutriphi-bot", + "nutriphi-bot:db:push": "pnpm --filter @manacore/telegram-nutriphi-bot db:push", + "nutriphi-bot:db:studio": "pnpm --filter @manacore/telegram-nutriphi-bot db:studio", "prepare": "husky" }, "devDependencies": { diff --git a/scripts/backup-monitoring.sh b/scripts/backup-monitoring.sh new file mode 100755 index 000000000..1963f1318 --- /dev/null +++ b/scripts/backup-monitoring.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Backup script for ManaCore Monitoring Stack +# - VictoriaMetrics (2 years of metrics) +# - DuckDB (Business KPIs) + +set -e + +# Configuration +BACKUP_DIR="${BACKUP_DIR:-/backup/monitoring}" +DATE=$(date +%Y-%m-%d) +RETENTION_DAYS="${RETENTION_DAYS:-30}" # Keep backups for 30 days + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# Create backup directory +mkdir -p "$BACKUP_DIR" + +# ============================================ +# Backup VictoriaMetrics +# ============================================ +backup_victoriametrics() { + log_info "Creating VictoriaMetrics snapshot..." + + # Create snapshot via API + SNAPSHOT_RESPONSE=$(curl -s -X POST "http://localhost:8428/snapshot/create") + SNAPSHOT_NAME=$(echo "$SNAPSHOT_RESPONSE" | grep -o '"snapshot":"[^"]*"' | cut -d'"' -f4) + + if [ -z "$SNAPSHOT_NAME" ]; then + log_error "Failed to create VictoriaMetrics snapshot" + echo "$SNAPSHOT_RESPONSE" + return 1 + fi + + log_info "Snapshot created: $SNAPSHOT_NAME" + + # Copy snapshot to backup directory + # Note: Adjust path based on your volume mount + VM_DATA_PATH="/var/lib/docker/volumes/manacore-victoriametrics/_data" + SNAPSHOT_PATH="$VM_DATA_PATH/snapshots/$SNAPSHOT_NAME" + + if [ -d "$SNAPSHOT_PATH" ]; then + BACKUP_FILE="$BACKUP_DIR/victoriametrics-$DATE.tar.gz" + log_info "Compressing snapshot to $BACKUP_FILE..." + tar -czf "$BACKUP_FILE" -C "$VM_DATA_PATH/snapshots" "$SNAPSHOT_NAME" + log_info "VictoriaMetrics backup complete: $BACKUP_FILE" + + # Delete snapshot after backup + curl -s -X POST "http://localhost:8428/snapshot/delete?snapshot=$SNAPSHOT_NAME" + log_info "Snapshot deleted from VictoriaMetrics" + else + log_warn "Snapshot directory not found at $SNAPSHOT_PATH" + log_warn "If using Docker, you may need to run this inside the container" + fi +} + +# ============================================ +# Backup DuckDB +# ============================================ +backup_duckdb() { + log_info "Backing up DuckDB analytics database..." + + # DuckDB is a single file, so we can just copy it + DUCKDB_PATH="/var/lib/docker/volumes/manacore-analytics/_data/metrics.duckdb" + + if [ -f "$DUCKDB_PATH" ]; then + BACKUP_FILE="$BACKUP_DIR/analytics-$DATE.duckdb" + cp "$DUCKDB_PATH" "$BACKUP_FILE" + log_info "DuckDB backup complete: $BACKUP_FILE" + + # Also export to Parquet for long-term archival + PARQUET_FILE="$BACKUP_DIR/analytics-$DATE.parquet" + if command -v duckdb &> /dev/null; then + duckdb "$DUCKDB_PATH" -c "COPY daily_metrics TO '$PARQUET_FILE' (FORMAT PARQUET)" + log_info "Parquet export complete: $PARQUET_FILE" + else + log_warn "duckdb CLI not found, skipping Parquet export" + fi + else + log_warn "DuckDB file not found at $DUCKDB_PATH" + + # Try alternative: backup via API + log_info "Attempting backup via API..." + curl -s "http://localhost:3001/api/analytics/latest" > "$BACKUP_DIR/analytics-latest-$DATE.json" + curl -s "http://localhost:3001/api/analytics/growth?days=365" > "$BACKUP_DIR/analytics-growth-$DATE.json" + log_info "API backup complete" + fi +} + +# ============================================ +# Cleanup old backups +# ============================================ +cleanup_old_backups() { + log_info "Cleaning up backups older than $RETENTION_DAYS days..." + find "$BACKUP_DIR" -type f -mtime +$RETENTION_DAYS -delete + log_info "Cleanup complete" +} + +# ============================================ +# Main +# ============================================ +main() { + log_info "Starting ManaCore Monitoring Backup" + log_info "Backup directory: $BACKUP_DIR" + log_info "Date: $DATE" + echo "" + + backup_victoriametrics + echo "" + + backup_duckdb + echo "" + + cleanup_old_backups + echo "" + + log_info "All backups complete!" + log_info "Files in $BACKUP_DIR:" + ls -lh "$BACKUP_DIR" +} + +# Run main function +main "$@" diff --git a/scripts/generate-env.mjs b/scripts/generate-env.mjs index 9bd110f7d..b0c8935d5 100644 --- a/scripts/generate-env.mjs +++ b/scripts/generate-env.mjs @@ -607,6 +607,30 @@ const APP_CONFIGS = [ PUBLIC_BACKEND_URL: (env) => `http://localhost:${env.TECHBASE_BACKEND_PORT || '3021'}`, }, }, + + // Zitare Telegram Bot + { + path: 'services/telegram-zitare-bot/.env', + vars: { + NODE_ENV: () => 'development', + PORT: (env) => env.ZITARE_BOT_PORT || '3303', + TELEGRAM_BOT_TOKEN: (env) => env.ZITARE_BOT_TELEGRAM_TOKEN, + DATABASE_URL: (env) => env.ZITARE_BOT_DATABASE_URL, + }, + }, + + // Todo Telegram Bot + { + path: 'services/telegram-todo-bot/.env', + vars: { + NODE_ENV: () => 'development', + PORT: (env) => env.TODO_BOT_PORT || '3304', + TELEGRAM_BOT_TOKEN: (env) => env.TODO_BOT_TELEGRAM_TOKEN, + DATABASE_URL: (env) => env.TODO_BOT_DATABASE_URL, + TODO_API_URL: (env) => env.TODO_BOT_API_URL || 'http://localhost:3018', + MANA_CORE_AUTH_URL: (env) => env.MANA_CORE_AUTH_URL, + }, + }, ]; function main() { diff --git a/scripts/setup-databases.sh b/scripts/setup-databases.sh index 6afe08cb8..c0e71ea3d 100755 --- a/scripts/setup-databases.sh +++ b/scripts/setup-databases.sh @@ -75,6 +75,9 @@ ALL_DATABASES=( "planta" "nutriphi" "projectdoc" + "zitare_bot" + "todo_bot" + "nutriphi_bot" ) # Check if specific service requested @@ -160,9 +163,21 @@ setup_service() { create_db_if_not_exists "projectdoc" push_schema "@manacore/telegram-project-doc-bot" "projectdoc" ;; + zitare_bot|zitare-bot) + create_db_if_not_exists "zitare_bot" + push_schema "@manacore/telegram-zitare-bot" "zitare-bot" + ;; + todo_bot|todo-bot) + create_db_if_not_exists "todo_bot" + push_schema "@manacore/telegram-todo-bot" "todo-bot" + ;; + nutriphi_bot|nutriphi-bot) + create_db_if_not_exists "nutriphi_bot" + push_schema "@manacore/telegram-nutriphi-bot" "nutriphi-bot" + ;; *) echo -e "${RED}Unknown service: $service${NC}" - echo "Available services: auth, chat, zitare, contacts, calendar, clock, todo, manadeck, mail, moodlit, finance, voxel-lava, figgos, planta, nutriphi, presi, storage" + echo "Available services: auth, chat, zitare, contacts, calendar, clock, todo, manadeck, mail, moodlit, finance, voxel-lava, figgos, planta, nutriphi, presi, storage, projectdoc, zitare_bot, todo_bot, nutriphi_bot" exit 1 ;; esac diff --git a/services/mana-core-auth/.gitignore b/services/mana-core-auth/.gitignore index b0cd87a43..c2d49e8dd 100644 --- a/services/mana-core-auth/.gitignore +++ b/services/mana-core-auth/.gitignore @@ -45,3 +45,6 @@ coverage/ .cache/ tmp/ temp/ + +# DuckDB local data +data/ diff --git a/services/mana-core-auth/package.json b/services/mana-core-auth/package.json index f7abcd962..fa36e8994 100644 --- a/services/mana-core-auth/package.json +++ b/services/mana-core-auth/package.json @@ -50,7 +50,8 @@ "rxjs": "^7.8.1", "stripe": "^17.5.0", "winston": "^3.17.0", - "zod": "^3.24.1" + "zod": "^3.24.1", + "duckdb-async": "^1.1.1" }, "devDependencies": { "@nestjs/cli": "^11.0.0", diff --git a/services/mana-core-auth/src/analytics/analytics.controller.ts b/services/mana-core-auth/src/analytics/analytics.controller.ts new file mode 100644 index 000000000..b2a02a197 --- /dev/null +++ b/services/mana-core-auth/src/analytics/analytics.controller.ts @@ -0,0 +1,135 @@ +import { Controller, Get, Post, Query, Res, HttpStatus } from '@nestjs/common'; +import { Response } from 'express'; +import { AnalyticsService } from './analytics.service'; + +@Controller('analytics') +export class AnalyticsController { + constructor(private readonly analyticsService: AnalyticsService) {} + + /** + * Health check endpoint + */ + @Get('health') + async getHealth() { + return this.analyticsService.getHealth(); + } + + /** + * Get latest metrics snapshot + */ + @Get('latest') + async getLatest() { + const metrics = await this.analyticsService.getLatestMetrics(); + if (!metrics) { + return { message: 'No metrics recorded yet' }; + } + return metrics; + } + + /** + * Get user growth data + * @param days Number of days to look back (default: 90) + */ + @Get('growth') + async getGrowth(@Query('days') days?: string) { + const numDays = days ? parseInt(days, 10) : 90; + return this.analyticsService.getUserGrowth(numDays); + } + + /** + * Get monthly aggregated metrics + * @param months Number of months to look back (default: 12) + */ + @Get('monthly') + async getMonthly(@Query('months') months?: string) { + const numMonths = months ? parseInt(months, 10) : 12; + return this.analyticsService.getMonthlyMetrics(numMonths); + } + + /** + * Get metrics for a date range + * @param start Start date (YYYY-MM-DD) + * @param end End date (YYYY-MM-DD) + */ + @Get('range') + async getRange(@Query('start') start: string, @Query('end') end: string) { + if (!start || !end) { + return { error: 'Both start and end dates are required (YYYY-MM-DD format)' }; + } + return this.analyticsService.getMetricsRange(start, end); + } + + /** + * Trigger manual snapshot (for testing/recovery) + */ + @Post('snapshot') + async triggerSnapshot() { + await this.analyticsService.recordDailySnapshot(); + return { message: 'Snapshot recorded successfully' }; + } + + /** + * Grafana JSON API compatible endpoint - query + * Used by Grafana Infinity datasource + */ + @Post('grafana/query') + async grafanaQuery(@Res() res: Response) { + // Return available targets + const latest = await this.analyticsService.getLatestMetrics(); + const growth = await this.analyticsService.getUserGrowth(30); + + res.status(HttpStatus.OK).json([ + { + target: 'total_users', + datapoints: growth.map((g) => [g.total_users, new Date(g.date).getTime()]), + }, + { + target: 'daily_growth', + datapoints: growth.map((g) => [g.growth ?? 0, new Date(g.date).getTime()]), + }, + ]); + } + + /** + * Grafana JSON API compatible endpoint - search + * Returns available metrics + */ + @Post('grafana/search') + async grafanaSearch() { + return [ + 'total_users', + 'verified_users', + 'new_users_today', + 'new_users_week', + 'new_users_month', + 'daily_growth', + ]; + } + + /** + * Summary endpoint for dashboards + */ + @Get('summary') + async getSummary() { + const latest = await this.analyticsService.getLatestMetrics(); + const monthly = await this.analyticsService.getMonthlyMetrics(2); + const health = await this.analyticsService.getHealth(); + + const currentMonth = monthly[monthly.length - 1]; + const previousMonth = monthly[monthly.length - 2]; + + return { + current: latest, + trends: { + month_over_month_growth: + currentMonth && previousMonth + ? ((currentMonth.total_users_eom - previousMonth.total_users_eom) / + previousMonth.total_users_eom) * + 100 + : null, + new_users_this_month: currentMonth?.new_users ?? 0, + }, + health, + }; + } +} diff --git a/services/mana-core-auth/src/analytics/analytics.module.ts b/services/mana-core-auth/src/analytics/analytics.module.ts new file mode 100644 index 000000000..a8e27f641 --- /dev/null +++ b/services/mana-core-auth/src/analytics/analytics.module.ts @@ -0,0 +1,12 @@ +import { Module } from '@nestjs/common'; +import { ScheduleModule } from '@nestjs/schedule'; +import { AnalyticsService } from './analytics.service'; +import { AnalyticsController } from './analytics.controller'; + +@Module({ + imports: [ScheduleModule.forRoot()], + controllers: [AnalyticsController], + providers: [AnalyticsService], + exports: [AnalyticsService], +}) +export class AnalyticsModule {} diff --git a/services/mana-core-auth/src/analytics/analytics.service.ts b/services/mana-core-auth/src/analytics/analytics.service.ts new file mode 100644 index 000000000..b0a99ae7e --- /dev/null +++ b/services/mana-core-auth/src/analytics/analytics.service.ts @@ -0,0 +1,327 @@ +import { Injectable, Logger, OnModuleInit, OnModuleDestroy } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import { Cron, CronExpression } from '@nestjs/schedule'; +import { Database } from 'duckdb-async'; +import { sql } from 'drizzle-orm'; +import { getDb } from '../db/connection'; +import * as fs from 'fs'; +import * as path from 'path'; + +export interface DailyMetrics { + date: string; + total_users: number; + verified_users: number; + new_users_today: number; + new_users_week: number; + new_users_month: number; + total_db_size_bytes: number | null; + recorded_at: string; +} + +export interface GrowthData { + date: string; + total_users: number; + growth: number | null; + growth_percent: number | null; +} + +export interface MonthlyMetrics { + month: string; + total_users_eom: number; + new_users: number; + growth_percent: number | null; +} + +@Injectable() +export class AnalyticsService implements OnModuleInit, OnModuleDestroy { + private readonly logger = new Logger(AnalyticsService.name); + private duckdb: Database | null = null; + private readonly dbPath: string; + private readonly databaseUrl: string; + + constructor(private readonly configService: ConfigService) { + this.dbPath = this.configService.get('DUCKDB_PATH', './data/metrics.duckdb'); + this.databaseUrl = this.configService.get('DATABASE_URL', ''); + } + + async onModuleInit(): Promise { + try { + // Ensure the directory exists + const dbDir = path.dirname(this.dbPath); + if (!fs.existsSync(dbDir)) { + fs.mkdirSync(dbDir, { recursive: true }); + this.logger.log(`Created DuckDB directory: ${dbDir}`); + } + + this.duckdb = await Database.create(this.dbPath); + await this.initializeSchema(); + this.logger.log(`DuckDB initialized at ${this.dbPath}`); + + // Record initial snapshot if database is empty + const count = await this.getRecordCount(); + if (count === 0) { + this.logger.log('No existing records found, recording initial snapshot...'); + await this.recordDailySnapshot(); + } + } catch (error) { + this.logger.error('Failed to initialize DuckDB', error); + } + } + + async onModuleDestroy(): Promise { + if (this.duckdb) { + await this.duckdb.close(); + this.logger.log('DuckDB connection closed'); + } + } + + private async initializeSchema(): Promise { + if (!this.duckdb) return; + + await this.duckdb.run(` + CREATE TABLE IF NOT EXISTS daily_metrics ( + date DATE PRIMARY KEY, + total_users INTEGER NOT NULL, + verified_users INTEGER NOT NULL, + new_users_today INTEGER NOT NULL, + new_users_week INTEGER NOT NULL, + new_users_month INTEGER NOT NULL, + total_db_size_bytes BIGINT, + recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + `); + + this.logger.log('DuckDB schema initialized'); + } + + private async getRecordCount(): Promise { + if (!this.duckdb) return 0; + const result = await this.duckdb.all('SELECT COUNT(*) as count FROM daily_metrics'); + return Number(result[0]?.count ?? 0); + } + + /** + * Record daily snapshot - runs at midnight UTC + */ + @Cron(CronExpression.EVERY_DAY_AT_MIDNIGHT) + async recordDailySnapshot(): Promise { + if (!this.duckdb) { + this.logger.warn('DuckDB not initialized, skipping snapshot'); + return; + } + + try { + const today = new Date().toISOString().split('T')[0]; + + // Get user counts from PostgreSQL + const [totalUsers, verifiedUsers, newToday, newWeek, newMonth, dbSize] = await Promise.all([ + this.countTotalUsers(), + this.countVerifiedUsers(), + this.countUsersCreatedSince(1), + this.countUsersCreatedSince(7), + this.countUsersCreatedSince(30), + this.getDatabaseSize(), + ]); + + // Insert or replace in DuckDB + await this.duckdb.run( + ` + INSERT OR REPLACE INTO daily_metrics + (date, total_users, verified_users, new_users_today, new_users_week, new_users_month, total_db_size_bytes, recorded_at) + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) + `, + today, + totalUsers, + verifiedUsers, + newToday, + newWeek, + newMonth, + dbSize + ); + + this.logger.log(`Daily snapshot recorded for ${today}: ${totalUsers} total users`); + } catch (error) { + this.logger.error('Failed to record daily snapshot', error); + } + } + + /** + * Get user growth over time + */ + async getUserGrowth(days: number = 90): Promise { + if (!this.duckdb) return []; + + const result = await this.duckdb.all( + ` + SELECT + date::VARCHAR as date, + total_users, + total_users - LAG(total_users) OVER (ORDER BY date) as growth, + ROUND(((total_users::FLOAT - LAG(total_users) OVER (ORDER BY date)) / + NULLIF(LAG(total_users) OVER (ORDER BY date), 0)) * 100, 2) as growth_percent + FROM daily_metrics + WHERE date > CURRENT_DATE - INTERVAL '${days} days' + ORDER BY date + ` + ); + + return result as GrowthData[]; + } + + /** + * Get monthly aggregated metrics + */ + async getMonthlyMetrics(months: number = 12): Promise { + if (!this.duckdb) return []; + + const result = await this.duckdb.all( + ` + SELECT + strftime(date_trunc('month', date), '%Y-%m') as month, + MAX(total_users)::INTEGER as total_users_eom, + SUM(new_users_today)::INTEGER as new_users, + ROUND(((MAX(total_users)::FLOAT - MIN(total_users)) / + NULLIF(MIN(total_users), 0)) * 100, 2) as growth_percent + FROM daily_metrics + WHERE date > CURRENT_DATE - INTERVAL '${months} months' + GROUP BY date_trunc('month', date) + ORDER BY month + ` + ); + + return result as MonthlyMetrics[]; + } + + /** + * Get latest metrics + */ + async getLatestMetrics(): Promise { + if (!this.duckdb) return null; + + const result = await this.duckdb.all(` + SELECT + date::VARCHAR as date, + total_users, + verified_users, + new_users_today, + new_users_week, + new_users_month, + total_db_size_bytes::INTEGER as total_db_size_bytes, + recorded_at::VARCHAR as recorded_at + FROM daily_metrics + ORDER BY date DESC + LIMIT 1 + `); + + return (result[0] as DailyMetrics) ?? null; + } + + /** + * Get all metrics for a date range + */ + async getMetricsRange(startDate: string, endDate: string): Promise { + if (!this.duckdb) return []; + + const result = await this.duckdb.all( + ` + SELECT + date::VARCHAR as date, + total_users, + verified_users, + new_users_today, + new_users_week, + new_users_month, + total_db_size_bytes::INTEGER as total_db_size_bytes, + recorded_at::VARCHAR as recorded_at + FROM daily_metrics + WHERE date BETWEEN ? AND ? + ORDER BY date + `, + startDate, + endDate + ); + + return result as DailyMetrics[]; + } + + /** + * Health check for the analytics service + */ + async getHealth(): Promise<{ + status: string; + database_path: string; + database_size_bytes: number | null; + total_records: number; + latest_snapshot: string | null; + }> { + const recordCount = await this.getRecordCount(); + const latest = await this.getLatestMetrics(); + + return { + status: this.duckdb ? 'healthy' : 'unhealthy', + database_path: this.dbPath, + database_size_bytes: null, // DuckDB doesn't expose this easily + total_records: recordCount, + latest_snapshot: latest?.date ?? null, + }; + } + + /** + * Export metrics to Parquet format (for archival) + */ + async exportToParquet(outputPath: string): Promise { + if (!this.duckdb) { + throw new Error('DuckDB not initialized'); + } + + await this.duckdb.run(`COPY daily_metrics TO '${outputPath}' (FORMAT PARQUET)`); + this.logger.log(`Metrics exported to ${outputPath}`); + } + + // ============================================ + // PostgreSQL Query Helpers + // ============================================ + + private getPostgresDb() { + if (!this.databaseUrl) { + throw new Error('DATABASE_URL not configured'); + } + return getDb(this.databaseUrl); + } + + private async countTotalUsers(): Promise { + const db = this.getPostgresDb(); + const result = await db.execute(sql`SELECT COUNT(*) as count FROM auth.users`); + const row = result[0] as { count: string | number } | undefined; + return Number(row?.count ?? 0); + } + + private async countVerifiedUsers(): Promise { + const db = this.getPostgresDb(); + const result = await db.execute( + sql`SELECT COUNT(*) as count FROM auth.users WHERE email_verified = true` + ); + const row = result[0] as { count: string | number } | undefined; + return Number(row?.count ?? 0); + } + + private async countUsersCreatedSince(days: number): Promise { + const db = this.getPostgresDb(); + const result = await db.execute( + sql`SELECT COUNT(*) as count FROM auth.users WHERE created_at > NOW() - INTERVAL '${sql.raw(days.toString())} days'` + ); + const row = result[0] as { count: string | number } | undefined; + return Number(row?.count ?? 0); + } + + private async getDatabaseSize(): Promise { + try { + const db = this.getPostgresDb(); + const result = await db.execute(sql`SELECT pg_database_size(current_database()) as size`); + const row = result[0] as { size: string | number } | undefined; + return Number(row?.size ?? 0); + } catch { + return null; + } + } +} diff --git a/services/mana-core-auth/src/analytics/index.ts b/services/mana-core-auth/src/analytics/index.ts new file mode 100644 index 000000000..456948c98 --- /dev/null +++ b/services/mana-core-auth/src/analytics/index.ts @@ -0,0 +1,3 @@ +export * from './analytics.module'; +export * from './analytics.service'; +export * from './analytics.controller'; diff --git a/services/mana-core-auth/src/app.module.ts b/services/mana-core-auth/src/app.module.ts index 15e593264..5466762c5 100644 --- a/services/mana-core-auth/src/app.module.ts +++ b/services/mana-core-auth/src/app.module.ts @@ -12,6 +12,7 @@ import { TagsModule } from './tags/tags.module'; import { AiModule } from './ai/ai.module'; import { HealthModule } from './health/health.module'; import { MetricsModule } from './metrics'; +import { AnalyticsModule } from './analytics'; import { HttpExceptionFilter } from './common/filters/http-exception.filter'; @Module({ @@ -27,6 +28,7 @@ import { HttpExceptionFilter } from './common/filters/http-exception.filter'; }, ]), MetricsModule, + AnalyticsModule, AiModule, AuthModule, CreditsModule,