mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 21:41:09 +02:00
This commit bundles two unrelated changes that were swept together by an
accidental `git add -A` in another working session. Documented here so the
history reflects what's actually inside.
═══════════════════════════════════════════════════════════════════════
1. fix(mana-auth): /api/v1/auth/login mints JWT via auth.handler instead
of api.signInEmail
═══════════════════════════════════════════════════════════════════════
Previous attempt (commit 55cc75e7d) tried to fix the broken JWT mint in
/api/v1/auth/login by switching the cookie name from `mana.session_token`
to `__Secure-mana.session_token` for production. That was necessary but
not sufficient: Better Auth's session cookie value isn't just the raw
session token, it's `<token>.<HMAC>` where the HMAC is derived from the
better-auth secret. Reconstructing the cookie from auth.api.signInEmail's
JSON response only gave us the raw token, so /api/auth/token's
get-session middleware still couldn't validate it and the JWT mint kept
silently failing.
Real fix: do the sign-in via auth.handler (the HTTP path) rather than
auth.api.signInEmail (the SDK path). The handler returns a real fetch
Response with a Set-Cookie header containing the fully signed cookie
envelope. We capture that header verbatim and forward it as the cookie
on the /api/auth/token request, which now passes validation and mints
the JWT correctly.
Verified end-to-end on auth.mana.how:
$ curl -X POST https://auth.mana.how/api/v1/auth/login \
-d '{"email":"...","password":"..."}'
{
"user": {...},
"token": "<session token>",
"accessToken": "eyJhbGciOiJFZERTQSI...", ← real JWT now
"refreshToken": "<session token>"
}
Side benefits:
- Email-not-verified path is now handled by checking
signInResponse.status === 403 directly, no more catching APIError
with the comment-noted async-stream footgun.
- X-Forwarded-For is forwarded explicitly so Better Auth's rate limiter
and our security log see the real client IP.
- The leftover catch block now only handles unexpected exceptions
(network errors etc); the FORBIDDEN-checking logic in it is dead but
harmless and left in for defense in depth.
═══════════════════════════════════════════════════════════════════════
2. chore: remove the entire self-hosted Matrix stack (Synapse, Element,
Manalink, mana-matrix-bot)
═══════════════════════════════════════════════════════════════════════
The Matrix subsystem ran parallel to the main Mana product without any
load-bearing integration: the unified web app never imported matrix-js-sdk,
the chat module uses mana-sync (local-first), and mana-matrix-bot's
plugins duplicated features the unified app already ships natively.
Keeping it alive cost a Synapse + Element + matrix-web + bot container
quartet, three Cloudflare routes, an OIDC provider plugin in mana-auth,
and a steady drip of devlog/dependency churn.
Removed:
- apps/matrix (Manalink web + mobile, ~150 files)
- services/mana-matrix-bot (Go bot with ~20 plugins)
- docker/matrix configs (Synapse + Element)
- synapse/element-web/matrix-web/mana-matrix-bot services in
docker-compose.macmini.yml
- matrix.mana.how/element.mana.how/link.mana.how Cloudflare tunnel routes
- OIDC provider plugin + matrix-synapse trustedClient + matrixUserLinks
table from mana-auth (oauth_* schema definitions also removed)
- MatrixService import path in mana-media (importFromMatrix endpoint)
- Matrix notification channel in mana-notify (worker, metrics, config,
channel_type enum, MatrixOptions handler)
- Matrix entries from shared-branding (mana-apps + app-icons),
notify-client, the i18n bundle, the observatory map, the credits
app-label list, the landing footer/apps page, the prometheus + alerts
+ promtail tier mappings, and the matrix-related deploy paths in
cd-macmini.yml + ci.yml
Devlog/manascore/blueprint entries that mention Matrix are left intact
as historical record. The oauth_* + matrix_user_links Postgres tables
stay on existing prod databases — code can no longer write to them, drop
them in a follow-up migration if you want them gone for real.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
467 lines
17 KiB
YAML
467 lines
17 KiB
YAML
groups:
|
|
- name: service_alerts
|
|
rules:
|
|
# Service Down Alert
|
|
- alert: ServiceDown
|
|
expr: up{job=~"mana-auth|.*-backend|mana-search|mana-media|mana-llm"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down"
|
|
description: "{{ $labels.job }} has been down for more than 1 minute."
|
|
|
|
# High Error Rate (> 5% of requests are 5xx)
|
|
- alert: HighErrorRate
|
|
expr: |
|
|
sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)
|
|
/ sum(rate(http_requests_total[5m])) by (job) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate on {{ $labels.job }}"
|
|
description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 5%)"
|
|
|
|
# Very High Error Rate (> 20% of requests are 5xx)
|
|
- alert: VeryHighErrorRate
|
|
expr: |
|
|
sum(rate(http_requests_total{status=~"5.."}[5m])) by (job)
|
|
/ sum(rate(http_requests_total[5m])) by (job) > 0.20
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Very high error rate on {{ $labels.job }}"
|
|
description: "{{ $labels.job }} has error rate of {{ $value | humanizePercentage }} (> 20%)"
|
|
|
|
# Slow Response Time (p95 > 2s)
|
|
- alert: SlowResponseTime
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Slow response time on {{ $labels.job }}"
|
|
description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}"
|
|
|
|
# Very Slow Response Time (p95 > 5s)
|
|
- alert: VerySlowResponseTime
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, job)) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Very slow response time on {{ $labels.job }}"
|
|
description: "{{ $labels.job }} p95 response time is {{ $value | humanizeDuration }}"
|
|
|
|
# High Memory Usage (Node.js heap > 500MB)
|
|
- alert: HighHeapMemory
|
|
expr: nodejs_heap_size_used_bytes > 500 * 1024 * 1024
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High heap memory on {{ $labels.job }}"
|
|
description: "{{ $labels.job }} heap usage is {{ $value | humanize1024 }}B"
|
|
|
|
# Event Loop Lag (> 100ms)
|
|
- alert: HighEventLoopLag
|
|
expr: nodejs_eventloop_lag_seconds > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High event loop lag on {{ $labels.job }}"
|
|
description: "{{ $labels.job }} event loop lag is {{ $value | humanizeDuration }}"
|
|
|
|
- name: infrastructure_alerts
|
|
rules:
|
|
# High CPU Usage (> 80%)
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on host"
|
|
description: "CPU usage is {{ $value | humanize }}%"
|
|
|
|
# Very High CPU Usage (> 95%)
|
|
- alert: VeryHighCPUUsage
|
|
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Very high CPU usage on host"
|
|
description: "CPU usage is {{ $value | humanize }}%"
|
|
|
|
# High Memory Usage (> 85%)
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on host"
|
|
description: "Memory usage is {{ $value | humanize }}%"
|
|
|
|
# Very High Memory Usage (> 95%)
|
|
- alert: VeryHighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Very high memory usage on host"
|
|
description: "Memory usage is {{ $value | humanize }}%"
|
|
|
|
# High Disk Usage — macOS host disks (via Pushgateway, since node-exporter runs in VM)
|
|
# Metrics pushed by scripts/mac-mini/disk-metrics.sh (runs every 5 min via launchd)
|
|
- alert: HighDiskUsage
|
|
expr: |
|
|
mac_disk_used_percent{disk=~"internal|manaData"} > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High disk usage on {{ $labels.disk }} ({{ $labels.mountpoint }})"
|
|
description: "Disk usage is {{ $value | humanize }}% — {{ $labels.avail_human }} free"
|
|
|
|
# Very High Disk Usage (> 90%) — immediate alert
|
|
- alert: VeryHighDiskUsage
|
|
expr: |
|
|
mac_disk_used_percent{disk=~"internal|manaData"} > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "CRITICAL: Disk {{ $labels.disk }} almost full ({{ $labels.mountpoint }})"
|
|
description: "Disk usage is {{ $value | humanize }}% — only {{ $labels.avail_human }} free. Server may crash."
|
|
|
|
# Colima VM disk large (> 150GB actual usage on sparse datadisk)
|
|
- alert: ColimaVMDiskLarge
|
|
expr: |
|
|
mac_colima_disk_used_gb > 150
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Colima VM disk is {{ $value | humanize }}GB — consider pruning Docker images"
|
|
description: "Run: docker system prune -f && docker image prune -a"
|
|
|
|
- name: database_alerts
|
|
rules:
|
|
# PostgreSQL Down
|
|
- alert: PostgreSQLDown
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL is down"
|
|
description: "PostgreSQL has been down for more than 1 minute."
|
|
|
|
# Redis Down
|
|
- alert: RedisDown
|
|
expr: redis_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis has been down for more than 1 minute."
|
|
|
|
# PostgreSQL High Connections (> 80)
|
|
- alert: PostgreSQLHighConnections
|
|
expr: sum(pg_stat_activity_count) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High PostgreSQL connections"
|
|
description: "PostgreSQL has {{ $value }} connections (> 80)"
|
|
|
|
# PostgreSQL Low Cache Hit Ratio (< 90%)
|
|
- alert: PostgreSQLLowCacheHitRatio
|
|
expr: |
|
|
avg(pg_stat_database_blks_hit{datname!~"template.*|postgres"}
|
|
/ (pg_stat_database_blks_hit{datname!~"template.*|postgres"}
|
|
+ pg_stat_database_blks_read{datname!~"template.*|postgres"} + 0.0001)) * 100 < 90
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL low cache hit ratio"
|
|
description: "PostgreSQL cache hit ratio is {{ $value | humanize }}%"
|
|
|
|
# Redis High Memory (> 1GB)
|
|
- alert: RedisHighMemory
|
|
expr: redis_memory_used_bytes > 1024 * 1024 * 1024
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Redis high memory usage"
|
|
description: "Redis memory usage is {{ $value | humanize1024 }}B"
|
|
|
|
# Redis Blocked Clients
|
|
- alert: RedisBlockedClients
|
|
expr: redis_blocked_clients > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Redis has blocked clients"
|
|
description: "Redis has {{ $value }} blocked clients"
|
|
|
|
- name: container_alerts
|
|
rules:
|
|
# Container High CPU (> 80% of limit)
|
|
- alert: ContainerHighCPU
|
|
expr: |
|
|
sum(rate(container_cpu_usage_seconds_total{id=~"/docker/.+"}[5m])) by (name) * 100 > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} high CPU"
|
|
description: "Container {{ $labels.name }} CPU usage is {{ $value | humanize }}%"
|
|
|
|
# Container High Memory (> 80% of limit)
|
|
- alert: ContainerHighMemory
|
|
expr: |
|
|
container_memory_usage_bytes{id=~"/docker/.+"}
|
|
/ container_spec_memory_limit_bytes{id=~"/docker/.+"} * 100 > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} high memory"
|
|
description: "Container {{ $labels.name }} memory usage is {{ $value | humanize }}%"
|
|
|
|
# Container Restart
|
|
- alert: ContainerRestarted
|
|
expr: |
|
|
increase(container_start_time_seconds{id=~"/docker/.+"}[5m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} restarted"
|
|
description: "Container {{ $labels.name }} has restarted."
|
|
|
|
- name: auth_service_alerts
|
|
rules:
|
|
# Auth Service Down
|
|
- alert: AuthServiceDown
|
|
expr: up{job="mana-auth"} == 0
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Auth Service is down"
|
|
description: "mana-auth has been down for more than 30 seconds. All authentication will fail."
|
|
|
|
# High Login Failure Rate (> 50% of logins fail with 401)
|
|
- alert: HighLoginFailureRate
|
|
expr: |
|
|
sum(rate(http_requests_total{job="mana-auth",route="/auth/login",status="401"}[5m]))
|
|
/ sum(rate(http_requests_total{job="mana-auth",route="/auth/login"}[5m])) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High login failure rate"
|
|
description: "{{ $value | humanizePercentage }} of login attempts are failing."
|
|
|
|
# Rate Limiting Triggered Frequently
|
|
- alert: HighRateLimitHits
|
|
expr: |
|
|
sum(rate(http_requests_total{job="mana-auth",status="429"}[5m])) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Frequent rate limiting on Auth Service"
|
|
description: "Rate limit (429) is being hit {{ $value | humanize }} times/second. Possible attack or misconfiguration."
|
|
|
|
# Brute Force Detection (> 100 failed logins in 5 min)
|
|
- alert: PossibleBruteForce
|
|
expr: |
|
|
sum(increase(http_requests_total{job="mana-auth",route="/auth/login",status="401"}[5m])) > 100
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Possible brute force attack detected"
|
|
description: "{{ $value | humanize }} failed login attempts in the last 5 minutes."
|
|
|
|
# Registration Spike (unusual registration activity)
|
|
- alert: RegistrationSpike
|
|
expr: |
|
|
sum(rate(http_requests_total{job="mana-auth",route="/auth/register",status="201"}[5m])) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "High registration activity"
|
|
description: "{{ $value | humanize }} registrations per second. Verify this is expected."
|
|
|
|
# Token Refresh Failures
|
|
- alert: HighTokenRefreshFailures
|
|
expr: |
|
|
sum(rate(http_requests_total{job="mana-auth",route="/auth/refresh",status=~"4.."}[5m]))
|
|
/ sum(rate(http_requests_total{job="mana-auth",route="/auth/refresh"}[5m])) > 0.3
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High token refresh failure rate"
|
|
description: "{{ $value | humanizePercentage }} of token refresh attempts are failing."
|
|
|
|
# Password Reset Flood (possible enumeration attack)
|
|
- alert: PasswordResetFlood
|
|
expr: |
|
|
sum(increase(http_requests_total{job="mana-auth",route="/auth/forgot-password"}[5m])) > 50
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Unusual password reset activity"
|
|
description: "{{ $value | humanize }} password reset requests in the last 5 minutes."
|
|
|
|
# Low User Verification Rate (less than 50% verified after 1 week)
|
|
- alert: LowVerificationRate
|
|
expr: |
|
|
auth_users_verified{job="mana-auth"} / auth_users_total{job="mana-auth"} < 0.5
|
|
for: 1h
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Low email verification rate"
|
|
description: "Only {{ $value | humanizePercentage }} of users have verified their email."
|
|
|
|
# Auth Service Slow (p95 > 500ms)
|
|
- alert: AuthServiceSlow
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job="mana-auth"}[5m])) by (le)) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Auth Service responding slowly"
|
|
description: "Auth service p95 latency is {{ $value | humanizeDuration }}. This may impact all services."
|
|
|
|
# OIDC Token Endpoint Errors
|
|
- alert: OIDCTokenErrors
|
|
expr: |
|
|
sum(rate(http_requests_total{job="mana-auth",route=~"/api/auth/oauth2/token|/api/oidc/token",status=~"5.."}[5m])) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "OIDC token endpoint errors"
|
|
description: "OIDC token endpoint is returning 5xx errors. SSO may be affected."
|
|
|
|
- name: uptime_alerts
|
|
rules:
|
|
# Web App offline (HTTP probe failed)
|
|
- alert: WebAppDown
|
|
expr: probe_success{job="blackbox-web"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Web App offline: {{ $labels.instance }}"
|
|
description: "{{ $labels.instance }} hat seit 2 Minuten keine gültige HTTP-Antwort zurückgegeben."
|
|
|
|
# API Health Endpoint offline
|
|
- alert: APIDown
|
|
expr: probe_success{job="blackbox-api"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "API offline: {{ $labels.instance }}"
|
|
description: "{{ $labels.instance }} antwortet nicht auf den Health-Endpoint."
|
|
|
|
# Infra Tool offline (Grafana, Git, etc.)
|
|
- alert: InfraToolDown
|
|
expr: probe_success{job="blackbox-infra"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Infra-Dienst offline: {{ $labels.instance }}"
|
|
description: "{{ $labels.instance }} ist seit 3 Minuten nicht erreichbar."
|
|
|
|
# GPU Server Service offline
|
|
- alert: GPUServiceDown
|
|
expr: probe_success{job="blackbox-gpu"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "GPU-Dienst offline: {{ $labels.instance }}"
|
|
description: "{{ $labels.instance }} (GPU-Server) ist seit 5 Minuten nicht erreichbar."
|
|
|
|
# Slow HTTP response (> 5s)
|
|
- alert: SlowHTTPResponse
|
|
expr: probe_duration_seconds{job=~"blackbox-web|blackbox-api"} > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Langsame HTTP-Antwort: {{ $labels.instance }}"
|
|
description: "{{ $labels.instance }} antwortet mit {{ $value | humanizeDuration }} (> 5s)."
|
|
|
|
- name: llm_alerts
|
|
rules:
|
|
# mana-llm Down
|
|
- alert: LLMServiceDown
|
|
expr: up{job="mana-llm"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "mana-llm service is down"
|
|
description: "mana-llm has been down for more than 1 minute. All AI features will fail."
|
|
|
|
# High LLM Error Rate (> 10%)
|
|
- alert: LLMHighErrorRate
|
|
expr: |
|
|
sum(rate(mana_llm_llm_errors_total[5m]))
|
|
/ (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High LLM error rate"
|
|
description: "{{ $value | humanizePercentage }} of LLM requests are failing."
|
|
|
|
# Ollama Provider Down (all requests going to fallback)
|
|
- alert: OllamaProviderDown
|
|
expr: |
|
|
sum(rate(mana_llm_llm_requests_total{provider="google"}[5m]))
|
|
/ (sum(rate(mana_llm_llm_requests_total[5m])) + 0.001) > 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Ollama appears down — most requests going to Google fallback"
|
|
description: "{{ $value | humanizePercentage }} of LLM requests are using Google Gemini fallback."
|
|
|
|
# LLM Slow Responses (p95 > 30s)
|
|
- alert: LLMSlowResponses
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(mana_llm_llm_latency_seconds_bucket[5m])) by (le)) > 30
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "LLM responses are slow"
|
|
description: "LLM p95 latency is {{ $value | humanizeDuration }}."
|