managarten/scripts/mac-mini/health-check.sh
Till JS b1b9bbc269
Some checks are pending
CD Mac Mini / Detect Changes (push) Waiting to run
CD Mac Mini / Deploy (push) Blocked by required conditions
CI / Detect Changes (push) Waiting to run
CI / Validate (push) Waiting to run
CI / Build mana-search (push) Blocked by required conditions
CI / Build mana-sync (push) Blocked by required conditions
CI / Build mana-api-gateway (push) Blocked by required conditions
CI / Build mana-crawler (push) Blocked by required conditions
Docker Validate / Validate Dockerfiles (push) Waiting to run
Docker Validate / Build calendar-web (push) Blocked by required conditions
Docker Validate / Build quotes-web (push) Blocked by required conditions
Docker Validate / Build todo-backend (push) Blocked by required conditions
Docker Validate / Build todo-web (push) Blocked by required conditions
Docker Validate / Build mana-auth (push) Blocked by required conditions
Docker Validate / Build mana-sync (push) Blocked by required conditions
Docker Validate / Build mana-media (push) Blocked by required conditions
Mirror to Forgejo / Push to Forgejo (push) Waiting to run
chore: rename repo mana-monorepo → managarten
Phase-3-Rename des ehemaligen Multi-App-Monorepos zum eigenständigen
Produkt-Repo. Verein heißt mana e.V., Plattform-Domain bleibt mana.how,
apps/mana/ bleibt unverändert — nur der Repo-Container kriegt den
neuen Namen "managarten" (Garten der mana-Apps).

Geändert:
- package.json#name + #description
- README.md (Titel + erster Absatz)
- TROUBLESHOOTING.md
- alle Mac-Mini-Skripte (Pfade ~/projects/mana-monorepo → ~/projects/managarten)
- COMPOSE_PROJECT_NAME-default in scripts/mac-mini/status.sh
- .github/workflows/cd-macmini.yml + mirror-to-forgejo.yml
- apps/docs (astro.config.mjs + content)
- .claude/settings.local.json (Bash-Permission-Pfade)
- alle docs/*.md Pfad-Referenzen
- launchd plists, .env.macmini.example, infrastructure/

Forgejo-Repo + GitHub-Repo bereits via API umbenannt. Lokales
Verzeichnis-Rename + Mac-Mini-Cutover folgen separat.
2026-05-09 01:16:02 +02:00

373 lines
12 KiB
Bash
Executable file

#!/bin/bash
# Mana Health Check Script
# Checks all services and sends notifications on failure
#
# Notification channels (configure via environment or .env.notifications):
# - Telegram: TELEGRAM_BOT_TOKEN + TELEGRAM_CHAT_ID
# - Email: EMAIL_TO + EMAIL_FROM + SMTP_* settings
# - ntfy: NTFY_TOPIC
# Ensure PATH includes docker
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Load notification config if exists
if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
source "$PROJECT_ROOT/.env.notifications"
fi
# Colors for terminal output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Track failures
FAILURES=()
# ============================================
# Notification Functions
# ============================================
send_telegram() {
local message="$1"
if [ -z "$TELEGRAM_BOT_TOKEN" ] || [ -z "$TELEGRAM_CHAT_ID" ]; then
return 0
fi
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" \
-d "text=${message}" \
-d "parse_mode=HTML" \
>/dev/null 2>&1
if [ $? -eq 0 ]; then
echo " [Telegram] Notification sent"
else
echo " [Telegram] Failed to send"
fi
}
send_email() {
local subject="$1"
local body="$2"
if [ -z "$EMAIL_TO" ]; then
return 0
fi
# Use msmtp if available, otherwise try mail command
if command -v msmtp &> /dev/null; then
echo -e "Subject: ${subject}\nFrom: ${EMAIL_FROM:-mana@localhost}\nTo: ${EMAIL_TO}\n\n${body}" | \
msmtp -a default "$EMAIL_TO" 2>/dev/null
elif command -v mail &> /dev/null; then
echo "$body" | mail -s "$subject" "$EMAIL_TO" 2>/dev/null
elif command -v sendmail &> /dev/null; then
echo -e "Subject: ${subject}\nFrom: ${EMAIL_FROM:-mana@localhost}\nTo: ${EMAIL_TO}\n\n${body}" | \
sendmail "$EMAIL_TO" 2>/dev/null
else
echo " [Email] No mail client available (install msmtp)"
return 1
fi
if [ $? -eq 0 ]; then
echo " [Email] Notification sent to $EMAIL_TO"
else
echo " [Email] Failed to send"
fi
}
send_ntfy() {
local message="$1"
if [ -z "$NTFY_TOPIC" ]; then
return 0
fi
curl -s -d "$message" \
-H "Title: Mac Mini Alert" \
-H "Priority: high" \
-H "Tags: warning" \
"https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1
if [ $? -eq 0 ]; then
echo " [ntfy] Notification sent"
else
echo " [ntfy] Failed to send"
fi
}
send_all_notifications() {
local failed_services="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# Telegram message (HTML format)
local telegram_msg="🚨 <b>Mana Health Check Failed</b>
<b>Time:</b> ${timestamp}
<b>Host:</b> $(hostname)
<b>Failed Services:</b>
${failed_services}
Check logs: <code>ssh mac-mini</code>"
# Email message
local email_subject="[ALERT] Mana Health Check Failed"
local email_body="Mana Health Check Failed
=============================
Time: ${timestamp}
Host: $(hostname)
Failed Services:
${failed_services}
To investigate:
ssh mac-mini
cd ~/projects/managarten
./scripts/mac-mini/status.sh
docker logs <container-name>"
# Plain text for ntfy
local ntfy_msg="Mana Failed: ${failed_services}"
echo ""
echo "Sending notifications..."
send_telegram "$telegram_msg"
send_email "$email_subject" "$email_body"
send_ntfy "$ntfy_msg"
}
# ============================================
# Health Check Functions
# ============================================
check_service() {
local name=$1
local url=$2
local timeout=${3:-5}
local status=$(curl -s -o /dev/null -w "%{http_code}" --max-time "$timeout" "$url" 2>/dev/null)
if [ "$status" = "200" ]; then
echo -e " ${GREEN}[OK]${NC} $name"
return 0
else
echo -e " ${RED}[FAIL]${NC} $name (HTTP $status)"
FAILURES+=("$name")
return 1
fi
}
# ============================================
# Main Health Check
# ============================================
echo ""
echo "=== Mana Health Check ==="
echo "Time: $(date)"
echo ""
echo "Infrastructure:"
# Check postgres via docker
if docker exec mana-infra-postgres pg_isready -U postgres >/dev/null 2>&1; then
echo -e " ${GREEN}[OK]${NC} PostgreSQL"
else
echo -e " ${RED}[FAIL]${NC} PostgreSQL"
FAILURES+=("PostgreSQL")
fi
# Check redis via docker
if docker exec mana-infra-redis redis-cli ping >/dev/null 2>&1; then
echo -e " ${GREEN}[OK]${NC} Redis"
else
echo -e " ${RED}[FAIL]${NC} Redis"
FAILURES+=("Redis")
fi
# Check for stuck containers (Created/Exited status). The only exclusion
# is *-init containers, which are one-shot init pods by design — Exit 0
# is success and the container intentionally never re-runs.
STUCK_CONTAINERS=$(docker ps -a --filter "status=created" --filter "status=exited" --format "{{.Names}}" \
| grep "^mana-" \
| grep -vE -- "-init$" \
|| true)
if [ -n "$STUCK_CONTAINERS" ]; then
echo -e " ${RED}[FAIL]${NC} Stuck containers detected:"
echo "$STUCK_CONTAINERS" | while read c; do echo " - $c"; done
FAILURES+=("Stuck containers: $(echo $STUCK_CONTAINERS | tr '\n' ' ')")
fi
echo ""
echo "Local services (auth + unified web):"
check_service "Auth API" "http://localhost:3001/health"
check_service "Unified Web" "http://localhost:5000/health"
# ────────────────────────────────────────────────────────────
# Public hostnames via Cloudflare Tunnel
# ────────────────────────────────────────────────────────────
# Walk every `hostname:` entry in cloudflared's ingress config and
# probe the live public URL. This catches the failure modes that the
# old port-by-port probes missed:
#
# - Tunnel ingress points at a port that no container listens on (502)
# - DNS CNAME for a tunnel hostname is missing (530 / NXDOMAIN)
# - Cloudflared is misrouted or stale on a different config
# - The container is healthy on LAN but unreachable from the public side
#
# A 200/204/301/302/308 from the public hostname is OK; anything else
# (including 404/502/530/timeout) is reported as a failure.
#
# Hostnames matching the patterns in TUNNEL_INGRESS_SKIP are excluded
# (typically internal-only routes or things we know don't expose
# /health, like raw IMAP / SSH ingress).
CLOUDFLARED_CONFIG="${HOME}/.cloudflared/config.yml"
TUNNEL_INGRESS_SKIP_REGEX='^(ssh|smtp|imap|pop3)\.'
check_public_hostname() {
local host=$1
local timeout=${2:-8}
# Resolve via Cloudflare's public DNS (1.1.1.1) instead of the local
# resolver. The Mac Mini's home-router DNS keeps a negative cache
# for hostnames that didn't exist when first queried — newly added
# CNAMEs like the 2026-04-07 sync/media records take hours to clear
# there, even though they resolve fine for external users. Asking
# 1.1.1.1 directly gives us the same view the public internet has.
local ip
ip=$(dig +short "$host" @1.1.1.1 2>/dev/null | head -1)
if [ -z "$ip" ]; then
echo -e " ${RED}[FAIL]${NC} ${host} (no DNS record on Cloudflare zone)"
FAILURES+=("${host} (no DNS)")
return 1
fi
# Try /health, accept anything <500. We use --resolve to bypass any
# local DNS cache and pin the lookup to the IP we just got back.
local status
status=$(curl -sk -o /dev/null -w "%{http_code}" --max-time "$timeout" \
--resolve "${host}:443:${ip}" \
"https://${host}/health" 2>/dev/null)
if [ -z "$status" ] || [ "$status" = "000" ]; then
echo -e " ${RED}[FAIL]${NC} ${host} (no response — tunnel timeout?)"
FAILURES+=("${host} (no response)")
return 1
fi
case "$status" in
2*|3*|404)
# 404 is OK for hostnames whose backend has no /health route
# but the tunnel + DNS are working.
echo -e " ${GREEN}[OK]${NC} ${host} (HTTP ${status})"
return 0
;;
5*)
echo -e " ${RED}[FAIL]${NC} ${host} (HTTP ${status} — origin / tunnel)"
FAILURES+=("${host} (HTTP ${status})")
return 1
;;
*)
echo -e " ${YELLOW}[WARN]${NC} ${host} (HTTP ${status})"
return 0
;;
esac
}
if [ -f "$CLOUDFLARED_CONFIG" ]; then
echo ""
echo "Public hostnames (Cloudflare Tunnel ingress):"
HOSTNAMES=$(awk '/^[[:space:]]*-[[:space:]]*hostname:/{print $3}' "$CLOUDFLARED_CONFIG" \
| grep -vE "$TUNNEL_INGRESS_SKIP_REGEX" \
| sort -u)
for host in $HOSTNAMES; do
check_public_hostname "$host"
done
else
echo ""
echo -e " ${YELLOW}[SKIP]${NC} cloudflared config not found at $CLOUDFLARED_CONFIG"
fi
echo ""
echo "GPU Server (192.168.178.11, LAN-only probe):"
# Direct LAN check — catches GPU box being down even when the public
# tunnel hostnames in the ingress walk above happen to time out or
# return cached errors.
check_service "GPU Ollama" "http://192.168.178.11:11434/api/version" 3
check_service "GPU STT" "http://192.168.178.11:3020/health" 3
check_service "GPU TTS" "http://192.168.178.11:3022/health" 3
check_service "GPU Image Gen" "http://192.168.178.11:3023/health" 3
# GPU Video Gen (LTX) is intentionally not probed — it's planned but
# not deployed yet, so its absence is expected and shouldn't page.
echo ""
echo "Monitoring (LAN, not exposed via tunnel):"
check_service "VictoriaMetrics" "http://localhost:9090/health"
echo ""
echo "Alerting:"
check_service "vmalert" "http://localhost:8880/health"
check_service "Alertmanager" "http://localhost:9093/-/healthy"
check_service "Alert Notifier" "http://localhost:9095/health"
echo ""
echo "Disk Space:"
check_disk() {
local name=$1
local path=$2
local warn_pct=${3:-80}
local crit_pct=${4:-90}
if [ ! -d "$path" ]; then
echo -e " ${YELLOW}[SKIP]${NC} $name ($path not found)"
return 0
fi
local usage_pct=$(df "$path" | tail -1 | awk '{gsub(/%/,""); print $5}')
local avail=$(df -h "$path" | tail -1 | awk '{print $4}')
if [ "$usage_pct" -ge "$crit_pct" ]; then
echo -e " ${RED}[CRIT]${NC} $name: ${usage_pct}% used ($avail free)"
FAILURES+=("Disk $name: ${usage_pct}% (critical)")
elif [ "$usage_pct" -ge "$warn_pct" ]; then
echo -e " ${YELLOW}[WARN]${NC} $name: ${usage_pct}% used ($avail free)"
FAILURES+=("Disk $name: ${usage_pct}% (warning)")
else
echo -e " ${GREEN}[OK]${NC} $name: ${usage_pct}% used ($avail free)"
fi
}
check_disk "System (/)" "/"
check_disk "ManaData" "/Volumes/ManaData"
echo ""
echo "Cloudflare Tunnel:"
if pgrep -x "cloudflared" >/dev/null; then
echo -e " ${GREEN}[OK]${NC} cloudflared running"
else
echo -e " ${RED}[FAIL]${NC} cloudflared not running"
FAILURES+=("cloudflared")
fi
echo ""
echo "=== Summary ==="
if [ ${#FAILURES[@]} -eq 0 ]; then
echo -e "${GREEN}All services healthy!${NC}"
exit 0
else
echo -e "${RED}Failed services (${#FAILURES[@]}):${NC}"
FAILED_LIST=""
for f in "${FAILURES[@]}"; do
echo " - $f"
FAILED_LIST="${FAILED_LIST}- ${f}\n"
done
# Send notifications
send_all_notifications "$(echo -e "$FAILED_LIST")"
exit 1
fi