mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-15 08:41:10 +02:00
Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
311 lines
8.5 KiB
Bash
Executable file
311 lines
8.5 KiB
Bash
Executable file
#!/bin/bash
|
|
# ManaCore Health Check Script
|
|
# Checks all services and sends notifications on failure
|
|
#
|
|
# Notification channels (configure via environment or .env.notifications):
|
|
# - Telegram: TELEGRAM_BOT_TOKEN + TELEGRAM_CHAT_ID
|
|
# - Email: EMAIL_TO + EMAIL_FROM + SMTP_* settings
|
|
# - ntfy: NTFY_TOPIC
|
|
|
|
# Ensure PATH includes docker
|
|
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
|
|
# Load notification config if exists
|
|
if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
|
|
source "$PROJECT_ROOT/.env.notifications"
|
|
fi
|
|
|
|
# Colors for terminal output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Track failures
|
|
FAILURES=()
|
|
|
|
# ============================================
|
|
# Notification Functions
|
|
# ============================================
|
|
|
|
send_telegram() {
|
|
local message="$1"
|
|
|
|
if [ -z "$TELEGRAM_BOT_TOKEN" ] || [ -z "$TELEGRAM_CHAT_ID" ]; then
|
|
return 0
|
|
fi
|
|
|
|
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
|
-d "chat_id=${TELEGRAM_CHAT_ID}" \
|
|
-d "text=${message}" \
|
|
-d "parse_mode=HTML" \
|
|
>/dev/null 2>&1
|
|
|
|
if [ $? -eq 0 ]; then
|
|
echo " [Telegram] Notification sent"
|
|
else
|
|
echo " [Telegram] Failed to send"
|
|
fi
|
|
}
|
|
|
|
send_email() {
|
|
local subject="$1"
|
|
local body="$2"
|
|
|
|
if [ -z "$EMAIL_TO" ]; then
|
|
return 0
|
|
fi
|
|
|
|
# Use msmtp if available, otherwise try mail command
|
|
if command -v msmtp &> /dev/null; then
|
|
echo -e "Subject: ${subject}\nFrom: ${EMAIL_FROM:-manacore@localhost}\nTo: ${EMAIL_TO}\n\n${body}" | \
|
|
msmtp -a default "$EMAIL_TO" 2>/dev/null
|
|
elif command -v mail &> /dev/null; then
|
|
echo "$body" | mail -s "$subject" "$EMAIL_TO" 2>/dev/null
|
|
elif command -v sendmail &> /dev/null; then
|
|
echo -e "Subject: ${subject}\nFrom: ${EMAIL_FROM:-manacore@localhost}\nTo: ${EMAIL_TO}\n\n${body}" | \
|
|
sendmail "$EMAIL_TO" 2>/dev/null
|
|
else
|
|
echo " [Email] No mail client available (install msmtp)"
|
|
return 1
|
|
fi
|
|
|
|
if [ $? -eq 0 ]; then
|
|
echo " [Email] Notification sent to $EMAIL_TO"
|
|
else
|
|
echo " [Email] Failed to send"
|
|
fi
|
|
}
|
|
|
|
send_ntfy() {
|
|
local message="$1"
|
|
|
|
if [ -z "$NTFY_TOPIC" ]; then
|
|
return 0
|
|
fi
|
|
|
|
curl -s -d "$message" \
|
|
-H "Title: Mac Mini Alert" \
|
|
-H "Priority: high" \
|
|
-H "Tags: warning" \
|
|
"https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1
|
|
|
|
if [ $? -eq 0 ]; then
|
|
echo " [ntfy] Notification sent"
|
|
else
|
|
echo " [ntfy] Failed to send"
|
|
fi
|
|
}
|
|
|
|
send_all_notifications() {
|
|
local failed_services="$1"
|
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
|
|
|
# Telegram message (HTML format)
|
|
local telegram_msg="🚨 <b>ManaCore Health Check Failed</b>
|
|
|
|
<b>Time:</b> ${timestamp}
|
|
<b>Host:</b> $(hostname)
|
|
|
|
<b>Failed Services:</b>
|
|
${failed_services}
|
|
|
|
Check logs: <code>ssh mac-mini</code>"
|
|
|
|
# Email message
|
|
local email_subject="[ALERT] ManaCore Health Check Failed"
|
|
local email_body="ManaCore Health Check Failed
|
|
=============================
|
|
|
|
Time: ${timestamp}
|
|
Host: $(hostname)
|
|
|
|
Failed Services:
|
|
${failed_services}
|
|
|
|
To investigate:
|
|
ssh mac-mini
|
|
cd ~/projects/manacore-monorepo
|
|
./scripts/mac-mini/status.sh
|
|
docker logs <container-name>"
|
|
|
|
# Plain text for ntfy
|
|
local ntfy_msg="ManaCore Failed: ${failed_services}"
|
|
|
|
echo ""
|
|
echo "Sending notifications..."
|
|
send_telegram "$telegram_msg"
|
|
send_email "$email_subject" "$email_body"
|
|
send_ntfy "$ntfy_msg"
|
|
}
|
|
|
|
# ============================================
|
|
# Health Check Functions
|
|
# ============================================
|
|
|
|
check_service() {
|
|
local name=$1
|
|
local url=$2
|
|
local timeout=${3:-5}
|
|
|
|
local status=$(curl -s -o /dev/null -w "%{http_code}" --max-time "$timeout" "$url" 2>/dev/null)
|
|
|
|
if [ "$status" = "200" ]; then
|
|
echo -e " ${GREEN}[OK]${NC} $name"
|
|
return 0
|
|
else
|
|
echo -e " ${RED}[FAIL]${NC} $name (HTTP $status)"
|
|
FAILURES+=("$name")
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# ============================================
|
|
# Main Health Check
|
|
# ============================================
|
|
|
|
echo ""
|
|
echo "=== ManaCore Health Check ==="
|
|
echo "Time: $(date)"
|
|
echo ""
|
|
|
|
echo "Infrastructure:"
|
|
# Check postgres via docker
|
|
if docker exec mana-infra-postgres pg_isready -U postgres >/dev/null 2>&1; then
|
|
echo -e " ${GREEN}[OK]${NC} PostgreSQL"
|
|
else
|
|
echo -e " ${RED}[FAIL]${NC} PostgreSQL"
|
|
FAILURES+=("PostgreSQL")
|
|
fi
|
|
|
|
# Check redis via docker
|
|
if docker exec mana-infra-redis redis-cli ping >/dev/null 2>&1; then
|
|
echo -e " ${GREEN}[OK]${NC} Redis"
|
|
else
|
|
echo -e " ${RED}[FAIL]${NC} Redis"
|
|
FAILURES+=("Redis")
|
|
fi
|
|
|
|
# Check for stuck containers (Created/Exited status)
|
|
STUCK_CONTAINERS=$(docker ps -a --filter "status=created" --filter "status=exited" --format "{{.Names}}" | grep "^mana-" || true)
|
|
if [ -n "$STUCK_CONTAINERS" ]; then
|
|
echo -e " ${RED}[FAIL]${NC} Stuck containers detected:"
|
|
echo "$STUCK_CONTAINERS" | while read c; do echo " - $c"; done
|
|
FAILURES+=("Stuck containers: $(echo $STUCK_CONTAINERS | tr '\n' ' ')")
|
|
fi
|
|
|
|
echo ""
|
|
echo "Auth & Dashboard:"
|
|
check_service "Auth API" "http://localhost:3001/health"
|
|
check_service "Dashboard Web" "http://localhost:5000/health"
|
|
|
|
echo ""
|
|
echo "Chat:"
|
|
check_service "Chat Backend" "http://localhost:3030/health"
|
|
check_service "Chat Web" "http://localhost:5010/health"
|
|
|
|
echo ""
|
|
echo "Todo:"
|
|
check_service "Todo Backend" "http://localhost:3031/health"
|
|
check_service "Todo Web" "http://localhost:5011/health"
|
|
|
|
echo ""
|
|
echo "Calendar:"
|
|
check_service "Calendar Backend" "http://localhost:3032/health"
|
|
check_service "Calendar Web" "http://localhost:5012/health"
|
|
|
|
echo ""
|
|
echo "Clock:"
|
|
check_service "Clock Backend" "http://localhost:3033/health"
|
|
check_service "Clock Web" "http://localhost:5013/health"
|
|
|
|
echo ""
|
|
echo "Contacts:"
|
|
check_service "Contacts Backend" "http://localhost:3034/health"
|
|
check_service "Contacts Web" "http://localhost:5014/health"
|
|
|
|
echo ""
|
|
echo "Storage:"
|
|
check_service "Storage Backend" "http://localhost:3035/api/v1/health"
|
|
check_service "Storage Web" "http://localhost:5015/health"
|
|
|
|
echo ""
|
|
echo "Presi:"
|
|
check_service "Presi Backend" "http://localhost:3036/api/v1/health"
|
|
check_service "Presi Web" "http://localhost:5016/health"
|
|
|
|
echo ""
|
|
echo "NutriPhi:"
|
|
check_service "NutriPhi Backend" "http://localhost:3037/api/v1/health"
|
|
check_service "NutriPhi Web" "http://localhost:5017/health"
|
|
|
|
echo ""
|
|
echo "SkillTree:"
|
|
check_service "SkillTree Backend" "http://localhost:3038/health"
|
|
# SkillTree Web disabled - Dockerfile needs fix for shared packages
|
|
|
|
echo ""
|
|
echo "Photos:"
|
|
check_service "Photos Backend" "http://localhost:3039/api/v1/health"
|
|
check_service "Photos Web" "http://localhost:5019/health"
|
|
|
|
echo ""
|
|
echo "Core Services:"
|
|
# API Gateway disabled - no GHCR image, no Dockerfile
|
|
check_service "Search Service" "http://localhost:3020/api/v1/health"
|
|
check_service "Media Service" "http://localhost:3015/api/v1/health"
|
|
check_service "LLM Service" "http://localhost:3025/health"
|
|
|
|
echo ""
|
|
echo "Matrix:"
|
|
check_service "Synapse" "http://localhost:4000/health"
|
|
check_service "Element Web" "http://localhost:4080/"
|
|
check_service "Matrix Web" "http://localhost:4090/health"
|
|
check_service "Matrix Mana Bot" "http://localhost:4010/health"
|
|
check_service "Matrix Ollama Bot" "http://localhost:4011/health"
|
|
check_service "Matrix Stats Bot" "http://localhost:4012/health"
|
|
check_service "Matrix Project Doc Bot" "http://localhost:4013/health"
|
|
|
|
echo ""
|
|
echo "Monitoring:"
|
|
check_service "Grafana" "http://localhost:8000/api/health"
|
|
check_service "Umami" "http://localhost:8010/api/heartbeat"
|
|
check_service "VictoriaMetrics" "http://localhost:9090/health"
|
|
|
|
echo ""
|
|
echo "Alerting:"
|
|
check_service "vmalert" "http://localhost:8880/health"
|
|
check_service "Alertmanager" "http://localhost:9093/-/healthy"
|
|
check_service "Alert Notifier" "http://localhost:9095/health"
|
|
|
|
echo ""
|
|
echo "Cloudflare Tunnel:"
|
|
if pgrep -x "cloudflared" >/dev/null; then
|
|
echo -e " ${GREEN}[OK]${NC} cloudflared running"
|
|
else
|
|
echo -e " ${RED}[FAIL]${NC} cloudflared not running"
|
|
FAILURES+=("cloudflared")
|
|
fi
|
|
|
|
echo ""
|
|
echo "=== Summary ==="
|
|
|
|
if [ ${#FAILURES[@]} -eq 0 ]; then
|
|
echo -e "${GREEN}All services healthy!${NC}"
|
|
exit 0
|
|
else
|
|
echo -e "${RED}Failed services (${#FAILURES[@]}):${NC}"
|
|
FAILED_LIST=""
|
|
for f in "${FAILURES[@]}"; do
|
|
echo " - $f"
|
|
FAILED_LIST="${FAILED_LIST}- ${f}\n"
|
|
done
|
|
|
|
# Send notifications
|
|
send_all_notifications "$(echo -e "$FAILED_LIST")"
|
|
|
|
exit 1
|
|
fi
|