mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-17 15:49:40 +02:00
🔧 fix(mac-mini): add container recovery and update health check ports
- Add ensure-containers-running.sh to detect and auto-start stuck containers - Add LaunchD plist for automatic container health checks every 5 minutes - Update health-check.sh with correct ports (3031/5011 for todo, etc.) - Update deploy.sh health checks to match docker-compose.macmini.yml - Fix container name references (mana-infra-postgres instead of manacore-postgres) This prevents 502 errors when containers get stuck in "Created" status. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
03c9267a69
commit
2fe7f842c6
4 changed files with 261 additions and 33 deletions
|
|
@ -92,16 +92,18 @@ check_health() {
|
|||
fi
|
||||
}
|
||||
|
||||
check_health "Auth API" "http://localhost:3001/api/v1/health"
|
||||
check_health "ManaCore Web" "http://localhost:5173/health"
|
||||
check_health "Chat Backend" "http://localhost:3002/api/v1/health"
|
||||
check_health "Chat Web" "http://localhost:3000/health"
|
||||
check_health "Todo Backend" "http://localhost:3018/api/health"
|
||||
check_health "Todo Web" "http://localhost:5188/health"
|
||||
check_health "Calendar Backend" "http://localhost:3016/api/v1/health"
|
||||
check_health "Calendar Web" "http://localhost:5186/health"
|
||||
check_health "Clock Backend" "http://localhost:3017/api/v1/health"
|
||||
check_health "Clock Web" "http://localhost:5187/health"
|
||||
check_health "Auth API" "http://localhost:3001/health"
|
||||
check_health "ManaCore Web" "http://localhost:5000/health"
|
||||
check_health "Chat Backend" "http://localhost:3030/health"
|
||||
check_health "Chat Web" "http://localhost:5010/health"
|
||||
check_health "Todo Backend" "http://localhost:3031/health"
|
||||
check_health "Todo Web" "http://localhost:5011/health"
|
||||
check_health "Calendar Backend" "http://localhost:3032/health"
|
||||
check_health "Calendar Web" "http://localhost:5012/health"
|
||||
check_health "Clock Backend" "http://localhost:3033/health"
|
||||
check_health "Clock Web" "http://localhost:5013/health"
|
||||
check_health "Contacts Backend" "http://localhost:3034/health"
|
||||
check_health "Contacts Web" "http://localhost:5014/health"
|
||||
|
||||
echo ""
|
||||
echo "=== Deployment Complete ==="
|
||||
|
|
|
|||
156
scripts/mac-mini/ensure-containers-running.sh
Executable file
156
scripts/mac-mini/ensure-containers-running.sh
Executable file
|
|
@ -0,0 +1,156 @@
|
|||
#!/bin/bash
|
||||
# ManaCore Container Health Enforcer
|
||||
# Ensures all containers are actually running, not just created
|
||||
#
|
||||
# This script detects containers that are stuck in "Created" or "Exited"
|
||||
# status and automatically starts them.
|
||||
#
|
||||
# Run via LaunchD every 5 minutes or after system startup.
|
||||
|
||||
set -e
|
||||
|
||||
# Ensure PATH includes docker
|
||||
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
COMPOSE_FILE="$PROJECT_ROOT/docker-compose.macmini.yml"
|
||||
ENV_FILE="$PROJECT_ROOT/.env.macmini"
|
||||
LOG_FILE="/tmp/manacore-container-health.log"
|
||||
|
||||
# Load notification config if exists
|
||||
if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
|
||||
source "$PROJECT_ROOT/.env.notifications"
|
||||
fi
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
send_notification() {
|
||||
local message="$1"
|
||||
|
||||
# Telegram
|
||||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d "chat_id=${TELEGRAM_CHAT_ID}" \
|
||||
-d "text=${message}" \
|
||||
-d "parse_mode=HTML" \
|
||||
>/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
# ntfy
|
||||
if [ -n "$NTFY_TOPIC" ]; then
|
||||
curl -s -d "$message" \
|
||||
-H "Title: ManaCore Container Fixed" \
|
||||
-H "Priority: default" \
|
||||
-H "Tags: white_check_mark" \
|
||||
"https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if docker is running
|
||||
if ! docker info >/dev/null 2>&1; then
|
||||
log "ERROR: Docker is not running"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get containers that are NOT running (Created, Exited, etc.)
|
||||
# Filter only mana-* containers from our compose file
|
||||
STUCK_CONTAINERS=$(docker ps -a --filter "status=created" --filter "status=exited" --format "{{.Names}}" | grep "^mana-" || true)
|
||||
|
||||
if [ -z "$STUCK_CONTAINERS" ]; then
|
||||
log "OK: All containers are running"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "WARNING: Found containers not running:"
|
||||
echo "$STUCK_CONTAINERS" | while read container; do
|
||||
STATUS=$(docker inspect "$container" --format '{{.State.Status}}' 2>/dev/null || echo "unknown")
|
||||
log " - $container (status: $STATUS)"
|
||||
done
|
||||
|
||||
# Start the stuck containers using docker compose
|
||||
log "Starting stuck containers via docker compose..."
|
||||
|
||||
cd "$PROJECT_ROOT"
|
||||
|
||||
# Use docker compose up for the specific services
|
||||
# This ensures dependencies are respected
|
||||
for container in $STUCK_CONTAINERS; do
|
||||
# Extract service name from container name (remove mana-app- or mana-* prefix)
|
||||
# Container naming: mana-{category}-{service} or mana-app-{service}-{type}
|
||||
SERVICE_NAME=""
|
||||
|
||||
case "$container" in
|
||||
mana-app-todo-web) SERVICE_NAME="todo-web" ;;
|
||||
mana-app-todo-backend) SERVICE_NAME="todo-backend" ;;
|
||||
mana-app-chat-web) SERVICE_NAME="chat-web" ;;
|
||||
mana-app-chat-backend) SERVICE_NAME="chat-backend" ;;
|
||||
mana-app-calendar-web) SERVICE_NAME="calendar-web" ;;
|
||||
mana-app-calendar-backend) SERVICE_NAME="calendar-backend" ;;
|
||||
mana-app-clock-web) SERVICE_NAME="clock-web" ;;
|
||||
mana-app-clock-backend) SERVICE_NAME="clock-backend" ;;
|
||||
mana-app-contacts-web) SERVICE_NAME="contacts-web" ;;
|
||||
mana-app-contacts-backend) SERVICE_NAME="contacts-backend" ;;
|
||||
mana-app-storage-web) SERVICE_NAME="storage-web" ;;
|
||||
mana-app-storage-backend) SERVICE_NAME="storage-backend" ;;
|
||||
mana-app-presi-web) SERVICE_NAME="presi-web" ;;
|
||||
mana-app-presi-backend) SERVICE_NAME="presi-backend" ;;
|
||||
mana-app-nutriphi-web) SERVICE_NAME="nutriphi-web" ;;
|
||||
mana-app-nutriphi-backend) SERVICE_NAME="nutriphi-backend" ;;
|
||||
mana-app-skilltree-web) SERVICE_NAME="skilltree-web" ;;
|
||||
mana-app-skilltree-backend) SERVICE_NAME="skilltree-backend" ;;
|
||||
mana-app-photos-web) SERVICE_NAME="photos-web" ;;
|
||||
mana-app-photos-backend) SERVICE_NAME="photos-backend" ;;
|
||||
mana-app-web) SERVICE_NAME="mana-web" ;;
|
||||
mana-core-auth) SERVICE_NAME="mana-auth" ;;
|
||||
mana-core-gateway) SERVICE_NAME="api-gateway" ;;
|
||||
mana-core-search) SERVICE_NAME="mana-search" ;;
|
||||
mana-core-searxng) SERVICE_NAME="searxng" ;;
|
||||
mana-core-media) SERVICE_NAME="mana-media" ;;
|
||||
mana-infra-postgres) SERVICE_NAME="postgres" ;;
|
||||
mana-infra-redis) SERVICE_NAME="redis" ;;
|
||||
mana-infra-minio) SERVICE_NAME="minio" ;;
|
||||
mana-matrix-synapse) SERVICE_NAME="synapse" ;;
|
||||
mana-matrix-element) SERVICE_NAME="element-web" ;;
|
||||
mana-matrix-web) SERVICE_NAME="matrix-web" ;;
|
||||
mana-matrix-bot-*) SERVICE_NAME="${container#mana-matrix-bot-}"; SERVICE_NAME="matrix-${SERVICE_NAME}-bot" ;;
|
||||
mana-mon-*) SERVICE_NAME="${container#mana-mon-}" ;;
|
||||
mana-auto-*) SERVICE_NAME="${container#mana-auto-}" ;;
|
||||
mana-service-*) SERVICE_NAME="${container#mana-service-}" ;;
|
||||
mana-app-llm-playground) SERVICE_NAME="llm-playground" ;;
|
||||
*)
|
||||
log " Unknown container pattern: $container, trying direct start"
|
||||
docker start "$container" 2>&1 || true
|
||||
continue
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -n "$SERVICE_NAME" ]; then
|
||||
log " Starting service: $SERVICE_NAME"
|
||||
docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" up -d "$SERVICE_NAME" 2>&1 || {
|
||||
log " WARNING: Failed to start $SERVICE_NAME via compose, trying direct start"
|
||||
docker start "$container" 2>&1 || true
|
||||
}
|
||||
fi
|
||||
done
|
||||
|
||||
# Wait for containers to start
|
||||
sleep 10
|
||||
|
||||
# Verify containers are now running
|
||||
STILL_STUCK=$(docker ps -a --filter "status=created" --filter "status=exited" --format "{{.Names}}" | grep "^mana-" || true)
|
||||
|
||||
if [ -z "$STILL_STUCK" ]; then
|
||||
FIXED_MSG="Auto-fixed stuck containers: $(echo $STUCK_CONTAINERS | tr '\n' ', ')"
|
||||
log "SUCCESS: $FIXED_MSG"
|
||||
send_notification "🔧 <b>ManaCore Auto-Recovery</b>\n\n$FIXED_MSG"
|
||||
else
|
||||
log "ERROR: Some containers still not running:"
|
||||
echo "$STILL_STUCK" | while read container; do
|
||||
log " - $container"
|
||||
done
|
||||
send_notification "⚠️ <b>ManaCore Container Issue</b>\n\nContainers still stuck: $(echo $STILL_STUCK | tr '\n' ', ')"
|
||||
exit 1
|
||||
fi
|
||||
|
|
@ -174,7 +174,7 @@ echo ""
|
|||
|
||||
echo "Infrastructure:"
|
||||
# Check postgres via docker
|
||||
if docker exec manacore-postgres pg_isready -U postgres >/dev/null 2>&1; then
|
||||
if docker exec mana-infra-postgres pg_isready -U postgres >/dev/null 2>&1; then
|
||||
echo -e " ${GREEN}[OK]${NC} PostgreSQL"
|
||||
else
|
||||
echo -e " ${RED}[FAIL]${NC} PostgreSQL"
|
||||
|
|
@ -182,60 +182,98 @@ else
|
|||
fi
|
||||
|
||||
# Check redis via docker
|
||||
if docker exec manacore-redis redis-cli ping >/dev/null 2>&1; then
|
||||
if docker exec mana-infra-redis redis-cli ping >/dev/null 2>&1; then
|
||||
echo -e " ${GREEN}[OK]${NC} Redis"
|
||||
else
|
||||
echo -e " ${RED}[FAIL]${NC} Redis"
|
||||
FAILURES+=("Redis")
|
||||
fi
|
||||
|
||||
# Check for stuck containers (Created/Exited status)
|
||||
STUCK_CONTAINERS=$(docker ps -a --filter "status=created" --filter "status=exited" --format "{{.Names}}" | grep "^mana-" || true)
|
||||
if [ -n "$STUCK_CONTAINERS" ]; then
|
||||
echo -e " ${RED}[FAIL]${NC} Stuck containers detected:"
|
||||
echo "$STUCK_CONTAINERS" | while read c; do echo " - $c"; done
|
||||
FAILURES+=("Stuck containers: $(echo $STUCK_CONTAINERS | tr '\n' ' ')")
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Auth & Dashboard:"
|
||||
check_service "Auth API" "http://localhost:3001/health"
|
||||
check_service "Dashboard Web" "http://localhost:5173/health"
|
||||
check_service "Dashboard Web" "http://localhost:5000/health"
|
||||
|
||||
echo ""
|
||||
echo "Chat:"
|
||||
check_service "Chat Backend" "http://localhost:3002/health"
|
||||
check_service "Chat Web" "http://localhost:3000/health"
|
||||
check_service "Chat Backend" "http://localhost:3030/health"
|
||||
check_service "Chat Web" "http://localhost:5010/health"
|
||||
|
||||
echo ""
|
||||
echo "Todo:"
|
||||
check_service "Todo Backend" "http://localhost:3018/health"
|
||||
check_service "Todo Web" "http://localhost:5188/health"
|
||||
check_service "Todo Backend" "http://localhost:3031/health"
|
||||
check_service "Todo Web" "http://localhost:5011/health"
|
||||
|
||||
echo ""
|
||||
echo "Calendar:"
|
||||
check_service "Calendar Backend" "http://localhost:3016/health"
|
||||
check_service "Calendar Web" "http://localhost:5186/health"
|
||||
check_service "Calendar Backend" "http://localhost:3032/health"
|
||||
check_service "Calendar Web" "http://localhost:5012/health"
|
||||
|
||||
echo ""
|
||||
echo "Clock:"
|
||||
check_service "Clock Backend" "http://localhost:3017/health"
|
||||
check_service "Clock Web" "http://localhost:5187/health"
|
||||
check_service "Clock Backend" "http://localhost:3033/health"
|
||||
check_service "Clock Web" "http://localhost:5013/health"
|
||||
|
||||
echo ""
|
||||
echo "Contacts:"
|
||||
check_service "Contacts Backend" "http://localhost:3015/health"
|
||||
check_service "Contacts Web" "http://localhost:5184/health"
|
||||
check_service "Contacts Backend" "http://localhost:3034/health"
|
||||
check_service "Contacts Web" "http://localhost:5014/health"
|
||||
|
||||
echo ""
|
||||
echo "Storage:"
|
||||
check_service "Storage Backend" "http://localhost:3019/api/v1/health"
|
||||
check_service "Storage Web" "http://localhost:5185/health"
|
||||
check_service "Storage Backend" "http://localhost:3035/api/v1/health"
|
||||
check_service "Storage Web" "http://localhost:5015/health"
|
||||
|
||||
echo ""
|
||||
echo "Presi:"
|
||||
check_service "Presi Backend" "http://localhost:3008/api/v1/health"
|
||||
check_service "Presi Web" "http://localhost:5178/health"
|
||||
check_service "Presi Backend" "http://localhost:3036/api/v1/health"
|
||||
check_service "Presi Web" "http://localhost:5016/health"
|
||||
|
||||
echo ""
|
||||
echo "Matrix (DSGVO-konform):"
|
||||
check_service "Synapse" "http://localhost:8008/health"
|
||||
check_service "Element Web" "http://localhost:8087/"
|
||||
check_service "Matrix Ollama Bot" "http://localhost:3311/health"
|
||||
check_service "Matrix Stats Bot" "http://localhost:3312/health"
|
||||
check_service "Matrix Project Doc Bot" "http://localhost:3313/health"
|
||||
echo "NutriPhi:"
|
||||
check_service "NutriPhi Backend" "http://localhost:3037/api/v1/health"
|
||||
check_service "NutriPhi Web" "http://localhost:5017/health"
|
||||
|
||||
echo ""
|
||||
echo "SkillTree:"
|
||||
check_service "SkillTree Backend" "http://localhost:3038/health"
|
||||
check_service "SkillTree Web" "http://localhost:5018/health"
|
||||
|
||||
echo ""
|
||||
echo "Photos:"
|
||||
check_service "Photos Backend" "http://localhost:3039/api/v1/health"
|
||||
check_service "Photos Web" "http://localhost:5019/health"
|
||||
|
||||
echo ""
|
||||
echo "Core Services:"
|
||||
check_service "API Gateway" "http://localhost:3010/health"
|
||||
check_service "Search Service" "http://localhost:3020/health"
|
||||
check_service "Media Service" "http://localhost:3015/api/v1/health"
|
||||
check_service "LLM Service" "http://localhost:3025/health"
|
||||
|
||||
echo ""
|
||||
echo "Matrix:"
|
||||
check_service "Synapse" "http://localhost:4000/health"
|
||||
check_service "Element Web" "http://localhost:4080/"
|
||||
check_service "Matrix Web" "http://localhost:4090/health"
|
||||
check_service "Matrix Mana Bot" "http://localhost:4010/health"
|
||||
check_service "Matrix Ollama Bot" "http://localhost:4011/health"
|
||||
check_service "Matrix Stats Bot" "http://localhost:4012/health"
|
||||
check_service "Matrix Project Doc Bot" "http://localhost:4013/health"
|
||||
|
||||
echo ""
|
||||
echo "Monitoring:"
|
||||
check_service "Grafana" "http://localhost:8000/api/health"
|
||||
check_service "Umami" "http://localhost:8010/api/heartbeat"
|
||||
check_service "VictoriaMetrics" "http://localhost:9090/health"
|
||||
|
||||
echo ""
|
||||
echo "Cloudflare Tunnel:"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,32 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.manacore.ensure-containers</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>/Users/till/projects/manacore-monorepo/scripts/mac-mini/ensure-containers-running.sh</string>
|
||||
</array>
|
||||
|
||||
<key>StartInterval</key>
|
||||
<integer>300</integer>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/tmp/manacore-ensure-containers.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/tmp/manacore-ensure-containers.log</string>
|
||||
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin</string>
|
||||
</dict>
|
||||
</dict>
|
||||
</plist>
|
||||
Loading…
Add table
Add a link
Reference in a new issue