diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml
index 0a4044e11..87afdb919 100644
--- a/docker-compose.macmini.yml
+++ b/docker-compose.macmini.yml
@@ -1431,6 +1431,11 @@ services:
- /dev/disk/:/dev/disk:ro
ports:
- "9110:8080"
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
postgres-exporter:
image: prometheuscommunity/postgres-exporter:v0.15.0
@@ -1481,6 +1486,80 @@ services:
timeout: 10s
retries: 3
+ # ============================================
+ # Alerting Stack (Ports 9093-9095)
+ # ============================================
+
+ vmalert:
+ image: victoriametrics/vmalert:v1.99.0
+ container_name: mana-mon-vmalert
+ restart: always
+ depends_on:
+ victoriametrics:
+ condition: service_healthy
+ alertmanager:
+ condition: service_healthy
+ command:
+ - '-datasource.url=http://victoriametrics:9090'
+ - '-notifier.url=http://alertmanager:9093'
+ - '-remoteWrite.url=http://victoriametrics:9090'
+ - '-remoteRead.url=http://victoriametrics:9090'
+ - '-rule=/etc/alerts/*.yml'
+ - '-evaluationInterval=30s'
+ - '-httpListenAddr=:8880'
+ volumes:
+ - ./docker/prometheus/alerts.yml:/etc/alerts/alerts.yml:ro
+ ports:
+ - "8880:8880"
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8880/health"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+
+ alertmanager:
+ image: prom/alertmanager:v0.27.0
+ container_name: mana-mon-alertmanager
+ restart: always
+ depends_on:
+ alert-notifier:
+ condition: service_healthy
+ command:
+ - '--config.file=/etc/alertmanager/alertmanager.yml'
+ - '--storage.path=/alertmanager'
+ - '--web.listen-address=:9093'
+ volumes:
+ - ./docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+ - alertmanager_data:/alertmanager
+ ports:
+ - "9093:9093"
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9093/-/healthy"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+
+ alert-notifier:
+ build:
+ context: ./docker/alert-notifier
+ dockerfile: Dockerfile
+ image: alert-notifier:local
+ container_name: mana-mon-alert-notifier
+ restart: always
+ environment:
+ PORT: 8080
+ TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
+ TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID:-}
+ NTFY_TOPIC: ${NTFY_TOPIC:-}
+ ports:
+ - "9095:8080"
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"]
+ interval: 30s
+ timeout: 5s
+ retries: 3
+ start_period: 5s
+
# ============================================
# Auto-Update (Watchtower)
# ============================================
@@ -1508,6 +1587,8 @@ volumes:
name: mana-redis-data
victoriametrics_data:
name: mana-victoria-data
+ alertmanager_data:
+ name: mana-alertmanager-data
grafana_data:
name: mana-grafana-data
analytics_data:
diff --git a/docker/alert-notifier/Dockerfile b/docker/alert-notifier/Dockerfile
new file mode 100644
index 000000000..cb045d43b
--- /dev/null
+++ b/docker/alert-notifier/Dockerfile
@@ -0,0 +1,17 @@
+FROM python:3.12-alpine
+
+WORKDIR /app
+
+COPY main.py .
+
+# No dependencies needed - uses only stdlib
+RUN chmod +x main.py
+
+EXPOSE 8080
+
+ENV PORT=8080
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
+ CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:8080/health || exit 1
+
+CMD ["python", "main.py"]
diff --git a/docker/alert-notifier/main.py b/docker/alert-notifier/main.py
new file mode 100644
index 000000000..f7cc6fcb5
--- /dev/null
+++ b/docker/alert-notifier/main.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+Alert Notifier - Webhook receiver for Alertmanager
+Forwards alerts to Telegram and ntfy
+
+Environment Variables:
+ TELEGRAM_BOT_TOKEN - Telegram bot token
+ TELEGRAM_CHAT_ID - Telegram chat ID
+ NTFY_TOPIC - ntfy.sh topic name (optional)
+"""
+
+import os
+import json
+import logging
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import urllib.request
+import urllib.parse
+
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+TELEGRAM_BOT_TOKEN = os.environ.get('TELEGRAM_BOT_TOKEN', '')
+TELEGRAM_CHAT_ID = os.environ.get('TELEGRAM_CHAT_ID', '')
+NTFY_TOPIC = os.environ.get('NTFY_TOPIC', '')
+
+SEVERITY_EMOJI = {
+ 'critical': 'đ¨',
+ 'warning': 'â ī¸',
+ 'info': 'âšī¸',
+}
+
+def format_alert_telegram(alert: dict, status: str) -> str:
+ """Format a single alert for Telegram."""
+ labels = alert.get('labels', {})
+ annotations = alert.get('annotations', {})
+
+ severity = labels.get('severity', 'unknown')
+ emoji = SEVERITY_EMOJI.get(severity, 'đ')
+
+ if status == 'resolved':
+ emoji = 'â
'
+
+ alertname = labels.get('alertname', 'Unknown')
+ job = labels.get('job', '')
+ summary = annotations.get('summary', alertname)
+ description = annotations.get('description', '')
+
+ msg = f"{emoji} {status.upper()}: {summary}\n"
+ if job:
+ msg += f"Service: {job}\n"
+ if description:
+ msg += f"{description}\n"
+
+ return msg
+
+def send_telegram(message: str) -> bool:
+ """Send message to Telegram."""
+ if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
+ logger.warning("Telegram not configured")
+ return False
+
+ url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
+ data = {
+ 'chat_id': TELEGRAM_CHAT_ID,
+ 'text': message,
+ 'parse_mode': 'HTML',
+ 'disable_web_page_preview': True
+ }
+
+ try:
+ req = urllib.request.Request(
+ url,
+ data=urllib.parse.urlencode(data).encode(),
+ headers={'Content-Type': 'application/x-www-form-urlencoded'}
+ )
+ with urllib.request.urlopen(req, timeout=10) as resp:
+ return resp.status == 200
+ except Exception as e:
+ logger.error(f"Telegram send failed: {e}")
+ return False
+
+def send_ntfy(title: str, message: str, priority: str = 'default') -> bool:
+ """Send message to ntfy."""
+ if not NTFY_TOPIC:
+ return False
+
+ url = f"https://ntfy.sh/{NTFY_TOPIC}"
+
+ priority_map = {
+ 'critical': 'urgent',
+ 'warning': 'high',
+ 'info': 'low'
+ }
+ ntfy_priority = priority_map.get(priority, 'default')
+
+ try:
+ req = urllib.request.Request(
+ url,
+ data=message.encode('utf-8'),
+ headers={
+ 'Title': title,
+ 'Priority': ntfy_priority,
+ 'Tags': 'warning' if priority == 'critical' else 'loudspeaker'
+ }
+ )
+ with urllib.request.urlopen(req, timeout=10) as resp:
+ return resp.status == 200
+ except Exception as e:
+ logger.error(f"ntfy send failed: {e}")
+ return False
+
+class AlertHandler(BaseHTTPRequestHandler):
+ def do_POST(self):
+ if self.path != '/webhook':
+ self.send_response(404)
+ self.end_headers()
+ return
+
+ content_length = int(self.headers.get('Content-Length', 0))
+ body = self.rfile.read(content_length)
+
+ try:
+ payload = json.loads(body)
+ self.process_alerts(payload)
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(b'OK')
+ except Exception as e:
+ logger.error(f"Error processing webhook: {e}")
+ self.send_response(500)
+ self.end_headers()
+ self.wfile.write(str(e).encode())
+
+ def do_GET(self):
+ if self.path == '/health':
+ self.send_response(200)
+ self.end_headers()
+ self.wfile.write(b'OK')
+ else:
+ self.send_response(404)
+ self.end_headers()
+
+ def process_alerts(self, payload: dict):
+ """Process Alertmanager webhook payload."""
+ status = payload.get('status', 'unknown')
+ alerts = payload.get('alerts', [])
+
+ if not alerts:
+ return
+
+ logger.info(f"Received {len(alerts)} alerts with status: {status}")
+
+ # Build message
+ messages = []
+ highest_severity = 'info'
+
+ for alert in alerts:
+ msg = format_alert_telegram(alert, alert.get('status', status))
+ messages.append(msg)
+
+ severity = alert.get('labels', {}).get('severity', 'info')
+ if severity == 'critical':
+ highest_severity = 'critical'
+ elif severity == 'warning' and highest_severity != 'critical':
+ highest_severity = 'warning'
+
+ combined_message = '\n'.join(messages)
+
+ # Send notifications
+ if TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID:
+ success = send_telegram(combined_message)
+ logger.info(f"Telegram: {'sent' if success else 'failed'}")
+
+ if NTFY_TOPIC:
+ title = f"ManaCore Alert ({len(alerts)} alerts)"
+ # Strip HTML for ntfy
+ plain_message = combined_message.replace('', '').replace('', '')
+ plain_message = plain_message.replace('', '').replace('', '')
+ success = send_ntfy(title, plain_message, highest_severity)
+ logger.info(f"ntfy: {'sent' if success else 'failed'}")
+
+ def log_message(self, format, *args):
+ logger.info(f"{self.client_address[0]} - {format % args}")
+
+def main():
+ port = int(os.environ.get('PORT', 8080))
+
+ logger.info(f"Starting Alert Notifier on port {port}")
+ logger.info(f"Telegram configured: {bool(TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID)}")
+ logger.info(f"ntfy configured: {bool(NTFY_TOPIC)}")
+
+ server = HTTPServer(('0.0.0.0', port), AlertHandler)
+
+ try:
+ server.serve_forever()
+ except KeyboardInterrupt:
+ logger.info("Shutting down")
+ server.shutdown()
+
+if __name__ == '__main__':
+ main()
diff --git a/docker/alertmanager/alertmanager.yml b/docker/alertmanager/alertmanager.yml
new file mode 100644
index 000000000..98d0cfd19
--- /dev/null
+++ b/docker/alertmanager/alertmanager.yml
@@ -0,0 +1,62 @@
+# Alertmanager Configuration for ManaCore
+# Sends alerts via webhook to custom notification handler
+
+global:
+ resolve_timeout: 5m
+
+route:
+ # Default receiver for all alerts
+ receiver: 'webhook'
+ # Group alerts by severity and service
+ group_by: ['alertname', 'severity', 'job']
+ # Wait before sending first notification
+ group_wait: 30s
+ # Wait before sending follow-up notifications for same group
+ group_interval: 5m
+ # Wait before re-sending resolved alerts
+ repeat_interval: 4h
+
+ routes:
+ # Critical alerts - immediate notification
+ - match:
+ severity: critical
+ receiver: 'webhook'
+ group_wait: 10s
+ repeat_interval: 1h
+
+ # Warning alerts - less frequent
+ - match:
+ severity: warning
+ receiver: 'webhook'
+ group_wait: 1m
+ repeat_interval: 6h
+
+ # Info alerts - only during business hours, batch together
+ - match:
+ severity: info
+ receiver: 'webhook'
+ group_wait: 5m
+ repeat_interval: 24h
+
+receivers:
+ - name: 'webhook'
+ webhook_configs:
+ - url: 'http://alert-notifier:8080/webhook'
+ send_resolved: true
+ max_alerts: 10
+
+# Inhibition rules - prevent redundant alerts
+inhibit_rules:
+ # Don't alert on warnings if critical is firing for same service
+ - source_match:
+ severity: 'critical'
+ target_match:
+ severity: 'warning'
+ equal: ['alertname', 'job']
+
+ # Don't alert on service-specific issues if PostgreSQL is down
+ - source_match:
+ alertname: 'PostgreSQLDown'
+ target_match_re:
+ alertname: '.*(Backend|Service).*'
+ equal: []
diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml
index 4dcf8f5b6..a53e9d4d9 100644
--- a/docker/prometheus/prometheus.yml
+++ b/docker/prometheus/prometheus.yml
@@ -9,11 +9,11 @@ global:
rule_files:
- /etc/prometheus/alerts.yml
-# Alertmanager configuration (optional, for future use)
-# alerting:
-# alertmanagers:
-# - static_configs:
-# - targets: []
+# Alertmanager configuration
+alerting:
+ alertmanagers:
+ - static_configs:
+ - targets: ['alertmanager:9093']
scrape_configs:
# Prometheus self-monitoring
@@ -30,10 +30,10 @@ scrape_configs:
target_label: instance
replacement: 'mac-mini'
- # Docker container metrics via cAdvisor (disabled - container not deployed)
- # - job_name: 'cadvisor'
- # static_configs:
- # - targets: ['cadvisor:8080']
+ # Docker container metrics via cAdvisor
+ - job_name: 'cadvisor'
+ static_configs:
+ - targets: ['cadvisor:8080']
# PostgreSQL metrics
- job_name: 'postgres'
diff --git a/scripts/mac-mini/check-disk-space.sh b/scripts/mac-mini/check-disk-space.sh
new file mode 100755
index 000000000..5b805470a
--- /dev/null
+++ b/scripts/mac-mini/check-disk-space.sh
@@ -0,0 +1,233 @@
+#!/bin/bash
+# ManaCore Disk Space Monitor
+# Checks disk usage on system and data volumes
+# Alerts via Telegram/ntfy when thresholds are exceeded
+#
+# Thresholds:
+# - Warning: 80%
+# - Critical: 90%
+#
+# Run via LaunchD hourly
+
+set -e
+
+# Ensure PATH includes required tools
+export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+LOG_FILE="/tmp/manacore-disk-check.log"
+
+# Thresholds
+WARNING_THRESHOLD=80
+CRITICAL_THRESHOLD=90
+
+# Load notification config if exists
+if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
+ source "$PROJECT_ROOT/.env.notifications"
+fi
+
+log() {
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
+}
+
+send_notification() {
+ local message="$1"
+ local priority="${2:-default}"
+
+ # Telegram
+ if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
+ curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
+ -d "chat_id=${TELEGRAM_CHAT_ID}" \
+ -d "text=${message}" \
+ -d "parse_mode=HTML" \
+ >/dev/null 2>&1 || true
+ fi
+
+ # ntfy
+ if [ -n "$NTFY_TOPIC" ]; then
+ local ntfy_priority="default"
+ [ "$priority" = "high" ] && ntfy_priority="high"
+ [ "$priority" = "critical" ] && ntfy_priority="urgent"
+
+ curl -s -d "$message" \
+ -H "Title: ManaCore Disk Alert" \
+ -H "Priority: $ntfy_priority" \
+ -H "Tags: warning" \
+ "https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1 || true
+ fi
+}
+
+check_disk() {
+ local mount_point="$1"
+ local name="$2"
+
+ # Check if mount point exists
+ if [ ! -d "$mount_point" ]; then
+ log "WARNING: Mount point $mount_point does not exist"
+ return 1
+ fi
+
+ # Get disk usage percentage (macOS compatible)
+ local usage
+ usage=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')
+
+ if [ -z "$usage" ]; then
+ log "ERROR: Could not get disk usage for $mount_point"
+ return 1
+ fi
+
+ # Get available space
+ local available
+ available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
+
+ log "$name: ${usage}% used (${available} free)"
+
+ # Check thresholds
+ if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then
+ log "CRITICAL: $name at ${usage}%!"
+ send_notification "đ¨ CRITICAL: Disk Space
+
+$name is at ${usage}%
+Available: ${available}
+
+Immediate action required!" "critical"
+ return 2
+ elif [ "$usage" -ge "$WARNING_THRESHOLD" ]; then
+ log "WARNING: $name at ${usage}%"
+ send_notification "â ī¸ WARNING: Disk Space
+
+$name is at ${usage}%
+Available: ${available}
+
+Consider cleaning up old files." "high"
+ return 1
+ fi
+
+ return 0
+}
+
+check_docker_disk() {
+ # Check Docker disk usage
+ if ! command -v docker &> /dev/null; then
+ log "Docker not found in PATH"
+ return 0
+ fi
+
+ if ! docker info >/dev/null 2>&1; then
+ log "Docker is not running"
+ return 0
+ fi
+
+ # Get Docker disk usage
+ local docker_usage
+ docker_usage=$(docker system df --format '{{.Size}}' 2>/dev/null | head -1)
+
+ log "Docker disk usage: $docker_usage"
+
+ # Check for dangling images and unused volumes
+ local dangling_images
+ dangling_images=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')
+
+ local unused_volumes
+ unused_volumes=$(docker volume ls -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')
+
+ if [ "$dangling_images" -gt 10 ] || [ "$unused_volumes" -gt 5 ]; then
+ log "Docker cleanup recommended: $dangling_images dangling images, $unused_volumes unused volumes"
+
+ # Auto-cleanup if critical
+ local system_usage
+ system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')
+
+ if [ "$system_usage" -ge "$CRITICAL_THRESHOLD" ]; then
+ log "Running docker system prune due to critical disk usage..."
+ docker system prune -f --volumes 2>/dev/null || true
+ log "Docker cleanup completed"
+ fi
+ fi
+}
+
+check_postgres_backups() {
+ local backup_dir="/Volumes/ManaData/backups/postgres"
+
+ if [ ! -d "$backup_dir" ]; then
+ return 0
+ fi
+
+ # Get backup directory size
+ local backup_size
+ backup_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')
+
+ log "PostgreSQL backups: $backup_size"
+
+ # Count old backups (older than 30 days in daily folder)
+ local old_backups
+ old_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime +30 2>/dev/null | wc -l | tr -d ' ')
+
+ if [ "$old_backups" -gt 0 ]; then
+ log "Note: $old_backups old daily backups could be cleaned up"
+ fi
+}
+
+check_docker_logs() {
+ # Check for large Docker log files
+ local docker_logs_dir="/var/lib/docker/containers"
+
+ # On macOS with Docker Desktop, logs are in the VM
+ # We can check via docker inspect instead
+ if ! docker info >/dev/null 2>&1; then
+ return 0
+ fi
+
+ # Get containers with largest log sizes
+ local large_logs=0
+ for container in $(docker ps -q 2>/dev/null); do
+ local log_size
+ log_size=$(docker inspect "$container" --format '{{.LogPath}}' 2>/dev/null | xargs -I {} docker run --rm -v /var/lib/docker:/var/lib/docker:ro alpine stat -c%s {} 2>/dev/null || echo "0")
+
+ # Convert to MB (if size > 100MB, flag it)
+ if [ "$log_size" -gt 104857600 ] 2>/dev/null; then
+ local container_name
+ container_name=$(docker inspect "$container" --format '{{.Name}}' 2>/dev/null | tr -d '/')
+ log "Large log file: $container_name ($(($log_size / 1048576))MB)"
+ large_logs=$((large_logs + 1))
+ fi
+ done 2>/dev/null || true
+
+ if [ "$large_logs" -gt 0 ]; then
+ log "Found $large_logs containers with large log files"
+ fi
+}
+
+# Main execution
+log "=== ManaCore Disk Space Check ==="
+
+ALERT_STATUS=0
+
+# Check system disk
+check_disk "/" "System Disk" || ALERT_STATUS=$?
+
+# Check ManaData volume (external SSD)
+if [ -d "/Volumes/ManaData" ]; then
+ check_disk "/Volumes/ManaData" "ManaData SSD" || ALERT_STATUS=$?
+fi
+
+# Check Docker disk usage
+check_docker_disk
+
+# Check backup sizes
+check_postgres_backups
+
+# Summary
+log "=== Check Complete ==="
+
+if [ "$ALERT_STATUS" -eq 2 ]; then
+ log "Status: CRITICAL - Immediate action required"
+ exit 2
+elif [ "$ALERT_STATUS" -eq 1 ]; then
+ log "Status: WARNING - Attention needed"
+ exit 1
+else
+ log "Status: OK - All disks within thresholds"
+ exit 0
+fi
diff --git a/scripts/mac-mini/health-check.sh b/scripts/mac-mini/health-check.sh
index e6296d9c2..a4342df73 100755
--- a/scripts/mac-mini/health-check.sh
+++ b/scripts/mac-mini/health-check.sh
@@ -275,6 +275,12 @@ check_service "Grafana" "http://localhost:8000/api/health"
check_service "Umami" "http://localhost:8010/api/heartbeat"
check_service "VictoriaMetrics" "http://localhost:9090/health"
+echo ""
+echo "Alerting:"
+check_service "vmalert" "http://localhost:8880/health"
+check_service "Alertmanager" "http://localhost:9093/-/healthy"
+check_service "Alert Notifier" "http://localhost:9095/health"
+
echo ""
echo "Cloudflare Tunnel:"
if pgrep -x "cloudflared" >/dev/null; then
diff --git a/scripts/mac-mini/launchd/README.md b/scripts/mac-mini/launchd/README.md
index 57a4b26dc..b294a81eb 100644
--- a/scripts/mac-mini/launchd/README.md
+++ b/scripts/mac-mini/launchd/README.md
@@ -17,8 +17,11 @@ for f in *.plist; do launchctl load ~/Library/LaunchAgents/$f; done
| Service | Description | Interval |
|---------|-------------|----------|
| `docker-startup` | Starts Docker containers on boot | At login |
-| `ensure-containers` | Detects and restarts stuck containers | Every 5 min |
+| `ensure-containers` | Detects and restarts stuck/crash-looping containers | Every 5 min |
| `health-check` | Checks all services and sends alerts | Every 5 min |
+| `backup-databases` | PostgreSQL backup with daily/weekly rotation | Daily 3 AM |
+| `disk-check` | Monitors disk space, alerts on thresholds | Hourly |
+| `weekly-report` | Generates system health summary | Sunday 10 AM |
| `ssd-check` | Monitors SSD health | Periodic |
| `mana-stt` | Speech-to-text service (Whisper) | At login |
| `mana-tts` | Text-to-speech service (Kokoro) | At login |
diff --git a/scripts/mac-mini/launchd/com.manacore.disk-check.plist b/scripts/mac-mini/launchd/com.manacore.disk-check.plist
new file mode 100644
index 000000000..35f7d31a9
--- /dev/null
+++ b/scripts/mac-mini/launchd/com.manacore.disk-check.plist
@@ -0,0 +1,34 @@
+
+
+
+
+ Label
+ com.manacore.disk-check
+
+ ProgramArguments
+
+ /bin/bash
+ /Users/mana/projects/manacore-monorepo/scripts/mac-mini/check-disk-space.sh
+
+
+
+ StartInterval
+ 3600
+
+
+ RunAtLoad
+
+
+ StandardOutPath
+ /tmp/manacore-disk-check.log
+
+ StandardErrorPath
+ /tmp/manacore-disk-check.error.log
+
+ EnvironmentVariables
+
+ PATH
+ /usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin
+
+
+
diff --git a/scripts/mac-mini/launchd/com.manacore.weekly-report.plist b/scripts/mac-mini/launchd/com.manacore.weekly-report.plist
new file mode 100644
index 000000000..c1abf0e1a
--- /dev/null
+++ b/scripts/mac-mini/launchd/com.manacore.weekly-report.plist
@@ -0,0 +1,37 @@
+
+
+
+
+ Label
+ com.manacore.weekly-report
+
+ ProgramArguments
+
+ /bin/bash
+ /Users/mana/projects/manacore-monorepo/scripts/mac-mini/weekly-report.sh
+
+
+
+ StartCalendarInterval
+
+ Weekday
+ 0
+ Hour
+ 10
+ Minute
+ 0
+
+
+ StandardOutPath
+ /tmp/manacore-weekly-report.log
+
+ StandardErrorPath
+ /tmp/manacore-weekly-report.error.log
+
+ EnvironmentVariables
+
+ PATH
+ /usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin
+
+
+
diff --git a/scripts/mac-mini/weekly-report.sh b/scripts/mac-mini/weekly-report.sh
new file mode 100755
index 000000000..b2dcf3ae9
--- /dev/null
+++ b/scripts/mac-mini/weekly-report.sh
@@ -0,0 +1,309 @@
+#!/bin/bash
+# ManaCore Weekly Maintenance Report
+# Generates a comprehensive system health summary
+#
+# Includes:
+# - Backup status
+# - Disk usage
+# - Container health & restart counts
+# - Database statistics
+# - Error log summary
+#
+# Run via LaunchD every Sunday at 10:00 AM
+
+set -e
+
+# Ensure PATH includes required tools
+export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+LOG_FILE="/tmp/manacore-weekly-report.log"
+REPORT_FILE="/tmp/manacore-weekly-report.txt"
+
+# Load notification config if exists
+if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
+ source "$PROJECT_ROOT/.env.notifications"
+fi
+
+# Load env for database password
+if [ -f "$PROJECT_ROOT/.env.macmini" ]; then
+ source "$PROJECT_ROOT/.env.macmini"
+fi
+
+log() {
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
+}
+
+send_notification() {
+ local message="$1"
+
+ # Telegram
+ if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
+ curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
+ -d "chat_id=${TELEGRAM_CHAT_ID}" \
+ -d "text=${message}" \
+ -d "parse_mode=HTML" \
+ >/dev/null 2>&1 || true
+ fi
+}
+
+# Initialize report
+init_report() {
+ cat > "$REPORT_FILE" << EOF
+đ ManaCore Weekly Report
+$(date '+%Y-%m-%d %H:%M')
+ââââââââââââââââââââââ
+
+EOF
+}
+
+# Add section to report
+add_section() {
+ local title="$1"
+ echo "" >> "$REPORT_FILE"
+ echo "$title" >> "$REPORT_FILE"
+}
+
+# Check backup status
+check_backups() {
+ add_section "đž Backup Status"
+
+ local backup_dir="/Volumes/ManaData/backups/postgres"
+
+ if [ ! -d "$backup_dir" ]; then
+ echo "â ī¸ Backup directory not found" >> "$REPORT_FILE"
+ return
+ fi
+
+ # Count recent backups
+ local daily_count
+ daily_count=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime -7 2>/dev/null | wc -l | tr -d ' ')
+
+ local weekly_count
+ weekly_count=$(find "$backup_dir/weekly" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
+
+ # Get latest backup
+ local latest_backup
+ latest_backup=$(ls -t "$backup_dir/daily"/*.sql.gz 2>/dev/null | head -1)
+ local latest_date=""
+ if [ -n "$latest_backup" ]; then
+ latest_date=$(stat -f "%Sm" -t "%Y-%m-%d %H:%M" "$latest_backup" 2>/dev/null || stat -c "%y" "$latest_backup" 2>/dev/null | cut -d'.' -f1)
+ fi
+
+ # Get total backup size
+ local total_size
+ total_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')
+
+ echo "Daily backups (7 days): $daily_count" >> "$REPORT_FILE"
+ echo "Weekly backups: $weekly_count" >> "$REPORT_FILE"
+ echo "Latest: $latest_date" >> "$REPORT_FILE"
+ echo "Total size: $total_size" >> "$REPORT_FILE"
+
+ # Verify backup integrity (check if files are not empty)
+ local empty_backups
+ empty_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -size 0 2>/dev/null | wc -l | tr -d ' ')
+ if [ "$empty_backups" -gt 0 ]; then
+ echo "â ī¸ $empty_backups empty backup files found!" >> "$REPORT_FILE"
+ fi
+}
+
+# Check disk usage
+check_disk_usage() {
+ add_section "đŋ Disk Usage"
+
+ # System disk
+ local system_usage
+ system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}')
+ echo "System: $system_usage" >> "$REPORT_FILE"
+
+ # ManaData SSD
+ if [ -d "/Volumes/ManaData" ]; then
+ local data_usage
+ data_usage=$(df -h "/Volumes/ManaData" 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}')
+ echo "ManaData: $data_usage" >> "$REPORT_FILE"
+ fi
+
+ # Docker disk usage
+ if docker info >/dev/null 2>&1; then
+ local docker_images
+ docker_images=$(docker system df --format '{{.Type}}: {{.Size}}' 2>/dev/null | head -3 | tr '\n' ', ' | sed 's/,$//')
+ echo "Docker: $docker_images" >> "$REPORT_FILE"
+ fi
+}
+
+# Check container health
+check_containers() {
+ add_section "đŗ Container Health"
+
+ if ! docker info >/dev/null 2>&1; then
+ echo "â ī¸ Docker not running" >> "$REPORT_FILE"
+ return
+ fi
+
+ # Count containers by status
+ local running
+ running=$(docker ps -q 2>/dev/null | wc -l | tr -d ' ')
+
+ local total
+ total=$(docker ps -aq 2>/dev/null | wc -l | tr -d ' ')
+
+ local healthy
+ healthy=$(docker ps --filter "health=healthy" -q 2>/dev/null | wc -l | tr -d ' ')
+
+ local unhealthy
+ unhealthy=$(docker ps --filter "health=unhealthy" -q 2>/dev/null | wc -l | tr -d ' ')
+
+ echo "Running: $running / $total" >> "$REPORT_FILE"
+ echo "Healthy: $healthy" >> "$REPORT_FILE"
+
+ if [ "$unhealthy" -gt 0 ]; then
+ echo "â ī¸ Unhealthy: $unhealthy" >> "$REPORT_FILE"
+ # List unhealthy containers
+ docker ps --filter "health=unhealthy" --format " - {{.Names}}" 2>/dev/null >> "$REPORT_FILE"
+ fi
+
+ # Get containers with most restarts this week
+ echo "" >> "$REPORT_FILE"
+ echo "Top restarts:" >> "$REPORT_FILE"
+
+ docker ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | \
+ grep -E "mana-" | \
+ while read name status; do
+ local restarts
+ restarts=$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo "0")
+ if [ "$restarts" -gt 0 ]; then
+ echo " $name: $restarts" >> "$REPORT_FILE"
+ fi
+ done
+
+ # Check for any container that restarted in the last week
+ local recent_restarts=0
+ for container in $(docker ps -aq 2>/dev/null); do
+ local restart_count
+ restart_count=$(docker inspect "$container" --format '{{.RestartCount}}' 2>/dev/null || echo "0")
+ if [ "$restart_count" -gt 0 ]; then
+ recent_restarts=$((recent_restarts + restart_count))
+ fi
+ done
+
+ if [ "$recent_restarts" -eq 0 ]; then
+ echo " None (stable week!)" >> "$REPORT_FILE"
+ fi
+}
+
+# Check database health
+check_database() {
+ add_section "đī¸ Database"
+
+ if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "mana-infra-postgres"; then
+ echo "â ī¸ PostgreSQL not running" >> "$REPORT_FILE"
+ return
+ fi
+
+ # Get database sizes
+ local db_sizes
+ db_sizes=$(docker exec mana-infra-postgres psql -U postgres -t -c "
+ SELECT datname, pg_size_pretty(pg_database_size(datname))
+ FROM pg_database
+ WHERE datistemplate = false AND datname != 'postgres'
+ ORDER BY pg_database_size(datname) DESC
+ LIMIT 5;
+ " 2>/dev/null | grep -v "^$" || echo "Could not fetch")
+
+ echo "Database sizes:" >> "$REPORT_FILE"
+ echo "$db_sizes" | while read line; do
+ [ -n "$line" ] && echo " $line" >> "$REPORT_FILE"
+ done
+
+ # Get total connection count
+ local connections
+ connections=$(docker exec mana-infra-postgres psql -U postgres -t -c "
+ SELECT count(*) FROM pg_stat_activity WHERE state = 'active';
+ " 2>/dev/null | tr -d ' ' || echo "?")
+
+ echo "Active connections: $connections" >> "$REPORT_FILE"
+}
+
+# Check for errors in logs
+check_errors() {
+ add_section "â ī¸ Recent Errors"
+
+ local error_count=0
+ local containers_with_errors=""
+
+ # Check each mana container for errors in the last 24h
+ for container in $(docker ps --format '{{.Names}}' 2>/dev/null | grep "^mana-"); do
+ local errors
+ errors=$(docker logs "$container" --since 168h 2>&1 | grep -ci "error\|exception\|fatal" || echo "0")
+
+ if [ "$errors" -gt 10 ]; then
+ containers_with_errors="$containers_with_errors\n $container: $errors errors"
+ error_count=$((error_count + errors))
+ fi
+ done
+
+ if [ -z "$containers_with_errors" ]; then
+ echo "No significant errors in the last week" >> "$REPORT_FILE"
+ else
+ echo "Total errors: $error_count" >> "$REPORT_FILE"
+ echo -e "$containers_with_errors" >> "$REPORT_FILE"
+ fi
+}
+
+# Check uptime and system resources
+check_system() {
+ add_section "đĨī¸ System"
+
+ # System uptime
+ local uptime_str
+ uptime_str=$(uptime | sed 's/.*up //' | sed 's/,.*//')
+ echo "Uptime: $uptime_str" >> "$REPORT_FILE"
+
+ # Load average
+ local load
+ load=$(uptime | sed 's/.*load averages: //' | awk '{print $1 " " $2 " " $3}')
+ echo "Load: $load" >> "$REPORT_FILE"
+
+ # Memory (macOS specific)
+ if command -v vm_stat &> /dev/null; then
+ local pages_free
+ pages_free=$(vm_stat | grep "Pages free" | awk '{print $3}' | tr -d '.')
+ local pages_active
+ pages_active=$(vm_stat | grep "Pages active" | awk '{print $3}' | tr -d '.')
+ # Calculate rough memory usage (page size is 16384 on Apple Silicon, 4096 on Intel)
+ local page_size
+ page_size=$(pagesize 2>/dev/null || echo "16384")
+ local mem_free_gb
+ mem_free_gb=$(echo "scale=1; $pages_free * $page_size / 1024 / 1024 / 1024" | bc 2>/dev/null || echo "?")
+ echo "Memory free: ~${mem_free_gb}GB" >> "$REPORT_FILE"
+ fi
+}
+
+# Generate summary
+generate_summary() {
+ echo "" >> "$REPORT_FILE"
+ echo "ââââââââââââââââââââââ" >> "$REPORT_FILE"
+ echo "Generated by ManaCore" >> "$REPORT_FILE"
+}
+
+# Main execution
+log "=== Generating Weekly Report ==="
+
+init_report
+check_backups
+check_disk_usage
+check_containers
+check_database
+check_errors
+check_system
+generate_summary
+
+log "Report generated at $REPORT_FILE"
+
+# Send report via Telegram
+REPORT_CONTENT=$(cat "$REPORT_FILE")
+send_notification "$REPORT_CONTENT"
+
+log "Report sent via Telegram"
+log "=== Weekly Report Complete ==="