mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-14 19:01:08 +02:00
feat(monitoring): add alerting stack and maintenance scripts
Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
02a5172c7c
commit
acc8de36ee
11 changed files with 996 additions and 10 deletions
|
|
@ -1431,6 +1431,11 @@ services:
|
|||
- /dev/disk/:/dev/disk:ro
|
||||
ports:
|
||||
- "9110:8080"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
postgres-exporter:
|
||||
image: prometheuscommunity/postgres-exporter:v0.15.0
|
||||
|
|
@ -1481,6 +1486,80 @@ services:
|
|||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# ============================================
|
||||
# Alerting Stack (Ports 9093-9095)
|
||||
# ============================================
|
||||
|
||||
vmalert:
|
||||
image: victoriametrics/vmalert:v1.99.0
|
||||
container_name: mana-mon-vmalert
|
||||
restart: always
|
||||
depends_on:
|
||||
victoriametrics:
|
||||
condition: service_healthy
|
||||
alertmanager:
|
||||
condition: service_healthy
|
||||
command:
|
||||
- '-datasource.url=http://victoriametrics:9090'
|
||||
- '-notifier.url=http://alertmanager:9093'
|
||||
- '-remoteWrite.url=http://victoriametrics:9090'
|
||||
- '-remoteRead.url=http://victoriametrics:9090'
|
||||
- '-rule=/etc/alerts/*.yml'
|
||||
- '-evaluationInterval=30s'
|
||||
- '-httpListenAddr=:8880'
|
||||
volumes:
|
||||
- ./docker/prometheus/alerts.yml:/etc/alerts/alerts.yml:ro
|
||||
ports:
|
||||
- "8880:8880"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8880/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.27.0
|
||||
container_name: mana-mon-alertmanager
|
||||
restart: always
|
||||
depends_on:
|
||||
alert-notifier:
|
||||
condition: service_healthy
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.listen-address=:9093'
|
||||
volumes:
|
||||
- ./docker/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
- alertmanager_data:/alertmanager
|
||||
ports:
|
||||
- "9093:9093"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:9093/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
alert-notifier:
|
||||
build:
|
||||
context: ./docker/alert-notifier
|
||||
dockerfile: Dockerfile
|
||||
image: alert-notifier:local
|
||||
container_name: mana-mon-alert-notifier
|
||||
restart: always
|
||||
environment:
|
||||
PORT: 8080
|
||||
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN:-}
|
||||
TELEGRAM_CHAT_ID: ${TELEGRAM_CHAT_ID:-}
|
||||
NTFY_TOPIC: ${NTFY_TOPIC:-}
|
||||
ports:
|
||||
- "9095:8080"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:8080/health"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 5s
|
||||
|
||||
# ============================================
|
||||
# Auto-Update (Watchtower)
|
||||
# ============================================
|
||||
|
|
@ -1508,6 +1587,8 @@ volumes:
|
|||
name: mana-redis-data
|
||||
victoriametrics_data:
|
||||
name: mana-victoria-data
|
||||
alertmanager_data:
|
||||
name: mana-alertmanager-data
|
||||
grafana_data:
|
||||
name: mana-grafana-data
|
||||
analytics_data:
|
||||
|
|
|
|||
17
docker/alert-notifier/Dockerfile
Normal file
17
docker/alert-notifier/Dockerfile
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
FROM python:3.12-alpine
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY main.py .
|
||||
|
||||
# No dependencies needed - uses only stdlib
|
||||
RUN chmod +x main.py
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
ENV PORT=8080
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
|
||||
CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:8080/health || exit 1
|
||||
|
||||
CMD ["python", "main.py"]
|
||||
204
docker/alert-notifier/main.py
Normal file
204
docker/alert-notifier/main.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Alert Notifier - Webhook receiver for Alertmanager
|
||||
Forwards alerts to Telegram and ntfy
|
||||
|
||||
Environment Variables:
|
||||
TELEGRAM_BOT_TOKEN - Telegram bot token
|
||||
TELEGRAM_CHAT_ID - Telegram chat ID
|
||||
NTFY_TOPIC - ntfy.sh topic name (optional)
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TELEGRAM_BOT_TOKEN = os.environ.get('TELEGRAM_BOT_TOKEN', '')
|
||||
TELEGRAM_CHAT_ID = os.environ.get('TELEGRAM_CHAT_ID', '')
|
||||
NTFY_TOPIC = os.environ.get('NTFY_TOPIC', '')
|
||||
|
||||
SEVERITY_EMOJI = {
|
||||
'critical': '🚨',
|
||||
'warning': '⚠️',
|
||||
'info': 'ℹ️',
|
||||
}
|
||||
|
||||
def format_alert_telegram(alert: dict, status: str) -> str:
|
||||
"""Format a single alert for Telegram."""
|
||||
labels = alert.get('labels', {})
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
severity = labels.get('severity', 'unknown')
|
||||
emoji = SEVERITY_EMOJI.get(severity, '🔔')
|
||||
|
||||
if status == 'resolved':
|
||||
emoji = '✅'
|
||||
|
||||
alertname = labels.get('alertname', 'Unknown')
|
||||
job = labels.get('job', '')
|
||||
summary = annotations.get('summary', alertname)
|
||||
description = annotations.get('description', '')
|
||||
|
||||
msg = f"{emoji} <b>{status.upper()}: {summary}</b>\n"
|
||||
if job:
|
||||
msg += f"Service: <code>{job}</code>\n"
|
||||
if description:
|
||||
msg += f"{description}\n"
|
||||
|
||||
return msg
|
||||
|
||||
def send_telegram(message: str) -> bool:
|
||||
"""Send message to Telegram."""
|
||||
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
|
||||
logger.warning("Telegram not configured")
|
||||
return False
|
||||
|
||||
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||
data = {
|
||||
'chat_id': TELEGRAM_CHAT_ID,
|
||||
'text': message,
|
||||
'parse_mode': 'HTML',
|
||||
'disable_web_page_preview': True
|
||||
}
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=urllib.parse.urlencode(data).encode(),
|
||||
headers={'Content-Type': 'application/x-www-form-urlencoded'}
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
return resp.status == 200
|
||||
except Exception as e:
|
||||
logger.error(f"Telegram send failed: {e}")
|
||||
return False
|
||||
|
||||
def send_ntfy(title: str, message: str, priority: str = 'default') -> bool:
|
||||
"""Send message to ntfy."""
|
||||
if not NTFY_TOPIC:
|
||||
return False
|
||||
|
||||
url = f"https://ntfy.sh/{NTFY_TOPIC}"
|
||||
|
||||
priority_map = {
|
||||
'critical': 'urgent',
|
||||
'warning': 'high',
|
||||
'info': 'low'
|
||||
}
|
||||
ntfy_priority = priority_map.get(priority, 'default')
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=message.encode('utf-8'),
|
||||
headers={
|
||||
'Title': title,
|
||||
'Priority': ntfy_priority,
|
||||
'Tags': 'warning' if priority == 'critical' else 'loudspeaker'
|
||||
}
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
return resp.status == 200
|
||||
except Exception as e:
|
||||
logger.error(f"ntfy send failed: {e}")
|
||||
return False
|
||||
|
||||
class AlertHandler(BaseHTTPRequestHandler):
|
||||
def do_POST(self):
|
||||
if self.path != '/webhook':
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
|
||||
content_length = int(self.headers.get('Content-Length', 0))
|
||||
body = self.rfile.read(content_length)
|
||||
|
||||
try:
|
||||
payload = json.loads(body)
|
||||
self.process_alerts(payload)
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
self.wfile.write(b'OK')
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing webhook: {e}")
|
||||
self.send_response(500)
|
||||
self.end_headers()
|
||||
self.wfile.write(str(e).encode())
|
||||
|
||||
def do_GET(self):
|
||||
if self.path == '/health':
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
self.wfile.write(b'OK')
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def process_alerts(self, payload: dict):
|
||||
"""Process Alertmanager webhook payload."""
|
||||
status = payload.get('status', 'unknown')
|
||||
alerts = payload.get('alerts', [])
|
||||
|
||||
if not alerts:
|
||||
return
|
||||
|
||||
logger.info(f"Received {len(alerts)} alerts with status: {status}")
|
||||
|
||||
# Build message
|
||||
messages = []
|
||||
highest_severity = 'info'
|
||||
|
||||
for alert in alerts:
|
||||
msg = format_alert_telegram(alert, alert.get('status', status))
|
||||
messages.append(msg)
|
||||
|
||||
severity = alert.get('labels', {}).get('severity', 'info')
|
||||
if severity == 'critical':
|
||||
highest_severity = 'critical'
|
||||
elif severity == 'warning' and highest_severity != 'critical':
|
||||
highest_severity = 'warning'
|
||||
|
||||
combined_message = '\n'.join(messages)
|
||||
|
||||
# Send notifications
|
||||
if TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID:
|
||||
success = send_telegram(combined_message)
|
||||
logger.info(f"Telegram: {'sent' if success else 'failed'}")
|
||||
|
||||
if NTFY_TOPIC:
|
||||
title = f"ManaCore Alert ({len(alerts)} alerts)"
|
||||
# Strip HTML for ntfy
|
||||
plain_message = combined_message.replace('<b>', '').replace('</b>', '')
|
||||
plain_message = plain_message.replace('<code>', '').replace('</code>', '')
|
||||
success = send_ntfy(title, plain_message, highest_severity)
|
||||
logger.info(f"ntfy: {'sent' if success else 'failed'}")
|
||||
|
||||
def log_message(self, format, *args):
|
||||
logger.info(f"{self.client_address[0]} - {format % args}")
|
||||
|
||||
def main():
|
||||
port = int(os.environ.get('PORT', 8080))
|
||||
|
||||
logger.info(f"Starting Alert Notifier on port {port}")
|
||||
logger.info(f"Telegram configured: {bool(TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID)}")
|
||||
logger.info(f"ntfy configured: {bool(NTFY_TOPIC)}")
|
||||
|
||||
server = HTTPServer(('0.0.0.0', port), AlertHandler)
|
||||
|
||||
try:
|
||||
server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutting down")
|
||||
server.shutdown()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
62
docker/alertmanager/alertmanager.yml
Normal file
62
docker/alertmanager/alertmanager.yml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# Alertmanager Configuration for ManaCore
|
||||
# Sends alerts via webhook to custom notification handler
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
# Default receiver for all alerts
|
||||
receiver: 'webhook'
|
||||
# Group alerts by severity and service
|
||||
group_by: ['alertname', 'severity', 'job']
|
||||
# Wait before sending first notification
|
||||
group_wait: 30s
|
||||
# Wait before sending follow-up notifications for same group
|
||||
group_interval: 5m
|
||||
# Wait before re-sending resolved alerts
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
# Critical alerts - immediate notification
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'webhook'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
|
||||
# Warning alerts - less frequent
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'webhook'
|
||||
group_wait: 1m
|
||||
repeat_interval: 6h
|
||||
|
||||
# Info alerts - only during business hours, batch together
|
||||
- match:
|
||||
severity: info
|
||||
receiver: 'webhook'
|
||||
group_wait: 5m
|
||||
repeat_interval: 24h
|
||||
|
||||
receivers:
|
||||
- name: 'webhook'
|
||||
webhook_configs:
|
||||
- url: 'http://alert-notifier:8080/webhook'
|
||||
send_resolved: true
|
||||
max_alerts: 10
|
||||
|
||||
# Inhibition rules - prevent redundant alerts
|
||||
inhibit_rules:
|
||||
# Don't alert on warnings if critical is firing for same service
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'job']
|
||||
|
||||
# Don't alert on service-specific issues if PostgreSQL is down
|
||||
- source_match:
|
||||
alertname: 'PostgreSQLDown'
|
||||
target_match_re:
|
||||
alertname: '.*(Backend|Service).*'
|
||||
equal: []
|
||||
|
|
@ -9,11 +9,11 @@ global:
|
|||
rule_files:
|
||||
- /etc/prometheus/alerts.yml
|
||||
|
||||
# Alertmanager configuration (optional, for future use)
|
||||
# alerting:
|
||||
# alertmanagers:
|
||||
# - static_configs:
|
||||
# - targets: []
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
|
|
@ -30,10 +30,10 @@ scrape_configs:
|
|||
target_label: instance
|
||||
replacement: 'mac-mini'
|
||||
|
||||
# Docker container metrics via cAdvisor (disabled - container not deployed)
|
||||
# - job_name: 'cadvisor'
|
||||
# static_configs:
|
||||
# - targets: ['cadvisor:8080']
|
||||
# Docker container metrics via cAdvisor
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
|
||||
# PostgreSQL metrics
|
||||
- job_name: 'postgres'
|
||||
|
|
|
|||
233
scripts/mac-mini/check-disk-space.sh
Executable file
233
scripts/mac-mini/check-disk-space.sh
Executable file
|
|
@ -0,0 +1,233 @@
|
|||
#!/bin/bash
|
||||
# ManaCore Disk Space Monitor
|
||||
# Checks disk usage on system and data volumes
|
||||
# Alerts via Telegram/ntfy when thresholds are exceeded
|
||||
#
|
||||
# Thresholds:
|
||||
# - Warning: 80%
|
||||
# - Critical: 90%
|
||||
#
|
||||
# Run via LaunchD hourly
|
||||
|
||||
set -e
|
||||
|
||||
# Ensure PATH includes required tools
|
||||
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
LOG_FILE="/tmp/manacore-disk-check.log"
|
||||
|
||||
# Thresholds
|
||||
WARNING_THRESHOLD=80
|
||||
CRITICAL_THRESHOLD=90
|
||||
|
||||
# Load notification config if exists
|
||||
if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
|
||||
source "$PROJECT_ROOT/.env.notifications"
|
||||
fi
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
send_notification() {
|
||||
local message="$1"
|
||||
local priority="${2:-default}"
|
||||
|
||||
# Telegram
|
||||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d "chat_id=${TELEGRAM_CHAT_ID}" \
|
||||
-d "text=${message}" \
|
||||
-d "parse_mode=HTML" \
|
||||
>/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
# ntfy
|
||||
if [ -n "$NTFY_TOPIC" ]; then
|
||||
local ntfy_priority="default"
|
||||
[ "$priority" = "high" ] && ntfy_priority="high"
|
||||
[ "$priority" = "critical" ] && ntfy_priority="urgent"
|
||||
|
||||
curl -s -d "$message" \
|
||||
-H "Title: ManaCore Disk Alert" \
|
||||
-H "Priority: $ntfy_priority" \
|
||||
-H "Tags: warning" \
|
||||
"https://ntfy.sh/$NTFY_TOPIC" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
check_disk() {
|
||||
local mount_point="$1"
|
||||
local name="$2"
|
||||
|
||||
# Check if mount point exists
|
||||
if [ ! -d "$mount_point" ]; then
|
||||
log "WARNING: Mount point $mount_point does not exist"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Get disk usage percentage (macOS compatible)
|
||||
local usage
|
||||
usage=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')
|
||||
|
||||
if [ -z "$usage" ]; then
|
||||
log "ERROR: Could not get disk usage for $mount_point"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Get available space
|
||||
local available
|
||||
available=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
|
||||
|
||||
log "$name: ${usage}% used (${available} free)"
|
||||
|
||||
# Check thresholds
|
||||
if [ "$usage" -ge "$CRITICAL_THRESHOLD" ]; then
|
||||
log "CRITICAL: $name at ${usage}%!"
|
||||
send_notification "🚨 <b>CRITICAL: Disk Space</b>
|
||||
|
||||
<b>$name</b> is at <b>${usage}%</b>
|
||||
Available: ${available}
|
||||
|
||||
Immediate action required!" "critical"
|
||||
return 2
|
||||
elif [ "$usage" -ge "$WARNING_THRESHOLD" ]; then
|
||||
log "WARNING: $name at ${usage}%"
|
||||
send_notification "⚠️ <b>WARNING: Disk Space</b>
|
||||
|
||||
<b>$name</b> is at <b>${usage}%</b>
|
||||
Available: ${available}
|
||||
|
||||
Consider cleaning up old files." "high"
|
||||
return 1
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
check_docker_disk() {
|
||||
# Check Docker disk usage
|
||||
if ! command -v docker &> /dev/null; then
|
||||
log "Docker not found in PATH"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if ! docker info >/dev/null 2>&1; then
|
||||
log "Docker is not running"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Get Docker disk usage
|
||||
local docker_usage
|
||||
docker_usage=$(docker system df --format '{{.Size}}' 2>/dev/null | head -1)
|
||||
|
||||
log "Docker disk usage: $docker_usage"
|
||||
|
||||
# Check for dangling images and unused volumes
|
||||
local dangling_images
|
||||
dangling_images=$(docker images -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
local unused_volumes
|
||||
unused_volumes=$(docker volume ls -f "dangling=true" -q 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
if [ "$dangling_images" -gt 10 ] || [ "$unused_volumes" -gt 5 ]; then
|
||||
log "Docker cleanup recommended: $dangling_images dangling images, $unused_volumes unused volumes"
|
||||
|
||||
# Auto-cleanup if critical
|
||||
local system_usage
|
||||
system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {gsub(/%/,""); print $5}')
|
||||
|
||||
if [ "$system_usage" -ge "$CRITICAL_THRESHOLD" ]; then
|
||||
log "Running docker system prune due to critical disk usage..."
|
||||
docker system prune -f --volumes 2>/dev/null || true
|
||||
log "Docker cleanup completed"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
check_postgres_backups() {
|
||||
local backup_dir="/Volumes/ManaData/backups/postgres"
|
||||
|
||||
if [ ! -d "$backup_dir" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Get backup directory size
|
||||
local backup_size
|
||||
backup_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')
|
||||
|
||||
log "PostgreSQL backups: $backup_size"
|
||||
|
||||
# Count old backups (older than 30 days in daily folder)
|
||||
local old_backups
|
||||
old_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime +30 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
if [ "$old_backups" -gt 0 ]; then
|
||||
log "Note: $old_backups old daily backups could be cleaned up"
|
||||
fi
|
||||
}
|
||||
|
||||
check_docker_logs() {
|
||||
# Check for large Docker log files
|
||||
local docker_logs_dir="/var/lib/docker/containers"
|
||||
|
||||
# On macOS with Docker Desktop, logs are in the VM
|
||||
# We can check via docker inspect instead
|
||||
if ! docker info >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Get containers with largest log sizes
|
||||
local large_logs=0
|
||||
for container in $(docker ps -q 2>/dev/null); do
|
||||
local log_size
|
||||
log_size=$(docker inspect "$container" --format '{{.LogPath}}' 2>/dev/null | xargs -I {} docker run --rm -v /var/lib/docker:/var/lib/docker:ro alpine stat -c%s {} 2>/dev/null || echo "0")
|
||||
|
||||
# Convert to MB (if size > 100MB, flag it)
|
||||
if [ "$log_size" -gt 104857600 ] 2>/dev/null; then
|
||||
local container_name
|
||||
container_name=$(docker inspect "$container" --format '{{.Name}}' 2>/dev/null | tr -d '/')
|
||||
log "Large log file: $container_name ($(($log_size / 1048576))MB)"
|
||||
large_logs=$((large_logs + 1))
|
||||
fi
|
||||
done 2>/dev/null || true
|
||||
|
||||
if [ "$large_logs" -gt 0 ]; then
|
||||
log "Found $large_logs containers with large log files"
|
||||
fi
|
||||
}
|
||||
|
||||
# Main execution
|
||||
log "=== ManaCore Disk Space Check ==="
|
||||
|
||||
ALERT_STATUS=0
|
||||
|
||||
# Check system disk
|
||||
check_disk "/" "System Disk" || ALERT_STATUS=$?
|
||||
|
||||
# Check ManaData volume (external SSD)
|
||||
if [ -d "/Volumes/ManaData" ]; then
|
||||
check_disk "/Volumes/ManaData" "ManaData SSD" || ALERT_STATUS=$?
|
||||
fi
|
||||
|
||||
# Check Docker disk usage
|
||||
check_docker_disk
|
||||
|
||||
# Check backup sizes
|
||||
check_postgres_backups
|
||||
|
||||
# Summary
|
||||
log "=== Check Complete ==="
|
||||
|
||||
if [ "$ALERT_STATUS" -eq 2 ]; then
|
||||
log "Status: CRITICAL - Immediate action required"
|
||||
exit 2
|
||||
elif [ "$ALERT_STATUS" -eq 1 ]; then
|
||||
log "Status: WARNING - Attention needed"
|
||||
exit 1
|
||||
else
|
||||
log "Status: OK - All disks within thresholds"
|
||||
exit 0
|
||||
fi
|
||||
|
|
@ -275,6 +275,12 @@ check_service "Grafana" "http://localhost:8000/api/health"
|
|||
check_service "Umami" "http://localhost:8010/api/heartbeat"
|
||||
check_service "VictoriaMetrics" "http://localhost:9090/health"
|
||||
|
||||
echo ""
|
||||
echo "Alerting:"
|
||||
check_service "vmalert" "http://localhost:8880/health"
|
||||
check_service "Alertmanager" "http://localhost:9093/-/healthy"
|
||||
check_service "Alert Notifier" "http://localhost:9095/health"
|
||||
|
||||
echo ""
|
||||
echo "Cloudflare Tunnel:"
|
||||
if pgrep -x "cloudflared" >/dev/null; then
|
||||
|
|
|
|||
|
|
@ -17,8 +17,11 @@ for f in *.plist; do launchctl load ~/Library/LaunchAgents/$f; done
|
|||
| Service | Description | Interval |
|
||||
|---------|-------------|----------|
|
||||
| `docker-startup` | Starts Docker containers on boot | At login |
|
||||
| `ensure-containers` | Detects and restarts stuck containers | Every 5 min |
|
||||
| `ensure-containers` | Detects and restarts stuck/crash-looping containers | Every 5 min |
|
||||
| `health-check` | Checks all services and sends alerts | Every 5 min |
|
||||
| `backup-databases` | PostgreSQL backup with daily/weekly rotation | Daily 3 AM |
|
||||
| `disk-check` | Monitors disk space, alerts on thresholds | Hourly |
|
||||
| `weekly-report` | Generates system health summary | Sunday 10 AM |
|
||||
| `ssd-check` | Monitors SSD health | Periodic |
|
||||
| `mana-stt` | Speech-to-text service (Whisper) | At login |
|
||||
| `mana-tts` | Text-to-speech service (Kokoro) | At login |
|
||||
|
|
|
|||
34
scripts/mac-mini/launchd/com.manacore.disk-check.plist
Normal file
34
scripts/mac-mini/launchd/com.manacore.disk-check.plist
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.manacore.disk-check</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>/Users/mana/projects/manacore-monorepo/scripts/mac-mini/check-disk-space.sh</string>
|
||||
</array>
|
||||
|
||||
<!-- Run hourly -->
|
||||
<key>StartInterval</key>
|
||||
<integer>3600</integer>
|
||||
|
||||
<!-- Also run at startup -->
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/tmp/manacore-disk-check.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/tmp/manacore-disk-check.error.log</string>
|
||||
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin</string>
|
||||
</dict>
|
||||
</dict>
|
||||
</plist>
|
||||
37
scripts/mac-mini/launchd/com.manacore.weekly-report.plist
Normal file
37
scripts/mac-mini/launchd/com.manacore.weekly-report.plist
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.manacore.weekly-report</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>/Users/mana/projects/manacore-monorepo/scripts/mac-mini/weekly-report.sh</string>
|
||||
</array>
|
||||
|
||||
<!-- Run every Sunday at 10:00 AM -->
|
||||
<key>StartCalendarInterval</key>
|
||||
<dict>
|
||||
<key>Weekday</key>
|
||||
<integer>0</integer>
|
||||
<key>Hour</key>
|
||||
<integer>10</integer>
|
||||
<key>Minute</key>
|
||||
<integer>0</integer>
|
||||
</dict>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/tmp/manacore-weekly-report.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/tmp/manacore-weekly-report.error.log</string>
|
||||
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>PATH</key>
|
||||
<string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin</string>
|
||||
</dict>
|
||||
</dict>
|
||||
</plist>
|
||||
309
scripts/mac-mini/weekly-report.sh
Executable file
309
scripts/mac-mini/weekly-report.sh
Executable file
|
|
@ -0,0 +1,309 @@
|
|||
#!/bin/bash
|
||||
# ManaCore Weekly Maintenance Report
|
||||
# Generates a comprehensive system health summary
|
||||
#
|
||||
# Includes:
|
||||
# - Backup status
|
||||
# - Disk usage
|
||||
# - Container health & restart counts
|
||||
# - Database statistics
|
||||
# - Error log summary
|
||||
#
|
||||
# Run via LaunchD every Sunday at 10:00 AM
|
||||
|
||||
set -e
|
||||
|
||||
# Ensure PATH includes required tools
|
||||
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
LOG_FILE="/tmp/manacore-weekly-report.log"
|
||||
REPORT_FILE="/tmp/manacore-weekly-report.txt"
|
||||
|
||||
# Load notification config if exists
|
||||
if [ -f "$PROJECT_ROOT/.env.notifications" ]; then
|
||||
source "$PROJECT_ROOT/.env.notifications"
|
||||
fi
|
||||
|
||||
# Load env for database password
|
||||
if [ -f "$PROJECT_ROOT/.env.macmini" ]; then
|
||||
source "$PROJECT_ROOT/.env.macmini"
|
||||
fi
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
send_notification() {
|
||||
local message="$1"
|
||||
|
||||
# Telegram
|
||||
if [ -n "$TELEGRAM_BOT_TOKEN" ] && [ -n "$TELEGRAM_CHAT_ID" ]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d "chat_id=${TELEGRAM_CHAT_ID}" \
|
||||
-d "text=${message}" \
|
||||
-d "parse_mode=HTML" \
|
||||
>/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
# Initialize report
|
||||
init_report() {
|
||||
cat > "$REPORT_FILE" << EOF
|
||||
📊 <b>ManaCore Weekly Report</b>
|
||||
$(date '+%Y-%m-%d %H:%M')
|
||||
━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
# Add section to report
|
||||
add_section() {
|
||||
local title="$1"
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "<b>$title</b>" >> "$REPORT_FILE"
|
||||
}
|
||||
|
||||
# Check backup status
|
||||
check_backups() {
|
||||
add_section "💾 Backup Status"
|
||||
|
||||
local backup_dir="/Volumes/ManaData/backups/postgres"
|
||||
|
||||
if [ ! -d "$backup_dir" ]; then
|
||||
echo "⚠️ Backup directory not found" >> "$REPORT_FILE"
|
||||
return
|
||||
fi
|
||||
|
||||
# Count recent backups
|
||||
local daily_count
|
||||
daily_count=$(find "$backup_dir/daily" -name "*.sql.gz" -mtime -7 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
local weekly_count
|
||||
weekly_count=$(find "$backup_dir/weekly" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
# Get latest backup
|
||||
local latest_backup
|
||||
latest_backup=$(ls -t "$backup_dir/daily"/*.sql.gz 2>/dev/null | head -1)
|
||||
local latest_date=""
|
||||
if [ -n "$latest_backup" ]; then
|
||||
latest_date=$(stat -f "%Sm" -t "%Y-%m-%d %H:%M" "$latest_backup" 2>/dev/null || stat -c "%y" "$latest_backup" 2>/dev/null | cut -d'.' -f1)
|
||||
fi
|
||||
|
||||
# Get total backup size
|
||||
local total_size
|
||||
total_size=$(du -sh "$backup_dir" 2>/dev/null | awk '{print $1}')
|
||||
|
||||
echo "Daily backups (7 days): $daily_count" >> "$REPORT_FILE"
|
||||
echo "Weekly backups: $weekly_count" >> "$REPORT_FILE"
|
||||
echo "Latest: $latest_date" >> "$REPORT_FILE"
|
||||
echo "Total size: $total_size" >> "$REPORT_FILE"
|
||||
|
||||
# Verify backup integrity (check if files are not empty)
|
||||
local empty_backups
|
||||
empty_backups=$(find "$backup_dir/daily" -name "*.sql.gz" -size 0 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$empty_backups" -gt 0 ]; then
|
||||
echo "⚠️ $empty_backups empty backup files found!" >> "$REPORT_FILE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check disk usage
|
||||
check_disk_usage() {
|
||||
add_section "💿 Disk Usage"
|
||||
|
||||
# System disk
|
||||
local system_usage
|
||||
system_usage=$(df -h / 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}')
|
||||
echo "System: $system_usage" >> "$REPORT_FILE"
|
||||
|
||||
# ManaData SSD
|
||||
if [ -d "/Volumes/ManaData" ]; then
|
||||
local data_usage
|
||||
data_usage=$(df -h "/Volumes/ManaData" 2>/dev/null | awk 'NR==2 {print $5 " used (" $4 " free)"}')
|
||||
echo "ManaData: $data_usage" >> "$REPORT_FILE"
|
||||
fi
|
||||
|
||||
# Docker disk usage
|
||||
if docker info >/dev/null 2>&1; then
|
||||
local docker_images
|
||||
docker_images=$(docker system df --format '{{.Type}}: {{.Size}}' 2>/dev/null | head -3 | tr '\n' ', ' | sed 's/,$//')
|
||||
echo "Docker: $docker_images" >> "$REPORT_FILE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check container health
|
||||
check_containers() {
|
||||
add_section "🐳 Container Health"
|
||||
|
||||
if ! docker info >/dev/null 2>&1; then
|
||||
echo "⚠️ Docker not running" >> "$REPORT_FILE"
|
||||
return
|
||||
fi
|
||||
|
||||
# Count containers by status
|
||||
local running
|
||||
running=$(docker ps -q 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
local total
|
||||
total=$(docker ps -aq 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
local healthy
|
||||
healthy=$(docker ps --filter "health=healthy" -q 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
local unhealthy
|
||||
unhealthy=$(docker ps --filter "health=unhealthy" -q 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
echo "Running: $running / $total" >> "$REPORT_FILE"
|
||||
echo "Healthy: $healthy" >> "$REPORT_FILE"
|
||||
|
||||
if [ "$unhealthy" -gt 0 ]; then
|
||||
echo "⚠️ Unhealthy: $unhealthy" >> "$REPORT_FILE"
|
||||
# List unhealthy containers
|
||||
docker ps --filter "health=unhealthy" --format " - {{.Names}}" 2>/dev/null >> "$REPORT_FILE"
|
||||
fi
|
||||
|
||||
# Get containers with most restarts this week
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "Top restarts:" >> "$REPORT_FILE"
|
||||
|
||||
docker ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | \
|
||||
grep -E "mana-" | \
|
||||
while read name status; do
|
||||
local restarts
|
||||
restarts=$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo "0")
|
||||
if [ "$restarts" -gt 0 ]; then
|
||||
echo " $name: $restarts" >> "$REPORT_FILE"
|
||||
fi
|
||||
done
|
||||
|
||||
# Check for any container that restarted in the last week
|
||||
local recent_restarts=0
|
||||
for container in $(docker ps -aq 2>/dev/null); do
|
||||
local restart_count
|
||||
restart_count=$(docker inspect "$container" --format '{{.RestartCount}}' 2>/dev/null || echo "0")
|
||||
if [ "$restart_count" -gt 0 ]; then
|
||||
recent_restarts=$((recent_restarts + restart_count))
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$recent_restarts" -eq 0 ]; then
|
||||
echo " None (stable week!)" >> "$REPORT_FILE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check database health
|
||||
check_database() {
|
||||
add_section "🗄️ Database"
|
||||
|
||||
if ! docker ps --format '{{.Names}}' 2>/dev/null | grep -q "mana-infra-postgres"; then
|
||||
echo "⚠️ PostgreSQL not running" >> "$REPORT_FILE"
|
||||
return
|
||||
fi
|
||||
|
||||
# Get database sizes
|
||||
local db_sizes
|
||||
db_sizes=$(docker exec mana-infra-postgres psql -U postgres -t -c "
|
||||
SELECT datname, pg_size_pretty(pg_database_size(datname))
|
||||
FROM pg_database
|
||||
WHERE datistemplate = false AND datname != 'postgres'
|
||||
ORDER BY pg_database_size(datname) DESC
|
||||
LIMIT 5;
|
||||
" 2>/dev/null | grep -v "^$" || echo "Could not fetch")
|
||||
|
||||
echo "Database sizes:" >> "$REPORT_FILE"
|
||||
echo "$db_sizes" | while read line; do
|
||||
[ -n "$line" ] && echo " $line" >> "$REPORT_FILE"
|
||||
done
|
||||
|
||||
# Get total connection count
|
||||
local connections
|
||||
connections=$(docker exec mana-infra-postgres psql -U postgres -t -c "
|
||||
SELECT count(*) FROM pg_stat_activity WHERE state = 'active';
|
||||
" 2>/dev/null | tr -d ' ' || echo "?")
|
||||
|
||||
echo "Active connections: $connections" >> "$REPORT_FILE"
|
||||
}
|
||||
|
||||
# Check for errors in logs
|
||||
check_errors() {
|
||||
add_section "⚠️ Recent Errors"
|
||||
|
||||
local error_count=0
|
||||
local containers_with_errors=""
|
||||
|
||||
# Check each mana container for errors in the last 24h
|
||||
for container in $(docker ps --format '{{.Names}}' 2>/dev/null | grep "^mana-"); do
|
||||
local errors
|
||||
errors=$(docker logs "$container" --since 168h 2>&1 | grep -ci "error\|exception\|fatal" || echo "0")
|
||||
|
||||
if [ "$errors" -gt 10 ]; then
|
||||
containers_with_errors="$containers_with_errors\n $container: $errors errors"
|
||||
error_count=$((error_count + errors))
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "$containers_with_errors" ]; then
|
||||
echo "No significant errors in the last week" >> "$REPORT_FILE"
|
||||
else
|
||||
echo "Total errors: $error_count" >> "$REPORT_FILE"
|
||||
echo -e "$containers_with_errors" >> "$REPORT_FILE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check uptime and system resources
|
||||
check_system() {
|
||||
add_section "🖥️ System"
|
||||
|
||||
# System uptime
|
||||
local uptime_str
|
||||
uptime_str=$(uptime | sed 's/.*up //' | sed 's/,.*//')
|
||||
echo "Uptime: $uptime_str" >> "$REPORT_FILE"
|
||||
|
||||
# Load average
|
||||
local load
|
||||
load=$(uptime | sed 's/.*load averages: //' | awk '{print $1 " " $2 " " $3}')
|
||||
echo "Load: $load" >> "$REPORT_FILE"
|
||||
|
||||
# Memory (macOS specific)
|
||||
if command -v vm_stat &> /dev/null; then
|
||||
local pages_free
|
||||
pages_free=$(vm_stat | grep "Pages free" | awk '{print $3}' | tr -d '.')
|
||||
local pages_active
|
||||
pages_active=$(vm_stat | grep "Pages active" | awk '{print $3}' | tr -d '.')
|
||||
# Calculate rough memory usage (page size is 16384 on Apple Silicon, 4096 on Intel)
|
||||
local page_size
|
||||
page_size=$(pagesize 2>/dev/null || echo "16384")
|
||||
local mem_free_gb
|
||||
mem_free_gb=$(echo "scale=1; $pages_free * $page_size / 1024 / 1024 / 1024" | bc 2>/dev/null || echo "?")
|
||||
echo "Memory free: ~${mem_free_gb}GB" >> "$REPORT_FILE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Generate summary
|
||||
generate_summary() {
|
||||
echo "" >> "$REPORT_FILE"
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━" >> "$REPORT_FILE"
|
||||
echo "<i>Generated by ManaCore</i>" >> "$REPORT_FILE"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
log "=== Generating Weekly Report ==="
|
||||
|
||||
init_report
|
||||
check_backups
|
||||
check_disk_usage
|
||||
check_containers
|
||||
check_database
|
||||
check_errors
|
||||
check_system
|
||||
generate_summary
|
||||
|
||||
log "Report generated at $REPORT_FILE"
|
||||
|
||||
# Send report via Telegram
|
||||
REPORT_CONTENT=$(cat "$REPORT_FILE")
|
||||
send_notification "$REPORT_CONTENT"
|
||||
|
||||
log "Report sent via Telegram"
|
||||
log "=== Weekly Report Complete ==="
|
||||
Loading…
Add table
Add a link
Reference in a new issue