feat(monitoring): add alerting stack and maintenance scripts

Medium priority stability improvements:

Alerting:
- Add vmalert for evaluating Prometheus alert rules
- Add alertmanager for alert routing and grouping
- Add alert-notifier service for Telegram/ntfy notifications
- Enable cadvisor scraping in prometheus config

Disk Monitoring:
- Add check-disk-space.sh for hourly disk monitoring
- Alert on 80% (warning) and 90% (critical) thresholds
- Auto-cleanup Docker when disk is critical
- Add com.manacore.disk-check.plist for LaunchD

Weekly Reports:
- Add weekly-report.sh for system health summary
- Includes: backup status, disk usage, container health,
  database stats, error log summary
- Runs every Sunday at 10 AM via LaunchD

Health Check Updates:
- Add checks for vmalert, alertmanager, alert-notifier

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Till-JS 2026-02-12 13:46:57 +01:00
parent 02a5172c7c
commit acc8de36ee
11 changed files with 996 additions and 10 deletions

View file

@ -0,0 +1,17 @@
FROM python:3.12-alpine
WORKDIR /app
COPY main.py .
# No dependencies needed - uses only stdlib
RUN chmod +x main.py
EXPOSE 8080
ENV PORT=8080
HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:8080/health || exit 1
CMD ["python", "main.py"]

View file

@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
Alert Notifier - Webhook receiver for Alertmanager
Forwards alerts to Telegram and ntfy
Environment Variables:
TELEGRAM_BOT_TOKEN - Telegram bot token
TELEGRAM_CHAT_ID - Telegram chat ID
NTFY_TOPIC - ntfy.sh topic name (optional)
"""
import os
import json
import logging
from http.server import HTTPServer, BaseHTTPRequestHandler
import urllib.request
import urllib.parse
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
TELEGRAM_BOT_TOKEN = os.environ.get('TELEGRAM_BOT_TOKEN', '')
TELEGRAM_CHAT_ID = os.environ.get('TELEGRAM_CHAT_ID', '')
NTFY_TOPIC = os.environ.get('NTFY_TOPIC', '')
SEVERITY_EMOJI = {
'critical': '🚨',
'warning': '⚠️',
'info': '',
}
def format_alert_telegram(alert: dict, status: str) -> str:
"""Format a single alert for Telegram."""
labels = alert.get('labels', {})
annotations = alert.get('annotations', {})
severity = labels.get('severity', 'unknown')
emoji = SEVERITY_EMOJI.get(severity, '🔔')
if status == 'resolved':
emoji = ''
alertname = labels.get('alertname', 'Unknown')
job = labels.get('job', '')
summary = annotations.get('summary', alertname)
description = annotations.get('description', '')
msg = f"{emoji} <b>{status.upper()}: {summary}</b>\n"
if job:
msg += f"Service: <code>{job}</code>\n"
if description:
msg += f"{description}\n"
return msg
def send_telegram(message: str) -> bool:
"""Send message to Telegram."""
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
logger.warning("Telegram not configured")
return False
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
data = {
'chat_id': TELEGRAM_CHAT_ID,
'text': message,
'parse_mode': 'HTML',
'disable_web_page_preview': True
}
try:
req = urllib.request.Request(
url,
data=urllib.parse.urlencode(data).encode(),
headers={'Content-Type': 'application/x-www-form-urlencoded'}
)
with urllib.request.urlopen(req, timeout=10) as resp:
return resp.status == 200
except Exception as e:
logger.error(f"Telegram send failed: {e}")
return False
def send_ntfy(title: str, message: str, priority: str = 'default') -> bool:
"""Send message to ntfy."""
if not NTFY_TOPIC:
return False
url = f"https://ntfy.sh/{NTFY_TOPIC}"
priority_map = {
'critical': 'urgent',
'warning': 'high',
'info': 'low'
}
ntfy_priority = priority_map.get(priority, 'default')
try:
req = urllib.request.Request(
url,
data=message.encode('utf-8'),
headers={
'Title': title,
'Priority': ntfy_priority,
'Tags': 'warning' if priority == 'critical' else 'loudspeaker'
}
)
with urllib.request.urlopen(req, timeout=10) as resp:
return resp.status == 200
except Exception as e:
logger.error(f"ntfy send failed: {e}")
return False
class AlertHandler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path != '/webhook':
self.send_response(404)
self.end_headers()
return
content_length = int(self.headers.get('Content-Length', 0))
body = self.rfile.read(content_length)
try:
payload = json.loads(body)
self.process_alerts(payload)
self.send_response(200)
self.end_headers()
self.wfile.write(b'OK')
except Exception as e:
logger.error(f"Error processing webhook: {e}")
self.send_response(500)
self.end_headers()
self.wfile.write(str(e).encode())
def do_GET(self):
if self.path == '/health':
self.send_response(200)
self.end_headers()
self.wfile.write(b'OK')
else:
self.send_response(404)
self.end_headers()
def process_alerts(self, payload: dict):
"""Process Alertmanager webhook payload."""
status = payload.get('status', 'unknown')
alerts = payload.get('alerts', [])
if not alerts:
return
logger.info(f"Received {len(alerts)} alerts with status: {status}")
# Build message
messages = []
highest_severity = 'info'
for alert in alerts:
msg = format_alert_telegram(alert, alert.get('status', status))
messages.append(msg)
severity = alert.get('labels', {}).get('severity', 'info')
if severity == 'critical':
highest_severity = 'critical'
elif severity == 'warning' and highest_severity != 'critical':
highest_severity = 'warning'
combined_message = '\n'.join(messages)
# Send notifications
if TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID:
success = send_telegram(combined_message)
logger.info(f"Telegram: {'sent' if success else 'failed'}")
if NTFY_TOPIC:
title = f"ManaCore Alert ({len(alerts)} alerts)"
# Strip HTML for ntfy
plain_message = combined_message.replace('<b>', '').replace('</b>', '')
plain_message = plain_message.replace('<code>', '').replace('</code>', '')
success = send_ntfy(title, plain_message, highest_severity)
logger.info(f"ntfy: {'sent' if success else 'failed'}")
def log_message(self, format, *args):
logger.info(f"{self.client_address[0]} - {format % args}")
def main():
port = int(os.environ.get('PORT', 8080))
logger.info(f"Starting Alert Notifier on port {port}")
logger.info(f"Telegram configured: {bool(TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID)}")
logger.info(f"ntfy configured: {bool(NTFY_TOPIC)}")
server = HTTPServer(('0.0.0.0', port), AlertHandler)
try:
server.serve_forever()
except KeyboardInterrupt:
logger.info("Shutting down")
server.shutdown()
if __name__ == '__main__':
main()

View file

@ -0,0 +1,62 @@
# Alertmanager Configuration for ManaCore
# Sends alerts via webhook to custom notification handler
global:
resolve_timeout: 5m
route:
# Default receiver for all alerts
receiver: 'webhook'
# Group alerts by severity and service
group_by: ['alertname', 'severity', 'job']
# Wait before sending first notification
group_wait: 30s
# Wait before sending follow-up notifications for same group
group_interval: 5m
# Wait before re-sending resolved alerts
repeat_interval: 4h
routes:
# Critical alerts - immediate notification
- match:
severity: critical
receiver: 'webhook'
group_wait: 10s
repeat_interval: 1h
# Warning alerts - less frequent
- match:
severity: warning
receiver: 'webhook'
group_wait: 1m
repeat_interval: 6h
# Info alerts - only during business hours, batch together
- match:
severity: info
receiver: 'webhook'
group_wait: 5m
repeat_interval: 24h
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://alert-notifier:8080/webhook'
send_resolved: true
max_alerts: 10
# Inhibition rules - prevent redundant alerts
inhibit_rules:
# Don't alert on warnings if critical is firing for same service
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'job']
# Don't alert on service-specific issues if PostgreSQL is down
- source_match:
alertname: 'PostgreSQLDown'
target_match_re:
alertname: '.*(Backend|Service).*'
equal: []

View file

@ -9,11 +9,11 @@ global:
rule_files:
- /etc/prometheus/alerts.yml
# Alertmanager configuration (optional, for future use)
# alerting:
# alertmanagers:
# - static_configs:
# - targets: []
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
scrape_configs:
# Prometheus self-monitoring
@ -30,10 +30,10 @@ scrape_configs:
target_label: instance
replacement: 'mac-mini'
# Docker container metrics via cAdvisor (disabled - container not deployed)
# - job_name: 'cadvisor'
# static_configs:
# - targets: ['cadvisor:8080']
# Docker container metrics via cAdvisor
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
# PostgreSQL metrics
- job_name: 'postgres'