mirror of
https://github.com/Memo-2023/mana-monorepo.git
synced 2026-05-19 13:33:35 +02:00
feat(monitoring): add alerting stack and maintenance scripts
Medium priority stability improvements: Alerting: - Add vmalert for evaluating Prometheus alert rules - Add alertmanager for alert routing and grouping - Add alert-notifier service for Telegram/ntfy notifications - Enable cadvisor scraping in prometheus config Disk Monitoring: - Add check-disk-space.sh for hourly disk monitoring - Alert on 80% (warning) and 90% (critical) thresholds - Auto-cleanup Docker when disk is critical - Add com.manacore.disk-check.plist for LaunchD Weekly Reports: - Add weekly-report.sh for system health summary - Includes: backup status, disk usage, container health, database stats, error log summary - Runs every Sunday at 10 AM via LaunchD Health Check Updates: - Add checks for vmalert, alertmanager, alert-notifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
02a5172c7c
commit
acc8de36ee
11 changed files with 996 additions and 10 deletions
17
docker/alert-notifier/Dockerfile
Normal file
17
docker/alert-notifier/Dockerfile
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
FROM python:3.12-alpine
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY main.py .
|
||||
|
||||
# No dependencies needed - uses only stdlib
|
||||
RUN chmod +x main.py
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
ENV PORT=8080
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
|
||||
CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:8080/health || exit 1
|
||||
|
||||
CMD ["python", "main.py"]
|
||||
204
docker/alert-notifier/main.py
Normal file
204
docker/alert-notifier/main.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Alert Notifier - Webhook receiver for Alertmanager
|
||||
Forwards alerts to Telegram and ntfy
|
||||
|
||||
Environment Variables:
|
||||
TELEGRAM_BOT_TOKEN - Telegram bot token
|
||||
TELEGRAM_CHAT_ID - Telegram chat ID
|
||||
NTFY_TOPIC - ntfy.sh topic name (optional)
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TELEGRAM_BOT_TOKEN = os.environ.get('TELEGRAM_BOT_TOKEN', '')
|
||||
TELEGRAM_CHAT_ID = os.environ.get('TELEGRAM_CHAT_ID', '')
|
||||
NTFY_TOPIC = os.environ.get('NTFY_TOPIC', '')
|
||||
|
||||
SEVERITY_EMOJI = {
|
||||
'critical': '🚨',
|
||||
'warning': '⚠️',
|
||||
'info': 'ℹ️',
|
||||
}
|
||||
|
||||
def format_alert_telegram(alert: dict, status: str) -> str:
|
||||
"""Format a single alert for Telegram."""
|
||||
labels = alert.get('labels', {})
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
severity = labels.get('severity', 'unknown')
|
||||
emoji = SEVERITY_EMOJI.get(severity, '🔔')
|
||||
|
||||
if status == 'resolved':
|
||||
emoji = '✅'
|
||||
|
||||
alertname = labels.get('alertname', 'Unknown')
|
||||
job = labels.get('job', '')
|
||||
summary = annotations.get('summary', alertname)
|
||||
description = annotations.get('description', '')
|
||||
|
||||
msg = f"{emoji} <b>{status.upper()}: {summary}</b>\n"
|
||||
if job:
|
||||
msg += f"Service: <code>{job}</code>\n"
|
||||
if description:
|
||||
msg += f"{description}\n"
|
||||
|
||||
return msg
|
||||
|
||||
def send_telegram(message: str) -> bool:
|
||||
"""Send message to Telegram."""
|
||||
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
|
||||
logger.warning("Telegram not configured")
|
||||
return False
|
||||
|
||||
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||
data = {
|
||||
'chat_id': TELEGRAM_CHAT_ID,
|
||||
'text': message,
|
||||
'parse_mode': 'HTML',
|
||||
'disable_web_page_preview': True
|
||||
}
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=urllib.parse.urlencode(data).encode(),
|
||||
headers={'Content-Type': 'application/x-www-form-urlencoded'}
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
return resp.status == 200
|
||||
except Exception as e:
|
||||
logger.error(f"Telegram send failed: {e}")
|
||||
return False
|
||||
|
||||
def send_ntfy(title: str, message: str, priority: str = 'default') -> bool:
|
||||
"""Send message to ntfy."""
|
||||
if not NTFY_TOPIC:
|
||||
return False
|
||||
|
||||
url = f"https://ntfy.sh/{NTFY_TOPIC}"
|
||||
|
||||
priority_map = {
|
||||
'critical': 'urgent',
|
||||
'warning': 'high',
|
||||
'info': 'low'
|
||||
}
|
||||
ntfy_priority = priority_map.get(priority, 'default')
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=message.encode('utf-8'),
|
||||
headers={
|
||||
'Title': title,
|
||||
'Priority': ntfy_priority,
|
||||
'Tags': 'warning' if priority == 'critical' else 'loudspeaker'
|
||||
}
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
return resp.status == 200
|
||||
except Exception as e:
|
||||
logger.error(f"ntfy send failed: {e}")
|
||||
return False
|
||||
|
||||
class AlertHandler(BaseHTTPRequestHandler):
|
||||
def do_POST(self):
|
||||
if self.path != '/webhook':
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
|
||||
content_length = int(self.headers.get('Content-Length', 0))
|
||||
body = self.rfile.read(content_length)
|
||||
|
||||
try:
|
||||
payload = json.loads(body)
|
||||
self.process_alerts(payload)
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
self.wfile.write(b'OK')
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing webhook: {e}")
|
||||
self.send_response(500)
|
||||
self.end_headers()
|
||||
self.wfile.write(str(e).encode())
|
||||
|
||||
def do_GET(self):
|
||||
if self.path == '/health':
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
self.wfile.write(b'OK')
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def process_alerts(self, payload: dict):
|
||||
"""Process Alertmanager webhook payload."""
|
||||
status = payload.get('status', 'unknown')
|
||||
alerts = payload.get('alerts', [])
|
||||
|
||||
if not alerts:
|
||||
return
|
||||
|
||||
logger.info(f"Received {len(alerts)} alerts with status: {status}")
|
||||
|
||||
# Build message
|
||||
messages = []
|
||||
highest_severity = 'info'
|
||||
|
||||
for alert in alerts:
|
||||
msg = format_alert_telegram(alert, alert.get('status', status))
|
||||
messages.append(msg)
|
||||
|
||||
severity = alert.get('labels', {}).get('severity', 'info')
|
||||
if severity == 'critical':
|
||||
highest_severity = 'critical'
|
||||
elif severity == 'warning' and highest_severity != 'critical':
|
||||
highest_severity = 'warning'
|
||||
|
||||
combined_message = '\n'.join(messages)
|
||||
|
||||
# Send notifications
|
||||
if TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID:
|
||||
success = send_telegram(combined_message)
|
||||
logger.info(f"Telegram: {'sent' if success else 'failed'}")
|
||||
|
||||
if NTFY_TOPIC:
|
||||
title = f"ManaCore Alert ({len(alerts)} alerts)"
|
||||
# Strip HTML for ntfy
|
||||
plain_message = combined_message.replace('<b>', '').replace('</b>', '')
|
||||
plain_message = plain_message.replace('<code>', '').replace('</code>', '')
|
||||
success = send_ntfy(title, plain_message, highest_severity)
|
||||
logger.info(f"ntfy: {'sent' if success else 'failed'}")
|
||||
|
||||
def log_message(self, format, *args):
|
||||
logger.info(f"{self.client_address[0]} - {format % args}")
|
||||
|
||||
def main():
|
||||
port = int(os.environ.get('PORT', 8080))
|
||||
|
||||
logger.info(f"Starting Alert Notifier on port {port}")
|
||||
logger.info(f"Telegram configured: {bool(TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID)}")
|
||||
logger.info(f"ntfy configured: {bool(NTFY_TOPIC)}")
|
||||
|
||||
server = HTTPServer(('0.0.0.0', port), AlertHandler)
|
||||
|
||||
try:
|
||||
server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutting down")
|
||||
server.shutdown()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
62
docker/alertmanager/alertmanager.yml
Normal file
62
docker/alertmanager/alertmanager.yml
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# Alertmanager Configuration for ManaCore
|
||||
# Sends alerts via webhook to custom notification handler
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
# Default receiver for all alerts
|
||||
receiver: 'webhook'
|
||||
# Group alerts by severity and service
|
||||
group_by: ['alertname', 'severity', 'job']
|
||||
# Wait before sending first notification
|
||||
group_wait: 30s
|
||||
# Wait before sending follow-up notifications for same group
|
||||
group_interval: 5m
|
||||
# Wait before re-sending resolved alerts
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
# Critical alerts - immediate notification
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'webhook'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
|
||||
# Warning alerts - less frequent
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'webhook'
|
||||
group_wait: 1m
|
||||
repeat_interval: 6h
|
||||
|
||||
# Info alerts - only during business hours, batch together
|
||||
- match:
|
||||
severity: info
|
||||
receiver: 'webhook'
|
||||
group_wait: 5m
|
||||
repeat_interval: 24h
|
||||
|
||||
receivers:
|
||||
- name: 'webhook'
|
||||
webhook_configs:
|
||||
- url: 'http://alert-notifier:8080/webhook'
|
||||
send_resolved: true
|
||||
max_alerts: 10
|
||||
|
||||
# Inhibition rules - prevent redundant alerts
|
||||
inhibit_rules:
|
||||
# Don't alert on warnings if critical is firing for same service
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'job']
|
||||
|
||||
# Don't alert on service-specific issues if PostgreSQL is down
|
||||
- source_match:
|
||||
alertname: 'PostgreSQLDown'
|
||||
target_match_re:
|
||||
alertname: '.*(Backend|Service).*'
|
||||
equal: []
|
||||
|
|
@ -9,11 +9,11 @@ global:
|
|||
rule_files:
|
||||
- /etc/prometheus/alerts.yml
|
||||
|
||||
# Alertmanager configuration (optional, for future use)
|
||||
# alerting:
|
||||
# alertmanagers:
|
||||
# - static_configs:
|
||||
# - targets: []
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
scrape_configs:
|
||||
# Prometheus self-monitoring
|
||||
|
|
@ -30,10 +30,10 @@ scrape_configs:
|
|||
target_label: instance
|
||||
replacement: 'mac-mini'
|
||||
|
||||
# Docker container metrics via cAdvisor (disabled - container not deployed)
|
||||
# - job_name: 'cadvisor'
|
||||
# static_configs:
|
||||
# - targets: ['cadvisor:8080']
|
||||
# Docker container metrics via cAdvisor
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
|
||||
# PostgreSQL metrics
|
||||
- job_name: 'postgres'
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue