feat(infra): add deploy tracking with PostgreSQL, Pushgateway & Grafana dashboard

Instrument the CD pipeline to record per-deploy and per-service metrics
(build time, image size, startup time, health status) into PostgreSQL and
push gauges to Pushgateway. Adds a Grafana dashboard with 13 panels covering
deploy frequency, build performance, service health, and history.

New files:
- scripts/mac-mini/init-deploy-tracking.sql (idempotent DDL)
- scripts/deploy-metrics.sh (bash library for CI)
- docker/grafana/provisioning/datasources/deploy-tracking.yml
- docker/grafana/dashboards/deploy-tracking.json

Modified:
- docker/prometheus/prometheus.yml (pushgateway scrape job)
- .github/workflows/cd-macmini.yml (build/health instrumentation)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Till JS 2026-03-20 17:08:03 +01:00
parent a5940abfc2
commit 3f91c4656a
6 changed files with 933 additions and 38 deletions

View file

@ -152,6 +152,15 @@ jobs:
cd "${{ env.PROJECT_DIR }}" cd "${{ env.PROJECT_DIR }}"
git pull origin main git pull origin main
- name: Init deploy tracking
id: init
run: |
cd "${{ env.PROJECT_DIR }}"
source scripts/deploy-metrics.sh
deploy_timer_start
echo "start_epoch=$DEPLOY_START_EPOCH" >> $GITHUB_OUTPUT
ensure_deploy_schema
- name: Ensure env vars exist - name: Ensure env vars exist
run: | run: |
cd "${{ env.PROJECT_DIR }}" cd "${{ env.PROJECT_DIR }}"
@ -197,61 +206,241 @@ jobs:
echo "deploy-all=false" >> $GITHUB_OUTPUT echo "deploy-all=false" >> $GITHUB_OUTPUT
echo "Services to deploy: $SERVICES" echo "Services to deploy: $SERVICES"
- name: Deploy all services - name: Build and deploy services
if: steps.services.outputs.deploy-all == 'true' id: build
run: | run: |
cd "${{ env.PROJECT_DIR }}" cd "${{ env.PROJECT_DIR }}"
echo "=== Rebuilding and restarting ALL services ===" source scripts/deploy-metrics.sh
docker compose -f "${{ env.COMPOSE_FILE }}" --env-file "${{ env.ENV_FILE }}" up -d --build
echo "=== Waiting for services to start ==="
sleep 15
docker compose -f "${{ env.COMPOSE_FILE }}" ps
- name: Deploy changed services DEPLOY_ALL="${{ steps.services.outputs.deploy-all }}"
if: steps.services.outputs.deploy-all == 'false' && steps.services.outputs.services != ''
run: |
cd "${{ env.PROJECT_DIR }}"
SERVICES="${{ steps.services.outputs.services }}" SERVICES="${{ steps.services.outputs.services }}"
echo "=== Rebuilding: $SERVICES ==="
docker compose -f "${{ env.COMPOSE_FILE }}" --env-file "${{ env.ENV_FILE }}" up -d --build --no-deps $SERVICES # Determine final service list
if [ "$DEPLOY_ALL" == "true" ]; then
# Get all service names from compose file
SERVICES=$(docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" config --services | tr '\n' ' ')
echo "=== Rebuilding ALL services ==="
elif [ -z "$SERVICES" ]; then
echo "No services to deploy"
echo "build-times=" >> $GITHUB_OUTPUT
exit 0
else
echo "=== Rebuilding: $SERVICES ==="
fi
# Build each service individually to capture build times
BUILD_TIMES=""
for svc in $SERVICES; do
echo "--- Building $svc ---"
build_start=$(date +%s)
docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" build "$svc" 2>&1 || true
build_end=$(date +%s)
build_dur=$(( build_end - build_start ))
BUILD_TIMES="$BUILD_TIMES $svc:$build_dur"
echo " $svc built in ${build_dur}s"
done
# Start all services at once (no rebuild, images already built)
echo "=== Starting services ==="
if [ "$DEPLOY_ALL" == "true" ]; then
docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" up -d
else
docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" up -d --no-deps $SERVICES
fi
echo "=== Waiting for services to start ===" echo "=== Waiting for services to start ==="
sleep 10 sleep 10
echo "build-times=$BUILD_TIMES" >> $GITHUB_OUTPUT
- name: Health checks - name: Health checks
id: health
run: |
cd "${{ env.PROJECT_DIR }}"
source scripts/deploy-metrics.sh
# Map of service -> health URL
declare -A HEALTH_URLS=(
["mana-auth"]="http://localhost:3001/health"
["matrix-web"]="http://localhost:5180/health"
["chat-backend"]="http://localhost:3030/health"
["chat-web"]="http://localhost:5010/health"
["todo-backend"]="http://localhost:3031/health"
["todo-web"]="http://localhost:5011/health"
["calendar-backend"]="http://localhost:3032/health"
["calendar-web"]="http://localhost:5012/health"
["clock-backend"]="http://localhost:3033/health"
["clock-web"]="http://localhost:5013/health"
["contacts-backend"]="http://localhost:3034/health"
["contacts-web"]="http://localhost:5014/health"
)
DEPLOY_ALL="${{ steps.services.outputs.deploy-all }}"
SERVICES="${{ steps.services.outputs.services }}"
HEALTH_RESULTS=""
echo "=== Health Checks ==="
# Check all known health endpoints
for svc in "${!HEALTH_URLS[@]}"; do
url="${HEALTH_URLS[$svc]}"
result=$(check_health_timed "$svc" "$url" 2>/dev/null) || true
status=$(echo "$result" | awk '{print $1}')
elapsed=$(echo "$result" | awk '{print $2}')
http_code=$(echo "$result" | awk '{print $3}')
if [ -z "$status" ]; then
status="skipped"
elapsed="0"
http_code="0"
fi
if [ "$status" = "ok" ]; then
echo " ✓ $svc: OK (${elapsed}s)"
else
echo " ✗ $svc: $status (HTTP $http_code, ${elapsed}s)"
fi
HEALTH_RESULTS="$HEALTH_RESULTS $svc:$status:$http_code:$elapsed"
done
echo "health-results=$HEALTH_RESULTS" >> $GITHUB_OUTPUT
- name: Record deploy metrics
if: always()
run: |
cd "${{ env.PROJECT_DIR }}"
source scripts/deploy-metrics.sh
START_EPOCH="${{ steps.init.outputs.start_epoch }}"
NOW=$(date +%s)
DURATION=$(( NOW - START_EPOCH ))
# Determine overall status
STATUS="success"
if [ "${{ job.status }}" != "success" ]; then
STATUS="failure"
fi
# Determine services list
DEPLOY_ALL="${{ steps.services.outputs.deploy-all }}"
SERVICES="${{ steps.services.outputs.services }}"
if [ "$DEPLOY_ALL" == "true" ]; then
SERVICES_CSV="all"
else
SERVICES_CSV=$(echo "$SERVICES" | tr ' ' ',')
fi
COMMIT_MSG=$(git log -1 --pretty=%s 2>/dev/null | head -c 200 || echo "unknown")
BRANCH="${{ github.ref_name }}"
# Insert deployment row
DEPLOY_ID=$(insert_deployment \
"${{ github.run_id }}" \
"${{ github.run_attempt }}" \
"${{ github.sha }}" \
"$COMMIT_MSG" \
"$BRANCH" \
"${{ github.event_name }}" \
"${{ github.actor }}" \
"$SERVICES_CSV" \
"$STATUS" 2>/dev/null) || DEPLOY_ID=""
if [ -n "$DEPLOY_ID" ]; then
# Finalise with duration
finalise_deployment "$DEPLOY_ID" "$STATUS" "$DURATION" 2>/dev/null || true
# Parse build times: "svc1:42 svc2:31"
BUILD_TIMES="${{ steps.build.outputs.build-times }}"
declare -A BUILD_DUR_MAP
for entry in $BUILD_TIMES; do
svc="${entry%%:*}"
dur="${entry#*:}"
BUILD_DUR_MAP["$svc"]="$dur"
done
# Parse health results: "svc1:ok:200:5.0 svc2:failed:503:30.0"
HEALTH_RESULTS="${{ steps.health.outputs.health-results }}"
declare -A HEALTH_MAP HTTP_MAP STARTUP_MAP
for entry in $HEALTH_RESULTS; do
svc=$(echo "$entry" | cut -d: -f1)
h_status=$(echo "$entry" | cut -d: -f2)
h_code=$(echo "$entry" | cut -d: -f3)
h_time=$(echo "$entry" | cut -d: -f4)
HEALTH_MAP["$svc"]="$h_status"
HTTP_MAP["$svc"]="$h_code"
STARTUP_MAP["$svc"]="$h_time"
done
# Combine: for each service that was built or health-checked, insert a row
ALL_SVCS=$(echo "$BUILD_TIMES $HEALTH_RESULTS" | tr ' ' '\n' | cut -d: -f1 | sort -u | tr '\n' ' ')
for svc in $ALL_SVCS; do
[ -z "$svc" ] && continue
build_dur="${BUILD_DUR_MAP[$svc]:-0}"
img_mb=$(get_image_size_mb "$svc" 2>/dev/null || echo "0")
startup="${STARTUP_MAP[$svc]:-0}"
health="${HEALTH_MAP[$svc]:-skipped}"
http_code="${HTTP_MAP[$svc]:-0}"
insert_deploy_service "$DEPLOY_ID" "$svc" "$build_dur" "$img_mb" "$startup" "$health" "$http_code" 2>/dev/null || true
push_service_metrics "$svc" "$build_dur" "$img_mb" "$health" 2>/dev/null || true
done
fi
# Push overall metrics to Pushgateway
push_deploy_metrics "$STATUS" "$DURATION" "$BRANCH" 2>/dev/null || true
echo "Deploy tracking recorded: status=$STATUS duration=${DURATION}s"
- name: Summary
if: always()
run: | run: |
cd "${{ env.PROJECT_DIR }}" cd "${{ env.PROJECT_DIR }}"
check_health() { START_EPOCH="${{ steps.init.outputs.start_epoch }}"
local name=$1 NOW=$(date +%s)
local url=$2 DURATION=$(( NOW - START_EPOCH ))
local status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || echo "000")
if [ "$status" == "200" ]; then
echo " ✓ $name: OK"
else
echo " ✗ $name: FAILED (HTTP $status)"
fi
}
echo "=== Health Checks ==="
check_health "Auth API" "http://localhost:3001/health"
check_health "Matrix Web" "http://localhost:5180/health"
check_health "Chat Backend" "http://localhost:3030/health"
check_health "Chat Web" "http://localhost:5010/health"
check_health "Todo Backend" "http://localhost:3031/health"
check_health "Todo Web" "http://localhost:5011/health"
check_health "Calendar Backend" "http://localhost:3032/health"
check_health "Calendar Web" "http://localhost:5012/health"
check_health "Clock Backend" "http://localhost:3033/health"
check_health "Clock Web" "http://localhost:5013/health"
- name: Summary
run: |
echo "## Deployment Summary" >> $GITHUB_STEP_SUMMARY echo "## Deployment Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "**Trigger:** ${{ github.event_name }}" >> $GITHUB_STEP_SUMMARY echo "**Trigger:** ${{ github.event_name }}" >> $GITHUB_STEP_SUMMARY
echo "**Commit:** ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY echo "**Commit:** \`${{ github.sha }}\`" >> $GITHUB_STEP_SUMMARY
echo "**Duration:** ${DURATION}s" >> $GITHUB_STEP_SUMMARY
echo "**Status:** ${{ job.status }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
if [ "${{ steps.services.outputs.deploy-all }}" == "true" ]; then if [ "${{ steps.services.outputs.deploy-all }}" == "true" ]; then
echo "**Services:** All" >> $GITHUB_STEP_SUMMARY echo "**Services:** All" >> $GITHUB_STEP_SUMMARY
else else
echo "**Services:** ${{ steps.services.outputs.services }}" >> $GITHUB_STEP_SUMMARY echo "**Services:** ${{ steps.services.outputs.services }}" >> $GITHUB_STEP_SUMMARY
fi fi
# Build times table
BUILD_TIMES="${{ steps.build.outputs.build-times }}"
if [ -n "$BUILD_TIMES" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Build Times" >> $GITHUB_STEP_SUMMARY
echo "| Service | Duration |" >> $GITHUB_STEP_SUMMARY
echo "|---------|----------|" >> $GITHUB_STEP_SUMMARY
for entry in $BUILD_TIMES; do
svc="${entry%%:*}"
dur="${entry#*:}"
echo "| $svc | ${dur}s |" >> $GITHUB_STEP_SUMMARY
done
fi
# Health results table
HEALTH_RESULTS="${{ steps.health.outputs.health-results }}"
if [ -n "$HEALTH_RESULTS" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Health Checks" >> $GITHUB_STEP_SUMMARY
echo "| Service | Status | HTTP | Startup |" >> $GITHUB_STEP_SUMMARY
echo "|---------|--------|------|---------|" >> $GITHUB_STEP_SUMMARY
for entry in $HEALTH_RESULTS; do
svc=$(echo "$entry" | cut -d: -f1)
h_status=$(echo "$entry" | cut -d: -f2)
h_code=$(echo "$entry" | cut -d: -f3)
h_time=$(echo "$entry" | cut -d: -f4)
icon="✓"
[ "$h_status" != "ok" ] && icon="✗"
echo "| $svc | $icon $h_status | $h_code | ${h_time}s |" >> $GITHUB_STEP_SUMMARY
done
fi

View file

@ -0,0 +1,487 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "Status Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 3600 },
{ "color": "red", "value": 86400 }
]
},
"unit": "s"
}
},
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "time() - deploy_last_timestamp_seconds{branch=\"main\"}",
"legendFormat": ""
}
],
"title": "Last Deploy",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "FAILED" },
"1": { "color": "green", "text": "OK" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "deploy_last_status{branch=\"main\"}",
"legendFormat": ""
}
],
"title": "Status",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 300 },
{ "color": "red", "value": 600 }
]
},
"unit": "s"
}
},
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "deploy_last_duration_seconds{branch=\"main\"}",
"legendFormat": ""
}
],
"title": "Duration",
"type": "stat"
},
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "blue", "value": null }]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"format": "table",
"rawQuery": true,
"rawSql": "SELECT COUNT(*) AS \"Deploys\" FROM deploy_tracking.deployments WHERE started_at > NOW() - INTERVAL '30 days';"
}
],
"title": "Deploys (30d)",
"type": "stat"
},
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 0.8 },
{ "color": "green", "value": 0.95 }
]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"format": "table",
"rawQuery": true,
"rawSql": "SELECT CASE WHEN COUNT(*) = 0 THEN 0 ELSE COUNT(*) FILTER (WHERE status = 'success')::float / COUNT(*)::float END AS \"Rate\" FROM deploy_tracking.deployments WHERE started_at > NOW() - INTERVAL '30 days';"
}
],
"title": "Success Rate (30d)",
"type": "stat"
},
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 300 },
{ "color": "red", "value": 600 }
]
}
}
},
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "none",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"format": "table",
"rawQuery": true,
"rawSql": "SELECT COALESCE(AVG(duration_s), 0) AS \"Avg\" FROM deploy_tracking.deployments WHERE status = 'success' AND started_at > NOW() - INTERVAL '30 days';"
}
],
"title": "Avg Duration (30d)",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 101,
"panels": [],
"title": "Deploy Frequency",
"type": "row"
},
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "bars",
"fillOpacity": 80,
"stacking": { "mode": "normal" }
}
},
"overrides": [
{
"matcher": { "id": "byName", "options": "success" },
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
},
{
"matcher": { "id": "byName", "options": "failure" },
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
}
]
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
"id": 7,
"options": { "legend": { "displayMode": "list" }, "tooltip": { "mode": "multi" } },
"targets": [
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"format": "time_series",
"rawQuery": true,
"rawSql": "SELECT date_trunc('day', started_at) AS time, status AS metric, COUNT(*) AS value FROM deploy_tracking.deployments WHERE $__timeFilter(started_at) GROUP BY 1, 2 ORDER BY 1;"
}
],
"title": "Deploys per Day",
"type": "timeseries"
},
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "blue", "value": null }]
}
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
"id": 8,
"options": { "orientation": "horizontal" },
"targets": [
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"format": "table",
"rawQuery": true,
"rawSql": "SELECT s AS \"Service\", COUNT(*) AS \"Deploys\" FROM deploy_tracking.deployments, unnest(services) AS s WHERE started_at > NOW() - INTERVAL '30 days' GROUP BY s ORDER BY COUNT(*) DESC;"
}
],
"title": "Deploys per Service (30d)",
"type": "barchart"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 102,
"panels": [],
"title": "Build Performance",
"type": "row"
},
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"unit": "s",
"custom": { "drawStyle": "line", "pointSize": 5, "showPoints": "auto" }
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
"id": 9,
"options": { "legend": { "displayMode": "list" }, "tooltip": { "mode": "multi" } },
"targets": [
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"format": "time_series",
"rawQuery": true,
"rawSql": "SELECT d.started_at AS time, ds.service_name AS metric, ds.build_duration_s AS value FROM deploy_tracking.deploy_services ds JOIN deploy_tracking.deployments d ON d.id = ds.deployment_id WHERE $__timeFilter(d.started_at) ORDER BY d.started_at;"
}
],
"title": "Build Duration Trend",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"unit": "decmbytes",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 500 },
{ "color": "red", "value": 1000 }
]
}
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
"id": 10,
"options": { "orientation": "horizontal" },
"targets": [
{
"expr": "deploy_service_image_size_mb",
"legendFormat": "{{service}}"
}
],
"title": "Image Sizes",
"type": "barchart"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
"id": 103,
"panels": [],
"title": "Startup & Health",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": { "color": "red", "text": "UNHEALTHY" },
"1": { "color": "green", "text": "HEALTHY" }
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 24 },
"id": 11,
"options": {
"colorMode": "background",
"graphMode": "none",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "deploy_service_healthy",
"legendFormat": "{{service}}"
}
],
"title": "Service Health",
"type": "stat"
},
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"unit": "s",
"custom": { "drawStyle": "line", "pointSize": 5, "showPoints": "auto" }
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 },
"id": 12,
"options": { "legend": { "displayMode": "list" }, "tooltip": { "mode": "multi" } },
"targets": [
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"format": "time_series",
"rawQuery": true,
"rawSql": "SELECT d.started_at AS time, ds.service_name AS metric, ds.startup_time_s AS value FROM deploy_tracking.deploy_services ds JOIN deploy_tracking.deployments d ON d.id = ds.deployment_id WHERE ds.startup_time_s IS NOT NULL AND $__timeFilter(d.started_at) ORDER BY d.started_at;"
}
],
"title": "Startup Time Trend",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
"id": 104,
"panels": [],
"title": "Deploy History",
"type": "row"
},
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"fieldConfig": {
"defaults": {},
"overrides": [
{
"matcher": { "id": "byName", "options": "status" },
"properties": [
{
"id": "mappings",
"value": [
{
"options": {
"failure": { "color": "red", "text": "FAILED" },
"success": { "color": "green", "text": "OK" },
"running": { "color": "yellow", "text": "RUNNING" }
},
"type": "value"
}
]
}
]
},
{
"matcher": { "id": "byName", "options": "duration_s" },
"properties": [{ "id": "unit", "value": "s" }]
}
]
},
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 33 },
"id": 13,
"options": { "showHeader": true, "sortBy": [{ "desc": true, "displayName": "started_at" }] },
"targets": [
{
"datasource": { "type": "postgres", "uid": "deploy-tracking" },
"format": "table",
"rawQuery": true,
"rawSql": "SELECT started_at, commit_sha, commit_message, deployer, array_to_string(services, ', ') AS services, status, duration_s FROM deploy_tracking.deployments ORDER BY started_at DESC LIMIT 50;"
}
],
"title": "Recent Deploys",
"type": "table"
}
],
"schemaVersion": 39,
"tags": ["deploy", "ci-cd"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "Prometheus", "value": "Prometheus" },
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"type": "datasource"
}
]
},
"time": { "from": "now-30d", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Deploy Tracking",
"uid": "deploy-tracking",
"version": 1
}

View file

@ -0,0 +1,23 @@
# Deploy Tracking PostgreSQL Datasource
# Queries deploy_tracking schema in the mana database
apiVersion: 1
datasources:
- name: DeployTracking
type: postgres
access: proxy
url: postgres:5432
user: postgres
secureJsonData:
password: ${POSTGRES_PASSWORD}
jsonData:
database: mana
sslmode: disable
maxOpenConns: 5
maxIdleConns: 2
connMaxLifetime: 14400
postgresVersion: 1600
timescaledb: false
isDefault: false
editable: true

View file

@ -111,3 +111,11 @@ scrape_configs:
# - targets: ['nutriphi-backend:3037'] # - targets: ['nutriphi-backend:3037']
# metrics_path: '/metrics' # metrics_path: '/metrics'
# scrape_interval: 30s # scrape_interval: 30s
# ============================================
# Pushgateway (deploy metrics, batch jobs)
# ============================================
- job_name: 'pushgateway'
honor_labels: true
static_configs:
- targets: ['pushgateway:9091']

148
scripts/deploy-metrics.sh Executable file
View file

@ -0,0 +1,148 @@
#!/usr/bin/env bash
# Deploy Metrics Library
# Source this file in CI/CD: source scripts/deploy-metrics.sh
#
# Provides functions for timing, DB inserts, and Pushgateway pushes.
set -euo pipefail
DEPLOY_START_EPOCH=""
PUSHGATEWAY_URL="http://localhost:9091"
PSQL_CMD="docker exec -i mana-infra-postgres psql -U postgres -d mana -tAq"
# ── Timing ──────────────────────────────────────────────────
deploy_timer_start() {
DEPLOY_START_EPOCH=$(date +%s)
}
deploy_timer_elapsed() {
local now
now=$(date +%s)
echo $(( now - DEPLOY_START_EPOCH ))
}
# ── Docker helpers ──────────────────────────────────────────
# Get image size in MB for a compose service
# Usage: get_image_size_mb <compose-service-name>
get_image_size_mb() {
local service="$1"
local size_bytes
size_bytes=$(docker image inspect "$(docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" images "$service" -q 2>/dev/null)" --format='{{.Size}}' 2>/dev/null || echo "0")
echo "scale=2; $size_bytes / 1048576" | bc 2>/dev/null || echo "0"
}
# Health check with retry and timing
# Usage: check_health_timed <service-name> <url>
# Output: <status> <seconds> <http_code> (e.g. "ok 4.2 200")
check_health_timed() {
local service="$1"
local url="$2"
local timeout=30
local interval=2
local start http_code elapsed
start=$(date +%s)
while true; do
http_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 3 "$url" 2>/dev/null || echo "000")
elapsed=$(( $(date +%s) - start ))
if [ "$http_code" = "200" ]; then
echo "ok ${elapsed}.0 $http_code"
return 0
fi
if [ "$elapsed" -ge "$timeout" ]; then
echo "failed ${elapsed}.0 $http_code"
return 1
fi
sleep "$interval"
done
}
# ── Database inserts ────────────────────────────────────────
# Ensure schema exists (idempotent guard)
ensure_deploy_schema() {
$PSQL_CMD -c "CREATE SCHEMA IF NOT EXISTS deploy_tracking;" 2>/dev/null || true
}
# Insert a deployment row, returns the new id
# Usage: insert_deployment <run_id> <run_attempt> <commit_sha> <commit_message> <branch> <trigger> <deployer> <services_csv> <status>
insert_deployment() {
local run_id="$1" run_attempt="$2" commit_sha="$3" commit_message="$4"
local branch="$5" trigger="$6" deployer="$7" services_csv="$8" status="$9"
# Convert comma-separated to PostgreSQL array literal
local pg_array
pg_array=$(echo "$services_csv" | sed "s/,/','/g")
$PSQL_CMD <<SQL
INSERT INTO deploy_tracking.deployments
(run_id, run_attempt, commit_sha, commit_message, branch, trigger, deployer, services, status)
VALUES
($run_id, $run_attempt, '$commit_sha', '$(echo "$commit_message" | sed "s/'/''/g")', '$branch', '$trigger', '$deployer', ARRAY['$pg_array'], '$status')
RETURNING id;
SQL
}
# Finalise a deployment row
# Usage: finalise_deployment <id> <status> <duration_s>
finalise_deployment() {
local id="$1" status="$2" duration_s="$3"
$PSQL_CMD <<SQL
UPDATE deploy_tracking.deployments
SET status = '$status', finished_at = NOW(), duration_s = $duration_s
WHERE id = $id;
SQL
}
# Insert a service row
# Usage: insert_deploy_service <deployment_id> <service_name> <build_duration_s> <image_size_mb> <startup_time_s> <health_status> <health_http_code>
insert_deploy_service() {
local dep_id="$1" svc="$2" build_dur="$3" img_mb="$4" startup="$5" health="$6" http_code="$7"
$PSQL_CMD <<SQL
INSERT INTO deploy_tracking.deploy_services
(deployment_id, service_name, build_duration_s, image_size_mb, startup_time_s, health_status, health_http_code)
VALUES
($dep_id, '$svc', $build_dur, $img_mb, $startup, '$health', $http_code);
SQL
}
# ── Pushgateway ─────────────────────────────────────────────
# Push overall deploy metrics
# Usage: push_deploy_metrics <status> <duration_s> <branch>
push_deploy_metrics() {
local status="$1" duration_s="$2" branch="$3"
local status_val=0
[ "$status" = "success" ] && status_val=1
cat <<PROM | curl -s --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/deploy/branch/${branch}" || true
# TYPE deploy_last_timestamp_seconds gauge
deploy_last_timestamp_seconds $(date +%s)
# TYPE deploy_last_duration_seconds gauge
deploy_last_duration_seconds $duration_s
# TYPE deploy_last_status gauge
deploy_last_status $status_val
PROM
}
# Push per-service metrics
# Usage: push_service_metrics <service> <build_duration_s> <image_size_mb> <healthy>
push_service_metrics() {
local svc="$1" build_dur="$2" img_mb="$3" healthy="$4"
local healthy_val=0
[ "$healthy" = "ok" ] && healthy_val=1
cat <<PROM | curl -s --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/deploy_service/service/${svc}" || true
# TYPE deploy_service_build_duration_seconds gauge
deploy_service_build_duration_seconds $build_dur
# TYPE deploy_service_image_size_mb gauge
deploy_service_image_size_mb $img_mb
# TYPE deploy_service_healthy gauge
deploy_service_healthy $healthy_val
PROM
}

View file

@ -0,0 +1,40 @@
-- Deploy Tracking Schema
-- Run once: docker exec -i mana-infra-postgres psql -U postgres -d mana < scripts/mac-mini/init-deploy-tracking.sql
-- All statements are idempotent (IF NOT EXISTS).
CREATE SCHEMA IF NOT EXISTS deploy_tracking;
-- One row per CI/CD run
CREATE TABLE IF NOT EXISTS deploy_tracking.deployments (
id BIGSERIAL PRIMARY KEY,
run_id BIGINT NOT NULL,
run_attempt INTEGER NOT NULL DEFAULT 1,
commit_sha VARCHAR(40) NOT NULL,
commit_message TEXT,
branch VARCHAR(255) NOT NULL DEFAULT 'main',
trigger VARCHAR(20) NOT NULL,
deployer VARCHAR(255),
services TEXT[],
status VARCHAR(20) NOT NULL DEFAULT 'running',
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
duration_s NUMERIC(10,2)
);
-- One row per service per deploy
CREATE TABLE IF NOT EXISTS deploy_tracking.deploy_services (
id BIGSERIAL PRIMARY KEY,
deployment_id BIGINT NOT NULL REFERENCES deploy_tracking.deployments(id) ON DELETE CASCADE,
service_name VARCHAR(100) NOT NULL,
build_duration_s NUMERIC(10,2),
image_size_mb NUMERIC(10,2),
startup_time_s NUMERIC(10,2),
health_status VARCHAR(10),
health_http_code INTEGER
);
-- Indexes
CREATE INDEX IF NOT EXISTS idx_deployments_started_at ON deploy_tracking.deployments (started_at DESC);
CREATE INDEX IF NOT EXISTS idx_deployments_status ON deploy_tracking.deployments (status);
CREATE INDEX IF NOT EXISTS idx_deploy_services_deployment_id ON deploy_tracking.deploy_services (deployment_id);
CREATE INDEX IF NOT EXISTS idx_deploy_services_service_name ON deploy_tracking.deploy_services (service_name);