diff --git a/.github/workflows/cd-macmini.yml b/.github/workflows/cd-macmini.yml index 6489bebbf..90efd7981 100644 --- a/.github/workflows/cd-macmini.yml +++ b/.github/workflows/cd-macmini.yml @@ -152,6 +152,15 @@ jobs: cd "${{ env.PROJECT_DIR }}" git pull origin main + - name: Init deploy tracking + id: init + run: | + cd "${{ env.PROJECT_DIR }}" + source scripts/deploy-metrics.sh + deploy_timer_start + echo "start_epoch=$DEPLOY_START_EPOCH" >> $GITHUB_OUTPUT + ensure_deploy_schema + - name: Ensure env vars exist run: | cd "${{ env.PROJECT_DIR }}" @@ -197,61 +206,241 @@ jobs: echo "deploy-all=false" >> $GITHUB_OUTPUT echo "Services to deploy: $SERVICES" - - name: Deploy all services - if: steps.services.outputs.deploy-all == 'true' + - name: Build and deploy services + id: build run: | cd "${{ env.PROJECT_DIR }}" - echo "=== Rebuilding and restarting ALL services ===" - docker compose -f "${{ env.COMPOSE_FILE }}" --env-file "${{ env.ENV_FILE }}" up -d --build - echo "=== Waiting for services to start ===" - sleep 15 - docker compose -f "${{ env.COMPOSE_FILE }}" ps + source scripts/deploy-metrics.sh - - name: Deploy changed services - if: steps.services.outputs.deploy-all == 'false' && steps.services.outputs.services != '' - run: | - cd "${{ env.PROJECT_DIR }}" + DEPLOY_ALL="${{ steps.services.outputs.deploy-all }}" SERVICES="${{ steps.services.outputs.services }}" - echo "=== Rebuilding: $SERVICES ===" - docker compose -f "${{ env.COMPOSE_FILE }}" --env-file "${{ env.ENV_FILE }}" up -d --build --no-deps $SERVICES + + # Determine final service list + if [ "$DEPLOY_ALL" == "true" ]; then + # Get all service names from compose file + SERVICES=$(docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" config --services | tr '\n' ' ') + echo "=== Rebuilding ALL services ===" + elif [ -z "$SERVICES" ]; then + echo "No services to deploy" + echo "build-times=" >> $GITHUB_OUTPUT + exit 0 + else + echo "=== Rebuilding: $SERVICES ===" + fi + + # Build each service individually to capture build times + BUILD_TIMES="" + for svc in $SERVICES; do + echo "--- Building $svc ---" + build_start=$(date +%s) + docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" build "$svc" 2>&1 || true + build_end=$(date +%s) + build_dur=$(( build_end - build_start )) + BUILD_TIMES="$BUILD_TIMES $svc:$build_dur" + echo " $svc built in ${build_dur}s" + done + + # Start all services at once (no rebuild, images already built) + echo "=== Starting services ===" + if [ "$DEPLOY_ALL" == "true" ]; then + docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" up -d + else + docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" up -d --no-deps $SERVICES + fi echo "=== Waiting for services to start ===" sleep 10 + echo "build-times=$BUILD_TIMES" >> $GITHUB_OUTPUT + - name: Health checks + id: health + run: | + cd "${{ env.PROJECT_DIR }}" + source scripts/deploy-metrics.sh + + # Map of service -> health URL + declare -A HEALTH_URLS=( + ["mana-auth"]="http://localhost:3001/health" + ["matrix-web"]="http://localhost:5180/health" + ["chat-backend"]="http://localhost:3030/health" + ["chat-web"]="http://localhost:5010/health" + ["todo-backend"]="http://localhost:3031/health" + ["todo-web"]="http://localhost:5011/health" + ["calendar-backend"]="http://localhost:3032/health" + ["calendar-web"]="http://localhost:5012/health" + ["clock-backend"]="http://localhost:3033/health" + ["clock-web"]="http://localhost:5013/health" + ["contacts-backend"]="http://localhost:3034/health" + ["contacts-web"]="http://localhost:5014/health" + ) + + DEPLOY_ALL="${{ steps.services.outputs.deploy-all }}" + SERVICES="${{ steps.services.outputs.services }}" + + HEALTH_RESULTS="" + echo "=== Health Checks ===" + + # Check all known health endpoints + for svc in "${!HEALTH_URLS[@]}"; do + url="${HEALTH_URLS[$svc]}" + result=$(check_health_timed "$svc" "$url" 2>/dev/null) || true + status=$(echo "$result" | awk '{print $1}') + elapsed=$(echo "$result" | awk '{print $2}') + http_code=$(echo "$result" | awk '{print $3}') + + if [ -z "$status" ]; then + status="skipped" + elapsed="0" + http_code="0" + fi + + if [ "$status" = "ok" ]; then + echo " ✓ $svc: OK (${elapsed}s)" + else + echo " ✗ $svc: $status (HTTP $http_code, ${elapsed}s)" + fi + + HEALTH_RESULTS="$HEALTH_RESULTS $svc:$status:$http_code:$elapsed" + done + + echo "health-results=$HEALTH_RESULTS" >> $GITHUB_OUTPUT + + - name: Record deploy metrics + if: always() + run: | + cd "${{ env.PROJECT_DIR }}" + source scripts/deploy-metrics.sh + + START_EPOCH="${{ steps.init.outputs.start_epoch }}" + NOW=$(date +%s) + DURATION=$(( NOW - START_EPOCH )) + + # Determine overall status + STATUS="success" + if [ "${{ job.status }}" != "success" ]; then + STATUS="failure" + fi + + # Determine services list + DEPLOY_ALL="${{ steps.services.outputs.deploy-all }}" + SERVICES="${{ steps.services.outputs.services }}" + if [ "$DEPLOY_ALL" == "true" ]; then + SERVICES_CSV="all" + else + SERVICES_CSV=$(echo "$SERVICES" | tr ' ' ',') + fi + + COMMIT_MSG=$(git log -1 --pretty=%s 2>/dev/null | head -c 200 || echo "unknown") + BRANCH="${{ github.ref_name }}" + + # Insert deployment row + DEPLOY_ID=$(insert_deployment \ + "${{ github.run_id }}" \ + "${{ github.run_attempt }}" \ + "${{ github.sha }}" \ + "$COMMIT_MSG" \ + "$BRANCH" \ + "${{ github.event_name }}" \ + "${{ github.actor }}" \ + "$SERVICES_CSV" \ + "$STATUS" 2>/dev/null) || DEPLOY_ID="" + + if [ -n "$DEPLOY_ID" ]; then + # Finalise with duration + finalise_deployment "$DEPLOY_ID" "$STATUS" "$DURATION" 2>/dev/null || true + + # Parse build times: "svc1:42 svc2:31" + BUILD_TIMES="${{ steps.build.outputs.build-times }}" + declare -A BUILD_DUR_MAP + for entry in $BUILD_TIMES; do + svc="${entry%%:*}" + dur="${entry#*:}" + BUILD_DUR_MAP["$svc"]="$dur" + done + + # Parse health results: "svc1:ok:200:5.0 svc2:failed:503:30.0" + HEALTH_RESULTS="${{ steps.health.outputs.health-results }}" + declare -A HEALTH_MAP HTTP_MAP STARTUP_MAP + for entry in $HEALTH_RESULTS; do + svc=$(echo "$entry" | cut -d: -f1) + h_status=$(echo "$entry" | cut -d: -f2) + h_code=$(echo "$entry" | cut -d: -f3) + h_time=$(echo "$entry" | cut -d: -f4) + HEALTH_MAP["$svc"]="$h_status" + HTTP_MAP["$svc"]="$h_code" + STARTUP_MAP["$svc"]="$h_time" + done + + # Combine: for each service that was built or health-checked, insert a row + ALL_SVCS=$(echo "$BUILD_TIMES $HEALTH_RESULTS" | tr ' ' '\n' | cut -d: -f1 | sort -u | tr '\n' ' ') + for svc in $ALL_SVCS; do + [ -z "$svc" ] && continue + build_dur="${BUILD_DUR_MAP[$svc]:-0}" + img_mb=$(get_image_size_mb "$svc" 2>/dev/null || echo "0") + startup="${STARTUP_MAP[$svc]:-0}" + health="${HEALTH_MAP[$svc]:-skipped}" + http_code="${HTTP_MAP[$svc]:-0}" + + insert_deploy_service "$DEPLOY_ID" "$svc" "$build_dur" "$img_mb" "$startup" "$health" "$http_code" 2>/dev/null || true + push_service_metrics "$svc" "$build_dur" "$img_mb" "$health" 2>/dev/null || true + done + fi + + # Push overall metrics to Pushgateway + push_deploy_metrics "$STATUS" "$DURATION" "$BRANCH" 2>/dev/null || true + echo "Deploy tracking recorded: status=$STATUS duration=${DURATION}s" + + - name: Summary + if: always() run: | cd "${{ env.PROJECT_DIR }}" - check_health() { - local name=$1 - local url=$2 - local status=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null || echo "000") - if [ "$status" == "200" ]; then - echo " ✓ $name: OK" - else - echo " ✗ $name: FAILED (HTTP $status)" - fi - } + START_EPOCH="${{ steps.init.outputs.start_epoch }}" + NOW=$(date +%s) + DURATION=$(( NOW - START_EPOCH )) - echo "=== Health Checks ===" - check_health "Auth API" "http://localhost:3001/health" - check_health "Matrix Web" "http://localhost:5180/health" - check_health "Chat Backend" "http://localhost:3030/health" - check_health "Chat Web" "http://localhost:5010/health" - check_health "Todo Backend" "http://localhost:3031/health" - check_health "Todo Web" "http://localhost:5011/health" - check_health "Calendar Backend" "http://localhost:3032/health" - check_health "Calendar Web" "http://localhost:5012/health" - check_health "Clock Backend" "http://localhost:3033/health" - check_health "Clock Web" "http://localhost:5013/health" - - - name: Summary - run: | echo "## Deployment Summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "**Trigger:** ${{ github.event_name }}" >> $GITHUB_STEP_SUMMARY - echo "**Commit:** ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY + echo "**Commit:** \`${{ github.sha }}\`" >> $GITHUB_STEP_SUMMARY + echo "**Duration:** ${DURATION}s" >> $GITHUB_STEP_SUMMARY + echo "**Status:** ${{ job.status }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + if [ "${{ steps.services.outputs.deploy-all }}" == "true" ]; then echo "**Services:** All" >> $GITHUB_STEP_SUMMARY else echo "**Services:** ${{ steps.services.outputs.services }}" >> $GITHUB_STEP_SUMMARY fi + + # Build times table + BUILD_TIMES="${{ steps.build.outputs.build-times }}" + if [ -n "$BUILD_TIMES" ]; then + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Build Times" >> $GITHUB_STEP_SUMMARY + echo "| Service | Duration |" >> $GITHUB_STEP_SUMMARY + echo "|---------|----------|" >> $GITHUB_STEP_SUMMARY + for entry in $BUILD_TIMES; do + svc="${entry%%:*}" + dur="${entry#*:}" + echo "| $svc | ${dur}s |" >> $GITHUB_STEP_SUMMARY + done + fi + + # Health results table + HEALTH_RESULTS="${{ steps.health.outputs.health-results }}" + if [ -n "$HEALTH_RESULTS" ]; then + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Health Checks" >> $GITHUB_STEP_SUMMARY + echo "| Service | Status | HTTP | Startup |" >> $GITHUB_STEP_SUMMARY + echo "|---------|--------|------|---------|" >> $GITHUB_STEP_SUMMARY + for entry in $HEALTH_RESULTS; do + svc=$(echo "$entry" | cut -d: -f1) + h_status=$(echo "$entry" | cut -d: -f2) + h_code=$(echo "$entry" | cut -d: -f3) + h_time=$(echo "$entry" | cut -d: -f4) + icon="✓" + [ "$h_status" != "ok" ] && icon="✗" + echo "| $svc | $icon $h_status | $h_code | ${h_time}s |" >> $GITHUB_STEP_SUMMARY + done + fi diff --git a/docker/grafana/dashboards/deploy-tracking.json b/docker/grafana/dashboards/deploy-tracking.json new file mode 100644 index 000000000..47b0f4796 --- /dev/null +++ b/docker/grafana/dashboards/deploy-tracking.json @@ -0,0 +1,487 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "panels": [], + "title": "Status Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 3600 }, + { "color": "red", "value": 86400 } + ] + }, + "unit": "s" + } + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "expr": "time() - deploy_last_timestamp_seconds{branch=\"main\"}", + "legendFormat": "" + } + ], + "title": "Last Deploy", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "FAILED" }, + "1": { "color": "green", "text": "OK" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "expr": "deploy_last_status{branch=\"main\"}", + "legendFormat": "" + } + ], + "title": "Status", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 300 }, + { "color": "red", "value": 600 } + ] + }, + "unit": "s" + } + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "expr": "deploy_last_duration_seconds{branch=\"main\"}", + "legendFormat": "" + } + ], + "title": "Duration", + "type": "stat" + }, + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "format": "table", + "rawQuery": true, + "rawSql": "SELECT COUNT(*) AS \"Deploys\" FROM deploy_tracking.deployments WHERE started_at > NOW() - INTERVAL '30 days';" + } + ], + "title": "Deploys (30d)", + "type": "stat" + }, + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.8 }, + { "color": "green", "value": 0.95 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CASE WHEN COUNT(*) = 0 THEN 0 ELSE COUNT(*) FILTER (WHERE status = 'success')::float / COUNT(*)::float END AS \"Rate\" FROM deploy_tracking.deployments WHERE started_at > NOW() - INTERVAL '30 days';" + } + ], + "title": "Success Rate (30d)", + "type": "stat" + }, + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 300 }, + { "color": "red", "value": 600 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "format": "table", + "rawQuery": true, + "rawSql": "SELECT COALESCE(AVG(duration_s), 0) AS \"Avg\" FROM deploy_tracking.deployments WHERE status = 'success' AND started_at > NOW() - INTERVAL '30 days';" + } + ], + "title": "Avg Duration (30d)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "panels": [], + "title": "Deploy Frequency", + "type": "row" + }, + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "stacking": { "mode": "normal" } + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "success" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "failure" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 7, + "options": { "legend": { "displayMode": "list" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "format": "time_series", + "rawQuery": true, + "rawSql": "SELECT date_trunc('day', started_at) AS time, status AS metric, COUNT(*) AS value FROM deploy_tracking.deployments WHERE $__timeFilter(started_at) GROUP BY 1, 2 ORDER BY 1;" + } + ], + "title": "Deploys per Day", + "type": "timeseries" + }, + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "blue", "value": null }] + } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 8, + "options": { "orientation": "horizontal" }, + "targets": [ + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "format": "table", + "rawQuery": true, + "rawSql": "SELECT s AS \"Service\", COUNT(*) AS \"Deploys\" FROM deploy_tracking.deployments, unnest(services) AS s WHERE started_at > NOW() - INTERVAL '30 days' GROUP BY s ORDER BY COUNT(*) DESC;" + } + ], + "title": "Deploys per Service (30d)", + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 102, + "panels": [], + "title": "Build Performance", + "type": "row" + }, + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "unit": "s", + "custom": { "drawStyle": "line", "pointSize": 5, "showPoints": "auto" } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, + "id": 9, + "options": { "legend": { "displayMode": "list" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "format": "time_series", + "rawQuery": true, + "rawSql": "SELECT d.started_at AS time, ds.service_name AS metric, ds.build_duration_s AS value FROM deploy_tracking.deploy_services ds JOIN deploy_tracking.deployments d ON d.id = ds.deployment_id WHERE $__timeFilter(d.started_at) ORDER BY d.started_at;" + } + ], + "title": "Build Duration Trend", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "unit": "decmbytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 500 }, + { "color": "red", "value": 1000 } + ] + } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, + "id": 10, + "options": { "orientation": "horizontal" }, + "targets": [ + { + "expr": "deploy_service_image_size_mb", + "legendFormat": "{{service}}" + } + ], + "title": "Image Sizes", + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "id": 103, + "panels": [], + "title": "Startup & Health", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { "color": "red", "text": "UNHEALTHY" }, + "1": { "color": "green", "text": "HEALTHY" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 12, "x": 0, "y": 24 }, + "id": 11, + "options": { + "colorMode": "background", + "graphMode": "none", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"] } + }, + "targets": [ + { + "expr": "deploy_service_healthy", + "legendFormat": "{{service}}" + } + ], + "title": "Service Health", + "type": "stat" + }, + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "unit": "s", + "custom": { "drawStyle": "line", "pointSize": 5, "showPoints": "auto" } + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 12, + "options": { "legend": { "displayMode": "list" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "format": "time_series", + "rawQuery": true, + "rawSql": "SELECT d.started_at AS time, ds.service_name AS metric, ds.startup_time_s AS value FROM deploy_tracking.deploy_services ds JOIN deploy_tracking.deployments d ON d.id = ds.deployment_id WHERE ds.startup_time_s IS NOT NULL AND $__timeFilter(d.started_at) ORDER BY d.started_at;" + } + ], + "title": "Startup Time Trend", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 }, + "id": 104, + "panels": [], + "title": "Deploy History", + "type": "row" + }, + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "fieldConfig": { + "defaults": {}, + "overrides": [ + { + "matcher": { "id": "byName", "options": "status" }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "failure": { "color": "red", "text": "FAILED" }, + "success": { "color": "green", "text": "OK" }, + "running": { "color": "yellow", "text": "RUNNING" } + }, + "type": "value" + } + ] + } + ] + }, + { + "matcher": { "id": "byName", "options": "duration_s" }, + "properties": [{ "id": "unit", "value": "s" }] + } + ] + }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 33 }, + "id": 13, + "options": { "showHeader": true, "sortBy": [{ "desc": true, "displayName": "started_at" }] }, + "targets": [ + { + "datasource": { "type": "postgres", "uid": "deploy-tracking" }, + "format": "table", + "rawQuery": true, + "rawSql": "SELECT started_at, commit_sha, commit_message, deployer, array_to_string(services, ', ') AS services, status, duration_s FROM deploy_tracking.deployments ORDER BY started_at DESC LIMIT 50;" + } + ], + "title": "Recent Deploys", + "type": "table" + } + ], + "schemaVersion": 39, + "tags": ["deploy", "ci-cd"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + } + ] + }, + "time": { "from": "now-30d", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Deploy Tracking", + "uid": "deploy-tracking", + "version": 1 +} diff --git a/docker/grafana/provisioning/datasources/deploy-tracking.yml b/docker/grafana/provisioning/datasources/deploy-tracking.yml new file mode 100644 index 000000000..3386f999a --- /dev/null +++ b/docker/grafana/provisioning/datasources/deploy-tracking.yml @@ -0,0 +1,23 @@ +# Deploy Tracking PostgreSQL Datasource +# Queries deploy_tracking schema in the mana database + +apiVersion: 1 + +datasources: + - name: DeployTracking + type: postgres + access: proxy + url: postgres:5432 + user: postgres + secureJsonData: + password: ${POSTGRES_PASSWORD} + jsonData: + database: mana + sslmode: disable + maxOpenConns: 5 + maxIdleConns: 2 + connMaxLifetime: 14400 + postgresVersion: 1600 + timescaledb: false + isDefault: false + editable: true diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index a53e9d4d9..9c317d4d8 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -111,3 +111,11 @@ scrape_configs: # - targets: ['nutriphi-backend:3037'] # metrics_path: '/metrics' # scrape_interval: 30s + + # ============================================ + # Pushgateway (deploy metrics, batch jobs) + # ============================================ + - job_name: 'pushgateway' + honor_labels: true + static_configs: + - targets: ['pushgateway:9091'] diff --git a/scripts/deploy-metrics.sh b/scripts/deploy-metrics.sh new file mode 100755 index 000000000..03ef598fa --- /dev/null +++ b/scripts/deploy-metrics.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# Deploy Metrics Library +# Source this file in CI/CD: source scripts/deploy-metrics.sh +# +# Provides functions for timing, DB inserts, and Pushgateway pushes. + +set -euo pipefail + +DEPLOY_START_EPOCH="" +PUSHGATEWAY_URL="http://localhost:9091" +PSQL_CMD="docker exec -i mana-infra-postgres psql -U postgres -d mana -tAq" + +# ── Timing ────────────────────────────────────────────────── + +deploy_timer_start() { + DEPLOY_START_EPOCH=$(date +%s) +} + +deploy_timer_elapsed() { + local now + now=$(date +%s) + echo $(( now - DEPLOY_START_EPOCH )) +} + +# ── Docker helpers ────────────────────────────────────────── + +# Get image size in MB for a compose service +# Usage: get_image_size_mb +get_image_size_mb() { + local service="$1" + local size_bytes + size_bytes=$(docker image inspect "$(docker compose -f "$COMPOSE_FILE" --env-file "$ENV_FILE" images "$service" -q 2>/dev/null)" --format='{{.Size}}' 2>/dev/null || echo "0") + echo "scale=2; $size_bytes / 1048576" | bc 2>/dev/null || echo "0" +} + +# Health check with retry and timing +# Usage: check_health_timed +# Output: (e.g. "ok 4.2 200") +check_health_timed() { + local service="$1" + local url="$2" + local timeout=30 + local interval=2 + local start http_code elapsed + + start=$(date +%s) + while true; do + http_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 3 "$url" 2>/dev/null || echo "000") + elapsed=$(( $(date +%s) - start )) + + if [ "$http_code" = "200" ]; then + echo "ok ${elapsed}.0 $http_code" + return 0 + fi + + if [ "$elapsed" -ge "$timeout" ]; then + echo "failed ${elapsed}.0 $http_code" + return 1 + fi + + sleep "$interval" + done +} + +# ── Database inserts ──────────────────────────────────────── + +# Ensure schema exists (idempotent guard) +ensure_deploy_schema() { + $PSQL_CMD -c "CREATE SCHEMA IF NOT EXISTS deploy_tracking;" 2>/dev/null || true +} + +# Insert a deployment row, returns the new id +# Usage: insert_deployment +insert_deployment() { + local run_id="$1" run_attempt="$2" commit_sha="$3" commit_message="$4" + local branch="$5" trigger="$6" deployer="$7" services_csv="$8" status="$9" + + # Convert comma-separated to PostgreSQL array literal + local pg_array + pg_array=$(echo "$services_csv" | sed "s/,/','/g") + + $PSQL_CMD < +finalise_deployment() { + local id="$1" status="$2" duration_s="$3" + $PSQL_CMD < +insert_deploy_service() { + local dep_id="$1" svc="$2" build_dur="$3" img_mb="$4" startup="$5" health="$6" http_code="$7" + $PSQL_CMD < +push_deploy_metrics() { + local status="$1" duration_s="$2" branch="$3" + local status_val=0 + [ "$status" = "success" ] && status_val=1 + +cat < +push_service_metrics() { + local svc="$1" build_dur="$2" img_mb="$3" healthy="$4" + local healthy_val=0 + [ "$healthy" = "ok" ] && healthy_val=1 + +cat <