From ac663a6c911214a3ca201fb2f34fcb235881d217 Mon Sep 17 00:00:00 2001 From: Till-JS <101404291+Till-JS@users.noreply.github.com> Date: Sun, 25 Jan 2026 14:01:11 +0100 Subject: [PATCH] chore: remove staging/Hetzner infra, add Watchtower auto-deploy - Remove old Hetzner deployment workflows (cd-staging, cd-production) - Remove staging docker-compose files - Remove outdated staging/Hetzner documentation - Add Watchtower to docker-compose.macmini.yml for auto-updates - Update CLAUDE.md with Mac Mini server access - Simplify docs/DEPLOYMENT.md for new architecture Production now runs on Mac Mini with automatic deployments via Watchtower. Co-Authored-By: Claude --- .github/workflows/cd-production.yml | 389 --- .github/workflows/cd-staging-tagged.yml | 555 ---- .github/workflows/cd-staging.yml | 371 --- .github/workflows/cd-staging.yml.bak | 264 -- .github/workflows/ci-main.yml.bak | 168 -- .github/workflows/ci-pull-request.yml.bak | 314 --- .github/workflows/dependency-update.yml.bak | 249 -- .github/workflows/staging-config-check.yml | 103 - .github/workflows/test-coverage.yml.bak | 180 -- .github/workflows/test.yml.bak | 389 --- CLAUDE.md | 28 +- docker-compose.macmini.yml | 26 + docker-compose.production.yml | 429 --- docker-compose.staging.full.yml | 290 -- docker-compose.staging.yml | 421 --- docs/CI_CD_SETUP.md | 522 ---- docs/DEPLOYMENT.md | 780 +---- docs/DEPLOYMENT_ARCHITECTURE.md | 2816 ------------------- docs/DEPLOYMENT_DIAGRAMS.md | 949 ------- docs/DEPLOYMENT_HETZNER.md | 602 ---- docs/DEPLOYMENT_RUNBOOKS.md | 1314 --------- docs/DOCKER_SETUP_ANALYSIS.md | 750 ----- docs/HETZNER_DEPLOYMENT_SUMMARY.md | 625 ---- docs/HETZNER_PRODUCTION_GUIDE.md | 2007 ------------- docs/PRODUCTION_LAUNCH.md | 296 -- docs/STAGING_DEPLOYMENT_ISSUES.md | 408 --- docs/STAGING_SETUP.md | 441 --- 27 files changed, 104 insertions(+), 15582 deletions(-) delete mode 100644 .github/workflows/cd-production.yml delete mode 100644 .github/workflows/cd-staging-tagged.yml delete mode 100644 .github/workflows/cd-staging.yml delete mode 100644 .github/workflows/cd-staging.yml.bak delete mode 100644 .github/workflows/ci-main.yml.bak delete mode 100644 .github/workflows/ci-pull-request.yml.bak delete mode 100644 .github/workflows/dependency-update.yml.bak delete mode 100644 .github/workflows/staging-config-check.yml delete mode 100644 .github/workflows/test-coverage.yml.bak delete mode 100644 .github/workflows/test.yml.bak delete mode 100644 docker-compose.production.yml delete mode 100644 docker-compose.staging.full.yml delete mode 100644 docker-compose.staging.yml delete mode 100644 docs/CI_CD_SETUP.md delete mode 100644 docs/DEPLOYMENT_ARCHITECTURE.md delete mode 100644 docs/DEPLOYMENT_DIAGRAMS.md delete mode 100644 docs/DEPLOYMENT_HETZNER.md delete mode 100644 docs/DEPLOYMENT_RUNBOOKS.md delete mode 100644 docs/DOCKER_SETUP_ANALYSIS.md delete mode 100644 docs/HETZNER_DEPLOYMENT_SUMMARY.md delete mode 100644 docs/HETZNER_PRODUCTION_GUIDE.md delete mode 100644 docs/PRODUCTION_LAUNCH.md delete mode 100644 docs/STAGING_DEPLOYMENT_ISSUES.md delete mode 100644 docs/STAGING_SETUP.md diff --git a/.github/workflows/cd-production.yml b/.github/workflows/cd-production.yml deleted file mode 100644 index 564aa0c84..000000000 --- a/.github/workflows/cd-production.yml +++ /dev/null @@ -1,389 +0,0 @@ -# Production Deployment -# -# Triggered by: -# - Manual only (workflow_dispatch with confirmation) -# -# Flow: dev (staging) → main (production) -# Requires typing "deploy" to confirm -name: CD - Production Deployment - -on: - workflow_dispatch: - inputs: - service: - description: 'Service to deploy' - required: true - type: choice - options: - - all - - mana-core-auth - - maerchenzauber-backend - - chat-backend - - manadeck-backend - - nutriphi-backend - - news-api - environment: - description: 'Deployment environment' - required: true - type: choice - options: - - production - confirm: - description: 'Type "deploy" to confirm production deployment' - required: true - type: string - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - -jobs: - validate-deployment: - name: Validate Deployment Request - runs-on: ubuntu-latest - steps: - - name: Validate confirmation - run: | - if [ "${{ github.event.inputs.confirm }}" != "deploy" ]; then - echo "❌ Deployment not confirmed. Please type 'deploy' to confirm." - exit 1 - fi - echo "✅ Deployment confirmed" - - - name: Validate branch - run: | - if [ "${{ github.ref }}" != "refs/heads/main" ]; then - echo "❌ Production deployments must be from main branch" - exit 1 - fi - echo "✅ Deploying from main branch" - - - name: Check recent commits - uses: actions/checkout@v4 - with: - fetch-depth: 10 - - - name: Verify recent CI passes - run: | - echo "Checking recent CI status..." - # This would check recent CI runs, simplified for now - echo "✅ Recent CI checks verified" - - # Request manual approval for production - request-approval: - name: Request Production Approval - runs-on: ubuntu-latest - needs: validate-deployment - environment: - name: production-approval - steps: - - name: Approval granted - run: | - echo "## Production Deployment Approved" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Approved by**: ${{ github.actor }}" >> $GITHUB_STEP_SUMMARY - echo "- **Service**: ${{ github.event.inputs.service }}" >> $GITHUB_STEP_SUMMARY - echo "- **Timestamp**: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY - - # Create deployment backup - create-backup: - name: Create Production Backup - runs-on: ubuntu-latest - needs: request-approval - environment: - name: production - steps: - - name: Setup SSH - uses: webfactory/ssh-agent@v0.9.0 - with: - ssh-private-key: ${{ secrets.PRODUCTION_SSH_KEY }} - - - name: Add production server to known hosts - run: | - mkdir -p ~/.ssh - ssh-keyscan -H ${{ secrets.PRODUCTION_HOST }} >> ~/.ssh/known_hosts - - - name: Create database backup - run: | - ssh ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }} << 'EOF' - cd ~/manacore-production - - # Backup timestamp - TIMESTAMP=$(date +%Y%m%d_%H%M%S) - BACKUP_DIR="backups/$TIMESTAMP" - mkdir -p $BACKUP_DIR - - # Backup PostgreSQL - docker compose exec -T postgres pg_dumpall -U $POSTGRES_USER > $BACKUP_DIR/postgres_backup.sql - - # Backup Redis (if applicable) - docker compose exec -T redis redis-cli SAVE || echo "Redis backup skipped" - - # Backup docker-compose and env files - cp docker-compose.yml $BACKUP_DIR/ - cp .env $BACKUP_DIR/.env.backup - - echo "Backup created at: $BACKUP_DIR" - ls -lh $BACKUP_DIR/ - EOF - - - name: Tag current deployment - run: | - ssh ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }} << 'EOF' - cd ~/manacore-production - docker compose images > deployment_images.txt - echo "Current deployment tagged: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" - EOF - - # Deploy to production - deploy-production: - name: Deploy to Production - runs-on: ubuntu-latest - needs: create-backup - environment: - name: production - url: https://api.manacore.app - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup SSH - uses: webfactory/ssh-agent@v0.9.0 - with: - ssh-private-key: ${{ secrets.PRODUCTION_SSH_KEY }} - - - name: Add production server to known hosts - run: | - mkdir -p ~/.ssh - ssh-keyscan -H ${{ secrets.PRODUCTION_HOST }} >> ~/.ssh/known_hosts - - - name: Copy deployment files - run: | - scp docker-compose.production.yml ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }}:~/manacore-production/docker-compose.yml - - - name: Update environment variables - run: | - # Create production env file from secrets - cat > .env.production << EOF - # Database - POSTGRES_HOST=${{ secrets.PRODUCTION_POSTGRES_HOST }} - POSTGRES_PORT=${{ secrets.PRODUCTION_POSTGRES_PORT }} - POSTGRES_DB=${{ secrets.PRODUCTION_POSTGRES_DB }} - POSTGRES_USER=${{ secrets.PRODUCTION_POSTGRES_USER }} - POSTGRES_PASSWORD=${{ secrets.PRODUCTION_POSTGRES_PASSWORD }} - - # Redis - REDIS_HOST=${{ secrets.PRODUCTION_REDIS_HOST }} - REDIS_PORT=${{ secrets.PRODUCTION_REDIS_PORT }} - REDIS_PASSWORD=${{ secrets.PRODUCTION_REDIS_PASSWORD }} - - # Mana Core Auth - MANA_SERVICE_URL=${{ secrets.PRODUCTION_MANA_SERVICE_URL }} - JWT_SECRET=${{ secrets.PRODUCTION_JWT_SECRET }} - JWT_PUBLIC_KEY=${{ secrets.PRODUCTION_JWT_PUBLIC_KEY }} - JWT_PRIVATE_KEY=${{ secrets.PRODUCTION_JWT_PRIVATE_KEY }} - - # Supabase - SUPABASE_URL=${{ secrets.PRODUCTION_SUPABASE_URL }} - SUPABASE_ANON_KEY=${{ secrets.PRODUCTION_SUPABASE_ANON_KEY }} - SUPABASE_SERVICE_ROLE_KEY=${{ secrets.PRODUCTION_SUPABASE_SERVICE_ROLE_KEY }} - - # Azure OpenAI - AZURE_OPENAI_ENDPOINT=${{ secrets.PRODUCTION_AZURE_OPENAI_ENDPOINT }} - AZURE_OPENAI_API_KEY=${{ secrets.PRODUCTION_AZURE_OPENAI_API_KEY }} - AZURE_OPENAI_API_VERSION=2024-12-01-preview - - # Environment - NODE_ENV=production - EOF - - scp .env.production ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }}:~/manacore-production/.env - rm .env.production - - - name: Pull latest images - run: | - ssh ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }} << 'EOF' - cd ~/manacore-production - docker compose pull - EOF - - - name: Run database migrations - run: | - ssh ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }} << 'EOF' - cd ~/manacore-production - - echo "=== Running Database Migrations ===" - echo "" - - # Migration function with retry logic - run_migration() { - local service=$1 - local max_attempts=3 - local timeout=300 # 5 minutes - local attempt=1 - - while [ $attempt -le $max_attempts ]; do - echo "[$service] Migration attempt $attempt/$max_attempts..." - - # Run migration with timeout using a temporary container - if timeout $timeout docker compose run --rm $service pnpm run db:migrate 2>&1; then - echo "✅ [$service] Migration succeeded" - return 0 - else - exit_code=$? - if [ $exit_code -eq 124 ]; then - echo "⚠️ [$service] Migration timeout after ${timeout}s" - else - echo "⚠️ [$service] Migration failed with exit code $exit_code" - fi - - attempt=$((attempt + 1)) - if [ $attempt -le $max_attempts ]; then - wait_time=$((10 * attempt)) # Backoff: 10s, 20s, 30s - echo " Waiting ${wait_time}s before retry..." - sleep $wait_time - fi - fi - done - - echo "❌ [$service] Migration failed after $max_attempts attempts" - return 1 - } - - # Run migrations for mana-core-auth (central auth service) - run_migration mana-core-auth || { - echo "❌ mana-core-auth migration failed" - echo "⚠️ Continuing with deployment - manual migration may be required" - } - - echo "" - echo "✅ Migration step completed" - EOF - - - name: Deploy with zero-downtime - run: | - SERVICE="${{ github.event.inputs.service }}" - - ssh ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }} << EOF - cd ~/manacore-production - - if [ "$SERVICE" == "all" ]; then - # Rolling update for all services - for service in mana-core-auth maerchenzauber-backend chat-backend manadeck-backend nutriphi-backend news-api; do - echo "Deploying \$service..." - docker compose up -d --no-deps --scale \$service=2 \$service - sleep 10 - docker compose up -d --no-deps --scale \$service=1 \$service - done - else - # Single service deployment - echo "Deploying $SERVICE..." - docker compose up -d --no-deps $SERVICE - fi - - # Cleanup old images - docker image prune -f - EOF - - - name: Verify deployment - run: | - # Wait for services to stabilize - sleep 30 - - SERVICES=( - "mana-core-auth:3001:/api/v1/health" - "maerchenzauber-backend:3002:/health" - "chat-backend:3002:/api/health" - ) - - for SERVICE_CONFIG in "${SERVICES[@]}"; do - IFS=':' read -r SERVICE PORT PATH <<< "$SERVICE_CONFIG" - - echo "Verifying $SERVICE..." - ssh ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }} << EOF - HEALTH=\$(docker compose -f ~/manacore-production/docker-compose.yml exec -T $SERVICE wget -q -O - http://localhost:$PORT$PATH || echo "FAILED") - - if [[ "\$HEALTH" == *"FAILED"* ]]; then - echo "❌ Health check failed for $SERVICE" - docker compose -f ~/manacore-production/docker-compose.yml logs --tail=100 $SERVICE - exit 1 - else - echo "✅ Health check passed for $SERVICE" - fi - EOF - done - - - name: Monitor for 5 minutes - run: | - echo "Monitoring services for 5 minutes..." - for i in {1..5}; do - echo "Check $i/5..." - sleep 60 - ssh ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }} << 'EOF' - cd ~/manacore-production - docker compose ps - EOF - done - echo "✅ Monitoring complete - services stable" - - # Post-deployment verification - post-deployment-checks: - name: Post-Deployment Checks - runs-on: ubuntu-latest - needs: deploy-production - steps: - - name: Run smoke tests - run: | - # Test key endpoints - ENDPOINTS=( - "${{ secrets.PRODUCTION_API_URL }}/api/v1/health" - "${{ secrets.PRODUCTION_API_URL }}/health" - ) - - for ENDPOINT in "${ENDPOINTS[@]}"; do - echo "Testing: $ENDPOINT" - RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" $ENDPOINT) - - if [ "$RESPONSE" -eq 200 ]; then - echo "✅ $ENDPOINT is healthy" - else - echo "❌ $ENDPOINT returned $RESPONSE" - exit 1 - fi - done - - - name: Deployment summary - run: | - echo "## Production Deployment Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Environment**: Production" >> $GITHUB_STEP_SUMMARY - echo "- **Deployed by**: ${{ github.actor }}" >> $GITHUB_STEP_SUMMARY - echo "- **Service**: ${{ github.event.inputs.service }}" >> $GITHUB_STEP_SUMMARY - echo "- **Commit**: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY - echo "- **Timestamp**: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Deployment Status" >> $GITHUB_STEP_SUMMARY - echo "✅ All services deployed and verified successfully" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Backup Information" >> $GITHUB_STEP_SUMMARY - echo "Pre-deployment backup created and stored" >> $GITHUB_STEP_SUMMARY - - # Notify team - notify-deployment: - name: Notify Team - runs-on: ubuntu-latest - needs: post-deployment-checks - if: always() - steps: - - name: Deployment notification - run: | - STATUS="${{ needs.post-deployment-checks.result }}" - - if [ "$STATUS" == "success" ]; then - echo "✅ Production deployment completed successfully" - echo "Service: ${{ github.event.inputs.service }}" - else - echo "❌ Production deployment failed" - echo "Please check logs and consider rollback" - exit 1 - fi diff --git a/.github/workflows/cd-staging-tagged.yml b/.github/workflows/cd-staging-tagged.yml deleted file mode 100644 index 0f7db0fb5..000000000 --- a/.github/workflows/cd-staging-tagged.yml +++ /dev/null @@ -1,555 +0,0 @@ -name: CD - Staging (Tagged Releases) - -on: - push: - tags: - # Pattern: {project}-staging-v{version} or {project}-v{version}-staging - # Examples: chat-staging-v1.0.0, picture-v2.1.0-staging, mana-core-auth-staging-v1.0.0 - # For multi-app: chat-all-staging-v1.0.0 (deploys backend + web + landing) - - '*-staging-v*' - - '*-v*-staging' - workflow_dispatch: - inputs: - project: - description: 'Project to deploy' - required: true - type: choice - options: - - chat - - picture - - manadeck - - zitare - - presi - - mana-core-auth - - todo - apps: - description: 'Apps to deploy (comma-separated: backend,web,landing or "all")' - required: true - type: string - default: 'backend' - version: - description: 'Version tag (e.g., v1.0.0)' - required: false - type: string - default: 'latest' - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - REGISTRY: ghcr.io - # Note: repository_owner is lowercased for Docker compatibility - IMAGE_PREFIX: ghcr.io/memo-2023 - -jobs: - # Parse tag or inputs to determine what to deploy - parse-deployment: - name: Parse Deployment Target - runs-on: ubuntu-latest - outputs: - project: ${{ steps.parse.outputs.project }} - version: ${{ steps.parse.outputs.version }} - matrix: ${{ steps.matrix.outputs.matrix }} - steps: - - name: Parse tag or inputs - id: parse - run: | - if [ "${{ github.event_name }}" == "push" ]; then - # Parse from tag: {project}-staging-v{version} or {project}-v{version}-staging - # Also supports: {project}-all-staging-v{version} for multi-app deploy - TAG="${GITHUB_REF#refs/tags/}" - echo "Parsing tag: $TAG" - - # Extract project, app hint, and version from tag - if [[ "$TAG" =~ ^(.+)-all-staging-v(.+)$ ]]; then - PROJECT="${BASH_REMATCH[1]}" - VERSION="v${BASH_REMATCH[2]}" - APPS="all" - elif [[ "$TAG" =~ ^(.+)-staging-v(.+)$ ]]; then - PROJECT="${BASH_REMATCH[1]}" - VERSION="v${BASH_REMATCH[2]}" - APPS="backend" - elif [[ "$TAG" =~ ^(.+)-v(.+)-staging$ ]]; then - PROJECT="${BASH_REMATCH[1]}" - VERSION="v${BASH_REMATCH[2]}" - APPS="backend" - else - echo "Invalid tag format: $TAG" - exit 1 - fi - else - # Use workflow dispatch inputs - PROJECT="${{ github.event.inputs.project }}" - APPS="${{ github.event.inputs.apps }}" - VERSION="${{ github.event.inputs.version }}" - fi - - echo "Project: $PROJECT" - echo "Apps: $APPS" - echo "Version: $VERSION" - - echo "project=$PROJECT" >> $GITHUB_OUTPUT - echo "apps=$APPS" >> $GITHUB_OUTPUT - echo "version=$VERSION" >> $GITHUB_OUTPUT - - - name: Generate build matrix - id: matrix - run: | - PROJECT="${{ steps.parse.outputs.project }}" - APPS="${{ steps.parse.outputs.apps }}" - VERSION="${{ steps.parse.outputs.version }}" - - # Define available apps per project - declare -A PROJECT_APPS - PROJECT_APPS[chat]="backend,web,landing" - PROJECT_APPS[picture]="backend,web,landing" - PROJECT_APPS[manadeck]="backend,web" - PROJECT_APPS[zitare]="backend,web" - PROJECT_APPS[presi]="backend,web" - PROJECT_APPS[mana-core-auth]="service" - PROJECT_APPS[todo]="backend,web" - - # Expand "all" to available apps - if [ "$APPS" == "all" ]; then - APPS="${PROJECT_APPS[$PROJECT]}" - fi - - # Build JSON matrix - MATRIX='{"include":[' - FIRST=true - - IFS=',' read -ra APP_ARRAY <<< "$APPS" - for APP in "${APP_ARRAY[@]}"; do - APP=$(echo "$APP" | xargs) # Trim whitespace - - # Determine paths based on project and app - case "$PROJECT" in - mana-core-auth) - DOCKERFILE_PATH="services/mana-core-auth/Dockerfile" - CONTEXT_PATH="." - IMAGE_NAME="mana-core-auth" - PORT="3001" - HEALTH_PATH="/api/v1/health" - ;; - *) - case "$APP" in - backend|service) - DOCKERFILE_PATH="apps/$PROJECT/apps/backend/Dockerfile" - CONTEXT_PATH="." - IMAGE_NAME="${PROJECT}-backend" - ;; - web) - # Apps with their own Dockerfiles (need monorepo root for shared packages) - case "$PROJECT" in - manacore|todo|calendar|clock) - DOCKERFILE_PATH="apps/$PROJECT/apps/web/Dockerfile" - CONTEXT_PATH="." - ;; - *) - DOCKERFILE_PATH="docker/templates/Dockerfile.sveltekit" - CONTEXT_PATH="apps/$PROJECT/apps/web" - ;; - esac - IMAGE_NAME="${PROJECT}-web" - ;; - landing) - DOCKERFILE_PATH="docker/templates/Dockerfile.astro" - CONTEXT_PATH="apps/$PROJECT/apps/landing" - IMAGE_NAME="${PROJECT}-landing" - ;; - esac - - # Set backend ports per project (must match docker-compose.staging.yml) - case "$PROJECT" in - chat) PORT="3002" ;; - picture) PORT="3006" ;; - manadeck) PORT="3009" ;; - zitare) PORT="3007" ;; - presi) PORT="3008" ;; - todo) PORT="3018" ;; - esac - - # Override ports for web apps (SvelteKit uses different ports) - if [ "$APP" == "web" ]; then - case "$PROJECT" in - manacore) PORT="5173" ;; - todo) PORT="5188" ;; - calendar) PORT="5186" ;; - clock) PORT="5187" ;; - *) PORT="5173" ;; # default SvelteKit port - esac - fi - HEALTH_PATH="/api/v1/health" - ;; - esac - - if [ "$FIRST" = true ]; then - FIRST=false - else - MATRIX+=',' - fi - - MATRIX+="{\"app\":\"$APP\",\"image_name\":\"$IMAGE_NAME\",\"dockerfile_path\":\"$DOCKERFILE_PATH\",\"context_path\":\"$CONTEXT_PATH\",\"port\":\"$PORT\",\"health_path\":\"$HEALTH_PATH\"}" - done - - MATRIX+=']}' - - echo "Generated matrix: $MATRIX" - echo "matrix=$MATRIX" >> $GITHUB_OUTPUT - - # Build and push Docker images (parallel for multi-app) - build: - name: Build ${{ matrix.image_name }} - runs-on: ubuntu-latest - needs: parse-deployment - strategy: - fail-fast: false - matrix: ${{ fromJSON(needs.parse-deployment.outputs.matrix) }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Check Dockerfile exists - id: check - run: | - if [ -f "${{ matrix.dockerfile_path }}" ]; then - echo "exists=true" >> $GITHUB_OUTPUT - else - echo "Dockerfile not found: ${{ matrix.dockerfile_path }}" - echo "exists=false" >> $GITHUB_OUTPUT - fi - - - name: Set up Docker Buildx - if: steps.check.outputs.exists == 'true' - uses: docker/setup-buildx-action@v3 - - - name: Login to GitHub Container Registry - if: steps.check.outputs.exists == 'true' - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - if: steps.check.outputs.exists == 'true' - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.IMAGE_PREFIX }}/${{ matrix.image_name }} - tags: | - type=raw,value=${{ needs.parse-deployment.outputs.version }} - type=raw,value=staging-latest - type=sha,prefix=staging- - - - name: Build and push - if: steps.check.outputs.exists == 'true' - id: build - uses: docker/build-push-action@v5 - with: - context: ${{ matrix.context_path }} - file: ${{ matrix.dockerfile_path }} - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - build-args: | - NODE_ENV=staging - - - name: Build summary - run: | - echo "## Build: ${{ matrix.image_name }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Project**: ${{ needs.parse-deployment.outputs.project }}" >> $GITHUB_STEP_SUMMARY - echo "- **App**: ${{ matrix.app }}" >> $GITHUB_STEP_SUMMARY - echo "- **Version**: ${{ needs.parse-deployment.outputs.version }}" >> $GITHUB_STEP_SUMMARY - echo "- **Image**: ${{ env.IMAGE_PREFIX }}/${{ matrix.image_name }}" >> $GITHUB_STEP_SUMMARY - echo "- **Tags**: ${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY - - # Deploy to staging (parallel for multi-app) - deploy: - name: Deploy ${{ matrix.image_name }} - runs-on: ubuntu-latest - needs: [parse-deployment, build] - strategy: - fail-fast: false - matrix: ${{ fromJSON(needs.parse-deployment.outputs.matrix) }} - environment: - name: staging - url: https://staging.manacore.app - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup SSH - uses: webfactory/ssh-agent@v0.9.0 - with: - ssh-private-key: ${{ secrets.STAGING_SSH_KEY }} - - - name: Add staging server to known hosts - run: | - mkdir -p ~/.ssh - ssh-keyscan -H ${{ secrets.STAGING_HOST }} >> ~/.ssh/known_hosts - - - name: Sync docker-compose to staging - run: | - # Ensure staging directory exists - ssh ${{ secrets.STAGING_USER }}@${{ secrets.STAGING_HOST }} "mkdir -p ~/manacore-staging" - # Copy the docker-compose file - scp docker-compose.staging.yml ${{ secrets.STAGING_USER }}@${{ secrets.STAGING_HOST }}:~/manacore-staging/docker-compose.yml - - - name: Login to GHCR on staging server - run: | - ssh ${{ secrets.STAGING_USER }}@${{ secrets.STAGING_HOST }} << EOF - echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin - EOF - - - name: Deploy service - env: - VERSION: ${{ needs.parse-deployment.outputs.version }} - IMAGE_NAME: ${{ matrix.image_name }} - APP_TYPE: ${{ matrix.app }} - PROJECT: ${{ needs.parse-deployment.outputs.project }} - run: | - # Compute the version variable name locally (before SSH) - # Map: todo-web -> TODO_WEB_VERSION, chat-backend -> CHAT_VERSION - case "$IMAGE_NAME" in - *-web) - PROJECT_UPPER=$(echo "$PROJECT" | tr '[:lower:]-' '[:upper:]_') - VERSION_VAR="${PROJECT_UPPER}_WEB_VERSION" - ;; - *-backend) - PROJECT_UPPER=$(echo "$PROJECT" | tr '[:lower:]-' '[:upper:]_') - VERSION_VAR="${PROJECT_UPPER}_VERSION" - ;; - mana-core-auth) - VERSION_VAR="AUTH_VERSION" - ;; - *) - VERSION_VAR=$(echo "$IMAGE_NAME" | tr '[:lower:]-' '[:upper:]_')_VERSION - ;; - esac - - echo "Will set $VERSION_VAR=$VERSION for docker-compose" - - ssh ${{ secrets.STAGING_USER }}@${{ secrets.STAGING_HOST }} << EOF - cd ~/manacore-staging - - echo "Deploying $IMAGE_NAME:$VERSION to staging..." - - # Pull the new image with specific version tag - docker pull ${{ env.IMAGE_PREFIX }}/$IMAGE_NAME:$VERSION - - # Update .env file with the version for this service - # This ensures docker-compose uses the correct image tag - if grep -q "^$VERSION_VAR=" .env 2>/dev/null; then - sed -i "s/^$VERSION_VAR=.*/$VERSION_VAR=$VERSION/" .env - else - echo "Service \$SERVICE_NAME not found in compose, starting..." - docker compose up -d --force-recreate \$SERVICE_NAME - fi - - echo "Updated .env: $VERSION_VAR=$VERSION" - grep "$VERSION_VAR" .env || true - - # Service name matches docker-compose service name (with hyphens) - SERVICE_NAME="$IMAGE_NAME" - CONTAINER_NAME="${IMAGE_NAME}-staging" - - # Remove any stale container with the same name (prevents "name already in use" error) - if docker ps -a --format '{{.Names}}' | grep -q "^\$CONTAINER_NAME\$"; then - echo "Removing stale container: \$CONTAINER_NAME" - docker rm -f \$CONTAINER_NAME 2>/dev/null || true - fi - - # Always use --force-recreate to ensure the new image is used - echo "Deploying service: \$SERVICE_NAME" - docker compose up -d --no-deps --force-recreate \$SERVICE_NAME - - # Wait for startup - sleep 10 - docker compose ps \$SERVICE_NAME - - # Verify correct image is running - echo "Running image:" - docker inspect --format='{{.Config.Image}}' ${IMAGE_NAME}-staging 2>/dev/null || true - - # Cleanup old images - docker image prune -f - EOF - - - name: Health check - if: matrix.app == 'backend' || matrix.app == 'service' - run: | - PORT="${{ matrix.port }}" - HEALTH_PATH="${{ matrix.health_path }}" - - echo "Running health check on port $PORT$HEALTH_PATH..." - - ssh ${{ secrets.STAGING_USER }}@${{ secrets.STAGING_HOST }} << EOF - for i in {1..5}; do - RESPONSE=\$(curl -s -o /dev/null -w "%{http_code}" http://localhost:$PORT$HEALTH_PATH || echo "000") - if [ "\$RESPONSE" == "200" ]; then - echo "Health check passed (attempt \$i)" - exit 0 - fi - echo "Health check failed (attempt \$i), response: \$RESPONSE" - sleep 5 - done - echo "Health check failed after 5 attempts" - exit 1 - EOF - - - name: Deployment summary - run: | - echo "## Deploy: ${{ matrix.image_name }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Environment**: Staging" >> $GITHUB_STEP_SUMMARY - echo "- **Project**: ${{ needs.parse-deployment.outputs.project }}" >> $GITHUB_STEP_SUMMARY - echo "- **App**: ${{ matrix.app }}" >> $GITHUB_STEP_SUMMARY - echo "- **Version**: ${{ needs.parse-deployment.outputs.version }}" >> $GITHUB_STEP_SUMMARY - echo "- **Image**: ${{ env.IMAGE_PREFIX }}/${{ matrix.image_name }}" >> $GITHUB_STEP_SUMMARY - echo "- **Deployed by**: ${{ github.actor }}" >> $GITHUB_STEP_SUMMARY - echo "- **Timestamp**: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY - - # Run database migrations after deploy - migrations: - name: Database Migrations - runs-on: ubuntu-latest - needs: [parse-deployment, deploy] - # Only run for projects with backends (not manacore which is web-only) - if: needs.parse-deployment.outputs.project != 'manacore' - steps: - - name: Setup SSH - uses: webfactory/ssh-agent@v0.9.0 - with: - ssh-private-key: ${{ secrets.STAGING_SSH_KEY }} - - - name: Add staging server to known hosts - run: | - mkdir -p ~/.ssh - ssh-keyscan -H ${{ secrets.STAGING_HOST }} >> ~/.ssh/known_hosts - - - name: Run database migrations - env: - PROJECT: ${{ needs.parse-deployment.outputs.project }} - run: | - # Determine service name based on project - case "$PROJECT" in - mana-core-auth) - SERVICE_NAME="mana-core-auth" - ;; - *) - SERVICE_NAME="${PROJECT}-backend" - ;; - esac - - echo "Running database migrations for $SERVICE_NAME..." - - ssh ${{ secrets.STAGING_USER }}@${{ secrets.STAGING_HOST }} << EOF - cd ~/manacore-staging - - echo "=== Database Migration for $SERVICE_NAME ===" - - # Check if service is running - if ! docker compose ps $SERVICE_NAME --format '{{.State}}' 2>/dev/null | grep -q "running"; then - echo "⚠️ Service $SERVICE_NAME is not running, skipping migrations" - exit 0 - fi - - # Migration function with retry logic - run_db_push() { - local service=\$1 - local max_attempts=3 - local timeout=120 # 2 minutes - local attempt=1 - - while [ \$attempt -le \$max_attempts ]; do - echo "[\$service] db:push attempt \$attempt/\$max_attempts..." - - # Try db:push with timeout (staging uses push, not migrate) - if timeout \$timeout docker compose exec -T \$service pnpm run db:push 2>&1; then - echo "✅ [\$service] Database schema pushed successfully" - return 0 - else - exit_code=\$? - if [ \$exit_code -eq 124 ]; then - echo "⚠️ [\$service] db:push timeout after \${timeout}s" - else - echo "⚠️ [\$service] db:push failed with exit code \$exit_code" - fi - - attempt=\$((attempt + 1)) - if [ \$attempt -le \$max_attempts ]; then - wait_time=\$((5 * attempt)) # Backoff: 5s, 10s, 15s - echo " Waiting \${wait_time}s before retry..." - sleep \$wait_time - fi - fi - done - - echo "❌ [\$service] db:push failed after \$max_attempts attempts" - return 1 - } - - # Run db:push for the service - run_db_push $SERVICE_NAME || { - echo "❌ Database migration failed for $SERVICE_NAME" - echo "⚠️ You may need to run migrations manually:" - echo " ssh deploy@\${{ secrets.STAGING_HOST }} 'cd ~/manacore-staging && docker compose exec -T $SERVICE_NAME pnpm run db:push'" - exit 1 - } - - echo "" - echo "✅ Database migrations completed for $SERVICE_NAME" - EOF - - - name: Migration summary - if: always() - run: | - echo "## Database Migrations" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Project**: ${{ needs.parse-deployment.outputs.project }}" >> $GITHUB_STEP_SUMMARY - echo "- **Status**: ${{ job.status }}" >> $GITHUB_STEP_SUMMARY - - # Notify on completion - notify: - name: Deployment Complete - runs-on: ubuntu-latest - needs: [parse-deployment, build, deploy, migrations] - if: always() - steps: - - name: Deployment notification - run: | - BUILD_STATUS="${{ needs.build.result }}" - DEPLOY_STATUS="${{ needs.deploy.result }}" - MIGRATION_STATUS="${{ needs.migrations.result }}" - PROJECT="${{ needs.parse-deployment.outputs.project }}" - VERSION="${{ needs.parse-deployment.outputs.version }}" - - echo "## Staging Deployment Complete" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "| Stage | Status |" >> $GITHUB_STEP_SUMMARY - echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY - echo "| Build | $BUILD_STATUS |" >> $GITHUB_STEP_SUMMARY - echo "| Deploy | $DEPLOY_STATUS |" >> $GITHUB_STEP_SUMMARY - echo "| Migrations | $MIGRATION_STATUS |" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Project**: $PROJECT" >> $GITHUB_STEP_SUMMARY - echo "- **Version**: $VERSION" >> $GITHUB_STEP_SUMMARY - - # Check all stages (migrations can be skipped for web-only projects) - if [ "$BUILD_STATUS" == "success" ] && [ "$DEPLOY_STATUS" == "success" ]; then - if [ "$MIGRATION_STATUS" == "success" ] || [ "$MIGRATION_STATUS" == "skipped" ]; then - echo "" >> $GITHUB_STEP_SUMMARY - echo "All stages completed successfully" >> $GITHUB_STEP_SUMMARY - else - echo "" >> $GITHUB_STEP_SUMMARY - echo "⚠️ Migrations failed - database may need manual update" >> $GITHUB_STEP_SUMMARY - exit 1 - fi - else - echo "" >> $GITHUB_STEP_SUMMARY - echo "Some deployments failed - check individual job logs" >> $GITHUB_STEP_SUMMARY - exit 1 - fi diff --git a/.github/workflows/cd-staging.yml b/.github/workflows/cd-staging.yml deleted file mode 100644 index 6c8fcb97b..000000000 --- a/.github/workflows/cd-staging.yml +++ /dev/null @@ -1,371 +0,0 @@ -# Staging Deployment -# -# Triggered by: -# - Automatic: Push to dev branch (via ci.yml) -# - Manual: workflow_dispatch -# -# Full config archived at: .github/workflows/cd-staging.full.yml -# -# To add a service: -# 1. Add service to workflow_dispatch options -# 2. Add health check in "Run health checks" step -# 3. Add service to docker-compose.staging.yml -name: CD - Staging Deployment - -on: - workflow_dispatch: - inputs: - service: - description: 'Service to deploy (leave empty for all)' - required: false - type: choice - options: - - all - - mana-core-auth - - chat-backend - - chat-web - - manacore-web - - todo-backend - - todo-web - - calendar-backend - - calendar-web - - clock-backend - - clock-web - - telegram-stats-bot - workflow_call: - -permissions: - contents: read - packages: read - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - -jobs: - deploy-staging: - name: Deploy to Staging - runs-on: ubuntu-latest - environment: - name: staging - url: https://staging.manacore.app - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup SSH for deployment - uses: webfactory/ssh-agent@v0.9.0 - with: - ssh-private-key: ${{ secrets.STAGING_SSH_KEY }} - - - name: Add staging server to known hosts - env: - STAGING_HOST: 46.224.108.214 - run: | - mkdir -p ~/.ssh - ssh-keyscan -H $STAGING_HOST >> ~/.ssh/known_hosts - - - name: Prepare deployment directory - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - mkdir -p ~/manacore-staging - cd ~/manacore-staging - - # Create required directories - mkdir -p logs - mkdir -p data/postgres - mkdir -p data/redis - EOF - - - name: Copy docker-compose file - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - scp docker-compose.staging.yml $STAGING_USER@$STAGING_HOST:~/manacore-staging/docker-compose.yml - - - name: Copy environment file - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - # Create staging env file (mix of hardcoded config and secrets) - cat > .env.staging << EOF - # Database - Configuration - POSTGRES_HOST=postgres - POSTGRES_PORT=5432 - POSTGRES_DB=manacore - POSTGRES_USER=postgres - POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }} - - # Redis - Configuration - REDIS_HOST=redis - REDIS_PORT=6379 - REDIS_PASSWORD=${{ secrets.STAGING_REDIS_PASSWORD }} - - # Mana Core Auth - Configuration - MANA_SERVICE_URL=http://mana-core-auth:3001 - JWT_SECRET=${{ secrets.STAGING_JWT_SECRET }} - JWT_PUBLIC_KEY=${{ secrets.STAGING_JWT_PUBLIC_KEY }} - JWT_PRIVATE_KEY=${{ secrets.STAGING_JWT_PRIVATE_KEY }} - - # Supabase - SUPABASE_URL=${{ secrets.STAGING_SUPABASE_URL }} - SUPABASE_ANON_KEY=${{ secrets.STAGING_SUPABASE_ANON_KEY }} - SUPABASE_SERVICE_ROLE_KEY=${{ secrets.STAGING_SUPABASE_SERVICE_ROLE_KEY }} - - # Azure OpenAI - AZURE_OPENAI_ENDPOINT=${{ secrets.STAGING_AZURE_OPENAI_ENDPOINT }} - AZURE_OPENAI_API_KEY=${{ secrets.STAGING_AZURE_OPENAI_API_KEY }} - AZURE_OPENAI_API_VERSION=2024-12-01-preview - - # Environment - NODE_ENV=staging - EOF - - scp .env.staging $STAGING_USER@$STAGING_HOST:~/manacore-staging/.env - rm .env.staging - - - name: Login to GitHub Container Registry on staging server - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << EOF - # Login to ghcr.io with GitHub token - echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin - EOF - - - name: Pull latest Docker images - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - cd ~/manacore-staging - docker compose pull - EOF - - - name: Deploy services - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - SERVICE="${{ github.event.inputs.service || 'all' }}" - - ssh $STAGING_USER@$STAGING_HOST << EOF - cd ~/manacore-staging - - # Determine which services to deploy - if [ "$SERVICE" == "all" ]; then - echo "Deploying all services..." - docker compose up -d - else - echo "Deploying service: $SERVICE" - docker compose up -d $SERVICE - fi - - # Wait for initial startup - echo "Waiting for services to start..." - sleep 15 - - echo "=== Container Status ===" - docker compose ps - EOF - - - name: Create databases - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - cd ~/manacore-staging - - echo "Creating required databases..." - - # Create manacore_auth database (for mana-core-auth service) - docker compose exec -T postgres psql -U postgres -c "CREATE DATABASE manacore_auth;" 2>/dev/null || echo "manacore_auth database already exists" - - # Create chat database (for chat-backend service) - docker compose exec -T postgres psql -U postgres -c "CREATE DATABASE chat;" 2>/dev/null || echo "chat database already exists" - - # Create todo database (for todo-backend service) - docker compose exec -T postgres psql -U postgres -c "CREATE DATABASE todo;" 2>/dev/null || echo "todo database already exists" - - # Create calendar database (for calendar-backend service) - docker compose exec -T postgres psql -U postgres -c "CREATE DATABASE calendar;" 2>/dev/null || echo "calendar database already exists" - - # Create clock database (for clock-backend service) - docker compose exec -T postgres psql -U postgres -c "CREATE DATABASE clock;" 2>/dev/null || echo "clock database already exists" - - echo "✅ Databases ready" - EOF - - - name: Run database migrations - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - cd ~/manacore-staging - - echo "=== Running Database Migrations ===" - echo "" - - # Migration function with retry logic - run_migration() { - local service=$1 - local max_attempts=3 - local timeout=300 # 5 minutes - local attempt=1 - - while [ $attempt -le $max_attempts ]; do - echo "[$service] Migration attempt $attempt/$max_attempts..." - - # Run migration with timeout - if timeout $timeout docker compose exec -T $service pnpm run db:migrate 2>&1; then - echo "✅ [$service] Migration succeeded" - return 0 - else - exit_code=$? - if [ $exit_code -eq 124 ]; then - echo "⚠️ [$service] Migration timeout after ${timeout}s" - else - echo "⚠️ [$service] Migration failed with exit code $exit_code" - fi - - attempt=$((attempt + 1)) - if [ $attempt -le $max_attempts ]; then - wait_time=$((10 * attempt)) # Backoff: 10s, 20s, 30s - echo " Waiting ${wait_time}s before retry..." - sleep $wait_time - fi - fi - done - - echo "❌ [$service] Migration failed after $max_attempts attempts" - return 1 - } - - # Run migrations for services that have db:migrate script - # mana-core-auth - central auth service - if docker compose exec -T mana-core-auth test -f src/db/migrate.ts 2>/dev/null || \ - docker compose exec -T mana-core-auth pnpm run db:migrate --help 2>/dev/null; then - run_migration mana-core-auth || { - echo "❌ mana-core-auth migration failed - aborting deployment" - exit 1 - } - else - echo "⏭️ [mana-core-auth] No db:migrate script, using db:push..." - docker compose exec -T mana-core-auth npx drizzle-kit push --force || echo "Auth schema push completed" - fi - - echo "" - echo "✅ All migrations completed" - EOF - - - name: Run health checks - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - cd ~/manacore-staging - - echo "=== Health Checks with Polling ===" - echo "" - - # Health check function with retry polling - check_health() { - local service=$1 - local url=$2 - local max_attempts=24 # 24 * 5s = 2 minutes max wait - local attempt=1 - - echo "Checking $service..." - - while [ $attempt -le $max_attempts ]; do - # Check if container is running - if ! docker compose ps $service 2>/dev/null | grep -q "Up"; then - if [ $attempt -eq 1 ]; then - echo " ⏳ Waiting for container to start..." - fi - sleep 5 - attempt=$((attempt + 1)) - continue - fi - - # Check health endpoint - if docker compose exec -T $service wget -q -O - $url > /dev/null 2>&1; then - echo " ✅ $service is healthy (attempt $attempt)" - return 0 - fi - - if [ $attempt -eq 1 ]; then - echo " ⏳ Waiting for $service to become healthy..." - fi - - sleep 5 - attempt=$((attempt + 1)) - done - - echo " ❌ $service health check failed after $max_attempts attempts" - echo " === Recent Logs ===" - docker compose logs --tail=50 $service - return 1 - } - - echo "=== Container Status ===" - docker compose ps - echo "" - - # Check all services with polling - check_health mana-core-auth http://localhost:3001/api/v1/health || exit 1 - check_health chat-backend http://localhost:3002/api/v1/health || exit 1 - check_health chat-web http://localhost:3000/health || exit 1 - check_health manacore-web http://localhost:5173/health || exit 1 - check_health todo-backend http://localhost:3018/api/v1/health || exit 1 - check_health todo-web http://localhost:5188/health || exit 1 - check_health calendar-backend http://localhost:3016/api/v1/health || exit 1 - check_health calendar-web http://localhost:5186/health || exit 1 - check_health clock-backend http://localhost:3017/api/v1/health || exit 1 - check_health clock-web http://localhost:5187/health || exit 1 - - echo "" - echo "✅ All health checks passed!" - EOF - - - name: Deployment summary - run: | - echo "## Staging Deployment Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Environment**: Staging" >> $GITHUB_STEP_SUMMARY - echo "- **Deployed by**: ${{ github.actor }}" >> $GITHUB_STEP_SUMMARY - echo "- **Commit**: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY - echo "- **Timestamp**: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Services Deployed" >> $GITHUB_STEP_SUMMARY - echo "Service: ${{ github.event.inputs.service || 'all' }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Health Checks" >> $GITHUB_STEP_SUMMARY - echo "All health checks passed ✅" >> $GITHUB_STEP_SUMMARY - - notify-deployment: - name: Notify Deployment - runs-on: ubuntu-latest - needs: deploy-staging - if: always() - steps: - - name: Deployment notification - run: | - STATUS="${{ needs.deploy-staging.result }}" - - if [ "$STATUS" == "success" ]; then - echo "✅ Staging deployment completed successfully" - else - echo "❌ Staging deployment failed" - exit 1 - fi diff --git a/.github/workflows/cd-staging.yml.bak b/.github/workflows/cd-staging.yml.bak deleted file mode 100644 index 219b626bd..000000000 --- a/.github/workflows/cd-staging.yml.bak +++ /dev/null @@ -1,264 +0,0 @@ -# ARCHIVED: Full staging workflow with all services -# Active simplified workflow: .github/workflows/cd-staging.yml -# -# Services included: mana-core-auth, chat-backend, manadeck-backend -# -# To restore: cp .github/workflows/cd-staging.full.yml .github/workflows/cd-staging.yml - -name: CD - Staging Deployment - -on: - workflow_dispatch: - inputs: - service: - description: 'Service to deploy (leave empty for all)' - required: false - type: choice - options: - - all - - mana-core-auth - - chat-backend - - manadeck-backend - workflow_call: - -permissions: - contents: read - packages: read - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - -jobs: - deploy-staging: - name: Deploy to Staging - runs-on: ubuntu-latest - environment: - name: staging - url: https://staging.manacore.app - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup SSH for deployment - uses: webfactory/ssh-agent@v0.9.0 - with: - ssh-private-key: ${{ secrets.STAGING_SSH_KEY }} - - - name: Add staging server to known hosts - env: - STAGING_HOST: 46.224.108.214 - run: | - mkdir -p ~/.ssh - ssh-keyscan -H $STAGING_HOST >> ~/.ssh/known_hosts - - - name: Prepare deployment directory - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - mkdir -p ~/manacore-staging - cd ~/manacore-staging - - # Create required directories - mkdir -p logs - mkdir -p data/postgres - mkdir -p data/redis - EOF - - - name: Copy docker-compose file - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - scp docker-compose.staging.yml $STAGING_USER@$STAGING_HOST:~/manacore-staging/docker-compose.yml - - - name: Copy environment file - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - # Create staging env file (mix of hardcoded config and secrets) - cat > .env.staging << EOF - # Database - Configuration - POSTGRES_HOST=postgres - POSTGRES_PORT=5432 - POSTGRES_DB=manacore - POSTGRES_USER=postgres - POSTGRES_PASSWORD=${{ secrets.STAGING_POSTGRES_PASSWORD }} - - # Redis - Configuration - REDIS_HOST=redis - REDIS_PORT=6379 - REDIS_PASSWORD=${{ secrets.STAGING_REDIS_PASSWORD }} - - # Mana Core Auth - Configuration - MANA_SERVICE_URL=http://mana-core-auth:3001 - JWT_SECRET=${{ secrets.STAGING_JWT_SECRET }} - JWT_PUBLIC_KEY=${{ secrets.STAGING_JWT_PUBLIC_KEY }} - JWT_PRIVATE_KEY=${{ secrets.STAGING_JWT_PRIVATE_KEY }} - - # Supabase - SUPABASE_URL=${{ secrets.STAGING_SUPABASE_URL }} - SUPABASE_ANON_KEY=${{ secrets.STAGING_SUPABASE_ANON_KEY }} - SUPABASE_SERVICE_ROLE_KEY=${{ secrets.STAGING_SUPABASE_SERVICE_ROLE_KEY }} - - # Azure OpenAI - AZURE_OPENAI_ENDPOINT=${{ secrets.STAGING_AZURE_OPENAI_ENDPOINT }} - AZURE_OPENAI_API_KEY=${{ secrets.STAGING_AZURE_OPENAI_API_KEY }} - AZURE_OPENAI_API_VERSION=2024-12-01-preview - - # Environment - NODE_ENV=staging - EOF - - scp .env.staging $STAGING_USER@$STAGING_HOST:~/manacore-staging/.env - rm .env.staging - - - name: Login to GitHub Container Registry on staging server - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << EOF - # Login to ghcr.io with GitHub token - echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin - EOF - - - name: Pull latest Docker images - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - cd ~/manacore-staging - docker compose pull - EOF - - - name: Deploy services - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - SERVICE="${{ github.event.inputs.service || 'all' }}" - - ssh $STAGING_USER@$STAGING_HOST << EOF - cd ~/manacore-staging - - # Determine which services to deploy - if [ "$SERVICE" == "all" ]; then - echo "Deploying all services..." - docker compose up -d - else - echo "Deploying service: $SERVICE" - docker compose up -d $SERVICE - fi - - # Wait for initial startup - echo "Waiting for services to start..." - sleep 15 - - echo "=== Container Status ===" - docker compose ps - EOF - - - name: Run health checks - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - cd ~/manacore-staging - - # Wait for services to fully start - echo "Waiting 60s for services to fully initialize..." - sleep 60 - - echo "=== Container Status ===" - docker compose ps - - echo "" - echo "=== Health Checks ===" - - # Check mana-core-auth - echo "Checking mana-core-auth..." - if docker compose exec -T mana-core-auth wget -q -O - http://localhost:3001/api/v1/health > /dev/null 2>&1; then - echo "✅ mana-core-auth is healthy" - else - echo "❌ mana-core-auth health check failed" - echo "=== Logs ===" - docker compose logs --tail=50 mana-core-auth - exit 1 - fi - - # Check chat-backend - echo "Checking chat-backend..." - if docker compose exec -T chat-backend wget -q -O - http://localhost:3002/api/health > /dev/null 2>&1; then - echo "✅ chat-backend is healthy" - else - echo "❌ chat-backend health check failed" - echo "=== Logs ===" - docker compose logs --tail=50 chat-backend - exit 1 - fi - - # Check manadeck-backend - echo "Checking manadeck-backend..." - if docker compose exec -T manadeck-backend wget -q -O - http://localhost:3003/api/health > /dev/null 2>&1; then - echo "✅ manadeck-backend is healthy" - else - echo "❌ manadeck-backend health check failed" - echo "=== Logs ===" - docker compose logs --tail=50 manadeck-backend - exit 1 - fi - - echo "" - echo "✅ All health checks passed!" - EOF - - - name: Run database migrations - env: - STAGING_USER: deploy - STAGING_HOST: 46.224.108.214 - run: | - # Run migrations for services that need them - ssh $STAGING_USER@$STAGING_HOST << 'EOF' - cd ~/manacore-staging - - # Mana Core Auth migrations - docker compose exec -T mana-core-auth pnpm run db:migrate || echo "Auth migrations skipped" - EOF - - - name: Deployment summary - run: | - echo "## Staging Deployment Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Environment**: Staging" >> $GITHUB_STEP_SUMMARY - echo "- **Deployed by**: ${{ github.actor }}" >> $GITHUB_STEP_SUMMARY - echo "- **Commit**: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY - echo "- **Timestamp**: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Services Deployed" >> $GITHUB_STEP_SUMMARY - echo "Service: ${{ github.event.inputs.service || 'all' }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Health Checks" >> $GITHUB_STEP_SUMMARY - echo "All health checks passed ✅" >> $GITHUB_STEP_SUMMARY - - notify-deployment: - name: Notify Deployment - runs-on: ubuntu-latest - needs: deploy-staging - if: always() - steps: - - name: Deployment notification - run: | - STATUS="${{ needs.deploy-staging.result }}" - - if [ "$STATUS" == "success" ]; then - echo "✅ Staging deployment completed successfully" - else - echo "❌ Staging deployment failed" - exit 1 - fi diff --git a/.github/workflows/ci-main.yml.bak b/.github/workflows/ci-main.yml.bak deleted file mode 100644 index 33c9f0a1f..000000000 --- a/.github/workflows/ci-main.yml.bak +++ /dev/null @@ -1,168 +0,0 @@ -name: CI - Main Branch - -on: - push: - branches: - - main - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - TURBO_TOKEN: ${{ secrets.TURBO_TOKEN }} - TURBO_TEAM: ${{ secrets.TURBO_TEAM }} - -jobs: - # Full validation on main branch - validate: - name: Validate Main Branch - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Run format check - run: pnpm run format:check - - - name: Run lint - run: pnpm run lint - continue-on-error: true - - - name: Run type check - run: pnpm run type-check - - - name: Build all projects - run: pnpm run build - - - name: Run tests - run: pnpm run test || echo "Some tests failed" - continue-on-error: true - - - name: Generate build summary - run: | - echo "## Main Branch Build Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Commit**: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY - echo "- **Author**: ${{ github.actor }}" >> $GITHUB_STEP_SUMMARY - echo "- **Timestamp**: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Build Status" >> $GITHUB_STEP_SUMMARY - echo "All projects built successfully" >> $GITHUB_STEP_SUMMARY - - # Build and push Docker images for backend services - build-docker-images: - name: Build Docker Images - runs-on: ubuntu-latest - needs: validate - strategy: - matrix: - service: - - { name: 'maerchenzauber-backend', path: 'apps/maerchenzauber/apps/backend', port: '3002' } - - { name: 'chat-backend', path: 'apps/chat/apps/backend', port: '3002' } - - { name: 'manadeck-backend', path: 'apps/manadeck/apps/backend', port: '3003' } - - { name: 'nutriphi-backend', path: 'apps/nutriphi/apps/backend', port: '3004' } - - { name: 'news-api', path: 'apps/news/apps/api', port: '3005' } - - { name: 'mana-core-auth', path: 'services/mana-core-auth', port: '3001' } - fail-fast: false - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check if Dockerfile exists - id: check-dockerfile - run: | - if [ -f "${{ matrix.service.path }}/Dockerfile" ]; then - echo "exists=true" >> $GITHUB_OUTPUT - else - echo "exists=false" >> $GITHUB_OUTPUT - echo "Warning: No Dockerfile found for ${{ matrix.service.name }}" - fi - - - name: Login to GitHub Container Registry - if: steps.check-dockerfile.outputs.exists == 'true' - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - if: steps.check-dockerfile.outputs.exists == 'true' - id: meta - uses: docker/metadata-action@v5 - with: - images: ghcr.io/${{ github.repository_owner }}/${{ matrix.service.name }} - tags: | - type=sha,prefix={{branch}}- - type=ref,event=branch - type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }} - - - name: Build and push - if: steps.check-dockerfile.outputs.exists == 'true' - uses: docker/build-push-action@v5 - with: - context: . - file: ${{ matrix.service.path }}/Dockerfile - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - build-args: | - NODE_ENV=production - PORT=${{ matrix.service.port }} - - - name: Image digest - if: steps.check-dockerfile.outputs.exists == 'true' - run: echo "Image digest - ${{ steps.meta.outputs.digest }}" - - # Trigger staging deployment - trigger-staging-deploy: - name: Trigger Staging Deployment - runs-on: ubuntu-latest - needs: build-docker-images - if: github.ref == 'refs/heads/main' - steps: - - name: Trigger staging deployment workflow - uses: actions/github-script@v7 - with: - script: | - await github.rest.actions.createWorkflowDispatch({ - owner: context.repo.owner, - repo: context.repo.repo, - workflow_id: 'cd-staging.yml', - ref: 'main' - }); - - - name: Deployment notification - run: | - echo "## Staging Deployment Triggered" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "Docker images have been built and pushed successfully." >> $GITHUB_STEP_SUMMARY - echo "Staging deployment workflow has been triggered." >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/ci-pull-request.yml.bak b/.github/workflows/ci-pull-request.yml.bak deleted file mode 100644 index d367c9ea6..000000000 --- a/.github/workflows/ci-pull-request.yml.bak +++ /dev/null @@ -1,314 +0,0 @@ -name: CI - Pull Request - -on: - pull_request: - branches: - - main - - develop - types: [opened, synchronize, reopened] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - TURBO_TOKEN: ${{ secrets.TURBO_TOKEN }} - TURBO_TEAM: ${{ secrets.TURBO_TEAM }} - -jobs: - # Detect which projects have changed - detect-changes: - name: Detect Changed Projects - runs-on: ubuntu-latest - outputs: - projects: ${{ steps.filter.outputs.changes }} - has-changes: ${{ steps.filter.outputs.changes != '[]' }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Detect changed projects - uses: dorny/paths-filter@v3 - id: filter - with: - filters: | - chat: - - 'apps/chat/**' - - 'packages/**' - manacore: - - 'apps/manacore/**' - - 'packages/**' - packages: - - 'packages/**' - - # Lint and format check - lint-and-format: - name: Lint & Format Check - runs-on: ubuntu-latest - needs: detect-changes - if: needs.detect-changes.outputs.has-changes == 'true' - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Run format check - run: pnpm run format:check - continue-on-error: true - - - name: Run lint - run: pnpm run lint --filter='./apps/chat/**' --filter='./apps/manacore/**' - continue-on-error: true - - # Type checking - type-check: - name: Type Check - runs-on: ubuntu-latest - needs: detect-changes - if: needs.detect-changes.outputs.has-changes == 'true' - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Run type check - run: pnpm run type-check --filter='./apps/chat/**' --filter='./apps/manacore/**' - continue-on-error: true - - # Build all affected projects - build: - name: Build Projects - runs-on: ubuntu-latest - needs: detect-changes - if: needs.detect-changes.outputs.has-changes == 'true' - strategy: - matrix: - project: ${{ fromJSON(needs.detect-changes.outputs.projects) }} - fail-fast: false - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Build project - ${{ matrix.project }} - run: | - if [ "${{ matrix.project }}" == "packages" ]; then - pnpm run build --filter=@manacore/* - else - pnpm run build --filter='./apps/${{ matrix.project }}/**' - fi - continue-on-error: true - - - name: Upload build artifacts - uses: actions/upload-artifact@v4 - with: - name: build-${{ matrix.project }} - path: | - apps/${{ matrix.project }}/**/dist - apps/${{ matrix.project }}/**/.next - apps/${{ matrix.project }}/**/.svelte-kit - apps/${{ matrix.project }}/**/.astro - services/**/dist - retention-days: 7 - if-no-files-found: ignore - - # Run tests - test: - name: Run Tests - runs-on: ubuntu-latest - needs: detect-changes - if: needs.detect-changes.outputs.has-changes == 'true' - strategy: - matrix: - project: ${{ fromJSON(needs.detect-changes.outputs.projects) }} - fail-fast: false - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Run tests - ${{ matrix.project }} - run: | - if [ "${{ matrix.project }}" == "packages" ]; then - pnpm run test --filter=@manacore/* || echo "No tests found for packages" - else - pnpm run test --filter='./apps/${{ matrix.project }}/**' || echo "No tests found for ${{ matrix.project }}" - fi - continue-on-error: true - - - name: Upload test coverage - uses: actions/upload-artifact@v4 - with: - name: coverage-${{ matrix.project }} - path: | - apps/${{ matrix.project }}/**/coverage - services/**/coverage - retention-days: 7 - if-no-files-found: ignore - - # Docker build validation for backend services - docker-build-check: - name: Docker Build Check - runs-on: ubuntu-latest - needs: detect-changes - if: contains(needs.detect-changes.outputs.projects, 'chat') - strategy: - matrix: - service: - - { name: 'chat-backend', path: 'apps/chat/apps/backend' } - fail-fast: false - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check if Dockerfile exists - id: check-dockerfile - run: | - if [ -f "${{ matrix.service.path }}/Dockerfile" ]; then - echo "exists=true" >> $GITHUB_OUTPUT - else - echo "exists=false" >> $GITHUB_OUTPUT - fi - - - name: Build Docker image - if: steps.check-dockerfile.outputs.exists == 'true' - uses: docker/build-push-action@v5 - with: - context: . - file: ${{ matrix.service.path }}/Dockerfile - push: false - tags: ${{ matrix.service.name }}:pr-${{ github.event.pull_request.number }} - cache-from: type=gha - cache-to: type=gha,mode=max - build-args: | - NODE_ENV=production - - # Security scanning - security-scan: - name: Security Scan - runs-on: ubuntu-latest - needs: detect-changes - if: needs.detect-changes.outputs.has-changes == 'true' - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Run security audit - run: pnpm audit --audit-level=high - continue-on-error: true - - - name: Check for outdated dependencies - run: pnpm outdated - continue-on-error: true - - # PR status check (required for merge) - pr-checks-complete: - name: All PR Checks Complete - runs-on: ubuntu-latest - needs: [lint-and-format, type-check, build, test, docker-build-check, security-scan] - if: always() - steps: - - name: Check all jobs status - run: | - if [ "${{ needs.lint-and-format.result }}" == "failure" ] || \ - [ "${{ needs.type-check.result }}" == "failure" ] || \ - [ "${{ needs.build.result }}" == "failure" ]; then - echo "One or more required checks failed" - exit 1 - fi - echo "All required checks passed" - - - name: PR summary - run: | - echo "## PR Checks Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "| Check | Status |" >> $GITHUB_STEP_SUMMARY - echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY - echo "| Lint & Format | ${{ needs.lint-and-format.result }} |" >> $GITHUB_STEP_SUMMARY - echo "| Type Check | ${{ needs.type-check.result }} |" >> $GITHUB_STEP_SUMMARY - echo "| Build | ${{ needs.build.result }} |" >> $GITHUB_STEP_SUMMARY - echo "| Tests | ${{ needs.test.result }} |" >> $GITHUB_STEP_SUMMARY - echo "| Docker Build | ${{ needs.docker-build-check.result }} |" >> $GITHUB_STEP_SUMMARY - echo "| Security Scan | ${{ needs.security-scan.result }} |" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/dependency-update.yml.bak b/.github/workflows/dependency-update.yml.bak deleted file mode 100644 index 7fd78180a..000000000 --- a/.github/workflows/dependency-update.yml.bak +++ /dev/null @@ -1,249 +0,0 @@ -name: Dependency Updates - -on: - schedule: - # Run every Monday at 06:00 UTC - - cron: '0 6 * * 1' - workflow_dispatch: - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - -jobs: - # Check for outdated dependencies - check-outdated: - name: Check Outdated Dependencies - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Check for outdated dependencies - run: pnpm outdated --format json > outdated.json || true - - - name: Generate outdated report - run: | - echo "## Outdated Dependencies Report" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "Generated on: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f outdated.json ] && [ -s outdated.json ]; then - echo "### Packages to Update" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - cat outdated.json | jq -r 'to_entries[] | "- **\(.key)**: \(.value.current) → \(.value.latest)"' >> $GITHUB_STEP_SUMMARY || echo "No outdated packages found" >> $GITHUB_STEP_SUMMARY - else - echo "✅ All dependencies are up to date!" >> $GITHUB_STEP_SUMMARY - fi - - - name: Upload outdated report - uses: actions/upload-artifact@v4 - with: - name: outdated-dependencies - path: outdated.json - retention-days: 30 - if: always() - - # Security audit - security-audit: - name: Security Audit - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Run security audit - run: | - pnpm audit --json > audit-report.json || true - pnpm audit --audit-level=moderate || echo "Security vulnerabilities found" - - - name: Generate security report - run: | - echo "## Security Audit Report" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "Generated on: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f audit-report.json ]; then - # Parse audit report - CRITICAL=$(jq -r '.metadata.vulnerabilities.critical // 0' audit-report.json) - HIGH=$(jq -r '.metadata.vulnerabilities.high // 0' audit-report.json) - MODERATE=$(jq -r '.metadata.vulnerabilities.moderate // 0' audit-report.json) - LOW=$(jq -r '.metadata.vulnerabilities.low // 0' audit-report.json) - - echo "| Severity | Count |" >> $GITHUB_STEP_SUMMARY - echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY - echo "| Critical | $CRITICAL |" >> $GITHUB_STEP_SUMMARY - echo "| High | $HIGH |" >> $GITHUB_STEP_SUMMARY - echo "| Moderate | $MODERATE |" >> $GITHUB_STEP_SUMMARY - echo "| Low | $LOW |" >> $GITHUB_STEP_SUMMARY - - if [ "$CRITICAL" -gt 0 ] || [ "$HIGH" -gt 0 ]; then - echo "" >> $GITHUB_STEP_SUMMARY - echo "⚠️ **Action Required**: Critical or high severity vulnerabilities detected!" >> $GITHUB_STEP_SUMMARY - fi - fi - - - name: Upload security audit - uses: actions/upload-artifact@v4 - with: - name: security-audit - path: audit-report.json - retention-days: 90 - if: always() - - - name: Create issue for critical vulnerabilities - if: always() - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - - if (!fs.existsSync('audit-report.json')) { - console.log('No audit report found'); - return; - } - - const auditData = JSON.parse(fs.readFileSync('audit-report.json', 'utf8')); - const critical = auditData.metadata?.vulnerabilities?.critical || 0; - const high = auditData.metadata?.vulnerabilities?.high || 0; - - if (critical > 0 || high > 0) { - const issueTitle = `🚨 Security Alert: ${critical} Critical, ${high} High Severity Vulnerabilities`; - const issueBody = ` - ## Security Vulnerability Report - - **Date**: ${new Date().toISOString()} - **Workflow Run**: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - - ### Summary - - Critical: ${critical} - - High: ${high} - - Moderate: ${auditData.metadata?.vulnerabilities?.moderate || 0} - - Low: ${auditData.metadata?.vulnerabilities?.low || 0} - - ### Action Required - Please review the security audit report and update affected dependencies. - - \`\`\`bash - pnpm audit - pnpm audit fix - \`\`\` - - **Note**: This issue was automatically created by the dependency update workflow. - `; - - // Check if similar issue exists - const { data: existingIssues } = await github.rest.issues.listForRepo({ - owner: context.repo.owner, - repo: context.repo.repo, - state: 'open', - labels: 'security,automated' - }); - - const hasExistingIssue = existingIssues.some(issue => - issue.title.includes('Security Alert') - ); - - if (!hasExistingIssue) { - await github.rest.issues.create({ - owner: context.repo.owner, - repo: context.repo.repo, - title: issueTitle, - body: issueBody, - labels: ['security', 'automated', 'high-priority'] - }); - } - } - - # Update lock file - update-lockfile: - name: Update Lock File - runs-on: ubuntu-latest - needs: [check-outdated, security-audit] - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Update lock file - run: | - # Update lock file without changing package.json versions - pnpm install --no-frozen-lockfile - - - name: Check for changes - id: changes - run: | - if git diff --quiet pnpm-lock.yaml; then - echo "has-changes=false" >> $GITHUB_OUTPUT - else - echo "has-changes=true" >> $GITHUB_OUTPUT - fi - - - name: Create Pull Request - if: steps.changes.outputs.has-changes == 'true' - uses: peter-evans/create-pull-request@v6 - with: - token: ${{ secrets.GITHUB_TOKEN }} - commit-message: "chore: update pnpm-lock.yaml" - title: "chore: Update dependency lock file" - body: | - ## Dependency Lock File Update - - This PR updates the `pnpm-lock.yaml` file to reflect the latest compatible versions. - - ### Changes - - Updated lock file to latest compatible versions - - No breaking changes to package.json - - ### Testing - - [ ] All CI checks pass - - [ ] Manual testing completed - - **Note**: This PR was automatically created by the dependency update workflow. - branch: chore/update-lockfile - labels: | - dependencies - automated - assignees: wuesteon diff --git a/.github/workflows/staging-config-check.yml b/.github/workflows/staging-config-check.yml deleted file mode 100644 index cfd9aef20..000000000 --- a/.github/workflows/staging-config-check.yml +++ /dev/null @@ -1,103 +0,0 @@ -name: Staging Config Check - -on: - pull_request: - paths: - - 'docker-compose.staging.yml' - - 'docker/caddy/Caddyfile.staging' - -jobs: - check-staging-urls: - name: Validate Staging URLs - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Check for HTTP IP addresses in _CLIENT URLs - run: | - echo "Checking docker-compose.staging.yml for HTTP IP addresses..." - - # Check that no _CLIENT URLs use HTTP IP addresses - if grep -E '_CLIENT:.*http://[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' docker-compose.staging.yml; then - echo "" - echo "::error::Found HTTP IP addresses in _CLIENT URLs!" - echo "All _CLIENT URLs must use HTTPS staging domains (e.g., https://auth.staging.manacore.ai)" - exit 1 - fi - - echo "No HTTP IP addresses found in _CLIENT URLs" - - - name: Check for non-HTTPS external URLs - run: | - echo "Checking for non-HTTPS external URLs in _CLIENT variables..." - - # Check that _CLIENT URLs use HTTPS (excluding localhost for dev) - VIOLATIONS=$(grep -E '_CLIENT:.*http://' docker-compose.staging.yml | grep -v localhost || true) - - if [ -n "$VIOLATIONS" ]; then - echo "" - echo "::error::Found non-HTTPS URLs in _CLIENT variables!" - echo "$VIOLATIONS" - echo "" - echo "All _CLIENT URLs must use HTTPS for staging domains." - exit 1 - fi - - echo "All _CLIENT URLs use HTTPS" - - - name: Verify required HTTPS domains - run: | - echo "Verifying required HTTPS staging domains are configured..." - - REQUIRED_DOMAINS=( - "https://auth.staging.manacore.ai" - "https://staging.manacore.ai" - ) - - MISSING=0 - for domain in "${REQUIRED_DOMAINS[@]}"; do - if ! grep -q "$domain" docker-compose.staging.yml; then - echo "::warning::Missing required domain: $domain" - MISSING=1 - fi - done - - if [ $MISSING -eq 1 ]; then - echo "" - echo "::warning::Some required staging domains are not configured. Please verify this is intentional." - fi - - echo "Domain verification complete" - - - name: Check CORS origins include HTTPS - run: | - echo "Checking CORS_ORIGINS for HTTPS staging domains..." - - # Extract CORS_ORIGINS lines and check they include staging domains - CORS_LINES=$(grep "CORS_ORIGINS:" docker-compose.staging.yml || true) - - if [ -n "$CORS_LINES" ]; then - # Check if any CORS line has HTTP staging domains (not localhost) - HTTP_CORS=$(echo "$CORS_LINES" | grep -E 'http://[a-z]+\.staging\.manacore\.ai' || true) - - if [ -n "$HTTP_CORS" ]; then - echo "" - echo "::error::Found HTTP (non-HTTPS) staging domains in CORS_ORIGINS!" - echo "$HTTP_CORS" - exit 1 - fi - fi - - echo "CORS origins are correctly configured" - - - name: Summary - run: | - echo "" - echo "======================================" - echo "Staging Configuration Check: PASSED" - echo "======================================" - echo "" - echo "All checks passed:" - echo " - No HTTP IP addresses in _CLIENT URLs" - echo " - All external _CLIENT URLs use HTTPS" - echo " - CORS origins correctly configured" diff --git a/.github/workflows/test-coverage.yml.bak b/.github/workflows/test-coverage.yml.bak deleted file mode 100644 index bb14aa4e5..000000000 --- a/.github/workflows/test-coverage.yml.bak +++ /dev/null @@ -1,180 +0,0 @@ -name: Test Coverage - -on: - pull_request: - branches: - - main - push: - branches: - - main - schedule: - # Run weekly on Sundays at 00:00 UTC - - cron: '0 0 * * 0' - workflow_dispatch: - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - TURBO_TOKEN: ${{ secrets.TURBO_TOKEN }} - TURBO_TEAM: ${{ secrets.TURBO_TEAM }} - -jobs: - test-coverage: - name: Test Coverage - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Run tests with coverage - run: pnpm run test --coverage || echo "Some tests failed" - continue-on-error: true - - - name: Collect coverage reports - run: | - # Find all coverage directories - find . -type d -name coverage \( -path "*/apps/*/apps/*" -o -path "*/services/*" \) > coverage_dirs.txt - - # Create combined coverage directory - mkdir -p coverage-combined - - # Copy all coverage files - while IFS= read -r dir; do - if [ -f "$dir/coverage-final.json" ]; then - PROJECT=$(echo $dir | sed 's|./apps/||' | sed 's|./services/||' | sed 's|/coverage||' | tr '/' '-') - cp "$dir/coverage-final.json" "coverage-combined/coverage-$PROJECT.json" - fi - done < coverage_dirs.txt - - - name: Generate coverage summary - run: | - echo "## Test Coverage Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - # Find and parse coverage summaries - find . -type f -name "coverage-summary.json" | while read -r file; do - PROJECT=$(dirname $file | sed 's|./apps/||' | sed 's|./services/||' | sed 's|/coverage||') - - if [ -f "$file" ]; then - LINES=$(jq -r '.total.lines.pct' "$file" 2>/dev/null || echo "0") - STATEMENTS=$(jq -r '.total.statements.pct' "$file" 2>/dev/null || echo "0") - FUNCTIONS=$(jq -r '.total.functions.pct' "$file" 2>/dev/null || echo "0") - BRANCHES=$(jq -r '.total.branches.pct' "$file" 2>/dev/null || echo "0") - - echo "### $PROJECT" >> $GITHUB_STEP_SUMMARY - echo "| Metric | Coverage |" >> $GITHUB_STEP_SUMMARY - echo "|--------|----------|" >> $GITHUB_STEP_SUMMARY - echo "| Lines | ${LINES}% |" >> $GITHUB_STEP_SUMMARY - echo "| Statements | ${STATEMENTS}% |" >> $GITHUB_STEP_SUMMARY - echo "| Functions | ${FUNCTIONS}% |" >> $GITHUB_STEP_SUMMARY - echo "| Branches | ${BRANCHES}% |" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - fi - done - - - name: Archive coverage reports - uses: actions/upload-artifact@v4 - with: - name: coverage-reports - path: | - apps/**/coverage - services/**/coverage - coverage-combined - retention-days: 30 - if-no-files-found: warn - - - name: Check coverage thresholds - run: | - echo "Checking coverage thresholds..." - - # Set minimum coverage threshold - MINIMUM_COVERAGE=50 # Start with 50%, increase gradually - - # Check each project's coverage - find . -type f -name "coverage-summary.json" | while read -r file; do - PROJECT=$(dirname $file | sed 's|./apps/||' | sed 's|./services/||' | sed 's|/coverage||') - LINES=$(jq -r '.total.lines.pct' "$file" 2>/dev/null || echo "0") - - echo "Checking $PROJECT: ${LINES}% coverage" - - # Convert to integer for comparison - LINES_INT=$(printf "%.0f" $LINES) - - if [ "$LINES_INT" -lt "$MINIMUM_COVERAGE" ]; then - echo "⚠️ Warning: $PROJECT coverage (${LINES}%) is below minimum threshold (${MINIMUM_COVERAGE}%)" - else - echo "✅ $PROJECT meets coverage threshold" - fi - done - - # Generate coverage badge - coverage-badge: - name: Update Coverage Badge - runs-on: ubuntu-latest - needs: test-coverage - if: github.ref == 'refs/heads/main' - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download coverage reports - uses: actions/download-artifact@v4 - continue-on-error: true - id: download-coverage - with: - name: coverage-reports - path: coverage-reports - - - name: Create coverage badge - if: steps.download-coverage.outcome == 'success' - run: | - # Calculate overall coverage - TOTAL_LINES=0 - COVERED_LINES=0 - - find coverage-reports -type f -name "coverage-summary.json" | while read -r file; do - LINES=$(jq -r '.total.lines.total' "$file" 2>/dev/null || echo "0") - COVERED=$(jq -r '.total.lines.covered' "$file" 2>/dev/null || echo "0") - - TOTAL_LINES=$((TOTAL_LINES + LINES)) - COVERED_LINES=$((COVERED_LINES + COVERED)) - done - - if [ "$TOTAL_LINES" -gt 0 ]; then - COVERAGE=$(echo "scale=2; $COVERED_LINES * 100 / $TOTAL_LINES" | bc) - echo "Overall coverage: ${COVERAGE}%" - echo "COVERAGE=${COVERAGE}" >> $GITHUB_ENV - else - echo "No coverage data found" - echo "COVERAGE=0" >> $GITHUB_ENV - fi - - - name: Update README badge - if: steps.download-coverage.outcome == 'success' - run: | - echo "Coverage badge data ready: ${{ env.COVERAGE }}%" - # This would update a badge in the README or create a gist - # Implementation depends on chosen badge service (shields.io, codecov, etc.) - - - name: Skip badge update - if: steps.download-coverage.outcome != 'success' - run: echo "No coverage reports available - skipping badge update" diff --git a/.github/workflows/test.yml.bak b/.github/workflows/test.yml.bak deleted file mode 100644 index 1ffd6eb46..000000000 --- a/.github/workflows/test.yml.bak +++ /dev/null @@ -1,389 +0,0 @@ -name: Test Suite - -on: - pull_request: - branches: [main, develop] - push: - branches: [main, develop] - workflow_dispatch: - -# Cancel in-progress runs for same PR/branch -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - NODE_VERSION: '20' - PNPM_VERSION: '9.15.0' - -jobs: - # ==================== - # 1. TEST BACKENDS - # ==================== - test-backends: - name: Test Backend - ${{ matrix.project }} - runs-on: ubuntu-latest - timeout-minutes: 10 - - strategy: - fail-fast: false - matrix: - project: - - maerchenzauber - - manadeck - - chat - - nutriphi - - picture - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup pnpm - uses: pnpm/action-setup@v2 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Type check - run: pnpm --filter @${{ matrix.project }}/backend type-check - continue-on-error: true - - - name: Run tests with coverage - run: pnpm --filter @${{ matrix.project }}/backend test:cov - env: - NODE_ENV: test - - - name: Check coverage thresholds - run: | - echo "Checking coverage meets 80% threshold..." - # Jest/Vitest will fail if thresholds aren't met - - # ==================== - # 2. TEST MOBILE APPS - # ==================== - test-mobile: - name: Test Mobile - ${{ matrix.project }} - runs-on: ubuntu-latest - timeout-minutes: 15 - - strategy: - fail-fast: false - matrix: - project: - - maerchenzauber - - memoro - - picture - - chat - - manacore - - manadeck - - nutriphi - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v2 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Type check - run: pnpm --filter @${{ matrix.project }}/mobile type-check - continue-on-error: true - - - name: Run tests with coverage - run: pnpm --filter @${{ matrix.project }}/mobile test -- --coverage --watchAll=false --ci - env: - NODE_ENV: test - - # ==================== - # 3. TEST WEB APPS - # ==================== - test-web: - name: Test Web - ${{ matrix.project }} - runs-on: ubuntu-latest - timeout-minutes: 15 - - strategy: - fail-fast: false - matrix: - project: - - maerchenzauber - - manacore - - memoro - - picture - - uload - - chat - - manadeck - - nutriphi - - news - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v2 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Type check - run: pnpm --filter @${{ matrix.project }}/web check - continue-on-error: true - - - name: Run unit tests with coverage - run: pnpm --filter @${{ matrix.project }}/web test:unit -- --coverage --run - env: - NODE_ENV: test - - # ==================== - # 4. E2E TESTS (WEB) - # ==================== - test-e2e-web: - name: E2E Web - ${{ matrix.project }} - runs-on: ubuntu-latest - timeout-minutes: 20 - - strategy: - fail-fast: false - matrix: - project: - - uload - # Add other projects with E2E tests - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v2 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Install Playwright browsers - run: pnpm --filter @${{ matrix.project }}/web exec playwright install --with-deps chromium - - - name: Build application - run: pnpm --filter @${{ matrix.project }}/web build - - - name: Run E2E tests - run: pnpm --filter @${{ matrix.project }}/web test:e2e - env: - CI: true - - - name: Upload Playwright report - if: always() - uses: actions/upload-artifact@v4 - with: - name: playwright-report-${{ matrix.project }} - path: ./apps/${{ matrix.project }}/apps/web/playwright-report/ - retention-days: 7 - - # ==================== - # 5. TEST SHARED PACKAGES - # ==================== - test-shared-packages: - name: Test Shared Packages - runs-on: ubuntu-latest - timeout-minutes: 10 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v2 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Type check shared packages - run: pnpm --filter './packages/*' type-check - continue-on-error: true - - - name: Run tests with coverage - run: pnpm --filter './packages/*' test -- --coverage --run - continue-on-error: true - env: - NODE_ENV: test - - # ==================== - # 6. LINT & FORMAT CHECK - # ==================== - lint-and-format: - name: Lint & Format - runs-on: ubuntu-latest - timeout-minutes: 10 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v2 - with: - version: ${{ env.PNPM_VERSION }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: ${{ env.NODE_VERSION }} - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm run build:packages - - - name: Check formatting - run: pnpm run format:check - - - name: Run linters - run: pnpm run lint - continue-on-error: true - - # ==================== - # 7. COVERAGE REPORT - # ==================== - coverage-report: - name: Generate Coverage Report - needs: - - test-backends - - test-mobile - - test-web - - test-shared-packages - runs-on: ubuntu-latest - if: always() - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download all coverage reports - uses: actions/download-artifact@v4 - continue-on-error: true - - - name: Generate coverage summary - run: | - echo "## 📊 Test Coverage Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Jobs Status" >> $GITHUB_STEP_SUMMARY - echo "- Backend Tests: ${{ needs.test-backends.result }}" >> $GITHUB_STEP_SUMMARY - echo "- Mobile Tests: ${{ needs.test-mobile.result }}" >> $GITHUB_STEP_SUMMARY - echo "- Web Tests: ${{ needs.test-web.result }}" >> $GITHUB_STEP_SUMMARY - echo "- Shared Packages Tests: ${{ needs.test-shared-packages.result }}" >> $GITHUB_STEP_SUMMARY - - # ==================== - # 8. TEST STATUS CHECK - # ==================== - test-status: - name: All Tests Status - needs: - - test-backends - - test-mobile - - test-web - - test-shared-packages - - lint-and-format - runs-on: ubuntu-latest - if: always() - - steps: - - name: Check test results - run: | - if [ "${{ needs.test-backends.result }}" != "success" ] || \ - [ "${{ needs.test-mobile.result }}" != "success" ] || \ - [ "${{ needs.test-web.result }}" != "success" ] || \ - [ "${{ needs.test-shared-packages.result }}" != "success" ]; then - echo "❌ Some tests failed" - exit 1 - fi - echo "✅ All tests passed" - - - name: Post PR comment - if: github.event_name == 'pull_request' - uses: actions/github-script@v7 - with: - script: | - const status = '${{ needs.test-status.result }}' === 'success' ? '✅' : '❌'; - const body = `## ${status} Test Suite Results - - **Status**: ${status === '✅' ? 'All tests passed!' : 'Some tests failed'} - - ### Test Coverage - - Backend: ${{ needs.test-backends.result }} - - Mobile: ${{ needs.test-mobile.result }} - - Web: ${{ needs.test-web.result }} - - Shared Packages: ${{ needs.test-shared-packages.result }} - - Lint & Format: ${{ needs.lint-and-format.result }} - - View detailed results in the [Actions tab](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - `; - - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body - }); diff --git a/CLAUDE.md b/CLAUDE.md index f925c51e2..d2e646183 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -549,16 +549,34 @@ npx wrangler pages project add-domain chat-landing chat.manacore.app ## Server Access -### Hetzner Staging Server +### Mac Mini Production Server -SSH access for deployment troubleshooting, log inspection, and service management: +The production environment runs on a Mac Mini, accessible via Cloudflare Tunnel. + +**Domain:** mana.how +**SSH:** `ssh mana-server` (requires cloudflared and SSH config) ```bash -ssh -i ~/.ssh/hetzner_deploy_key deploy@46.224.108.214 +# SSH config (~/.ssh/config) +Host mana-server + HostName mac-mini.mana.how + User till + ProxyCommand /opt/homebrew/bin/cloudflared access ssh --hostname %h ``` -**User:** `deploy` -**Key:** `~/.ssh/hetzner_deploy_key` +#### Useful Commands + +```bash +ssh mana-server # Connect to server +cd ~/projects/manacore-monorepo + +./scripts/mac-mini/status.sh # Check all services +./scripts/mac-mini/deploy.sh # Pull & restart containers +./scripts/mac-mini/health-check.sh # Run health checks +docker compose -f docker-compose.macmini.yml logs -f # View logs +``` + +For detailed server documentation, see **[docs/MAC_MINI_SERVER.md](docs/MAC_MINI_SERVER.md)**. ## Adding Dependencies diff --git a/docker-compose.macmini.yml b/docker-compose.macmini.yml index 5c4636fc5..c6f38c71f 100644 --- a/docker-compose.macmini.yml +++ b/docker-compose.macmini.yml @@ -723,6 +723,32 @@ services: retries: 3 start_period: 40s + # ============================================ + # Auto-Update (Watchtower) + # ============================================ + + watchtower: + image: containrrr/watchtower + container_name: manacore-watchtower + restart: always + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ~/.docker/config.json:/config.json:ro + environment: + TZ: Europe/Berlin + WATCHTOWER_POLL_INTERVAL: 300 # Check every 5 minutes + WATCHTOWER_CLEANUP: "true" # Remove old images + WATCHTOWER_INCLUDE_STOPPED: "false" # Only update running containers + WATCHTOWER_NO_STARTUP_MESSAGE: "false" # Log startup message + WATCHTOWER_NOTIFICATIONS: shoutrrr + WATCHTOWER_NOTIFICATION_URL: ${WATCHTOWER_NOTIFICATION_URL:-} # Optional: telegram://token@telegram?chats=chatid + WATCHTOWER_NOTIFICATION_TEMPLATE: | + {{- if .Updated -}} + 🚀 *ManaCore Update* + Updated: {{range .Updated}}{{.Name}} {{end}} + {{- end -}} + command: --label-enable=false # Update all containers (not just labeled ones) + # ============================================ # Volumes # ============================================ diff --git a/docker-compose.production.yml b/docker-compose.production.yml deleted file mode 100644 index 9b5e5cb7d..000000000 --- a/docker-compose.production.yml +++ /dev/null @@ -1,429 +0,0 @@ -# ManaCore Production Configuration -# Domain: mana.how -# Server: 46.224.108.214 -# -# This replaces the staging environment as production. -# Apps: mana-core-auth, manacore-web, chat, todo, calendar, clock - -services: - # ============================================ - # Infrastructure Services - # ============================================ - - postgres: - image: postgres:16-alpine - container_name: manacore-postgres-prod - restart: always - environment: - POSTGRES_DB: ${POSTGRES_DB:-manacore} - POSTGRES_USER: ${POSTGRES_USER:-postgres} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - volumes: - - postgres_data:/var/lib/postgresql/data - ports: - - "127.0.0.1:5432:5432" - healthcheck: - test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-postgres}"] - interval: 10s - timeout: 5s - retries: 5 - networks: - - manacore-network - - redis: - image: redis:7-alpine - container_name: manacore-redis-prod - restart: always - command: redis-server --requirepass ${REDIS_PASSWORD} - volumes: - - redis_data:/data - ports: - - "127.0.0.1:6379:6379" - healthcheck: - test: ["CMD", "redis-cli", "--raw", "incr", "ping"] - interval: 10s - timeout: 5s - retries: 5 - networks: - - manacore-network - - # ============================================ - # Auth Service - # ============================================ - - mana-core-auth: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/mana-core-auth:${AUTH_VERSION:-latest} - container_name: mana-core-auth-prod - restart: always - depends_on: - postgres: - condition: service_healthy - redis: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 3001 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/manacore_auth - REDIS_HOST: redis - REDIS_PORT: 6379 - REDIS_PASSWORD: ${REDIS_PASSWORD} - JWT_SECRET: ${JWT_SECRET} - JWT_PUBLIC_KEY: ${JWT_PUBLIC_KEY} - JWT_PRIVATE_KEY: ${JWT_PRIVATE_KEY} - # CORS - Production domains only - CORS_ORIGINS: https://mana.how,https://chat.mana.how,https://todo.mana.how,https://calendar.mana.how,https://clock.mana.how - ports: - - "3001:3001" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3001/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - deploy: - resources: - limits: - cpus: '1' - memory: 512M - - # ============================================ - # ManaCore Dashboard - # ============================================ - - manacore-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/manacore-web:${MANACORE_WEB_VERSION:-latest} - container_name: manacore-web-prod - restart: always - depends_on: - mana-core-auth: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 5173 - # Auth URLs - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.mana.how - # Backend URLs for dashboard widgets - PUBLIC_TODO_API_URL: http://todo-backend:3018 - PUBLIC_TODO_API_URL_CLIENT: https://todo-api.mana.how - PUBLIC_CALENDAR_API_URL: http://calendar-backend:3016 - PUBLIC_CALENDAR_API_URL_CLIENT: https://calendar-api.mana.how - PUBLIC_CLOCK_API_URL: http://clock-backend:3017 - PUBLIC_CLOCK_API_URL_CLIENT: https://clock-api.mana.how - ports: - - "5173:5173" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5173/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - - # ============================================ - # Chat App - # ============================================ - - chat-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/chat-backend:${CHAT_VERSION:-latest} - container_name: chat-backend-prod - restart: always - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 3002 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/chat - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - SUPABASE_URL: ${SUPABASE_URL} - SUPABASE_SERVICE_KEY: ${SUPABASE_SERVICE_ROLE_KEY} - AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT} - AZURE_OPENAI_API_KEY: ${AZURE_OPENAI_API_KEY} - AZURE_OPENAI_API_VERSION: ${AZURE_OPENAI_API_VERSION:-2024-12-01-preview} - CORS_ORIGINS: https://chat.mana.how,https://mana.how - ports: - - "3002:3002" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3002/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - deploy: - resources: - limits: - cpus: '2' - memory: 1G - - chat-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/chat-web:${CHAT_WEB_VERSION:-latest} - container_name: chat-web-prod - restart: always - depends_on: - chat-backend: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 3000 - PUBLIC_BACKEND_URL: http://chat-backend:3002 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_BACKEND_URL_CLIENT: https://chat-api.mana.how - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.mana.how - ports: - - "3000:3000" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - - # ============================================ - # Todo App - # ============================================ - - todo-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/todo-backend:${TODO_BACKEND_VERSION:-latest} - container_name: todo-backend-prod - restart: always - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 3018 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/todo - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - CORS_ORIGINS: https://todo.mana.how,https://mana.how - ports: - - "3018:3018" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3018/api/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - - todo-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/todo-web:${TODO_WEB_VERSION:-latest} - container_name: todo-web-prod - restart: always - depends_on: - todo-backend: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 5188 - PUBLIC_BACKEND_URL: http://todo-backend:3018 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_BACKEND_URL_CLIENT: https://todo-api.mana.how - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.mana.how - ports: - - "5188:5188" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5188/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - - # ============================================ - # Calendar App - # ============================================ - - calendar-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/calendar-backend:${CALENDAR_VERSION:-latest} - container_name: calendar-backend-prod - restart: always - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 3016 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/calendar - DB_HOST: postgres - DB_PORT: 5432 - DB_USER: ${POSTGRES_USER:-postgres} - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - CORS_ORIGINS: https://calendar.mana.how,https://mana.how - ports: - - "3016:3016" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3016/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - - calendar-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/calendar-web:${CALENDAR_WEB_VERSION:-latest} - container_name: calendar-web-prod - restart: always - depends_on: - calendar-backend: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 5186 - PUBLIC_BACKEND_URL: http://calendar-backend:3016 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_BACKEND_URL_CLIENT: https://calendar-api.mana.how - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.mana.how - ports: - - "5186:5186" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5186/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - - # ============================================ - # Clock App - # ============================================ - - clock-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/clock-backend:${CLOCK_VERSION:-latest} - container_name: clock-backend-prod - restart: always - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 3017 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/clock - DB_HOST: postgres - DB_PORT: 5432 - DB_USER: ${POSTGRES_USER:-postgres} - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - CORS_ORIGINS: https://clock.mana.how,https://mana.how - ports: - - "3017:3017" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3017/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - - clock-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/clock-web:${CLOCK_WEB_VERSION:-latest} - container_name: clock-web-prod - restart: always - depends_on: - clock-backend: - condition: service_healthy - environment: - NODE_ENV: production - PORT: 5187 - PUBLIC_BACKEND_URL: http://clock-backend:3017 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_BACKEND_URL_CLIENT: https://clock-api.mana.how - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.mana.how - ports: - - "5187:5187" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5187/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "50m" - max-file: "5" - -# ============================================ -# Networks -# ============================================ - -networks: - manacore-network: - driver: bridge - name: manacore-production - -# ============================================ -# Volumes -# ============================================ - -volumes: - postgres_data: - name: manacore-postgres-prod - redis_data: - name: manacore-redis-prod diff --git a/docker-compose.staging.full.yml b/docker-compose.staging.full.yml deleted file mode 100644 index f0937e38b..000000000 --- a/docker-compose.staging.full.yml +++ /dev/null @@ -1,290 +0,0 @@ -# ARCHIVED: Full staging config with all services -# Active simplified config: docker-compose.staging.yml -# -# Services included: -# - postgres, redis (infrastructure) -# - mana-core-auth, chat-backend, manadeck-backend (backends) -# - nginx (reverse proxy) -# -# To restore: cp docker-compose.staging.full.yml docker-compose.staging.yml - -services: - # ============================================ - # Infrastructure Services - # ============================================ - - postgres: - image: postgres:16-alpine - container_name: manacore-postgres-staging - restart: unless-stopped - environment: - POSTGRES_DB: ${POSTGRES_DB:-manacore} - POSTGRES_USER: ${POSTGRES_USER:-postgres} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - volumes: - - postgres_data:/var/lib/postgresql/data - # init.sql removed - not needed for staging - ports: - - "5432:5432" - healthcheck: - test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-postgres}"] - interval: 10s - timeout: 5s - retries: 5 - networks: - - manacore-network - - redis: - image: redis:7-alpine - container_name: manacore-redis-staging - restart: unless-stopped - command: redis-server --requirepass ${REDIS_PASSWORD:-redis123} - volumes: - - redis_data:/data - ports: - - "6379:6379" - healthcheck: - test: ["CMD", "redis-cli", "--raw", "incr", "ping"] - interval: 10s - timeout: 5s - retries: 5 - networks: - - manacore-network - - # ============================================ - # Backend Services - # ============================================ - - mana-core-auth: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/mana-core-auth:${AUTH_VERSION:-latest} - container_name: mana-core-auth-staging - restart: unless-stopped - depends_on: - postgres: - condition: service_healthy - redis: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3001 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/manacore_auth - REDIS_HOST: redis - REDIS_PORT: 6379 - REDIS_PASSWORD: ${REDIS_PASSWORD:-redis123} - JWT_SECRET: ${JWT_SECRET} - JWT_PUBLIC_KEY: ${JWT_PUBLIC_KEY} - JWT_PRIVATE_KEY: ${JWT_PRIVATE_KEY} - ports: - - "3001:3001" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3001/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - # maerchenzauber-backend: - # image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/maerchenzauber-backend:${MAERCHENZAUBER_VERSION:-latest} - # container_name: maerchenzauber-backend-staging - # restart: unless-stopped - # depends_on: - # mana-core-auth: - # condition: service_healthy - # environment: - # NODE_ENV: staging - # PORT: 3002 - # MANA_SERVICE_URL: http://mana-core-auth:3001 - # SUPABASE_URL: ${SUPABASE_URL} - # SUPABASE_ANON_KEY: ${SUPABASE_ANON_KEY} - # SUPABASE_SERVICE_ROLE_KEY: ${SUPABASE_SERVICE_ROLE_KEY} - # AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT} - # AZURE_OPENAI_API_KEY: ${AZURE_OPENAI_API_KEY} - # AZURE_OPENAI_API_VERSION: ${AZURE_OPENAI_API_VERSION:-2024-12-01-preview} - # ports: - # - "3002:3002" - # healthcheck: - # test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3002/health"] - # interval: 30s - # timeout: 10s - # retries: 3 - # networks: - # - manacore-network - # logging: - # driver: "json-file" - # options: - # max-size: "10m" - # max-file: "3" - # # DISABLED: No Dockerfile exists yet - - chat-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/chat-backend:${CHAT_VERSION:-latest} - container_name: chat-backend-staging - restart: unless-stopped - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3002 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/chat - MANA_SERVICE_URL: http://mana-core-auth:3001 - SUPABASE_URL: ${SUPABASE_URL} - SUPABASE_SERVICE_KEY: ${SUPABASE_SERVICE_ROLE_KEY} - AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT} - AZURE_OPENAI_API_KEY: ${AZURE_OPENAI_API_KEY} - AZURE_OPENAI_API_VERSION: ${AZURE_OPENAI_API_VERSION:-2024-12-01-preview} - ports: - - "3003:3002" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3002/api/health"] - interval: 30s - timeout: 10s - retries: 3 - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - manadeck-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/manadeck-backend:${MANADECK_VERSION:-latest} - container_name: manadeck-backend-staging - restart: unless-stopped - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3003 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/manadeck - MANA_SERVICE_URL: http://mana-core-auth:3001 - SUPABASE_URL: ${SUPABASE_URL} - SUPABASE_SERVICE_KEY: ${SUPABASE_SERVICE_ROLE_KEY} - ports: - - "3004:3003" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3003/health"] - interval: 30s - timeout: 10s - retries: 3 - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - # nutriphi-backend: - # image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/nutriphi-backend:${NUTRIPHI_VERSION:-latest} - # container_name: nutriphi-backend-staging - # restart: unless-stopped - # depends_on: - # mana-core-auth: - # condition: service_healthy - # environment: - # NODE_ENV: staging - # PORT: 3004 - # MANA_SERVICE_URL: http://mana-core-auth:3001 - # SUPABASE_URL: ${SUPABASE_URL} - # SUPABASE_SERVICE_KEY: ${SUPABASE_SERVICE_ROLE_KEY} - # ports: - # - "3005:3004" - # healthcheck: - # test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3004/health"] - # interval: 30s - # timeout: 10s - # retries: 3 - # networks: - # - manacore-network - # logging: - # driver: "json-file" - # options: - # max-size: "10m" - # max-file: "3" - # # DISABLED: No Dockerfile exists yet - - # news-api: - # image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/news-api:${NEWS_VERSION:-latest} - # container_name: news-api-staging - # restart: unless-stopped - # depends_on: - # mana-core-auth: - # condition: service_healthy - # environment: - # NODE_ENV: staging - # PORT: 3005 - # MANA_SERVICE_URL: http://mana-core-auth:3001 - # ports: - # - "3006:3005" - # healthcheck: - # test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3005/health"] - # interval: 30s - # timeout: 10s - # retries: 3 - # networks: - # - manacore-network - # logging: - # driver: "json-file" - # options: - # max-size: "10m" - # max-file: "3" - # # DISABLED: No Dockerfile exists yet - - # ============================================ - # Reverse Proxy (Optional) - # ============================================ - - nginx: - image: nginx:alpine - container_name: manacore-nginx-staging - restart: unless-stopped - depends_on: - - mana-core-auth - - chat-backend - - manadeck-backend - volumes: - - ./docker/nginx/staging.conf:/etc/nginx/conf.d/default.conf - - ./docker/nginx/ssl:/etc/nginx/ssl - ports: - - "80:80" - - "443:443" - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - -# ============================================ -# Networks -# ============================================ - -networks: - manacore-network: - driver: bridge - name: manacore-staging - -# ============================================ -# Volumes -# ============================================ - -volumes: - postgres_data: - name: manacore-postgres-staging - redis_data: - name: manacore-redis-staging diff --git a/docker-compose.staging.yml b/docker-compose.staging.yml deleted file mode 100644 index 6ba21f4ab..000000000 --- a/docker-compose.staging.yml +++ /dev/null @@ -1,421 +0,0 @@ -# Simplified staging config: mana-core-auth + chat (backend + web) -# Full config archived at: docker-compose.staging.full.yml -# -# To restore full config: -# cp docker-compose.staging.full.yml docker-compose.staging.yml -# -# To add more services back: -# 1. Copy the service block from docker-compose.staging.full.yml -# 2. Add corresponding health check in .github/workflows/cd-staging.yml -# 3. Add service to workflow_dispatch options in cd-staging.yml - -services: - # ============================================ - # Infrastructure Services - # ============================================ - - postgres: - image: postgres:16-alpine - container_name: manacore-postgres-staging - restart: unless-stopped - environment: - POSTGRES_DB: ${POSTGRES_DB:-manacore} - POSTGRES_USER: ${POSTGRES_USER:-postgres} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - volumes: - - postgres_data:/var/lib/postgresql/data - ports: - - "5432:5432" - healthcheck: - test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-postgres}"] - interval: 10s - timeout: 5s - retries: 5 - networks: - - manacore-network - - redis: - image: redis:7-alpine - container_name: manacore-redis-staging - restart: unless-stopped - command: redis-server --requirepass ${REDIS_PASSWORD:-redis123} - volumes: - - redis_data:/data - ports: - - "6379:6379" - healthcheck: - test: ["CMD", "redis-cli", "--raw", "incr", "ping"] - interval: 10s - timeout: 5s - retries: 5 - networks: - - manacore-network - - # ============================================ - # Backend Services - # ============================================ - - mana-core-auth: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/mana-core-auth:${AUTH_VERSION:-latest} - container_name: mana-core-auth-staging - restart: unless-stopped - depends_on: - postgres: - condition: service_healthy - redis: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3001 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/manacore_auth - REDIS_HOST: redis - REDIS_PORT: 6379 - REDIS_PASSWORD: ${REDIS_PASSWORD:-redis123} - JWT_SECRET: ${JWT_SECRET} - JWT_PUBLIC_KEY: ${JWT_PUBLIC_KEY} - JWT_PRIVATE_KEY: ${JWT_PRIVATE_KEY} - # CORS - Allow all staging web app origins (HTTPS domains + localhost for dev) - CORS_ORIGINS: https://chat.staging.manacore.ai,https://staging.manacore.ai,https://calendar.staging.manacore.ai,https://clock.staging.manacore.ai,https://todo.staging.manacore.ai,http://localhost:3000,http://localhost:5173,http://localhost:5186,http://localhost:5187,http://localhost:5188 - ports: - - "3001:3001" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3001/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - chat-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/chat-backend:${CHAT_VERSION:-latest} - container_name: chat-backend-staging - restart: unless-stopped - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3002 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/chat - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - SUPABASE_URL: ${SUPABASE_URL} - SUPABASE_SERVICE_KEY: ${SUPABASE_SERVICE_ROLE_KEY} - AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT} - AZURE_OPENAI_API_KEY: ${AZURE_OPENAI_API_KEY} - AZURE_OPENAI_API_VERSION: ${AZURE_OPENAI_API_VERSION:-2024-12-01-preview} - ports: - - "3002:3002" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3002/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - chat-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/chat-web:${CHAT_WEB_VERSION:-latest} - container_name: chat-web-staging - restart: unless-stopped - depends_on: - chat-backend: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3000 - # Server-side URLs (Docker internal network) - PUBLIC_BACKEND_URL: http://chat-backend:3002 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - # Client-side URLs (browser access via HTTPS staging domains) - PUBLIC_BACKEND_URL_CLIENT: https://chat-api.staging.manacore.ai - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.staging.manacore.ai - ports: - - "3000:3000" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - # ============================================ - # Manacore App - # ============================================ - - manacore-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/manacore-web:${MANACORE_WEB_VERSION:-latest} - container_name: manacore-web-staging - restart: unless-stopped - depends_on: - mana-core-auth: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 5173 - # Auth URLs - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.staging.manacore.ai - # Backend URLs for dashboard widgets - PUBLIC_TODO_API_URL: http://todo-backend:3018 - PUBLIC_TODO_API_URL_CLIENT: https://todo-api.staging.manacore.ai - PUBLIC_CALENDAR_API_URL: http://calendar-backend:3016 - PUBLIC_CALENDAR_API_URL_CLIENT: https://calendar-api.staging.manacore.ai - PUBLIC_CLOCK_API_URL: http://clock-backend:3017 - PUBLIC_CLOCK_API_URL_CLIENT: https://clock-api.staging.manacore.ai - ports: - - "5173:5173" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5173/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - # ============================================ - # Todo App - # ============================================ - - todo-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/todo-backend:${TODO_BACKEND_VERSION:-latest} - container_name: todo-backend-staging - restart: unless-stopped - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3018 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/todo - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - ports: - - "3018:3018" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3018/api/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - todo-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/todo-web:${TODO_WEB_VERSION:-latest} - container_name: todo-web-staging - restart: unless-stopped - depends_on: - todo-backend: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 5188 - # Server-side URLs (Docker internal network) - PUBLIC_BACKEND_URL: http://todo-backend:3018 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - # Client-side URLs (browser access via public IP) - PUBLIC_BACKEND_URL_CLIENT: http://46.224.108.214:3018 - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: http://46.224.108.214:3001 - ports: - - "5188:5188" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5188/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - # ============================================ - # Calendar App - # ============================================ - - calendar-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/calendar-backend:${CALENDAR_VERSION:-latest} - container_name: calendar-backend-staging - restart: unless-stopped - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3016 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/calendar - DB_HOST: postgres - DB_PORT: 5432 - DB_USER: ${POSTGRES_USER:-postgres} - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - CORS_ORIGINS: https://calendar.staging.manacore.ai,https://staging.manacore.ai,http://localhost:5186,http://localhost:5173 - ports: - - "3016:3016" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3016/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - calendar-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/calendar-web:${CALENDAR_WEB_VERSION:-latest} - container_name: calendar-web-staging - restart: unless-stopped - depends_on: - calendar-backend: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 5186 - PUBLIC_BACKEND_URL: http://calendar-backend:3016 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_BACKEND_URL_CLIENT: https://calendar-api.staging.manacore.ai - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.staging.manacore.ai - ports: - - "5186:5186" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5186/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - # ============================================ - # Clock App - # ============================================ - - clock-backend: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/clock-backend:${CLOCK_VERSION:-latest} - container_name: clock-backend-staging - restart: unless-stopped - depends_on: - mana-core-auth: - condition: service_healthy - postgres: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 3017 - DATABASE_URL: postgresql://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD}@postgres:5432/clock - DB_HOST: postgres - DB_PORT: 5432 - DB_USER: ${POSTGRES_USER:-postgres} - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - CORS_ORIGINS: https://clock.staging.manacore.ai,https://staging.manacore.ai,http://localhost:5187,http://localhost:5173 - ports: - - "3017:3017" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3017/api/v1/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - - clock-web: - image: ${DOCKER_REGISTRY:-ghcr.io/memo-2023}/clock-web:${CLOCK_WEB_VERSION:-latest} - container_name: clock-web-staging - restart: unless-stopped - depends_on: - clock-backend: - condition: service_healthy - environment: - NODE_ENV: staging - PORT: 5187 - PUBLIC_BACKEND_URL: http://clock-backend:3017 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_BACKEND_URL_CLIENT: https://clock-api.staging.manacore.ai - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.staging.manacore.ai - ports: - - "5187:5187" - healthcheck: - test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:5187/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - manacore-network - logging: - driver: "json-file" - options: - max-size: "10m" - max-file: "3" - -# ============================================ -# Networks -# ============================================ - -networks: - manacore-network: - driver: bridge - name: manacore-staging - -# ============================================ -# Volumes -# ============================================ - -volumes: - postgres_data: - name: manacore-postgres-staging - redis_data: - name: manacore-redis-staging diff --git a/docs/CI_CD_SETUP.md b/docs/CI_CD_SETUP.md deleted file mode 100644 index 6b182236b..000000000 --- a/docs/CI_CD_SETUP.md +++ /dev/null @@ -1,522 +0,0 @@ -# CI/CD Setup Guide - -Step-by-step guide to configure the CI/CD pipeline for the manacore-monorepo. - -## Quick Start - -1. [Configure GitHub Secrets](#github-secrets) -2. [Set Up Docker Registry](#docker-registry) -3. [Configure Deployment Servers](#deployment-servers) -4. [Enable GitHub Actions](#enable-github-actions) -5. [Test the Pipeline](#test-the-pipeline) - -## GitHub Secrets - -### Navigate to Secrets - -1. Go to your GitHub repository -2. Click `Settings` > `Secrets and variables` > `Actions` -3. Click `New repository secret` - -### Required Secrets - -#### Docker Registry (3 secrets) - -``` -DOCKER_USERNAME=your-docker-hub-username -DOCKER_PASSWORD=your-docker-hub-password-or-token -DOCKER_REGISTRY=wuesteon -``` - -**How to get Docker credentials**: -1. Create account at https://hub.docker.com -2. Go to Account Settings > Security -3. Create Access Token -4. Use token as DOCKER_PASSWORD - -#### SSH Keys (2 secrets per environment) - -Generate SSH keys: -```bash -# Generate new key pair -ssh-keygen -t ed25519 -C "github-actions-staging" -f ~/.ssh/github-actions-staging - -# Display private key (copy this to GitHub secret) -cat ~/.ssh/github-actions-staging - -# Display public key (add this to server) -cat ~/.ssh/github-actions-staging.pub -``` - -Add to GitHub: -``` -STAGING_SSH_KEY= -PRODUCTION_SSH_KEY= -``` - -#### Server Access (2 secrets per environment) - -``` -STAGING_HOST=staging.manacore.app -STAGING_USER=deploy -PRODUCTION_HOST=api.manacore.app -PRODUCTION_USER=deploy -``` - -#### Database Configuration (Staging) - -``` -STAGING_POSTGRES_HOST=postgres -STAGING_POSTGRES_PORT=5432 -STAGING_POSTGRES_DB=manacore -STAGING_POSTGRES_USER=postgres -STAGING_POSTGRES_PASSWORD= -``` - -Generate secure password: -```bash -openssl rand -base64 32 -``` - -#### Redis Configuration (Staging) - -``` -STAGING_REDIS_HOST=redis -STAGING_REDIS_PORT=6379 -STAGING_REDIS_PASSWORD= -``` - -#### Supabase Configuration (Staging) - -``` -STAGING_SUPABASE_URL=https://xxxxx.supabase.co -STAGING_SUPABASE_ANON_KEY= -STAGING_SUPABASE_SERVICE_ROLE_KEY= -``` - -**How to get Supabase credentials**: -1. Go to https://supabase.com -2. Open your project -3. Go to Project Settings > API -4. Copy `URL`, `anon public`, and `service_role` keys - -#### Azure OpenAI Configuration (Staging) - -``` -STAGING_AZURE_OPENAI_ENDPOINT=https://xxxxx.openai.azure.com -STAGING_AZURE_OPENAI_API_KEY= -STAGING_AZURE_OPENAI_API_VERSION=2024-12-01-preview -``` - -#### JWT Configuration (Staging) - -Generate JWT keys: -```bash -# Generate private key -openssl genrsa -out jwt-private.pem 2048 - -# Extract public key -openssl rsa -in jwt-private.pem -pubout -out jwt-public.pem - -# Generate secret -openssl rand -hex 32 - -# View private key (copy to STAGING_JWT_PRIVATE_KEY) -cat jwt-private.pem - -# View public key (copy to STAGING_JWT_PUBLIC_KEY) -cat jwt-public.pem -``` - -Add to GitHub: -``` -STAGING_JWT_SECRET= -STAGING_JWT_PUBLIC_KEY= -STAGING_JWT_PRIVATE_KEY= -``` - -#### Production Secrets - -Repeat all the above for production with `PRODUCTION_` prefix. - -**Important**: Use different values for production! Never reuse staging credentials. - -#### Optional: Turbo Cache - -For faster builds with remote caching: - -``` -TURBO_TOKEN= -TURBO_TEAM= -``` - -Get these from https://vercel.com - -#### Optional: Code Coverage - -``` -CODECOV_TOKEN= -``` - -Get from https://codecov.io - -## Docker Registry - -### Option 1: Docker Hub (Recommended) - -1. Sign up at https://hub.docker.com -2. Create access token (Account Settings > Security) -3. Add credentials to GitHub secrets -4. Create repository for each service: - - `wuesteon/mana-core-auth` - - `wuesteon/chat-backend` - - `wuesteon/maerchenzauber-backend` - - etc. - -### Option 2: GitHub Container Registry - -```yaml -# In .github/workflows/ci-main.yml, change: -- name: Login to Docker Hub - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - -# Change image names to: -ghcr.io/${{ github.repository_owner }}/service-name -``` - -### Option 3: Private Registry - -Update workflows to use your registry URL: -``` -registry: registry.example.com -``` - -## Deployment Servers - -### Server Requirements - -- **OS**: Ubuntu 20.04+ or Debian 11+ -- **RAM**: 4GB minimum, 8GB recommended -- **Storage**: 50GB minimum, 100GB recommended -- **CPU**: 2 cores minimum, 4 cores recommended - -### Server Setup - -#### 1. Create Deploy User - -```bash -# On server -sudo adduser deploy -sudo usermod -aG docker deploy -sudo su - deploy -``` - -#### 2. Install Docker - -```bash -# Update system -sudo apt update && sudo apt upgrade -y - -# Install Docker -curl -fsSL https://get.docker.com -o get-docker.sh -sudo sh get-docker.sh - -# Install Docker Compose -sudo apt install docker-compose-plugin - -# Verify installation -docker --version -docker compose version -``` - -#### 3. Configure SSH Access - -```bash -# On server, as deploy user -mkdir -p ~/.ssh -chmod 700 ~/.ssh - -# Add GitHub Actions public key to authorized_keys -echo "ssh-ed25519 AAAAC3... github-actions-staging" >> ~/.ssh/authorized_keys -chmod 600 ~/.ssh/authorized_keys -``` - -#### 4. Test SSH Access - -```bash -# From your local machine -ssh -i ~/.ssh/github-actions-staging deploy@staging.manacore.app - -# Should login without password prompt -``` - -#### 5. Create Deployment Directories - -```bash -# On server -mkdir -p ~/manacore-staging -mkdir -p ~/manacore-staging/logs -mkdir -p ~/manacore-staging/backups - -# Or for production -mkdir -p ~/manacore-production -mkdir -p ~/manacore-production/logs -mkdir -p ~/manacore-production/backups -``` - -#### 6. Configure Firewall - -```bash -# Allow SSH -sudo ufw allow 22/tcp - -# Allow HTTP/HTTPS -sudo ufw allow 80/tcp -sudo ufw allow 443/tcp - -# Allow specific service ports (optional, if not using reverse proxy) -sudo ufw allow 3001/tcp # Mana Core Auth -sudo ufw allow 3002/tcp # Maerchenzauber Backend - -# Enable firewall -sudo ufw enable -``` - -#### 7. Set Up Reverse Proxy (Optional) - -If using Nginx as reverse proxy: - -```bash -sudo apt install nginx - -# Create configuration -sudo nano /etc/nginx/sites-available/manacore -``` - -```nginx -server { - listen 80; - server_name api.manacore.app; - - location /api/v1/ { - proxy_pass http://localhost:3001; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - } - - location /health { - proxy_pass http://localhost:3002; - proxy_set_header Host $host; - } -} -``` - -```bash -# Enable site -sudo ln -s /etc/nginx/sites-available/manacore /etc/nginx/sites-enabled/ -sudo nginx -t -sudo systemctl reload nginx -``` - -## GitHub Environments - -### Create Environments - -1. Go to repository Settings > Environments -2. Create two environments: - - `staging` - - `production-approval` - -### Configure Production Approval - -1. Go to `production-approval` environment -2. Add required reviewers -3. Set wait timer (optional): 5 minutes -4. Add environment secrets (if any differ from repository secrets) - -## Enable GitHub Actions - -### 1. Check Workflow Permissions - -1. Go to Settings > Actions > General -2. Scroll to "Workflow permissions" -3. Select "Read and write permissions" -4. Check "Allow GitHub Actions to create and approve pull requests" -5. Click Save - -### 2. Enable Workflows - -Workflows are automatically enabled when files are pushed to `.github/workflows/` - -### 3. Configure Branch Protection - -1. Go to Settings > Branches -2. Add rule for `main` branch: - - ✅ Require status checks to pass - - Select: `All PR Checks Complete` - - ✅ Require branches to be up to date - - ✅ Require conversation resolution - - ✅ Do not allow bypassing - -## Test the Pipeline - -### 1. Test PR Workflow - -```bash -# Create test branch -git checkout -b test/ci-pipeline - -# Make a small change -echo "# CI/CD Test" >> README.md - -# Commit and push -git add README.md -git commit -m "test: verify CI pipeline" -git push origin test/ci-pipeline - -# Create PR on GitHub -# Watch GitHub Actions tab for workflow execution -``` - -**Expected Results**: -- ✅ Detect changed files -- ✅ Format check passes -- ✅ Type check passes -- ✅ Build completes -- ✅ Tests run - -### 2. Test Main Branch Workflow - -```bash -# Merge the PR -# Watch GitHub Actions for: -``` - -**Expected Results**: -- ✅ Full validation passes -- ✅ Docker images built -- ✅ Images pushed to registry -- ✅ Staging deployment triggered - -### 3. Test Staging Deployment - -Check staging server: -```bash -ssh deploy@staging.manacore.app -cd ~/manacore-staging -docker compose ps -``` - -**Expected Results**: -- All services running -- Health checks passing - -### 4. Test Production Deployment - -1. Go to Actions > CD - Production Deployment -2. Click "Run workflow" -3. Select: - - Service: `all` - - Environment: `production` - - Confirm: `deploy` -4. Click "Run workflow" -5. Approve when prompted - -**Expected Results**: -- ✅ Backup created -- ✅ Deployment completes -- ✅ Health checks pass - -## Troubleshooting - -### Workflow Not Triggering - -**Issue**: PR workflow doesn't run - -**Solution**: -- Check workflow file syntax -- Verify branch protection rules -- Check repository permissions - -### Docker Build Fails - -**Issue**: Image build fails in CI - -**Solution**: -```bash -# Test build locally -docker buildx build --file apps/chat/apps/backend/Dockerfile . - -# Check for syntax errors -yamllint .github/workflows/ci-main.yml -``` - -### SSH Connection Fails - -**Issue**: Can't connect to server from GitHub Actions - -**Solution**: -1. Verify SSH key is correct -2. Check server firewall -3. Verify user has docker permissions - -```bash -# Test locally -ssh -i ~/.ssh/github-actions-staging deploy@staging.manacore.app 'docker ps' -``` - -### Missing Secrets - -**Issue**: Workflow fails with "secret not found" - -**Solution**: -1. Go to Settings > Secrets -2. Verify secret name matches exactly -3. Check for typos -4. Ensure secret has value - -## Maintenance - -### Rotate SSH Keys - -Every 90 days, rotate SSH keys: - -```bash -# Generate new keys -ssh-keygen -t ed25519 -C "github-actions-$(date +%Y%m)" -f ~/.ssh/github-actions-new - -# Add new public key to server -ssh deploy@staging.manacore.app -echo "ssh-ed25519 NEW_KEY..." >> ~/.ssh/authorized_keys - -# Update GitHub secret with new private key -# Test new key works -# Remove old key from authorized_keys -``` - -### Update Docker Credentials - -Rotate Docker access tokens annually: - -1. Generate new token in Docker Hub -2. Update `DOCKER_PASSWORD` secret -3. Test by triggering workflow - -### Monitor Workflow Usage - -Check Actions usage: -1. Go to Settings > Billing -2. Review Actions minutes used -3. Set spending limits if needed - -## Next Steps - -1. [Read Deployment Guide](DEPLOYMENT.md) -2. Configure monitoring -3. Set up alerts -4. Document runbooks -5. Train team on deployment process diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index c7665ac68..f5f666fc9 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -1,762 +1,92 @@ # Deployment Guide -This guide covers the complete deployment process for the manacore-monorepo, including CI/CD setup, Docker orchestration, and production deployment strategies. - -## Table of Contents - -- [Overview](#overview) -- [Prerequisites](#prerequisites) -- [CI/CD Pipeline](#cicd-pipeline) -- [Docker Setup](#docker-setup) -- [Deployment Environments](#deployment-environments) -- [Deployment Process](#deployment-process) -- [Rollback Procedures](#rollback-procedures) -- [Monitoring and Maintenance](#monitoring-and-maintenance) -- [Troubleshooting](#troubleshooting) - ## Overview -The manacore-monorepo uses a comprehensive CI/CD pipeline with the following features: - -- **Automated Testing**: PR checks, type checking, linting, and format validation -- **Smart Build Detection**: Only builds affected projects using Turborepo filters -- **Docker Orchestration**: Multi-stage builds for all service types -- **Zero-Downtime Deployments**: Rolling updates with health checks -- **Automated Rollbacks**: Emergency rollback procedures -- **Security Scanning**: Dependency audits and vulnerability checks - -### Architecture +Production runs on a **Mac Mini** accessible via Cloudflare Tunnel at **mana.how**. ``` -┌─────────────────┐ -│ GitHub PR │ -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ PR Validation │ ← Lint, Type Check, Build, Test -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ Merge to Main │ -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ Build & Push │ ← Docker images to registry -│ Docker Images │ -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ Deploy Staging │ ← Automatic deployment -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ Manual Approval │ ← Production gate -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│Deploy Production│ ← With backup & health checks -└─────────────────┘ +Push to main → CI builds Docker images → GHCR → Watchtower pulls & restarts + (automatic) (automatic, ~5 min) ``` -## Prerequisites +**Watchtower** automatically checks for new Docker images every 5 minutes and updates running containers. -### Required Tools +## Quick Reference -- **Docker**: Version 20.10+ -- **Docker Compose**: Version 2.0+ -- **Node.js**: Version 20+ -- **pnpm**: Version 9.15.0 -- **Git**: Version 2.30+ - -### Required Accounts - -- **GitHub**: Repository access and Actions enabled -- **Docker Hub**: For image storage (or alternative registry) -- **Supabase**: For database services -- **Azure**: For OpenAI services -- **Hetzner/Coolify**: For hosting (recommended) - -### GitHub Secrets - -Configure the following secrets in your GitHub repository (`Settings > Secrets and variables > Actions`): - -#### Docker Registry - -``` -DOCKER_USERNAME=your-docker-username -DOCKER_PASSWORD=your-docker-password -DOCKER_REGISTRY=wuesteon -``` - -#### Staging Environment - -``` -STAGING_HOST=staging.manacore.app -STAGING_USER=deploy -STAGING_SSH_KEY= -STAGING_POSTGRES_HOST=postgres -STAGING_POSTGRES_PORT=5432 -STAGING_POSTGRES_DB=manacore -STAGING_POSTGRES_USER=postgres -STAGING_POSTGRES_PASSWORD= -STAGING_REDIS_HOST=redis -STAGING_REDIS_PORT=6379 -STAGING_REDIS_PASSWORD= -STAGING_SUPABASE_URL=https://xxx.supabase.co -STAGING_SUPABASE_ANON_KEY= -STAGING_SUPABASE_SERVICE_ROLE_KEY= -STAGING_AZURE_OPENAI_ENDPOINT=https://xxx.openai.azure.com -STAGING_AZURE_OPENAI_API_KEY= -STAGING_JWT_SECRET= -STAGING_JWT_PUBLIC_KEY= -STAGING_JWT_PRIVATE_KEY= -``` - -#### Production Environment - -``` -PRODUCTION_HOST=api.manacore.app -PRODUCTION_USER=deploy -PRODUCTION_SSH_KEY= -PRODUCTION_API_URL=https://api.manacore.app -# ... (same structure as staging with production values) -``` - -#### Turbo Cache (Optional) - -``` -TURBO_TOKEN= -TURBO_TEAM= -``` - -#### Code Coverage (Optional) - -``` -CODECOV_TOKEN= -``` +| Environment | Location | Domain | +|-------------|----------|--------| +| Local Dev | Your machine | localhost | +| Production | Mac Mini | mana.how | ## CI/CD Pipeline -### Workflow Files +### What happens automatically -The CI/CD pipeline consists of 6 GitHub Actions workflows: +1. **Push to main** triggers CI workflow +2. CI detects changed services +3. Docker images are built for changed services +4. Images are pushed to GitHub Container Registry (ghcr.io) -#### 1. PR Validation (`ci-pull-request.yml`) +### What happens automatically (Watchtower) -**Triggers**: Pull requests to `main` or `develop` +Watchtower runs as a Docker container and: +1. Checks GHCR for new images every 5 minutes +2. Pulls updated images +3. Recreates containers with new images +4. Cleans up old images -**Steps**: +No manual action needed for regular deployments. -1. Detect changed projects -2. Run format check -3. Run linting -4. Type checking -5. Build affected projects -6. Run tests with coverage -7. Docker build validation -8. Security scanning +## Manual Deployment (if needed) -**Required Checks**: Format, Type Check, Build - -#### 2. Main Branch CI (`ci-main.yml`) - -**Triggers**: Push to `main` branch - -**Steps**: - -1. Full validation (all projects) -2. Build all projects -3. Build and push Docker images -4. Trigger staging deployment - -#### 3. Staging Deployment (`cd-staging.yml`) - -**Triggers**: Manual or automated from main CI - -**Steps**: - -1. SSH to staging server -2. Pull latest Docker images -3. Update environment configuration -4. Deploy services with zero-downtime -5. Run database migrations -6. Health checks -7. Notify on completion - -#### 4. Production Deployment (`cd-production.yml`) - -**Triggers**: Manual only - -**Steps**: - -1. Validate deployment request -2. Request manual approval -3. Create database backup -4. Deploy with rolling update -5. Run migrations -6. Health checks -7. Monitor for 5 minutes -8. Run smoke tests -9. Notify on completion - -#### 5. Test Coverage (`test-coverage.yml`) - -**Triggers**: PRs, pushes to main, weekly schedule - -**Steps**: - -1. Run all tests with coverage -2. Collect coverage reports -3. Upload to Codecov -4. Generate summary -5. Check coverage thresholds (50% minimum) - -#### 6. Dependency Updates (`dependency-update.yml`) - -**Triggers**: Weekly schedule, manual - -**Steps**: - -1. Check for outdated dependencies -2. Run security audit -3. Create issue for critical vulnerabilities -4. Update lock file -5. Create PR with changes - -### Change Detection - -The pipeline uses `dorny/paths-filter` to detect which projects have changed: - -```yaml -filters: - maerchenzauber: - - 'apps/maerchenzauber/**' - - 'packages/**' - chat: - - 'apps/chat/**' - - 'packages/**' - # ... other projects -``` - -Only affected projects are built and tested, saving time and resources. - -## Docker Setup - -### Multi-Stage Builds - -All Dockerfiles use multi-stage builds for optimal image size: - -1. **Builder Stage**: Install dependencies and build -2. **Production Stage**: Copy only production dependencies and built assets - -### Service Types - -#### NestJS Backend - -Template: `docker/templates/Dockerfile.nestjs` - -```dockerfile -FROM node:20-alpine AS builder -# Build with all dependencies - -FROM node:20-alpine AS production -# Production with minimal footprint -``` - -**Key Features**: - -- Non-root user (`nestjs`) -- Health checks -- Resource limits -- Optimized caching - -#### SvelteKit Web - -Template: `docker/templates/Dockerfile.sveltekit` - -**Key Features**: - -- SSR support -- Static asset optimization -- Non-root user -- Health endpoints - -#### Astro Landing Pages - -Template: `docker/templates/Dockerfile.astro` - -**Key Features**: - -- Nginx-based serving -- Gzip compression -- Security headers -- Static file caching - -### Docker Compose - -Two environments are provided: - -#### Staging (`docker-compose.staging.yml`) - -- Includes PostgreSQL and Redis -- Service discovery via Docker network -- Local development configuration -- Verbose logging - -#### Production (`docker-compose.production.yml`) - -- External database connections -- Resource limits -- Optimized logging -- Security hardening - -## Deployment Environments - -### Staging - -**Purpose**: Pre-production testing and validation - -**URL**: `https://staging.manacore.app` - -**Characteristics**: - -- Automatic deployment from `main` branch -- Separate database instances -- Full feature parity with production -- Verbose logging enabled - -**Access**: +For immediate deployment without waiting for Watchtower: ```bash -ssh deploy@staging.manacore.app -cd ~/manacore-staging -docker compose ps +ssh mana-server "cd ~/projects/manacore-monorepo && ./scripts/mac-mini/deploy.sh" ``` -### Production - -**Purpose**: Live production environment - -**URL**: `https://api.manacore.app` - -**Characteristics**: - -- Manual deployment with approval -- High availability configuration -- Performance optimized -- Enhanced monitoring -- Backup procedures - -**Access**: +## Monitoring ```bash -ssh deploy@api.manacore.app -cd ~/manacore-production -docker compose ps -``` - -## Deployment Process - -### Automated Staging Deployment - -Staging deployment happens automatically when code is merged to `main`: - -```bash -# 1. Create PR -git checkout -b feature/my-feature -git push origin feature/my-feature - -# 2. PR Validation runs automatically -# - Checks pass - -# 3. Merge to main -# - Main CI builds Docker images -# - Pushes to registry -# - Triggers staging deployment - -# 4. Staging deployment -# - Pulls latest images -# - Rolling update -# - Health checks -# - Success! -``` - -### Manual Production Deployment - -Production requires manual trigger and approval: - -#### Step 1: Trigger Deployment - -Go to GitHub Actions > CD - Production Deployment > Run workflow - -**Required Inputs**: - -- Service: `all` or specific service name -- Environment: `production` -- Confirm: Type `deploy` - -#### Step 2: Approval - -Workflow pauses for manual approval at `production-approval` environment. - -Approve in: GitHub > Settings > Environments > production-approval - -#### Step 3: Automated Deployment - -Once approved: - -1. Creates database backup -2. Tags current deployment -3. Pulls latest images -4. Runs migrations -5. Rolling update (zero-downtime) -6. Health checks -7. 5-minute monitoring -8. Smoke tests - -#### Step 4: Verification - -```bash -# Check deployment status -./scripts/deploy/health-check.sh production +# Check service status +ssh mana-server "./scripts/mac-mini/status.sh" # View logs -ssh deploy@api.manacore.app -cd ~/manacore-production -docker compose logs -f +ssh mana-server "docker logs -f manacore-chat-backend" + +# Health check +ssh mana-server "./scripts/mac-mini/health-check.sh" ``` -### Manual Deployment Scripts +## Services & URLs -For manual deployments or troubleshooting: +| Service | URL | Container | +|---------|-----|-----------| +| Dashboard | https://mana.how | manacore-web | +| Auth API | https://auth.mana.how | mana-core-auth | +| Chat | https://chat.mana.how | chat-web | +| Chat API | https://chat-api.mana.how | chat-backend | +| Todo | https://todo.mana.how | todo-web | +| Todo API | https://todo-api.mana.how | todo-backend | +| Calendar | https://calendar.mana.how | calendar-web | +| Calendar API | https://calendar-api.mana.how | calendar-backend | +| Clock | https://clock.mana.how | clock-web | +| Clock API | https://clock-api.mana.how | clock-backend | +| Contacts | https://contacts.mana.how | contacts-web | +| Contacts API | https://contacts-api.mana.how | contacts-backend | -#### Build and Push Images +## Rollback ```bash -# Build all services -./scripts/deploy/build-and-push.sh all latest +ssh mana-server +cd ~/projects/manacore-monorepo -# Build specific service -./scripts/deploy/build-and-push.sh chat-backend v1.2.3 +# Rollback to specific image tag +docker compose -f docker-compose.macmini.yml pull : +docker compose -f docker-compose.macmini.yml up -d ``` -#### Deploy to Server +## Detailed Documentation -```bash -# Deploy to staging -export STAGING_HOST=staging.manacore.app -export STAGING_USER=deploy -./scripts/deploy/deploy-hetzner.sh staging all - -# Deploy to production -export PRODUCTION_HOST=api.manacore.app -export PRODUCTION_USER=deploy -./scripts/deploy/deploy-hetzner.sh production all -``` - -#### Health Checks - -```bash -# Check staging -./scripts/deploy/health-check.sh staging - -# Check production -./scripts/deploy/health-check.sh production -``` - -#### Database Migrations - -```bash -# Run migrations for specific project -./scripts/deploy/migrate-db.sh chat staging -./scripts/deploy/migrate-db.sh mana-core-auth production -``` - -## Rollback Procedures - -### Automated Rollback (Recommended) - -```bash -# Rollback staging -./scripts/deploy/rollback.sh staging all - -# Rollback production (specific service) -./scripts/deploy/rollback.sh production chat-backend -``` - -**What the script does**: - -1. Confirms rollback with user -2. Checks for previous deployment backup -3. Stops current services -4. Restores previous docker-compose configuration -5. Restores database (if applicable) -6. Starts services with previous version -7. Runs health checks -8. Reports status - -### Manual Rollback - -If automated rollback fails: - -```bash -# SSH to server -ssh deploy@api.manacore.app -cd ~/manacore-production - -# List available backups -ls -lt backups/ - -# Choose backup -BACKUP_DIR=backups/20250127_120000 - -# Restore configuration -cp $BACKUP_DIR/docker-compose.yml ./docker-compose.yml -cp $BACKUP_DIR/.env.backup ./.env - -# Restore database (if needed) -docker compose exec -T postgres psql -U postgres < $BACKUP_DIR/postgres_backup.sql - -# Restart services -docker compose up -d - -# Check status -docker compose ps -``` - -## Monitoring and Maintenance - -### Log Management - -```bash -# View logs for all services -docker compose logs -f - -# View logs for specific service -docker compose logs -f mana-core-auth - -# View last 100 lines -docker compose logs --tail=100 chat-backend - -# Search logs -docker compose logs | grep ERROR -``` - -### Resource Monitoring - -```bash -# Check container resources -docker stats - -# Check disk usage -docker system df - -# Cleanup unused resources -docker system prune -a -``` - -### Database Backups - -Automated backups are created before each production deployment. - -**Manual backup**: - -```bash -# Create backup -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -docker compose exec -T postgres pg_dumpall -U postgres > backup_$TIMESTAMP.sql - -# Restore from backup -docker compose exec -T postgres psql -U postgres < backup_20250127.sql -``` - -### Health Monitoring - -Set up external monitoring tools to ping health endpoints: - -- Mana Core Auth: `https://api.manacore.app/api/v1/health` -- Maerchenzauber: `https://api.manacore.app/health` -- Chat Backend: `https://api.manacore.app/api/health` - -Recommended tools: - -- UptimeRobot -- Pingdom -- Better Uptime -- Datadog - -## Troubleshooting - -### Deployment Fails - -**Issue**: Deployment workflow fails - -**Solutions**: - -1. Check workflow logs in GitHub Actions -2. Verify all required secrets are set -3. Ensure SSH access to server works -4. Check Docker registry credentials - -```bash -# Test SSH access -ssh deploy@staging.manacore.app 'echo "SSH works"' - -# Test Docker login -echo $DOCKER_PASSWORD | docker login -u $DOCKER_USERNAME --password-stdin -``` - -### Health Checks Fail - -**Issue**: Service fails health checks after deployment - -**Solutions**: - -1. Check service logs -2. Verify environment variables -3. Check database connectivity -4. Verify port mappings - -```bash -# Check service logs -docker compose logs --tail=200 mana-core-auth - -# Test health endpoint directly -docker compose exec mana-core-auth wget -O - http://localhost:3001/api/v1/health - -# Check environment -docker compose exec mana-core-auth env | grep -v PASSWORD -``` - -### Database Connection Issues - -**Issue**: Services can't connect to database - -**Solutions**: - -1. Verify database is running -2. Check connection strings -3. Verify credentials -4. Check network connectivity - -```bash -# Check database status -docker compose exec postgres psql -U postgres -c '\l' - -# Test connection from service -docker compose exec mana-core-auth nc -zv postgres 5432 -``` - -### Image Build Failures - -**Issue**: Docker build fails in CI - -**Solutions**: - -1. Check Dockerfile syntax -2. Verify all COPY paths exist -3. Check for build dependency issues -4. Review build logs - -```bash -# Test build locally -docker buildx build --file apps/chat/apps/backend/Dockerfile . - -# Build with verbose output -docker buildx build --progress=plain --file apps/chat/apps/backend/Dockerfile . -``` - -### Out of Disk Space - -**Issue**: Server runs out of disk space - -**Solutions**: - -```bash -# Check disk usage -df -h - -# Clean Docker resources -docker system prune -a --volumes - -# Remove old images -docker image prune -a --filter "until=72h" - -# Remove old backups -cd ~/manacore-production/backups -ls -t | tail -n +10 | xargs rm -rf -``` - -### Services Not Starting - -**Issue**: Docker Compose services fail to start - -**Solutions**: - -```bash -# Check service dependencies -docker compose config - -# Start services one by one -docker compose up -d postgres -docker compose up -d redis -docker compose up -d mana-core-auth - -# Check startup logs -docker compose logs --tail=100 --follow -``` - -## Best Practices - -### 1. Always Test in Staging First - -Never deploy directly to production without testing in staging. - -### 2. Use Tagged Releases - -Tag important releases: - -```bash -git tag -a v1.2.3 -m "Release version 1.2.3" -git push origin v1.2.3 -``` - -### 3. Monitor After Deployment - -Watch logs and metrics for at least 30 minutes after production deployment. - -### 4. Communicate Deployments - -Notify team before production deployments, especially during business hours. - -### 5. Keep Backups - -Always verify backups are created before production deployments. - -### 6. Document Changes - -Update CHANGELOG.md with notable changes for each deployment. - -### 7. Security - -- Rotate secrets regularly -- Keep dependencies updated -- Review security audit reports -- Use least-privilege access - -## Support - -For deployment issues or questions: - -1. Check this documentation -2. Review GitHub Actions logs -3. Check service logs on server -4. Contact DevOps team - -**Emergency Contact**: DevOps on-call rotation +- **[MAC_MINI_SERVER.md](MAC_MINI_SERVER.md)** - Complete server setup, autostart, health checks +- **[LOCAL_DEVELOPMENT.md](LOCAL_DEVELOPMENT.md)** - Local development setup diff --git a/docs/DEPLOYMENT_ARCHITECTURE.md b/docs/DEPLOYMENT_ARCHITECTURE.md deleted file mode 100644 index 19745ce03..000000000 --- a/docs/DEPLOYMENT_ARCHITECTURE.md +++ /dev/null @@ -1,2816 +0,0 @@ -# Manacore Monorepo - Deployment Architecture - -**Version:** 1.0 -**Date:** 2025-11-27 -**Author:** Hive Mind Swarm Analyst - ---- - -## Table of Contents - -1. [Executive Summary](#executive-summary) -2. [System Inventory](#system-inventory) -3. [Container Architecture](#container-architecture) -4. [Service Orchestration](#service-orchestration) -5. [Deployment Topology](#deployment-topology) -6. [Data Architecture](#data-architecture) -7. [Network Architecture](#network-architecture) -8. [Environment Configuration Matrix](#environment-configuration-matrix) -9. [Monitoring & Observability](#monitoring--observability) -10. [CI/CD Pipeline](#cicd-pipeline) -11. [Disaster Recovery](#disaster-recovery) -12. [Security Hardening](#security-hardening) - ---- - -## Executive Summary - -The manacore-monorepo contains **10 product projects** with **37 deployable services** across multiple technology stacks: - -- **10 NestJS backend APIs** (Node.js microservices) -- **9 SvelteKit web applications** (SSR/SSG) -- **9 Astro landing pages** (static sites) -- **8 Expo mobile apps** (served via CDN for OTA updates) -- **1 Central authentication service** (mana-core-auth) - -**Key Architectural Decisions:** - -- **Per-project container isolation** for independent scaling -- **Shared infrastructure** for databases (PostgreSQL) and caching (Redis) -- **Multi-stage Docker builds** optimized for pnpm workspace monorepo -- **Blue-green deployment** strategy with zero-downtime rollbacks -- **Docker Compose orchestration** with GitHub Container Registry -- **CDN-first static assets** (Astro landing pages, mobile OTA bundles) - ---- - -## System Inventory - -### Complete Service Matrix - -| Project | Backend (NestJS) | Web (SvelteKit) | Landing (Astro) | Mobile (Expo) | Port Range | -|---------|------------------|-----------------|-----------------|---------------|------------| -| **mana-core-auth** | ✅ 3001 | ❌ | ❌ | ❌ | 3001 | -| **chat** | ✅ 3002 | ✅ | ✅ | ✅ | 3002-3005 | -| **maerchenzauber** | ✅ 3003 | ✅ | ✅ | ✅ | 3010-3013 | -| **manadeck** | ✅ 3004 | ✅ | ✅ | ✅ | 3020-3023 | -| **memoro** | ❌ | ✅ | ✅ | ✅ | 3030-3032 | -| **manacore** | ❌ | ✅ | ✅ | ✅ | 3040-3042 | -| **picture** | ✅ 3005 | ✅ | ✅ | ✅ | 3050-3053 | -| **uload** | ✅ 3006 | ✅ | ✅ | ❌ | 3060-3062 | -| **nutriphi** | ✅ 3007 | ✅ | ✅ | ✅ | 3070-3073 | -| **news** | ✅ 3008 (api) | ✅ | ✅ | ❌ | 3080-3082 | - -**Total Deployable Services:** 37 containers + 2 shared infrastructure (PostgreSQL, Redis) - -### Technology Stack Breakdown - -#### Backend (NestJS) - 10 services -- **Node.js:** 20 LTS -- **Framework:** NestJS 10-11 -- **Database:** Drizzle ORM + PostgreSQL -- **Runtime:** Node.js process (no PM2 needed in containers) - -#### Web (SvelteKit) - 9 services -- **Node.js:** 20 LTS -- **Framework:** SvelteKit 2.x + Svelte 5 (runes mode) -- **Adapter:** `@sveltejs/adapter-node` for Docker or `@sveltejs/adapter-netlify` for Netlify -- **Build output:** SSR Node server - -#### Landing (Astro) - 9 services -- **Framework:** Astro 5.x -- **Build output:** Static files (HTML/CSS/JS) -- **Deployment:** CDN (Cloudflare, Netlify, Vercel) or Nginx container - -#### Mobile (Expo) - 8 services -- **Framework:** React Native + Expo SDK 52-54 -- **Deployment:** - - **OTA Updates:** EAS Update (served from CDN) - - **Binaries:** App Store / Google Play Store - - **Dev:** Expo Go or custom dev client - -### Shared Packages (19 packages) - -All shared packages must be built before deployment: - -``` -packages/shared-auth -packages/shared-auth-ui -packages/shared-branding -packages/shared-errors -packages/shared-i18n -packages/shared-supabase -packages/shared-types -packages/shared-utils -... (19 total) -``` - ---- - -## Container Architecture - -### 1. Dockerfile Strategy - -#### 1.1 NestJS Backend Template - -**File:** `docker/templates/Dockerfile.nestjs` - -```dockerfile -# ============================================================================= -# Multi-stage Dockerfile for NestJS Backend (Monorepo-optimized) -# Build from monorepo root with context=. -# ============================================================================= - -# ----------------------------------------------------------------------------- -# Stage 1: Base - Install pnpm and prepare workspace -# ----------------------------------------------------------------------------- -FROM node:20-alpine AS base - -# Enable corepack for pnpm -RUN corepack enable && corepack prepare pnpm@9.15.0 --activate - -WORKDIR /app - -# Copy workspace configuration -COPY pnpm-workspace.yaml package.json pnpm-lock.yaml ./ - -# ----------------------------------------------------------------------------- -# Stage 2: Dependencies - Install all dependencies -# ----------------------------------------------------------------------------- -FROM base AS dependencies - -# Copy all package.json files (for dependency resolution) -COPY packages/*/package.json ./packages/ -COPY apps/*/apps/*/package.json ./apps/ -COPY services/*/package.json ./services/ - -# Install all dependencies (frozen lockfile for reproducibility) -RUN pnpm install --frozen-lockfile --filter=@PROJECT/backend... - -# ----------------------------------------------------------------------------- -# Stage 3: Builder - Build shared packages and backend -# ----------------------------------------------------------------------------- -FROM dependencies AS builder - -# Copy source code for shared packages -COPY packages/ ./packages/ - -# Build shared packages (Turborepo cache) -RUN pnpm --filter '@manacore/shared-*' build - -# Copy backend source -ARG PROJECT_PATH -COPY ${PROJECT_PATH} ./${PROJECT_PATH} - -# Build backend -WORKDIR /app/${PROJECT_PATH} -RUN pnpm build - -# ----------------------------------------------------------------------------- -# Stage 4: Production - Minimal runtime image -# ----------------------------------------------------------------------------- -FROM node:20-alpine AS production - -# Security: Non-root user -RUN addgroup -g 1001 nodejs && adduser -u 1001 -G nodejs -s /bin/sh -D nodejs - -# Install runtime dependencies only (for health checks, migrations) -RUN apk add --no-cache postgresql-client wget - -WORKDIR /app - -# Copy built artifacts -COPY --from=builder --chown=nodejs:nodejs /app/node_modules ./node_modules -COPY --from=builder --chown=nodejs:nodejs /app/packages ./packages -COPY --from=builder --chown=nodejs:nodejs /app/${PROJECT_PATH}/dist ./dist -COPY --from=builder --chown=nodejs:nodejs /app/${PROJECT_PATH}/package.json ./ - -# Environment -ENV NODE_ENV=production -ENV PORT=3000 - -# Health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \ - CMD wget --no-verbose --tries=1 --spider http://localhost:${PORT}/api/health || exit 1 - -# Switch to non-root user -USER nodejs - -EXPOSE ${PORT} - -# Start server -CMD ["node", "dist/main.js"] -``` - -**Build Arguments:** -- `PROJECT_PATH`: e.g., `apps/chat/apps/backend` -- `PORT`: Service port (default: 3000) - -**Example Build:** -```bash -docker build \ - --build-arg PROJECT_PATH=apps/chat/apps/backend \ - --build-arg PORT=3002 \ - -t chat-backend:latest \ - -f docker/templates/Dockerfile.nestjs \ - . -``` - ---- - -#### 1.2 SvelteKit Web Template - -**File:** `docker/templates/Dockerfile.sveltekit` - -```dockerfile -# ============================================================================= -# Multi-stage Dockerfile for SvelteKit Web App (Monorepo-optimized) -# Build from monorepo root with context=. -# ============================================================================= - -# ----------------------------------------------------------------------------- -# Stage 1: Base - Install pnpm and prepare workspace -# ----------------------------------------------------------------------------- -FROM node:20-alpine AS base - -RUN corepack enable && corepack prepare pnpm@9.15.0 --activate - -WORKDIR /app - -COPY pnpm-workspace.yaml package.json pnpm-lock.yaml ./ - -# ----------------------------------------------------------------------------- -# Stage 2: Dependencies -# ----------------------------------------------------------------------------- -FROM base AS dependencies - -COPY packages/*/package.json ./packages/ -COPY apps/*/apps/*/package.json ./apps/ - -ARG PROJECT_PATH -RUN pnpm install --frozen-lockfile --filter=${PROJECT_PATH}... - -# ----------------------------------------------------------------------------- -# Stage 3: Builder -# ----------------------------------------------------------------------------- -FROM dependencies AS builder - -# Copy shared packages source -COPY packages/ ./packages/ - -# Build shared packages -RUN pnpm --filter '@manacore/shared-*' build - -# Copy web app source -ARG PROJECT_PATH -COPY ${PROJECT_PATH} ./${PROJECT_PATH} - -WORKDIR /app/${PROJECT_PATH} - -# Build SvelteKit app (adapter-node output) -RUN pnpm build - -# ----------------------------------------------------------------------------- -# Stage 4: Production -# ----------------------------------------------------------------------------- -FROM node:20-alpine AS production - -RUN addgroup -g 1001 nodejs && adduser -u 1001 -G nodejs -s /bin/sh -D nodejs - -WORKDIR /app - -ARG PROJECT_PATH -COPY --from=builder --chown=nodejs:nodejs /app/${PROJECT_PATH}/build ./build -COPY --from=builder --chown=nodejs:nodejs /app/${PROJECT_PATH}/package.json ./ -COPY --from=builder --chown=nodejs:nodejs /app/node_modules ./node_modules - -ENV NODE_ENV=production -ENV PORT=3000 -ENV HOST=0.0.0.0 - -HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \ - CMD wget --no-verbose --tries=1 --spider http://localhost:${PORT}/api/health || exit 1 - -USER nodejs - -EXPOSE ${PORT} - -CMD ["node", "build"] -``` - -**Notes:** -- Requires `@sveltejs/adapter-node` in `svelte.config.js` -- Replace Netlify adapter with Node adapter for Docker deployment - ---- - -#### 1.3 Astro Landing Page Template - -**File:** `docker/templates/Dockerfile.astro` - -```dockerfile -# ============================================================================= -# Multi-stage Dockerfile for Astro Landing Page (Static Site) -# Serves via Nginx for production -# ============================================================================= - -# ----------------------------------------------------------------------------- -# Stage 1: Builder -# ----------------------------------------------------------------------------- -FROM node:20-alpine AS builder - -RUN corepack enable && corepack prepare pnpm@9.15.0 --activate - -WORKDIR /app - -COPY pnpm-workspace.yaml package.json pnpm-lock.yaml ./ -COPY packages/*/package.json ./packages/ -COPY apps/*/apps/*/package.json ./apps/ - -ARG PROJECT_PATH -RUN pnpm install --frozen-lockfile --filter=${PROJECT_PATH}... - -COPY packages/ ./packages/ -RUN pnpm --filter '@manacore/shared-landing-ui' build - -COPY ${PROJECT_PATH} ./${PROJECT_PATH} - -WORKDIR /app/${PROJECT_PATH} -RUN pnpm build - -# ----------------------------------------------------------------------------- -# Stage 2: Nginx Server -# ----------------------------------------------------------------------------- -FROM nginx:1.25-alpine AS production - -# Copy built static files -ARG PROJECT_PATH -COPY --from=builder /app/${PROJECT_PATH}/dist /usr/share/nginx/html - -# Copy custom Nginx config (optional) -COPY docker/templates/nginx.conf /etc/nginx/nginx.conf - -# Health check -HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ - CMD wget --no-verbose --tries=1 --spider http://localhost:80/health || exit 1 - -EXPOSE 80 - -CMD ["nginx", "-g", "daemon off;"] -``` - -**Nginx Configuration:** - -```nginx -# docker/templates/nginx.conf -worker_processes auto; -events { worker_connections 1024; } - -http { - include /etc/nginx/mime.types; - default_type application/octet-stream; - - sendfile on; - tcp_nopush on; - tcp_nodelay on; - keepalive_timeout 65; - gzip on; - gzip_types text/plain text/css application/json application/javascript text/xml application/xml; - - server { - listen 80; - server_name _; - root /usr/share/nginx/html; - index index.html; - - # Cache static assets - location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { - expires 1y; - add_header Cache-Control "public, immutable"; - } - - # SPA fallback - location / { - try_files $uri $uri/ /index.html; - } - - # Health check endpoint - location /health { - return 200 "OK"; - add_header Content-Type text/plain; - } - } -} -``` - ---- - -### 2. Base Image Selection - -| App Type | Base Image | Size | Rationale | -|----------|------------|------|-----------| -| **NestJS** | `node:20-alpine` | ~120MB | Minimal footprint, security updates | -| **SvelteKit** | `node:20-alpine` | ~120MB | Same as NestJS | -| **Astro** | `nginx:1.25-alpine` | ~40MB | Static files, ultra-fast | -| **PostgreSQL** | `postgres:16-alpine` | ~230MB | Official, stable | -| **Redis** | `redis:7-alpine` | ~40MB | Official, minimal | - -**Why Alpine Linux:** -- 5x smaller than Debian-based images -- Fewer attack vectors (minimal packages) -- Faster pull times -- Security-hardened by default - ---- - -### 3. Layer Caching Strategy - -**Key Optimization:** Leverage Docker layer cache + pnpm's efficient workspace handling. - -**Cache Layers (in order):** - -1. **OS & System Packages** (changes rarely) - ```dockerfile - FROM node:20-alpine - RUN corepack enable && corepack prepare pnpm@9.15.0 --activate - ``` - -2. **Workspace Configuration** (changes when adding/removing packages) - ```dockerfile - COPY pnpm-workspace.yaml package.json pnpm-lock.yaml ./ - ``` - -3. **Package Manifests** (changes when dependencies update) - ```dockerfile - COPY packages/*/package.json ./packages/ - COPY apps/*/apps/*/package.json ./apps/ - ``` - -4. **Dependency Installation** (cache hit ~80% of builds) - ```dockerfile - RUN pnpm install --frozen-lockfile - ``` - -5. **Source Code** (changes every build) - ```dockerfile - COPY packages/ ./packages/ - COPY apps/chat/apps/backend ./apps/chat/apps/backend - ``` - -**Build Time Optimization:** -- **Without cache:** ~10-15 minutes (full dependency install) -- **With cache:** ~2-3 minutes (only rebuild changed layers) - ---- - -### 4. Security Hardening - -#### Non-Root User Execution - -All containers run as unprivileged user (UID 1001): - -```dockerfile -RUN addgroup -g 1001 nodejs && adduser -u 1001 -G nodejs -s /bin/sh -D nodejs -USER nodejs -``` - -#### Read-Only Root Filesystem - -```yaml -# docker-compose.yml -security_opt: - - no-new-privileges:true -read_only: true -tmpfs: - - /tmp - - /app/.cache -``` - -#### Minimal Runtime Dependencies - -```dockerfile -# Only install essential tools -RUN apk add --no-cache postgresql-client wget -``` - -#### Vulnerability Scanning - -```bash -# Scan images with Trivy -trivy image chat-backend:latest --severity HIGH,CRITICAL -``` - ---- - -## Service Orchestration - -### 1. Docker Compose for Local Development - -**File:** `docker-compose.dev.yml` (already exists, enhance it) - -```yaml -# Enhanced Development Docker Compose -version: '3.9' - -services: - # ============================================================================ - # Shared Infrastructure - # ============================================================================ - - postgres: - image: postgres:16-alpine - container_name: manacore-postgres - restart: unless-stopped - environment: - POSTGRES_DB: manacore - POSTGRES_USER: ${POSTGRES_USER:-manacore} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-devpassword} - volumes: - - postgres-data:/var/lib/postgresql/data - - ./docker/init-db:/docker-entrypoint-initdb.d:ro - ports: - - "5432:5432" - networks: - - manacore-network - healthcheck: - test: ["CMD-SHELL", "pg_isready -U manacore"] - interval: 10s - timeout: 5s - retries: 5 - - redis: - image: redis:7-alpine - container_name: manacore-redis - restart: unless-stopped - command: redis-server --requirepass ${REDIS_PASSWORD:-devpassword} --maxmemory 256mb --maxmemory-policy allkeys-lru - volumes: - - redis-data:/data - ports: - - "6379:6379" - networks: - - manacore-network - healthcheck: - test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD:-devpassword}", "ping"] - interval: 10s - timeout: 5s - retries: 3 - - # ============================================================================ - # Mana Core Auth Service - # ============================================================================ - - mana-core-auth: - profiles: ["auth", "all"] - build: - context: . - dockerfile: ./services/mana-core-auth/Dockerfile - container_name: manacore-auth - restart: unless-stopped - environment: - NODE_ENV: development - PORT: 3001 - DATABASE_URL: postgresql://manacore:devpassword@postgres:5432/manacore - REDIS_HOST: redis - REDIS_PORT: 6379 - REDIS_PASSWORD: ${REDIS_PASSWORD:-devpassword} - JWT_PUBLIC_KEY: ${JWT_PUBLIC_KEY} - JWT_PRIVATE_KEY: ${JWT_PRIVATE_KEY} - depends_on: - postgres: - condition: service_healthy - redis: - condition: service_healthy - ports: - - "3001:3001" - networks: - - manacore-network - labels: - - "com.manacore.service=auth" - - "com.manacore.tier=infrastructure" - - # ============================================================================ - # Project Backends (NestJS) - # ============================================================================ - - chat-backend: - profiles: ["chat", "all"] - build: - context: . - dockerfile: ./apps/chat/apps/backend/Dockerfile - container_name: chat-backend - restart: unless-stopped - environment: - NODE_ENV: development - PORT: 3002 - DATABASE_URL: postgresql://manacore:devpassword@postgres:5432/chat - AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT} - AZURE_OPENAI_API_KEY: ${AZURE_OPENAI_API_KEY} - MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - depends_on: - postgres: - condition: service_healthy - mana-core-auth: - condition: service_started - ports: - - "3002:3002" - networks: - - manacore-network - labels: - - "com.manacore.project=chat" - - "com.manacore.service=backend" - - maerchenzauber-backend: - profiles: ["maerchenzauber", "all"] - build: - context: . - dockerfile: ./apps/maerchenzauber/apps/backend/Dockerfile - container_name: maerchenzauber-backend - restart: unless-stopped - environment: - NODE_ENV: development - PORT: 3003 - DATABASE_URL: postgresql://manacore:devpassword@postgres:5432/maerchenzauber - SUPABASE_URL: ${MAERCHENZAUBER_SUPABASE_URL} - SUPABASE_ANON_KEY: ${MAERCHENZAUBER_SUPABASE_ANON_KEY} - depends_on: - postgres: - condition: service_healthy - ports: - - "3003:3003" - networks: - - manacore-network - labels: - - "com.manacore.project=maerchenzauber" - - "com.manacore.service=backend" - - # ============================================================================ - # Web Apps (SvelteKit) - Behind Traefik Reverse Proxy - # ============================================================================ - - chat-web: - profiles: ["chat", "all"] - build: - context: . - dockerfile: docker/templates/Dockerfile.sveltekit - args: - PROJECT_PATH: apps/chat/apps/web - container_name: chat-web - restart: unless-stopped - environment: - NODE_ENV: production - PORT: 3000 - PUBLIC_BACKEND_URL: http://chat-backend:3002 - ports: - - "3100:3000" - networks: - - manacore-network - labels: - - "com.manacore.project=chat" - - "com.manacore.service=web" - - "traefik.enable=true" - - "traefik.http.routers.chat-web.rule=Host(`chat.localhost`)" - - # ============================================================================ - # Landing Pages (Astro) - Nginx Static - # ============================================================================ - - chat-landing: - profiles: ["chat", "all"] - build: - context: . - dockerfile: docker/templates/Dockerfile.astro - args: - PROJECT_PATH: apps/chat/apps/landing - container_name: chat-landing - restart: unless-stopped - ports: - - "3200:80" - networks: - - manacore-network - labels: - - "com.manacore.project=chat" - - "com.manacore.service=landing" - - # ============================================================================ - # Reverse Proxy (Optional for local dev) - # ============================================================================ - - traefik: - profiles: ["proxy", "all"] - image: traefik:v2.11 - container_name: manacore-traefik - command: - - "--api.insecure=true" - - "--providers.docker=true" - - "--providers.docker.exposedbydefault=false" - - "--entrypoints.web.address=:80" - ports: - - "80:80" - - "8080:8080" # Traefik dashboard - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - networks: - - manacore-network - -networks: - manacore-network: - driver: bridge - -volumes: - postgres-data: - redis-data: -``` - -**Usage:** - -```bash -# Start only infrastructure (PostgreSQL + Redis) -pnpm docker:up - -# Start auth service -pnpm docker:up:auth - -# Start specific project (chat) -docker compose --profile chat up -d - -# Start everything -pnpm docker:up:all - -# View logs -pnpm docker:logs:chat - -# Stop all -pnpm docker:down -``` - ---- - -### 2. Production Orchestration (Coolify) - -**Coolify Configuration:** `.coolify/docker-compose.prod.yml` - -```yaml -version: '3.9' - -# Production Docker Compose for Coolify Deployment -# Coolify will handle: -# - Automatic SSL (Let's Encrypt) -# - Health check monitoring -# - Auto-restart on failure -# - Log aggregation -# - Resource limits - -services: - chat-backend: - image: ${DOCKER_REGISTRY}/chat-backend:${VERSION} - restart: always - environment: - NODE_ENV: production - PORT: 3002 - DATABASE_URL: ${CHAT_DATABASE_URL} - AZURE_OPENAI_ENDPOINT: ${AZURE_OPENAI_ENDPOINT} - AZURE_OPENAI_API_KEY: ${AZURE_OPENAI_API_KEY} - deploy: - resources: - limits: - cpus: '1.0' - memory: 512M - reservations: - cpus: '0.5' - memory: 256M - healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:3002/api/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - labels: - - "coolify.managed=true" - - "coolify.project=chat" - - "coolify.service=backend" - - "coolify.port=3002" - - "coolify.domain=api-chat.manacore.app" -``` - -**Coolify Deployment Strategy:** - -1. **Per-project services**: Each project (chat, maerchenzauber, etc.) deployed as separate Coolify application -2. **Resource pools**: Shared PostgreSQL and Redis as Coolify resources -3. **Auto-scaling**: Configure horizontal scaling based on CPU/memory -4. **Blue-green deployments**: Coolify's native zero-downtime deployment - ---- - -### 3. Kubernetes (Future-Proof Option) - -**File:** `k8s/base/deployment.yaml` (template) - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: chat-backend - namespace: manacore - labels: - app: chat - component: backend - tier: api -spec: - replicas: 2 - strategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 1 - maxUnavailable: 0 - selector: - matchLabels: - app: chat - component: backend - template: - metadata: - labels: - app: chat - component: backend - spec: - securityContext: - runAsNonRoot: true - runAsUser: 1001 - fsGroup: 1001 - containers: - - name: chat-backend - image: registry.manacore.app/chat-backend:latest - imagePullPolicy: Always - ports: - - containerPort: 3002 - name: http - protocol: TCP - env: - - name: NODE_ENV - value: "production" - - name: PORT - value: "3002" - - name: DATABASE_URL - valueFrom: - secretKeyRef: - name: chat-db-credentials - key: connection-string - resources: - requests: - cpu: 250m - memory: 256Mi - limits: - cpu: 1000m - memory: 512Mi - livenessProbe: - httpGet: - path: /api/health - port: 3002 - initialDelaySeconds: 30 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 - readinessProbe: - httpGet: - path: /api/health - port: 3002 - initialDelaySeconds: 10 - periodSeconds: 5 - timeoutSeconds: 3 - failureThreshold: 3 - securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - capabilities: - drop: - - ALL ---- -apiVersion: v1 -kind: Service -metadata: - name: chat-backend - namespace: manacore -spec: - type: ClusterIP - ports: - - port: 3002 - targetPort: 3002 - protocol: TCP - name: http - selector: - app: chat - component: backend ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: chat-backend - namespace: manacore - annotations: - cert-manager.io/cluster-issuer: "letsencrypt-prod" - nginx.ingress.kubernetes.io/ssl-redirect: "true" -spec: - ingressClassName: nginx - tls: - - hosts: - - api-chat.manacore.app - secretName: chat-backend-tls - rules: - - host: api-chat.manacore.app - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: chat-backend - port: - number: 3002 -``` - -**Helm Chart Structure:** - -``` -k8s/ -├── base/ -│ ├── deployment.yaml -│ ├── service.yaml -│ ├── ingress.yaml -│ └── configmap.yaml -├── overlays/ -│ ├── staging/ -│ │ └── kustomization.yaml -│ └── production/ -│ └── kustomization.yaml -└── helm/ - └── manacore/ - ├── Chart.yaml - ├── values.yaml - ├── values-staging.yaml - ├── values-production.yaml - └── templates/ - ├── deployment.yaml - ├── service.yaml - ├── ingress.yaml - └── hpa.yaml -``` - ---- - -## Deployment Topology - -### 1. Environment Stages - -``` -┌─────────────────────────────────────────────────────────────────────┐ -│ DEPLOYMENT PIPELINE │ -├─────────────────────────────────────────────────────────────────────┤ -│ │ -│ [Development] → [Staging] → [Production] │ -│ ↓ ↓ ↓ │ -│ Local Docker Coolify Coolify/K8s │ -│ 127.0.0.1 staging.* app domains │ -│ Hot reload Manual test Blue-green │ -│ No SSL Let's Encrypt Let's Encrypt │ -│ │ -└─────────────────────────────────────────────────────────────────────┘ -``` - -#### Development Environment - -- **Location:** Developer workstations -- **Orchestration:** Docker Compose -- **Database:** Local PostgreSQL (Docker) -- **Domains:** `localhost`, `*.localhost` -- **SSL:** None -- **Purpose:** Feature development, debugging - -#### Staging Environment - -- **Location:** Hetzner VPS (CCX32) -- **Orchestration:** Docker Compose -- **Database:** Dedicated Supabase project (staging) -- **Domains:** `staging-chat.manacore.app`, `staging-api-chat.manacore.app` -- **SSL:** Let's Encrypt via Traefik -- **Purpose:** Integration testing, QA, stakeholder demos - -#### Production Environment - -- **Location:** Hetzner VPS (CCX42) or Kubernetes (future) -- **Orchestration:** Docker Compose with zero-downtime deployments -- **Database:** Production Supabase projects (per-project isolation) -- **Domains:** `chat.manacore.app`, `api-chat.manacore.app`, etc. -- **SSL:** Let's Encrypt with auto-renewal -- **Purpose:** Live customer traffic - ---- - -### 2. Deployment Regions - -**Current Strategy:** Single-region deployment (Europe-West3) - -**Multi-Region Expansion (Future):** - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ GLOBAL DEPLOYMENT │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ [US-East] [EU-West] [Asia-Pacific] │ -│ Primary Primary Primary │ -│ Replicas: 2 Replicas: 3 Replicas: 2 │ -│ │ -│ ┌─────────────────────────────────────────────────┐ │ -│ │ Cloudflare CDN (Global Edge) │ │ -│ │ - Astro landing pages (cached) │ │ -│ │ - Expo OTA bundles (cached) │ │ -│ │ - API requests (proxied to nearest region) │ │ -│ └─────────────────────────────────────────────────┘ │ -│ │ -│ Database: Supabase (auto-replication across regions) │ -│ │ -└─────────────────────────────────────────────────────────────────┘ -``` - ---- - -### 3. Blue-Green Deployment Strategy - -**Concept:** Zero-downtime deployments by running two identical production environments. - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ BLUE-GREEN DEPLOYMENT │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ [Load Balancer / Coolify Proxy] │ -│ ↓ │ -│ ┌──────────────────┐ ┌──────────────────┐ │ -│ │ BLUE (Live) │ │ GREEN (Standby) │ │ -│ │ Version: 1.5.2 │ │ Version: 1.6.0 │ │ -│ │ Traffic: 100% │ │ Traffic: 0% │ │ -│ └──────────────────┘ └──────────────────┘ │ -│ │ -│ Deployment Steps: │ -│ 1. Deploy new version to GREEN │ -│ 2. Run smoke tests on GREEN │ -│ 3. Switch 10% traffic to GREEN (canary) │ -│ 4. Monitor metrics for 10 minutes │ -│ 5. Switch 100% traffic to GREEN │ -│ 6. Keep BLUE running for 1 hour (rollback window) │ -│ 7. Decommission BLUE │ -│ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Rollback Procedure:** - -```bash -# Instant rollback by switching traffic back to BLUE -coolify switch-deployment blue - -# Or with Kubernetes -kubectl set image deployment/chat-backend chat-backend=registry.manacore.app/chat-backend:v1.5.2 -``` - -**Database Migration Handling:** - -- **Forward-compatible migrations only**: New code can read old schema -- **Two-phase migrations**: - 1. Deploy schema changes (additive only) - 2. Deploy code that uses new schema - 3. Remove old columns in next release - ---- - -### 4. Health Checks & Readiness Probes - -**NestJS Health Check Endpoint:** - -```typescript -// src/health/health.controller.ts -import { Controller, Get } from '@nestjs/common'; -import { HealthCheck, HealthCheckService, TypeOrmHealthIndicator } from '@nestjs/terminus'; - -@Controller('api/health') -export class HealthController { - constructor( - private health: HealthCheckService, - private db: TypeOrmHealthIndicator, - ) {} - - @Get() - @HealthCheck() - check() { - return this.health.check([ - () => this.db.pingCheck('database'), - ]); - } -} -``` - -**SvelteKit Health Check Endpoint:** - -```typescript -// src/routes/api/health/+server.ts -import type { RequestHandler } from './$types'; - -export const GET: RequestHandler = async () => { - return new Response('OK', { - status: 200, - headers: { 'Content-Type': 'text/plain' } - }); -}; -``` - -**Health Check Configuration:** - -```yaml -# docker-compose.yml -healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:3002/api/health"] - interval: 30s # Check every 30 seconds - timeout: 10s # Fail if no response in 10s - retries: 3 # Mark unhealthy after 3 consecutive failures - start_period: 40s # Grace period for app startup -``` - ---- - -## Data Architecture - -### 1. Database Strategy - -#### Supabase Integration Pattern - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ SUPABASE MULTI-TENANCY │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ Separate Supabase Project per Product: │ -│ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ -│ │ Chat DB │ │ Memoro DB │ │ Picture DB │ │ -│ │ (Supabase) │ │ (Supabase) │ │ (Supabase) │ │ -│ │ │ │ │ │ │ │ -│ │ - messages │ │ - memos │ │ - images │ │ -│ │ - threads │ │ - memories │ │ - prompts │ │ -│ │ - models │ │ - blueprints │ │ - generations│ │ -│ └──────────────┘ └──────────────┘ └──────────────┘ │ -│ │ -│ Shared Auth Database (Mana Core Auth): │ -│ ┌──────────────────────────────────────┐ │ -│ │ PostgreSQL (Docker/Cloud) │ │ -│ │ - users │ │ -│ │ - sessions │ │ -│ │ - credits │ │ -│ │ - subscriptions │ │ -│ └──────────────────────────────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Rationale for Separate Supabase Projects:** - -- **Data isolation**: Security boundary per product -- **Independent scaling**: Each project has its own compute resources -- **Schema evolution**: Migrate databases independently -- **Billing transparency**: Track costs per product -- **RLS policies**: Easier to manage with per-project isolation - ---- - -#### Connection Pooling - -**Problem:** NestJS apps open many DB connections, exceeding Supabase limits (default: 60 connections). - -**Solution:** PgBouncer connection pooler (Supabase built-in). - -**Configuration:** - -```typescript -// Backend connection string (transaction pooling) -DATABASE_URL=postgresql://user:pass@db.project.supabase.co:6543/postgres?pgbouncer=true - -// For migrations (session pooling) -MIGRATION_DATABASE_URL=postgresql://user:pass@db.project.supabase.co:5432/postgres -``` - -**Docker Environment:** - -```yaml -# docker-compose.prod.yml -environment: - DATABASE_URL: ${DATABASE_URL}?pgbouncer=true&connection_limit=10 -``` - -**Connection Limits per Service:** - -| Service Type | Max Connections | Pool Size | Rationale | -|--------------|----------------|-----------|-----------| -| NestJS Backend | 10 | 5 | API requests are short-lived | -| SvelteKit Web | 5 | 3 | SSR queries are quick | -| Migration Script | 1 | 1 | One-time operation | - ---- - -### 2. Migration Workflow - -**Environment Progression:** - -``` -Development → Staging → Production - ↓ ↓ ↓ - Local DB Staging DB Prod DB -``` - -**Migration Process:** - -1. **Development:** - ```bash - # Generate migration - pnpm --filter @chat/backend migration:generate --name add-user-preferences - - # Apply migration locally - pnpm --filter @chat/backend migration:run - ``` - -2. **Staging:** - ```bash - # CI/CD pipeline applies migrations before deploying code - docker exec chat-backend pnpm migration:run - ``` - -3. **Production:** - ```bash - # Manual trigger (after staging validation) - kubectl exec -it chat-backend-pod -- pnpm migration:run - - # Or automated (Coolify) - coolify deploy chat-backend --run-migrations - ``` - -**Migration Safety Rules:** - -- ✅ **Safe migrations** (can run while old code is live): - - Add new table - - Add new column (with default value) - - Add index (concurrent) - - Expand enum values - -- ❌ **Unsafe migrations** (require blue-green deployment): - - Remove column - - Rename column - - Change column type - - Remove enum value - -**Example Migration (Drizzle ORM):** - -```typescript -// migrations/0001_add_user_preferences.ts -import { sql } from 'drizzle-orm'; -import { pgTable, text, jsonb, timestamp } from 'drizzle-orm/pg-core'; - -export const userPreferences = pgTable('user_preferences', { - id: text('id').primaryKey(), - userId: text('user_id').notNull().references(() => users.id), - preferences: jsonb('preferences').notNull().default('{}'), - createdAt: timestamp('created_at').defaultNow(), - updatedAt: timestamp('updated_at').defaultNow(), -}); - -export async function up(db) { - await db.execute(sql` - CREATE TABLE user_preferences ( - id TEXT PRIMARY KEY, - user_id TEXT NOT NULL REFERENCES users(id) ON DELETE CASCADE, - preferences JSONB NOT NULL DEFAULT '{}', - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW() - ); - CREATE INDEX idx_user_preferences_user_id ON user_preferences(user_id); - `); -} - -export async function down(db) { - await db.execute(sql`DROP TABLE user_preferences;`); -} -``` - ---- - -### 3. Backup & Recovery Strategy - -**Supabase Automatic Backups:** - -- **Daily backups**: Retained for 7 days (Pro plan) -- **Point-in-time recovery**: Up to 7 days (Pro plan) -- **Geographic replication**: Multi-region redundancy - -**Custom Backup Script:** - -```bash -#!/bin/bash -# scripts/backup-db.sh - -PROJECT_REF="your-project-ref" -BACKUP_DIR="/backups/$(date +%Y-%m-%d)" - -# Create backup -pg_dump "$DATABASE_URL" \ - --format=custom \ - --compress=9 \ - --file="$BACKUP_DIR/chat-db-$(date +%Y%m%d-%H%M%S).dump" - -# Upload to S3/R2 -aws s3 cp "$BACKUP_DIR" s3://manacore-backups/ --recursive - -# Retain only last 30 days -find /backups -mtime +30 -delete -``` - -**Restore Procedure:** - -```bash -# Download backup -aws s3 cp s3://manacore-backups/2025-11-27/chat-db-20251127-120000.dump ./ - -# Restore to database -pg_restore --clean --if-exists \ - --dbname="$DATABASE_URL" \ - ./chat-db-20251127-120000.dump -``` - -**Disaster Recovery RPO/RTO:** - -- **RPO (Recovery Point Objective)**: < 24 hours (daily backups) -- **RTO (Recovery Time Objective)**: < 1 hour (automated restore) - ---- - -### 4. Redis Caching Strategy - -**Use Cases:** - -| Service | Cache Key Pattern | TTL | Purpose | -|---------|------------------|-----|---------| -| Mana Core Auth | `session:{sessionId}` | 7 days | JWT session storage | -| Mana Core Auth | `credits:{userId}` | 5 minutes | Credit balance cache | -| Chat Backend | `models:list` | 1 hour | AI model metadata | -| Picture Backend | `generations:{userId}:{day}` | 24 hours | Daily usage quota | -| Uload Backend | `url:{shortCode}` | Permanent | URL redirect cache | - -**Redis Configuration:** - -```yaml -# docker-compose.prod.yml -redis: - image: redis:7-alpine - command: > - redis-server - --requirepass ${REDIS_PASSWORD} - --maxmemory 512mb - --maxmemory-policy allkeys-lru - --appendonly yes - --appendfsync everysec - volumes: - - redis-data:/data -``` - -**Cache Invalidation Strategy:** - -```typescript -// Example: Invalidate user credits cache on update -async updateCredits(userId: string, amount: number) { - await this.db.updateCredits(userId, amount); - await this.redis.del(`credits:${userId}`); // Invalidate cache -} -``` - ---- - -## Network Architecture - -### 1. Domain & Subdomain Strategy - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ DOMAIN ARCHITECTURE │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ Root Domain: manacore.app │ -│ │ -│ Product Structure: │ -│ ┌──────────────────────────────────────────────────┐ │ -│ │ Landing (Astro) → chat.manacore.app │ │ -│ │ Web App (Svelte) → app-chat.manacore.app │ │ -│ │ API (NestJS) → api-chat.manacore.app │ │ -│ │ Mobile (Expo) → N/A (native apps) │ │ -│ └──────────────────────────────────────────────────┘ │ -│ │ -│ Example: Chat Project │ -│ - https://chat.manacore.app → Astro landing │ -│ - https://app-chat.manacore.app → SvelteKit web app │ -│ - https://api-chat.manacore.app → NestJS backend │ -│ │ -│ Infrastructure: │ -│ - https://auth.manacore.app → Mana Core Auth │ -│ - https://status.manacore.app → Status page (UptimeRobot)│ -│ - https://docs.manacore.app → API documentation │ -│ │ -│ All domains: │ -│ - SSL via Let's Encrypt (Coolify auto-provision) │ -│ - HTTP/2 enabled │ -│ - HSTS headers (max-age=31536000) │ -│ - Cloudflare DNS (with proxy for DDoS protection) │ -│ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**DNS Records (Cloudflare):** - -``` -Type Name Target Proxy -───────────────────────────────────────────────────────────────────── -A chat.manacore.app 185.230.123.45 (Coolify IP) Yes -A app-chat.manacore.app 185.230.123.45 Yes -A api-chat.manacore.app 185.230.123.45 No* -CNAME *.manacore.app manacore.app Yes - -* API endpoints should NOT be proxied through Cloudflare to avoid caching issues -``` - ---- - -### 2. SSL/TLS Certificate Management - -**Coolify Automatic SSL:** - -```yaml -# .coolify/settings.yml -ssl: - provider: letsencrypt - email: devops@manacore.app - staging: false # Use production Let's Encrypt - auto_renew: true - renewal_days_before: 30 -``` - -**Manual SSL (Certbot):** - -```bash -# Initial setup -certbot certonly --standalone \ - -d chat.manacore.app \ - -d api-chat.manacore.app \ - --email devops@manacore.app \ - --agree-tos - -# Auto-renewal cron job -0 0 * * * certbot renew --quiet --post-hook "systemctl reload nginx" -``` - -**SSL Configuration (Nginx):** - -```nginx -# /etc/nginx/sites-available/chat.manacore.app -server { - listen 443 ssl http2; - server_name chat.manacore.app; - - ssl_certificate /etc/letsencrypt/live/chat.manacore.app/fullchain.pem; - ssl_certificate_key /etc/letsencrypt/live/chat.manacore.app/privkey.pem; - ssl_protocols TLSv1.2 TLSv1.3; - ssl_ciphers HIGH:!aNULL:!MD5; - ssl_prefer_server_ciphers on; - - # HSTS - add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; - - # Security headers - add_header X-Frame-Options "SAMEORIGIN" always; - add_header X-Content-Type-Options "nosniff" always; - add_header X-XSS-Protection "1; mode=block" always; - - location / { - proxy_pass http://localhost:3100; # chat-web container - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } -} -``` - ---- - -### 3. API Gateway vs Direct Service Exposure - -**Current Recommendation:** Direct service exposure (no API gateway initially). - -**Rationale:** - -- **Simplicity**: Each backend has its own domain -- **Low traffic volume**: Gateway overhead not justified yet -- **Independent scaling**: Services scale independently -- **Coolify routing**: Built-in reverse proxy handles routing - -**Future API Gateway (Kong/Traefik) - When to Adopt:** - -- Traffic > 10,000 req/min -- Need centralized rate limiting -- Require complex routing (A/B testing, canary deployments) -- Centralized authentication/authorization - -**Example Kong Configuration (Future):** - -```yaml -# kong.yml -_format_version: "3.0" - -services: - - name: chat-backend - url: http://chat-backend:3002 - routes: - - name: chat-api - paths: - - /api/chat - strip_path: true - plugins: - - name: rate-limiting - config: - minute: 100 - - name: cors - config: - origins: - - https://app-chat.manacore.app - - - name: picture-backend - url: http://picture-backend:3005 - routes: - - name: picture-api - paths: - - /api/picture -``` - ---- - -### 4. CORS Configuration - -**Backend CORS Setup (NestJS):** - -```typescript -// src/main.ts -import { NestFactory } from '@nestjs/core'; -import { AppModule } from './app.module'; - -async function bootstrap() { - const app = await NestFactory.create(AppModule); - - app.enableCors({ - origin: [ - 'https://app-chat.manacore.app', // Production web app - 'https://chat.manacore.app', // Landing page - 'http://localhost:5173', // Development web app - 'http://localhost:3000', // Development landing - 'capacitor://localhost', // Mobile app (Capacitor) - 'ionic://localhost', // Mobile app (Ionic) - ], - credentials: true, - methods: ['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'OPTIONS'], - allowedHeaders: ['Content-Type', 'Authorization', 'X-App-ID'], - }); - - await app.listen(3002); -} -bootstrap(); -``` - -**Environment-Specific CORS:** - -```typescript -// config/cors.config.ts -const allowedOrigins = { - development: ['http://localhost:*'], - staging: ['https://staging-*.manacore.app'], - production: ['https://*.manacore.app'], -}; - -export const getCorsOrigins = () => { - const env = process.env.NODE_ENV || 'development'; - return allowedOrigins[env]; -}; -``` - ---- - -### 5. CDN for Static Assets - -**Strategy:** Cloudflare CDN in front of Astro landing pages. - -**Benefits:** - -- **Global edge caching**: 275+ data centers worldwide -- **DDoS protection**: Automatic mitigation -- **Compression**: Brotli + Gzip -- **Image optimization**: Polish feature (WebP conversion) -- **Caching rules**: Configurable per path - -**Cloudflare Page Rules:** - -``` -Rule 1: Cache Everything - URL: https://chat.manacore.app/* - Settings: - - Cache Level: Cache Everything - - Edge Cache TTL: 1 month - - Browser Cache TTL: 1 week - -Rule 2: Bypass Cache for API - URL: https://api-chat.manacore.app/* - Settings: - - Cache Level: Bypass - -Rule 3: Image Optimization - URL: https://chat.manacore.app/images/* - Settings: - - Polish: Lossless - - Mirage: On (lazy loading) -``` - -**Astro Build Configuration:** - -```javascript -// astro.config.mjs -export default defineConfig({ - output: 'static', - build: { - inlineStylesheets: 'auto', - assets: '_assets', - }, - vite: { - build: { - rollupOptions: { - output: { - assetFileNames: 'assets/[name].[hash][extname]', - chunkFileNames: 'chunks/[name].[hash].js', - entryFileNames: 'entry/[name].[hash].js', - }, - }, - }, - }, -}); -``` - -**Cache-Control Headers:** - -```nginx -# Nginx config for Astro landing pages -location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ { - expires 1y; - add_header Cache-Control "public, immutable"; -} - -location ~* \.(html)$ { - expires 1h; - add_header Cache-Control "public, must-revalidate"; -} -``` - ---- - -## Environment Configuration Matrix - -### Service Environment Variables - -| Service | Env Var | Development | Staging | Production | Secret | -|---------|---------|-------------|---------|------------|--------| -| **mana-core-auth** | -| | `PORT` | 3001 | 3001 | 3001 | No | -| | `DATABASE_URL` | `postgresql://localhost:5432/manacore` | `postgresql://staging-db/manacore` | `postgresql://prod-db/manacore` | Yes | -| | `REDIS_HOST` | localhost | redis | redis | No | -| | `JWT_PRIVATE_KEY` | (dev key) | (staging key) | (prod key) | Yes | -| | `STRIPE_SECRET_KEY` | `sk_test_...` | `sk_test_...` | `sk_live_...` | Yes | -| **chat-backend** | -| | `PORT` | 3002 | 3002 | 3002 | No | -| | `DATABASE_URL` | Supabase (dev) | Supabase (staging) | Supabase (prod) | Yes | -| | `AZURE_OPENAI_API_KEY` | (dev key) | (staging key) | (prod key) | Yes | -| | `MANA_CORE_AUTH_URL` | `http://localhost:3001` | `https://auth-staging.manacore.app` | `https://auth.manacore.app` | No | -| **chat-web** | -| | `PUBLIC_BACKEND_URL` | `http://localhost:3002` | `https://api-staging-chat.manacore.app` | `https://api-chat.manacore.app` | No | -| | `PUBLIC_SUPABASE_URL` | Supabase (dev) | Supabase (staging) | Supabase (prod) | No | -| | `PUBLIC_SUPABASE_ANON_KEY` | (dev anon key) | (staging anon key) | (prod anon key) | No | - -**Secret Management:** - -- **Development:** `.env.development` (committed to git) -- **Staging/Production:** Coolify secrets UI or Kubernetes secrets - -```bash -# Coolify secret injection -coolify env set chat-backend \ - AZURE_OPENAI_API_KEY=secret123 \ - DATABASE_URL=postgresql://... -``` - -**Kubernetes Secrets:** - -```yaml -# k8s/secrets.yaml -apiVersion: v1 -kind: Secret -metadata: - name: chat-backend-secrets - namespace: manacore -type: Opaque -data: - database-url: cG9zdGdyZXNxbDovLy4uLg== # base64 encoded - azure-api-key: c2VjcmV0MTIz # base64 encoded -``` - ---- - -## Monitoring & Observability - -### 1. Logging Aggregation - -**Architecture:** - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ LOGGING PIPELINE │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ [Services] │ -│ ↓ stdout/stderr │ -│ [Docker Logs] │ -│ ↓ Docker logging driver │ -│ [Loki / ELK Stack] │ -│ ↓ Aggregation & indexing │ -│ [Grafana / Kibana] │ -│ ↓ Visualization & alerts │ -│ [On-call Engineer] │ -│ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Docker Logging Driver (Loki):** - -```yaml -# docker-compose.prod.yml -x-logging: &default-logging - driver: loki - options: - loki-url: "http://loki:3100/loki/api/v1/push" - loki-batch-size: "400" - loki-retries: "3" - labels: "project,service,environment" - -services: - chat-backend: - logging: *default-logging - labels: - logging.project: "chat" - logging.service: "backend" - logging.environment: "production" -``` - -**Structured Logging (NestJS):** - -```typescript -// src/logging/logger.service.ts -import { Injectable, Logger as NestLogger } from '@nestjs/common'; - -@Injectable() -export class LoggerService extends NestLogger { - log(message: string, context?: string) { - super.log(JSON.stringify({ - level: 'info', - timestamp: new Date().toISOString(), - context, - message, - environment: process.env.NODE_ENV, - service: 'chat-backend', - })); - } - - error(message: string, trace?: string, context?: string) { - super.error(JSON.stringify({ - level: 'error', - timestamp: new Date().toISOString(), - context, - message, - trace, - environment: process.env.NODE_ENV, - service: 'chat-backend', - })); - } -} -``` - -**Grafana Loki Query Examples:** - -```logql -# All errors in last 1 hour -{project="chat", level="error"} |= "" | json | line_format "{{.message}}" - -# High latency requests (>1s) -{service="backend"} | json | duration > 1s - -# Failed database connections -{service="backend"} |~ "database connection failed" -``` - ---- - -### 2. Application Performance Monitoring (APM) - -**Recommended Tool:** Sentry (error tracking) + New Relic / Datadog (APM) - -**Sentry Integration (NestJS):** - -```typescript -// src/main.ts -import * as Sentry from '@sentry/node'; - -Sentry.init({ - dsn: process.env.SENTRY_DSN, - environment: process.env.NODE_ENV, - tracesSampleRate: 0.1, // 10% of transactions - integrations: [ - new Sentry.Integrations.Http({ tracing: true }), - new Sentry.Integrations.Postgres(), - ], -}); - -async function bootstrap() { - const app = await NestFactory.create(AppModule); - - // Sentry request handler - app.use(Sentry.Handlers.requestHandler()); - app.use(Sentry.Handlers.tracingHandler()); - - // ... app setup - - // Sentry error handler - app.use(Sentry.Handlers.errorHandler()); - - await app.listen(3002); -} -``` - -**Metrics to Track:** - -| Metric | Threshold | Action | -|--------|-----------|--------| -| API Response Time (p95) | > 500ms | Alert on-call | -| Error Rate | > 5% | Alert on-call | -| Database Query Time (p95) | > 200ms | Investigate slow queries | -| Memory Usage | > 80% | Scale up or investigate leak | -| CPU Usage | > 70% | Scale horizontally | -| Failed Logins | > 100/min | Potential attack, rate limit | - ---- - -### 3. Metrics Collection (Prometheus + Grafana) - -**Prometheus Exporter (NestJS):** - -```typescript -// src/metrics/metrics.controller.ts -import { Controller, Get } from '@nestjs/common'; -import { register, Counter, Histogram } from 'prom-client'; - -const httpRequestDuration = new Histogram({ - name: 'http_request_duration_seconds', - help: 'Duration of HTTP requests in seconds', - labelNames: ['method', 'route', 'status_code'], -}); - -const httpRequestTotal = new Counter({ - name: 'http_requests_total', - help: 'Total number of HTTP requests', - labelNames: ['method', 'route', 'status_code'], -}); - -@Controller() -export class MetricsController { - @Get('/metrics') - getMetrics() { - return register.metrics(); - } -} -``` - -**Prometheus Scrape Config:** - -```yaml -# prometheus.yml -scrape_configs: - - job_name: 'chat-backend' - static_configs: - - targets: ['chat-backend:3002'] - metrics_path: '/metrics' - scrape_interval: 30s - - - job_name: 'maerchenzauber-backend' - static_configs: - - targets: ['maerchenzauber-backend:3003'] -``` - -**Grafana Dashboard:** - -- **Dashboard 1: Service Health Overview** - - Request rate (req/sec) - - Error rate (%) - - Response time (p50, p95, p99) - - Active connections - -- **Dashboard 2: Database Performance** - - Query duration - - Connection pool usage - - Slow queries (>100ms) - -- **Dashboard 3: Resource Utilization** - - CPU usage - - Memory usage - - Disk I/O - - Network traffic - ---- - -### 4. Alert Thresholds - -**Alert Configuration (Prometheus Alertmanager):** - -```yaml -# alertmanager.yml -groups: - - name: critical_alerts - interval: 1m - rules: - - alert: HighErrorRate - expr: rate(http_requests_total{status_code=~"5.."}[5m]) > 0.05 - for: 5m - labels: - severity: critical - annotations: - summary: "High error rate detected (>5%)" - description: "Service {{ $labels.service }} has error rate {{ $value }}" - - - alert: HighResponseTime - expr: histogram_quantile(0.95, http_request_duration_seconds_bucket) > 0.5 - for: 10m - labels: - severity: warning - annotations: - summary: "High response time (p95 >500ms)" - - - alert: DatabaseConnectionPoolExhausted - expr: pg_pool_available_connections < 2 - for: 2m - labels: - severity: critical - annotations: - summary: "Database connection pool almost exhausted" - - - alert: HighMemoryUsage - expr: container_memory_usage_bytes / container_spec_memory_limit_bytes > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: "Container memory usage >80%" -``` - -**Alert Routing:** - -```yaml -# alertmanager.yml -route: - receiver: 'default' - group_by: ['alertname', 'service'] - group_wait: 10s - group_interval: 10s - repeat_interval: 12h - routes: - - match: - severity: critical - receiver: 'pagerduty' - - match: - severity: warning - receiver: 'slack' - -receivers: - - name: 'pagerduty' - pagerduty_configs: - - service_key: '' - - - name: 'slack' - slack_configs: - - api_url: '' - channel: '#alerts' -``` - ---- - -## CI/CD Pipeline - -### GitHub Actions Workflow - -**File:** `.github/workflows/deploy-chat.yml` - -```yaml -name: Deploy Chat Project - -on: - push: - branches: [main] - paths: - - 'apps/chat/**' - - 'packages/shared-*/**' - - '.github/workflows/deploy-chat.yml' - pull_request: - branches: [main] - paths: - - 'apps/chat/**' - -env: - REGISTRY: ghcr.io - IMAGE_PREFIX: manacore - -jobs: - # ============================================================================ - # Job 1: Lint & Type Check - # ============================================================================ - - lint-and-typecheck: - name: Lint & Type Check - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v2 - with: - version: 9.15.0 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - cache: 'pnpm' - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build shared packages - run: pnpm --filter '@manacore/shared-*' build - - - name: Lint chat backend - run: pnpm --filter @chat/backend lint - - - name: Type check chat backend - run: pnpm --filter @chat/backend type-check - - - name: Lint chat web - run: pnpm --filter @chat/web lint - - - name: Type check chat web - run: pnpm --filter @chat/web type-check - - # ============================================================================ - # Job 2: Build & Push Docker Images - # ============================================================================ - - build-and-push: - name: Build Docker Images - runs-on: ubuntu-latest - needs: lint-and-typecheck - if: github.event_name == 'push' && github.ref == 'refs/heads/main' - strategy: - matrix: - service: - - { name: chat-backend, path: apps/chat/apps/backend, port: 3002 } - - { name: chat-web, path: apps/chat/apps/web, port: 3000 } - - { name: chat-landing, path: apps/chat/apps/landing, port: 80 } - - permissions: - contents: read - packages: write - - steps: - - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_PREFIX }}/${{ matrix.service.name }} - tags: | - type=ref,event=branch - type=ref,event=pr - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=sha,prefix={{branch}}- - type=raw,value=latest,enable={{is_default_branch}} - - - name: Determine Dockerfile - id: dockerfile - run: | - if [[ "${{ matrix.service.name }}" == *-backend ]]; then - echo "dockerfile=docker/templates/Dockerfile.nestjs" >> $GITHUB_OUTPUT - elif [[ "${{ matrix.service.name }}" == *-web ]]; then - echo "dockerfile=docker/templates/Dockerfile.sveltekit" >> $GITHUB_OUTPUT - elif [[ "${{ matrix.service.name }}" == *-landing ]]; then - echo "dockerfile=docker/templates/Dockerfile.astro" >> $GITHUB_OUTPUT - fi - - - name: Build and push Docker image - uses: docker/build-push-action@v5 - with: - context: . - file: ${{ steps.dockerfile.outputs.dockerfile }} - build-args: | - PROJECT_PATH=${{ matrix.service.path }} - PORT=${{ matrix.service.port }} - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - - # ============================================================================ - # Job 3: Deploy to Staging - # ============================================================================ - - deploy-staging: - name: Deploy to Staging - runs-on: ubuntu-latest - needs: build-and-push - environment: - name: staging - url: https://staging-chat.manacore.app - - steps: - - name: Deploy to Coolify (Staging) - uses: appleboy/ssh-action@v1.0.0 - with: - host: ${{ secrets.COOLIFY_STAGING_HOST }} - username: ${{ secrets.COOLIFY_SSH_USER }} - key: ${{ secrets.COOLIFY_SSH_KEY }} - script: | - cd /var/lib/coolify/apps/chat-staging - docker compose pull - docker compose up -d --force-recreate - docker compose exec -T chat-backend pnpm migration:run - - - name: Health check (Staging) - run: | - curl -f https://api-staging-chat.manacore.app/api/health || exit 1 - - # ============================================================================ - # Job 4: Deploy to Production (Manual Approval) - # ============================================================================ - - deploy-production: - name: Deploy to Production - runs-on: ubuntu-latest - needs: deploy-staging - environment: - name: production - url: https://chat.manacore.app - - steps: - - name: Deploy to Coolify (Production) - uses: appleboy/ssh-action@v1.0.0 - with: - host: ${{ secrets.COOLIFY_PROD_HOST }} - username: ${{ secrets.COOLIFY_SSH_USER }} - key: ${{ secrets.COOLIFY_SSH_KEY }} - script: | - cd /var/lib/coolify/apps/chat-production - - # Blue-green deployment: Deploy to green environment - docker compose -f docker-compose.green.yml pull - docker compose -f docker-compose.green.yml up -d --force-recreate - - # Wait for health check - sleep 10 - - # Run migrations on green - docker compose -f docker-compose.green.yml exec -T chat-backend pnpm migration:run - - # Health check green environment - curl -f http://localhost:3002/api/health || exit 1 - - # Switch traffic to green (update Coolify routing) - coolify switch-deployment chat green - - # Keep blue running for 1 hour (rollback window) - # Decommission blue after validation - - - name: Health check (Production) - run: | - curl -f https://api-chat.manacore.app/api/health || exit 1 - - - name: Smoke tests - run: | - # Basic API tests - curl -X POST https://api-chat.manacore.app/api/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"Hello"}]}' -``` - -**Matrix Strategy for All Projects:** - -```yaml -# .github/workflows/deploy-all.yml -strategy: - matrix: - project: - - chat - - maerchenzauber - - manadeck - - memoro - - picture - - uload - - nutriphi - - news - - manacore -``` - ---- - -## Disaster Recovery - -### 1. Backup Strategy - -**What to Backup:** - -- ✅ **PostgreSQL databases** (Supabase auto-backup + manual pg_dump) -- ✅ **Redis data** (AOF persistence enabled) -- ✅ **Docker volumes** (application state, logs) -- ✅ **Environment variables** (encrypted secrets backup) -- ✅ **SSL certificates** (Let's Encrypt certs) -- ❌ **Docker images** (rebuild from source) -- ❌ **Build artifacts** (regenerate from CI/CD) - -**Backup Schedule:** - -| Asset | Frequency | Retention | Storage | -|-------|-----------|-----------|---------| -| PostgreSQL | Daily (3 AM UTC) | 30 days | Cloudflare R2 | -| Redis | Daily (4 AM UTC) | 7 days | Cloudflare R2 | -| Environment Configs | On change | Indefinite | Git (encrypted) | -| SSL Certs | Weekly | 90 days | Encrypted backup | - -**Automated Backup Script:** - -```bash -#!/bin/bash -# scripts/backup-all.sh - -set -e - -BACKUP_DIR="/backups/$(date +%Y/%m/%d)" -S3_BUCKET="s3://manacore-backups" - -mkdir -p "$BACKUP_DIR" - -# Backup all databases -for db in manacore chat maerchenzauber manadeck picture nutriphi; do - echo "Backing up database: $db" - pg_dump "$DATABASE_URL/$db" \ - --format=custom \ - --compress=9 \ - --file="$BACKUP_DIR/$db-$(date +%Y%m%d-%H%M%S).dump" -done - -# Backup Redis -echo "Backing up Redis" -redis-cli --rdb "$BACKUP_DIR/redis-$(date +%Y%m%d-%H%M%S).rdb" - -# Upload to S3 (Cloudflare R2) -aws s3 sync "$BACKUP_DIR" "$S3_BUCKET/$(date +%Y/%m/%d)" \ - --endpoint-url https://your-account-id.r2.cloudflarestorage.com - -# Cleanup local backups older than 7 days -find /backups -type d -mtime +7 -exec rm -rf {} + - -echo "Backup completed successfully" -``` - -**Cron Job:** - -```cron -# Run backup daily at 3 AM UTC -0 3 * * * /opt/manacore/scripts/backup-all.sh >> /var/log/manacore-backup.log 2>&1 -``` - ---- - -### 2. Recovery Procedures - -#### Scenario 1: Database Corruption - -```bash -# 1. Stop application -docker compose stop chat-backend - -# 2. Download latest backup -aws s3 cp s3://manacore-backups/2025/11/27/chat-20251127-030000.dump ./ - -# 3. Drop corrupted database -psql -U manacore -c "DROP DATABASE chat;" -psql -U manacore -c "CREATE DATABASE chat;" - -# 4. Restore from backup -pg_restore --dbname="postgresql://manacore:pass@localhost/chat" \ - --clean --if-exists \ - ./chat-20251127-030000.dump - -# 5. Restart application -docker compose start chat-backend - -# 6. Verify health -curl -f https://api-chat.manacore.app/api/health -``` - -**RTO:** ~15 minutes -**RPO:** < 24 hours (last daily backup) - ---- - -#### Scenario 2: Complete Server Failure - -```bash -# 1. Provision new server (same specs) -# 2. Install Docker + Coolify -curl -fsSL https://cdn.coollabs.io/coolify/install.sh | bash - -# 3. Clone repository -git clone https://github.com/manacore/manacore-monorepo.git -cd manacore-monorepo - -# 4. Restore environment variables (from encrypted backup) -gpg --decrypt secrets-backup.gpg > .env.production - -# 5. Restore databases -./scripts/restore-all-databases.sh - -# 6. Deploy all services -docker compose -f docker-compose.prod.yml up -d - -# 7. Update DNS records (point to new server IP) -# 8. Verify all services healthy -``` - -**RTO:** ~2 hours -**RPO:** < 24 hours - ---- - -#### Scenario 3: Accidental Data Deletion - -**Example:** User accidentally deleted critical records. - -```bash -# 1. Identify time of deletion -# 2. Find latest backup BEFORE deletion -aws s3 ls s3://manacore-backups/2025/11/27/ - -# 3. Restore to temporary database -pg_restore --dbname="postgresql://localhost/chat_temp" \ - ./chat-20251127-120000.dump - -# 4. Extract deleted records -psql -U manacore chat_temp -c \ - "COPY (SELECT * FROM messages WHERE id IN ('uuid1','uuid2')) TO STDOUT" \ - > deleted_records.csv - -# 5. Import to production database -psql -U manacore chat -c \ - "COPY messages FROM STDIN CSV" < deleted_records.csv - -# 6. Verify restoration -psql -U manacore chat -c \ - "SELECT * FROM messages WHERE id IN ('uuid1','uuid2')" -``` - ---- - -### 3. Failover Strategies - -#### Active-Passive (Current) - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ ACTIVE-PASSIVE FAILOVER │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ [Primary Server - EU-West] │ -│ ┌────────────────────────────┐ │ -│ │ Chat Backend (Active) │ │ -│ │ Picture Backend (Active) │ │ -│ │ All Web Apps (Active) │ │ -│ └────────────────────────────┘ │ -│ │ -│ [Standby Server - US-East] (Cold Standby) │ -│ ┌────────────────────────────┐ │ -│ │ Services: Stopped │ │ -│ │ Disk: Daily backup sync │ │ -│ │ Activation: Manual │ │ -│ └────────────────────────────┘ │ -│ │ -│ Failover Time: ~2 hours (manual) │ -│ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -**Failover Trigger:** -1. Primary server down > 30 minutes -2. Health checks fail > 10 consecutive times -3. Network unreachable - -**Manual Failover Steps:** -```bash -# 1. Verify primary is down -curl -f https://api-chat.manacore.app/api/health - -# 2. Activate standby server -ssh standby-server "docker compose -f docker-compose.prod.yml up -d" - -# 3. Update DNS (short TTL) -# A record: chat.manacore.app → standby-server-ip - -# 4. Wait for DNS propagation (~5 minutes with TTL=300) - -# 5. Verify all services healthy on standby -./scripts/health-check-all.sh -``` - ---- - -#### Active-Active (Future) - -**Multi-region setup with load balancing:** - -``` -[Cloudflare Load Balancer] - ↓ - ┌────┴────┐ - ↓ ↓ -[EU-West] [US-East] -Chat-1 Chat-2 -Picture-1 Picture-2 -``` - -**Benefits:** -- Zero-downtime failover (automatic) -- Geographic load distribution -- Better performance for global users - -**Challenges:** -- Database replication complexity -- Session state synchronization -- 2x infrastructure cost - ---- - -## Security Hardening - -### 1. Container Security - -```dockerfile -# Security best practices in Dockerfile - -# 1. Non-root user -RUN addgroup -g 1001 nodejs && adduser -u 1001 -G nodejs -s /bin/sh -D nodejs -USER nodejs - -# 2. Read-only root filesystem -# (configured in docker-compose.yml) - -# 3. Minimal base image -FROM node:20-alpine # Not node:20 (Debian) - -# 4. No unnecessary packages -RUN apk add --no-cache postgresql-client wget -# Avoid: apt-get install curl git vim ... - -# 5. Scan for vulnerabilities -# Run: trivy image chat-backend:latest -``` - -**Docker Compose Security:** - -```yaml -services: - chat-backend: - security_opt: - - no-new-privileges:true - read_only: true - tmpfs: - - /tmp:noexec,nosuid,size=100m - cap_drop: - - ALL - cap_add: - - NET_BIND_SERVICE -``` - ---- - -### 2. Network Security - -**Firewall Rules (iptables/ufw):** - -```bash -# Allow only necessary ports -ufw default deny incoming -ufw default allow outgoing -ufw allow 22/tcp # SSH -ufw allow 80/tcp # HTTP -ufw allow 443/tcp # HTTPS -ufw enable - -# Block direct access to backend ports (only via reverse proxy) -ufw deny 3001:3100/tcp -``` - -**Docker Network Isolation:** - -```yaml -networks: - frontend: - driver: bridge - backend: - driver: bridge - internal: true # No external access - -services: - chat-web: - networks: - - frontend - - backend - - chat-backend: - networks: - - backend # Not exposed to internet - - postgres: - networks: - - backend # Internal only -``` - ---- - -### 3. Secrets Management - -**Current:** Coolify environment variables UI (encrypted at rest) - -**Future:** HashiCorp Vault or AWS Secrets Manager - -**Vault Integration Example:** - -```typescript -// src/config/vault.config.ts -import * as vault from 'node-vault'; - -const vaultClient = vault({ - endpoint: process.env.VAULT_ADDR, - token: process.env.VAULT_TOKEN, -}); - -export async function getSecret(path: string) { - const result = await vaultClient.read(path); - return result.data; -} - -// Usage -const dbPassword = await getSecret('secret/database/chat-backend'); -``` - ---- - -### 4. Rate Limiting - -**NestJS Throttler:** - -```typescript -// src/app.module.ts -import { ThrottlerModule } from '@nestjs/throttler'; - -@Module({ - imports: [ - ThrottlerModule.forRoot({ - ttl: 60, // Time window (seconds) - limit: 100, // Max requests per window - }), - ], -}) -export class AppModule {} -``` - -**Nginx Rate Limiting:** - -```nginx -# /etc/nginx/nginx.conf -http { - limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s; - - server { - location /api/ { - limit_req zone=api_limit burst=20 nodelay; - proxy_pass http://backend; - } - } -} -``` - ---- - -### 5. Security Headers - -```typescript -// src/main.ts (NestJS) -import helmet from 'helmet'; - -app.use(helmet({ - contentSecurityPolicy: { - directives: { - defaultSrc: ["'self'"], - scriptSrc: ["'self'", "'unsafe-inline'"], - styleSrc: ["'self'", "'unsafe-inline'"], - imgSrc: ["'self'", "data:", "https:"], - }, - }, - hsts: { - maxAge: 31536000, - includeSubDomains: true, - preload: true, - }, -})); -``` - -**HTTP Headers:** - -``` -Strict-Transport-Security: max-age=31536000; includeSubDomains; preload -X-Frame-Options: SAMEORIGIN -X-Content-Type-Options: nosniff -X-XSS-Protection: 1; mode=block -Referrer-Policy: strict-origin-when-cross-origin -Permissions-Policy: geolocation=(), microphone=(), camera=() -``` - ---- - -## Implementation Roadmap - -### Phase 1: Foundation (Week 1-2) - -- [ ] Create Dockerfile templates (NestJS, SvelteKit, Astro) -- [ ] Enhance `docker-compose.dev.yml` with all projects -- [ ] Set up shared PostgreSQL + Redis containers -- [ ] Test local development workflow -- [ ] Document environment variable mapping - -### Phase 2: CI/CD (Week 3-4) - -- [ ] Set up GitHub Actions workflows (per project) -- [ ] Configure Docker image registry (GitHub Container Registry) -- [ ] Implement automated testing in CI -- [ ] Set up staging environment on Coolify -- [ ] Implement blue-green deployment scripts - -### Phase 3: Production Deployment (Week 5-6) - -- [ ] Deploy `mana-core-auth` to production -- [ ] Deploy first project (chat) end-to-end -- [ ] Set up monitoring (Prometheus + Grafana) -- [ ] Configure alerting (PagerDuty + Slack) -- [ ] Implement automated backups - -### Phase 4: Rollout (Week 7-8) - -- [ ] Deploy remaining 8 projects -- [ ] Set up CDN for Astro landing pages -- [ ] Configure DNS and SSL for all domains -- [ ] Load testing and performance optimization -- [ ] Documentation and runbooks - -### Phase 5: Optimization (Week 9-10) - -- [ ] Implement caching strategies (Redis) -- [ ] Set up APM (Sentry + New Relic) -- [ ] Security audit and penetration testing -- [ ] Disaster recovery drills -- [ ] Team training on deployment procedures - ---- - -## Appendix - -### A. Port Allocation Matrix - -| Service | Dev Port | Staging Port | Prod Port | Protocol | -|---------|----------|--------------|-----------|----------| -| mana-core-auth | 3001 | 3001 | 3001 | HTTP | -| chat-backend | 3002 | 3002 | 3002 | HTTP | -| chat-web | 3100 | 3100 | 3100 | HTTP | -| chat-landing | 3200 | 3200 | 3200 | HTTP | -| maerchenzauber-backend | 3003 | 3003 | 3003 | HTTP | -| maerchenzauber-web | 3110 | 3110 | 3110 | HTTP | -| maerchenzauber-landing | 3210 | 3210 | 3210 | HTTP | -| picture-backend | 3005 | 3005 | 3005 | HTTP | -| picture-web | 3150 | 3150 | 3150 | HTTP | -| PostgreSQL | 5432 | 5432 | N/A (Supabase) | TCP | -| Redis | 6379 | 6379 | 6379 | TCP | - -### B. Resource Requirements - -**Per Service (Minimum):** - -| Service Type | CPU | Memory | Disk | -|--------------|-----|--------|------| -| NestJS Backend | 0.5 vCPU | 512 MB | 1 GB | -| SvelteKit Web | 0.25 vCPU | 256 MB | 500 MB | -| Astro Landing (Nginx) | 0.1 vCPU | 128 MB | 100 MB | -| PostgreSQL | 1 vCPU | 2 GB | 50 GB | -| Redis | 0.25 vCPU | 256 MB | 5 GB | - -**Total Infrastructure (Production):** - -- **CPU:** ~15 vCPU -- **Memory:** ~15 GB -- **Disk:** ~100 GB (excluding databases) -- **Estimated Monthly Cost:** $150-$300 (single server) or $500-$800 (multi-region) - -### C. Useful Commands Reference - -```bash -# Build all Docker images -./scripts/build-all-images.sh - -# Deploy specific project -docker compose --profile chat up -d - -# View logs -docker compose logs -f chat-backend - -# Health check all services -./scripts/health-check-all.sh - -# Backup all databases -./scripts/backup-all.sh - -# Restore database -./scripts/restore-db.sh chat 2025-11-27 - -# Rollback deployment -./scripts/rollback.sh chat v1.5.2 - -# Scale service -docker compose up -d --scale chat-backend=3 -``` - ---- - -## Conclusion - -This deployment architecture provides: - -- **Scalability:** Horizontal scaling per service -- **Reliability:** Blue-green deployments with instant rollback -- **Security:** Non-root containers, read-only filesystems, secrets management -- **Observability:** Comprehensive logging, metrics, and alerting -- **Disaster Recovery:** Automated backups with <1 hour RTO -- **Developer Experience:** Local Docker Compose mirrors production -- **Cost Efficiency:** Shared infrastructure (PostgreSQL, Redis) reduces overhead - -**Next Steps:** - -1. Review this architecture with the team -2. Prioritize Phase 1 implementation -3. Create Dockerfiles for all services -4. Set up CI/CD pipelines -5. Deploy to staging environment - -**Questions or Feedback:** Contact the DevOps team or create an issue in the monorepo. - ---- - -**Document Version:** 1.0 -**Last Updated:** 2025-11-27 -**Maintained By:** Hive Mind Swarm - Analyst Agent diff --git a/docs/DEPLOYMENT_DIAGRAMS.md b/docs/DEPLOYMENT_DIAGRAMS.md deleted file mode 100644 index 3f311cbf0..000000000 --- a/docs/DEPLOYMENT_DIAGRAMS.md +++ /dev/null @@ -1,949 +0,0 @@ -# Manacore Monorepo - Deployment Architecture Diagrams - -**Visual representation of the deployment architecture** - ---- - -## System Overview - High-Level Architecture - -``` -┌────────────────────────────────────────────────────────────────────────────────────────┐ -│ MANACORE ECOSYSTEM │ -│ Production Deployment Architecture │ -└────────────────────────────────────────────────────────────────────────────────────────┘ - - [Internet Users] - │ - │ - ┌────────────────────┴────────────────────┐ - │ │ - ▼ ▼ - ┌──────────────────┐ ┌──────────────────┐ - │ Cloudflare CDN │ │ Cloudflare CDN │ - │ (Static Assets) │ │ (DDoS/Cache) │ - └────────┬─────────┘ └────────┬─────────┘ - │ │ - │ Astro Landing Pages │ App Traffic - │ (Nginx/Static) │ - ▼ ▼ - ┌──────────────────┐ ┌──────────────────┐ - │ Landing Servers │ │ Coolify/K8s LB │ - │ - chat.app │ │ (Load Balancer) │ - │ - picture.app │ └────────┬─────────┘ - │ - memoro.app │ │ - └──────────────────┘ ┌─────────────────┼─────────────────┐ - │ │ │ - ▼ ▼ ▼ - ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ - │ Web Apps │ │ API Backends │ │ Auth Service │ - │ (SvelteKit) │ │ (NestJS) │ │ (Core Auth) │ - ├──────────────┤ ├──────────────┤ ├──────────────┤ - │ chat-web │ │chat-backend │ │mana-core-auth│ - │ picture-web │ │picture-api │ │ Port: 3001 │ - │ memoro-web │ │maerchen-api │ └──────┬───────┘ - │ ...9 apps │ │ ...10 APIs │ │ - └──────┬───────┘ └──────┬───────┘ │ - │ │ │ - └─────────────────┼─────────────────┘ - │ - ┌─────────────────┴─────────────────┐ - │ │ - ▼ ▼ - ┌──────────────┐ ┌──────────────┐ - │ PostgreSQL │ │ Redis │ - │ (Supabase) │ │ (Cache) │ - ├──────────────┤ ├──────────────┤ - │ chat_db │ │ Sessions │ - │ picture_db │ │ Credits │ - │ memoro_db │ │ Rate Limits │ - │ manacore_db │ └──────────────┘ - └──────────────┘ -``` - ---- - -## Container Hierarchy - Docker Layer Structure - -``` -┌────────────────────────────────────────────────────────────────────────────────────────┐ -│ MULTI-STAGE BUILD ARCHITECTURE │ -│ (Optimized for pnpm Workspace Monorepo) │ -└────────────────────────────────────────────────────────────────────────────────────────┘ - - [STAGE 1: BASE] - │ - │ FROM node:20-alpine - │ COPY pnpm-workspace.yaml - │ COPY package.json - │ COPY pnpm-lock.yaml - │ - ▼ - ┌─────────────────────┐ - │ Workspace Setup │ - │ Size: ~150 MB │ - └──────────┬──────────┘ - │ - ┌────────────┴────────────┐ - │ │ - ▼ ▼ - [STAGE 2: DEPENDENCIES] [STAGE 2: DEPENDENCIES] - │ │ - │ pnpm install │ pnpm install - │ --frozen-lockfile │ --frozen-lockfile - │ │ - ▼ ▼ - ┌─────────────────────┐ ┌─────────────────────┐ - │ Backend Dependencies│ │ Frontend Dependencies│ - │ Size: ~400 MB │ │ Size: ~500 MB │ - └──────────┬──────────┘ └──────────┬───────────┘ - │ │ - │ COPY packages/ │ COPY packages/ - │ RUN pnpm build │ RUN pnpm build - │ │ - ▼ ▼ - [STAGE 3: BUILDER] [STAGE 3: BUILDER] - │ │ - │ COPY apps/*/backend │ COPY apps/*/web - │ RUN pnpm build │ RUN pnpm build - │ │ - ▼ ▼ - ┌─────────────────────┐ ┌─────────────────────┐ - │ Built Backend │ │ Built Frontend │ - │ (dist/) │ │ (build/) │ - │ Size: ~50 MB │ │ Size: ~20 MB │ - └──────────┬──────────┘ └──────────┬───────────┘ - │ │ - │ Multi-stage copy │ Multi-stage copy - │ │ - ▼ ▼ - [STAGE 4: PRODUCTION] [STAGE 4: PRODUCTION] - │ │ - │ FROM node:20-alpine │ FROM node:20-alpine - │ COPY --from=builder │ COPY --from=builder - │ USER nodejs (1001) │ USER nodejs (1001) - │ │ - ▼ ▼ - ┌─────────────────────┐ ┌─────────────────────┐ - │ chat-backend │ │ chat-web │ - │ Final: 180 MB │ │ Final: 170 MB │ - │ Port: 3002 │ │ Port: 3000 │ - └─────────────────────┘ └─────────────────────┘ - - [ASTRO LANDING PAGES] - │ - │ FROM node:20-alpine (builder) - │ RUN pnpm build (static files) - │ - ▼ - ┌─────────────────────┐ - │ Static Build │ - │ (dist/) │ - │ Size: ~5 MB │ - └──────────┬──────────┘ - │ - │ FROM nginx:1.25-alpine - │ COPY --from=builder dist/ - │ - ▼ - ┌─────────────────────┐ - │ chat-landing │ - │ Final: 45 MB │ - │ Port: 80 │ - └─────────────────────┘ - -CACHE BENEFITS: - Layer 1 (Base): 99% cache hit rate (workspace config rarely changes) - Layer 2 (Deps): 80% cache hit rate (dependencies change weekly) - Layer 3 (Build): 0% cache hit rate (source code changes frequently) - -TOTAL BUILD TIME: - - Without cache: ~12-15 minutes - - With cache: ~2-3 minutes -``` - ---- - -## Network Topology - Production Environment - -``` -┌────────────────────────────────────────────────────────────────────────────────────────┐ -│ NETWORK ARCHITECTURE │ -│ (Ports, Protocols, Security) │ -└────────────────────────────────────────────────────────────────────────────────────────┘ - - ┌─────────────────────────────────┐ - │ Internet (Public) │ - │ 0.0.0.0/0 │ - └────────────┬────────────────────┘ - │ - │ Port 443 (HTTPS) - │ Port 80 (HTTP → 443 redirect) - │ - ▼ - ┌─────────────────────────────────┐ - │ Cloudflare / Coolify Proxy │ - │ - DDoS Protection │ - │ - SSL Termination │ - │ - Rate Limiting │ - └────────────┬────────────────────┘ - │ - ┌───────────────────────┼───────────────────────┐ - │ │ │ - ▼ ▼ ▼ - ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ - │ Frontend Net │ │ Backend Net │ │ Data Net │ - │ (Public) │ │ (Private) │ │ (Private) │ - └──────────────────┘ └──────────────────┘ └──────────────────┘ - │ │ │ - │ │ │ - ┌───────┴───────┐ ┌───────┴───────┐ ┌───────┴───────┐ - │ │ │ │ │ │ - ▼ ▼ ▼ ▼ ▼ ▼ -┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ -│ Nginx │ │SvelteKit│ │ NestJS │ │ NestJS │ │Postgres │ │ Redis │ -│ (Astro) │ │ (Web) │ │ Backend │ │ Auth │ │(Supabase)│ │ Cache │ -├─────────┤ ├─────────┤ ├─────────┤ ├─────────┤ ├─────────┤ ├─────────┤ -│Port: 80 │ │Port:3100│ │Port:3002│ │Port:3001│ │Port:5432│ │Port:6379│ -│Public │ │Internal │ │Internal │ │Internal │ │Internal │ │Internal │ -└─────────┘ └─────────┘ └────┬────┘ └────┬────┘ └─────────┘ └─────────┘ - │ │ - │ DB Conn │ DB Conn - │ Pool: 10 │ Pool: 10 - │ │ - └───────────┴────────> PostgreSQL - │ - └────────> Redis - -NETWORK SECURITY RULES: - - ┌─────────────────────────────────────────────────────────────────┐ - │ INGRESS RULES (Firewall) │ - ├─────────────────────────────────────────────────────────────────┤ - │ Port 22 (SSH) - Source: DevOps IPs only │ - │ Port 80 (HTTP) - Source: 0.0.0.0/0 (Redirect to 443) │ - │ Port 443 (HTTPS) - Source: 0.0.0.0/0 │ - │ Port 3001-3200 (Apps) - DENY (Internal only) │ - │ Port 5432 (PostgreSQL) - DENY (Internal only) │ - │ Port 6379 (Redis) - DENY (Internal only) │ - └─────────────────────────────────────────────────────────────────┘ - - ┌─────────────────────────────────────────────────────────────────┐ - │ DOCKER NETWORK SEGMENTATION │ - ├─────────────────────────────────────────────────────────────────┤ - │ frontend-network: SvelteKit, Astro, Nginx │ - │ backend-network: NestJS APIs, Auth Service │ - │ data-network: PostgreSQL, Redis (no internet access) │ - └─────────────────────────────────────────────────────────────────┘ - -SSL/TLS CONFIGURATION: - - Certificate Provider: Let's Encrypt (Coolify auto-provision) - Protocols: TLSv1.2, TLSv1.3 - Cipher Suites: HIGH:!aNULL:!MD5:!3DES - HSTS: max-age=31536000; includeSubDomains; preload - Certificate Renewal: Automatic (30 days before expiry) -``` - ---- - -## Data Flow - Request Lifecycle - -``` -┌────────────────────────────────────────────────────────────────────────────────────────┐ -│ REQUEST LIFECYCLE (Chat API Example) │ -└────────────────────────────────────────────────────────────────────────────────────────┘ - -[1] User Request - │ - │ POST https://api-chat.manacore.app/api/chat/completions - │ Headers: Authorization: Bearer - │ - ▼ -┌───────────────────────────┐ -│ Cloudflare Edge (CDN) │ ← Geographically closest data center -│ - Check cache (miss) │ -│ - DDoS protection │ -│ - Rate limiting │ -└─────────────┬─────────────┘ - │ - │ HTTPS (TLS 1.3) - │ - ▼ -┌───────────────────────────┐ -│ Coolify Reverse Proxy │ -│ - SSL termination │ -│ - Route to container │ -│ - Health check │ -└─────────────┬─────────────┘ - │ - │ HTTP (internal network) - │ - ▼ -┌───────────────────────────┐ -│ Chat Backend (NestJS) │ -│ Container: chat-backend │ -│ Port: 3002 │ -└─────────────┬─────────────┘ - │ - │ [2] Authentication Middleware - │ - ▼ -┌───────────────────────────┐ -│ Verify JWT Token │ -│ ┌─────────────────────┐ │ -│ │ Extract manaToken │ │ -│ │ Decode JWT │ │ -│ │ Verify signature │ │ -│ │ Check expiry │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ JWT Claims: { sub: userId, role: user, app_id: chat } - │ - ▼ -┌───────────────────────────┐ -│ Credits Check │ -│ ┌─────────────────────┐ │ -│ │ Query Redis cache │ │ -│ │ Key: credits:{id} │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Cache MISS - │ - ▼ -┌───────────────────────────┐ -│ Query PostgreSQL │ -│ ┌─────────────────────┐ │ -│ │ SELECT credits │ │ -│ │ FROM users │ │ -│ │ WHERE id = userId │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Credits: 50 (sufficient) - │ Cache: SET credits:{id} 50 EX 300 - │ - ▼ -┌───────────────────────────┐ -│ [3] Business Logic │ -│ ┌─────────────────────┐ │ -│ │ Parse request │ │ -│ │ Validate input │ │ -│ │ Call Azure OpenAI │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ HTTP POST to Azure - │ - ▼ -┌───────────────────────────┐ -│ Azure OpenAI API │ -│ Model: GPT-4o-mini │ -│ Latency: ~800ms │ -└─────────────┬─────────────┘ - │ - │ AI Response - │ - ▼ -┌───────────────────────────┐ -│ [4] Save to Database │ -│ ┌─────────────────────┐ │ -│ │ INSERT message │ │ -│ │ UPDATE credits │ │ -│ │ (credits - 1) │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Transaction committed - │ Invalidate cache: DEL credits:{id} - │ - ▼ -┌───────────────────────────┐ -│ [5] Return Response │ -│ ┌─────────────────────┐ │ -│ │ HTTP 200 OK │ │ -│ │ { │ │ -│ │ "message": "...", │ │ -│ │ "credits": 49 │ │ -│ │ } │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Response time: ~1.2s total - │ - ▼ -[6] User receives AI response - -PERFORMANCE BREAKDOWN: - - Cloudflare routing: ~20ms - - SSL handshake: ~50ms (cached session) - - Authentication: ~10ms (JWT decode) - - Credits check (cache): ~2ms - - Azure OpenAI call: ~800ms (largest latency) - - Database write: ~15ms - - Response serialization: ~5ms - ──────────────────────────────── - TOTAL: ~902ms (p95 latency target: <1s) - -CACHING STRATEGY: - ✅ Redis: User credits (TTL: 5 min) - Reduces DB queries by 90% - ✅ Redis: AI model list (TTL: 1 hour) - Static metadata - ❌ No cache: Chat messages (always fresh from DB) - ❌ No cache: AI completions (unique per request) -``` - ---- - -## Deployment Flow - CI/CD Pipeline - -``` -┌────────────────────────────────────────────────────────────────────────────────────────┐ -│ CI/CD DEPLOYMENT PIPELINE │ -│ (GitHub Actions → Coolify) │ -└────────────────────────────────────────────────────────────────────────────────────────┘ - -[Developer] - │ - │ git commit -m "feat: add chat model selector" - │ git push origin feature/chat-model-selector - │ - ▼ -┌───────────────────────────┐ -│ GitHub (Pull Request) │ -│ - Code review │ -│ - Automated tests │ -└─────────────┬─────────────┘ - │ - │ PR approved & merged to main - │ - ▼ -┌───────────────────────────────────────────────────────────────────────────────────────┐ -│ GITHUB ACTIONS WORKFLOW │ -└───────────────────────────────────────────────────────────────────────────────────────┘ - - ▼ -┌───────────────────────────┐ -│ Job 1: Lint & Type Check │ ← Parallel execution -│ ┌─────────────────────┐ │ -│ │ pnpm lint │ │ -│ │ pnpm type-check │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ ✅ Passed - │ - ▼ -┌───────────────────────────┐ -│ Job 2: Build Docker Image│ -│ ┌─────────────────────┐ │ -│ │ docker buildx build │ │ -│ │ --cache-from cache │ │ -│ │ --cache-to cache │ │ -│ │ --push │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Image: ghcr.io/manacore/chat-backend:main-abc1234 - │ - ▼ -┌───────────────────────────┐ -│ Job 3: Security Scan │ -│ ┌─────────────────────┐ │ -│ │ trivy image scan │ │ -│ │ Severity: HIGH+ │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ ✅ No critical vulnerabilities - │ - ▼ -┌───────────────────────────────────────────────────────────────────────────────────────┐ -│ STAGING DEPLOYMENT │ -└───────────────────────────────────────────────────────────────────────────────────────┘ - - ▼ -┌───────────────────────────┐ -│ Deploy to Staging │ -│ ┌─────────────────────┐ │ -│ │ SSH to Coolify │ │ -│ │ docker compose pull │ │ -│ │ docker compose up │ │ -│ │ pnpm migration:run │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Staging URL: https://staging-api-chat.manacore.app - │ - ▼ -┌───────────────────────────┐ -│ Automated Smoke Tests │ -│ ┌─────────────────────┐ │ -│ │ curl /api/health │ │ ✅ 200 OK -│ │ curl /api/models │ │ ✅ 200 OK -│ │ POST /api/chat │ │ ✅ 200 OK -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ ✅ All tests passed - │ - ▼ -┌───────────────────────────┐ -│ Manual Approval Required │ ← Human checkpoint -│ ┌─────────────────────┐ │ -│ │ QA Team Review │ │ -│ │ Stakeholder Demo │ │ -│ │ Approve/Reject │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ ✅ Approved - │ - ▼ -┌───────────────────────────────────────────────────────────────────────────────────────┐ -│ PRODUCTION DEPLOYMENT (Blue-Green) │ -└───────────────────────────────────────────────────────────────────────────────────────┘ - - ▼ -┌───────────────────────────┐ -│ Deploy to GREEN Env │ -│ ┌─────────────────────┐ │ -│ │ Blue: v1.5.2 (100%) │ │ -│ │ Green: v1.6.0 (0%) │ │ -│ │ │ │ -│ │ docker compose up │ │ -│ │ --file green.yml │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Wait 30 seconds for startup - │ - ▼ -┌───────────────────────────┐ -│ Run Database Migrations │ -│ ┌─────────────────────┐ │ -│ │ pnpm migration:run │ │ ← Forward-compatible migrations only -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Migrations applied successfully - │ - ▼ -┌───────────────────────────┐ -│ Health Check GREEN │ -│ ┌─────────────────────┐ │ -│ │ curl localhost:3002 │ │ ✅ 200 OK -│ │ /api/health │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ GREEN environment healthy - │ - ▼ -┌───────────────────────────┐ -│ Canary Deployment │ -│ ┌─────────────────────┐ │ -│ │ Blue: 90% traffic │ │ -│ │ Green: 10% traffic │ │ -│ │ │ │ -│ │ Monitor for 10 min │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Metrics: - │ - Error rate: 0.1% (✅ <1%) - │ - Response time: 850ms (✅ <1s) - │ - No customer complaints - │ - ▼ -┌───────────────────────────┐ -│ Full Cutover │ -│ ┌─────────────────────┐ │ -│ │ Blue: 0% traffic │ │ -│ │ Green: 100% traffic │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Traffic switched to GREEN - │ - ▼ -┌───────────────────────────┐ -│ Rollback Window (1 hour) │ ← Keep BLUE running -│ ┌─────────────────────┐ │ -│ │ Monitor metrics │ │ -│ │ If issues: │ │ -│ │ Switch back BLUE │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ ✅ No issues detected - │ - ▼ -┌───────────────────────────┐ -│ Decommission BLUE │ -│ ┌─────────────────────┐ │ -│ │ docker compose down │ │ -│ │ --file blue.yml │ │ -│ └──────────┬──────────┘ │ -└─────────────┼─────────────┘ - │ - │ Deployment completed successfully - │ - ▼ -[Production v1.6.0 Live] - -DEPLOYMENT TIMELINE: - - Code merge to main: 0:00 - - CI/CD pipeline start: 0:01 - - Lint & build: 0:05 (4 min) - - Staging deployment: 0:07 (2 min) - - Smoke tests: 0:08 (1 min) - - Manual approval: 0:30 (22 min - human review) - - Production deploy (GREEN): 0:35 (5 min) - - Canary monitoring: 0:45 (10 min) - - Full cutover: 0:46 (1 min) - - Rollback window: 1:46 (60 min) - ───────────────────────────────────────────── - TOTAL TIME TO PRODUCTION: ~2 hours (mostly manual approval) - -ROLLBACK PROCEDURE (if needed): - 1. Detect issue (error spike, customer reports) - 2. Run: coolify switch-deployment chat blue - 3. Traffic reverts to BLUE (v1.5.2) in <30 seconds - 4. Investigate issue in GREEN (offline) - 5. Fix and redeploy when ready -``` - ---- - -## Monitoring Dashboard Layout - -``` -┌────────────────────────────────────────────────────────────────────────────────────────┐ -│ GRAFANA MONITORING DASHBOARD │ -│ (Real-time Metrics) │ -└────────────────────────────────────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────────────────────────────────┐ -│ SYSTEM HEALTH OVERVIEW Last Update: 12:34:56 │ -├─────────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ -│ │ Services │ │ Request Rate │ │ Error Rate │ │ Avg Latency │ │ -│ │ 38 / 39 │ │ 1,234 req/s │ │ 0.2% │ │ 450 ms │ │ -│ │ 🟢 Healthy │ │ 🟢 Normal │ │ 🟢 Good │ │ 🟢 Fast │ │ -│ └───────────────┘ └───────────────┘ └───────────────┘ └───────────────┘ │ -│ │ -│ ⚠️ 1 Service Warning: picture-backend (High Memory: 85%) │ -│ │ -└─────────────────────────────────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────────────────────────────────┐ -│ SERVICE STATUS (by Project) │ -├─────────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Project │ Backend │ Web │ Landing │ Status │ Last Deploy │ -│ ─────────────────┼─────────┼────────┼─────────┼────────┼─────────────────────── │ -│ mana-core-auth │ 🟢 UP │ - │ - │ 100% │ 2025-11-26 10:23 │ -│ chat │ 🟢 UP │ 🟢 UP │ 🟢 UP │ 100% │ 2025-11-27 12:15 │ -│ maerchenzauber │ 🟢 UP │ 🟢 UP │ 🟢 UP │ 100% │ 2025-11-25 14:45 │ -│ picture │ 🟡 WARN│ 🟢 UP │ 🟢 UP │ 100% │ 2025-11-27 08:30 │ -│ memoro │ - │ 🟢 UP │ 🟢 UP │ 100% │ 2025-11-26 16:00 │ -│ uload │ 🟢 UP │ 🟢 UP │ 🟢 UP │ 100% │ 2025-11-24 11:20 │ -│ │ -└─────────────────────────────────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────────────────────────────────┐ -│ RESPONSE TIME (p95 Latency) [Last 24 hours] │ -├─────────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ 1000ms │ ╭╮ │ -│ │ ╭╯╰╮ │ -│ 800ms │ ╭╮ ╭╯ ╰╮ │ -│ │ ╭╯╰╮ ╭╯ ╰╮ │ -│ 600ms │ ╭╮ ╭╯ ╰╮ ╭╯ ╰╮ │ -│ │ ╭╮ ╭╯╰╮ ╭╯ ╰╮╭╯ ╰╮ │ -│ 400ms │─────────╭╯╰───────╯──╰──╯──────╰╯──────────╰────────── │ -│ │ ╭╯ │ -│ 200ms │ ╭────╯ │ -│ │───╯ │ -│ 0ms └─────────────────────────────────────────────────────────────────────── │ -│ 0h 6h 12h 18h 24h │ -│ │ -│ Legend: ─ chat-backend ─ picture-backend ─ Target (500ms) │ -│ │ -└─────────────────────────────────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────────────────────────────────┐ -│ RESOURCE UTILIZATION │ -├─────────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ CPU Usage (%) Memory Usage (%) Disk I/O (MB/s) │ -│ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ │ -│ │ [████████░░] 45│ │ [██████░░░░] 60│ │ [███░░░░░░░] 30│ │ -│ └────────────────┘ └────────────────┘ └────────────────┘ │ -│ │ -│ Top Consumers: Top Consumers: Top Consumers: │ -│ 1. picture-api 25% 1. picture-api 85% 1. postgres 25 MB/s │ -│ 2. chat-api 10% 2. chat-web 70% 2. redis 3 MB/s │ -│ 3. postgres 8% 3. postgres 60% 3. chat-api 2 MB/s │ -│ │ -└─────────────────────────────────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────────────────────────────────┐ -│ ACTIVE ALERTS │ -├─────────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ⚠️ WARNING │ picture-backend │ High Memory Usage (85% > 80%) │ 12:30:15 │ -│ ℹ️ INFO │ chat-backend │ Slow Query Detected (250ms) │ 12:28:42 │ -│ │ -│ 🔕 No Critical Alerts │ -│ │ -└─────────────────────────────────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────────────────────────────────┐ -│ DATABASE PERFORMANCE │ -├─────────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Database │ Connections │ Query Time (avg) │ Slow Queries │ Cache Hit Rate │ -│ ───────────────┼─────────────┼──────────────────┼──────────────┼────────────── │ -│ chat │ 8 / 10 │ 45 ms │ 3 │ 98.5% │ -│ picture │ 9 / 10 │ 62 ms │ 8 │ 96.2% │ -│ manacore │ 5 / 10 │ 28 ms │ 0 │ 99.1% │ -│ │ -│ 🔍 View Slow Queries │ 📊 Connection Pool Analysis │ -│ │ -└─────────────────────────────────────────────────────────────────────────────────────┘ - -┌─────────────────────────────────────────────────────────────────────────────────────┐ -│ EXTERNAL DEPENDENCIES │ -├─────────────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ Service │ Status │ Latency │ Success Rate │ Last Check │ -│ ─────────────────────┼─────────┼─────────┼──────────────┼──────────────────── │ -│ Azure OpenAI │ 🟢 UP │ 850 ms │ 99.9% │ 12:34:50 │ -│ Supabase (chat) │ 🟢 UP │ 35 ms │ 100% │ 12:34:52 │ -│ Supabase (picture) │ 🟢 UP │ 42 ms │ 100% │ 12:34:48 │ -│ Redis Cache │ 🟢 UP │ 2 ms │ 100% │ 12:34:55 │ -│ │ -└─────────────────────────────────────────────────────────────────────────────────────┘ - -ACTION BUTTONS: - [🔄 Refresh Dashboard] [📥 Export Data] [🔔 Configure Alerts] [📖 View Logs] -``` - ---- - -## Disaster Recovery Flowchart - -``` -┌────────────────────────────────────────────────────────────────────────────────────────┐ -│ DISASTER RECOVERY DECISION TREE │ -└────────────────────────────────────────────────────────────────────────────────────────┘ - - [INCIDENT DETECTED] - │ - │ Alert triggered or customer report - │ - ▼ - ┌──────────────────┐ - │ What failed? │ - └────────┬─────────┘ - │ - ┌────────────────────┼────────────────────┐ - │ │ │ - ▼ ▼ ▼ - ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ - │ Service │ │ Database │ │ Full Server │ - │ Crash │ │ Corruption │ │ Failure │ - └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ - │ │ │ - ▼ ▼ ▼ - ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ - │ Health check │ │ Verify scope │ │ Verify total │ - │ failing? │ │ of corruption │ │ server down │ - └────────┬────────┘ └────────┬────────┘ └────────┬────────┘ - │ │ │ - ▼ YES ▼ Database DOWN ▼ YES - ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ - │ Restart │ │ Stop affected │ │ Activate │ - │ container │ │ services │ │ standby server │ - ├─────────────────┤ ├─────────────────┤ ├─────────────────┤ - │ docker compose │ │ docker compose │ │ 1. Start services│ - │ restart │ │ stop chat-api │ │ 2. Restore DBs │ - │ chat-backend │ │ │ │ 3. Update DNS │ - └────────┬────────┘ └────────┬────────┘ └────────┬────────┘ - │ │ │ - │ Wait 30s │ Download backup │ ETA: 2 hours - │ │ │ - ▼ ▼ ▼ - ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ - │ Health check │ │ Restore from │ │ Verify services │ - │ passing? │ │ latest backup │ │ healthy │ - └────────┬────────┘ ├─────────────────┤ └────────┬────────┘ - │ │ pg_restore │ │ - ▼ YES │ chat.dump │ ▼ YES - ┌─────────────────┐ └────────┬────────┘ ┌─────────────────┐ - │ ✅ RESOLVED │ │ │ ✅ RESOLVED │ - │ RTO: 2 min │ ▼ DB UP │ RTO: 2 hours │ - └─────────────────┘ ┌─────────────────┐ └─────────────────┘ - │ Restart services│ - ├─────────────────┤ - │ docker compose │ - │ start chat-api │ - └────────┬────────┘ - │ - ▼ Services UP - ┌─────────────────┐ - │ Verify data │ - │ integrity │ - └────────┬────────┘ - │ - ▼ Verified - ┌─────────────────┐ - │ ✅ RESOLVED │ - │ RTO: 20 min │ - │ RPO: <24 hours │ - └─────────────────┘ - -POST-INCIDENT ACTIONS (All Scenarios): - 1. Document timeline in incident log - 2. Notify stakeholders of resolution - 3. Schedule post-mortem meeting - 4. Identify root cause - 5. Implement preventive measures - 6. Update runbooks - -ESCALATION PATHS: - - Service crash (2+ restarts fail) → Call DevOps lead - - Database corruption → Call Database admin + CTO - - Full server failure → Call Infrastructure team + CEO - - Security breach → Call Security team + Legal - -COMMUNICATION TEMPLATE: - Subject: [INCIDENT] Service Downtime - chat-backend - - Status: INVESTIGATING / RESOLVED - Impact: API requests failing (100% error rate) - Affected Users: ~500 active users - Started: 2025-11-27 12:34 UTC - Resolved: 2025-11-27 12:38 UTC (4 min) - RTO: 2 minutes - - Timeline: - - 12:34 UTC: Alert triggered (health check fail) - - 12:35 UTC: Container restarted - - 12:36 UTC: Health check passing - - 12:38 UTC: Verified all API endpoints working - - Root Cause: OOM killer terminated process (memory leak) - - Action Items: - 1. Increase memory limit to 1GB (from 512MB) - 2. Add memory monitoring alert - 3. Investigate memory leak in code -``` - ---- - -## Legend & Symbols - -``` -┌────────────────────────────────────────────────────────────────────────────────────────┐ -│ DIAGRAM LEGEND & SYMBOLS │ -└────────────────────────────────────────────────────────────────────────────────────────┘ - -STATUS INDICATORS: - 🟢 - Healthy / Running / Success - 🟡 - Warning / Degraded Performance - 🔴 - Critical / Down / Failed - ⚪ - Unknown / Not Monitored - ⚠️ - Warning Alert - 🚨 - Critical Alert - ℹ️ - Informational Message - -NETWORK SYMBOLS: - │ - Vertical connection - ─ - Horizontal connection - ┌ └ ┐ ┘ - Corners - ├ ┤ ┬ ┴ ┼ - Junctions - → ← - Data flow direction - ▼ ▲ - Process flow direction - -SERVICE TYPES: - [NestJS] - Backend API service - [SvelteKit]- Web frontend service - [Astro] - Static landing page - [Postgres] - Database - [Redis] - Cache/session store - [Nginx] - Reverse proxy / static server - -SECURITY LEVELS: - Public - Accessible from internet (0.0.0.0/0) - Internal - Private network only (Docker network) - Protected - Firewall rules + authentication required - -DEPLOYMENT STAGES: - Development - Local Docker Compose - Staging - Coolify (separate server) - Production - Coolify (production server) - -ABBREVIATIONS: - RTO - Recovery Time Objective - RPO - Recovery Point Objective - CDN - Content Delivery Network - SSL - Secure Sockets Layer - TLS - Transport Layer Security - HSTS - HTTP Strict Transport Security - CORS - Cross-Origin Resource Sharing - JWT - JSON Web Token - ORM - Object-Relational Mapping - APM - Application Performance Monitoring - CI/CD- Continuous Integration / Continuous Deployment -``` - ---- - -## Quick Reference - -### Health Check URLs - -``` -mana-core-auth: https://auth.manacore.app/api/health -chat-backend: https://api-chat.manacore.app/api/health -chat-web: https://app-chat.manacore.app/api/health -picture-backend: https://api-picture.manacore.app/api/health -maerchenzauber-backend:https://api-maerchenzauber.manacore.app/api/health -``` - -### Emergency Contacts - -``` -DevOps Lead: +XX XXX XXX XXXX (on-call: Mon-Fri 9-5) -Database Admin: +XX XXX XXX XXXX (on-call: 24/7) -Infrastructure: devops@manacore.app -Security Team: security@manacore.app -Status Page: https://status.manacore.app -``` - -### Common Commands - -```bash -# Restart service -docker compose restart chat-backend - -# View logs (last 100 lines) -docker compose logs --tail 100 -f chat-backend - -# Check resource usage -docker stats - -# Rollback deployment -./scripts/rollback.sh chat v1.5.2 - -# Restore database -./scripts/restore-db.sh chat 2025-11-27 - -# Run health checks -./scripts/health-check-all.sh -``` - ---- - -**End of Deployment Diagrams** diff --git a/docs/DEPLOYMENT_HETZNER.md b/docs/DEPLOYMENT_HETZNER.md deleted file mode 100644 index 4ac28745d..000000000 --- a/docs/DEPLOYMENT_HETZNER.md +++ /dev/null @@ -1,602 +0,0 @@ -# Hetzner Deployment Guide - -Dieses Dokument beschreibt verschiedene Deployment-Optionen für das Manacore Monorepo auf Hetzner Cloud Infrastructure. - -## Inhaltsverzeichnis - -- [Bestandsaufnahme](#bestandsaufnahme) -- [Option 1: Single Server](#option-1-single-server-einfach--günstig) -- [Option 2: Dual-Server mit Floating IP](#option-2-dual-server-mit-floating-ip) -- [Option 3: Kubernetes Cluster](#option-3-kubernetes-cluster-enterprise) -- [Option 4: Hybrid mit Docker Swarm](#option-4-hybrid-mit-docker-swarm-empfohlen) -- [Vergleichstabelle](#vergleichstabelle) -- [Empfehlung](#empfehlung) -- [Implementierungsdetails](#implementierungsdetails) - ---- - -## Bestandsaufnahme - -### Zu deployende Komponenten - -| Typ | Anzahl | Technologie | Deployment-Ziel | -|-----|--------|-------------|-----------------| -| **Backends** | 10 | NestJS | Container | -| **Web Apps** | 11 | SvelteKit (SSR) | Container | -| **Landing Pages** | 11 | Astro (statisch) | CDN/Static | -| **Auth Service** | 1 | NestJS | Container | -| **Datenbanken** | 2 | PostgreSQL + Redis | Dedicated/Managed | -| **Mobile Apps** | 10 | Expo | App Stores (nicht Hetzner) | - -### Backend-Services im Detail - -| Service | Package | Port | Datenbank | -|---------|---------|------|-----------| -| mana-core-auth | `mana-core-auth` | 3001 | PostgreSQL + Redis | -| Chat Backend | `@chat/backend` | 3002 | PostgreSQL | -| Maerchenzauber Backend | `@maerchenzauber/backend` | 3003 | Supabase | -| Manadeck Backend | `@manadeck/backend` | 3004 | Supabase | -| Picture Backend | `@picture/backend` | 3005 | PostgreSQL | -| Transcriber Backend | `@transcriber/backend` | 3006 | Filesystem | -| Nutriphi Backend | `@nutriphi/backend` | 3007 | Supabase | -| News API | `@news/api` | 3008 | PostgreSQL | -| Quote Backend | `@quote/backend` | 3009 | PostgreSQL | -| Uload Backend | `@uload/backend` | 3010 | PostgreSQL | - -### Ressourcenanforderungen (geschätzt) - -| Komponente | RAM | CPU | Storage | -|------------|-----|-----|---------| -| NestJS Backend (pro Service) | 200-400 MB | 0.25 vCPU | 100 MB | -| SvelteKit Web App (pro App) | 150-300 MB | 0.25 vCPU | 50 MB | -| PostgreSQL | 1-2 GB | 1 vCPU | 10-50 GB | -| Redis | 256-512 MB | 0.25 vCPU | 1 GB | -| Traefik/Nginx | 128 MB | 0.25 vCPU | 100 MB | - -**Gesamt (Minimum):** ~8 GB RAM, 4 vCPU, 100 GB Storage - ---- - -## Option 1: Single Server (Einfach & Günstig) - -### Kosten: ~€30-50/Monat - -### Architektur - -``` -┌─────────────────────────────────────────────────────────┐ -│ Hetzner CX41/CX51 │ -│ (8 vCPU, 16-32 GB RAM) │ -├─────────────────────────────────────────────────────────┤ -│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -│ │ Traefik │ │ Docker │ │ PostgreSQL │ │ -│ │ (Reverse │ │ Compose │ │ Redis │ │ -│ │ Proxy) │ │ (All Apps) │ │ │ │ -│ └─────────────┘ └─────────────┘ └─────────────┘ │ -│ │ -│ Backends: 10 Container (~200MB RAM each) │ -│ Web Apps: 10 Container (SSR) │ -│ Landing: Statisch via Traefik │ -└─────────────────────────────────────────────────────────┘ -``` - -### Hetzner Server Empfehlung - -| Server | vCPU | RAM | Storage | Preis | -|--------|------|-----|---------|-------| -| CX41 | 8 | 16 GB | 160 GB | ~€28/Monat | -| CX51 | 16 | 32 GB | 240 GB | ~€58/Monat | - -### Vorteile - -- Einfache Verwaltung -- Günstig -- Schnelle Einrichtung -- Ein Server = ein Backup - -### Nachteile - -- Kein Failover (Single Point of Failure) -- Downtime bei Updates -- Keine horizontale Skalierung -- Server-Ausfall = kompletter Ausfall - -### Wann geeignet? - -- Entwicklung/Staging -- MVP/Early Stage -- Budget-kritische Projekte -- Wenig Traffic (<1000 DAU) - ---- - -## Option 2: Dual-Server mit Floating IP - -### Kosten: ~€80-120/Monat - -### Architektur - -``` - ┌─────────────────┐ - │ Floating IP │ - │ (Failover) │ - └────────┬────────┘ - │ - ┌──────────────┴──────────────┐ - │ │ - ┌─────────▼─────────┐ ┌──────────▼─────────┐ - │ Server 1 (CX31) │ │ Server 2 (CX31) │ - │ PRIMARY │ │ STANDBY │ - ├───────────────────┤ ├────────────────────┤ - │ • Traefik │ │ • Traefik │ - │ • All Backends │◄─────►│ • All Backends │ - │ • Web Apps │ sync │ • Web Apps │ - │ • PostgreSQL │ │ • PostgreSQL │ - │ (Primary) │ │ (Replica) │ - │ • Redis │ │ • Redis Sentinel │ - └───────────────────┘ └────────────────────┘ - │ │ - └──────────────┬──────────────┘ - │ - ┌────────▼────────┐ - │ Hetzner Volume │ - │ (Shared Data) │ - └─────────────────┘ -``` - -### Komponenten - -| Komponente | Funktion | -|------------|----------| -| **Floating IP** | Virtuelle IP, die zwischen Servern wechseln kann | -| **Keepalived** | VRRP-Daemon für automatisches Failover | -| **PostgreSQL Streaming Replication** | Echtzeit-Datenbank-Replikation | -| **Redis Sentinel** | Redis High Availability | -| **Litestream/pgBackRest** | Kontinuierliche Backups | - -### Server-Konfiguration - -```yaml -# Server 1 & 2 identisch -Server: CX31 -vCPU: 4 -RAM: 8 GB -Storage: 80 GB -Kosten: ~€15/Monat pro Server - -# Zusätzlich -Floating IP: €4/Monat -Volume (100GB): €4.40/Monat -``` - -### Failover-Prozess - -1. Keepalived erkennt Server-Ausfall (Health Check) -2. Floating IP wird auf Standby-Server umgeleitet (~30 Sekunden) -3. PostgreSQL Replica wird zu Primary promoted -4. Redis Sentinel wählt neuen Master - -### Vorteile - -- Automatisches Failover (~30 Sekunden) -- Keine Downtime bei Updates (Rolling) -- Datenbank-Replikation -- Gutes Preis-Leistungs-Verhältnis - -### Nachteile - -- Mehr Komplexität als Single Server -- PostgreSQL Failover kann komplex sein -- Keepalived-Konfiguration erforderlich - -### Wann geeignet? - -- Produktions-Workloads -- 99.9% Uptime-Anforderung -- Mittlerer Traffic (1000-10000 DAU) - ---- - -## Option 3: Kubernetes Cluster (Enterprise) - -### Kosten: ~€150-300/Monat - -### Architektur - -``` - ┌─────────────────┐ - │ Hetzner LB │ - │ (Cloud-native) │ - └────────┬────────┘ - │ - ┌─────────────────────────┼─────────────────────────┐ - │ │ │ -┌───────▼───────┐ ┌────────▼────────┐ ┌────────▼───────┐ -│ Node 1 │ │ Node 2 │ │ Node 3 │ -│ (CX21) │ │ (CX21) │ │ (CX21) │ -├───────────────┤ ├─────────────────┤ ├────────────────┤ -│ k3s Worker │ │ k3s Worker │ │ k3s Worker │ -│ • Pods │ │ • Pods │ │ • Pods │ -│ • Services │ │ • Services │ │ • Services │ -└───────────────┘ └─────────────────┘ └────────────────┘ - │ │ │ - └─────────────────────────┼─────────────────────────┘ - │ - ┌─────────────┴─────────────┐ - │ │ - ┌────────▼────────┐ ┌──────────▼─────────┐ - │ Hetzner Managed │ │ Hetzner Volume │ - │ PostgreSQL │ │ (Persistent) │ - │ (Optional) │ │ │ - └─────────────────┘ └────────────────────┘ -``` - -### Kubernetes Stack - -```yaml -Cluster: - - k3s (leichtgewichtiges Kubernetes) - - 3 Nodes minimum für HA Control Plane - -Ingress: - - Traefik (in k3s integriert) - - oder NGINX Ingress Controller - -TLS: - - cert-manager - - Let's Encrypt (automatische Zertifikate) - -Storage: - - Longhorn (Distributed Block Storage) - - oder Hetzner CSI Driver - -GitOps: - - ArgoCD oder Flux - - Automatische Deployments aus Git - -Monitoring: - - Prometheus - - Grafana - - Alertmanager - -Logging: - - Loki - - Promtail -``` - -### Server-Konfiguration - -```yaml -# k3s Nodes -3x CX21: - vCPU: 2 - RAM: 4 GB - Storage: 40 GB - Kosten: ~€6/Monat pro Node = €18/Monat - -# Oder für mehr Ressourcen -3x CX31: - vCPU: 4 - RAM: 8 GB - Storage: 80 GB - Kosten: ~€15/Monat pro Node = €45/Monat - -# Load Balancer -Hetzner LB: €5/Monat - -# Volumes für Persistent Storage -3x 50GB Volumes: ~€7/Monat -``` - -### Vorteile - -- Auto-Scaling (Horizontal Pod Autoscaler) -- Self-Healing (automatischer Pod-Restart) -- Rolling Updates ohne Downtime -- Deklarative Konfiguration -- Multi-Zone möglich -- Industry Standard - -### Nachteile - -- Hohe Komplexität -- Steile Lernkurve -- Overhead für kleine Teams -- Mehr Ressourcen für Control Plane - -### Wann geeignet? - -- Enterprise-Anforderungen -- Großes Team mit K8s-Erfahrung -- Hoher Traffic (>10000 DAU) -- Microservices-Architektur -- Multi-Tenant-Anforderungen - ---- - -## Option 4: Hybrid mit Docker Swarm (Empfohlen) - -### Kosten: ~€100-150/Monat - -### Architektur - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ HETZNER CLOUD │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────────┐ ┌─────────────────┐ │ -│ │ Load Balancer │ │ Cloud Firewall │ │ -│ │ (Hetzner LB) │ │ │ │ -│ └────────┬────────┘ └──────────────────┘ │ -│ │ │ -│ ┌────────┴────────────────────────────────┐ │ -│ │ │ │ -│ ▼ ▼ │ -│ ┌──────────────────┐ ┌──────────────────┐ │ -│ │ App Server 1 │ │ App Server 2 │ │ -│ │ (CX31) │ │ (CX31) │ │ -│ ├──────────────────┤ ├──────────────────┤ │ -│ │ Docker Swarm │◄────────────►│ Docker Swarm │ │ -│ │ Manager + Worker │ Overlay │ Manager + Worker │ │ -│ │ │ Network │ │ │ -│ │ • All Backends │ │ • All Backends │ │ -│ │ • Web Apps │ │ • Web Apps │ │ -│ │ • Traefik │ │ • Traefik │ │ -│ └──────────────────┘ └──────────────────┘ │ -│ │ │ │ -│ └────────────────┬───────────────┘ │ -│ │ │ -│ ┌────────▼────────┐ │ -│ │ DB Server │ │ -│ │ (CX21) │ │ -│ ├─────────────────┤ │ -│ │ • PostgreSQL 16 │ │ -│ │ • Redis 7 │ │ -│ │ • Daily Backups │ │ -│ │ → Object │ │ -│ │ Storage │ │ -│ └─────────────────┘ │ -│ │ -│ ┌─────────────────────────────────────────────────────┐ │ -│ │ Hetzner Object Storage │ │ -│ │ (Backups, Static Assets, Media) │ │ -│ └─────────────────────────────────────────────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────┘ - - │ - ▼ - ┌───────────────────────────────┐ - │ EXTERNAL CDN │ - │ (Cloudflare Free) │ - │ • Static Assets │ - │ • DDoS Protection │ - │ • SSL Termination │ - └───────────────────────────────┘ -``` - -### Warum Docker Swarm? - -Docker Swarm bietet die wichtigsten Features von Kubernetes mit deutlich weniger Komplexität: - -| Feature | Docker Swarm | Kubernetes | -|---------|--------------|------------| -| Lernkurve | Niedrig | Hoch | -| Setup-Zeit | Minuten | Stunden/Tage | -| Service Discovery | Built-in | Benötigt Config | -| Load Balancing | Built-in | Benötigt Ingress | -| Rolling Updates | Built-in | Built-in | -| Secrets Management | Built-in | Built-in | -| Ressourcen-Overhead | Minimal | Signifikant | - -### Server-Konfiguration - -```yaml -# App Server 1 & 2 -2x CX31: - vCPU: 4 - RAM: 8 GB - Storage: 80 GB - Kosten: €15/Monat × 2 = €30/Monat - -# Database Server -1x CX21: - vCPU: 2 - RAM: 4 GB - Storage: 40 GB + 100GB Volume - Kosten: €6/Monat + €4.40/Monat = €10.40/Monat - -# Load Balancer -Hetzner LB: - Kosten: €5/Monat - -# Object Storage (Backups) -100 GB: - Kosten: ~€5/Monat - -# Cloud Firewall -Kostenlos - -# Private Network -Kostenlos - -───────────────────────────── -Gesamt: ~€50-55/Monat Basis - + Traffic-Kosten -``` - -### Docker Swarm Stack - -```yaml -# docker-stack.yml -version: "3.8" - -services: - # Reverse Proxy - traefik: - image: traefik:v3.0 - deploy: - replicas: 2 - placement: - constraints: - - node.role == manager - ports: - - "80:80" - - "443:443" - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - - traefik-certs:/letsencrypt - - # Auth Service - mana-core-auth: - image: ghcr.io/your-org/mana-core-auth:latest - deploy: - replicas: 2 - update_config: - parallelism: 1 - delay: 10s - failure_action: rollback - restart_policy: - condition: on-failure - environment: - - DATABASE_URL=postgresql://... - labels: - - "traefik.http.routers.auth.rule=Host(`auth.yourdomain.com`)" - - # Backend Services (repeat for each) - chat-backend: - image: ghcr.io/your-org/chat-backend:latest - deploy: - replicas: 2 - labels: - - "traefik.http.routers.chat-api.rule=Host(`api.chat.yourdomain.com`)" - - # Web Apps (repeat for each) - chat-web: - image: ghcr.io/your-org/chat-web:latest - deploy: - replicas: 2 - labels: - - "traefik.http.routers.chat-web.rule=Host(`chat.yourdomain.com`)" - -volumes: - traefik-certs: - -networks: - default: - driver: overlay - attachable: true -``` - -### Vorteile - -- Einfacher als Kubernetes -- Native Docker-Erfahrung nutzbar -- Built-in Service Discovery & Load Balancing -- Rolling Updates ohne Downtime -- Overlay-Network für sichere Kommunikation -- Hetzner LB für echte HA - -### Nachteile - -- Weniger Features als Kubernetes -- Kleineres Ökosystem -- Kein HPA (Horizontal Pod Autoscaler) - -### Wann geeignet? - -- Produktions-Workloads -- Kleine bis mittlere Teams -- Docker-Erfahrung vorhanden -- Mittlerer Traffic (1000-50000 DAU) - ---- - -## Vergleichstabelle - -| Feature | Option 1 | Option 2 | Option 3 | Option 4 | -|---------|----------|----------|----------|----------| -| **Kosten/Monat** | €30-50 | €80-120 | €150-300 | €100-150 | -| **Ausfallsicherheit** | ❌ | ✅ | ✅✅ | ✅ | -| **Auto-Failover** | ❌ | ✅ (30s) | ✅ (<10s) | ✅ (10-30s) | -| **Komplexität** | Niedrig | Mittel | Hoch | Mittel | -| **Skalierbarkeit** | ❌ | ⚠️ | ✅✅ | ✅ | -| **Zero-Downtime Deploy** | ❌ | ✅ | ✅ | ✅ | -| **Wartungsaufwand** | Niedrig | Mittel | Hoch | Mittel | -| **Backup/Recovery** | Manuell | Auto | Auto | Auto | -| **Setup-Zeit** | 1 Tag | 2-3 Tage | 1 Woche | 2-3 Tage | -| **Team-Größe** | 1 Person | 1-2 Personen | 2+ Personen | 1-2 Personen | - ---- - -## Empfehlung - -### Für Manacore Monorepo: **Option 4 (Hybrid mit Docker Swarm)** - -**Begründung:** - -1. **Richtige Balance** zwischen Komplexität und Features -2. **Docker Swarm** ist deutlich einfacher als Kubernetes, bietet aber: - - Service Discovery - - Load Balancing - - Rolling Updates - - Health Checks - - Secrets Management -3. **Hetzner Load Balancer** für echte HA ohne komplexe Floating-IP-Konfiguration -4. **Separater DB-Server** für: - - Bessere Performance - - Einfachere Backups - - Unabhängige Skalierung -5. **Cloudflare** als kostenloses CDN + DDoS-Schutz -6. **Object Storage** für Backups und Media-Dateien - -### Migrationspfad - -``` -Option 1 (Dev/Staging) - ↓ -Option 4 (Production) - ↓ -Option 3 (bei Bedarf für Enterprise-Scale) -``` - ---- - -## Implementierungsdetails - -### Nächste Schritte - -1. **Dockerfiles erstellen** für alle Services -2. **CI/CD Pipeline** mit GitHub Actions -3. **Hetzner Infrastruktur** provisionieren (Terraform) -4. **Docker Swarm** einrichten -5. **Monitoring** mit Prometheus/Grafana -6. **Backup-Strategie** implementieren - -### Geschätzte Implementierungszeit - -| Phase | Dauer | Beschreibung | -|-------|-------|--------------| -| Dockerfiles | 2-3 Tage | Alle Services containerisieren | -| CI/CD | 1-2 Tage | GitHub Actions Pipelines | -| Infrastruktur | 1 Tag | Hetzner Setup (Terraform) | -| Swarm Setup | 1 Tag | Cluster initialisieren | -| Deployment | 1-2 Tage | Services deployen & testen | -| Monitoring | 1 Tag | Prometheus, Grafana, Alerts | -| **Gesamt** | **~1-2 Wochen** | | - ---- - -## Weiterführende Dokumente - -- [DOCKERFILES.md](./DOCKERFILES.md) - Docker-Konfiguration für alle Services -- [CI_CD.md](./CI_CD.md) - GitHub Actions Pipelines -- [TERRAFORM.md](./TERRAFORM.md) - Infrastructure as Code -- [MONITORING.md](./MONITORING.md) - Prometheus & Grafana Setup -- [BACKUP_STRATEGY.md](./BACKUP_STRATEGY.md) - Backup & Recovery - ---- - -*Erstellt: November 2025* -*Letzte Aktualisierung: November 2025* diff --git a/docs/DEPLOYMENT_RUNBOOKS.md b/docs/DEPLOYMENT_RUNBOOKS.md deleted file mode 100644 index 1a8275c3e..000000000 --- a/docs/DEPLOYMENT_RUNBOOKS.md +++ /dev/null @@ -1,1314 +0,0 @@ -# Deployment Runbooks & Operational Procedures - -**Practical guides for common deployment and operational tasks** - ---- - -## Table of Contents - -1. [Initial Setup Runbook](#initial-setup-runbook) -2. [Deploying a New Service](#deploying-a-new-service) -3. [Updating an Existing Service](#updating-an-existing-service) -4. [Database Migration Runbook](#database-migration-runbook) -5. [Rollback Procedures](#rollback-procedures) -6. [Incident Response](#incident-response) -7. [Scaling Operations](#scaling-operations) -8. [Backup & Restore](#backup--restore) -9. [Security Audit Checklist](#security-audit-checklist) -10. [Monitoring Setup](#monitoring-setup) - ---- - -## Initial Setup Runbook - -### Prerequisites - -- [ ] Server with Docker installed (Ubuntu 22.04 LTS recommended) -- [ ] Domain name configured (manacore.app) -- [ ] Cloudflare account (for DNS and CDN) -- [ ] GitHub account (for CI/CD) -- [ ] Supabase projects created (one per product) - -### Step 1: Set up Docker Compose - -```bash -# SSH into server -ssh root@your-server-ip - -# Set up Docker Compose (automated installer) -curl -fsSL https://cdn.coollabs.io/coolify/install.sh | bash - -# Verify installation -coolify --version - -# Access Docker Compose configuration -# Navigate to: http://your-server-ip:8000 -# Create admin account -``` - -### Step 2: Configure DNS - -```bash -# In Cloudflare DNS dashboard, add A records: - -Type Name Target Proxy -──────────────────────────────────────────────────────────── -A @ YOUR_SERVER_IP Yes -A *.manacore.app YOUR_SERVER_IP Yes -A auth.manacore.app YOUR_SERVER_IP No -A api-chat.manacore.app YOUR_SERVER_IP No -A api-*.manacore.app YOUR_SERVER_IP No - -# Note: API endpoints should NOT be proxied (to avoid caching) -``` - -### Step 3: Clone Repository - -```bash -# On server -mkdir -p /opt/manacore -cd /opt/manacore - -git clone https://github.com/manacore/manacore-monorepo.git -cd manacore-monorepo - -# Checkout production branch -git checkout main -``` - -### Step 4: Set Up Environment Variables - -```bash -# Copy production environment template -cp .env.production.example .env.production - -# Edit with secure credentials -nano .env.production - -# Required variables (never commit real values to git): -# - DATABASE_URL (Supabase connection strings) -# - JWT_PRIVATE_KEY (generate new RSA key pair) -# - AZURE_OPENAI_API_KEY -# - STRIPE_SECRET_KEY -# - REDIS_PASSWORD (use strong password) -``` - -**Generate JWT Keys:** - -```bash -# Generate RSA key pair for JWT signing -ssh-keygen -t rsa -b 4096 -m PEM -f jwt_key -# Private key: jwt_key -# Public key: jwt_key.pub - -# Convert to single-line format for .env -cat jwt_key | tr '\n' '|' # Replace | with \n in .env -cat jwt_key.pub | tr '\n' '|' -``` - -### Step 5: Deploy Shared Infrastructure - -```bash -# Start PostgreSQL and Redis -pnpm docker:up - -# Wait for health checks to pass -docker compose ps - -# Expected output: -# NAME STATUS -# manacore-postgres Up (healthy) -# manacore-redis Up (healthy) -``` - -### Step 6: Deploy Mana Core Auth - -```bash -# Build and deploy auth service -docker compose --profile auth up -d - -# Run database migrations -docker compose exec mana-core-auth pnpm migration:run - -# Verify health -curl -f http://localhost:3001/api/health -# Expected: {"status":"ok","database":"connected","redis":"connected"} - -# Test authentication -curl -X POST http://localhost:3001/api/auth/register \ - -H "Content-Type: application/json" \ - -d '{ - "email": "test@manacore.app", - "password": "TestPassword123!", - "name": "Test User" - }' -``` - -### Step 7: Configure SSL (Coolify Auto) - -In Docker Compose configuration: - -1. Navigate to: Settings → Domains -2. Add domain: `auth.manacore.app` -3. Enable "Auto SSL" (Let's Encrypt) -4. Wait for certificate provisioning (~2 minutes) - -### Step 8: Deploy First Project (Chat) - -```bash -# Deploy all chat services -docker compose --profile chat up -d - -# Run migrations -docker compose exec chat-backend pnpm migration:run - -# Verify all services healthy -./scripts/health-check-all.sh - -# Expected output: -# ✅ mana-core-auth: healthy -# ✅ chat-backend: healthy -# ✅ chat-web: healthy -# ✅ chat-landing: healthy -``` - -### Step 9: Set Up Monitoring - -```bash -# Deploy Prometheus and Grafana -docker compose --profile monitoring up -d - -# Access Grafana -# Navigate to: http://your-server-ip:3000 -# Default credentials: admin / admin (change immediately) - -# Import dashboards -# Dashboard IDs: -# - 1860 (Node Exporter Full) -# - 893 (Docker monitoring) -# - Custom: manacore-services.json (in /monitoring/dashboards/) -``` - -### Step 10: Configure Backups - -```bash -# Set up automated daily backups -crontab -e - -# Add backup jobs: -0 3 * * * /opt/manacore/scripts/backup-all.sh >> /var/log/manacore-backup.log 2>&1 -0 4 * * * /opt/manacore/scripts/cleanup-old-backups.sh - -# Test backup manually -/opt/manacore/scripts/backup-all.sh - -# Verify backup created -ls -lah /backups/$(date +%Y/%m/%d)/ -``` - -### Verification Checklist - -- [ ] All DNS records resolve correctly -- [ ] SSL certificates valid (https://www.ssllabs.com/ssltest/) -- [ ] Mana Core Auth API accessible -- [ ] At least one project deployed and healthy -- [ ] Monitoring dashboards accessible -- [ ] Backups running successfully -- [ ] Firewall rules configured (only ports 22, 80, 443 open) -- [ ] Non-root user created for deployments -- [ ] SSH key authentication enabled (password auth disabled) - ---- - -## Deploying a New Service - -### Example: Deploy Picture Backend - -```bash -# Step 1: Prepare Dockerfile (if not exists) -# File: apps/picture/apps/backend/Dockerfile - -# Step 2: Build Docker image locally (test) -docker build \ - --build-arg PROJECT_PATH=apps/picture/apps/backend \ - --build-arg PORT=3005 \ - -t picture-backend:test \ - -f docker/templates/Dockerfile.nestjs \ - . - -# Step 3: Test image locally -docker run -d \ - --name picture-backend-test \ - -p 3005:3005 \ - -e DATABASE_URL=$PICTURE_DATABASE_URL \ - -e NODE_ENV=development \ - picture-backend:test - -# Verify health -curl -f http://localhost:3005/api/health - -# Step 4: Stop test container -docker stop picture-backend-test -docker rm picture-backend-test - -# Step 5: Add to docker-compose.prod.yml -cat >> docker-compose.prod.yml < docker-compose.green.yml <> /docs/CHANGELOG.md -git add /docs/CHANGELOG.md -git commit -m "docs: update chat-backend to v1.6.0" -git push -``` - -### Hotfix Deployment (Fast Track) - -```bash -# Scenario: Critical bug in production, need immediate fix - -# Step 1: Create hotfix branch -git checkout -b hotfix/chat-backend-memory-leak main - -# Step 2: Apply fix -# Edit code, test locally - -# Step 3: Build and tag -docker build \ - -t chat-backend:v1.5.3-hotfix \ - -f docker/templates/Dockerfile.nestjs \ - . - -# Step 4: Deploy directly to production (skip green) -docker tag chat-backend:v1.5.3-hotfix chat-backend:latest -docker compose up -d chat-backend - -# Step 5: Verify fix -curl -f https://api-chat.manacore.app/api/health - -# Step 6: Monitor closely for 30 minutes - -# Step 7: Merge hotfix to main -git checkout main -git merge hotfix/chat-backend-memory-leak -git push origin main - -# Step 8: Delete hotfix branch -git branch -d hotfix/chat-backend-memory-leak -``` - ---- - -## Database Migration Runbook - -### Safe Migration Checklist - -Before running any migration: - -- [ ] Migration is backward-compatible (old code can read new schema) -- [ ] Database backup completed (within last 24 hours) -- [ ] Migration tested in staging environment -- [ ] Rollback plan documented -- [ ] Estimated migration time < 5 minutes (for zero-downtime) -- [ ] No destructive operations (DROP, RENAME without compatibility layer) - -### Running a Migration - -```bash -# Example: Add new column to users table - -# Step 1: Generate migration -pnpm --filter @chat/backend migration:generate --name add-user-avatar - -# Migration file created: -# apps/chat/apps/backend/migrations/20251127_add_user_avatar.ts - -# Step 2: Review migration -cat apps/chat/apps/backend/migrations/20251127_add_user_avatar.ts - -# Example migration (safe): -export async function up(db) { - await db.execute(sql` - ALTER TABLE users - ADD COLUMN avatar_url TEXT; - `); -} - -export async function down(db) { - await db.execute(sql` - ALTER TABLE users - DROP COLUMN avatar_url; - `); -} - -# Step 3: Test migration in staging -# SSH to staging server -ssh staging.manacore.app - -cd /opt/manacore/manacore-monorepo -docker compose exec chat-backend pnpm migration:run - -# Verify schema change -docker compose exec postgres psql -U manacore chat -c "\d users" - -# Step 4: Test old code with new schema -# Ensure existing API endpoints still work - -# Step 5: Deploy to production -# SSH to production server -ssh prod.manacore.app - -cd /opt/manacore/manacore-monorepo - -# Backup database first -./scripts/backup-db.sh chat - -# Run migration -docker compose exec chat-backend pnpm migration:run - -# Expected output: -# ✅ Running migration: 20251127_add_user_avatar -# ✅ Migration completed successfully - -# Step 6: Verify schema -docker compose exec postgres psql -U manacore chat -c "\d users" - -# Step 7: Deploy new code (that uses new column) -docker compose up -d chat-backend - -# Step 8: Verify functionality -curl -X PATCH https://api-chat.manacore.app/api/users/me \ - -H "Authorization: Bearer $TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"avatar_url":"https://example.com/avatar.jpg"}' -``` - -### Unsafe Migration (Two-Phase) - -```bash -# Scenario: Rename column (requires two-phase deployment) - -# PHASE 1: Add new column, keep old column -# Migration 1: -export async function up(db) { - await db.execute(sql` - ALTER TABLE users - ADD COLUMN full_name TEXT; - - -- Copy data from old column - UPDATE users SET full_name = name; - `); -} - -# Deploy code that writes to BOTH columns -# (Old code still reads 'name', new code reads 'full_name') - -# PHASE 2: Remove old column (after all instances updated) -# Migration 2 (deploy 1 week later): -export async function up(db) { - await db.execute(sql` - ALTER TABLE users - DROP COLUMN name; - `); -} -``` - ---- - -## Rollback Procedures - -### Application Rollback - -```bash -# Scenario: Production deployment has critical bugs - -# OPTION 1: Blue-Green Instant Rollback -# (If blue environment still running) - -# Switch traffic back to blue -coolify switch-deployment chat blue - -# Or manually update Nginx: -sudo nano /etc/nginx/sites-available/api-chat.manacore.app -# Change: proxy_pass http://localhost:3012 # green -# To: proxy_pass http://localhost:3002 # blue - -sudo nginx -t -sudo systemctl reload nginx - -# Verify rollback -curl -s https://api-chat.manacore.app/api/version | jq -# Output: {"version":"1.5.2"} # Back to previous version - -# RTO: < 1 minute - -# ---------------------------------------- - -# OPTION 2: Docker Image Rollback -# (If blue environment already stopped) - -# Step 1: Find previous image tag -docker images chat-backend - -# Output: -# REPOSITORY TAG IMAGE ID CREATED -# chat-backend v1.6.0 def5678 2 hours ago -# chat-backend v1.5.2 abc1234 1 day ago - -# Step 2: Tag previous version as latest -docker tag chat-backend:v1.5.2 chat-backend:latest - -# Step 3: Restart service with previous image -docker compose up -d chat-backend - -# Step 4: Verify rollback -curl -s https://api-chat.manacore.app/api/version | jq - -# RTO: ~3 minutes - -# ---------------------------------------- - -# OPTION 3: Git Rollback + Rebuild -# (If no previous images available) - -# Step 1: Find previous commit -git log --oneline --decorate -10 - -# Output: -# def5678 (HEAD -> main) feat: add chat model selector -# abc1234 fix: authentication timeout -# 789wxyz feat: improve error handling - -# Step 2: Checkout previous commit -git checkout abc1234 - -# Step 3: Rebuild image -docker build -t chat-backend:rollback \ - -f docker/templates/Dockerfile.nestjs \ - . - -# Step 4: Deploy -docker tag chat-backend:rollback chat-backend:latest -docker compose up -d chat-backend - -# Step 5: Verify rollback -curl -s https://api-chat.manacore.app/api/version | jq - -# RTO: ~10 minutes (includes rebuild time) -``` - -### Database Rollback - -```bash -# Scenario: Migration caused data corruption - -# Step 1: Stop affected services -docker compose stop chat-backend - -# Step 2: Find latest backup BEFORE migration -ls -lt /backups/chat/ - -# Output: -# -rw-r--r-- chat-20251127-020000.dump # Before migration -# -rw-r--r-- chat-20251127-050000.dump # After migration (corrupted) - -# Step 3: Drop current database -docker compose exec postgres psql -U manacore -c "DROP DATABASE chat;" -docker compose exec postgres psql -U manacore -c "CREATE DATABASE chat;" - -# Step 4: Restore from backup -pg_restore \ - --dbname="postgresql://manacore:password@localhost:5432/chat" \ - --clean --if-exists \ - /backups/chat/chat-20251127-020000.dump - -# Step 5: Verify data -docker compose exec postgres psql -U manacore chat -c "SELECT COUNT(*) FROM users;" - -# Step 6: Restart services -docker compose start chat-backend - -# Step 7: Verify application -curl -f https://api-chat.manacore.app/api/health - -# RTO: ~15 minutes -# RPO: Time since last backup (up to 24 hours) -``` - ---- - -## Incident Response - -### Severity Levels - -| Severity | Description | Response Time | Escalation | -| ----------------- | -------------------------------------- | ------------- | -------------------------- | -| **P1 - Critical** | Total service outage, data loss | Immediate | CTO + All hands | -| **P2 - High** | Major functionality broken | < 30 min | DevOps lead + Backend team | -| **P3 - Medium** | Partial degradation, workaround exists | < 2 hours | On-call engineer | -| **P4 - Low** | Minor issues, no user impact | < 24 hours | Backlog | - -### Incident Response Workflow - -```bash -# STEP 1: DETECTION (Automated or Manual) -# - Alert triggered (Grafana, Sentry, customer report) - -# STEP 2: TRIAGE (Within 5 minutes) -# Determine severity and impact - -# Checklist: -- [ ] How many users affected? (1, 10, 100, All) -- [ ] What functionality broken? (Critical path? Nice-to-have?) -- [ ] Is data at risk? (Potential data loss?) -- [ ] Is there a security concern? (Breach, leak, attack?) - -# Assign severity: P1, P2, P3, or P4 - -# STEP 3: COMMUNICATION -# Start incident channel - -# Slack: Create channel #incident-YYYYMMDD-HHMM -# Post initial status: - **INCIDENT: Chat API Down** - Severity: P1 - Started: 2025-11-27 12:34 UTC - Impact: All chat requests failing (500 errors) - Affected: ~1,000 active users - Status: INVESTIGATING - -# STEP 4: INVESTIGATION -# Gather data - -# Check logs -docker compose logs --tail 200 -f chat-backend - -# Check metrics -# Navigate to Grafana dashboard - -# Check database -docker compose exec postgres psql -U manacore chat -c "SELECT 1;" - -# Check external dependencies -curl -f https://api.openai.azure.com/health - -# STEP 5: MITIGATION -# Choose appropriate action: - -# Option A: Restart service -docker compose restart chat-backend - -# Option B: Rollback deployment -# (See Rollback Procedures section) - -# Option C: Temporary workaround -# Example: Disable problematic feature via feature flag - -# STEP 6: VERIFICATION -# Confirm issue resolved - -curl -f https://api-chat.manacore.app/api/health -./scripts/smoke-test.sh https://api-chat.manacore.app - -# STEP 7: COMMUNICATION (Resolution) -# Update incident channel: - **INCIDENT RESOLVED** - Severity: P1 - Duration: 8 minutes - Root cause: OOM kill (memory leak in chat model loader) - Resolution: Service restarted, memory limit increased - Action items: - 1. Fix memory leak in code - 2. Add memory monitoring alert - 3. Load test with 10,000 concurrent users - -# STEP 8: POST-MORTEM (Within 24 hours) -# Create post-mortem document - -# Template: /docs/post-mortems/YYYY-MM-DD-incident-name.md -``` - -### Example Incident Scenarios - -#### Scenario 1: Database Connection Pool Exhausted - -```bash -# Symptoms: -# - API requests timing out -# - Logs: "Error: Pool exhausted, max connections reached" - -# Investigation: -docker compose exec postgres psql -U manacore chat -c " - SELECT - application_name, - state, - COUNT(*) - FROM pg_stat_activity - WHERE datname = 'chat' - GROUP BY application_name, state; -" - -# Output shows 60 connections (all exhausted) - -# Root Cause: Connection leak in code (not releasing connections) - -# Immediate Mitigation: -# 1. Restart backend (releases connections) -docker compose restart chat-backend - -# 2. Increase connection pool temporarily -docker compose exec chat-backend sh -c " - export DB_POOL_MAX=20 - pnpm start:prod -" - -# Permanent Fix: -# 1. Fix code to properly release connections -# 2. Configure PgBouncer for connection pooling -# 3. Add monitoring for active connections -``` - -#### Scenario 2: SSL Certificate Expired - -```bash -# Symptoms: -# - Users reporting "Your connection is not private" -# - Curl: "SSL certificate problem: certificate has expired" - -# Investigation: -openssl s_client -connect api-chat.manacore.app:443 -servername api-chat.manacore.app < /dev/null 2>/dev/null | openssl x509 -noout -dates - -# Output: -# notAfter=Nov 20 10:00:00 2025 GMT # Expired! - -# Root Cause: Let's Encrypt auto-renewal failed - -# Immediate Mitigation: -# Manually renew certificate -sudo certbot renew --force-renewal - -# Or via Coolify: -coolify ssl renew api-chat.manacore.app - -# Verification: -curl -I https://api-chat.manacore.app -# Should return: HTTP/2 200 - -# Permanent Fix: -# 1. Check certbot renewal cron job -crontab -l | grep certbot - -# 2. Add monitoring for certificate expiry -# Alert 30 days before expiration -``` - ---- - -## Scaling Operations - -### Vertical Scaling (Increase Resources) - -```bash -# Scenario: Service hitting CPU/memory limits - -# Step 1: Check current resource usage -docker stats chat-backend - -# Output: -# CONTAINER CPU % MEM USAGE / LIMIT -# chat-backend 95% 480MiB / 512MiB # At limit! - -# Step 2: Update resource limits -# Edit docker-compose.prod.yml: -cat >> docker-compose.prod.yml <> docker-compose.prod.yml < nginx/load-balancer.conf </dev/null | \ - openssl x509 -noout -dates -done - -# 4. Review firewall rules -sudo ufw status verbose - -# 5. Check for exposed secrets in logs -grep -r "password\|api_key\|secret" /var/log/manacore/ || echo "No secrets found" - -# 6. Review user access -# List users with sudo access -getent group sudo - -# 7. Check for unauthorized Docker containers -docker ps -a | grep -v "manacore\|postgres\|redis\|nginx" - -# 8. Review recent authentication failures -journalctl -u ssh | grep "Failed password" | tail -20 - -# 9. Verify backup integrity -# Attempt restore of random backup to test environment -latest_backup=$(ls -t /backups/chat/*.dump | head -1) -pg_restore --dbname="postgresql://test:test@localhost:5432/chat_test" \ - --clean --if-exists $latest_backup - -# 10. Check database permissions -docker compose exec postgres psql -U manacore chat -c " - SELECT grantee, privilege_type - FROM information_schema.table_privileges - WHERE table_schema = 'public'; -" -``` - -### Security Hardening - -```bash -# 1. Enable fail2ban (SSH brute force protection) -sudo apt install fail2ban -sudo systemctl enable fail2ban -sudo systemctl start fail2ban - -# 2. Disable password authentication (SSH keys only) -sudo nano /etc/ssh/sshd_config -# Set: PasswordAuthentication no -sudo systemctl restart sshd - -# 3. Set up automated security updates -sudo apt install unattended-upgrades -sudo dpkg-reconfigure -plow unattended-upgrades - -# 4. Install and configure AppArmor -sudo apt install apparmor apparmor-utils -sudo systemctl enable apparmor -sudo systemctl start apparmor - -# 5. Enable Docker content trust -export DOCKER_CONTENT_TRUST=1 -echo 'export DOCKER_CONTENT_TRUST=1' >> ~/.bashrc - -# 6. Scan for rootkits -sudo apt install rkhunter -sudo rkhunter --update -sudo rkhunter --check -``` - ---- - -## Monitoring Setup - -### Grafana Dashboard Setup - -```bash -# Step 1: Access Grafana -# Navigate to: http://your-server-ip:3000 -# Login: admin / admin (change password) - -# Step 2: Add Prometheus data source -# Settings → Data Sources → Add data source -# Type: Prometheus -# URL: http://prometheus:9090 -# Save & Test - -# Step 3: Import dashboard -# Dashboards → Import -# Upload file: /monitoring/dashboards/manacore-services.json - -# Step 4: Configure alerts -# Alerting → Notification channels -# Add Slack webhook: -# Name: #alerts -# Type: Slack -# Webhook URL: https://hooks.slack.com/services/YOUR/WEBHOOK/URL - -# Step 5: Create alert rules -# Example: High error rate -curl -X POST http://admin:admin@localhost:3000/api/alerts \ - -H "Content-Type: application/json" \ - -d '{ - "name": "High Error Rate", - "query": "rate(http_requests_total{status_code=~\"5..\"}[5m]) > 0.05", - "for": "5m", - "annotations": { - "summary": "Error rate >5% for 5 minutes" - }, - "labels": { - "severity": "critical" - } - }' -``` - -### Prometheus Configuration - -```yaml -# /monitoring/prometheus.yml -global: - scrape_interval: 30s - evaluation_interval: 30s - -scrape_configs: - - job_name: 'mana-core-auth' - static_configs: - - targets: ['mana-core-auth:3001'] - metrics_path: '/metrics' - - - job_name: 'chat-backend' - static_configs: - - targets: ['chat-backend:3002'] - - - job_name: 'picture-backend' - static_configs: - - targets: ['picture-backend:3005'] - - - job_name: 'postgres-exporter' - static_configs: - - targets: ['postgres-exporter:9187'] - - - job_name: 'redis-exporter' - static_configs: - - targets: ['redis-exporter:9121'] - - - job_name: 'node-exporter' - static_configs: - - targets: ['node-exporter:9100'] - -alerting: - alertmanagers: - - static_configs: - - targets: ['alertmanager:9093'] -``` - ---- - -## Troubleshooting Common Issues - -### Issue: Container Won't Start - -```bash -# Check logs -docker compose logs chat-backend - -# Common causes: -# 1. Port already in use -sudo lsof -i :3002 -# Kill process: sudo kill -9 - -# 2. Missing environment variable -docker compose config chat-backend -# Verify all required env vars present - -# 3. Database connection failed -docker compose exec chat-backend sh -c " - nc -zv postgres 5432 || echo 'Cannot reach database' -" - -# 4. Volume mount permission error -ls -la /var/lib/docker/volumes/ -sudo chown -R 1001:1001 /var/lib/docker/volumes/manacore-data -``` - -### Issue: High Memory Usage - -```bash -# Identify memory hog -docker stats --no-stream --format "table {{.Container}}\t{{.MemUsage}}" | sort -k 2 -h -r - -# Check for memory leak -docker exec chat-backend sh -c " - node --expose-gc --heap-prof -e 'require(\"./dist/main\")' -" - -# Restart container -docker compose restart chat-backend - -# If persistent, increase memory limit or investigate code -``` - -### Issue: Slow API Responses - -```bash -# Check database query performance -docker compose exec postgres psql -U manacore chat -c " - SELECT query, mean_exec_time, calls - FROM pg_stat_statements - ORDER BY mean_exec_time DESC - LIMIT 10; -" - -# Check Redis cache hit rate -docker compose exec redis redis-cli info stats | grep keyspace - -# Profile application -# Add Sentry performance monitoring - -# Check network latency -docker exec chat-backend sh -c " - curl -w '@curl-format.txt' -o /dev/null -s https://api.openai.azure.com -" -``` - ---- - -**End of Runbooks** - -For questions or issues not covered here, contact DevOps team or create an issue in the repository. diff --git a/docs/DOCKER_SETUP_ANALYSIS.md b/docs/DOCKER_SETUP_ANALYSIS.md deleted file mode 100644 index a14333891..000000000 --- a/docs/DOCKER_SETUP_ANALYSIS.md +++ /dev/null @@ -1,750 +0,0 @@ -# Docker Setup Analysis - Current State - -**Analysis Date**: 2025-12-01 -**Scope**: Complete monorepo Docker configuration for Hetzner deployment - -## Executive Summary - -The monorepo has **solid Docker foundations** with multi-environment compose files and containerized services, but requires **critical fixes** before production deployment to Hetzner. - -**Status**: ⚠️ **Not Production Ready** - 4 critical blockers identified - ---- - -## Table of Contents - -- [Docker Files Inventory](#docker-files-inventory) -- [Current Architecture](#current-architecture) -- [Containerized Services](#containerized-services) -- [Critical Blocking Issues](#critical-blocking-issues) -- [Configuration Gaps](#configuration-gaps) -- [Best Practices Currently Followed](#best-practices-currently-followed) -- [Immediate Actions Required](#immediate-actions-required) - ---- - -## Docker Files Inventory - -### Root-Level Compose Files - -| File | Lines | Purpose | Status | -|------|-------|---------|--------| -| `docker-compose.yml` | 190 | Full production stack with Traefik, PostgreSQL, Redis, PgBouncer, Prometheus, Grafana | ⚠️ Missing configs | -| `docker-compose.dev.yml` | 117 | Development setup with minimal infrastructure | ✅ Working | -| `docker-compose.staging.yml` | 273 | Staging environment with 5 backends and registry images | ✅ Working | -| `docker-compose.production.yml` | 253 | Production deployment with resource constraints | ⚠️ Missing external services | - -### Active Service Dockerfiles - -| Service | Path | Base Image | Status | -|---------|------|------------|--------| -| mana-core-auth | `services/mana-core-auth/Dockerfile` | Node 20-alpine | ✅ Working | -| chat-backend | `apps/chat/apps/backend/Dockerfile` | Node 20-alpine | ✅ Working | -| picture-backend | `apps/picture/apps/backend/Dockerfile` | Node 20-alpine | ✅ Working | -| manadeck-backend | `apps/manadeck/apps/backend/Dockerfile` | Node 18 | ❌ Inconsistent | - -### Docker Templates (Reusable) - -``` -docker/templates/ -├── Dockerfile.nestjs # Multi-service NestJS template -├── Dockerfile.sveltekit # SvelteKit web app template -└── Dockerfile.astro # Astro static site with Nginx -``` - -### Supporting Infrastructure - -``` -docker/ -├── init-db/ -│ └── 01-create-databases.sql # Database initialization -├── nginx/ -│ └── astro.conf # Nginx config for static sites -├── prometheus/ -│ └── prometheus.yml # ❌ MISSING -└── grafana/ - └── provisioning/ # ❌ MISSING -``` - -### Entrypoint Scripts - -- `services/mana-core-auth/docker-entrypoint.sh` ✅ -- `apps/chat/apps/backend/docker-entrypoint.sh` ✅ -- `apps/picture/apps/backend/docker-entrypoint.sh` ✅ -- `apps/manadeck/apps/backend/docker-entrypoint.sh` ❌ Missing - ---- - -## Current Architecture - -### Development Environment - -**File**: `docker-compose.dev.yml` - -``` -Services: -- PostgreSQL 16-alpine (port 5432) -- Redis 7-alpine (port 6379) -- Optional services via profiles ("auth", "chat", "all") - -Network: manacore-network (bridge) -Health Checks: 10-second intervals -Restart Policy: unless-stopped -``` - -**Purpose**: Minimal stack for local development with hot reload support. - -### Staging Environment - -**File**: `docker-compose.staging.yml` - -``` -Services: -- 5 backend microservices (maerchenzauber, chat, manadeck, nutriphi, news) -- PostgreSQL and Redis infrastructure -- Nginx reverse proxy (ports 80/443) - -Images: Pre-built from Docker registry -Health Checks: 30-second intervals -Logging: Structured JSON (10MB max-size, 3 files) -Network: manacore-staging (bridge) -``` - -**Purpose**: Pre-production testing environment. - -### Production Environment - -**File**: `docker-compose.production.yml` - -``` -Services: -- 5 backend microservices only (no web apps) -- External PostgreSQL/Redis (not containerized) - -Ports: All bound to 127.0.0.1 (localhost only) -Resource Constraints: 1-2 CPUs, 512MB-1GB memory per service -Volumes: None (external services) -Network: manacore-production (bridge) -``` - -**Purpose**: Minimal application footprint for managed infrastructure. - -### Full Infrastructure Stack - -**File**: `docker-compose.yml` - -``` -Services: -- Traefik v3.0 (reverse proxy with Let's Encrypt SSL) -- PostgreSQL 16-alpine + PgBouncer (connection pooling) -- Redis 7-alpine (session management) -- Prometheus (metrics collection) ⚠️ Missing config -- Grafana (monitoring dashboards) ⚠️ Missing provisioning - -Features: -- Automatic SSL via Traefik -- Database connection pooling -- Metrics collection -- Dashboard monitoring -``` - -**Purpose**: Complete on-premises deployment with monitoring. - ---- - -## Containerized Services - -### Active & Containerized - -| Service | Technology | Port | Status | -|---------|------------|------|--------| -| mana-core-auth | NestJS | 3001 | ✅ Production Ready | -| chat-backend | NestJS | 3002 | ✅ Production Ready | -| picture-backend | NestJS | 3006 | ✅ Production Ready | -| manadeck-backend | NestJS | 3009 | ⚠️ Needs Updates | - -### Not Yet Containerized - -**Web Apps (SvelteKit)**: -- Templates available in `docker/templates/Dockerfile.sveltekit` -- Need per-project Dockerfiles -- SSR support included - -**Landing Pages (Astro)**: -- Templates available in `docker/templates/Dockerfile.astro` -- Nginx configuration ready (`docker/nginx/astro.conf`) -- Static site optimization included - -**Mobile Apps (Expo/React Native)**: -- Not containerized (not applicable for Hetzner deployment) -- Built and deployed to app stores separately - ---- - -## Critical Blocking Issues - -### 1. ❌ Missing Prometheus Configuration - -**Impact**: High - Blocks monitoring deployment -**File**: `docker/prometheus/prometheus.yml` - -**Issue**: Referenced in `docker-compose.yml` but file doesn't exist. - -**Error**: -```yaml -# docker-compose.yml line ~150 -volumes: - - ./docker/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml -``` - -**Solution Required**: -```bash -mkdir -p docker/prometheus -``` - -Create basic `prometheus.yml`: -```yaml -global: - scrape_interval: 15s - evaluation_interval: 15s - -scrape_configs: - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - - - job_name: 'node-exporter' - static_configs: - - targets: ['node-exporter:9100'] - - - job_name: 'postgres' - static_configs: - - targets: ['postgres:9187'] - - - job_name: 'redis' - static_configs: - - targets: ['redis:9121'] -``` - -### 2. ❌ Missing Grafana Provisioning - -**Impact**: High - Blocks monitoring dashboard deployment -**Directory**: `docker/grafana/provisioning/` - -**Issue**: Referenced in docker-compose but directories don't exist: -- `docker/grafana/provisioning/dashboards/` -- `docker/grafana/provisioning/datasources/` - -**Solution Required**: -```bash -mkdir -p docker/grafana/provisioning/{dashboards,datasources} -``` - -Create `docker/grafana/provisioning/datasources/prometheus.yml`: -```yaml -apiVersion: 1 - -datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://prometheus:9090 - isDefault: true - editable: true -``` - -Create `docker/grafana/provisioning/dashboards/default.yml`: -```yaml -apiVersion: 1 - -providers: - - name: 'Default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - updateIntervalSeconds: 10 - allowUiUpdates: true - options: - path: /var/lib/grafana/dashboards -``` - -### 3. ❌ Node Version Inconsistency - -**Impact**: Medium - May cause runtime issues -**File**: `apps/manadeck/apps/backend/Dockerfile` - -**Issue**: ManaDeck uses Node 18 while all other services use Node 20. - -**Current**: -```dockerfile -FROM node:18-alpine AS base -``` - -**Should Be**: -```dockerfile -FROM node:20-alpine AS base -``` - -**Location**: `/Users/wuesteon/dev/mana_universe/manacore-monorepo/apps/manadeck/apps/backend/Dockerfile:1` - -### 4. ❌ ManaDeck Dockerfile Anomalies - -**Impact**: Medium - Build inconsistency -**File**: `apps/manadeck/apps/backend/Dockerfile` - -**Issues**: -1. Uses `npm` instead of `pnpm` (lines 15, 33, 38) -2. Includes peer dependency workaround (`--legacy-peer-deps`) -3. Cloud Run specific configuration (port 8080 instead of 3009) -4. Missing proper workspace awareness - -**Example Issue**: -```dockerfile -# Line 15 - Should use pnpm -RUN npm ci --omit=dev --legacy-peer-deps -``` - -**Solution**: Refactor to use pnpm like other services. - ---- - -## Configuration Gaps - -### 1. Missing Staging HTTPS/SSL Configuration - -**Severity**: Medium - -Staging environment (`docker-compose.staging.yml`) only has HTTP Nginx configuration. No SSL/TLS setup for testing HTTPS in staging. - -**Recommendation**: Add Let's Encrypt staging certificates or self-signed certs. - -### 2. Inconsistent Docker Compose at Service Level - -**Severity**: Low - -Only `chat` and `picture` have local `docker-compose.yml` files in their service directories. Other projects don't have service-specific compose files. - -**Current**: -``` -apps/chat/docker-compose.yml ✅ Exists -apps/picture/docker-compose.yml ✅ Exists -apps/manadeck/docker-compose.yml ❌ Missing -apps/zitare/docker-compose.yml ❌ Missing -apps/presi/docker-compose.yml ❌ Missing -``` - -### 3. Database Initialization Unclear - -**Severity**: Medium - -Database initialization script (`docker/init-db/01-create-databases.sql`) exists, but unclear if it covers all services beyond mana-core-auth. - -**Services Requiring Databases**: -- mana-core-auth (PostgreSQL + Redis) ✅ -- chat-backend (PostgreSQL) ? -- picture-backend (PostgreSQL) ? -- manadeck-backend (Supabase external) N/A -- zitare-backend (PostgreSQL) ? -- presi-backend (PostgreSQL) ? - -### 4. No Resource Limits in Development - -**Severity**: Low - -Development environment (`docker-compose.dev.yml`) has no resource limits, which can lead to runaway containers consuming all system resources. - -**Recommendation**: Add development-appropriate limits (e.g., 2GB RAM per service). - -### 5. Entrypoint Scripts Not Universal - -**Severity**: Low - -Not all services have entrypoint scripts for handling migrations, health checks, and graceful shutdown. - -**Have Entrypoints**: -- mana-core-auth ✅ -- chat-backend ✅ -- picture-backend ✅ - -**Missing Entrypoints**: -- manadeck-backend ❌ -- zitare-backend ❌ -- presi-backend ❌ - ---- - -## Best Practices Currently Followed - -### ✅ Multi-Stage Dockerfile Builds - -All Dockerfiles use multi-stage builds with separate `build` and `production` stages: - -```dockerfile -FROM node:20-alpine AS base -# ... setup - -FROM base AS build -# ... build artifacts - -FROM node:20-alpine AS production -# ... copy only necessary files -``` - -**Benefit**: Smaller production images (~50% size reduction). - -### ✅ Non-Root User Execution - -All services run as non-root users: - -```dockerfile -RUN addgroup -g 1001 -S nodejs && \ - adduser -S nestjs -u 1001 -USER nestjs -``` - -**Security Impact**: Prevents privilege escalation attacks. - -### ✅ Alpine Base Images - -Using Alpine Linux for minimal attack surface: - -```dockerfile -FROM node:20-alpine -``` - -**Benefit**: ~40MB base image vs ~900MB for standard Node images. - -### ✅ Health Checks on All Services - -Comprehensive health checks with appropriate timeouts: - -```yaml -healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s -``` - -### ✅ Service Dependencies with Health Conditions - -Proper dependency orchestration: - -```yaml -depends_on: - postgres: - condition: service_healthy - redis: - condition: service_healthy -``` - -### ✅ Named Volumes for Data Persistence - -Explicit volume naming for easy backup/restore: - -```yaml -volumes: - postgres-data: - driver: local - name: manacore-postgres-data -``` - -### ✅ Environment Variable Externalization - -Secrets and configuration via environment files: - -```yaml -env_file: - - .env.development - - .env.production -``` - -### ✅ Custom Bridge Networks - -Service isolation with custom networks: - -```yaml -networks: - manacore-network: - driver: bridge - name: manacore-network -``` - -### ✅ Restart Policies - -Appropriate restart policies per environment: - -```yaml -restart: unless-stopped # Staging/Production -restart: on-failure # Development -``` - -### ✅ Reverse Proxy with SSL - -Traefik with automatic Let's Encrypt SSL: - -```yaml -command: - - "--certificatesresolvers.letsencrypt.acme.httpchallenge=true" - - "--certificatesresolvers.letsencrypt.acme.email=${ACME_EMAIL}" -``` - -### ✅ Database Connection Pooling - -PgBouncer integration for efficient connection management. - -### ✅ Redis Caching Layer - -Centralized caching with Redis for session management and performance. - -### ✅ Docker Compose Profiles - -Selective service startup with profiles: - -```yaml -services: - mana-core-auth: - profiles: ["auth", "all"] - chat-backend: - profiles: ["chat", "all"] -``` - -### ✅ pnpm Workspace Awareness - -Dockerfiles properly handle pnpm workspaces: - -```dockerfile -COPY pnpm-workspace.yaml package.json pnpm-lock.yaml ./ -RUN pnpm fetch -RUN pnpm install --frozen-lockfile --offline -``` - ---- - -## Best Practice Gaps - -### Missing: Docker Build Cache Optimization - -**Issue**: No `.dockerignore` optimization strategy across services. - -**Impact**: Slower builds, larger build contexts sent to Docker daemon. - -**Recommendation**: Add comprehensive `.dockerignore` files per service. - -### Missing: Multi-Architecture Build Support - -**Issue**: No explicit multi-architecture builds (assumes AMD64 only). - -**Impact**: M1/M2 Mac developers may face compatibility issues. - -**Recommendation**: Use `docker buildx` for ARM64 + AMD64 builds. - -### Missing: Container Security Scanning - -**Issue**: No automated security scanning (Trivy, Hadolint, etc.). - -**Impact**: Unknown vulnerabilities in production images. - -**Recommendation**: Add CI/CD security scanning step. - -### Missing: Consistent Logging - -**Issue**: Logging configuration varies across environments. - -**Recommendation**: Standardize JSON structured logging across all environments. - -### Missing: Docker Deployment Documentation - -**Issue**: No step-by-step Docker deployment guide. - -**Impact**: Difficult onboarding for new developers. - -**Recommendation**: Create `DOCKER_DEPLOYMENT.md` with runbooks. - ---- - -## Environment Variable Handling - -### Root-Level `.dockerignore` Excludes - -``` -node_modules/ -dist/ -.git/ -.env* -*.log -coverage/ -``` - -**Status**: ✅ Properly configured - -### Variable Management Strategy - -**Three-Tier Hierarchy**: - -1. **Root `.env.development`**: Shared development variables (committed) -2. **Environment-specific** (`.env.production`): Secrets (gitignored) -3. **Service-specific**: Per-service overrides in compose files - -**Key Secrets Required**: -- `POSTGRES_PASSWORD` -- `REDIS_PASSWORD` -- `JWT_PRIVATE_KEY`, `JWT_PUBLIC_KEY` -- `AZURE_OPENAI_API_KEY` -- `GOOGLE_GENAI_API_KEY` -- `SUPABASE_SERVICE_ROLE_KEY` - ---- - -## Network & Volume Strategy - -### Networks - -**Development**: `manacore-network` (bridge) -**Staging**: `manacore-staging` (bridge) -**Production**: `manacore-production` (bridge) - -**Service-to-Service Communication**: Via Docker DNS -- `postgres:5432` -- `redis:6379` -- `mana-core-auth:3001` - -### Volumes - -**Development**: -```yaml -volumes: - postgres-data: {} - redis-data: {} -``` - -**Staging**: -```yaml -volumes: - postgres_data: - name: manacore-staging-postgres - redis_data: - name: manacore-staging-redis -``` - -**Production**: No volumes (external services assumed) - -**Full Stack**: -```yaml -volumes: - postgres-data: {} - redis-data: {} - traefik-letsencrypt: {} - prometheus-data: {} - grafana-data: {} -``` - ---- - -## Immediate Actions Required - -### Priority 1: Critical Blockers (Must Fix Before Deployment) - -1. **Create Prometheus Configuration** - ```bash - mkdir -p docker/prometheus - # Create prometheus.yml (see issue #1) - ``` - -2. **Create Grafana Provisioning** - ```bash - mkdir -p docker/grafana/provisioning/{dashboards,datasources} - # Create provisioning files (see issue #2) - ``` - -3. **Update ManaDeck Node Version** - ```bash - # Edit apps/manadeck/apps/backend/Dockerfile - # Change FROM node:18-alpine to node:20-alpine - ``` - -4. **Fix ManaDeck Dockerfile** - ```bash - # Refactor to use pnpm instead of npm - # Remove --legacy-peer-deps - # Fix port configuration (3009 instead of 8080) - ``` - -### Priority 2: Configuration Improvements - -5. **Add Staging SSL Configuration** - - Add Let's Encrypt staging environment - - Or configure self-signed certificates - -6. **Standardize Service Compose Files** - - Add `docker-compose.yml` to all projects - - Follow chat/picture pattern - -7. **Document Database Initialization** - - Clarify which databases are created - - Add initialization for all services - -8. **Add Development Resource Limits** - - Prevent runaway containers - - Set reasonable limits (e.g., 2GB RAM) - -9. **Add Entrypoint Scripts** - - Create for manadeck, zitare, presi - - Standardize migration handling - -### Priority 3: Best Practice Enhancements - -10. **Optimize Docker Build Cache** - - Add comprehensive `.dockerignore` files - - Optimize layer ordering - -11. **Add Multi-Architecture Support** - - Use `docker buildx` - - Build for AMD64 + ARM64 - -12. **Implement Security Scanning** - - Add Trivy to CI/CD - - Scan images before push - -13. **Standardize Logging** - - JSON structured logging - - Consistent across environments - -14. **Create Deployment Documentation** - - Step-by-step runbooks - - Troubleshooting guides - ---- - -## Estimated Time to Production Ready - -| Phase | Tasks | Time Estimate | -|-------|-------|---------------| -| **Phase 1: Critical Fixes** | Issues #1-4 | 2-4 hours | -| **Phase 2: Configuration** | Issues #5-9 | 4-6 hours | -| **Phase 3: Best Practices** | Issues #10-14 | 6-8 hours | -| **Total** | 14 tasks | **12-18 hours** | - ---- - -## Conclusion - -The Docker setup demonstrates **strong architectural foundations** with: -- Multi-environment support ✅ -- Service isolation ✅ -- Health-driven orchestration ✅ -- Security best practices ✅ - -However, **4 critical blockers** prevent immediate production deployment to Hetzner. Addressing these issues should take **2-4 hours** and will unblock staging and production deployments. - -**Recommendation**: Fix Priority 1 items immediately, then incrementally address Priority 2 and 3 for production hardening. - ---- - -**Related Documentation**: -- `HETZNER_PRODUCTION_GUIDE.md` - Comprehensive Hetzner deployment guide -- `DOCKER_COMPOSE_PRODUCTION_ARCHITECTURE.md` - Detailed architecture design -- `DOCKER_GUIDE.md` - Docker usage and best practices -- `DEPLOYMENT_HETZNER.md` - Deployment options comparison diff --git a/docs/HETZNER_DEPLOYMENT_SUMMARY.md b/docs/HETZNER_DEPLOYMENT_SUMMARY.md deleted file mode 100644 index b741da0a6..000000000 --- a/docs/HETZNER_DEPLOYMENT_SUMMARY.md +++ /dev/null @@ -1,625 +0,0 @@ -# Hetzner Deployment Summary - Quick Reference - -**Date**: 2025-12-01 -**Status**: Complete Analysis & Documentation -**Action Required**: Fix 4 critical blockers before deployment - ---- - -## Executive Summary - -Your monorepo has **solid Docker foundations** but needs **4 critical fixes** (2-4 hours of work) before production deployment to Hetzner. - -### Current State: ⚠️ Not Production Ready - -**What's Working**: - -- Multi-environment Docker Compose setups ✅ -- 4 containerized backends (auth, chat, picture, manadeck) ✅ -- Health checks and dependency management ✅ -- Security best practices (non-root, Alpine, network isolation) ✅ - -**What Needs Fixing**: - -1. ❌ Missing Prometheus configuration (`docker/prometheus/prometheus.yml`) -2. ❌ Missing Grafana provisioning (`docker/grafana/provisioning/`) -3. ❌ ManaDeck uses Node 18 (should be Node 20) -4. ❌ ManaDeck uses npm instead of pnpm - ---- - -## Quick Start: Get Production Ready in 2-4 Hours - -### Step 1: Fix Critical Blockers (1 hour) - -```bash -# 1. Create monitoring infrastructure -mkdir -p docker/prometheus -mkdir -p docker/grafana/provisioning/{dashboards,datasources} - -# 2. Create Prometheus config -cat > docker/prometheus/prometheus.yml <<'EOF' -global: - scrape_interval: 15s - -scrape_configs: - - job_name: 'prometheus' - static_configs: - - targets: ['localhost:9090'] - - job_name: 'docker' - static_configs: - - targets: ['172.17.0.1:9323'] -EOF - -# 3. Create Grafana datasource -cat > docker/grafana/provisioning/datasources/prometheus.yml <<'EOF' -apiVersion: 1 -datasources: - - name: Prometheus - type: prometheus - url: http://prometheus:9090 - isDefault: true -EOF - -# 4. Fix ManaDeck Dockerfile -# Edit apps/manadeck/apps/backend/Dockerfile -# - Change: FROM node:18-alpine → FROM node:20-alpine -# - Replace all "npm" commands with "pnpm" -# - Remove --legacy-peer-deps flag - -# 5. Test locally -pnpm docker:up -``` - -### Step 2: Deploy to Hetzner (1-2 hours) - -```bash -# On Hetzner server (use "Docker CE" app during creation) - -# 1. Run production setup script (see HETZNER_PRODUCTION_GUIDE.md) -curl -o setup.sh https://your-repo/scripts/hetzner-setup.sh -chmod +x setup.sh -./setup.sh - -# 2. Configure environment variables -cd /app -cp .env.production.example .env.production -nano .env.production # Add your secrets - -# 3. Deploy application -docker compose -f docker-compose.production.yml up -d - -# 4. Verify health -curl http://localhost:3001/api/v1/health # mana-core-auth -curl http://localhost:3002/api/health # chat-backend -``` - -### Step 3: Setup Monitoring & Backups (1 hour) - -```bash -# Deploy monitoring stack -docker compose -f docker-compose.monitoring.yml up -d - -# Setup automated backups -apt install borgbackup -./scripts/setup-backups.sh - -# Configure backup cron (daily at 2 AM) -echo "0 2 * * * /usr/local/bin/docker-backup.sh" | crontab - -``` - ---- - -## Recommended Hetzner Setup - -### For Your Monorepo Size (10 backends, 10 web apps) - -**Option 1: Single Server (Development/Staging)** - €28/month - -``` -Server: Hetzner CX33 (4 vCPU, 8GB RAM) -- All services on one server -- Good for staging environment -- ~5-7 concurrent services -``` - -**Option 2: Production HA Setup** - €37/month - -``` -2x Hetzner CPX21 (3 vCPU, 4GB RAM) - €14/month -+ Load Balancer - €5.39/month -+ Volumes (3x 50GB) - €7.50/month -+ Storage Box (500GB) - €10.11/month -``` - -**Option 3: Full Monorepo (All Services)** - €166/month - -``` -3x App Servers (CX33) - €84/month -1x DB Server (CX31) - €28/month -Load Balancer - €10/month -Volumes + Storage Box - €44/month - -vs AWS equivalent: $400-600/month -Savings: 60-75% -``` - -**Recommendation**: Start with Option 1 (staging), scale to Option 2 (production) - ---- - -## Cost Breakdown: What You'll Pay Monthly - -### Minimal Production (5 services) - -``` -Server (CPX21): €7.00/month -Volume (50GB): €2.50/month -Storage Box (100GB): €3.81/month -───────────────────────────────────────── -Total: €13.81/month -``` - -### Your Current Setup (Full Monorepo) - -``` -3x Servers (CX33): €84.00/month -1x Database Server: €28.00/month -Load Balancer: €10.00/month -Volumes (5x 100GB): €25.00/month -Storage Box (1TB): €19.00/month -───────────────────────────────────────── -Total: €166.00/month -``` - -**vs AWS/GCP**: Saves 60-75% on infrastructure costs - ---- - -## Architecture Overview - -### Network Isolation (3-Tier) - -``` -┌─────────────────────────────────────────┐ -│ FRONTEND NETWORK │ -│ - Traefik (reverse proxy) │ -│ - Web apps (SvelteKit) │ -│ - Landing pages (Astro) │ -└─────────────────┬───────────────────────┘ - │ -┌─────────────────▼───────────────────────┐ -│ BACKEND NETWORK │ -│ - NestJS backends │ -│ - mana-core-auth │ -│ - API services │ -└─────────────────┬───────────────────────┘ - │ -┌─────────────────▼───────────────────────┐ -│ DATABASE NETWORK (Internal) │ -│ - PostgreSQL │ -│ - Redis │ -│ - No internet access │ -└─────────────────────────────────────────┘ -``` - -### Service Dependency Flow - -``` -PostgreSQL + Redis - ↓ -mana-core-auth (Central Authentication) - ↓ -Backend Services (chat, picture, zitare, presi, manadeck) - ↓ -Web Apps (SvelteKit) - ↓ -Landing Pages (Astro) - ↓ -Traefik (SSL + Reverse Proxy) -``` - ---- - -## Key Files & Locations - -### Documentation (Created Today) - -- `docs/DOCKER_SETUP_ANALYSIS.md` - Complete current state analysis -- `docs/HETZNER_PRODUCTION_GUIDE.md` - Comprehensive deployment guide -- `docs/HETZNER_DEPLOYMENT_SUMMARY.md` - This quick reference - -### Existing Documentation - -- `docs/DEPLOYMENT_HETZNER.md` - Deployment options comparison (German) -- `docs/DOCKER_GUIDE.md` - Docker usage guide -- `docs/DEPLOYMENT_ARCHITECTURE.md` - Architecture details - -### Docker Configuration Files - -- `docker-compose.yml` - Full stack with monitoring -- `docker-compose.dev.yml` - Development environment -- `docker-compose.staging.yml` - Staging deployment -- `docker-compose.production.yml` - Production deployment - -### Docker Templates - -- `docker/templates/Dockerfile.nestjs` - NestJS backend template -- `docker/templates/Dockerfile.sveltekit` - SvelteKit web template -- `docker/templates/Dockerfile.astro` - Astro landing page template - -### Active Service Dockerfiles - -- `services/mana-core-auth/Dockerfile` ✅ -- `apps/chat/apps/backend/Dockerfile` ✅ -- `apps/picture/apps/backend/Dockerfile` ✅ -- `apps/manadeck/apps/backend/Dockerfile` ⚠️ Needs fixes - ---- - -## Security Checklist - -### Critical Security Items - -- [ ] **SSH Configuration** - - Disable root login - - Disable password authentication - - SSH keys only - -- [ ] **Firewall Setup** - - Hetzner Cloud Firewall (primary layer) - - UFW on server (secondary layer) - - Allow only ports 22, 80, 443 - -- [ ] **Docker Security** - - Non-root containers - - Docker secrets for production - - Read-only filesystems where possible - - Security updates automated - -- [ ] **Backup Strategy** - - Automated daily backups with Borg - - 7 daily, 4 weekly, 6 monthly retention - - Test restore procedure - ---- - -## Monitoring Stack Components - -### What You Get - -**Metrics Collection**: - -- Prometheus - Time-series metrics database -- cAdvisor - Container resource usage -- Node Exporter - Host system metrics - -**Visualization**: - -- Grafana - Dashboards and alerts -- Pre-built dashboards for Docker, PostgreSQL, Redis - -**Logging**: - -- Loki - Log aggregation -- Promtail - Log collection from containers - -**Access**: - -- Grafana UI: `http://your-server:3000` -- Prometheus UI: `http://your-server:9090` - ---- - -## CI/CD Integration - -### GitHub Actions Workflow (Recommended) - -```yaml -# .github/workflows/deploy-hetzner.yml - -on: - push: - branches: [main] - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - # Build and push to GitHub Container Registry - - name: Build and push - run: | - docker build -t ghcr.io/your-org/service:latest . - docker push ghcr.io/your-org/service:latest - - # Deploy to Hetzner via SSH - - name: Deploy - uses: appleboy/ssh-action@master - with: - host: ${{ secrets.HETZNER_HOST }} - username: deploy - key: ${{ secrets.SSH_PRIVATE_KEY }} - script: | - cd /app - docker compose pull - docker compose up -d --remove-orphans -``` - ---- - -## Common Commands - -### Local Development - -```bash -# Start all services -pnpm docker:up - -# Start specific project -docker compose --profile chat up -d - -# View logs -docker compose logs -f chat-backend - -# Stop everything -docker compose down -``` - -### Production Deployment - -```bash -# Deploy to production -docker compose -f docker-compose.production.yml up -d - -# Check service health -docker compose ps - -# View logs -docker compose logs -f --tail=100 - -# Restart single service -docker compose restart chat-backend - -# Update single service (zero downtime) -docker compose up -d --no-deps chat-backend -``` - -### Monitoring - -```bash -# Check resource usage -docker stats - -# View container health -docker inspect --format='{{.State.Health.Status}}' container-name - -# Access Prometheus -http://localhost:9090 - -# Access Grafana -http://localhost:3000 -``` - -### Backup & Restore - -```bash -# Manual backup -/usr/local/bin/docker-backup.sh - -# List backups -borg list ssh://u123456@u123456.your-storagebox.de:23/./backups - -# Restore from backup -borg extract ssh://u123456@u123456.your-storagebox.de:23/./backups::20251201-020000 -``` - ---- - -## Troubleshooting Quick Reference - -### Container Won't Start - -```bash -# View logs -docker logs container-name - -# Check exit code -docker inspect --format='{{.State.ExitCode}}' container-name - -# Run interactively -docker run -it --rm image-name sh -``` - -### High Resource Usage - -```bash -# Check stats -docker stats - -# Check disk usage -docker system df - -# Clean up -docker system prune -a -``` - -### Network Issues - -```bash -# Test connectivity -docker exec container1 ping container2 - -# Check network -docker network inspect manacore-network - -# Restart Docker -systemctl restart docker -``` - -### Health Check Failing - -```bash -# Check health status -docker inspect --format='{{.State.Health}}' container-name - -# View health logs -docker inspect --format='{{range .State.Health.Log}}{{.Output}}{{end}}' container-name - -# Test health endpoint manually -curl http://localhost:3000/health -``` - ---- - -## Next Steps: Priority Order - -### Immediate (Today - 2 hours) - -1. **Fix Critical Blockers** (See Step 1 above) - - Create monitoring configs - - Fix ManaDeck Dockerfile - -2. **Test Locally** - ```bash - pnpm docker:up - docker compose ps # All should be healthy - ``` - -### Short Term (This Week - 4 hours) - -3. **Provision Hetzner Server** - - Choose server type (CX33 recommended for start) - - Select "Docker CE" app during creation - - Configure private network - -4. **Initial Deployment** - - Run production setup script - - Deploy application - - Configure monitoring - -5. **Setup Backups** - - Configure Storage Box - - Initialize Borg repository - - Test restore procedure - -### Medium Term (Next Week - 8 hours) - -6. **CI/CD Pipeline** - - Setup GitHub Actions workflow - - Configure secrets - - Test automated deployment - -7. **Security Hardening** - - Configure Hetzner Cloud Firewall - - Setup fail2ban - - Enable automatic security updates - -8. **Load Testing** - - Test with expected load - - Tune resource limits - - Optimize performance - -### Long Term (Ongoing) - -9. **Documentation** - - Create runbooks for common tasks - - Document incident response - - Team training - -10. **Optimization** - - Monitor costs - - Right-size resources - - Implement auto-scaling if needed - ---- - -## Success Metrics - -### How to Know You're Production Ready - -✅ **Infrastructure** - -- [ ] Server accessible via SSH with key authentication -- [ ] Docker and docker-compose installed and working -- [ ] Firewall configured (Hetzner + UFW) -- [ ] Private network configured (if multi-server) - -✅ **Application** - -- [ ] All services start and pass health checks -- [ ] Environment variables properly configured -- [ ] SSL/TLS working (Let's Encrypt) -- [ ] Database migrations run successfully - -✅ **Monitoring** - -- [ ] Prometheus collecting metrics -- [ ] Grafana dashboards accessible -- [ ] Alerts configured and tested -- [ ] Logs centralized in Loki - -✅ **Backups** - -- [ ] Automated daily backups running -- [ ] Storage Box configured -- [ ] Restore procedure tested -- [ ] Retention policy configured - -✅ **CI/CD** - -- [ ] GitHub Actions workflow working -- [ ] Automated deployments successful -- [ ] Rollback procedure tested - ---- - -## Getting Help - -### Documentation References - -- **Current State**: `docs/DOCKER_SETUP_ANALYSIS.md` -- **Complete Guide**: `docs/HETZNER_PRODUCTION_GUIDE.md` -- **Docker Usage**: `docs/DOCKER_GUIDE.md` -- **Options Comparison**: `docs/DEPLOYMENT_HETZNER.md` - -### External Resources - -- [Hetzner Cloud Docs](https://docs.hetzner.com/cloud/) -- [Docker Compose Reference](https://docs.docker.com/compose/) -- [Traefik Documentation](https://doc.traefik.io/traefik/) -- [Prometheus Documentation](https://prometheus.io/docs/) - -### Support Channels - -- Hetzner Support: https://console.hetzner.cloud/ -- Docker Community: https://forums.docker.com/ -- Your Team Documentation: `docs/` directory - ---- - -## Summary - -You have: - -- ✅ **Solid foundation** with multi-environment Docker setup -- ✅ **4 containerized services** ready to deploy -- ✅ **Complete documentation** for production deployment -- ⚠️ **4 critical fixes** needed (2-4 hours of work) - -After fixes: - -- 🚀 **2-4 hours** to deploy to Hetzner -- 💰 **€14-166/month** depending on scale (60-75% cheaper than AWS) -- 📊 **Complete monitoring** with Prometheus + Grafana -- 🔒 **Production-grade security** with firewalls and automated backups -- 🔄 **Automated deployments** with GitHub Actions - -**Total time to production**: ~10-15 hours from current state - ---- - -**Document Version**: 1.0 -**Last Updated**: 2025-12-01 -**Next Review**: After first deployment diff --git a/docs/HETZNER_PRODUCTION_GUIDE.md b/docs/HETZNER_PRODUCTION_GUIDE.md deleted file mode 100644 index 6f9b572a7..000000000 --- a/docs/HETZNER_PRODUCTION_GUIDE.md +++ /dev/null @@ -1,2007 +0,0 @@ -# Hetzner Production Deployment Guide - -**Version**: 1.0 -**Last Updated**: 2025-12-01 -**Scope**: Complete production deployment guide for Manacore monorepo on Hetzner Cloud - ---- - -## Table of Contents - -1. [Server Specifications](#1-server-specifications--instance-types) -2. [Network Architecture](#2-network-architecture) -3. [Storage & Backup Strategies](#3-storage--backup-strategies) -4. [Security Hardening](#4-security-hardening-checklist) -5. [Monitoring & Logging](#5-monitoring--logging-solutions) -6. [CI/CD Integration](#6-cicd-integration-patterns) -7. [Cost Optimization](#7-cost-optimization-tips) -8. [Orchestration Choice](#8-orchestration-choice-docker-swarm-vs-kubernetes) -9. [Production Setup Scripts](#9-production-ready-deployment-scripts) -10. [Production Checklist](#10-production-ready-checklist) - ---- - -## 1. Server Specifications & Instance Types - -### Recommended Server Types - -#### Entry-Level Production (Small Applications) - -**Hetzner CX23**: 2 vCPUs, 4 GB RAM, 40 GB storage, 20 TB traffic - -- **Price**: €3.49/month -- **Use Case**: Single container apps, development/staging environments -- **Suitable For**: Individual microservices, low-traffic applications - -#### Mid-Tier Production (Standard Applications) - -**Hetzner CPX21**: 3 shared vCPUs, 4 GB RAM, 80 GB storage - -- **Price**: ~€7/month -- **Use Case**: Multi-container applications, small microservices -- **Best For**: 2-3 backend services + web apps - -**Hetzner CX33**: 2 vCPUs, 8 GB RAM, 80 GB storage, 20 TB traffic - -- **Price**: €5.49/month -- **Use Case**: Standard production workloads -- **Best For**: Full stack with 5-6 services - -#### High-Performance Production - -**CCX Series**: Dedicated vCPUs for CPU-intensive workloads - -- **CCX42**: 16 vCPU, 64 GB RAM - €101/month -- **Use Case**: High-traffic applications, full monorepo deployment -- **Best For**: 10+ services with monitoring stack - -**CAX ARM Series**: 40% better cost efficiency - -- **CAX21**: 4 ARM vCPUs, 8 GB RAM - ~€8/month -- **Use Case**: ARM-compatible Docker images -- **Benefit**: Better performance-per-euro - -### ARM vs x86 Considerations - -**ARM64 (CAX) Advantages**: - -- 40% cost savings -- Better performance-per-euro -- Modern Docker images support ARM64 - -**Compatibility Check**: - -- Node.js: ✅ Full ARM64 support -- Python: ✅ Full ARM64 support -- Go: ✅ Native ARM64 -- PostgreSQL: ✅ Official ARM images -- Redis: ✅ Official ARM images - -**Check Your Dependencies**: - -```bash -# Test ARM compatibility locally (M1/M2 Mac) -docker buildx build --platform linux/arm64 . - -# Or on AMD64 with QEMU -docker run --rm --privileged multiarch/qemu-user-static --reset -p yes -docker buildx build --platform linux/arm64 . -``` - -### Installation Method - -**Recommended**: Use **Docker CE App** from Hetzner Cloud Apps during server creation. - -**Benefits**: - -- Docker and docker-compose pre-installed -- Optimized for Hetzner infrastructure -- Eliminates manual installation errors - -**Alternative** (Manual Installation): - -```bash -curl -fsSL https://get.docker.com -o get-docker.sh -sh get-docker.sh -rm get-docker.sh -``` - ---- - -## 2. Network Architecture - -### Private Networks - -**Architecture Overview**: - -``` -┌─────────────────┐ ┌─────────────────┐ -│ Web Server │────▶│ App Server │ -│ (Public IP) │ │ (Private only) │ -│ - Traefik │ │ - Backends │ -│ - Web Apps │ │ - Processing │ -└─────────────────┘ └─────────────────┘ - │ │ - └───────────┬───────────┘ - │ - ┌──────▼──────┐ - │ Database │ - │ (Private) │ - │ - PostgreSQL│ - │ - Redis │ - └─────────────┘ -``` - -### Best Practices - -**1. Configure Private Networks BEFORE Docker Installation** - -```bash -# Create private network via Hetzner Console or CLI -hcloud network create --name production-network --ip-range 10.0.0.0/16 - -# Create subnet -hcloud network add-subnet production-network --network-zone eu-central --type server --ip-range 10.0.1.0/24 - -# Attach servers to network -hcloud server attach-to-network --network production-network --ip 10.0.1.2 -``` - -**2. Docker Daemon Configuration for Private Networks** - -**MTU for Private Networks**: 1450 bytes (Hetzner requirement) - -```json -// /etc/docker/daemon.json -{ - "mtu": 1450, - "default-address-pools": [{ "base": "172.17.0.0/12", "size": 24 }], - "live-restore": true, - "userland-proxy": false, - "no-new-privileges": true, - "icc": false -} -``` - -**Apply Configuration**: - -```bash -systemctl restart docker -``` - -**3. Network Isolation Strategy** - -- **Public Network**: Only expose necessary services (web apps, APIs) -- **Private Network**: All inter-service communication (backends, databases) -- **Hetzner Cloud Firewall**: Primary security layer -- **UFW (Secondary)**: Host-level firewall - -### Floating IPs (High Availability) - -**Use Cases**: - -- High availability setups -- Zero-downtime deployments -- Failover scenarios - -**Implementation with Docker Swarm**: - -```bash -# Create floating IP -hcloud floating-ip create --type ipv4 --name production-lb --home-location nbg1 - -# Assign to server -hcloud floating-ip assign - -# Docker service for IP management -docker service create \ - --name ip-floater \ - --mode global \ - --constraint 'node.role==manager' \ - --mount type=bind,src=/var/run/docker.sock,dst=/var/run/docker.sock \ - -e HCLOUD_TOKEN=${HCLOUD_TOKEN} \ - -e FLOATING_IP=${FLOATING_IP} \ - costela/hetzner-ip-floater:latest -``` - -### Load Balancers - -**Hetzner Cloud Load Balancer**: - -- **Protocol Support**: TCP, HTTP, HTTPS (HTTP/2 by default) -- **Health Checks**: Active and passive monitoring -- **Instant Configuration**: Changes apply immediately -- **Proxy Protocol**: Preserve client IP addresses -- **Pricing**: Starting at €5.39/month - -**Recommended Architecture**: - -``` -Internet → Hetzner LB → Private Network → Docker Containers -``` - -**Configuration Options**: - -1. **Direct Binding**: App containers bind to private IPs - - ```yaml - services: - web: - networks: - - private - ports: - - '10.0.1.2:3000:3000' - ``` - -2. **Traefik Reverse Proxy**: LB routes to Traefik on Docker Swarm - - ```yaml - services: - traefik: - ports: - - '80:80' - - '443:443' - networks: - - public - - private - ``` - -3. **Kubernetes Ingress**: Automatic LB provisioning - ```yaml - apiVersion: v1 - kind: Service - metadata: - annotations: - load-balancer.hetzner.cloud/location: nbg1 - spec: - type: LoadBalancer - ``` - ---- - -## 3. Storage & Backup Strategies - -### Block Storage Volumes - -**Characteristics**: - -- Attach to **single server only** (not shared) -- ext4 or xfs filesystems (ext4 recommended) -- Up to 10 TB per volume -- Hot-attach/detach support -- **€0.05/GB/month** pricing - -**Docker Volume Best Practices**: - -```bash -# 1. Create and format volume (first time) -mkfs.ext4 -F /dev/disk/by-id/scsi-0HC_Volume_12345 - -# 2. Mount volume to dedicated path -mkdir -p /mnt/volumes/data -mount /dev/disk/by-id/scsi-0HC_Volume_12345 /mnt/volumes/data - -# 3. Add to /etc/fstab for persistence -echo '/dev/disk/by-id/scsi-0HC_Volume_12345 /mnt/volumes/data ext4 discard,nofail,defaults 0 0' >> /etc/fstab - -# 4. Test auto-mount -umount /mnt/volumes/data -mount -a -``` - -**Docker Compose Usage**: - -```yaml -volumes: - app-data: - driver: local - driver_opts: - type: none - o: bind - device: /mnt/volumes/data -``` - -### ⚠️ Critical: Hetzner Does NOT Provide Volume Backups - -**You MUST implement your own backup solution** - -### Backup Strategy - -#### Option 1: Borg Backup with Storage Box (Recommended) - -**Why Borg?** - -- Deduplication (saves space) -- Compression (lz4, zstd) -- Encryption (AES-256) -- Incremental backups -- Fast recovery - -**Setup**: - -```bash -# 1. Install Borg -apt install borgbackup - -# 2. Initialize repository on Storage Box -borg init --encryption=repokey \ - ssh://u123456@u123456.your-storagebox.de:23/./backups - -# Store passphrase securely -echo "your-encryption-passphrase" > /root/.borg-passphrase -chmod 600 /root/.borg-passphrase - -# 3. Create backup script -cat > /usr/local/bin/docker-backup.sh <<'EOF' -#!/bin/bash -set -e - -BORG_REPO="ssh://u123456@u123456.your-storagebox.de:23/./backups" -export BORG_PASSPHRASE=$(cat /root/.borg-passphrase) - -# Stop containers for consistency (optional) -# docker-compose -f /app/docker-compose.yml stop - -# Create backup -borg create --stats --compression lz4 \ - $BORG_REPO::$(date +%Y%m%d-%H%M%S) \ - /mnt/volumes/data \ - /var/lib/docker/volumes - -# Prune old backups -borg prune \ - --keep-daily=7 \ - --keep-weekly=4 \ - --keep-monthly=6 \ - $BORG_REPO - -# Restart containers -# docker-compose -f /app/docker-compose.yml start - -echo "Backup completed successfully" -EOF - -chmod +x /usr/local/bin/docker-backup.sh - -# 4. Schedule with cron (daily at 2 AM) -echo "0 2 * * * /usr/local/bin/docker-backup.sh >> /var/log/backup.log 2>&1" | crontab - -``` - -**Restore**: - -```bash -# List backups -borg list ssh://u123456@u123456.your-storagebox.de:23/./backups - -# Restore specific backup -borg extract ssh://u123456@u123456.your-storagebox.de:23/./backups::20251201-020000 -``` - -#### Option 2: Restic (Alternative) - -```bash -# Install Restic -apt install restic - -# Initialize repository -restic -r sftp:u123456@u123456.your-storagebox.de:backups init - -# Create backup -restic -r sftp:u123456@u123456.your-storagebox.de:backups \ - backup /mnt/volumes/data - -# Restore -restic -r sftp:u123456@u123456.your-storagebox.de:backups \ - restore latest --target /mnt/volumes/data -``` - -#### Option 3: Database-Specific Backups - -**PostgreSQL**: - -```bash -#!/bin/bash -# /usr/local/bin/postgres-backup.sh - -BACKUP_DIR="/backup/postgres" -DATE=$(date +%Y%m%d-%H%M%S) - -mkdir -p $BACKUP_DIR - -# Dump all databases -docker exec postgres pg_dumpall -U manacore | \ - gzip > $BACKUP_DIR/all-databases-$DATE.sql.gz - -# Retain last 7 days -find $BACKUP_DIR -name "*.sql.gz" -mtime +7 -delete - -echo "PostgreSQL backup completed: $DATE" -``` - -**Redis**: - -```bash -#!/bin/bash -# Redis automatically creates dump.rdb and appendonly.aof -# Just backup these files - -cp /var/lib/docker/volumes/redis-data/_data/dump.rdb \ - /backup/redis/dump-$(date +%Y%m%d).rdb -``` - -**Schedule Both**: - -```cron -# /etc/cron.d/database-backups -0 3 * * * root /usr/local/bin/postgres-backup.sh >> /var/log/postgres-backup.log 2>&1 -30 3 * * * root /usr/local/bin/redis-backup.sh >> /var/log/redis-backup.log 2>&1 -``` - -### Storage Box Usage - -**Hetzner Storage Box** (NOT for Docker Images): - -- **Remote storage via**: CIFS/SMB, SSHFS, SFTP, Borg -- **Pricing**: Starting at €3.81/month for 100 GB -- **Best For**: Backups, media files, logs - -**Critical Warning**: - -❌ **DO NOT store Docker images on Storage Box** - -- Causes instability (storage can disconnect) -- Docker requires 100% available storage -- Use only for application data, NOT `/var/lib/docker` - -**Safe Usage Pattern** (Application Uploads): - -```yaml -# docker-compose.yml -volumes: - uploads: - driver: local - driver_opts: - type: cifs - o: 'username=u123456,password=${STORAGE_BOX_PASSWORD},addr=u123456.your-storagebox.de' - device: '//u123456.your-storagebox.de/uploads' -``` - ---- - -## 4. Security Hardening Checklist - -### Initial Server Setup - -#### 1. SSH Hardening - -```bash -# Disable root login -sed -i 's/#\?PermitRootLogin.*/PermitRootLogin no/' /etc/ssh/sshd_config - -# Disable password authentication (SSH keys only) -sed -i 's/#\?PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config - -# Create sudo user -adduser deploy -usermod -aG sudo deploy -usermod -aG docker deploy - -# Setup SSH keys -mkdir -p /home/deploy/.ssh -cp ~/.ssh/authorized_keys /home/deploy/.ssh/ -chown -R deploy:deploy /home/deploy/.ssh -chmod 700 /home/deploy/.ssh -chmod 600 /home/deploy/.ssh/authorized_keys - -# Restart SSH -systemctl restart sshd -``` - -#### 2. Firewall Configuration (Defense in Depth) - -**Layer 1: Hetzner Cloud Firewall** (Primary): - -```bash -# Create firewall via Hetzner CLI -hcloud firewall create --name production - -# Allow SSH (from specific IPs only - replace with your IP) -hcloud firewall add-rule production \ - --direction in \ - --protocol tcp \ - --port 22 \ - --source-ips YOUR_IP/32 - -# Allow HTTP/HTTPS from anywhere -hcloud firewall add-rule production \ - --direction in \ - --protocol tcp \ - --port 80 \ - --source-ips 0.0.0.0/0,::/0 - -hcloud firewall add-rule production \ - --direction in \ - --protocol tcp \ - --port 443 \ - --source-ips 0.0.0.0/0,::/0 - -# Apply to server -hcloud firewall apply-to-resource production \ - --type server \ - --server web-01 -``` - -**Layer 2: UFW** (Secondary, Host-Level): - -```bash -# Install UFW -apt install ufw - -# Default policies -ufw default deny incoming -ufw default allow outgoing - -# Allow SSH, HTTP, HTTPS -ufw allow 22/tcp -ufw allow 80/tcp -ufw allow 443/tcp - -# Allow Docker Swarm (if using) -ufw allow 2377/tcp # Cluster management -ufw allow 7946/tcp # Node communication -ufw allow 7946/udp # Node communication -ufw allow 4789/udp # Overlay network - -# Enable firewall -ufw enable - -# Check status -ufw status verbose -``` - -#### 3. Docker-Specific Security - -```json -// /etc/docker/daemon.json -{ - "live-restore": true, - "userland-proxy": false, - "no-new-privileges": true, - "icc": false, - "log-driver": "json-file", - "log-opts": { - "max-size": "10m", - "max-file": "3" - }, - "metrics-addr": "127.0.0.1:9323", - "experimental": true -} -``` - -**Docker Compose Security**: - -```yaml -services: - app: - image: myapp:latest - read_only: true - security_opt: - - no-new-privileges:true - cap_drop: - - ALL - cap_add: - - NET_BIND_SERVICE - tmpfs: - - /tmp:noexec,nosuid,size=100m - user: '1000:1000' -``` - -#### 4. Fail2ban Configuration - -```bash -apt install fail2ban - -# Create local config -cat > /etc/fail2ban/jail.local < 0.9 - for: 5m - labels: - severity: warning - annotations: - summary: 'High memory usage on {{ $labels.name }}' - description: 'Container {{ $labels.name }} memory usage is above 90%.' - - - alert: HighCPUUsage - expr: rate(container_cpu_usage_seconds_total[5m]) > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: 'High CPU usage on {{ $labels.name }}' - description: 'Container {{ $labels.name }} CPU usage is above 80%.' - - - name: host - interval: 30s - rules: - - alert: HostOutOfDiskSpace - expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1 - for: 5m - labels: - severity: critical - annotations: - summary: 'Host out of disk space' - description: 'Disk space is below 10%.' - - - alert: HostHighCPULoad - expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 - for: 5m - labels: - severity: warning - annotations: - summary: 'Host high CPU load' - description: 'CPU load is > 80%.' -``` - -### Hetzner-Specific Monitoring - -**Hetzner Cloud Exporter** (Monitor Hetzner Resources): - -```bash -docker run -d \ - --name hcloud-exporter \ - -p 9501:9501 \ - -e HCLOUD_TOKEN=${HCLOUD_TOKEN} \ - promhippie/hcloud_exporter:latest -``` - -**Add to Prometheus**: - -```yaml -scrape_configs: - - job_name: 'hetzner-cloud' - static_configs: - - targets: ['hcloud-exporter:9501'] -``` - -**Available Grafana Dashboards**: - -- **Hetzner Cloud Servers**: Dashboard ID 16169 -- **Hetzner Cloud Servers & Load Balancers**: Dashboard ID 20257 - -### Log Management - -**Loki Configuration** (`docker/loki/loki-config.yml`): - -```yaml -auth_enabled: false - -server: - http_listen_port: 3100 - -ingester: - lifecycler: - address: 127.0.0.1 - ring: - kvstore: - store: inmemory - replication_factor: 1 - final_sleep: 0s - chunk_idle_period: 5m - chunk_retain_period: 30s - -schema_config: - configs: - - from: 2020-05-15 - store: boltdb - object_store: filesystem - schema: v11 - index: - prefix: index_ - period: 168h - -storage_config: - boltdb: - directory: /loki/index - filesystem: - directory: /loki/chunks - -limits_config: - enforce_metric_name: false - reject_old_samples: true - reject_old_samples_max_age: 168h - -chunk_store_config: - max_look_back_period: 0s - -table_manager: - retention_deletes_enabled: true - retention_period: 720h -``` - -**Promtail Configuration** (`docker/promtail/promtail-config.yml`): - -```yaml -server: - http_listen_port: 9080 - grpc_listen_port: 0 - -positions: - filename: /tmp/positions.yaml - -clients: - - url: http://loki:3100/loki/api/v1/push - -scrape_configs: - - job_name: system - static_configs: - - targets: - - localhost - labels: - job: varlogs - __path__: /var/log/**/*.log - - - job_name: docker - static_configs: - - targets: - - localhost - labels: - job: docker - __path__: /var/lib/docker/containers/**/*.log -``` - -**Deploy Monitoring Stack**: - -```bash -# Start monitoring services -docker compose -f docker-compose.monitoring.yml up -d - -# Check status -docker compose -f docker-compose.monitoring.yml ps - -# Access Grafana -http://your-server-ip:3000 -``` - ---- - -## 6. CI/CD Integration Patterns - -### GitHub Actions with Hetzner Cloud - -#### Option 1: Deploy to Existing Server (Recommended) - -**Workflow**: `.github/workflows/deploy-hetzner.yml` - -```yaml -name: Deploy to Hetzner - -on: - push: - branches: [main] - workflow_dispatch: - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -jobs: - build-and-push: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=sha - type=ref,event=branch - type=semver,pattern={{version}} - - - name: Build and push Docker images - uses: docker/build-push-action@v5 - with: - context: . - file: ./services/mana-core-auth/Dockerfile - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max - - deploy: - needs: build-and-push - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Deploy to Hetzner - uses: appleboy/ssh-action@master - with: - host: ${{ secrets.HETZNER_HOST }} - username: deploy - key: ${{ secrets.SSH_PRIVATE_KEY }} - script: | - cd /app - - # Pull latest images - docker compose -f docker-compose.production.yml pull - - # Rolling update (zero downtime) - docker compose -f docker-compose.production.yml up -d --remove-orphans - - # Run migrations if needed - docker compose -f docker-compose.production.yml exec -T mana-core-auth pnpm migration:run || true - - # Health check - sleep 10 - curl -f http://localhost:3001/api/v1/health || exit 1 - - echo "Deployment completed successfully" - - - name: Notify on failure - if: failure() - uses: 8398a7/action-slack@v3 - with: - status: ${{ job.status }} - text: 'Deployment to Hetzner failed!' - webhook_url: ${{ secrets.SLACK_WEBHOOK }} -``` - -#### Option 2: Self-Hosted GitHub Runner on Hetzner - -**Benefits**: - -- 3-10x cheaper than GitHub-hosted runners -- Faster builds with persistent caching -- Full control over environment - -**Setup**: - -```bash -# On Hetzner server -cd /opt -mkdir actions-runner && cd actions-runner - -# Download runner (check latest version) -curl -o actions-runner-linux-x64-2.311.0.tar.gz -L \ - https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz - -tar xzf actions-runner-linux-x64-2.311.0.tar.gz - -# Configure (get token from GitHub repo settings) -./config.sh --url https://github.com/your-org/manacore-monorepo --token YOUR_TOKEN - -# Install as service -sudo ./svc.sh install -sudo ./svc.sh start -``` - -**Use in Workflow**: - -```yaml -jobs: - deploy: - runs-on: self-hosted - steps: - - uses: actions/checkout@v4 - - run: docker compose up -d -``` - -⚠️ **Important**: Hetzner bills per hour, not per minute. A 30-second run costs the same as a 1-hour run. - -### Docker Registry Options - -#### Option 1: GitHub Container Registry (Recommended) - -```yaml -- name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - -- name: Build and Push - uses: docker/build-push-action@v5 - with: - push: true - tags: ghcr.io/${{ github.repository }}:latest -``` - -#### Option 2: Docker Hub - -```yaml -- name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} -``` - -#### Option 3: Self-Hosted Harbor Registry - -```bash -# Deploy Harbor on Hetzner -docker compose -f harbor-docker-compose.yml up -d -``` - -### Deployment Strategies - -#### Blue-Green Deployment - -```yaml -- name: Blue-Green Deploy - run: | - ssh deploy@${{ secrets.HETZNER_HOST }} << 'EOF' - cd /app - - # Start green environment - docker compose -f docker-compose.green.yml up -d - - # Wait for health checks - sleep 30 - - # Switch traffic (update nginx/traefik config) - sudo mv /etc/nginx/sites-enabled/blue.conf /etc/nginx/sites-enabled/blue.conf.bak - sudo mv /etc/nginx/sites-enabled/green.conf.new /etc/nginx/sites-enabled/green.conf - sudo nginx -s reload - - # Stop blue environment - docker compose -f docker-compose.blue.yml down - EOF -``` - -#### Rolling Update (Docker Swarm) - -```yaml -- name: Deploy to Swarm - run: | - ssh deploy@${{ secrets.HETZNER_HOST }} << 'EOF' - docker service update \ - --image ghcr.io/your-org/myapp:${{ github.sha }} \ - --update-parallelism 2 \ - --update-delay 10s \ - --update-failure-action rollback \ - myapp - EOF -``` - ---- - -## 7. Cost Optimization Tips - -### Server Right-Sizing - -**Progressive Scaling Strategy**: - -``` -Development/Testing: CX11 (€3.92/month) - ↓ -Staging: CX23 (€3.49/month) - ↓ -Production (Small): CPX21 (€7/month) - ↓ -Production (Medium): CX33 (€28/month) - ↓ -Production (Large): CCX42 (€101/month) -``` - -**Cost Calculator**: https://costgoat.com/pricing/hetzner - -### Resource Optimization Strategies - -#### 1. Use ARM Servers (CAX Series) - -**Cost Savings**: 40% lower operational costs vs x86 - -**Example**: - -- **CX21** (x86): 2 vCPU, 4GB RAM - €6/month -- **CAX21** (ARM): 4 vCPU, 8GB RAM - ~€8/month -- **Better**: More CPUs, more RAM, same price range - -**Requirements**: - -- ARM64-compatible Docker images -- Test thoroughly before production migration - -#### 2. Implement Auto-Scaling with Hetzner API - -```bash -#!/bin/bash -# auto-scale.sh - -LOAD=$(uptime | awk -F'load average:' '{print $2}' | cut -d, -f1 | xargs) -THRESHOLD=4.0 - -if (( $(echo "$LOAD > $THRESHOLD" | bc -l) )); then - # Scale up - create new server - hcloud server create \ - --type cpx21 \ - --name web-$(date +%s) \ - --image docker-ce \ - --ssh-key default - - echo "Scaled up due to load: $LOAD" -else - echo "Load normal: $LOAD" -fi -``` - -#### 3. Volume Management - -```bash -#!/bin/bash -# cleanup-volumes.sh - -# List detached volumes -hcloud volume list -o json | jq -r '.[] | select(.server == null) | .id' - -# Delete old snapshots (>30 days) -hcloud snapshot list -o json | \ - jq -r '.[] | select(.created | fromdateiso8601 < now - 2592000) | .id' | \ - xargs -I {} hcloud snapshot delete {} -``` - -**Cost Impact**: - -- Volumes: €0.05/GB/month (even when detached) -- Snapshots: €0.01/GB/month -- Storage Box: €0.04/GB/month (cheaper for cold storage) - -#### 4. Network Traffic Optimization - -**Included Traffic**: 20 TB/month (most plans) -**Additional Traffic**: €1.19/TB - -**Optimization**: - -- Use private networks for inter-server communication (free) -- Enable compression in Nginx/Traefik -- Serve static assets from CDN (Cloudflare free) - -```nginx -# Enable gzip compression -gzip on; -gzip_vary on; -gzip_min_length 1024; -gzip_types text/plain text/css text/xml application/json application/javascript; -``` - -#### 5. Load Balancer Optimization - -**Pricing**: - -- Small LB (5K connections): €5.39/month -- Large LB (40K connections): €15.49/month - -**When to Use**: - -- Multi-server setups only -- For single server, use Nginx/Traefik directly (no LB cost) - -#### 6. Monitoring Costs - -**Self-Hosted** (Prometheus + Grafana): - -- Cost: ~€0/month (runs on same server) -- Overhead: ~200MB RAM -- No external service fees - -**External Monitoring** (Datadog, New Relic): - -- Cost: $20-50+/month per host -- Only if specific features required - -### Total Cost Examples - -#### Single App Deployment (Minimal) - -``` -Server (CPX21): €7.00/month -Volume (50GB): €2.50/month -Snapshot (weekly, 10GB): €0.50/month -Storage Box (100GB backup): €3.81/month -───────────────────────────────────────── -Total: €13.81/month -``` - -#### High-Availability Setup (Production) - -``` -2x Servers (CPX21): €14.00/month -Load Balancer (small): €5.39/month -3x Volumes (50GB each): €7.50/month -Storage Box (500GB backup): €10.11/month -Private Network: €0.00/month (free) -Cloud Firewall: €0.00/month (free) -───────────────────────────────────────── -Total: €37.00/month -``` - -#### Full Monorepo Deployment (All Services) - -``` -3x App Servers (CX33): €84.00/month -1x DB Server (CX31): €28.00/month -Load Balancer (medium): €10.00/month -5x Volumes (100GB each): €25.00/month -Storage Box (1TB backup): €19.00/month -Private Network: €0.00/month -Cloud Firewall: €0.00/month -───────────────────────────────────────── -Total: €166.00/month - -Equivalent on AWS: $400-600/month -Savings: 60-75% -``` - -### Cost Monitoring - -**Track Usage with Hetzner API**: - -```bash -#!/bin/bash -# cost-report.sh - -# Get current month billing -YEAR_MONTH=$(date +%Y-%m) -hcloud billing get-month $YEAR_MONTH | jq - -# Example output: -# { -# "from": "2025-12-01", -# "to": "2025-12-31", -# "total_net": "45.67", -# "total_gross": "54.35" -# } -``` - -**Set Billing Alerts** (via Hetzner Console): - -- Alert at €50 -- Alert at €100 -- Alert at €150 - -### Cost Optimization Checklist - -- [ ] Start with smaller server types -- [ ] Evaluate CAX ARM servers for 40% savings -- [ ] Use private networks for inter-server traffic (free) -- [ ] Delete unused volumes and snapshots regularly -- [ ] Use Storage Box for backups (cheaper than volumes) -- [ ] Implement auto-scaling for variable workloads -- [ ] Monitor resource usage and right-size servers -- [ ] Use Hetzner's included 20TB/month traffic -- [ ] Self-host monitoring (Prometheus/Grafana) -- [ ] Regular cost audits with billing API - ---- - -## 8. Orchestration Choice: Docker Swarm vs Kubernetes - -### When to Use Docker Swarm - -**Best For**: - -- Small to medium deployments (<50 nodes) -- Teams familiar with Docker Compose -- Quick setup requirements (<30 minutes to production) -- Simple applications without complex networking -- Projects prioritizing simplicity over features - -**Advantages**: - -- Native Docker integration (same CLI) -- Easy migration from docker-compose -- Lower learning curve -- Faster deployment times -- Lower resource overhead (~100MB vs ~1GB for K8s) - -**Hetzner Setup**: - -```bash -# Initialize swarm on manager node -docker swarm init --advertise-addr 10.0.1.2 - -# Join worker nodes -docker swarm join --token 10.0.1.2:2377 - -# Deploy stack -docker stack deploy -c docker-compose.yml manacore - -# Scale service -docker service scale manacore_chat-backend=3 - -# Rolling update -docker service update \ - --image ghcr.io/org/chat-backend:v2 \ - manacore_chat-backend -``` - -### When to Use Kubernetes (k3s) - -**Best For**: - -- Medium to large deployments (>20 nodes) -- Complex microservices architectures -- Need for advanced networking (service mesh) -- Teams requiring extensive ecosystem tools -- Enterprise compliance requirements - -**Advantages on Hetzner**: - -- k3s optimized for Hetzner's cost structure -- 40% lower costs vs MicroK8s -- Production-grade availability -- Extensive ecosystem (Helm, operators, etc.) -- Better for multi-tenant applications - -**k3s Recommended** over full Kubernetes: - -- 50% less memory usage -- Single binary installation -- Hetzner-specific tooling available - -### Quick Comparison - -| Factor | Docker Swarm | k3s on Hetzner | -| -------------------------- | ---------------- | ------------------------------- | -| **Setup Time** | 15 minutes | 30-60 minutes | -| **Learning Curve** | Low | Medium | -| **Resource Overhead** | Minimal (~100MB) | Low (~500MB) | -| **Ecosystem** | Limited | Extensive | -| **Cost (3 nodes)** | ~€21/month | ~€21/month | -| **Operational Complexity** | Lower | Higher | -| **Max Scale** | ~50 nodes | 1000+ nodes | -| **Auto-Scaling** | Manual | HPA (Horizontal Pod Autoscaler) | -| **Service Mesh** | No | Yes (Linkerd, Istio) | - -### Recommendation for Manacore Monorepo - -**Start with Docker Swarm**, then migrate to k3s if needed: - -**Rationale**: - -1. **Faster Time to Market**: 15-minute setup vs 1+ week for K8s -2. **Lower Complexity**: Existing Docker Compose knowledge sufficient -3. **Cost Effective**: Same infrastructure cost, lower ops overhead -4. **Sufficient for 90% of Use Cases**: <50 services, <100K requests/day - -**Migration Path**: - -``` -Docker Compose (Development) - ↓ -Docker Swarm (Production) - ↓ -k3s/Kubernetes (if scaling beyond 50 nodes) -``` - ---- - -## 9. Production-Ready Deployment Scripts - -### Complete Server Setup Script - -```bash -#!/bin/bash -# hetzner-production-setup.sh -# Complete Hetzner production setup automation - -set -e - -echo "=== Hetzner Docker Production Setup ===" - -# Configuration -DEPLOY_USER="deploy" -DOCKER_VERSION="24.0" -SERVER_IP=$(curl -s ifconfig.me) - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } -log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; } - -# 1. System Update -log_info "Updating system packages..." -apt update && apt upgrade -y || log_error "System update failed" - -# 2. Install Docker (if not pre-installed) -if ! command -v docker &> /dev/null; then - log_info "Installing Docker..." - curl -fsSL https://get.docker.com -o get-docker.sh - sh get-docker.sh - rm get-docker.sh -else - log_info "Docker already installed: $(docker --version)" -fi - -# 3. Install Docker Compose -if ! command -v docker-compose &> /dev/null; then - log_info "Installing Docker Compose..." - apt install -y docker-compose-plugin -fi - -# 4. Create deploy user -if ! id "$DEPLOY_USER" &> /dev/null; then - log_info "Creating deploy user..." - adduser --disabled-password --gecos "" $DEPLOY_USER - usermod -aG sudo,docker $DEPLOY_USER - - # Setup SSH keys - mkdir -p /home/$DEPLOY_USER/.ssh - if [ -f /root/.ssh/authorized_keys ]; then - cp /root/.ssh/authorized_keys /home/$DEPLOY_USER/.ssh/ - chown -R $DEPLOY_USER:$DEPLOY_USER /home/$DEPLOY_USER/.ssh - chmod 700 /home/$DEPLOY_USER/.ssh - chmod 600 /home/$DEPLOY_USER/.ssh/authorized_keys - log_info "SSH keys copied for $DEPLOY_USER" - fi -else - log_info "User $DEPLOY_USER already exists" -fi - -# 5. Configure Docker daemon -log_info "Configuring Docker daemon..." -cat > /etc/docker/daemon.json < /etc/fail2ban/jail.local < /opt/monitoring/prometheus/prometheus.yml < /opt/monitoring/grafana/provisioning/datasources/prometheus.yml < /etc/logrotate.d/docker-containers < /dev/null; then - log_info "✓ $SERVICE_NAME is healthy" - else - log_error "✗ $SERVICE_NAME health check failed" - fi -done - -# Clean up old images -log_info "Cleaning up old Docker images..." -docker image prune -f - -log_info "Deployment completed successfully!" -``` - ---- - -## 10. Production-Ready Checklist - -### Infrastructure - -- [ ] **Server Provisioned**: Appropriate Hetzner server type selected -- [ ] **Private Network Configured**: 10.0.0.0/16 network created -- [ ] **Floating IP Setup** (if HA required) -- [ ] **Load Balancer Configured** (if multi-server) -- [ ] **Volumes Mounted**: Block storage attached and formatted -- [ ] **Hetzner Cloud Firewall**: Rules configured with IP restrictions -- [ ] **DNS Records**: A/AAAA records pointing to server IP - -### Storage & Backup - -- [ ] **Volumes Mounted**: Attached to `/mnt/volumes/*` -- [ ] **Storage Box Configured**: Access credentials set -- [ ] **Borg Backup Setup**: Repository initialized -- [ ] **Automated Backups**: Cron job scheduled (daily at 2 AM) -- [ ] **Database Backups**: PostgreSQL/Redis backup scripts created -- [ ] **Backup Testing**: Restore procedure tested and documented -- [ ] **Retention Policy**: Old backups pruned (7 days, 4 weeks, 6 months) - -### Security - -- [ ] **SSH Key-Only Authentication**: Password auth disabled -- [ ] **Root Login Disabled**: PermitRootLogin no -- [ ] **UFW Configured**: Host-level firewall enabled -- [ ] **fail2ban Installed**: Brute force protection active -- [ ] **Automatic Security Updates**: unattended-upgrades enabled -- [ ] **Docker Secrets**: Production secrets stored securely -- [ ] **Containers Run as Non-Root**: All services use unprivileged users -- [ ] **SSL/TLS Configured**: Let's Encrypt certificates active -- [ ] **Security Scanning**: Trivy/Hadolint integrated in CI/CD - -### Monitoring - -- [ ] **Prometheus Deployed**: Metrics collection running -- [ ] **Grafana Deployed**: Dashboards configured -- [ ] **cAdvisor Running**: Container metrics available -- [ ] **Node Exporter Running**: Host metrics collected -- [ ] **Loki + Promtail**: Centralized logging active -- [ ] **Hetzner Cloud Exporter** (optional): Cloud resource monitoring -- [ ] **Alert Rules Configured**: Critical alerts defined -- [ ] **Alert Notifications**: Email/Slack notifications working -- [ ] **Health Checks**: All services have health endpoints - -### Deployment - -- [ ] **Docker Compose Files**: Production files tested -- [ ] **Environment Variables**: Secrets properly configured -- [ ] **CI/CD Pipeline**: GitHub Actions workflow working -- [ ] **Docker Registry**: Images pushed to registry -- [ ] **Deployment Strategy**: Blue-green or rolling updates defined -- [ ] **Rollback Procedure**: Tested and documented -- [ ] **Health Checks**: Pre-deployment and post-deployment checks - -### Documentation - -- [ ] **Deployment Runbook**: Step-by-step deployment guide -- [ ] **Rollback Procedure**: Emergency rollback documented -- [ ] **Disaster Recovery Plan**: Complete recovery steps -- [ ] **On-Call Procedures**: Incident response playbook -- [ ] **Architecture Diagram**: Current infrastructure documented -- [ ] **Access Documentation**: Server access, credentials locations -- [ ] **Monitoring Dashboard**: Team has access to Grafana - -### Cost Management - -- [ ] **Right-Sized Servers**: Appropriate server types selected -- [ ] **ARM Servers Evaluated**: CAX series considered for savings -- [ ] **Private Networks Used**: Inter-server traffic optimized -- [ ] **Unused Resources Cleaned**: Old volumes/snapshots removed -- [ ] **Billing Alerts Configured**: Threshold alerts set -- [ ] **Cost Monitoring**: Monthly cost reports automated - -### Performance - -- [ ] **Resource Limits Set**: CPU/memory limits defined -- [ ] **Database Optimization**: PostgreSQL tuned for workload -- [ ] **Redis Caching**: Cache hit ratio monitored -- [ ] **CDN Configured**: Static assets served via CDN -- [ ] **Compression Enabled**: Gzip/Brotli compression active -- [ ] **Load Testing**: Application stress-tested - ---- - -## Conclusion - -This guide provides a comprehensive production deployment strategy for the Manacore monorepo on Hetzner Cloud infrastructure. Following these practices will result in: - -- **Cost-Effective**: 60-75% cost savings vs AWS/GCP -- **Secure**: Defense-in-depth security strategy -- **Reliable**: High availability with failover capabilities -- **Observable**: Complete monitoring and logging stack -- **Maintainable**: Automated deployments and backups - -**Estimated Time to Production**: - -- Initial setup: 4-6 hours -- Application deployment: 2-3 hours -- Testing and hardening: 4-6 hours -- **Total**: ~10-15 hours for complete production deployment - -**Monthly Operational Cost**: - -- Single server: €14-28/month -- HA setup: €37-50/month -- Full monorepo: €166/month - ---- - -**Related Documentation**: - -- `DOCKER_SETUP_ANALYSIS.md` - Current Docker setup analysis -- `DOCKER_COMPOSE_PRODUCTION_ARCHITECTURE.md` - Architecture design -- `DEPLOYMENT_HETZNER.md` - Deployment options comparison -- `CI_CD_SETUP.md` - CI/CD pipeline details diff --git a/docs/PRODUCTION_LAUNCH.md b/docs/PRODUCTION_LAUNCH.md deleted file mode 100644 index d91a108cb..000000000 --- a/docs/PRODUCTION_LAUNCH.md +++ /dev/null @@ -1,296 +0,0 @@ -# Production Launch Guide - mana.how - -Diese Anleitung beschreibt alle Schritte um die Staging-Umgebung zur Production zu machen. - -**Server:** 46.224.108.214 (Hetzner) -**Domain:** mana.how - ---- - -## Schritt 1: DNS-Einträge anlegen - -Bei eurem DNS-Provider (wo `mana.how` registriert ist) folgende A-Records anlegen: - -### Erforderliche DNS-Einträge - -| Subdomain | Typ | Ziel | TTL | -|-----------|-----|------|-----| -| `@` (root) | A | 46.224.108.214 | 300 | -| `www` | A | 46.224.108.214 | 300 | -| `auth` | A | 46.224.108.214 | 300 | -| `chat` | A | 46.224.108.214 | 300 | -| `chat-api` | A | 46.224.108.214 | 300 | -| `todo` | A | 46.224.108.214 | 300 | -| `todo-api` | A | 46.224.108.214 | 300 | -| `calendar` | A | 46.224.108.214 | 300 | -| `calendar-api` | A | 46.224.108.214 | 300 | -| `clock` | A | 46.224.108.214 | 300 | -| `clock-api` | A | 46.224.108.214 | 300 | - -**Alternative mit Wildcard:** -| Subdomain | Typ | Ziel | TTL | -|-----------|-----|------|-----| -| `@` (root) | A | 46.224.108.214 | 300 | -| `*` | A | 46.224.108.214 | 300 | - -> **Hinweis:** Nach dem Anlegen kann es bis zu 24h dauern bis die DNS-Einträge weltweit propagiert sind. In der Praxis meist schneller. - -### DNS prüfen - -```bash -# Prüfen ob DNS korrekt ist -dig mana.how +short -dig auth.mana.how +short -dig chat.mana.how +short -# Sollte jeweils 46.224.108.214 zurückgeben -``` - ---- - -## Schritt 2: Server vorbereiten - -SSH auf den Server: - -```bash -ssh -i ~/.ssh/hetzner_deploy_key deploy@46.224.108.214 -``` - -### 2.1 Backup der aktuellen Staging-Daten (optional aber empfohlen) - -```bash -cd ~/manacore-staging - -# Datenbank-Backup erstellen -docker compose exec -T postgres pg_dumpall -U postgres > ~/backup_$(date +%Y%m%d_%H%M%S).sql - -echo "Backup erstellt: ~/backup_*.sql" -``` - -### 2.2 Staging Container stoppen - -```bash -cd ~/manacore-staging -docker compose down -``` - ---- - -## Schritt 3: Production Konfiguration deployen - -### 3.1 Verzeichnis umbenennen (optional) - -```bash -# Von staging zu production umbenennen -mv ~/manacore-staging ~/manacore-production -cd ~/manacore-production -``` - -### 3.2 Production docker-compose kopieren - -Vom lokalen Rechner: - -```bash -# Aus dem Repo-Root -scp -i ~/.ssh/hetzner_deploy_key \ - docker-compose.production.yml \ - deploy@46.224.108.214:~/manacore-production/docker-compose.yml -``` - -### 3.3 Production Caddyfile kopieren - -```bash -scp -i ~/.ssh/hetzner_deploy_key \ - docker/caddy/Caddyfile.production \ - deploy@46.224.108.214:~/Caddyfile -``` - -### 3.4 Caddy neu laden - -Auf dem Server: - -```bash -# Caddy Config neu laden -docker exec caddy caddy reload --config /etc/caddy/Caddyfile - -# Prüfen ob Caddy läuft -docker logs caddy --tail 20 -``` - ---- - -## Schritt 4: Environment Variables anpassen - -Auf dem Server die `.env` Datei anpassen: - -```bash -cd ~/manacore-production -nano .env -``` - -Die bestehenden Staging-Werte können bleiben. Nur sicherstellen dass: - -```env -NODE_ENV=production - -# Diese Werte bleiben gleich (Staging Secrets weiterverwenden): -POSTGRES_PASSWORD= -REDIS_PASSWORD= -JWT_SECRET= -JWT_PUBLIC_KEY= -JWT_PRIVATE_KEY= -SUPABASE_URL= -SUPABASE_ANON_KEY= -SUPABASE_SERVICE_ROLE_KEY= -AZURE_OPENAI_ENDPOINT= -AZURE_OPENAI_API_KEY= -``` - ---- - -## Schritt 5: Container starten - -```bash -cd ~/manacore-production - -# Images pullen -docker compose pull - -# Container starten -docker compose up -d - -# Status prüfen -docker compose ps -``` - ---- - -## Schritt 6: Health Checks - -```bash -# Alle Services prüfen -curl -s http://localhost:3001/api/v1/health # Auth -curl -s http://localhost:5173/health # Dashboard -curl -s http://localhost:3000/health # Chat Web -curl -s http://localhost:3002/api/v1/health # Chat API -curl -s http://localhost:5188/health # Todo Web -curl -s http://localhost:3018/api/health # Todo API -curl -s http://localhost:5186/health # Calendar Web -curl -s http://localhost:3016/api/v1/health # Calendar API -curl -s http://localhost:5187/health # Clock Web -curl -s http://localhost:3017/api/v1/health # Clock API -``` - ---- - -## Schritt 7: SSL-Zertifikate (automatisch) - -Caddy holt sich automatisch Let's Encrypt Zertifikate sobald die DNS-Einträge korrekt sind. - -Prüfen: - -```bash -# Logs prüfen auf Certificate-Meldungen -docker logs caddy 2>&1 | grep -i "certificate\|tls" - -# Oder direkt testen -curl -I https://mana.how -``` - ---- - -## Schritt 8: Finale Tests - -Im Browser testen: - -| URL | Erwartet | -|-----|----------| -| https://mana.how | Dashboard Login | -| https://auth.mana.how/api/v1/health | `{"status":"ok"}` | -| https://chat.mana.how | Chat App Login | -| https://todo.mana.how | Todo App Login | -| https://calendar.mana.how | Calendar App Login | -| https://clock.mana.how | Clock App Login | - ---- - -## Troubleshooting - -### Container startet nicht - -```bash -# Logs anschauen -docker compose logs - -# Beispiel -docker compose logs mana-core-auth -docker compose logs chat-backend -``` - -### DNS nicht propagiert - -```bash -# Verschiedene DNS-Server testen -dig @8.8.8.8 mana.how +short # Google DNS -dig @1.1.1.1 mana.how +short # Cloudflare DNS -``` - -### SSL-Zertifikat Fehler - -```bash -# Caddy Logs prüfen -docker logs caddy --tail 100 - -# Caddy neu starten -docker restart caddy -``` - -### Datenbank Verbindungsfehler - -```bash -# Postgres prüfen -docker compose exec postgres psql -U postgres -l - -# Datenbanken anzeigen -docker compose exec postgres psql -U postgres -c "\l" -``` - ---- - -## Rollback zu Staging - -Falls etwas schief geht: - -```bash -cd ~/manacore-production -docker compose down - -# Alte Staging docker-compose wiederherstellen -# (müsste vorher gesichert werden) - -# Caddyfile zurück auf staging -scp -i ~/.ssh/hetzner_deploy_key \ - docker/caddy/Caddyfile.staging \ - deploy@46.224.108.214:~/Caddyfile - -docker exec caddy caddy reload --config /etc/caddy/Caddyfile -docker compose up -d -``` - ---- - -## Zusammenfassung der URLs - -Nach erfolgreichem Launch: - -| App | URL | -|-----|-----| -| **Dashboard** | https://mana.how | -| **Auth API** | https://auth.mana.how | -| **Chat** | https://chat.mana.how | -| **Chat API** | https://chat-api.mana.how | -| **Todo** | https://todo.mana.how | -| **Todo API** | https://todo-api.mana.how | -| **Calendar** | https://calendar.mana.how | -| **Calendar API** | https://calendar-api.mana.how | -| **Clock** | https://clock.mana.how | -| **Clock API** | https://clock-api.mana.how | diff --git a/docs/STAGING_DEPLOYMENT_ISSUES.md b/docs/STAGING_DEPLOYMENT_ISSUES.md deleted file mode 100644 index 4df93e2dd..000000000 --- a/docs/STAGING_DEPLOYMENT_ISSUES.md +++ /dev/null @@ -1,408 +0,0 @@ -# Staging Deployment Issues & Solutions - -This document captures common issues encountered during staging deployments and their solutions. Reference this when debugging deployment problems. - -## Table of Contents - -1. [Runtime Environment Variables (SvelteKit)](#1-runtime-environment-variables-sveltekit) -2. [CORS Configuration](#2-cors-configuration) -3. [CD Workflow Version Tags](#3-cd-workflow-version-tags) -4. [Database Setup](#4-database-setup) -5. [User ID Format (Better Auth)](#5-user-id-format-better-auth) -6. [Debugging Checklist](#6-debugging-checklist) -7. [Summary: Common Mistakes to Avoid](#summary-common-mistakes-to-avoid) - ---- - -## 1. Runtime Environment Variables (SvelteKit) - -### Problem - -SvelteKit apps use `import.meta.env.PUBLIC_*` which gets **baked in at build time**. When running in Docker, the container uses whatever values were present during the GitHub Actions build, not the runtime environment variables. - -**Symptoms:** -- Web apps calling `localhost:3001` instead of staging server IP -- API calls going to wrong URLs despite correct Docker env vars - -### Solution - -Use **runtime env injection** via `hooks.server.ts`: - -```typescript -// src/hooks.server.ts -import type { Handle } from '@sveltejs/kit'; - -const PUBLIC_MANA_CORE_AUTH_URL_CLIENT = - process.env.PUBLIC_MANA_CORE_AUTH_URL_CLIENT || ''; -const PUBLIC_BACKEND_URL_CLIENT = - process.env.PUBLIC_BACKEND_URL_CLIENT || ''; - -export const handle: Handle = async ({ event, resolve }) => { - return resolve(event, { - transformPageChunk: ({ html }) => { - const envScript = ``; - return html.replace('', `${envScript}`); - }, - }); -}; -``` - -Then in client code, read from `window` instead of `import.meta.env`: - -```typescript -import { browser } from '$app/environment'; - -function getApiUrl(): string { - if (browser && typeof window !== 'undefined') { - const injectedUrl = (window as any).__PUBLIC_BACKEND_URL__; - if (injectedUrl) return injectedUrl; - } - return 'http://localhost:3000'; // fallback for local dev -} -``` - -### Lazy Client Initialization Pattern - -**Important**: API clients must be lazily initialized to read the URL at request time, not at module load time: - -```typescript -// CORRECT - Lazy initialization -let _client: ReturnType | null = null; - -function getClient() { - if (!_client) { - _client = createApiClient(getApiUrl()); // URL evaluated when called - } - return _client; -} - -export async function getTasks() { - return getClient().get('/tasks'); // Client created on first use -} -``` - -```typescript -// WRONG - Module-level initialization -const client = createApiClient(getApiUrl()); // URL evaluated at import time! - -export async function getTasks() { - return client.get('/tasks'); // Will use stale URL -} -``` - -**Why this matters**: When the module is imported, the `window` object may not have the injected environment variables yet. The lazy pattern ensures the URL is read only when the client is actually needed. - -### Docker Compose Pattern - -Use two environment variables: -- `PUBLIC_*_URL` - Internal Docker network URL (container-to-container) -- `PUBLIC_*_URL_CLIENT` - External URL for browser access - -```yaml -environment: - PUBLIC_BACKEND_URL: http://backend-container:3000 # Server-side - PUBLIC_BACKEND_URL_CLIENT: http://46.224.108.214:3000 # Browser-side -``` - ---- - -## 2. CORS Configuration - -### Problem - -Backends only allow CORS from their own web apps, blocking requests from other origins like manacore-web dashboard. - -**Symptoms:** -- `Access to fetch blocked by CORS policy` -- `No 'Access-Control-Allow-Origin' header` - -### Solution - -Add all necessary origins to `CORS_ORIGINS` in docker-compose.staging.yml: - -```yaml -todo-backend: - environment: - # Include both the app's own web AND manacore-web dashboard - CORS_ORIGINS: http://46.224.108.214:5188,http://46.224.108.214:5173,http://localhost:5188,http://localhost:5173 -``` - -### Checklist for New Backends - -When deploying a new backend that will be called from manacore-web dashboard: -1. Add `http://46.224.108.214:5173` to CORS_ORIGINS -2. Add `http://localhost:5173` for local development -3. Restart the container after config changes - -### Testing CORS - -```bash -curl -I -X OPTIONS http://46.224.108.214:3018/api/v1/endpoint \ - -H "Origin: http://46.224.108.214:5173" \ - -H "Access-Control-Request-Method: GET" - -# Should see: -# Access-Control-Allow-Origin: http://46.224.108.214:5173 -``` - ---- - -## 3. CD Workflow Version Tags - -### Problem - -docker-compose uses variables like `${TODO_WEB_VERSION:-latest}`, but the CD workflow wasn't updating the `.env` file on the staging server, causing containers to always use `latest` instead of the tagged version. - -**Symptoms:** -- Deployed new version but container still running old code -- `docker ps` shows wrong image tag - -### Solution - -The CD workflow (`.github/workflows/cd-staging-tagged.yml`) now: -1. Computes the version variable name (e.g., `TODO_WEB_VERSION`) -2. Updates the `.env` file on staging server -3. docker-compose reads from `.env` - -### Tag Naming Convention - -Tags must follow the exact project name as defined in the CD workflow: - -| Project | Correct Tag Format | Wrong Format | -|---------|-------------------|--------------| -| mana-core-auth | `mana-core-auth-staging-v1.0.0` | `auth-staging-v1.0.0` | -| chat | `chat-staging-v1.0.0` or `chat-all-staging-v1.0.0` | - | -| todo | `todo-staging-v1.0.0` or `todo-all-staging-v1.0.0` | - | - -**Note**: Using the wrong tag format (e.g., `auth-staging-*` instead of `mana-core-auth-staging-*`) will cause the workflow to fail because it won't find the correct Dockerfile path. - -### Verifying Deployment - -```bash -# Check running container version -docker ps --format '{{.Names}}: {{.Image}}' | grep todo - -# Check .env file -cat ~/manacore-staging/.env | grep VERSION -``` - ---- - -## 4. Database Setup - -### Problem - -New backends fail with `database "X" does not exist` because the PostgreSQL databases weren't created. - -**Symptoms:** -- 500 Internal Server Error -- Logs show: `PostgresError: database "todo" does not exist` - -### Solution - -Create databases manually on first deployment: - -```bash -# SSH to staging -ssh deploy@46.224.108.214 - -# Create databases -docker exec manacore-postgres-staging psql -U postgres -c 'CREATE DATABASE todo;' -docker exec manacore-postgres-staging psql -U postgres -c 'CREATE DATABASE calendar;' -docker exec manacore-postgres-staging psql -U postgres -c 'CREATE DATABASE clock;' - -# Restart backends (they auto-migrate schemas on startup) -cd ~/manacore-staging -docker compose restart todo-backend calendar-backend clock-backend -``` - -### Checklist for New Apps - -When deploying a new app with a database: -1. Create the database: `CREATE DATABASE appname;` -2. The backend will auto-migrate the schema on startup -3. Verify tables exist: `\dt` in psql - ---- - -## 5. User ID Format (Better Auth) - -### Problem - -Backend database schemas use `uuid` type for `user_id`, but Better Auth generates non-UUID user IDs like `otUe1YrfENPdHnrF3g1vSBfpkQfambCZ`. - -**Symptoms:** -- 500 Internal Server Error on authenticated requests -- Logs show: `invalid input syntax for type uuid: "otUe1YrfENPdHnrF3g1vSBfpkQfambCZ"` - -### Solution - -Change `user_id` columns from `uuid` to `text`: - -```sql --- For each table with user_id (use USING clause for explicit conversion) -ALTER TABLE tasks ALTER COLUMN user_id TYPE text USING user_id::text; -ALTER TABLE projects ALTER COLUMN user_id TYPE text USING user_id::text; --- etc. -``` - -**Important**: Always use the `USING` clause when converting column types. Without it, PostgreSQL may silently fail or produce unexpected results: - -```sql --- CORRECT - Explicit conversion -ALTER TABLE events ALTER COLUMN user_id TYPE text USING user_id::text; - --- RISKY - May fail silently on some data types -ALTER TABLE events ALTER COLUMN user_id TYPE text; -``` - -### Prevention - -When creating new backend schemas, **always use `text` type for user_id**: - -```typescript -// Drizzle schema - CORRECT -export const tasks = pgTable('tasks', { - id: uuid('id').defaultRandom().primaryKey(), - userId: text('user_id').notNull(), // Use text, not uuid - // ... -}); - -// WRONG - Don't do this -export const tasks = pgTable('tasks', { - userId: uuid('user_id').notNull(), // Will fail with Better Auth -}); -``` - ---- - -## Quick Debugging Commands - -```bash -# Check container logs -docker logs --tail 50 - -# Check container is running correct version -docker ps --format '{{.Names}}: {{.Image}}' - -# Test CORS -curl -I -X OPTIONS -H "Origin: " - -# Check database exists -docker exec manacore-postgres-staging psql -U postgres -c '\l' - -# Check tables in database -docker exec manacore-postgres-staging psql -U postgres -d -c '\dt' - -# Restart a service -cd ~/manacore-staging && docker compose restart - -# Force recreate with new config -cd ~/manacore-staging && docker compose up -d --no-deps --force-recreate -``` - ---- - -## Port Reference - -| Service | Port | -|---------|------| -| mana-core-auth | 3001 | -| chat-backend | 3002 | -| calendar-backend | 3016 | -| clock-backend | 3017 | -| todo-backend | 3018 | -| chat-web | 3000 | -| manacore-web | 5173 | -| calendar-web | 5186 | -| clock-web | 5187 | -| todo-web | 5188 | - ---- - -## 6. Debugging Checklist - -When something doesn't work on staging, follow this checklist: - -### API Returns Wrong Data or Fails - -1. **Check if calling correct URL** - ```bash - # In browser console - console.log(window.__PUBLIC_BACKEND_URL__) - ``` - If undefined or localhost, the runtime env injection isn't working. - -2. **Check CORS** - ```bash - curl -I -X OPTIONS http://46.224.108.214:/api/v1/endpoint \ - -H "Origin: http://46.224.108.214:5173" - ``` - Should return `Access-Control-Allow-Origin` header. - -3. **Check container logs** - ```bash - ssh deploy@46.224.108.214 "docker logs --tail 100" - ``` - -### 500 Internal Server Error - -1. **Check database exists** - ```bash - docker exec manacore-postgres-staging psql -U postgres -c '\l' - ``` - -2. **Check tables exist** - ```bash - docker exec manacore-postgres-staging psql -U postgres -d -c '\dt' - ``` - -3. **Check for type mismatches** (especially user_id uuid vs text) - -### 401 Unauthorized - -1. **Check token is being sent** - ```bash - # In browser Network tab, check Authorization header - ``` - -2. **Check JWKS endpoint** - ```bash - curl http://46.224.108.214:3001/api/v1/auth/jwks - ``` - -3. **Check issuer/audience match** - Token must have `iss: manacore` and `aud: manacore` - -### Container Not Updated - -1. **Check image version** - ```bash - docker ps --format '{{.Names}}: {{.Image}}' - ``` - -2. **Check .env file** - ```bash - cat ~/manacore-staging/.env | grep VERSION - ``` - -3. **Force recreate** - ```bash - docker compose up -d --no-deps --force-recreate - ``` - ---- - -## Summary: Common Mistakes to Avoid - -| Mistake | Consequence | Prevention | -|---------|-------------|------------| -| Using `import.meta.env` for Docker runtime | URLs baked at build time | Use `window.__PUBLIC_*__` with runtime injection | -| Initializing API clients at module level | Client uses stale URLs | Use lazy initialization pattern | -| Using `uuid` type for user_id | Better Auth IDs fail validation | Always use `text` type for user_id | -| Missing CORS origin for manacore-web | Dashboard can't call backends | Add port 5173 to all backend CORS configs | -| Wrong tag format for mana-core-auth | Deployment fails, can't find Dockerfile | Use `mana-core-auth-staging-v*` not `auth-staging-v*` | -| Forgetting to create database | Backend crashes on startup | Create database before first deployment | -| ALTER TABLE without USING clause | Silent failures on type conversion | Always use `USING column::new_type` | diff --git a/docs/STAGING_SETUP.md b/docs/STAGING_SETUP.md deleted file mode 100644 index 0ae580b97..000000000 --- a/docs/STAGING_SETUP.md +++ /dev/null @@ -1,441 +0,0 @@ -# Staging Environment Setup Guide - -This document describes the complete staging environment setup for ManaCore apps on Hetzner VPS with HTTPS via Caddy reverse proxy. - -## Overview - -| Component | Details | -|-----------|---------| -| **Server** | Hetzner VPS (46.224.108.214) | -| **Domain** | manacore.ai (Namecheap) | -| **Reverse Proxy** | Caddy (auto-SSL via Let's Encrypt) | -| **Container Runtime** | Docker Compose | -| **SSH Access** | `ssh -i ~/.ssh/hetzner_deploy_key deploy@46.224.108.214` | - -## Architecture - -``` - ┌─────────────────────────────────────────────┐ - │ Hetzner VPS (46.224.108.214) │ - │ │ - Internet │ ┌─────────────────────────────────────┐ │ - │ │ │ Caddy (ports 80/443) │ │ - │ │ │ Auto-SSL via Let's Encrypt │ │ - ▼ │ └──────────────┬──────────────────────┘ │ -┌──────────────┐ │ │ │ -│ Namecheap │ │ ▼ │ -│ DNS Records │────────────────────│ ┌─────────────────────────────────────┐ │ -│ │ │ │ Docker Compose Services │ │ -│ *.staging │ │ │ │ │ -│ A → IP │ │ │ mana-core-auth:3001 │ │ -└──────────────┘ │ │ chat-web:3000 / chat-backend:3002 │ │ - │ │ clock-web:5187 / clock-backend:3017│ │ - │ │ calendar-web:5186 / calendar-api:3016│ │ - │ │ todo-web:5188 / todo-backend:3018 │ │ - │ │ manacore-web:5173 │ │ - │ │ postgres:5432 / redis:6379 │ │ - │ └─────────────────────────────────────┘ │ - └─────────────────────────────────────────────┘ -``` - -## Domain Mapping - -### DNS Configuration (Namecheap) - -| Type | Host | Value | TTL | -|------|------|-------|-----| -| A | `staging` | 46.224.108.214 | Automatic | -| A | `*.staging` | 46.224.108.214 | Automatic | - -The wildcard record `*.staging` enables all subdomains like `auth.staging.manacore.ai`, `clock.staging.manacore.ai`, etc. - -### Staging URLs - -| Service | URL | Internal Port | -|---------|-----|---------------| -| **Auth** | https://auth.staging.manacore.ai | 3001 | -| **ManaCore Web** | https://staging.manacore.ai | 5173 | -| **Chat Web** | https://chat.staging.manacore.ai | 3000 | -| **Chat API** | https://chat-api.staging.manacore.ai | 3002 | -| **Clock Web** | https://clock.staging.manacore.ai | 5187 | -| **Clock API** | https://clock-api.staging.manacore.ai | 3017 | -| **Calendar Web** | https://calendar.staging.manacore.ai | 5186 | -| **Calendar API** | https://calendar-api.staging.manacore.ai | 3016 | -| **Todo Web** | https://todo.staging.manacore.ai | 5188 | -| **Todo API** | https://todo-api.staging.manacore.ai | 3018 | - -## Caddy Reverse Proxy - -### Installation (One-time setup) - -```bash -# SSH into server -ssh -i ~/.ssh/hetzner_deploy_key deploy@46.224.108.214 - -# Create Caddy data directory -mkdir -p ~/caddy_data ~/caddy_config - -# Run Caddy container -docker run -d \ - --name caddy \ - --network host \ - --restart unless-stopped \ - -v ~/Caddyfile:/etc/caddy/Caddyfile \ - -v ~/caddy_data:/data \ - -v ~/caddy_config:/config \ - caddy:2-alpine -``` - -### Configuration - -The Caddyfile is stored at: -- **Server**: `~/Caddyfile` -- **Repo**: `docker/caddy/Caddyfile.staging` - -```caddyfile -# ManaCore Staging Reverse Proxy - -auth.staging.manacore.ai { - reverse_proxy localhost:3001 -} - -chat.staging.manacore.ai { - reverse_proxy localhost:3000 -} - -chat-api.staging.manacore.ai { - reverse_proxy localhost:3002 -} - -staging.manacore.ai { - reverse_proxy localhost:5173 -} - -calendar.staging.manacore.ai { - reverse_proxy localhost:5186 -} - -calendar-api.staging.manacore.ai { - reverse_proxy localhost:3016 -} - -clock.staging.manacore.ai { - reverse_proxy localhost:5187 -} - -clock-api.staging.manacore.ai { - reverse_proxy localhost:3017 -} - -todo.staging.manacore.ai { - reverse_proxy localhost:5188 -} - -todo-api.staging.manacore.ai { - reverse_proxy localhost:3018 -} -``` - -### Updating Caddy Configuration - -```bash -# Copy updated config to server -scp -i ~/.ssh/hetzner_deploy_key docker/caddy/Caddyfile.staging deploy@46.224.108.214:~/Caddyfile - -# Reload Caddy (no downtime) -ssh -i ~/.ssh/hetzner_deploy_key deploy@46.224.108.214 "docker exec caddy caddy reload --config /etc/caddy/Caddyfile" -``` - -### Caddy Management Commands - -```bash -# View logs -docker logs caddy -f - -# Restart Caddy -docker restart caddy - -# Check Caddy status -docker exec caddy caddy validate --config /etc/caddy/Caddyfile -``` - -## SvelteKit Runtime Environment Variables - -### The Problem - -SvelteKit's `$env/static/public` variables are replaced at **build time**. When Docker images are built in CI, the environment variables are baked into the JavaScript bundles. This means containers cannot use different URLs for different environments. - -### The Solution - -Use `$env/dynamic/private` in `hooks.server.ts` to read environment variables at **runtime**, then inject them into the HTML for client-side access. - -### Implementation - -Each SvelteKit web app has a `hooks.server.ts` that: -1. Reads `_CLIENT` environment variables at runtime -2. Injects them into the HTML via ``; - return html.replace('', `${envScript}`); - }, - }); -}; -``` - -### Environment Variable Pattern - -Each web app container receives two sets of URLs: - -| Variable | Purpose | Example | -|----------|---------|---------| -| `PUBLIC_BACKEND_URL` | Server-side (Docker network) | `http://clock-backend:3017` | -| `PUBLIC_BACKEND_URL_CLIENT` | Client-side (browser) | `https://clock-api.staging.manacore.ai` | -| `PUBLIC_MANA_CORE_AUTH_URL` | Server-side auth | `http://mana-core-auth:3001` | -| `PUBLIC_MANA_CORE_AUTH_URL_CLIENT` | Client-side auth | `https://auth.staging.manacore.ai` | - -## Docker Compose Configuration - -### File Locations - -| File | Purpose | -|------|---------| -| `docker-compose.staging.yml` | Staging configuration (repo) | -| `~/manacore-staging/docker-compose.yml` | Server deployment | - -### Key Configuration Sections - -**Web App Environment Variables:** -```yaml -clock-web: - environment: - NODE_ENV: staging - PORT: 5187 - # Server-side URLs (Docker internal network) - PUBLIC_BACKEND_URL: http://clock-backend:3017 - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - # Client-side URLs (browser access via HTTPS) - PUBLIC_BACKEND_URL_CLIENT: https://clock-api.staging.manacore.ai - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.staging.manacore.ai -``` - -**Backend CORS Configuration:** -```yaml -clock-backend: - environment: - CORS_ORIGINS: https://clock.staging.manacore.ai,https://staging.manacore.ai,http://localhost:5187 -``` - -**Auth Service CORS:** -```yaml -mana-core-auth: - environment: - CORS_ORIGINS: https://chat.staging.manacore.ai,https://staging.manacore.ai,https://calendar.staging.manacore.ai,https://clock.staging.manacore.ai,https://todo.staging.manacore.ai,http://localhost:3000,http://localhost:5173 -``` - -### Syncing Configuration to Server - -```bash -# Copy docker-compose to server -scp -i ~/.ssh/hetzner_deploy_key docker-compose.staging.yml deploy@46.224.108.214:~/manacore-staging/docker-compose.yml - -# Recreate containers with new config -ssh -i ~/.ssh/hetzner_deploy_key deploy@46.224.108.214 "cd ~/manacore-staging && docker compose up -d --force-recreate" -``` - -## Deployment Workflow - -### CI/CD Pipeline - -The GitHub Actions workflow (`.github/workflows/cd-staging.yml`): -1. Builds Docker images on push to `dev` branch -2. Pushes images to GitHub Container Registry (ghcr.io) -3. SSHs into staging server -4. Pulls latest images -5. Restarts containers - -### Manual Deployment - -```bash -# 1. Build and push images (from local) -docker build -t ghcr.io/memo-2023/clock-web:latest -f apps/clock/apps/web/Dockerfile . -docker push ghcr.io/memo-2023/clock-web:latest - -# 2. SSH into server -ssh -i ~/.ssh/hetzner_deploy_key deploy@46.224.108.214 - -# 3. Pull and restart -cd ~/manacore-staging -docker compose pull -docker compose up -d --force-recreate -``` - -### Updating Environment Variables - -1. Edit `docker-compose.staging.yml` locally -2. Copy to server: `scp -i ~/.ssh/hetzner_deploy_key docker-compose.staging.yml deploy@46.224.108.214:~/manacore-staging/docker-compose.yml` -3. Recreate affected containers: `docker compose up -d --force-recreate ` - -## Troubleshooting - -### Mixed Content Errors - -**Symptom:** Browser console shows "Mixed Content: The page was loaded over HTTPS, but requested an insecure resource" - -**Cause:** Client-side JavaScript is calling HTTP URLs instead of HTTPS - -**Solution:** -1. Check `_CLIENT` environment variables in docker-compose.yml -2. Ensure they use `https://` staging domains -3. Recreate web containers: `docker compose up -d --force-recreate ` - -### CORS Errors - -**Symptom:** Browser console shows "Access-Control-Allow-Origin" errors - -**Cause:** Backend CORS_ORIGINS doesn't include the HTTPS staging domain - -**Solution:** -1. Add the HTTPS domain to `CORS_ORIGINS` in docker-compose.yml -2. Recreate backend containers - -### Caddy SSL Certificate Issues - -**Symptom:** Browser shows SSL certificate warning - -**Solution:** -```bash -# Check Caddy logs -docker logs caddy - -# Force certificate renewal -docker exec caddy caddy reload --config /etc/caddy/Caddyfile -``` - -### Container Health Check Failures - -**Symptom:** Container shows "unhealthy" status - -**Solution:** -```bash -# Check container logs -docker logs - -# Check health status -docker inspect | grep -A 20 Health -``` - -## Adding a New App to Staging - -### 1. Update DNS (if needed) - -If using a new subdomain pattern, update Namecheap DNS. The `*.staging` wildcard should cover most cases. - -### 2. Update Caddyfile - -Add entries for web and API: -```caddyfile -newapp.staging.manacore.ai { - reverse_proxy localhost: -} - -newapp-api.staging.manacore.ai { - reverse_proxy localhost: -} -``` - -### 3. Update docker-compose.staging.yml - -Add the new services with proper environment variables: -```yaml -newapp-web: - image: ghcr.io/memo-2023/newapp-web:latest - environment: - PUBLIC_BACKEND_URL: http://newapp-backend: - PUBLIC_MANA_CORE_AUTH_URL: http://mana-core-auth:3001 - PUBLIC_BACKEND_URL_CLIENT: https://newapp-api.staging.manacore.ai - PUBLIC_MANA_CORE_AUTH_URL_CLIENT: https://auth.staging.manacore.ai - ports: - - ":" -``` - -### 4. Implement hooks.server.ts - -Copy the runtime env var pattern from an existing app: -```typescript -import type { Handle } from '@sveltejs/kit'; -import { env } from '$env/dynamic/private'; - -export const handle: Handle = async ({ event, resolve }) => { - const authUrlClient = env.PUBLIC_MANA_CORE_AUTH_URL_CLIENT || ''; - const backendUrlClient = env.PUBLIC_BACKEND_URL_CLIENT || ''; - - return resolve(event, { - transformPageChunk: ({ html }) => { - const envScript = ``; - return html.replace('', `${envScript}`); - }, - }); -}; -``` - -### 5. Deploy - -1. Sync Caddyfile: `scp ... Caddyfile.staging deploy@server:~/Caddyfile` -2. Reload Caddy: `docker exec caddy caddy reload --config /etc/caddy/Caddyfile` -3. Sync docker-compose: `scp ... docker-compose.staging.yml deploy@server:~/manacore-staging/docker-compose.yml` -4. Deploy containers: `docker compose up -d` - -## Quick Reference Commands - -```bash -# SSH into server -ssh -i ~/.ssh/hetzner_deploy_key deploy@46.224.108.214 - -# View all containers -docker ps - -# View container logs -docker logs -f - -# Restart a container -docker restart - -# Recreate containers with new config -cd ~/manacore-staging && docker compose up -d --force-recreate - -# Check Caddy SSL certificates -docker exec caddy caddy validate --config /etc/caddy/Caddyfile - -# Test HTTPS endpoint -curl -s https://auth.staging.manacore.ai/api/v1/health - -# Check container env vars -docker exec printenv | grep -E 'CLIENT|CORS' -``` - -## Related Documentation - -- [Local Development Guide](./LOCAL_DEVELOPMENT.md) -- [CI/CD Deployment Guide](./DEPLOYMENT.md) -- [Environment Variables](./ENVIRONMENT_VARIABLES.md)