feat(db): add production-safe migration system with advisory locks

- Add migrate.ts script with PostgreSQL advisory locks to prevent concurrent migrations
- Add retry logic with exponential backoff for transient connection errors
- Update CI/CD workflows to run migrations before deployment with health polling
- Create comprehensive DATABASE_MIGRATIONS.md documentation covering:
  - Drizzle ORM internals (push vs generate/migrate modes)
  - Migration tracking (journal files, __drizzle_migrations table)
  - Advisory lock architecture and timeout handling
  - Zero-downtime migration patterns (expand-contract)
  - Troubleshooting guide
- Update .claude/guidelines/database.md with migration quick reference
- Remove stale migration files that caused schema conflicts
This commit is contained in:
Wuesteon 2025-12-09 02:13:11 +01:00
parent 18a7b2d9a0
commit 8af01724d7
10 changed files with 1146 additions and 1696 deletions

View file

@ -349,6 +349,15 @@ async function getPaginated(
## Migrations
> **Comprehensive Documentation**: See **[docs/DATABASE_MIGRATIONS.md](/docs/DATABASE_MIGRATIONS.md)** for full migration internals, CI/CD integration, zero-downtime patterns, and troubleshooting.
### Quick Reference
| Environment | Command | Purpose |
| --------------- | ----------------- | ------------------------------- |
| **Development** | `pnpm db:push` | Fast iteration, direct sync |
| **Production** | `pnpm db:migrate` | Tracked migrations with history |
### Configuration
```typescript
@ -358,9 +367,9 @@ import { defineConfig } from 'drizzle-kit';
export default defineConfig({
schema: './src/db/schema/index.ts',
out: './src/db/migrations',
driver: 'pg',
dialect: 'postgresql',
dbCredentials: {
connectionString: process.env.DATABASE_URL!,
url: process.env.DATABASE_URL!,
},
verbose: true,
strict: true,
@ -370,41 +379,85 @@ export default defineConfig({
### Commands
```bash
# Generate migration from schema changes
pnpm drizzle-kit generate
# Development - push schema directly (fast, no history)
pnpm db:push
# Push schema directly (development only)
pnpm drizzle-kit push
# Open Drizzle Studio
pnpm drizzle-kit studio
# Run migrations (production)
# Production - generate and run migrations
pnpm db:generate --name add_user_preferences
pnpm db:migrate
# Open Drizzle Studio for database inspection
pnpm db:studio
```
### Migration Runner
### Migration Workflow
```
┌─────────────────────────────────────────────────────────────────┐
│ Which command should I use? │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Local development? │
│ └── YES → pnpm db:push (fast, no tracking) │
│ │
│ Staging/Production? │
│ └── YES → pnpm db:generate + pnpm db:migrate (tracked) │
│ │
│ Schema changed by someone else? │
│ └── YES → git pull + pnpm db:push (local) │
│ git pull + pnpm db:migrate (staging/prod) │
│ │
└─────────────────────────────────────────────────────────────────┘
```
### Key Concepts
1. **Advisory Locks**: Migrations use PostgreSQL advisory locks to prevent concurrent execution
2. **Migration Tracking**: `__drizzle_migrations` table + `meta/_journal.json` file
3. **Migrations run BEFORE code deployment**: Ensures database is ready for new code
4. **Never modify applied migrations**: Create new migrations instead
5. **Zero-downtime**: Use expand-contract pattern for breaking schema changes
### Production Migration Script
Production backends use a migration script with advisory locks:
```typescript
// src/db/migrate.ts
import { drizzle } from 'drizzle-orm/postgres-js';
import { migrate } from 'drizzle-orm/postgres-js/migrator';
import postgres from 'postgres';
// src/db/migrate.ts - Key features:
// - Advisory lock (pg_try_advisory_lock) prevents concurrent migrations
// - Retry logic with exponential backoff for transient failures
// - Timeout protection (default 5 minutes)
// - Graceful handling when no migrations exist
async function runMigrations() {
const connection = postgres(process.env.DATABASE_URL!, { max: 1 });
const db = drizzle(connection);
const MIGRATION_LOCK_ID = 987654321; // Unique per service
console.log('Running migrations...');
await migrate(db, { migrationsFolder: './src/db/migrations' });
console.log('Migrations complete');
await connection.end();
async function acquireLock(db) {
const result = await db.execute(
sql`SELECT pg_try_advisory_lock(${MIGRATION_LOCK_ID}) as acquired`
);
return result[0]?.acquired === true;
}
runMigrations().catch(console.error);
```
See `services/mana-core-auth/src/db/migrate.ts` for the full implementation.
### Best Practices
**DO:**
- Run migrations before deploying new code
- Test migrations in staging before production
- Use `CONCURRENTLY` for index creation
- Keep migrations small and focused
- Commit migration files to version control
**DON'T:**
- Run `db:push` in production
- Delete or modify applied migrations
- Add NOT NULL without default or backfill
- Drop columns immediately (wait 1-2 weeks)
## Query Patterns
### Select with Joins

View file

@ -212,8 +212,52 @@ jobs:
ssh ${{ secrets.PRODUCTION_USER }}@${{ secrets.PRODUCTION_HOST }} << 'EOF'
cd ~/manacore-production
# Run migrations before deploying new code
docker compose run --rm mana-core-auth pnpm run db:migrate || echo "Migrations completed or skipped"
echo "=== Running Database Migrations ==="
echo ""
# Migration function with retry logic
run_migration() {
local service=$1
local max_attempts=3
local timeout=300 # 5 minutes
local attempt=1
while [ $attempt -le $max_attempts ]; do
echo "[$service] Migration attempt $attempt/$max_attempts..."
# Run migration with timeout using a temporary container
if timeout $timeout docker compose run --rm $service pnpm run db:migrate 2>&1; then
echo "✅ [$service] Migration succeeded"
return 0
else
exit_code=$?
if [ $exit_code -eq 124 ]; then
echo "⚠️ [$service] Migration timeout after ${timeout}s"
else
echo "⚠️ [$service] Migration failed with exit code $exit_code"
fi
attempt=$((attempt + 1))
if [ $attempt -le $max_attempts ]; then
wait_time=$((10 * attempt)) # Backoff: 10s, 20s, 30s
echo " Waiting ${wait_time}s before retry..."
sleep $wait_time
fi
fi
done
echo "❌ [$service] Migration failed after $max_attempts attempts"
return 1
}
# Run migrations for mana-core-auth (central auth service)
run_migration mana-core-auth || {
echo "❌ mana-core-auth migration failed"
echo "⚠️ Continuing with deployment - manual migration may be required"
}
echo ""
echo "✅ Migration step completed"
EOF
- name: Deploy with zero-downtime

View file

@ -203,6 +203,69 @@ jobs:
echo "✅ Databases ready"
EOF
- name: Run database migrations
env:
STAGING_USER: deploy
STAGING_HOST: 46.224.108.214
run: |
ssh $STAGING_USER@$STAGING_HOST << 'EOF'
cd ~/manacore-staging
echo "=== Running Database Migrations ==="
echo ""
# Migration function with retry logic
run_migration() {
local service=$1
local max_attempts=3
local timeout=300 # 5 minutes
local attempt=1
while [ $attempt -le $max_attempts ]; do
echo "[$service] Migration attempt $attempt/$max_attempts..."
# Run migration with timeout
if timeout $timeout docker compose exec -T $service pnpm run db:migrate 2>&1; then
echo "✅ [$service] Migration succeeded"
return 0
else
exit_code=$?
if [ $exit_code -eq 124 ]; then
echo "⚠️ [$service] Migration timeout after ${timeout}s"
else
echo "⚠️ [$service] Migration failed with exit code $exit_code"
fi
attempt=$((attempt + 1))
if [ $attempt -le $max_attempts ]; then
wait_time=$((10 * attempt)) # Backoff: 10s, 20s, 30s
echo " Waiting ${wait_time}s before retry..."
sleep $wait_time
fi
fi
done
echo "❌ [$service] Migration failed after $max_attempts attempts"
return 1
}
# Run migrations for services that have db:migrate script
# mana-core-auth - central auth service
if docker compose exec -T mana-core-auth test -f src/db/migrate.ts 2>/dev/null || \
docker compose exec -T mana-core-auth pnpm run db:migrate --help 2>/dev/null; then
run_migration mana-core-auth || {
echo "❌ mana-core-auth migration failed - aborting deployment"
exit 1
}
else
echo "⏭️ [mana-core-auth] No db:migrate script, using db:push..."
docker compose exec -T mana-core-auth npx drizzle-kit push --force || echo "Auth schema push completed"
fi
echo ""
echo "✅ All migrations completed"
EOF
- name: Run health checks
env:
STAGING_USER: deploy
@ -211,143 +274,69 @@ jobs:
ssh $STAGING_USER@$STAGING_HOST << 'EOF'
cd ~/manacore-staging
# Wait for services to fully start
echo "Waiting 60s for services to fully initialize..."
sleep 60
echo "=== Health Checks with Polling ==="
echo ""
# Health check function with retry polling
check_health() {
local service=$1
local url=$2
local max_attempts=24 # 24 * 5s = 2 minutes max wait
local attempt=1
echo "Checking $service..."
while [ $attempt -le $max_attempts ]; do
# Check if container is running
if ! docker compose ps $service 2>/dev/null | grep -q "Up"; then
if [ $attempt -eq 1 ]; then
echo " ⏳ Waiting for container to start..."
fi
sleep 5
attempt=$((attempt + 1))
continue
fi
# Check health endpoint
if docker compose exec -T $service wget -q -O - $url > /dev/null 2>&1; then
echo " ✅ $service is healthy (attempt $attempt)"
return 0
fi
if [ $attempt -eq 1 ]; then
echo " ⏳ Waiting for $service to become healthy..."
fi
sleep 5
attempt=$((attempt + 1))
done
echo " ❌ $service health check failed after $max_attempts attempts"
echo " === Recent Logs ==="
docker compose logs --tail=50 $service
return 1
}
echo "=== Container Status ==="
docker compose ps
echo ""
echo "=== Health Checks ==="
# Check mana-core-auth
echo "Checking mana-core-auth..."
if docker compose exec -T mana-core-auth wget -q -O - http://localhost:3001/api/v1/health > /dev/null 2>&1; then
echo "✅ mana-core-auth is healthy"
else
echo "❌ mana-core-auth health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 mana-core-auth
exit 1
fi
# Check chat-backend
echo "Checking chat-backend..."
if docker compose exec -T chat-backend wget -q -O - http://localhost:3002/api/v1/health > /dev/null 2>&1; then
echo "✅ chat-backend is healthy"
else
echo "❌ chat-backend health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 chat-backend
exit 1
fi
# Check chat-web
echo "Checking chat-web..."
if docker compose exec -T chat-web wget -q -O - http://localhost:3000/health > /dev/null 2>&1; then
echo "✅ chat-web is healthy"
else
echo "❌ chat-web health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 chat-web
exit 1
fi
# Check manacore-web
echo "Checking manacore-web..."
if docker compose exec -T manacore-web wget -q -O - http://localhost:5173/health > /dev/null 2>&1; then
echo "✅ manacore-web is healthy"
else
echo "❌ manacore-web health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 manacore-web
exit 1
fi
# Check todo-backend
echo "Checking todo-backend..."
if docker compose exec -T todo-backend wget -q -O - http://localhost:3018/api/v1/health > /dev/null 2>&1; then
echo "✅ todo-backend is healthy"
else
echo "❌ todo-backend health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 todo-backend
exit 1
fi
# Check todo-web
echo "Checking todo-web..."
if docker compose exec -T todo-web wget -q -O - http://localhost:5188/health > /dev/null 2>&1; then
echo "✅ todo-web is healthy"
else
echo "❌ todo-web health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 todo-web
exit 1
fi
# Check calendar-backend
echo "Checking calendar-backend..."
if docker compose exec -T calendar-backend wget -q -O - http://localhost:3016/api/v1/health > /dev/null 2>&1; then
echo "✅ calendar-backend is healthy"
else
echo "❌ calendar-backend health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 calendar-backend
exit 1
fi
# Check calendar-web
echo "Checking calendar-web..."
if docker compose exec -T calendar-web wget -q -O - http://localhost:5186/health > /dev/null 2>&1; then
echo "✅ calendar-web is healthy"
else
echo "❌ calendar-web health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 calendar-web
exit 1
fi
# Check clock-backend
echo "Checking clock-backend..."
if docker compose exec -T clock-backend wget -q -O - http://localhost:3017/api/v1/health > /dev/null 2>&1; then
echo "✅ clock-backend is healthy"
else
echo "❌ clock-backend health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 clock-backend
exit 1
fi
# Check clock-web
echo "Checking clock-web..."
if docker compose exec -T clock-web wget -q -O - http://localhost:5187/health > /dev/null 2>&1; then
echo "✅ clock-web is healthy"
else
echo "❌ clock-web health check failed"
echo "=== Logs ==="
docker compose logs --tail=50 clock-web
exit 1
fi
# Check all services with polling
check_health mana-core-auth http://localhost:3001/api/v1/health || exit 1
check_health chat-backend http://localhost:3002/api/v1/health || exit 1
check_health chat-web http://localhost:3000/health || exit 1
check_health manacore-web http://localhost:5173/health || exit 1
check_health todo-backend http://localhost:3018/api/v1/health || exit 1
check_health todo-web http://localhost:5188/health || exit 1
check_health calendar-backend http://localhost:3016/api/v1/health || exit 1
check_health calendar-web http://localhost:5186/health || exit 1
check_health clock-backend http://localhost:3017/api/v1/health || exit 1
check_health clock-web http://localhost:5187/health || exit 1
echo ""
echo "✅ All health checks passed!"
EOF
- name: Run database migrations
env:
STAGING_USER: deploy
STAGING_HOST: 46.224.108.214
run: |
# Run migrations for services that need them
ssh $STAGING_USER@$STAGING_HOST << 'EOF'
cd ~/manacore-staging
# Mana Core Auth - push schema using Drizzle (--force skips interactive confirmation)
docker compose exec -T mana-core-auth npx drizzle-kit push --force || echo "Auth schema push skipped"
EOF
- name: Deployment summary
run: |
echo "## Staging Deployment Summary" >> $GITHUB_STEP_SUMMARY

View file

@ -636,6 +636,7 @@ PORT=...
- **[docs/LOCAL_DEVELOPMENT.md](docs/LOCAL_DEVELOPMENT.md)** - Database setup and `dev:*:full` commands
- **[docs/ENVIRONMENT_VARIABLES.md](docs/ENVIRONMENT_VARIABLES.md)** - Complete environment setup guide
- **[docs/DATABASE_MIGRATIONS.md](docs/DATABASE_MIGRATIONS.md)** - Migration best practices, CI/CD, rollback procedures
Each project has its own `CLAUDE.md` with detailed information:

667
docs/DATABASE_MIGRATIONS.md Normal file
View file

@ -0,0 +1,667 @@
# Database Migration Guide
This document describes database migration best practices, procedures, and tooling for the ManaCore monorepo. **This is a core system concept** - all developers should understand these patterns.
## Table of Contents
1. [Overview](#overview)
2. [Drizzle Migration Internals](#drizzle-migration-internals)
3. [Migration Commands](#migration-commands)
4. [Development vs Production](#development-vs-production)
5. [CI/CD Pipeline](#cicd-pipeline)
6. [Advisory Locks](#advisory-locks)
7. [Zero-Downtime Migrations](#zero-downtime-migrations)
8. [Rollback Procedures](#rollback-procedures)
9. [Troubleshooting](#troubleshooting)
---
## Overview
All backends in the ManaCore monorepo use **Drizzle ORM** for database schema management. We use two different approaches depending on the environment:
| Environment | Command | Purpose |
|-------------|---------|---------|
| **Development** | `drizzle-kit push` | Fast iteration, direct schema sync |
| **Production** | `drizzle-kit generate` + `migrate` | Tracked migrations with history |
### Key Principles
1. **Migrations run BEFORE code deployment** - Ensures database is ready for new code
2. **Advisory locks prevent concurrent migrations** - Safe for multi-replica deployments
3. **Expand-contract pattern for breaking changes** - Zero-downtime schema changes
4. **Data persistence** - Migrations never delete user data unless explicitly requested
### Quick Decision Guide
```
┌─────────────────────────────────────────────────────────────────┐
│ Which command should I use? │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Local development? │
│ └── YES → pnpm db:push (fast, no tracking) │
│ │
│ Staging/Production? │
│ └── YES → pnpm db:generate + pnpm db:migrate (tracked) │
│ │
│ Need to inspect data? │
│ └── YES → pnpm db:studio (opens Drizzle Studio) │
│ │
│ Schema changed by someone else? │
│ └── YES → git pull + pnpm db:push (local) │
│ git pull + pnpm db:migrate (staging/prod) │
│ │
└─────────────────────────────────────────────────────────────────┘
```
---
## Drizzle Migration Internals
Understanding how Drizzle manages migrations is essential for debugging issues.
### The Two Modes
#### 1. Push Mode (`drizzle-kit push`)
**How it works:**
1. Drizzle introspects your TypeScript schema files
2. Drizzle introspects the current database schema
3. Drizzle computes the diff between them
4. Drizzle generates and **immediately executes** the SQL to sync them
**Characteristics:**
- No migration files created
- No history tracking
- Direct database modification
- Interactive confirmation (use `--force` to skip)
**When to use:** Local development, experimentation, prototyping
#### 2. Generate + Migrate Mode (`drizzle-kit generate` + `migrate`)
**How it works:**
**Step 1: Generate** (`drizzle-kit generate`)
1. Drizzle introspects your TypeScript schema files
2. Drizzle reads the last snapshot from `migrations/meta/`
3. Drizzle computes the diff
4. Drizzle creates migration files (SQL + snapshot)
**Step 2: Migrate** (`pnpm db:migrate`)
1. Script reads `migrations/meta/_journal.json`
2. Script queries `__drizzle_migrations` table in database
3. Script determines which migrations haven't been applied
4. Script executes pending migrations in order
5. Script records applied migrations in `__drizzle_migrations`
**Characteristics:**
- Creates versioned SQL files
- Full history tracking
- Repeatable deployments
- Can be reviewed before applying
**When to use:** Staging, production, CI/CD pipelines
### Migration File Structure
```
src/db/migrations/
├── 0000_initial_schema/
│ ├── migration.sql # The actual SQL to execute
│ └── snapshot.json # Schema snapshot AFTER this migration
├── 0001_add_user_preferences/
│ ├── migration.sql
│ └── snapshot.json
├── 0002_add_credits_table/
│ ├── migration.sql
│ └── snapshot.json
└── meta/
└── _journal.json # Migration registry (order + metadata)
```
### The Journal File (`_journal.json`)
This file tracks all generated migrations:
```json
{
"version": "7",
"dialect": "postgresql",
"entries": [
{
"idx": 0,
"version": "7",
"when": 1733066521000,
"tag": "0000_initial_schema",
"breakpoints": true
},
{
"idx": 1,
"version": "7",
"when": 1733152921000,
"tag": "0001_add_user_preferences",
"breakpoints": true
}
]
}
```
**Key fields:**
- `idx`: Sequential index (order matters!)
- `tag`: Folder name containing the migration
- `when`: Unix timestamp when generated
- `breakpoints`: Whether to use statement breakpoints
### The Database Tracking Table (`__drizzle_migrations`)
Drizzle creates this table automatically to track applied migrations:
```sql
-- Schema: drizzle
-- Table: __drizzle_migrations
CREATE TABLE drizzle.__drizzle_migrations (
id SERIAL PRIMARY KEY,
hash TEXT NOT NULL,
created_at BIGINT NOT NULL
);
```
**Query applied migrations:**
```sql
SELECT * FROM drizzle.__drizzle_migrations ORDER BY created_at;
```
### How Migration Tracking Works
```
┌─────────────────┐ ┌─────────────────┐
│ _journal.json │ │ __drizzle_ │
│ (filesystem) │ │ migrations (db) │
└────────┬────────┘ └────────┬────────┘
│ │
▼ ▼
[0000, 0001, 0002] [hash_0000, hash_0001]
│ │
└───────────┬───────────┘
Pending: [0002]
Execute 0002/migration.sql
Insert into __drizzle_migrations
```
### Snapshot Files
Each migration includes a `snapshot.json` that captures the **complete schema state** after that migration. This allows Drizzle to:
1. Compute diffs for the next migration
2. Detect schema drift
3. Generate accurate SQL
**Important:** Never modify snapshots manually!
---
## Migration Commands
### All Backends
```bash
# Development - push schema directly (fast, no history)
pnpm db:push
# Generate migration files from schema changes
pnpm db:generate
# Run migrations with advisory locks (production-safe)
pnpm db:migrate
# Open Drizzle Studio for database inspection
pnpm db:studio
```
### Root-Level Commands
```bash
# Setup all databases (creates DBs + pushes schemas)
pnpm setup:db
# Setup specific service
pnpm setup:db:auth
pnpm setup:db:chat
```
### Per-Service Commands
```bash
# mana-core-auth
pnpm --filter mana-core-auth db:push
pnpm --filter mana-core-auth db:generate
pnpm --filter mana-core-auth db:migrate
# chat-backend
pnpm --filter @chat/backend db:push
pnpm --filter @chat/backend db:migrate
```
---
## Development vs Production
### Development Workflow
For local development, use `db:push` for fast iteration:
```bash
# 1. Make schema changes in src/db/schema/*.ts
# 2. Push changes to local database
pnpm db:push
# Or use the full dev command which handles this automatically
pnpm dev:chat:full
```
**Why `push` for development?**
- Instant feedback on schema changes
- No migration file clutter during experimentation
- Automatically handled by `dev:*:full` commands
### Production Workflow
For staging/production, use migration files for trackability:
```bash
# 1. Make schema changes in src/db/schema/*.ts
# 2. Generate migration file
pnpm db:generate --name add_user_preferences
# 3. Review generated SQL
cat src/db/migrations/*/migration.sql
# 4. Commit migration files
git add src/db/migrations/
git commit -m "feat: add user preferences table"
# 5. CI/CD runs migrations automatically on deploy
```
**Why migrations for production?**
- Audit trail of all schema changes
- Repeatable deployments
- Rollback capability (with manual down migrations)
---
## CI/CD Pipeline
### Deployment Flow
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Build │───>│ Create DB │───>│ Migrate │───>│ Deploy │
│ Images │ │ (if new) │ │ Database │ │ Code │
└─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘
```
### Migration Step Features
1. **Retry logic** - 3 attempts with exponential backoff (10s, 20s, 30s)
2. **Timeout protection** - 5-minute timeout per migration
3. **Advisory locks** - Prevents concurrent migrations
4. **Graceful fallback** - Falls back to `db:push` if `db:migrate` unavailable
### Staging Deployment
Migrations run automatically after database creation:
```yaml
# .github/workflows/cd-staging.yml
- name: Run database migrations
run: |
docker compose exec -T mana-core-auth pnpm run db:migrate
```
### Production Deployment
Migrations run BEFORE deploying new code:
```yaml
# .github/workflows/cd-production.yml
- name: Run database migrations
run: |
docker compose run --rm mana-core-auth pnpm run db:migrate
- name: Deploy with zero-downtime
run: |
docker compose up -d
```
---
## Advisory Locks
Advisory locks prevent multiple instances from running migrations simultaneously.
### How It Works
```typescript
// services/mana-core-auth/src/db/migrate.ts
const MIGRATION_LOCK_ID = 987654321;
// Acquire lock before migration
await db.execute(sql`SELECT pg_try_advisory_lock(${LOCK_ID})`);
// Run migrations...
// Release lock after migration
await db.execute(sql`SELECT pg_advisory_unlock(${LOCK_ID})`);
```
### Lock Behavior
| Scenario | Behavior |
|----------|----------|
| Lock acquired | Migration runs immediately |
| Lock held by another process | Waits up to 5 minutes, then fails |
| Lock stuck | Manual release required (see Troubleshooting) |
### Lock IDs by Service
| Service | Lock ID |
|---------|---------|
| mana-core-auth | `987654321` |
| chat-backend | (to be assigned) |
| todo-backend | (to be assigned) |
### Migration Script Architecture
The production migration script (`src/db/migrate.ts`) is designed for safe, concurrent-safe deployments:
```
┌─────────────────────────────────────────────────────────────────┐
│ migrate.ts Execution Flow │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. Load environment variables (.env) │
│ └── DATABASE_URL, MIGRATION_TIMEOUT │
│ │
│ 2. Create single-connection pool │
│ └── max: 1 (dedicated migration connection) │
│ │
│ 3. Test database connectivity (with retry) │
│ └── SELECT 1 (max 3 attempts, exponential backoff) │
│ │
│ 4. Acquire advisory lock │
│ ├── pg_try_advisory_lock() - non-blocking attempt │
│ └── If busy: poll every 5s until timeout (default: 5 min) │
│ │
│ 5. Check for migration files │
│ └── If meta/_journal.json missing: exit gracefully │
│ │
│ 6. Run Drizzle migrations │
│ └── migrate(db, { migrationsFolder }) │
│ │
│ 7. Cleanup (always runs, even on error) │
│ ├── Release advisory lock │
│ └── Close database connection │
│ │
└─────────────────────────────────────────────────────────────────┘
```
**Key Components:**
| Component | Purpose | Configuration |
|-----------|---------|---------------|
| `withRetry()` | Retry transient errors (network, connection) | 3 attempts, exponential backoff |
| `acquireLock()` | Non-blocking lock attempt | `pg_try_advisory_lock()` |
| `waitForLock()` | Polling wait for lock | 5s intervals, configurable timeout |
| `releaseLock()` | Release lock in finally block | Always runs |
**Error Handling:**
```typescript
// Transient errors (will retry):
- ECONNREFUSED, ETIMEDOUT, ENOTFOUND
- Connection errors
- PostgreSQL 57P03 (cannot connect now)
// Non-transient errors (immediate failure):
- Missing DATABASE_URL
- SQL syntax errors
- Schema conflicts
- Lock timeout
```
**Exit Codes:**
| Code | Meaning |
|------|---------|
| 0 | Success - all migrations applied |
| 1 | Failure - check logs for details |
---
## Zero-Downtime Migrations
For breaking schema changes, use the **expand-contract pattern**:
### Phase 1: Expand
Add new schema elements alongside existing ones:
```sql
-- Migration: 001_add_full_name.sql
ALTER TABLE users ADD COLUMN full_name TEXT;
```
### Phase 2: Migrate
Update application to write to both, backfill data:
```typescript
// Application code - dual write
await db.update(users).set({
name: newName, // Old column
fullName: newName, // New column
});
// Backfill script
UPDATE users SET full_name = name WHERE full_name IS NULL;
```
### Phase 3: Contract
After 1-2 weeks, remove old column:
```sql
-- Migration: 002_drop_name_column.sql
ALTER TABLE users DROP COLUMN name;
```
### Common Patterns
| Change Type | Approach |
|-------------|----------|
| Add column | Direct `ALTER TABLE ADD COLUMN` |
| Drop column | Remove from code first, wait 2 weeks, then drop |
| Rename column | Add new → dual-write → backfill → drop old |
| Change type | Add new column → backfill with cast → swap |
| Add NOT NULL | Add nullable → backfill → add constraint |
### Index Creation
Always use `CONCURRENTLY` to avoid table locks:
```sql
-- Good
CREATE INDEX CONCURRENTLY idx_users_email ON users(email);
-- Bad (locks table)
CREATE INDEX idx_users_email ON users(email);
```
---
## Rollback Procedures
### Automatic Rollback (Not Supported)
Drizzle ORM does not support automatic rollbacks. Plan your migrations carefully.
### Manual Rollback
1. **Write down migration scripts** alongside up migrations:
```
src/db/migrations/
├── 001_add_referrals.up.sql
├── 001_add_referrals.down.sql # Manual rollback script
```
2. **Execute rollback manually**:
```bash
# Connect to database
docker compose exec -T postgres psql -U postgres -d manacore_auth
# Run down migration
\i /path/to/001_add_referrals.down.sql
```
### Rollback Checklist
- [ ] Identify affected migration
- [ ] Verify rollback script exists and is tested
- [ ] Create database backup before rollback
- [ ] Execute rollback in staging first
- [ ] Monitor for issues after rollback
- [ ] Update application code if needed
---
## Troubleshooting
### Migration Lock Stuck
If a migration lock is stuck (process crashed without releasing):
```sql
-- Check for stuck locks
SELECT * FROM pg_locks WHERE locktype = 'advisory';
-- Release specific lock (replace LOCK_ID)
SELECT pg_advisory_unlock(987654321);
-- Release all advisory locks for current session
SELECT pg_advisory_unlock_all();
```
### Migration Timeout
If migrations time out:
1. Check for long-running queries: `SELECT * FROM pg_stat_activity;`
2. Increase timeout: `MIGRATION_TIMEOUT=600 pnpm db:migrate`
3. Break large migrations into smaller steps
### Schema Drift
If staging/production schema differs from expected:
```bash
# Generate migration from current schema
pnpm db:generate --name sync_schema
# Review and apply
pnpm db:migrate
```
### Connection Issues
```bash
# Test database connectivity
docker compose exec -T postgres pg_isready -U postgres
# Check environment variables
echo $DATABASE_URL
# Manual connection test
docker compose exec -T postgres psql -U postgres -d manacore_auth -c "SELECT 1"
```
### Migration Fails in CI/CD
1. Check GitHub Actions logs for specific error
2. Verify DATABASE_URL is correctly set in secrets
3. Ensure database exists before migration runs
4. Check if another migration is running (advisory lock)
---
## Best Practices
### DO
- Run migrations before deploying new code
- Test migrations in staging before production
- Use `CONCURRENTLY` for index creation
- Keep migrations small and focused
- Commit migration files to version control
- Wait 1-2 weeks before dropping columns
### DON'T
- Run `db:push` in production
- Delete migration files after they've been applied
- Modify migration files after they've been applied
- Add NOT NULL without default or backfill
- Create indexes without `CONCURRENTLY`
- Drop columns immediately after removing from code
---
## Migration File Structure
```
services/mana-core-auth/
├── src/db/
│ ├── schema/
│ │ ├── index.ts # Export all schemas
│ │ ├── auth.schema.ts # User, session tables
│ │ └── credits.schema.ts # Credit system tables
│ ├── migrations/
│ │ ├── 0001_initial/
│ │ │ ├── snapshot.json
│ │ │ └── migration.sql
│ │ └── meta/
│ │ └── _journal.json # Migration history
│ ├── connection.ts # Database connection
│ └── migrate.ts # Migration script with locks
└── drizzle.config.ts # Drizzle configuration
```
---
## Environment Variables
| Variable | Description | Default |
|----------|-------------|---------|
| `DATABASE_URL` | PostgreSQL connection string | Required |
| `MIGRATION_TIMEOUT` | Max seconds for migration | `300` |
---
## References
- [Drizzle ORM Migrations](https://orm.drizzle.team/docs/migrations)
- [PostgreSQL Advisory Locks](https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS)
- [Expand-Contract Pattern](https://martinfowler.com/bliki/ParallelChange.html)
- [Zero-Downtime PostgreSQL Migrations](https://postgres.ai/blog/20210923-zero-downtime-postgres-schema-migrations-lock-timeout-and-retries)

View file

@ -91,7 +91,9 @@ services/mana-core-auth/
│ ├── credits/ # Credit system
│ ├── db/
│ │ ├── schema/ # Drizzle schemas
│ │ └── connection.ts # DB connection
│ │ ├── migrations/ # Generated migration files
│ │ ├── connection.ts # DB connection
│ │ └── migrate.ts # Migration script with advisory locks
│ └── config/
│ └── configuration.ts # App config
├── docs/
@ -99,6 +101,16 @@ services/mana-core-auth/
└── test/
```
## Database Migrations
For comprehensive migration documentation, see **[docs/DATABASE_MIGRATIONS.md](/docs/DATABASE_MIGRATIONS.md)**.
Key points:
- Use `db:push` for development (fast iteration)
- Use `db:generate` + `db:migrate` for production (tracked migrations)
- Migrations use advisory locks to prevent concurrent execution
- CI/CD runs migrations automatically before code deployment
## Key Files
| File | Purpose |

View file

@ -16,6 +16,8 @@
"test:cov": "jest --coverage",
"test:e2e": "jest --config ./test/jest-e2e.json",
"db:push": "drizzle-kit push",
"db:generate": "drizzle-kit generate",
"db:migrate": "tsx src/db/migrate.ts",
"db:studio": "drizzle-kit studio"
},
"dependencies": {

View file

@ -0,0 +1,222 @@
/**
* Database Migration Script with Advisory Locks
*
* This script safely runs database migrations with the following features:
* - Advisory locks to prevent concurrent migrations
* - Retry logic for transient network failures
* - Timeout protection
* - Proper cleanup on exit
* - Graceful handling when no migrations exist
*
* Usage:
* pnpm db:migrate # Run migrations
* MIGRATION_TIMEOUT=600 pnpm db:migrate # With custom timeout (seconds)
*/
import { drizzle } from 'drizzle-orm/postgres-js';
import { migrate } from 'drizzle-orm/postgres-js/migrator';
import { sql } from 'drizzle-orm';
import postgres from 'postgres';
import * as dotenv from 'dotenv';
import * as fs from 'fs';
import * as path from 'path';
// Load environment variables
dotenv.config();
// Configuration
const MIGRATION_LOCK_ID = 987654321; // Unique lock ID for mana-core-auth migrations
const MAX_LOCK_WAIT_MS = parseInt(process.env.MIGRATION_TIMEOUT || '300', 10) * 1000; // Default 5 minutes
const MAX_RETRIES = 3;
const RETRY_DELAY_MS = 2000;
/**
* Retry wrapper for transient errors
*/
async function withRetry<T>(
operation: () => Promise<T>,
operationName: string,
maxRetries = MAX_RETRIES
): Promise<T> {
let lastError: Error | undefined;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
return await operation();
} catch (error) {
lastError = error as Error;
// Check if error is transient (network-related)
const isTransient =
lastError.message?.includes('ECONNREFUSED') ||
lastError.message?.includes('ETIMEDOUT') ||
lastError.message?.includes('ENOTFOUND') ||
lastError.message?.includes('connection') ||
(lastError as any).code === '57P03'; // PostgreSQL: cannot connect now
if (!isTransient || attempt === maxRetries) {
throw error;
}
const delay = RETRY_DELAY_MS * Math.pow(2, attempt - 1); // Exponential backoff
console.log(
`\u26a0\ufe0f [${operationName}] Transient error, retrying in ${delay}ms... (attempt ${attempt}/${maxRetries})`
);
console.log(` Error: ${lastError.message}`);
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
throw lastError!;
}
/**
* Acquire PostgreSQL advisory lock
*/
async function acquireLock(db: ReturnType<typeof drizzle>): Promise<boolean> {
const result = await db.execute(
sql`SELECT pg_try_advisory_lock(${MIGRATION_LOCK_ID}) as acquired`
);
return (result as any)[0]?.acquired === true;
}
/**
* Release PostgreSQL advisory lock
*/
async function releaseLock(db: ReturnType<typeof drizzle>): Promise<void> {
await db.execute(sql`SELECT pg_advisory_unlock(${MIGRATION_LOCK_ID})`);
}
/**
* Wait for migration lock with timeout
*/
async function waitForLock(db: ReturnType<typeof drizzle>): Promise<boolean> {
const startTime = Date.now();
while (Date.now() - startTime < MAX_LOCK_WAIT_MS) {
const acquired = await acquireLock(db);
if (acquired) {
return true;
}
const elapsed = Math.round((Date.now() - startTime) / 1000);
console.log(`\u23f3 Waiting for migration lock... (${elapsed}s / ${MAX_LOCK_WAIT_MS / 1000}s)`);
await new Promise((resolve) => setTimeout(resolve, 5000));
}
return false;
}
/**
* Main migration function
*/
async function runMigrations(): Promise<void> {
const databaseUrl = process.env.DATABASE_URL;
if (!databaseUrl) {
throw new Error('DATABASE_URL environment variable is not set');
}
console.log('\n\ud83d\udd04 Starting database migration process...');
console.log(` Lock ID: ${MIGRATION_LOCK_ID}`);
console.log(` Timeout: ${MAX_LOCK_WAIT_MS / 1000}s`);
console.log('');
// Create connection with single connection for migrations
const connection = postgres(databaseUrl, {
max: 1,
idle_timeout: 20,
connect_timeout: 30,
});
const db = drizzle(connection);
let lockAcquired = false;
try {
// Test database connection
console.log('\ud83d\udd0c Testing database connection...');
await withRetry(async () => {
await db.execute(sql`SELECT 1`);
}, 'Database connection');
console.log('\u2705 Database connection successful\n');
// Attempt to acquire advisory lock
console.log('\ud83d\udd12 Attempting to acquire migration lock...');
lockAcquired = await withRetry(() => acquireLock(db), 'Acquire lock');
if (!lockAcquired) {
console.log('\u23f3 Another instance is running migrations. Waiting for lock...');
lockAcquired = await waitForLock(db);
if (!lockAcquired) {
throw new Error(
`Migration lock timeout after ${MAX_LOCK_WAIT_MS / 1000}s - another migration may be stuck`
);
}
}
console.log('\u2705 Migration lock acquired\n');
// Check if migration files exist
const migrationsFolder = './src/db/migrations';
const journalPath = path.join(migrationsFolder, 'meta', '_journal.json');
if (!fs.existsSync(journalPath)) {
console.log('\u26a0\ufe0f No migration files found (meta/_journal.json missing)');
console.log(' This is normal if you have not generated any migrations yet.');
console.log(' To generate migrations, run: pnpm db:generate');
console.log(' For development, you can use: pnpm db:push');
console.log('\n\u2705 No migrations to run\n');
return;
}
// Run migrations
console.log('\ud83d\udce6 Running database migrations...');
await withRetry(
async () => {
await migrate(db, {
migrationsFolder,
});
},
'Run migrations',
1 // Only 1 attempt for actual migrations (they should be idempotent)
);
console.log('\u2705 Migrations completed successfully\n');
} catch (error) {
console.error('\n\u274c Migration failed:', error);
throw error;
} finally {
// Always attempt to release lock
if (lockAcquired) {
try {
await releaseLock(db);
console.log('\ud83d\udd13 Migration lock released');
} catch (unlockError) {
console.error('\u26a0\ufe0f Failed to release lock:', unlockError);
}
}
// Close connection
try {
await connection.end();
console.log('\ud83d\udd0c Database connection closed\n');
} catch (closeError) {
console.error('\u26a0\ufe0f Failed to close connection:', closeError);
}
}
}
// Run migrations
runMigrations()
.then(() => {
console.log('\ud83c\udf89 Migration process completed successfully');
process.exit(0);
})
.catch((error) => {
console.error('\n\ud83d\udca5 Migration process failed:', error.message);
process.exit(1);
});

View file

@ -1,39 +0,0 @@
CREATE SCHEMA "feedback";
--> statement-breakpoint
CREATE TYPE "public"."feedback_category" AS ENUM('bug', 'feature', 'improvement', 'question', 'other');--> statement-breakpoint
CREATE TYPE "public"."feedback_status" AS ENUM('submitted', 'under_review', 'planned', 'in_progress', 'completed', 'declined');--> statement-breakpoint
CREATE TABLE "feedback"."feedback_votes" (
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
"feedback_id" uuid NOT NULL,
"user_id" uuid NOT NULL,
"created_at" timestamp with time zone DEFAULT now() NOT NULL
);
--> statement-breakpoint
CREATE TABLE "feedback"."user_feedback" (
"id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
"user_id" uuid NOT NULL,
"app_id" text NOT NULL,
"title" text,
"feedback_text" text NOT NULL,
"category" "feedback_category" DEFAULT 'feature' NOT NULL,
"status" "feedback_status" DEFAULT 'submitted' NOT NULL,
"is_public" boolean DEFAULT false NOT NULL,
"admin_response" text,
"vote_count" integer DEFAULT 0 NOT NULL,
"device_info" jsonb,
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
"updated_at" timestamp with time zone DEFAULT now() NOT NULL,
"published_at" timestamp with time zone,
"completed_at" timestamp with time zone
);
--> statement-breakpoint
ALTER TABLE "feedback"."feedback_votes" ADD CONSTRAINT "feedback_votes_feedback_id_user_feedback_id_fk" FOREIGN KEY ("feedback_id") REFERENCES "feedback"."user_feedback"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "feedback"."feedback_votes" ADD CONSTRAINT "feedback_votes_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "auth"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "feedback"."user_feedback" ADD CONSTRAINT "user_feedback_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "auth"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
CREATE UNIQUE INDEX "feedback_vote_unique" ON "feedback"."feedback_votes" USING btree ("feedback_id","user_id");--> statement-breakpoint
CREATE INDEX "feedback_votes_feedback_idx" ON "feedback"."feedback_votes" USING btree ("feedback_id");--> statement-breakpoint
CREATE INDEX "feedback_user_idx" ON "feedback"."user_feedback" USING btree ("user_id");--> statement-breakpoint
CREATE INDEX "feedback_app_idx" ON "feedback"."user_feedback" USING btree ("app_id");--> statement-breakpoint
CREATE INDEX "feedback_public_idx" ON "feedback"."user_feedback" USING btree ("is_public");--> statement-breakpoint
CREATE INDEX "feedback_status_idx" ON "feedback"."user_feedback" USING btree ("status");--> statement-breakpoint
CREATE INDEX "feedback_created_at_idx" ON "feedback"."user_feedback" USING btree ("created_at");