Harden prod deploy: versioned secrets, healthchecks, migration lock, dry-run

Swarm stack
- Resource limits on all services, stop_grace_period 60s on api/worker/admin
- Dozzle bound to manager loopback only (ssh -L required for access)
- Worker health server on :6060, admin /api/health endpoint
- Redis 200M LRU cap, B2/S3 env vars wired through to api service

Deploy script
- DRY_RUN=1 prints plan + exits
- Auto-rollback on failed healthcheck, docker logout at end
- Versioned-secret pruning keeps last SECRET_KEEP_VERSIONS (default 3)
- PUSH_LATEST_TAG default flipped to false
- B2 all-or-none validation before deploy

Code
- cmd/api takes pg_advisory_lock on a dedicated connection before
  AutoMigrate, serialising boot-time migrations across replicas
- cmd/worker exposes an HTTP /health endpoint with graceful shutdown

Docs
- deploy/DEPLOYING.md: step-by-step walkthrough for a real deploy
- deploy/shit_deploy_cant_do.md: manual prerequisites + recurring ops
- deploy/README.md updated with storage toggle, worker-replica caveat,
  multi-arch recipe, connection-pool tuning, renumbered sections

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-04-14 15:22:43 -05:00
parent ca818e8478
commit 33eee812b6
11 changed files with 908 additions and 30 deletions

View File

@@ -1,6 +1,7 @@
package database
import (
"context"
"fmt"
"time"
@@ -15,6 +16,11 @@ import (
"github.com/treytartt/honeydue-api/internal/models"
)
// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
// Migrate() across API replicas booting in parallel. Value is arbitrary but
// stable ("hdmg" as bytes = honeydue migration).
const migrationAdvisoryLockKey int64 = 0x68646d67
// zerologGormWriter adapts zerolog for GORM's logger interface
type zerologGormWriter struct{}
@@ -121,6 +127,54 @@ func Paginate(page, pageSize int) func(db *gorm.DB) *gorm.DB {
}
}
// MigrateWithLock runs Migrate() under a Postgres session-level advisory lock
// so that multiple API replicas booting in parallel don't race on AutoMigrate.
// On non-Postgres dialects (sqlite in tests) it falls through to Migrate().
func MigrateWithLock() error {
if db == nil {
return fmt.Errorf("database not initialised")
}
if db.Dialector.Name() != "postgres" {
return Migrate()
}
sqlDB, err := db.DB()
if err != nil {
return fmt.Errorf("get underlying sql.DB: %w", err)
}
// Give ourselves up to 5 min to acquire the lock — long enough for a
// slow migration on a peer replica, short enough to fail fast if Postgres
// is hung.
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
conn, err := sqlDB.Conn(ctx)
if err != nil {
return fmt.Errorf("acquire dedicated migration connection: %w", err)
}
defer conn.Close()
log.Info().Int64("lock_key", migrationAdvisoryLockKey).Msg("Acquiring migration advisory lock...")
if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", migrationAdvisoryLockKey); err != nil {
return fmt.Errorf("pg_advisory_lock: %w", err)
}
log.Info().Msg("Migration advisory lock acquired")
defer func() {
// Unlock with a fresh context — the outer ctx may have expired.
unlockCtx, unlockCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer unlockCancel()
if _, err := conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", migrationAdvisoryLockKey); err != nil {
log.Warn().Err(err).Msg("Failed to release migration advisory lock (session close will also release)")
} else {
log.Info().Msg("Migration advisory lock released")
}
}()
return Migrate()
}
// Migrate runs database migrations for all models
func Migrate() error {
log.Info().Msg("Running database migrations...")