Harden prod deploy: versioned secrets, healthchecks, migration lock, dry-run
Swarm stack - Resource limits on all services, stop_grace_period 60s on api/worker/admin - Dozzle bound to manager loopback only (ssh -L required for access) - Worker health server on :6060, admin /api/health endpoint - Redis 200M LRU cap, B2/S3 env vars wired through to api service Deploy script - DRY_RUN=1 prints plan + exits - Auto-rollback on failed healthcheck, docker logout at end - Versioned-secret pruning keeps last SECRET_KEEP_VERSIONS (default 3) - PUSH_LATEST_TAG default flipped to false - B2 all-or-none validation before deploy Code - cmd/api takes pg_advisory_lock on a dedicated connection before AutoMigrate, serialising boot-time migrations across replicas - cmd/worker exposes an HTTP /health endpoint with graceful shutdown Docs - deploy/DEPLOYING.md: step-by-step walkthrough for a real deploy - deploy/shit_deploy_cant_do.md: manual prerequisites + recurring ops - deploy/README.md updated with storage toggle, worker-replica caveat, multi-arch recipe, connection-pool tuning, renumbered sections Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package database
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
@@ -15,6 +16,11 @@ import (
|
||||
"github.com/treytartt/honeydue-api/internal/models"
|
||||
)
|
||||
|
||||
// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
|
||||
// Migrate() across API replicas booting in parallel. Value is arbitrary but
|
||||
// stable ("hdmg" as bytes = honeydue migration).
|
||||
const migrationAdvisoryLockKey int64 = 0x68646d67
|
||||
|
||||
// zerologGormWriter adapts zerolog for GORM's logger interface
|
||||
type zerologGormWriter struct{}
|
||||
|
||||
@@ -121,6 +127,54 @@ func Paginate(page, pageSize int) func(db *gorm.DB) *gorm.DB {
|
||||
}
|
||||
}
|
||||
|
||||
// MigrateWithLock runs Migrate() under a Postgres session-level advisory lock
|
||||
// so that multiple API replicas booting in parallel don't race on AutoMigrate.
|
||||
// On non-Postgres dialects (sqlite in tests) it falls through to Migrate().
|
||||
func MigrateWithLock() error {
|
||||
if db == nil {
|
||||
return fmt.Errorf("database not initialised")
|
||||
}
|
||||
if db.Dialector.Name() != "postgres" {
|
||||
return Migrate()
|
||||
}
|
||||
|
||||
sqlDB, err := db.DB()
|
||||
if err != nil {
|
||||
return fmt.Errorf("get underlying sql.DB: %w", err)
|
||||
}
|
||||
|
||||
// Give ourselves up to 5 min to acquire the lock — long enough for a
|
||||
// slow migration on a peer replica, short enough to fail fast if Postgres
|
||||
// is hung.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
conn, err := sqlDB.Conn(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("acquire dedicated migration connection: %w", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
log.Info().Int64("lock_key", migrationAdvisoryLockKey).Msg("Acquiring migration advisory lock...")
|
||||
if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", migrationAdvisoryLockKey); err != nil {
|
||||
return fmt.Errorf("pg_advisory_lock: %w", err)
|
||||
}
|
||||
log.Info().Msg("Migration advisory lock acquired")
|
||||
|
||||
defer func() {
|
||||
// Unlock with a fresh context — the outer ctx may have expired.
|
||||
unlockCtx, unlockCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer unlockCancel()
|
||||
if _, err := conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", migrationAdvisoryLockKey); err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to release migration advisory lock (session close will also release)")
|
||||
} else {
|
||||
log.Info().Msg("Migration advisory lock released")
|
||||
}
|
||||
}()
|
||||
|
||||
return Migrate()
|
||||
}
|
||||
|
||||
// Migrate runs database migrations for all models
|
||||
func Migrate() error {
|
||||
log.Info().Msg("Running database migrations...")
|
||||
|
||||
Reference in New Issue
Block a user