From 30966c6f5e525c80c4833ac9292b0a482abd1e50 Mon Sep 17 00:00:00 2001 From: Trey t Date: Sun, 26 Apr 2026 21:53:52 -0500 Subject: [PATCH] Fix migration deadlock under Neon pooler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today's pooler-endpoint switch broke MigrateWithLock: pg_advisory_lock is session-scoped, but PgBouncer transaction-mode releases the underlying Postgres session after every transaction. The lock was being released the moment we acquired it, and on the next pod's startup the migration either deadlocked or proceeded without serialization. Visible as \"Acquiring migration advisory lock...\" hanging until the startup probe killed the pod (as just happened on the b67f7f9 deploy). Fix: open a parallel *gorm.DB pointed at the *direct* Neon endpoint (DB_HOST with the -pooler segment stripped) for migrations only. That keeps a real persistent session, so pg_advisory_lock works correctly. The migration runs against this direct connection; the runtime pool keeps using the pooler for everything else. Side effects: - Migrate() now delegates to migrate(target *gorm.DB) which lets MigrateWithLock pass the direct DB. Tests on SQLite still call Migrate() through the global pool unchanged. - Migration DB uses MaxOpenConns=1, MaxIdleConns=1 — we just hold the lock on it, no connection pressure. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/database/database.go | 92 +++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 9 deletions(-) diff --git a/internal/database/database.go b/internal/database/database.go index f7e134e..f1519b6 100644 --- a/internal/database/database.go +++ b/internal/database/database.go @@ -3,6 +3,7 @@ package database import ( "context" "fmt" + "strings" "time" "github.com/rs/zerolog/log" @@ -120,6 +121,48 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) { return db, nil } +// openMigrationDB opens a parallel *gorm.DB pointed at the *direct* (non-pooler) +// Neon compute endpoint, used exclusively by MigrateWithLock so that +// pg_advisory_lock (session-scoped) survives across statements. +// +// The direct host is derived by stripping the "-pooler" segment from the +// configured DB_HOST. Example: +// pooler → ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech +// direct → ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech +// +// If DB_HOST already lacks the suffix (e.g., a fallback config or a non-Neon +// PG instance), it's used as-is. +func openMigrationDB() (*gorm.DB, error) { + host := viper.GetString("DB_HOST") + port := viper.GetInt("DB_PORT") + user := viper.GetString("POSTGRES_USER") + pass := viper.GetString("POSTGRES_PASSWORD") + name := viper.GetString("POSTGRES_DB") + sslmode := viper.GetString("DB_SSLMODE") + + directHost := host + if idx := strings.Index(host, "-pooler."); idx > 0 { + directHost = host[:idx] + host[idx+len("-pooler"):] + } + + dsn := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s", + directHost, port, user, pass, name, sslmode) + + migrationDB, err := gorm.Open(postgres.Open(dsn), &gorm.Config{ + Logger: logger.Default.LogMode(logger.Silent), + }) + if err != nil { + return nil, err + } + if sqlDB, _ := migrationDB.DB(); sqlDB != nil { + // Single connection is enough — we hold the advisory lock on it. + sqlDB.SetMaxOpenConns(1) + sqlDB.SetMaxIdleConns(1) + } + log.Info().Str("direct_host", directHost).Msg("Opened direct DB connection for migration") + return migrationDB, nil +} + // warmUpPool issues N parallel pings so the pool fills with established // connections before the first user request lands. Failures are logged but // not fatal — the pool will fill on demand under traffic if pre-warm fails. @@ -200,18 +243,37 @@ func MigrateWithLock() error { return Migrate() } - sqlDB, err := db.DB() - if err != nil { - return fmt.Errorf("get underlying sql.DB: %w", err) - } - // Give ourselves up to 5 min to acquire the lock — long enough for a // slow migration on a peer replica, short enough to fail fast if Postgres // is hung. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() - conn, err := sqlDB.Conn(ctx) + // CRITICAL: pg_advisory_lock is *session-scoped*. The runtime DB pool + // connects to Neon's pooler endpoint (PgBouncer in transaction mode), + // which releases the underlying Postgres session after each transaction — + // which would silently release our lock the moment we acquired it. + // + // So for migrations only, we open a parallel *gorm.DB pointed at the + // *direct* compute endpoint (DB_HOST with the "-pooler" segment stripped). + // AutoMigrate runs against this direct connection, holding the lock the + // whole time. Closed on return. + migrationDB, err := openMigrationDB() + if err != nil { + return fmt.Errorf("open direct migration connection: %w", err) + } + defer func() { + if sqlDB, _ := migrationDB.DB(); sqlDB != nil { + _ = sqlDB.Close() + } + }() + + migSQLDB, err := migrationDB.DB() + if err != nil { + return fmt.Errorf("get underlying sql.DB: %w", err) + } + + conn, err := migSQLDB.Conn(ctx) if err != nil { return fmt.Errorf("acquire dedicated migration connection: %w", err) } @@ -234,15 +296,27 @@ func MigrateWithLock() error { } }() - return Migrate() + // Run migrations against the *direct* migrationDB while holding the + // session-scoped lock on `conn`. Migrate() defaults to the global + // `db` (the pooler-backed pool) but the package-level `db` swap during + // migration is risky; instead, just call AutoMigrate on migrationDB. + return migrate(migrationDB) } -// Migrate runs database migrations for all models +// Migrate runs database migrations against the global pool. This is fine +// when the global pool is connected via a session (no PgBouncer) — e.g., +// in tests with SQLite. In production, MigrateWithLock should be used, +// which targets a direct Postgres connection. func Migrate() error { + return migrate(db) +} + +// migrate runs AutoMigrate on the supplied gorm.DB. +func migrate(target *gorm.DB) error { log.Info().Msg("Running database migrations...") // Migrate all models in order (respecting foreign key constraints) - err := db.AutoMigrate( + err := target.AutoMigrate( // Lookup tables first (no foreign keys) &models.ResidenceType{}, &models.TaskCategory{},