Fix migration deadlock under Neon pooler
Today's pooler-endpoint switch broke MigrateWithLock: pg_advisory_lock is
session-scoped, but PgBouncer transaction-mode releases the underlying
Postgres session after every transaction. The lock was being released
the moment we acquired it, and on the next pod's startup the migration
either deadlocked or proceeded without serialization. Visible as
\"Acquiring migration advisory lock...\" hanging until the startup
probe killed the pod (as just happened on the b67f7f9 deploy).
Fix: open a parallel *gorm.DB pointed at the *direct* Neon endpoint
(DB_HOST with the -pooler segment stripped) for migrations only. That
keeps a real persistent session, so pg_advisory_lock works correctly.
The migration runs against this direct connection; the runtime pool
keeps using the pooler for everything else.
Side effects:
- Migrate() now delegates to migrate(target *gorm.DB) which lets
MigrateWithLock pass the direct DB. Tests on SQLite still call
Migrate() through the global pool unchanged.
- Migration DB uses MaxOpenConns=1, MaxIdleConns=1 — we just hold
the lock on it, no connection pressure.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ package database
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
@@ -120,6 +121,48 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) {
|
|||||||
return db, nil
|
return db, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// openMigrationDB opens a parallel *gorm.DB pointed at the *direct* (non-pooler)
|
||||||
|
// Neon compute endpoint, used exclusively by MigrateWithLock so that
|
||||||
|
// pg_advisory_lock (session-scoped) survives across statements.
|
||||||
|
//
|
||||||
|
// The direct host is derived by stripping the "-pooler" segment from the
|
||||||
|
// configured DB_HOST. Example:
|
||||||
|
// pooler → ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech
|
||||||
|
// direct → ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech
|
||||||
|
//
|
||||||
|
// If DB_HOST already lacks the suffix (e.g., a fallback config or a non-Neon
|
||||||
|
// PG instance), it's used as-is.
|
||||||
|
func openMigrationDB() (*gorm.DB, error) {
|
||||||
|
host := viper.GetString("DB_HOST")
|
||||||
|
port := viper.GetInt("DB_PORT")
|
||||||
|
user := viper.GetString("POSTGRES_USER")
|
||||||
|
pass := viper.GetString("POSTGRES_PASSWORD")
|
||||||
|
name := viper.GetString("POSTGRES_DB")
|
||||||
|
sslmode := viper.GetString("DB_SSLMODE")
|
||||||
|
|
||||||
|
directHost := host
|
||||||
|
if idx := strings.Index(host, "-pooler."); idx > 0 {
|
||||||
|
directHost = host[:idx] + host[idx+len("-pooler"):]
|
||||||
|
}
|
||||||
|
|
||||||
|
dsn := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
|
||||||
|
directHost, port, user, pass, name, sslmode)
|
||||||
|
|
||||||
|
migrationDB, err := gorm.Open(postgres.Open(dsn), &gorm.Config{
|
||||||
|
Logger: logger.Default.LogMode(logger.Silent),
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if sqlDB, _ := migrationDB.DB(); sqlDB != nil {
|
||||||
|
// Single connection is enough — we hold the advisory lock on it.
|
||||||
|
sqlDB.SetMaxOpenConns(1)
|
||||||
|
sqlDB.SetMaxIdleConns(1)
|
||||||
|
}
|
||||||
|
log.Info().Str("direct_host", directHost).Msg("Opened direct DB connection for migration")
|
||||||
|
return migrationDB, nil
|
||||||
|
}
|
||||||
|
|
||||||
// warmUpPool issues N parallel pings so the pool fills with established
|
// warmUpPool issues N parallel pings so the pool fills with established
|
||||||
// connections before the first user request lands. Failures are logged but
|
// connections before the first user request lands. Failures are logged but
|
||||||
// not fatal — the pool will fill on demand under traffic if pre-warm fails.
|
// not fatal — the pool will fill on demand under traffic if pre-warm fails.
|
||||||
@@ -200,18 +243,37 @@ func MigrateWithLock() error {
|
|||||||
return Migrate()
|
return Migrate()
|
||||||
}
|
}
|
||||||
|
|
||||||
sqlDB, err := db.DB()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("get underlying sql.DB: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Give ourselves up to 5 min to acquire the lock — long enough for a
|
// Give ourselves up to 5 min to acquire the lock — long enough for a
|
||||||
// slow migration on a peer replica, short enough to fail fast if Postgres
|
// slow migration on a peer replica, short enough to fail fast if Postgres
|
||||||
// is hung.
|
// is hung.
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
conn, err := sqlDB.Conn(ctx)
|
// CRITICAL: pg_advisory_lock is *session-scoped*. The runtime DB pool
|
||||||
|
// connects to Neon's pooler endpoint (PgBouncer in transaction mode),
|
||||||
|
// which releases the underlying Postgres session after each transaction —
|
||||||
|
// which would silently release our lock the moment we acquired it.
|
||||||
|
//
|
||||||
|
// So for migrations only, we open a parallel *gorm.DB pointed at the
|
||||||
|
// *direct* compute endpoint (DB_HOST with the "-pooler" segment stripped).
|
||||||
|
// AutoMigrate runs against this direct connection, holding the lock the
|
||||||
|
// whole time. Closed on return.
|
||||||
|
migrationDB, err := openMigrationDB()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("open direct migration connection: %w", err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if sqlDB, _ := migrationDB.DB(); sqlDB != nil {
|
||||||
|
_ = sqlDB.Close()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
migSQLDB, err := migrationDB.DB()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("get underlying sql.DB: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
conn, err := migSQLDB.Conn(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("acquire dedicated migration connection: %w", err)
|
return fmt.Errorf("acquire dedicated migration connection: %w", err)
|
||||||
}
|
}
|
||||||
@@ -234,15 +296,27 @@ func MigrateWithLock() error {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
return Migrate()
|
// Run migrations against the *direct* migrationDB while holding the
|
||||||
|
// session-scoped lock on `conn`. Migrate() defaults to the global
|
||||||
|
// `db` (the pooler-backed pool) but the package-level `db` swap during
|
||||||
|
// migration is risky; instead, just call AutoMigrate on migrationDB.
|
||||||
|
return migrate(migrationDB)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Migrate runs database migrations for all models
|
// Migrate runs database migrations against the global pool. This is fine
|
||||||
|
// when the global pool is connected via a session (no PgBouncer) — e.g.,
|
||||||
|
// in tests with SQLite. In production, MigrateWithLock should be used,
|
||||||
|
// which targets a direct Postgres connection.
|
||||||
func Migrate() error {
|
func Migrate() error {
|
||||||
|
return migrate(db)
|
||||||
|
}
|
||||||
|
|
||||||
|
// migrate runs AutoMigrate on the supplied gorm.DB.
|
||||||
|
func migrate(target *gorm.DB) error {
|
||||||
log.Info().Msg("Running database migrations...")
|
log.Info().Msg("Running database migrations...")
|
||||||
|
|
||||||
// Migrate all models in order (respecting foreign key constraints)
|
// Migrate all models in order (respecting foreign key constraints)
|
||||||
err := db.AutoMigrate(
|
err := target.AutoMigrate(
|
||||||
// Lookup tables first (no foreign keys)
|
// Lookup tables first (no foreign keys)
|
||||||
&models.ResidenceType{},
|
&models.ResidenceType{},
|
||||||
&models.TaskCategory{},
|
&models.TaskCategory{},
|
||||||
|
|||||||
Reference in New Issue
Block a user