diff --git a/internal/database/database.go b/internal/database/database.go index f7e134e..f1519b6 100644 --- a/internal/database/database.go +++ b/internal/database/database.go @@ -3,6 +3,7 @@ package database import ( "context" "fmt" + "strings" "time" "github.com/rs/zerolog/log" @@ -120,6 +121,48 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) { return db, nil } +// openMigrationDB opens a parallel *gorm.DB pointed at the *direct* (non-pooler) +// Neon compute endpoint, used exclusively by MigrateWithLock so that +// pg_advisory_lock (session-scoped) survives across statements. +// +// The direct host is derived by stripping the "-pooler" segment from the +// configured DB_HOST. Example: +// pooler → ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech +// direct → ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech +// +// If DB_HOST already lacks the suffix (e.g., a fallback config or a non-Neon +// PG instance), it's used as-is. +func openMigrationDB() (*gorm.DB, error) { + host := viper.GetString("DB_HOST") + port := viper.GetInt("DB_PORT") + user := viper.GetString("POSTGRES_USER") + pass := viper.GetString("POSTGRES_PASSWORD") + name := viper.GetString("POSTGRES_DB") + sslmode := viper.GetString("DB_SSLMODE") + + directHost := host + if idx := strings.Index(host, "-pooler."); idx > 0 { + directHost = host[:idx] + host[idx+len("-pooler"):] + } + + dsn := fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s", + directHost, port, user, pass, name, sslmode) + + migrationDB, err := gorm.Open(postgres.Open(dsn), &gorm.Config{ + Logger: logger.Default.LogMode(logger.Silent), + }) + if err != nil { + return nil, err + } + if sqlDB, _ := migrationDB.DB(); sqlDB != nil { + // Single connection is enough — we hold the advisory lock on it. + sqlDB.SetMaxOpenConns(1) + sqlDB.SetMaxIdleConns(1) + } + log.Info().Str("direct_host", directHost).Msg("Opened direct DB connection for migration") + return migrationDB, nil +} + // warmUpPool issues N parallel pings so the pool fills with established // connections before the first user request lands. Failures are logged but // not fatal — the pool will fill on demand under traffic if pre-warm fails. @@ -200,18 +243,37 @@ func MigrateWithLock() error { return Migrate() } - sqlDB, err := db.DB() - if err != nil { - return fmt.Errorf("get underlying sql.DB: %w", err) - } - // Give ourselves up to 5 min to acquire the lock — long enough for a // slow migration on a peer replica, short enough to fail fast if Postgres // is hung. ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() - conn, err := sqlDB.Conn(ctx) + // CRITICAL: pg_advisory_lock is *session-scoped*. The runtime DB pool + // connects to Neon's pooler endpoint (PgBouncer in transaction mode), + // which releases the underlying Postgres session after each transaction — + // which would silently release our lock the moment we acquired it. + // + // So for migrations only, we open a parallel *gorm.DB pointed at the + // *direct* compute endpoint (DB_HOST with the "-pooler" segment stripped). + // AutoMigrate runs against this direct connection, holding the lock the + // whole time. Closed on return. + migrationDB, err := openMigrationDB() + if err != nil { + return fmt.Errorf("open direct migration connection: %w", err) + } + defer func() { + if sqlDB, _ := migrationDB.DB(); sqlDB != nil { + _ = sqlDB.Close() + } + }() + + migSQLDB, err := migrationDB.DB() + if err != nil { + return fmt.Errorf("get underlying sql.DB: %w", err) + } + + conn, err := migSQLDB.Conn(ctx) if err != nil { return fmt.Errorf("acquire dedicated migration connection: %w", err) } @@ -234,15 +296,27 @@ func MigrateWithLock() error { } }() - return Migrate() + // Run migrations against the *direct* migrationDB while holding the + // session-scoped lock on `conn`. Migrate() defaults to the global + // `db` (the pooler-backed pool) but the package-level `db` swap during + // migration is risky; instead, just call AutoMigrate on migrationDB. + return migrate(migrationDB) } -// Migrate runs database migrations for all models +// Migrate runs database migrations against the global pool. This is fine +// when the global pool is connected via a session (no PgBouncer) — e.g., +// in tests with SQLite. In production, MigrateWithLock should be used, +// which targets a direct Postgres connection. func Migrate() error { + return migrate(db) +} + +// migrate runs AutoMigrate on the supplied gorm.DB. +func migrate(target *gorm.DB) error { log.Info().Msg("Running database migrations...") // Migrate all models in order (respecting foreign key constraints) - err := db.AutoMigrate( + err := target.AutoMigrate( // Lookup tables first (no foreign keys) &models.ResidenceType{}, &models.TaskCategory{},