// notif-diag is a CLI for inspecting and (optionally) cleaning up stuck // notification rows. Default mode is read-only — runs SELECTs and prints a // summary. With --mark-failed-as-sent, marks pending rows that already have a // recorded error as sent (cosmetic — no retry, no resend). // // Usage: // // set -a && source deploy/prod.env && set +a // go run ./cmd/notif-diag # diagnose // go run ./cmd/notif-diag --mark-failed-as-sent --yes # clean up errored backlog package main import ( "bufio" "fmt" "os" "strconv" "strings" "time" "github.com/rs/zerolog" "github.com/rs/zerolog/log" "gorm.io/driver/postgres" "gorm.io/gorm" "gorm.io/gorm/logger" ) func main() { passwordFile := stringFlag("password-file", "deploy/secrets/postgres_password.txt", "Path to file containing POSTGRES_PASSWORD (used if env var is empty)") markFailed := boolFlag("mark-failed-as-sent", "Mark every pending row with a non-empty error_message as sent. Cosmetic only — does not retry the push.") yes := boolFlag("yes", "Skip the interactive confirmation prompt for destructive actions.") log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: time.RFC3339}) dsn, host, err := buildDSN(*passwordFile) if err != nil { log.Fatal().Err(err).Msg("failed to build database DSN") } db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{ Logger: logger.Default.LogMode(logger.Silent), }) if err != nil { log.Fatal().Err(err).Msg("failed to connect to database") } fmt.Printf("DB host: %s\n", host) fmt.Println(strings.Repeat("=", 80)) overallTotals(db) pendingByType(db) recentPending(db) deviceCounts(db) if *markFailed { markFailedAsSent(db, *yes) } } // markFailedAsSent updates pending rows whose error_message is non-empty, // flipping them to sent=true with sent_at=updated_at. This is purely cosmetic: // it removes them from the "pending" count so dashboards and the diag tool // don't keep flagging an old, unfixable backlog. It does NOT re-send anything. func markFailedAsSent(db *gorm.DB, skipPrompt bool) { var candidate int64 if err := db.Raw(` SELECT COUNT(*) FROM notifications_notification WHERE sent = false AND error_message IS NOT NULL AND error_message <> '' `).Scan(&candidate).Error; err != nil { log.Fatal().Err(err).Msg("failed to count cleanup candidates") } fmt.Printf("\n# Cleanup candidate count: %d\n", candidate) if candidate == 0 { fmt.Println(" (nothing to clean up)") return } fmt.Println(" These rows have a recorded send error and will never be retried.") fmt.Println(" Marking them sent=true is cosmetic — it just prevents them from") fmt.Println(" showing up as pending in admin dashboards going forward.") if !skipPrompt { fmt.Printf("\nProceed? Type 'yes' to update %d rows: ", candidate) s, err := bufio.NewReader(os.Stdin).ReadString('\n') if err != nil { log.Fatal().Err(err).Msg("failed to read confirmation") } if strings.TrimSpace(s) != "yes" { fmt.Println("Aborted.") return } } res := db.Exec(` UPDATE notifications_notification SET sent = true, sent_at = COALESCE(updated_at, NOW()) WHERE sent = false AND error_message IS NOT NULL AND error_message <> '' `) if res.Error != nil { log.Fatal().Err(res.Error).Msg("failed to update rows") } fmt.Printf("OK — updated %d rows.\n", res.RowsAffected) } // overallTotals shows the high-level sent/pending/read split. func overallTotals(db *gorm.DB) { type row struct { Total int64 Sent int64 Pending int64 Read int64 Errored int64 } var r row db.Raw(` SELECT COUNT(*) AS total, COUNT(*) FILTER (WHERE sent = true) AS sent, COUNT(*) FILTER (WHERE sent = false) AS pending, COUNT(*) FILTER (WHERE read = true) AS read, COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS errored FROM notifications_notification `).Scan(&r) fmt.Println("\n# Overall notification counts") fmt.Printf(" total: %d\n", r.Total) fmt.Printf(" sent: %d\n", r.Sent) fmt.Printf(" pending: %d\n", r.Pending) fmt.Printf(" read: %d\n", r.Read) fmt.Printf(" errored: %d (rows with non-empty error_message)\n", r.Errored) } // pendingByType breaks the pending rows down by type and age. func pendingByType(db *gorm.DB) { type row struct { NotificationType string PendingCount int64 Oldest *time.Time Newest *time.Time WithErrors int64 Last24h int64 Last7d int64 } var rows []row db.Raw(` SELECT notification_type, COUNT(*) AS pending_count, MIN(created_at) AS oldest, MAX(created_at) AS newest, COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS with_errors, COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '24 hours') AS last_24h, COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '7 days') AS last_7d FROM notifications_notification WHERE sent = false GROUP BY notification_type ORDER BY MAX(created_at) DESC NULLS LAST `).Scan(&rows) fmt.Println("\n# Pending rows by type") if len(rows) == 0 { fmt.Println(" (no pending notifications)") return } fmt.Printf(" %-22s %7s %7s %7s %7s %-19s %-19s\n", "TYPE", "PENDING", "ERRORED", "LAST24H", "LAST7D", "OLDEST", "NEWEST") for _, r := range rows { fmt.Printf(" %-22s %7d %7d %7d %7d %-19s %-19s\n", r.NotificationType, r.PendingCount, r.WithErrors, r.Last24h, r.Last7d, fmtTime(r.Oldest), fmtTime(r.Newest)) } } // recentPending shows the 5 most recent pending rows with full detail. func recentPending(db *gorm.DB) { type row struct { ID uint UserID uint NotificationType string Title string Body string ErrorMessage string CreatedAt time.Time } var rows []row db.Raw(` SELECT id, user_id, notification_type, title, body, COALESCE(error_message, '') AS error_message, created_at FROM notifications_notification WHERE sent = false ORDER BY created_at DESC LIMIT 5 `).Scan(&rows) fmt.Println("\n# 5 most recent pending notifications") if len(rows) == 0 { fmt.Println(" (none)") return } for _, r := range rows { errPart := "" if r.ErrorMessage != "" { errPart = fmt.Sprintf("\n error: %s", r.ErrorMessage) } fmt.Printf(" [%d] user=%d %s %s%s\n title: %s\n body: %s\n", r.ID, r.UserID, r.CreatedAt.Format("2006-01-02 15:04:05"), r.NotificationType, errPart, truncate(r.Title, 100), truncate(r.Body, 100)) } } // deviceCounts shows how many push devices are registered (active vs inactive). func deviceCounts(db *gorm.DB) { type row struct { Total int64 Active int64 WithUser int64 DistinctUsers int64 } fmt.Println("\n# Registered push devices") for _, t := range []struct { label string table string }{ {"APNs (iOS)", "push_notifications_apnsdevice"}, {"GCM (Android)", "push_notifications_gcmdevice"}, } { var r row err := db.Raw(fmt.Sprintf(` SELECT COUNT(*) AS total, COUNT(*) FILTER (WHERE active = true) AS active, COUNT(*) FILTER (WHERE user_id IS NOT NULL) AS with_user, COUNT(DISTINCT user_id) AS distinct_users FROM %s `, t.table)).Scan(&r).Error if err != nil { fmt.Printf(" %-15s ERROR: %v\n", t.label, err) continue } fmt.Printf(" %-15s total=%-5d active=%-5d with_user=%-5d distinct_users=%d\n", t.label, r.Total, r.Active, r.WithUser, r.DistinctUsers) } } func buildDSN(passwordFile string) (dsn, host string, err error) { host = os.Getenv("DB_HOST") user := os.Getenv("POSTGRES_USER") dbname := os.Getenv("POSTGRES_DB") sslmode := os.Getenv("DB_SSLMODE") if sslmode == "" { sslmode = "require" } port := 5432 if s := os.Getenv("DB_PORT"); s != "" { p, perr := strconv.Atoi(s) if perr != nil { return "", "", fmt.Errorf("invalid DB_PORT %q: %w", s, perr) } port = p } password := os.Getenv("POSTGRES_PASSWORD") if password == "" && passwordFile != "" { b, rerr := os.ReadFile(passwordFile) if rerr != nil { return "", "", fmt.Errorf("POSTGRES_PASSWORD not set and could not read %s: %w", passwordFile, rerr) } password = strings.TrimRight(string(b), "\r\n") } missing := []string{} if host == "" { missing = append(missing, "DB_HOST") } if user == "" { missing = append(missing, "POSTGRES_USER") } if dbname == "" { missing = append(missing, "POSTGRES_DB") } if password == "" { missing = append(missing, "POSTGRES_PASSWORD") } if len(missing) > 0 { return "", "", fmt.Errorf("missing required env vars: %s", strings.Join(missing, ", ")) } dsn = fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s", host, port, user, password, dbname, sslmode) return dsn, host, nil } // stringFlag is a tiny stand-in for flag.String to keep imports lean — using it // also dodges flag-package quirks when this file is rebuilt with go run. func stringFlag(name, def, _usage string) *string { v := def prefix := "--" + name + "=" for _, a := range os.Args[1:] { if strings.HasPrefix(a, prefix) { v = strings.TrimPrefix(a, prefix) } } return &v } // boolFlag is true if --name is present in os.Args (no value form). func boolFlag(name, _usage string) *bool { want := "--" + name v := false for _, a := range os.Args[1:] { if a == want { v = true } } return &v } func fmtTime(t *time.Time) string { if t == nil { return "-" } return t.Format("2006-01-02 15:04:05") } func truncate(s string, n int) string { if len(s) <= n { return s } return s[:n] + "…" }