honeyDueAPI/cmd/notif-diag/main.go

// notif-diag is a CLI for inspecting and (optionally) cleaning up stuck
// notification rows. Default mode is read-only — runs SELECTs and prints a
// summary. With --mark-failed-as-sent, marks pending rows that already have a
// recorded error as sent (cosmetic — no retry, no resend).
//
// Usage:
//
//	set -a && source deploy/prod.env && set +a
//	go run ./cmd/notif-diag                              # diagnose
//	go run ./cmd/notif-diag --mark-failed-as-sent --yes  # clean up errored backlog
package main

import (
	"bufio"
	"fmt"
	"os"
	"strconv"
	"strings"
	"time"

	"github.com/rs/zerolog"
	"github.com/rs/zerolog/log"
	"gorm.io/driver/postgres"
	"gorm.io/gorm"
	"gorm.io/gorm/logger"
)

func main() {
	passwordFile := stringFlag("password-file", "deploy/secrets/postgres_password.txt",
		"Path to file containing POSTGRES_PASSWORD (used if env var is empty)")
	markFailed := boolFlag("mark-failed-as-sent",
		"Mark every pending row with a non-empty error_message as sent. Cosmetic only — does not retry the push.")
	yes := boolFlag("yes", "Skip the interactive confirmation prompt for destructive actions.")

	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: time.RFC3339})

	dsn, host, err := buildDSN(*passwordFile)
	if err != nil {
		log.Fatal().Err(err).Msg("failed to build database DSN")
	}

	db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{
		Logger: logger.Default.LogMode(logger.Silent),
	})
	if err != nil {
		log.Fatal().Err(err).Msg("failed to connect to database")
	}

	fmt.Printf("DB host: %s\n", host)
	fmt.Println(strings.Repeat("=", 80))

	overallTotals(db)
	pendingByType(db)
	recentPending(db)
	deviceCounts(db)

	if *markFailed {
		markFailedAsSent(db, *yes)
	}
}

// markFailedAsSent updates pending rows whose error_message is non-empty,
// flipping them to sent=true with sent_at=updated_at. This is purely cosmetic:
// it removes them from the "pending" count so dashboards and the diag tool
// don't keep flagging an old, unfixable backlog. It does NOT re-send anything.
func markFailedAsSent(db *gorm.DB, skipPrompt bool) {
	var candidate int64
	if err := db.Raw(`
		SELECT COUNT(*) FROM notifications_notification
		WHERE sent = false AND error_message IS NOT NULL AND error_message <> ''
	`).Scan(&candidate).Error; err != nil {
		log.Fatal().Err(err).Msg("failed to count cleanup candidates")
	}

	fmt.Printf("\n# Cleanup candidate count: %d\n", candidate)
	if candidate == 0 {
		fmt.Println("  (nothing to clean up)")
		return
	}
	fmt.Println("  These rows have a recorded send error and will never be retried.")
	fmt.Println("  Marking them sent=true is cosmetic — it just prevents them from")
	fmt.Println("  showing up as pending in admin dashboards going forward.")

	if !skipPrompt {
		fmt.Printf("\nProceed? Type 'yes' to update %d rows: ", candidate)
		s, err := bufio.NewReader(os.Stdin).ReadString('\n')
		if err != nil {
			log.Fatal().Err(err).Msg("failed to read confirmation")
		}
		if strings.TrimSpace(s) != "yes" {
			fmt.Println("Aborted.")
			return
		}
	}

	res := db.Exec(`
		UPDATE notifications_notification
		SET sent = true, sent_at = COALESCE(updated_at, NOW())
		WHERE sent = false AND error_message IS NOT NULL AND error_message <> ''
	`)
	if res.Error != nil {
		log.Fatal().Err(res.Error).Msg("failed to update rows")
	}
	fmt.Printf("OK — updated %d rows.\n", res.RowsAffected)
}

// overallTotals shows the high-level sent/pending/read split.
func overallTotals(db *gorm.DB) {
	type row struct {
		Total   int64
		Sent    int64
		Pending int64
		Read    int64
		Errored int64
	}
	var r row
	db.Raw(`
		SELECT
			COUNT(*)                                                  AS total,
			COUNT(*) FILTER (WHERE sent = true)                       AS sent,
			COUNT(*) FILTER (WHERE sent = false)                      AS pending,
			COUNT(*) FILTER (WHERE read = true)                       AS read,
			COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS errored
		FROM notifications_notification
	`).Scan(&r)

	fmt.Println("\n# Overall notification counts")
	fmt.Printf("  total:   %d\n", r.Total)
	fmt.Printf("  sent:    %d\n", r.Sent)
	fmt.Printf("  pending: %d\n", r.Pending)
	fmt.Printf("  read:    %d\n", r.Read)
	fmt.Printf("  errored: %d  (rows with non-empty error_message)\n", r.Errored)
}

// pendingByType breaks the pending rows down by type and age.
func pendingByType(db *gorm.DB) {
	type row struct {
		NotificationType string
		PendingCount     int64
		Oldest           *time.Time
		Newest           *time.Time
		WithErrors       int64
		Last24h          int64
		Last7d           int64
	}
	var rows []row
	db.Raw(`
		SELECT
			notification_type,
			COUNT(*)                                       AS pending_count,
			MIN(created_at)                                AS oldest,
			MAX(created_at)                                AS newest,
			COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS with_errors,
			COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '24 hours')          AS last_24h,
			COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '7 days')            AS last_7d
		FROM notifications_notification
		WHERE sent = false
		GROUP BY notification_type
		ORDER BY MAX(created_at) DESC NULLS LAST
	`).Scan(&rows)

	fmt.Println("\n# Pending rows by type")
	if len(rows) == 0 {
		fmt.Println("  (no pending notifications)")
		return
	}
	fmt.Printf("  %-22s  %7s  %7s  %7s  %7s  %-19s  %-19s\n",
		"TYPE", "PENDING", "ERRORED", "LAST24H", "LAST7D", "OLDEST", "NEWEST")
	for _, r := range rows {
		fmt.Printf("  %-22s  %7d  %7d  %7d  %7d  %-19s  %-19s\n",
			r.NotificationType, r.PendingCount, r.WithErrors, r.Last24h, r.Last7d,
			fmtTime(r.Oldest), fmtTime(r.Newest))
	}
}

// recentPending shows the 5 most recent pending rows with full detail.
func recentPending(db *gorm.DB) {
	type row struct {
		ID               uint
		UserID           uint
		NotificationType string
		Title            string
		Body             string
		ErrorMessage     string
		CreatedAt        time.Time
	}
	var rows []row
	db.Raw(`
		SELECT id, user_id, notification_type, title, body, COALESCE(error_message, '') AS error_message, created_at
		FROM notifications_notification
		WHERE sent = false
		ORDER BY created_at DESC
		LIMIT 5
	`).Scan(&rows)

	fmt.Println("\n# 5 most recent pending notifications")
	if len(rows) == 0 {
		fmt.Println("  (none)")
		return
	}
	for _, r := range rows {
		errPart := ""
		if r.ErrorMessage != "" {
			errPart = fmt.Sprintf("\n      error: %s", r.ErrorMessage)
		}
		fmt.Printf("  [%d] user=%d  %s  %s%s\n      title: %s\n      body:  %s\n",
			r.ID, r.UserID, r.CreatedAt.Format("2006-01-02 15:04:05"), r.NotificationType, errPart,
			truncate(r.Title, 100), truncate(r.Body, 100))
	}
}

// deviceCounts shows how many push devices are registered (active vs inactive).
func deviceCounts(db *gorm.DB) {
	type row struct {
		Total       int64
		Active      int64
		WithUser    int64
		DistinctUsers int64
	}

	fmt.Println("\n# Registered push devices")
	for _, t := range []struct {
		label string
		table string
	}{
		{"APNs (iOS)", "push_notifications_apnsdevice"},
		{"GCM (Android)", "push_notifications_gcmdevice"},
	} {
		var r row
		err := db.Raw(fmt.Sprintf(`
			SELECT
				COUNT(*)                                              AS total,
				COUNT(*) FILTER (WHERE active = true)                 AS active,
				COUNT(*) FILTER (WHERE user_id IS NOT NULL)           AS with_user,
				COUNT(DISTINCT user_id)                               AS distinct_users
			FROM %s
		`, t.table)).Scan(&r).Error
		if err != nil {
			fmt.Printf("  %-15s  ERROR: %v\n", t.label, err)
			continue
		}
		fmt.Printf("  %-15s  total=%-5d  active=%-5d  with_user=%-5d  distinct_users=%d\n",
			t.label, r.Total, r.Active, r.WithUser, r.DistinctUsers)
	}
}

func buildDSN(passwordFile string) (dsn, host string, err error) {
	host = os.Getenv("DB_HOST")
	user := os.Getenv("POSTGRES_USER")
	dbname := os.Getenv("POSTGRES_DB")
	sslmode := os.Getenv("DB_SSLMODE")
	if sslmode == "" {
		sslmode = "require"
	}

	port := 5432
	if s := os.Getenv("DB_PORT"); s != "" {
		p, perr := strconv.Atoi(s)
		if perr != nil {
			return "", "", fmt.Errorf("invalid DB_PORT %q: %w", s, perr)
		}
		port = p
	}

	password := os.Getenv("POSTGRES_PASSWORD")
	if password == "" && passwordFile != "" {
		b, rerr := os.ReadFile(passwordFile)
		if rerr != nil {
			return "", "", fmt.Errorf("POSTGRES_PASSWORD not set and could not read %s: %w", passwordFile, rerr)
		}
		password = strings.TrimRight(string(b), "\r\n")
	}

	missing := []string{}
	if host == "" {
		missing = append(missing, "DB_HOST")
	}
	if user == "" {
		missing = append(missing, "POSTGRES_USER")
	}
	if dbname == "" {
		missing = append(missing, "POSTGRES_DB")
	}
	if password == "" {
		missing = append(missing, "POSTGRES_PASSWORD")
	}
	if len(missing) > 0 {
		return "", "", fmt.Errorf("missing required env vars: %s", strings.Join(missing, ", "))
	}

	dsn = fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
		host, port, user, password, dbname, sslmode)
	return dsn, host, nil
}

// stringFlag is a tiny stand-in for flag.String to keep imports lean — using it
// also dodges flag-package quirks when this file is rebuilt with go run.
func stringFlag(name, def, _usage string) *string {
	v := def
	prefix := "--" + name + "="
	for _, a := range os.Args[1:] {
		if strings.HasPrefix(a, prefix) {
			v = strings.TrimPrefix(a, prefix)
		}
	}
	return &v
}

// boolFlag is true if --name is present in os.Args (no value form).
func boolFlag(name, _usage string) *bool {
	want := "--" + name
	v := false
	for _, a := range os.Args[1:] {
		if a == want {
			v = true
		}
	}
	return &v
}

func fmtTime(t *time.Time) string {
	if t == nil {
		return "-"
	}
	return t.Format("2006-01-02 15:04:05")
}

func truncate(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "…"
}