dev: add Kratos + Mailpit local-dev stack

docker-compose.dev.yml gains a Kratos identity service (public :4433 / admin :4434) and a Mailpit SMTP catcher for local onboarding email codes, plus a postgres-init mount. deploy/local/kratos/ holds the local Kratos config + identity schema (placeholder dev cookie secret only). Supports the local backend the XCUITest suite seeds against. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
backend: GDPR export + retention cleanups + worker metrics (BE-1/2/3)
2026-06-09 00:11:06 -05:00 · 2026-06-08 22:15:26 -05:00 · 2026-06-08 21:41:40 -05:00 · 2026-06-06 10:49:37 -05:00 · 2026-06-04 20:54:54 -05:00 · 2026-06-03 22:30:33 -05:00
251 changed files with 20347 additions and 12078 deletions
@@ -28,12 +28,22 @@ EMAIL_HOST_USER=your-email@gmail.com
 EMAIL_HOST_PASSWORD=your-app-password
 DEFAULT_FROM_EMAIL=honeyDue <noreply@honeyDue.treytartt.com>
 # Sign in with Apple
 # APPLE_CLIENT_ID must equal the iOS bundle ID of the build hitting this
 # backend. The Apple identity-token `aud` claim is checked against it
 # (see internal/services/apple_auth.go::verifyAudience). With DEBUG=false
 # an empty value rejects every Apple token.
 #   Release builds: com.myhoneydue.honeyDue
 #   Debug builds:   com.myhoneydue.honeyDue.dev
 APPLE_CLIENT_ID=com.myhoneydue.honeyDue.dev
 APPLE_TEAM_ID=X86BR9WTLD
 # APNs Settings (iOS Push Notifications)
 # Direct APNs integration - no external push server needed
 APNS_AUTH_KEY_PATH=/path/to/AuthKey_XXXXXX.p8
 APNS_AUTH_KEY_ID=XXXXXXXXXX
 APNS_TEAM_ID=XXXXXXXXXX
-APNS_TOPIC=com.tt.honeyDue
+APNS_TOPIC=com.myhoneydue.honeyDue.dev
 APNS_PRODUCTION=false  # Set to true for production APNs, false for sandbox
 # FCM Settings (Android Push Notifications)
@@ -8,6 +8,9 @@ bin/
 /api
 /worker
 /admin
 /admin-reset
 /notif-diag
 /send-test-push
 !admin/
 *.exe
 *.exe~
@@ -42,3 +45,4 @@ push_certs/
 # Vendor (if not using go modules)
 # vendor/
 /migrate
@@ -1,5 +1,5 @@
 # Admin panel build stage
-FROM node:20-alpine AS admin-builder
+FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS admin-builder
 WORKDIR /app
@@ -49,6 +49,19 @@ RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /
 # Build the worker binary
 RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /app/worker ./cmd/worker
 # Install goose CLI for production migrations. Pinned to a specific version
 # so an upstream behavioural change can't break a deploy unannounced.
 # Bumping is a deliberate, reviewable diff. We `go build` rather than
 # `go install` so the output path is predictable across host platforms —
 # `go install` with cross-compile env vars drops the binary in
 # /go/bin/<goos>_<goarch>/, which is awkward to COPY from.
 RUN cd /tmp && \
    git clone --depth=1 --branch=v3.22.1 https://github.com/pressly/goose.git goose-src && \
    cd goose-src && \
    CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} \
      go build -ldflags="-w -s" -o /app/goose ./cmd/goose && \
    cd / && rm -rf /tmp/goose-src
 # Base runtime stage for Go services
 FROM alpine:3.19 AS go-base
@@ -64,6 +77,9 @@ WORKDIR /app
 # Copy all binaries from builder
 COPY --from=builder /app/api /app/api
 COPY --from=builder /app/worker /app/worker
 # goose is the migration runner — same image is reused as the migrate Job
 # entrypoint via `command: ["/usr/local/bin/goose", ...]`.
 COPY --from=builder /app/goose /usr/local/bin/goose
 # Copy templates directory
 COPY --from=builder /app/templates /app/templates
@@ -93,7 +109,7 @@ FROM go-base AS worker
 CMD ["/app/worker"]
 # Admin panel runtime stage
-FROM node:20-alpine AS admin
+FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS admin
 WORKDIR /app
@@ -115,7 +131,7 @@ ENV HOSTNAME="0.0.0.0"
 CMD ["node", "server.js"]
 # Default production stage (for Dokku - runs API + Admin)
-FROM node:20-alpine AS production
+FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS production
 # Install runtime dependencies
 RUN apk add --no-cache ca-certificates tzdata curl
@@ -89,15 +89,36 @@ docker-build-prod:
 	docker build --target worker -t $${REGISTRY:-ghcr.io/treytartt}/honeydue-worker:$${TAG:-latest} .
 	docker build --target admin -t $${REGISTRY:-ghcr.io/treytartt}/honeydue-admin:$${TAG:-latest} .
-# Database migrations
+# Database migrations (goose)
 #
 # DATABASE_URL must point at the *direct* (non-pooler) Neon endpoint —
 # goose's session-scoped advisory lock won't survive PgBouncer transaction
 # mode. Example:
 #   export DATABASE_URL='host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
 #                        user=neondb_owner password=... dbname=honeyDue sslmode=require'
 #
 # Bootstrap (one-time, when adopting goose against an existing DB):
 #   make migrate-status                       # creates goose_db_version
 #   psql ... -c "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
 #
 # Day-to-day:
 #   make migrate-status   # show what's pending
 #   make migrate-up       # apply pending migrations
 #   make migrate-down     # roll back the latest migration
 #   make migrate-new name=add_some_column   # scaffold a new SQL migration
 migrate-up:
-	migrate -path migrations -database "$(DATABASE_URL)" up
+	goose -dir migrations postgres "$(DATABASE_URL)" up
 migrate-down:
-	migrate -path migrations -database "$(DATABASE_URL)" down
+	goose -dir migrations postgres "$(DATABASE_URL)" down
-migrate-create:
+migrate-status:
-	migrate create -ext sql -dir migrations -seq $(name)
+	goose -dir migrations postgres "$(DATABASE_URL)" status
 migrate-new:
 	@if [ -z "$(name)" ]; then echo "usage: make migrate-new name=<short_name>"; exit 1; fi
 	goose -dir migrations create $(name) sql
 # Encrypt existing uploads at rest (run after setting STORAGE_ENCRYPTION_KEY)
 migrate-encrypt:
@@ -184,6 +184,15 @@ needed for local dev. For the complete production env var reference
 Leave all four `B2_*` empty in dev to fall back to a local `/app/uploads` volume.
 **Upload architecture (since `b7f8329`)**: Image and document uploads go
 **directly from the client to B2** via a presigned POST policy issued by
 `POST /api/uploads/presign`. Bytes never traverse the api server. B2
 enforces a 10 MB per-object cap at the protocol level. The worker reaps
 orphaned upload sessions hourly via the `maintenance:upload_cleanup`
 cron. See [`docs/deployment/09-storage.md`](./docs/deployment/09-storage.md)
 for the full flow, and [`docs/deployment/14-deployment-process.md`](./docs/deployment/14-deployment-process.md#one-time-b2-bucket-lifecycle-manual)
 for the one-time bucket lifecycle setup.
 ### Worker schedules (UTC hours)
 | Variable | Description | Default |
@@ -349,7 +358,11 @@ All protected endpoints require an `Authorization: Token <token>` header.
 Production runs on a **3-node K3s HA cluster** on Hetzner Cloud, fronted
 by Cloudflare, with Neon Postgres, Backblaze B2, and a self-hosted Gitea
-container registry. See the full deployment book for every detail:
+container registry. Live observability (VictoriaMetrics + Jaeger +
 Grafana) runs on a separate Linode VPS at
 [`grafana.88oakapps.com`](https://grafana.88oakapps.com) and is fed by a
 `vmagent` sidecar in-cluster. See the full deployment book for every
 detail:
 **→ [docs/deployment/](./docs/deployment/README.md) — The Deployment Book**
@@ -371,7 +384,10 @@ Quick links:
 - **Runbook** — [docs/deployment/17-runbook.md](./docs/deployment/17-runbook.md) — 22 common ops procedures
 - **kubectl cheat sheet** — [docs/deployment/appendices/b-commands.md](./docs/deployment/appendices/b-commands.md)
- **Deploy process** — [docs/deployment/14-deployment-process.md](./docs/deployment/14-deployment-process.md) — build → push → rollout
+- **Deploy process** — [docs/deployment/14-deployment-process.md](./docs/deployment/14-deployment-process.md) — `bash deploy-k3s/scripts/03-deploy.sh` builds → pushes → rolls out
 - **Observability** — [docs/deployment/15-observability.md](./docs/deployment/15-observability.md) — VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`
 - **Observability plan** — [docs/observability-plan.md](./docs/observability-plan.md) — design doc and rollout phases
 - **Database / pool tuning** — [docs/deployment/08-database.md](./docs/deployment/08-database.md) — Neon pooler endpoint, GORM pool, warm-up, RTT budget
 - **Failure modes** — [docs/deployment/16-failure-modes.md](./docs/deployment/16-failure-modes.md) — what happens when X dies
 - **Swarm postmortem** — [docs/deployment/19-postmortem-swarm.md](./docs/deployment/19-postmortem-swarm.md) — why we migrated
@@ -0,0 +1,257 @@
 // admin-reset is a one-off CLI for resetting an admin_users row's password.
 //
 // It reads DB connection settings from environment variables (the same names
 // the API uses), looks up the admin user by email, prompts for a new password
 // twice (no echo), bcrypts it, and updates the row. Safe to keep in the repo
 // — running it requires DB credentials.
 //
 // Usage:
 //
 //	# load env (host, user, db, sslmode) and password from secrets file
 //	set -a && source deploy/prod.env && set +a
 //	go run ./cmd/admin-reset
 //
 //	# or with a non-default secrets path / different admin
 //	go run ./cmd/admin-reset --password-file path/to/postgres_password.txt
 //	go run ./cmd/admin-reset --email someone@example.com
 package main
 import (
 	"bufio"
 	"errors"
 	"flag"
 	"fmt"
 	"os"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 	"golang.org/x/crypto/bcrypt"
 	"golang.org/x/term"
 	"gorm.io/driver/postgres"
 	"gorm.io/gorm"
 	"gorm.io/gorm/logger"
 	"github.com/treytartt/honeydue-api/internal/models"
 )
 const minPasswordLen = 12
 func main() {
 	email := flag.String("email", "admin@myhoneydue.com", "Admin email to reset")
 	passwordFile := flag.String("password-file", "deploy/secrets/postgres_password.txt",
 		"Path to file containing POSTGRES_PASSWORD (used if env var is empty)")
 	list := flag.Bool("list", false, "List all rows in admin_users and exit (no changes)")
 	verify := flag.Bool("verify", false, "Prompt for a password and check it against the stored hash; no changes")
 	newEmail := flag.String("new-email", "", "If set: rename the matched admin's email to this value and exit (no password change)")
 	flag.Parse()
 	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: time.RFC3339})
 	dsn, host, err := buildDSN(*passwordFile)
 	if err != nil {
 		log.Fatal().Err(err).Msg("failed to build database DSN")
 	}
 	db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{
 		Logger: logger.Default.LogMode(logger.Silent),
 	})
 	if err != nil {
 		log.Fatal().Err(err).Msg("failed to connect to database")
 	}
 	if *list {
 		var admins []models.AdminUser
 		if err := db.Order("id").Find(&admins).Error; err != nil {
 			log.Fatal().Err(err).Msg("failed to list admin users")
 		}
 		fmt.Fprintf(os.Stderr, "DB host: %s\n%d admin user(s):\n\n", host, len(admins))
 		fmt.Fprintf(os.Stderr, "%-4s  %-40s  %-12s  %-6s  %s\n", "ID", "EMAIL", "ROLE", "ACTIVE", "LAST_LOGIN")
 		for _, a := range admins {
 			last := "-"
 			if a.LastLogin != nil {
 				last = a.LastLogin.Format(time.RFC3339)
 			}
 			fmt.Fprintf(os.Stderr, "%-4d  %-40s  %-12s  %-6t  %s\n", a.ID, a.Email, a.Role, a.IsActive, last)
 		}
 		return
 	}
 	// Mirror the live API's case-insensitive lookup so --verify reflects what
 	// /api/admin/auth/login actually does. The reset path uses the same query
 	// for consistency.
 	var admin models.AdminUser
 	if err := db.Where("LOWER(email) = LOWER(?)", *email).First(&admin).Error; err != nil {
 		if errors.Is(err, gorm.ErrRecordNotFound) {
 			log.Fatal().Str("email", *email).Msg("admin user not found (try --list to see existing rows)")
 		}
 		log.Fatal().Err(err).Msg("failed to look up admin user")
 	}
 	if *newEmail != "" {
 		target := strings.TrimSpace(*newEmail)
 		if target == "" || !strings.Contains(target, "@") {
 			log.Fatal().Str("new_email", *newEmail).Msg("--new-email must be a valid email address")
 		}
 		if strings.EqualFold(target, admin.Email) {
 			fmt.Fprintf(os.Stderr, "No change — current email already matches %q\n", target)
 			return
 		}
 		// Catch the unique-index conflict early with a clear message instead of a Postgres error.
 		var collisionCount int64
 		if err := db.Model(&models.AdminUser{}).
 			Where("LOWER(email) = LOWER(?) AND id <> ?", target, admin.ID).
 			Count(&collisionCount).Error; err != nil {
 			log.Fatal().Err(err).Msg("failed to check for email collision")
 		}
 		if collisionCount > 0 {
 			log.Fatal().Str("new_email", target).Msg("another admin row already uses this email — aborting")
 		}
 		fmt.Fprintf(os.Stderr, "Renaming admin email: %s → %s (id=%d)\n", admin.Email, target, admin.ID)
 		fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
 		res := db.Model(&models.AdminUser{}).
 			Where("id = ?", admin.ID).
 			Updates(map[string]any{
 				"email":      target,
 				"updated_at": time.Now().UTC(),
 			})
 		if res.Error != nil {
 			log.Fatal().Err(res.Error).Msg("failed to rename admin email")
 		}
 		if res.RowsAffected != 1 {
 			log.Fatal().Int64("rows", res.RowsAffected).Msg("expected exactly 1 row updated")
 		}
 		fmt.Fprintf(os.Stderr, "OK — email is now %s\n", target)
 		return
 	}
 	if *verify {
 		fmt.Fprintf(os.Stderr, "Verifying password for: %s (id=%d, role=%s, active=%t)\n",
 			admin.Email, admin.ID, admin.Role, admin.IsActive)
 		fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
 		pw, err := readPassword("Password: ")
 		if err != nil {
 			log.Fatal().Err(err).Msg("failed to read password")
 		}
 		if admin.CheckPassword(pw) {
 			fmt.Fprintln(os.Stderr, "PASS — bcrypt hash matches the supplied password")
 			if !admin.IsActive {
 				fmt.Fprintln(os.Stderr, "WARNING: is_active = false — login will still be rejected with \"Account is disabled\"")
 			}
 		} else {
 			fmt.Fprintln(os.Stderr, "FAIL — bcrypt hash does NOT match the supplied password")
 			os.Exit(1)
 		}
 		return
 	}
 	fmt.Fprintf(os.Stderr, "Resetting password for: %s (id=%d, role=%s, active=%t)\n",
 		admin.Email, admin.ID, admin.Role, admin.IsActive)
 	fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
 	pw1, err := readPassword("New password: ")
 	if err != nil {
 		log.Fatal().Err(err).Msg("failed to read password")
 	}
 	if len(pw1) < minPasswordLen {
 		log.Fatal().Int("min", minPasswordLen).Msg("password too short")
 	}
 	pw2, err := readPassword("Confirm password: ")
 	if err != nil {
 		log.Fatal().Err(err).Msg("failed to read password")
 	}
 	if pw1 != pw2 {
 		log.Fatal().Msg("passwords do not match")
 	}
 	hash, err := bcrypt.GenerateFromPassword([]byte(pw1), bcrypt.DefaultCost)
 	if err != nil {
 		log.Fatal().Err(err).Msg("failed to hash password")
 	}
 	res := db.Model(&models.AdminUser{}).
 		Where("id = ?", admin.ID).
 		Updates(map[string]any{
 			"password":   string(hash),
 			"updated_at": time.Now().UTC(),
 		})
 	if res.Error != nil {
 		log.Fatal().Err(res.Error).Msg("failed to update admin user")
 	}
 	if res.RowsAffected != 1 {
 		log.Fatal().Int64("rows", res.RowsAffected).Msg("expected exactly 1 row updated")
 	}
 	fmt.Fprintf(os.Stderr, "\nOK — password reset for %s\n", admin.Email)
 }
 func buildDSN(passwordFile string) (dsn, host string, err error) {
 	host = os.Getenv("DB_HOST")
 	user := os.Getenv("POSTGRES_USER")
 	dbname := os.Getenv("POSTGRES_DB")
 	sslmode := os.Getenv("DB_SSLMODE")
 	if sslmode == "" {
 		sslmode = "require"
 	}
 	port := 5432
 	if s := os.Getenv("DB_PORT"); s != "" {
 		p, perr := strconv.Atoi(s)
 		if perr != nil {
 			return "", "", fmt.Errorf("invalid DB_PORT %q: %w", s, perr)
 		}
 		port = p
 	}
 	password := os.Getenv("POSTGRES_PASSWORD")
 	if password == "" && passwordFile != "" {
 		b, rerr := os.ReadFile(passwordFile)
 		if rerr != nil {
 			return "", "", fmt.Errorf("POSTGRES_PASSWORD not set and could not read %s: %w", passwordFile, rerr)
 		}
 		password = strings.TrimRight(string(b), "\r\n")
 	}
 	missing := []string{}
 	if host == "" {
 		missing = append(missing, "DB_HOST")
 	}
 	if user == "" {
 		missing = append(missing, "POSTGRES_USER")
 	}
 	if dbname == "" {
 		missing = append(missing, "POSTGRES_DB")
 	}
 	if password == "" {
 		missing = append(missing, "POSTGRES_PASSWORD")
 	}
 	if len(missing) > 0 {
 		return "", "", fmt.Errorf("missing required env vars: %s", strings.Join(missing, ", "))
 	}
 	dsn = fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
 		host, port, user, password, dbname, sslmode)
 	return dsn, host, nil
 }
 func readPassword(prompt string) (string, error) {
 	fmt.Fprint(os.Stderr, prompt)
 	if term.IsTerminal(int(os.Stdin.Fd())) {
 		b, err := term.ReadPassword(int(os.Stdin.Fd()))
 		fmt.Fprintln(os.Stderr)
 		if err != nil {
 			return "", err
 		}
 		return strings.TrimRight(string(b), "\r\n"), nil
 	}
 	s, err := bufio.NewReader(os.Stdin).ReadString('\n')
 	if err != nil {
 		return "", err
 	}
 	return strings.TrimRight(s, "\r\n"), nil
 }
@@ -9,6 +9,7 @@ import (
 	"syscall"
 	"time"
 	"github.com/hibiken/asynq"
 	"github.com/rs/zerolog/log"
 	"gorm.io/gorm"
@@ -19,6 +20,8 @@ import (
 	"github.com/treytartt/honeydue-api/internal/push"
 	"github.com/treytartt/honeydue-api/internal/router"
 	"github.com/treytartt/honeydue-api/internal/services"
 	"github.com/treytartt/honeydue-api/internal/tracing"
 	"github.com/treytartt/honeydue-api/internal/worker"
 	"github.com/treytartt/honeydue-api/pkg/utils"
 )
@@ -50,6 +53,29 @@ func main() {
 		Str("redis_url", config.MaskURLCredentials(cfg.Redis.URL)).
 		Msg("Starting HoneyDue API server")
 	// Initialize OpenTelemetry tracing — exports to obs.88oakapps.com
 	// (Jaeger via OTLP/HTTP) when OBS_TRACES_URL is set; otherwise installs
 	// a no-op tracer so call sites can use otel.Tracer() unconditionally.
 	// config.SecretValue (not os.Getenv) so file-mounted secrets resolve
 	// after audit F8 removed these from the process environment.
 	tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{
 		ServiceName: "honeydue-api",
 		Environment: deploymentEnvironment(cfg.Server.Debug),
 		EndpointURL: config.SecretValue("OBS_TRACES_URL"),
 		BearerToken: config.SecretValue("OBS_INGEST_TOKEN"),
 		SampleRatio: tracing.SampleRatioFromEnv(),
 	})
 	if err != nil {
 		log.Error().Err(err).Msg("tracing init failed — continuing without traces")
 	}
 	defer func() {
 		shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 		defer cancel()
 		if err := tracingShutdown(shutdownCtx); err != nil {
 			log.Warn().Err(err).Msg("tracing shutdown error")
 		}
 	}()
 	// Connect to database (retry with backoff)
 	var db *gorm.DB
 	var dbErr error
@@ -65,11 +91,14 @@ func main() {
 		log.Error().Err(dbErr).Msg("Failed to connect to database - API will start but database operations will fail")
 	} else {
 		defer database.Close()
-		// Run database migrations only if connected.
+		// Migrations are managed out-of-band by golang-migrate (see
-		// MigrateWithLock serialises parallel replica starts via a Postgres
+		// cmd/migrate and deploy-k3s/manifests/migrate/job.yaml) so the api
-		// advisory lock so concurrent AutoMigrate calls don't race on DDL.
+		// no longer runs AutoMigrate at startup. Instead we verify the
-		if err := database.MigrateWithLock(); err != nil {
+		// schema is at the expected version and refuse to start if not —
-			log.Error().Err(err).Msg("Failed to run database migrations")
+		// this catches the "operator forgot to run migrate" footgun loudly,
 		// at boot, instead of with mysterious runtime errors.
 		if err := database.RequireSchemaApplied(); err != nil {
 			log.Fatal().Err(err).Msg("Schema precondition failed — run `kubectl -n honeydue create job --from=cronjob/honeydue-migrate` (or `make migrate-up` locally) and retry")
 		}
 	}
@@ -167,6 +196,28 @@ func main() {
 			Msg("Push notification client initialized")
 	}
 	// Initialize Asynq enqueuer (api-side). Used by services that move
 	// long-running work off the request path (currently: task-completion
 	// notification fan-out). Same Redis as cmd/worker — file-mounted password
 	// applied separately because cfg.Redis.URL does not embed it (audit HIGH-1).
 	var taskEnqueuer *worker.TaskClient
 	if redisOpt, parseErr := asynq.ParseRedisURI(cfg.Redis.URL); parseErr != nil {
 		log.Warn().Err(parseErr).Msg("Failed to parse Redis URL for Asynq enqueuer — completion notifications will run inline")
 	} else if clientOpt, ok := redisOpt.(asynq.RedisClientOpt); ok {
 		if cfg.Redis.Password != "" {
 			clientOpt.Password = cfg.Redis.Password
 		}
 		taskEnqueuer = worker.NewTaskClient(clientOpt)
 		defer func() {
 			if cerr := taskEnqueuer.Close(); cerr != nil {
 				log.Warn().Err(cerr).Msg("Failed to close Asynq enqueuer on shutdown")
 			}
 		}()
 		log.Info().Msg("Asynq enqueuer initialized")
 	} else {
 		log.Warn().Msg("Redis opt is not RedisClientOpt — Asynq enqueuer skipped; completion notifications will run inline")
 	}
 	// Setup router with dependencies (includes admin panel at /admin)
 	deps := &router.Dependencies{
 		DB:                db,
@@ -178,6 +229,12 @@ func main() {
 		StorageService:    storageService,
 		MonitoringService: monitoringService,
 	}
 	// Only assign the enqueuer when we actually constructed one. Assigning a
 	// nil *worker.TaskClient directly would create a typed-nil interface that
 	// fails the `if deps.TaskEnqueuer != nil` check in router.SetupRouter.
 	if taskEnqueuer != nil {
 		deps.TaskEnqueuer = taskEnqueuer
 	}
 	e := router.SetupRouter(deps)
 	// Create HTTP server
@@ -217,3 +274,15 @@ func main() {
 	log.Info().Msg("Server exited")
 }
 // deploymentEnvironment turns the boolean Debug flag into the conventional
 // environment label spans get tagged with.
 func deploymentEnvironment(debug bool) string {
 	if env := os.Getenv("DEPLOYMENT_ENVIRONMENT"); env != "" {
 		return env
 	}
 	if debug {
 		return "dev"
 	}
 	return "prod"
 }
@@ -0,0 +1,32 @@
 package main
 import "time"
 // shouldInitEmail returns true if email config has host and user set.
 func shouldInitEmail(host, user string) bool {
 	return host != "" && user != ""
 }
 // shouldInitStorage returns true if upload directory is configured.
 func shouldInitStorage(uploadDir string) bool {
 	return uploadDir != ""
 }
 // shouldInitEncryption returns true if encryption key is set.
 func shouldInitEncryption(encryptionKey string) bool {
 	return encryptionKey != ""
 }
 // connectWithRetry attempts a connection with exponential backoff.
 // Returns nil on success or the last error after all retries fail.
 func connectWithRetry(connect func() error, maxRetries int) error {
 	var err error
 	for i := 0; i < maxRetries; i++ {
 		err = connect()
 		if err == nil {
 			return nil
 		}
 		time.Sleep(time.Duration(i+1) * time.Millisecond) // use ms in tests
 	}
 	return err
 }
@@ -0,0 +1,107 @@
 package main
 import (
 	"errors"
 	"testing"
 )
 // --- shouldInitEmail ---
 func TestShouldInitEmail_BothSet_True(t *testing.T) {
 	if !shouldInitEmail("smtp.example.com", "user@example.com") {
 		t.Error("expected true when both set")
 	}
 }
 func TestShouldInitEmail_MissingHost_False(t *testing.T) {
 	if shouldInitEmail("", "user@example.com") {
 		t.Error("expected false when host empty")
 	}
 }
 func TestShouldInitEmail_MissingUser_False(t *testing.T) {
 	if shouldInitEmail("smtp.example.com", "") {
 		t.Error("expected false when user empty")
 	}
 }
 func TestShouldInitEmail_BothEmpty_False(t *testing.T) {
 	if shouldInitEmail("", "") {
 		t.Error("expected false when both empty")
 	}
 }
 // --- shouldInitStorage ---
 func TestShouldInitStorage_Set_True(t *testing.T) {
 	if !shouldInitStorage("/uploads") {
 		t.Error("expected true")
 	}
 }
 func TestShouldInitStorage_Empty_False(t *testing.T) {
 	if shouldInitStorage("") {
 		t.Error("expected false")
 	}
 }
 // --- shouldInitEncryption ---
 func TestShouldInitEncryption_Set_True(t *testing.T) {
 	if !shouldInitEncryption("secret-key-123") {
 		t.Error("expected true")
 	}
 }
 func TestShouldInitEncryption_Empty_False(t *testing.T) {
 	if shouldInitEncryption("") {
 		t.Error("expected false")
 	}
 }
 // --- connectWithRetry ---
 func TestConnectWithRetry_SucceedsFirst_NoRetry(t *testing.T) {
 	calls := 0
 	err := connectWithRetry(func() error {
 		calls++
 		return nil
 	}, 3)
 	if err != nil {
 		t.Errorf("unexpected error: %v", err)
 	}
 	if calls != 1 {
 		t.Errorf("calls = %d, want 1", calls)
 	}
 }
 func TestConnectWithRetry_SucceedsSecond_OneRetry(t *testing.T) {
 	calls := 0
 	err := connectWithRetry(func() error {
 		calls++
 		if calls == 1 {
 			return errors.New("fail")
 		}
 		return nil
 	}, 3)
 	if err != nil {
 		t.Errorf("unexpected error: %v", err)
 	}
 	if calls != 2 {
 		t.Errorf("calls = %d, want 2", calls)
 	}
 }
 func TestConnectWithRetry_AllFail_ReturnsError(t *testing.T) {
 	calls := 0
 	err := connectWithRetry(func() error {
 		calls++
 		return errors.New("fail")
 	}, 3)
 	if err == nil {
 		t.Error("expected error")
 	}
 	if calls != 3 {
 		t.Errorf("calls = %d, want 3", calls)
 	}
 }
@@ -0,0 +1,333 @@
 // notif-diag is a CLI for inspecting and (optionally) cleaning up stuck
 // notification rows. Default mode is read-only — runs SELECTs and prints a
 // summary. With --mark-failed-as-sent, marks pending rows that already have a
 // recorded error as sent (cosmetic — no retry, no resend).
 //
 // Usage:
 //
 //	set -a && source deploy/prod.env && set +a
 //	go run ./cmd/notif-diag                              # diagnose
 //	go run ./cmd/notif-diag --mark-failed-as-sent --yes  # clean up errored backlog
 package main
 import (
 	"bufio"
 	"fmt"
 	"os"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 	"gorm.io/driver/postgres"
 	"gorm.io/gorm"
 	"gorm.io/gorm/logger"
 )
 func main() {
 	passwordFile := stringFlag("password-file", "deploy/secrets/postgres_password.txt",
 		"Path to file containing POSTGRES_PASSWORD (used if env var is empty)")
 	markFailed := boolFlag("mark-failed-as-sent",
 		"Mark every pending row with a non-empty error_message as sent. Cosmetic only — does not retry the push.")
 	yes := boolFlag("yes", "Skip the interactive confirmation prompt for destructive actions.")
 	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: time.RFC3339})
 	dsn, host, err := buildDSN(*passwordFile)
 	if err != nil {
 		log.Fatal().Err(err).Msg("failed to build database DSN")
 	}
 	db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{
 		Logger: logger.Default.LogMode(logger.Silent),
 	})
 	if err != nil {
 		log.Fatal().Err(err).Msg("failed to connect to database")
 	}
 	fmt.Printf("DB host: %s\n", host)
 	fmt.Println(strings.Repeat("=", 80))
 	overallTotals(db)
 	pendingByType(db)
 	recentPending(db)
 	deviceCounts(db)
 	if *markFailed {
 		markFailedAsSent(db, *yes)
 	}
 }
 // markFailedAsSent updates pending rows whose error_message is non-empty,
 // flipping them to sent=true with sent_at=updated_at. This is purely cosmetic:
 // it removes them from the "pending" count so dashboards and the diag tool
 // don't keep flagging an old, unfixable backlog. It does NOT re-send anything.
 func markFailedAsSent(db *gorm.DB, skipPrompt bool) {
 	var candidate int64
 	if err := db.Raw(`
 		SELECT COUNT(*) FROM notifications_notification
 		WHERE sent = false AND error_message IS NOT NULL AND error_message <> ''
 	`).Scan(&candidate).Error; err != nil {
 		log.Fatal().Err(err).Msg("failed to count cleanup candidates")
 	}
 	fmt.Printf("\n# Cleanup candidate count: %d\n", candidate)
 	if candidate == 0 {
 		fmt.Println("  (nothing to clean up)")
 		return
 	}
 	fmt.Println("  These rows have a recorded send error and will never be retried.")
 	fmt.Println("  Marking them sent=true is cosmetic — it just prevents them from")
 	fmt.Println("  showing up as pending in admin dashboards going forward.")
 	if !skipPrompt {
 		fmt.Printf("\nProceed? Type 'yes' to update %d rows: ", candidate)
 		s, err := bufio.NewReader(os.Stdin).ReadString('\n')
 		if err != nil {
 			log.Fatal().Err(err).Msg("failed to read confirmation")
 		}
 		if strings.TrimSpace(s) != "yes" {
 			fmt.Println("Aborted.")
 			return
 		}
 	}
 	res := db.Exec(`
 		UPDATE notifications_notification
 		SET sent = true, sent_at = COALESCE(updated_at, NOW())
 		WHERE sent = false AND error_message IS NOT NULL AND error_message <> ''
 	`)
 	if res.Error != nil {
 		log.Fatal().Err(res.Error).Msg("failed to update rows")
 	}
 	fmt.Printf("OK — updated %d rows.\n", res.RowsAffected)
 }
 // overallTotals shows the high-level sent/pending/read split.
 func overallTotals(db *gorm.DB) {
 	type row struct {
 		Total   int64
 		Sent    int64
 		Pending int64
 		Read    int64
 		Errored int64
 	}
 	var r row
 	db.Raw(`
 		SELECT
 			COUNT(*)                                                  AS total,
 			COUNT(*) FILTER (WHERE sent = true)                       AS sent,
 			COUNT(*) FILTER (WHERE sent = false)                      AS pending,
 			COUNT(*) FILTER (WHERE read = true)                       AS read,
 			COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS errored
 		FROM notifications_notification
 	`).Scan(&r)
 	fmt.Println("\n# Overall notification counts")
 	fmt.Printf("  total:   %d\n", r.Total)
 	fmt.Printf("  sent:    %d\n", r.Sent)
 	fmt.Printf("  pending: %d\n", r.Pending)
 	fmt.Printf("  read:    %d\n", r.Read)
 	fmt.Printf("  errored: %d  (rows with non-empty error_message)\n", r.Errored)
 }
 // pendingByType breaks the pending rows down by type and age.
 func pendingByType(db *gorm.DB) {
 	type row struct {
 		NotificationType string
 		PendingCount     int64
 		Oldest           *time.Time
 		Newest           *time.Time
 		WithErrors       int64
 		Last24h          int64
 		Last7d           int64
 	}
 	var rows []row
 	db.Raw(`
 		SELECT
 			notification_type,
 			COUNT(*)                                       AS pending_count,
 			MIN(created_at)                                AS oldest,
 			MAX(created_at)                                AS newest,
 			COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS with_errors,
 			COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '24 hours')          AS last_24h,
 			COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '7 days')            AS last_7d
 		FROM notifications_notification
 		WHERE sent = false
 		GROUP BY notification_type
 		ORDER BY MAX(created_at) DESC NULLS LAST
 	`).Scan(&rows)
 	fmt.Println("\n# Pending rows by type")
 	if len(rows) == 0 {
 		fmt.Println("  (no pending notifications)")
 		return
 	}
 	fmt.Printf("  %-22s  %7s  %7s  %7s  %7s  %-19s  %-19s\n",
 		"TYPE", "PENDING", "ERRORED", "LAST24H", "LAST7D", "OLDEST", "NEWEST")
 	for _, r := range rows {
 		fmt.Printf("  %-22s  %7d  %7d  %7d  %7d  %-19s  %-19s\n",
 			r.NotificationType, r.PendingCount, r.WithErrors, r.Last24h, r.Last7d,
 			fmtTime(r.Oldest), fmtTime(r.Newest))
 	}
 }
 // recentPending shows the 5 most recent pending rows with full detail.
 func recentPending(db *gorm.DB) {
 	type row struct {
 		ID               uint
 		UserID           uint
 		NotificationType string
 		Title            string
 		Body             string
 		ErrorMessage     string
 		CreatedAt        time.Time
 	}
 	var rows []row
 	db.Raw(`
 		SELECT id, user_id, notification_type, title, body, COALESCE(error_message, '') AS error_message, created_at
 		FROM notifications_notification
 		WHERE sent = false
 		ORDER BY created_at DESC
 		LIMIT 5
 	`).Scan(&rows)
 	fmt.Println("\n# 5 most recent pending notifications")
 	if len(rows) == 0 {
 		fmt.Println("  (none)")
 		return
 	}
 	for _, r := range rows {
 		errPart := ""
 		if r.ErrorMessage != "" {
 			errPart = fmt.Sprintf("\n      error: %s", r.ErrorMessage)
 		}
 		fmt.Printf("  [%d] user=%d  %s  %s%s\n      title: %s\n      body:  %s\n",
 			r.ID, r.UserID, r.CreatedAt.Format("2006-01-02 15:04:05"), r.NotificationType, errPart,
 			truncate(r.Title, 100), truncate(r.Body, 100))
 	}
 }
 // deviceCounts shows how many push devices are registered (active vs inactive).
 func deviceCounts(db *gorm.DB) {
 	type row struct {
 		Total       int64
 		Active      int64
 		WithUser    int64
 		DistinctUsers int64
 	}
 	fmt.Println("\n# Registered push devices")
 	for _, t := range []struct {
 		label string
 		table string
 	}{
 		{"APNs (iOS)", "push_notifications_apnsdevice"},
 		{"GCM (Android)", "push_notifications_gcmdevice"},
 	} {
 		var r row
 		err := db.Raw(fmt.Sprintf(`
 			SELECT
 				COUNT(*)                                              AS total,
 				COUNT(*) FILTER (WHERE active = true)                 AS active,
 				COUNT(*) FILTER (WHERE user_id IS NOT NULL)           AS with_user,
 				COUNT(DISTINCT user_id)                               AS distinct_users
 			FROM %s
 		`, t.table)).Scan(&r).Error
 		if err != nil {
 			fmt.Printf("  %-15s  ERROR: %v\n", t.label, err)
 			continue
 		}
 		fmt.Printf("  %-15s  total=%-5d  active=%-5d  with_user=%-5d  distinct_users=%d\n",
 			t.label, r.Total, r.Active, r.WithUser, r.DistinctUsers)
 	}
 }
 func buildDSN(passwordFile string) (dsn, host string, err error) {
 	host = os.Getenv("DB_HOST")
 	user := os.Getenv("POSTGRES_USER")
 	dbname := os.Getenv("POSTGRES_DB")
 	sslmode := os.Getenv("DB_SSLMODE")
 	if sslmode == "" {
 		sslmode = "require"
 	}
 	port := 5432
 	if s := os.Getenv("DB_PORT"); s != "" {
 		p, perr := strconv.Atoi(s)
 		if perr != nil {
 			return "", "", fmt.Errorf("invalid DB_PORT %q: %w", s, perr)
 		}
 		port = p
 	}
 	password := os.Getenv("POSTGRES_PASSWORD")
 	if password == "" && passwordFile != "" {
 		b, rerr := os.ReadFile(passwordFile)
 		if rerr != nil {
 			return "", "", fmt.Errorf("POSTGRES_PASSWORD not set and could not read %s: %w", passwordFile, rerr)
 		}
 		password = strings.TrimRight(string(b), "\r\n")
 	}
 	missing := []string{}
 	if host == "" {
 		missing = append(missing, "DB_HOST")
 	}
 	if user == "" {
 		missing = append(missing, "POSTGRES_USER")
 	}
 	if dbname == "" {
 		missing = append(missing, "POSTGRES_DB")
 	}
 	if password == "" {
 		missing = append(missing, "POSTGRES_PASSWORD")
 	}
 	if len(missing) > 0 {
 		return "", "", fmt.Errorf("missing required env vars: %s", strings.Join(missing, ", "))
 	}
 	dsn = fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
 		host, port, user, password, dbname, sslmode)
 	return dsn, host, nil
 }
 // stringFlag is a tiny stand-in for flag.String to keep imports lean — using it
 // also dodges flag-package quirks when this file is rebuilt with go run.
 func stringFlag(name, def, _usage string) *string {
 	v := def
 	prefix := "--" + name + "="
 	for _, a := range os.Args[1:] {
 		if strings.HasPrefix(a, prefix) {
 			v = strings.TrimPrefix(a, prefix)
 		}
 	}
 	return &v
 }
 // boolFlag is true if --name is present in os.Args (no value form).
 func boolFlag(name, _usage string) *bool {
 	want := "--" + name
 	v := false
 	for _, a := range os.Args[1:] {
 		if a == want {
 			v = true
 		}
 	}
 	return &v
 }
 func fmtTime(t *time.Time) string {
 	if t == nil {
 		return "-"
 	}
 	return t.Format("2006-01-02 15:04:05")
 }
 func truncate(s string, n int) string {
 	if len(s) <= n {
 		return s
 	}
 	return s[:n] + "…"
 }
@@ -0,0 +1,59 @@
 // send-test-push enqueues a one-shot Asynq push notification task. The worker
 // picks it up and routes it through internal/push/Client.SendToAll, which now
 // hits APNs production. Verifies end-to-end that push delivery is working
 // without waiting for the next cron tick.
 //
 // Usage:
 //
 //	# Port-forward Redis from the cluster first:
 //	kubectl --kubeconfig=~/.kube/honeydue-k3s.yaml -n honeydue port-forward svc/redis 6379:6379
 //
 //	# Then in another shell:
 //	go run ./cmd/send-test-push --user-id 6 --title "Test" --message "Hello from notif-diag"
 package main
 import (
 	"flag"
 	"fmt"
 	"os"
 	"strconv"
 	"github.com/hibiken/asynq"
 	"github.com/treytartt/honeydue-api/internal/worker/jobs"
 )
 func main() {
 	userID := flag.Uint("user-id", 0, "Target auth_user.id (required)")
 	title := flag.String("title", "Test push", "Notification title")
 	message := flag.String("message", "Hello from send-test-push", "Notification body")
 	redisAddr := flag.String("redis", "localhost:6379", "Redis host:port (use kubectl port-forward to reach the in-cluster redis)")
 	flag.Parse()
 	if *userID == 0 {
 		fmt.Fprintln(os.Stderr, "--user-id is required")
 		os.Exit(2)
 	}
 	task, err := jobs.NewSendPushTask(*userID, *title, *message, map[string]string{
 		"type":    "test",
 		"user_id": strconv.FormatUint(uint64(*userID), 10),
 	})
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "build task: %v\n", err)
 		os.Exit(1)
 	}
 	client := asynq.NewClient(asynq.RedisClientOpt{Addr: *redisAddr})
 	defer func() { _ = client.Close() }()
 	info, err := client.Enqueue(task, asynq.Queue("default"), asynq.MaxRetry(3))
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "enqueue: %v\n", err)
 		os.Exit(1)
 	}
 	fmt.Printf("Enqueued task: id=%s queue=%s type=%s\n", info.ID, info.Queue, info.Type)
 	fmt.Printf("Tail worker logs to see the result:\n")
 	fmt.Printf("  kubectl --kubeconfig=~/.kube/honeydue-k3s.yaml -n honeydue logs deploy/worker --tail=20 -f\n")
 }
@@ -11,13 +11,19 @@ import (
 	"github.com/hibiken/asynq"
 	"github.com/redis/go-redis/v9"
 	"github.com/rs/zerolog/log"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/codes"
 	"go.opentelemetry.io/otel/trace"
 	"github.com/treytartt/honeydue-api/internal/config"
 	"github.com/treytartt/honeydue-api/internal/database"
 	"github.com/treytartt/honeydue-api/internal/monitoring"
 	"github.com/treytartt/honeydue-api/internal/prom"
 	"github.com/treytartt/honeydue-api/internal/push"
 	"github.com/treytartt/honeydue-api/internal/repositories"
 	"github.com/treytartt/honeydue-api/internal/services"
 	"github.com/treytartt/honeydue-api/internal/tracing"
 	"github.com/treytartt/honeydue-api/internal/worker"
 	"github.com/treytartt/honeydue-api/internal/worker/jobs"
 	"github.com/treytartt/honeydue-api/pkg/utils"
 )
@@ -40,6 +46,29 @@ func main() {
 		os.Exit(0)
 	}
 	// Initialize OpenTelemetry tracing for the worker process. Same OTLP
 	// destination as the api; service.name distinguishes them in Jaeger.
 	// config.SecretValue (not os.Getenv) so file-mounted secrets resolve
 	// after audit F8 removed these from the process environment.
 	tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{
 		ServiceName: "honeydue-worker",
 		Environment: workerDeploymentEnv(cfg.Server.Debug),
 		EndpointURL: config.SecretValue("OBS_TRACES_URL"),
 		BearerToken: config.SecretValue("OBS_INGEST_TOKEN"),
 		SampleRatio: tracing.SampleRatioFromEnv(),
 	})
 	if err != nil {
 		log.Error().Err(err).Msg("worker tracing init failed — continuing without traces")
 	}
 	defer func() {
 		shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 		defer cancel()
 		if err := tracingShutdown(shutdownCtx); err != nil {
 			log.Warn().Err(err).Msg("worker tracing shutdown error")
 		}
 	}()
 	asynqTracer := tracing.Tracer("honeydue/worker/asynq")
 	// Initialize database
 	db, err := database.Connect(&cfg.Database, cfg.Server.Debug)
 	if err != nil {
@@ -80,6 +109,17 @@ func main() {
 	if err != nil {
 		log.Fatal().Err(err).Msg("Failed to parse Redis URL")
 	}
 	// Audit HIGH-1: the Redis password is a file-mounted secret (REDIS_PASSWORD),
 	// not embedded in REDIS_URL — REDIS_URL travels in the honeydue-config
 	// ConfigMap. Apply the password onto the parsed opt so the Asynq server,
 	// inspector and monitoring client (all derived from redisOpt below)
 	// authenticate against a requirepass-protected Redis.
 	if cfg.Redis.Password != "" {
 		if clientOpt, ok := redisOpt.(asynq.RedisClientOpt); ok {
 			clientOpt.Password = cfg.Redis.Password
 			redisOpt = clientOpt
 		}
 	}
 	// Initialize monitoring service (if Redis is available)
 	var monitoringService *monitoring.Service
@@ -141,14 +181,62 @@ func main() {
 	// Create job handler
 	jobHandler := jobs.NewHandler(db, pushClient, emailService, notificationService, cfg)
 	// Wire upload service for the pending_uploads cleanup cron AND share the
 	// underlying storage service with the TaskService below so the worker can
 	// load completion images for email embedding. Storage may be local-disk
 	// (no S3 backend), in which case the upload service stays nil and the
 	// cleanup handler no-ops. Cache is optional — the cleanup path doesn't
 	// rate-limit and works fine with a nil cache.
 	var sharedStorageService *services.StorageService
 	if storageService, sErr := services.NewStorageService(&cfg.Storage); sErr == nil {
 		sharedStorageService = storageService
 		if s3 := storageService.S3Backend(); s3 != nil {
 			pendingUploadRepo := repositories.NewPendingUploadRepository(db)
 			uploadService := services.NewUploadService(pendingUploadRepo, s3, &cfg.Storage, nil)
 			jobHandler.SetUploadService(uploadService)
 		}
 	} else {
 		log.Warn().Err(sErr).Msg("Failed to initialize storage service for upload cleanup; cleanup cron will no-op")
 	}
 	// Wire a TaskService for the task-completed notification handler. The
 	// worker re-creates this (vs. importing the api's wired instance) because
 	// each binary owns its own dependency graph. The handler is fully nil-safe
 	// — if any of the wired services are absent, the corresponding side of
 	// notification delivery (push or email) is skipped.
 	taskRepo := repositories.NewTaskRepository(db)
 	residenceRepo := repositories.NewResidenceRepository(db)
 	workerTaskService := services.NewTaskService(taskRepo, residenceRepo)
 	if notificationService != nil {
 		workerTaskService.SetNotificationService(notificationService)
 	}
 	if emailService != nil {
 		workerTaskService.SetEmailService(emailService)
 	}
 	if sharedStorageService != nil {
 		workerTaskService.SetStorageService(sharedStorageService)
 	}
 	jobHandler.SetTaskService(workerTaskService)
 	// Create Asynq mux and register handlers
 	mux := asynq.NewServeMux()
 	// Tracing + metrics middleware: every job runs inside a span and emits
 	// asynq_job_duration_seconds{task_type,result}.
 	mux.Use(asynqTracingMiddleware(asynqTracer))
 	mux.HandleFunc(jobs.TypeSmartReminder, jobHandler.HandleSmartReminder)
 	mux.HandleFunc(jobs.TypeDailyDigest, jobHandler.HandleDailyDigest)
 	mux.HandleFunc(jobs.TypeSendEmail, jobHandler.HandleSendEmail)
 	mux.HandleFunc(jobs.TypeSendPush, jobHandler.HandleSendPush)
 	mux.HandleFunc(jobs.TypeOnboardingEmails, jobHandler.HandleOnboardingEmails)
 	mux.HandleFunc(jobs.TypeReminderLogCleanup, jobHandler.HandleReminderLogCleanup)
 	mux.HandleFunc(jobs.TypeUploadCleanup, jobHandler.HandleUploadCleanup)
 	mux.HandleFunc(jobs.TypeNotificationCleanup, jobHandler.HandleNotificationCleanup)
 	mux.HandleFunc(jobs.TypeWebhookLogCleanup, jobHandler.HandleWebhookLogCleanup)
 	mux.HandleFunc(jobs.TypeAuditLogCleanup, jobHandler.HandleAuditLogCleanup)
 	mux.HandleFunc(worker.TypeTaskCompletedNotification, jobHandler.HandleTaskCompletedNotification)
 	mux.HandleFunc(worker.TypeDataExport, jobHandler.HandleDataExport)
 	// Register email job handlers (welcome, verification, password reset, password changed)
 	if emailService != nil {
@@ -188,6 +276,32 @@ func main() {
 	}
 	log.Info().Str("cron", "0 3 * * *").Msg("Registered reminder log cleanup job (runs daily at 3:00 AM UTC)")
 	// Schedule pending_uploads cleanup (hourly at :30 to avoid colliding with
 	// the top-of-hour reminder + digest crons). Reaps unclaimed expired
 	// upload sessions; the B2 bucket lifecycle (7 days on uploads/ prefix)
 	// is the backstop if this worker is offline for an extended period.
 	if _, err := scheduler.Register("30 * * * *", asynq.NewTask(jobs.TypeUploadCleanup, nil)); err != nil {
 		log.Fatal().Err(err).Msg("Failed to register upload cleanup job")
 	}
 	log.Info().Str("cron", "30 * * * *").Msg("Registered pending_uploads cleanup job (runs hourly)")
 	// Data-retention cleanups (BE-2). Staggered off the 3:00 reminder cleanup to
 	// avoid piling DELETEs onto the same Neon connection window.
 	if _, err := scheduler.Register("0 2 * * *", asynq.NewTask(jobs.TypeNotificationCleanup, nil)); err != nil {
 		log.Fatal().Err(err).Msg("Failed to register notification cleanup job")
 	}
 	log.Info().Str("cron", "0 2 * * *").Msg("Registered notification cleanup job (daily 02:00 UTC, 90d retention)")
 	if _, err := scheduler.Register("30 2 * * 0", asynq.NewTask(jobs.TypeWebhookLogCleanup, nil)); err != nil {
 		log.Fatal().Err(err).Msg("Failed to register webhook log cleanup job")
 	}
 	log.Info().Str("cron", "30 2 * * 0").Msg("Registered webhook log cleanup job (weekly Sun 02:30 UTC, 180d retention)")
 	if _, err := scheduler.Register("30 3 * * 0", asynq.NewTask(jobs.TypeAuditLogCleanup, nil)); err != nil {
 		log.Fatal().Err(err).Msg("Failed to register audit log cleanup job")
 	}
 	log.Info().Str("cron", "30 3 * * 0").Msg("Registered audit log cleanup job (weekly Sun 03:30 UTC, 365d retention)")
 	// Handle graceful shutdown
 	quit := make(chan os.Signal, 1)
 	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
@@ -199,6 +313,12 @@ func main() {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(`{"status":"ok"}`))
 	})
 	// Expose Prometheus metrics so vmagent can scrape the worker. The
 	// apns_send_*, fcm_send_*, asynq_job_* and cache_ops_* series have been
 	// recorded on this process all along — they were just never exposed, which
 	// is why those dashboard panels read empty. Same :6060 as health; in-cluster
 	// only (not externally published).
 	healthMux.Handle("/metrics", prom.HTTPHandler())
 	healthSrv := &http.Server{
 		Addr:              workerHealthAddr,
 		Handler:           healthMux,
@@ -238,3 +358,44 @@ func main() {
 	log.Info().Msg("Worker stopped")
 }
 // asynqTracingMiddleware returns an asynq.MiddlewareFunc that opens a span
 // per task execution and records asynq_job_duration_seconds. Span attrs
 // include task type, queue, retry count, and the result outcome.
 func asynqTracingMiddleware(tracer trace.Tracer) asynq.MiddlewareFunc {
 	return func(next asynq.Handler) asynq.Handler {
 		return asynq.HandlerFunc(func(ctx context.Context, t *asynq.Task) error {
 			ctx, span := tracer.Start(ctx, "asynq.handle:"+t.Type(),
 				trace.WithAttributes(
 					attribute.String("asynq.task_type", t.Type()),
 					attribute.Int("asynq.payload_bytes", len(t.Payload())),
 				),
 			)
 			defer span.End()
 			start := time.Now()
 			err := next.ProcessTask(ctx, t)
 			dur := time.Since(start)
 			result := "ok"
 			if err != nil {
 				result = "error"
 				span.SetStatus(codes.Error, err.Error())
 				span.RecordError(err)
 			}
 			span.SetAttributes(attribute.String("asynq.result", result))
 			prom.ObserveAsynqJob(t.Type(), result, dur)
 			return err
 		})
 	}
 }
 // workerDeploymentEnv mirrors deploymentEnvironment in cmd/api/main.go.
 func workerDeploymentEnv(debug bool) string {
 	if env := os.Getenv("DEPLOYMENT_ENVIRONMENT"); env != "" {
 		return env
 	}
 	if debug {
 		return "dev"
 	}
 	return "prod"
 }
@@ -42,7 +42,7 @@ email:
 push:
  apns_key_id: ""
  apns_team_id: ""
-  apns_topic: com.tt.honeyDue
+  apns_topic: com.myhoneydue.honeyDue.dev
  apns_production: false
  apns_use_sandbox: true                # Sandbox for dev
@@ -85,8 +85,9 @@ tls:
  # If mode=cloudflare, create secrets/cloudflare-origin.crt and .key
 # --- Apple Auth / IAP (optional) ---
 # client_id MUST equal the iOS Debug bundle ID for the dev backend.
 apple_auth:
-  client_id: ""
+  client_id: "com.myhoneydue.honeyDue.dev"
  team_id: ""
  iap_key_id: ""
  iap_issuer_id: ""
@@ -92,7 +92,7 @@ ADMIN_PW="$(openssl rand -base64 16)"
 EMAIL_USER="treytartt@fastmail.com"
 APNS_KEY_ID="9R5Q7ZX874"
-APNS_TEAM_ID="V3PF3M6B6U"
+APNS_TEAM_ID="X86BR9WTLD"
 log ""
 log "Pre-filled from existing dev server:"
@@ -147,7 +147,7 @@ email:
 push:
  apns_key_id: "${APNS_KEY_ID}"
  apns_team_id: "${APNS_TEAM_ID}"
-  apns_topic: com.tt.honeyDue
+  apns_topic: com.myhoneydue.honeyDue.dev
  apns_production: false
  apns_use_sandbox: true
@@ -189,7 +189,7 @@ tls:
 # --- Apple Auth / IAP ---
 apple_auth:
-  client_id: "com.tt.honeyDue"
+  client_id: "com.myhoneydue.honeyDue.dev"
  team_id: "${APNS_TEAM_ID}"
  iap_key_id: ""
  iap_issuer_id: ""
@@ -3,6 +3,7 @@ config.yaml
 # Generated files
 kubeconfig
 kubeconfig.*
 cluster-config.yaml
 prod.env
@@ -0,0 +1,966 @@
 # honeyDue k3s Cluster — Operations Runbook
 Living document for the honeyDue production cluster. Add entries when you hit
 something non-obvious so future-you (or your replacement) doesn't have to
 rediscover it.
 Last full revision: **2026-06-03** (Hetzner → OVH BHS cutover; cluster solo
 production from that date forward). For pre-OVH history, see
 `MIGRATION_NOTES.md` (Swarm → k3s migration on Hetzner, 2026-04-24).
 ---
 ## 1. Topology and inventory
 ### Hosting
 | | |
 |---|---|
 | Provider | OVHcloud (us.ovhcloud.com) |
 | Datacenter | BHS — Beauharnois, Quebec, Canada |
 | Plan | VPS-1 × 3 (~$6.46/mo each, ~$19/mo total) |
 | Node spec | 4 vCPU (Intel Haswell, shared), 7.6 GB RAM, 75 GB NVMe |
 | Public bandwidth | 400 Mbps per node, unlimited traffic |
 | Private network | **None.** Nodes have public IPv4 + IPv6 only; inter-node traffic crosses the public internet (encrypted by flannel WireGuard backend — see §3) |
 ### Nodes
 | SSH alias | Kubernetes node name | Public IPv4 | Public IPv6 | Roles |
 |---|---|---|---|---|
 | `ovhcloud1` | `vps-1624d691` | `51.81.83.33` | `2604:2dc0:101:200::5a9a` | control-plane, etcd, redis-pinned |
 | `ovhcloud2` | `vps-c0f51be2` | `51.81.87.86` | `2604:2dc0:101:200::30d4` | control-plane, etcd |
 | `ovhcloud3` | `vps-dbca24c7` | `51.81.85.248` | `2604:2dc0:101:200::450f` | control-plane, etcd |
 The cluster is **all-control-plane** (workloads schedule on the same nodes that
 run etcd and the API server). `vps-1624d691` carries the
 `honeydue/redis=true` label so the Redis Deployment's `nodeSelector` binds
 there; the Redis PVC (`local-path`, host-pinned) lives on that node's disk.
 ### SSH access
 `~/.ssh/config` entries (operator workstation):
 ```
 Host ovhcloud1
    HostName 51.81.83.33
    Port 22
    User ubuntu
    IdentityFile ~/.ssh/ovhcloud
    IdentitiesOnly yes
 Host ovhcloud2
    HostName 51.81.87.86
    Port 22
    User ubuntu
    IdentityFile ~/.ssh/ovhcloud
    IdentitiesOnly yes
 Host ovhcloud3
    HostName 51.81.85.248
    Port 22
    User ubuntu
    IdentityFile ~/.ssh/ovhcloud
    IdentitiesOnly yes
 ```
 `ubuntu` has passwordless sudo (`/etc/sudoers.d/90-cloud-init-users` from OVH's
 cloud-init).
 ### kubectl access
 ```bash
 export KUBECONFIG=/Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go/deploy-k3s/kubeconfig
 kubectl get nodes
 ```
 The `deploy-k3s/kubeconfig` file (mode 0600, gitignored) is the OVH cluster's
 admin kubeconfig with `server: https://51.81.83.33:6443`. A stale Hetzner copy
 lives next to it as `kubeconfig.hetzner.bak` for historical reference; the
 Hetzner cluster is powered off and that file's API server is unreachable.
 To refresh from the cluster (if the local copy is lost or rotated):
 ```bash
 ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
  | sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
  > deploy-k3s/kubeconfig
 chmod 600 deploy-k3s/kubeconfig
 ```
 The k3s API at `:6443` is open to the public internet (token-protected).
 ---
 ## 2. Software
 ### Kernel-level
 | | |
 |---|---|
 | OS | Ubuntu 26.04 LTS (set by OVH's VPS-1 image) |
 | Kernel | `7.0.0-14-generic` |
 | Init | systemd |
 | Container runtime | containerd 2.2.2 (bundled with k3s) |
 | Firewall | `ufw` (per-node, configured at install — see §3) |
 | Other host packages | `fail2ban` (SSH brute-force protection, default jail), `unattended-upgrades` (security updates), `open-iscsi` (k3s prereq for some storage backends), `curl` |
 ### Kubernetes
 | | |
 |---|---|
 | Distribution | k3s |
 | Version | **`v1.34.6+k3s1`** (pinned in `config.yaml:cluster.k3s_version`) |
 | Control plane | 3-node HA, embedded etcd (no external Postgres backing store) |
 | CNI / networking | flannel with **WireGuard-native backend** (`--flannel-backend=wireguard-native`). Encrypts pod-to-pod and etcd peer traffic because nodes only have public IPs (no private network). ~3-5% CPU overhead under load. |
 | Service LB | klipper-lb (default k3s `servicelb`). The `svclb-traefik` DaemonSet binds host ports `:80` and `:443` on each node and forwards to the Traefik Service. **Not** the DaemonSet-w/-hostNetwork Traefik pattern used on the old Hetzner cluster — see §10 *Differences from MIGRATION_NOTES*. |
 | Ingress controller | Traefik (k3s default), single-replica Deployment, exposed via klipper-lb |
 | DNS | CoreDNS (k3s default) |
 | Secrets encryption | Enabled (`--secrets-encryption`); etcd values are AES-CBC encrypted at rest |
 | kubeconfig perms | `0600` (`--write-kubeconfig-mode=0600`) |
 | Cloud controller | Disabled (`--disable-cloud-controller`) — no provider integration on OVH |
 | Misc | `--node-ip` / `--node-external-ip` / `--advertise-address` all set to each node's public IPv4. TLS SANs cover all 3 IPs so any IP can serve the API. |
 ### Application stack (in cluster, `honeydue` namespace)
 | Deployment | Replicas | Image (digest-pinned) | Notes |
 |---|---:|---|---|
 | `api` | 3 | `gitea.treytartt.com/admin/honeydue-api@sha256:34fde6...` | Go REST API on `:8000`, exposes `/metrics` |
 | `web` | 3 | `gitea.treytartt.com/admin/honeydue-web@sha256:8c62cf...` | Next.js, server-side proxy to api |
 | `admin` | 1 | `gitea.treytartt.com/admin/honeydue-admin@sha256:b81263...` | Next.js admin panel, gated behind Traefik basic-auth |
 | `worker` | 1 | `gitea.treytartt.com/admin/honeydue-worker@sha256:fe1f5e...` | Asynq scheduler + Redis-backed jobs (singleton — must not run as >1 replica or every cron fires N×) |
 | `redis` | 1 | `redis:7-alpine@sha256:6ab0b6...` | Pinned to `vps-1624d691` via `honeydue/redis=true`. PVC `redis-data` (local-path, 5 Gi). Password-auth required. |
 | `vmagent` | 1 | `victoriametrics/vmagent@sha256:...` (default tag) | Scrapes api `/metrics` + kube-state-metrics; remote-writes to obs.88oakapps.com |
 | `kube-state-metrics` | 1 | `kube-state-metrics@sha256:...` | In `kube-system`, scraped by vmagent for `kube_*` cluster-state metrics |
 | `alloy-logs` (DaemonSet) | 3 (1/node) | `grafana/alloy@sha256:...` | Tails `/var/log/pods/*` and ships to Loki at obs.88oakapps.com |
 The Asynq scheduler inside `worker` registers these cron jobs:
 | Cron | Job | Notes |
 |---|---|---|
 | `0 * * * *` | Smart reminder check (per-user hour) | Default user hour: 14:00 UTC |
 | `0 * * * *` | Daily digest check (per-user hour) | Default user hour: 03:00 UTC |
 | `0 10 * * *` | Onboarding emails | 10:00 UTC |
 | `0 3 * * *` | Reminder log cleanup | 03:00 UTC |
 | `30 * * * *` | Pending uploads cleanup | xx:30 every hour |
 ### External dependencies
 | Service | Endpoint | Purpose | Failure mode |
 |---|---|---|---|
 | Neon Postgres | `ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech:5432` | App data. Pooler endpoint (transaction-mode PgBouncer in front of Neon compute) so connections stay warm. | api / worker pods crash-loop with `dial tcp: connection refused`. Health endpoint returns `postgres: error`. |
 | Backblaze B2 (S3-compatible) | `s3.us-east-005.backblazeb2.com` (bucket `honeyDueProd`) | User uploads (photos, PDFs, completion attachments) | Upload routes return 5xx; reads of cached/static files still work. |
 | Cloudflare | `myhoneydue.com` zone | DNS + TLS termination + edge cache + DDoS | Traffic stops reaching origin. Direct `https://51.81.x.x` still works for diagnostics. |
 | obs.88oakapps.com | Operator-run Grafana + VictoriaMetrics + Loki | Metrics & logs | vmagent + alloy-logs back off and retry. No app-side impact. |
 | Apple APNs | `api.push.apple.com:443` (production) | iOS push notifications | Push fails; circuit breaker opens; failure logged. App functionality unaffected. |
 | Fastmail SMTP | `smtp.fastmail.com:587` | Transactional emails (verification, recovery, digests) | Email send fails in the worker; logged; user reset/digest flow degrades. |
 | Gitea registry | `gitea.treytartt.com` | Container image registry | Deploys can't pull. Existing pods keep running on cached images. |
 ---
 ## 3. Network and firewall
 ### Per-node `ufw` configuration
 Applied during install (same on all 3 nodes):
 ```
 default deny incoming
 default allow outgoing
 allow 22/tcp                  (SSH, world)
 allow 80/tcp                  (HTTP via Cloudflare, world — see GAP-1)
 allow 443/tcp                 (HTTPS, same — GAP-1)
 allow 6443/tcp                (k3s API, world, token-protected)
 allow 2379:2380/tcp from <other 2 OVH IPs>   (etcd client + peer)
 allow 10250/tcp from <other 2 OVH IPs>       (kubelet)
 allow 51820/udp from <other 2 OVH IPs>       (WireGuard tunnel)
 allow 8472/udp  from <other 2 OVH IPs>       (VXLAN, defense-in-depth fallback)
 ```
 To inspect: `ssh ovhcloudN sudo ufw status numbered`.
 ### Cluster networking
 - **Pod CIDR**: `10.42.0.0/16` (default k3s)
 - **Service CIDR**: `10.43.0.0/16` (default k3s)
 - **Flannel backend**: WireGuard-native. Each node hosts a `flannel-wg` interface on UDP 51820 and tunnels pod traffic to peers. Verify: `ssh ovhcloudN ip -d link show flannel-wg`.
 ### Traefik ingress flow
 ```
 Cloudflare → node:80/443 (public)
  → klipper-lb svclb-traefik DaemonSet pod (hostPort:80/443)
  → Traefik Service (ClusterIP 10.43.245.127:80/443)
  → Traefik Deployment pod (single replica)
  → matches Ingress host rule (api.myhoneydue.com etc.)
  → routes to backend Service (api / web / admin)
  → backend Pod
 ```
 The Traefik default also lives in `kube-system` and is managed by k3s's
 HelmChart. **No HelmChartConfig override is applied on OVH** (unlike Hetzner
 — see §10).
 ---
 ## 4. DNS configuration (Cloudflare)
 The `myhoneydue.com` zone in Cloudflare has these public records. **All
 hostnames are proxied (orange cloud)** — required by the `cloudflare-only`
 Traefik middleware which 403s any non-CF source IP.
 | Host | Type | Values | Proxy |
 |---|---|---|---|
 | `api.myhoneydue.com` | A × 3 | `51.81.83.33`, `51.81.87.86`, `51.81.85.248` | Proxied |
 | `app.myhoneydue.com` | A × 3 | (same trio) | Proxied |
 | `admin.myhoneydue.com` | A × 3 | (same trio) | Proxied |
 | `myhoneydue.com` (apex `@`) | A × 3 | (same trio) | Proxied |
 Cloudflare round-robins among the 3 origins, klipper-lb on whichever node CF
 hits forwards to Traefik, and Traefik routes by Host header. Per-request,
 effectively load-balanced across the 3 nodes for ingress, with no central LB.
 **SSL/TLS mode**: Flexible (CF terminates TLS at the edge; origin is plain
 HTTP on `:80`). Upgrading to Full (strict) is on the deferred list — would
 need an origin certificate provisioned to `cloudflare-origin-cert` secret and
 Traefik configured for TLS termination.
 ---
 ## 5. Filesystem layout (`deploy-k3s/`)
 ```
 deploy-k3s/
 ├── config.yaml                 # Single config source (gitignored; contains tokens)
 ├── config.yaml.example         # Template
 ├── kubeconfig                  # OVH admin kubeconfig (gitignored, 0600)
 ├── kubeconfig.hetzner.bak      # Old Hetzner kubeconfig (unreachable, kept for history)
 ├── kubeconfig.tunnel           # Optional: localhost-pointing copy for SSH-tunnel use
 ├── secrets/
 │   ├── README.md
 │   ├── postgres_password.txt   # Neon DB password
 │   ├── secret_key.txt          # 32+ char app-token signing secret
 │   ├── email_host_password.txt # Fastmail SMTP app password
 │   ├── fcm_server_key.txt      # FCM server key (currently unused — Android push disabled)
 │   ├── apns_auth_key.p8        # APNs auth key (binary)
 │   ├── cloudflare-origin.crt   # Origin certificate (currently unused — CF Flexible)
 │   └── cloudflare-origin.key
 │   (all gitignored except README.md)
 ├── manifests/
 │   ├── namespace.yaml
 │   ├── network-policies.yaml   # default-deny + per-app egress/ingress (13 NetPols total)
 │   ├── rbac.yaml               # api/worker/admin/web/redis ServiceAccounts (NOT applied by 03-deploy.sh; manual once)
 │   ├── pod-disruption-budgets.yaml  # api-pdb, web-pdb, worker-pdb (NOT applied by 03-deploy.sh; manual once)
 │   ├── traefik-helmchartconfig.yaml # Hetzner-only DaemonSet+hostNetwork override (do NOT apply on OVH; we use default klipper-lb)
 │   ├── kyverno-verify-images.yaml   # Operator-gated policy (do NOT apply blindly — see file comment)
 │   ├── api/{deployment,service,hpa}.yaml
 │   ├── worker/deployment.yaml
 │   ├── admin/{deployment,service}.yaml
 │   ├── web/{deployment,service}.yaml
 │   ├── redis/{deployment,service,pvc}.yaml
 │   ├── ingress/{middleware,ingress-simple}.yaml
 │   ├── migrate/job.yaml        # goose migration Job (image-subbed at deploy time)
 │   ├── observability/{kube-state-metrics,vmagent,alloy-logs}.yaml
 │   └── kratos/                 # Ory Kratos identity service (NOT yet deployed; gated on operator OIDC setup)
 └── scripts/
    ├── _config.sh              # Sourced by all scripts: cfg(), generate_env(), generate_cluster_config()
    ├── 01-provision-cluster.sh # Hetzner-Cloud-specific (uses hetzner-k3s CLI) — DO NOT RUN ON OVH
    ├── 02-setup-secrets.sh     # Creates honeydue-secrets etc. from secrets/ + config.yaml; kubeconfig-driven
    ├── 03-deploy.sh            # Build + push + apply manifests + roll deployments; kubeconfig-driven
    ├── 04-verify.sh            # Post-deploy health + security checks; kubeconfig-driven
    └── rollback.sh             # `kubectl rollout undo` across all deployments
 ```
 The `deploy/prod.env` file (sibling to `deploy-k3s/`, gitignored) holds
 observability + admin credentials that `02/03-deploy.sh` read but never
 display:
 ```
 OBS_INGEST_URL       (https://obs.88oakapps.com/api/v1/write)
 OBS_TRACES_URL       (https://obs.88oakapps.com/v1/traces)
 OBS_INGEST_TOKEN     (bearer token for VM + Loki + traces — all use same token)
 GRAFANA_URL          (https://grafana.88oakapps.com)
 GRAFANA_ADMIN_USER   (admin)
 GRAFANA_ADMIN_PASSWORD
 ADMIN_EMAIL / ADMIN_PASSWORD (in-app admin login)
 ```
 ---
 ## 6. Install from clean boxes — the truthful procedure
 This is what we ran on 2026-06-03 to stand up the live cluster, exactly. If
 you ever rebuild from zero this is the canonical sequence. Total wall-clock:
 ~12 min for cluster bootstrap; ~10 min for workloads.
 ### 6.1 Prerequisites
 - 3 fresh Ubuntu VPS instances (any provider with public IPv4, ≥4 GB RAM,
  ≥40 GB disk)
 - `~/.ssh/config` entries (`ovhcloud1/2/3`) pointing at them, with
  passwordless sudo
 - Local `kubectl` and `curl`
 - The repo's `deploy-k3s/secrets/` populated (or the ability to copy live
  secrets from another running cluster — see §7.2)
 - `deploy/prod.env` populated with obs token + Grafana creds
 ### 6.2 Per-node OS hardening + firewall (all 3 in parallel)
 For each `ovhcloudN`, over SSH:
 ```sh
 export DEBIAN_FRONTEND=noninteractive
 sudo apt-get update -qq
 sudo apt-get install -y -qq fail2ban unattended-upgrades open-iscsi curl ufw
 sudo systemctl enable --now iscsid fail2ban
 sudo dpkg-reconfigure -f noninteractive -plow unattended-upgrades
 sudo ufw --force reset
 sudo ufw default deny incoming
 sudo ufw default allow outgoing
 sudo ufw allow 22/tcp
 sudo ufw allow 80/tcp
 sudo ufw allow 443/tcp
 sudo ufw allow 6443/tcp
 SELF=$(hostname -I | awk '{print $1}')
 for peer in 51.81.83.33 51.81.87.86 51.81.85.248; do
  [ "$peer" = "$SELF" ] && continue
  sudo ufw allow from "$peer" to any port 2379:2380 proto tcp
  sudo ufw allow from "$peer" to any port 10250        proto tcp
  sudo ufw allow from "$peer" to any port 51820        proto udp
  sudo ufw allow from "$peer" to any port 8472         proto udp
 done
 sudo ufw --force enable
 ```
 **Watch ordering:** `allow 22/tcp` MUST precede `ufw enable`. Existing SSH
 sessions survive (`ufw` only affects new connections), but a misordered script
 locks you out of fresh logins.
 ### 6.3 Install k3s on `ovhcloud1` (the init node)
 ```sh
 ssh ovhcloud1 'curl -sfL https://get.k3s.io | \
  INSTALL_K3S_VERSION=v1.34.6+k3s1 \
  sh -s - server \
    --cluster-init \
    --node-ip=51.81.83.33 \
    --node-external-ip=51.81.83.33 \
    --advertise-address=51.81.83.33 \
    --flannel-backend=wireguard-native \
    --flannel-external-ip \
    --secrets-encryption \
    --write-kubeconfig-mode=0600 \
    --tls-san=51.81.83.33 \
    --tls-san=51.81.87.86 \
    --tls-san=51.81.85.248 \
    --disable-cloud-controller'
 ```
 Wait for `sudo k3s kubectl get nodes` to show this node Ready (~2-5 s).
 Read the cluster token:
 ```sh
 ssh ovhcloud1 'sudo cat /var/lib/rancher/k3s/server/node-token'
 ```
 ### 6.4 Join `ovhcloud2`, then `ovhcloud3` (sequential)
 Joining etcd one node at a time avoids split-brain on slow networks.
 Replace `<TOKEN>` with the value from 6.3.
 For `ovhcloud2`:
 ```sh
 ssh ovhcloud2 'curl -sfL https://get.k3s.io | \
  INSTALL_K3S_VERSION=v1.34.6+k3s1 \
  K3S_TOKEN=<TOKEN> \
  sh -s - server \
    --server=https://51.81.83.33:6443 \
    --node-ip=51.81.87.86 \
    --node-external-ip=51.81.87.86 \
    --advertise-address=51.81.87.86 \
    --flannel-backend=wireguard-native \
    --flannel-external-ip \
    --secrets-encryption \
    --write-kubeconfig-mode=0600 \
    --tls-san=51.81.83.33 --tls-san=51.81.87.86 --tls-san=51.81.85.248 \
    --disable-cloud-controller'
 ```
 Then identical for `ovhcloud3` with `--node-ip=51.81.85.248` and
 `--advertise-address=51.81.85.248`. After each, wait for `kubectl get nodes`
 to show the new node Ready before proceeding.
 ### 6.5 Pull kubeconfig to the operator workstation
 ```sh
 ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
  | sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
  > deploy-k3s/kubeconfig
 chmod 600 deploy-k3s/kubeconfig
 export KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig
 kubectl get nodes -o wide       # All 3 Ready, INTERNAL-IP = public IP
 ```
 ### 6.6 Label the redis node
 ```sh
 kubectl label node vps-1624d691 honeydue/redis=true --overwrite
 ```
 (Use whichever k8s node name corresponds to `ovhcloud1`. The Redis
 Deployment's `nodeSelector` binds to this label.)
 ### 6.7 Bootstrap manifests NOT applied by `03-deploy.sh`
 These must be applied manually on a fresh cluster, **before** running
 `03-deploy.sh`, or workloads will fail to schedule:
 ```sh
 kubectl apply -f deploy-k3s/manifests/rbac.yaml
 kubectl apply -f deploy-k3s/manifests/pod-disruption-budgets.yaml
 ```
 `rbac.yaml` creates the 5 ServiceAccounts (`api`, `worker`, `admin`, `web`,
 `redis`) referenced by the Deployment manifests. Without these, ReplicaSets
 hang on `FailedCreate: error looking up service account` and pods never
 start. Symptom on first deploy: `kubectl get deploy` shows `0 up-to-date`
 across the board with no pod activity — see §9 *Gotchas*.
 **Do NOT apply** `traefik-helmchartconfig.yaml` (Hetzner-only — see §10) or
 `kyverno-verify-images.yaml` (gated on operator Kyverno install).
 ### 6.8 Seed secrets
 Two paths; pick whichever fits your situation:
 **Path A — clean install from local files** (the original design):
 ```sh
 KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/02-setup-secrets.sh
 ```
 Requires `deploy-k3s/secrets/` to contain real `postgres_password.txt`,
 `secret_key.txt`, `email_host_password.txt`, `fcm_server_key.txt`,
 `apns_auth_key.p8`, `cloudflare-origin.crt`, `cloudflare-origin.key`. The
 script reads `config.yaml` for `registry.*`, `redis.password`,
 `admin.basic_auth_*`, and `storage.b2_*`.
 **Path B — clone live secrets from another running cluster** (what we
 actually did during the migration; useful if `secrets/` is empty or you want
 exact-byte equivalence):
 ```sh
 HETZNER=$(pwd)/deploy-k3s/kubeconfig.hetzner.bak   # or any kubeconfig with the secrets
 OVH=$(pwd)/deploy-k3s/kubeconfig
 kubectl --kubeconfig=$OVH apply -f deploy-k3s/manifests/namespace.yaml
 for S in honeydue-secrets honeydue-apns-key gitea-credentials cloudflare-origin-cert admin-basic-auth; do
  kubectl --kubeconfig=$HETZNER -n honeydue get secret $S -o json \
    | python3 -c "
 import json, sys
 d = json.load(sys.stdin)
 m = d['metadata']
 for k in ('uid','resourceVersion','creationTimestamp','generation','managedFields','ownerReferences','selfLink'):
    m.pop(k, None)
 m.pop('annotations', None)
 print(json.dumps(d))" \
    | kubectl --kubeconfig=$OVH apply -f -
 done
 ```
 After either path, verify:
 ```sh
 kubectl -n honeydue get secrets
 # Expect: admin-basic-auth, cloudflare-origin-cert, gitea-credentials,
 #         honeydue-apns-key, honeydue-secrets
 ```
 ### 6.9 Deploy workloads
 ```sh
 KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig \
  ./deploy-k3s/scripts/03-deploy.sh --skip-build --tag latest
 ```
 - `--skip-build` skips Docker build + push, deploys whatever's already in the
  registry at the named tag. Use this when migrating between clusters to
  guarantee both run identical bits.
 - Without flags it builds the api / worker / admin / web images from the
  local repo HEAD and pushes to `gitea.treytartt.com` first.
 - The script applies (in order): namespace, network-policies (13 of them),
  redis, ingress, then runs the goose migration Job (blocking on success),
  then api / worker / admin / web Deployments, then observability
  (kube-state-metrics, vmagent, alloy-logs).
 - It does NOT apply: `rbac.yaml`, `pod-disruption-budgets.yaml`,
  `traefik-helmchartconfig.yaml`, `kyverno-verify-images.yaml`. The first
  two must be applied manually (see §6.7); the latter two are Hetzner-only
  or operator-gated.
 - It does NOT apply: anything under `kratos/` (skipped until
  `kratos-secrets` exists, which requires real OIDC client IDs).
 ### 6.10 Verify
 ```sh
 KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/04-verify.sh
 ```
 Expect: all deployments `READY=desired`, 13 NetworkPolicies, 7 ServiceAccounts
 (api, worker, admin, web, redis, vmagent, alloy-logs), 3 PDBs, cloudflare-only
 middleware present, in-cluster `/api/health/` returns 200.
 External smoke test (DNS-aware, but the api `/health/` route is exempt from
 the cloudflare-only middleware so direct-IP works for diagnostics):
 ```sh
 for IP in 51.81.83.33 51.81.87.86 51.81.85.248; do
  curl -s -o /dev/null -w "$IP -> %{http_code}\n" \
    -H 'Host: api.myhoneydue.com' http://$IP/api/health/
 done
 # All three should return 200.
 ```
 ### 6.11 DNS cutover (if migrating)
 In the Cloudflare dashboard for `myhoneydue.com`, set the 4 hostnames in §4 to
 the OVH IPs and keep proxied. Effective propagation ~30 s to 5 min through
 the Cloudflare proxy.
 If you have a previous cluster, **scale its worker to 0 before flipping** to
 avoid scheduled-job double-fires:
 ```sh
 KUBECONFIG=<previous>    kubectl -n honeydue scale deploy/worker --replicas=0
 # (cut DNS)
 KUBECONFIG=<new>         kubectl -n honeydue scale deploy/worker --replicas=1
 ```
 Run those last two lines back-to-back. Worker work is mostly scheduled
 (hourly+), so a brief gap is harmless; overlap would cause duplicate emails.
 ---
 ## 7. Day-to-day operations
 ### Common kubectl one-liners
 ```sh
 export KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig
 # Cluster state
 kubectl get nodes -o wide
 kubectl -n honeydue get pods
 kubectl -n honeydue get deploy
 kubectl top nodes
 kubectl -n honeydue top pods
 # Tail logs
 kubectl -n honeydue logs deploy/api -f --tail=50
 kubectl -n honeydue logs -l app.kubernetes.io/name=api -f --tail=20
 stern -n honeydue api               # if stern is installed (multi-pod)
 # Restart a deployment (no image change, picks up ConfigMap changes)
 kubectl -n honeydue rollout restart deploy/api
 # Rollback one revision
 kubectl -n honeydue rollout undo deploy/api
 # Scale (worker MUST stay at 0 or 1)
 kubectl -n honeydue scale deploy/api --replicas=4
 # Get into a pod
 kubectl -n honeydue exec -it deploy/api -- sh
 ```
 ### Redeploy after code changes
 ```sh
 KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/03-deploy.sh
 ```
 Builds images from local HEAD, tags with the git short SHA, pushes to Gitea,
 runs `goose up` (idempotent), rolls api/worker/admin/web. Total: ~3-5 min
 when images change.
 To deploy without rebuilding (pin to a specific tag):
 ```sh
 ./deploy-k3s/scripts/03-deploy.sh --skip-build --tag <tag-or-:latest>
 ```
 ### Migrations
 Goose migrations live in `migrations/`. New file pattern:
 ```
 make migrate-new name=add_foo_column     # generates migrations/YYYYMMDDHHMMSS_add_foo_column.sql
 # Edit the file with -- +goose Up / -- +goose Down sections
 ```
 `03-deploy.sh` runs a one-shot Job (`manifests/migrate/job.yaml`) that
 executes `goose up` against Neon (direct compute endpoint, not pooler — see
 file comment). The Job blocks api/worker rollout and aborts the deploy on
 failure. No app pod runs `AutoMigrate`; api/worker startup verifies
 `goose_db_version` is current and refuses to boot on mismatch.
 ### Grafana
 URL: https://grafana.88oakapps.com (creds in `deploy/prod.env`)
 Three dashboards in the `honeyDue` folder:
 | UID | Title | Use |
 |---|---|---|
 | `honeydue-eli5-overview` | honeyDue — Overview (ELI5) | Single-screen at-a-glance health: pods up, crashes, errors, RPS, latency, Postgres, memory, top endpoints, push failures, worker activity, recent error logs. Created 2026-06-03. |
 | `honeydue-red` | honeyDue API — RED | Rate/Errors/Duration cuts (legacy) |
 | `honeydue-logs` | honeyDue — Production Logs | Live log explorer |
 For the ELI5 dashboard's queries, **api-side metrics use `service="api"`,
 NOT `namespace="honeydue"`.** vmagent's scrape config drops the namespace
 label from api metrics — only `service`, `pod`, `node`, `job`, plus the
 metric's own labels (route, method, status, etc.) survive. Queries that
 filter on `namespace="honeydue"` for api metrics silently match nothing.
 ### kubectl tunnel (if 6443 is firewalled to your IP)
 Currently `6443` is open WAN-side (matching the previous Hetzner posture).
 If you tighten that to operator-IPs-only and your IP changes, use an SSH
 tunnel:
 ```sh
 ssh -fN -o ExitOnForwardFailure=yes -o ServerAliveInterval=30 \
    -i ~/.ssh/ovhcloud \
    -L 127.0.0.1:6443:127.0.0.1:6443 \
    ubuntu@51.81.83.33
 cp deploy-k3s/kubeconfig deploy-k3s/kubeconfig.tunnel
 sed -i.bak 's|https://51.81.83.33:6443|https://127.0.0.1:6443|' deploy-k3s/kubeconfig.tunnel
 export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig.tunnel"
 ```
 ---
 ## 8. Disaster recovery
 ### "I lost the kubeconfig"
 ```sh
 ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
  | sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
  > deploy-k3s/kubeconfig
 chmod 600 deploy-k3s/kubeconfig
 ```
 If `ovhcloud1` is down but `ovhcloud2` or `3` is up, swap host and IP — the
 TLS SAN covers all three.
 ### "A node is unresponsive"
 ```sh
 kubectl drain vps-XXX --ignore-daemonsets --delete-emptydir-data
 # Reboot via OVH manager or:
 ssh ovhcloudN sudo reboot
 # Wait for Ready, then:
 kubectl uncordon vps-XXX
 ```
 The cluster tolerates 1 node down (etcd quorum 2/3). With 2 down, etcd
 loses quorum and the API server stops accepting writes.
 ### "etcd quorum lost (2+ nodes dead)"
 Bring nodes back online if possible. If not:
 ```sh
 ssh ovhcloud1 'sudo k3s server --cluster-reset --cluster-reset-restore-path=/var/lib/rancher/k3s/server/db/snapshots/<latest>'
 ```
 k3s takes automatic etcd snapshots every 12h, keeping 5. List with:
 ```sh
 ssh ovhcloud1 sudo ls -la /var/lib/rancher/k3s/server/db/snapshots/
 ```
 This is destructive — workload state since the snapshot is lost, but Neon
 (actual app data) is unaffected.
 ### "I have to rebuild the whole cluster from scratch"
 Provision 3 fresh boxes, then exactly the sequence in §6. End-to-end is
 ~30 min. The dependencies that make this possible:
 | Stays put through rebuild | Where |
 |---|---|
 | Application data | Neon Postgres (managed) |
 | User uploads | Backblaze B2 (managed) |
 | Container images | `gitea.treytartt.com` (self-hosted, but not on the OVH cluster) |
 | Operator secrets | `deploy-k3s/secrets/` + `config.yaml` + `deploy/prod.env` on the operator workstation (gitignored) |
 | DNS | Cloudflare control panel |
 If `gitea.treytartt.com` is on the same OVH cluster, you have a circular
 dependency — rebuilding requires images you can't pull until the cluster is
 up. Currently Gitea is NOT in the honeyDue cluster (separate Hetzner-era
 host), so this isn't a problem today, but worth flagging if that ever
 changes.
 ### "Cutover back to Hetzner / failover to a backup cluster"
 There is **no warm standby today.** Bringing up a second cluster is the
 same §6 procedure on different hardware, then a Cloudflare DNS swap. The
 worker-swap dance is critical:
 ```sh
 KUBECONFIG=<current>  kubectl -n honeydue scale deploy/worker --replicas=0
 # (Update Cloudflare DNS to new cluster's IPs — proxied)
 KUBECONFIG=<new>      kubectl -n honeydue scale deploy/worker --replicas=1
 ```
 ---
 ## 9. Known gotchas
 ### 9.1 First-deploy "0 up-to-date" across all Deployments
 **Symptoms:** `kubectl get deploy` shows `READY 0/N, UP-TO-DATE 0` for
 api/worker/admin/web/redis. `kubectl get events` shows
 `FailedCreate: error looking up service account honeydue/<name>: serviceaccount "..." not found`.
 **Cause:** `rbac.yaml` (ServiceAccounts) is NOT applied by `03-deploy.sh`. On
 a fresh cluster the SAs don't exist; the ReplicaSet controller can't create
 pods.
 **Fix:**
 ```sh
 kubectl apply -f deploy-k3s/manifests/rbac.yaml
 kubectl -n honeydue rollout restart deploy/api deploy/worker deploy/admin deploy/web deploy/redis
 ```
 This was hit during the 2026-06-03 OVH bootstrap. Permanently fix by adding
 `kubectl apply -f rbac.yaml` to `03-deploy.sh` between the namespace and
 network-policies apply, but until that lands, follow §6.7 on every fresh
 cluster.
 ### 9.2 vmagent SD broken on fresh deploy ("0 pods up" in Grafana)
 **Symptoms:**
 - Grafana panels using `kube_*` metrics or `up{job=...}` show 0
 - vmagent logs: `dial tcp 10.43.0.1:443: connect: connection refused` every ~30 s
 - Direct test from a pod also refused
 **Cause:** k3s's NetworkPolicy controller evaluates egress rules *after*
 kube-proxy's DNAT (not before, contrary to spec). Pod-to-`kubernetes`-Service
 (`10.43.0.1:443`) gets DNAT'd to `<node_ip>:6443`, *then* the policy check
 runs. Without an explicit egress rule for `:6443`, the packet is rejected.
 The `allow-egress-from-vmagent` NetPol in `network-policies.yaml` includes
 both rules:
 ```yaml
 - to:
    - ipBlock: { cidr: 10.43.0.0/16 }
  ports:
    - { port: 443, protocol: TCP }
 - to:
    - ipBlock:
        cidr: 0.0.0.0/0
        except: [10.42.0.0/16]
  ports:
    - { port: 6443, protocol: TCP }
 ```
 **If this happens:** confirm `network-policies.yaml` was applied:
 ```sh
 kubectl -n honeydue get netpol allow-egress-from-vmagent -o yaml | grep -A 5 6443
 ```
 Counter-evidence that confirms diagnosis: `kube-state-metrics` in
 `kube-system` works fine (no NetPols in that namespace).
 ### 9.3 vmagent appears healthy but no data in Grafana
 vmagent's `/-/healthy` returns 200 as long as the process is alive and
 remote-write is TCP-functional. It doesn't check that scrapes are actually
 *succeeding*. The liveness probe in `vmagent.yaml` queries `/api/v1/targets`
 and fails the pod if no target is `up`. After ~3 failures (~3 min), kubelet
 recycles it.
 If vmagent runs for weeks but Grafana is empty, the probe was disabled or
 the exec command broke.
 ### 9.4 vmagent bearer token destroyed by direct `kubectl apply`
 The committed `vmagent.yaml` has `bearer_token: TOKEN_PLACEHOLDER`. The real
 token is `sed`-substituted at deploy time by `03-deploy.sh`. Applying the
 file directly:
 ```sh
 kubectl apply -f deploy-k3s/manifests/observability/vmagent.yaml   # WRONG
 ```
 overwrites the Secret with the literal `TOKEN_PLACEHOLDER` and remote-writes
 401. To restore without a full redeploy:
 ```sh
 OBS_TOKEN_B64=$(kubectl -n honeydue get secret honeydue-secrets \
                  -o jsonpath='{.data.OBS_INGEST_TOKEN}')
 kubectl -n honeydue patch secret vmagent-remote-write --type=json \
  -p="[{\"op\":\"replace\",\"path\":\"/data/bearer_token\",\"value\":\"${OBS_TOKEN_B64}\"}]"
 kubectl -n honeydue rollout restart deploy/vmagent
 ```
 Or just re-run `./deploy-k3s/scripts/03-deploy.sh` — the sed handles it.
 ### 9.5 Dashboard queries: api metrics need `service="api"` not `namespace="honeydue"`
 vmagent's scrape config (`vmagent-config` ConfigMap) explicitly chooses which
 Kubernetes pod-metadata labels to copy onto each scraped series. **Namespace
 isn't one of them.** Labels you can use on api-side metrics:
 - `service` (literal `"api"`)
 - `job` (literal `"api"`)
 - `pod` (the api pod name)
 - `node` (the k8s node name)
 - `cluster` (vmagent external_label, currently `"honeydue-k3s"`)
 - `environment` (vmagent external_label, currently `"prod"`)
 - Plus each metric's own labels (`method`, `route`, `status` for HTTP; etc.)
 `kube_*` metrics from kube-state-metrics DO carry `namespace` natively
 (KSM publishes it as a label, vmagent passes it through). Loki streams have
 `namespace` because alloy-logs explicitly relabels it. So the rule is:
 | Metric prefix | Use |
 |---|---|
 | `kube_*` | `namespace="honeydue"` |
 | `http_*`, `gorm_*`, `go_*`, `process_*` (api) | `service="api"` |
 | Loki logs `{...}` | `namespace="honeydue"` |
 ### 9.6 Cluster-label collision when two clusters run together
 Both Hetzner and OVH vmagents push as `cluster=honeydue-k3s, environment=prod`
 (same external_labels). During the migration overlap this made dashboards
 sum both clusters' data. The simplest narrowing during overlap is by node
 name pattern (`node=~"vps-.*"` for OVH, `node=~"ubuntu-.*"` for Hetzner). If
 you ever bring up a backup cluster long-term, change one cluster's
 `external_labels.cluster` to something distinct (e.g. `honeydue-ovh`
 vs. `honeydue-backup`).
 ### 9.7 Worker double-firing scheduled jobs
 If two `worker` Deployments run concurrently (e.g. two clusters both pointing
 at the same Neon DB), Asynq schedulers each fire crons independently — users
 get duplicate emails. Workaround: scale all-but-one worker to 0. This is the
 exact mechanic used during cutovers (§6.11).
 ### 9.8 Node kubeconfig mode
 `/etc/rancher/k3s/k3s.yaml` on each node is mode `0600` because we install
 with `--write-kubeconfig-mode=0600`. Tightening from k3s default (0644) was
 intentional. Don't change without coordinating — any tooling on the node
 that expects to read it (none today) will break.
 ---
 ## 10. Differences from MIGRATION_NOTES.md (Hetzner-era)
 `MIGRATION_NOTES.md` documents the Swarm → k3s migration on Hetzner
 (2026-04-24). Most of it still applies, with these OVH-specific deltas:
 | What MIGRATION_NOTES says | What OVH actually has |
 |---|---|
 | `hetzner-k3s` provisioner | Manual k3s install (§6) |
 | Hetzner Load Balancer (not used) → Cloudflare round-robin | Same — Cloudflare round-robin (§4) |
 | Traefik as DaemonSet + hostNetwork via HelmChartConfig | Traefik default Deployment + klipper-lb svclb DaemonSet. The `traefik-helmchartconfig.yaml` file is **NOT applied** on OVH. |
 | `servicelb` disabled (`--disable=servicelb`) | `servicelb` enabled (we didn't pass `--disable=servicelb`). This is what makes klipper-lb work. |
 | sysctl `net.ipv4.ip_unprivileged_port_start=0` for hostNetwork Traefik | Not needed — klipper-lb proxies the port binding instead |
 | UFW rules between 3 Hetzner IPs | UFW rules between 3 OVH IPs (51.81.83.33, 51.81.87.86, 51.81.85.248) |
 | Kubeconfig at `~/.kube/honeydue-k3s.yaml` | Kubeconfig at `deploy-k3s/kubeconfig` |
 | TLS at origin: not configured (CF Flexible) | Same — CF Flexible. `cloudflare-origin-cert` Secret exists (carried over) but Ingress doesn't reference it. |
 ---
 ## 11. Outstanding follow-ups (deferred, not blocking)
 1. **No warm standby / rollback cluster.** OVH is solo production. An OVH
   outage is a real outage; recovery time = §6 procedure (~30 min). User
   plans to bring a second cluster up as a target.
 2. **UFW allows 80/443 from world.** Hetzner had a network-layer Cloudflare-IP
   allowlist on these ports. OVH currently relies on the L7
   `cloudflare-only` Traefik middleware, which protects admin but NOT api /
   web / apex (those routes have to be reachable from anywhere, but they're
   then trivially DDoSable bypassing Cloudflare). Fix: add ufw allow rules
   restricting `80/tcp` and `443/tcp` to Cloudflare's published IP ranges
   (~22 IPv4 prefixes from https://www.cloudflare.com/ips-v4/).
 3. **Cloudflare TLS Flexible → Full(strict).** Origin certs exist as Secret
   but Ingress doesn't terminate TLS. Upgrading to Full(strict) requires
   Traefik configured with the cert + an HTTPS entrypoint + Ingress
   `tls:` block.
 4. **`rbac.yaml` + `pod-disruption-budgets.yaml` should be in `03-deploy.sh`.**
   They're currently bootstrap-only. Adding them is idempotent and prevents
   the §9.1 footgun.
 5. **Push notification metrics are log-derived, not counters.** Successes
   aren't logged or counted. Proper Prometheus instrumentation (~15 lines in
   `internal/push/client.go`) would give a real success/failure ratio.
 6. **Worker has no `/metrics` endpoint.** `cmd/worker/main.go` serves `:6060`
   for healthz only. Adding Asynq's `metrics.NewPrometheusExporter()` + a
   ServiceMonitor + uncommenting the `worker` job stanza in
   `vmagent-config` ConfigMap would give real queue depth and job latency.
 7. **Ory Kratos.** Manifests exist (`manifests/kratos/`) but the deploy
   is gated on operator-side prerequisites (Neon `kratos` database,
   `auth.myhoneydue.com` DNS, real Apple+Google OIDC clients, Kratos image
   tag pinned). Until `kratos-secrets` exists, `03-deploy.sh` silently
   skips the Kratos apply.
 8. **Hetzner cluster fully retired? `config.yaml` `nodes:` block describes
   OVH; the bak kubeconfig is at `kubeconfig.hetzner.bak`. Boxes themselves
   are operator-managed.
 ### 11.1 Dashboard observability gaps (raised 2026-06-03 during dashboard build)
 Surfaced while building the `honeydue-eli5-overview` Grafana dashboard. Each
 needs code or infra changes to expose; none blocks today's operations.
 9. **node-exporter not deployed.** No node-level metrics today
   (`node_filesystem_avail_bytes`, `node_memory_*`, `node_load1`, etc.).
   The dashboard's pod-level memory/CPU panels are app-process only — a
   node running out of disk would silently fail the cluster before any
   dashboard signal showed it. Highest-priority Tier-3 item. Fix: deploy
   `node-exporter` as a DaemonSet (~50 lines of YAML), add a scrape stanza
   to `vmagent-config`, add a `Node disk free` stat panel.
 10. **Traefik metrics not enabled.** Traefik can expose `/metrics` with
    `traefik_entrypoint_requests_total` + `traefik_service_request_duration_seconds`,
    giving edge-level visibility into requests that never reached api
    pods (404s, redirects, middleware blocks). Enable via a
    HelmChartConfig override that sets `metrics.prometheus.entryPoint=metrics`
    + adds a `:9100` entryPoint + a scrape stanza. Skipped today to avoid
    Traefik restart risk; safe additive change when ready.
 11. **Push notification success/failure counters** (already #5). Add
    `prometheus.NewCounterVec` in `internal/push/client.go` with labels
    `platform={ios,android}, outcome={success,failed,breaker_open,disabled}`.
    Increments at every Send/SendActionable branch. Replaces the
    log-derived "Push failures" stat on the dashboard with a real success
    rate.
 12. **Worker queue / job metrics** (already #6). Asynq has a built-in
    Prometheus exporter (`asynq/x/metrics`). Wire it into the worker's
    `:6060` health server (a single `healthMux.Handle` line) and
    uncomment the worker scrape stanza in `vmagent-config`. Surfaces
    queue depth, retry count, processing time per task type.
 13. **Cache hit / miss rate.** `internal/services/cache_service.go` has
    no counters. Add a Counter with labels `{operation=get|set, result=hit|miss}`
    around the cache wrapper. ~10 lines. Useful once real traffic flows
    to verify the ETag and Redis caches are paying their keep.
 14. **APNs send-latency histogram.** Wrap `internal/push/apns.go::Send`
    in a `prometheus.NewHistogramVec` keyed on outcome. Tells you when
    Apple's gateway is slow (which correlates with their incident page).
 ---
 ## 12. Audit trail
 | Date | Change |
 |---|---|
 | 2026-04-24 | Initial k3s cluster on Hetzner (Swarm → k3s migration) — see MIGRATION_NOTES.md |
 | 2026-04-25 | `config.yaml` reconstructed from live ConfigMap (original file lost) |
 | 2026-05-15 | Audit fixes: Redis auth required, admin basic auth, secrets-encryption flag |
 | 2026-05-16 | `02-setup-secrets.sh` started carrying B2 credentials (was a manifest/script drift) |
 | 2026-06-02 | Kratos scaffolding committed (not deployed) |
 | 2026-06-03 | **Hetzner → OVH BHS cutover.** New 3-node cluster on 51.81.83.33, .87.86, .85.248. DNS cut on Cloudflare. Hetzner kubeconfig moved to `.bak`. Grafana `honeydue-eli5-overview` dashboard created. Hetzner cluster powered off later same day. |
 | 2026-06-03 | Dashboard build-out: extended `honeydue-eli5-overview` to 22 panels covering Tier-1 (HTTP status, CPU per pod, goroutines, top slow) and Tier-2 (GC, network I/O, pod uptime, top 5xx) signals. Surfaced Tier-3 instrumentation gaps in §11.1. |
@@ -30,6 +30,7 @@ load_balancer_ip: ""
 domains:
  api: api.myhoneydue.com
  admin: admin.myhoneydue.com
  app: app.myhoneydue.com                   # web client host — added to CORS_ALLOWED_ORIGINS
  base: myhoneydue.com
 # --- Container Registry (GHCR) ---
@@ -62,7 +63,7 @@ email:
 push:
  apns_key_id: ""
  apns_team_id: ""
-  apns_topic: com.tt.honeyDue
+  apns_topic: com.myhoneydue.honeyDue
  apns_production: true
  apns_use_sandbox: false
@@ -72,8 +73,13 @@ storage:
  b2_app_key: ""
  b2_bucket: ""
  b2_endpoint: ""                           # e.g. s3.us-west-004.backblazeb2.com
  b2_region: ""                             # e.g. us-east-005
  b2_use_ssl: true
  max_file_size: 10485760
  allowed_types: "image/jpeg,image/png,image/gif,image/webp,application/pdf"
  upload_dir: /app/uploads                  # filesystem path inside the api container
  base_url: /uploads                        # public URL prefix served by the api
  static_dir: /app/static                   # static asset path inside the api container
 # --- Worker Schedules (UTC hours) ---
 worker:
@@ -100,8 +106,10 @@ admin:
  basic_auth_password: ""                   # HTTP basic auth password for admin panel
 # --- Apple Auth / IAP (optional, leave empty if unused) ---
 # client_id MUST equal the iOS Release bundle ID — Apple identity tokens
 # are rejected if the `aud` claim doesn't match.
 apple_auth:
-  client_id: ""
+  client_id: "com.myhoneydue.honeyDue"
  team_id: ""
  iap_key_id: ""
  iap_issuer_id: ""
@@ -23,8 +23,11 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: admin
      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
      # the ServiceAccount-level setting in rbac.yaml.
      automountServiceAccountToken: false
      imagePullSecrets:
-        - name: ghcr-credentials
+        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1001
@@ -35,6 +38,7 @@ spec:
      containers:
        - name: admin
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh
          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit; images are SHA/digest-pinned
          ports:
            - containerPort: 3000
              protocol: TCP
@@ -82,7 +86,7 @@ spec:
            timeoutSeconds: 5
          livenessProbe:
            httpGet:
-              path: /admin/
+              path: /
              port: 3000
            initialDelaySeconds: 30
            periodSeconds: 30
@@ -23,8 +23,11 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: api
      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
      # the ServiceAccount-level setting in rbac.yaml.
      automountServiceAccountToken: false
      imagePullSecrets:
-        - name: ghcr-credentials
+        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
@@ -35,6 +38,7 @@ spec:
      containers:
        - name: api
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh
          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit; images are SHA/digest-pinned
          ports:
            - containerPort: 8000
              protocol: TCP
@@ -46,34 +50,16 @@ spec:
          envFrom:
            - configMapRef:
                name: honeydue-config
-          env:
+          # Audit CODE-F8: secrets are NOT injected as environment variables.
-            - name: POSTGRES_PASSWORD
+          # Env vars are readable for the life of the pod via /proc/<pid>/environ
-              valueFrom:
+          # and leak into crash dumps / child processes. honeydue-secrets is
-                secretKeyRef:
+          # mounted read-only at /etc/honeydue/secrets (mode 0400) and the Go
-                  name: honeydue-secrets
+          # config layer (config.loadFileSecrets) reads each key from its file.
-                  key: POSTGRES_PASSWORD
+          # Non-secret config still arrives via the configMapRef above.
            - name: SECRET_KEY
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: SECRET_KEY
            - name: EMAIL_HOST_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: EMAIL_HOST_PASSWORD
            - name: FCM_SERVER_KEY
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: FCM_SERVER_KEY
            - name: REDIS_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: REDIS_PASSWORD
                  optional: true
          volumeMounts:
            - name: app-secrets
              mountPath: /etc/honeydue/secrets
              readOnly: true
            - name: apns-key
              mountPath: /secrets/apns
              readOnly: true
@@ -90,11 +76,12 @@ spec:
            httpGet:
              path: /api/health/
              port: 8000
-            # MigrateWithLock in cmd/api/main.go runs pg_advisory_lock on
+            # Schema migrations run separately in the honeydue-migrate Job
-            # every startup. On a cold boot with 3 replicas, the first does
+            # *before* this Deployment rolls — the api itself does not migrate
-            # AutoMigrate (~90s) and the others wait on the lock, so real
+            # (it only verifies goose_db_version at boot). Cold start still
-            # startup runs 90–240s. 48 × 5s = 240s grace absorbs it without
+            # pays the DB pool warm-up + Redis connect + APNs/FCM client init
-            # healthcheck killing a still-starting replica.
+            # before /api/health/ goes green. 48 × 5s = 240s grace keeps the
            # probe from killing a still-starting replica.
            failureThreshold: 48
            periodSeconds: 5
          readinessProbe:
@@ -112,6 +99,12 @@ spec:
            periodSeconds: 30
            timeoutSeconds: 10
      volumes:
        # Audit CODE-F8: the whole honeydue-secrets Secret, projected as files.
        # defaultMode 0400 → readable only by the container's runAsUser (1000).
        - name: app-secrets
          secret:
            secretName: honeydue-secrets
            defaultMode: 0400
        - name: apns-key
          secret:
            secretName: honeydue-apns-key
@@ -0,0 +1,57 @@
 # B2 bucket lifecycle — `uploads/` prefix
 The `pending_uploads` cleanup worker (cron `30 * * * *`, see
 `internal/worker/jobs/handler.go::HandleUploadCleanup`) reaps unclaimed
 upload sessions every hour, deleting both the row and the corresponding B2
 object. This bucket-level lifecycle rule is a **backstop** — it catches B2
 objects that survive the row deletion (e.g. worker crashed mid-loop, B2
 delete errored, manual DB tampering).
 ## Rule
 Apply via the Backblaze web console: **Bucket → `honeyDueProd` → Lifecycle Settings → Custom**
 ```json
 [
  {
    "fileNamePrefix": "uploads/",
    "daysFromUploadingToHiding": 7,
    "daysFromHidingToDeleting": 1
  }
 ]
 ```
 Effect: any object under the `uploads/` prefix is hidden 7 days after
 upload, then permanently deleted 1 day after that. Total maximum lifetime
 of an orphaned object: 8 days.
 This rule does NOT affect:
 - `images/`, `documents/`, `completions/` — legacy multipart-uploaded
  objects, which are managed by the existing `task_completion_image` /
  `document_image` / `document.file_url` references.
 ## Why a backstop, not the primary mechanism
 The application worker is the primary mechanism because:
 1. It can delete the **DB row** alongside the B2 object — lifecycle alone
   would leave dangling `pending_uploads` rows.
 2. It runs hourly vs. lifecycle's once-per-day evaluation — much tighter
   recovery window for the common case.
 3. It produces logs / metrics for orphan rate observability.
 ## Verification
 After applying:
 ```bash
 b2 bucket get-info honeyDueProd | jq '.lifecycleRules'
 ```
 Should show the rule above. If you don't have the B2 CLI:
 ```bash
 curl -u "$B2_KEY_ID:$B2_APP_KEY" https://api.backblazeb2.com/b2api/v3/b2_authorize_account
 # Then use the returned authorization_token + apiUrl to call b2_get_bucket
 ```
@@ -1,11 +1,10 @@
-# Simple hostname-based Ingress — no TLS (Cloudflare Flexible handles edge
+# Hostname-based Ingress with TLS terminated at Traefik using the
-# TLS, CF→origin is plain HTTP on 80). Upgrade to Full (strict) by
+# Cloudflare Origin CA cert (secret/cloudflare-origin-cert). CF→origin
-# adding back a `tls:` block with a Cloudflare Origin CA cert stored in
+# encryption enables CF SSL mode "Full (strict)".
 # secret/cloudflare-origin-cert.
 #
 # Middleware chain (security headers, rate limit, CF-only allowlist, admin
-# basic auth) is defined in `middleware.yaml` but NOT attached here —
+# basic auth) is defined in `middleware.yaml`. security-headers + rate-limit
-# annotate this ingress to turn any of them on.
+# are attached below via annotation.
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
@@ -13,8 +12,15 @@ metadata:
  namespace: honeydue
  labels:
    app.kubernetes.io/part-of: honeydue
  annotations:
    traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
 spec:
  ingressClassName: traefik
  tls:
    - hosts:
        - api.myhoneydue.com
        - myhoneydue.com
      secretName: cloudflare-origin-cert
  rules:
    - host: api.myhoneydue.com
      http:
@@ -46,8 +52,19 @@ metadata:
  namespace: honeydue
  labels:
    app.kubernetes.io/part-of: honeydue
  annotations:
    # cloudflare-only + admin-auth wired in (audit F2/F3/CODE-L6). Order
    # matters: reject non-Cloudflare IPs, then basic auth, then headers,
    # then rate limit. The admin-basic-auth secret is created by
    # 02-setup-secrets.sh from config.yaml admin.basic_auth_* — that runs
    # before 03-deploy.sh, so the middleware always has its secret.
    traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-admin-auth@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
 spec:
  ingressClassName: traefik
  tls:
    - hosts:
        - admin.myhoneydue.com
      secretName: cloudflare-origin-cert
  rules:
    - host: admin.myhoneydue.com
      http:
@@ -67,8 +84,14 @@ metadata:
  namespace: honeydue
  labels:
    app.kubernetes.io/part-of: honeydue
  annotations:
    traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
 spec:
  ingressClassName: traefik
  tls:
    - hosts:
        - app.myhoneydue.com
      secretName: cloudflare-origin-cert
  rules:
    - host: app.myhoneydue.com
      http:
@@ -80,3 +103,98 @@ spec:
                name: web
                port:
                  number: 3000
 ---
 # Auth-endpoint Ingress (audit F10 / LIVE-L12). A dedicated Ingress for the
 # auth paths so Traefik gives their longer path-prefix routers a higher
 # priority than honeydue-api's "/" router — these paths then get
 # auth-rate-limit (5/min) instead of the general rate-limit (100/min).
 # Anything not matched here falls through to honeydue-api unchanged.
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: honeydue-api-auth
  namespace: honeydue
  labels:
    app.kubernetes.io/part-of: honeydue
  annotations:
    traefik.ingress.kubernetes.io/router.middlewares: honeydue-auth-rate-limit@kubernetescrd,honeydue-security-headers@kubernetescrd
 spec:
  ingressClassName: traefik
  tls:
    - hosts:
        - api.myhoneydue.com
      secretName: cloudflare-origin-cert
  rules:
    - host: api.myhoneydue.com
      http:
        paths:
          - path: /api/auth/login
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/auth/register
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/auth/forgot-password
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/auth/reset-password
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/residences/join-with-code
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/auth/verify-reset-code
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/auth/apple-sign-in
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/auth/google-sign-in
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/auth/refresh
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
          - path: /api/auth/account
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
@@ -1,54 +0,0 @@
 # API Ingress — Cloudflare-only + security headers + rate limiting
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: honeydue-api
  namespace: honeydue
  labels:
    app.kubernetes.io/part-of: honeydue
  annotations:
    traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
 spec:
  tls:
    - hosts:
        - api.myhoneydue.com
      secretName: cloudflare-origin-cert
  rules:
    - host: api.myhoneydue.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: api
                port:
                  number: 8000
 ---
 # Admin Ingress — Cloudflare-only + security headers + rate limiting + basic auth
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: honeydue-admin
  namespace: honeydue
  labels:
    app.kubernetes.io/part-of: honeydue
  annotations:
    traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd,honeydue-admin-auth@kubernetescrd
 spec:
  tls:
    - hosts:
        - admin.myhoneydue.com
      secretName: cloudflare-origin-cert
  rules:
    - host: admin.myhoneydue.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: admin
                port:
                  number: 3000
@@ -21,13 +21,24 @@ spec:
  headers:
    frameDeny: true
    contentTypeNosniff: true
-    browserXssFilter: true
+    # browserXssFilter removed (audit L7): it emits the deprecated
    # X-XSS-Protection header, which can itself introduce XSS in legacy
    # browsers. Modern browsers ignore it.
    referrerPolicy: "strict-origin-when-cross-origin"
    customResponseHeaders:
      X-Content-Type-Options: "nosniff"
      X-Frame-Options: "DENY"
-      Strict-Transport-Security: "max-age=31536000; includeSubDomains"
+      # HSTS: 2-year max-age + preload (audit L5/CODE-L3). After this is
-      Content-Security-Policy: "default-src 'self'; frame-ancestors 'none'"
+      # live on api/admin/app, submit myhoneydue.com to hstspreload.org.
      Strict-Transport-Security: "max-age=63072000; includeSubDomains; preload"
      # Cross-origin isolation (audit F9). COEP (require-corp) is omitted —
      # it commonly breaks third-party embeds; add only after testing.
      Cross-Origin-Opener-Policy: "same-origin"
      Cross-Origin-Resource-Policy: "same-origin"
      # Content-Security-Policy is intentionally NOT set here — the Go API
      # sets a CSP in internal/router/router.go that permits Google Fonts
      # for the landing page. Two CSP headers would intersect and break it.
      # admin and web apps set their own CSP via Next.js middleware.
      Permissions-Policy: "camera=(), microphone=(), geolocation=()"
      X-Permitted-Cross-Domain-Policies: "none"
@@ -80,3 +91,24 @@ spec:
  basicAuth:
    secret: admin-basic-auth
    realm: "honeyDue Admin"
 ---
 # Strict rate limit for auth endpoints (audit F10 / LIVE-L12).
 # Applied via the honeydue-api-auth Ingress to login / register /
 # forgot-password / reset-password / join-with-code. depth: 2 makes the
 # limiter key on the real client IP rather than the Cloudflare edge IP
 # (request path: client -> Cloudflare -> Traefik). This is the edge half;
 # the per-account lockout in the Go app is the robust half.
 apiVersion: traefik.io/v1alpha1
 kind: Middleware
 metadata:
  name: auth-rate-limit
  namespace: honeydue
 spec:
  rateLimit:
    average: 5
    burst: 10
    period: 1m
    sourceCriterion:
      ipStrategy:
        depth: 2
@@ -0,0 +1,92 @@
 # Ory Kratos — honeyDue identity service (Phase 1: infrastructure)
 This directory deploys [Ory Kratos](https://www.ory.sh/kratos/) into the
 `honeydue` namespace as the identity provider — replacing the hand-rolled auth
 in `internal/services/auth_service.go` etc.
 **Phase 1 is infrastructure only.** Once deployed, Kratos runs but nothing uses
 it yet — the honeyDue Go API still does its own auth. Phase 2 (backend swap)
 and Phase 3 (KMP/web clients) follow. Migrating onto Kratos can lose all
 existing user data — honeyDue is pre-production, so no user import is done.
 The deploy is **gated**: `03-deploy.sh` applies Kratos only when the
 `kratos-secrets` Secret exists, and `02-setup-secrets.sh` creates that Secret
 only when `config.yaml` has a `kratos:` block. Until then the existing stack
 deploys completely unaffected.
 ## Files
 | File | What |
 |---|---|
 | `configmap.yaml` | `kratos.yml`, identity schema, Google/Apple OIDC claim mappers (no secrets) |
 | `migrate-job.yaml` | `kratos migrate sql` — schema migration, run before the Deployment |
 | `kratos.yaml` | Deployment (×2), Service, NetworkPolicies |
 | `ingress.yaml` | `auth.myhoneydue.com` → Kratos public API :4433 |
 ## Operator prerequisites (must be done before deploying)
 1. **Kratos version** — Ory uses CalVer (`v25.x` / `v26.x`). Pick the current
   stable, then replace `REPLACE_WITH_CURRENT_STABLE_TAG` in `kratos.yaml` and
   `migrate-job.yaml` with `oryd/kratos:vXX.Y@sha256:<digest>`, and set the
   matching `version:` in `configmap.yaml`.
 2. **Kratos database** — create a separate Neon database named `kratos` (do not
   share honeyDue's). Capture its connection string as the DSN.
 3. **DNS** — add `auth.myhoneydue.com` in Cloudflare (proxied), pointing at the
   cluster ingress like the other honeyDue hosts. Confirm the
   `cloudflare-origin-cert` TLS secret covers `auth.myhoneydue.com`.
 4. **Google OAuth client** — Google Cloud Console → create an OAuth 2.0 client.
   Redirect URI: `https://auth.myhoneydue.com/self-service/methods/oidc/callback/google`.
   Put the **client ID** into `configmap.yaml` (`GOOGLE_OAUTH_CLIENT_ID`); the
   **client secret** goes in `config.yaml`.
 5. **Apple Sign In** — Apple Developer → a Services ID + a Sign in with Apple
   key. Return URL: `https://auth.myhoneydue.com/self-service/methods/oidc/callback/apple`.
   Put the **Services ID / Team ID / Key ID** into `configmap.yaml`
   (`APPLE_SERVICES_ID` / `APPLE_TEAM_ID` / `APPLE_PRIVATE_KEY_ID`); the **.p8
   private key** goes in `config.yaml`.
 6. **`config.yaml`** — add a `kratos:` block:
   ```yaml
   kratos:
     dsn: "postgres://USER:PASS@HOST/kratos?sslmode=require"
     secrets_cookie: "<openssl rand -hex 16>"   # generate ONCE, keep stable
     secrets_cipher: "<openssl rand -hex 16>"   # must be exactly 32 chars
     smtp_connection_uri: "smtps://USER:PASS@smtp.fastmail.com:465/"
     google_client_secret: "<from Google Cloud Console>"
     apple_private_key: |
       -----BEGIN PRIVATE KEY-----
       ...
       -----END PRIVATE KEY-----
   ```
   `secrets_cookie` / `secrets_cipher` must stay stable forever — rotating them
   invalidates every session and makes encrypted data unreadable.
 ## Deploy
 ```bash
 cd honeyDueAPI-go
 export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
 ./deploy-k3s/scripts/02-setup-secrets.sh   # creates kratos-secrets from config.yaml
 ./deploy-k3s/scripts/03-deploy.sh          # applies kratos manifests, runs migrate, rolls
 ```
 `03-deploy.sh` applies `configmap.yaml` → runs `migrate-job.yaml` → waits →
 applies `kratos.yaml` + `ingress.yaml`.
 ## Verify
 - `kubectl -n honeydue get pods -l app.kubernetes.io/name=kratos` — 2/2 Running
 - `kubectl -n honeydue logs job/kratos-migrate` — migration succeeded
 - `curl https://auth.myhoneydue.com/health/ready` — `{"status":"ok"}`
 - `curl https://auth.myhoneydue.com/self-service/registration/api` — returns a flow
 ## Not yet done (later phases)
 - **Phase 2** — honeyDue Go backend: swap `middleware/auth.go` for Kratos
  session validation, drop the hand-rolled auth code, rebuild the `users`
  table keyed on the Kratos identity ID.
 - **Phase 3** — KMP mobile + Next.js web clients point at Kratos flows.
 - Admin-panel auth stays on its own JWT (out of scope).
@@ -0,0 +1,232 @@
 # Ory Kratos configuration for honeyDue.
 #
 # Secrets are NOT in this ConfigMap. The DSN, cookie/cipher secrets, SMTP URI
 # and OIDC client secrets are injected as environment variables from the
 # kratos-secrets Secret (see kratos.yaml). Kratos is configured natively via
 # env vars, so this is the idiomatic split — only non-secret config here.
 #
 # OIDC scope: Apple-only as of 2026-06-03. Google is intentionally absent;
 # adding it later is additive — append a `- id: google` block under
 # selfservice.methods.oidc.config.providers (it becomes index 1) and bind a
 # matching CLIENT_SECRET env in kratos.yaml.
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: kratos-config
  namespace: honeydue
  labels:
    app.kubernetes.io/name: kratos
    app.kubernetes.io/part-of: honeydue
 data:
  kratos.yml: |
    # version must track the Kratos image tag — kratos.yaml + migrate-job.yaml
    # both pin oryd/kratos:v26.2.0 (2026-06-03). See kratos/README.md.
    version: v1.3.0  # internal config schema version; do not change unless Kratos release notes require it
    serve:
      public:
        base_url: https://auth.myhoneydue.com/
        cors:
          enabled: true
          allowed_origins:
            - https://myhoneydue.com
            - https://app.myhoneydue.com
            - https://admin.myhoneydue.com
          allowed_methods: [GET, POST, PUT, PATCH, DELETE, OPTIONS]
          allowed_headers: [Authorization, Content-Type, X-Session-Token, Cookie]
          exposed_headers: [Content-Type, Set-Cookie]
          # Required: the web clients call Kratos browser flows with
          # credentials (the ory_kratos_session cookie). Safe here because
          # allowed_origins is an explicit list, never a wildcard.
          allow_credentials: true
      admin:
        base_url: http://kratos.honeydue.svc.cluster.local:4434/
    selfservice:
      default_browser_return_url: https://app.myhoneydue.com/
      allowed_return_urls:
        - https://app.myhoneydue.com
        - https://myhoneydue.com
        - honeydue://callback
      methods:
        password:
          enabled: true
        code:                       # email one-time codes (verify/recover)
          enabled: true
        oidc:
          enabled: true
          config:
            providers:
              # index 0 — Apple Sign In. apple_private_key (.p8 contents) is
              # injected via env SELFSERVICE_METHODS_OIDC_CONFIG_PROVIDERS_0_APPLE_PRIVATE_KEY.
              # client_id is the Apple Services ID (here: the bundle ID, which
              # was configured as a Services ID with Sign In with Apple
              # capability — see operator notes in README.md §5).
              - id: apple
                provider: apple
                # Production bundle id. Apple issues id_tokens with
                # `aud` = the requesting app's bundle id, so this is the
                # primary audience Kratos verifies against.
                client_id: com.myhoneydue.honeyDue
                # Debug builds out of Xcode use a `.dev` bundle id (see
                # iosApp/honeyDue.xcodeproj — Debug config). Their id_tokens
                # therefore have `aud: com.myhoneydue.honeyDue.dev`, which
                # the primary client_id check rejects. Whitelist the dev
                # audience so Apple Sign In works from a non-Release Xcode
                # build without per-build Kratos reconfiguration.
                additional_id_token_audiences:
                  - com.myhoneydue.honeyDue.dev
                apple_team_id: X86BR9WTLD
                apple_private_key_id: HQD3NCF99C
                mapper_url: file:///etc/kratos/oidc.apple.jsonnet
                scope: [openid, email, name]
      flows:
        error:
          ui_url: https://app.myhoneydue.com/auth/error
        login:
          ui_url: https://app.myhoneydue.com/auth/login
          lifespan: 10m
        registration:
          ui_url: https://app.myhoneydue.com/auth/registration
          lifespan: 10m
          after:
            password:
              hooks:
                - hook: session     # auto-login after registration
            oidc:
              hooks:
                - hook: session
        verification:
          enabled: true
          ui_url: https://app.myhoneydue.com/auth/verification
          use: code
          after:
            default_browser_return_url: https://app.myhoneydue.com/
        recovery:
          enabled: true
          ui_url: https://app.myhoneydue.com/auth/recovery
          use: code
        settings:
          ui_url: https://app.myhoneydue.com/auth/settings
          privileged_session_max_age: 15m
        logout:
          after:
            default_browser_return_url: https://app.myhoneydue.com/
    log:
      level: info
      format: json
      leak_sensitive_values: false
    ciphers:
      algorithm: xchacha20-poly1305
    hashers:
      algorithm: bcrypt
      bcrypt:
        cost: 12
    identity:
      default_schema_id: honeydue
      schemas:
        - id: honeydue
          url: file:///etc/kratos/identity.schema.json
    courier:
      smtp:
        from_address: noreply@myhoneydue.com
        from_name: honeyDue
        # connection_uri is injected via env COURIER_SMTP_CONNECTION_URI
    session:
      lifespan: 720h                # 30-day sessions (mobile)
      cookie:
        domain: myhoneydue.com
        same_site: Lax
  identity.schema.json: |
    {
      "$id": "https://honeydue.app/identity.schema.json",
      "$schema": "http://json-schema.org/draft-07/schema#",
      "title": "honeyDue user",
      "type": "object",
      "properties": {
        "traits": {
          "type": "object",
          "properties": {
            "email": {
              "type": "string",
              "format": "email",
              "title": "Email",
              "minLength": 3,
              "maxLength": 320,
              "ory.sh/kratos": {
                "credentials": {
                  "password": { "identifier": true },
                  "code": { "identifier": true, "via": "email" },
                  "totp": { "account_name": true }
                },
                "verification": { "via": "email" },
                "recovery": { "via": "email" }
              }
            },
            "name": {
              "type": "object",
              "title": "Name",
              "properties": {
                "first": { "type": "string", "title": "First name", "maxLength": 100 },
                "last": { "type": "string", "title": "Last name", "maxLength": 100 }
              }
            }
          },
          "required": ["email"],
          "additionalProperties": false
        }
      }
    }
  oidc.google.jsonnet: |
    // Maps Google OIDC claims onto the honeyDue identity schema.
    local claims = std.extVar('claims');
    {
      identity: {
        traits: {
          email: claims.email,
          [if 'given_name' in claims || 'family_name' in claims then 'name']: {
            first: if 'given_name' in claims then claims.given_name else '',
            last: if 'family_name' in claims then claims.family_name else '',
          },
        },
      },
    }
  oidc.apple.jsonnet: |
    // Maps Apple OIDC claims onto the honeyDue identity schema. Apple only
    // returns the name on the very first authorization and not in the ID
    // token claims, so only email is mapped here.
    //
    // Sign in with Apple emails are marked verified UNCONDITIONALLY: completing
    // SIWA cryptographically proves the user controls that Apple ID, and Apple
    // owns/verifies the (relay or real) email, so a 6-digit code would be
    // redundant. We deliberately do NOT gate this on Apple's `email_verified`
    // claim — Apple omits that claim on many authorizations (only sends it on
    // the first grant), which made auto-verification random: sometimes verified,
    // sometimes a surprise code prompt (observed 2026-06-03). Marking it
    // verified on every SIWA makes the behaviour consistent: Apple users never
    // see a code; password sign-ups still verify via the honeyDue API flow.
    local claims = std.extVar('claims');
    {
      identity: {
        traits: {
          email: claims.email,
        },
        verified_addresses: std.prune([
          if 'email' in claims then {
            via: 'email',
            value: claims.email,
          },
        ]),
      },
    }
@@ -0,0 +1,44 @@
 # Public ingress for Ory Kratos — auth.myhoneydue.com → Kratos public API :4433.
 #
 # Middlewares match the honeyDue API ingress (security-headers + rate-limit).
 # The cloudflare-only middleware is intentionally NOT applied here: on this
 # cluster, klipper-lb SNATs the source IP before Traefik sees it, so
 # cloudflare-only's IP allowlist rejects every legitimate Cloudflare request
 # (verified 2026-06-03 — iOS Apple Sign In failed silently because Kratos
 # never received the request). The api ingress doesn't use cloudflare-only
 # for the same reason. DDoS protection still rides on Cloudflare's edge.
 #
 # Kratos's self-service flows are multi-request, so the strict auth-rate-limit
 # (5/min) is intentionally NOT used here — Kratos applies its own per-flow
 # protections.
 #
 # OPERATOR: confirm the cloudflare-origin-cert TLS secret covers
 # auth.myhoneydue.com (apex + wildcard origin cert), and add the
 # auth.myhoneydue.com DNS record in Cloudflare (proxied) → cluster ingress.
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: honeydue-auth
  namespace: honeydue
  labels:
    app.kubernetes.io/name: kratos
    app.kubernetes.io/part-of: honeydue
  annotations:
    traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
 spec:
  ingressClassName: traefik
  tls:
    - hosts:
        - auth.myhoneydue.com
      secretName: cloudflare-origin-cert
  rules:
    - host: auth.myhoneydue.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: kratos
                port:
                  number: 4433
@@ -0,0 +1,208 @@
 # Ory Kratos — identity service for honeyDue.
 #
 # Deployed once the operator has completed the prerequisites in kratos/README.md
 # (Neon `kratos` database, auth.myhoneydue.com DNS, Apple Sign In OIDC client,
 # and the kratos-secrets Secret). Until then 03-deploy.sh skips the Kratos
 # apply, so the existing stack is unaffected.
 #
 # IMAGE: pinned to oryd/kratos v26.2.0 (CalVer current stable as of 2026-06-03)
 # with the linux/amd64 digest. The schema-migration Job is in migrate-job.yaml
 # and runs before this Deployment rolls.
 #
 # OIDC: currently Apple-only (configmap.yaml providers[0]). Google was scoped
 # out at deploy time; adding it later is additive — append to providers[] in
 # configmap.yaml and add the matching CLIENT_SECRET env binding here.
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: kratos
  namespace: honeydue
  labels:
    app.kubernetes.io/name: kratos
    app.kubernetes.io/part-of: honeydue
 spec:
  replicas: 2
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 0
      maxSurge: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: kratos
  template:
    metadata:
      labels:
        app.kubernetes.io/name: kratos
        app.kubernetes.io/part-of: honeydue
    spec:
      automountServiceAccountToken: false
      securityContext:
        runAsNonRoot: true
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: kratos
          image: oryd/kratos:v26.2.0@sha256:92eedc292ff8e1a918ac442c88ed0abe44610c75121700963114549908a45ac3
          imagePullPolicy: IfNotPresent
          args:
            - serve
            - --config
            - /etc/kratos/kratos.yml
            - --watch-courier      # send verification/recovery email in-process
          ports:
            - name: public
              containerPort: 4433
            - name: admin
              containerPort: 4434
          env:
            # Kratos is configured natively via env vars; secrets come from
            # the kratos-secrets Secret rather than the ConfigMap.
            - name: DSN
              valueFrom: { secretKeyRef: { name: kratos-secrets, key: dsn } }
            - name: SECRETS_COOKIE
              valueFrom: { secretKeyRef: { name: kratos-secrets, key: secrets_cookie } }
            - name: SECRETS_CIPHER
              valueFrom: { secretKeyRef: { name: kratos-secrets, key: secrets_cipher } }
            - name: COURIER_SMTP_CONNECTION_URI
              valueFrom: { secretKeyRef: { name: kratos-secrets, key: smtp_connection_uri } }
            # OIDC provider secrets — index must match the providers list
            # order in configmap.yaml. Apple-only for now (index 0).
            - name: SELFSERVICE_METHODS_OIDC_CONFIG_PROVIDERS_0_APPLE_PRIVATE_KEY
              valueFrom: { secretKeyRef: { name: kratos-secrets, key: apple_private_key } }
          volumeMounts:
            - name: config
              mountPath: /etc/kratos
              readOnly: true
            - name: tmp
              mountPath: /tmp
          readinessProbe:
            httpGet:
              path: /health/ready
              port: 4434
            initialDelaySeconds: 5
            periodSeconds: 10
          livenessProbe:
            httpGet:
              path: /health/alive
              port: 4434
            initialDelaySeconds: 10
            periodSeconds: 30
          resources:
            requests:
              cpu: 100m
              memory: 128Mi
            limits:
              cpu: "1"
              memory: 512Mi
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop: ["ALL"]
      volumes:
        - name: config
          configMap:
            name: kratos-config
        - name: tmp
          emptyDir:
            sizeLimit: 64Mi
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: kratos
  namespace: honeydue
  labels:
    app.kubernetes.io/name: kratos
    app.kubernetes.io/part-of: honeydue
 spec:
  selector:
    app.kubernetes.io/name: kratos
  ports:
    - name: public
      port: 4433
      targetPort: 4433
    - name: admin
      port: 4434
      targetPort: 4434
 ---
 # Ingress to Kratos. Traefik (the auth.myhoneydue.com IngressRoute) reaches
 # only the public API :4433. The honeyDue api pods reach the public API :4433
 # (session whoami) AND the admin API :4434 (identity deletion on account
 # close). The admin API :4434 takes no other cluster ingress.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-ingress-to-kratos
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: kratos
  policyTypes:
    - Ingress
  ingress:
    # Traefik ingress controller -> public API only.
    - from:
        - namespaceSelector:
            matchLabels:
              kubernetes.io/metadata.name: kube-system
      ports:
        - port: 4433
          protocol: TCP
    # honeyDue api pods -> public API (whoami) + admin API (identity deletion).
    - from:
        - podSelector:
            matchLabels:
              app.kubernetes.io/name: api
      ports:
        - port: 4433
          protocol: TCP
        - port: 4434
          protocol: TCP
 ---
 # Kratos egress: DNS, the Neon Postgres database, SMTP, and HTTPS to the
 # OIDC providers (Apple/Google token + JWKS endpoints).
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-egress-from-kratos
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: kratos
  policyTypes:
    - Egress
  egress:
    - to:
        - namespaceSelector: {}
      ports:
        - port: 53
          protocol: UDP
        - port: 53
          protocol: TCP
    # Neon Postgres (external)
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              - 10.42.0.0/16
              - 10.43.0.0/16
      ports:
        - port: 5432
          protocol: TCP
    # SMTP (Fastmail) + HTTPS to Apple/Google OIDC endpoints (external)
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              - 10.42.0.0/16
              - 10.43.0.0/16
      ports:
        - port: 465
          protocol: TCP
        - port: 443
          protocol: TCP
@@ -0,0 +1,51 @@
 # Ory Kratos schema migration — runs `kratos migrate sql` against the Kratos
 # database before the Kratos Deployment rolls. 03-deploy.sh applies this,
 # waits for completion, then applies kratos.yaml.
 #
 # IMAGE: pinned to oryd/kratos v26.2.0 (CalVer current stable as of 2026-06-03)
 # with the linux/amd64 digest. Bump in sync with kratos.yaml's image.
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: kratos-migrate
  namespace: honeydue
  labels:
    app.kubernetes.io/name: kratos
    app.kubernetes.io/part-of: honeydue
 spec:
  backoffLimit: 0
  template:
    metadata:
      labels:
        app.kubernetes.io/name: kratos
        app.kubernetes.io/part-of: honeydue
    spec:
      restartPolicy: Never
      automountServiceAccountToken: false
      securityContext:
        runAsNonRoot: true
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: kratos-migrate
          image: oryd/kratos:v26.2.0@sha256:92eedc292ff8e1a918ac442c88ed0abe44610c75121700963114549908a45ac3
          imagePullPolicy: IfNotPresent
          args: ["migrate", "sql", "-e", "--yes"]
          env:
            - name: DSN
              valueFrom:
                secretKeyRef:
                  name: kratos-secrets
                  key: dsn
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop: ["ALL"]
          resources:
            requests:
              cpu: 50m
              memory: 64Mi
            limits:
              cpu: 500m
              memory: 256Mi
@@ -0,0 +1,61 @@
 # Kyverno image-signature verification policy (audit CODE-L5).
 #
 # ──────────────────────────────────────────────────────────────────────────
 # THIS MANIFEST IS NOT APPLIED BY 03-deploy.sh. It is intentionally outside
 # the script's apply set. Applying it before the prerequisites are in place
 # would block every honeydue Pod from scheduling. Operator steps:
 #
 #   1. Install Kyverno in the cluster (it is an admission controller):
 #        kubectl create -f https://github.com/kyverno/kyverno/releases/latest/download/install.yaml
 #   2. Generate a cosign key pair and keep the private key safe:
 #        cosign generate-key-pair                 # -> cosign.key (PRIVATE) + cosign.pub
 #      Set COSIGN_KEY=cosign.key in the deploy environment so 03-deploy.sh
 #      signs images after pushing them (the signing step is already wired,
 #      guarded, into 03-deploy.sh).
 #   3. Paste the contents of cosign.pub into the publicKeys block below.
 #   4. Apply this policy:  kubectl apply -f deploy-k3s/manifests/kyverno-verify-images.yaml
 #   5. After confirming honeydue Pods still schedule, flip
 #      validationFailureAction from Audit to Enforce.
 #
 # Until then it is a documented, ready-to-use template — not active config.
 # ──────────────────────────────────────────────────────────────────────────
 apiVersion: kyverno.io/v1
 kind: ClusterPolicy
 metadata:
  name: verify-honeydue-images
  annotations:
    policies.kyverno.io/title: Verify honeyDue image signatures
    policies.kyverno.io/description: >-
      Requires that honeyDue application images pulled into the honeydue
      namespace carry a valid cosign signature made with the operator's key.
 spec:
  # Audit first — logs violations without blocking. Switch to Enforce once
  # signing is confirmed working end to end.
  validationFailureAction: Audit
  background: false
  webhookTimeoutSeconds: 30
  rules:
    - name: verify-gitea-image-signatures
      match:
        any:
          - resources:
              kinds:
                - Pod
              namespaces:
                - honeydue
      verifyImages:
        # Only the images we build and sign. Public base images
        # (redis, vmagent) are pinned by digest instead — see their manifests.
        - imageReferences:
            - "gitea.treytartt.com/admin/honeydue-api*"
            - "gitea.treytartt.com/admin/honeydue-worker*"
            - "gitea.treytartt.com/admin/honeydue-admin*"
            - "gitea.treytartt.com/admin/honeydue-web*"
          attestors:
            - count: 1
              entries:
                - keys:
                    publicKeys: |-
                      -----BEGIN PUBLIC KEY-----
                      REPLACE_WITH_CONTENTS_OF_cosign.pub
                      -----END PUBLIC KEY-----
@@ -0,0 +1,78 @@
 # One-shot migration Job. Runs goose against Neon's *direct* (non-pooler)
 # endpoint, applies any pending migrations from /app/migrations (baked into
 # the api image), exits.
 #
 # 03-deploy.sh deletes any prior Job, applies this one, waits for completion
 # with `kubectl wait --for=condition=complete`, and rolls api/worker only
 # after the Job succeeds. A Job failure aborts the whole deploy.
 #
 # We reuse the api image rather than build a separate one — the api Dockerfile
 # already installs the goose CLI to /usr/local/bin/goose and copies the
 # migrations directory to /app/migrations.
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: honeydue-migrate
  namespace: honeydue
  labels:
    app.kubernetes.io/name: migrate
    app.kubernetes.io/part-of: honeydue
 spec:
  backoffLimit: 0                  # fail fast — no silent retries on a bad migration
  ttlSecondsAfterFinished: 86400   # keep finished Job for 24h so logs are inspectable
  template:
    metadata:
      labels:
        app.kubernetes.io/name: migrate
        app.kubernetes.io/part-of: honeydue
    spec:
      restartPolicy: Never
      # The migrate Job never calls the k8s API (audit F11).
      automountServiceAccountToken: false
      imagePullSecrets:
        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        runAsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: goose
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh — same as api
          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit
          command: ["/bin/sh", "-c"]
          # DB_HOST in the ConfigMap points at the -pooler endpoint for runtime.
          # goose's session-scoped advisory lock can't survive PgBouncer
          # transaction-mode, so we strip the -pooler segment for migrations.
          # `set -e` so any sub-command failure exits non-zero.
          args:
            - |
              set -e
              DIRECT_HOST=$(echo "$DB_HOST" | sed 's/-pooler\.\(.*\)$/.\1/')
              echo "[migrate] running goose up against $DIRECT_HOST"
              exec /usr/local/bin/goose \
                -dir /app/migrations \
                postgres "host=$DIRECT_HOST port=$DB_PORT user=$POSTGRES_USER password=$POSTGRES_PASSWORD dbname=$POSTGRES_DB sslmode=$DB_SSLMODE" \
                up
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop: ["ALL"]
          envFrom:
            - configMapRef:
                name: honeydue-config
          env:
            - name: POSTGRES_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: POSTGRES_PASSWORD
          resources:
            requests:
              cpu: 100m
              memory: 64Mi
            limits:
              cpu: 500m
              memory: 256Mi
@@ -47,10 +47,19 @@ spec:
  policyTypes:
    - Ingress
  ingress:
    # Traefik runs as DaemonSet with hostNetwork=true, so traffic from it
    # arrives with the NODE IP as source (not a pod IP). The node pod CIDR
    # 10.42.0.0/16 covers any intra-cluster caller; the three node IPs
    # cover Traefik on hostNetwork.
    - from:
-        - namespaceSelector:
+        - ipBlock:
-            matchLabels:
+            cidr: 178.105.32.198/32   # ubuntu-8gb-nbg1-1
-              kubernetes.io/metadata.name: kube-system
+        - ipBlock:
            cidr: 178.104.247.152/32  # ubuntu-8gb-nbg1-2
        - ipBlock:
            cidr: 178.104.249.189/32  # ubuntu-8gb-nbg1-3
        - ipBlock:
            cidr: 10.42.0.0/16        # cluster pod CIDR
      ports:
        - protocol: TCP
          port: 8000
@@ -69,10 +78,17 @@ spec:
  policyTypes:
    - Ingress
  ingress:
    # Traefik runs as DaemonSet with hostNetwork=true — see allow-ingress-to-api
    # for the rationale. Same ipBlock list.
    - from:
-        - namespaceSelector:
+        - ipBlock:
-            matchLabels:
+            cidr: 178.105.32.198/32
-              kubernetes.io/metadata.name: kube-system
+        - ipBlock:
            cidr: 178.104.247.152/32
        - ipBlock:
            cidr: 178.104.249.189/32
        - ipBlock:
            cidr: 10.42.0.0/16
      ports:
        - protocol: TCP
          port: 3000
@@ -124,6 +140,20 @@ spec:
      ports:
        - protocol: TCP
          port: 6379
    # Kratos (in-cluster). The auth middleware validates every session via
    # http://kratos:4433/sessions/whoami; the AuthService also uses :4434
    # for account deletion (DELETE /admin/identities/{id}). k3s evaluates
    # egress rules AFTER kube-proxy DNAT (runbook §9.2), so this podSelector
    # rule covers Service ClusterIP traffic correctly.
    - to:
        - podSelector:
            matchLabels:
              app.kubernetes.io/name: kratos
      ports:
        - protocol: TCP
          port: 4433
        - protocol: TCP
          port: 4434
    # External services: Neon DB (5432), SMTP (587), HTTPS (443 — APNs, FCM, B2, PostHog)
    - to:
        - ipBlock:
@@ -200,3 +230,213 @@ spec:
      ports:
        - protocol: TCP
          port: 8000
 ---
 # --- Web: allow ingress from Traefik (kube-system namespace) ---
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-ingress-to-web
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: web
  policyTypes:
    - Ingress
  ingress:
    # Traefik runs as DaemonSet with hostNetwork=true — see allow-ingress-to-api
    # for the rationale. Same ipBlock list.
    - from:
        - ipBlock:
            cidr: 178.105.32.198/32
        - ipBlock:
            cidr: 178.104.247.152/32
        - ipBlock:
            cidr: 178.104.249.189/32
        - ipBlock:
            cidr: 10.42.0.0/16
      ports:
        - protocol: TCP
          port: 3000
 ---
 # --- Web: allow egress for the Next.js server-side proxy routes ---
 # Browser → app.myhoneydue.com → web pod (Node.js) → api.myhoneydue.com
 # The web pod resolves api.myhoneydue.com via public DNS and hits
 # Cloudflare (143.). We don't know which CF IP yet at policy time, so
 # allow HTTPS to public ipBlock (except private CIDRs).
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-egress-from-web
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: web
  policyTypes:
    - Egress
  egress:
    # HTTPS to public (api.myhoneydue.com via CF, PostHog, any other remote)
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              - 10.0.0.0/8
              - 172.16.0.0/12
              - 192.168.0.0/16
      ports:
        - protocol: TCP
          port: 443
 ---
 # vmagent egress.
 #
 # IMPORTANT (gotcha): k3s's built-in NetworkPolicy controller appears to
 # evaluate egress rules AFTER kube-proxy's DNAT, not before (contrary to
 # the k8s spec). So traffic from a pod to the kubernetes Service
 # (ClusterIP 10.43.0.1:443) is policy-checked as dst=<node_public_ip>:6443.
 # That's why we need an explicit rule for :6443 to public IPs, even though
 # we already allow :443 to the cluster service CIDR.
 #
 # Without the :6443 rule, vmagent's k8s service discovery silently fails
 # and zero pods get scraped. See deploy-k3s/RUNBOOK.md ("vmagent SD broken").
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-egress-from-vmagent
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: vmagent
  policyTypes:
    - Egress
  egress:
    # DNS (cluster-internal)
    - to:
        - namespaceSelector: {}
      ports:
        - port: 53
          protocol: UDP
        - port: 53
          protocol: TCP
    # k8s API server via ClusterIP (pre-DNAT view)
    - to:
        - ipBlock:
            cidr: 10.43.0.0/16
      ports:
        - port: 443
          protocol: TCP
    # k8s API server post-DNAT (real path k3s NetPol enforcer sees) — REQUIRED
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              - 10.42.0.0/16
      ports:
        - port: 6443
          protocol: TCP
    # Scrape api Pods on :8000
    - to:
        - ipBlock:
            cidr: 10.42.0.0/16
      ports:
        - port: 8000
          protocol: TCP
    # Scrape kube-state-metrics Pod on :8080 (pod CIDR)
    - to:
        - ipBlock:
            cidr: 10.42.0.0/16
      ports:
        - port: 8080
          protocol: TCP
    # HTTPS to public (remote-write to obs.88oakapps.com via Cloudflare)
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              - 10.42.0.0/16
              - 10.43.0.0/16
      ports:
        - port: 443
          protocol: TCP
 ---
 # Allow vmagent → api ingress on :8000 so api pods accept scrapes.
 # api Pods are otherwise locked down by default-deny-all + allow-ingress-to-api
 # (which only allows Traefik). This adds vmagent specifically.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-vmagent-to-api
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: api
  policyTypes:
    - Ingress
  ingress:
    - from:
        - podSelector:
            matchLabels:
              app.kubernetes.io/name: vmagent
      ports:
        - port: 8000
          protocol: TCP
 ---
 # alloy-logs egress — Grafana Alloy discovers honeydue pods via the k8s API
 # and pushes their logs to Loki at obs.88oakapps.com. Same k3s NetworkPolicy
 # DNAT gotcha as vmagent: API-server traffic is policy-checked as
 # dst=<node_public_ip>:6443, so an explicit :6443 rule is required.
 # Alloy reads log FILES from a hostPath, so it needs no ingress and no
 # egress to pod :8000/:8080 — only DNS, the API server, and obs HTTPS.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-egress-from-alloy-logs
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: alloy-logs
  policyTypes:
    - Egress
  egress:
    # DNS (cluster-internal)
    - to:
        - namespaceSelector: {}
      ports:
        - port: 53
          protocol: UDP
        - port: 53
          protocol: TCP
    # k8s API server via ClusterIP (pre-DNAT view)
    - to:
        - ipBlock:
            cidr: 10.43.0.0/16
      ports:
        - port: 443
          protocol: TCP
    # k8s API server post-DNAT (real path k3s NetPol enforcer sees) — REQUIRED
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              - 10.42.0.0/16
      ports:
        - port: 6443
          protocol: TCP
    # HTTPS to public (log push to obs.88oakapps.com via Cloudflare)
    - to:
        - ipBlock:
            cidr: 0.0.0.0/0
            except:
              - 10.42.0.0/16
              - 10.43.0.0/16
      ports:
        - port: 443
          protocol: TCP
@@ -0,0 +1,278 @@
 # honeyDue log shipper — Grafana Alloy as a DaemonSet.
 #
 # Each node runs one Alloy pod that tails the honeydue-namespace pod logs in
 # /var/log/pods and pushes them to Loki at obs.88oakapps.com/loki/api/v1/push
 # (the same nginx ingest endpoint + bearer token vmagent uses for metrics).
 #
 # Runs as root: /var/log/pods is 0750 root:root on the k3s nodes, so a
 # non-root uid cannot even traverse it. The container is otherwise locked
 # down — all capabilities dropped, read-only root filesystem, seccomp
 # RuntimeDefault — and root inside the container reads only a read-only
 # hostPath mount of /var/log/pods. This is the one root-running workload in
 # the namespace (standard for log collectors); see docs/deployment.
 #
 # 03-deploy.sh substitutes TOKEN_PLACEHOLDER with OBS_INGEST_TOKEN from
 # deploy/prod.env before applying — the token never lands in the repo.
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: alloy-logs
  namespace: honeydue
  labels:
    app.kubernetes.io/name: alloy-logs
    app.kubernetes.io/part-of: honeydue
 ---
 # Least privilege: Alloy's discovery.kubernetes only lists/watches pods, and
 # only in the honeydue namespace — so a namespaced Role, not a ClusterRole.
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: alloy-logs
  namespace: honeydue
  labels:
    app.kubernetes.io/name: alloy-logs
    app.kubernetes.io/part-of: honeydue
 rules:
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list", "watch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: alloy-logs
  namespace: honeydue
  labels:
    app.kubernetes.io/name: alloy-logs
    app.kubernetes.io/part-of: honeydue
 subjects:
  - kind: ServiceAccount
    name: alloy-logs
    namespace: honeydue
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: alloy-logs
 ---
 # Bearer token for the Loki push endpoint. TOKEN_PLACEHOLDER is replaced by
 # 03-deploy.sh with OBS_INGEST_TOKEN (same token vmagent uses).
 apiVersion: v1
 kind: Secret
 metadata:
  name: alloy-logs-auth
  namespace: honeydue
  labels:
    app.kubernetes.io/name: alloy-logs
    app.kubernetes.io/part-of: honeydue
 type: Opaque
 stringData:
  bearer_token: TOKEN_PLACEHOLDER
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: alloy-logs
  namespace: honeydue
  labels:
    app.kubernetes.io/name: alloy-logs
    app.kubernetes.io/part-of: honeydue
 data:
  config.alloy: |
    // honeyDue log shipper. Each DaemonSet instance discovers honeydue-namespace
    // pods via the Kubernetes API, tails the container log files present on its
    // own node (/var/log/pods), and pushes them to Loki at obs.88oakapps.com.
    logging {
      level  = "warn"
      format = "logfmt"
    }
    discovery.kubernetes "pods" {
      role = "pod"
      namespaces {
        names = ["honeydue"]
      }
    }
    // Turn pod metadata into Loki labels and build the on-disk log path.
    discovery.relabel "pod_logs" {
      targets = discovery.kubernetes.pods.targets
      rule {
        source_labels = ["__meta_kubernetes_namespace"]
        action        = "replace"
        target_label  = "namespace"
      }
      rule {
        source_labels = ["__meta_kubernetes_pod_name"]
        action        = "replace"
        target_label  = "pod"
      }
      rule {
        source_labels = ["__meta_kubernetes_pod_container_name"]
        action        = "replace"
        target_label  = "container"
      }
      rule {
        source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
        action        = "replace"
        target_label  = "app"
      }
      rule {
        source_labels = ["__meta_kubernetes_pod_node_name"]
        action        = "replace"
        target_label  = "node"
      }
      // /var/log/pods/<namespace>_<pod>_<uid>/<container>/<n>.log
      rule {
        source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
        separator     = "/"
        action        = "replace"
        replacement   = "/var/log/pods/*$1/*.log"
        target_label  = "__path__"
      }
    }
    local.file_match "pod_logs" {
      path_targets = discovery.relabel.pod_logs.output
    }
    loki.source.file "pod_logs" {
      targets       = local.file_match.pod_logs.targets
      forward_to    = [loki.process.pod_logs.receiver]
      // With no stored read offset (fresh node, or positions wiped), start
      // at the END of each file instead of re-shipping history — otherwise
      // Loki rejects the now-too-old entries ("entry too far behind") and
      // shipping stalls. Offsets persist on a hostPath (see volumes), so a
      // normal pod restart resumes exactly where it left off.
      tail_from_end = true
    }
    // Parse the CRI log format (timestamp / stream / flags / message),
    // then drop probe/scrape noise before shipping.
    loki.process "pod_logs" {
      forward_to = [loki.write.obs.receiver]
      stage.cri {}
      // Drop successful probe/scrape access logs. k8s liveness/readiness
      // hits /api/health/ every few seconds and vmagent scrapes /metrics
      // on a 15s interval — all 2xx, pure noise that drowns real logs.
      // A non-2xx health check, or one logged above info level, does NOT
      // match this regex and is kept.
      stage.drop {
        expression          = "\"level\":\"info\".*\"path\":\"/(api/health/?|metrics)\".*\"status\":2[0-9][0-9]"
        drop_counter_reason = "probe_access_ok"
      }
    }
    loki.write "obs" {
      endpoint {
        url               = "https://obs.88oakapps.com/loki/api/v1/push"
        bearer_token_file = "/etc/alloy-secrets/bearer_token"
      }
      external_labels = {
        cluster     = "honeydue-k3s",
        environment = "prod",
      }
    }
 ---
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: alloy-logs
  namespace: honeydue
  labels:
    app.kubernetes.io/name: alloy-logs
    app.kubernetes.io/part-of: honeydue
 spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: alloy-logs
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
  template:
    metadata:
      labels:
        app.kubernetes.io/name: alloy-logs
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: alloy-logs
      # Alloy needs its SA token — discovery.kubernetes talks to the API server.
      automountServiceAccountToken: true
      # Root is required to traverse /var/log/pods (0750 root:root). The
      # container is otherwise fully confined (see container securityContext).
      securityContext:
        runAsUser: 0
        runAsGroup: 0
        seccompProfile:
          type: RuntimeDefault
      tolerations:
        # DaemonSet must run on every node, including any control-plane taint.
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
        - name: alloy
          image: grafana/alloy:v1.5.1@sha256:01a63f4e032ce54ee94b22049bc27f597e74f85566478c377f4b5c7f020c1eb3
          imagePullPolicy: IfNotPresent
          args:
            - run
            - /etc/alloy/config.alloy
            - --storage.path=/tmp/alloy
            - --server.http.listen-addr=0.0.0.0:12345
          ports:
            - name: http
              containerPort: 12345
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop: ["ALL"]
          volumeMounts:
            - name: config
              mountPath: /etc/alloy
              readOnly: true
            - name: auth
              mountPath: /etc/alloy-secrets
              readOnly: true
            - name: varlogpods
              mountPath: /var/log/pods
              readOnly: true
            - name: tmp
              mountPath: /tmp/alloy
          readinessProbe:
            httpGet:
              path: /-/ready
              port: 12345
            initialDelaySeconds: 10
            periodSeconds: 20
          resources:
            requests:
              cpu: 25m
              memory: 64Mi
            limits:
              cpu: 150m
              memory: 256Mi
      volumes:
        - name: config
          configMap:
            name: alloy-logs
        - name: auth
          secret:
            secretName: alloy-logs-auth
            defaultMode: 0400
        - name: varlogpods
          hostPath:
            path: /var/log/pods
            type: Directory
        # Alloy's positions/WAL store. A hostPath (not emptyDir) so file read
        # offsets survive pod restarts — otherwise every restart re-reads log
        # files from the start and Loki rejects the now-too-old entries.
        - name: tmp
          hostPath:
            path: /var/lib/honeydue-alloy-logs
            type: DirectoryOrCreate
@@ -0,0 +1,223 @@
 # kube-state-metrics — exposes cluster object state (pods, deployments,
 # services, etc.) as Prometheus metrics. vmagent scrapes it via the api
 # group defined in vmagent-config; Grafana panels that count pods,
 # replicas, etc. consume the `kube_*` metrics this produces.
 #
 # Lives in kube-system because it watches resources cluster-wide.
 # RBAC is cluster-scoped (ClusterRole + ClusterRoleBinding).
 #
 # Image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
 # (latest stable as of authoring; bump when a newer minor is released)
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: kube-state-metrics
  namespace: kube-system
  labels:
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/part-of: honeydue-observability
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: kube-state-metrics
  labels:
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/part-of: honeydue-observability
 rules:
  # Core resources
  - apiGroups: [""]
    resources:
      - configmaps
      - secrets
      - nodes
      - pods
      - services
      - serviceaccounts
      - resourcequotas
      - replicationcontrollers
      - limitranges
      - persistentvolumeclaims
      - persistentvolumes
      - namespaces
      - endpoints
    verbs: [list, watch]
  # Apps
  - apiGroups: ["apps"]
    resources:
      - statefulsets
      - daemonsets
      - deployments
      - replicasets
    verbs: [list, watch]
  # Batch
  - apiGroups: ["batch"]
    resources:
      - cronjobs
      - jobs
    verbs: [list, watch]
  # Autoscaling
  - apiGroups: ["autoscaling"]
    resources:
      - horizontalpodautoscalers
    verbs: [list, watch]
  # Authentication / authorization (used by some ksm collectors)
  - apiGroups: ["authentication.k8s.io"]
    resources: [tokenreviews]
    verbs: [create]
  - apiGroups: ["authorization.k8s.io"]
    resources: [subjectaccessreviews]
    verbs: [create]
  # Policy
  - apiGroups: ["policy"]
    resources: [poddisruptionbudgets]
    verbs: [list, watch]
  # Certificate signing
  - apiGroups: ["certificates.k8s.io"]
    resources: [certificatesigningrequests]
    verbs: [list, watch]
  # Discovery
  - apiGroups: ["discovery.k8s.io"]
    resources: [endpointslices]
    verbs: [list, watch]
  # Storage
  - apiGroups: ["storage.k8s.io"]
    resources:
      - storageclasses
      - volumeattachments
    verbs: [list, watch]
  # Admission policy
  - apiGroups: ["admissionregistration.k8s.io"]
    resources:
      - mutatingwebhookconfigurations
      - validatingwebhookconfigurations
    verbs: [list, watch]
  # Networking
  - apiGroups: ["networking.k8s.io"]
    resources:
      - networkpolicies
      - ingressclasses
      - ingresses
    verbs: [list, watch]
  # Coordination (leader election)
  - apiGroups: ["coordination.k8s.io"]
    resources: [leases]
    verbs: [list, watch]
  # RBAC
  - apiGroups: ["rbac.authorization.k8s.io"]
    resources:
      - clusterrolebindings
      - clusterroles
      - rolebindings
      - roles
    verbs: [list, watch]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: kube-state-metrics
  labels:
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/part-of: honeydue-observability
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
 subjects:
  - kind: ServiceAccount
    name: kube-state-metrics
    namespace: kube-system
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: kube-state-metrics
  namespace: kube-system
  labels:
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/part-of: honeydue-observability
 spec:
  type: ClusterIP
  selector:
    app.kubernetes.io/name: kube-state-metrics
  ports:
    - name: http-metrics
      port: 8080
      targetPort: http-metrics
      protocol: TCP
    - name: telemetry
      port: 8081
      targetPort: telemetry
      protocol: TCP
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: kube-state-metrics
  namespace: kube-system
  labels:
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/part-of: honeydue-observability
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-state-metrics
  template:
    metadata:
      labels:
        app.kubernetes.io/name: kube-state-metrics
        app.kubernetes.io/part-of: honeydue-observability
    spec:
      serviceAccountName: kube-state-metrics
      automountServiceAccountToken: true
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534
        fsGroup: 65534
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: kube-state-metrics
          image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 8080
              name: http-metrics
            - containerPort: 8081
              name: telemetry
          args:
            - --port=8080
            - --telemetry-port=8081
          resources:
            requests:
              cpu: 25m
              memory: 64Mi
            limits:
              cpu: 200m
              memory: 256Mi
          securityContext:
            allowPrivilegeEscalation: false
            capabilities:
              drop: [ALL]
            readOnlyRootFilesystem: true
          livenessProbe:
            httpGet:
              path: /livez
              port: http-metrics
            initialDelaySeconds: 5
            periodSeconds: 30
          readinessProbe:
            httpGet:
              path: /readyz
              port: http-metrics
            initialDelaySeconds: 5
            periodSeconds: 10
@@ -0,0 +1,126 @@
 # node-exporter — per-node host metrics (filesystem, memory, load, CPU).
 # Runs as a normal pod (NOT hostNetwork) so vmagent scrapes it pod-to-pod over
 # the cluster CIDR, avoiding any dependency on node public IPs (the netpol
 # node-IP list is OVH-stale). Host /proc, /sys and / are bind-mounted read-only
 # so the filesystem/memory/load collectors read the real host, not the pod ns.
 # Added 2026-06-08 to close RUNBOOK §11.1 gap #9 (node disk/mem were unmonitored).
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: node-exporter
  namespace: honeydue
  labels:
    app.kubernetes.io/name: node-exporter
    app.kubernetes.io/part-of: honeydue
 spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: node-exporter
  template:
    metadata:
      labels:
        app.kubernetes.io/name: node-exporter
        app.kubernetes.io/part-of: honeydue
    spec:
      # Run on every node, including any tainted control-plane nodes.
      tolerations:
        - operator: Exists
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: node-exporter
          image: quay.io/prometheus/node-exporter:v1.8.2  # TODO digest-pin (audit K3S-F14)
          imagePullPolicy: IfNotPresent
          args:
            - --path.procfs=/host/proc
            - --path.sysfs=/host/sys
            - --path.rootfs=/host/root
            # Only report real host mounts; drop the kubelet/container churn.
            - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/kubelet/.+|var/lib/docker/.+|var/lib/containerd/.+)($|/)
            - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
            - --no-collector.wifi
            - --no-collector.hwmon
            - --web.listen-address=:9100
          ports:
            - name: metrics
              containerPort: 9100
              protocol: TCP
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop: ["ALL"]
          resources:
            requests:
              cpu: 30m
              memory: 32Mi
            limits:
              cpu: 200m
              memory: 128Mi
          volumeMounts:
            - name: proc
              mountPath: /host/proc
              readOnly: true
            - name: sys
              mountPath: /host/sys
              readOnly: true
            - name: root
              mountPath: /host/root
              mountPropagation: HostToContainer
              readOnly: true
      volumes:
        - name: proc
          hostPath:
            path: /proc
        - name: sys
          hostPath:
            path: /sys
        - name: root
          hostPath:
            path: /
 ---
 # default-deny-all blocks ingress; allow vmagent to scrape :9100.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-ingress-to-node-exporter
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: node-exporter
  policyTypes:
    - Ingress
  ingress:
    - from:
        - podSelector:
            matchLabels:
              app.kubernetes.io/name: vmagent
      ports:
        - port: 9100
          protocol: TCP
 ---
 # vmagent's existing egress policy only opens :8000/:8080 to the pod CIDR.
 # Additive policy (NetworkPolicies are OR'd) opening :9100 for the node-exporter
 # scrape — leaves the working allow-egress-from-vmagent policy untouched.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-egress-from-vmagent-to-node-exporter
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: vmagent
  policyTypes:
    - Egress
  egress:
    - to:
        - ipBlock:
            cidr: 10.42.0.0/16
      ports:
        - port: 9100
          protocol: TCP
@@ -0,0 +1,289 @@
 # vmagent — scrapes Prometheus /metrics from in-cluster services and
 # remote-writes them to https://obs.88oakapps.com/api/v1/write
 # (VictoriaMetrics on 88oakappsUpdate, fronted by Cloudflare + nginx
 # bearer-token auth). Single replica is fine — vmagent buffers locally
 # during transient remote outages.
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: vmagent-config
  namespace: honeydue
  labels:
    app.kubernetes.io/name: vmagent
    app.kubernetes.io/part-of: honeydue
 data:
  scrape.yaml: |
    global:
      scrape_interval: 15s
      external_labels:
        cluster: honeydue-k3s
        environment: prod
    scrape_configs:
      # honeyDue Go API — exposes /metrics on :8000
      - job_name: api
        kubernetes_sd_configs:
          - role: pod
            namespaces:
              names: [honeydue]
        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
            action: keep
            regex: api
          - source_labels: [__meta_kubernetes_pod_container_port_number]
            action: keep
            regex: "8000"
          - source_labels: [__meta_kubernetes_pod_name]
            target_label: pod
          - source_labels: [__meta_kubernetes_pod_node_name]
            target_label: node
          - target_label: service
            replacement: api
      # kube-state-metrics — cluster object state (kube_pod_*, kube_deployment_*,
      # etc.) needed for Grafana panels that count pods/replicas/etc.
      - job_name: kube-state-metrics
        kubernetes_sd_configs:
          - role: endpoints
            namespaces:
              names: [kube-system]
        relabel_configs:
          - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
            action: keep
            regex: kube-state-metrics
          - source_labels: [__meta_kubernetes_endpoint_port_name]
            action: keep
            regex: http-metrics
      # node-exporter — per-node host metrics (node_filesystem_*, node_memory_*,
      # node_load*). Pod-networked DaemonSet scraped on :9100 over the pod CIDR.
      - job_name: node-exporter
        kubernetes_sd_configs:
          - role: pod
            namespaces:
              names: [honeydue]
        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
            action: keep
            regex: node-exporter
          - source_labels: [__meta_kubernetes_pod_container_port_number]
            action: keep
            regex: "9100"
          - source_labels: [__meta_kubernetes_pod_name]
            target_label: pod
          - source_labels: [__meta_kubernetes_pod_node_name]
            target_label: node
          - target_label: service
            replacement: node-exporter
      # honeyDue worker — exposes /metrics on :6060 (apns/fcm/asynq/cache series).
      - job_name: worker
        kubernetes_sd_configs:
          - role: pod
            namespaces:
              names: [honeydue]
        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
            action: keep
            regex: worker
          - source_labels: [__meta_kubernetes_pod_container_port_number]
            action: keep
            regex: "6060"
          - source_labels: [__meta_kubernetes_pod_name]
            target_label: pod
          - source_labels: [__meta_kubernetes_pod_node_name]
            target_label: node
          - target_label: service
            replacement: worker
 ---
 apiVersion: v1
 kind: Secret
 metadata:
  name: vmagent-remote-write
  namespace: honeydue
  labels:
    app.kubernetes.io/name: vmagent
    app.kubernetes.io/part-of: honeydue
 type: Opaque
 stringData:
  # Bearer token for obs.88oakapps.com. Provisioned at deploy time from
  # deploy/prod.env (OBS_INGEST_TOKEN). The cluster-side token must match
  # the token in /etc/honeydue-obs/ingest_token on 88oakappsUpdate.
  bearer_token: TOKEN_PLACEHOLDER
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: vmagent
  namespace: honeydue
 rules:
  - apiGroups: [""]
    resources: [pods, services, endpoints]
    verbs: [get, list, watch]
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: vmagent
  namespace: honeydue
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: vmagent
  namespace: honeydue
 subjects:
  - kind: ServiceAccount
    name: vmagent
    namespace: honeydue
 roleRef:
  kind: Role
  name: vmagent
  apiGroup: rbac.authorization.k8s.io
 ---
 # Allow vmagent to discover the kube-state-metrics Service/Endpoints in
 # kube-system so the kube-state-metrics scrape job can find its target.
 # Cross-namespace SD needs an explicit RoleBinding here.
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: vmagent-kube-system
  namespace: kube-system
 rules:
  - apiGroups: [""]
    resources: [services, endpoints, pods]
    verbs: [get, list, watch]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: vmagent-kube-system
  namespace: kube-system
 subjects:
  - kind: ServiceAccount
    name: vmagent
    namespace: honeydue
 roleRef:
  kind: Role
  name: vmagent-kube-system
  apiGroup: rbac.authorization.k8s.io
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: vmagent
  namespace: honeydue
  labels:
    app.kubernetes.io/name: vmagent
    app.kubernetes.io/part-of: honeydue
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app.kubernetes.io/name: vmagent
  template:
    metadata:
      labels:
        app.kubernetes.io/name: vmagent
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: vmagent
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        fsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: vmagent
          # Pinned by digest (audit K3S-F14).
          image: victoriametrics/vmagent:v1.106.1@sha256:90208a667c0baf65f7536b92a84c40b6e35ffe8e88bda7e4447b97b06c6ba6b8
          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit
          # Container-level hardening (audit F7) — matches the other 5
          # workloads. vmagent only writes to the /tmp/vmagent emptyDir
          # (its remoteWrite buffer), so a read-only root filesystem holds.
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop: ["ALL"]
          args:
            - "-promscrape.config=/etc/vmagent/scrape.yaml"
            - "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write"
            - "-remoteWrite.bearerTokenFile=/etc/vmagent-secrets/bearer_token"
            - "-remoteWrite.tmpDataPath=/tmp/vmagent"
            - "-remoteWrite.maxDiskUsagePerURL=512MB"
            - "-loggerLevel=INFO"
          ports:
            - containerPort: 8429
              name: http
          resources:
            requests:
              cpu: 25m
              memory: 64Mi
            limits:
              cpu: 200m
              memory: 256Mi
          volumeMounts:
            - name: config
              mountPath: /etc/vmagent
              readOnly: true
            - name: secrets
              mountPath: /etc/vmagent-secrets
              readOnly: true
            - name: buffer
              mountPath: /tmp/vmagent
          # Process startup gate. /-/healthy returns 200 once vmagent has
          # parsed config — gives the agent up to 2 min to come up before
          # liveness starts evaluating.
          startupProbe:
            httpGet:
              path: /-/healthy
              port: http
            initialDelaySeconds: 5
            periodSeconds: 5
            failureThreshold: 24
          # Real liveness check: are scrapes actually succeeding?
          # /-/healthy was the old probe and returned 200 for 17 days even
          # while vmagent had zero healthy targets (stale k8s SD watch).
          # This exec probe queries vmagent's own targets API and fails if
          # NO target is in state "up". Three consecutive failures (3 min)
          # → kubelet kills the pod → fresh SD watch.
          livenessProbe:
            exec:
              command:
                - sh
                - -c
                - 'n=$(wget -qO- -T 4 http://localhost:8429/api/v1/targets 2>/dev/null | grep -c ''"health":"up"''); [ "$n" -gt 0 ]'
            initialDelaySeconds: 180
            periodSeconds: 120
            timeoutSeconds: 5
            failureThreshold: 5
          readinessProbe:
            httpGet:
              path: /-/healthy
              port: http
            initialDelaySeconds: 5
            periodSeconds: 10
      volumes:
        - name: config
          configMap:
            name: vmagent-config
        - name: secrets
          secret:
            secretName: vmagent-remote-write
            defaultMode: 0400
        - name: buffer
          emptyDir:
            sizeLimit: 512Mi
@@ -20,6 +20,9 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: redis
      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
      # the ServiceAccount-level setting in rbac.yaml.
      automountServiceAccountToken: false
      nodeSelector:
        honeydue/redis: "true"
      securityContext:
@@ -31,12 +34,18 @@ spec:
          type: RuntimeDefault
      containers:
        - name: redis
-          image: redis:7-alpine
+          # Pinned by digest (audit K3S-F14) — redis:7-alpine is 7.4.9-alpine.
          image: redis:7-alpine@sha256:6ab0b6e7381779332f97b8ca76193e45b0756f38d4c0dcda72dbb3c32061ab99
          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit
          command:
            - sh
            - -c
            - |
-              ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy noeviction"
+              # allkeys-lru: under memory pressure, evict the least-recently-used key.
              # honeyDue uses Redis as a cache + asynq queue. The cache layer falls
              # through to DB on miss, so eviction is graceful. asynq keys with TTLs
              # would be evicted only after older cache entries are gone.
              ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy allkeys-lru"
              if [ -n "$REDIS_PASSWORD" ]; then
                ARGS="$ARGS --requirepass $REDIS_PASSWORD"
              fi
@@ -23,8 +23,11 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: web
      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
      # the ServiceAccount-level setting in rbac.yaml.
      automountServiceAccountToken: false
      imagePullSecrets:
-        - name: ghcr-credentials
+        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1001
@@ -43,6 +46,7 @@ spec:
      containers:
        - name: web
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh or manual sed
          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit; images are SHA/digest-pinned
          ports:
            - containerPort: 3000
              protocol: TCP
@@ -27,8 +27,11 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: worker
      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
      # the ServiceAccount-level setting in rbac.yaml.
      automountServiceAccountToken: false
      imagePullSecrets:
-        - name: ghcr-credentials
+        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
@@ -39,6 +42,12 @@ spec:
      containers:
        - name: worker
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh
          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit; images are SHA/digest-pinned
          ports:
            # health + Prometheus /metrics (in-cluster only; scraped by vmagent)
            - name: metrics
              containerPort: 6060
              protocol: TCP
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
@@ -47,34 +56,16 @@ spec:
          envFrom:
            - configMapRef:
                name: honeydue-config
-          env:
+          # Audit CODE-F8: secrets are NOT injected as environment variables.
-            - name: POSTGRES_PASSWORD
+          # Env vars are readable for the life of the pod via /proc/<pid>/environ
-              valueFrom:
+          # and leak into crash dumps / child processes. honeydue-secrets is
-                secretKeyRef:
+          # mounted read-only at /etc/honeydue/secrets (mode 0400) and the Go
-                  name: honeydue-secrets
+          # config layer (config.loadFileSecrets) reads each key from its file.
-                  key: POSTGRES_PASSWORD
+          # Non-secret config still arrives via the configMapRef above.
            - name: SECRET_KEY
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: SECRET_KEY
            - name: EMAIL_HOST_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: EMAIL_HOST_PASSWORD
            - name: FCM_SERVER_KEY
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: FCM_SERVER_KEY
            - name: REDIS_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: honeydue-secrets
                  key: REDIS_PASSWORD
                  optional: true
          volumeMounts:
            - name: app-secrets
              mountPath: /etc/honeydue/secrets
              readOnly: true
            - name: apns-key
              mountPath: /secrets/apns
              readOnly: true
@@ -94,6 +85,12 @@ spec:
            periodSeconds: 30
            timeoutSeconds: 5
      volumes:
        # Audit CODE-F8: the whole honeydue-secrets Secret, projected as files.
        # defaultMode 0400 → readable only by the container's runAsUser (1000).
        - name: app-secrets
          secret:
            secretName: honeydue-secrets
            defaultMode: 0400
        - name: apns-key
          secret:
            secretName: honeydue-apns-key
@@ -103,3 +100,46 @@ spec:
        - name: tmp
          emptyDir:
            sizeLimit: 64Mi
 ---
 # Allow vmagent to scrape the worker's /metrics on :6060 (default-deny-all is in
 # force; the worker otherwise receives no ingress). Additive — see node-exporter.
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-ingress-to-worker-metrics
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: worker
  policyTypes:
    - Ingress
  ingress:
    - from:
        - podSelector:
            matchLabels:
              app.kubernetes.io/name: vmagent
      ports:
        - port: 6060
          protocol: TCP
 ---
 # vmagent's base egress policy only opens :8000/:8080 to the pod CIDR; this
 # additive policy opens :6060 for the worker scrape (leaves the base untouched).
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
  name: allow-egress-from-vmagent-to-worker
  namespace: honeydue
 spec:
  podSelector:
    matchLabels:
      app.kubernetes.io/name: vmagent
  policyTypes:
    - Egress
  egress:
    - to:
        - ipBlock:
            cidr: 10.42.0.0/16
      ports:
        - port: 6060
          protocol: TCP
@@ -68,6 +68,43 @@ SECRET_ARGS=(
 if [[ -n "${REDIS_PASSWORD}" ]]; then
  log "  Including REDIS_PASSWORD in secrets"
  SECRET_ARGS+=(--from-literal="REDIS_PASSWORD=${REDIS_PASSWORD}")
 else
  # Audit K3S-F1 (CRITICAL) / MEDIUM-4: refuse to deploy with an unauthenticated
  # Redis. A previous version only warned here, which let a deploy from an
  # unedited config.yaml silently bring Redis up with no password.
  die "redis.password is empty in config.yaml — refusing to deploy: Redis would run with NO authentication (audit K3S-F1). Set a strong value, e.g.: openssl rand -base64 32"
 fi
 # B2 (Backblaze) object-storage credentials. The api/worker manifests
 # reference B2_KEY_ID / B2_APP_KEY as required secret keys, so honeydue-secrets
 # MUST carry them or those pods fail to start. Sourced from config.yaml so the
 # script and the manifests no longer drift (was a latent gap before 2026-05-16).
 B2_KEY_ID_VAL="$(cfg storage.b2_key_id 2>/dev/null || true)"
 B2_APP_KEY_VAL="$(cfg storage.b2_app_key 2>/dev/null || true)"
 if [[ -n "${B2_KEY_ID_VAL}" && -n "${B2_APP_KEY_VAL}" ]]; then
  log "  Including B2_KEY_ID / B2_APP_KEY in secrets"
  SECRET_ARGS+=(--from-literal="B2_KEY_ID=${B2_KEY_ID_VAL}")
  SECRET_ARGS+=(--from-literal="B2_APP_KEY=${B2_APP_KEY_VAL}")
 else
  warn "storage.b2_key_id / b2_app_key not set in config.yaml — B2 uploads will be disabled."
 fi
 # Observability ingest credentials live in deploy/prod.env (gitignored) so
 # the values aren't checked into config.yaml. Skipped silently when the
 # file or keys are absent — the api/worker manifests mark these env vars
 # optional, so the deployment still rolls without traces.
 PROD_ENV_FILE="${DEPLOY_DIR}/../deploy/prod.env"
 if [[ -f "${PROD_ENV_FILE}" ]]; then
  OBS_TOKEN_VAL="$(grep -E '^OBS_INGEST_TOKEN=' "${PROD_ENV_FILE}" 2>/dev/null | cut -d= -f2- || true)"
  OBS_URL_VAL="$(grep -E '^OBS_TRACES_URL=' "${PROD_ENV_FILE}" 2>/dev/null | cut -d= -f2- || true)"
  if [[ -n "${OBS_TOKEN_VAL}" ]]; then
    log "  Including OBS_INGEST_TOKEN in secrets"
    SECRET_ARGS+=(--from-literal="OBS_INGEST_TOKEN=${OBS_TOKEN_VAL}")
  fi
  if [[ -n "${OBS_URL_VAL}" ]]; then
    log "  Including OBS_TRACES_URL in secrets"
    SECRET_ARGS+=(--from-literal="OBS_TRACES_URL=${OBS_URL_VAL}")
  fi
 fi
 kubectl create secret generic honeydue-secrets \
@@ -82,22 +119,24 @@ kubectl create secret generic honeydue-apns-key \
  --from-file="apns_auth_key.p8=${SECRETS_DIR}/apns_auth_key.p8" \
  --dry-run=client -o yaml | kubectl apply -f -
-# --- Create GHCR registry credentials ---
+# --- Create container registry credentials ---
 # Secret name is gitea-credentials (audit F6): the registry is self-hosted
 # Gitea, not GHCR. Every deployment manifest references this same name.
 REGISTRY_SERVER="$(cfg registry.server)"
 REGISTRY_USER="$(cfg registry.username)"
 REGISTRY_TOKEN="$(cfg registry.token)"
 if [[ -n "${REGISTRY_SERVER}" && -n "${REGISTRY_USER}" && -n "${REGISTRY_TOKEN}" ]]; then
-  log "Creating ghcr-credentials..."
+  log "Creating gitea-credentials..."
-  kubectl create secret docker-registry ghcr-credentials \
+  kubectl create secret docker-registry gitea-credentials \
    --namespace="${NAMESPACE}" \
    --docker-server="${REGISTRY_SERVER}" \
    --docker-username="${REGISTRY_USER}" \
    --docker-password="${REGISTRY_TOKEN}" \
    --dry-run=client -o yaml | kubectl apply -f -
 else
-  warn "Registry credentials incomplete in config.yaml — skipping ghcr-credentials."
+  warn "Registry credentials incomplete in config.yaml — skipping gitea-credentials."
 fi
 # --- Create Cloudflare origin cert ---
@@ -114,7 +153,8 @@ kubectl create secret tls cloudflare-origin-cert \
 if [[ -n "${ADMIN_AUTH_USER}" && -n "${ADMIN_AUTH_PASSWORD}" ]]; then
  command -v htpasswd >/dev/null 2>&1 || die "Missing: htpasswd (install apache2-utils)"
  log "Creating admin-basic-auth secret..."
-  HTPASSWD="$(htpasswd -nb "${ADMIN_AUTH_USER}" "${ADMIN_AUTH_PASSWORD}")"
+  # -B forces bcrypt (Traefik BasicAuth supports it; avoids weak apr1-MD5).
  HTPASSWD="$(htpasswd -nbB "${ADMIN_AUTH_USER}" "${ADMIN_AUTH_PASSWORD}")"
  kubectl create secret generic admin-basic-auth \
    --namespace="${NAMESPACE}" \
    --from-literal=users="${HTPASSWD}" \
@@ -124,6 +164,35 @@ else
  warn "Admin panel will NOT have basic auth protection."
 fi
 # --- Create Kratos secrets (Ory Kratos identity service) ---
 # Created only when config.yaml has a kratos.dsn. Until then 03-deploy.sh skips
 # the Kratos deploy entirely, so the existing stack is unaffected.
 KRATOS_DSN="$(cfg kratos.dsn 2>/dev/null || true)"
 if [[ -n "${KRATOS_DSN}" ]]; then
  log "Creating kratos-secrets..."
  KR_COOKIE="$(cfg kratos.secrets_cookie 2>/dev/null || true)"
  KR_CIPHER="$(cfg kratos.secrets_cipher 2>/dev/null || true)"
  KR_SMTP="$(cfg kratos.smtp_connection_uri 2>/dev/null || true)"
  KR_GOOGLE="$(cfg kratos.google_client_secret 2>/dev/null || true)"
  KR_APPLE="$(cfg kratos.apple_private_key 2>/dev/null || true)"
  [[ -n "${KR_COOKIE}" && -n "${KR_CIPHER}" ]] \
    || die "kratos.secrets_cookie / secrets_cipher must be set (generate once: openssl rand -hex 16)"
  [[ ${#KR_CIPHER} -eq 32 ]] \
    || die "kratos.secrets_cipher must be exactly 32 characters (openssl rand -hex 16)"
  kubectl create secret generic kratos-secrets \
    --namespace="${NAMESPACE}" \
    --from-literal="dsn=${KRATOS_DSN}" \
    --from-literal="secrets_cookie=${KR_COOKIE}" \
    --from-literal="secrets_cipher=${KR_CIPHER}" \
    --from-literal="smtp_connection_uri=${KR_SMTP}" \
    --from-literal="google_client_secret=${KR_GOOGLE}" \
    --from-literal="apple_private_key=${KR_APPLE}" \
    --dry-run=client -o yaml | kubectl apply -f -
 else
  warn "config.yaml has no kratos.dsn — skipping kratos-secrets (Kratos not yet configured)."
 fi
 # --- Done ---
 log ""
@@ -81,20 +81,24 @@ if [[ "${SKIP_BUILD}" == "false" ]]; then
  log "Logging in to ${REGISTRY_SERVER}..."
  printf '%s' "${REGISTRY_TOKEN}" | docker login "${REGISTRY_SERVER}" -u "${REGISTRY_USER}" --password-stdin >/dev/null
-  log "Building API image: ${API_IMAGE}"
+  # k3s nodes are linux/amd64 (Hetzner CX). Force the build platform so
-  docker build --target api -t "${API_IMAGE}" "${REPO_DIR}"
+  # local arm64 Macs don't push images that crash with "exec format error".
  BUILD_PLATFORM="linux/amd64"
-  log "Building Worker image: ${WORKER_IMAGE}"
+  log "Building API image: ${API_IMAGE} (${BUILD_PLATFORM})"
-  docker build --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}"
+  docker build --platform "${BUILD_PLATFORM}" --target api -t "${API_IMAGE}" "${REPO_DIR}"
-  log "Building Admin image: ${ADMIN_IMAGE} (NEXT_PUBLIC_API_URL=${ADMIN_API_URL})"
+  log "Building Worker image: ${WORKER_IMAGE} (${BUILD_PLATFORM})"
-  docker build --target admin \
+  docker build --platform "${BUILD_PLATFORM}" --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}"
  log "Building Admin image: ${ADMIN_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${ADMIN_API_URL})"
  docker build --platform "${BUILD_PLATFORM}" --target admin \
    --build-arg "NEXT_PUBLIC_API_URL=${ADMIN_API_URL}" \
    -t "${ADMIN_IMAGE}" "${REPO_DIR}"
  if [[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]]; then
-    log "Building Web image: ${WEB_IMAGE} (NEXT_PUBLIC_API_URL=${WEB_API_URL})"
+    log "Building Web image: ${WEB_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${WEB_API_URL})"
-    docker build \
+    docker build --platform "${BUILD_PLATFORM}" \
      --build-arg "NEXT_PUBLIC_API_URL=${WEB_API_URL}" \
      --build-arg "NEXT_PUBLIC_POSTHOG_KEY=${NEXT_PUBLIC_POSTHOG_KEY}" \
      --build-arg "NEXT_PUBLIC_POSTHOG_HOST=${NEXT_PUBLIC_POSTHOG_HOST}" \
@@ -124,6 +128,56 @@ else
  warn "Skipping build. Using images for tag: ${DEPLOY_TAG}"
 fi
 # --- Resolve immutable image digests (audit F5) ---
 # A short-SHA tag is mutable — anyone who can push to the registry can
 # overwrite it, and imagePullPolicy then pulls the new bits silently. We
 # deploy by @sha256: digest instead, pinning the exact image that was just
 # built and pushed. `docker push` populates RepoDigests; with --skip-build
 # (no local image) resolve_ref falls back to the tag.
 resolve_ref() {
  local img="$1" digest
  digest="$(docker inspect --format='{{range .RepoDigests}}{{println .}}{{end}}' "${img}" 2>/dev/null | grep -m1 '@sha256:' || true)"
  if [[ -n "${digest}" ]]; then
    printf '%s' "${digest}"
  else
    warn "could not resolve a digest for ${img} — deploying by mutable tag"
    printf '%s' "${img}"
  fi
 }
 API_REF="$(resolve_ref "${API_IMAGE}")"
 WORKER_REF="$(resolve_ref "${WORKER_IMAGE}")"
 ADMIN_REF="$(resolve_ref "${ADMIN_IMAGE}")"
 WEB_REF="$(resolve_ref "${WEB_IMAGE}")"
 log "Deploying by digest:"
 log "  API:    ${API_REF}"
 log "  Worker: ${WORKER_REF}"
 log "  Admin:  ${ADMIN_REF}"
 # --- Image scan + signing (audit CODE-L5) ---
 # Both steps are best-effort: the deploy does NOT fail if the tools are
 # absent, so an operator who has not set up cosign/trivy yet is not blocked.
 # Install trivy + cosign and export COSIGN_KEY to enforce. Cluster-side
 # admission verification (Kyverno/Connaisseur) is a separate operator step.
 if [[ "${SKIP_BUILD}" == "false" ]]; then
  if command -v trivy >/dev/null 2>&1; then
    log "Scanning images with Trivy (HIGH,CRITICAL)..."
    for img in "${API_IMAGE}" "${WORKER_IMAGE}" "${ADMIN_IMAGE}"; do
      trivy image --severity HIGH,CRITICAL --exit-code 0 --quiet "${img}" \
        || warn "Trivy reported findings for ${img}"
    done
  else
    warn "trivy not installed — skipping image vulnerability scan (audit L5)"
  fi
  if command -v cosign >/dev/null 2>&1 && [[ -n "${COSIGN_KEY:-}" ]]; then
    log "Signing images with cosign..."
    for ref in "${API_REF}" "${WORKER_REF}" "${ADMIN_REF}"; do
      cosign sign --yes --key "${COSIGN_KEY}" "${ref}" || warn "cosign sign failed for ${ref}"
    done
  else
    warn "cosign not configured (need cosign + COSIGN_KEY) — skipping image signing (audit L5)"
  fi
 fi
 # --- Generate and apply ConfigMap from config.yaml ---
 log "Generating env from config.yaml..."
@@ -142,24 +196,95 @@ kubectl create configmap honeydue-config \
 log "Applying manifests..."
 kubectl apply -f "${MANIFESTS}/namespace.yaml"
 # NetworkPolicies first — default-deny-all + per-app allow rules.
 # These MUST be applied; without them the cluster falls back to default-allow
 # (worse posture) AND the vmagent egress rule for :6443 (which fixes a k3s
 # post-DNAT enforcement quirk for k8s API discovery) is missing.
 # See deploy-k3s/RUNBOOK.md ("vmagent SD broken on fresh deploy").
 kubectl apply -f "${MANIFESTS}/network-policies.yaml"
 kubectl apply -f "${MANIFESTS}/redis/"
 kubectl apply -f "${MANIFESTS}/ingress/"
 # --- Run migrations BEFORE rolling api/worker ---
 #
 # goose-based migration Job. We delete any prior Job (Jobs are immutable —
 # applying a duplicate name otherwise fails), apply a fresh one with the new
 # api image (which includes /usr/local/bin/goose and /app/migrations), and
 # block until it succeeds. A failure aborts the deploy before any new app
 # pod sees a stale schema.
 log "Running database migrations (goose Job)..."
 kubectl delete job honeydue-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null
 sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/migrate/job.yaml" | kubectl apply -f -
 if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=10m job/honeydue-migrate; then
  warn "migration Job failed — see logs:"
  kubectl logs -n "${NAMESPACE}" job/honeydue-migrate --tail=200 || true
  die "migrations did not complete cleanly; aborting deploy"
 fi
 log "Migrations applied; proceeding with api/worker rollout"
 # Apply deployments with image substitution
-sed "s|image: IMAGE_PLACEHOLDER|image: ${API_IMAGE}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f -
+sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f -
 kubectl apply -f "${MANIFESTS}/api/service.yaml"
 kubectl apply -f "${MANIFESTS}/api/hpa.yaml"
-sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_IMAGE}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f -
+sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_REF}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f -
-sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_IMAGE}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f -
+sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_REF}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f -
 kubectl apply -f "${MANIFESTS}/admin/service.yaml"
 if [[ -d "${MANIFESTS}/web" ]]; then
-  sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_IMAGE}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f -
+  sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_REF}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f -
  kubectl apply -f "${MANIFESTS}/web/service.yaml"
 fi
 # Observability — vmagent scrapes api Pods :8000/metrics + kube-state-metrics
 # :8080/metrics and remote-writes everything to obs.88oakapps.com. The bearer
 # token comes from deploy/prod.env so it stays out of the repo; the manifest
 # holds TOKEN_PLACEHOLDER. kube-state-metrics provides the kube_* metrics
 # Grafana panels need to count pods, deployments, etc.
 if [[ -d "${MANIFESTS}/observability" ]]; then
  # kube-state-metrics — no secrets, plain apply
  kubectl apply -f "${MANIFESTS}/observability/kube-state-metrics.yaml"
  # vmagent — needs the bearer-token substitution
  # prod.env lives at the repo's deploy/ dir (sibling of deploy-k3s/), not
  # under deploy-k3s/. It's gitignored — operator copies values there once.
  OBS_TOKEN="$(grep -E '^OBS_INGEST_TOKEN=' "${REPO_DIR}/deploy/prod.env" 2>/dev/null | cut -d= -f2- || true)"
  if [[ -z "${OBS_TOKEN}" ]]; then
    warn "OBS_INGEST_TOKEN not found in deploy/prod.env — skipping vmagent + alloy-logs apply"
  else
    sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/vmagent.yaml" | kubectl apply -f -
    # alloy-logs — DaemonSet that tails honeydue pod logs and pushes them to
    # Loki at obs.88oakapps.com. Same OBS_INGEST_TOKEN as vmagent.
    if [[ -f "${MANIFESTS}/observability/alloy-logs.yaml" ]]; then
      sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/alloy-logs.yaml" | kubectl apply -f -
    fi
  fi
 fi
 # --- Ory Kratos (identity service) ---
 # Applied only when kratos-secrets exists — i.e. the operator has completed the
 # Kratos prerequisites in deploy-k3s/manifests/kratos/README.md. Otherwise
 # skipped, so the existing stack deploys unaffected.
 if kubectl -n "${NAMESPACE}" get secret kratos-secrets >/dev/null 2>&1; then
  log "Deploying Ory Kratos..."
  kubectl apply -f "${MANIFESTS}/kratos/configmap.yaml"
  # The migrate Job is immutable — delete any prior run, then apply + wait.
  kubectl delete job kratos-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null
  kubectl apply -f "${MANIFESTS}/kratos/migrate-job.yaml"
  if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=5m job/kratos-migrate; then
    warn "Kratos migration Job failed — logs:"
    kubectl logs -n "${NAMESPACE}" job/kratos-migrate --tail=100 || true
    die "aborting: Kratos schema migration failed"
  fi
  kubectl apply -f "${MANIFESTS}/kratos/kratos.yaml"
  kubectl apply -f "${MANIFESTS}/kratos/ingress.yaml"
 else
  log "kratos-secrets not present — skipping Kratos deploy (see manifests/kratos/README.md)."
 fi
 # --- Wait for rollouts ---
 log "Waiting for rollouts..."
@@ -171,6 +296,15 @@ kubectl rollout status deployment/admin -n "${NAMESPACE}" --timeout=300s
 if [[ -d "${MANIFESTS}/web" ]]; then
  kubectl rollout status deployment/web -n "${NAMESPACE}" --timeout=300s
 fi
 if kubectl -n "${NAMESPACE}" get deployment vmagent >/dev/null 2>&1; then
  kubectl rollout status deployment/vmagent -n "${NAMESPACE}" --timeout=120s
 fi
 if kubectl -n "${NAMESPACE}" get daemonset alloy-logs >/dev/null 2>&1; then
  kubectl rollout status daemonset/alloy-logs -n "${NAMESPACE}" --timeout=120s
 fi
 if kubectl -n "${NAMESPACE}" get deployment kratos >/dev/null 2>&1; then
  kubectl rollout status deployment/kratos -n "${NAMESPACE}" --timeout=180s
 fi
 # --- Done ---
@@ -100,7 +100,7 @@ lines = [
    # API
    'DEBUG=false',
    f\"ALLOWED_HOSTS={d['api']},{d['base']}\",
-    f\"CORS_ALLOWED_ORIGINS=https://{d['base']},https://{d['admin']}\",
+    f\"CORS_ALLOWED_ORIGINS=https://{d['base']},https://{d['admin']},https://{d.get('app', 'app.' + d['base'])}\",
    'TIMEZONE=UTC',
    f\"BASE_URL=https://{d['base']}\",
    'PORT=8000',
@@ -118,8 +118,15 @@ lines = [
    f\"DB_MAX_OPEN_CONNS={db['max_open_conns']}\",
    f\"DB_MAX_IDLE_CONNS={db['max_idle_conns']}\",
    f\"DB_MAX_LIFETIME={db['max_lifetime']}\",
-    # Redis (K8s internal DNS — password injected if configured)
+    f\"DB_MAX_IDLE_TIME={db.get('max_idle_time', '0s')}\",
-    f\"REDIS_URL=redis://{':%s@' % val(rd.get('password')) if rd.get('password') else ''}redis.honeydue.svc.cluster.local:6379/0\",
+    # Redis — in-namespace DNS short form (works because pod /etc/resolv.conf
    # searches honeydue.svc.cluster.local). Audit HIGH-1: the password is
    # intentionally NOT embedded here. This URL is emitted into the
    # honeydue-config ConfigMap, which is NOT encrypted at rest and is
    # readable by anyone with `get configmap`. The Redis password travels
    # only in honeydue-secrets as REDIS_PASSWORD (file-mounted, F8); the API
    # applies it in cache_service.go and the worker onto its Asynq opt.
    'REDIS_URL=redis://redis:6379/0',
    'REDIS_DB=0',
    # Email
    f\"EMAIL_HOST={em['host']}\",
@@ -139,12 +146,21 @@ lines = [
    f\"OVERDUE_REMINDER_HOUR={wk['overdue_reminder_hour']}\",
    f\"DAILY_DIGEST_HOUR={wk['daily_digest_hour']}\",
    # B2 Storage
-    f\"B2_KEY_ID={val(st['b2_key_id'])}\",
+    # B2_KEY_ID and B2_APP_KEY are intentionally NOT emitted into the
-    f\"B2_APP_KEY={val(st['b2_app_key'])}\",
+    # ConfigMap — they're credentials and belong in honeydue-secrets
    # (set by 02-setup-secrets.sh). Wire them into the api/worker
    # deployments via envFrom: secretRef when B2 uploads need to be
    # active. Leaving them in cleartext here would leak via
    # \"kubectl get cm\".
    f\"B2_BUCKET_NAME={val(st['b2_bucket'])}\",
    f\"B2_ENDPOINT={val(st['b2_endpoint'])}\",
    f\"B2_REGION={val(st.get('b2_region'))}\",
    f\"B2_USE_SSL={b(st.get('b2_use_ssl', True))}\",
    f\"STORAGE_MAX_FILE_SIZE={st['max_file_size']}\",
    f\"STORAGE_ALLOWED_TYPES={st['allowed_types']}\",
    f\"STORAGE_UPLOAD_DIR={val(st.get('upload_dir', '/app/uploads'))}\",
    f\"STORAGE_BASE_URL={val(st.get('base_url', '/uploads'))}\",
    f\"STATIC_DIR={val(st.get('static_dir', '/app/static'))}\",
    # Features
    f\"FEATURE_PUSH_ENABLED={b(ft['push_enabled'])}\",
    f\"FEATURE_EMAIL_ENABLED={b(ft['email_enabled'])}\",
@@ -207,8 +223,18 @@ config = {
        'image': 'ubuntu-24.04',
    },
    'additional_packages': ['open-iscsi'],
-    'post_create_commands': ['sudo systemctl enable --now iscsid'],
+    # Audit K3S-CG2: harden the node OS at provision time — fail2ban for SSH
-    'k3s_config_file': 'secrets-encryption: true\n',
+    # brute-force, unattended-upgrades for automatic security patches.
    'post_create_commands': [
        'sudo systemctl enable --now iscsid',
        'sudo apt-get update -qq',
        'sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq fail2ban unattended-upgrades',
        'sudo systemctl enable --now fail2ban',
        'sudo dpkg-reconfigure -f noninteractive -plow unattended-upgrades',
    ],
    # Audit K3S-CG1 / K3S-F4: encrypt Secrets at rest in etcd, and write the
    # node kubeconfig as mode 0600 (not world-readable).
    'k3s_config_file': 'secrets-encryption: true\nwrite-kubeconfig-mode: \"0600\"\n',
 }
 print(yaml.dump(config, default_flow_style=False, sort_keys=False))
@@ -0,0 +1,39 @@
 {
  "$id": "https://honeydue.app/identity.schema.json",
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "honeyDue user",
  "type": "object",
  "properties": {
    "traits": {
      "type": "object",
      "properties": {
        "email": {
          "type": "string",
          "format": "email",
          "title": "Email",
          "minLength": 3,
          "maxLength": 320,
          "ory.sh/kratos": {
            "credentials": {
              "password": { "identifier": true },
              "code": { "identifier": true, "via": "email" },
              "totp": { "account_name": true }
            },
            "verification": { "via": "email" },
            "recovery": { "via": "email" }
          }
        },
        "name": {
          "type": "object",
          "title": "Name",
          "properties": {
            "first": { "type": "string", "title": "First name", "maxLength": 100 },
            "last": { "type": "string", "title": "Last name", "maxLength": 100 }
          }
        }
      },
      "required": ["email"],
      "additionalProperties": false
    }
  }
 }
@@ -0,0 +1,101 @@
 version: v1.3.0
 serve:
  public:
    base_url: http://localhost:4433/
    cors:
      enabled: true
      allowed_origins:
        - http://localhost
        - http://localhost:3000
        - http://localhost:8000
        - http://127.0.0.1
      allowed_methods: [GET, POST, PUT, PATCH, DELETE, OPTIONS]
      allowed_headers: [Authorization, Content-Type, X-Session-Token, Cookie]
      exposed_headers: [Content-Type, Set-Cookie]
      allow_credentials: true
  admin:
    base_url: http://kratos:4434/
 selfservice:
  default_browser_return_url: http://localhost:8000/
  allowed_return_urls:
    - http://localhost:8000
    - honeydue://callback
  methods:
    password:
      enabled: true
      config:
        min_password_length: 8
        identifier_similarity_check_enabled: false
    code:
      enabled: true
    oidc:
      enabled: false
  flows:
    error:
      ui_url: http://localhost:8000/auth/error
    login:
      ui_url: http://localhost:8000/auth/login
      lifespan: 10m
    registration:
      ui_url: http://localhost:8000/auth/registration
      lifespan: 10m
      after:
        password:
          hooks:
            - hook: session
    verification:
      enabled: true
      ui_url: http://localhost:8000/auth/verification
      use: code
      after:
        default_browser_return_url: http://localhost:8000/
    recovery:
      enabled: true
      ui_url: http://localhost:8000/auth/recovery
      use: code
    settings:
      ui_url: http://localhost:8000/auth/settings
      privileged_session_max_age: 15m
    logout:
      after:
        default_browser_return_url: http://localhost:8000/
 log:
  level: debug
  format: text
  leak_sensitive_values: true
 secrets:
  cookie:
    - local-dev-cookie-secret-please-change-this-32chars
  cipher:
    - 0123456789abcdef0123456789abcdef
 ciphers:
  algorithm: xchacha20-poly1305
 hashers:
  algorithm: bcrypt
  bcrypt:
    cost: 8
 identity:
  default_schema_id: honeydue
  schemas:
    - id: honeydue
      url: file:///etc/config/kratos/identity.schema.json
 courier:
  smtp:
    connection_uri: smtp://mailpit:1025/?disable_starttls=true
    from_address: noreply@localhost
    from_name: honeyDue Local
 session:
  lifespan: 720h
  cookie:
    same_site: Lax
@@ -35,7 +35,7 @@ DEFAULT_FROM_EMAIL=honeyDue <noreply@honeyDue.treytartt.com>
 # APNS private key goes in deploy/secrets/apns_auth_key.p8
 APNS_AUTH_KEY_ID=CHANGEME_APNS_KEY_ID
 APNS_TEAM_ID=CHANGEME_APNS_TEAM_ID
-APNS_TOPIC=com.tt.honeyDue
+APNS_TOPIC=com.myhoneydue.honeyDue
 APNS_USE_SANDBOX=false
 APNS_PRODUCTION=true
@@ -80,7 +80,11 @@ FEATURE_PDF_REPORTS_ENABLED=true
 FEATURE_WORKER_ENABLED=true
 # Optional auth/iap values
-APPLE_CLIENT_ID=
+# APPLE_CLIENT_ID must equal the iOS Release bundle ID. The Apple
 # identity-token `aud` claim is verified against this value
 # (internal/services/apple_auth.go::verifyAudience). Leaving it empty
 # with DEBUG=false rejects every Apple token as invalid audience.
 APPLE_CLIENT_ID=com.myhoneydue.honeyDue
 APPLE_TEAM_ID=
 GOOGLE_CLIENT_ID=
 GOOGLE_ANDROID_CLIENT_ID=
@@ -1,6 +1,31 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # DEPRECATED — production migrated from Docker Swarm to k3s on 2026-04-24.
 # This script targets the old Swarm manager + registry flow and will fail
 # at the SSH/Swarm validation step because hetzner1 no longer runs dockerd.
 #
 # Use the k3s deploy stack instead:
 #
 #   export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
 #   ./deploy-k3s/scripts/03-deploy.sh
 #
 # If you don't have deploy-k3s/kubeconfig locally, fetch it once:
 #   ssh -i ~/.ssh/hetzner deploy@hetzner1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
 #     | sed 's|server: https://127.0.0.1:6443|server: https://178.104.247.152:6443|' \
 #     > deploy-k3s/kubeconfig
 #   chmod 600 deploy-k3s/kubeconfig
 #
 # To override and run anyway (do NOT do this casually), set:
 #   ALLOW_LEGACY_SWARM_DEPLOY=1 ./deploy/scripts/deploy_prod.sh
 if [[ "${ALLOW_LEGACY_SWARM_DEPLOY:-0}" != "1" ]]; then
  printf '[deploy][error] %s\n' \
    "deploy_prod.sh is the legacy Docker Swarm flow. Production now runs on k3s." \
    "Use ./deploy-k3s/scripts/03-deploy.sh instead (see top of this script for setup)." \
    "If you really need the old Swarm path, set ALLOW_LEGACY_SWARM_DEPLOY=1." >&2
  exit 1
 fi
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 DEPLOY_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
 REPO_DIR="$(cd "${DEPLOY_DIR}/.." && pwd)"
@@ -14,6 +14,7 @@ services:
      POSTGRES_DB: ${POSTGRES_DB:-honeydue}
    volumes:
      - postgres_data:/var/lib/postgresql/data
      - ./deploy/local/postgres-init:/docker-entrypoint-initdb.d:ro
    ports:
      - "${DB_PORT:-5433}:5432"  # 5433 externally to avoid conflicts with local postgres
    healthcheck:
@@ -85,12 +86,16 @@ services:
      APNS_AUTH_KEY_PATH: ${APNS_AUTH_KEY_PATH}
      APNS_AUTH_KEY_ID: ${APNS_AUTH_KEY_ID}
      APNS_TEAM_ID: ${APNS_TEAM_ID}
-      APNS_TOPIC: ${APNS_TOPIC:-com.tt.honeyDue}
+      APNS_TOPIC: ${APNS_TOPIC:-com.myhoneydue.honeyDue.dev}
      APNS_USE_SANDBOX: "true"
      FCM_SERVER_KEY: ${FCM_SERVER_KEY}
      # Storage encryption
      STORAGE_ENCRYPTION_KEY: ${STORAGE_ENCRYPTION_KEY}
      # Kratos (identity service)
      KRATOS_PUBLIC_URL: "http://kratos:4433"
      KRATOS_ADMIN_URL: "http://kratos:4434"
    volumes:
      - ./push_certs:/certs:ro
      - ./uploads:/app/uploads
@@ -99,6 +104,8 @@ services:
        condition: service_healthy
      redis:
        condition: service_healthy
      kratos:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://127.0.0.1:8000/api/health/"]
      interval: 30s
@@ -158,7 +165,7 @@ services:
      APNS_AUTH_KEY_PATH: "/certs/apns_key.p8"
      APNS_AUTH_KEY_ID: ${APNS_AUTH_KEY_ID}
      APNS_TEAM_ID: ${APNS_TEAM_ID}
-      APNS_TOPIC: ${APNS_TOPIC:-com.tt.honeyDue}
+      APNS_TOPIC: ${APNS_TOPIC:-com.myhoneydue.honeyDue.dev}
      APNS_USE_SANDBOX: "true"
      FCM_SERVER_KEY: ${FCM_SERVER_KEY}
@@ -184,6 +191,59 @@ services:
    networks:
      - honeydue-network
  # Mailpit — local SMTP catcher (for Kratos email codes during onboarding)
  mailpit:
    image: axllent/mailpit:latest
    container_name: honeydue-mailpit
    restart: unless-stopped
    ports:
      - "${MAILPIT_SMTP_PORT:-1025}:1025"
      - "${MAILPIT_HTTP_PORT:-8025}:8025"
    networks:
      - honeydue-network
  # Kratos schema migration (one-shot, runs before kratos starts)
  kratos-migrate:
    image: oryd/kratos:v1.3.0
    container_name: honeydue-kratos-migrate
    command: ["migrate", "sql", "-e", "--yes"]
    environment:
      DSN: "postgres://${POSTGRES_USER:-honeydue}:${POSTGRES_PASSWORD:-honeydue_dev_password}@db:5432/kratos?sslmode=disable"
    depends_on:
      db:
        condition: service_healthy
    networks:
      - honeydue-network
    restart: "no"
  # Ory Kratos — identity service
  kratos:
    image: oryd/kratos:v1.3.0
    container_name: honeydue-kratos
    restart: unless-stopped
    command: ["serve", "--config", "/etc/config/kratos/kratos.yml", "--watch-courier", "--dev"]
    ports:
      - "${KRATOS_PUBLIC_PORT:-4433}:4433"
      - "${KRATOS_ADMIN_PORT:-4434}:4434"
    environment:
      DSN: "postgres://${POSTGRES_USER:-honeydue}:${POSTGRES_PASSWORD:-honeydue_dev_password}@db:5432/kratos?sslmode=disable"
      LOG_LEVEL: "debug"
    volumes:
      - ./deploy/local/kratos:/etc/config/kratos:ro
    depends_on:
      kratos-migrate:
        condition: service_completed_successfully
      mailpit:
        condition: service_started
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://127.0.0.1:4434/health/ready"]
      interval: 10s
      timeout: 5s
      retries: 10
      start_period: 10s
    networks:
      - honeydue-network
  # Dozzle — lightweight real-time log viewer
  dozzle:
    image: amir20/dozzle:latest
@@ -194,10 +194,17 @@ See [Chapter 8](./08-database.md), [9](./09-storage.md), and
  until we have Apple Developer / Google Play accounts. The env vars are
  set to sentinel values that let the Go app boot; `FEATURE_PUSH_ENABLED=false`
  gates all call sites.
- **External metrics/monitoring (Prometheus, Grafana, Betterstack).**
+- **In-cluster Prometheus / Grafana.** Self-hosted Prometheus-compatible
-  Right now we rely on `kubectl logs`, `kubectl top`, and Cloudflare's own
+  metrics + tracing + dashboards live **outside** the k3s cluster on
-  analytics. See [Chapter 15](./15-observability.md) for what's there and
+  `88oakappsUpdate` (the same Linode VPS that hosts PostHog), reached
-  what we'd add.
+  via `https://obs.88oakapps.com` (Cloudflare-fronted, bearer-gated).
  A `vmagent` sidecar in the honeydue namespace scrapes the api Pods
  and remote-writes out. This frees ~700 MB of cluster RAM and means
  observability survives a k3s control-plane incident. See
  [Chapter 15](./15-observability.md).
 - **Alerting.** No PagerDuty, Slack hooks, or pages-on-error wired up
  yet. Histograms are flowing into Grafana — alert rules on top of them
  is the next add. See [Chapter 15 — Future](./15-observability.md).
 - **Automated backups of Redis state.** Redis is configured with AOF
  (append-only file) persistence, but the PVC is only on one node. Redis
  holds only cache + Asynq queue state; losing it re-populates on first
@@ -27,23 +27,27 @@ that every legitimate port be enumerated in a rule.
 Run `sudo ufw status verbose` on any node to see the live ruleset. The
 canonical ruleset below, grouped by purpose.
-### Public-facing (anywhere)
+### Public-facing
-| Port | Protocol | From | Purpose | Comment |
+| Port | Protocol | From | Purpose |
-|---|---|---|---|---|
+|---|---|---|---|
-| 22 | TCP | Anywhere | SSH | |
+| 22 | TCP | Anywhere | SSH (key-only) |
-| 80 | TCP | Anywhere | HTTP (Cloudflare → Traefik) | |
+| 443 | TCP | Cloudflare ranges (15 IPv4 + 7 IPv6) | HTTPS (CF → Traefik, TLS-terminated at Traefik) |
 | 443 | TCP | Anywhere | HTTPS (future, currently unused at origin) | |
-**Why 443 is open but unused**: We're on Cloudflare SSL=Flexible, so
+**Port :80 is closed** on all three nodes. CF is in Full (strict) mode
-Cloudflare talks to origin over plain HTTP:80. Port 443 on origin is
+and initiates every request on :443 to the origin. Cloudflare's
-only hit by misconfigured clients (who bypass CF DNS and hit node IPs
+"Always Use HTTPS" turns any plaintext client request into HTTPS at
-directly). Traefik's config accepts it but we don't require it. Keeping
+the edge, so the origin never needs to accept :80.
 it open smooths a future switch to Full (strict) SSL mode.
-**Future hardening**: Restrict 80 and 443 to Cloudflare's published IP
+**Port :443 is restricted to Cloudflare** via 22 UFW allow rules per
-ranges (15 IPv4 CIDRs, 7 IPv6 CIDRs). See [Chapter 13](./13-cloudflare.md)
+node (one per CF CIDR). Direct-connect from any non-CF IP is dropped
-for the ranges and the UFW rule format. Today they're open to anyone.
+at the kernel. This closes the "node IP leak = bypass CF WAF/DDoS"
 hole entirely. See [Chapter 13](./13-cloudflare.md#cloudflare-ip-ranges-used-in-traefik-trustedips)
 for the exact ranges and UFW rule format.
 **Refresh cadence**: CF updates its IP ranges rarely. A monthly
 `curl https://www.cloudflare.com/ips-v4` diff and UFW re-apply is
 enough. Automation TODO (Chapter 20).
 ### SSH (operator access)
@@ -8,6 +8,13 @@ long-haul components, and dedicated service accounts with dropped
 capabilities inside containers. This chapter documents each layer, the
 rationale, and what's currently missing (and why).
 > **Updated 2026-05-15 — security remediation.** The 2026-05 audits
 > (`live_scan_5_12.md`, `k3_audit_5_12.md`, `security_scan_5_12.md`) drove a
 > full remediation pass. **`deploy-k3s/SECURITY.md` is the authoritative,
 > per-finding current-state record.** This chapter is corrected for the
 > major items below; where any other detail conflicts with `SECURITY.md`,
 > `SECURITY.md` wins.
 ## Threat model
 Who we're defending against, in rough order of likelihood:
@@ -54,8 +61,8 @@ Cloudflare sits in front of every public request.
 - **Authorize requests** — that's the app's job
 - **Protect origin if origin IP leaks** — once someone knows a node IP
  they can bypass CF. Mitigation: keep origin firewall strict (Chapter 4).
- **Encrypt between CF and origin** — we're on SSL=Flexible, so CF↔origin
+- **~~Encrypt between CF and origin~~** — done (2026-04-24): SSL mode is
-  is HTTP. This is in our TODO (Chapter 20, upgrade to Full-strict).
+  Full (strict); CF↔origin is TLS with a Cloudflare Origin CA cert.
 ### The proxy-IP problem
@@ -75,8 +82,8 @@ This means a malicious request that bypasses CF (by hitting the node IP
 directly) can't spoof headers — Traefik ignores `X-Forwarded-*` unless
 the source IP is in CF's ranges.
-**TODO** (Chapter 20): Enforce at UFW level — allow 80/tcp only from
+**Done (2026-04-24):** the node UFW allowlist permits `:443` only from
-CF IP ranges. Today any IP can reach the origin on port 80.
+Cloudflare's IP ranges; the `Anywhere` rules on `:80`/`:443` were removed.
 ## Layer 2 — Node (OS, SSH, firewall)
@@ -297,15 +304,13 @@ The `deploy-k3s/manifests/network-policies.yaml` scaffold defines:
  reach api pods on port 8000
 - **allow-ingress-to-admin** — same, for admin:3000
-**These are not currently applied.** Without them, our pods can freely
+**Applied.** `03-deploy.sh` applies
-talk to anything — including, theoretically, malicious destinations if
+`deploy-k3s/manifests/network-policies.yaml` on every deploy — default-deny
-an attacker gets RCE inside a pod.
+plus the explicit per-app allows below. Traefik runs `hostNetwork`, so its
 traffic is matched by node-IP `ipBlock`s plus the pod CIDR `10.42.0.0/16`,
 not a `namespaceSelector`.
-**TODO** (Chapter 20): Apply network policies. The scaffold is there; we
+### What network policies prevent
 just need to `kubectl apply -f deploy-k3s/manifests/network-policies.yaml`
 and test that nothing breaks.
 ### What network policies would prevent
 | Attack scenario | NetworkPolicy blocks |
 |---|---|
@@ -324,13 +329,10 @@ renewed Let's Encrypt or CF-managed cert for `*.myhoneydue.com`.
 ### CF ↔ origin
-**Plaintext HTTP** (SSL = Flexible). An attacker with access to the
+**TLS — SSL = Full (strict)** (since 2026-04-24). A Cloudflare Origin CA
-Cloudflare-to-Hetzner path could read traffic. In practice nobody who
+certificate (`cloudflare-origin-cert` secret) is installed on all three
-isn't Cloudflare or Hetzner sits on that path.
+ingresses; Cloudflare validates it. Both user↔CF and CF↔origin are
-
+encrypted, and a DNS-hijack MitM is defeated by the origin-cert check.
 **TODO** (Chapter 20): Upgrade to SSL = Full (strict) with a Cloudflare
 Origin CA certificate. This encrypts CF ↔ origin and verifies that
 origin's cert is the CF-issued one (prevents MitM if DNS is compromised).
 ### API ↔ Neon Postgres
@@ -454,11 +456,14 @@ Mitigations:
 - Gitea itself is behind login; PAT is scoped to read:packages +
  write:packages only
 - Gitea runs on the operator's infrastructure (same operator account)
- Image tags are SHA-pinned (`:237c6b8`) not `:latest` → attacker can't
+- Workloads deploy by immutable `@sha256:` digest, not by mutable tag
-  replace an existing tag's image without us noticing the digest change
+  (`03-deploy.sh` resolves the digest after push; the redis/vmagent/node
  base images are digest-pinned too) — a swapped tag cannot reach the
  cluster.
-**TODO** (Chapter 20): Add cosign signing at build time, verify at pull
+**TODO**: cosign signing is wired into `03-deploy.sh` (guarded — runs when
-time.
+`cosign` + `COSIGN_KEY` are present); cluster-side admission verification
 (Kyverno/Connaisseur) is still pending. See `deploy-k3s/SECURITY.md` → L5.
 ## Operator workstation security
@@ -1,5 +1,13 @@
 # 06 — Traefik Ingress
 > **Updated 2026-05-15 (security remediation):** the Traefik middleware set
 > changed — `cloudflare-only` + `admin-auth` are now attached to the admin
 > ingress, a strict `auth-rate-limit` middleware fronts the auth endpoints
 > (via a dedicated `honeydue-api-auth` Ingress), and `security-headers`
 > gained COOP/CORP + a 2-year preload HSTS and dropped the deprecated
 > `X-XSS-Protection`. `deploy-k3s/SECURITY.md` is the authoritative
 > current-state record.
 ## Summary
 Traefik is the reverse proxy that routes external HTTP requests to the
@@ -280,16 +288,22 @@ most Ingress controllers and matches how users think about URL routing.
 ## How requests flow
-1. **Cloudflare DNS** resolves `api.myhoneydue.com` to one of three IPs
+1. **Cloudflare DNS** resolves `api.myhoneydue.com` to a CF edge IP
-   (round-robin). Say it picks `178.105.32.198` (hetzner2).
+   (client never sees the three origin IPs — CF proxies).
-2. **Cloudflare edge** establishes TCP to `178.105.32.198:80` (plain HTTP,
+2. **Cloudflare edge** terminates TLS from the browser, then opens a
-   SSL=Flexible). Original HTTPS terminated at CF.
+   fresh TCP to one of the origin IPs on `:443` (SSL=Full (strict)).
-3. **UFW on hetzner2** accepts the SYN (80/tcp open from anywhere).
+   Say it picks `178.105.32.198` (hetzner2).
-4. **Linux kernel** sees a listener on 0.0.0.0:80 (the Traefik pod).
+3. **UFW on hetzner2** accepts the SYN — the source IP is in one of
-   Hands off the SYN.
+   the 15 CF IPv4 CIDRs allowed on `:443`. (Any non-CF source IP is
-5. **Traefik accepts** the connection. Reads the HTTP request.
+   dropped at the kernel.)
 4. **Linux kernel** sees a listener on `0.0.0.0:443` (the Traefik pod,
   hostNetwork). Hands off the SYN.
 5. **Traefik accepts** the connection, completes the TLS handshake
   using the `cloudflare-origin-cert` secret (CF Origin CA — CF
   verifies this chain on its side). Reads the plaintext HTTP request.
 6. **Traefik matches** the `Host:` header against its router table.
   `Host: api.myhoneydue.com` → `honeydue-api` Ingress → `api` Service.
   Attached middlewares (`security-headers`, `rate-limit`) run here.
 7. **Traefik dials** `10.43.167.83:8000` (api Service ClusterIP). This
   goes through the cluster DNS (CoreDNS) and kube-proxy (IPVS).
 8. **kube-proxy IPVS** rewrites the destination to a live api pod endpoint
@@ -1,10 +1,17 @@
 # 07 — Services
 > **Updated 2026-05-15 (security remediation):** Redis now requires a
 > password (`config.yaml` `redis.password` → `honeydue-secrets`), all
 > workloads deploy by immutable `@sha256:` digest, and the redis/vmagent
 > base images are digest-pinned. `deploy-k3s/SECURITY.md` is the
 > authoritative current-state record.
 ## Summary
-Four workloads run in the `honeydue` namespace: **api** (Go REST API, 3
+Five workloads run in the `honeydue` namespace: **api** (Go REST API, 3
-replicas), **admin** (Next.js panel, 1 replica), **worker** (Go background
+replicas), **admin** (Next.js admin panel, 1 replica), **web** (Next.js
-jobs, 1 replica), and **redis** (cache + job queue, 1 replica, PVC-backed).
+customer-facing app, 3 replicas), **worker** (Go background jobs, 1
 replica), and **redis** (cache + job queue, 1 replica, PVC-backed).
 This chapter deep-dives each: container image, resource limits, probes,
 volumes, and why each knob is set the way it is.
@@ -14,10 +21,11 @@ volumes, and why each knob is set the way it is.
 |---|---|---|---|---|
 | `api` | `gitea.treytartt.com/admin/honeydue-api:<sha>` | 3 | 8000 | HTTP REST API |
 | `admin` | `gitea.treytartt.com/admin/honeydue-admin:<sha>` | 1 | 3000 | Next.js admin panel |
 | `web` | `gitea.treytartt.com/admin/honeydue-web:<sha>` | 3 | 3000 | Next.js customer-facing web client at `app.myhoneydue.com` |
 | `worker` | `gitea.treytartt.com/admin/honeydue-worker:<sha>` | 1 | — | Background job processor |
 | `redis` | `redis:7-alpine` | 1 | 6379 | Cache + Asynq queue |
-All four are Kubernetes `Deployment` workloads (not StatefulSets, not
+All five are Kubernetes `Deployment` workloads (not StatefulSets, not
 DaemonSets). They share:
 - ServiceAccount with `automountServiceAccountToken: false` (Chapter 5)
 - `imagePullSecrets: [gitea-credentials]` (Chapter 11)
@@ -25,6 +33,66 @@ DaemonSets). They share:
 - Individual env vars wired to `honeydue-secrets` keys
 - Read-only root filesystem with `tmp` emptyDir mounted at `/tmp`
 ## Service — web (Next.js customer app)
 ### What it does
 Lives at `https://app.myhoneydue.com`. Next.js 16 standalone build,
 served by `node server.js` inside the container. Sibling repo:
 `/Users/treyt/Desktop/code/honeyDue/honeyDueAPI-Web/`.
 ### Architecture: server-side proxy pattern
 Unlike the admin panel (which makes CORS requests directly to
 `api.myhoneydue.com`), the web app uses a proxy pattern:
 ```
 Browser → https://app.myhoneydue.com/api/proxy/tasks/123/
       → Next.js route handler (src/app/api/proxy/[...path]/route.ts)
       → reads honeydue-token httpOnly cookie
       → attaches Authorization: Token <value>
       → https://api.myhoneydue.com/api/tasks/123/ (server-side fetch)
       → response flows back
 ```
 **Consequences:**
 - Browser never makes cross-origin requests. No CORS entry needed on
  the Go API for `app.myhoneydue.com`.
 - Auth tokens live in httpOnly cookies, not localStorage. XSS can't
  exfiltrate them.
 - The web pod needs outbound HTTPS to `api.myhoneydue.com` — covered
  in the `allow-egress-from-web` NetworkPolicy (Chapter 5).
 ### Env vars
 Build-time (baked into the client bundle by the Dockerfile `ARG`):
 - `NEXT_PUBLIC_API_URL` — only used as a fallback; baked for safety
 - `NEXT_PUBLIC_POSTHOG_KEY` — PostHog project API key
 - `NEXT_PUBLIC_POSTHOG_HOST` — `https://analytics.88oakapps.com`
 Runtime (ConfigMap):
 - `API_URL=https://api.myhoneydue.com/api` — consumed by the
  server-side proxy handlers
 - `PORT=3000`, `HOSTNAME=0.0.0.0`
 ### Deployment spec highlights
 - **3 replicas**, same as api — this is a production customer surface
 - `topologySpreadConstraints` across `kubernetes.io/hostname` —
  evicting one node at most kills one pod
 - `readOnlyRootFilesystem: true`; `emptyDir`s at `/app/.next/cache`
  (Next.js build cache) and `/tmp`
 - PDB `web-pdb` with `minAvailable: 2`
 - runAsUser/runAsGroup `1001` (matches the `nextjs` user created in
  the Dockerfile)
 ### Why same availability as api
 The web client is now the primary user-facing surface. Users hitting
 `app.myhoneydue.com/login` should never see a 502 because a single
 node went down. 3 replicas × `minAvailable: 2` guarantees at least
 two pods stay up through any voluntary disruption.
 ## Service 1 — api (Go REST API)
 ### What it does
@@ -113,13 +181,15 @@ doesn't run as root.
 file writes to the image layer. Go binary doesn't need to write to `/`;
 only `/tmp` is mutable.
-**`startupProbe.failureThreshold: 48`** (= 48 × 5s = 240s grace) — this
+**`startupProbe.failureThreshold: 48`** (= 48 × 5s = 240s grace) —
-was bumped up from the scaffold default of 12. Reason: on first boot,
+historically bumped from the scaffold default of 12 to absorb in-replica
-the Go app runs `MigrateWithLock()` which acquires a Postgres advisory
+migration time. Now that migrations run out-of-band as a Kubernetes
-lock and runs AutoMigrate. First replica takes ~90s; subsequent
+Job ([Chapter 8 §Schema management](./08-database.md)), pods boot in
-replicas wait on the lock. With 3 replicas all starting simultaneously
+seconds and only need a few probe failures of grace, but the budget
-and the lock serializing them, 240s is the right grace. See
+stays at 240s because cold pods on a fresh Hetzner node still pay
-[Chapter 19](./19-postmortem-swarm.md) for the detailed story.
+~10s for image pull + startup. See
 [Chapter 19 §13](./19-postmortem-swarm.md) for the historical
 context (the in-replica advisory-lock approach this replaced).
 **`readinessProbe.initialDelaySeconds: 5`** — after the startupProbe
 passes, wait 5s before starting readiness checks. Prevents a racy
@@ -4,8 +4,10 @@
 Authoritative user data lives in a Neon-managed Postgres database in AWS
 us-east-1. Connections use TLS (`DB_SSLMODE=require`). Schema is managed
-via GORM AutoMigrate inside the api binary, coordinated across replicas
+via [pressly/goose](https://github.com/pressly/goose) running as a
-by a Postgres advisory lock to prevent concurrent migration attempts.
+one-shot Kubernetes Job before every api/worker rollout. See §Schema
 management below for the full shape; ch19 §13 documents the previous
 in-replica AutoMigrate approach this replaced.
 ## Why Neon
@@ -32,7 +34,7 @@ Neon Launch won on:
 | Field | Value |
 |---|---|
-| Hostname | `ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech` |
+| Hostname | `ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech` |
 | Port | 5432 |
 | Username | `neondb_owner` |
 | Database | `honeyDue` (case-sensitive!) |
@@ -58,9 +60,19 @@ paid tiers much higher.
 ### PgBouncer on Neon
-Neon provides a built-in PgBouncer at `-pooler` subdomain. Our hostname
+Neon provides a built-in PgBouncer at the `-pooler` subdomain. The
-already includes `-pooler` handling in the route, so connections go
+non-pooler endpoint (`ep-floral-truth-amttbc5a.c-5.us-east-1...`) is
-through PgBouncer transparently.
+the direct compute endpoint and connects straight to Postgres,
 paying the full TCP+TLS+startup handshake on every cold connection.
 The `-pooler` endpoint multiplexes through PgBouncer in Neon's
 infrastructure.
 **We use the `-pooler` endpoint** because the direct endpoint paid
 ~440ms per cold handshake on a transatlantic link, visible as
 1500ms-tail spikes in /api/tasks/ traces. The pooler keeps backend
 Postgres connections warm in Neon's data center, so the only
 latency our Go pods see is one TCP+TLS to PgBouncer (already
 warm via our pool) plus one query round-trip.
 Modes PgBouncer supports:
 - **session** — one server connection held per client session (transparent)
@@ -68,26 +80,59 @@ Modes PgBouncer supports:
 - **statement** — per-statement (most aggressive; breaks many features)
 Neon's pooler runs in **transaction mode**. This is compatible with GORM
-out of the box (we don't use session-level features like prepared
+runtime queries (we don't use session-level features like LISTEN/NOTIFY
-statements or session variables).
+or session-scope advisory locks in the data path). The one place this
 matters is migrations: goose's session-scoped advisory lock can't
 survive PgBouncer transaction-mode pooling. The migrate Job
 (`deploy-k3s/manifests/migrate/job.yaml`) handles this by stripping
 the `-pooler` segment from `DB_HOST` before invoking goose — runtime
 keeps using the pooler, only migrations bypass it.
 ### Connection pool settings
-In `prod.env`:
+In `config.yaml` (rendered into ConfigMap → env vars):
-```
+```yaml
-DB_MAX_OPEN_CONNS=25
+database:
-DB_MAX_IDLE_CONNS=10
+  max_open_conns: 25
-DB_MAX_LIFETIME=600s
+  max_idle_conns: 20
  max_lifetime: "1800s"
  max_idle_time: "0s"
 ```
-These are the Go `database/sql` pool settings (GORM uses `database/sql`
+These map to Go `database/sql` pool settings:
 underneath):
- **MaxOpenConns: 25** — at most 25 concurrent connections per replica
+- **MaxOpenConns: 25** — at most 25 concurrent connections per replica.
- **MaxIdleConns: 10** — keep up to 10 warm connections ready to reuse
+- **MaxIdleConns: 20** — keep up to 20 warm connections per replica
- **MaxLifetime: 600s** — recycle connections after 10 min (prevents
+  ready to reuse. Bumped from 10 because the pooler tolerates many
-  stale state in long-lived connections, good for Neon's idle timeout)
+  client connections cheaply, and the cost of a cold handshake (~440ms
  transatlantic) is far higher than the cost of holding an idle
  connection.
 - **MaxLifetime: 1800s** — recycle connections after 30 min. Bumped
  from 600s; with the pooler keeping things warm, longer lifetime
  reduces churn.
 - **MaxIdleTime: 0s** — never close idle connections. Lifetime drives
  recycling instead.
 ### Pool warm-up at boot
 `database.Connect()` issues 20 parallel `PingContext` calls
 immediately after opening the pool. This pre-establishes
 `MaxIdleConns` connections to the pooler so the first user request
 doesn't pay any handshake.
 The warm-up is bounded by *one* round-trip time (~440ms cold), not
 one round-trip per connection — pings run concurrently. Confirmed
 in pod logs at boot:
 ```
 {"level":"info","requested":20,"warmed":20,"message":"DB pool warm-up complete"}
 ```
 If warm-up partially fails (e.g., 18/20 succeed), the pod still
 starts; the pool fills the rest under traffic. Failure to ping at all
 would be caught by the synchronous `sqlDB.Ping()` immediately before,
 which is fatal.
 ### Worst-case connection count
@@ -107,66 +152,110 @@ the default 25/10. If we hit connection errors in prod, adjust.
 ## Schema management
-### GORM AutoMigrate
+### goose
-On startup, the Go API's `cmd/api/main.go` calls
+We use [pressly/goose](https://github.com/pressly/goose) (pinned in the
-`database.MigrateWithLock()` which:
+api `Dockerfile` to v3.22.1) for schema migrations. Why goose specifically:
-1. Opens a dedicated Postgres connection
+- Each migration file runs inside its own transaction by default —
-2. `SELECT pg_advisory_lock(1751412071)` — acquires a session-level
+  partial-failure recovery is built in (no "dirty" state to manually
-   advisory lock on a hardcoded key
+  unstick like golang-migrate).
-3. Calls `db.AutoMigrate(&models.*{})` for every GORM model
+- Locking is opt-in. We *don't* opt in. Migrations run as a single
-4. `SELECT pg_advisory_unlock(...)` via deferred function
+  Kubernetes Job — that's the singleton process. No advisory-lock vs
-5. Close the connection
+  PgBouncer-transaction-mode foot-gun.
 - Plain SQL files. No DSL, no library integration in our Go code.
-The advisory lock serializes migrations across replicas: when 3 api
+See `docs/deployment/19-postmortem-swarm.md` (Schema Versioning section)
-pods start simultaneously, one acquires the lock and migrates; the
+for the AutoMigrate-with-advisory-lock approach this replaced and why.
 others block on the lock. Once the first finishes (≤2s for already-
 migrated schema, up to 90s on first cold boot), the next acquires and
 sees the schema is current (no-op migrate).
-### Why an advisory lock
+### Migration files
-Without it, concurrent `CREATE TABLE IF NOT EXISTS ...` statements from
+Live under `migrations/`, named `<NNNNNN>_<short_name>.sql`. Each file
-multiple replicas would race — Postgres usually handles it, but GORM's
+has both the up and down migration in one file, separated by goose
-AutoMigrate also alters tables (adds columns, indexes) which can deadlock
+markers:
 under concurrency.
-The advisory lock pattern (also used by Rails + Django + Alembic) is the
+```sql
-canonical solution.
+-- +goose Up
 CREATE TABLE example (id bigint PRIMARY KEY);
-### The lock key
+-- +goose Down
 DROP TABLE example;
 ```
-`1751412071` is a hardcoded integer in `internal/database/database.go`.
+Multi-statement constructs (`CREATE FUNCTION`, `DO $$ BEGIN ... END $$`)
-Arbitrary but unique — as long as nothing else in the Postgres instance
+need `-- +goose StatementBegin` / `-- +goose StatementEnd` wrappers
-uses the same advisory lock key, no conflicts.
+because goose splits on semicolons by default.
-### First-boot behavior
+`migrations/000001_init.sql` is the baseline — captures every
 table/index/sequence as it existed when goose was adopted, generated
 via `pg_dump --schema-only --no-owner --no-privileges`. The pre-goose
 hand-numbered migrations (002-022 in git history at commit
 58e6997) had their effects folded into this baseline; they're gone
 from the live tree but remain in git for archaeology.
-On a **fresh database** (new Neon project), the first api pod runs
+### Production migration flow
 through every model's `CREATE TABLE` statement. This is ~50 tables for
 honeyDue and takes ~90 seconds.
-On a **warm database** (tables already exist), AutoMigrate is fast —
+`deploy-k3s/scripts/03-deploy.sh` runs migrations as part of every
-typically under 2 seconds. It still runs (GORM checks every model
+deploy, **before** the api/worker rollout starts:
 against the schema) but finds no work to do.
-### Where this bit us
+```
 1. kubectl delete job honeydue-migrate (idempotent)
 2. kubectl apply -f manifests/migrate/job.yaml (with current api image)
 3. kubectl wait --for=condition=complete --timeout=10m job/honeydue-migrate
 4. (only if Job succeeded) kubectl apply -f manifests/api/...
 ```
-With 3 api pods starting simultaneously and migrations taking 90s first
+The Job uses the api image — we install the goose CLI binary at
-time, the lock queue for the last replica is ~180s. We needed a
+`/usr/local/bin/goose` during the api Dockerfile build, so any pod that
-startupProbe grace of 240s to cover this without false restart loops.
+can run api can run goose. No separate image to build/push.
 See Chapter 7 §startupProbe and Chapter 19 §MigrateWithLock.
-### Downside: no schema versioning
+The Job's `command` runs `goose ... up` against the **direct**
 (non-pooler) Neon endpoint. Goose's session-scoped advisory lock can't
 survive PgBouncer transaction-mode pooling, so the Job script strips
 the `-pooler` segment from `DB_HOST` before connecting. The api/worker
 runtime continues to use the pooler endpoint for everything else; only
 this one Job needs the direct connection.
-AutoMigrate can only *add* — new tables, new columns, new indexes. It
+### Schema-version precondition
 won't drop columns, rename them, or change types destructively. For
 those we'd need raw SQL migrations (a tool like `golang-migrate` or
 `dbmate`).
-Today: we accept that schema changes are additive-only. When we need
+`internal/database/database.go::RequireSchemaApplied()` runs at api and
-destructive changes, we'd hand-write them.
+worker startup. It queries `goose_db_version` for the highest applied
 version and refuses to start if the table is missing or the latest row
 is `is_applied=false`. This catches "operator forgot to run migrate" as
 a clear boot error instead of a mysterious runtime "relation does not
 exist" later.
 ### Local migration workflow
 ```bash
 # Set the direct-endpoint DSN once
 export DATABASE_URL='host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
                     user=neondb_owner password=$PG_PASSWORD dbname=honeyDue sslmode=require'
 make migrate-status                  # what's pending
 make migrate-up                      # apply
 make migrate-down                    # roll back the latest
 make migrate-new name=add_widget_col # scaffold a new SQL file
 ```
 Each new migration file goes through code review like any other code
 change. The deploy-script Job applies it on the next deploy.
 ### Bootstrap (one-time, when the prod DB already had a schema)
 Bootstrapping a goose-managed DB whose schema already exists requires
 seeding `goose_db_version` so goose treats version 1 as already-applied:
 ```bash
 # Once. After this, future migrations append normally.
 goose -dir migrations postgres "$DATABASE_URL" version  # creates the table
 psql "$DATABASE_URL" -c \
  "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
 ```
 This was done for honeyDue's prod Neon project at the time of goose
 adoption — no need to repeat unless we set up a fresh DB from a
 schema dump.
 ## What's in the database
@@ -229,17 +318,45 @@ value.
 ## Neon regions
 Neon's default region for new projects is `aws-us-east-1` (Virginia).
-Our DB is there. Latency from Nuremberg to us-east-1 is **~90-120ms
+Our DB is there. Latency from Nuremberg to us-east-1 is **~108ms one-way**
-round trip**.
+TCP-level (verified by `nc -z -w 5` from `hetzner1`), so **~220ms RTT
 through Neon's pooler stack**.
 This is the slowest hop in our data flow. Every api request that needs
-a DB query (most of them) pays this latency at least once.
+a DB query pays this latency at least once. Sub-millisecond Postgres
 execution time (verified via `EXPLAIN ANALYZE`: 0.04-0.34 ms on every
 hot path) means **wall-clock latency = network + Neon proxy overhead**.
-**When this matters**: When we start seeing ~200ms+ response times from
+### Optimizations layered on top to minimize round trips
-complex endpoints, it's likely DB latency dominant. Options:
+
- Migrate Neon to `aws-eu-central-1` (Frankfurt) — shaves ~90ms off
+We don't move the DB region (yet) but we cut the *number* of RTTs per
- Add Redis caching for hot reads (Chapter 7)
+request via:
- Read replicas (Neon supports them on paid tiers)
+
 1. **Auth caching** (Chapter 7 §Redis) — token + user lookups served
   from Redis (1-hour TTL) and per-pod in-memory cache (5-min TTL).
   On warm cache: 0 SQL round-trips for auth.
 2. **JOIN consolidation** — two-step
   `find residence-IDs → find tasks IN ids` collapsed into a single
   query with a Postgres subquery. One RTT instead of two.
 3. **Single-query auth** — token + user fetched in one INNER JOIN
   instead of GORM's two-query Preload pattern.
 4. **Residence-IDs Redis cache** — cached per user with 5-min TTL,
   invalidated on Create/Delete/Join/Remove. Saves 1 RTT per
   `/api/documents/`, `/api/contractors/`, `/api/residences/summary/`
   request.
 After these, a fully-warm `/api/tasks/` is **1 SQL round-trip total
 (~220ms wall-clock)**. Verified via Jaeger trace — see Chapter 15.
 ### When this still matters
 - Any cold-cache request still pays 2-3 RTTs (~500-700ms).
 - Pod startup pays 1 RTT × 20 (warm-up), but that runs in parallel:
  ~440ms one-shot.
 Long-term fix: migrate Neon to `aws-eu-central-1` (Frankfurt) — drops
 RTT to ~5ms and brings warm-cache requests under 50ms. Tracked in
 `docs/observability-plan.md` and Chapter 18 §migration triggers.
 ## Environment variables the app reads
@@ -247,14 +364,15 @@ From ConfigMap:
 | Var | Purpose |
 |---|---|
-| `DB_HOST` | Neon pooler hostname |
+| `DB_HOST` | Neon pooler hostname (`-pooler` suffix) |
 | `DB_PORT` | 5432 |
 | `POSTGRES_USER` | `neondb_owner` |
 | `POSTGRES_DB` | `honeyDue` |
 | `DB_SSLMODE` | `require` |
 | `DB_MAX_OPEN_CONNS` | 25 |
-| `DB_MAX_IDLE_CONNS` | 10 |
+| `DB_MAX_IDLE_CONNS` | 20 |
-| `DB_MAX_LIFETIME` | `600s` |
+| `DB_MAX_LIFETIME` | `1800s` |
 | `DB_MAX_IDLE_TIME` | `0s` (never close idle) |
 From Secret (`honeydue-secrets`):
@@ -288,11 +406,13 @@ GROUP BY usename, state, application_name;
 - [Neon docs][neon-docs]
 - [Neon pricing][neon-pricing]
 - [Postgres advisory locks][pg-locks]
- [GORM AutoMigrate][gorm-automigrate]
+- [pressly/goose][goose] — production migration tool
 - [GORM AutoMigrate][gorm-automigrate] (tests only)
 - [honeyDue task architecture][task-arch] (repo-local)
 [neon-docs]: https://neon.com/docs/introduction
 [neon-pricing]: https://neon.com/pricing
 [pg-locks]: https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
 [goose]: https://github.com/pressly/goose
 [gorm-automigrate]: https://gorm.io/docs/migration.html
 [task-arch]: ../../docs/TASK_LOGIC_ARCHITECTURE.md
@@ -150,18 +150,64 @@ Allowed MIME types: `image/jpeg`, `image/png`, `image/gif`, `image/webp`,
 ## Access control
-### Upload flow
+### Upload flow (current — direct-to-B2 with presigned POST)
-1. Client POSTs to `/api/upload/`
+Image and document uploads go **directly from the client to B2**. The
-2. Go API validates the user is authenticated and authorized for the
+api server only signs a short-lived POST policy; the bytes never
-   target resource
+traverse our cluster. This is the WhatsApp / Slack architecture and
-3. Go API streams the upload to B2 via minio-go's `PutObject`
+sidesteps the api as a proxy bottleneck.
 4. B2 returns a key
 5. Go API stores the key in Postgres
 6. Returns the key to the client
-The B2 bucket is **private**. Clients can't GET directly; they always
+1. Client `POST /api/uploads/presign` with `{category, content_type, content_length}`.
-go through the Go API.
+2. api validates auth, per-user quota (10 concurrent in-flight,
   50/hour rate limit), allowed mime, and the 10 MB cap. On success it
   creates a `pending_uploads` row, signs a B2 POST policy with a
   `content-length-range` condition bound to the claimed length ±256
   bytes, and returns `{id, upload_url, fields, key, expires_at}`.
 3. Client multipart-POSTs the bytes directly to B2 using the returned
   fields. **B2 enforces the size cap at the protocol level** — clients
   can't bypass it by lying about Content-Length.
 4. Client POSTs to the entity-creation endpoint (`/api/task-completions/`,
   `/api/documents/`) with `upload_ids: [id]`. The service `HEAD`s each
   B2 object, verifies size matches `expected_bytes`, marks the
   `pending_uploads.claimed_at`, and writes the `task_completion_image`
   / `document_image` row referencing the upload.
 The signed URL is valid for 15 minutes; presigns are not reusable.
 The B2 bucket stays **private** — only the api ever holds the key
 material. Clients can't list or GET directly without a presign.
 ```
 ┌──────────┐   1) presign        ┌────────┐
 │  client  │ ──────────────────► │  api   │
 │          │ ◄────────────────── │        │  POST policy + key
 │          │                     └────────┘
 │          │                                   row in
 │          │                          pending_uploads
 │          │                          (claimed_at NULL)
 │          │   2) POST bytes      ┌────────┐
 │          │ ──────────────────►  │   B2   │  enforces policy
 │          │ ◄────────────────── │        │
 │          │                     └────────┘
 │          │   3) attach          ┌────────┐
 │          │ ──────────────────►  │  api   │  HEAD B2 object,
 │          │  upload_ids: [id]    │        │  mark claimed_at,
 │          │                     └────────┘  insert image row
 └──────────┘
 ```
 Server-side enforcement summary:
 | Check | Where | Reject if |
 |---|---|---|
 | Auth | api middleware | unauthenticated |
 | Mime allowlist | `upload_service.go:allowedContentTypes` | not in list for category |
 | Size cap (10 MB) | api before signing + B2 policy | content_length > 10 MiB |
 | Concurrency cap (10) | `CountUnclaimedActiveForUser` | already 10 unclaimed in-flight |
 | Rate limit (50/hr) | Redis sliding window `upload:presign:<uid>:<bucket>` | 51st presign in the same hour |
 | Size at upload time | B2 (signed policy) | bytes outside content-length-range |
 | Ownership at attach | `FindUnclaimedForUser` | upload_id belongs to a different user |
 | Bytes match claim | `s3.Stat()` + bytes comparison | actual size differs from expected ±256 |
 ### Download flow (current)
@@ -170,34 +216,55 @@ go through the Go API.
 3. Go API fetches from B2 and streams back to the client
 This proxies every download through the api. For high-traffic media
-that's inefficient (api becomes an egress bottleneck).
+that's inefficient (api becomes an egress bottleneck) — could be
-
+replaced with presigned GET URLs on the same bucket. Not yet shipped;
-### Future: signed URLs
+download volume is low enough that the proxy is fine for now.
 We could generate time-limited signed URLs for B2 objects:
 ```go
 url, err := s3Client.PresignedGetObject(ctx, bucket, key, 1*time.Hour, nil)
 ```
 Returns a URL the client can GET directly from B2, scoped to a specific
 object, valid for 1h. Saves api bandwidth and latency.
 Not yet implemented. TODO (Chapter 20).
 ## Lifecycle and retention
-We have **no lifecycle rules** set on the bucket. Objects live forever
+### Orphan cleanup (`pending_uploads`)
 unless the app deletes them.
-When a user deletes their account, the app should delete their B2
+Every presign creates a row in `pending_uploads` with `expires_at =
-objects. This is currently not automated — a compliance gap for any
+now + 15 min`. If the client never finishes the upload, or finishes
-"right to be forgotten" request.
+but never calls the attach endpoint, the row stays unclaimed. An
 hourly cron in the worker reaps them:
-**TODO** (Chapter 20): Either:
+- **`maintenance:upload_cleanup`** — cron `30 * * * *`. Selects
- Implement explicit cleanup in the user deletion handler, or
+  unclaimed rows past `expires_at`, deletes the corresponding B2
- Add B2 lifecycle rule tied to object metadata (tag objects with
+  object, deletes the row. Up to 500 per tick; the next tick picks up
-  user ID; rule deletes tagged objects when user is soft-deleted)
+  any overflow. Worker logs include `reaped` count.
 The worker constructs a `StorageService` at startup; if storage init
 fails (e.g. `B2_KEY_ID` / `B2_APP_KEY` not wired into the worker
 deployment), the cleanup handler logs a warning and no-ops. See
 `deploy-k3s/manifests/worker/deployment.yaml` — both B2 secrets are
 required envs on this pod.
 ### Bucket lifecycle (backstop)
 A B2 lifecycle rule on the `uploads/` prefix is the safety net if the
 worker is offline for an extended period:
 - Hide objects 7 days after upload.
 - Delete 1 day after hidden.
 This is configured manually via the Backblaze console (B2's S3
 lifecycle API isn't fully implemented). See
 `deploy-k3s/manifests/b2-lifecycle.md` for the exact rule and
 `b2 bucket get-info` verification command.
 ### User-deletion cascade
 When a user deletes their account, the app deletes their `task_*` /
 `document` rows. The associated B2 objects survive — same compliance
 gap as before, not yet automated. Two approaches:
 - Walk the image rows on user delete and `RemoveObject` each (simple,
  synchronous, slow for users with many uploads).
 - Tag objects with a `user_id` metadata header at upload time, then
  use a B2 lifecycle rule scoped to a deleted-users prefix.
 Option 1 is the next item in the upload roadmap.
 ## Backup of B2
@@ -1,5 +1,11 @@
 # 10 — Secrets & Config
 > **Updated 2026-05-15 (security remediation):** `honeydue-secrets` now
 > carries `REDIS_PASSWORD`; an `admin-basic-auth` Secret backs the admin
 > ingress; rotation is documented in `docs/runbooks/secret-rotation.md`;
 > and the Go config can read file-mounted secrets (`HONEYDUE_SECRETS_DIR`).
 > `deploy-k3s/SECURITY.md` is the authoritative current-state record.
 ## Summary
 Non-sensitive config (hostnames, ports, feature flags, etc.) lives in
@@ -55,7 +61,7 @@ APNS_AUTH_KEY_ID=DISABLED01
 APNS_AUTH_KEY_PATH=/secrets/apns/apns_auth_key.p8
 APNS_PRODUCTION=false
 APNS_TEAM_ID=DISABLED01
-APNS_TOPIC=com.tt.honeyDue
+APNS_TOPIC=com.myhoneydue.honeyDue
 APNS_USE_SANDBOX=false
 BASE_URL=https://myhoneydue.com
 B2_BUCKET_NAME=honeyDueProd
@@ -272,7 +272,7 @@ sequenceDiagram
    participant NewPod as api pod v2 (starting)
    Note over NewPod: kubelet starts new pod
-    Note over NewPod: pod connects to Postgres<br/>MigrateWithLock runs (no-op)<br/>HTTP server starts<br/>readinessProbe passes
+    Note over NewPod: pod connects to Postgres<br/>RequireSchemaApplied checks goose_db_version<br/>HTTP server starts<br/>readinessProbe passes
    Note over NewPod: kube-proxy updates endpoints<br/>NewPod added to Service pool
    CF->>Traefik: request 1
    Traefik->>OldPod: routed (old pod still in pool)
@@ -5,8 +5,9 @@
 Cloudflare sits in front of every public request. It provides DNS
 (authoritative nameservers for `myhoneydue.com`), TLS termination at
 the edge, DDoS mitigation, caching, and the round-robin fan-out across
-our three node IPs. We use the Free plan. TLS mode is "Flexible"
+our three node IPs. We use the Free plan. TLS mode is **Full (strict)**
-(HTTP between CF and origin). This chapter documents every Cloudflare
+— CF connects to origin over HTTPS and verifies the origin's cert
 against CF's own Origin CA. This chapter documents every Cloudflare
 setting that matters.
 ## DNS
@@ -72,53 +73,49 @@ when you want sub-second failover.
 ## TLS
-### Mode: Flexible
+### Mode: Full (strict)
-CF Dashboard → SSL/TLS → Overview → **Flexible**.
+CF Dashboard → SSL/TLS → Overview → **Full (strict)**.
 **What this means:**
- User ↔ Cloudflare: **TLS** (HTTPS)
+- User ↔ Cloudflare: **TLS** (HTTPS) — CF serves its own Let's Encrypt cert
- Cloudflare ↔ Origin: **plaintext HTTP** (port 80)
+- Cloudflare ↔ Origin: **TLS** (HTTPS :443) — origin serves our CF Origin CA cert; CF verifies it chains to CF's Origin CA root
-**Why we chose it:**
+**How it's wired:**
- No origin cert required on the Hetzner nodes
+- k8s secret `cloudflare-origin-cert` (type `kubernetes.io/tls`) holds
- Zero Traefik cert-management complexity
+  `tls.crt` + `tls.key`. The cert is valid for `*.myhoneydue.com` +
- Fine for a site where CF terminates all user-facing TLS
+  `myhoneydue.com`, 15-year validity, issued by
  `CloudFlare Origin CA SSL Certificate Authority`.
 - All three `Ingress` resources in `deploy-k3s/manifests/ingress/ingress-simple.yaml`
  reference the secret via `spec.tls[].secretName`.
 - Traefik terminates TLS on :443 using the cert. Backend pods still
  speak plain HTTP over the cluster network (Traefik → pod is an
  intra-cluster hop, encrypted at the Flannel overlay layer).
-**Downsides:**
+**Why we chose Full (strict) over Flexible:**
- An attacker with network access between CF and Hetzner could read
+- CF → origin traffic was plaintext on Flexible. Between Cloudflare's
-  traffic. Realistically: nobody between CF's POPs and Hetzner's
+  POPs and Hetzner Nuremberg is a lot of internet. Full (strict)
-  Nuremberg DC, but it's theoretically plaintext on the wire.
+  closes that gap.
- MitM risk if DNS gets hijacked and traffic is routed through an
+- Origin cert is a CF-internal-only CA, so it's useless to anyone who
-  unintended origin.
+  isn't CF. Non-CF clients that somehow bypass the UFW CF-IP allowlist
  can't impersonate the origin because their cert wouldn't chain to
  CF's Origin CA root.
-### Future: Full (strict)
+**Maintenance:** the Origin CA cert is valid for 15 years (expires
 Apr 2041). No action needed until then. If rotation is ever required,
 regenerate in CF dashboard → SSL/TLS → Origin Server, re-run the
 `kubectl create secret tls cloudflare-origin-cert --dry-run=client -o yaml | kubectl apply -f -`
 command, Traefik picks it up on next secret reload (no pod restart).
-The next step up is **Full (strict)**: CF verifies origin's TLS cert
+### Regenerating the cert (for the record)
 and connects over HTTPS. Cloudflare provides free **Origin CA
 certificates** for this: they're issued by a CF-internal CA that only
 CF's own edge accepts. An attacker without a CF-signed cert can't
 impersonate our origin.
-Path to enable:
+```bash
-1. Generate Origin CA cert in CF dashboard → SSL/TLS → Origin Server
+# After downloading cf-origin-cert.pem + cf-origin-key.pem from CF dashboard:
-2. Download as PEM
+kubectl -n honeydue create secret tls cloudflare-origin-cert \
-3. Create k8s Secret `cloudflare-origin-cert`:
+  --cert=cf-origin-cert.pem \
-   ```bash
+  --key=cf-origin-key.pem \
-   kubectl create secret tls cloudflare-origin-cert -n honeydue \
+  --dry-run=client -o yaml | kubectl apply -f -
-     --cert=origin.crt --key=origin.key
+```
   ```
 4. Add `tls:` block to our Ingress:
   ```yaml
   spec:
     tls:
       - hosts: [api.myhoneydue.com]
         secretName: cloudflare-origin-cert
   ```
 5. Switch CF SSL mode to Full (strict)
 Trad-off: the `cloudflare-origin-cert` expires (default 15 years), so
 low maintenance. **TODO** (Chapter 20).
 ### Edge certificate
@@ -8,23 +8,62 @@ No downtime if the change is backward-compatible. Rollback is
 `kubectl rollout undo`. This chapter walks through the full process,
 plus alternate paths (config-only changes, manifest changes, hotfixes).
-## TL;DR for a code change
+## TL;DR using the unified deploy script
 The recommended path. `deploy-k3s/scripts/03-deploy.sh` builds all four
 images (api, worker, admin, web), pushes to Gitea, regenerates the
 ConfigMap from `config.yaml`, applies every manifest under
 `deploy-k3s/manifests/` (including the observability vmagent), and
 waits for all rollouts.
 ```bash
 cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go
 git add . && git commit -m "..." && git push gitea master
 export KUBECONFIG=~/.kube/honeydue.yaml
 bash deploy-k3s/scripts/03-deploy.sh         # full build + push + rollout
 # or, to redeploy without rebuilding:
 bash deploy-k3s/scripts/03-deploy.sh --skip-build
 # or, to pin a specific tag:
 bash deploy-k3s/scripts/03-deploy.sh --tag d3708e6
 ```
 What the script does, in order:
 1. Read registry creds from `deploy-k3s/config.yaml`.
 2. `docker login gitea.treytartt.com`.
 3. Build all four images with `--platform linux/amd64` (so arm64 Macs
   don't push images that crash on Hetzner amd64 nodes with
   "exec format error").
 4. Push to the gitea registry, plus tag and push `:latest`.
 5. Generate the env file from `config.yaml` and apply as ConfigMap
   `honeydue-config` (uses dry-run + apply for diff-free idempotence).
 6. Apply `manifests/namespace.yaml`, `redis/`, `ingress/`,
   `api/{deployment,service,hpa}`, `worker/`, `admin/`, `web/`.
 7. Apply `manifests/observability/vmagent.yaml`, substituting
   `TOKEN_PLACEHOLDER` with `OBS_INGEST_TOKEN` from `deploy/prod.env`
   (gitignored). Skipped with a warning if the token isn't present.
 8. `kubectl rollout status` for every Deployment, including vmagent.
 ~7–10 minutes for a full rebuild. ~1–2 minutes with `--skip-build`.
 ## TL;DR for a single-service code change (manual)
 ```bash
 # 1. Commit + get SHA
 cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go
 git add . && git commit -m "..." && SHA=$(git rev-parse --short HEAD)
-# 2. Login to Gitea registry
+# 2. Login to Gitea registry (creds in config.yaml)
-set -a; source deploy/registry.env; set +a
+docker login gitea.treytartt.com -u admin
 printf '%s' "$REGISTRY_TOKEN" | docker login "$REGISTRY" -u "$REGISTRY_USERNAME" --password-stdin
 # 3. Build + push amd64 image
-docker buildx build --platform linux/amd64 --target api \
+docker build --platform linux/amd64 --target api \
-  -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" --push .
+  -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" .
 docker push "gitea.treytartt.com/admin/honeydue-api:${SHA}"
 # 4. Roll it in
-export KUBECONFIG=~/.kube/honeydue-k3s.yaml
+export KUBECONFIG=~/.kube/honeydue.yaml
 kubectl set image deployment/api -n honeydue \
  api="gitea.treytartt.com/admin/honeydue-api:${SHA}"
@@ -32,11 +71,18 @@ kubectl set image deployment/api -n honeydue \
 kubectl rollout status -n honeydue deployment/api
 # 6. Log out
-docker logout "$REGISTRY"
+docker logout gitea.treytartt.com
 ```
 ~3–5 minutes end to end for api.
 > **Gotcha:** Deployments default to `imagePullPolicy: IfNotPresent`,
 > which means kubelet won't re-fetch an image with a tag it already
 > has cached locally — even if the registry now has different bytes
 > at that tag. Always change tags (use the SHA), or temporarily flip
 > `imagePullPolicy: Always` and `kubectl rollout restart` if you need
 > to overwrite a tag.
 ## The build
 ### Step 1 — Prepare
@@ -201,6 +247,38 @@ kubectl patch secret honeydue-secrets -n honeydue \
 kubectl rollout restart -n honeydue deployment/api deployment/worker
 ```
 ## One-time B2 bucket lifecycle (manual)
 The `pending_uploads` cleanup cron (`30 * * * *` on the worker) handles
 the common case of reaping orphaned uploads. The B2 bucket lifecycle
 rule on the `uploads/` prefix is the **backstop** if the worker is
 offline for >24 hours. It's configured once via the Backblaze web
 console — B2's S3 lifecycle API isn't fully implemented, so this can't
 be in the deploy script.
 One-time setup:
 1. Open https://secure.backblaze.com/b2_buckets.htm → bucket
   `honeyDueProd` → **Lifecycle Settings** → **Custom**
 2. Add rule:
   - File name prefix: `uploads/`
   - Hide files older than: **7 days**
   - Delete hidden files older than: **1 day**
 Total maximum lifetime of an orphaned object after the rule fires: 8
 days. The worker normally reaps within an hour, so the rule should
 almost never trigger.
 Verify:
 ```bash
 # Requires the b2 CLI: brew install b2-tools
 b2 bucket get-info honeyDueProd | jq '.lifecycleRules'
 ```
 See `deploy-k3s/manifests/b2-lifecycle.md` for the canonical rule
 definition and a curl-based fallback if the b2 CLI isn't available.
 ## Manifest changes
 When you add/modify a deployment YAML:
@@ -271,10 +349,47 @@ Timeline (approximate, warm state):
 - t=60s: another old pod terminates
 - ...continues until all on new RS
-For cold-boot (e.g., first deploy on a rebuilt cluster), the
+Migrations run as a separate Kubernetes Job that completes before any
-MigrateWithLock advisory lock extends this to several minutes. But the
+api/worker pod is rolled. So the rollout above never includes migration
-rollout is serialized — only one pod starts per iteration, so the lock
+work — pods that boot are guaranteed to find the schema already at the
-queue is small.
+expected version. See §"Migrations are gated, not interleaved" below.
 ## Migrations are gated, not interleaved
 `03-deploy.sh` runs `goose up` as a one-shot Job before applying any
 api/worker manifests:
 ```
 1. kubectl delete job honeydue-migrate (idempotent, removes prior run)
 2. kubectl apply -f manifests/migrate/job.yaml (with current api image)
 3. kubectl wait --for=condition=complete --timeout=10m job/honeydue-migrate
 4. (only if Job succeeded) kubectl apply -f manifests/api/...
 ```
 The Job uses the api image — `/usr/local/bin/goose` is baked in at
 Dockerfile build time. The Job script strips the `-pooler` segment
 from `DB_HOST` before connecting (goose's session-scoped advisory
 lock can't survive PgBouncer transaction-mode), runs `goose up`, exits.
 If the Job fails, the script aborts before any new app pod sees a
 stale schema. To debug:
 ```bash
 kubectl -n honeydue logs job/honeydue-migrate --tail=200
 kubectl -n honeydue describe job honeydue-migrate
 ```
 After investigating, fix the migration file and re-run `03-deploy.sh`.
 The Job is idempotent — successful migrations stay applied, only the
 new/failed file gets retried.
 api/worker pods run a `RequireSchemaApplied` check at startup that
 queries `goose_db_version` and refuses to boot if the table is missing
 or the latest row is `is_applied=false`. This is the fail-fast for
 "someone bypassed the deploy script and the schema isn't current."
 For full schema management background, see
 [Chapter 8 §Schema management](./08-database.md).
 ## Hotfix workflow
@@ -314,14 +429,10 @@ Contrast: `deploy/scripts/deploy_prod.sh` (Swarm-era) did:
 9. Healthcheck the final URL; auto-rollback on failure
 10. Log out of registries
-Our current k3s deploy is more manual but simpler. We'd write a similar
+The current k3s replacement, `deploy-k3s/scripts/03-deploy.sh`, covers
-script for k3s if deploys become frequent:
+the same ground in fewer steps because Kubernetes does the
-
+versioning/rollout/health bookkeeping natively. See the TL;DR section
-```bash
+at the top of this chapter.
 # deploy-k3s/scripts/04-deploy.sh (not yet updated for Gitea)
 ```
 See the scaffold in `deploy-k3s/scripts/`.
 ## Common deploy failures
@@ -2,15 +2,119 @@
 ## Summary
-We have minimal observability today: `kubectl logs`, `kubectl top`,
+Production has live metrics and tracing infrastructure as of 2026-04-25.
-Cloudflare Analytics, and the Neon dashboard. No Prometheus, no Grafana,
+A self-hosted **VictoriaMetrics + Jaeger + Grafana** stack runs on
-no centralized log aggregator, no APM. This is adequate for the
+`88oakappsUpdate` (Linode VPS, also home to the self-hosted PostHog
-current traffic volume (low) but is a known gap. This chapter documents
+deployment). A `vmagent` sidecar in the honeyDue k3s namespace scrapes
-what we *have* and what we'd add as traffic grows.
+the api Pods' `/metrics` endpoint every 15 seconds and remote-writes to
 `https://obs.88oakapps.com/api/v1/write`. Grafana is at
 `https://grafana.88oakapps.com` with a pre-provisioned RED dashboard.
 What we still don't have: log aggregation (Dozzle and `kubectl logs`
 fill the niche for now), alerting (no PagerDuty/Slack on errors), and
 full distributed tracing (OTel SDK is wired in app code but app-side
 instrumentation beyond HTTP routes hasn't shipped yet).
 The whole observability stack costs **$0** incremental and uses ~700 MB
 RAM on `88oakappsUpdate` (5% of its free RAM). It runs as a separate
 docker-compose project from PostHog so neither product's lifecycle
 touches the other.
 ## What we have
-### 1. `kubectl logs`
+### 1. Metrics — VictoriaMetrics + vmagent
 ```
 honeyDue k3s (Hetzner)                   88oakappsUpdate (Linode)
 ┌───────────────────────────┐            ┌──────────────────────────┐
 │ api Pods (3) :8000/metrics│            │ /opt/honeydue-obs/       │
 │   prometheus/client_golang│            │ ┌──────────────────┐     │
 │                           │            │ │ VictoriaMetrics  │     │
 │ vmagent ──── scrape 15s   │            │ │  30d retention   │     │
 │         remote_write ─────┼────────────┼─→ /api/v1/write   │     │
 │         (HTTPS, bearer)   │            │ │  (mem 256 MB)    │     │
 └───────────────────────────┘            │ └──────────────────┘     │
                                          └──────────────────────────┘
 ```
 The Go API exposes `/metrics` in Prometheus exposition format. Histograms
 are defined in `internal/prom/metrics.go` and registered globally:
 | Metric | Labels | Source |
 |---|---|---|
 | `http_request_duration_seconds` | `route, method, status` | Echo middleware around every handler |
 | `gorm_query_duration_seconds` | `table, operation` | GORM before/after callbacks (no ctx threading needed) |
 | `b2_upload_duration_seconds` | `bucket, result` | Wrapped `s.backend.Write` in `internal/services/storage_service.go` |
 | `b2_upload_bytes_total` | `bucket, result` | Counter alongside the duration histogram |
 | `apns_send_duration_seconds` | `result` (`ok`/`bad_token`/`error`) | Wrapped APNs `PushWithContext` in `internal/push/apns.go` |
 | `fcm_send_duration_seconds` | `result` | Wrapped FCM HTTP v1 send in `internal/push/fcm.go` |
 | `asynq_job_duration_seconds` | `task_type, result` | Histograms registered; middleware not yet attached (Step 3) |
 | `go_*`, `process_*` | (standard) | `prometheus/client_golang/prometheus/collectors` defaults |
 The previous custom monitoring at `/metrics` was renamed to
 `/metrics/legacy` so the canonical `/metrics` emits proper histograms
 suitable for `histogram_quantile()` rollups. The legacy endpoint stays
 because the GoAdmin dashboard reads it.
 #### vmagent in k3s
 Lives at `deploy-k3s/manifests/observability/vmagent.yaml`. One replica,
 `mem_limit: 256Mi`, scrapes by Kubernetes pod-discovery filtered to
 `app.kubernetes.io/name=api` and remote-writes to
 `https://obs.88oakapps.com/api/v1/write` with a bearer token from
 `OBS_INGEST_TOKEN` in `deploy/prod.env` (substituted into a Secret at
 deploy time).
 The agent buffers locally to `/tmp/vmagent` (emptyDir, 512 MB cap), so
 brief obs outages don't drop samples. Persistent queue replays on
 reconnect.
 NetworkPolicies in the honeydue namespace allow egress from vmagent to:
 - DNS (kube-dns / coredns)
 - Kubernetes API (`10.43.0.0/16:443`) for pod discovery
 - api Pods on `10.42.0.0/16:8000`
 - The public obs endpoint over `0.0.0.0/0:443`
 These are scoped tight — vmagent can't reach Postgres, Redis, B2, or
 any other external service.
 ### 2. Tracing — Jaeger all-in-one
 Jaeger 1.62 with badger storage runs alongside VictoriaMetrics. The
 collector accepts:
 - OTLP/HTTP at `https://obs.88oakapps.com/v1/traces` (bearer-token gated)
 - OTLP/gRPC at `:4317` (localhost-only)
 - Native Jaeger protocols at `:14268` etc. (localhost-only)
 Retention: ~7 days at current scale before badger rotates. UI at
 `https://grafana.88oakapps.com` via the Jaeger datasource.
 **Status of app-side instrumentation**: the histograms are populating
 metrics. The OTel exporter wiring in `cmd/api/main.go` is **not yet
 shipped**. When it does ship, every `POST /api/auth/login/` will produce
 a flame-graph trace with HTTP → handler → SQL → B2 → APNs spans.
 Tracking issue: gitea#3.
 ### 3. Dashboards — Grafana
 `https://grafana.88oakapps.com` (Cloudflare-fronted, basic auth via
 Grafana itself, admin credentials in `deploy/prod.env`).
 Datasources auto-provisioned at container startup from
 `/opt/honeydue-obs/data/grafana-provisioning/datasources/datasources.yaml`:
 - VictoriaMetrics (Prometheus type, `http://victoriametrics:8428` in-network)
 - Jaeger (`http://jaeger:16686` in-network)
 Pre-provisioned dashboard: `honeyDue API — RED` at
 `/d/honeydue-red`. Top row uses the legacy custom metrics
 (`http_endpoint_requests_total`, `http_requests_total`) which started
 flowing the moment vmagent attached. Lower rows use the new histograms
 (`http_request_duration_seconds_bucket` p50/p95/p99 by route, GORM p95
 by table, B2 upload p95, APNs/FCM send p95, Go memory + goroutines).
 Lower rows populated immediately after the api rebuild that shipped
 `internal/prom`.
 ### 4. `kubectl logs`
 Every container's stdout/stderr is captured by containerd and readable
 via kubectl:
@@ -33,9 +137,10 @@ kubectl get events -n honeydue --sort-by=.lastTimestamp
 Only the last ~20 MB of logs is retained per container, on-disk on the
 node. Once a pod is deleted, its logs are gone.
-For persistent log access we'd need aggregation (see §what we'd add).
+For persistent log access we'd need aggregation (see §What we still
 don't have).
-### 2. `kubectl top`
+### 5. `kubectl top`
 Pod and node resource usage via metrics-server:
@@ -43,43 +148,32 @@ Pod and node resource usage via metrics-server:
 kubectl top nodes
 # NAME                CPU(cores)   CPU(%)   MEMORY(bytes)   MEMORY(%)
 # ubuntu-8gb-nbg1-1   169m         4%       748Mi           9%
 # ubuntu-8gb-nbg1-2   229m         5%       1043Mi          13%
 # ubuntu-8gb-nbg1-3   124m         3%       770Mi           9%
 kubectl top pods -n honeydue
 ```
-**Retention**: In-memory only. Last few minutes of data. No
+In-memory only; last few minutes of data. For historical trends use
-historical view.
+the Grafana dashboard, which exposes the same data via the `go_*` and
 `container_*` (kubelet cAdvisor) metrics.
-### 3. Cloudflare Analytics
+### 6. Cloudflare Analytics
-CF Dashboard → Analytics & Logs. Per-zone stats:
+CF Dashboard → Analytics & Logs. Per-zone aggregate stats:
- Requests per second
+requests/sec, bandwidth, cache hit ratio, top status codes, top paths,
- Bandwidth
+bot traffic score. Good for spotting macro trends ("suddenly 10× more
- Cache hit ratio
+502s today") that wouldn't show up in a single-pod sample.
 - Top HTTP status codes
 - Top request paths
 - Bot traffic score
-All aggregated, no individual request traces. Good for spotting macro
+Free tier retention: 7 days of aggregate stats.
 trends ("suddenly 10× more 502s today"), poor for debugging specific
 issues.
-Free tier retention: 7 days of aggregate stats. Pro extends this.
+### 7. Neon dashboard
-### 4. Neon dashboard
+Neon console → project → Monitoring: compute utilization (CU-hours),
 slow queries, active connections, storage usage. Useful for "is the
 DB busy?" and free-tier limit watching. The new
 `gorm_query_duration_seconds` histogram covers the application side
 of the same question with much better latency tail visibility.
-Neon console → project → Monitoring:
+### 8. Kubernetes events
 - Compute utilization (CU-hours consumed)
 - Query performance (slow queries)
 - Active connections
 - Storage usage
 Good for "is the DB busy?" and "am I close to my free tier limit?"
 Not real-time.
 ### 5. Kubernetes events
 `kubectl get events` shows cluster-level state changes: pod scheduling,
 failures, image pulls, probe failures. Useful for post-mortem on
@@ -87,7 +181,7 @@ deploys.
 Retention: events are stored in etcd but default to 1 hour.
-## What we don't have (the gap)
+## What we still don't have
 ### No log aggregation
@@ -98,64 +192,108 @@ all api pod logs for user X") we have to:
 # Query all at once with stern (if installed)
 stern -n honeydue api
-# Or for specific pod
+# Or per-pod
 kubectl logs -n honeydue <pod> | grep user_id=12345
 ```
-This works but doesn't scale. Grep across 3 pods for a specific
+This works but doesn't scale across many pods.
 user_id is OK. Across 30 pods, intractable.
-**What we'd add**: [Loki](https://grafana.com/oss/loki/) — a lightweight
+**What we'd add**: [Loki](https://grafana.com/oss/loki/) on
-log aggregator designed for k8s. ~$0 to self-host; integrates with
+`88oakappsUpdate` next to the existing obs stack. Adds ~512 MB RAM
-Grafana for queries. Or [Betterstack](https://betterstack.com/logs)
+plus a Promtail (or Vector/Alloy) DaemonSet in k3s. Defer until log
-($10/mo, hosted).
+search becomes a recurring pain point — `stern` + `grep` is fine at
-
+current pod count.
 ### No metrics/dashboards
 `kubectl top` tells us "is this pod hot right now?" but not "has CPU
 been climbing over the past hour?" We'd need:
 - **Prometheus** — scrapes metrics from kubelet and pods' `/metrics`
  endpoints, stores time series
 - **Grafana** — queries Prometheus, renders dashboards
 K3s can install these via Helm in ~10 minutes. Adds ~500MB RAM to the
 cluster. Stability and operational load: moderate.
 **Alternative**: [Kubernetes Dashboard](https://github.com/kubernetes/dashboard)
 bundled with k3s (disabled by default). Minimal UI over the existing
 metrics API. Cheaper than Prometheus but less queryable.
 ### No distributed tracing
 "This request took 800ms — which hop was slow?" is currently unanswerable
 beyond "the DB query, probably." A real trace would show:
 - TLS handshake time
 - Traefik routing time
 - Go handler time
 - Postgres query time
 - Redis call time
 - Each B2 request time
 We'd add OpenTelemetry to the Go app and export to Jaeger/Tempo. Work
 is moderate; value kicks in when we have complex request flows.
 ### No alerting
 No PagerDuty, no Slack webhooks, no email on "api is returning 500s."
 The operator finds out when users complain.
-Cheapest fix: [Uptime Kuma](https://github.com/louislam/uptime-kuma)
+Cheapest fix path:
-(self-hosted) or Better Stack Uptime (free for small teams). Ping
+1. Grafana alerting (built into Grafana 11) — alert rules over the
-`https://api.myhoneydue.com/api/health/` every minute; alert if it fails.
+   existing histograms (e.g., `histogram_quantile(0.95, ...) > 1s`).
   Routes to Slack via webhook. **Zero infra cost.**
 2. [Uptime Kuma](https://github.com/louislam/uptime-kuma) on
   `88oakappsUpdate` — pings `/api/health/` from outside the cluster
   every minute; complements the in-cluster view.
 We'd want both eventually. Grafana alerting first because the data is
 already there.
 ### Distributed tracing — fully integrated
 The OTel SDK is wired in `cmd/api/main.go` and `cmd/worker/main.go` and
 ships traces to Jaeger via `obs.88oakapps.com/v1/traces`. Every public
 service method now takes `ctx context.Context` and routes its SQL through
 `repo.WithContext(ctx)`, which means **every authenticated API endpoint
 produces a fully-nested flame graph** in Jaeger.
 | Span source | Status |
 |---|---|
 | `otelecho.Middleware` — span per HTTP request | ✅ live |
 | Auth middleware DB lookups (`m.db.WithContext(ctx)`) | ✅ live |
 | All repos via `repo.WithContext(ctx)` (`otelgorm` plugin) | ✅ live |
 | Manual span around `storage_service.Upload` (B2 PutObject) | ✅ live |
 | Manual span around APNs `Send` / `SendWithCategory` | ✅ live |
 | Manual span around FCM `sendOne` | ✅ live |
 | Asynq middleware — span per task type with retry/payload attrs | ✅ live |
 Migrated services (every public method takes ctx):
 - `AuthService` — login, register, refresh, logout, me, verify-email,
  forgot/reset-password, update-profile
 - `TaskService` — all 25+ task and completion methods
 - `ResidenceService` — all 15 methods including share-codes
 - `ContractorService` — all 9 methods
 - `DocumentService` — all 10 methods
 - `NotificationService` — all 12 methods
 - `SubscriptionService` — all 12 methods including Apple/Google IAP
 Sample trace for `GET /api/tasks/` (warm cache, post-optimization):
 ```
 GET /api/tasks/                                              (229ms)
 └── service: SELECT * FROM task_task WHERE residence_id IN
              (SELECT id FROM residence_residence WHERE...)   (227ms)
 ```
 Two spans total. The auth path runs entirely from Redis + in-memory
 cache (zero SQL queries) thanks to the 1-hour token TTL and 5-min user
 TTL. The residence-ID lookup is folded into the tasks query as a
 Postgres subquery, so a single network round-trip to Neon services the
 whole request. See Chapter 8 §"Optimizations layered on top" for the
 optimization stack.
 Earlier trace, before the optimization stack landed (commit 88fb175):
 ```
 GET /api/tasks/                                              (2473ms)
 ├── auth: SELECT * FROM user_authtoken WHERE key=...           (1506ms)
 ├── auth: SELECT * FROM auth_user WHERE id=7                    (333ms)
 ├── service: SELECT id FROM residence_residence WHERE...        (736ms)
 └── service: SELECT * FROM task_task WHERE residence_id IN(...) (226ms)
 ```
 10× improvement from 2,473ms to 229ms by cutting query count
 (5 SQL → 1 SQL on warm cache). The 227ms in the surviving query is
 **1 transatlantic round-trip** to Neon us-east-1 from Hetzner
 Nuremberg — the physical floor on the current setup. Eliminated by
 migrating Neon to a EU region; tracked in [Chapter 18 §migration
 triggers](./18-cost.md) and `docs/observability-plan.md`.
 **Migration pattern (for any future services or middleware):** add
 `ctx context.Context` as the first arg, change the handler call site
 to pass `c.Request().Context()`, and replace `s.repo.X(...)` with
 `s.repo.WithContext(ctx).X(...)`. Tests pass `context.Background()`.
 ### No APM (Application Performance Monitoring)
-No request-level profiling. We can't see "which endpoint has the highest
+No continuous profiling. We can answer "which endpoint has the highest
-p99 latency?" or "which SQL query is hot this week?"
+p99 latency?" from the histograms, but not "where in the call stack is
 the time going?" without ad-hoc `pprof` runs.
-Options: Datadog, New Relic, Honeycomb, self-hosted Tempo+Grafana.
+If/when needed: Grafana Pyroscope is the OSS continuous profiler that
-All are meaningful work to set up and cost $$$.
+fits our stack. Adds ~512 MB RAM. Defer until a CPU performance
 incident shows up.
 ## The app's logging conventions
@@ -172,28 +310,12 @@ The Go app uses zerolog and emits structured JSON:
 ```
 Log levels: `debug`, `info`, `warn`, `error`, `fatal`. Controlled by
-`DEBUG=true|false` in ConfigMap (true sets level to debug, false sets
+`DEBUG=true|false` in the ConfigMap (true sets level to debug, false
-level to info).
+sets level to info).
-Every request is logged with:
+Every request is logged with method, path, status, request_id, user_id
- Method, path, status code
+(if authenticated), latency. Queryable by grep today; ready to ingest
- Request ID (for correlating logs across pods)
+into Loki when we add it.
 - User ID (if authenticated)
 - Latency
 ```json
 {
  "level": "info",
  "method": "GET",
  "path": "/api/tasks/",
  "status": 200,
  "latency_ms": 42,
  "user_id": 123,
  "request_id": "a6b5db35-..."
 }
 ```
 This is queryable by grep. Better with log aggregation.
 ## Health endpoints
@@ -202,71 +324,58 @@ Each service exposes a health endpoint:
 | Service | Endpoint | What it checks |
 |---|---|---|
 | api | `/api/health/` | Process alive (doesn't verify DB) |
 | api | `/api/health/live` | Process alive |
 | admin | `/` | Next.js is up |
 | worker | (none public) | Internal Asynq status |
 | api | `/metrics` | Prometheus exposition (vmagent scrapes here) |
 | api | `/metrics/legacy` | Custom monitoring metrics for GoAdmin |
 Health endpoints are **shallow** — they return 200 if the process is
 running and listening. They don't try to reach Postgres/Redis/etc.
 Rationale: if Postgres is briefly down, we don't want all api pods to
 start failing liveness and cascade-restart.
-## Dozzle (deprecated)
+## obs.88oakapps.com — the ingest endpoint
-The Swarm era had [Dozzle](https://github.com/amir20/dozzle) — a
+Public hostname for cross-cluster metric and trace ingest. Cloudflare
-lightweight web UI for Docker logs. Accessible via SSH tunnel to the
+in front, nginx on `88oakappsUpdate` enforces a bearer-token check
-manager node. Not deployed on k3s; `kubectl logs` + `stern` fills the
+before forwarding to the local VM/Jaeger containers.
 niche.
-## Kubernetes metrics the k8s API exposes
+| Path | Forwards to | Purpose |
 |---|---|---|
 | `/api/v1/write` | `http://127.0.0.1:8428` | Prometheus remote-write (vmagent → VM) |
 | `/v1/traces` | `http://127.0.0.1:4318/v1/traces` | OTLP/HTTP traces (app → Jaeger) |
 | `/health` | (returns 200) | Reachability probe — also requires auth |
 | anything else | 404 | |
-Even without Prometheus, these are queryable:
+Token lives at `/etc/honeydue-obs/secrets.env` (mode 0600 on the box)
 and at `OBS_INGEST_TOKEN=` in `deploy/prod.env` (gitignored). To rotate:
 generate a new value, update both ends, restart vmagent.
 ```bash
-# Resource metrics (via metrics-server)
+# Operator: rotate the bearer token
-kubectl get --raw /apis/metrics.k8s.io/v1beta1/nodes
+NEW=$(openssl rand -hex 32)
-kubectl get --raw /apis/metrics.k8s.io/v1beta1/namespaces/honeydue/pods
+ssh 88oakappsUpdate "sudo sed -i 's|OBS_INGEST_TOKEN=.*|OBS_INGEST_TOKEN=$NEW|' /etc/honeydue-obs/secrets.env"
-
+ssh 88oakappsUpdate "sudo sed -i 's|Bearer [a-f0-9]\{64\}|Bearer $NEW|' /etc/nginx/sites-available/obs.88oakapps.com && sudo nginx -s reload"
-# Core API (k8s state)
+sed -i.bak "s|^OBS_INGEST_TOKEN=.*|OBS_INGEST_TOKEN=$NEW|" deploy/prod.env
-kubectl get --raw /api/v1/namespaces/honeydue/pods/<name>
+KUBECONFIG=~/.kube/honeydue.yaml kubectl -n honeydue create secret generic vmagent-remote-write \
-
+  --from-literal=bearer_token=$NEW --dry-run=client -o yaml | kubectl apply -f -
-# Kubelet metrics (per-node; requires tunneling)
+KUBECONFIG=~/.kube/honeydue.yaml kubectl -n honeydue rollout restart deploy/vmagent
 kubectl get --raw /api/v1/nodes/<node>/proxy/metrics
 ```
-If we ever spin up Prometheus, these are the endpoints it would scrape.
+## Resource budget
-## Future: what to add and when
+| Service | mem_limit | Disk | Retention |
 |---|---|---|---|
 | VictoriaMetrics | 256 MB | 10 GB | 30 days |
 | Jaeger all-in-one (badger) | 256 MB | 10 GB | ~7 days |
 | Grafana OSS | 256 MB | 1 GB | — |
 | vmagent (in k3s) | 256 MB | 512 MB emptyDir | — |
 | **Total** | **~1 GB hard cap** | **~21 GB** | |
-| Trigger | Add |
+Resident usage at idle is much lower (~90 MB on the obs side, ~30 MB
-|---|---|
+for vmagent). Hard limits exist so a memory leak in any one component
-| 10k+ daily users | Loki + Grafana for logs |
+can't squeeze the cohabiting PostHog stack on `88oakappsUpdate`.
 | 100+ req/s sustained | Prometheus + Grafana for metrics |
 | Performance incidents | OpenTelemetry tracing |
 | Revenue > $5k/mo | Paid monitoring (Datadog or similar) |
 | First production outage | Alerting to phone/Slack |
 The overall philosophy: observability is an investment that compounds.
 Add it before you need it, not after. But also don't over-invest at
 idle.
 **Next quarter**: set up Uptime Kuma + Loki at minimum.
 ## Checking what's installed
 ```bash
 # In kube-system namespace
 kubectl get pods -n kube-system
 # Should see: coredns, metrics-server, traefik, local-path-provisioner,
 # and some k3s-related helm install jobs
 # In honeydue namespace
 kubectl get pods -n honeydue
 # api, admin, worker, redis
 # No monitoring namespace (yet)
 kubectl get namespaces
 # default, honeydue, kube-node-lease, kube-public, kube-system
 ```
 ## Operator cheat sheet
@@ -274,32 +383,61 @@ kubectl get namespaces
 # Tail all logs in the namespace
 kubectl logs -n honeydue --all-containers=true --tail=50 -l app.kubernetes.io/part-of=honeydue
 # Scrape state from vmagent self-metrics
 kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
  | grep -E "scrapes_total|targets|remotewrite"
 # Force vmagent to reload scrape config
 kubectl -n honeydue rollout restart deploy/vmagent
 # Query VictoriaMetrics directly (PromQL)
 ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=histogram_quantile(0.95,sum%20by%20(route,le)(rate(http_request_duration_seconds_bucket%5B5m%5D)))" | python3 -m json.tool'
 # Restart the obs stack on 88oakappsUpdate
 ssh 88oakappsUpdate 'cd /opt/honeydue-obs && sudo docker compose restart'
 # Live obs container memory
 ssh 88oakappsUpdate 'sudo docker stats --no-stream | grep honeydue-obs'
 # Pod resource usage (k3s side)
 kubectl top pods -n honeydue --sort-by=memory
 # With stern (if installed: brew install stern)
 stern -n honeydue .
 # Follow specific pod, including previous runs
 kubectl logs -n honeydue <pod> -f --previous=false
 # Pod resource usage
 kubectl top pods -n honeydue --sort-by=memory
 kubectl top pods -n honeydue --sort-by=cpu
 # Events (cluster-wide)
 kubectl get events -A --sort-by=.lastTimestamp | tail -20
 # Full state dump for a pod (debugging)
 kubectl describe pod -n honeydue <pod> > /tmp/pod-dump.txt
 kubectl logs -n honeydue <pod> > /tmp/pod-logs.txt
 ```
 ## Future: what to add and when
 | Trigger | Add |
 |---|---|
 | First production incident | Grafana alerting (free, data already there) |
 | 10k+ daily users | Loki + Vector for log aggregation |
 | Performance incident the histograms can't explain | Wire OTel exporter → Jaeger from the Go app |
 | CPU pressure on api pods | Pyroscope continuous profiler |
 | Multi-product obs needs | Migrate obs stack to dedicated CX32 ($8/mo) |
 The overall philosophy: observability is an investment that compounds.
 Add it before you need it, not after. But also don't over-invest at
 idle.
 ## References
- [Kubernetes metrics-server][ms]
+- [VictoriaMetrics docs][vm]
- [K3s metrics][k3s-metrics]
+- [vmagent kubernetes_sd_configs][vmagent-k8s]
- [Loki][loki]
+- [Jaeger all-in-one with badger][jaeger]
 - [prometheus/client_golang][promclient]
 - [Grafana provisioning datasources][gf-prov]
 - [Loki][loki] (future)
 - [Stern (multi-pod log tail)][stern]
-[ms]: https://github.com/kubernetes-sigs/metrics-server
+[vm]: https://docs.victoriametrics.com/single-server-victoriametrics/
-[k3s-metrics]: https://docs.k3s.io/advanced#enabling-metrics-server
+[vmagent-k8s]: https://docs.victoriametrics.com/vmagent.html#kubernetes-monitoring-with-vmagent
 [jaeger]: https://www.jaegertracing.io/docs/1.62/getting-started/#all-in-one
 [promclient]: https://pkg.go.dev/github.com/prometheus/client_golang
 [gf-prov]: https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources
 [loki]: https://grafana.com/oss/loki/
 [stern]: https://github.com/stern/stern
@@ -115,6 +115,41 @@ kubectl rollout restart deployment/coredns -n kube-system
 kubectl rollout restart deployment/metrics-server -n kube-system
 ```
 #### vmagent can't reach obs.88oakapps.com
 **Symptom**: dashboards stop updating; vmagent logs show 401 / TLS /
 network errors against `obs.88oakapps.com`. App is unaffected.
 **Recovery**: vmagent buffers up to 512 MB locally and replays on
 reconnect, so brief outages self-heal. If sustained:
 ```bash
 # Is the obs endpoint up?
 curl -s -o /dev/null -w "%{http_code}\n" https://obs.88oakapps.com/health \
  -H "Authorization: Bearer $(grep ^OBS_INGEST_TOKEN= deploy/prod.env | cut -d= -f2)"
 # 200 = ingest endpoint healthy.
 # Inspect vmagent's failure metric
 kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
  | grep -E "remotewrite_(packets|samples)_dropped|persistentqueue_blocks_dropped"
 # Restart vmagent (forces config reload + drains queue)
 kubectl -n honeydue rollout restart deploy/vmagent
 ```
 **If 88oakappsUpdate itself is down** (PostHog runs there too):
 SSH and check `sudo docker compose -f /opt/honeydue-obs/docker-compose.yml ps`.
 **Non-critical**: nothing app-facing depends on the obs stack.
 #### Grafana dashboard shows "no data"
 **Possible causes, in order of frequency**:
 1. New histogram name — query targets a metric the api hasn't emitted
   yet. Check `kubectl exec deploy/vmagent -- wget -qO- http://api:8000/metrics`
   for the metric name.
 2. vmagent isn't scraping (see above).
 3. Time range is before the obs stack came up (2026-04-25). Adjust
   the dashboard time picker.
 4. Cardinality blowup — VM rejected high-label-count series. Check
   `vm_rows_inserted_total` vs `vm_rows_dropped_total` on the obs box.
 ### Networking failures
 #### UFW rule accidentally blocks essential traffic
@@ -210,12 +245,58 @@ finds an empty data directory (or can't mount at all).
 - If the original node is gone: Redis starts empty. Cache regenerates.
  Asynq queue state is lost; pending jobs re-queue on retry, cron
  fires re-schedule on next tick.
 - Auth caches (token + residence-IDs) regenerate on first user
  request — first request per user pays full DB lookup, then warm
  again. Visible as a brief latency spike in the Grafana RED
  dashboard, not a functional failure.
 - Ensure the node label `honeydue/redis=true` is on a healthy node:
 ```bash
 kubectl label node <new-node> honeydue/redis=true --overwrite
 kubectl label node <dead-node> honeydue/redis- 2>/dev/null || true
 ```
 #### Stale residence-IDs cache (data freshness bug)
 **Symptom**: a user accepts a share-code or has a residence
 removed, but `/api/tasks/`, `/api/documents/`, `/api/contractors/`,
 or `/api/residences/summary/` continues to show the old
 membership for up to 5 minutes.
 **Cause**: a residence-membership-mutating code path landed
 without calling `cache.InvalidateResidenceIDsForUsers(...)`. The
 cache TTL is 5 min so the issue self-heals, but it's user-visible.
 **Recovery (immediate)**: flush the affected user's cache key
 manually. See [Chapter 17 §residence-IDs cache invalidation](./17-runbook.md).
 **Prevention (permanent)**: every mutation that changes
 `residence_residence.owner_id`, `residence_residence_users.user_id`,
 or deletes a residence MUST invalidate. Existing call sites for
 reference: `CreateResidence` (owner), `DeleteResidence`
 (all members), `JoinWithCode` (joining user), `RemoveUser`
 (removed user). The pattern lives in
 `internal/services/residence_id_cache.go`.
 #### Redis at maxmemory limit
 **Symptom**: Redis logs `OOM command not allowed when used memory > 'maxmemory'`.
 Should be rare — current production usage is ~2.4 MB against a 256 MB
 limit and the policy is `allkeys-lru` (cache writes evict cold keys
 instead of erroring).
 **Recovery**: confirm the policy is still `allkeys-lru`:
 ```bash
 kubectl -n honeydue exec deploy/redis -- redis-cli CONFIG GET maxmemory-policy
 ```
 If it's somehow `noeviction`, set it live:
 ```bash
 kubectl -n honeydue exec deploy/redis -- redis-cli CONFIG SET maxmemory-policy allkeys-lru
 ```
 And re-apply the manifest at `deploy-k3s/manifests/redis/deployment.yaml`
 so the change survives a pod restart.
 If memory usage is genuinely climbing toward the cap, check for
 runaway keys without TTLs:
 ```bash
 kubectl -n honeydue exec deploy/redis -- redis-cli --bigkeys
 ```
 ### External service failures
 #### Neon Postgres outage
@@ -229,6 +310,72 @@ until Neon is back.
 Postgres-level failover.
 **Frequency**: Neon has had a handful of hours-scale outages since launch.
 #### Neon pooler endpoint unreachable but direct endpoint up
 **Symptom**: `dial tcp ep-floral-truth-amttbc5a-pooler.c-5...: i/o
 timeout` in api logs but the direct compute endpoint is reachable.
 Rare — Neon's pooler runs in their infra alongside compute — but
 possible during pooler maintenance.
 **Recovery (emergency)**: switch `DB_HOST` in `config.yaml` from the
 `-pooler` to the direct hostname (drop the `-pooler` segment),
 re-apply ConfigMap, rolling-restart api and worker:
 ```bash
 # Edit deploy-k3s/config.yaml: database.host: ep-floral-truth-amttbc5a.c-5...
 # Then:
 KUBECONFIG=~/.kube/honeydue.yaml bash deploy-k3s/scripts/03-deploy.sh --skip-build
 ```
 Cold-handshake latency goes back up (~440ms first hit) but the API
 keeps serving. Switch back when the pooler recovers.
 #### Migrate Job fails during deploy
 **Symptom**: `03-deploy.sh` aborts at the migrations step:
 ```
 [deploy][error] migrations did not complete cleanly; aborting deploy
 ```
 api/worker pods are NOT updated — they keep running the previous
 revision. This is the intentional fail-fast.
 **Recovery**:
 ```bash
 # 1. See the failure
 kubectl -n honeydue logs job/honeydue-migrate --tail=200
 # 2. Common cause: a SQL error in the migration file. Fix the file
 #    locally, commit, retry the deploy. The Job is idempotent —
 #    successful prior versions stay applied; only the failed file
 #    re-runs.
 git add migrations/000NNN_*.sql
 git commit -m "Fix migration NNN"
 git push gitea master
 bash deploy-k3s/scripts/03-deploy.sh
 # 3. Other cause: Neon down or auth changed. Test direct connection:
 DB_PASS=$(kubectl -n honeydue get secret honeydue-secrets \
  -o jsonpath='{.data.POSTGRES_PASSWORD}' | base64 -d)
 docker run --rm -e PGPASSWORD="$DB_PASS" postgres:17-alpine \
  psql "host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
        user=neondb_owner dbname=honeyDue sslmode=require" -c "SELECT 1;"
 ```
 **Why no automatic retry**: `backoffLimit: 0` on the Job is deliberate.
 A failing migration almost never gets unstuck by retrying — needs an
 operator to look. See [Chapter 17 §27](./17-runbook.md) for recovery
 playbook.
 #### api refuses to start: "Schema precondition failed"
 **Symptom**: api pods log `Schema precondition failed` and exit
 immediately after DB connect.
 **Cause**: `goose_db_version` table is missing or its latest row has
 `is_applied=false`. Means the migrate Job either was never run or
 ran and rolled back.
 **Recovery**: run the migrate Job manually (see
 [Chapter 17 §26](./17-runbook.md)). After it completes successfully,
 delete the failing api pods so they restart with a fresh schema check:
 ```bash
 kubectl -n honeydue rollout restart deploy/api
 ```
 #### Backblaze B2 outage
 **Symptom**: image uploads fail; image downloads fail unless cached by
@@ -358,6 +358,165 @@ Workaround: in each pod's logs, search for a unique user identifier:
 stern -n honeydue api | grep "user_id=12345"
 ```
 ## 23. Invalidate residence-IDs cache for a user
 Used when a user reports stale data ("I joined a residence but my
 tasks list still shows the old one"). The cache is keyed on user ID
 with 5-min TTL — most issues self-heal — but you can flush manually.
 ```bash
 # Single user
 kubectl -n honeydue exec deploy/redis -- redis-cli DEL "residence_ids_user:7"
 # All users (nuclear; everyone pays one DB lookup on next request)
 kubectl -n honeydue exec deploy/redis -- redis-cli --scan --pattern "residence_ids_user:*" \
  | xargs -r -n 100 kubectl -n honeydue exec deploy/redis -- redis-cli DEL
 ```
 Mutation paths that should invalidate this cache automatically (any
 new code that changes membership must call
 `cache.InvalidateResidenceIDsForUsers(ctx, userIDs...)`):
 - `ResidenceService.CreateResidence` → owner
 - `ResidenceService.DeleteResidence` → all members
 - `ResidenceService.JoinWithCode` → joining user
 - `ResidenceService.RemoveUser` → removed user
 If a user keeps reporting stale data, grep for missing invalidation:
 ```bash
 grep -rn "residenceRepo.*Add\|RemoveUser\|residence_residence_users" internal/ \
  | grep -v cache | grep -v _test
 ```
 ## 24. Verify DB pool warm-up is working
 After a deploy, check the api pod log for the warm-up confirmation:
 ```bash
 kubectl -n honeydue logs -l app.kubernetes.io/name=api --tail=50 \
  | grep "DB pool warm-up complete"
 ```
 Expected output (per pod):
 ```json
 {"level":"info","requested":20,"warmed":20,"message":"DB pool warm-up complete"}
 ```
 If `warmed` < `requested`, the pool partially failed at boot — pod
 still starts, fills from there. If `warmed=0`, something's wrong with
 either Neon connectivity or auth — check the next log line for the
 specific error.
 To test impact: hit the api right after a rollout. With warm-up
 working, the first request should be ~250ms (1 RTT). Without warm-up,
 the first request is ~700ms (full handshake).
 ## 25. Switch DB host between pooler and direct endpoints
 The pooler endpoint (`-pooler` suffix) is the default — it cuts
 cold-handshake latency by ~3 RTTs. The direct endpoint
 (`ep-floral-truth-amttbc5a.c-5...`) is the fallback.
 ```bash
 # Edit deploy-k3s/config.yaml — change database.host
 # To pooler:   ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech
 # To direct:   ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech
 KUBECONFIG=~/.kube/honeydue.yaml bash deploy-k3s/scripts/03-deploy.sh --skip-build
 ```
 The pooler runs in transaction mode so any session-scope feature
 (LISTEN/NOTIFY, session advisory locks) won't work over it. Migrations
 already handle this — the migrate Job script strips `-pooler` from
 `DB_HOST` before invoking goose. If you add new session-level features
 in the data path, they'll need the same workaround.
 ## 26. Run migrations manually (rare)
 Day-to-day, migrations run as part of every `03-deploy.sh`. But
 sometimes you want to apply or inspect them outside a deploy:
 ```bash
 # Direct-endpoint DSN (goose's advisory lock won't survive the pooler)
 DB_PASS=$(kubectl -n honeydue get secret honeydue-secrets \
  -o jsonpath='{.data.POSTGRES_PASSWORD}' | base64 -d)
 export DATABASE_URL="host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
                     port=5432 user=neondb_owner password=$DB_PASS \
                     dbname=honeyDue sslmode=require"
 # What's pending? (read-only; safe to run anytime)
 make migrate-status
 # Apply pending migrations (or `goose -dir migrations postgres "$DATABASE_URL" up`)
 make migrate-up
 # Roll back the most recent migration
 make migrate-down
 # Scaffold a new migration file
 make migrate-new name=add_widget_count_to_residences
 # → migrations/000002_add_widget_count_to_residences.sql
 # Edit, then `make migrate-up` to test, then commit.
 ```
 To run goose from inside the cluster (e.g., to bypass a network policy
 that blocks Neon from your laptop), use the migrate Job manifest as a
 one-shot:
 ```bash
 # Re-runs the latest migrate Job with whatever args you need
 kubectl -n honeydue delete job honeydue-migrate --ignore-not-found
 sed "s|image: IMAGE_PLACEHOLDER|image: $(kubectl -n honeydue get deploy api -o jsonpath='{.spec.template.spec.containers[0].image}')|" \
  deploy-k3s/manifests/migrate/job.yaml | kubectl apply -f -
 kubectl -n honeydue wait --for=condition=complete --timeout=5m job/honeydue-migrate
 kubectl -n honeydue logs job/honeydue-migrate
 ```
 ## 27. Recover from a failed/dirty migration
 If `goose up` fails partway through, the migration file's transaction
 rolls back and `goose_db_version` reflects the last *complete*
 version. Goose marks no row as "dirty" — that's a golang-migrate
 concept. So recovery is just: fix the migration file, re-run.
 If you've genuinely corrupted state (dropped tables you shouldn't have,
 applied a destructive migration in error):
 ```bash
 # See current goose state
 make migrate-status
 psql "$DATABASE_URL" -c \
  "SELECT version_id, is_applied, tstamp FROM goose_db_version ORDER BY id DESC LIMIT 10;"
 # To force the version table back to a known-good number after
 # manually fixing the schema:
 psql "$DATABASE_URL" -c \
  "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (<N>, true, NOW());"
 ```
 ## 28. Bootstrap goose on a fresh clone of the schema
 If you create a new Neon branch / dev DB and need to bring it under
 goose management:
 ```bash
 export DATABASE_URL="...<the new DB>..."
 # Option A: fresh DB, no schema → just run up
 make migrate-up
 # Option B: schema already populated (e.g., restored from a dump) →
 #          mark v1 as already-applied
 goose -dir migrations postgres "$DATABASE_URL" version  # creates table
 psql "$DATABASE_URL" -c \
  "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
 ```
 This is also what was done for the live prod DB at goose-adoption time
 (commit `12b2f9d`).
 ## References
 - [kubectl cheat sheet][kubectl-cs]
@@ -58,6 +58,20 @@ honeyDue.
 |---|---:|
 | Gitea container registry | **$0** |
 ### Observability (88oakappsUpdate)
 VictoriaMetrics + Jaeger + Grafana co-tenant on the existing Linode
 VPS that hosts PostHog. ~700 MB RAM, 21 GB disk — fits inside the
 existing instance. Not charged to honeyDue.
 | Item | Monthly |
 |---|---:|
 | Self-hosted obs stack on `88oakappsUpdate` | **$0** |
 Migration trigger: when the obs stack starts pressuring PostHog or
 needs hard isolation, move to a dedicated Hetzner CX32 (~$8/mo).
 See [Chapter 15 — When to move off](./15-observability.md).
 ### Total infrastructure
 | Category | Monthly |
@@ -67,6 +81,7 @@ honeyDue.
 | Storage | ~$0.30 |
 | Edge | $0 |
 | Registry | $0 |
 | Observability | $0 |
 | **Total** | **~$30** |
 ## External SaaS
@@ -397,6 +397,35 @@ should reflect reality, not be optimistic.
 **Moral**: Healthchecks should be realistic, not aspirational. Know
 what your app actually does at startup.
 #### Postscript (2026-04-26): the whole `MigrateWithLock` shape was wrong
 A few months after the Swarm migration, switching `DB_HOST` to Neon's
 `-pooler` endpoint for runtime perf wins broke this code completely:
 `pg_advisory_lock` is session-scoped, but PgBouncer transaction-mode
 multiplexes statements across backend Postgres sessions, so the lock
 appeared to be held but actually wasn't. Pods hung at
 "Acquiring migration advisory lock..." and the startup probe killed
 them in turn.
 After a brief band-aid (route migrations through the direct endpoint;
 bump probe to 600s to absorb 5-minute AutoMigrate runs over the slow
 direct connection — both reverted), we abandoned the runtime-side
 migration story entirely and adopted [pressly/goose](https://github.com/pressly/goose)
 in commit `12b2f9d`:
 - Migrations run as a one-shot Kubernetes Job before any api/worker
  pod rolls. No more in-replica migration, no more advisory lock,
  no more startup probe gymnastics.
 - `RequireSchemaApplied` checks `goose_db_version` at startup and
  refuses to boot on a stale schema — fail-fast for "operator
  forgot to run migrate," instead of mysterious runtime errors.
 - `failureThreshold` reverted to its pre-MigrateWithLock value.
  Pods boot in seconds again.
 See [Chapter 8 §Schema management](./08-database.md) for the goose
 shape. This entire sub-section is preserved as historical context
 for why we walked the path we did.
 ## What we learned
 ### Docker Swarm is in a bad place in 2026
@@ -19,84 +19,72 @@ minute, with Slack/email alerts on failure.
 **Effort**: ~30 min for Uptime Kuma deploy, ~10 min for Better Stack
 signup.
-### Cloudflare origin IP restriction
+### ~~Cloudflare origin IP restriction~~ ✓ DONE (2026-04-24)
-**Why**: UFW allows :80 from anywhere. If node IPs leak, direct-connect
+Both `:80` and `:443` `Anywhere` rules removed on all 3 nodes. Only
-attackers bypass CF's WAF/DDoS protection.
+CF's 15 IPv4 + 7 IPv6 ranges allowed on `:443`. Direct-connect attempts
 from non-CF IPs time out.
-**How**: Replace the anywhere-80 UFW rule with 15 IPv4 + 7 IPv6 CF
+**Still TODO**: monthly automated refresh of the CF IP list. Ranges
-ranges. See [Chapter 13 §CF IP ranges](./13-cloudflare.md#cloudflare-ip-ranges-used-in-traefik-trustedips).
+change rarely; manual re-run of `scripts/ufw-cf-refresh.sh` (not yet
 written) on cadence is acceptable for now.
-Automation: a small script that refreshes the CF IP list monthly and
+### ~~Enable network policies in k3s~~ ✓ DONE (2026-04-24)
 re-applies UFW rules.
-**Effort**: 1 hour.
+Applied with one scaffold correction: Traefik runs as a DaemonSet with
 `hostNetwork: true`, so traffic from it arrives with the **node IP** as
 source rather than a pod IP. The original scaffold used
 `namespaceSelector: kube-system` which doesn't match hostNetwork
 traffic. Fixed by using an `ipBlock` list of the three node IPs plus
 the cluster pod CIDR `10.42.0.0/16`.
-### Enable network policies in k3s
+Also added policies for `web` (missing from the original scaffold).
-**Why**: Currently pods can freely egress anywhere. A compromised pod
+### ~~Apply Traefik security middleware~~ ✓ DONE (2026-04-24)
 could exfiltrate data or attack lateral services.
-**How**: `kubectl apply -f deploy-k3s/manifests/network-policies.yaml`.
+`security-headers` + `rate-limit` attached to all three ingresses
-The scaffold defines default-deny + explicit allows for:
+(api, admin, web). `admin-auth` is defined but not attached (needs an
- DNS egress for all pods
+`admin-basic-auth` secret we haven't created). `cloudflare-only` IP
- Traefik → api (port 8000)
+allowlist exists but is redundant with the UFW-level CF restriction —
- Traefik → admin (port 3000)
+keep for defense in depth if we ever expose another layer.
 - api/worker → Redis
 - api/worker → external services (Postgres, B2, Fastmail)
-Then test that nothing breaks (might need to adjust allow rules).
+One scaffold correction: the `Content-Security-Policy` header in
-
+`security-headers.customResponseHeaders` was stripped. The Go API sets
-**Effort**: 1-2 hours including testing.
+its own CSP in `internal/router/router.go`, and two CSP headers combine
-
+via intersection (most restrictive wins), which would break the Google
-### Apply Traefik security middleware
+Fonts on the marketing landing page. Next.js apps set their own via
-
+middleware.
 **Why**: Our current Ingress has no rate limiting or security headers
 beyond what Traefik adds by default.
 **How**: Apply `deploy-k3s/manifests/ingress/middleware.yaml`, annotate
 Ingresses to use them:
 ```yaml
 metadata:
  annotations:
    traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
 ```
 **Effort**: 15 min.
 ## Medium priority
-### Upgrade to CF Full (strict) SSL
+### ~~Upgrade to CF Full (strict) SSL~~ ✓ DONE (2026-04-24)
-**Why**: Currently CF↔origin is plain HTTP. An attacker between CF and
+Origin CA cert (`*.myhoneydue.com` + `myhoneydue.com`, 15-year
-Hetzner could read traffic. Full (strict) mode encrypts this leg with
+validity) stored as `cloudflare-origin-cert` TLS secret. All three
-a CF-issued origin cert.
+ingresses reference it via `tls:` blocks. CF mode flipped from
 Flexible to Full (strict). Verified by:
-**How**:
+- direct-connect to origin on `:443` serves the Origin cert (subject
-1. Generate Origin CA cert in CF dashboard → SSL/TLS → Origin Server
+  `CN=CloudFlare Origin Certificate`)
-2. Create `cloudflare-origin-cert` Secret in k8s
+- CF edge continues to serve its own Let's Encrypt cert to browsers
-3. Add `tls:` block to Ingresses
+- both layers now TLS-encrypted
 4. Switch CF SSL mode to Full (strict)
-**Effort**: 30 min.
+### ~~Migration Job for schema changes~~ — done (2026-04-26, commit 12b2f9d)
-**Citations**: [Cloudflare Origin CA docs][cf-origin-ca]
+**What shipped**: pressly/goose as the migration tool, run as a one-shot
 Kubernetes Job from `deploy-k3s/manifests/migrate/job.yaml` before
 api/worker rollout. The Job uses the api image (goose CLI is baked in
 during the Dockerfile build), strips `-pooler` from `DB_HOST` for the
 direct-endpoint connection migrations need, and exits in seconds when
 there's nothing to apply. `RequireSchemaApplied` in the api/worker
 startup checks `goose_db_version` and fails fast on a stale schema.
-### Migration Job for schema changes
+The Go-code-with-`--migrate-only` shape originally proposed here was
 rejected in favor of using the upstream goose binary directly — see
 [Chapter 8 §Schema management](./08-database.md) for the trade-offs.
-**Why**: Currently every api pod runs `MigrateWithLock()` on startup,
+Pre-goose `MigrateWithLock` is gone; ch19 §13 has the historical
-serializing on a Postgres advisory lock. Adds 90-240s to cold startup
+postmortem context.
 and caused bug #13 in Chapter 19.
 **How**: Create a Kubernetes `Job` resource that runs the api image
 with a `--migrate-only` flag. Job runs once per deploy, completes when
 schema is current. api pods get an initContainer that waits for the
 Job to complete.
 Requires Go code change to support `--migrate-only` flag.
 **Effort**: 3-4 hours (code + job manifest + testing).
 ### Redis password
@@ -312,7 +300,16 @@ k3s server on each node with the new backend.
 As items are done, mark them here. Think of this as a running changelog.
 - [x] k3s migration from Swarm (2026-04-24)
- [x] Traefik DaemonSet + hostNetwork
+- [x] Traefik DaemonSet + hostNetwork (2026-04-24)
- [x] Admin seed via ADMIN_EMAIL + ADMIN_PASSWORD
+- [x] Admin seed via ADMIN_EMAIL + ADMIN_PASSWORD (2026-04-24)
- [x] Documentation book (this doc set)
+- [x] Documentation book (this doc set) (2026-04-24)
 - [x] Web client deployed at `app.myhoneydue.com` (2026-04-24) — Next.js 16 standalone, 3 replicas with PDB, proxy pattern to api, see Chapter 7.
 - [x] Admin URL-baking fix (2026-04-24) — Dockerfile `ARG NEXT_PUBLIC_API_URL`, `.dockerignore` hardening for `admin/.env.*`.
 - [x] Auto-seed initial data on first API boot (2026-04-24) — `20260414_seed_initial_data` migration populates lookups, admin user, task templates. See commit `4ec4bbb`.
 - [x] APNs wired up (2026-04-24) — Key ID `5L5BVF5G48`, Team ID `X86BR9WTLD`, sandbox mode. Secret `honeydue-apns-key`, `FEATURE_PUSH_ENABLED=true`.
 - [x] Traefik middleware: `security-headers` + `rate-limit` attached to all three ingresses (2026-04-24). CSP is stripped from the middleware because the Go API sets its own.
 - [x] Admin liveness probe path fix (2026-04-24) — was hitting `/admin/` (404) and crashlooping every ~90s for 6 hours before the bug was caught. Fixed to `/`.
 - [x] Network policies applied (2026-04-24) — default-deny + explicit allows. Traefik hostNetwork is matched via node IP `ipBlock`s, not namespaceSelector. See Chapter 5.
 - [x] Cloudflare Full (strict) SSL (2026-04-24) — Origin CA cert installed as `cloudflare-origin-cert` secret, ingresses have `tls:` blocks, CF mode flipped from Flexible. Both user↔CF and CF↔origin now TLS.
 - [x] UFW CF-IP allowlist on all 3 nodes (2026-04-24) — 15 IPv4 + 7 IPv6 CF ranges allow `:443`; `Anywhere` rules for `:80` and `:443` deleted. Direct-connect from non-CF IPs times out.
 - [ ] All other items above
@@ -40,7 +40,7 @@ they do, and how to operate them.
 - [07 — Services](./07-services.md) — api, admin, worker, redis per-service deep dive
 - [08 — Database](./08-database.md) — Neon Postgres, advisory-lock migrations
- [09 — Storage](./09-storage.md) — Backblaze B2, minio-go client details
+- [09 — Storage](./09-storage.md) — Backblaze B2, minio-go, presigned-URL direct uploads
 - [10 — Secrets & Config](./10-secrets-config.md) — ConfigMap, Secret, env mapping
 - [11 — Registry](./11-registry.md) — Gitea container registry, multi-arch builds
@@ -48,7 +48,7 @@ they do, and how to operate them.
 - [12 — Data Flow](./12-data-flow.md) — end-to-end request lifecycle
 - [14 — Deployment Process](./14-deployment-process.md) — how to roll new code
- [15 — Observability](./15-observability.md) — logs, metrics, tracing
+- [15 — Observability](./15-observability.md) — VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`, vmagent in-cluster, Prometheus histograms in the Go API
 - [16 — Failure Modes](./16-failure-modes.md) — what happens when X dies
 - [17 — Runbook](./17-runbook.md) — common ops tasks
@@ -173,11 +173,21 @@ suffix. (Chapter 8)
 ## Go + Asynq
 **AutoMigrate**: GORM function that syncs DB schema to Go structs.
-(Chapter 8)
+We used this in production until 2026-04, replaced by goose. Tests
 still use it via `testutil.SetupTestDB`. (Chapter 8)
 **Asynq**: Go library for background job queues. Redis-backed.
 (Chapter 7)
 **goose**: pressly/goose — the SQL migration tool we use in production
 (commit 12b2f9d onward). Migration files live in `migrations/`, one
 file per version with `-- +goose Up` / `-- +goose Down` markers.
 (Chapter 8)
 **goose_db_version**: goose's version-tracking table. One row per
 applied migration. `RequireSchemaApplied` reads the latest row at
 api/worker startup to fail fast on a stale schema. (Chapter 8)
 **GORM**: Go ORM we use. (Chapter 8)
 **pgx**: Go Postgres driver used by GORM. (Chapter 8)
@@ -278,6 +278,43 @@ ssh -i ~/.ssh/hetzner deploy@<node> 'sudo systemctl start k3s'
 # then re-join via the k3s install command
 ```
 ## Observability
 ```bash
 # Hit api /metrics from inside the cluster
 kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://api:8000/metrics | head -30
 # vmagent self-stats: scrapes succeeded, samples shipped, queue health
 kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
  | grep -E "scrapes_total|targets|remotewrite_samples_dropped|persistentqueue_blocks_dropped"
 # Force vmagent to reload config (after editing the ConfigMap)
 kubectl -n honeydue rollout restart deploy/vmagent
 # Query VictoriaMetrics by SSH'ing to the obs box
 ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=up"'
 # p95 latency by route, last 5m
 ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=histogram_quantile(0.95,sum%20by%20(route,le)(rate(http_request_duration_seconds_bucket%5B5m%5D)))" | python3 -m json.tool'
 # All metric names landing in VM
 ssh 88oakappsUpdate 'curl -s http://127.0.0.1:8428/api/v1/label/__name__/values | python3 -m json.tool'
 # Restart the obs stack on 88oakappsUpdate (VM + Jaeger + Grafana)
 ssh 88oakappsUpdate 'cd /opt/honeydue-obs && sudo docker compose restart'
 # Live RAM usage of the obs containers
 ssh 88oakappsUpdate 'sudo docker stats --no-stream | grep honeydue-obs'
 # Test the obs ingest endpoint with auth
 TOKEN=$(grep ^OBS_INGEST_TOKEN= deploy/prod.env | cut -d= -f2)
 curl -s -o /dev/null -w "%{http_code}\n" https://obs.88oakapps.com/health \
  -H "Authorization: Bearer $TOKEN"  # 200 = healthy
 ```
 Dashboards live at `https://grafana.88oakapps.com/d/honeydue-red`.
 Admin credentials in `deploy/prod.env`.
 ## One-liners worth memorizing
 ```bash
@@ -34,6 +34,14 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBU9xTTBD78tYUqHijgyU9PDqtmS4NuM/6uy8XgDzva+
 | `~/.docker/config.json` | Docker CLI config. After `docker login` to Gitea, contains creds. **Log out after each deploy** to not leave PATs on disk. |
 | `~/Library/Containers/com.docker.docker/` | Docker Desktop state (macOS). |
 ### Apple / Cloudflare credentials on disk
 | Path | Purpose |
 |---|---|
 | `~/Desktop/code/honeyDue/AuthKey_5L5BVF5G48.p8` | APNs auth key (Apple). Source file for the `honeydue-apns-key` k8s secret. Sensitive — treat as a credential. |
 | `~/Desktop/code/honeyDue/cf-origin-cert.pem` | Cloudflare Origin CA cert (PEM). Source file for the `cloudflare-origin-cert` k8s secret. `*.myhoneydue.com` + `myhoneydue.com`, expires 2041. |
 | `~/Desktop/code/honeyDue/cf-origin-key.pem` | Private key for the Origin cert. CF only shows this **once** at generation time. Sensitive — treat as a credential. |
 ## Git repo (`/Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go/`)
 ### Top-level
@@ -90,19 +98,21 @@ ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBU9xTTBD78tYUqHijgyU9PDqtmS4NuM/6uy8XgDzva+
 | `deploy-k3s/manifests/namespace.yaml` | Creates `honeydue` namespace. |
 | `deploy-k3s/manifests/rbac.yaml` | ServiceAccounts + `automountServiceAccountToken: false`. |
 | `deploy-k3s/manifests/pod-disruption-budgets.yaml` | PDBs for api (2/3) and worker (0/1). |
-| `deploy-k3s/manifests/network-policies.yaml` | Default-deny + allows. NOT currently applied. |
+| `deploy-k3s/manifests/network-policies.yaml` | Default-deny + allows. **Applied.** Includes web policies; Traefik hostNetwork handled via node IP `ipBlock`s rather than namespaceSelector. |
 | `deploy-k3s/manifests/api/deployment.yaml` | api Deployment. |
 | `deploy-k3s/manifests/api/service.yaml` | api ClusterIP Service. |
 | `deploy-k3s/manifests/api/hpa.yaml` | api HorizontalPodAutoscaler. NOT currently applied. |
 | `deploy-k3s/manifests/admin/deployment.yaml` | admin Deployment. |
 | `deploy-k3s/manifests/admin/service.yaml` | admin Service. |
 | `deploy-k3s/manifests/web/deployment.yaml` | web Deployment (3 replicas, customer-facing Next.js at app.myhoneydue.com). |
 | `deploy-k3s/manifests/web/service.yaml` | web ClusterIP Service. |
 | `deploy-k3s/manifests/worker/deployment.yaml` | worker Deployment. |
 | `deploy-k3s/manifests/redis/deployment.yaml` | Redis Deployment. |
 | `deploy-k3s/manifests/redis/service.yaml` | Redis Service. |
 | `deploy-k3s/manifests/redis/pvc.yaml` | Redis PersistentVolumeClaim. |
-| `deploy-k3s/manifests/ingress/ingress.yaml` | Full Ingress with TLS + middleware (scaffold; needs CF origin cert). |
+| `deploy-k3s/manifests/ingress/ingress.yaml` | Alternate full Ingress scaffold (unused; we apply ingress-simple.yaml). |
-| `deploy-k3s/manifests/ingress/ingress-simple.yaml` | Simple Ingress without TLS (what we actually apply). |
+| `deploy-k3s/manifests/ingress/ingress-simple.yaml` | **Primary Ingress**. TLS via CF Origin cert, `security-headers` + `rate-limit` middleware attached to all three rules (api/admin/web). |
-| `deploy-k3s/manifests/ingress/middleware.yaml` | Traefik middleware CRDs. Not currently applied. |
+| `deploy-k3s/manifests/ingress/middleware.yaml` | Traefik middleware CRDs (`rate-limit`, `security-headers`, `cloudflare-only`). Applied. `admin-auth` was defined but removed at runtime (needs an unset basic-auth secret). |
 | `deploy-k3s/manifests/traefik-helmchartconfig.yaml` | Our DaemonSet + hostNetwork override for Traefik. |
 | `deploy-k3s/manifests/secrets.yaml.example` | Template (never deployed). |
 | `deploy-k3s/scripts/01-provision-cluster.sh` | hetzner-k3s provisioning (we didn't use it; existing nodes). |
@@ -65,7 +65,9 @@ Every external link cited anywhere in this book, grouped by topic.
 - [Neon usage-based pricing announcement][neon-blog]
 - [Neon connect from any app][neon-connect]
 - [Postgres advisory locks][pg-locks]
- [GORM AutoMigrate][gorm-automigrate]
+- [GORM AutoMigrate][gorm-automigrate] (tests only — production migrations use goose)
 - [pressly/goose — SQL migration tool][goose]
 - [Goose documentation][goose-docs]
 ## Backblaze B2
@@ -168,6 +170,8 @@ Every external link cited anywhere in this book, grouped by topic.
 [neon-connect]: https://neon.com/docs/connect/connect-from-any-app
 [pg-locks]: https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
 [gorm-automigrate]: https://gorm.io/docs/migration.html
 [goose]: https://github.com/pressly/goose
 [goose-docs]: https://pressly.github.io/goose/
 <!-- B2 -->
 [b2-docs]: https://www.backblaze.com/docs/
@@ -0,0 +1,166 @@
 # Observability Plan — honeyDue (100% self-hosted)
 **Goal:** Live request-timing visibility (HTTP, DB, B2 uploads, APNs, asynq jobs) without paying any SaaS vendor.
 **Deployment target:** `88oakappsUpdate` (Linode VPS at `185.143.228.16`, Ubuntu 24.04, 8 vCPU / 32 GB RAM / 193 GB disk). This box already runs the self-hosted PostHog stack and has nginx + Let's Encrypt set up for `*.88oakapps.com`. Free RAM at rest ≈ 15 GB; the obs stack budget is ≈ 700 MB → ~5% of free RAM. Costs $0 incremental.
 **Why not in the honeyDue k3s cluster:** Frees ~700 MB across the 3 Hetzner nodes, no PVC plumbing, and no need to expose anything from k3s — everything is push-from-app to a public TLS endpoint.
 **Status:** Fully shipped. VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`, vmagent in-cluster, OTel SDK and otelgorm wired into the api+worker, every authed endpoint produces nested HTTP→service→SQL flame graphs in Jaeger.
 The first round of traces revealed every visible ms was network/proxy overhead — DB execution itself is sub-millisecond. The follow-up work (`internal/services/residence_id_cache.go`, GORM pool warm-up, auth-query JOIN consolidation, switching `DB_HOST` to Neon's `-pooler` endpoint, bumped cache TTLs) cut warm-cache `/api/tasks/` from 2,473 ms / 5 spans to **229 ms / 2 spans** — see commit `88fb175` and Chapter 8 §"Optimizations layered on top".
 ---
 ## Stack
 | Role | Choice | Why this vs. the obvious alternative |
 |---|---|---|
 | Metrics store | **VictoriaMetrics** (single-node) | Drop-in Prometheus-compatible. ~4× lower RAM (~200 MB vs ~500 MB) and ~7× better compression. Single binary. |
 | Tracing | **Jaeger all-in-one** | ~150 MB RAM with embedded badger storage. Tempo monolithic mode needs 1-2 GB minimum — overkill for honeyDue's scale. |
 | Dashboards | **Grafana OSS** | Connects to both VM (Prometheus protocol) and Jaeger natively. |
 | App instrumentation | **OpenTelemetry SDK** + `prometheus/client_golang` | OTel is vendor-neutral — backends are swappable without code change. |
 | Logs | **Keep Dozzle**; add Loki only when log search becomes painful | Loki adds ~512 MB RAM + a daemonset for log shipping. Not worth it until there's a concrete pain point. |
 ### Why not the LGTM stack (Loki + Grafana + Tempo + Mimir)?
 - **Tempo** wants 1-2 GB RAM minimum in monolithic mode ([Grafana community report](https://community.grafana.com/t/tempo-ram-usage-for-6k-spans-per-hour/63801)). Stacking that on top of Loki + Mimir would consume ~3-4 GB RAM. On a 3×8 GB cluster that's 12-17% of capacity for observability infra.
 - **Mimir** is wonderful for multi-tenant Prometheus at scale — you have one tenant.
 - **Loki** is great if you live in `kubectl logs` and need full-text search across them. You currently use Dozzle and are not feeling that pain.
 VictoriaMetrics + Jaeger all-in-one gives you 90% of the value at 25% of the resource cost.
 ---
 ## Resource budget on `88oakappsUpdate`
 Three Docker containers in a separate compose project under `/opt/honeydue-obs/` — fully isolated from the existing PostHog compose stack so PostHog's lifecycle never touches the obs stack and vice versa.
 | Service | `mem_limit` | Disk (bind mount) | Retention |
 |---|---|---|---|
 | VictoriaMetrics single-node | 256 MB | 10 GB | 30 days metrics |
 | Jaeger all-in-one (badger storage) | 256 MB | 10 GB | 7 days traces |
 | Grafana OSS | 256 MB | 1 GB | — |
 | **Total** | **~768 MB hard cap** | **21 GB** | |
 **~5% of the box's free RAM and ~14% of free disk.** The hard `mem_limit` per container matters: ClickHouse on the same VM can spike under PostHog analytics load, so bounding the obs stack prevents it from competing in a memory pinch.
 **Don't reuse PostHog's ClickHouse / Kafka / Redis.** Tempting because they're sitting right there, but coupling honeyDue's observability to PostHog's storage means a PostHog incident takes honeyDue's incident-response telemetry down with it. Keep them fully separate.
 **Shared blast radius caveat:** A kernel panic on `88oakappsUpdate` loses both PostHog and honeyDue obs at once. At current scale, fine — call it out, don't fix.
 ---
 ## App-side instrumentation
 | Surface | Library / approach | Import path |
 |---|---|---|
 | Echo HTTP middleware | `otelecho` — span per request, tagged route/method/status | `go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho` |
 | GORM queries | `uptrace/otelgorm` plugin — `db.Use(otelgorm.NewPlugin())`. Requires threading `ctx` through repositories so `db.WithContext(ctx)` works. | `github.com/uptrace/opentelemetry-go-extra/otelgorm` |
 | B2 / minio-go uploads | Manual span around `storage_service.Upload` with attributes for bucket, object size, MIME type | `go.opentelemetry.io/otel` |
 | APNs / FCM | Manual span in `internal/push/apns.go` and `fcm.go`; record device-token, response status code | `go.opentelemetry.io/otel` |
 | asynq jobs | Custom `asynq.MiddlewareFunc` (~20 lines) — span per task type, attached to ctx, records duration + retry count | `go.opentelemetry.io/otel` + `asynq.MiddlewareFunc` |
 | Prometheus `/metrics` endpoint | `prometheus/client_golang` direct — register histograms for HTTP duration / GORM op / B2 op / APNs send | `github.com/prometheus/client_golang/prometheus`, `.../prometheus/promhttp` |
 | OTLP exporter | OTLP/HTTP → `https://obs.88oakapps.com/v1/traces` with bearer token. 100% sample in dev, 10% in prod. | `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp` |
 | Metrics push | `vmagent` sidecar in k3s scrapes the api Pod's `/metrics` and remote-writes to `https://obs.88oakapps.com/api/v1/write` with bearer token. Cleaner than exposing `/metrics` publicly. | `victoriametrics/vmagent` image |
 **Note on GORM context propagation:** the existing repository methods don't take `ctx context.Context`. Adding `otelgorm` requires plumbing ctx down from the Echo handler through the service layer to the repository call site. ~10 repository files, many call sites. Save for last because the diff is large.
 ---
 ## Implementation order (smallest first)
 ### Step 1 — Metrics + dashboards (highest immediate ROI)
 **On `88oakappsUpdate`:**
 1. `mkdir -p /opt/honeydue-obs/{data/vm,data/jaeger,data/grafana}` and a `docker-compose.yml` defining the three services with `mem_limit: 256m`, bind mounts for persistence, and an isolated bridge network
 2. Add nginx vhosts (DNS A records first):
   - `grafana.88oakapps.com` → `127.0.0.1:3000` (basic auth via htpasswd, Let's Encrypt)
   - `obs.88oakapps.com` → routes by path:
     - `/api/v1/write` → `127.0.0.1:8428` (VictoriaMetrics remote-write, bearer-token check)
     - `/v1/traces`     → `127.0.0.1:4318` (OTLP/HTTP traces, bearer-token check)
 3. Generate a 32-byte token, store in `/etc/honeydue-obs/token` (mode 0600), reference from nginx as `auth_request` or simple `if ($http_authorization != ...)`
 4. Pre-provision Grafana with the VM datasource pointing at `http://victoriametrics:8428` (in-network)
 **On the honeyDue k3s cluster:**
 5. Add `prometheus/client_golang` to `honeyDueAPI-go/go.mod` and a `/metrics` endpoint to the Go API
 6. Register histograms:
   - `http_request_duration_seconds{route,method,status}` via Echo middleware
   - `gorm_query_duration_seconds{table,operation}` via a GORM `Plugin` callback (no ctx needed for this one — operates at the SQL string level)
   - `b2_upload_duration_seconds{bucket,result}`
   - `apns_send_duration_seconds{result}`
 7. Deploy a `vmagent` sidecar (or DaemonSet) in the `honeydue` namespace with:
   - Scrape: api Service `/metrics` every 15s
   - `remote_write.url`: `https://obs.88oakapps.com/api/v1/write`
   - `remote_write.bearer_token`: from k8s Secret
 8. Build the RED dashboard in Grafana: rate, errors, duration p50/p95/p99 per route
 **ROI:** "Is the API healthy? Where is time being spent right now?" answered live, served from `grafana.88oakapps.com`.
 ### Step 2 — Tracing baseline
 (Jaeger is already up from Step 1. This step adds the app-side wiring.)
 1. Add Grafana datasource for Jaeger pointing at `http://jaeger:16686` (in-network)
 2. Wire OTel SDK in `cmd/api/main.go`:
   - `otel.SetTracerProvider(tracerProvider)`
   - `otelecho.Middleware("honeydue-api")` on Echo
   - OTLP/HTTP exporter pointing at `https://obs.88oakapps.com/v1/traces` with `Authorization: Bearer <token>` header (token from env)
   - Sampling: `TraceIDRatioBased(0.1)` in prod, `AlwaysSample()` in dev
 3. Verify: a single `POST /api/auth/login/` produces a trace in Jaeger
 **ROI:** "Why is this one request slow?" — answered with a flame graph.
 ### Step 3 — Manual spans for the work that actually matters
 Wrap each in `tracer.Start(ctx, ...)` with attributes:
 - `storage_service.Upload` → span "b2.PutObject" with `bucket`, `key`, `size_bytes`, result
 - `push/apns.go` → span "apns.send" with `device_token_hash`, `status_code`, `reason`
 - `asynq` middleware → span per task type with `task.type`, `retry_count`, `payload_size`
 **ROI:** Specific high-value debugging questions ("why did this upload take 30 seconds", "why did these 5 push notifications fail") answered without code archaeology.
 ### Step 4 — Repository ctx + `otelgorm` (biggest diff, save for last)
 1. Refactor every repository method to accept `ctx context.Context` as first arg
 2. Update every call site to pass `c.Request().Context()` from handlers / propagate through services
 3. Add `db.Use(otelgorm.NewPlugin())` in `internal/database/database.go`
 4. Verify: a request now has nested spans `http → service → query → query → b2.PutObject → apns.send` with full SQL on the query spans
 **ROI:** Every DB query in every trace, with SQL + table + rows. The "find the N+1" tool you'd otherwise build by hand.
 ---
 ## Hard skips (revisit only when explicitly proven needed)
 | Tool | Why skip |
 |---|---|
 | Loki / Promtail | Dozzle covers the immediate need. Loki adds 512 Mi RAM + a daemonset; defer until log search becomes a hot pain point. |
 | Mimir / VM cluster mode | Single-node VM handles honeyDue scale for years. |
 | Pyroscope continuous profiling | Overkill at 3 small nodes. Use `pprof` endpoints ad-hoc when CPU pressure shows up. |
 | OTel Collector | Only worth running when 3+ services emit telemetry. App → Jaeger direct is fine for now. |
 | Any SaaS vendor (Datadog, NR, Honeycomb, Grafana Cloud, Sentry Performance) | User constraint: nothing paid. |
 ---
 ## When to move off `88oakappsUpdate`
 Triggers — any one is enough:
 - `88oakappsUpdate` available memory drops below ~3 GB sustained (PostHog growth squeezing it)
 - ClickHouse OOM events start showing up in `dmesg` (PostHog under load)
 - You want fully separate failure domains for honeyDue vs. 88oakapps
 Migration path: the obs stack is a single docker-compose project on a bind-mount, so moving it = `rsync /opt/honeydue-obs/` to a new box, update DNS for `grafana.88oakapps.com` and `obs.88oakapps.com`, `docker compose up -d`. ~30 min of work. Until then: cohabiting on `88oakappsUpdate` is correct.
 ---
 ## Quick reference: what shows up where
 | Question | Where to look |
 |---|---|
 | Is the API up right now? Latency? Errors? | Grafana RED dashboard |
 | Why is this specific request slow? | Jaeger trace view |
 | What did the slow part of that request actually do (which SQL, which B2 PUT)? | Span details inside the trace |
 | Background job throughput / queue depth | VictoriaMetrics + asynq metrics |
 | What did the app print to stdout 5 minutes ago? | Dozzle |
 | What error did the app log? | Dozzle (search) — or Loki if/when added |
@@ -0,0 +1,146 @@
 # Runbook — Secret Rotation
 Closes audit finding `K3S-F12` (secrets unrotated since cluster bootstrap,
 no rotation cadence). See `deploy-k3s/SECURITY.md` Stage 2.
 **Cadence:** rotate every secret at least **annually**. Rotate
 **immediately** on suspected exposure, on an operator-device loss, or when
 anyone who has seen a secret leaves the project.
 **Record keeping:** after each rotation, annotate the secret so the age is
 visible:
 ```bash
 kubectl -n honeydue annotate secret <name> \
  honeydue.dev/last-rotated="$(date -u +%Y-%m-%d)" --overwrite
 ```
 ---
 ## How rotation works
 Every secret has a **source of truth** on the operator workstation. The
 deploy scripts read those sources and (re)create the Kubernetes Secrets.
 Rotation is always: **update the source → re-run `02-setup-secrets.sh` →
 restart the pods that consume it → revoke the old credential at its
 provider.**
 `02-setup-secrets.sh` uses `kubectl apply` (via `--dry-run=client -o yaml`),
 so re-running it is idempotent and only changes what you changed.
 | Kubernetes Secret | Source of truth | Consumed by |
 |---|---|---|
 | `honeydue-secrets` → `POSTGRES_PASSWORD` | `deploy-k3s/secrets/postgres_password.txt` | api, worker |
 | `honeydue-secrets` → `SECRET_KEY` | `deploy-k3s/secrets/secret_key.txt` | api, worker |
 | `honeydue-secrets` → `EMAIL_HOST_PASSWORD` | `deploy-k3s/secrets/email_host_password.txt` | api, worker |
 | `honeydue-secrets` → `FCM_SERVER_KEY` | `deploy-k3s/secrets/fcm_server_key.txt` | api, worker |
 | `honeydue-secrets` → `REDIS_PASSWORD` | `config.yaml` key `redis.password` | api, worker, redis |
 | `honeydue-secrets` → `OBS_INGEST_TOKEN` | `deploy/prod.env` | api, worker |
 | `honeydue-apns-key` → `apns_auth_key.p8` | `deploy-k3s/secrets/apns_auth_key.p8` | api, worker |
 | `cloudflare-origin-cert` | `deploy-k3s/secrets/cloudflare-origin.{crt,key}` | Traefik ingress |
 | `ghcr-credentials` | `config.yaml` block `registry.*` | image pulls (all pods) |
 | `admin-basic-auth` | `config.yaml` keys `admin.basic_auth_user` / `..._password` | Traefik `admin-auth` middleware |
 The `deploy-k3s/secrets/` directory and `config.yaml` are **gitignored** —
 never commit them.
 ---
 ## Standard rotation procedure
 ```bash
 cd honeyDueAPI-go
 export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
 # 1. Update the source (file under deploy-k3s/secrets/ or a config.yaml key)
 # 2. Recreate the Kubernetes Secrets from sources
 ./deploy-k3s/scripts/02-setup-secrets.sh
 # 3. Restart the consumers (see per-secret notes below for which)
 kubectl -n honeydue rollout restart deploy/api deploy/worker
 # 4. Confirm health
 kubectl -n honeydue rollout status deploy/api
 kubectl -n honeydue rollout status deploy/worker
 # 5. Revoke the OLD credential at its provider (see per-secret notes)
 # 6. Annotate the rotated secret with today's date
 ```
 ---
 ## Per-secret notes
 ### `POSTGRES_PASSWORD`
 1. Rotate the role password in the Neon dashboard.
 2. Write the new value to `deploy-k3s/secrets/postgres_password.txt`.
 3. `02-setup-secrets.sh`, then `rollout restart deploy/api deploy/worker`.
 4. Watch logs for connection errors; the old password stops working the
   moment Neon applies the change, so do steps 2–3 promptly.
 ### `SECRET_KEY`  ⚠️ user-visible
 This signs auth tokens. **Rotating it logs every user out** — all existing
 tokens become invalid and every client must re-authenticate.
 1. Generate: `openssl rand -hex 32`.
 2. Write to `deploy-k3s/secrets/secret_key.txt` (must be ≥32 chars — the
   script enforces this; the app refuses to start in production without it).
 3. `02-setup-secrets.sh`, then `rollout restart deploy/api deploy/worker`.
 - Only rotate on a schedule or on suspected compromise — not casually.
 - A future improvement (overlap window via a key-id header) would let old
  tokens validate during the transition; not implemented today.
 ### `EMAIL_HOST_PASSWORD`
 1. Generate a new app password in Fastmail; keep the old one alive briefly.
 2. Write to `deploy-k3s/secrets/email_host_password.txt`.
 3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
 4. Delete the old Fastmail app password.
 ### `FCM_SERVER_KEY`
 1. Rotate the key in the Firebase console.
 2. Write to `deploy-k3s/secrets/fcm_server_key.txt`.
 3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
 ### `REDIS_PASSWORD`
 Source is `config.yaml` key `redis.password` (hex only — it is embedded in
 the `REDIS_URL`, so non-hex characters would break URL parsing).
 1. Generate: `openssl rand -hex 32`.
 2. Set `redis.password` in `config.yaml`.
 3. `02-setup-secrets.sh`.
 4. Restart **redis as well as** api/worker so the new `--requirepass` and
   the new `REDIS_URL` land together:
   `kubectl -n honeydue rollout restart deploy/redis deploy/api deploy/worker`.
   Expect a few seconds where api/worker reconnect.
 ### `apns_auth_key.p8`
 1. Revoke the key in the Apple Developer console, generate a new `.p8`.
 2. Replace `deploy-k3s/secrets/apns_auth_key.p8`.
 3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
 4. If the Key ID changed, update `push.apns_key_id` in `config.yaml` too.
 ### `cloudflare-origin-cert`
 1. Generate a new Origin CA certificate in the Cloudflare dashboard.
 2. Replace `deploy-k3s/secrets/cloudflare-origin.crt` and `.key`.
 3. `02-setup-secrets.sh`. Traefik picks up the new TLS secret; no app
   restart needed. Verify the served cert with `openssl s_client`.
 ### `ghcr-credentials` (Gitea registry)
 1. Generate a new PAT in Gitea (scope: `read:packages`).
 2. Update the `registry.token` value in `config.yaml`.
 3. `02-setup-secrets.sh`. No restart needed unless a pull is pending.
 4. Revoke the old PAT in Gitea.
 ### `admin-basic-auth`
 Source is `config.yaml` keys `admin.basic_auth_user` / `basic_auth_password`.
 1. Set a new password (e.g. `openssl rand -hex 24`).
 2. `02-setup-secrets.sh` regenerates the bcrypt htpasswd secret.
 3. No app restart needed — Traefik reloads the `admin-auth` middleware.
 4. Distribute the new credential to whoever uses the admin panel.
 ---
 ## After any rotation
 - Run `./deploy-k3s/scripts/04-verify.sh` and confirm no `✗` lines.
 - Annotate the rotated secret (see "Record keeping" above).
 - If the rotation was due to a compromise, also follow the relevant
  playbook in `deploy-k3s/SECURITY.md` → Appendix (Incident response).
@@ -1,6 +1,6 @@
 module github.com/treytartt/honeydue-api
-go 1.25
+go 1.25.0
 require (
 	github.com/go-pdf/fpdf v0.9.0
@@ -9,9 +9,10 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/gorilla/websocket v1.5.3
 	github.com/hibiken/asynq v0.25.1
-	github.com/labstack/echo/v4 v4.11.4
+	github.com/labstack/echo/v4 v4.15.1
 	github.com/minio/minio-go/v7 v7.0.99
 	github.com/nicksnyder/go-i18n/v2 v2.6.0
 	github.com/prometheus/client_golang v1.23.2
 	github.com/redis/go-redis/v9 v9.17.1
 	github.com/rs/zerolog v1.34.0
 	github.com/shirou/gopsutil/v3 v3.24.5
@@ -20,11 +21,17 @@ require (
 	github.com/spf13/viper v1.20.1
 	github.com/stretchr/testify v1.11.1
 	github.com/stripe/stripe-go/v81 v81.4.0
 	github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2
 	github.com/wneessen/go-mail v0.7.2
-	golang.org/x/crypto v0.46.0
+	go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0
-	golang.org/x/oauth2 v0.34.0
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0
-	golang.org/x/text v0.32.0
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0
-	golang.org/x/time v0.14.0
+	go.opentelemetry.io/otel/sdk v1.43.0
 	golang.org/x/crypto v0.51.0
 	golang.org/x/oauth2 v0.35.0
 	golang.org/x/term v0.43.0
 	golang.org/x/text v0.37.0
 	golang.org/x/time v0.15.0
 	google.golang.org/api v0.257.0
 	gopkg.in/yaml.v3 v3.0.1
 	gorm.io/driver/postgres v1.6.0
@@ -33,17 +40,28 @@ require (
 )
 require (
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/cenkalti/backoff/v5 v5.0.3 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/go-ini/ini v1.67.0 // indirect
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect
 	github.com/klauspost/compress v1.18.2 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.11 // indirect
 	github.com/klauspost/crc32 v1.3.0 // indirect
 	github.com/minio/crc64nvme v1.1.1 // indirect
 	github.com/minio/md5-simd v1.1.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/philhofer/fwd v1.2.0 // indirect
 	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.66.1 // indirect
 	github.com/prometheus/procfs v0.16.1 // indirect
 	github.com/rs/xid v1.6.0 // indirect
 	github.com/tinylib/msgp v1.6.1 // indirect
 	github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2 // indirect
 	go.opentelemetry.io/proto/otlp v1.10.0 // indirect
 	go.yaml.in/yaml/v2 v2.4.2 // indirect
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
 )
 require (
@@ -51,7 +69,7 @@ require (
 	cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
 	cloud.google.com/go/compute/metadata v0.9.0 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
-	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/fsnotify/fsnotify v1.9.0 // indirect
@@ -62,7 +80,6 @@ require (
 	github.com/go-playground/locales v0.14.1 // indirect
 	github.com/go-playground/universal-translator v0.18.1 // indirect
 	github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
 	github.com/golang-jwt/jwt v3.2.2+incompatible // indirect; TODO(S-19): Pulled by echo/v4 middleware — upgrade Echo to v4.12+ which removes built-in JWT middleware (uses echo-jwt/v4 with jwt/v5 instead), eliminating this vulnerable transitive dep
 	github.com/golang-jwt/jwt/v4 v4.5.2 // indirect
 	github.com/google/s2a-go v0.1.9 // indirect
 	github.com/googleapis/enterprise-certificate-proxy v0.3.7 // indirect
@@ -76,11 +93,11 @@ require (
 	github.com/labstack/gommon v0.4.2 // indirect
 	github.com/leodido/go-urn v1.4.0 // indirect
 	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
-	github.com/mattn/go-colorable v0.1.13 // indirect
+	github.com/mattn/go-colorable v0.1.14 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/mattn/go-sqlite3 v2.0.3+incompatible // indirect
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
-	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
 	github.com/robfig/cron/v3 v3.0.1 // indirect
 	github.com/sagikazarmark/locafero v0.9.0 // indirect
@@ -97,13 +114,13 @@ require (
 	github.com/yusufpapurcu/wmi v1.2.4 // indirect
 	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
-	go.opentelemetry.io/otel v1.38.0 // indirect
+	go.opentelemetry.io/otel v1.43.0
-	go.opentelemetry.io/otel/metric v1.38.0 // indirect
+	go.opentelemetry.io/otel/metric v1.43.0 // indirect
-	go.opentelemetry.io/otel/trace v1.38.0 // indirect
+	go.opentelemetry.io/otel/trace v1.43.0
-	golang.org/x/net v0.48.0 // indirect
+	golang.org/x/net v0.53.0 // indirect
-	golang.org/x/sync v0.19.0 // indirect
+	golang.org/x/sync v0.20.0
-	golang.org/x/sys v0.39.0 // indirect
+	golang.org/x/sys v0.44.0 // indirect
-	google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
-	google.golang.org/grpc v1.77.0 // indirect
+	google.golang.org/grpc v1.80.0 // indirect
-	google.golang.org/protobuf v1.36.10 // indirect
+	google.golang.org/protobuf v1.36.11 // indirect
 )
@@ -8,16 +8,20 @@ github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg
 github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
 github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20201120081800-1786d5ef83d4/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
 github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
 github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
 github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
 github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
 github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
@@ -52,8 +56,6 @@ github.com/go-playground/validator/v10 v10.23.0/go.mod h1:dbuPbCMFw/DrkbEynArYaC
 github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs=
 github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
 github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
 github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
 github.com/golang-jwt/jwt/v4 v4.4.1/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
 github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXeUI=
 github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
@@ -74,6 +76,8 @@ github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81
 github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc=
 github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
 github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c=
 github.com/hibiken/asynq v0.25.1 h1:phj028N0nm15n8O2ims+IvJ2gz4k2auvermngh9JhTw=
 github.com/hibiken/asynq v0.25.1/go.mod h1:pazWNOLBu0FEynQRBvHA26qdIKRSmfdIfUm4HdsLmXg=
 github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
@@ -99,16 +103,19 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/labstack/echo/v4 v4.11.4 h1:vDZmA+qNeh1pd/cCkEicDMrjtrnMGQ1QFI9gWN1zGq8=
+github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
-github.com/labstack/echo/v4 v4.11.4/go.mod h1:noh7EvLwqDsmh/X/HWKPUl1AjzJrhyptRyEbQJfxen8=
+github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
 github.com/labstack/echo/v4 v4.15.1 h1:S9keusg26gZpjMmPqB5hOEvNKnmd1lNmcHrbbH2lnFs=
 github.com/labstack/echo/v4 v4.15.1/go.mod h1:xmw1clThob0BSVRX1CRQkGQ/vjwcpOMjQZSZa9fKA/c=
 github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0=
 github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
 github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
 github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
 github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
 github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
 github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
 github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
@@ -121,6 +128,8 @@ github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
 github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
 github.com/minio/minio-go/v7 v7.0.99 h1:2vH/byrwUkIpFQFOilvTfaUpvAX3fEFhEzO+DR3DlCE=
 github.com/minio/minio-go/v7 v7.0.99/go.mod h1:EtGNKtlX20iL2yaYnxEigaIvj0G0GwSDnifnG8ClIdw=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/nicksnyder/go-i18n/v2 v2.6.0 h1:C/m2NNWNiTB6SK4Ao8df5EWm3JETSTIGNXBpMJTxzxQ=
 github.com/nicksnyder/go-i18n/v2 v2.6.0/go.mod h1:88sRqr0C6OPyJn0/KRNaEz1uWorjxIKP7rUUcvycecE=
 github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
@@ -128,10 +137,19 @@ github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8
 github.com/philhofer/fwd v1.2.0 h1:e6DnBTl7vGY+Gz322/ASL4Gyp1FspeMvx1RNDoToZuM=
 github.com/philhofer/fwd v1.2.0/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
 github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
 github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
 github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
 github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
 github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
 github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
 github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
 github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
 github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
 github.com/redis/go-redis/v9 v9.17.1 h1:7tl732FjYPRT9H9aNfyTwKg9iTETjWjGKEJ2t/5iWTs=
 github.com/redis/go-redis/v9 v9.17.1/go.mod h1:u410H11HMLoB+TP67dz8rL9s6QW2j76l0//kSOd3370=
 github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
@@ -180,6 +198,10 @@ github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFA
 github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
 github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
 github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY=
 github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2 h1:Jjn3zoRz13f8b1bR6LrXWglx93Sbh4kYfwgmPju3E2k=
 github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2/go.mod h1:wocb5pNrj/sjhWB9J5jctnC0K2eisSdz/nJJBNFHo+A=
 github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2 h1:ZjUj9BLYf9PEqBn8W/OapxhPjVRdC6CsXTdULHsyk5c=
 github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2/go.mod h1:O8bHQfyinKwTXKkiKNGmLQS7vRsqRxIQTFZpYpHK3IQ=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
 github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
@@ -190,33 +212,45 @@ github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo
 github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
 go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
 go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0 h1:7N94HrYgVc2tng6xEjmbycupxteYLll7lPlEi/UK5ok=
 go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0/go.mod h1:1i+7wBOfx0kn7PSGRKZ8e7zIhs+AmvLCiCloySDUeck=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
-go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
+go.opentelemetry.io/contrib/propagators/b3 v1.43.0 h1:CETqV3QLLPTy5yNrqyMr41VnAOOD4lsRved7n4QG00A=
-go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
+go.opentelemetry.io/contrib/propagators/b3 v1.43.0/go.mod h1:Q4mCiCdziYzpNR0g+6UqVotAlCDZdzz6L8jwY4knOrw=
-go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
+go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
-go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
+go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
-go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k=
-go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A=
-go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc=
-go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak=
-go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
+go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
-go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
+go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
 go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
 go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
 go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
 go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
 go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
 go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
 go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g=
 go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk=
 go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
 go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
 go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
 go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
 go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
 go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
 golang.org/x/crypto v0.0.0-20170512130425-ab89591268e0/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
-golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
+golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
-golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
+golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
 golang.org/x/net v0.0.0-20210520170846-37e1c6afe023/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220403103023-749bd193bc2b/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
-golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
+golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
-golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
+golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
-golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
+golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
-golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
+golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
-golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
-golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -228,32 +262,34 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
+golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
-golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
 golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4=
 golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
+golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
-golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
+golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
-golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
+golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
-golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
+golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
+gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
-gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
+gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
 google.golang.org/api v0.257.0 h1:8Y0lzvHlZps53PEaw+G29SsQIkuKrumGWs9puiexNAA=
 google.golang.org/api v0.257.0/go.mod h1:4eJrr+vbVaZSqs7vovFd1Jb/A6ml6iw2e6FBYf3GAO4=
 google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4=
 google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s=
-google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
-google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
-google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM=
+google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
-google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig=
+google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
-google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
+google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
-google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
 gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
@@ -1,215 +1,30 @@
 // apple_social_auth_handler is a stub — the user_applesocialauth table was
 // dropped in the Ory Kratos migration (phase 2). Social sign-in is now
 // handled by Kratos.
 package handlers
 import (
 	"net/http"
 	"strconv"
 	"github.com/labstack/echo/v4"
 	"gorm.io/gorm"
 	"github.com/treytartt/honeydue-api/internal/admin/dto"
 	"github.com/treytartt/honeydue-api/internal/models"
 )
-// AdminAppleSocialAuthHandler handles admin Apple social auth management endpoints
+// AdminAppleSocialAuthHandler is a no-op stub.
 type AdminAppleSocialAuthHandler struct {
 	db *gorm.DB
 }
 // NewAdminAppleSocialAuthHandler creates a new admin Apple social auth handler
 func NewAdminAppleSocialAuthHandler(db *gorm.DB) *AdminAppleSocialAuthHandler {
 	return &AdminAppleSocialAuthHandler{db: db}
 }
-// AppleSocialAuthResponse represents the response for an Apple social auth entry
+func (h *AdminAppleSocialAuthHandler) gone(c echo.Context) error {
-type AppleSocialAuthResponse struct {
+	return c.JSON(http.StatusGone, map[string]string{"message": "Apple social auth is managed by Ory Kratos"})
 	ID             uint   `json:"id"`
 	UserID         uint   `json:"user_id"`
 	Username       string `json:"username"`
 	UserEmail      string `json:"user_email"`
 	AppleID        string `json:"apple_id"`
 	Email          string `json:"email"`
 	IsPrivateEmail bool   `json:"is_private_email"`
 	CreatedAt      string `json:"created_at"`
 	UpdatedAt      string `json:"updated_at"`
 }
 // UpdateAppleSocialAuthRequest represents the request to update an Apple social auth entry
 type UpdateAppleSocialAuthRequest struct {
 	Email          *string `json:"email"`
 	IsPrivateEmail *bool   `json:"is_private_email"`
 }
 // List handles GET /api/admin/apple-social-auth
 func (h *AdminAppleSocialAuthHandler) List(c echo.Context) error {
 	var filters dto.PaginationParams
 	if err := c.Bind(&filters); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	var entries []models.AppleSocialAuth
 	var total int64
 	query := h.db.Model(&models.AppleSocialAuth{}).Preload("User")
 	// Apply search
 	if filters.Search != "" {
 		search := "%" + filters.Search + "%"
 		query = query.Joins("JOIN auth_user ON auth_user.id = user_applesocialauth.user_id").
 			Where("user_applesocialauth.apple_id ILIKE ? OR user_applesocialauth.email ILIKE ? OR auth_user.username ILIKE ? OR auth_user.email ILIKE ?",
 				search, search, search, search)
 	}
 	// Get total count
 	query.Count(&total)
 	// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
 	sortBy := filters.GetSafeSortBy([]string{
 		"id", "user_id", "apple_id", "email", "is_private_email",
 		"created_at", "updated_at",
 	}, "created_at")
 	query = query.Order(sortBy + " " + filters.GetSortDir())
 	// Apply pagination
 	query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
 	if err := query.Find(&entries).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entries"})
 	}
 	// Build response
 	responses := make([]AppleSocialAuthResponse, len(entries))
 	for i, entry := range entries {
 		responses[i] = h.toResponse(&entry)
 	}
 	return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
 }
 // Get handles GET /api/admin/apple-social-auth/:id
 func (h *AdminAppleSocialAuthHandler) Get(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
 	}
 	var entry models.AppleSocialAuth
 	if err := h.db.Preload("User").First(&entry, id).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
 		}
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
 	}
 	return c.JSON(http.StatusOK, h.toResponse(&entry))
 }
 // GetByUser handles GET /api/admin/apple-social-auth/user/:user_id
 func (h *AdminAppleSocialAuthHandler) GetByUser(c echo.Context) error {
 	userID, err := strconv.ParseUint(c.Param("user_id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
 	}
 	var entry models.AppleSocialAuth
 	if err := h.db.Preload("User").Where("user_id = ?", userID).First(&entry).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found for user"})
 		}
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
 	}
 	return c.JSON(http.StatusOK, h.toResponse(&entry))
 }
 // Update handles PUT /api/admin/apple-social-auth/:id
 func (h *AdminAppleSocialAuthHandler) Update(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
 	}
 	var entry models.AppleSocialAuth
 	if err := h.db.First(&entry, id).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
 		}
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
 	}
 	var req UpdateAppleSocialAuthRequest
 	if err := c.Bind(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	if req.Email != nil {
 		entry.Email = *req.Email
 	}
 	if req.IsPrivateEmail != nil {
 		entry.IsPrivateEmail = *req.IsPrivateEmail
 	}
 	if err := h.db.Save(&entry).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update Apple social auth entry"})
 	}
 	h.db.Preload("User").First(&entry, id)
 	return c.JSON(http.StatusOK, h.toResponse(&entry))
 }
 // Delete handles DELETE /api/admin/apple-social-auth/:id
 func (h *AdminAppleSocialAuthHandler) Delete(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
 	}
 	var entry models.AppleSocialAuth
 	if err := h.db.First(&entry, id).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
 		}
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
 	}
 	if err := h.db.Delete(&entry).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete Apple social auth entry"})
 	}
 	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Apple social auth entry deleted successfully"})
 }
 // BulkDelete handles DELETE /api/admin/apple-social-auth/bulk
 func (h *AdminAppleSocialAuthHandler) BulkDelete(c echo.Context) error {
 	var req dto.BulkDeleteRequest
 	if err := c.Bind(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	result := h.db.Where("id IN ?", req.IDs).Delete(&models.AppleSocialAuth{})
 	if result.Error != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete Apple social auth entries"})
 	}
 	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Apple social auth entries deleted successfully", "count": result.RowsAffected})
 }
 // toResponse converts an AppleSocialAuth model to AppleSocialAuthResponse
 func (h *AdminAppleSocialAuthHandler) toResponse(entry *models.AppleSocialAuth) AppleSocialAuthResponse {
 	response := AppleSocialAuthResponse{
 		ID:             entry.ID,
 		UserID:         entry.UserID,
 		AppleID:        entry.AppleID,
 		Email:          entry.Email,
 		IsPrivateEmail: entry.IsPrivateEmail,
 		CreatedAt:      entry.CreatedAt.Format("2006-01-02T15:04:05Z"),
 		UpdatedAt:      entry.UpdatedAt.Format("2006-01-02T15:04:05Z"),
 	}
 	if entry.User.ID != 0 {
 		response.Username = entry.User.Username
 		response.UserEmail = entry.User.Email
 	}
 	return response
 }
 func (h *AdminAppleSocialAuthHandler) List(c echo.Context) error         { return h.gone(c) }
 func (h *AdminAppleSocialAuthHandler) Get(c echo.Context) error          { return h.gone(c) }
 func (h *AdminAppleSocialAuthHandler) Delete(c echo.Context) error       { return h.gone(c) }
 func (h *AdminAppleSocialAuthHandler) BulkDelete(c echo.Context) error   { return h.gone(c) }
 func (h *AdminAppleSocialAuthHandler) Update(c echo.Context) error       { return h.gone(c) }
 func (h *AdminAppleSocialAuthHandler) GetByUser(c echo.Context) error    { return h.gone(c) }
@@ -1,144 +1,27 @@
 // auth_token_handler is a stub — the user_authtoken table was dropped in the
 // Ory Kratos migration (phase 2). Auth tokens are now Kratos sessions.
 package handlers
 import (
 	"net/http"
 	"strconv"
 	"github.com/labstack/echo/v4"
 	"gorm.io/gorm"
 	"github.com/treytartt/honeydue-api/internal/admin/dto"
 	"github.com/treytartt/honeydue-api/internal/models"
 )
-// AdminAuthTokenHandler handles admin auth token management endpoints
+// AdminAuthTokenHandler is a no-op stub.
 type AdminAuthTokenHandler struct {
 	db *gorm.DB
 }
 // NewAdminAuthTokenHandler creates a new admin auth token handler
 func NewAdminAuthTokenHandler(db *gorm.DB) *AdminAuthTokenHandler {
 	return &AdminAuthTokenHandler{db: db}
 }
-// AuthTokenResponse represents an auth token in API responses
+func (h *AdminAuthTokenHandler) gone(c echo.Context) error {
-type AuthTokenResponse struct {
+	return c.JSON(http.StatusGone, map[string]string{"message": "auth tokens are managed by Ory Kratos"})
 	Key      string `json:"key"`
 	UserID   uint   `json:"user_id"`
 	Username string `json:"username"`
 	Email    string `json:"email"`
 	Created  string `json:"created"`
 }
 // List handles GET /api/admin/auth-tokens
 func (h *AdminAuthTokenHandler) List(c echo.Context) error {
 	var filters dto.PaginationParams
 	if err := c.Bind(&filters); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	var tokens []models.AuthToken
 	var total int64
 	query := h.db.Model(&models.AuthToken{}).Preload("User")
 	// Apply search (search by user info)
 	if filters.Search != "" {
 		search := "%" + filters.Search + "%"
 		query = query.Joins("JOIN auth_user ON auth_user.id = user_authtoken.user_id").
 			Where(
 				"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_authtoken.key ILIKE ?",
 				search, search, search,
 			)
 	}
 	// Get total count
 	query.Count(&total)
 	// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
 	sortBy := filters.GetSafeSortBy([]string{
 		"created", "user_id",
 	}, "created")
 	query = query.Order(sortBy + " " + filters.GetSortDir())
 	// Apply pagination
 	query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
 	if err := query.Find(&tokens).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch auth tokens"})
 	}
 	// Build response
 	responses := make([]AuthTokenResponse, len(tokens))
 	for i, token := range tokens {
 		responses[i] = AuthTokenResponse{
 			Key:      token.Key,
 			UserID:   token.UserID,
 			Username: token.User.Username,
 			Email:    token.User.Email,
 			Created:  token.Created.Format("2006-01-02T15:04:05Z"),
 		}
 	}
 	return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
 }
 // Get handles GET /api/admin/auth-tokens/:id (id is actually user_id)
 func (h *AdminAuthTokenHandler) Get(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
 	}
 	var token models.AuthToken
 	if err := h.db.Preload("User").Where("user_id = ?", id).First(&token).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Auth token not found"})
 		}
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch auth token"})
 	}
 	response := AuthTokenResponse{
 		Key:      token.Key,
 		UserID:   token.UserID,
 		Username: token.User.Username,
 		Email:    token.User.Email,
 		Created:  token.Created.Format("2006-01-02T15:04:05Z"),
 	}
 	return c.JSON(http.StatusOK, response)
 }
 // Delete handles DELETE /api/admin/auth-tokens/:id (revoke token)
 func (h *AdminAuthTokenHandler) Delete(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
 	}
 	result := h.db.Where("user_id = ?", id).Delete(&models.AuthToken{})
 	if result.Error != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to revoke token"})
 	}
 	if result.RowsAffected == 0 {
 		return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Auth token not found"})
 	}
 	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Auth token revoked successfully"})
 }
 // BulkDelete handles DELETE /api/admin/auth-tokens/bulk
 func (h *AdminAuthTokenHandler) BulkDelete(c echo.Context) error {
 	var req dto.BulkDeleteRequest
 	if err := c.Bind(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	result := h.db.Where("user_id IN ?", req.IDs).Delete(&models.AuthToken{})
 	if result.Error != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to revoke tokens"})
 	}
 	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Auth tokens revoked successfully", "count": result.RowsAffected})
 }
 func (h *AdminAuthTokenHandler) List(c echo.Context) error       { return h.gone(c) }
 func (h *AdminAuthTokenHandler) Get(c echo.Context) error        { return h.gone(c) }
 func (h *AdminAuthTokenHandler) Delete(c echo.Context) error     { return h.gone(c) }
 func (h *AdminAuthTokenHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
@@ -1,162 +1,28 @@
 // confirmation_code_handler is a stub — the user_confirmationcode table was
 // dropped in the Ory Kratos migration (phase 2). Email verification is now
 // handled by Kratos.
 package handlers
 import (
 	"net/http"
 	"strconv"
 	"strings"
 	"github.com/labstack/echo/v4"
 	"gorm.io/gorm"
 	"github.com/treytartt/honeydue-api/internal/admin/dto"
 	"github.com/treytartt/honeydue-api/internal/models"
 )
-// maskCode masks a confirmation code, showing only the last 4 characters.
+// AdminConfirmationCodeHandler is a no-op stub.
 func maskCode(code string) string {
 	if len(code) <= 4 {
 		return strings.Repeat("*", len(code))
 	}
 	return strings.Repeat("*", len(code)-4) + code[len(code)-4:]
 }
 // AdminConfirmationCodeHandler handles admin confirmation code management endpoints
 type AdminConfirmationCodeHandler struct {
 	db *gorm.DB
 }
 // NewAdminConfirmationCodeHandler creates a new admin confirmation code handler
 func NewAdminConfirmationCodeHandler(db *gorm.DB) *AdminConfirmationCodeHandler {
 	return &AdminConfirmationCodeHandler{db: db}
 }
-// ConfirmationCodeResponse represents a confirmation code in API responses
+func (h *AdminConfirmationCodeHandler) gone(c echo.Context) error {
-type ConfirmationCodeResponse struct {
+	return c.JSON(http.StatusGone, map[string]string{"message": "confirmation codes are managed by Ory Kratos"})
 	ID        uint   `json:"id"`
 	UserID    uint   `json:"user_id"`
 	Username  string `json:"username"`
 	Email     string `json:"email"`
 	Code      string `json:"code"`
 	ExpiresAt string `json:"expires_at"`
 	IsUsed    bool   `json:"is_used"`
 	CreatedAt string `json:"created_at"`
 }
 // List handles GET /api/admin/confirmation-codes
 func (h *AdminConfirmationCodeHandler) List(c echo.Context) error {
 	var filters dto.PaginationParams
 	if err := c.Bind(&filters); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	var codes []models.ConfirmationCode
 	var total int64
 	query := h.db.Model(&models.ConfirmationCode{}).Preload("User")
 	// Apply search (search by user info or code)
 	if filters.Search != "" {
 		search := "%" + filters.Search + "%"
 		query = query.Joins("JOIN auth_user ON auth_user.id = user_confirmationcode.user_id").
 			Where(
 				"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_confirmationcode.code ILIKE ?",
 				search, search, search,
 			)
 	}
 	// Get total count
 	query.Count(&total)
 	// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
 	sortBy := filters.GetSafeSortBy([]string{
 		"id", "user_id", "created_at", "expires_at", "is_used",
 	}, "created_at")
 	query = query.Order(sortBy + " " + filters.GetSortDir())
 	// Apply pagination
 	query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
 	if err := query.Find(&codes).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch confirmation codes"})
 	}
 	// Build response
 	responses := make([]ConfirmationCodeResponse, len(codes))
 	for i, code := range codes {
 		responses[i] = ConfirmationCodeResponse{
 			ID:        code.ID,
 			UserID:    code.UserID,
 			Username:  code.User.Username,
 			Email:     code.User.Email,
 			Code:      maskCode(code.Code),
 			ExpiresAt: code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
 			IsUsed:    code.IsUsed,
 			CreatedAt: code.CreatedAt.Format("2006-01-02T15:04:05Z"),
 		}
 	}
 	return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
 }
 // Get handles GET /api/admin/confirmation-codes/:id
 func (h *AdminConfirmationCodeHandler) Get(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
 	}
 	var code models.ConfirmationCode
 	if err := h.db.Preload("User").First(&code, id).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Confirmation code not found"})
 		}
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch confirmation code"})
 	}
 	response := ConfirmationCodeResponse{
 		ID:        code.ID,
 		UserID:    code.UserID,
 		Username:  code.User.Username,
 		Email:     code.User.Email,
 		Code:      maskCode(code.Code),
 		ExpiresAt: code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
 		IsUsed:    code.IsUsed,
 		CreatedAt: code.CreatedAt.Format("2006-01-02T15:04:05Z"),
 	}
 	return c.JSON(http.StatusOK, response)
 }
 // Delete handles DELETE /api/admin/confirmation-codes/:id
 func (h *AdminConfirmationCodeHandler) Delete(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
 	}
 	result := h.db.Delete(&models.ConfirmationCode{}, id)
 	if result.Error != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete confirmation code"})
 	}
 	if result.RowsAffected == 0 {
 		return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Confirmation code not found"})
 	}
 	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Confirmation code deleted successfully"})
 }
 // BulkDelete handles DELETE /api/admin/confirmation-codes/bulk
 func (h *AdminConfirmationCodeHandler) BulkDelete(c echo.Context) error {
 	var req dto.BulkDeleteRequest
 	if err := c.Bind(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	result := h.db.Where("id IN ?", req.IDs).Delete(&models.ConfirmationCode{})
 	if result.Error != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete confirmation codes"})
 	}
 	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Confirmation codes deleted successfully", "count": result.RowsAffected})
 }
 func (h *AdminConfirmationCodeHandler) List(c echo.Context) error       { return h.gone(c) }
 func (h *AdminConfirmationCodeHandler) Get(c echo.Context) error        { return h.gone(c) }
 func (h *AdminConfirmationCodeHandler) Delete(c echo.Context) error     { return h.gone(c) }
 func (h *AdminConfirmationCodeHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
@@ -8,16 +8,18 @@ import (
 	"gorm.io/gorm"
 	"github.com/treytartt/honeydue-api/internal/models"
 	"github.com/treytartt/honeydue-api/internal/services"
 )
 // AdminLimitationsHandler handles subscription limitations management
 type AdminLimitationsHandler struct {
 	db    *gorm.DB
 	cache *services.CacheService
 }
-// NewAdminLimitationsHandler creates a new handler
+// NewAdminLimitationsHandler creates a new handler. Cache is optional.
-func NewAdminLimitationsHandler(db *gorm.DB) *AdminLimitationsHandler {
+func NewAdminLimitationsHandler(db *gorm.DB, cache *services.CacheService) *AdminLimitationsHandler {
-	return &AdminLimitationsHandler{db: db}
+	return &AdminLimitationsHandler{db: db, cache: cache}
 }
 // === Settings (enable_limitations) ===
@@ -27,14 +29,25 @@ type LimitationsSettingsResponse struct {
 	EnableLimitations bool `json:"enable_limitations"`
 }
-// GetSettings handles GET /api/admin/limitations/settings
+// GetSettings handles GET /api/admin/limitations/settings.
 // Reads through Redis cache first; on miss falls through to DB.
 func (h *AdminLimitationsHandler) GetSettings(c echo.Context) error {
 	ctx := c.Request().Context()
 	if h.cache != nil {
 		var cached models.SubscriptionSettings
 		if err := h.cache.GetCachedSubscriptionSettings(ctx, &cached); err == nil {
 			return c.JSON(http.StatusOK, LimitationsSettingsResponse{
 				EnableLimitations: cached.EnableLimitations,
 			})
 		}
 	}
 	var settings models.SubscriptionSettings
-	if err := h.db.First(&settings, 1).Error; err != nil {
+	if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			// Create default settings
 			settings = models.SubscriptionSettings{ID: 1, EnableLimitations: false}
-			if err := h.db.Create(&settings).Error; err != nil {
+			if err := h.db.WithContext(ctx).Create(&settings).Error; err != nil {
 				return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create default settings"})
 			}
 		} else {
@@ -42,6 +55,10 @@ func (h *AdminLimitationsHandler) GetSettings(c echo.Context) error {
 		}
 	}
 	if h.cache != nil {
 		_ = h.cache.CacheSubscriptionSettings(ctx, &settings)
 	}
 	return c.JSON(http.StatusOK, LimitationsSettingsResponse{
 		EnableLimitations: settings.EnableLimitations,
 	})
@@ -60,7 +77,8 @@ func (h *AdminLimitationsHandler) UpdateSettings(c echo.Context) error {
 	}
 	var settings models.SubscriptionSettings
-	if err := h.db.First(&settings, 1).Error; err != nil {
+	ctx := c.Request().Context()
 	if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			settings = models.SubscriptionSettings{ID: 1}
 		} else {
@@ -72,10 +90,15 @@ func (h *AdminLimitationsHandler) UpdateSettings(c echo.Context) error {
 		settings.EnableLimitations = *req.EnableLimitations
 	}
-	if err := h.db.Save(&settings).Error; err != nil {
+	if err := h.db.WithContext(ctx).Save(&settings).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update settings"})
 	}
 	// Invalidate the cache so the new value is visible to all pods.
 	if h.cache != nil {
 		_ = h.cache.InvalidateSubscriptionSettings(ctx)
 	}
 	return c.JSON(http.StatusOK, LimitationsSettingsResponse{
 		EnableLimitations: settings.EnableLimitations,
 	})
@@ -1,159 +1,28 @@
 // password_reset_code_handler is a stub — the user_passwordresetcode table
 // was dropped in the Ory Kratos migration (phase 2). Password resets are now
 // handled by Kratos.
 package handlers
 import (
 	"net/http"
 	"strconv"
 	"github.com/labstack/echo/v4"
 	"gorm.io/gorm"
 	"github.com/treytartt/honeydue-api/internal/admin/dto"
 	"github.com/treytartt/honeydue-api/internal/models"
 )
-// AdminPasswordResetCodeHandler handles admin password reset code management endpoints
+// AdminPasswordResetCodeHandler is a no-op stub.
 type AdminPasswordResetCodeHandler struct {
 	db *gorm.DB
 }
 // NewAdminPasswordResetCodeHandler creates a new admin password reset code handler
 func NewAdminPasswordResetCodeHandler(db *gorm.DB) *AdminPasswordResetCodeHandler {
 	return &AdminPasswordResetCodeHandler{db: db}
 }
-// PasswordResetCodeResponse represents a password reset code in API responses
+func (h *AdminPasswordResetCodeHandler) gone(c echo.Context) error {
-type PasswordResetCodeResponse struct {
+	return c.JSON(http.StatusGone, map[string]string{"message": "password reset codes are managed by Ory Kratos"})
 	ID          uint   `json:"id"`
 	UserID      uint   `json:"user_id"`
 	Username    string `json:"username"`
 	Email       string `json:"email"`
 	ResetToken  string `json:"reset_token"`
 	ExpiresAt   string `json:"expires_at"`
 	Used        bool   `json:"used"`
 	Attempts    int    `json:"attempts"`
 	MaxAttempts int    `json:"max_attempts"`
 	CreatedAt   string `json:"created_at"`
 }
 // List handles GET /api/admin/password-reset-codes
 func (h *AdminPasswordResetCodeHandler) List(c echo.Context) error {
 	var filters dto.PaginationParams
 	if err := c.Bind(&filters); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	var codes []models.PasswordResetCode
 	var total int64
 	query := h.db.Model(&models.PasswordResetCode{}).Preload("User")
 	// Apply search (search by user info or token)
 	if filters.Search != "" {
 		search := "%" + filters.Search + "%"
 		query = query.Joins("JOIN auth_user ON auth_user.id = user_passwordresetcode.user_id").
 			Where(
 				"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_passwordresetcode.reset_token ILIKE ?",
 				search, search, search,
 			)
 	}
 	// Get total count
 	query.Count(&total)
 	// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
 	sortBy := filters.GetSafeSortBy([]string{
 		"id", "user_id", "created_at", "expires_at", "used",
 	}, "created_at")
 	query = query.Order(sortBy + " " + filters.GetSortDir())
 	// Apply pagination
 	query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
 	if err := query.Find(&codes).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch password reset codes"})
 	}
 	// Build response
 	responses := make([]PasswordResetCodeResponse, len(codes))
 	for i, code := range codes {
 		responses[i] = PasswordResetCodeResponse{
 			ID:          code.ID,
 			UserID:      code.UserID,
 			Username:    code.User.Username,
 			Email:       code.User.Email,
 			ResetToken:  code.ResetToken[:8] + "..." + code.ResetToken[len(code.ResetToken)-4:], // Truncate for display
 			ExpiresAt:   code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
 			Used:        code.Used,
 			Attempts:    code.Attempts,
 			MaxAttempts: code.MaxAttempts,
 			CreatedAt:   code.CreatedAt.Format("2006-01-02T15:04:05Z"),
 		}
 	}
 	return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
 }
 // Get handles GET /api/admin/password-reset-codes/:id
 func (h *AdminPasswordResetCodeHandler) Get(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
 	}
 	var code models.PasswordResetCode
 	if err := h.db.Preload("User").First(&code, id).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Password reset code not found"})
 		}
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch password reset code"})
 	}
 	response := PasswordResetCodeResponse{
 		ID:          code.ID,
 		UserID:      code.UserID,
 		Username:    code.User.Username,
 		Email:       code.User.Email,
 		ResetToken:  code.ResetToken[:8] + "..." + code.ResetToken[len(code.ResetToken)-4:],
 		ExpiresAt:   code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
 		Used:        code.Used,
 		Attempts:    code.Attempts,
 		MaxAttempts: code.MaxAttempts,
 		CreatedAt:   code.CreatedAt.Format("2006-01-02T15:04:05Z"),
 	}
 	return c.JSON(http.StatusOK, response)
 }
 // Delete handles DELETE /api/admin/password-reset-codes/:id
 func (h *AdminPasswordResetCodeHandler) Delete(c echo.Context) error {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
 	if err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
 	}
 	result := h.db.Delete(&models.PasswordResetCode{}, id)
 	if result.Error != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete password reset code"})
 	}
 	if result.RowsAffected == 0 {
 		return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Password reset code not found"})
 	}
 	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Password reset code deleted successfully"})
 }
 // BulkDelete handles DELETE /api/admin/password-reset-codes/bulk
 func (h *AdminPasswordResetCodeHandler) BulkDelete(c echo.Context) error {
 	var req dto.BulkDeleteRequest
 	if err := c.Bind(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
 	}
 	result := h.db.Where("id IN ?", req.IDs).Delete(&models.PasswordResetCode{})
 	if result.Error != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete password reset codes"})
 	}
 	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Password reset codes deleted successfully", "count": result.RowsAffected})
 }
 func (h *AdminPasswordResetCodeHandler) List(c echo.Context) error       { return h.gone(c) }
 func (h *AdminPasswordResetCodeHandler) Get(c echo.Context) error        { return h.gone(c) }
 func (h *AdminPasswordResetCodeHandler) Delete(c echo.Context) error     { return h.gone(c) }
 func (h *AdminPasswordResetCodeHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
@@ -19,11 +19,13 @@ import (
 // AdminSettingsHandler handles system settings management
 type AdminSettingsHandler struct {
 	db    *gorm.DB
 	cache *services.CacheService
 }
-// NewAdminSettingsHandler creates a new handler
+// NewAdminSettingsHandler creates a new handler. The cache may be nil; the
-func NewAdminSettingsHandler(db *gorm.DB) *AdminSettingsHandler {
+// handler falls through to direct DB reads in that case.
-	return &AdminSettingsHandler{db: db}
+func NewAdminSettingsHandler(db *gorm.DB, cache *services.CacheService) *AdminSettingsHandler {
 	return &AdminSettingsHandler{db: db, cache: cache}
 }
 // SettingsResponse represents the settings response
@@ -34,10 +36,29 @@ type SettingsResponse struct {
 	TrialDurationDays int  `json:"trial_duration_days"`
 }
-// GetSettings handles GET /api/admin/settings
+// GetSettings handles GET /api/admin/settings.
 //
 // Reads through Redis (30-min TTL) before hitting Postgres so the same
 // row that's checked on every authed request and every monitoring poll
 // stays hot. Cache miss / first boot creates and caches the default row.
 func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
 	ctx := c.Request().Context()
 	// Try cache first.
 	if h.cache != nil {
 		var cached models.SubscriptionSettings
 		if err := h.cache.GetCachedSubscriptionSettings(ctx, &cached); err == nil {
 			return c.JSON(http.StatusOK, SettingsResponse{
 				EnableLimitations: cached.EnableLimitations,
 				EnableMonitoring:  cached.EnableMonitoring,
 				TrialEnabled:      cached.TrialEnabled,
 				TrialDurationDays: cached.TrialDurationDays,
 			})
 		}
 	}
 	var settings models.SubscriptionSettings
-	if err := h.db.First(&settings, 1).Error; err != nil {
+	if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			// Create default settings
 			settings = models.SubscriptionSettings{
@@ -47,7 +68,7 @@ func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
 				TrialEnabled:      true,
 				TrialDurationDays: 14,
 			}
-			if err := h.db.Create(&settings).Error; err != nil {
+			if err := h.db.WithContext(ctx).Create(&settings).Error; err != nil {
 				return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create default settings"})
 			}
 		} else {
@@ -55,6 +76,10 @@ func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
 		}
 	}
 	if h.cache != nil {
 		_ = h.cache.CacheSubscriptionSettings(ctx, &settings)
 	}
 	return c.JSON(http.StatusOK, SettingsResponse{
 		EnableLimitations: settings.EnableLimitations,
 		EnableMonitoring:  settings.EnableMonitoring,
@@ -79,7 +104,7 @@ func (h *AdminSettingsHandler) UpdateSettings(c echo.Context) error {
 	}
 	var settings models.SubscriptionSettings
-	if err := h.db.First(&settings, 1).Error; err != nil {
+	if err := h.db.WithContext(c.Request().Context()).First(&settings, 1).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			settings = models.SubscriptionSettings{
 				ID:                1,
@@ -108,10 +133,16 @@ func (h *AdminSettingsHandler) UpdateSettings(c echo.Context) error {
 		settings.TrialDurationDays = *req.TrialDurationDays
 	}
-	if err := h.db.Save(&settings).Error; err != nil {
+	if err := h.db.WithContext(c.Request().Context()).Save(&settings).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update settings"})
 	}
 	// Invalidate the cache so all pods pick up the new value on their
 	// next read (instead of waiting for the 30-min TTL).
 	if h.cache != nil {
 		_ = h.cache.InvalidateSubscriptionSettings(c.Request().Context())
 	}
 	return c.JSON(http.StatusOK, SettingsResponse{
 		EnableLimitations: settings.EnableLimitations,
 		EnableMonitoring:  settings.EnableMonitoring,
@@ -217,137 +248,20 @@ func (h *AdminSettingsHandler) cacheAllLookups(ctx context.Context) (bool, error
 	}
 	log.Debug().Int("count", len(taskTemplates)).Msg("Cached task templates")
-	// Build and cache the unified seeded data response
+	// Invalidate the unified seeded-data cache for every locale. The combined
-	// Import the grouped response type
+	// response is localized (lookup display_name + home-profile options) and is
-	seededData := map[string]interface{}{
+	// rebuilt per-locale on demand by the static_data handler, so the correct
-		"residence_types":        residenceTypes,
+	// action after a lookup change is to clear all language variants rather than
-		"task_categories":        categories,
+	// pre-warm a single (non-localized) blob.
-		"task_priorities":        priorities,
+	if err := cache.InvalidateSeededData(ctx); err != nil {
-		"task_frequencies":       frequencies,
+		return false, fmt.Errorf("failed to invalidate seeded data: %w", err)
 		"contractor_specialties": specialties,
 		"task_templates":         buildGroupedTemplates(taskTemplates),
 	}
-
+	log.Debug().Msg("Invalidated per-locale seeded data cache")
 	etag, err := cache.CacheSeededData(ctx, seededData)
 	if err != nil {
 		return false, fmt.Errorf("failed to cache seeded data: %w", err)
 	}
 	log.Debug().Str("etag", etag).Msg("Cached unified seeded data")
 	log.Info().Msg("All lookup data cached in Redis successfully")
 	return true, nil
 }
 // buildGroupedTemplates groups task templates by category for the seeded data response
 func buildGroupedTemplates(templates []models.TaskTemplate) map[string]interface{} {
 	type templateResponse struct {
 		ID           uint                   `json:"id"`
 		Title        string                 `json:"title"`
 		Description  string                 `json:"description"`
 		CategoryID   *uint                  `json:"category_id"`
 		Category     map[string]interface{} `json:"category,omitempty"`
 		FrequencyID  *uint                  `json:"frequency_id"`
 		Frequency    map[string]interface{} `json:"frequency,omitempty"`
 		IconIOS      string                 `json:"icon_ios"`
 		IconAndroid  string                 `json:"icon_android"`
 		Tags         []string               `json:"tags"`
 		DisplayOrder int                    `json:"display_order"`
 		IsActive     bool                   `json:"is_active"`
 	}
 	type categoryGroup struct {
 		CategoryName string             `json:"category_name"`
 		CategoryID   *uint              `json:"category_id"`
 		Templates    []templateResponse `json:"templates"`
 		Count        int                `json:"count"`
 	}
 	categoryMap := make(map[string]*categoryGroup)
 	categoryOrder := []string{}
 	for _, t := range templates {
 		categoryName := "Uncategorized"
 		var categoryID *uint
 		if t.Category != nil {
 			categoryName = t.Category.Name
 			categoryID = &t.Category.ID
 		}
 		if _, exists := categoryMap[categoryName]; !exists {
 			categoryMap[categoryName] = &categoryGroup{
 				CategoryName: categoryName,
 				CategoryID:   categoryID,
 				Templates:    []templateResponse{},
 			}
 			categoryOrder = append(categoryOrder, categoryName)
 		}
 		resp := templateResponse{
 			ID:           t.ID,
 			Title:        t.Title,
 			Description:  t.Description,
 			CategoryID:   t.CategoryID,
 			FrequencyID:  t.FrequencyID,
 			IconIOS:      t.IconIOS,
 			IconAndroid:  t.IconAndroid,
 			Tags:         parseTags(t.Tags),
 			DisplayOrder: t.DisplayOrder,
 			IsActive:     t.IsActive,
 		}
 		if t.Category != nil {
 			resp.Category = map[string]interface{}{
 				"id":            t.Category.ID,
 				"name":          t.Category.Name,
 				"description":   t.Category.Description,
 				"icon":          t.Category.Icon,
 				"color":         t.Category.Color,
 				"display_order": t.Category.DisplayOrder,
 			}
 		}
 		if t.Frequency != nil {
 			resp.Frequency = map[string]interface{}{
 				"id":            t.Frequency.ID,
 				"name":          t.Frequency.Name,
 				"days":          t.Frequency.Days,
 				"display_order": t.Frequency.DisplayOrder,
 			}
 		}
 		categoryMap[categoryName].Templates = append(categoryMap[categoryName].Templates, resp)
 	}
 	categories := make([]categoryGroup, len(categoryOrder))
 	totalCount := 0
 	for i, name := range categoryOrder {
 		group := categoryMap[name]
 		group.Count = len(group.Templates)
 		totalCount += group.Count
 		categories[i] = *group
 	}
 	return map[string]interface{}{
 		"categories":  categories,
 		"total_count": totalCount,
 	}
 }
 // parseTags splits a comma-separated tags string into a slice
 func parseTags(tags string) []string {
 	if tags == "" {
 		return []string{}
 	}
 	parts := strings.Split(tags, ",")
 	result := make([]string, 0, len(parts))
 	for _, p := range parts {
 		trimmed := strings.TrimSpace(p)
 		if trimmed != "" {
 			result = append(result, trimmed)
 		}
 	}
 	return result
 }
 // SeedTestData handles POST /api/admin/settings/seed-test-data
 func (h *AdminSettingsHandler) SeedTestData(c echo.Context) error {
 	if err := h.runSeedFile("002_test_data.sql"); err != nil {
@@ -207,9 +207,7 @@ func (h *AdminUserHandler) Create(c echo.Context) error {
 		user.IsSuperuser = *req.IsSuperuser
 	}
-	if err := user.SetPassword(req.Password); err != nil {
+	// Password management is handled by Ory Kratos; no local password hashing.
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to hash password"})
 	}
 	if err := h.db.Create(&user).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create user"})
@@ -284,10 +282,9 @@ func (h *AdminUserHandler) Update(c echo.Context) error {
 	if req.IsSuperuser != nil {
 		user.IsSuperuser = *req.IsSuperuser
 	}
 	// Password management is handled by Ory Kratos; local password update ignored.
 	if req.Password != nil {
-		if err := user.SetPassword(*req.Password); err != nil {
+		_ = req.Password // Password changes must go through Kratos admin API
 			return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to hash password"})
 		}
 	}
 	if err := h.db.Save(&user).Error; err != nil {
@@ -25,6 +25,7 @@ type Dependencies struct {
 	PushClient          *push.Client
 	OnboardingService   *services.OnboardingEmailService
 	MonitoringHandler   *monitoring.Handler
 	CacheService        *services.CacheService
 }
 // SetupRoutes configures all admin routes
@@ -380,7 +381,7 @@ func SetupRoutes(router *echo.Echo, db *gorm.DB, cfg *config.Config, deps *Depen
 			}
 			// System settings management (super admin only)
-			settingsHandler := handlers.NewAdminSettingsHandler(db)
+			settingsHandler := handlers.NewAdminSettingsHandler(db, deps.CacheService)
 			settings := protected.Group("/settings")
 			settings.Use(middleware.RequireSuperAdmin())
 			{
@@ -394,7 +395,7 @@ func SetupRoutes(router *echo.Echo, db *gorm.DB, cfg *config.Config, deps *Depen
 			}
 			// Limitations management (tier limits, upgrade triggers)
-			limitationsHandler := handlers.NewAdminLimitationsHandler(db)
+			limitationsHandler := handlers.NewAdminLimitationsHandler(db, deps.CacheService)
 			limitations := protected.Group("/limitations")
 			{
 				// Settings (enable_limitations toggle)
@@ -1,6 +1,7 @@
 package config
 import (
 	"crypto/rand"
 	"encoding/hex"
 	"fmt"
 	"net/url"
@@ -52,6 +53,7 @@ type DatabaseConfig struct {
 	MaxOpenConns int
 	MaxIdleConns int
 	MaxLifetime  time.Duration
 	MaxIdleTime  time.Duration
 }
 type RedisConfig struct {
@@ -88,8 +90,12 @@ type PushConfig struct {
 }
 type AppleAuthConfig struct {
-	ClientID string // Bundle ID (e.g., com.tt.honeyDue.honeyDueDev)
+	ClientID string // Bundle ID, used as the `aud` claim in Sign in with Apple identity tokens
-	TeamID   string // Apple Developer Team ID
+	// TeamID is currently unused — services/apple_auth.go validates identity tokens
 	// against ClientID + Apple's JWKS only, with no server-to-server REST calls.
 	// Wire this in if/when token revocation or refresh-token exchange is added,
 	// since both require signing a client_secret JWT with team_id + key_id.
 	TeamID string
 }
 type GoogleAuthConfig struct {
@@ -136,6 +142,13 @@ type SecurityConfig struct {
 	MaxPasswordResetRate int // per hour
 	TokenExpiryDays      int // Number of days before auth tokens expire (default 90)
 	TokenRefreshDays     int // Token must be at least this many days old before refresh (default 60)
 	// KratosPublicURL is the Ory Kratos public API base URL. The auth
 	// middleware validates sessions against {KratosPublicURL}/sessions/whoami.
 	KratosPublicURL string
 	// KratosAdminURL is the Ory Kratos admin API base URL. Account deletion
 	// removes the user's Kratos identity via
 	// {KratosAdminURL}/admin/identities/{id}.
 	KratosAdminURL string
 }
 // StorageConfig holds file storage settings.
@@ -178,7 +191,7 @@ type FeatureFlags struct {
 var (
 	cfg   *Config
-	cfgOnce sync.Once
+	cfgMu sync.Mutex
 )
 // knownWeakSecretKeys contains well-known default or placeholder secret keys
@@ -191,11 +204,19 @@ var knownWeakSecretKeys = map[string]bool{
 	"change-me-in-production-secret-key-12345": true,
 }
-// Load reads configuration from environment variables
+// Load reads configuration from environment variables.
 //
 // Caches the result so repeated calls are cheap. On validation failure, the
 // cache stays nil so a subsequent call (after env is corrected) can retry. The
 // previous implementation used sync.Once with an in-Do reset of the Once
 // itself, which races and panics with "sync: unlock of unlocked mutex".
 func Load() (*Config, error) {
-	var loadErr error
+	cfgMu.Lock()
 	defer cfgMu.Unlock()
 	if cfg != nil {
 		return cfg, nil
 	}
 	cfgOnce.Do(func() {
 	viper.SetEnvPrefix("")
 	viper.AutomaticEnv()
 	viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
@@ -203,6 +224,11 @@ func Load() (*Config, error) {
 	// Set defaults
 	setDefaults()
 	// Audit F8: overlay file-mounted secrets onto Viper. No-op when the
 	// directory is absent (local/dev), so this is safe to ship before the
 	// manifests mount honeydue-secrets as a volume.
 	loadFileSecrets()
 	// Parse DATABASE_URL if set (Dokku-style)
 	dbConfig := DatabaseConfig{
 		Host:         viper.GetString("DB_HOST"),
@@ -214,6 +240,7 @@ func Load() (*Config, error) {
 		MaxOpenConns: viper.GetInt("DB_MAX_OPEN_CONNS"),
 		MaxIdleConns: viper.GetInt("DB_MAX_IDLE_CONNS"),
 		MaxLifetime:  viper.GetDuration("DB_MAX_LIFETIME"),
 		MaxIdleTime:  viper.GetDuration("DB_MAX_IDLE_TIME"),
 	}
 	// Override with DATABASE_URL if present (F-16: log warning on parse failure)
@@ -234,7 +261,7 @@ func Load() (*Config, error) {
 		}
 	}
-		cfg = &Config{
+	c := &Config{
 		Server: ServerConfig{
 			Port:               viper.GetInt("PORT"),
 			Debug:              viper.GetBool("DEBUG"),
@@ -284,6 +311,8 @@ func Load() (*Config, error) {
 			MaxPasswordResetRate: 3,
 			TokenExpiryDays:      viper.GetInt("TOKEN_EXPIRY_DAYS"),
 			TokenRefreshDays:     viper.GetInt("TOKEN_REFRESH_DAYS"),
 			KratosPublicURL:      viper.GetString("KRATOS_PUBLIC_URL"),
 			KratosAdminURL:       viper.GetString("KRATOS_ADMIN_URL"),
 		},
 		Storage: StorageConfig{
 			UploadDir:     viper.GetString("STORAGE_UPLOAD_DIR"),
@@ -334,19 +363,11 @@ func Load() (*Config, error) {
 		},
 	}
-		// Validate required fields
+	if err := validate(c); err != nil {
-		if err := validate(cfg); err != nil {
+		// Leave cfg nil so the next Load() retries after env is corrected.
-			loadErr = err
+		return nil, err
 			// Reset so a subsequent call can retry after env is fixed
 			cfg = nil
 			cfgOnce = sync.Once{}
 	}
-	})
+	cfg = c
 	if loadErr != nil {
 		return nil, loadErr
 	}
 	return cfg, nil
 }
@@ -399,6 +420,8 @@ func setDefaults() {
 	// Token expiry defaults
 	viper.SetDefault("TOKEN_EXPIRY_DAYS", 90)  // Tokens expire after 90 days
 	viper.SetDefault("KRATOS_PUBLIC_URL", "http://kratos:4433") // Ory Kratos public API
 	viper.SetDefault("KRATOS_ADMIN_URL", "http://kratos:4434")  // Ory Kratos admin API
 	viper.SetDefault("TOKEN_REFRESH_DAYS", 60) // Tokens can be refreshed after 60 days
 	// Storage defaults
@@ -426,14 +449,67 @@ func isWeakSecretKey(key string) bool {
 	return knownWeakSecretKeys[strings.ToLower(strings.TrimSpace(key))]
 }
 // loadFileSecrets overlays file-mounted secrets onto Viper (audit F8). When
 // the honeydue-secrets Secret is mounted as a volume at /etc/honeydue/secrets
 // each key is a file; reading the value here and viper.Set-ing it (highest
 // Viper precedence) keeps the secret out of the process environment
 // (/proc/<pid>/environ), which plain env-var injection cannot. When the
 // directory is absent it is a silent no-op and env vars are used as before.
 func loadFileSecrets() {
 	dir := os.Getenv("HONEYDUE_SECRETS_DIR")
 	if dir == "" {
 		dir = "/etc/honeydue/secrets"
 	}
 	for _, k := range []string{
 		"POSTGRES_PASSWORD", "SECRET_KEY", "EMAIL_HOST_PASSWORD", "FCM_SERVER_KEY",
 		"REDIS_PASSWORD", "B2_KEY_ID", "B2_APP_KEY", "OBS_INGEST_TOKEN", "OBS_TRACES_URL",
 	} {
 		b, err := os.ReadFile(dir + "/" + k)
 		if err != nil {
 			continue
 		}
 		if v := strings.TrimSpace(string(b)); v != "" {
 			viper.Set(k, v)
 		}
 	}
 }
 // SecretValue resolves a configuration value that is not part of the typed
 // Config struct. It reads through Viper, so a value supplied via a file-mounted
 // secret (audit F8, loaded by loadFileSecrets) is found just like an env var.
 //
 // Must be called after Load(). Used by cmd/api and cmd/worker for the
 // observability endpoints, which are needed before the full Config is wired
 // and would otherwise be read with os.Getenv — which misses file-mounted
 // secrets entirely once F8 removes them from the process environment.
 func SecretValue(key string) string {
 	return viper.GetString(key)
 }
 // randomHexKey returns a cryptographically secure random hex string
 // representing n random bytes (2n hex characters).
 func randomHexKey(n int) (string, error) {
 	b := make([]byte, n)
 	if _, err := rand.Read(b); err != nil {
 		return "", err
 	}
 	return hex.EncodeToString(b), nil
 }
 func validate(cfg *Config) error {
-	// S-08: Validate SECRET_KEY against known weak defaults
+	// M8: SECRET_KEY validation — no static fallback secret in the binary.
 	if cfg.Security.SecretKey == "" {
 		if cfg.Server.Debug {
-			// In debug mode, use a default key with a warning for local development
+			// Debug only: generate a random key per boot. Tokens signed with
-			cfg.Security.SecretKey = "change-me-in-production-secret-key-12345"
+			// it do not survive a restart, which is acceptable for local dev
-			fmt.Println("WARNING: SECRET_KEY not set, using default (debug mode only)")
+			// and far safer than a well-known hardcoded fallback.
-			fmt.Println("WARNING: *** DO NOT USE THIS DEFAULT KEY IN PRODUCTION ***")
+			randomKey, err := randomHexKey(32)
 			if err != nil {
 				return fmt.Errorf("failed to generate ephemeral debug SECRET_KEY: %w", err)
 			}
 			cfg.Security.SecretKey = randomKey
 			fmt.Println("WARNING: SECRET_KEY not set, generated an ephemeral random key (debug mode only)")
 			fmt.Println("WARNING: tokens will not survive a restart — set SECRET_KEY for stable local sessions")
 		} else {
 			// In production, refuse to start without a proper secret key
 			return fmt.Errorf("FATAL: SECRET_KEY environment variable is required in production (DEBUG=false)")
@@ -446,6 +522,12 @@ func validate(cfg *Config) error {
 		}
 	}
 	// C4: fixed confirmation codes ("123456") must never be enabled outside
 	// debug — with DEBUG=false they are a full authentication bypass.
 	if cfg.Server.DebugFixedCodes && !cfg.Server.Debug {
 		return fmt.Errorf("FATAL: DEBUG_FIXED_CODES is enabled with DEBUG=false — fixed confirmation codes must never run in production")
 	}
 	// Database password might come from DATABASE_URL, don't require it separately
 	// The actual connection will fail if credentials are wrong
@@ -1,7 +1,6 @@
 package config
 import (
 	"sync"
 	"testing"
 	"github.com/spf13/viper"
@@ -11,8 +10,9 @@ import (
 // resetConfigState resets the package-level singleton so each test starts fresh.
 func resetConfigState() {
 	cfgMu.Lock()
 	cfg = nil
-	cfgOnce = sync.Once{}
+	cfgMu.Unlock()
 	viper.Reset()
 }
@@ -106,8 +106,10 @@ func TestLoad_Validation_MissingSecretKey_DebugMode(t *testing.T) {
 	c, err := Load()
 	require.NoError(t, err)
-	// In debug mode, a default key is assigned
+	// Audit M8: in debug mode an ephemeral random key is generated per boot
-	assert.Equal(t, "change-me-in-production-secret-key-12345", c.Security.SecretKey)
+	// (no static fallback). It must be a non-empty 64-char hex string.
 	assert.Len(t, c.Security.SecretKey, 64)
 	assert.NotEqual(t, "change-me-in-production-secret-key-12345", c.Security.SecretKey)
 }
 func TestLoad_Validation_WeakSecretKey_Production(t *testing.T) {
@@ -133,6 +135,33 @@ func TestLoad_Validation_WeakSecretKey_DebugMode(t *testing.T) {
 	assert.Equal(t, "secret", c.Security.SecretKey)
 }
 // Audit C4: DEBUG_FIXED_CODES makes confirmation codes a fixed "123456" — a
 // full authentication bypass. With DEBUG=false, validate() must refuse to boot
 // rather than ship that bypass to production.
 func TestLoad_Validation_DebugFixedCodes_Production(t *testing.T) {
 	// validate() directly — avoids the sync.Once issue Load() has on failure.
 	cfg := &Config{
 		Server:   ServerConfig{Debug: false, DebugFixedCodes: true},
 		Security: SecurityConfig{SecretKey: "a-strong-secret-key-for-tests"},
 	}
 	err := validate(cfg)
 	require.Error(t, err)
 	assert.Contains(t, err.Error(), "DEBUG_FIXED_CODES")
 }
 // With DEBUG=true the fixed codes are an intended local-dev convenience, so
 // the same combination must NOT error.
 func TestLoad_Validation_DebugFixedCodes_DebugMode(t *testing.T) {
 	cfg := &Config{
 		Server:   ServerConfig{Debug: true, DebugFixedCodes: true},
 		Security: SecurityConfig{SecretKey: "a-strong-secret-key-for-tests"},
 	}
 	err := validate(cfg)
 	require.NoError(t, err)
 }
 func TestLoad_Validation_EncryptionKey_Valid(t *testing.T) {
 	resetConfigState()
 	t.Setenv("SECRET_KEY", "a-strong-secret-key-for-tests")
@@ -14,12 +14,10 @@ import (
 	"github.com/treytartt/honeydue-api/internal/config"
 	"github.com/treytartt/honeydue-api/internal/models"
-)
+	"github.com/treytartt/honeydue-api/internal/prom"
-// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
+	"github.com/uptrace/opentelemetry-go-extra/otelgorm"
-// Migrate() across API replicas booting in parallel. Value is arbitrary but
+)
 // stable ("hdmg" as bytes = honeydue migration).
 const migrationAdvisoryLockKey int64 = 0x68646d67
 // zerologGormWriter adapts zerolog for GORM's logger interface
 type zerologGormWriter struct{}
@@ -68,25 +66,84 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) {
 		return nil, fmt.Errorf("failed to get underlying sql.DB: %w", err)
 	}
-	// Configure connection pool
+	// Configure connection pool. The Neon pooler endpoint keeps backend
 	// connections warm, so we keep our client-side pool warm too — that
 	// eliminates the ~440ms TCP+TLS+startup handshake on the first query
 	// after a cold pod / idle period.
 	sqlDB.SetMaxOpenConns(cfg.MaxOpenConns)
 	sqlDB.SetMaxIdleConns(cfg.MaxIdleConns)
 	sqlDB.SetConnMaxLifetime(cfg.MaxLifetime)
 	if cfg.MaxIdleTime > 0 {
 		sqlDB.SetConnMaxIdleTime(cfg.MaxIdleTime)
 	}
 	// MaxIdleTime=0 means "never close idle" — the pool fills up to
 	// MaxIdleConns and they stay alive until MaxLifetime expires.
 	// Test connection
 	if err := sqlDB.Ping(); err != nil {
 		return nil, fmt.Errorf("failed to ping database: %w", err)
 	}
 	// Eagerly warm the connection pool to MaxIdleConns. Without this, the
 	// first N user requests each pay the full handshake (~440ms over a
 	// transatlantic link). Pings are issued in parallel so warm-up is
 	// bounded by handshake time, not handshake-time × N.
 	warmUpPool(sqlDB, cfg.MaxIdleConns)
 	log.Info().
 		Str("host", cfg.Host).
 		Int("port", cfg.Port).
 		Str("database", cfg.Database).
 		Msg("Connected to PostgreSQL database")
 	// Register Prometheus GORM callbacks — emits gorm_query_duration_seconds
 	// for every SQL operation. Operates at the statement level, so does not
 	// require ctx to be threaded through repositories.
 	if err := prom.RegisterGORMCallbacks(db); err != nil {
 		log.Warn().Err(err).Msg("failed to register prometheus GORM callbacks; metrics will be partial")
 	}
 	// Register otelgorm plugin — emits a span per SQL statement, attached to
 	// whatever trace context is set via db.WithContext(ctx). Repositories that
 	// have been migrated to use WithContext (see internal/repositories/*.go)
 	// will produce nested SQL spans inside the request trace; pre-migration
 	// repositories silently emit untraced queries.
 	if err := db.Use(otelgorm.NewPlugin(otelgorm.WithDBName(cfg.Database))); err != nil {
 		log.Warn().Err(err).Msg("failed to register otelgorm plugin; SQL spans disabled")
 	}
 	return db, nil
 }
 // warmUpPool issues N parallel pings so the pool fills with established
 // connections before the first user request lands. Failures are logged but
 // not fatal — the pool will fill on demand under traffic if pre-warm fails.
 //
 // On a transatlantic link to Neon (~110ms RTT, ~440ms cold handshake), this
 // turns "first request pays the cold handshake" into "first request finds a
 // warm pool" — at the cost of ~440ms during pod startup.
 func warmUpPool(sqlDB interface {
 	PingContext(context.Context) error
 }, n int) {
 	if n <= 0 {
 		return
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
 	done := make(chan error, n)
 	for i := 0; i < n; i++ {
 		go func() { done <- sqlDB.PingContext(ctx) }()
 	}
 	successes := 0
 	for i := 0; i < n; i++ {
 		if err := <-done; err == nil {
 			successes++
 		}
 	}
 	log.Info().Int("requested", n).Int("warmed", successes).Msg("DB pool warm-up complete")
 }
 // Get returns the database instance
 func Get() *gorm.DB {
 	return db
@@ -127,52 +184,46 @@ func Paginate(page, pageSize int) func(db *gorm.DB) *gorm.DB {
 	}
 }
-// MigrateWithLock runs Migrate() under a Postgres session-level advisory lock
+// RequireSchemaApplied verifies that goose's version table exists and has
-// so that multiple API replicas booting in parallel don't race on AutoMigrate.
+// at least one applied entry. This is the fail-fast that runs at api/worker
-// On non-Postgres dialects (sqlite in tests) it falls through to Migrate().
+// boot: if the operator forgot to run the migrate Job, the pod refuses to
-func MigrateWithLock() error {
+// start with a clear error instead of throwing mysterious "relation does
 // not exist" errors deep in a request handler.
 //
 // On non-Postgres dialects (sqlite in tests) this is a no-op — tests use
 // AutoMigrate via testutil.SetupTestDB to create a fresh schema per run.
 // goose isn't involved in the test path.
 func RequireSchemaApplied() error {
 	if db == nil {
 		return fmt.Errorf("database not initialised")
 	}
 	if db.Dialector.Name() != "postgres" {
-		return Migrate()
+		return nil
 	}
-	sqlDB, err := db.DB()
+	// goose_db_version stores one row per applied migration, not a single
 	// "current version" row — so we look for the highest version_id with
 	// is_applied=true. ORDER BY id DESC LIMIT 1 also catches the case where
 	// the table exists but is empty (no rows returned, scan leaves Version
 	// at zero).
 	type migrationRow struct {
 		VersionID int64 `gorm:"column:version_id"`
 		IsApplied bool  `gorm:"column:is_applied"`
 	}
 	var row migrationRow
 	err := db.Raw(`SELECT version_id, is_applied FROM goose_db_version ORDER BY id DESC LIMIT 1`).Scan(&row).Error
 	if err != nil {
-		return fmt.Errorf("get underlying sql.DB: %w", err)
+		return fmt.Errorf("goose_db_version check failed (run the migrate Job to bootstrap): %w", err)
 	}
-
+	if !row.IsApplied {
-	// Give ourselves up to 5 min to acquire the lock — long enough for a
+		return fmt.Errorf("goose_db_version latest row is_applied=false at version=%d — last migration was rolled back or aborted; investigate before starting", row.VersionID)
 	// slow migration on a peer replica, short enough to fail fast if Postgres
 	// is hung.
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 	conn, err := sqlDB.Conn(ctx)
 	if err != nil {
 		return fmt.Errorf("acquire dedicated migration connection: %w", err)
 	}
-	defer conn.Close()
+	if row.VersionID < 1 {
-
+		return fmt.Errorf("goose_db_version is empty — run goose up (or seed a row marking version 1 as applied if the schema already exists)")
 	log.Info().Int64("lock_key", migrationAdvisoryLockKey).Msg("Acquiring migration advisory lock...")
 	if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", migrationAdvisoryLockKey); err != nil {
 		return fmt.Errorf("pg_advisory_lock: %w", err)
 	}
-	log.Info().Msg("Migration advisory lock acquired")
+	log.Info().Int64("schema_version", row.VersionID).Msg("Schema precondition satisfied")
-
+	return nil
 	defer func() {
 		// Unlock with a fresh context — the outer ctx may have expired.
 		unlockCtx, unlockCancel := context.WithTimeout(context.Background(), 10*time.Second)
 		defer unlockCancel()
 		if _, err := conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", migrationAdvisoryLockKey); err != nil {
 			log.Warn().Err(err).Msg("Failed to release migration advisory lock (session close will also release)")
 		} else {
 			log.Info().Msg("Migration advisory lock released")
 		}
 	}()
 	return Migrate()
 }
 // Migrate runs database migrations for all models
@@ -193,12 +244,7 @@ func Migrate() error {
 		// User and auth tables
 		&models.User{},
 		&models.AuthToken{},
 		&models.UserProfile{},
 		&models.ConfirmationCode{},
 		&models.PasswordResetCode{},
 		&models.AppleSocialAuth{},
 		&models.GoogleSocialAuth{},
 		// Admin users (separate from app users)
 		&models.AdminUser{},
@@ -25,7 +25,12 @@ type CreateDocumentRequest struct {
 	SerialNumber  string               `json:"serial_number" validate:"max=100"`
 	ModelNumber   string               `json:"model_number" validate:"max=100"`
 	TaskID        *uint                `json:"task_id"`
-	ImageURLs     []string             `json:"image_urls" validate:"omitempty,max=20,dive,max=500"` // Multiple image URLs
+	// UploadIDs claims pending_uploads rows produced by the presigned-URL
 	// upload flow and turns them into document_image rows. UploadIDs of
 	// category "document_file" attach to the document's main FileURL +
 	// FileName fields instead — the service infers placement from the
 	// row's category.
 	UploadIDs     []uint               `json:"upload_ids" validate:"omitempty,max=20"`
 }
 // UpdateDocumentRequest represents the request to update a document
@@ -100,14 +100,20 @@ type UpdateTaskRequest struct {
 	ContractorID       *uint            `json:"contractor_id"`
 }
-// CreateTaskCompletionRequest represents the request to create a task completion
+// CreateTaskCompletionRequest represents the request to create a task completion.
 //
 // Image attachments arrive via the presigned-URL flow: the client uploads
 // each image directly to B2 (see /api/uploads/presign) and passes the
 // resulting pending_uploads.id values in UploadIDs. The service claims
 // those rows and creates the linked task_completion_image rows.
 type CreateTaskCompletionRequest struct {
 	TaskID      uint             `json:"task_id" validate:"required"`
 	CompletedAt *time.Time       `json:"completed_at"` // Defaults to now
 	Notes       string           `json:"notes" validate:"max=10000"`
 	ActualCost  *decimal.Decimal `json:"actual_cost"`
 	Rating      *int             `json:"rating" validate:"omitempty,min=1,max=5"` // 1-5 star rating
-	ImageURLs   []string         `json:"image_urls" validate:"omitempty,max=20,dive,max=500"` // Multiple image URLs
+
 	UploadIDs []uint `json:"upload_ids" validate:"omitempty,max=20"`
 }
 // UpdateTaskCompletionRequest represents the request to update a task completion
@@ -115,7 +121,6 @@ type UpdateTaskCompletionRequest struct {
 	Notes      *string          `json:"notes" validate:"omitempty,max=10000"`
 	ActualCost *decimal.Decimal `json:"actual_cost"`
 	Rating     *int             `json:"rating" validate:"omitempty,min=1,max=5"`
 	ImageURLs  []string         `json:"image_urls" validate:"omitempty,max=20,dive,max=500"`
 }
 // CompletionImageInput represents an image to add to a completion
@@ -0,0 +1,22 @@
 package requests
 // PresignUploadRequest is the body for POST /api/uploads/presign. The client
 // describes what it's about to upload; the server validates against quota,
 // rate limits, and per-category caps before returning a signed POST policy.
 type PresignUploadRequest struct {
 	// Category gates allowed mime types and the size cap. One of:
 	//   "completion"     — task completion photos
 	//   "document_image" — image attached to a Document
 	//   "document_file"  — file (e.g. PDF) attached to a Document
 	Category string `json:"category" validate:"required,oneof=completion document_image document_file"`
 	// ContentType is the MIME type the client will upload (e.g. image/jpeg).
 	// Bound to the policy so the actual upload must match exactly.
 	ContentType string `json:"content_type" validate:"required,min=3,max=127"`
 	// ContentLength is the exact byte count the client intends to upload.
 	// The signed policy permits a small slack window around this value
 	// (server-side constant) so the client can encode in one pass without
 	// having to predict the byte count perfectly.
 	ContentLength int64 `json:"content_length" validate:"required,min=1"`
 }
@@ -9,7 +9,10 @@ import (
 // ContractorSpecialtyResponse represents a contractor specialty
 type ContractorSpecialtyResponse struct {
 	ID uint `json:"id"`
 	// Name is the stable English identifier (clients match on this).
 	Name string `json:"name"`
 	// DisplayName is the localized label for the request's Accept-Language.
 	DisplayName  string `json:"display_name"`
 	Description  string `json:"description"`
 	Icon         string `json:"icon"`
 	DisplayOrder int    `json:"display_order"`
@@ -11,7 +11,10 @@ import (
 // ResidenceTypeResponse represents a residence type in the API response
 type ResidenceTypeResponse struct {
 	ID uint `json:"id"`
 	// Name is the stable English identifier (clients match on this).
 	Name string `json:"name"`
 	// DisplayName is the localized label for the request's Accept-Language.
 	DisplayName string `json:"display_name"`
 }
 // ResidenceUserResponse represents a user with access to a residence
@@ -14,7 +14,10 @@ import (
 // TaskCategoryResponse represents a task category
 type TaskCategoryResponse struct {
 	ID uint `json:"id"`
 	// Name is the stable English identifier (clients match on this).
 	Name string `json:"name"`
 	// DisplayName is the localized label for the request's Accept-Language.
 	DisplayName  string `json:"display_name"`
 	Description  string `json:"description"`
 	Icon         string `json:"icon"`
 	Color        string `json:"color"`
@@ -25,6 +28,7 @@ type TaskCategoryResponse struct {
 type TaskPriorityResponse struct {
 	ID           uint   `json:"id"`
 	Name         string `json:"name"`
 	DisplayName  string `json:"display_name"`
 	Level        int    `json:"level"`
 	Color        string `json:"color"`
 	DisplayOrder int    `json:"display_order"`
@@ -34,6 +38,7 @@ type TaskPriorityResponse struct {
 type TaskFrequencyResponse struct {
 	ID           uint   `json:"id"`
 	Name         string `json:"name"`
 	DisplayName  string `json:"display_name"`
 	Days         *int   `json:"days"`
 	DisplayOrder int    `json:"display_order"`
 }
@@ -0,0 +1,38 @@
 package responses
 // PresignUploadResponse is what /api/uploads/presign returns to the client.
 //
 // Flow: the client makes one PUT request to URL with the raw object bytes
 // as the body and Headers as the request headers (verbatim — the signature
 // binds them). On success, the client passes ID back via upload_ids[] on
 // POST /api/task-completions/ or POST /api/documents/ to claim and attach
 // the object.
 //
 // We use PUT (not POST) because Backblaze B2's S3-compatible endpoint does
 // not implement the S3 POST Object form upload — it returns HTTP 501 on
 // every request style. PUT works against AWS S3, B2, and MinIO uniformly.
 type PresignUploadResponse struct {
 	// ID is the pending_uploads.id the client passes back via upload_ids[].
 	ID uint `json:"id"`
 	// URL is the signed PUT URL. Includes all auth as query parameters.
 	URL string `json:"upload_url"`
 	// Method is always "PUT" — emitted explicitly so clients don't have to
 	// hardcode it. Reserved for the rare case we ever offer alternative
 	// upload mechanisms.
 	Method string `json:"method"`
 	// Headers must be sent verbatim on the PUT request. Currently includes
 	// Content-Type and Content-Length; both are signed, and B2 will reject
 	// any PUT whose headers don't match.
 	Headers map[string]string `json:"headers"`
 	// Key is the object key chosen by the server. Echoed for client logging
 	// and debugging; the canonical reference is via ID.
 	Key string `json:"key"`
 	// ExpiresAt is when the signed URL stops working. Clients should retry
 	// with a fresh presign rather than relying on long-lived URLs.
 	ExpiresAt string `json:"expires_at"`
 }
@@ -1,7 +1,6 @@
 package handlers
 import (
 	"errors"
 	"net/http"
 	"github.com/labstack/echo/v4"
@@ -13,20 +12,22 @@ import (
 	"github.com/treytartt/honeydue-api/internal/middleware"
 	"github.com/treytartt/honeydue-api/internal/services"
 	"github.com/treytartt/honeydue-api/internal/validator"
 	"github.com/treytartt/honeydue-api/internal/worker"
 )
-// AuthHandler handles authentication endpoints
+// AuthHandler handles user profile and account management endpoints.
 // Session lifecycle (login, register, logout, password reset) is delegated
 // to Ory Kratos; this handler only deals with the honeyDue user record.
 type AuthHandler struct {
 	authService    *services.AuthService
 	emailService   *services.EmailService
 	cache          *services.CacheService
 	appleAuthService  *services.AppleAuthService
 	googleAuthService *services.GoogleAuthService
 	storageService *services.StorageService
 	auditService   *services.AuditService
 	enqueuer       worker.Enqueuer
 }
-// NewAuthHandler creates a new auth handler
+// NewAuthHandler creates a new auth handler.
 func NewAuthHandler(authService *services.AuthService, emailService *services.EmailService, cache *services.CacheService) *AuthHandler {
 	return &AuthHandler{
 		authService:  authService,
@@ -35,139 +36,108 @@ func NewAuthHandler(authService *services.AuthService, emailService *services.Em
 	}
 }
-// SetAppleAuthService sets the Apple auth service (called after initialization)
+// SetStorageService sets the storage service for file deletion during account deletion.
 func (h *AuthHandler) SetAppleAuthService(appleAuth *services.AppleAuthService) {
 	h.appleAuthService = appleAuth
 }
 // SetGoogleAuthService sets the Google auth service (called after initialization)
 func (h *AuthHandler) SetGoogleAuthService(googleAuth *services.GoogleAuthService) {
 	h.googleAuthService = googleAuth
 }
 // SetStorageService sets the storage service for file deletion during account deletion
 func (h *AuthHandler) SetStorageService(storageService *services.StorageService) {
 	h.storageService = storageService
 }
-// SetAuditService sets the audit service for logging security events
+// SetAuditService sets the audit service for logging security events.
 func (h *AuthHandler) SetAuditService(auditService *services.AuditService) {
 	h.auditService = auditService
 }
-// Login handles POST /api/auth/login/
+// SetEnqueuer sets the async task enqueuer (used by the GDPR data-export endpoint).
-func (h *AuthHandler) Login(c echo.Context) error {
+func (h *AuthHandler) SetEnqueuer(enqueuer worker.Enqueuer) {
-	var req requests.LoginRequest
+	h.enqueuer = enqueuer
 	if err := c.Bind(&req); err != nil {
 		return apperrors.BadRequest("error.invalid_request")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
 	response, err := h.authService.Login(&req)
 	if err != nil {
 		log.Debug().Err(err).Str("identifier", req.Username).Msg("Login failed")
 		if h.auditService != nil {
 			h.auditService.LogEvent(c, nil, services.AuditEventLoginFailed, map[string]interface{}{
 				"identifier": req.Username,
 			})
 		}
 		return err
 	}
 	if h.auditService != nil {
 		userID := response.User.ID
 		h.auditService.LogEvent(c, &userID, services.AuditEventLogin, nil)
 	}
 	return c.JSON(http.StatusOK, response)
 }
-// Register handles POST /api/auth/register/
+// ExportData handles POST /api/auth/export/ — queues a GDPR data-export job that
 // emails the user a zip of all their data. Async (202) because gathering,
 // zipping, and emailing can take seconds; doing it inline would block the request.
 func (h *AuthHandler) ExportData(c echo.Context) error {
 	noStore(c)
 	user, err := middleware.MustGetAuthUser(c)
 	if err != nil {
 		return err
 	}
 	if h.enqueuer == nil {
 		return echo.NewHTTPError(http.StatusServiceUnavailable, "data export is temporarily unavailable")
 	}
 	if err := h.enqueuer.EnqueueDataExport(user.ID); err != nil {
 		log.Error().Err(err).Uint("user_id", user.ID).Msg("Failed to enqueue data export")
 		return echo.NewHTTPError(http.StatusInternalServerError, "failed to queue data export")
 	}
 	if h.auditService != nil {
 		h.auditService.LogEvent(c, &user.ID, services.AuditEventDataExport, map[string]interface{}{
 			"user_id": user.ID,
 			"email":   user.Email,
 		})
 	}
 	return c.JSON(http.StatusAccepted, map[string]string{
 		"message": "Your data export has been queued. You'll receive an email with your data shortly.",
 	})
 }
 // noStore marks a response as non-cacheable.
 func noStore(c echo.Context) {
 	c.Response().Header().Set("Cache-Control", "no-store")
 }
 // Register handles POST /api/auth/register/ — creates a new password account.
 //
 // The identity is admin-created in Kratos with an unverified email and no
 // auto-sent code (see services.AuthService.Register). The client logs in right
 // after to get a session, then completes email verification. Returns 201 with
 // no token; 409 if the email is taken; 400 on a weak password.
 func (h *AuthHandler) Register(c echo.Context) error {
 	var req requests.RegisterRequest
 	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
+		return apperrors.BadRequest("error.invalid_request_body")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
-
+	if err := h.authService.Register(c.Request().Context(), &req); err != nil {
 	response, confirmationCode, err := h.authService.Register(&req)
 	if err != nil {
 		log.Debug().Err(err).Msg("Registration failed")
 		return err
 	}
-
+	return c.JSON(http.StatusCreated, map[string]string{
-	if h.auditService != nil {
+		"message": "Account created. Please verify your email.",
 		userID := response.User.ID
 		h.auditService.LogEvent(c, &userID, services.AuditEventRegister, map[string]interface{}{
 			"username": req.Username,
 			"email":    req.Email,
 	})
 	}
 	// Send welcome email with confirmation code (async)
 	if h.emailService != nil && confirmationCode != "" {
 		go func() {
 			defer func() {
 				if r := recover(); r != nil {
 					log.Error().Interface("panic", r).Str("email", req.Email).Msg("Panic in welcome email goroutine")
 				}
 			}()
 			if err := h.emailService.SendWelcomeEmail(req.Email, req.FirstName, confirmationCode); err != nil {
 				log.Error().Err(err).Str("email", req.Email).Msg("Failed to send welcome email")
 			}
 		}()
 	}
 	return c.JSON(http.StatusCreated, response)
 }
 // Logout handles POST /api/auth/logout/
 func (h *AuthHandler) Logout(c echo.Context) error {
 	token := middleware.GetAuthToken(c)
 	if token == "" {
 		return apperrors.Unauthorized("error.not_authenticated")
 	}
 	// Log audit event before invalidating the token
 	if h.auditService != nil {
 		user := middleware.GetAuthUser(c)
 		if user != nil {
 			h.auditService.LogEvent(c, &user.ID, services.AuditEventLogout, nil)
 		}
 	}
 	// Invalidate token in database
 	if err := h.authService.Logout(token); err != nil {
 		log.Warn().Err(err).Msg("Failed to delete token from database")
 	}
 	// Invalidate token in cache
 	if h.cache != nil {
 		if err := h.cache.InvalidateAuthToken(c.Request().Context(), token); err != nil {
 			log.Warn().Err(err).Msg("Failed to invalidate token in cache")
 		}
 	}
 	return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Logged out successfully"})
 }
 // CurrentUser handles GET /api/auth/me/
 func (h *AuthHandler) CurrentUser(c echo.Context) error {
 	noStore(c)
 	user, err := middleware.MustGetAuthUser(c)
 	if err != nil {
 		return err
 	}
-	response, err := h.authService.GetCurrentUser(user.ID)
+	response, err := h.authService.GetCurrentUser(c.Request().Context(), user.ID)
 	if err != nil {
 		log.Error().Err(err).Uint("user_id", user.ID).Msg("Failed to get current user")
 		return err
 	}
 	// user_profile.verified is a one-time mirror set at provision time
 	// (see middleware/kratos_auth.go::provision). Kratos remains the source
 	// of truth for email-verification state — it can flip from false → true
 	// the instant the user completes the verification flow, and nothing
 	// updates the local column. Override the response with the live value
 	// the Kratos auth middleware already stashed in context so /auth/me
 	// reflects current reality. Also opportunistically sync the DB mirror
 	// (best-effort, ignore error) so background queries that read the
 	// column see the same answer.
 	if verified, ok := c.Get(middleware.AuthVerifiedKey).(bool); ok {
 		mirrorStale := response.Profile != nil && response.Profile.Verified != verified
 		if response.Profile != nil {
 			response.Profile.Verified = verified
 		}
 		if verified && mirrorStale {
 			_ = h.authService.MarkUserVerified(c.Request().Context(), user.ID)
 		}
 	}
 	return c.JSON(http.StatusOK, response)
 }
@@ -186,7 +156,7 @@ func (h *AuthHandler) UpdateProfile(c echo.Context) error {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
-	response, err := h.authService.UpdateProfile(user.ID, &req)
+	response, err := h.authService.UpdateProfile(c.Request().Context(), user.ID, &req)
 	if err != nil {
 		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Failed to update profile")
 		return err
@@ -195,296 +165,6 @@ func (h *AuthHandler) UpdateProfile(c echo.Context) error {
 	return c.JSON(http.StatusOK, response)
 }
 // VerifyEmail handles POST /api/auth/verify-email/
 func (h *AuthHandler) VerifyEmail(c echo.Context) error {
 	user, err := middleware.MustGetAuthUser(c)
 	if err != nil {
 		return err
 	}
 	var req requests.VerifyEmailRequest
 	if err := c.Bind(&req); err != nil {
 		return apperrors.BadRequest("error.invalid_request")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
 	err = h.authService.VerifyEmail(user.ID, req.Code)
 	if err != nil {
 		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Email verification failed")
 		return err
 	}
 	// Send post-verification welcome email with tips (async)
 	if h.emailService != nil {
 		go func() {
 			defer func() {
 				if r := recover(); r != nil {
 					log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in post-verification email goroutine")
 				}
 			}()
 			if err := h.emailService.SendPostVerificationEmail(user.Email, user.FirstName); err != nil {
 				log.Error().Err(err).Str("email", user.Email).Msg("Failed to send post-verification email")
 			}
 		}()
 	}
 	return c.JSON(http.StatusOK, responses.VerifyEmailResponse{
 		Message:  "Email verified successfully",
 		Verified: true,
 	})
 }
 // ResendVerification handles POST /api/auth/resend-verification/
 func (h *AuthHandler) ResendVerification(c echo.Context) error {
 	user, err := middleware.MustGetAuthUser(c)
 	if err != nil {
 		return err
 	}
 	code, err := h.authService.ResendVerificationCode(user.ID)
 	if err != nil {
 		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Failed to resend verification")
 		return err
 	}
 	// Send verification email (async)
 	if h.emailService != nil {
 		go func() {
 			defer func() {
 				if r := recover(); r != nil {
 					log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in verification email goroutine")
 				}
 			}()
 			if err := h.emailService.SendVerificationEmail(user.Email, user.FirstName, code); err != nil {
 				log.Error().Err(err).Str("email", user.Email).Msg("Failed to send verification email")
 			}
 		}()
 	}
 	return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Verification email sent"})
 }
 // ForgotPassword handles POST /api/auth/forgot-password/
 func (h *AuthHandler) ForgotPassword(c echo.Context) error {
 	var req requests.ForgotPasswordRequest
 	if err := c.Bind(&req); err != nil {
 		return apperrors.BadRequest("error.invalid_request")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
 	code, user, err := h.authService.ForgotPassword(req.Email)
 	if err != nil {
 		var appErr *apperrors.AppError
 		if errors.As(err, &appErr) && appErr.Code == http.StatusTooManyRequests {
 			// Only reveal rate limit errors
 			return err
 		}
 		log.Error().Err(err).Str("email", req.Email).Msg("Forgot password failed")
 		// Don't reveal other errors to prevent email enumeration
 	}
 	// Send password reset email (async) - only if user found
 	if h.emailService != nil && code != "" && user != nil {
 		go func() {
 			defer func() {
 				if r := recover(); r != nil {
 					log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in password reset email goroutine")
 				}
 			}()
 			if err := h.emailService.SendPasswordResetEmail(user.Email, user.FirstName, code); err != nil {
 				log.Error().Err(err).Str("email", user.Email).Msg("Failed to send password reset email")
 			}
 		}()
 	}
 	if h.auditService != nil {
 		h.auditService.LogEvent(c, nil, services.AuditEventPasswordReset, map[string]interface{}{
 			"email": req.Email,
 		})
 	}
 	// Always return success to prevent email enumeration
 	return c.JSON(http.StatusOK, responses.ForgotPasswordResponse{
 		Message: "Password reset email sent",
 	})
 }
 // VerifyResetCode handles POST /api/auth/verify-reset-code/
 func (h *AuthHandler) VerifyResetCode(c echo.Context) error {
 	var req requests.VerifyResetCodeRequest
 	if err := c.Bind(&req); err != nil {
 		return apperrors.BadRequest("error.invalid_request")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
 	resetToken, err := h.authService.VerifyResetCode(req.Email, req.Code)
 	if err != nil {
 		log.Debug().Err(err).Str("email", req.Email).Msg("Verify reset code failed")
 		return err
 	}
 	return c.JSON(http.StatusOK, responses.VerifyResetCodeResponse{
 		Message:    "Reset code verified",
 		ResetToken: resetToken,
 	})
 }
 // ResetPassword handles POST /api/auth/reset-password/
 func (h *AuthHandler) ResetPassword(c echo.Context) error {
 	var req requests.ResetPasswordRequest
 	if err := c.Bind(&req); err != nil {
 		return apperrors.BadRequest("error.invalid_request")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
 	err := h.authService.ResetPassword(req.ResetToken, req.NewPassword)
 	if err != nil {
 		log.Debug().Err(err).Msg("Password reset failed")
 		return err
 	}
 	if h.auditService != nil {
 		h.auditService.LogEvent(c, nil, services.AuditEventPasswordChanged, map[string]interface{}{
 			"method": "reset_token",
 		})
 	}
 	return c.JSON(http.StatusOK, responses.ResetPasswordResponse{
 		Message: "Password reset successful",
 	})
 }
 // AppleSignIn handles POST /api/auth/apple-sign-in/
 func (h *AuthHandler) AppleSignIn(c echo.Context) error {
 	var req requests.AppleSignInRequest
 	if err := c.Bind(&req); err != nil {
 		return apperrors.BadRequest("error.invalid_request")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
 	if h.appleAuthService == nil {
 		log.Error().Msg("Apple auth service not configured")
 		return &apperrors.AppError{
 			Code:       500,
 			MessageKey: "error.apple_signin_not_configured",
 		}
 	}
 	response, err := h.authService.AppleSignIn(c.Request().Context(), h.appleAuthService, &req)
 	if err != nil {
 		// Check for legacy Apple Sign In error (not yet migrated)
 		if errors.Is(err, services.ErrAppleSignInFailed) {
 			log.Debug().Err(err).Msg("Apple Sign In failed (legacy error)")
 			return apperrors.Unauthorized("error.invalid_apple_token")
 		}
 		log.Debug().Err(err).Msg("Apple Sign In failed")
 		return err
 	}
 	// Send welcome email for new users (async)
 	if response.IsNewUser && h.emailService != nil && response.User.Email != "" {
 		go func() {
 			defer func() {
 				if r := recover(); r != nil {
 					log.Error().Interface("panic", r).Str("email", response.User.Email).Msg("Panic in Apple welcome email goroutine")
 				}
 			}()
 			if err := h.emailService.SendAppleWelcomeEmail(response.User.Email, response.User.FirstName); err != nil {
 				log.Error().Err(err).Str("email", response.User.Email).Msg("Failed to send Apple welcome email")
 			}
 		}()
 	}
 	return c.JSON(http.StatusOK, response)
 }
 // GoogleSignIn handles POST /api/auth/google-sign-in/
 func (h *AuthHandler) GoogleSignIn(c echo.Context) error {
 	var req requests.GoogleSignInRequest
 	if err := c.Bind(&req); err != nil {
 		return apperrors.BadRequest("error.invalid_request")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
 	if h.googleAuthService == nil {
 		log.Error().Msg("Google auth service not configured")
 		return &apperrors.AppError{
 			Code:       500,
 			MessageKey: "error.google_signin_not_configured",
 		}
 	}
 	response, err := h.authService.GoogleSignIn(c.Request().Context(), h.googleAuthService, &req)
 	if err != nil {
 		// Check for legacy Google Sign In error (not yet migrated)
 		if errors.Is(err, services.ErrGoogleSignInFailed) {
 			log.Debug().Err(err).Msg("Google Sign In failed (legacy error)")
 			return apperrors.Unauthorized("error.invalid_google_token")
 		}
 		log.Debug().Err(err).Msg("Google Sign In failed")
 		return err
 	}
 	// Send welcome email for new users (async)
 	if response.IsNewUser && h.emailService != nil && response.User.Email != "" {
 		go func() {
 			defer func() {
 				if r := recover(); r != nil {
 					log.Error().Interface("panic", r).Str("email", response.User.Email).Msg("Panic in Google welcome email goroutine")
 				}
 			}()
 			if err := h.emailService.SendGoogleWelcomeEmail(response.User.Email, response.User.FirstName); err != nil {
 				log.Error().Err(err).Str("email", response.User.Email).Msg("Failed to send Google welcome email")
 			}
 		}()
 	}
 	return c.JSON(http.StatusOK, response)
 }
 // RefreshToken handles POST /api/auth/refresh/
 func (h *AuthHandler) RefreshToken(c echo.Context) error {
 	user, err := middleware.MustGetAuthUser(c)
 	if err != nil {
 		return err
 	}
 	token := middleware.GetAuthToken(c)
 	if token == "" {
 		return apperrors.Unauthorized("error.not_authenticated")
 	}
 	response, err := h.authService.RefreshToken(token, user.ID)
 	if err != nil {
 		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Token refresh failed")
 		return err
 	}
 	// If the token was refreshed (new token), invalidate the old one from cache
 	if response.Token != token && h.cache != nil {
 		if cacheErr := h.cache.InvalidateAuthToken(c.Request().Context(), token); cacheErr != nil {
 			log.Warn().Err(cacheErr).Msg("Failed to invalidate old token from cache during refresh")
 		}
 	}
 	return c.JSON(http.StatusOK, response)
 }
 // DeleteAccount handles DELETE /api/auth/account/
 func (h *AuthHandler) DeleteAccount(c echo.Context) error {
 	user, err := middleware.MustGetAuthUser(c)
@@ -497,7 +177,7 @@ func (h *AuthHandler) DeleteAccount(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_request")
 	}
-	fileURLs, err := h.authService.DeleteAccount(user.ID, req.Password, req.Confirmation)
+	fileURLs, err := h.authService.DeleteAccount(c.Request().Context(), user.ID, req.Password, req.Confirmation)
 	if err != nil {
 		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Account deletion failed")
 		return err
@@ -527,13 +207,5 @@ func (h *AuthHandler) DeleteAccount(c echo.Context) error {
 		}()
 	}
 	// Invalidate auth token from cache
 	token := middleware.GetAuthToken(c)
 	if h.cache != nil && token != "" {
 		if err := h.cache.InvalidateAuthToken(c.Request().Context(), token); err != nil {
 			log.Warn().Err(err).Msg("Failed to invalidate token in cache after account deletion")
 		}
 	}
 	return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Account deleted successfully"})
 }
@@ -35,26 +35,25 @@ func setupDeleteAccountHandler(t *testing.T) (*AuthHandler, *echo.Echo, *gorm.DB
 	return handler, e, db
 }
-func TestAuthHandler_DeleteAccount_EmailUser(t *testing.T) {
+// TestAuthHandler_DeleteAccount_WithConfirmation verifies that DELETE /account/
 // succeeds when the user sends confirmation: "DELETE".
 // Post-Kratos: all users (regardless of provider) must confirm with "DELETE".
 func TestAuthHandler_DeleteAccount_WithConfirmation(t *testing.T) {
 	handler, e, db := setupDeleteAccountHandler(t)
-	user := testutil.CreateTestUser(t, db, "deletetest", "delete@test.com", "Password123")
+	user := testutil.CreateTestUser(t, db, "deletetest", "delete@test.com", "ignored")
 	// Create profile for the user
 	profile := &models.UserProfile{UserID: user.ID, Verified: true}
 	require.NoError(t, db.Create(profile).Error)
 	// Create auth token
 	testutil.CreateTestToken(t, db, user.ID)
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.DELETE("/account/", handler.DeleteAccount)
-	t.Run("successful deletion with correct password", func(t *testing.T) {
+	t.Run("successful deletion with DELETE confirmation", func(t *testing.T) {
 		password := "Password123"
 		req := map[string]interface{}{
-			"password": password,
+			"confirmation": "DELETE",
 		}
 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
@@ -74,106 +73,15 @@ func TestAuthHandler_DeleteAccount_EmailUser(t *testing.T) {
 		// Verify profile is deleted
 		db.Model(&models.UserProfile{}).Where("user_id = ?", user.ID).Count(&count)
 		assert.Equal(t, int64(0), count)
 		// Verify auth token is deleted
 		db.Model(&models.AuthToken{}).Where("user_id = ?", user.ID).Count(&count)
 		assert.Equal(t, int64(0), count)
 	})
 }
-func TestAuthHandler_DeleteAccount_WrongPassword(t *testing.T) {
+// TestAuthHandler_DeleteAccount_MissingConfirmation verifies that a missing
 // confirmation string is rejected with 400.
 func TestAuthHandler_DeleteAccount_MissingConfirmation(t *testing.T) {
 	handler, e, db := setupDeleteAccountHandler(t)
-	user := testutil.CreateTestUser(t, db, "wrongpw", "wrongpw@test.com", "Password123")
+	user := testutil.CreateTestUser(t, db, "nopw", "nopw@test.com", "ignored")
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.DELETE("/account/", handler.DeleteAccount)
 	t.Run("wrong password returns 401", func(t *testing.T) {
 		wrongPw := "wrongpassword"
 		req := map[string]interface{}{
 			"password": wrongPw,
 		}
 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
 		testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
 	})
 }
 func TestAuthHandler_DeleteAccount_MissingPassword(t *testing.T) {
 	handler, e, db := setupDeleteAccountHandler(t)
 	user := testutil.CreateTestUser(t, db, "nopw", "nopw@test.com", "Password123")
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.DELETE("/account/", handler.DeleteAccount)
 	t.Run("missing password returns 400", func(t *testing.T) {
 		req := map[string]interface{}{}
 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestAuthHandler_DeleteAccount_SocialAuthUser(t *testing.T) {
 	handler, e, db := setupDeleteAccountHandler(t)
 	user := testutil.CreateTestUser(t, db, "appleuser", "apple@test.com", "randompassword")
 	// Create Apple social auth record
 	appleAuth := &models.AppleSocialAuth{
 		UserID:  user.ID,
 		AppleID: "apple_sub_123",
 		Email:   "apple@test.com",
 	}
 	require.NoError(t, db.Create(appleAuth).Error)
 	// Create profile
 	profile := &models.UserProfile{UserID: user.ID, Verified: true}
 	require.NoError(t, db.Create(profile).Error)
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.DELETE("/account/", handler.DeleteAccount)
 	t.Run("successful deletion with DELETE confirmation", func(t *testing.T) {
 		confirmation := "DELETE"
 		req := map[string]interface{}{
 			"confirmation": confirmation,
 		}
 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 		// Verify user is deleted
 		var count int64
 		db.Model(&models.User{}).Where("id = ?", user.ID).Count(&count)
 		assert.Equal(t, int64(0), count)
 		// Verify apple auth is deleted
 		db.Model(&models.AppleSocialAuth{}).Where("user_id = ?", user.ID).Count(&count)
 		assert.Equal(t, int64(0), count)
 	})
 }
 func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
 	handler, e, db := setupDeleteAccountHandler(t)
 	user := testutil.CreateTestUser(t, db, "googleuser", "google@test.com", "randompassword")
 	// Create Google social auth record
 	googleAuth := &models.GoogleSocialAuth{
 		UserID:   user.ID,
 		GoogleID: "google_sub_456",
 		Email:    "google@test.com",
 	}
 	require.NoError(t, db.Create(googleAuth).Error)
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
@@ -188,9 +96,8 @@ func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
 	})
 	t.Run("wrong confirmation returns 400", func(t *testing.T) {
 		wrongConfirmation := "delete"
 		req := map[string]interface{}{
-			"confirmation": wrongConfirmation,
+			"confirmation": "delete", // lowercase — must be exact "DELETE"
 		}
 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
@@ -199,6 +106,8 @@ func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
 	})
 }
 // TestAuthHandler_DeleteAccount_Unauthenticated verifies that 401 is returned
 // when no auth middleware is set.
 func TestAuthHandler_DeleteAccount_Unauthenticated(t *testing.T) {
 	handler, e, _ := setupDeleteAccountHandler(t)
@@ -207,7 +116,7 @@ func TestAuthHandler_DeleteAccount_Unauthenticated(t *testing.T) {
 	t.Run("unauthenticated request returns 401", func(t *testing.T) {
 		req := map[string]interface{}{
-			"password": "Password123",
+			"confirmation": "DELETE",
 		}
 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "")
@@ -1,3 +1,7 @@
 // auth_handler_test.go tests the auth handler endpoints that survived the
 // Ory Kratos migration: GET /me/ and PUT/PATCH /profile/.
 // Login, register, logout, forgot-password, and social sign-in are now
 // handled by Kratos.
 package handlers
 import (
@@ -34,204 +38,32 @@ func setupAuthHandler(t *testing.T) (*AuthHandler, *echo.Echo, *repositories.Use
 	return handler, e, userRepo
 }
 func TestAuthHandler_Register(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/register/", handler.Register)
 	t.Run("successful registration", func(t *testing.T) {
 		req := requests.RegisterRequest{
 			Username:  "newuser",
 			Email:     "new@test.com",
 			Password:  "Password123",
 			FirstName: "New",
 			LastName:  "User",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusCreated)
 		var response map[string]interface{}
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)
 		testutil.AssertJSONFieldExists(t, response, "token")
 		testutil.AssertJSONFieldExists(t, response, "user")
 		testutil.AssertJSONFieldExists(t, response, "message")
 		user := response["user"].(map[string]interface{})
 		assert.Equal(t, "newuser", user["username"])
 		assert.Equal(t, "new@test.com", user["email"])
 		assert.Equal(t, "New", user["first_name"])
 		assert.Equal(t, "User", user["last_name"])
 	})
 	t.Run("registration with missing fields", func(t *testing.T) {
 		req := map[string]string{
 			"username": "test",
 			// Missing email and password
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 		response := testutil.ParseJSON(t, w.Body.Bytes())
 		testutil.AssertJSONFieldExists(t, response, "error")
 	})
 	t.Run("registration with short password", func(t *testing.T) {
 		req := requests.RegisterRequest{
 			Username: "testuser",
 			Email:    "test@test.com",
 			Password: "short", // Less than 8 chars
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 	t.Run("registration with duplicate username", func(t *testing.T) {
 		// First registration
 		req := requests.RegisterRequest{
 			Username: "duplicate",
 			Email:    "unique1@test.com",
 			Password: "Password123",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusCreated)
 		// Try to register again with same username
 		req.Email = "unique2@test.com"
 		w = testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusConflict) // 409 for duplicate resource
 		response := testutil.ParseJSON(t, w.Body.Bytes())
 		assert.Contains(t, response["error"], "Username already taken")
 	})
 	t.Run("registration with duplicate email", func(t *testing.T) {
 		// First registration
 		req := requests.RegisterRequest{
 			Username: "user1",
 			Email:    "duplicate@test.com",
 			Password: "Password123",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusCreated)
 		// Try to register again with same email
 		req.Username = "user2"
 		w = testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusConflict) // 409 for duplicate resource
 		response := testutil.ParseJSON(t, w.Body.Bytes())
 		assert.Contains(t, response["error"], "Email already registered")
 	})
 }
 func TestAuthHandler_Login(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/register/", handler.Register)
 	e.POST("/api/auth/login/", handler.Login)
 	// Create a test user
 	registerReq := requests.RegisterRequest{
 		Username:  "logintest",
 		Email:     "login@test.com",
 		Password:  "Password123",
 		FirstName: "Test",
 		LastName:  "User",
 	}
 	w := testutil.MakeRequest(e, "POST", "/api/auth/register/", registerReq, "")
 	testutil.AssertStatusCode(t, w, http.StatusCreated)
 	t.Run("successful login with username", func(t *testing.T) {
 		req := requests.LoginRequest{
 			Username: "logintest",
 			Password: "Password123",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 		var response map[string]interface{}
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)
 		testutil.AssertJSONFieldExists(t, response, "token")
 		testutil.AssertJSONFieldExists(t, response, "user")
 		user := response["user"].(map[string]interface{})
 		assert.Equal(t, "logintest", user["username"])
 		assert.Equal(t, "login@test.com", user["email"])
 	})
 	t.Run("successful login with email", func(t *testing.T) {
 		req := requests.LoginRequest{
 			Username: "login@test.com", // Using email as username
 			Password: "Password123",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 	})
 	t.Run("login with wrong password", func(t *testing.T) {
 		req := requests.LoginRequest{
 			Username: "logintest",
 			Password: "wrongpassword",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
 		response := testutil.ParseJSON(t, w.Body.Bytes())
 		assert.Contains(t, response["error"], "Invalid credentials")
 	})
 	t.Run("login with non-existent user", func(t *testing.T) {
 		req := requests.LoginRequest{
 			Username: "nonexistent",
 			Password: "Password123",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
 	})
 	t.Run("login with missing fields", func(t *testing.T) {
 		req := map[string]string{
 			"username": "logintest",
 			// Missing password
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestAuthHandler_CurrentUser(t *testing.T) {
-	handler, e, userRepo := setupAuthHandler(t)
+	handler, e, _ := setupAuthHandler(t)
 	db := testutil.SetupTestDB(t)
-	user := testutil.CreateTestUser(t, db, "metest", "me@test.com", "Password123")
+	user := testutil.CreateTestUser(t, db, "metest", "me@test.com", "")
 	user.FirstName = "Test"
 	user.LastName = "User"
-	userRepo.Update(user)
+	// Use the userRepo from setupAuthHandler's DB, but since we need the user
 	// in the same DB we re-create it there.
 	db2 := testutil.SetupTestDB(t)
 	user2 := testutil.CreateTestUser(t, db2, "metest2", "me2@test.com", "")
 	user2.FirstName = "Test"
 	user2.LastName = "User"
 	userRepo2 := repositories.NewUserRepository(db2)
 	require.NoError(t, userRepo2.Update(user2))
 	// Build handler against db2
 	cfg := &config.Config{}
 	authService2 := services.NewAuthService(userRepo2, cfg)
 	handler2 := NewAuthHandler(authService2, nil, nil)
 	// Set up route with mock auth middleware
 	authGroup := e.Group("/api/auth")
-	authGroup.Use(testutil.MockAuthMiddleware(user))
+	authGroup.Use(testutil.MockAuthMiddleware(user2))
-	authGroup.GET("/me/", handler.CurrentUser)
+	authGroup.GET("/me/", handler2.CurrentUser)
 	_ = handler // avoid unused
 	t.Run("get current user", func(t *testing.T) {
 		w := testutil.MakeRequest(e, "GET", "/api/auth/me/", nil, "test-token")
@@ -242,23 +74,26 @@ func TestAuthHandler_CurrentUser(t *testing.T) {
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)
-		assert.Equal(t, "metest", response["username"])
+		assert.Equal(t, "metest2", response["username"])
-		assert.Equal(t, "me@test.com", response["email"])
+		assert.Equal(t, "me2@test.com", response["email"])
 	})
 }
 func TestAuthHandler_UpdateProfile(t *testing.T) {
 	handler, e, userRepo := setupAuthHandler(t)
 	db := testutil.SetupTestDB(t)
-	user := testutil.CreateTestUser(t, db, "updatetest", "update@test.com", "Password123")
+	userRepo := repositories.NewUserRepository(db)
-	userRepo.Update(user)
+	cfg := &config.Config{}
 	authService := services.NewAuthService(userRepo, cfg)
 	handler := NewAuthHandler(authService, nil, nil)
 	e := testutil.SetupTestRouter()
 	user := testutil.CreateTestUser(t, db, "updatetest", "update@test.com", "")
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.PUT("/profile/", handler.UpdateProfile)
-	t.Run("update profile", func(t *testing.T) {
+	t.Run("update first and last name", func(t *testing.T) {
 		firstName := "Updated"
 		lastName := "Name"
 		req := requests.UpdateProfileRequest{
@@ -278,130 +113,3 @@ func TestAuthHandler_UpdateProfile(t *testing.T) {
 		assert.Equal(t, "Name", response["last_name"])
 	})
 }
 func TestAuthHandler_ForgotPassword(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/register/", handler.Register)
 	e.POST("/api/auth/forgot-password/", handler.ForgotPassword)
 	// Create a test user
 	registerReq := requests.RegisterRequest{
 		Username: "forgottest",
 		Email:    "forgot@test.com",
 		Password: "Password123",
 	}
 	testutil.MakeRequest(e, "POST", "/api/auth/register/", registerReq, "")
 	t.Run("forgot password with valid email", func(t *testing.T) {
 		req := requests.ForgotPasswordRequest{
 			Email: "forgot@test.com",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
 		// Always returns 200 to prevent email enumeration
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 		response := testutil.ParseJSON(t, w.Body.Bytes())
 		testutil.AssertJSONFieldExists(t, response, "message")
 	})
 	t.Run("forgot password with invalid email", func(t *testing.T) {
 		req := requests.ForgotPasswordRequest{
 			Email: "nonexistent@test.com",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
 		// Still returns 200 to prevent email enumeration
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 	})
 }
 func TestAuthHandler_Logout(t *testing.T) {
 	handler, e, userRepo := setupAuthHandler(t)
 	db := testutil.SetupTestDB(t)
 	user := testutil.CreateTestUser(t, db, "logouttest", "logout@test.com", "Password123")
 	userRepo.Update(user)
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.POST("/logout/", handler.Logout)
 	t.Run("successful logout", func(t *testing.T) {
 		w := testutil.MakeRequest(e, "POST", "/api/auth/logout/", nil, "test-token")
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 		response := testutil.ParseJSON(t, w.Body.Bytes())
 		assert.Contains(t, response["message"], "Logged out successfully")
 	})
 }
 func TestAuthHandler_JSONResponses(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/register/", handler.Register)
 	e.POST("/api/auth/login/", handler.Login)
 	t.Run("register response has correct JSON structure", func(t *testing.T) {
 		req := requests.RegisterRequest{
 			Username:  "jsontest",
 			Email:     "json@test.com",
 			Password:  "Password123",
 			FirstName: "JSON",
 			LastName:  "Test",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusCreated)
 		var response map[string]interface{}
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)
 		// Verify top-level structure
 		assert.Contains(t, response, "token")
 		assert.Contains(t, response, "user")
 		assert.Contains(t, response, "message")
 		// Verify token is not empty
 		assert.NotEmpty(t, response["token"])
 		// Verify user structure
 		user := response["user"].(map[string]interface{})
 		assert.Contains(t, user, "id")
 		assert.Contains(t, user, "username")
 		assert.Contains(t, user, "email")
 		assert.Contains(t, user, "first_name")
 		assert.Contains(t, user, "last_name")
 		assert.Contains(t, user, "is_active")
 		assert.Contains(t, user, "date_joined")
 		// Verify types
 		assert.IsType(t, float64(0), user["id"]) // JSON numbers are float64
 		assert.IsType(t, "", user["username"])
 		assert.IsType(t, "", user["email"])
 		assert.IsType(t, true, user["is_active"])
 	})
 	t.Run("error response has correct JSON structure", func(t *testing.T) {
 		req := map[string]string{
 			"username": "test",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 		var response map[string]interface{}
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)
 		assert.Contains(t, response, "error")
 		assert.IsType(t, "", response["error"])
 	})
 }
@@ -30,7 +30,7 @@ func (h *ContractorHandler) ListContractors(c echo.Context) error {
 	if err != nil {
 		return err
 	}
-	response, err := h.contractorService.ListContractors(user.ID)
+	response, err := h.contractorService.ListContractors(c.Request().Context(), user.ID)
 	if err != nil {
 		return apperrors.Internal(err)
 	}
@@ -48,7 +48,7 @@ func (h *ContractorHandler) GetContractor(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_contractor_id")
 	}
-	response, err := h.contractorService.GetContractor(uint(contractorID), user.ID)
+	response, err := h.contractorService.GetContractor(c.Request().Context(), uint(contractorID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -69,7 +69,7 @@ func (h *ContractorHandler) CreateContractor(c echo.Context) error {
 		return err
 	}
-	response, err := h.contractorService.CreateContractor(&req, user.ID)
+	response, err := h.contractorService.CreateContractor(c.Request().Context(), &req, user.ID)
 	if err != nil {
 		return err
 	}
@@ -95,7 +95,7 @@ func (h *ContractorHandler) UpdateContractor(c echo.Context) error {
 		return err
 	}
-	response, err := h.contractorService.UpdateContractor(uint(contractorID), user.ID, &req)
+	response, err := h.contractorService.UpdateContractor(c.Request().Context(), uint(contractorID), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -113,7 +113,7 @@ func (h *ContractorHandler) DeleteContractor(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_contractor_id")
 	}
-	err = h.contractorService.DeleteContractor(uint(contractorID), user.ID)
+	err = h.contractorService.DeleteContractor(c.Request().Context(), uint(contractorID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -131,7 +131,7 @@ func (h *ContractorHandler) ToggleFavorite(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_contractor_id")
 	}
-	response, err := h.contractorService.ToggleFavorite(uint(contractorID), user.ID)
+	response, err := h.contractorService.ToggleFavorite(c.Request().Context(), uint(contractorID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -149,7 +149,7 @@ func (h *ContractorHandler) GetContractorTasks(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_contractor_id")
 	}
-	response, err := h.contractorService.GetContractorTasks(uint(contractorID), user.ID)
+	response, err := h.contractorService.GetContractorTasks(c.Request().Context(), uint(contractorID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -167,7 +167,7 @@ func (h *ContractorHandler) ListContractorsByResidence(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_residence_id")
 	}
-	response, err := h.contractorService.ListContractorsByResidence(uint(residenceID), user.ID)
+	response, err := h.contractorService.ListContractorsByResidence(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -176,7 +176,7 @@ func (h *ContractorHandler) ListContractorsByResidence(c echo.Context) error {
 // GetSpecialties handles GET /api/contractors/specialties/
 func (h *ContractorHandler) GetSpecialties(c echo.Context) error {
-	specialties, err := h.contractorService.GetSpecialties()
+	specialties, err := h.contractorService.GetSpecialties(c.Request().Context())
 	if err != nil {
 		return apperrors.Internal(err)
 	}
@@ -70,7 +70,7 @@ func (h *DocumentHandler) ListDocuments(c echo.Context) error {
 		}
 	}
-	response, err := h.documentService.ListDocuments(user.ID, filter)
+	response, err := h.documentService.ListDocuments(c.Request().Context(), user.ID, filter)
 	if err != nil {
 		return err
 	}
@@ -88,7 +88,7 @@ func (h *DocumentHandler) GetDocument(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_document_id")
 	}
-	response, err := h.documentService.GetDocument(uint(documentID), user.ID)
+	response, err := h.documentService.GetDocument(c.Request().Context(), uint(documentID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -101,7 +101,7 @@ func (h *DocumentHandler) ListWarranties(c echo.Context) error {
 	if err != nil {
 		return err
 	}
-	response, err := h.documentService.ListWarranties(user.ID)
+	response, err := h.documentService.ListWarranties(c.Request().Context(), user.ID)
 	if err != nil {
 		return apperrors.Internal(err)
 	}
@@ -201,7 +201,7 @@ func (h *DocumentHandler) CreateDocument(c echo.Context) error {
 			if h.storageService == nil {
 				return apperrors.Internal(nil)
 			}
-			result, err := h.storageService.Upload(uploadedFile, "documents")
+			result, err := h.storageService.Upload(c.Request().Context(), uploadedFile, "documents")
 			if err != nil {
 				return apperrors.BadRequest("error.failed_to_upload_file")
 			}
@@ -222,7 +222,7 @@ func (h *DocumentHandler) CreateDocument(c echo.Context) error {
 		return err
 	}
-	response, err := h.documentService.CreateDocument(&req, user.ID)
+	response, err := h.documentService.CreateDocument(c.Request().Context(), &req, user.ID)
 	if err != nil {
 		return err
 	}
@@ -248,7 +248,7 @@ func (h *DocumentHandler) UpdateDocument(c echo.Context) error {
 		return err
 	}
-	response, err := h.documentService.UpdateDocument(uint(documentID), user.ID, &req)
+	response, err := h.documentService.UpdateDocument(c.Request().Context(), uint(documentID), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -266,7 +266,7 @@ func (h *DocumentHandler) DeleteDocument(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_document_id")
 	}
-	err = h.documentService.DeleteDocument(uint(documentID), user.ID)
+	err = h.documentService.DeleteDocument(c.Request().Context(), uint(documentID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -284,7 +284,7 @@ func (h *DocumentHandler) ActivateDocument(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_document_id")
 	}
-	response, err := h.documentService.ActivateDocument(uint(documentID), user.ID)
+	response, err := h.documentService.ActivateDocument(c.Request().Context(), uint(documentID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -302,7 +302,7 @@ func (h *DocumentHandler) DeactivateDocument(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_document_id")
 	}
-	response, err := h.documentService.DeactivateDocument(uint(documentID), user.ID)
+	response, err := h.documentService.DeactivateDocument(c.Request().Context(), uint(documentID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -342,14 +342,14 @@ func (h *DocumentHandler) UploadDocumentImage(c echo.Context) error {
 		return apperrors.Internal(nil)
 	}
-	result, err := h.storageService.Upload(uploadedFile, "images")
+	result, err := h.storageService.Upload(c.Request().Context(), uploadedFile, "images")
 	if err != nil {
 		return apperrors.BadRequest("error.failed_to_upload_file")
 	}
 	caption := c.FormValue("caption")
-	response, err := h.documentService.UploadDocumentImage(uint(documentID), user.ID, result.URL, caption)
+	response, err := h.documentService.UploadDocumentImage(c.Request().Context(), uint(documentID), user.ID, result.URL, caption)
 	if err != nil {
 		return err
 	}
@@ -372,7 +372,7 @@ func (h *DocumentHandler) DeleteDocumentImage(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_image_id")
 	}
-	response, err := h.documentService.DeleteDocumentImage(uint(documentID), uint(imageID), user.ID)
+	response, err := h.documentService.DeleteDocumentImage(c.Request().Context(), uint(documentID), uint(imageID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -506,232 +506,6 @@ func TestTaskHandler_CreateCompletion_NoTaskID(t *testing.T) {
 	})
 }
 // =============================================================================
 // Auth Handler - Additional Coverage
 // =============================================================================
 func TestAuthHandler_AppleSignIn_NotConfigured(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/apple-sign-in/", handler.AppleSignIn)
 	t.Run("returns 500 when apple auth not configured", func(t *testing.T) {
 		req := map[string]interface{}{
 			"id_token": "fake-token",
 			"user_id":  "fake-user-id",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/apple-sign-in/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusInternalServerError)
 	})
 	t.Run("missing identity_token returns 400", func(t *testing.T) {
 		req := map[string]interface{}{}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/apple-sign-in/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestAuthHandler_GoogleSignIn_NotConfigured(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/google-sign-in/", handler.GoogleSignIn)
 	t.Run("returns 500 when google auth not configured", func(t *testing.T) {
 		req := map[string]interface{}{
 			"id_token": "fake-token",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/google-sign-in/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusInternalServerError)
 	})
 	t.Run("missing id_token returns 400", func(t *testing.T) {
 		req := map[string]interface{}{}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/google-sign-in/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 // setupAuthHandlerWithDB is like setupAuthHandler but also returns the underlying *gorm.DB
 // for tests that need to create records like ConfirmationCode directly.
 func setupAuthHandlerWithDB(t *testing.T) (*AuthHandler, *echo.Echo, *gorm.DB) {
 	db := testutil.SetupTestDB(t)
 	userRepo := repositories.NewUserRepository(db)
 	cfg := &config.Config{
 		Security: config.SecurityConfig{
 			SecretKey:            "test-secret-key",
 			PasswordResetExpiry:  15 * time.Minute,
 			ConfirmationExpiry:   24 * time.Hour,
 			MaxPasswordResetRate: 3,
 		},
 	}
 	authService := services.NewAuthService(userRepo, cfg)
 	handler := NewAuthHandler(authService, nil, nil)
 	e := testutil.SetupTestRouter()
 	return handler, e, db
 }
 func TestAuthHandler_VerifyEmail(t *testing.T) {
 	handler, e, db := setupAuthHandlerWithDB(t)
 	user := testutil.CreateTestUser(t, db, "verifytest", "verify@test.com", "Password123")
 	// Create confirmation code
 	confirmCode := &models.ConfirmationCode{
 		UserID:    user.ID,
 		Code:      "123456",
 		ExpiresAt: time.Now().Add(24 * time.Hour),
 		IsUsed:    false,
 	}
 	require.NoError(t, db.Create(confirmCode).Error)
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.POST("/verify-email/", handler.VerifyEmail)
 	t.Run("successful verification", func(t *testing.T) {
 		req := requests.VerifyEmailRequest{
 			Code: "123456",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 		var response map[string]interface{}
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)
 		assert.Equal(t, true, response["verified"])
 	})
 	t.Run("wrong code returns error", func(t *testing.T) {
 		req := requests.VerifyEmailRequest{
 			Code: "999999",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
 		// Code already used or wrong code
 		assert.True(t, w.Code == http.StatusBadRequest || w.Code == http.StatusNotFound,
 			"expected 400 or 404, got %d", w.Code)
 	})
 	t.Run("missing code returns 400", func(t *testing.T) {
 		req := map[string]interface{}{}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestAuthHandler_ResendVerification(t *testing.T) {
 	handler, e, db := setupAuthHandlerWithDB(t)
 	user := testutil.CreateTestUser(t, db, "resendtest", "resend@test.com", "Password123")
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.POST("/resend-verification/", handler.ResendVerification)
 	t.Run("successful resend", func(t *testing.T) {
 		w := testutil.MakeRequest(e, "POST", "/api/auth/resend-verification/", nil, "test-token")
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 		var response map[string]interface{}
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)
 		assert.Contains(t, response, "message")
 	})
 }
 func TestAuthHandler_RefreshToken(t *testing.T) {
 	handler, e, db := setupAuthHandlerWithDB(t)
 	user := testutil.CreateTestUser(t, db, "refreshtest", "refresh@test.com", "Password123")
 	// Create auth token and use its actual key in the middleware
 	authToken := testutil.CreateTestToken(t, db, user.ID)
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(func(next echo.HandlerFunc) echo.HandlerFunc {
 		return func(c echo.Context) error {
 			c.Set("auth_user", user)
 			c.Set("auth_token", authToken.Key)
 			return next(c)
 		}
 	})
 	authGroup.POST("/refresh/", handler.RefreshToken)
 	t.Run("successful refresh", func(t *testing.T) {
 		w := testutil.MakeRequest(e, "POST", "/api/auth/refresh/", nil, authToken.Key)
 		testutil.AssertStatusCode(t, w, http.StatusOK)
 		var response map[string]interface{}
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)
 		assert.Contains(t, response, "token")
 	})
 }
 func TestAuthHandler_VerifyResetCode(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/register/", handler.Register)
 	e.POST("/api/auth/verify-reset-code/", handler.VerifyResetCode)
 	t.Run("invalid code returns error", func(t *testing.T) {
 		req := requests.VerifyResetCodeRequest{
 			Email: "nonexistent@test.com",
 			Code:  "999999",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-reset-code/", req, "")
 		// Should not be 200 since no valid code exists
 		assert.NotEqual(t, http.StatusOK, w.Code)
 	})
 	t.Run("missing fields returns 400", func(t *testing.T) {
 		req := map[string]interface{}{}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-reset-code/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestAuthHandler_ResetPassword(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/reset-password/", handler.ResetPassword)
 	t.Run("invalid reset token returns error", func(t *testing.T) {
 		req := requests.ResetPasswordRequest{
 			ResetToken:  "invalid-token",
 			NewPassword: "NewPassword123",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
 		assert.NotEqual(t, http.StatusOK, w.Code)
 	})
 	t.Run("missing fields returns 400", func(t *testing.T) {
 		req := map[string]interface{}{}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 	t.Run("short password returns 400", func(t *testing.T) {
 		req := requests.ResetPasswordRequest{
 			ResetToken:  "some-token",
 			NewPassword: "short",
 		}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestAuthHandler_ForgotPassword_MissingEmail(t *testing.T) {
 	handler, e, _ := setupAuthHandler(t)
 	e.POST("/api/auth/forgot-password/", handler.ForgotPassword)
 	t.Run("missing email returns 400", func(t *testing.T) {
 		req := map[string]interface{}{}
 		w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 // =============================================================================
 // Residence Handler - Additional Error Paths
 // =============================================================================
@@ -1781,45 +1555,11 @@ func TestStaticDataHandler_RefreshStaticData(t *testing.T) {
 // =============================================================================
 // Upload Handler - Additional Error Paths
 // =============================================================================
-
+//
-func TestUploadHandler_UploadImage_NoFile(t *testing.T) {
+// Multipart upload handlers (UploadImage / UploadDocument / UploadCompletion)
-	storageSvc := newTestStorageService("/var/uploads")
+// were removed alongside the legacy /api/uploads/{image,document,completion}
-	handler := NewUploadHandler(storageSvc, nil)
+// routes. The presigned-URL flow (POST /api/uploads/presign) is exercised by
-	e := testutil.SetupTestRouter()
+// integration tests that hit the full pipeline.
 	e.POST("/api/uploads/image", handler.UploadImage)
 	t.Run("no file returns 400", func(t *testing.T) {
 		w := testutil.MakeRequest(e, "POST", "/api/uploads/image", nil, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestUploadHandler_UploadDocument_NoFile(t *testing.T) {
 	storageSvc := newTestStorageService("/var/uploads")
 	handler := NewUploadHandler(storageSvc, nil)
 	e := testutil.SetupTestRouter()
 	e.POST("/api/uploads/document", handler.UploadDocument)
 	t.Run("no file returns 400", func(t *testing.T) {
 		w := testutil.MakeRequest(e, "POST", "/api/uploads/document", nil, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestUploadHandler_UploadCompletion_NoFile(t *testing.T) {
 	storageSvc := newTestStorageService("/var/uploads")
 	handler := NewUploadHandler(storageSvc, nil)
 	e := testutil.SetupTestRouter()
 	e.POST("/api/uploads/completion", handler.UploadCompletion)
 	t.Run("no file returns 400", func(t *testing.T) {
 		w := testutil.MakeRequest(e, "POST", "/api/uploads/completion", nil, "")
 		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
 	})
 }
 func TestUploadHandler_DeleteFile_OwnershipDenied(t *testing.T) {
 	storageSvc := newTestStorageService("/var/uploads")
@@ -37,6 +37,23 @@ func NewMediaHandler(
 	}
 }
 // safeContentDisposition builds an inline Content-Disposition header value
 // with a sanitized filename (audit M1). Control characters (including CR/LF),
 // double-quote and backslash are stripped so an attacker-controlled upload
 // filename cannot inject additional response headers (CWE-113).
 func safeContentDisposition(filename string) string {
 	cleaned := strings.Map(func(r rune) rune {
 		if r < 0x20 || r == 0x7f || r == '"' || r == '\\' {
 			return -1
 		}
 		return r
 	}, filename)
 	if cleaned == "" {
 		cleaned = "download"
 	}
 	return `inline; filename="` + cleaned + `"`
 }
 // ServeDocument serves a document file with access control
 // GET /api/media/document/:id
 func (h *MediaHandler) ServeDocument(c echo.Context) error {
@@ -71,7 +88,7 @@ func (h *MediaHandler) ServeDocument(c echo.Context) error {
 	// Set caching and disposition headers
 	c.Response().Header().Set("Cache-Control", "private, max-age=3600")
 	if doc.FileName != "" {
-		c.Response().Header().Set("Content-Disposition", "inline; filename=\""+doc.FileName+"\"")
+		c.Response().Header().Set("Content-Disposition", safeContentDisposition(doc.FileName))
 	}
 	return c.Blob(http.StatusOK, mimeType, data)
 }
@@ -114,7 +131,7 @@ func (h *MediaHandler) ServeDocumentImage(c echo.Context) error {
 	}
 	c.Response().Header().Set("Cache-Control", "private, max-age=3600")
-	c.Response().Header().Set("Content-Disposition", "inline; filename=\""+filepath.Base(img.ImageURL)+"\"")
+	c.Response().Header().Set("Content-Disposition", safeContentDisposition(filepath.Base(img.ImageURL)))
 	return c.Blob(http.StatusOK, mimeType, data)
 }
@@ -162,7 +179,7 @@ func (h *MediaHandler) ServeCompletionImage(c echo.Context) error {
 	}
 	c.Response().Header().Set("Cache-Control", "private, max-age=3600")
-	c.Response().Header().Set("Content-Disposition", "inline; filename=\""+filepath.Base(img.ImageURL)+"\"")
+	c.Response().Header().Set("Content-Disposition", safeContentDisposition(filepath.Base(img.ImageURL)))
 	return c.Blob(http.StatusOK, mimeType, data)
 }
@@ -46,7 +46,7 @@ func (h *NotificationHandler) ListNotifications(c echo.Context) error {
 		}
 	}
-	notifications, err := h.notificationService.GetNotifications(user.ID, limit, offset)
+	notifications, err := h.notificationService.GetNotifications(c.Request().Context(), user.ID, limit, offset)
 	if err != nil {
 		return err
 	}
@@ -64,7 +64,7 @@ func (h *NotificationHandler) GetUnreadCount(c echo.Context) error {
 		return err
 	}
-	count, err := h.notificationService.GetUnreadCount(user.ID)
+	count, err := h.notificationService.GetUnreadCount(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -84,7 +84,7 @@ func (h *NotificationHandler) MarkAsRead(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_notification_id")
 	}
-	err = h.notificationService.MarkAsRead(uint(notificationID), user.ID)
+	err = h.notificationService.MarkAsRead(c.Request().Context(), uint(notificationID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -99,7 +99,7 @@ func (h *NotificationHandler) MarkAllAsRead(c echo.Context) error {
 		return err
 	}
-	err = h.notificationService.MarkAllAsRead(user.ID)
+	err = h.notificationService.MarkAllAsRead(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -114,7 +114,7 @@ func (h *NotificationHandler) GetPreferences(c echo.Context) error {
 		return err
 	}
-	prefs, err := h.notificationService.GetPreferences(user.ID)
+	prefs, err := h.notificationService.GetPreferences(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -137,7 +137,7 @@ func (h *NotificationHandler) UpdatePreferences(c echo.Context) error {
 		return err
 	}
-	prefs, err := h.notificationService.UpdatePreferences(user.ID, &req)
+	prefs, err := h.notificationService.UpdatePreferences(c.Request().Context(), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -160,7 +160,7 @@ func (h *NotificationHandler) RegisterDevice(c echo.Context) error {
 		return err
 	}
-	device, err := h.notificationService.RegisterDevice(user.ID, &req)
+	device, err := h.notificationService.RegisterDevice(c.Request().Context(), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -175,7 +175,7 @@ func (h *NotificationHandler) ListDevices(c echo.Context) error {
 		return err
 	}
-	devices, err := h.notificationService.ListDevices(user.ID)
+	devices, err := h.notificationService.ListDevices(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -208,7 +208,7 @@ func (h *NotificationHandler) UnregisterDevice(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_platform")
 	}
-	err = h.notificationService.UnregisterDevice(req.RegistrationID, req.Platform, user.ID)
+	err = h.notificationService.UnregisterDevice(c.Request().Context(), req.RegistrationID, req.Platform, user.ID)
 	if err != nil {
 		return err
 	}
@@ -236,7 +236,7 @@ func (h *NotificationHandler) DeleteDevice(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_platform")
 	}
-	err = h.notificationService.DeleteDevice(uint(deviceID), platform, user.ID)
+	err = h.notificationService.DeleteDevice(c.Request().Context(), uint(deviceID), platform, user.ID)
 	if err != nil {
 		return err
 	}
@@ -39,7 +39,7 @@ func (h *ResidenceHandler) ListResidences(c echo.Context) error {
 		return err
 	}
-	response, err := h.residenceService.ListResidences(user.ID)
+	response, err := h.residenceService.ListResidences(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -55,7 +55,7 @@ func (h *ResidenceHandler) GetMyResidences(c echo.Context) error {
 	}
 	userNow := middleware.GetUserNow(c)
-	response, err := h.residenceService.GetMyResidences(user.ID, userNow)
+	response, err := h.residenceService.GetMyResidences(c.Request().Context(), user.ID, userNow)
 	if err != nil {
 		return err
 	}
@@ -72,7 +72,7 @@ func (h *ResidenceHandler) GetSummary(c echo.Context) error {
 	}
 	userNow := middleware.GetUserNow(c)
-	summary, err := h.residenceService.GetSummary(user.ID, userNow)
+	summary, err := h.residenceService.GetSummary(c.Request().Context(), user.ID, userNow)
 	if err != nil {
 		return err
 	}
@@ -93,7 +93,7 @@ func (h *ResidenceHandler) GetResidence(c echo.Context) error {
 	}
 	userNow := middleware.GetUserNow(c)
-	response, err := h.residenceService.GetResidence(uint(residenceID), user.ID, userNow)
+	response, err := h.residenceService.GetResidence(c.Request().Context(), uint(residenceID), user.ID, userNow)
 	if err != nil {
 		return err
 	}
@@ -116,7 +116,7 @@ func (h *ResidenceHandler) CreateResidence(c echo.Context) error {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
-	response, err := h.residenceService.CreateResidence(&req, user.ID)
+	response, err := h.residenceService.CreateResidence(c.Request().Context(), &req, user.ID)
 	if err != nil {
 		return err
 	}
@@ -144,7 +144,7 @@ func (h *ResidenceHandler) UpdateResidence(c echo.Context) error {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
-	response, err := h.residenceService.UpdateResidence(uint(residenceID), user.ID, &req)
+	response, err := h.residenceService.UpdateResidence(c.Request().Context(), uint(residenceID), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -164,7 +164,7 @@ func (h *ResidenceHandler) DeleteResidence(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_residence_id")
 	}
-	response, err := h.residenceService.DeleteResidence(uint(residenceID), user.ID)
+	response, err := h.residenceService.DeleteResidence(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -185,7 +185,7 @@ func (h *ResidenceHandler) GetShareCode(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_residence_id")
 	}
-	shareCode, err := h.residenceService.GetShareCode(uint(residenceID), user.ID)
+	shareCode, err := h.residenceService.GetShareCode(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -213,7 +213,7 @@ func (h *ResidenceHandler) GenerateShareCode(c echo.Context) error {
 	// Request body is optional
 	c.Bind(&req)
-	response, err := h.residenceService.GenerateShareCode(uint(residenceID), user.ID, req.ExpiresInHours)
+	response, err := h.residenceService.GenerateShareCode(c.Request().Context(), uint(residenceID), user.ID, req.ExpiresInHours)
 	if err != nil {
 		return err
 	}
@@ -238,7 +238,7 @@ func (h *ResidenceHandler) GenerateSharePackage(c echo.Context) error {
 	// Request body is optional (for expires_in_hours)
 	c.Bind(&req)
-	response, err := h.residenceService.GenerateSharePackage(uint(residenceID), user.ID, req.ExpiresInHours)
+	response, err := h.residenceService.GenerateSharePackage(c.Request().Context(), uint(residenceID), user.ID, req.ExpiresInHours)
 	if err != nil {
 		return err
 	}
@@ -261,7 +261,7 @@ func (h *ResidenceHandler) JoinWithCode(c echo.Context) error {
 		return err
 	}
-	response, err := h.residenceService.JoinWithCode(req.Code, user.ID)
+	response, err := h.residenceService.JoinWithCode(c.Request().Context(), req.Code, user.ID)
 	if err != nil {
 		return err
 	}
@@ -281,7 +281,7 @@ func (h *ResidenceHandler) GetResidenceUsers(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_residence_id")
 	}
-	users, err := h.residenceService.GetResidenceUsers(uint(residenceID), user.ID)
+	users, err := h.residenceService.GetResidenceUsers(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -306,7 +306,7 @@ func (h *ResidenceHandler) RemoveResidenceUser(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_user_id")
 	}
-	err = h.residenceService.RemoveUser(uint(residenceID), uint(userIDToRemove), user.ID)
+	err = h.residenceService.RemoveUser(c.Request().Context(), uint(residenceID), uint(userIDToRemove), user.ID)
 	if err != nil {
 		return err
 	}
@@ -316,7 +316,7 @@ func (h *ResidenceHandler) RemoveResidenceUser(c echo.Context) error {
 // GetResidenceTypes handles GET /api/residences/types/
 func (h *ResidenceHandler) GetResidenceTypes(c echo.Context) error {
-	types, err := h.residenceService.GetResidenceTypes()
+	types, err := h.residenceService.GetResidenceTypes(c.Request().Context())
 	if err != nil {
 		return err
 	}
@@ -348,7 +348,7 @@ func (h *ResidenceHandler) GenerateTasksReport(c echo.Context) error {
 	c.Bind(&req)
 	// Generate the report data
-	report, err := h.residenceService.GenerateTasksReport(uint(residenceID), user.ID)
+	report, err := h.residenceService.GenerateTasksReport(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
--- a/Show More
+++ b/Show More