dev: add Kratos + Mailpit local-dev stack

docker-compose.dev.yml gains a Kratos identity service (public :4433 / admin :4434) and a Mailpit SMTP catcher for local onboarding email codes, plus a postgres-init mount. deploy/local/kratos/ holds the local Kratos config + identity schema (placeholder dev cookie secret only). Supports the local backend the XCUITest suite seeds against. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
backend: GDPR export + retention cleanups + worker metrics (BE-1/2/3)
2026-06-09 00:11:06 -05:00 · 2026-06-08 22:15:26 -05:00 · 2026-06-08 21:41:40 -05:00 · 2026-06-06 10:49:37 -05:00 · 2026-06-04 20:54:54 -05:00 · 2026-06-03 22:30:33 -05:00
248 changed files with 20040 additions and 11941 deletions
@@ -28,12 +28,22 @@ EMAIL_HOST_USER=your-email@gmail.com
 EMAIL_HOST_PASSWORD=your-app-password
 DEFAULT_FROM_EMAIL=honeyDue <noreply@honeyDue.treytartt.com>

+# Sign in with Apple
+# APPLE_CLIENT_ID must equal the iOS bundle ID of the build hitting this
+# backend. The Apple identity-token `aud` claim is checked against it
+# (see internal/services/apple_auth.go::verifyAudience). With DEBUG=false
+# an empty value rejects every Apple token.
+#   Release builds: com.myhoneydue.honeyDue
+#   Debug builds:   com.myhoneydue.honeyDue.dev
+APPLE_CLIENT_ID=com.myhoneydue.honeyDue.dev
+APPLE_TEAM_ID=X86BR9WTLD
+
 # APNs Settings (iOS Push Notifications)
 # Direct APNs integration - no external push server needed
 APNS_AUTH_KEY_PATH=/path/to/AuthKey_XXXXXX.p8
 APNS_AUTH_KEY_ID=XXXXXXXXXX
 APNS_TEAM_ID=XXXXXXXXXX
-APNS_TOPIC=com.tt.honeyDue
+APNS_TOPIC=com.myhoneydue.honeyDue.dev
 APNS_PRODUCTION=false  # Set to true for production APNs, false for sandbox

 # FCM Settings (Android Push Notifications)
@@ -8,6 +8,9 @@ bin/
 /api
 /worker
 /admin
+/admin-reset
+/notif-diag
+/send-test-push
 !admin/
 *.exe
 *.exe~
@@ -42,3 +45,4 @@ push_certs/

 # Vendor (if not using go modules)
 # vendor/
+/migrate
@@ -1,5 +1,5 @@
 # Admin panel build stage
-FROM node:20-alpine AS admin-builder
+FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS admin-builder

 WORKDIR /app

@@ -49,6 +49,19 @@ RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /
 # Build the worker binary
 RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /app/worker ./cmd/worker

+# Install goose CLI for production migrations. Pinned to a specific version
+# so an upstream behavioural change can't break a deploy unannounced.
+# Bumping is a deliberate, reviewable diff. We `go build` rather than
+# `go install` so the output path is predictable across host platforms —
+# `go install` with cross-compile env vars drops the binary in
+# /go/bin/<goos>_<goarch>/, which is awkward to COPY from.
+RUN cd /tmp && \
+    git clone --depth=1 --branch=v3.22.1 https://github.com/pressly/goose.git goose-src && \
+    cd goose-src && \
+    CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} \
+      go build -ldflags="-w -s" -o /app/goose ./cmd/goose && \
+    cd / && rm -rf /tmp/goose-src
+
 # Base runtime stage for Go services
 FROM alpine:3.19 AS go-base

@@ -64,6 +77,9 @@ WORKDIR /app
 # Copy all binaries from builder
 COPY --from=builder /app/api /app/api
 COPY --from=builder /app/worker /app/worker
+# goose is the migration runner — same image is reused as the migrate Job
+# entrypoint via `command: ["/usr/local/bin/goose", ...]`.
+COPY --from=builder /app/goose /usr/local/bin/goose

 # Copy templates directory
 COPY --from=builder /app/templates /app/templates
@@ -93,7 +109,7 @@ FROM go-base AS worker
 CMD ["/app/worker"]

 # Admin panel runtime stage
-FROM node:20-alpine AS admin
+FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS admin

 WORKDIR /app

@@ -115,7 +131,7 @@ ENV HOSTNAME="0.0.0.0"
 CMD ["node", "server.js"]

 # Default production stage (for Dokku - runs API + Admin)
-FROM node:20-alpine AS production
+FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS production

 # Install runtime dependencies
 RUN apk add --no-cache ca-certificates tzdata curl
@@ -89,15 +89,36 @@ docker-build-prod:
 	docker build --target worker -t $${REGISTRY:-ghcr.io/treytartt}/honeydue-worker:$${TAG:-latest} .
 	docker build --target admin -t $${REGISTRY:-ghcr.io/treytartt}/honeydue-admin:$${TAG:-latest} .

-# Database migrations
+# Database migrations (goose)
+#
+# DATABASE_URL must point at the *direct* (non-pooler) Neon endpoint —
+# goose's session-scoped advisory lock won't survive PgBouncer transaction
+# mode. Example:
+#   export DATABASE_URL='host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
+#                        user=neondb_owner password=... dbname=honeyDue sslmode=require'
+#
+# Bootstrap (one-time, when adopting goose against an existing DB):
+#   make migrate-status                       # creates goose_db_version
+#   psql ... -c "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
+#
+# Day-to-day:
+#   make migrate-status   # show what's pending
+#   make migrate-up       # apply pending migrations
+#   make migrate-down     # roll back the latest migration
+#   make migrate-new name=add_some_column   # scaffold a new SQL migration
+
 migrate-up:
-	migrate -path migrations -database "$(DATABASE_URL)" up
+	goose -dir migrations postgres "$(DATABASE_URL)" up

 migrate-down:
-	migrate -path migrations -database "$(DATABASE_URL)" down
+	goose -dir migrations postgres "$(DATABASE_URL)" down

-migrate-create:
-	migrate create -ext sql -dir migrations -seq $(name)
+migrate-status:
+	goose -dir migrations postgres "$(DATABASE_URL)" status
+
+migrate-new:
+	@if [ -z "$(name)" ]; then echo "usage: make migrate-new name=<short_name>"; exit 1; fi
+	goose -dir migrations create $(name) sql

 # Encrypt existing uploads at rest (run after setting STORAGE_ENCRYPTION_KEY)
 migrate-encrypt:
@@ -184,6 +184,15 @@ needed for local dev. For the complete production env var reference

 Leave all four `B2_*` empty in dev to fall back to a local `/app/uploads` volume.

+**Upload architecture (since `b7f8329`)**: Image and document uploads go
+**directly from the client to B2** via a presigned POST policy issued by
+`POST /api/uploads/presign`. Bytes never traverse the api server. B2
+enforces a 10 MB per-object cap at the protocol level. The worker reaps
+orphaned upload sessions hourly via the `maintenance:upload_cleanup`
+cron. See [`docs/deployment/09-storage.md`](./docs/deployment/09-storage.md)
+for the full flow, and [`docs/deployment/14-deployment-process.md`](./docs/deployment/14-deployment-process.md#one-time-b2-bucket-lifecycle-manual)
+for the one-time bucket lifecycle setup.
+
 ### Worker schedules (UTC hours)

 | Variable | Description | Default |
@@ -349,7 +358,11 @@ All protected endpoints require an `Authorization: Token <token>` header.

 Production runs on a **3-node K3s HA cluster** on Hetzner Cloud, fronted
 by Cloudflare, with Neon Postgres, Backblaze B2, and a self-hosted Gitea
-container registry. See the full deployment book for every detail:
+container registry. Live observability (VictoriaMetrics + Jaeger +
+Grafana) runs on a separate Linode VPS at
+[`grafana.88oakapps.com`](https://grafana.88oakapps.com) and is fed by a
+`vmagent` sidecar in-cluster. See the full deployment book for every
+detail:

 **→ [docs/deployment/](./docs/deployment/README.md) — The Deployment Book**

@@ -371,7 +384,10 @@ Quick links:

 - **Runbook** — [docs/deployment/17-runbook.md](./docs/deployment/17-runbook.md) — 22 common ops procedures
 - **kubectl cheat sheet** — [docs/deployment/appendices/b-commands.md](./docs/deployment/appendices/b-commands.md)
- **Deploy process** — [docs/deployment/14-deployment-process.md](./docs/deployment/14-deployment-process.md) — build → push → rollout
+- **Deploy process** — [docs/deployment/14-deployment-process.md](./docs/deployment/14-deployment-process.md) — `bash deploy-k3s/scripts/03-deploy.sh` builds → pushes → rolls out
+- **Observability** — [docs/deployment/15-observability.md](./docs/deployment/15-observability.md) — VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`
+- **Observability plan** — [docs/observability-plan.md](./docs/observability-plan.md) — design doc and rollout phases
+- **Database / pool tuning** — [docs/deployment/08-database.md](./docs/deployment/08-database.md) — Neon pooler endpoint, GORM pool, warm-up, RTT budget
 - **Failure modes** — [docs/deployment/16-failure-modes.md](./docs/deployment/16-failure-modes.md) — what happens when X dies
 - **Swarm postmortem** — [docs/deployment/19-postmortem-swarm.md](./docs/deployment/19-postmortem-swarm.md) — why we migrated

@@ -0,0 +1,257 @@
+// admin-reset is a one-off CLI for resetting an admin_users row's password.
+//
+// It reads DB connection settings from environment variables (the same names
+// the API uses), looks up the admin user by email, prompts for a new password
+// twice (no echo), bcrypts it, and updates the row. Safe to keep in the repo
+// — running it requires DB credentials.
+//
+// Usage:
+//
+//	# load env (host, user, db, sslmode) and password from secrets file
+//	set -a && source deploy/prod.env && set +a
+//	go run ./cmd/admin-reset
+//
+//	# or with a non-default secrets path / different admin
+//	go run ./cmd/admin-reset --password-file path/to/postgres_password.txt
+//	go run ./cmd/admin-reset --email someone@example.com
+package main
+
+import (
+	"bufio"
+	"errors"
+	"flag"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
+	"golang.org/x/crypto/bcrypt"
+	"golang.org/x/term"
+	"gorm.io/driver/postgres"
+	"gorm.io/gorm"
+	"gorm.io/gorm/logger"
+
+	"github.com/treytartt/honeydue-api/internal/models"
+)
+
+const minPasswordLen = 12
+
+func main() {
+	email := flag.String("email", "admin@myhoneydue.com", "Admin email to reset")
+	passwordFile := flag.String("password-file", "deploy/secrets/postgres_password.txt",
+		"Path to file containing POSTGRES_PASSWORD (used if env var is empty)")
+	list := flag.Bool("list", false, "List all rows in admin_users and exit (no changes)")
+	verify := flag.Bool("verify", false, "Prompt for a password and check it against the stored hash; no changes")
+	newEmail := flag.String("new-email", "", "If set: rename the matched admin's email to this value and exit (no password change)")
+	flag.Parse()
+
+	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: time.RFC3339})
+
+	dsn, host, err := buildDSN(*passwordFile)
+	if err != nil {
+		log.Fatal().Err(err).Msg("failed to build database DSN")
+	}
+
+	db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{
+		Logger: logger.Default.LogMode(logger.Silent),
+	})
+	if err != nil {
+		log.Fatal().Err(err).Msg("failed to connect to database")
+	}
+
+	if *list {
+		var admins []models.AdminUser
+		if err := db.Order("id").Find(&admins).Error; err != nil {
+			log.Fatal().Err(err).Msg("failed to list admin users")
+		}
+		fmt.Fprintf(os.Stderr, "DB host: %s\n%d admin user(s):\n\n", host, len(admins))
+		fmt.Fprintf(os.Stderr, "%-4s  %-40s  %-12s  %-6s  %s\n", "ID", "EMAIL", "ROLE", "ACTIVE", "LAST_LOGIN")
+		for _, a := range admins {
+			last := "-"
+			if a.LastLogin != nil {
+				last = a.LastLogin.Format(time.RFC3339)
+			}
+			fmt.Fprintf(os.Stderr, "%-4d  %-40s  %-12s  %-6t  %s\n", a.ID, a.Email, a.Role, a.IsActive, last)
+		}
+		return
+	}
+
+	// Mirror the live API's case-insensitive lookup so --verify reflects what
+	// /api/admin/auth/login actually does. The reset path uses the same query
+	// for consistency.
+	var admin models.AdminUser
+	if err := db.Where("LOWER(email) = LOWER(?)", *email).First(&admin).Error; err != nil {
+		if errors.Is(err, gorm.ErrRecordNotFound) {
+			log.Fatal().Str("email", *email).Msg("admin user not found (try --list to see existing rows)")
+		}
+		log.Fatal().Err(err).Msg("failed to look up admin user")
+	}
+
+	if *newEmail != "" {
+		target := strings.TrimSpace(*newEmail)
+		if target == "" || !strings.Contains(target, "@") {
+			log.Fatal().Str("new_email", *newEmail).Msg("--new-email must be a valid email address")
+		}
+		if strings.EqualFold(target, admin.Email) {
+			fmt.Fprintf(os.Stderr, "No change — current email already matches %q\n", target)
+			return
+		}
+		// Catch the unique-index conflict early with a clear message instead of a Postgres error.
+		var collisionCount int64
+		if err := db.Model(&models.AdminUser{}).
+			Where("LOWER(email) = LOWER(?) AND id <> ?", target, admin.ID).
+			Count(&collisionCount).Error; err != nil {
+			log.Fatal().Err(err).Msg("failed to check for email collision")
+		}
+		if collisionCount > 0 {
+			log.Fatal().Str("new_email", target).Msg("another admin row already uses this email — aborting")
+		}
+
+		fmt.Fprintf(os.Stderr, "Renaming admin email: %s → %s (id=%d)\n", admin.Email, target, admin.ID)
+		fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
+		res := db.Model(&models.AdminUser{}).
+			Where("id = ?", admin.ID).
+			Updates(map[string]any{
+				"email":      target,
+				"updated_at": time.Now().UTC(),
+			})
+		if res.Error != nil {
+			log.Fatal().Err(res.Error).Msg("failed to rename admin email")
+		}
+		if res.RowsAffected != 1 {
+			log.Fatal().Int64("rows", res.RowsAffected).Msg("expected exactly 1 row updated")
+		}
+		fmt.Fprintf(os.Stderr, "OK — email is now %s\n", target)
+		return
+	}
+
+	if *verify {
+		fmt.Fprintf(os.Stderr, "Verifying password for: %s (id=%d, role=%s, active=%t)\n",
+			admin.Email, admin.ID, admin.Role, admin.IsActive)
+		fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
+
+		pw, err := readPassword("Password: ")
+		if err != nil {
+			log.Fatal().Err(err).Msg("failed to read password")
+		}
+		if admin.CheckPassword(pw) {
+			fmt.Fprintln(os.Stderr, "PASS — bcrypt hash matches the supplied password")
+			if !admin.IsActive {
+				fmt.Fprintln(os.Stderr, "WARNING: is_active = false — login will still be rejected with \"Account is disabled\"")
+			}
+		} else {
+			fmt.Fprintln(os.Stderr, "FAIL — bcrypt hash does NOT match the supplied password")
+			os.Exit(1)
+		}
+		return
+	}
+
+	fmt.Fprintf(os.Stderr, "Resetting password for: %s (id=%d, role=%s, active=%t)\n",
+		admin.Email, admin.ID, admin.Role, admin.IsActive)
+	fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
+
+	pw1, err := readPassword("New password: ")
+	if err != nil {
+		log.Fatal().Err(err).Msg("failed to read password")
+	}
+	if len(pw1) < minPasswordLen {
+		log.Fatal().Int("min", minPasswordLen).Msg("password too short")
+	}
+
+	pw2, err := readPassword("Confirm password: ")
+	if err != nil {
+		log.Fatal().Err(err).Msg("failed to read password")
+	}
+	if pw1 != pw2 {
+		log.Fatal().Msg("passwords do not match")
+	}
+
+	hash, err := bcrypt.GenerateFromPassword([]byte(pw1), bcrypt.DefaultCost)
+	if err != nil {
+		log.Fatal().Err(err).Msg("failed to hash password")
+	}
+
+	res := db.Model(&models.AdminUser{}).
+		Where("id = ?", admin.ID).
+		Updates(map[string]any{
+			"password":   string(hash),
+			"updated_at": time.Now().UTC(),
+		})
+	if res.Error != nil {
+		log.Fatal().Err(res.Error).Msg("failed to update admin user")
+	}
+	if res.RowsAffected != 1 {
+		log.Fatal().Int64("rows", res.RowsAffected).Msg("expected exactly 1 row updated")
+	}
+
+	fmt.Fprintf(os.Stderr, "\nOK — password reset for %s\n", admin.Email)
+}
+
+func buildDSN(passwordFile string) (dsn, host string, err error) {
+	host = os.Getenv("DB_HOST")
+	user := os.Getenv("POSTGRES_USER")
+	dbname := os.Getenv("POSTGRES_DB")
+	sslmode := os.Getenv("DB_SSLMODE")
+	if sslmode == "" {
+		sslmode = "require"
+	}
+
+	port := 5432
+	if s := os.Getenv("DB_PORT"); s != "" {
+		p, perr := strconv.Atoi(s)
+		if perr != nil {
+			return "", "", fmt.Errorf("invalid DB_PORT %q: %w", s, perr)
+		}
+		port = p
+	}
+
+	password := os.Getenv("POSTGRES_PASSWORD")
+	if password == "" && passwordFile != "" {
+		b, rerr := os.ReadFile(passwordFile)
+		if rerr != nil {
+			return "", "", fmt.Errorf("POSTGRES_PASSWORD not set and could not read %s: %w", passwordFile, rerr)
+		}
+		password = strings.TrimRight(string(b), "\r\n")
+	}
+
+	missing := []string{}
+	if host == "" {
+		missing = append(missing, "DB_HOST")
+	}
+	if user == "" {
+		missing = append(missing, "POSTGRES_USER")
+	}
+	if dbname == "" {
+		missing = append(missing, "POSTGRES_DB")
+	}
+	if password == "" {
+		missing = append(missing, "POSTGRES_PASSWORD")
+	}
+	if len(missing) > 0 {
+		return "", "", fmt.Errorf("missing required env vars: %s", strings.Join(missing, ", "))
+	}
+
+	dsn = fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
+		host, port, user, password, dbname, sslmode)
+	return dsn, host, nil
+}
+
+func readPassword(prompt string) (string, error) {
+	fmt.Fprint(os.Stderr, prompt)
+	if term.IsTerminal(int(os.Stdin.Fd())) {
+		b, err := term.ReadPassword(int(os.Stdin.Fd()))
+		fmt.Fprintln(os.Stderr)
+		if err != nil {
+			return "", err
+		}
+		return strings.TrimRight(string(b), "\r\n"), nil
+	}
+	s, err := bufio.NewReader(os.Stdin).ReadString('\n')
+	if err != nil {
+		return "", err
+	}
+	return strings.TrimRight(s, "\r\n"), nil
+}
@@ -9,6 +9,7 @@ import (
 	"syscall"
 	"time"

+	"github.com/hibiken/asynq"
 	"github.com/rs/zerolog/log"
 	"gorm.io/gorm"

@@ -19,6 +20,8 @@ import (
 	"github.com/treytartt/honeydue-api/internal/push"
 	"github.com/treytartt/honeydue-api/internal/router"
 	"github.com/treytartt/honeydue-api/internal/services"
+	"github.com/treytartt/honeydue-api/internal/tracing"
+	"github.com/treytartt/honeydue-api/internal/worker"
 	"github.com/treytartt/honeydue-api/pkg/utils"
 )

@@ -50,6 +53,29 @@ func main() {
 		Str("redis_url", config.MaskURLCredentials(cfg.Redis.URL)).
 		Msg("Starting HoneyDue API server")

+	// Initialize OpenTelemetry tracing — exports to obs.88oakapps.com
+	// (Jaeger via OTLP/HTTP) when OBS_TRACES_URL is set; otherwise installs
+	// a no-op tracer so call sites can use otel.Tracer() unconditionally.
+	// config.SecretValue (not os.Getenv) so file-mounted secrets resolve
+	// after audit F8 removed these from the process environment.
+	tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{
+		ServiceName: "honeydue-api",
+		Environment: deploymentEnvironment(cfg.Server.Debug),
+		EndpointURL: config.SecretValue("OBS_TRACES_URL"),
+		BearerToken: config.SecretValue("OBS_INGEST_TOKEN"),
+		SampleRatio: tracing.SampleRatioFromEnv(),
+	})
+	if err != nil {
+		log.Error().Err(err).Msg("tracing init failed — continuing without traces")
+	}
+	defer func() {
+		shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		if err := tracingShutdown(shutdownCtx); err != nil {
+			log.Warn().Err(err).Msg("tracing shutdown error")
+		}
+	}()
+
 	// Connect to database (retry with backoff)
 	var db *gorm.DB
 	var dbErr error
@@ -65,11 +91,14 @@ func main() {
 		log.Error().Err(dbErr).Msg("Failed to connect to database - API will start but database operations will fail")
 	} else {
 		defer database.Close()
-		// Run database migrations only if connected.
-		// MigrateWithLock serialises parallel replica starts via a Postgres
-		// advisory lock so concurrent AutoMigrate calls don't race on DDL.
-		if err := database.MigrateWithLock(); err != nil {
-			log.Error().Err(err).Msg("Failed to run database migrations")
+		// Migrations are managed out-of-band by golang-migrate (see
+		// cmd/migrate and deploy-k3s/manifests/migrate/job.yaml) so the api
+		// no longer runs AutoMigrate at startup. Instead we verify the
+		// schema is at the expected version and refuse to start if not —
+		// this catches the "operator forgot to run migrate" footgun loudly,
+		// at boot, instead of with mysterious runtime errors.
+		if err := database.RequireSchemaApplied(); err != nil {
+			log.Fatal().Err(err).Msg("Schema precondition failed — run `kubectl -n honeydue create job --from=cronjob/honeydue-migrate` (or `make migrate-up` locally) and retry")
 		}
 	}

@@ -167,6 +196,28 @@ func main() {
 			Msg("Push notification client initialized")
 	}

+	// Initialize Asynq enqueuer (api-side). Used by services that move
+	// long-running work off the request path (currently: task-completion
+	// notification fan-out). Same Redis as cmd/worker — file-mounted password
+	// applied separately because cfg.Redis.URL does not embed it (audit HIGH-1).
+	var taskEnqueuer *worker.TaskClient
+	if redisOpt, parseErr := asynq.ParseRedisURI(cfg.Redis.URL); parseErr != nil {
+		log.Warn().Err(parseErr).Msg("Failed to parse Redis URL for Asynq enqueuer — completion notifications will run inline")
+	} else if clientOpt, ok := redisOpt.(asynq.RedisClientOpt); ok {
+		if cfg.Redis.Password != "" {
+			clientOpt.Password = cfg.Redis.Password
+		}
+		taskEnqueuer = worker.NewTaskClient(clientOpt)
+		defer func() {
+			if cerr := taskEnqueuer.Close(); cerr != nil {
+				log.Warn().Err(cerr).Msg("Failed to close Asynq enqueuer on shutdown")
+			}
+		}()
+		log.Info().Msg("Asynq enqueuer initialized")
+	} else {
+		log.Warn().Msg("Redis opt is not RedisClientOpt — Asynq enqueuer skipped; completion notifications will run inline")
+	}
+
 	// Setup router with dependencies (includes admin panel at /admin)
 	deps := &router.Dependencies{
 		DB:                db,
@@ -178,6 +229,12 @@ func main() {
 		StorageService:    storageService,
 		MonitoringService: monitoringService,
 	}
+	// Only assign the enqueuer when we actually constructed one. Assigning a
+	// nil *worker.TaskClient directly would create a typed-nil interface that
+	// fails the `if deps.TaskEnqueuer != nil` check in router.SetupRouter.
+	if taskEnqueuer != nil {
+		deps.TaskEnqueuer = taskEnqueuer
+	}
 	e := router.SetupRouter(deps)

 	// Create HTTP server
@@ -217,3 +274,15 @@ func main() {

 	log.Info().Msg("Server exited")
 }
+
+// deploymentEnvironment turns the boolean Debug flag into the conventional
+// environment label spans get tagged with.
+func deploymentEnvironment(debug bool) string {
+	if env := os.Getenv("DEPLOYMENT_ENVIRONMENT"); env != "" {
+		return env
+	}
+	if debug {
+		return "dev"
+	}
+	return "prod"
+}
@@ -0,0 +1,32 @@
+package main
+
+import "time"
+
+// shouldInitEmail returns true if email config has host and user set.
+func shouldInitEmail(host, user string) bool {
+	return host != "" && user != ""
+}
+
+// shouldInitStorage returns true if upload directory is configured.
+func shouldInitStorage(uploadDir string) bool {
+	return uploadDir != ""
+}
+
+// shouldInitEncryption returns true if encryption key is set.
+func shouldInitEncryption(encryptionKey string) bool {
+	return encryptionKey != ""
+}
+
+// connectWithRetry attempts a connection with exponential backoff.
+// Returns nil on success or the last error after all retries fail.
+func connectWithRetry(connect func() error, maxRetries int) error {
+	var err error
+	for i := 0; i < maxRetries; i++ {
+		err = connect()
+		if err == nil {
+			return nil
+		}
+		time.Sleep(time.Duration(i+1) * time.Millisecond) // use ms in tests
+	}
+	return err
+}
@@ -0,0 +1,107 @@
+package main
+
+import (
+	"errors"
+	"testing"
+)
+
+// --- shouldInitEmail ---
+
+func TestShouldInitEmail_BothSet_True(t *testing.T) {
+	if !shouldInitEmail("smtp.example.com", "user@example.com") {
+		t.Error("expected true when both set")
+	}
+}
+
+func TestShouldInitEmail_MissingHost_False(t *testing.T) {
+	if shouldInitEmail("", "user@example.com") {
+		t.Error("expected false when host empty")
+	}
+}
+
+func TestShouldInitEmail_MissingUser_False(t *testing.T) {
+	if shouldInitEmail("smtp.example.com", "") {
+		t.Error("expected false when user empty")
+	}
+}
+
+func TestShouldInitEmail_BothEmpty_False(t *testing.T) {
+	if shouldInitEmail("", "") {
+		t.Error("expected false when both empty")
+	}
+}
+
+// --- shouldInitStorage ---
+
+func TestShouldInitStorage_Set_True(t *testing.T) {
+	if !shouldInitStorage("/uploads") {
+		t.Error("expected true")
+	}
+}
+
+func TestShouldInitStorage_Empty_False(t *testing.T) {
+	if shouldInitStorage("") {
+		t.Error("expected false")
+	}
+}
+
+// --- shouldInitEncryption ---
+
+func TestShouldInitEncryption_Set_True(t *testing.T) {
+	if !shouldInitEncryption("secret-key-123") {
+		t.Error("expected true")
+	}
+}
+
+func TestShouldInitEncryption_Empty_False(t *testing.T) {
+	if shouldInitEncryption("") {
+		t.Error("expected false")
+	}
+}
+
+// --- connectWithRetry ---
+
+func TestConnectWithRetry_SucceedsFirst_NoRetry(t *testing.T) {
+	calls := 0
+	err := connectWithRetry(func() error {
+		calls++
+		return nil
+	}, 3)
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	if calls != 1 {
+		t.Errorf("calls = %d, want 1", calls)
+	}
+}
+
+func TestConnectWithRetry_SucceedsSecond_OneRetry(t *testing.T) {
+	calls := 0
+	err := connectWithRetry(func() error {
+		calls++
+		if calls == 1 {
+			return errors.New("fail")
+		}
+		return nil
+	}, 3)
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	if calls != 2 {
+		t.Errorf("calls = %d, want 2", calls)
+	}
+}
+
+func TestConnectWithRetry_AllFail_ReturnsError(t *testing.T) {
+	calls := 0
+	err := connectWithRetry(func() error {
+		calls++
+		return errors.New("fail")
+	}, 3)
+	if err == nil {
+		t.Error("expected error")
+	}
+	if calls != 3 {
+		t.Errorf("calls = %d, want 3", calls)
+	}
+}
@@ -0,0 +1,333 @@
+// notif-diag is a CLI for inspecting and (optionally) cleaning up stuck
+// notification rows. Default mode is read-only — runs SELECTs and prints a
+// summary. With --mark-failed-as-sent, marks pending rows that already have a
+// recorded error as sent (cosmetic — no retry, no resend).
+//
+// Usage:
+//
+//	set -a && source deploy/prod.env && set +a
+//	go run ./cmd/notif-diag                              # diagnose
+//	go run ./cmd/notif-diag --mark-failed-as-sent --yes  # clean up errored backlog
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/rs/zerolog"
+	"github.com/rs/zerolog/log"
+	"gorm.io/driver/postgres"
+	"gorm.io/gorm"
+	"gorm.io/gorm/logger"
+)
+
+func main() {
+	passwordFile := stringFlag("password-file", "deploy/secrets/postgres_password.txt",
+		"Path to file containing POSTGRES_PASSWORD (used if env var is empty)")
+	markFailed := boolFlag("mark-failed-as-sent",
+		"Mark every pending row with a non-empty error_message as sent. Cosmetic only — does not retry the push.")
+	yes := boolFlag("yes", "Skip the interactive confirmation prompt for destructive actions.")
+
+	log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: time.RFC3339})
+
+	dsn, host, err := buildDSN(*passwordFile)
+	if err != nil {
+		log.Fatal().Err(err).Msg("failed to build database DSN")
+	}
+
+	db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{
+		Logger: logger.Default.LogMode(logger.Silent),
+	})
+	if err != nil {
+		log.Fatal().Err(err).Msg("failed to connect to database")
+	}
+
+	fmt.Printf("DB host: %s\n", host)
+	fmt.Println(strings.Repeat("=", 80))
+
+	overallTotals(db)
+	pendingByType(db)
+	recentPending(db)
+	deviceCounts(db)
+
+	if *markFailed {
+		markFailedAsSent(db, *yes)
+	}
+}
+
+// markFailedAsSent updates pending rows whose error_message is non-empty,
+// flipping them to sent=true with sent_at=updated_at. This is purely cosmetic:
+// it removes them from the "pending" count so dashboards and the diag tool
+// don't keep flagging an old, unfixable backlog. It does NOT re-send anything.
+func markFailedAsSent(db *gorm.DB, skipPrompt bool) {
+	var candidate int64
+	if err := db.Raw(`
+		SELECT COUNT(*) FROM notifications_notification
+		WHERE sent = false AND error_message IS NOT NULL AND error_message <> ''
+	`).Scan(&candidate).Error; err != nil {
+		log.Fatal().Err(err).Msg("failed to count cleanup candidates")
+	}
+
+	fmt.Printf("\n# Cleanup candidate count: %d\n", candidate)
+	if candidate == 0 {
+		fmt.Println("  (nothing to clean up)")
+		return
+	}
+	fmt.Println("  These rows have a recorded send error and will never be retried.")
+	fmt.Println("  Marking them sent=true is cosmetic — it just prevents them from")
+	fmt.Println("  showing up as pending in admin dashboards going forward.")
+
+	if !skipPrompt {
+		fmt.Printf("\nProceed? Type 'yes' to update %d rows: ", candidate)
+		s, err := bufio.NewReader(os.Stdin).ReadString('\n')
+		if err != nil {
+			log.Fatal().Err(err).Msg("failed to read confirmation")
+		}
+		if strings.TrimSpace(s) != "yes" {
+			fmt.Println("Aborted.")
+			return
+		}
+	}
+
+	res := db.Exec(`
+		UPDATE notifications_notification
+		SET sent = true, sent_at = COALESCE(updated_at, NOW())
+		WHERE sent = false AND error_message IS NOT NULL AND error_message <> ''
+	`)
+	if res.Error != nil {
+		log.Fatal().Err(res.Error).Msg("failed to update rows")
+	}
+	fmt.Printf("OK — updated %d rows.\n", res.RowsAffected)
+}
+
+// overallTotals shows the high-level sent/pending/read split.
+func overallTotals(db *gorm.DB) {
+	type row struct {
+		Total   int64
+		Sent    int64
+		Pending int64
+		Read    int64
+		Errored int64
+	}
+	var r row
+	db.Raw(`
+		SELECT
+			COUNT(*)                                                  AS total,
+			COUNT(*) FILTER (WHERE sent = true)                       AS sent,
+			COUNT(*) FILTER (WHERE sent = false)                      AS pending,
+			COUNT(*) FILTER (WHERE read = true)                       AS read,
+			COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS errored
+		FROM notifications_notification
+	`).Scan(&r)
+
+	fmt.Println("\n# Overall notification counts")
+	fmt.Printf("  total:   %d\n", r.Total)
+	fmt.Printf("  sent:    %d\n", r.Sent)
+	fmt.Printf("  pending: %d\n", r.Pending)
+	fmt.Printf("  read:    %d\n", r.Read)
+	fmt.Printf("  errored: %d  (rows with non-empty error_message)\n", r.Errored)
+}
+
+// pendingByType breaks the pending rows down by type and age.
+func pendingByType(db *gorm.DB) {
+	type row struct {
+		NotificationType string
+		PendingCount     int64
+		Oldest           *time.Time
+		Newest           *time.Time
+		WithErrors       int64
+		Last24h          int64
+		Last7d           int64
+	}
+	var rows []row
+	db.Raw(`
+		SELECT
+			notification_type,
+			COUNT(*)                                       AS pending_count,
+			MIN(created_at)                                AS oldest,
+			MAX(created_at)                                AS newest,
+			COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS with_errors,
+			COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '24 hours')          AS last_24h,
+			COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '7 days')            AS last_7d
+		FROM notifications_notification
+		WHERE sent = false
+		GROUP BY notification_type
+		ORDER BY MAX(created_at) DESC NULLS LAST
+	`).Scan(&rows)
+
+	fmt.Println("\n# Pending rows by type")
+	if len(rows) == 0 {
+		fmt.Println("  (no pending notifications)")
+		return
+	}
+	fmt.Printf("  %-22s  %7s  %7s  %7s  %7s  %-19s  %-19s\n",
+		"TYPE", "PENDING", "ERRORED", "LAST24H", "LAST7D", "OLDEST", "NEWEST")
+	for _, r := range rows {
+		fmt.Printf("  %-22s  %7d  %7d  %7d  %7d  %-19s  %-19s\n",
+			r.NotificationType, r.PendingCount, r.WithErrors, r.Last24h, r.Last7d,
+			fmtTime(r.Oldest), fmtTime(r.Newest))
+	}
+}
+
+// recentPending shows the 5 most recent pending rows with full detail.
+func recentPending(db *gorm.DB) {
+	type row struct {
+		ID               uint
+		UserID           uint
+		NotificationType string
+		Title            string
+		Body             string
+		ErrorMessage     string
+		CreatedAt        time.Time
+	}
+	var rows []row
+	db.Raw(`
+		SELECT id, user_id, notification_type, title, body, COALESCE(error_message, '') AS error_message, created_at
+		FROM notifications_notification
+		WHERE sent = false
+		ORDER BY created_at DESC
+		LIMIT 5
+	`).Scan(&rows)
+
+	fmt.Println("\n# 5 most recent pending notifications")
+	if len(rows) == 0 {
+		fmt.Println("  (none)")
+		return
+	}
+	for _, r := range rows {
+		errPart := ""
+		if r.ErrorMessage != "" {
+			errPart = fmt.Sprintf("\n      error: %s", r.ErrorMessage)
+		}
+		fmt.Printf("  [%d] user=%d  %s  %s%s\n      title: %s\n      body:  %s\n",
+			r.ID, r.UserID, r.CreatedAt.Format("2006-01-02 15:04:05"), r.NotificationType, errPart,
+			truncate(r.Title, 100), truncate(r.Body, 100))
+	}
+}
+
+// deviceCounts shows how many push devices are registered (active vs inactive).
+func deviceCounts(db *gorm.DB) {
+	type row struct {
+		Total       int64
+		Active      int64
+		WithUser    int64
+		DistinctUsers int64
+	}
+
+	fmt.Println("\n# Registered push devices")
+	for _, t := range []struct {
+		label string
+		table string
+	}{
+		{"APNs (iOS)", "push_notifications_apnsdevice"},
+		{"GCM (Android)", "push_notifications_gcmdevice"},
+	} {
+		var r row
+		err := db.Raw(fmt.Sprintf(`
+			SELECT
+				COUNT(*)                                              AS total,
+				COUNT(*) FILTER (WHERE active = true)                 AS active,
+				COUNT(*) FILTER (WHERE user_id IS NOT NULL)           AS with_user,
+				COUNT(DISTINCT user_id)                               AS distinct_users
+			FROM %s
+		`, t.table)).Scan(&r).Error
+		if err != nil {
+			fmt.Printf("  %-15s  ERROR: %v\n", t.label, err)
+			continue
+		}
+		fmt.Printf("  %-15s  total=%-5d  active=%-5d  with_user=%-5d  distinct_users=%d\n",
+			t.label, r.Total, r.Active, r.WithUser, r.DistinctUsers)
+	}
+}
+
+func buildDSN(passwordFile string) (dsn, host string, err error) {
+	host = os.Getenv("DB_HOST")
+	user := os.Getenv("POSTGRES_USER")
+	dbname := os.Getenv("POSTGRES_DB")
+	sslmode := os.Getenv("DB_SSLMODE")
+	if sslmode == "" {
+		sslmode = "require"
+	}
+
+	port := 5432
+	if s := os.Getenv("DB_PORT"); s != "" {
+		p, perr := strconv.Atoi(s)
+		if perr != nil {
+			return "", "", fmt.Errorf("invalid DB_PORT %q: %w", s, perr)
+		}
+		port = p
+	}
+
+	password := os.Getenv("POSTGRES_PASSWORD")
+	if password == "" && passwordFile != "" {
+		b, rerr := os.ReadFile(passwordFile)
+		if rerr != nil {
+			return "", "", fmt.Errorf("POSTGRES_PASSWORD not set and could not read %s: %w", passwordFile, rerr)
+		}
+		password = strings.TrimRight(string(b), "\r\n")
+	}
+
+	missing := []string{}
+	if host == "" {
+		missing = append(missing, "DB_HOST")
+	}
+	if user == "" {
+		missing = append(missing, "POSTGRES_USER")
+	}
+	if dbname == "" {
+		missing = append(missing, "POSTGRES_DB")
+	}
+	if password == "" {
+		missing = append(missing, "POSTGRES_PASSWORD")
+	}
+	if len(missing) > 0 {
+		return "", "", fmt.Errorf("missing required env vars: %s", strings.Join(missing, ", "))
+	}
+
+	dsn = fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
+		host, port, user, password, dbname, sslmode)
+	return dsn, host, nil
+}
+
+// stringFlag is a tiny stand-in for flag.String to keep imports lean — using it
+// also dodges flag-package quirks when this file is rebuilt with go run.
+func stringFlag(name, def, _usage string) *string {
+	v := def
+	prefix := "--" + name + "="
+	for _, a := range os.Args[1:] {
+		if strings.HasPrefix(a, prefix) {
+			v = strings.TrimPrefix(a, prefix)
+		}
+	}
+	return &v
+}
+
+// boolFlag is true if --name is present in os.Args (no value form).
+func boolFlag(name, _usage string) *bool {
+	want := "--" + name
+	v := false
+	for _, a := range os.Args[1:] {
+		if a == want {
+			v = true
+		}
+	}
+	return &v
+}
+
+func fmtTime(t *time.Time) string {
+	if t == nil {
+		return "-"
+	}
+	return t.Format("2006-01-02 15:04:05")
+}
+
+func truncate(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n] + "…"
+}
@@ -0,0 +1,59 @@
+// send-test-push enqueues a one-shot Asynq push notification task. The worker
+// picks it up and routes it through internal/push/Client.SendToAll, which now
+// hits APNs production. Verifies end-to-end that push delivery is working
+// without waiting for the next cron tick.
+//
+// Usage:
+//
+//	# Port-forward Redis from the cluster first:
+//	kubectl --kubeconfig=~/.kube/honeydue-k3s.yaml -n honeydue port-forward svc/redis 6379:6379
+//
+//	# Then in another shell:
+//	go run ./cmd/send-test-push --user-id 6 --title "Test" --message "Hello from notif-diag"
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"strconv"
+
+	"github.com/hibiken/asynq"
+
+	"github.com/treytartt/honeydue-api/internal/worker/jobs"
+)
+
+func main() {
+	userID := flag.Uint("user-id", 0, "Target auth_user.id (required)")
+	title := flag.String("title", "Test push", "Notification title")
+	message := flag.String("message", "Hello from send-test-push", "Notification body")
+	redisAddr := flag.String("redis", "localhost:6379", "Redis host:port (use kubectl port-forward to reach the in-cluster redis)")
+	flag.Parse()
+
+	if *userID == 0 {
+		fmt.Fprintln(os.Stderr, "--user-id is required")
+		os.Exit(2)
+	}
+
+	task, err := jobs.NewSendPushTask(*userID, *title, *message, map[string]string{
+		"type":    "test",
+		"user_id": strconv.FormatUint(uint64(*userID), 10),
+	})
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "build task: %v\n", err)
+		os.Exit(1)
+	}
+
+	client := asynq.NewClient(asynq.RedisClientOpt{Addr: *redisAddr})
+	defer func() { _ = client.Close() }()
+
+	info, err := client.Enqueue(task, asynq.Queue("default"), asynq.MaxRetry(3))
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "enqueue: %v\n", err)
+		os.Exit(1)
+	}
+
+	fmt.Printf("Enqueued task: id=%s queue=%s type=%s\n", info.ID, info.Queue, info.Type)
+	fmt.Printf("Tail worker logs to see the result:\n")
+	fmt.Printf("  kubectl --kubeconfig=~/.kube/honeydue-k3s.yaml -n honeydue logs deploy/worker --tail=20 -f\n")
+}
@@ -11,13 +11,19 @@ import (
 	"github.com/hibiken/asynq"
 	"github.com/redis/go-redis/v9"
 	"github.com/rs/zerolog/log"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"

 	"github.com/treytartt/honeydue-api/internal/config"
 	"github.com/treytartt/honeydue-api/internal/database"
 	"github.com/treytartt/honeydue-api/internal/monitoring"
+	"github.com/treytartt/honeydue-api/internal/prom"
 	"github.com/treytartt/honeydue-api/internal/push"
 	"github.com/treytartt/honeydue-api/internal/repositories"
 	"github.com/treytartt/honeydue-api/internal/services"
+	"github.com/treytartt/honeydue-api/internal/tracing"
+	"github.com/treytartt/honeydue-api/internal/worker"
 	"github.com/treytartt/honeydue-api/internal/worker/jobs"
 	"github.com/treytartt/honeydue-api/pkg/utils"
 )
@@ -40,6 +46,29 @@ func main() {
 		os.Exit(0)
 	}

+	// Initialize OpenTelemetry tracing for the worker process. Same OTLP
+	// destination as the api; service.name distinguishes them in Jaeger.
+	// config.SecretValue (not os.Getenv) so file-mounted secrets resolve
+	// after audit F8 removed these from the process environment.
+	tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{
+		ServiceName: "honeydue-worker",
+		Environment: workerDeploymentEnv(cfg.Server.Debug),
+		EndpointURL: config.SecretValue("OBS_TRACES_URL"),
+		BearerToken: config.SecretValue("OBS_INGEST_TOKEN"),
+		SampleRatio: tracing.SampleRatioFromEnv(),
+	})
+	if err != nil {
+		log.Error().Err(err).Msg("worker tracing init failed — continuing without traces")
+	}
+	defer func() {
+		shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		if err := tracingShutdown(shutdownCtx); err != nil {
+			log.Warn().Err(err).Msg("worker tracing shutdown error")
+		}
+	}()
+	asynqTracer := tracing.Tracer("honeydue/worker/asynq")
+
 	// Initialize database
 	db, err := database.Connect(&cfg.Database, cfg.Server.Debug)
 	if err != nil {
@@ -80,6 +109,17 @@ func main() {
 	if err != nil {
 		log.Fatal().Err(err).Msg("Failed to parse Redis URL")
 	}
+	// Audit HIGH-1: the Redis password is a file-mounted secret (REDIS_PASSWORD),
+	// not embedded in REDIS_URL — REDIS_URL travels in the honeydue-config
+	// ConfigMap. Apply the password onto the parsed opt so the Asynq server,
+	// inspector and monitoring client (all derived from redisOpt below)
+	// authenticate against a requirepass-protected Redis.
+	if cfg.Redis.Password != "" {
+		if clientOpt, ok := redisOpt.(asynq.RedisClientOpt); ok {
+			clientOpt.Password = cfg.Redis.Password
+			redisOpt = clientOpt
+		}
+	}

 	// Initialize monitoring service (if Redis is available)
 	var monitoringService *monitoring.Service
@@ -141,14 +181,62 @@ func main() {
 	// Create job handler
 	jobHandler := jobs.NewHandler(db, pushClient, emailService, notificationService, cfg)

+	// Wire upload service for the pending_uploads cleanup cron AND share the
+	// underlying storage service with the TaskService below so the worker can
+	// load completion images for email embedding. Storage may be local-disk
+	// (no S3 backend), in which case the upload service stays nil and the
+	// cleanup handler no-ops. Cache is optional — the cleanup path doesn't
+	// rate-limit and works fine with a nil cache.
+	var sharedStorageService *services.StorageService
+	if storageService, sErr := services.NewStorageService(&cfg.Storage); sErr == nil {
+		sharedStorageService = storageService
+		if s3 := storageService.S3Backend(); s3 != nil {
+			pendingUploadRepo := repositories.NewPendingUploadRepository(db)
+			uploadService := services.NewUploadService(pendingUploadRepo, s3, &cfg.Storage, nil)
+			jobHandler.SetUploadService(uploadService)
+		}
+	} else {
+		log.Warn().Err(sErr).Msg("Failed to initialize storage service for upload cleanup; cleanup cron will no-op")
+	}
+
+	// Wire a TaskService for the task-completed notification handler. The
+	// worker re-creates this (vs. importing the api's wired instance) because
+	// each binary owns its own dependency graph. The handler is fully nil-safe
+	// — if any of the wired services are absent, the corresponding side of
+	// notification delivery (push or email) is skipped.
+	taskRepo := repositories.NewTaskRepository(db)
+	residenceRepo := repositories.NewResidenceRepository(db)
+	workerTaskService := services.NewTaskService(taskRepo, residenceRepo)
+	if notificationService != nil {
+		workerTaskService.SetNotificationService(notificationService)
+	}
+	if emailService != nil {
+		workerTaskService.SetEmailService(emailService)
+	}
+	if sharedStorageService != nil {
+		workerTaskService.SetStorageService(sharedStorageService)
+	}
+	jobHandler.SetTaskService(workerTaskService)
+
 	// Create Asynq mux and register handlers
 	mux := asynq.NewServeMux()
+
+	// Tracing + metrics middleware: every job runs inside a span and emits
+	// asynq_job_duration_seconds{task_type,result}.
+	mux.Use(asynqTracingMiddleware(asynqTracer))
+
 	mux.HandleFunc(jobs.TypeSmartReminder, jobHandler.HandleSmartReminder)
 	mux.HandleFunc(jobs.TypeDailyDigest, jobHandler.HandleDailyDigest)
 	mux.HandleFunc(jobs.TypeSendEmail, jobHandler.HandleSendEmail)
 	mux.HandleFunc(jobs.TypeSendPush, jobHandler.HandleSendPush)
 	mux.HandleFunc(jobs.TypeOnboardingEmails, jobHandler.HandleOnboardingEmails)
 	mux.HandleFunc(jobs.TypeReminderLogCleanup, jobHandler.HandleReminderLogCleanup)
+	mux.HandleFunc(jobs.TypeUploadCleanup, jobHandler.HandleUploadCleanup)
+	mux.HandleFunc(jobs.TypeNotificationCleanup, jobHandler.HandleNotificationCleanup)
+	mux.HandleFunc(jobs.TypeWebhookLogCleanup, jobHandler.HandleWebhookLogCleanup)
+	mux.HandleFunc(jobs.TypeAuditLogCleanup, jobHandler.HandleAuditLogCleanup)
+	mux.HandleFunc(worker.TypeTaskCompletedNotification, jobHandler.HandleTaskCompletedNotification)
+	mux.HandleFunc(worker.TypeDataExport, jobHandler.HandleDataExport)

 	// Register email job handlers (welcome, verification, password reset, password changed)
 	if emailService != nil {
@@ -188,6 +276,32 @@ func main() {
 	}
 	log.Info().Str("cron", "0 3 * * *").Msg("Registered reminder log cleanup job (runs daily at 3:00 AM UTC)")

+	// Schedule pending_uploads cleanup (hourly at :30 to avoid colliding with
+	// the top-of-hour reminder + digest crons). Reaps unclaimed expired
+	// upload sessions; the B2 bucket lifecycle (7 days on uploads/ prefix)
+	// is the backstop if this worker is offline for an extended period.
+	if _, err := scheduler.Register("30 * * * *", asynq.NewTask(jobs.TypeUploadCleanup, nil)); err != nil {
+		log.Fatal().Err(err).Msg("Failed to register upload cleanup job")
+	}
+	log.Info().Str("cron", "30 * * * *").Msg("Registered pending_uploads cleanup job (runs hourly)")
+
+	// Data-retention cleanups (BE-2). Staggered off the 3:00 reminder cleanup to
+	// avoid piling DELETEs onto the same Neon connection window.
+	if _, err := scheduler.Register("0 2 * * *", asynq.NewTask(jobs.TypeNotificationCleanup, nil)); err != nil {
+		log.Fatal().Err(err).Msg("Failed to register notification cleanup job")
+	}
+	log.Info().Str("cron", "0 2 * * *").Msg("Registered notification cleanup job (daily 02:00 UTC, 90d retention)")
+
+	if _, err := scheduler.Register("30 2 * * 0", asynq.NewTask(jobs.TypeWebhookLogCleanup, nil)); err != nil {
+		log.Fatal().Err(err).Msg("Failed to register webhook log cleanup job")
+	}
+	log.Info().Str("cron", "30 2 * * 0").Msg("Registered webhook log cleanup job (weekly Sun 02:30 UTC, 180d retention)")
+
+	if _, err := scheduler.Register("30 3 * * 0", asynq.NewTask(jobs.TypeAuditLogCleanup, nil)); err != nil {
+		log.Fatal().Err(err).Msg("Failed to register audit log cleanup job")
+	}
+	log.Info().Str("cron", "30 3 * * 0").Msg("Registered audit log cleanup job (weekly Sun 03:30 UTC, 365d retention)")
+
 	// Handle graceful shutdown
 	quit := make(chan os.Signal, 1)
 	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
@@ -199,6 +313,12 @@ func main() {
 		w.WriteHeader(http.StatusOK)
 		_, _ = w.Write([]byte(`{"status":"ok"}`))
 	})
+	// Expose Prometheus metrics so vmagent can scrape the worker. The
+	// apns_send_*, fcm_send_*, asynq_job_* and cache_ops_* series have been
+	// recorded on this process all along — they were just never exposed, which
+	// is why those dashboard panels read empty. Same :6060 as health; in-cluster
+	// only (not externally published).
+	healthMux.Handle("/metrics", prom.HTTPHandler())
 	healthSrv := &http.Server{
 		Addr:              workerHealthAddr,
 		Handler:           healthMux,
@@ -238,3 +358,44 @@ func main() {

 	log.Info().Msg("Worker stopped")
 }
+
+// asynqTracingMiddleware returns an asynq.MiddlewareFunc that opens a span
+// per task execution and records asynq_job_duration_seconds. Span attrs
+// include task type, queue, retry count, and the result outcome.
+func asynqTracingMiddleware(tracer trace.Tracer) asynq.MiddlewareFunc {
+	return func(next asynq.Handler) asynq.Handler {
+		return asynq.HandlerFunc(func(ctx context.Context, t *asynq.Task) error {
+			ctx, span := tracer.Start(ctx, "asynq.handle:"+t.Type(),
+				trace.WithAttributes(
+					attribute.String("asynq.task_type", t.Type()),
+					attribute.Int("asynq.payload_bytes", len(t.Payload())),
+				),
+			)
+			defer span.End()
+
+			start := time.Now()
+			err := next.ProcessTask(ctx, t)
+			dur := time.Since(start)
+			result := "ok"
+			if err != nil {
+				result = "error"
+				span.SetStatus(codes.Error, err.Error())
+				span.RecordError(err)
+			}
+			span.SetAttributes(attribute.String("asynq.result", result))
+			prom.ObserveAsynqJob(t.Type(), result, dur)
+			return err
+		})
+	}
+}
+
+// workerDeploymentEnv mirrors deploymentEnvironment in cmd/api/main.go.
+func workerDeploymentEnv(debug bool) string {
+	if env := os.Getenv("DEPLOYMENT_ENVIRONMENT"); env != "" {
+		return env
+	}
+	if debug {
+		return "dev"
+	}
+	return "prod"
+}
@@ -42,7 +42,7 @@ email:
 push:
  apns_key_id: ""
  apns_team_id: ""
-  apns_topic: com.tt.honeyDue
+  apns_topic: com.myhoneydue.honeyDue.dev
  apns_production: false
  apns_use_sandbox: true                # Sandbox for dev

@@ -85,8 +85,9 @@ tls:
  # If mode=cloudflare, create secrets/cloudflare-origin.crt and .key

 # --- Apple Auth / IAP (optional) ---
+# client_id MUST equal the iOS Debug bundle ID for the dev backend.
 apple_auth:
-  client_id: ""
+  client_id: "com.myhoneydue.honeyDue.dev"
  team_id: ""
  iap_key_id: ""
  iap_issuer_id: ""
@@ -92,7 +92,7 @@ ADMIN_PW="$(openssl rand -base64 16)"

 EMAIL_USER="treytartt@fastmail.com"
 APNS_KEY_ID="9R5Q7ZX874"
-APNS_TEAM_ID="V3PF3M6B6U"
+APNS_TEAM_ID="X86BR9WTLD"

 log ""
 log "Pre-filled from existing dev server:"
@@ -147,7 +147,7 @@ email:
 push:
  apns_key_id: "${APNS_KEY_ID}"
  apns_team_id: "${APNS_TEAM_ID}"
-  apns_topic: com.tt.honeyDue
+  apns_topic: com.myhoneydue.honeyDue.dev
  apns_production: false
  apns_use_sandbox: true

@@ -189,7 +189,7 @@ tls:

 # --- Apple Auth / IAP ---
 apple_auth:
-  client_id: "com.tt.honeyDue"
+  client_id: "com.myhoneydue.honeyDue.dev"
  team_id: "${APNS_TEAM_ID}"
  iap_key_id: ""
  iap_issuer_id: ""
@@ -3,6 +3,7 @@ config.yaml

 # Generated files
 kubeconfig
+kubeconfig.*
 cluster-config.yaml
 prod.env

@@ -0,0 +1,966 @@
+# honeyDue k3s Cluster — Operations Runbook
+
+Living document for the honeyDue production cluster. Add entries when you hit
+something non-obvious so future-you (or your replacement) doesn't have to
+rediscover it.
+
+Last full revision: **2026-06-03** (Hetzner → OVH BHS cutover; cluster solo
+production from that date forward). For pre-OVH history, see
+`MIGRATION_NOTES.md` (Swarm → k3s migration on Hetzner, 2026-04-24).
+
+---
+
+## 1. Topology and inventory
+
+### Hosting
+
+| | |
+|---|---|
+| Provider | OVHcloud (us.ovhcloud.com) |
+| Datacenter | BHS — Beauharnois, Quebec, Canada |
+| Plan | VPS-1 × 3 (~$6.46/mo each, ~$19/mo total) |
+| Node spec | 4 vCPU (Intel Haswell, shared), 7.6 GB RAM, 75 GB NVMe |
+| Public bandwidth | 400 Mbps per node, unlimited traffic |
+| Private network | **None.** Nodes have public IPv4 + IPv6 only; inter-node traffic crosses the public internet (encrypted by flannel WireGuard backend — see §3) |
+
+### Nodes
+
+| SSH alias | Kubernetes node name | Public IPv4 | Public IPv6 | Roles |
+|---|---|---|---|---|
+| `ovhcloud1` | `vps-1624d691` | `51.81.83.33` | `2604:2dc0:101:200::5a9a` | control-plane, etcd, redis-pinned |
+| `ovhcloud2` | `vps-c0f51be2` | `51.81.87.86` | `2604:2dc0:101:200::30d4` | control-plane, etcd |
+| `ovhcloud3` | `vps-dbca24c7` | `51.81.85.248` | `2604:2dc0:101:200::450f` | control-plane, etcd |
+
+The cluster is **all-control-plane** (workloads schedule on the same nodes that
+run etcd and the API server). `vps-1624d691` carries the
+`honeydue/redis=true` label so the Redis Deployment's `nodeSelector` binds
+there; the Redis PVC (`local-path`, host-pinned) lives on that node's disk.
+
+### SSH access
+
+`~/.ssh/config` entries (operator workstation):
+
+```
+Host ovhcloud1
+    HostName 51.81.83.33
+    Port 22
+    User ubuntu
+    IdentityFile ~/.ssh/ovhcloud
+    IdentitiesOnly yes
+Host ovhcloud2
+    HostName 51.81.87.86
+    Port 22
+    User ubuntu
+    IdentityFile ~/.ssh/ovhcloud
+    IdentitiesOnly yes
+Host ovhcloud3
+    HostName 51.81.85.248
+    Port 22
+    User ubuntu
+    IdentityFile ~/.ssh/ovhcloud
+    IdentitiesOnly yes
+```
+
+`ubuntu` has passwordless sudo (`/etc/sudoers.d/90-cloud-init-users` from OVH's
+cloud-init).
+
+### kubectl access
+
+```bash
+export KUBECONFIG=/Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go/deploy-k3s/kubeconfig
+kubectl get nodes
+```
+
+The `deploy-k3s/kubeconfig` file (mode 0600, gitignored) is the OVH cluster's
+admin kubeconfig with `server: https://51.81.83.33:6443`. A stale Hetzner copy
+lives next to it as `kubeconfig.hetzner.bak` for historical reference; the
+Hetzner cluster is powered off and that file's API server is unreachable.
+
+To refresh from the cluster (if the local copy is lost or rotated):
+
+```bash
+ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
+  | sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
+  > deploy-k3s/kubeconfig
+chmod 600 deploy-k3s/kubeconfig
+```
+
+The k3s API at `:6443` is open to the public internet (token-protected).
+
+---
+
+## 2. Software
+
+### Kernel-level
+
+| | |
+|---|---|
+| OS | Ubuntu 26.04 LTS (set by OVH's VPS-1 image) |
+| Kernel | `7.0.0-14-generic` |
+| Init | systemd |
+| Container runtime | containerd 2.2.2 (bundled with k3s) |
+| Firewall | `ufw` (per-node, configured at install — see §3) |
+| Other host packages | `fail2ban` (SSH brute-force protection, default jail), `unattended-upgrades` (security updates), `open-iscsi` (k3s prereq for some storage backends), `curl` |
+
+### Kubernetes
+
+| | |
+|---|---|
+| Distribution | k3s |
+| Version | **`v1.34.6+k3s1`** (pinned in `config.yaml:cluster.k3s_version`) |
+| Control plane | 3-node HA, embedded etcd (no external Postgres backing store) |
+| CNI / networking | flannel with **WireGuard-native backend** (`--flannel-backend=wireguard-native`). Encrypts pod-to-pod and etcd peer traffic because nodes only have public IPs (no private network). ~3-5% CPU overhead under load. |
+| Service LB | klipper-lb (default k3s `servicelb`). The `svclb-traefik` DaemonSet binds host ports `:80` and `:443` on each node and forwards to the Traefik Service. **Not** the DaemonSet-w/-hostNetwork Traefik pattern used on the old Hetzner cluster — see §10 *Differences from MIGRATION_NOTES*. |
+| Ingress controller | Traefik (k3s default), single-replica Deployment, exposed via klipper-lb |
+| DNS | CoreDNS (k3s default) |
+| Secrets encryption | Enabled (`--secrets-encryption`); etcd values are AES-CBC encrypted at rest |
+| kubeconfig perms | `0600` (`--write-kubeconfig-mode=0600`) |
+| Cloud controller | Disabled (`--disable-cloud-controller`) — no provider integration on OVH |
+| Misc | `--node-ip` / `--node-external-ip` / `--advertise-address` all set to each node's public IPv4. TLS SANs cover all 3 IPs so any IP can serve the API. |
+
+### Application stack (in cluster, `honeydue` namespace)
+
+| Deployment | Replicas | Image (digest-pinned) | Notes |
+|---|---:|---|---|
+| `api` | 3 | `gitea.treytartt.com/admin/honeydue-api@sha256:34fde6...` | Go REST API on `:8000`, exposes `/metrics` |
+| `web` | 3 | `gitea.treytartt.com/admin/honeydue-web@sha256:8c62cf...` | Next.js, server-side proxy to api |
+| `admin` | 1 | `gitea.treytartt.com/admin/honeydue-admin@sha256:b81263...` | Next.js admin panel, gated behind Traefik basic-auth |
+| `worker` | 1 | `gitea.treytartt.com/admin/honeydue-worker@sha256:fe1f5e...` | Asynq scheduler + Redis-backed jobs (singleton — must not run as >1 replica or every cron fires N×) |
+| `redis` | 1 | `redis:7-alpine@sha256:6ab0b6...` | Pinned to `vps-1624d691` via `honeydue/redis=true`. PVC `redis-data` (local-path, 5 Gi). Password-auth required. |
+| `vmagent` | 1 | `victoriametrics/vmagent@sha256:...` (default tag) | Scrapes api `/metrics` + kube-state-metrics; remote-writes to obs.88oakapps.com |
+| `kube-state-metrics` | 1 | `kube-state-metrics@sha256:...` | In `kube-system`, scraped by vmagent for `kube_*` cluster-state metrics |
+| `alloy-logs` (DaemonSet) | 3 (1/node) | `grafana/alloy@sha256:...` | Tails `/var/log/pods/*` and ships to Loki at obs.88oakapps.com |
+
+The Asynq scheduler inside `worker` registers these cron jobs:
+
+| Cron | Job | Notes |
+|---|---|---|
+| `0 * * * *` | Smart reminder check (per-user hour) | Default user hour: 14:00 UTC |
+| `0 * * * *` | Daily digest check (per-user hour) | Default user hour: 03:00 UTC |
+| `0 10 * * *` | Onboarding emails | 10:00 UTC |
+| `0 3 * * *` | Reminder log cleanup | 03:00 UTC |
+| `30 * * * *` | Pending uploads cleanup | xx:30 every hour |
+
+### External dependencies
+
+| Service | Endpoint | Purpose | Failure mode |
+|---|---|---|---|
+| Neon Postgres | `ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech:5432` | App data. Pooler endpoint (transaction-mode PgBouncer in front of Neon compute) so connections stay warm. | api / worker pods crash-loop with `dial tcp: connection refused`. Health endpoint returns `postgres: error`. |
+| Backblaze B2 (S3-compatible) | `s3.us-east-005.backblazeb2.com` (bucket `honeyDueProd`) | User uploads (photos, PDFs, completion attachments) | Upload routes return 5xx; reads of cached/static files still work. |
+| Cloudflare | `myhoneydue.com` zone | DNS + TLS termination + edge cache + DDoS | Traffic stops reaching origin. Direct `https://51.81.x.x` still works for diagnostics. |
+| obs.88oakapps.com | Operator-run Grafana + VictoriaMetrics + Loki | Metrics & logs | vmagent + alloy-logs back off and retry. No app-side impact. |
+| Apple APNs | `api.push.apple.com:443` (production) | iOS push notifications | Push fails; circuit breaker opens; failure logged. App functionality unaffected. |
+| Fastmail SMTP | `smtp.fastmail.com:587` | Transactional emails (verification, recovery, digests) | Email send fails in the worker; logged; user reset/digest flow degrades. |
+| Gitea registry | `gitea.treytartt.com` | Container image registry | Deploys can't pull. Existing pods keep running on cached images. |
+
+---
+
+## 3. Network and firewall
+
+### Per-node `ufw` configuration
+
+Applied during install (same on all 3 nodes):
+
+```
+default deny incoming
+default allow outgoing
+allow 22/tcp                  (SSH, world)
+allow 80/tcp                  (HTTP via Cloudflare, world — see GAP-1)
+allow 443/tcp                 (HTTPS, same — GAP-1)
+allow 6443/tcp                (k3s API, world, token-protected)
+allow 2379:2380/tcp from <other 2 OVH IPs>   (etcd client + peer)
+allow 10250/tcp from <other 2 OVH IPs>       (kubelet)
+allow 51820/udp from <other 2 OVH IPs>       (WireGuard tunnel)
+allow 8472/udp  from <other 2 OVH IPs>       (VXLAN, defense-in-depth fallback)
+```
+
+To inspect: `ssh ovhcloudN sudo ufw status numbered`.
+
+### Cluster networking
+
+- **Pod CIDR**: `10.42.0.0/16` (default k3s)
+- **Service CIDR**: `10.43.0.0/16` (default k3s)
+- **Flannel backend**: WireGuard-native. Each node hosts a `flannel-wg` interface on UDP 51820 and tunnels pod traffic to peers. Verify: `ssh ovhcloudN ip -d link show flannel-wg`.
+
+### Traefik ingress flow
+
+```
+Cloudflare → node:80/443 (public)
+  → klipper-lb svclb-traefik DaemonSet pod (hostPort:80/443)
+  → Traefik Service (ClusterIP 10.43.245.127:80/443)
+  → Traefik Deployment pod (single replica)
+  → matches Ingress host rule (api.myhoneydue.com etc.)
+  → routes to backend Service (api / web / admin)
+  → backend Pod
+```
+
+The Traefik default also lives in `kube-system` and is managed by k3s's
+HelmChart. **No HelmChartConfig override is applied on OVH** (unlike Hetzner
+— see §10).
+
+---
+
+## 4. DNS configuration (Cloudflare)
+
+The `myhoneydue.com` zone in Cloudflare has these public records. **All
+hostnames are proxied (orange cloud)** — required by the `cloudflare-only`
+Traefik middleware which 403s any non-CF source IP.
+
+| Host | Type | Values | Proxy |
+|---|---|---|---|
+| `api.myhoneydue.com` | A × 3 | `51.81.83.33`, `51.81.87.86`, `51.81.85.248` | Proxied |
+| `app.myhoneydue.com` | A × 3 | (same trio) | Proxied |
+| `admin.myhoneydue.com` | A × 3 | (same trio) | Proxied |
+| `myhoneydue.com` (apex `@`) | A × 3 | (same trio) | Proxied |
+
+Cloudflare round-robins among the 3 origins, klipper-lb on whichever node CF
+hits forwards to Traefik, and Traefik routes by Host header. Per-request,
+effectively load-balanced across the 3 nodes for ingress, with no central LB.
+
+**SSL/TLS mode**: Flexible (CF terminates TLS at the edge; origin is plain
+HTTP on `:80`). Upgrading to Full (strict) is on the deferred list — would
+need an origin certificate provisioned to `cloudflare-origin-cert` secret and
+Traefik configured for TLS termination.
+
+---
+
+## 5. Filesystem layout (`deploy-k3s/`)
+
+```
+deploy-k3s/
+├── config.yaml                 # Single config source (gitignored; contains tokens)
+├── config.yaml.example         # Template
+├── kubeconfig                  # OVH admin kubeconfig (gitignored, 0600)
+├── kubeconfig.hetzner.bak      # Old Hetzner kubeconfig (unreachable, kept for history)
+├── kubeconfig.tunnel           # Optional: localhost-pointing copy for SSH-tunnel use
+├── secrets/
+│   ├── README.md
+│   ├── postgres_password.txt   # Neon DB password
+│   ├── secret_key.txt          # 32+ char app-token signing secret
+│   ├── email_host_password.txt # Fastmail SMTP app password
+│   ├── fcm_server_key.txt      # FCM server key (currently unused — Android push disabled)
+│   ├── apns_auth_key.p8        # APNs auth key (binary)
+│   ├── cloudflare-origin.crt   # Origin certificate (currently unused — CF Flexible)
+│   └── cloudflare-origin.key
+│   (all gitignored except README.md)
+├── manifests/
+│   ├── namespace.yaml
+│   ├── network-policies.yaml   # default-deny + per-app egress/ingress (13 NetPols total)
+│   ├── rbac.yaml               # api/worker/admin/web/redis ServiceAccounts (NOT applied by 03-deploy.sh; manual once)
+│   ├── pod-disruption-budgets.yaml  # api-pdb, web-pdb, worker-pdb (NOT applied by 03-deploy.sh; manual once)
+│   ├── traefik-helmchartconfig.yaml # Hetzner-only DaemonSet+hostNetwork override (do NOT apply on OVH; we use default klipper-lb)
+│   ├── kyverno-verify-images.yaml   # Operator-gated policy (do NOT apply blindly — see file comment)
+│   ├── api/{deployment,service,hpa}.yaml
+│   ├── worker/deployment.yaml
+│   ├── admin/{deployment,service}.yaml
+│   ├── web/{deployment,service}.yaml
+│   ├── redis/{deployment,service,pvc}.yaml
+│   ├── ingress/{middleware,ingress-simple}.yaml
+│   ├── migrate/job.yaml        # goose migration Job (image-subbed at deploy time)
+│   ├── observability/{kube-state-metrics,vmagent,alloy-logs}.yaml
+│   └── kratos/                 # Ory Kratos identity service (NOT yet deployed; gated on operator OIDC setup)
+└── scripts/
+    ├── _config.sh              # Sourced by all scripts: cfg(), generate_env(), generate_cluster_config()
+    ├── 01-provision-cluster.sh # Hetzner-Cloud-specific (uses hetzner-k3s CLI) — DO NOT RUN ON OVH
+    ├── 02-setup-secrets.sh     # Creates honeydue-secrets etc. from secrets/ + config.yaml; kubeconfig-driven
+    ├── 03-deploy.sh            # Build + push + apply manifests + roll deployments; kubeconfig-driven
+    ├── 04-verify.sh            # Post-deploy health + security checks; kubeconfig-driven
+    └── rollback.sh             # `kubectl rollout undo` across all deployments
+```
+
+The `deploy/prod.env` file (sibling to `deploy-k3s/`, gitignored) holds
+observability + admin credentials that `02/03-deploy.sh` read but never
+display:
+
+```
+OBS_INGEST_URL       (https://obs.88oakapps.com/api/v1/write)
+OBS_TRACES_URL       (https://obs.88oakapps.com/v1/traces)
+OBS_INGEST_TOKEN     (bearer token for VM + Loki + traces — all use same token)
+GRAFANA_URL          (https://grafana.88oakapps.com)
+GRAFANA_ADMIN_USER   (admin)
+GRAFANA_ADMIN_PASSWORD
+ADMIN_EMAIL / ADMIN_PASSWORD (in-app admin login)
+```
+
+---
+
+## 6. Install from clean boxes — the truthful procedure
+
+This is what we ran on 2026-06-03 to stand up the live cluster, exactly. If
+you ever rebuild from zero this is the canonical sequence. Total wall-clock:
+~12 min for cluster bootstrap; ~10 min for workloads.
+
+### 6.1 Prerequisites
+
+- 3 fresh Ubuntu VPS instances (any provider with public IPv4, ≥4 GB RAM,
+  ≥40 GB disk)
+- `~/.ssh/config` entries (`ovhcloud1/2/3`) pointing at them, with
+  passwordless sudo
+- Local `kubectl` and `curl`
+- The repo's `deploy-k3s/secrets/` populated (or the ability to copy live
+  secrets from another running cluster — see §7.2)
+- `deploy/prod.env` populated with obs token + Grafana creds
+
+### 6.2 Per-node OS hardening + firewall (all 3 in parallel)
+
+For each `ovhcloudN`, over SSH:
+
+```sh
+export DEBIAN_FRONTEND=noninteractive
+sudo apt-get update -qq
+sudo apt-get install -y -qq fail2ban unattended-upgrades open-iscsi curl ufw
+sudo systemctl enable --now iscsid fail2ban
+sudo dpkg-reconfigure -f noninteractive -plow unattended-upgrades
+
+sudo ufw --force reset
+sudo ufw default deny incoming
+sudo ufw default allow outgoing
+sudo ufw allow 22/tcp
+sudo ufw allow 80/tcp
+sudo ufw allow 443/tcp
+sudo ufw allow 6443/tcp
+SELF=$(hostname -I | awk '{print $1}')
+for peer in 51.81.83.33 51.81.87.86 51.81.85.248; do
+  [ "$peer" = "$SELF" ] && continue
+  sudo ufw allow from "$peer" to any port 2379:2380 proto tcp
+  sudo ufw allow from "$peer" to any port 10250        proto tcp
+  sudo ufw allow from "$peer" to any port 51820        proto udp
+  sudo ufw allow from "$peer" to any port 8472         proto udp
+done
+sudo ufw --force enable
+```
+
+**Watch ordering:** `allow 22/tcp` MUST precede `ufw enable`. Existing SSH
+sessions survive (`ufw` only affects new connections), but a misordered script
+locks you out of fresh logins.
+
+### 6.3 Install k3s on `ovhcloud1` (the init node)
+
+```sh
+ssh ovhcloud1 'curl -sfL https://get.k3s.io | \
+  INSTALL_K3S_VERSION=v1.34.6+k3s1 \
+  sh -s - server \
+    --cluster-init \
+    --node-ip=51.81.83.33 \
+    --node-external-ip=51.81.83.33 \
+    --advertise-address=51.81.83.33 \
+    --flannel-backend=wireguard-native \
+    --flannel-external-ip \
+    --secrets-encryption \
+    --write-kubeconfig-mode=0600 \
+    --tls-san=51.81.83.33 \
+    --tls-san=51.81.87.86 \
+    --tls-san=51.81.85.248 \
+    --disable-cloud-controller'
+```
+
+Wait for `sudo k3s kubectl get nodes` to show this node Ready (~2-5 s).
+Read the cluster token:
+
+```sh
+ssh ovhcloud1 'sudo cat /var/lib/rancher/k3s/server/node-token'
+```
+
+### 6.4 Join `ovhcloud2`, then `ovhcloud3` (sequential)
+
+Joining etcd one node at a time avoids split-brain on slow networks.
+Replace `<TOKEN>` with the value from 6.3.
+
+For `ovhcloud2`:
+
+```sh
+ssh ovhcloud2 'curl -sfL https://get.k3s.io | \
+  INSTALL_K3S_VERSION=v1.34.6+k3s1 \
+  K3S_TOKEN=<TOKEN> \
+  sh -s - server \
+    --server=https://51.81.83.33:6443 \
+    --node-ip=51.81.87.86 \
+    --node-external-ip=51.81.87.86 \
+    --advertise-address=51.81.87.86 \
+    --flannel-backend=wireguard-native \
+    --flannel-external-ip \
+    --secrets-encryption \
+    --write-kubeconfig-mode=0600 \
+    --tls-san=51.81.83.33 --tls-san=51.81.87.86 --tls-san=51.81.85.248 \
+    --disable-cloud-controller'
+```
+
+Then identical for `ovhcloud3` with `--node-ip=51.81.85.248` and
+`--advertise-address=51.81.85.248`. After each, wait for `kubectl get nodes`
+to show the new node Ready before proceeding.
+
+### 6.5 Pull kubeconfig to the operator workstation
+
+```sh
+ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
+  | sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
+  > deploy-k3s/kubeconfig
+chmod 600 deploy-k3s/kubeconfig
+export KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig
+kubectl get nodes -o wide       # All 3 Ready, INTERNAL-IP = public IP
+```
+
+### 6.6 Label the redis node
+
+```sh
+kubectl label node vps-1624d691 honeydue/redis=true --overwrite
+```
+
+(Use whichever k8s node name corresponds to `ovhcloud1`. The Redis
+Deployment's `nodeSelector` binds to this label.)
+
+### 6.7 Bootstrap manifests NOT applied by `03-deploy.sh`
+
+These must be applied manually on a fresh cluster, **before** running
+`03-deploy.sh`, or workloads will fail to schedule:
+
+```sh
+kubectl apply -f deploy-k3s/manifests/rbac.yaml
+kubectl apply -f deploy-k3s/manifests/pod-disruption-budgets.yaml
+```
+
+`rbac.yaml` creates the 5 ServiceAccounts (`api`, `worker`, `admin`, `web`,
+`redis`) referenced by the Deployment manifests. Without these, ReplicaSets
+hang on `FailedCreate: error looking up service account` and pods never
+start. Symptom on first deploy: `kubectl get deploy` shows `0 up-to-date`
+across the board with no pod activity — see §9 *Gotchas*.
+
+**Do NOT apply** `traefik-helmchartconfig.yaml` (Hetzner-only — see §10) or
+`kyverno-verify-images.yaml` (gated on operator Kyverno install).
+
+### 6.8 Seed secrets
+
+Two paths; pick whichever fits your situation:
+
+**Path A — clean install from local files** (the original design):
+
+```sh
+KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/02-setup-secrets.sh
+```
+
+Requires `deploy-k3s/secrets/` to contain real `postgres_password.txt`,
+`secret_key.txt`, `email_host_password.txt`, `fcm_server_key.txt`,
+`apns_auth_key.p8`, `cloudflare-origin.crt`, `cloudflare-origin.key`. The
+script reads `config.yaml` for `registry.*`, `redis.password`,
+`admin.basic_auth_*`, and `storage.b2_*`.
+
+**Path B — clone live secrets from another running cluster** (what we
+actually did during the migration; useful if `secrets/` is empty or you want
+exact-byte equivalence):
+
+```sh
+HETZNER=$(pwd)/deploy-k3s/kubeconfig.hetzner.bak   # or any kubeconfig with the secrets
+OVH=$(pwd)/deploy-k3s/kubeconfig
+kubectl --kubeconfig=$OVH apply -f deploy-k3s/manifests/namespace.yaml
+for S in honeydue-secrets honeydue-apns-key gitea-credentials cloudflare-origin-cert admin-basic-auth; do
+  kubectl --kubeconfig=$HETZNER -n honeydue get secret $S -o json \
+    | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+m = d['metadata']
+for k in ('uid','resourceVersion','creationTimestamp','generation','managedFields','ownerReferences','selfLink'):
+    m.pop(k, None)
+m.pop('annotations', None)
+print(json.dumps(d))" \
+    | kubectl --kubeconfig=$OVH apply -f -
+done
+```
+
+After either path, verify:
+
+```sh
+kubectl -n honeydue get secrets
+# Expect: admin-basic-auth, cloudflare-origin-cert, gitea-credentials,
+#         honeydue-apns-key, honeydue-secrets
+```
+
+### 6.9 Deploy workloads
+
+```sh
+KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig \
+  ./deploy-k3s/scripts/03-deploy.sh --skip-build --tag latest
+```
+
+- `--skip-build` skips Docker build + push, deploys whatever's already in the
+  registry at the named tag. Use this when migrating between clusters to
+  guarantee both run identical bits.
+- Without flags it builds the api / worker / admin / web images from the
+  local repo HEAD and pushes to `gitea.treytartt.com` first.
+- The script applies (in order): namespace, network-policies (13 of them),
+  redis, ingress, then runs the goose migration Job (blocking on success),
+  then api / worker / admin / web Deployments, then observability
+  (kube-state-metrics, vmagent, alloy-logs).
+- It does NOT apply: `rbac.yaml`, `pod-disruption-budgets.yaml`,
+  `traefik-helmchartconfig.yaml`, `kyverno-verify-images.yaml`. The first
+  two must be applied manually (see §6.7); the latter two are Hetzner-only
+  or operator-gated.
+- It does NOT apply: anything under `kratos/` (skipped until
+  `kratos-secrets` exists, which requires real OIDC client IDs).
+
+### 6.10 Verify
+
+```sh
+KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/04-verify.sh
+```
+
+Expect: all deployments `READY=desired`, 13 NetworkPolicies, 7 ServiceAccounts
+(api, worker, admin, web, redis, vmagent, alloy-logs), 3 PDBs, cloudflare-only
+middleware present, in-cluster `/api/health/` returns 200.
+
+External smoke test (DNS-aware, but the api `/health/` route is exempt from
+the cloudflare-only middleware so direct-IP works for diagnostics):
+
+```sh
+for IP in 51.81.83.33 51.81.87.86 51.81.85.248; do
+  curl -s -o /dev/null -w "$IP -> %{http_code}\n" \
+    -H 'Host: api.myhoneydue.com' http://$IP/api/health/
+done
+# All three should return 200.
+```
+
+### 6.11 DNS cutover (if migrating)
+
+In the Cloudflare dashboard for `myhoneydue.com`, set the 4 hostnames in §4 to
+the OVH IPs and keep proxied. Effective propagation ~30 s to 5 min through
+the Cloudflare proxy.
+
+If you have a previous cluster, **scale its worker to 0 before flipping** to
+avoid scheduled-job double-fires:
+
+```sh
+KUBECONFIG=<previous>    kubectl -n honeydue scale deploy/worker --replicas=0
+# (cut DNS)
+KUBECONFIG=<new>         kubectl -n honeydue scale deploy/worker --replicas=1
+```
+
+Run those last two lines back-to-back. Worker work is mostly scheduled
+(hourly+), so a brief gap is harmless; overlap would cause duplicate emails.
+
+---
+
+## 7. Day-to-day operations
+
+### Common kubectl one-liners
+
+```sh
+export KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig
+
+# Cluster state
+kubectl get nodes -o wide
+kubectl -n honeydue get pods
+kubectl -n honeydue get deploy
+kubectl top nodes
+kubectl -n honeydue top pods
+
+# Tail logs
+kubectl -n honeydue logs deploy/api -f --tail=50
+kubectl -n honeydue logs -l app.kubernetes.io/name=api -f --tail=20
+stern -n honeydue api               # if stern is installed (multi-pod)
+
+# Restart a deployment (no image change, picks up ConfigMap changes)
+kubectl -n honeydue rollout restart deploy/api
+
+# Rollback one revision
+kubectl -n honeydue rollout undo deploy/api
+
+# Scale (worker MUST stay at 0 or 1)
+kubectl -n honeydue scale deploy/api --replicas=4
+
+# Get into a pod
+kubectl -n honeydue exec -it deploy/api -- sh
+```
+
+### Redeploy after code changes
+
+```sh
+KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/03-deploy.sh
+```
+
+Builds images from local HEAD, tags with the git short SHA, pushes to Gitea,
+runs `goose up` (idempotent), rolls api/worker/admin/web. Total: ~3-5 min
+when images change.
+
+To deploy without rebuilding (pin to a specific tag):
+
+```sh
+./deploy-k3s/scripts/03-deploy.sh --skip-build --tag <tag-or-:latest>
+```
+
+### Migrations
+
+Goose migrations live in `migrations/`. New file pattern:
+
+```
+make migrate-new name=add_foo_column     # generates migrations/YYYYMMDDHHMMSS_add_foo_column.sql
+# Edit the file with -- +goose Up / -- +goose Down sections
+```
+
+`03-deploy.sh` runs a one-shot Job (`manifests/migrate/job.yaml`) that
+executes `goose up` against Neon (direct compute endpoint, not pooler — see
+file comment). The Job blocks api/worker rollout and aborts the deploy on
+failure. No app pod runs `AutoMigrate`; api/worker startup verifies
+`goose_db_version` is current and refuses to boot on mismatch.
+
+### Grafana
+
+URL: https://grafana.88oakapps.com (creds in `deploy/prod.env`)
+
+Three dashboards in the `honeyDue` folder:
+
+| UID | Title | Use |
+|---|---|---|
+| `honeydue-eli5-overview` | honeyDue — Overview (ELI5) | Single-screen at-a-glance health: pods up, crashes, errors, RPS, latency, Postgres, memory, top endpoints, push failures, worker activity, recent error logs. Created 2026-06-03. |
+| `honeydue-red` | honeyDue API — RED | Rate/Errors/Duration cuts (legacy) |
+| `honeydue-logs` | honeyDue — Production Logs | Live log explorer |
+
+For the ELI5 dashboard's queries, **api-side metrics use `service="api"`,
+NOT `namespace="honeydue"`.** vmagent's scrape config drops the namespace
+label from api metrics — only `service`, `pod`, `node`, `job`, plus the
+metric's own labels (route, method, status, etc.) survive. Queries that
+filter on `namespace="honeydue"` for api metrics silently match nothing.
+
+### kubectl tunnel (if 6443 is firewalled to your IP)
+
+Currently `6443` is open WAN-side (matching the previous Hetzner posture).
+If you tighten that to operator-IPs-only and your IP changes, use an SSH
+tunnel:
+
+```sh
+ssh -fN -o ExitOnForwardFailure=yes -o ServerAliveInterval=30 \
+    -i ~/.ssh/ovhcloud \
+    -L 127.0.0.1:6443:127.0.0.1:6443 \
+    ubuntu@51.81.83.33
+
+cp deploy-k3s/kubeconfig deploy-k3s/kubeconfig.tunnel
+sed -i.bak 's|https://51.81.83.33:6443|https://127.0.0.1:6443|' deploy-k3s/kubeconfig.tunnel
+export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig.tunnel"
+```
+
+---
+
+## 8. Disaster recovery
+
+### "I lost the kubeconfig"
+
+```sh
+ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
+  | sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
+  > deploy-k3s/kubeconfig
+chmod 600 deploy-k3s/kubeconfig
+```
+
+If `ovhcloud1` is down but `ovhcloud2` or `3` is up, swap host and IP — the
+TLS SAN covers all three.
+
+### "A node is unresponsive"
+
+```sh
+kubectl drain vps-XXX --ignore-daemonsets --delete-emptydir-data
+# Reboot via OVH manager or:
+ssh ovhcloudN sudo reboot
+# Wait for Ready, then:
+kubectl uncordon vps-XXX
+```
+
+The cluster tolerates 1 node down (etcd quorum 2/3). With 2 down, etcd
+loses quorum and the API server stops accepting writes.
+
+### "etcd quorum lost (2+ nodes dead)"
+
+Bring nodes back online if possible. If not:
+
+```sh
+ssh ovhcloud1 'sudo k3s server --cluster-reset --cluster-reset-restore-path=/var/lib/rancher/k3s/server/db/snapshots/<latest>'
+```
+
+k3s takes automatic etcd snapshots every 12h, keeping 5. List with:
+
+```sh
+ssh ovhcloud1 sudo ls -la /var/lib/rancher/k3s/server/db/snapshots/
+```
+
+This is destructive — workload state since the snapshot is lost, but Neon
+(actual app data) is unaffected.
+
+### "I have to rebuild the whole cluster from scratch"
+
+Provision 3 fresh boxes, then exactly the sequence in §6. End-to-end is
+~30 min. The dependencies that make this possible:
+
+| Stays put through rebuild | Where |
+|---|---|
+| Application data | Neon Postgres (managed) |
+| User uploads | Backblaze B2 (managed) |
+| Container images | `gitea.treytartt.com` (self-hosted, but not on the OVH cluster) |
+| Operator secrets | `deploy-k3s/secrets/` + `config.yaml` + `deploy/prod.env` on the operator workstation (gitignored) |
+| DNS | Cloudflare control panel |
+
+If `gitea.treytartt.com` is on the same OVH cluster, you have a circular
+dependency — rebuilding requires images you can't pull until the cluster is
+up. Currently Gitea is NOT in the honeyDue cluster (separate Hetzner-era
+host), so this isn't a problem today, but worth flagging if that ever
+changes.
+
+### "Cutover back to Hetzner / failover to a backup cluster"
+
+There is **no warm standby today.** Bringing up a second cluster is the
+same §6 procedure on different hardware, then a Cloudflare DNS swap. The
+worker-swap dance is critical:
+
+```sh
+KUBECONFIG=<current>  kubectl -n honeydue scale deploy/worker --replicas=0
+# (Update Cloudflare DNS to new cluster's IPs — proxied)
+KUBECONFIG=<new>      kubectl -n honeydue scale deploy/worker --replicas=1
+```
+
+---
+
+## 9. Known gotchas
+
+### 9.1 First-deploy "0 up-to-date" across all Deployments
+
+**Symptoms:** `kubectl get deploy` shows `READY 0/N, UP-TO-DATE 0` for
+api/worker/admin/web/redis. `kubectl get events` shows
+`FailedCreate: error looking up service account honeydue/<name>: serviceaccount "..." not found`.
+
+**Cause:** `rbac.yaml` (ServiceAccounts) is NOT applied by `03-deploy.sh`. On
+a fresh cluster the SAs don't exist; the ReplicaSet controller can't create
+pods.
+
+**Fix:**
+
+```sh
+kubectl apply -f deploy-k3s/manifests/rbac.yaml
+kubectl -n honeydue rollout restart deploy/api deploy/worker deploy/admin deploy/web deploy/redis
+```
+
+This was hit during the 2026-06-03 OVH bootstrap. Permanently fix by adding
+`kubectl apply -f rbac.yaml` to `03-deploy.sh` between the namespace and
+network-policies apply, but until that lands, follow §6.7 on every fresh
+cluster.
+
+### 9.2 vmagent SD broken on fresh deploy ("0 pods up" in Grafana)
+
+**Symptoms:**
+- Grafana panels using `kube_*` metrics or `up{job=...}` show 0
+- vmagent logs: `dial tcp 10.43.0.1:443: connect: connection refused` every ~30 s
+- Direct test from a pod also refused
+
+**Cause:** k3s's NetworkPolicy controller evaluates egress rules *after*
+kube-proxy's DNAT (not before, contrary to spec). Pod-to-`kubernetes`-Service
+(`10.43.0.1:443`) gets DNAT'd to `<node_ip>:6443`, *then* the policy check
+runs. Without an explicit egress rule for `:6443`, the packet is rejected.
+
+The `allow-egress-from-vmagent` NetPol in `network-policies.yaml` includes
+both rules:
+
+```yaml
+- to:
+    - ipBlock: { cidr: 10.43.0.0/16 }
+  ports:
+    - { port: 443, protocol: TCP }
+- to:
+    - ipBlock:
+        cidr: 0.0.0.0/0
+        except: [10.42.0.0/16]
+  ports:
+    - { port: 6443, protocol: TCP }
+```
+
+**If this happens:** confirm `network-policies.yaml` was applied:
+
+```sh
+kubectl -n honeydue get netpol allow-egress-from-vmagent -o yaml | grep -A 5 6443
+```
+
+Counter-evidence that confirms diagnosis: `kube-state-metrics` in
+`kube-system` works fine (no NetPols in that namespace).
+
+### 9.3 vmagent appears healthy but no data in Grafana
+
+vmagent's `/-/healthy` returns 200 as long as the process is alive and
+remote-write is TCP-functional. It doesn't check that scrapes are actually
+*succeeding*. The liveness probe in `vmagent.yaml` queries `/api/v1/targets`
+and fails the pod if no target is `up`. After ~3 failures (~3 min), kubelet
+recycles it.
+
+If vmagent runs for weeks but Grafana is empty, the probe was disabled or
+the exec command broke.
+
+### 9.4 vmagent bearer token destroyed by direct `kubectl apply`
+
+The committed `vmagent.yaml` has `bearer_token: TOKEN_PLACEHOLDER`. The real
+token is `sed`-substituted at deploy time by `03-deploy.sh`. Applying the
+file directly:
+
+```sh
+kubectl apply -f deploy-k3s/manifests/observability/vmagent.yaml   # WRONG
+```
+
+overwrites the Secret with the literal `TOKEN_PLACEHOLDER` and remote-writes
+401. To restore without a full redeploy:
+
+```sh
+OBS_TOKEN_B64=$(kubectl -n honeydue get secret honeydue-secrets \
+                  -o jsonpath='{.data.OBS_INGEST_TOKEN}')
+kubectl -n honeydue patch secret vmagent-remote-write --type=json \
+  -p="[{\"op\":\"replace\",\"path\":\"/data/bearer_token\",\"value\":\"${OBS_TOKEN_B64}\"}]"
+kubectl -n honeydue rollout restart deploy/vmagent
+```
+
+Or just re-run `./deploy-k3s/scripts/03-deploy.sh` — the sed handles it.
+
+### 9.5 Dashboard queries: api metrics need `service="api"` not `namespace="honeydue"`
+
+vmagent's scrape config (`vmagent-config` ConfigMap) explicitly chooses which
+Kubernetes pod-metadata labels to copy onto each scraped series. **Namespace
+isn't one of them.** Labels you can use on api-side metrics:
+
+- `service` (literal `"api"`)
+- `job` (literal `"api"`)
+- `pod` (the api pod name)
+- `node` (the k8s node name)
+- `cluster` (vmagent external_label, currently `"honeydue-k3s"`)
+- `environment` (vmagent external_label, currently `"prod"`)
+- Plus each metric's own labels (`method`, `route`, `status` for HTTP; etc.)
+
+`kube_*` metrics from kube-state-metrics DO carry `namespace` natively
+(KSM publishes it as a label, vmagent passes it through). Loki streams have
+`namespace` because alloy-logs explicitly relabels it. So the rule is:
+
+| Metric prefix | Use |
+|---|---|
+| `kube_*` | `namespace="honeydue"` |
+| `http_*`, `gorm_*`, `go_*`, `process_*` (api) | `service="api"` |
+| Loki logs `{...}` | `namespace="honeydue"` |
+
+### 9.6 Cluster-label collision when two clusters run together
+
+Both Hetzner and OVH vmagents push as `cluster=honeydue-k3s, environment=prod`
+(same external_labels). During the migration overlap this made dashboards
+sum both clusters' data. The simplest narrowing during overlap is by node
+name pattern (`node=~"vps-.*"` for OVH, `node=~"ubuntu-.*"` for Hetzner). If
+you ever bring up a backup cluster long-term, change one cluster's
+`external_labels.cluster` to something distinct (e.g. `honeydue-ovh`
+vs. `honeydue-backup`).
+
+### 9.7 Worker double-firing scheduled jobs
+
+If two `worker` Deployments run concurrently (e.g. two clusters both pointing
+at the same Neon DB), Asynq schedulers each fire crons independently — users
+get duplicate emails. Workaround: scale all-but-one worker to 0. This is the
+exact mechanic used during cutovers (§6.11).
+
+### 9.8 Node kubeconfig mode
+
+`/etc/rancher/k3s/k3s.yaml` on each node is mode `0600` because we install
+with `--write-kubeconfig-mode=0600`. Tightening from k3s default (0644) was
+intentional. Don't change without coordinating — any tooling on the node
+that expects to read it (none today) will break.
+
+---
+
+## 10. Differences from MIGRATION_NOTES.md (Hetzner-era)
+
+`MIGRATION_NOTES.md` documents the Swarm → k3s migration on Hetzner
+(2026-04-24). Most of it still applies, with these OVH-specific deltas:
+
+| What MIGRATION_NOTES says | What OVH actually has |
+|---|---|
+| `hetzner-k3s` provisioner | Manual k3s install (§6) |
+| Hetzner Load Balancer (not used) → Cloudflare round-robin | Same — Cloudflare round-robin (§4) |
+| Traefik as DaemonSet + hostNetwork via HelmChartConfig | Traefik default Deployment + klipper-lb svclb DaemonSet. The `traefik-helmchartconfig.yaml` file is **NOT applied** on OVH. |
+| `servicelb` disabled (`--disable=servicelb`) | `servicelb` enabled (we didn't pass `--disable=servicelb`). This is what makes klipper-lb work. |
+| sysctl `net.ipv4.ip_unprivileged_port_start=0` for hostNetwork Traefik | Not needed — klipper-lb proxies the port binding instead |
+| UFW rules between 3 Hetzner IPs | UFW rules between 3 OVH IPs (51.81.83.33, 51.81.87.86, 51.81.85.248) |
+| Kubeconfig at `~/.kube/honeydue-k3s.yaml` | Kubeconfig at `deploy-k3s/kubeconfig` |
+| TLS at origin: not configured (CF Flexible) | Same — CF Flexible. `cloudflare-origin-cert` Secret exists (carried over) but Ingress doesn't reference it. |
+
+---
+
+## 11. Outstanding follow-ups (deferred, not blocking)
+
+1. **No warm standby / rollback cluster.** OVH is solo production. An OVH
+   outage is a real outage; recovery time = §6 procedure (~30 min). User
+   plans to bring a second cluster up as a target.
+2. **UFW allows 80/443 from world.** Hetzner had a network-layer Cloudflare-IP
+   allowlist on these ports. OVH currently relies on the L7
+   `cloudflare-only` Traefik middleware, which protects admin but NOT api /
+   web / apex (those routes have to be reachable from anywhere, but they're
+   then trivially DDoSable bypassing Cloudflare). Fix: add ufw allow rules
+   restricting `80/tcp` and `443/tcp` to Cloudflare's published IP ranges
+   (~22 IPv4 prefixes from https://www.cloudflare.com/ips-v4/).
+3. **Cloudflare TLS Flexible → Full(strict).** Origin certs exist as Secret
+   but Ingress doesn't terminate TLS. Upgrading to Full(strict) requires
+   Traefik configured with the cert + an HTTPS entrypoint + Ingress
+   `tls:` block.
+4. **`rbac.yaml` + `pod-disruption-budgets.yaml` should be in `03-deploy.sh`.**
+   They're currently bootstrap-only. Adding them is idempotent and prevents
+   the §9.1 footgun.
+5. **Push notification metrics are log-derived, not counters.** Successes
+   aren't logged or counted. Proper Prometheus instrumentation (~15 lines in
+   `internal/push/client.go`) would give a real success/failure ratio.
+6. **Worker has no `/metrics` endpoint.** `cmd/worker/main.go` serves `:6060`
+   for healthz only. Adding Asynq's `metrics.NewPrometheusExporter()` + a
+   ServiceMonitor + uncommenting the `worker` job stanza in
+   `vmagent-config` ConfigMap would give real queue depth and job latency.
+7. **Ory Kratos.** Manifests exist (`manifests/kratos/`) but the deploy
+   is gated on operator-side prerequisites (Neon `kratos` database,
+   `auth.myhoneydue.com` DNS, real Apple+Google OIDC clients, Kratos image
+   tag pinned). Until `kratos-secrets` exists, `03-deploy.sh` silently
+   skips the Kratos apply.
+8. **Hetzner cluster fully retired? `config.yaml` `nodes:` block describes
+   OVH; the bak kubeconfig is at `kubeconfig.hetzner.bak`. Boxes themselves
+   are operator-managed.
+
+### 11.1 Dashboard observability gaps (raised 2026-06-03 during dashboard build)
+
+Surfaced while building the `honeydue-eli5-overview` Grafana dashboard. Each
+needs code or infra changes to expose; none blocks today's operations.
+
+9. **node-exporter not deployed.** No node-level metrics today
+   (`node_filesystem_avail_bytes`, `node_memory_*`, `node_load1`, etc.).
+   The dashboard's pod-level memory/CPU panels are app-process only — a
+   node running out of disk would silently fail the cluster before any
+   dashboard signal showed it. Highest-priority Tier-3 item. Fix: deploy
+   `node-exporter` as a DaemonSet (~50 lines of YAML), add a scrape stanza
+   to `vmagent-config`, add a `Node disk free` stat panel.
+10. **Traefik metrics not enabled.** Traefik can expose `/metrics` with
+    `traefik_entrypoint_requests_total` + `traefik_service_request_duration_seconds`,
+    giving edge-level visibility into requests that never reached api
+    pods (404s, redirects, middleware blocks). Enable via a
+    HelmChartConfig override that sets `metrics.prometheus.entryPoint=metrics`
+    + adds a `:9100` entryPoint + a scrape stanza. Skipped today to avoid
+    Traefik restart risk; safe additive change when ready.
+11. **Push notification success/failure counters** (already #5). Add
+    `prometheus.NewCounterVec` in `internal/push/client.go` with labels
+    `platform={ios,android}, outcome={success,failed,breaker_open,disabled}`.
+    Increments at every Send/SendActionable branch. Replaces the
+    log-derived "Push failures" stat on the dashboard with a real success
+    rate.
+12. **Worker queue / job metrics** (already #6). Asynq has a built-in
+    Prometheus exporter (`asynq/x/metrics`). Wire it into the worker's
+    `:6060` health server (a single `healthMux.Handle` line) and
+    uncomment the worker scrape stanza in `vmagent-config`. Surfaces
+    queue depth, retry count, processing time per task type.
+13. **Cache hit / miss rate.** `internal/services/cache_service.go` has
+    no counters. Add a Counter with labels `{operation=get|set, result=hit|miss}`
+    around the cache wrapper. ~10 lines. Useful once real traffic flows
+    to verify the ETag and Redis caches are paying their keep.
+14. **APNs send-latency histogram.** Wrap `internal/push/apns.go::Send`
+    in a `prometheus.NewHistogramVec` keyed on outcome. Tells you when
+    Apple's gateway is slow (which correlates with their incident page).
+
+---
+
+## 12. Audit trail
+
+| Date | Change |
+|---|---|
+| 2026-04-24 | Initial k3s cluster on Hetzner (Swarm → k3s migration) — see MIGRATION_NOTES.md |
+| 2026-04-25 | `config.yaml` reconstructed from live ConfigMap (original file lost) |
+| 2026-05-15 | Audit fixes: Redis auth required, admin basic auth, secrets-encryption flag |
+| 2026-05-16 | `02-setup-secrets.sh` started carrying B2 credentials (was a manifest/script drift) |
+| 2026-06-02 | Kratos scaffolding committed (not deployed) |
+| 2026-06-03 | **Hetzner → OVH BHS cutover.** New 3-node cluster on 51.81.83.33, .87.86, .85.248. DNS cut on Cloudflare. Hetzner kubeconfig moved to `.bak`. Grafana `honeydue-eli5-overview` dashboard created. Hetzner cluster powered off later same day. |
+| 2026-06-03 | Dashboard build-out: extended `honeydue-eli5-overview` to 22 panels covering Tier-1 (HTTP status, CPU per pod, goroutines, top slow) and Tier-2 (GC, network I/O, pod uptime, top 5xx) signals. Surfaced Tier-3 instrumentation gaps in §11.1. |
@@ -30,6 +30,7 @@ load_balancer_ip: ""
 domains:
  api: api.myhoneydue.com
  admin: admin.myhoneydue.com
+  app: app.myhoneydue.com                   # web client host — added to CORS_ALLOWED_ORIGINS
  base: myhoneydue.com

 # --- Container Registry (GHCR) ---
@@ -62,7 +63,7 @@ email:
 push:
  apns_key_id: ""
  apns_team_id: ""
-  apns_topic: com.tt.honeyDue
+  apns_topic: com.myhoneydue.honeyDue
  apns_production: true
  apns_use_sandbox: false

@@ -72,8 +73,13 @@ storage:
  b2_app_key: ""
  b2_bucket: ""
  b2_endpoint: ""                           # e.g. s3.us-west-004.backblazeb2.com
+  b2_region: ""                             # e.g. us-east-005
+  b2_use_ssl: true
  max_file_size: 10485760
  allowed_types: "image/jpeg,image/png,image/gif,image/webp,application/pdf"
+  upload_dir: /app/uploads                  # filesystem path inside the api container
+  base_url: /uploads                        # public URL prefix served by the api
+  static_dir: /app/static                   # static asset path inside the api container

 # --- Worker Schedules (UTC hours) ---
 worker:
@@ -100,8 +106,10 @@ admin:
  basic_auth_password: ""                   # HTTP basic auth password for admin panel

 # --- Apple Auth / IAP (optional, leave empty if unused) ---
+# client_id MUST equal the iOS Release bundle ID — Apple identity tokens
+# are rejected if the `aud` claim doesn't match.
 apple_auth:
-  client_id: ""
+  client_id: "com.myhoneydue.honeyDue"
  team_id: ""
  iap_key_id: ""
  iap_issuer_id: ""
@@ -23,8 +23,11 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: admin
+      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
+      # the ServiceAccount-level setting in rbac.yaml.
+      automountServiceAccountToken: false
      imagePullSecrets:
-        - name: ghcr-credentials
+        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1001
@@ -35,6 +38,7 @@ spec:
      containers:
        - name: admin
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh
+          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit; images are SHA/digest-pinned
          ports:
            - containerPort: 3000
              protocol: TCP
@@ -23,8 +23,11 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: api
+      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
+      # the ServiceAccount-level setting in rbac.yaml.
+      automountServiceAccountToken: false
      imagePullSecrets:
-        - name: ghcr-credentials
+        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
@@ -35,6 +38,7 @@ spec:
      containers:
        - name: api
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh
+          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit; images are SHA/digest-pinned
          ports:
            - containerPort: 8000
              protocol: TCP
@@ -46,34 +50,16 @@ spec:
          envFrom:
            - configMapRef:
                name: honeydue-config
-          env:
-            - name: POSTGRES_PASSWORD
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: POSTGRES_PASSWORD
-            - name: SECRET_KEY
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: SECRET_KEY
-            - name: EMAIL_HOST_PASSWORD
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: EMAIL_HOST_PASSWORD
-            - name: FCM_SERVER_KEY
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: FCM_SERVER_KEY
-            - name: REDIS_PASSWORD
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: REDIS_PASSWORD
-                  optional: true
+          # Audit CODE-F8: secrets are NOT injected as environment variables.
+          # Env vars are readable for the life of the pod via /proc/<pid>/environ
+          # and leak into crash dumps / child processes. honeydue-secrets is
+          # mounted read-only at /etc/honeydue/secrets (mode 0400) and the Go
+          # config layer (config.loadFileSecrets) reads each key from its file.
+          # Non-secret config still arrives via the configMapRef above.
          volumeMounts:
+            - name: app-secrets
+              mountPath: /etc/honeydue/secrets
+              readOnly: true
            - name: apns-key
              mountPath: /secrets/apns
              readOnly: true
@@ -90,11 +76,12 @@ spec:
            httpGet:
              path: /api/health/
              port: 8000
-            # MigrateWithLock in cmd/api/main.go runs pg_advisory_lock on
-            # every startup. On a cold boot with 3 replicas, the first does
-            # AutoMigrate (~90s) and the others wait on the lock, so real
-            # startup runs 90–240s. 48 × 5s = 240s grace absorbs it without
-            # healthcheck killing a still-starting replica.
+            # Schema migrations run separately in the honeydue-migrate Job
+            # *before* this Deployment rolls — the api itself does not migrate
+            # (it only verifies goose_db_version at boot). Cold start still
+            # pays the DB pool warm-up + Redis connect + APNs/FCM client init
+            # before /api/health/ goes green. 48 × 5s = 240s grace keeps the
+            # probe from killing a still-starting replica.
            failureThreshold: 48
            periodSeconds: 5
          readinessProbe:
@@ -112,6 +99,12 @@ spec:
            periodSeconds: 30
            timeoutSeconds: 10
      volumes:
+        # Audit CODE-F8: the whole honeydue-secrets Secret, projected as files.
+        # defaultMode 0400 → readable only by the container's runAsUser (1000).
+        - name: app-secrets
+          secret:
+            secretName: honeydue-secrets
+            defaultMode: 0400
        - name: apns-key
          secret:
            secretName: honeydue-apns-key
@@ -0,0 +1,57 @@
+# B2 bucket lifecycle — `uploads/` prefix
+
+The `pending_uploads` cleanup worker (cron `30 * * * *`, see
+`internal/worker/jobs/handler.go::HandleUploadCleanup`) reaps unclaimed
+upload sessions every hour, deleting both the row and the corresponding B2
+object. This bucket-level lifecycle rule is a **backstop** — it catches B2
+objects that survive the row deletion (e.g. worker crashed mid-loop, B2
+delete errored, manual DB tampering).
+
+## Rule
+
+Apply via the Backblaze web console: **Bucket → `honeyDueProd` → Lifecycle Settings → Custom**
+
+```json
+[
+  {
+    "fileNamePrefix": "uploads/",
+    "daysFromUploadingToHiding": 7,
+    "daysFromHidingToDeleting": 1
+  }
+]
+```
+
+Effect: any object under the `uploads/` prefix is hidden 7 days after
+upload, then permanently deleted 1 day after that. Total maximum lifetime
+of an orphaned object: 8 days.
+
+This rule does NOT affect:
+
+- `images/`, `documents/`, `completions/` — legacy multipart-uploaded
+  objects, which are managed by the existing `task_completion_image` /
+  `document_image` / `document.file_url` references.
+
+## Why a backstop, not the primary mechanism
+
+The application worker is the primary mechanism because:
+
+1. It can delete the **DB row** alongside the B2 object — lifecycle alone
+   would leave dangling `pending_uploads` rows.
+2. It runs hourly vs. lifecycle's once-per-day evaluation — much tighter
+   recovery window for the common case.
+3. It produces logs / metrics for orphan rate observability.
+
+## Verification
+
+After applying:
+
+```bash
+b2 bucket get-info honeyDueProd | jq '.lifecycleRules'
+```
+
+Should show the rule above. If you don't have the B2 CLI:
+
+```bash
+curl -u "$B2_KEY_ID:$B2_APP_KEY" https://api.backblazeb2.com/b2api/v3/b2_authorize_account
+# Then use the returned authorization_token + apiUrl to call b2_get_bucket
+```
@@ -53,7 +53,12 @@ metadata:
  labels:
    app.kubernetes.io/part-of: honeydue
  annotations:
-    traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
+    # cloudflare-only + admin-auth wired in (audit F2/F3/CODE-L6). Order
+    # matters: reject non-Cloudflare IPs, then basic auth, then headers,
+    # then rate limit. The admin-basic-auth secret is created by
+    # 02-setup-secrets.sh from config.yaml admin.basic_auth_* — that runs
+    # before 03-deploy.sh, so the middleware always has its secret.
+    traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-admin-auth@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
 spec:
  ingressClassName: traefik
  tls:
@@ -98,3 +103,98 @@ spec:
                name: web
                port:
                  number: 3000
+---
+# Auth-endpoint Ingress (audit F10 / LIVE-L12). A dedicated Ingress for the
+# auth paths so Traefik gives their longer path-prefix routers a higher
+# priority than honeydue-api's "/" router — these paths then get
+# auth-rate-limit (5/min) instead of the general rate-limit (100/min).
+# Anything not matched here falls through to honeydue-api unchanged.
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: honeydue-api-auth
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/part-of: honeydue
+  annotations:
+    traefik.ingress.kubernetes.io/router.middlewares: honeydue-auth-rate-limit@kubernetescrd,honeydue-security-headers@kubernetescrd
+spec:
+  ingressClassName: traefik
+  tls:
+    - hosts:
+        - api.myhoneydue.com
+      secretName: cloudflare-origin-cert
+  rules:
+    - host: api.myhoneydue.com
+      http:
+        paths:
+          - path: /api/auth/login
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/auth/register
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/auth/forgot-password
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/auth/reset-password
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/residences/join-with-code
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/auth/verify-reset-code
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/auth/apple-sign-in
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/auth/google-sign-in
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/auth/refresh
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
+          - path: /api/auth/account
+            pathType: Prefix
+            backend:
+              service:
+                name: api
+                port:
+                  number: 8000
@@ -1,54 +0,0 @@
-# API Ingress — Cloudflare-only + security headers + rate limiting
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: honeydue-api
-  namespace: honeydue
-  labels:
-    app.kubernetes.io/part-of: honeydue
-  annotations:
-    traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
-spec:
-  tls:
-    - hosts:
-        - api.myhoneydue.com
-      secretName: cloudflare-origin-cert
-  rules:
-    - host: api.myhoneydue.com
-      http:
-        paths:
-          - path: /
-            pathType: Prefix
-            backend:
-              service:
-                name: api
-                port:
-                  number: 8000
-
---
-# Admin Ingress — Cloudflare-only + security headers + rate limiting + basic auth
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: honeydue-admin
-  namespace: honeydue
-  labels:
-    app.kubernetes.io/part-of: honeydue
-  annotations:
-    traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd,honeydue-admin-auth@kubernetescrd
-spec:
-  tls:
-    - hosts:
-        - admin.myhoneydue.com
-      secretName: cloudflare-origin-cert
-  rules:
-    - host: admin.myhoneydue.com
-      http:
-        paths:
-          - path: /
-            pathType: Prefix
-            backend:
-              service:
-                name: admin
-                port:
-                  number: 3000
@@ -21,12 +21,20 @@ spec:
  headers:
    frameDeny: true
    contentTypeNosniff: true
-    browserXssFilter: true
+    # browserXssFilter removed (audit L7): it emits the deprecated
+    # X-XSS-Protection header, which can itself introduce XSS in legacy
+    # browsers. Modern browsers ignore it.
    referrerPolicy: "strict-origin-when-cross-origin"
    customResponseHeaders:
      X-Content-Type-Options: "nosniff"
      X-Frame-Options: "DENY"
-      Strict-Transport-Security: "max-age=31536000; includeSubDomains"
+      # HSTS: 2-year max-age + preload (audit L5/CODE-L3). After this is
+      # live on api/admin/app, submit myhoneydue.com to hstspreload.org.
+      Strict-Transport-Security: "max-age=63072000; includeSubDomains; preload"
+      # Cross-origin isolation (audit F9). COEP (require-corp) is omitted —
+      # it commonly breaks third-party embeds; add only after testing.
+      Cross-Origin-Opener-Policy: "same-origin"
+      Cross-Origin-Resource-Policy: "same-origin"
      # Content-Security-Policy is intentionally NOT set here — the Go API
      # sets a CSP in internal/router/router.go that permits Google Fonts
      # for the landing page. Two CSP headers would intersect and break it.
@@ -83,3 +91,24 @@ spec:
  basicAuth:
    secret: admin-basic-auth
    realm: "honeyDue Admin"
+
+---
+# Strict rate limit for auth endpoints (audit F10 / LIVE-L12).
+# Applied via the honeydue-api-auth Ingress to login / register /
+# forgot-password / reset-password / join-with-code. depth: 2 makes the
+# limiter key on the real client IP rather than the Cloudflare edge IP
+# (request path: client -> Cloudflare -> Traefik). This is the edge half;
+# the per-account lockout in the Go app is the robust half.
+apiVersion: traefik.io/v1alpha1
+kind: Middleware
+metadata:
+  name: auth-rate-limit
+  namespace: honeydue
+spec:
+  rateLimit:
+    average: 5
+    burst: 10
+    period: 1m
+    sourceCriterion:
+      ipStrategy:
+        depth: 2
@@ -0,0 +1,92 @@
+# Ory Kratos — honeyDue identity service (Phase 1: infrastructure)
+
+This directory deploys [Ory Kratos](https://www.ory.sh/kratos/) into the
+`honeydue` namespace as the identity provider — replacing the hand-rolled auth
+in `internal/services/auth_service.go` etc.
+
+**Phase 1 is infrastructure only.** Once deployed, Kratos runs but nothing uses
+it yet — the honeyDue Go API still does its own auth. Phase 2 (backend swap)
+and Phase 3 (KMP/web clients) follow. Migrating onto Kratos can lose all
+existing user data — honeyDue is pre-production, so no user import is done.
+
+The deploy is **gated**: `03-deploy.sh` applies Kratos only when the
+`kratos-secrets` Secret exists, and `02-setup-secrets.sh` creates that Secret
+only when `config.yaml` has a `kratos:` block. Until then the existing stack
+deploys completely unaffected.
+
+## Files
+
+| File | What |
+|---|---|
+| `configmap.yaml` | `kratos.yml`, identity schema, Google/Apple OIDC claim mappers (no secrets) |
+| `migrate-job.yaml` | `kratos migrate sql` — schema migration, run before the Deployment |
+| `kratos.yaml` | Deployment (×2), Service, NetworkPolicies |
+| `ingress.yaml` | `auth.myhoneydue.com` → Kratos public API :4433 |
+
+## Operator prerequisites (must be done before deploying)
+
+1. **Kratos version** — Ory uses CalVer (`v25.x` / `v26.x`). Pick the current
+   stable, then replace `REPLACE_WITH_CURRENT_STABLE_TAG` in `kratos.yaml` and
+   `migrate-job.yaml` with `oryd/kratos:vXX.Y@sha256:<digest>`, and set the
+   matching `version:` in `configmap.yaml`.
+
+2. **Kratos database** — create a separate Neon database named `kratos` (do not
+   share honeyDue's). Capture its connection string as the DSN.
+
+3. **DNS** — add `auth.myhoneydue.com` in Cloudflare (proxied), pointing at the
+   cluster ingress like the other honeyDue hosts. Confirm the
+   `cloudflare-origin-cert` TLS secret covers `auth.myhoneydue.com`.
+
+4. **Google OAuth client** — Google Cloud Console → create an OAuth 2.0 client.
+   Redirect URI: `https://auth.myhoneydue.com/self-service/methods/oidc/callback/google`.
+   Put the **client ID** into `configmap.yaml` (`GOOGLE_OAUTH_CLIENT_ID`); the
+   **client secret** goes in `config.yaml`.
+
+5. **Apple Sign In** — Apple Developer → a Services ID + a Sign in with Apple
+   key. Return URL: `https://auth.myhoneydue.com/self-service/methods/oidc/callback/apple`.
+   Put the **Services ID / Team ID / Key ID** into `configmap.yaml`
+   (`APPLE_SERVICES_ID` / `APPLE_TEAM_ID` / `APPLE_PRIVATE_KEY_ID`); the **.p8
+   private key** goes in `config.yaml`.
+
+6. **`config.yaml`** — add a `kratos:` block:
+   ```yaml
+   kratos:
+     dsn: "postgres://USER:PASS@HOST/kratos?sslmode=require"
+     secrets_cookie: "<openssl rand -hex 16>"   # generate ONCE, keep stable
+     secrets_cipher: "<openssl rand -hex 16>"   # must be exactly 32 chars
+     smtp_connection_uri: "smtps://USER:PASS@smtp.fastmail.com:465/"
+     google_client_secret: "<from Google Cloud Console>"
+     apple_private_key: |
+       -----BEGIN PRIVATE KEY-----
+       ...
+       -----END PRIVATE KEY-----
+   ```
+   `secrets_cookie` / `secrets_cipher` must stay stable forever — rotating them
+   invalidates every session and makes encrypted data unreadable.
+
+## Deploy
+
+```bash
+cd honeyDueAPI-go
+export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
+./deploy-k3s/scripts/02-setup-secrets.sh   # creates kratos-secrets from config.yaml
+./deploy-k3s/scripts/03-deploy.sh          # applies kratos manifests, runs migrate, rolls
+```
+
+`03-deploy.sh` applies `configmap.yaml` → runs `migrate-job.yaml` → waits →
+applies `kratos.yaml` + `ingress.yaml`.
+
+## Verify
+
+- `kubectl -n honeydue get pods -l app.kubernetes.io/name=kratos` — 2/2 Running
+- `kubectl -n honeydue logs job/kratos-migrate` — migration succeeded
+- `curl https://auth.myhoneydue.com/health/ready` — `{"status":"ok"}`
+- `curl https://auth.myhoneydue.com/self-service/registration/api` — returns a flow
+
+## Not yet done (later phases)
+
+- **Phase 2** — honeyDue Go backend: swap `middleware/auth.go` for Kratos
+  session validation, drop the hand-rolled auth code, rebuild the `users`
+  table keyed on the Kratos identity ID.
+- **Phase 3** — KMP mobile + Next.js web clients point at Kratos flows.
+- Admin-panel auth stays on its own JWT (out of scope).
@@ -0,0 +1,232 @@
+# Ory Kratos configuration for honeyDue.
+#
+# Secrets are NOT in this ConfigMap. The DSN, cookie/cipher secrets, SMTP URI
+# and OIDC client secrets are injected as environment variables from the
+# kratos-secrets Secret (see kratos.yaml). Kratos is configured natively via
+# env vars, so this is the idiomatic split — only non-secret config here.
+#
+# OIDC scope: Apple-only as of 2026-06-03. Google is intentionally absent;
+# adding it later is additive — append a `- id: google` block under
+# selfservice.methods.oidc.config.providers (it becomes index 1) and bind a
+# matching CLIENT_SECRET env in kratos.yaml.
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: kratos-config
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: kratos
+    app.kubernetes.io/part-of: honeydue
+data:
+  kratos.yml: |
+    # version must track the Kratos image tag — kratos.yaml + migrate-job.yaml
+    # both pin oryd/kratos:v26.2.0 (2026-06-03). See kratos/README.md.
+    version: v1.3.0  # internal config schema version; do not change unless Kratos release notes require it
+
+    serve:
+      public:
+        base_url: https://auth.myhoneydue.com/
+        cors:
+          enabled: true
+          allowed_origins:
+            - https://myhoneydue.com
+            - https://app.myhoneydue.com
+            - https://admin.myhoneydue.com
+          allowed_methods: [GET, POST, PUT, PATCH, DELETE, OPTIONS]
+          allowed_headers: [Authorization, Content-Type, X-Session-Token, Cookie]
+          exposed_headers: [Content-Type, Set-Cookie]
+          # Required: the web clients call Kratos browser flows with
+          # credentials (the ory_kratos_session cookie). Safe here because
+          # allowed_origins is an explicit list, never a wildcard.
+          allow_credentials: true
+      admin:
+        base_url: http://kratos.honeydue.svc.cluster.local:4434/
+
+    selfservice:
+      default_browser_return_url: https://app.myhoneydue.com/
+      allowed_return_urls:
+        - https://app.myhoneydue.com
+        - https://myhoneydue.com
+        - honeydue://callback
+
+      methods:
+        password:
+          enabled: true
+        code:                       # email one-time codes (verify/recover)
+          enabled: true
+        oidc:
+          enabled: true
+          config:
+            providers:
+              # index 0 — Apple Sign In. apple_private_key (.p8 contents) is
+              # injected via env SELFSERVICE_METHODS_OIDC_CONFIG_PROVIDERS_0_APPLE_PRIVATE_KEY.
+              # client_id is the Apple Services ID (here: the bundle ID, which
+              # was configured as a Services ID with Sign In with Apple
+              # capability — see operator notes in README.md §5).
+              - id: apple
+                provider: apple
+                # Production bundle id. Apple issues id_tokens with
+                # `aud` = the requesting app's bundle id, so this is the
+                # primary audience Kratos verifies against.
+                client_id: com.myhoneydue.honeyDue
+                # Debug builds out of Xcode use a `.dev` bundle id (see
+                # iosApp/honeyDue.xcodeproj — Debug config). Their id_tokens
+                # therefore have `aud: com.myhoneydue.honeyDue.dev`, which
+                # the primary client_id check rejects. Whitelist the dev
+                # audience so Apple Sign In works from a non-Release Xcode
+                # build without per-build Kratos reconfiguration.
+                additional_id_token_audiences:
+                  - com.myhoneydue.honeyDue.dev
+                apple_team_id: X86BR9WTLD
+                apple_private_key_id: HQD3NCF99C
+                mapper_url: file:///etc/kratos/oidc.apple.jsonnet
+                scope: [openid, email, name]
+
+      flows:
+        error:
+          ui_url: https://app.myhoneydue.com/auth/error
+        login:
+          ui_url: https://app.myhoneydue.com/auth/login
+          lifespan: 10m
+        registration:
+          ui_url: https://app.myhoneydue.com/auth/registration
+          lifespan: 10m
+          after:
+            password:
+              hooks:
+                - hook: session     # auto-login after registration
+            oidc:
+              hooks:
+                - hook: session
+        verification:
+          enabled: true
+          ui_url: https://app.myhoneydue.com/auth/verification
+          use: code
+          after:
+            default_browser_return_url: https://app.myhoneydue.com/
+        recovery:
+          enabled: true
+          ui_url: https://app.myhoneydue.com/auth/recovery
+          use: code
+        settings:
+          ui_url: https://app.myhoneydue.com/auth/settings
+          privileged_session_max_age: 15m
+        logout:
+          after:
+            default_browser_return_url: https://app.myhoneydue.com/
+
+    log:
+      level: info
+      format: json
+      leak_sensitive_values: false
+
+    ciphers:
+      algorithm: xchacha20-poly1305
+
+    hashers:
+      algorithm: bcrypt
+      bcrypt:
+        cost: 12
+
+    identity:
+      default_schema_id: honeydue
+      schemas:
+        - id: honeydue
+          url: file:///etc/kratos/identity.schema.json
+
+    courier:
+      smtp:
+        from_address: noreply@myhoneydue.com
+        from_name: honeyDue
+        # connection_uri is injected via env COURIER_SMTP_CONNECTION_URI
+
+    session:
+      lifespan: 720h                # 30-day sessions (mobile)
+      cookie:
+        domain: myhoneydue.com
+        same_site: Lax
+
+  identity.schema.json: |
+    {
+      "$id": "https://honeydue.app/identity.schema.json",
+      "$schema": "http://json-schema.org/draft-07/schema#",
+      "title": "honeyDue user",
+      "type": "object",
+      "properties": {
+        "traits": {
+          "type": "object",
+          "properties": {
+            "email": {
+              "type": "string",
+              "format": "email",
+              "title": "Email",
+              "minLength": 3,
+              "maxLength": 320,
+              "ory.sh/kratos": {
+                "credentials": {
+                  "password": { "identifier": true },
+                  "code": { "identifier": true, "via": "email" },
+                  "totp": { "account_name": true }
+                },
+                "verification": { "via": "email" },
+                "recovery": { "via": "email" }
+              }
+            },
+            "name": {
+              "type": "object",
+              "title": "Name",
+              "properties": {
+                "first": { "type": "string", "title": "First name", "maxLength": 100 },
+                "last": { "type": "string", "title": "Last name", "maxLength": 100 }
+              }
+            }
+          },
+          "required": ["email"],
+          "additionalProperties": false
+        }
+      }
+    }
+
+  oidc.google.jsonnet: |
+    // Maps Google OIDC claims onto the honeyDue identity schema.
+    local claims = std.extVar('claims');
+    {
+      identity: {
+        traits: {
+          email: claims.email,
+          [if 'given_name' in claims || 'family_name' in claims then 'name']: {
+            first: if 'given_name' in claims then claims.given_name else '',
+            last: if 'family_name' in claims then claims.family_name else '',
+          },
+        },
+      },
+    }
+
+  oidc.apple.jsonnet: |
+    // Maps Apple OIDC claims onto the honeyDue identity schema. Apple only
+    // returns the name on the very first authorization and not in the ID
+    // token claims, so only email is mapped here.
+    //
+    // Sign in with Apple emails are marked verified UNCONDITIONALLY: completing
+    // SIWA cryptographically proves the user controls that Apple ID, and Apple
+    // owns/verifies the (relay or real) email, so a 6-digit code would be
+    // redundant. We deliberately do NOT gate this on Apple's `email_verified`
+    // claim — Apple omits that claim on many authorizations (only sends it on
+    // the first grant), which made auto-verification random: sometimes verified,
+    // sometimes a surprise code prompt (observed 2026-06-03). Marking it
+    // verified on every SIWA makes the behaviour consistent: Apple users never
+    // see a code; password sign-ups still verify via the honeyDue API flow.
+    local claims = std.extVar('claims');
+    {
+      identity: {
+        traits: {
+          email: claims.email,
+        },
+        verified_addresses: std.prune([
+          if 'email' in claims then {
+            via: 'email',
+            value: claims.email,
+          },
+        ]),
+      },
+    }
@@ -0,0 +1,44 @@
+# Public ingress for Ory Kratos — auth.myhoneydue.com → Kratos public API :4433.
+#
+# Middlewares match the honeyDue API ingress (security-headers + rate-limit).
+# The cloudflare-only middleware is intentionally NOT applied here: on this
+# cluster, klipper-lb SNATs the source IP before Traefik sees it, so
+# cloudflare-only's IP allowlist rejects every legitimate Cloudflare request
+# (verified 2026-06-03 — iOS Apple Sign In failed silently because Kratos
+# never received the request). The api ingress doesn't use cloudflare-only
+# for the same reason. DDoS protection still rides on Cloudflare's edge.
+#
+# Kratos's self-service flows are multi-request, so the strict auth-rate-limit
+# (5/min) is intentionally NOT used here — Kratos applies its own per-flow
+# protections.
+#
+# OPERATOR: confirm the cloudflare-origin-cert TLS secret covers
+# auth.myhoneydue.com (apex + wildcard origin cert), and add the
+# auth.myhoneydue.com DNS record in Cloudflare (proxied) → cluster ingress.
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: honeydue-auth
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: kratos
+    app.kubernetes.io/part-of: honeydue
+  annotations:
+    traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
+spec:
+  ingressClassName: traefik
+  tls:
+    - hosts:
+        - auth.myhoneydue.com
+      secretName: cloudflare-origin-cert
+  rules:
+    - host: auth.myhoneydue.com
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: kratos
+                port:
+                  number: 4433
@@ -0,0 +1,208 @@
+# Ory Kratos — identity service for honeyDue.
+#
+# Deployed once the operator has completed the prerequisites in kratos/README.md
+# (Neon `kratos` database, auth.myhoneydue.com DNS, Apple Sign In OIDC client,
+# and the kratos-secrets Secret). Until then 03-deploy.sh skips the Kratos
+# apply, so the existing stack is unaffected.
+#
+# IMAGE: pinned to oryd/kratos v26.2.0 (CalVer current stable as of 2026-06-03)
+# with the linux/amd64 digest. The schema-migration Job is in migrate-job.yaml
+# and runs before this Deployment rolls.
+#
+# OIDC: currently Apple-only (configmap.yaml providers[0]). Google was scoped
+# out at deploy time; adding it later is additive — append to providers[] in
+# configmap.yaml and add the matching CLIENT_SECRET env binding here.
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kratos
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: kratos
+    app.kubernetes.io/part-of: honeydue
+spec:
+  replicas: 2
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 0
+      maxSurge: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kratos
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: kratos
+        app.kubernetes.io/part-of: honeydue
+    spec:
+      automountServiceAccountToken: false
+      securityContext:
+        runAsNonRoot: true
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: kratos
+          image: oryd/kratos:v26.2.0@sha256:92eedc292ff8e1a918ac442c88ed0abe44610c75121700963114549908a45ac3
+          imagePullPolicy: IfNotPresent
+          args:
+            - serve
+            - --config
+            - /etc/kratos/kratos.yml
+            - --watch-courier      # send verification/recovery email in-process
+          ports:
+            - name: public
+              containerPort: 4433
+            - name: admin
+              containerPort: 4434
+          env:
+            # Kratos is configured natively via env vars; secrets come from
+            # the kratos-secrets Secret rather than the ConfigMap.
+            - name: DSN
+              valueFrom: { secretKeyRef: { name: kratos-secrets, key: dsn } }
+            - name: SECRETS_COOKIE
+              valueFrom: { secretKeyRef: { name: kratos-secrets, key: secrets_cookie } }
+            - name: SECRETS_CIPHER
+              valueFrom: { secretKeyRef: { name: kratos-secrets, key: secrets_cipher } }
+            - name: COURIER_SMTP_CONNECTION_URI
+              valueFrom: { secretKeyRef: { name: kratos-secrets, key: smtp_connection_uri } }
+            # OIDC provider secrets — index must match the providers list
+            # order in configmap.yaml. Apple-only for now (index 0).
+            - name: SELFSERVICE_METHODS_OIDC_CONFIG_PROVIDERS_0_APPLE_PRIVATE_KEY
+              valueFrom: { secretKeyRef: { name: kratos-secrets, key: apple_private_key } }
+          volumeMounts:
+            - name: config
+              mountPath: /etc/kratos
+              readOnly: true
+            - name: tmp
+              mountPath: /tmp
+          readinessProbe:
+            httpGet:
+              path: /health/ready
+              port: 4434
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          livenessProbe:
+            httpGet:
+              path: /health/alive
+              port: 4434
+            initialDelaySeconds: 10
+            periodSeconds: 30
+          resources:
+            requests:
+              cpu: 100m
+              memory: 128Mi
+            limits:
+              cpu: "1"
+              memory: 512Mi
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop: ["ALL"]
+      volumes:
+        - name: config
+          configMap:
+            name: kratos-config
+        - name: tmp
+          emptyDir:
+            sizeLimit: 64Mi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: kratos
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: kratos
+    app.kubernetes.io/part-of: honeydue
+spec:
+  selector:
+    app.kubernetes.io/name: kratos
+  ports:
+    - name: public
+      port: 4433
+      targetPort: 4433
+    - name: admin
+      port: 4434
+      targetPort: 4434
+---
+# Ingress to Kratos. Traefik (the auth.myhoneydue.com IngressRoute) reaches
+# only the public API :4433. The honeyDue api pods reach the public API :4433
+# (session whoami) AND the admin API :4434 (identity deletion on account
+# close). The admin API :4434 takes no other cluster ingress.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-ingress-to-kratos
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: kratos
+  policyTypes:
+    - Ingress
+  ingress:
+    # Traefik ingress controller -> public API only.
+    - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: kube-system
+      ports:
+        - port: 4433
+          protocol: TCP
+    # honeyDue api pods -> public API (whoami) + admin API (identity deletion).
+    - from:
+        - podSelector:
+            matchLabels:
+              app.kubernetes.io/name: api
+      ports:
+        - port: 4433
+          protocol: TCP
+        - port: 4434
+          protocol: TCP
+---
+# Kratos egress: DNS, the Neon Postgres database, SMTP, and HTTPS to the
+# OIDC providers (Apple/Google token + JWKS endpoints).
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-egress-from-kratos
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: kratos
+  policyTypes:
+    - Egress
+  egress:
+    - to:
+        - namespaceSelector: {}
+      ports:
+        - port: 53
+          protocol: UDP
+        - port: 53
+          protocol: TCP
+    # Neon Postgres (external)
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              - 10.42.0.0/16
+              - 10.43.0.0/16
+      ports:
+        - port: 5432
+          protocol: TCP
+    # SMTP (Fastmail) + HTTPS to Apple/Google OIDC endpoints (external)
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              - 10.42.0.0/16
+              - 10.43.0.0/16
+      ports:
+        - port: 465
+          protocol: TCP
+        - port: 443
+          protocol: TCP
@@ -0,0 +1,51 @@
+# Ory Kratos schema migration — runs `kratos migrate sql` against the Kratos
+# database before the Kratos Deployment rolls. 03-deploy.sh applies this,
+# waits for completion, then applies kratos.yaml.
+#
+# IMAGE: pinned to oryd/kratos v26.2.0 (CalVer current stable as of 2026-06-03)
+# with the linux/amd64 digest. Bump in sync with kratos.yaml's image.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: kratos-migrate
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: kratos
+    app.kubernetes.io/part-of: honeydue
+spec:
+  backoffLimit: 0
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: kratos
+        app.kubernetes.io/part-of: honeydue
+    spec:
+      restartPolicy: Never
+      automountServiceAccountToken: false
+      securityContext:
+        runAsNonRoot: true
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: kratos-migrate
+          image: oryd/kratos:v26.2.0@sha256:92eedc292ff8e1a918ac442c88ed0abe44610c75121700963114549908a45ac3
+          imagePullPolicy: IfNotPresent
+          args: ["migrate", "sql", "-e", "--yes"]
+          env:
+            - name: DSN
+              valueFrom:
+                secretKeyRef:
+                  name: kratos-secrets
+                  key: dsn
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop: ["ALL"]
+          resources:
+            requests:
+              cpu: 50m
+              memory: 64Mi
+            limits:
+              cpu: 500m
+              memory: 256Mi
@@ -0,0 +1,61 @@
+# Kyverno image-signature verification policy (audit CODE-L5).
+#
+# ──────────────────────────────────────────────────────────────────────────
+# THIS MANIFEST IS NOT APPLIED BY 03-deploy.sh. It is intentionally outside
+# the script's apply set. Applying it before the prerequisites are in place
+# would block every honeydue Pod from scheduling. Operator steps:
+#
+#   1. Install Kyverno in the cluster (it is an admission controller):
+#        kubectl create -f https://github.com/kyverno/kyverno/releases/latest/download/install.yaml
+#   2. Generate a cosign key pair and keep the private key safe:
+#        cosign generate-key-pair                 # -> cosign.key (PRIVATE) + cosign.pub
+#      Set COSIGN_KEY=cosign.key in the deploy environment so 03-deploy.sh
+#      signs images after pushing them (the signing step is already wired,
+#      guarded, into 03-deploy.sh).
+#   3. Paste the contents of cosign.pub into the publicKeys block below.
+#   4. Apply this policy:  kubectl apply -f deploy-k3s/manifests/kyverno-verify-images.yaml
+#   5. After confirming honeydue Pods still schedule, flip
+#      validationFailureAction from Audit to Enforce.
+#
+# Until then it is a documented, ready-to-use template — not active config.
+# ──────────────────────────────────────────────────────────────────────────
+apiVersion: kyverno.io/v1
+kind: ClusterPolicy
+metadata:
+  name: verify-honeydue-images
+  annotations:
+    policies.kyverno.io/title: Verify honeyDue image signatures
+    policies.kyverno.io/description: >-
+      Requires that honeyDue application images pulled into the honeydue
+      namespace carry a valid cosign signature made with the operator's key.
+spec:
+  # Audit first — logs violations without blocking. Switch to Enforce once
+  # signing is confirmed working end to end.
+  validationFailureAction: Audit
+  background: false
+  webhookTimeoutSeconds: 30
+  rules:
+    - name: verify-gitea-image-signatures
+      match:
+        any:
+          - resources:
+              kinds:
+                - Pod
+              namespaces:
+                - honeydue
+      verifyImages:
+        # Only the images we build and sign. Public base images
+        # (redis, vmagent) are pinned by digest instead — see their manifests.
+        - imageReferences:
+            - "gitea.treytartt.com/admin/honeydue-api*"
+            - "gitea.treytartt.com/admin/honeydue-worker*"
+            - "gitea.treytartt.com/admin/honeydue-admin*"
+            - "gitea.treytartt.com/admin/honeydue-web*"
+          attestors:
+            - count: 1
+              entries:
+                - keys:
+                    publicKeys: |-
+                      -----BEGIN PUBLIC KEY-----
+                      REPLACE_WITH_CONTENTS_OF_cosign.pub
+                      -----END PUBLIC KEY-----
@@ -0,0 +1,78 @@
+# One-shot migration Job. Runs goose against Neon's *direct* (non-pooler)
+# endpoint, applies any pending migrations from /app/migrations (baked into
+# the api image), exits.
+#
+# 03-deploy.sh deletes any prior Job, applies this one, waits for completion
+# with `kubectl wait --for=condition=complete`, and rolls api/worker only
+# after the Job succeeds. A Job failure aborts the whole deploy.
+#
+# We reuse the api image rather than build a separate one — the api Dockerfile
+# already installs the goose CLI to /usr/local/bin/goose and copies the
+# migrations directory to /app/migrations.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: honeydue-migrate
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: migrate
+    app.kubernetes.io/part-of: honeydue
+spec:
+  backoffLimit: 0                  # fail fast — no silent retries on a bad migration
+  ttlSecondsAfterFinished: 86400   # keep finished Job for 24h so logs are inspectable
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: migrate
+        app.kubernetes.io/part-of: honeydue
+    spec:
+      restartPolicy: Never
+      # The migrate Job never calls the k8s API (audit F11).
+      automountServiceAccountToken: false
+      imagePullSecrets:
+        - name: gitea-credentials
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        runAsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: goose
+          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh — same as api
+          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit
+          command: ["/bin/sh", "-c"]
+          # DB_HOST in the ConfigMap points at the -pooler endpoint for runtime.
+          # goose's session-scoped advisory lock can't survive PgBouncer
+          # transaction-mode, so we strip the -pooler segment for migrations.
+          # `set -e` so any sub-command failure exits non-zero.
+          args:
+            - |
+              set -e
+              DIRECT_HOST=$(echo "$DB_HOST" | sed 's/-pooler\.\(.*\)$/.\1/')
+              echo "[migrate] running goose up against $DIRECT_HOST"
+              exec /usr/local/bin/goose \
+                -dir /app/migrations \
+                postgres "host=$DIRECT_HOST port=$DB_PORT user=$POSTGRES_USER password=$POSTGRES_PASSWORD dbname=$POSTGRES_DB sslmode=$DB_SSLMODE" \
+                up
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop: ["ALL"]
+          envFrom:
+            - configMapRef:
+                name: honeydue-config
+          env:
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: honeydue-secrets
+                  key: POSTGRES_PASSWORD
+          resources:
+            requests:
+              cpu: 100m
+              memory: 64Mi
+            limits:
+              cpu: 500m
+              memory: 256Mi
@@ -140,6 +140,20 @@ spec:
      ports:
        - protocol: TCP
          port: 6379
+    # Kratos (in-cluster). The auth middleware validates every session via
+    # http://kratos:4433/sessions/whoami; the AuthService also uses :4434
+    # for account deletion (DELETE /admin/identities/{id}). k3s evaluates
+    # egress rules AFTER kube-proxy DNAT (runbook §9.2), so this podSelector
+    # rule covers Service ClusterIP traffic correctly.
+    - to:
+        - podSelector:
+            matchLabels:
+              app.kubernetes.io/name: kratos
+      ports:
+        - protocol: TCP
+          port: 4433
+        - protocol: TCP
+          port: 4434
    # External services: Neon DB (5432), SMTP (587), HTTPS (443 — APNs, FCM, B2, PostHog)
    - to:
        - ipBlock:
@@ -275,3 +289,154 @@ spec:
      ports:
        - protocol: TCP
          port: 443
+
+---
+# vmagent egress.
+#
+# IMPORTANT (gotcha): k3s's built-in NetworkPolicy controller appears to
+# evaluate egress rules AFTER kube-proxy's DNAT, not before (contrary to
+# the k8s spec). So traffic from a pod to the kubernetes Service
+# (ClusterIP 10.43.0.1:443) is policy-checked as dst=<node_public_ip>:6443.
+# That's why we need an explicit rule for :6443 to public IPs, even though
+# we already allow :443 to the cluster service CIDR.
+#
+# Without the :6443 rule, vmagent's k8s service discovery silently fails
+# and zero pods get scraped. See deploy-k3s/RUNBOOK.md ("vmagent SD broken").
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-egress-from-vmagent
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: vmagent
+  policyTypes:
+    - Egress
+  egress:
+    # DNS (cluster-internal)
+    - to:
+        - namespaceSelector: {}
+      ports:
+        - port: 53
+          protocol: UDP
+        - port: 53
+          protocol: TCP
+    # k8s API server via ClusterIP (pre-DNAT view)
+    - to:
+        - ipBlock:
+            cidr: 10.43.0.0/16
+      ports:
+        - port: 443
+          protocol: TCP
+    # k8s API server post-DNAT (real path k3s NetPol enforcer sees) — REQUIRED
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              - 10.42.0.0/16
+      ports:
+        - port: 6443
+          protocol: TCP
+    # Scrape api Pods on :8000
+    - to:
+        - ipBlock:
+            cidr: 10.42.0.0/16
+      ports:
+        - port: 8000
+          protocol: TCP
+    # Scrape kube-state-metrics Pod on :8080 (pod CIDR)
+    - to:
+        - ipBlock:
+            cidr: 10.42.0.0/16
+      ports:
+        - port: 8080
+          protocol: TCP
+    # HTTPS to public (remote-write to obs.88oakapps.com via Cloudflare)
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              - 10.42.0.0/16
+              - 10.43.0.0/16
+      ports:
+        - port: 443
+          protocol: TCP
+
+---
+# Allow vmagent → api ingress on :8000 so api pods accept scrapes.
+# api Pods are otherwise locked down by default-deny-all + allow-ingress-to-api
+# (which only allows Traefik). This adds vmagent specifically.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-vmagent-to-api
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: api
+  policyTypes:
+    - Ingress
+  ingress:
+    - from:
+        - podSelector:
+            matchLabels:
+              app.kubernetes.io/name: vmagent
+      ports:
+        - port: 8000
+          protocol: TCP
+
+---
+# alloy-logs egress — Grafana Alloy discovers honeydue pods via the k8s API
+# and pushes their logs to Loki at obs.88oakapps.com. Same k3s NetworkPolicy
+# DNAT gotcha as vmagent: API-server traffic is policy-checked as
+# dst=<node_public_ip>:6443, so an explicit :6443 rule is required.
+# Alloy reads log FILES from a hostPath, so it needs no ingress and no
+# egress to pod :8000/:8080 — only DNS, the API server, and obs HTTPS.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-egress-from-alloy-logs
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: alloy-logs
+  policyTypes:
+    - Egress
+  egress:
+    # DNS (cluster-internal)
+    - to:
+        - namespaceSelector: {}
+      ports:
+        - port: 53
+          protocol: UDP
+        - port: 53
+          protocol: TCP
+    # k8s API server via ClusterIP (pre-DNAT view)
+    - to:
+        - ipBlock:
+            cidr: 10.43.0.0/16
+      ports:
+        - port: 443
+          protocol: TCP
+    # k8s API server post-DNAT (real path k3s NetPol enforcer sees) — REQUIRED
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              - 10.42.0.0/16
+      ports:
+        - port: 6443
+          protocol: TCP
+    # HTTPS to public (log push to obs.88oakapps.com via Cloudflare)
+    - to:
+        - ipBlock:
+            cidr: 0.0.0.0/0
+            except:
+              - 10.42.0.0/16
+              - 10.43.0.0/16
+      ports:
+        - port: 443
+          protocol: TCP
@@ -0,0 +1,278 @@
+# honeyDue log shipper — Grafana Alloy as a DaemonSet.
+#
+# Each node runs one Alloy pod that tails the honeydue-namespace pod logs in
+# /var/log/pods and pushes them to Loki at obs.88oakapps.com/loki/api/v1/push
+# (the same nginx ingest endpoint + bearer token vmagent uses for metrics).
+#
+# Runs as root: /var/log/pods is 0750 root:root on the k3s nodes, so a
+# non-root uid cannot even traverse it. The container is otherwise locked
+# down — all capabilities dropped, read-only root filesystem, seccomp
+# RuntimeDefault — and root inside the container reads only a read-only
+# hostPath mount of /var/log/pods. This is the one root-running workload in
+# the namespace (standard for log collectors); see docs/deployment.
+#
+# 03-deploy.sh substitutes TOKEN_PLACEHOLDER with OBS_INGEST_TOKEN from
+# deploy/prod.env before applying — the token never lands in the repo.
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: alloy-logs
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: alloy-logs
+    app.kubernetes.io/part-of: honeydue
+---
+# Least privilege: Alloy's discovery.kubernetes only lists/watches pods, and
+# only in the honeydue namespace — so a namespaced Role, not a ClusterRole.
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: alloy-logs
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: alloy-logs
+    app.kubernetes.io/part-of: honeydue
+rules:
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: alloy-logs
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: alloy-logs
+    app.kubernetes.io/part-of: honeydue
+subjects:
+  - kind: ServiceAccount
+    name: alloy-logs
+    namespace: honeydue
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: alloy-logs
+---
+# Bearer token for the Loki push endpoint. TOKEN_PLACEHOLDER is replaced by
+# 03-deploy.sh with OBS_INGEST_TOKEN (same token vmagent uses).
+apiVersion: v1
+kind: Secret
+metadata:
+  name: alloy-logs-auth
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: alloy-logs
+    app.kubernetes.io/part-of: honeydue
+type: Opaque
+stringData:
+  bearer_token: TOKEN_PLACEHOLDER
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: alloy-logs
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: alloy-logs
+    app.kubernetes.io/part-of: honeydue
+data:
+  config.alloy: |
+    // honeyDue log shipper. Each DaemonSet instance discovers honeydue-namespace
+    // pods via the Kubernetes API, tails the container log files present on its
+    // own node (/var/log/pods), and pushes them to Loki at obs.88oakapps.com.
+
+    logging {
+      level  = "warn"
+      format = "logfmt"
+    }
+
+    discovery.kubernetes "pods" {
+      role = "pod"
+      namespaces {
+        names = ["honeydue"]
+      }
+    }
+
+    // Turn pod metadata into Loki labels and build the on-disk log path.
+    discovery.relabel "pod_logs" {
+      targets = discovery.kubernetes.pods.targets
+
+      rule {
+        source_labels = ["__meta_kubernetes_namespace"]
+        action        = "replace"
+        target_label  = "namespace"
+      }
+      rule {
+        source_labels = ["__meta_kubernetes_pod_name"]
+        action        = "replace"
+        target_label  = "pod"
+      }
+      rule {
+        source_labels = ["__meta_kubernetes_pod_container_name"]
+        action        = "replace"
+        target_label  = "container"
+      }
+      rule {
+        source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
+        action        = "replace"
+        target_label  = "app"
+      }
+      rule {
+        source_labels = ["__meta_kubernetes_pod_node_name"]
+        action        = "replace"
+        target_label  = "node"
+      }
+      // /var/log/pods/<namespace>_<pod>_<uid>/<container>/<n>.log
+      rule {
+        source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
+        separator     = "/"
+        action        = "replace"
+        replacement   = "/var/log/pods/*$1/*.log"
+        target_label  = "__path__"
+      }
+    }
+
+    local.file_match "pod_logs" {
+      path_targets = discovery.relabel.pod_logs.output
+    }
+
+    loki.source.file "pod_logs" {
+      targets       = local.file_match.pod_logs.targets
+      forward_to    = [loki.process.pod_logs.receiver]
+      // With no stored read offset (fresh node, or positions wiped), start
+      // at the END of each file instead of re-shipping history — otherwise
+      // Loki rejects the now-too-old entries ("entry too far behind") and
+      // shipping stalls. Offsets persist on a hostPath (see volumes), so a
+      // normal pod restart resumes exactly where it left off.
+      tail_from_end = true
+    }
+
+    // Parse the CRI log format (timestamp / stream / flags / message),
+    // then drop probe/scrape noise before shipping.
+    loki.process "pod_logs" {
+      forward_to = [loki.write.obs.receiver]
+
+      stage.cri {}
+
+      // Drop successful probe/scrape access logs. k8s liveness/readiness
+      // hits /api/health/ every few seconds and vmagent scrapes /metrics
+      // on a 15s interval — all 2xx, pure noise that drowns real logs.
+      // A non-2xx health check, or one logged above info level, does NOT
+      // match this regex and is kept.
+      stage.drop {
+        expression          = "\"level\":\"info\".*\"path\":\"/(api/health/?|metrics)\".*\"status\":2[0-9][0-9]"
+        drop_counter_reason = "probe_access_ok"
+      }
+    }
+
+    loki.write "obs" {
+      endpoint {
+        url               = "https://obs.88oakapps.com/loki/api/v1/push"
+        bearer_token_file = "/etc/alloy-secrets/bearer_token"
+      }
+      external_labels = {
+        cluster     = "honeydue-k3s",
+        environment = "prod",
+      }
+    }
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: alloy-logs
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: alloy-logs
+    app.kubernetes.io/part-of: honeydue
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: alloy-logs
+  updateStrategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 1
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: alloy-logs
+        app.kubernetes.io/part-of: honeydue
+    spec:
+      serviceAccountName: alloy-logs
+      # Alloy needs its SA token — discovery.kubernetes talks to the API server.
+      automountServiceAccountToken: true
+      # Root is required to traverse /var/log/pods (0750 root:root). The
+      # container is otherwise fully confined (see container securityContext).
+      securityContext:
+        runAsUser: 0
+        runAsGroup: 0
+        seccompProfile:
+          type: RuntimeDefault
+      tolerations:
+        # DaemonSet must run on every node, including any control-plane taint.
+        - key: node-role.kubernetes.io/control-plane
+          operator: Exists
+          effect: NoSchedule
+      containers:
+        - name: alloy
+          image: grafana/alloy:v1.5.1@sha256:01a63f4e032ce54ee94b22049bc27f597e74f85566478c377f4b5c7f020c1eb3
+          imagePullPolicy: IfNotPresent
+          args:
+            - run
+            - /etc/alloy/config.alloy
+            - --storage.path=/tmp/alloy
+            - --server.http.listen-addr=0.0.0.0:12345
+          ports:
+            - name: http
+              containerPort: 12345
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop: ["ALL"]
+          volumeMounts:
+            - name: config
+              mountPath: /etc/alloy
+              readOnly: true
+            - name: auth
+              mountPath: /etc/alloy-secrets
+              readOnly: true
+            - name: varlogpods
+              mountPath: /var/log/pods
+              readOnly: true
+            - name: tmp
+              mountPath: /tmp/alloy
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 12345
+            initialDelaySeconds: 10
+            periodSeconds: 20
+          resources:
+            requests:
+              cpu: 25m
+              memory: 64Mi
+            limits:
+              cpu: 150m
+              memory: 256Mi
+      volumes:
+        - name: config
+          configMap:
+            name: alloy-logs
+        - name: auth
+          secret:
+            secretName: alloy-logs-auth
+            defaultMode: 0400
+        - name: varlogpods
+          hostPath:
+            path: /var/log/pods
+            type: Directory
+        # Alloy's positions/WAL store. A hostPath (not emptyDir) so file read
+        # offsets survive pod restarts — otherwise every restart re-reads log
+        # files from the start and Loki rejects the now-too-old entries.
+        - name: tmp
+          hostPath:
+            path: /var/lib/honeydue-alloy-logs
+            type: DirectoryOrCreate
@@ -0,0 +1,223 @@
+# kube-state-metrics — exposes cluster object state (pods, deployments,
+# services, etc.) as Prometheus metrics. vmagent scrapes it via the api
+# group defined in vmagent-config; Grafana panels that count pods,
+# replicas, etc. consume the `kube_*` metrics this produces.
+#
+# Lives in kube-system because it watches resources cluster-wide.
+# RBAC is cluster-scoped (ClusterRole + ClusterRoleBinding).
+#
+# Image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
+# (latest stable as of authoring; bump when a newer minor is released)
+
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: kube-state-metrics
+  namespace: kube-system
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/part-of: honeydue-observability
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kube-state-metrics
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/part-of: honeydue-observability
+rules:
+  # Core resources
+  - apiGroups: [""]
+    resources:
+      - configmaps
+      - secrets
+      - nodes
+      - pods
+      - services
+      - serviceaccounts
+      - resourcequotas
+      - replicationcontrollers
+      - limitranges
+      - persistentvolumeclaims
+      - persistentvolumes
+      - namespaces
+      - endpoints
+    verbs: [list, watch]
+  # Apps
+  - apiGroups: ["apps"]
+    resources:
+      - statefulsets
+      - daemonsets
+      - deployments
+      - replicasets
+    verbs: [list, watch]
+  # Batch
+  - apiGroups: ["batch"]
+    resources:
+      - cronjobs
+      - jobs
+    verbs: [list, watch]
+  # Autoscaling
+  - apiGroups: ["autoscaling"]
+    resources:
+      - horizontalpodautoscalers
+    verbs: [list, watch]
+  # Authentication / authorization (used by some ksm collectors)
+  - apiGroups: ["authentication.k8s.io"]
+    resources: [tokenreviews]
+    verbs: [create]
+  - apiGroups: ["authorization.k8s.io"]
+    resources: [subjectaccessreviews]
+    verbs: [create]
+  # Policy
+  - apiGroups: ["policy"]
+    resources: [poddisruptionbudgets]
+    verbs: [list, watch]
+  # Certificate signing
+  - apiGroups: ["certificates.k8s.io"]
+    resources: [certificatesigningrequests]
+    verbs: [list, watch]
+  # Discovery
+  - apiGroups: ["discovery.k8s.io"]
+    resources: [endpointslices]
+    verbs: [list, watch]
+  # Storage
+  - apiGroups: ["storage.k8s.io"]
+    resources:
+      - storageclasses
+      - volumeattachments
+    verbs: [list, watch]
+  # Admission policy
+  - apiGroups: ["admissionregistration.k8s.io"]
+    resources:
+      - mutatingwebhookconfigurations
+      - validatingwebhookconfigurations
+    verbs: [list, watch]
+  # Networking
+  - apiGroups: ["networking.k8s.io"]
+    resources:
+      - networkpolicies
+      - ingressclasses
+      - ingresses
+    verbs: [list, watch]
+  # Coordination (leader election)
+  - apiGroups: ["coordination.k8s.io"]
+    resources: [leases]
+    verbs: [list, watch]
+  # RBAC
+  - apiGroups: ["rbac.authorization.k8s.io"]
+    resources:
+      - clusterrolebindings
+      - clusterroles
+      - rolebindings
+      - roles
+    verbs: [list, watch]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: kube-state-metrics
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/part-of: honeydue-observability
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: kube-state-metrics
+subjects:
+  - kind: ServiceAccount
+    name: kube-state-metrics
+    namespace: kube-system
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: kube-state-metrics
+  namespace: kube-system
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/part-of: honeydue-observability
+spec:
+  type: ClusterIP
+  selector:
+    app.kubernetes.io/name: kube-state-metrics
+  ports:
+    - name: http-metrics
+      port: 8080
+      targetPort: http-metrics
+      protocol: TCP
+    - name: telemetry
+      port: 8081
+      targetPort: telemetry
+      protocol: TCP
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: kube-system
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/part-of: honeydue-observability
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kube-state-metrics
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: kube-state-metrics
+        app.kubernetes.io/part-of: honeydue-observability
+    spec:
+      serviceAccountName: kube-state-metrics
+      automountServiceAccountToken: true
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65534
+        fsGroup: 65534
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: kube-state-metrics
+          image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 8080
+              name: http-metrics
+            - containerPort: 8081
+              name: telemetry
+          args:
+            - --port=8080
+            - --telemetry-port=8081
+          resources:
+            requests:
+              cpu: 25m
+              memory: 64Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: [ALL]
+            readOnlyRootFilesystem: true
+          livenessProbe:
+            httpGet:
+              path: /livez
+              port: http-metrics
+            initialDelaySeconds: 5
+            periodSeconds: 30
+          readinessProbe:
+            httpGet:
+              path: /readyz
+              port: http-metrics
+            initialDelaySeconds: 5
+            periodSeconds: 10
@@ -0,0 +1,126 @@
+# node-exporter — per-node host metrics (filesystem, memory, load, CPU).
+# Runs as a normal pod (NOT hostNetwork) so vmagent scrapes it pod-to-pod over
+# the cluster CIDR, avoiding any dependency on node public IPs (the netpol
+# node-IP list is OVH-stale). Host /proc, /sys and / are bind-mounted read-only
+# so the filesystem/memory/load collectors read the real host, not the pod ns.
+# Added 2026-06-08 to close RUNBOOK §11.1 gap #9 (node disk/mem were unmonitored).
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: node-exporter
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: node-exporter
+    app.kubernetes.io/part-of: honeydue
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: node-exporter
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: node-exporter
+        app.kubernetes.io/part-of: honeydue
+    spec:
+      # Run on every node, including any tainted control-plane nodes.
+      tolerations:
+        - operator: Exists
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 65534
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: node-exporter
+          image: quay.io/prometheus/node-exporter:v1.8.2  # TODO digest-pin (audit K3S-F14)
+          imagePullPolicy: IfNotPresent
+          args:
+            - --path.procfs=/host/proc
+            - --path.sysfs=/host/sys
+            - --path.rootfs=/host/root
+            # Only report real host mounts; drop the kubelet/container churn.
+            - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/kubelet/.+|var/lib/docker/.+|var/lib/containerd/.+)($|/)
+            - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
+            - --no-collector.wifi
+            - --no-collector.hwmon
+            - --web.listen-address=:9100
+          ports:
+            - name: metrics
+              containerPort: 9100
+              protocol: TCP
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop: ["ALL"]
+          resources:
+            requests:
+              cpu: 30m
+              memory: 32Mi
+            limits:
+              cpu: 200m
+              memory: 128Mi
+          volumeMounts:
+            - name: proc
+              mountPath: /host/proc
+              readOnly: true
+            - name: sys
+              mountPath: /host/sys
+              readOnly: true
+            - name: root
+              mountPath: /host/root
+              mountPropagation: HostToContainer
+              readOnly: true
+      volumes:
+        - name: proc
+          hostPath:
+            path: /proc
+        - name: sys
+          hostPath:
+            path: /sys
+        - name: root
+          hostPath:
+            path: /
+---
+# default-deny-all blocks ingress; allow vmagent to scrape :9100.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-ingress-to-node-exporter
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: node-exporter
+  policyTypes:
+    - Ingress
+  ingress:
+    - from:
+        - podSelector:
+            matchLabels:
+              app.kubernetes.io/name: vmagent
+      ports:
+        - port: 9100
+          protocol: TCP
+---
+# vmagent's existing egress policy only opens :8000/:8080 to the pod CIDR.
+# Additive policy (NetworkPolicies are OR'd) opening :9100 for the node-exporter
+# scrape — leaves the working allow-egress-from-vmagent policy untouched.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-egress-from-vmagent-to-node-exporter
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: vmagent
+  policyTypes:
+    - Egress
+  egress:
+    - to:
+        - ipBlock:
+            cidr: 10.42.0.0/16
+      ports:
+        - port: 9100
+          protocol: TCP
@@ -0,0 +1,289 @@
+# vmagent — scrapes Prometheus /metrics from in-cluster services and
+# remote-writes them to https://obs.88oakapps.com/api/v1/write
+# (VictoriaMetrics on 88oakappsUpdate, fronted by Cloudflare + nginx
+# bearer-token auth). Single replica is fine — vmagent buffers locally
+# during transient remote outages.
+
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: vmagent-config
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: vmagent
+    app.kubernetes.io/part-of: honeydue
+data:
+  scrape.yaml: |
+    global:
+      scrape_interval: 15s
+      external_labels:
+        cluster: honeydue-k3s
+        environment: prod
+
+    scrape_configs:
+      # honeyDue Go API — exposes /metrics on :8000
+      - job_name: api
+        kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              names: [honeydue]
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+            action: keep
+            regex: api
+          - source_labels: [__meta_kubernetes_pod_container_port_number]
+            action: keep
+            regex: "8000"
+          - source_labels: [__meta_kubernetes_pod_name]
+            target_label: pod
+          - source_labels: [__meta_kubernetes_pod_node_name]
+            target_label: node
+          - target_label: service
+            replacement: api
+
+      # kube-state-metrics — cluster object state (kube_pod_*, kube_deployment_*,
+      # etc.) needed for Grafana panels that count pods/replicas/etc.
+      - job_name: kube-state-metrics
+        kubernetes_sd_configs:
+          - role: endpoints
+            namespaces:
+              names: [kube-system]
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
+            action: keep
+            regex: kube-state-metrics
+          - source_labels: [__meta_kubernetes_endpoint_port_name]
+            action: keep
+            regex: http-metrics
+
+      # node-exporter — per-node host metrics (node_filesystem_*, node_memory_*,
+      # node_load*). Pod-networked DaemonSet scraped on :9100 over the pod CIDR.
+      - job_name: node-exporter
+        kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              names: [honeydue]
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+            action: keep
+            regex: node-exporter
+          - source_labels: [__meta_kubernetes_pod_container_port_number]
+            action: keep
+            regex: "9100"
+          - source_labels: [__meta_kubernetes_pod_name]
+            target_label: pod
+          - source_labels: [__meta_kubernetes_pod_node_name]
+            target_label: node
+          - target_label: service
+            replacement: node-exporter
+
+      # honeyDue worker — exposes /metrics on :6060 (apns/fcm/asynq/cache series).
+      - job_name: worker
+        kubernetes_sd_configs:
+          - role: pod
+            namespaces:
+              names: [honeydue]
+        relabel_configs:
+          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
+            action: keep
+            regex: worker
+          - source_labels: [__meta_kubernetes_pod_container_port_number]
+            action: keep
+            regex: "6060"
+          - source_labels: [__meta_kubernetes_pod_name]
+            target_label: pod
+          - source_labels: [__meta_kubernetes_pod_node_name]
+            target_label: node
+          - target_label: service
+            replacement: worker
+
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: vmagent-remote-write
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: vmagent
+    app.kubernetes.io/part-of: honeydue
+type: Opaque
+stringData:
+  # Bearer token for obs.88oakapps.com. Provisioned at deploy time from
+  # deploy/prod.env (OBS_INGEST_TOKEN). The cluster-side token must match
+  # the token in /etc/honeydue-obs/ingest_token on 88oakappsUpdate.
+  bearer_token: TOKEN_PLACEHOLDER
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: vmagent
+  namespace: honeydue
+rules:
+  - apiGroups: [""]
+    resources: [pods, services, endpoints]
+    verbs: [get, list, watch]
+
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: vmagent
+  namespace: honeydue
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: vmagent
+  namespace: honeydue
+subjects:
+  - kind: ServiceAccount
+    name: vmagent
+    namespace: honeydue
+roleRef:
+  kind: Role
+  name: vmagent
+  apiGroup: rbac.authorization.k8s.io
+
+---
+# Allow vmagent to discover the kube-state-metrics Service/Endpoints in
+# kube-system so the kube-state-metrics scrape job can find its target.
+# Cross-namespace SD needs an explicit RoleBinding here.
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: vmagent-kube-system
+  namespace: kube-system
+rules:
+  - apiGroups: [""]
+    resources: [services, endpoints, pods]
+    verbs: [get, list, watch]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: vmagent-kube-system
+  namespace: kube-system
+subjects:
+  - kind: ServiceAccount
+    name: vmagent
+    namespace: honeydue
+roleRef:
+  kind: Role
+  name: vmagent-kube-system
+  apiGroup: rbac.authorization.k8s.io
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vmagent
+  namespace: honeydue
+  labels:
+    app.kubernetes.io/name: vmagent
+    app.kubernetes.io/part-of: honeydue
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vmagent
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vmagent
+        app.kubernetes.io/part-of: honeydue
+    spec:
+      serviceAccountName: vmagent
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        fsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: vmagent
+          # Pinned by digest (audit K3S-F14).
+          image: victoriametrics/vmagent:v1.106.1@sha256:90208a667c0baf65f7536b92a84c40b6e35ffe8e88bda7e4447b97b06c6ba6b8
+          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit
+          # Container-level hardening (audit F7) — matches the other 5
+          # workloads. vmagent only writes to the /tmp/vmagent emptyDir
+          # (its remoteWrite buffer), so a read-only root filesystem holds.
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop: ["ALL"]
+          args:
+            - "-promscrape.config=/etc/vmagent/scrape.yaml"
+            - "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write"
+            - "-remoteWrite.bearerTokenFile=/etc/vmagent-secrets/bearer_token"
+            - "-remoteWrite.tmpDataPath=/tmp/vmagent"
+            - "-remoteWrite.maxDiskUsagePerURL=512MB"
+            - "-loggerLevel=INFO"
+          ports:
+            - containerPort: 8429
+              name: http
+          resources:
+            requests:
+              cpu: 25m
+              memory: 64Mi
+            limits:
+              cpu: 200m
+              memory: 256Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/vmagent
+              readOnly: true
+            - name: secrets
+              mountPath: /etc/vmagent-secrets
+              readOnly: true
+            - name: buffer
+              mountPath: /tmp/vmagent
+          # Process startup gate. /-/healthy returns 200 once vmagent has
+          # parsed config — gives the agent up to 2 min to come up before
+          # liveness starts evaluating.
+          startupProbe:
+            httpGet:
+              path: /-/healthy
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            failureThreshold: 24
+          # Real liveness check: are scrapes actually succeeding?
+          # /-/healthy was the old probe and returned 200 for 17 days even
+          # while vmagent had zero healthy targets (stale k8s SD watch).
+          # This exec probe queries vmagent's own targets API and fails if
+          # NO target is in state "up". Three consecutive failures (3 min)
+          # → kubelet kills the pod → fresh SD watch.
+          livenessProbe:
+            exec:
+              command:
+                - sh
+                - -c
+                - 'n=$(wget -qO- -T 4 http://localhost:8429/api/v1/targets 2>/dev/null | grep -c ''"health":"up"''); [ "$n" -gt 0 ]'
+            initialDelaySeconds: 180
+            periodSeconds: 120
+            timeoutSeconds: 5
+            failureThreshold: 5
+          readinessProbe:
+            httpGet:
+              path: /-/healthy
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 10
+      volumes:
+        - name: config
+          configMap:
+            name: vmagent-config
+        - name: secrets
+          secret:
+            secretName: vmagent-remote-write
+            defaultMode: 0400
+        - name: buffer
+          emptyDir:
+            sizeLimit: 512Mi
@@ -20,6 +20,9 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: redis
+      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
+      # the ServiceAccount-level setting in rbac.yaml.
+      automountServiceAccountToken: false
      nodeSelector:
        honeydue/redis: "true"
      securityContext:
@@ -31,12 +34,18 @@ spec:
          type: RuntimeDefault
      containers:
        - name: redis
-          image: redis:7-alpine
+          # Pinned by digest (audit K3S-F14) — redis:7-alpine is 7.4.9-alpine.
+          image: redis:7-alpine@sha256:6ab0b6e7381779332f97b8ca76193e45b0756f38d4c0dcda72dbb3c32061ab99
+          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit
          command:
            - sh
            - -c
            - |
-              ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy noeviction"
+              # allkeys-lru: under memory pressure, evict the least-recently-used key.
+              # honeyDue uses Redis as a cache + asynq queue. The cache layer falls
+              # through to DB on miss, so eviction is graceful. asynq keys with TTLs
+              # would be evicted only after older cache entries are gone.
+              ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy allkeys-lru"
              if [ -n "$REDIS_PASSWORD" ]; then
                ARGS="$ARGS --requirepass $REDIS_PASSWORD"
              fi
@@ -23,8 +23,11 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: web
+      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
+      # the ServiceAccount-level setting in rbac.yaml.
+      automountServiceAccountToken: false
      imagePullSecrets:
-        - name: ghcr-credentials
+        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1001
@@ -43,6 +46,7 @@ spec:
      containers:
        - name: web
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh or manual sed
+          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit; images are SHA/digest-pinned
          ports:
            - containerPort: 3000
              protocol: TCP
@@ -27,8 +27,11 @@ spec:
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: worker
+      # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
+      # the ServiceAccount-level setting in rbac.yaml.
+      automountServiceAccountToken: false
      imagePullSecrets:
-        - name: ghcr-credentials
+        - name: gitea-credentials
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
@@ -39,6 +42,12 @@ spec:
      containers:
        - name: worker
          image: IMAGE_PLACEHOLDER  # Replaced by 03-deploy.sh
+          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit; images are SHA/digest-pinned
+          ports:
+            # health + Prometheus /metrics (in-cluster only; scraped by vmagent)
+            - name: metrics
+              containerPort: 6060
+              protocol: TCP
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
@@ -47,34 +56,16 @@ spec:
          envFrom:
            - configMapRef:
                name: honeydue-config
-          env:
-            - name: POSTGRES_PASSWORD
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: POSTGRES_PASSWORD
-            - name: SECRET_KEY
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: SECRET_KEY
-            - name: EMAIL_HOST_PASSWORD
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: EMAIL_HOST_PASSWORD
-            - name: FCM_SERVER_KEY
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: FCM_SERVER_KEY
-            - name: REDIS_PASSWORD
-              valueFrom:
-                secretKeyRef:
-                  name: honeydue-secrets
-                  key: REDIS_PASSWORD
-                  optional: true
+          # Audit CODE-F8: secrets are NOT injected as environment variables.
+          # Env vars are readable for the life of the pod via /proc/<pid>/environ
+          # and leak into crash dumps / child processes. honeydue-secrets is
+          # mounted read-only at /etc/honeydue/secrets (mode 0400) and the Go
+          # config layer (config.loadFileSecrets) reads each key from its file.
+          # Non-secret config still arrives via the configMapRef above.
          volumeMounts:
+            - name: app-secrets
+              mountPath: /etc/honeydue/secrets
+              readOnly: true
            - name: apns-key
              mountPath: /secrets/apns
              readOnly: true
@@ -94,6 +85,12 @@ spec:
            periodSeconds: 30
            timeoutSeconds: 5
      volumes:
+        # Audit CODE-F8: the whole honeydue-secrets Secret, projected as files.
+        # defaultMode 0400 → readable only by the container's runAsUser (1000).
+        - name: app-secrets
+          secret:
+            secretName: honeydue-secrets
+            defaultMode: 0400
        - name: apns-key
          secret:
            secretName: honeydue-apns-key
@@ -103,3 +100,46 @@ spec:
        - name: tmp
          emptyDir:
            sizeLimit: 64Mi
+---
+# Allow vmagent to scrape the worker's /metrics on :6060 (default-deny-all is in
+# force; the worker otherwise receives no ingress). Additive — see node-exporter.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-ingress-to-worker-metrics
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: worker
+  policyTypes:
+    - Ingress
+  ingress:
+    - from:
+        - podSelector:
+            matchLabels:
+              app.kubernetes.io/name: vmagent
+      ports:
+        - port: 6060
+          protocol: TCP
+---
+# vmagent's base egress policy only opens :8000/:8080 to the pod CIDR; this
+# additive policy opens :6060 for the worker scrape (leaves the base untouched).
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: allow-egress-from-vmagent-to-worker
+  namespace: honeydue
+spec:
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/name: vmagent
+  policyTypes:
+    - Egress
+  egress:
+    - to:
+        - ipBlock:
+            cidr: 10.42.0.0/16
+      ports:
+        - port: 6060
+          protocol: TCP
@@ -68,6 +68,43 @@ SECRET_ARGS=(
 if [[ -n "${REDIS_PASSWORD}" ]]; then
  log "  Including REDIS_PASSWORD in secrets"
  SECRET_ARGS+=(--from-literal="REDIS_PASSWORD=${REDIS_PASSWORD}")
+else
+  # Audit K3S-F1 (CRITICAL) / MEDIUM-4: refuse to deploy with an unauthenticated
+  # Redis. A previous version only warned here, which let a deploy from an
+  # unedited config.yaml silently bring Redis up with no password.
+  die "redis.password is empty in config.yaml — refusing to deploy: Redis would run with NO authentication (audit K3S-F1). Set a strong value, e.g.: openssl rand -base64 32"
+fi
+
+# B2 (Backblaze) object-storage credentials. The api/worker manifests
+# reference B2_KEY_ID / B2_APP_KEY as required secret keys, so honeydue-secrets
+# MUST carry them or those pods fail to start. Sourced from config.yaml so the
+# script and the manifests no longer drift (was a latent gap before 2026-05-16).
+B2_KEY_ID_VAL="$(cfg storage.b2_key_id 2>/dev/null || true)"
+B2_APP_KEY_VAL="$(cfg storage.b2_app_key 2>/dev/null || true)"
+if [[ -n "${B2_KEY_ID_VAL}" && -n "${B2_APP_KEY_VAL}" ]]; then
+  log "  Including B2_KEY_ID / B2_APP_KEY in secrets"
+  SECRET_ARGS+=(--from-literal="B2_KEY_ID=${B2_KEY_ID_VAL}")
+  SECRET_ARGS+=(--from-literal="B2_APP_KEY=${B2_APP_KEY_VAL}")
+else
+  warn "storage.b2_key_id / b2_app_key not set in config.yaml — B2 uploads will be disabled."
+fi
+
+# Observability ingest credentials live in deploy/prod.env (gitignored) so
+# the values aren't checked into config.yaml. Skipped silently when the
+# file or keys are absent — the api/worker manifests mark these env vars
+# optional, so the deployment still rolls without traces.
+PROD_ENV_FILE="${DEPLOY_DIR}/../deploy/prod.env"
+if [[ -f "${PROD_ENV_FILE}" ]]; then
+  OBS_TOKEN_VAL="$(grep -E '^OBS_INGEST_TOKEN=' "${PROD_ENV_FILE}" 2>/dev/null | cut -d= -f2- || true)"
+  OBS_URL_VAL="$(grep -E '^OBS_TRACES_URL=' "${PROD_ENV_FILE}" 2>/dev/null | cut -d= -f2- || true)"
+  if [[ -n "${OBS_TOKEN_VAL}" ]]; then
+    log "  Including OBS_INGEST_TOKEN in secrets"
+    SECRET_ARGS+=(--from-literal="OBS_INGEST_TOKEN=${OBS_TOKEN_VAL}")
+  fi
+  if [[ -n "${OBS_URL_VAL}" ]]; then
+    log "  Including OBS_TRACES_URL in secrets"
+    SECRET_ARGS+=(--from-literal="OBS_TRACES_URL=${OBS_URL_VAL}")
+  fi
 fi

 kubectl create secret generic honeydue-secrets \
@@ -82,22 +119,24 @@ kubectl create secret generic honeydue-apns-key \
  --from-file="apns_auth_key.p8=${SECRETS_DIR}/apns_auth_key.p8" \
  --dry-run=client -o yaml | kubectl apply -f -

-# --- Create GHCR registry credentials ---
+# --- Create container registry credentials ---
+# Secret name is gitea-credentials (audit F6): the registry is self-hosted
+# Gitea, not GHCR. Every deployment manifest references this same name.

 REGISTRY_SERVER="$(cfg registry.server)"
 REGISTRY_USER="$(cfg registry.username)"
 REGISTRY_TOKEN="$(cfg registry.token)"

 if [[ -n "${REGISTRY_SERVER}" && -n "${REGISTRY_USER}" && -n "${REGISTRY_TOKEN}" ]]; then
-  log "Creating ghcr-credentials..."
-  kubectl create secret docker-registry ghcr-credentials \
+  log "Creating gitea-credentials..."
+  kubectl create secret docker-registry gitea-credentials \
    --namespace="${NAMESPACE}" \
    --docker-server="${REGISTRY_SERVER}" \
    --docker-username="${REGISTRY_USER}" \
    --docker-password="${REGISTRY_TOKEN}" \
    --dry-run=client -o yaml | kubectl apply -f -
 else
-  warn "Registry credentials incomplete in config.yaml — skipping ghcr-credentials."
+  warn "Registry credentials incomplete in config.yaml — skipping gitea-credentials."
 fi

 # --- Create Cloudflare origin cert ---
@@ -114,7 +153,8 @@ kubectl create secret tls cloudflare-origin-cert \
 if [[ -n "${ADMIN_AUTH_USER}" && -n "${ADMIN_AUTH_PASSWORD}" ]]; then
  command -v htpasswd >/dev/null 2>&1 || die "Missing: htpasswd (install apache2-utils)"
  log "Creating admin-basic-auth secret..."
-  HTPASSWD="$(htpasswd -nb "${ADMIN_AUTH_USER}" "${ADMIN_AUTH_PASSWORD}")"
+  # -B forces bcrypt (Traefik BasicAuth supports it; avoids weak apr1-MD5).
+  HTPASSWD="$(htpasswd -nbB "${ADMIN_AUTH_USER}" "${ADMIN_AUTH_PASSWORD}")"
  kubectl create secret generic admin-basic-auth \
    --namespace="${NAMESPACE}" \
    --from-literal=users="${HTPASSWD}" \
@@ -124,6 +164,35 @@ else
  warn "Admin panel will NOT have basic auth protection."
 fi

+# --- Create Kratos secrets (Ory Kratos identity service) ---
+# Created only when config.yaml has a kratos.dsn. Until then 03-deploy.sh skips
+# the Kratos deploy entirely, so the existing stack is unaffected.
+
+KRATOS_DSN="$(cfg kratos.dsn 2>/dev/null || true)"
+if [[ -n "${KRATOS_DSN}" ]]; then
+  log "Creating kratos-secrets..."
+  KR_COOKIE="$(cfg kratos.secrets_cookie 2>/dev/null || true)"
+  KR_CIPHER="$(cfg kratos.secrets_cipher 2>/dev/null || true)"
+  KR_SMTP="$(cfg kratos.smtp_connection_uri 2>/dev/null || true)"
+  KR_GOOGLE="$(cfg kratos.google_client_secret 2>/dev/null || true)"
+  KR_APPLE="$(cfg kratos.apple_private_key 2>/dev/null || true)"
+  [[ -n "${KR_COOKIE}" && -n "${KR_CIPHER}" ]] \
+    || die "kratos.secrets_cookie / secrets_cipher must be set (generate once: openssl rand -hex 16)"
+  [[ ${#KR_CIPHER} -eq 32 ]] \
+    || die "kratos.secrets_cipher must be exactly 32 characters (openssl rand -hex 16)"
+  kubectl create secret generic kratos-secrets \
+    --namespace="${NAMESPACE}" \
+    --from-literal="dsn=${KRATOS_DSN}" \
+    --from-literal="secrets_cookie=${KR_COOKIE}" \
+    --from-literal="secrets_cipher=${KR_CIPHER}" \
+    --from-literal="smtp_connection_uri=${KR_SMTP}" \
+    --from-literal="google_client_secret=${KR_GOOGLE}" \
+    --from-literal="apple_private_key=${KR_APPLE}" \
+    --dry-run=client -o yaml | kubectl apply -f -
+else
+  warn "config.yaml has no kratos.dsn — skipping kratos-secrets (Kratos not yet configured)."
+fi
+
 # --- Done ---

 log ""
@@ -81,20 +81,24 @@ if [[ "${SKIP_BUILD}" == "false" ]]; then
  log "Logging in to ${REGISTRY_SERVER}..."
  printf '%s' "${REGISTRY_TOKEN}" | docker login "${REGISTRY_SERVER}" -u "${REGISTRY_USER}" --password-stdin >/dev/null

-  log "Building API image: ${API_IMAGE}"
-  docker build --target api -t "${API_IMAGE}" "${REPO_DIR}"
+  # k3s nodes are linux/amd64 (Hetzner CX). Force the build platform so
+  # local arm64 Macs don't push images that crash with "exec format error".
+  BUILD_PLATFORM="linux/amd64"

-  log "Building Worker image: ${WORKER_IMAGE}"
-  docker build --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}"
+  log "Building API image: ${API_IMAGE} (${BUILD_PLATFORM})"
+  docker build --platform "${BUILD_PLATFORM}" --target api -t "${API_IMAGE}" "${REPO_DIR}"

-  log "Building Admin image: ${ADMIN_IMAGE} (NEXT_PUBLIC_API_URL=${ADMIN_API_URL})"
-  docker build --target admin \
+  log "Building Worker image: ${WORKER_IMAGE} (${BUILD_PLATFORM})"
+  docker build --platform "${BUILD_PLATFORM}" --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}"
+
+  log "Building Admin image: ${ADMIN_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${ADMIN_API_URL})"
+  docker build --platform "${BUILD_PLATFORM}" --target admin \
    --build-arg "NEXT_PUBLIC_API_URL=${ADMIN_API_URL}" \
    -t "${ADMIN_IMAGE}" "${REPO_DIR}"

  if [[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]]; then
-    log "Building Web image: ${WEB_IMAGE} (NEXT_PUBLIC_API_URL=${WEB_API_URL})"
-    docker build \
+    log "Building Web image: ${WEB_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${WEB_API_URL})"
+    docker build --platform "${BUILD_PLATFORM}" \
      --build-arg "NEXT_PUBLIC_API_URL=${WEB_API_URL}" \
      --build-arg "NEXT_PUBLIC_POSTHOG_KEY=${NEXT_PUBLIC_POSTHOG_KEY}" \
      --build-arg "NEXT_PUBLIC_POSTHOG_HOST=${NEXT_PUBLIC_POSTHOG_HOST}" \
@@ -124,6 +128,56 @@ else
  warn "Skipping build. Using images for tag: ${DEPLOY_TAG}"
 fi

+# --- Resolve immutable image digests (audit F5) ---
+# A short-SHA tag is mutable — anyone who can push to the registry can
+# overwrite it, and imagePullPolicy then pulls the new bits silently. We
+# deploy by @sha256: digest instead, pinning the exact image that was just
+# built and pushed. `docker push` populates RepoDigests; with --skip-build
+# (no local image) resolve_ref falls back to the tag.
+resolve_ref() {
+  local img="$1" digest
+  digest="$(docker inspect --format='{{range .RepoDigests}}{{println .}}{{end}}' "${img}" 2>/dev/null | grep -m1 '@sha256:' || true)"
+  if [[ -n "${digest}" ]]; then
+    printf '%s' "${digest}"
+  else
+    warn "could not resolve a digest for ${img} — deploying by mutable tag"
+    printf '%s' "${img}"
+  fi
+}
+API_REF="$(resolve_ref "${API_IMAGE}")"
+WORKER_REF="$(resolve_ref "${WORKER_IMAGE}")"
+ADMIN_REF="$(resolve_ref "${ADMIN_IMAGE}")"
+WEB_REF="$(resolve_ref "${WEB_IMAGE}")"
+log "Deploying by digest:"
+log "  API:    ${API_REF}"
+log "  Worker: ${WORKER_REF}"
+log "  Admin:  ${ADMIN_REF}"
+
+# --- Image scan + signing (audit CODE-L5) ---
+# Both steps are best-effort: the deploy does NOT fail if the tools are
+# absent, so an operator who has not set up cosign/trivy yet is not blocked.
+# Install trivy + cosign and export COSIGN_KEY to enforce. Cluster-side
+# admission verification (Kyverno/Connaisseur) is a separate operator step.
+if [[ "${SKIP_BUILD}" == "false" ]]; then
+  if command -v trivy >/dev/null 2>&1; then
+    log "Scanning images with Trivy (HIGH,CRITICAL)..."
+    for img in "${API_IMAGE}" "${WORKER_IMAGE}" "${ADMIN_IMAGE}"; do
+      trivy image --severity HIGH,CRITICAL --exit-code 0 --quiet "${img}" \
+        || warn "Trivy reported findings for ${img}"
+    done
+  else
+    warn "trivy not installed — skipping image vulnerability scan (audit L5)"
+  fi
+  if command -v cosign >/dev/null 2>&1 && [[ -n "${COSIGN_KEY:-}" ]]; then
+    log "Signing images with cosign..."
+    for ref in "${API_REF}" "${WORKER_REF}" "${ADMIN_REF}"; do
+      cosign sign --yes --key "${COSIGN_KEY}" "${ref}" || warn "cosign sign failed for ${ref}"
+    done
+  else
+    warn "cosign not configured (need cosign + COSIGN_KEY) — skipping image signing (audit L5)"
+  fi
+fi
+
 # --- Generate and apply ConfigMap from config.yaml ---

 log "Generating env from config.yaml..."
@@ -142,24 +196,95 @@ kubectl create configmap honeydue-config \
 log "Applying manifests..."

 kubectl apply -f "${MANIFESTS}/namespace.yaml"
+
+# NetworkPolicies first — default-deny-all + per-app allow rules.
+# These MUST be applied; without them the cluster falls back to default-allow
+# (worse posture) AND the vmagent egress rule for :6443 (which fixes a k3s
+# post-DNAT enforcement quirk for k8s API discovery) is missing.
+# See deploy-k3s/RUNBOOK.md ("vmagent SD broken on fresh deploy").
+kubectl apply -f "${MANIFESTS}/network-policies.yaml"
+
 kubectl apply -f "${MANIFESTS}/redis/"
 kubectl apply -f "${MANIFESTS}/ingress/"

+# --- Run migrations BEFORE rolling api/worker ---
+#
+# goose-based migration Job. We delete any prior Job (Jobs are immutable —
+# applying a duplicate name otherwise fails), apply a fresh one with the new
+# api image (which includes /usr/local/bin/goose and /app/migrations), and
+# block until it succeeds. A failure aborts the deploy before any new app
+# pod sees a stale schema.
+log "Running database migrations (goose Job)..."
+kubectl delete job honeydue-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null
+sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/migrate/job.yaml" | kubectl apply -f -
+if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=10m job/honeydue-migrate; then
+  warn "migration Job failed — see logs:"
+  kubectl logs -n "${NAMESPACE}" job/honeydue-migrate --tail=200 || true
+  die "migrations did not complete cleanly; aborting deploy"
+fi
+log "Migrations applied; proceeding with api/worker rollout"
+
 # Apply deployments with image substitution
-sed "s|image: IMAGE_PLACEHOLDER|image: ${API_IMAGE}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f -
+sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f -
 kubectl apply -f "${MANIFESTS}/api/service.yaml"
 kubectl apply -f "${MANIFESTS}/api/hpa.yaml"

-sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_IMAGE}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f -
+sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_REF}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f -

-sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_IMAGE}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f -
+sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_REF}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f -
 kubectl apply -f "${MANIFESTS}/admin/service.yaml"

 if [[ -d "${MANIFESTS}/web" ]]; then
-  sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_IMAGE}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f -
+  sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_REF}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f -
  kubectl apply -f "${MANIFESTS}/web/service.yaml"
 fi

+# Observability — vmagent scrapes api Pods :8000/metrics + kube-state-metrics
+# :8080/metrics and remote-writes everything to obs.88oakapps.com. The bearer
+# token comes from deploy/prod.env so it stays out of the repo; the manifest
+# holds TOKEN_PLACEHOLDER. kube-state-metrics provides the kube_* metrics
+# Grafana panels need to count pods, deployments, etc.
+if [[ -d "${MANIFESTS}/observability" ]]; then
+  # kube-state-metrics — no secrets, plain apply
+  kubectl apply -f "${MANIFESTS}/observability/kube-state-metrics.yaml"
+
+  # vmagent — needs the bearer-token substitution
+  # prod.env lives at the repo's deploy/ dir (sibling of deploy-k3s/), not
+  # under deploy-k3s/. It's gitignored — operator copies values there once.
+  OBS_TOKEN="$(grep -E '^OBS_INGEST_TOKEN=' "${REPO_DIR}/deploy/prod.env" 2>/dev/null | cut -d= -f2- || true)"
+  if [[ -z "${OBS_TOKEN}" ]]; then
+    warn "OBS_INGEST_TOKEN not found in deploy/prod.env — skipping vmagent + alloy-logs apply"
+  else
+    sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/vmagent.yaml" | kubectl apply -f -
+    # alloy-logs — DaemonSet that tails honeydue pod logs and pushes them to
+    # Loki at obs.88oakapps.com. Same OBS_INGEST_TOKEN as vmagent.
+    if [[ -f "${MANIFESTS}/observability/alloy-logs.yaml" ]]; then
+      sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/alloy-logs.yaml" | kubectl apply -f -
+    fi
+  fi
+fi
+
+# --- Ory Kratos (identity service) ---
+# Applied only when kratos-secrets exists — i.e. the operator has completed the
+# Kratos prerequisites in deploy-k3s/manifests/kratos/README.md. Otherwise
+# skipped, so the existing stack deploys unaffected.
+if kubectl -n "${NAMESPACE}" get secret kratos-secrets >/dev/null 2>&1; then
+  log "Deploying Ory Kratos..."
+  kubectl apply -f "${MANIFESTS}/kratos/configmap.yaml"
+  # The migrate Job is immutable — delete any prior run, then apply + wait.
+  kubectl delete job kratos-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null
+  kubectl apply -f "${MANIFESTS}/kratos/migrate-job.yaml"
+  if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=5m job/kratos-migrate; then
+    warn "Kratos migration Job failed — logs:"
+    kubectl logs -n "${NAMESPACE}" job/kratos-migrate --tail=100 || true
+    die "aborting: Kratos schema migration failed"
+  fi
+  kubectl apply -f "${MANIFESTS}/kratos/kratos.yaml"
+  kubectl apply -f "${MANIFESTS}/kratos/ingress.yaml"
+else
+  log "kratos-secrets not present — skipping Kratos deploy (see manifests/kratos/README.md)."
+fi
+
 # --- Wait for rollouts ---

 log "Waiting for rollouts..."
@@ -171,6 +296,15 @@ kubectl rollout status deployment/admin -n "${NAMESPACE}" --timeout=300s
 if [[ -d "${MANIFESTS}/web" ]]; then
  kubectl rollout status deployment/web -n "${NAMESPACE}" --timeout=300s
 fi
+if kubectl -n "${NAMESPACE}" get deployment vmagent >/dev/null 2>&1; then
+  kubectl rollout status deployment/vmagent -n "${NAMESPACE}" --timeout=120s
+fi
+if kubectl -n "${NAMESPACE}" get daemonset alloy-logs >/dev/null 2>&1; then
+  kubectl rollout status daemonset/alloy-logs -n "${NAMESPACE}" --timeout=120s
+fi
+if kubectl -n "${NAMESPACE}" get deployment kratos >/dev/null 2>&1; then
+  kubectl rollout status deployment/kratos -n "${NAMESPACE}" --timeout=180s
+fi

 # --- Done ---

@@ -100,7 +100,7 @@ lines = [
    # API
    'DEBUG=false',
    f\"ALLOWED_HOSTS={d['api']},{d['base']}\",
-    f\"CORS_ALLOWED_ORIGINS=https://{d['base']},https://{d['admin']}\",
+    f\"CORS_ALLOWED_ORIGINS=https://{d['base']},https://{d['admin']},https://{d.get('app', 'app.' + d['base'])}\",
    'TIMEZONE=UTC',
    f\"BASE_URL=https://{d['base']}\",
    'PORT=8000',
@@ -118,8 +118,15 @@ lines = [
    f\"DB_MAX_OPEN_CONNS={db['max_open_conns']}\",
    f\"DB_MAX_IDLE_CONNS={db['max_idle_conns']}\",
    f\"DB_MAX_LIFETIME={db['max_lifetime']}\",
-    # Redis (K8s internal DNS — password injected if configured)
-    f\"REDIS_URL=redis://{':%s@' % val(rd.get('password')) if rd.get('password') else ''}redis.honeydue.svc.cluster.local:6379/0\",
+    f\"DB_MAX_IDLE_TIME={db.get('max_idle_time', '0s')}\",
+    # Redis — in-namespace DNS short form (works because pod /etc/resolv.conf
+    # searches honeydue.svc.cluster.local). Audit HIGH-1: the password is
+    # intentionally NOT embedded here. This URL is emitted into the
+    # honeydue-config ConfigMap, which is NOT encrypted at rest and is
+    # readable by anyone with `get configmap`. The Redis password travels
+    # only in honeydue-secrets as REDIS_PASSWORD (file-mounted, F8); the API
+    # applies it in cache_service.go and the worker onto its Asynq opt.
+    'REDIS_URL=redis://redis:6379/0',
    'REDIS_DB=0',
    # Email
    f\"EMAIL_HOST={em['host']}\",
@@ -139,12 +146,21 @@ lines = [
    f\"OVERDUE_REMINDER_HOUR={wk['overdue_reminder_hour']}\",
    f\"DAILY_DIGEST_HOUR={wk['daily_digest_hour']}\",
    # B2 Storage
-    f\"B2_KEY_ID={val(st['b2_key_id'])}\",
-    f\"B2_APP_KEY={val(st['b2_app_key'])}\",
+    # B2_KEY_ID and B2_APP_KEY are intentionally NOT emitted into the
+    # ConfigMap — they're credentials and belong in honeydue-secrets
+    # (set by 02-setup-secrets.sh). Wire them into the api/worker
+    # deployments via envFrom: secretRef when B2 uploads need to be
+    # active. Leaving them in cleartext here would leak via
+    # \"kubectl get cm\".
    f\"B2_BUCKET_NAME={val(st['b2_bucket'])}\",
    f\"B2_ENDPOINT={val(st['b2_endpoint'])}\",
+    f\"B2_REGION={val(st.get('b2_region'))}\",
+    f\"B2_USE_SSL={b(st.get('b2_use_ssl', True))}\",
    f\"STORAGE_MAX_FILE_SIZE={st['max_file_size']}\",
    f\"STORAGE_ALLOWED_TYPES={st['allowed_types']}\",
+    f\"STORAGE_UPLOAD_DIR={val(st.get('upload_dir', '/app/uploads'))}\",
+    f\"STORAGE_BASE_URL={val(st.get('base_url', '/uploads'))}\",
+    f\"STATIC_DIR={val(st.get('static_dir', '/app/static'))}\",
    # Features
    f\"FEATURE_PUSH_ENABLED={b(ft['push_enabled'])}\",
    f\"FEATURE_EMAIL_ENABLED={b(ft['email_enabled'])}\",
@@ -207,8 +223,18 @@ config = {
        'image': 'ubuntu-24.04',
    },
    'additional_packages': ['open-iscsi'],
-    'post_create_commands': ['sudo systemctl enable --now iscsid'],
-    'k3s_config_file': 'secrets-encryption: true\n',
+    # Audit K3S-CG2: harden the node OS at provision time — fail2ban for SSH
+    # brute-force, unattended-upgrades for automatic security patches.
+    'post_create_commands': [
+        'sudo systemctl enable --now iscsid',
+        'sudo apt-get update -qq',
+        'sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq fail2ban unattended-upgrades',
+        'sudo systemctl enable --now fail2ban',
+        'sudo dpkg-reconfigure -f noninteractive -plow unattended-upgrades',
+    ],
+    # Audit K3S-CG1 / K3S-F4: encrypt Secrets at rest in etcd, and write the
+    # node kubeconfig as mode 0600 (not world-readable).
+    'k3s_config_file': 'secrets-encryption: true\nwrite-kubeconfig-mode: \"0600\"\n',
 }

 print(yaml.dump(config, default_flow_style=False, sort_keys=False))
@@ -0,0 +1,39 @@
+{
+  "$id": "https://honeydue.app/identity.schema.json",
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "honeyDue user",
+  "type": "object",
+  "properties": {
+    "traits": {
+      "type": "object",
+      "properties": {
+        "email": {
+          "type": "string",
+          "format": "email",
+          "title": "Email",
+          "minLength": 3,
+          "maxLength": 320,
+          "ory.sh/kratos": {
+            "credentials": {
+              "password": { "identifier": true },
+              "code": { "identifier": true, "via": "email" },
+              "totp": { "account_name": true }
+            },
+            "verification": { "via": "email" },
+            "recovery": { "via": "email" }
+          }
+        },
+        "name": {
+          "type": "object",
+          "title": "Name",
+          "properties": {
+            "first": { "type": "string", "title": "First name", "maxLength": 100 },
+            "last": { "type": "string", "title": "Last name", "maxLength": 100 }
+          }
+        }
+      },
+      "required": ["email"],
+      "additionalProperties": false
+    }
+  }
+}
@@ -0,0 +1,101 @@
+version: v1.3.0
+
+serve:
+  public:
+    base_url: http://localhost:4433/
+    cors:
+      enabled: true
+      allowed_origins:
+        - http://localhost
+        - http://localhost:3000
+        - http://localhost:8000
+        - http://127.0.0.1
+      allowed_methods: [GET, POST, PUT, PATCH, DELETE, OPTIONS]
+      allowed_headers: [Authorization, Content-Type, X-Session-Token, Cookie]
+      exposed_headers: [Content-Type, Set-Cookie]
+      allow_credentials: true
+  admin:
+    base_url: http://kratos:4434/
+
+selfservice:
+  default_browser_return_url: http://localhost:8000/
+  allowed_return_urls:
+    - http://localhost:8000
+    - honeydue://callback
+
+  methods:
+    password:
+      enabled: true
+      config:
+        min_password_length: 8
+        identifier_similarity_check_enabled: false
+    code:
+      enabled: true
+    oidc:
+      enabled: false
+
+  flows:
+    error:
+      ui_url: http://localhost:8000/auth/error
+    login:
+      ui_url: http://localhost:8000/auth/login
+      lifespan: 10m
+    registration:
+      ui_url: http://localhost:8000/auth/registration
+      lifespan: 10m
+      after:
+        password:
+          hooks:
+            - hook: session
+    verification:
+      enabled: true
+      ui_url: http://localhost:8000/auth/verification
+      use: code
+      after:
+        default_browser_return_url: http://localhost:8000/
+    recovery:
+      enabled: true
+      ui_url: http://localhost:8000/auth/recovery
+      use: code
+    settings:
+      ui_url: http://localhost:8000/auth/settings
+      privileged_session_max_age: 15m
+    logout:
+      after:
+        default_browser_return_url: http://localhost:8000/
+
+log:
+  level: debug
+  format: text
+  leak_sensitive_values: true
+
+secrets:
+  cookie:
+    - local-dev-cookie-secret-please-change-this-32chars
+  cipher:
+    - 0123456789abcdef0123456789abcdef
+
+ciphers:
+  algorithm: xchacha20-poly1305
+
+hashers:
+  algorithm: bcrypt
+  bcrypt:
+    cost: 8
+
+identity:
+  default_schema_id: honeydue
+  schemas:
+    - id: honeydue
+      url: file:///etc/config/kratos/identity.schema.json
+
+courier:
+  smtp:
+    connection_uri: smtp://mailpit:1025/?disable_starttls=true
+    from_address: noreply@localhost
+    from_name: honeyDue Local
+
+session:
+  lifespan: 720h
+  cookie:
+    same_site: Lax
@@ -35,7 +35,7 @@ DEFAULT_FROM_EMAIL=honeyDue <noreply@honeyDue.treytartt.com>
 # APNS private key goes in deploy/secrets/apns_auth_key.p8
 APNS_AUTH_KEY_ID=CHANGEME_APNS_KEY_ID
 APNS_TEAM_ID=CHANGEME_APNS_TEAM_ID
-APNS_TOPIC=com.tt.honeyDue
+APNS_TOPIC=com.myhoneydue.honeyDue
 APNS_USE_SANDBOX=false
 APNS_PRODUCTION=true

@@ -80,7 +80,11 @@ FEATURE_PDF_REPORTS_ENABLED=true
 FEATURE_WORKER_ENABLED=true

 # Optional auth/iap values
-APPLE_CLIENT_ID=
+# APPLE_CLIENT_ID must equal the iOS Release bundle ID. The Apple
+# identity-token `aud` claim is verified against this value
+# (internal/services/apple_auth.go::verifyAudience). Leaving it empty
+# with DEBUG=false rejects every Apple token as invalid audience.
+APPLE_CLIENT_ID=com.myhoneydue.honeyDue
 APPLE_TEAM_ID=
 GOOGLE_CLIENT_ID=
 GOOGLE_ANDROID_CLIENT_ID=
@@ -1,6 +1,31 @@
 #!/usr/bin/env bash
 set -euo pipefail

+# DEPRECATED — production migrated from Docker Swarm to k3s on 2026-04-24.
+# This script targets the old Swarm manager + registry flow and will fail
+# at the SSH/Swarm validation step because hetzner1 no longer runs dockerd.
+#
+# Use the k3s deploy stack instead:
+#
+#   export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
+#   ./deploy-k3s/scripts/03-deploy.sh
+#
+# If you don't have deploy-k3s/kubeconfig locally, fetch it once:
+#   ssh -i ~/.ssh/hetzner deploy@hetzner1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
+#     | sed 's|server: https://127.0.0.1:6443|server: https://178.104.247.152:6443|' \
+#     > deploy-k3s/kubeconfig
+#   chmod 600 deploy-k3s/kubeconfig
+#
+# To override and run anyway (do NOT do this casually), set:
+#   ALLOW_LEGACY_SWARM_DEPLOY=1 ./deploy/scripts/deploy_prod.sh
+if [[ "${ALLOW_LEGACY_SWARM_DEPLOY:-0}" != "1" ]]; then
+  printf '[deploy][error] %s\n' \
+    "deploy_prod.sh is the legacy Docker Swarm flow. Production now runs on k3s." \
+    "Use ./deploy-k3s/scripts/03-deploy.sh instead (see top of this script for setup)." \
+    "If you really need the old Swarm path, set ALLOW_LEGACY_SWARM_DEPLOY=1." >&2
+  exit 1
+fi
+
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 DEPLOY_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
 REPO_DIR="$(cd "${DEPLOY_DIR}/.." && pwd)"
@@ -14,6 +14,7 @@ services:
      POSTGRES_DB: ${POSTGRES_DB:-honeydue}
    volumes:
      - postgres_data:/var/lib/postgresql/data
+      - ./deploy/local/postgres-init:/docker-entrypoint-initdb.d:ro
    ports:
      - "${DB_PORT:-5433}:5432"  # 5433 externally to avoid conflicts with local postgres
    healthcheck:
@@ -85,12 +86,16 @@ services:
      APNS_AUTH_KEY_PATH: ${APNS_AUTH_KEY_PATH}
      APNS_AUTH_KEY_ID: ${APNS_AUTH_KEY_ID}
      APNS_TEAM_ID: ${APNS_TEAM_ID}
-      APNS_TOPIC: ${APNS_TOPIC:-com.tt.honeyDue}
+      APNS_TOPIC: ${APNS_TOPIC:-com.myhoneydue.honeyDue.dev}
      APNS_USE_SANDBOX: "true"
      FCM_SERVER_KEY: ${FCM_SERVER_KEY}

      # Storage encryption
      STORAGE_ENCRYPTION_KEY: ${STORAGE_ENCRYPTION_KEY}
+
+      # Kratos (identity service)
+      KRATOS_PUBLIC_URL: "http://kratos:4433"
+      KRATOS_ADMIN_URL: "http://kratos:4434"
    volumes:
      - ./push_certs:/certs:ro
      - ./uploads:/app/uploads
@@ -99,6 +104,8 @@ services:
        condition: service_healthy
      redis:
        condition: service_healthy
+      kratos:
+        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://127.0.0.1:8000/api/health/"]
      interval: 30s
@@ -158,7 +165,7 @@ services:
      APNS_AUTH_KEY_PATH: "/certs/apns_key.p8"
      APNS_AUTH_KEY_ID: ${APNS_AUTH_KEY_ID}
      APNS_TEAM_ID: ${APNS_TEAM_ID}
-      APNS_TOPIC: ${APNS_TOPIC:-com.tt.honeyDue}
+      APNS_TOPIC: ${APNS_TOPIC:-com.myhoneydue.honeyDue.dev}
      APNS_USE_SANDBOX: "true"
      FCM_SERVER_KEY: ${FCM_SERVER_KEY}

@@ -184,6 +191,59 @@ services:
    networks:
      - honeydue-network

+  # Mailpit — local SMTP catcher (for Kratos email codes during onboarding)
+  mailpit:
+    image: axllent/mailpit:latest
+    container_name: honeydue-mailpit
+    restart: unless-stopped
+    ports:
+      - "${MAILPIT_SMTP_PORT:-1025}:1025"
+      - "${MAILPIT_HTTP_PORT:-8025}:8025"
+    networks:
+      - honeydue-network
+
+  # Kratos schema migration (one-shot, runs before kratos starts)
+  kratos-migrate:
+    image: oryd/kratos:v1.3.0
+    container_name: honeydue-kratos-migrate
+    command: ["migrate", "sql", "-e", "--yes"]
+    environment:
+      DSN: "postgres://${POSTGRES_USER:-honeydue}:${POSTGRES_PASSWORD:-honeydue_dev_password}@db:5432/kratos?sslmode=disable"
+    depends_on:
+      db:
+        condition: service_healthy
+    networks:
+      - honeydue-network
+    restart: "no"
+
+  # Ory Kratos — identity service
+  kratos:
+    image: oryd/kratos:v1.3.0
+    container_name: honeydue-kratos
+    restart: unless-stopped
+    command: ["serve", "--config", "/etc/config/kratos/kratos.yml", "--watch-courier", "--dev"]
+    ports:
+      - "${KRATOS_PUBLIC_PORT:-4433}:4433"
+      - "${KRATOS_ADMIN_PORT:-4434}:4434"
+    environment:
+      DSN: "postgres://${POSTGRES_USER:-honeydue}:${POSTGRES_PASSWORD:-honeydue_dev_password}@db:5432/kratos?sslmode=disable"
+      LOG_LEVEL: "debug"
+    volumes:
+      - ./deploy/local/kratos:/etc/config/kratos:ro
+    depends_on:
+      kratos-migrate:
+        condition: service_completed_successfully
+      mailpit:
+        condition: service_started
+    healthcheck:
+      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://127.0.0.1:4434/health/ready"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+      start_period: 10s
+    networks:
+      - honeydue-network
+
  # Dozzle — lightweight real-time log viewer
  dozzle:
    image: amir20/dozzle:latest
@@ -194,10 +194,17 @@ See [Chapter 8](./08-database.md), [9](./09-storage.md), and
  until we have Apple Developer / Google Play accounts. The env vars are
  set to sentinel values that let the Go app boot; `FEATURE_PUSH_ENABLED=false`
  gates all call sites.
- **External metrics/monitoring (Prometheus, Grafana, Betterstack).**
-  Right now we rely on `kubectl logs`, `kubectl top`, and Cloudflare's own
-  analytics. See [Chapter 15](./15-observability.md) for what's there and
-  what we'd add.
+- **In-cluster Prometheus / Grafana.** Self-hosted Prometheus-compatible
+  metrics + tracing + dashboards live **outside** the k3s cluster on
+  `88oakappsUpdate` (the same Linode VPS that hosts PostHog), reached
+  via `https://obs.88oakapps.com` (Cloudflare-fronted, bearer-gated).
+  A `vmagent` sidecar in the honeydue namespace scrapes the api Pods
+  and remote-writes out. This frees ~700 MB of cluster RAM and means
+  observability survives a k3s control-plane incident. See
+  [Chapter 15](./15-observability.md).
+- **Alerting.** No PagerDuty, Slack hooks, or pages-on-error wired up
+  yet. Histograms are flowing into Grafana — alert rules on top of them
+  is the next add. See [Chapter 15 — Future](./15-observability.md).
 - **Automated backups of Redis state.** Redis is configured with AOF
  (append-only file) persistence, but the PVC is only on one node. Redis
  holds only cache + Asynq queue state; losing it re-populates on first
@@ -8,6 +8,13 @@ long-haul components, and dedicated service accounts with dropped
 capabilities inside containers. This chapter documents each layer, the
 rationale, and what's currently missing (and why).

+> **Updated 2026-05-15 — security remediation.** The 2026-05 audits
+> (`live_scan_5_12.md`, `k3_audit_5_12.md`, `security_scan_5_12.md`) drove a
+> full remediation pass. **`deploy-k3s/SECURITY.md` is the authoritative,
+> per-finding current-state record.** This chapter is corrected for the
+> major items below; where any other detail conflicts with `SECURITY.md`,
+> `SECURITY.md` wins.
+
 ## Threat model

 Who we're defending against, in rough order of likelihood:
@@ -54,8 +61,8 @@ Cloudflare sits in front of every public request.
 - **Authorize requests** — that's the app's job
 - **Protect origin if origin IP leaks** — once someone knows a node IP
  they can bypass CF. Mitigation: keep origin firewall strict (Chapter 4).
- **Encrypt between CF and origin** — we're on SSL=Flexible, so CF↔origin
-  is HTTP. This is in our TODO (Chapter 20, upgrade to Full-strict).
+- **~~Encrypt between CF and origin~~** — done (2026-04-24): SSL mode is
+  Full (strict); CF↔origin is TLS with a Cloudflare Origin CA cert.

 ### The proxy-IP problem

@@ -75,8 +82,8 @@ This means a malicious request that bypasses CF (by hitting the node IP
 directly) can't spoof headers — Traefik ignores `X-Forwarded-*` unless
 the source IP is in CF's ranges.

-**TODO** (Chapter 20): Enforce at UFW level — allow 80/tcp only from
-CF IP ranges. Today any IP can reach the origin on port 80.
+**Done (2026-04-24):** the node UFW allowlist permits `:443` only from
+Cloudflare's IP ranges; the `Anywhere` rules on `:80`/`:443` were removed.

 ## Layer 2 — Node (OS, SSH, firewall)

@@ -297,15 +304,13 @@ The `deploy-k3s/manifests/network-policies.yaml` scaffold defines:
  reach api pods on port 8000
 - **allow-ingress-to-admin** — same, for admin:3000

-**These are not currently applied.** Without them, our pods can freely
-talk to anything — including, theoretically, malicious destinations if
-an attacker gets RCE inside a pod.
+**Applied.** `03-deploy.sh` applies
+`deploy-k3s/manifests/network-policies.yaml` on every deploy — default-deny
+plus the explicit per-app allows below. Traefik runs `hostNetwork`, so its
+traffic is matched by node-IP `ipBlock`s plus the pod CIDR `10.42.0.0/16`,
+not a `namespaceSelector`.

-**TODO** (Chapter 20): Apply network policies. The scaffold is there; we
-just need to `kubectl apply -f deploy-k3s/manifests/network-policies.yaml`
-and test that nothing breaks.
-
-### What network policies would prevent
+### What network policies prevent

 | Attack scenario | NetworkPolicy blocks |
 |---|---|
@@ -324,13 +329,10 @@ renewed Let's Encrypt or CF-managed cert for `*.myhoneydue.com`.

 ### CF ↔ origin

-**Plaintext HTTP** (SSL = Flexible). An attacker with access to the
-Cloudflare-to-Hetzner path could read traffic. In practice nobody who
-isn't Cloudflare or Hetzner sits on that path.
-
-**TODO** (Chapter 20): Upgrade to SSL = Full (strict) with a Cloudflare
-Origin CA certificate. This encrypts CF ↔ origin and verifies that
-origin's cert is the CF-issued one (prevents MitM if DNS is compromised).
+**TLS — SSL = Full (strict)** (since 2026-04-24). A Cloudflare Origin CA
+certificate (`cloudflare-origin-cert` secret) is installed on all three
+ingresses; Cloudflare validates it. Both user↔CF and CF↔origin are
+encrypted, and a DNS-hijack MitM is defeated by the origin-cert check.

 ### API ↔ Neon Postgres

@@ -454,11 +456,14 @@ Mitigations:
 - Gitea itself is behind login; PAT is scoped to read:packages +
  write:packages only
 - Gitea runs on the operator's infrastructure (same operator account)
- Image tags are SHA-pinned (`:237c6b8`) not `:latest` → attacker can't
-  replace an existing tag's image without us noticing the digest change
+- Workloads deploy by immutable `@sha256:` digest, not by mutable tag
+  (`03-deploy.sh` resolves the digest after push; the redis/vmagent/node
+  base images are digest-pinned too) — a swapped tag cannot reach the
+  cluster.

-**TODO** (Chapter 20): Add cosign signing at build time, verify at pull
-time.
+**TODO**: cosign signing is wired into `03-deploy.sh` (guarded — runs when
+`cosign` + `COSIGN_KEY` are present); cluster-side admission verification
+(Kyverno/Connaisseur) is still pending. See `deploy-k3s/SECURITY.md` → L5.

 ## Operator workstation security

@@ -1,5 +1,13 @@
 # 06 — Traefik Ingress

+> **Updated 2026-05-15 (security remediation):** the Traefik middleware set
+> changed — `cloudflare-only` + `admin-auth` are now attached to the admin
+> ingress, a strict `auth-rate-limit` middleware fronts the auth endpoints
+> (via a dedicated `honeydue-api-auth` Ingress), and `security-headers`
+> gained COOP/CORP + a 2-year preload HSTS and dropped the deprecated
+> `X-XSS-Protection`. `deploy-k3s/SECURITY.md` is the authoritative
+> current-state record.
+
 ## Summary

 Traefik is the reverse proxy that routes external HTTP requests to the
@@ -1,5 +1,11 @@
 # 07 — Services

+> **Updated 2026-05-15 (security remediation):** Redis now requires a
+> password (`config.yaml` `redis.password` → `honeydue-secrets`), all
+> workloads deploy by immutable `@sha256:` digest, and the redis/vmagent
+> base images are digest-pinned. `deploy-k3s/SECURITY.md` is the
+> authoritative current-state record.
+
 ## Summary

 Five workloads run in the `honeydue` namespace: **api** (Go REST API, 3
@@ -175,13 +181,15 @@ doesn't run as root.
 file writes to the image layer. Go binary doesn't need to write to `/`;
 only `/tmp` is mutable.

-**`startupProbe.failureThreshold: 48`** (= 48 × 5s = 240s grace) — this
-was bumped up from the scaffold default of 12. Reason: on first boot,
-the Go app runs `MigrateWithLock()` which acquires a Postgres advisory
-lock and runs AutoMigrate. First replica takes ~90s; subsequent
-replicas wait on the lock. With 3 replicas all starting simultaneously
-and the lock serializing them, 240s is the right grace. See
-[Chapter 19](./19-postmortem-swarm.md) for the detailed story.
+**`startupProbe.failureThreshold: 48`** (= 48 × 5s = 240s grace) —
+historically bumped from the scaffold default of 12 to absorb in-replica
+migration time. Now that migrations run out-of-band as a Kubernetes
+Job ([Chapter 8 §Schema management](./08-database.md)), pods boot in
+seconds and only need a few probe failures of grace, but the budget
+stays at 240s because cold pods on a fresh Hetzner node still pay
+~10s for image pull + startup. See
+[Chapter 19 §13](./19-postmortem-swarm.md) for the historical
+context (the in-replica advisory-lock approach this replaced).

 **`readinessProbe.initialDelaySeconds: 5`** — after the startupProbe
 passes, wait 5s before starting readiness checks. Prevents a racy
@@ -4,8 +4,10 @@

 Authoritative user data lives in a Neon-managed Postgres database in AWS
 us-east-1. Connections use TLS (`DB_SSLMODE=require`). Schema is managed
-via GORM AutoMigrate inside the api binary, coordinated across replicas
-by a Postgres advisory lock to prevent concurrent migration attempts.
+via [pressly/goose](https://github.com/pressly/goose) running as a
+one-shot Kubernetes Job before every api/worker rollout. See §Schema
+management below for the full shape; ch19 §13 documents the previous
+in-replica AutoMigrate approach this replaced.

 ## Why Neon

@@ -32,7 +34,7 @@ Neon Launch won on:

 | Field | Value |
 |---|---|
-| Hostname | `ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech` |
+| Hostname | `ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech` |
 | Port | 5432 |
 | Username | `neondb_owner` |
 | Database | `honeyDue` (case-sensitive!) |
@@ -58,9 +60,19 @@ paid tiers much higher.

 ### PgBouncer on Neon

-Neon provides a built-in PgBouncer at `-pooler` subdomain. Our hostname
-already includes `-pooler` handling in the route, so connections go
-through PgBouncer transparently.
+Neon provides a built-in PgBouncer at the `-pooler` subdomain. The
+non-pooler endpoint (`ep-floral-truth-amttbc5a.c-5.us-east-1...`) is
+the direct compute endpoint and connects straight to Postgres,
+paying the full TCP+TLS+startup handshake on every cold connection.
+The `-pooler` endpoint multiplexes through PgBouncer in Neon's
+infrastructure.
+
+**We use the `-pooler` endpoint** because the direct endpoint paid
+~440ms per cold handshake on a transatlantic link, visible as
+1500ms-tail spikes in /api/tasks/ traces. The pooler keeps backend
+Postgres connections warm in Neon's data center, so the only
+latency our Go pods see is one TCP+TLS to PgBouncer (already
+warm via our pool) plus one query round-trip.

 Modes PgBouncer supports:
 - **session** — one server connection held per client session (transparent)
@@ -68,26 +80,59 @@ Modes PgBouncer supports:
 - **statement** — per-statement (most aggressive; breaks many features)

 Neon's pooler runs in **transaction mode**. This is compatible with GORM
-out of the box (we don't use session-level features like prepared
-statements or session variables).
+runtime queries (we don't use session-level features like LISTEN/NOTIFY
+or session-scope advisory locks in the data path). The one place this
+matters is migrations: goose's session-scoped advisory lock can't
+survive PgBouncer transaction-mode pooling. The migrate Job
+(`deploy-k3s/manifests/migrate/job.yaml`) handles this by stripping
+the `-pooler` segment from `DB_HOST` before invoking goose — runtime
+keeps using the pooler, only migrations bypass it.

 ### Connection pool settings

-In `prod.env`:
+In `config.yaml` (rendered into ConfigMap → env vars):

-```
-DB_MAX_OPEN_CONNS=25
-DB_MAX_IDLE_CONNS=10
-DB_MAX_LIFETIME=600s
+```yaml
+database:
+  max_open_conns: 25
+  max_idle_conns: 20
+  max_lifetime: "1800s"
+  max_idle_time: "0s"
 ```

-These are the Go `database/sql` pool settings (GORM uses `database/sql`
-underneath):
+These map to Go `database/sql` pool settings:

- **MaxOpenConns: 25** — at most 25 concurrent connections per replica
- **MaxIdleConns: 10** — keep up to 10 warm connections ready to reuse
- **MaxLifetime: 600s** — recycle connections after 10 min (prevents
-  stale state in long-lived connections, good for Neon's idle timeout)
+- **MaxOpenConns: 25** — at most 25 concurrent connections per replica.
+- **MaxIdleConns: 20** — keep up to 20 warm connections per replica
+  ready to reuse. Bumped from 10 because the pooler tolerates many
+  client connections cheaply, and the cost of a cold handshake (~440ms
+  transatlantic) is far higher than the cost of holding an idle
+  connection.
+- **MaxLifetime: 1800s** — recycle connections after 30 min. Bumped
+  from 600s; with the pooler keeping things warm, longer lifetime
+  reduces churn.
+- **MaxIdleTime: 0s** — never close idle connections. Lifetime drives
+  recycling instead.
+
+### Pool warm-up at boot
+
+`database.Connect()` issues 20 parallel `PingContext` calls
+immediately after opening the pool. This pre-establishes
+`MaxIdleConns` connections to the pooler so the first user request
+doesn't pay any handshake.
+
+The warm-up is bounded by *one* round-trip time (~440ms cold), not
+one round-trip per connection — pings run concurrently. Confirmed
+in pod logs at boot:
+
+```
+{"level":"info","requested":20,"warmed":20,"message":"DB pool warm-up complete"}
+```
+
+If warm-up partially fails (e.g., 18/20 succeed), the pod still
+starts; the pool fills the rest under traffic. Failure to ping at all
+would be caught by the synchronous `sqlDB.Ping()` immediately before,
+which is fatal.

 ### Worst-case connection count

@@ -107,66 +152,110 @@ the default 25/10. If we hit connection errors in prod, adjust.

 ## Schema management

-### GORM AutoMigrate
+### goose

-On startup, the Go API's `cmd/api/main.go` calls
-`database.MigrateWithLock()` which:
+We use [pressly/goose](https://github.com/pressly/goose) (pinned in the
+api `Dockerfile` to v3.22.1) for schema migrations. Why goose specifically:

-1. Opens a dedicated Postgres connection
-2. `SELECT pg_advisory_lock(1751412071)` — acquires a session-level
-   advisory lock on a hardcoded key
-3. Calls `db.AutoMigrate(&models.*{})` for every GORM model
-4. `SELECT pg_advisory_unlock(...)` via deferred function
-5. Close the connection
+- Each migration file runs inside its own transaction by default —
+  partial-failure recovery is built in (no "dirty" state to manually
+  unstick like golang-migrate).
+- Locking is opt-in. We *don't* opt in. Migrations run as a single
+  Kubernetes Job — that's the singleton process. No advisory-lock vs
+  PgBouncer-transaction-mode foot-gun.
+- Plain SQL files. No DSL, no library integration in our Go code.

-The advisory lock serializes migrations across replicas: when 3 api
-pods start simultaneously, one acquires the lock and migrates; the
-others block on the lock. Once the first finishes (≤2s for already-
-migrated schema, up to 90s on first cold boot), the next acquires and
-sees the schema is current (no-op migrate).
+See `docs/deployment/19-postmortem-swarm.md` (Schema Versioning section)
+for the AutoMigrate-with-advisory-lock approach this replaced and why.

-### Why an advisory lock
+### Migration files

-Without it, concurrent `CREATE TABLE IF NOT EXISTS ...` statements from
-multiple replicas would race — Postgres usually handles it, but GORM's
-AutoMigrate also alters tables (adds columns, indexes) which can deadlock
-under concurrency.
+Live under `migrations/`, named `<NNNNNN>_<short_name>.sql`. Each file
+has both the up and down migration in one file, separated by goose
+markers:

-The advisory lock pattern (also used by Rails + Django + Alembic) is the
-canonical solution.
+```sql
+-- +goose Up
+CREATE TABLE example (id bigint PRIMARY KEY);

-### The lock key
+-- +goose Down
+DROP TABLE example;
+```

-`1751412071` is a hardcoded integer in `internal/database/database.go`.
-Arbitrary but unique — as long as nothing else in the Postgres instance
-uses the same advisory lock key, no conflicts.
+Multi-statement constructs (`CREATE FUNCTION`, `DO $$ BEGIN ... END $$`)
+need `-- +goose StatementBegin` / `-- +goose StatementEnd` wrappers
+because goose splits on semicolons by default.

-### First-boot behavior
+`migrations/000001_init.sql` is the baseline — captures every
+table/index/sequence as it existed when goose was adopted, generated
+via `pg_dump --schema-only --no-owner --no-privileges`. The pre-goose
+hand-numbered migrations (002-022 in git history at commit
+58e6997) had their effects folded into this baseline; they're gone
+from the live tree but remain in git for archaeology.

-On a **fresh database** (new Neon project), the first api pod runs
-through every model's `CREATE TABLE` statement. This is ~50 tables for
-honeyDue and takes ~90 seconds.
+### Production migration flow

-On a **warm database** (tables already exist), AutoMigrate is fast —
-typically under 2 seconds. It still runs (GORM checks every model
-against the schema) but finds no work to do.
+`deploy-k3s/scripts/03-deploy.sh` runs migrations as part of every
+deploy, **before** the api/worker rollout starts:

-### Where this bit us
+```
+1. kubectl delete job honeydue-migrate (idempotent)
+2. kubectl apply -f manifests/migrate/job.yaml (with current api image)
+3. kubectl wait --for=condition=complete --timeout=10m job/honeydue-migrate
+4. (only if Job succeeded) kubectl apply -f manifests/api/...
+```

-With 3 api pods starting simultaneously and migrations taking 90s first
-time, the lock queue for the last replica is ~180s. We needed a
-startupProbe grace of 240s to cover this without false restart loops.
-See Chapter 7 §startupProbe and Chapter 19 §MigrateWithLock.
+The Job uses the api image — we install the goose CLI binary at
+`/usr/local/bin/goose` during the api Dockerfile build, so any pod that
+can run api can run goose. No separate image to build/push.

-### Downside: no schema versioning
+The Job's `command` runs `goose ... up` against the **direct**
+(non-pooler) Neon endpoint. Goose's session-scoped advisory lock can't
+survive PgBouncer transaction-mode pooling, so the Job script strips
+the `-pooler` segment from `DB_HOST` before connecting. The api/worker
+runtime continues to use the pooler endpoint for everything else; only
+this one Job needs the direct connection.

-AutoMigrate can only *add* — new tables, new columns, new indexes. It
-won't drop columns, rename them, or change types destructively. For
-those we'd need raw SQL migrations (a tool like `golang-migrate` or
-`dbmate`).
+### Schema-version precondition

-Today: we accept that schema changes are additive-only. When we need
-destructive changes, we'd hand-write them.
+`internal/database/database.go::RequireSchemaApplied()` runs at api and
+worker startup. It queries `goose_db_version` for the highest applied
+version and refuses to start if the table is missing or the latest row
+is `is_applied=false`. This catches "operator forgot to run migrate" as
+a clear boot error instead of a mysterious runtime "relation does not
+exist" later.
+
+### Local migration workflow
+
+```bash
+# Set the direct-endpoint DSN once
+export DATABASE_URL='host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
+                     user=neondb_owner password=$PG_PASSWORD dbname=honeyDue sslmode=require'
+
+make migrate-status                  # what's pending
+make migrate-up                      # apply
+make migrate-down                    # roll back the latest
+make migrate-new name=add_widget_col # scaffold a new SQL file
+```
+
+Each new migration file goes through code review like any other code
+change. The deploy-script Job applies it on the next deploy.
+
+### Bootstrap (one-time, when the prod DB already had a schema)
+
+Bootstrapping a goose-managed DB whose schema already exists requires
+seeding `goose_db_version` so goose treats version 1 as already-applied:
+
+```bash
+# Once. After this, future migrations append normally.
+goose -dir migrations postgres "$DATABASE_URL" version  # creates the table
+psql "$DATABASE_URL" -c \
+  "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
+```
+
+This was done for honeyDue's prod Neon project at the time of goose
+adoption — no need to repeat unless we set up a fresh DB from a
+schema dump.

 ## What's in the database

@@ -229,17 +318,45 @@ value.
 ## Neon regions

 Neon's default region for new projects is `aws-us-east-1` (Virginia).
-Our DB is there. Latency from Nuremberg to us-east-1 is **~90-120ms
-round trip**.
+Our DB is there. Latency from Nuremberg to us-east-1 is **~108ms one-way**
+TCP-level (verified by `nc -z -w 5` from `hetzner1`), so **~220ms RTT
+through Neon's pooler stack**.

 This is the slowest hop in our data flow. Every api request that needs
-a DB query (most of them) pays this latency at least once.
+a DB query pays this latency at least once. Sub-millisecond Postgres
+execution time (verified via `EXPLAIN ANALYZE`: 0.04-0.34 ms on every
+hot path) means **wall-clock latency = network + Neon proxy overhead**.

-**When this matters**: When we start seeing ~200ms+ response times from
-complex endpoints, it's likely DB latency dominant. Options:
- Migrate Neon to `aws-eu-central-1` (Frankfurt) — shaves ~90ms off
- Add Redis caching for hot reads (Chapter 7)
- Read replicas (Neon supports them on paid tiers)
+### Optimizations layered on top to minimize round trips
+
+We don't move the DB region (yet) but we cut the *number* of RTTs per
+request via:
+
+1. **Auth caching** (Chapter 7 §Redis) — token + user lookups served
+   from Redis (1-hour TTL) and per-pod in-memory cache (5-min TTL).
+   On warm cache: 0 SQL round-trips for auth.
+2. **JOIN consolidation** — two-step
+   `find residence-IDs → find tasks IN ids` collapsed into a single
+   query with a Postgres subquery. One RTT instead of two.
+3. **Single-query auth** — token + user fetched in one INNER JOIN
+   instead of GORM's two-query Preload pattern.
+4. **Residence-IDs Redis cache** — cached per user with 5-min TTL,
+   invalidated on Create/Delete/Join/Remove. Saves 1 RTT per
+   `/api/documents/`, `/api/contractors/`, `/api/residences/summary/`
+   request.
+
+After these, a fully-warm `/api/tasks/` is **1 SQL round-trip total
+(~220ms wall-clock)**. Verified via Jaeger trace — see Chapter 15.
+
+### When this still matters
+
+- Any cold-cache request still pays 2-3 RTTs (~500-700ms).
+- Pod startup pays 1 RTT × 20 (warm-up), but that runs in parallel:
+  ~440ms one-shot.
+
+Long-term fix: migrate Neon to `aws-eu-central-1` (Frankfurt) — drops
+RTT to ~5ms and brings warm-cache requests under 50ms. Tracked in
+`docs/observability-plan.md` and Chapter 18 §migration triggers.

 ## Environment variables the app reads

@@ -247,14 +364,15 @@ From ConfigMap:

 | Var | Purpose |
 |---|---|
-| `DB_HOST` | Neon pooler hostname |
+| `DB_HOST` | Neon pooler hostname (`-pooler` suffix) |
 | `DB_PORT` | 5432 |
 | `POSTGRES_USER` | `neondb_owner` |
 | `POSTGRES_DB` | `honeyDue` |
 | `DB_SSLMODE` | `require` |
 | `DB_MAX_OPEN_CONNS` | 25 |
-| `DB_MAX_IDLE_CONNS` | 10 |
-| `DB_MAX_LIFETIME` | `600s` |
+| `DB_MAX_IDLE_CONNS` | 20 |
+| `DB_MAX_LIFETIME` | `1800s` |
+| `DB_MAX_IDLE_TIME` | `0s` (never close idle) |

 From Secret (`honeydue-secrets`):

@@ -288,11 +406,13 @@ GROUP BY usename, state, application_name;
 - [Neon docs][neon-docs]
 - [Neon pricing][neon-pricing]
 - [Postgres advisory locks][pg-locks]
- [GORM AutoMigrate][gorm-automigrate]
+- [pressly/goose][goose] — production migration tool
+- [GORM AutoMigrate][gorm-automigrate] (tests only)
 - [honeyDue task architecture][task-arch] (repo-local)

 [neon-docs]: https://neon.com/docs/introduction
 [neon-pricing]: https://neon.com/pricing
 [pg-locks]: https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
+[goose]: https://github.com/pressly/goose
 [gorm-automigrate]: https://gorm.io/docs/migration.html
 [task-arch]: ../../docs/TASK_LOGIC_ARCHITECTURE.md
@@ -150,18 +150,64 @@ Allowed MIME types: `image/jpeg`, `image/png`, `image/gif`, `image/webp`,

 ## Access control

-### Upload flow
+### Upload flow (current — direct-to-B2 with presigned POST)

-1. Client POSTs to `/api/upload/`
-2. Go API validates the user is authenticated and authorized for the
-   target resource
-3. Go API streams the upload to B2 via minio-go's `PutObject`
-4. B2 returns a key
-5. Go API stores the key in Postgres
-6. Returns the key to the client
+Image and document uploads go **directly from the client to B2**. The
+api server only signs a short-lived POST policy; the bytes never
+traverse our cluster. This is the WhatsApp / Slack architecture and
+sidesteps the api as a proxy bottleneck.

-The B2 bucket is **private**. Clients can't GET directly; they always
-go through the Go API.
+1. Client `POST /api/uploads/presign` with `{category, content_type, content_length}`.
+2. api validates auth, per-user quota (10 concurrent in-flight,
+   50/hour rate limit), allowed mime, and the 10 MB cap. On success it
+   creates a `pending_uploads` row, signs a B2 POST policy with a
+   `content-length-range` condition bound to the claimed length ±256
+   bytes, and returns `{id, upload_url, fields, key, expires_at}`.
+3. Client multipart-POSTs the bytes directly to B2 using the returned
+   fields. **B2 enforces the size cap at the protocol level** — clients
+   can't bypass it by lying about Content-Length.
+4. Client POSTs to the entity-creation endpoint (`/api/task-completions/`,
+   `/api/documents/`) with `upload_ids: [id]`. The service `HEAD`s each
+   B2 object, verifies size matches `expected_bytes`, marks the
+   `pending_uploads.claimed_at`, and writes the `task_completion_image`
+   / `document_image` row referencing the upload.
+
+The signed URL is valid for 15 minutes; presigns are not reusable.
+
+The B2 bucket stays **private** — only the api ever holds the key
+material. Clients can't list or GET directly without a presign.
+
+```
+┌──────────┐   1) presign        ┌────────┐
+│  client  │ ──────────────────► │  api   │
+│          │ ◄────────────────── │        │  POST policy + key
+│          │                     └────────┘
+│          │                                   row in
+│          │                          pending_uploads
+│          │                          (claimed_at NULL)
+│          │   2) POST bytes      ┌────────┐
+│          │ ──────────────────►  │   B2   │  enforces policy
+│          │ ◄────────────────── │        │
+│          │                     └────────┘
+│          │   3) attach          ┌────────┐
+│          │ ──────────────────►  │  api   │  HEAD B2 object,
+│          │  upload_ids: [id]    │        │  mark claimed_at,
+│          │                     └────────┘  insert image row
+└──────────┘
+```
+
+Server-side enforcement summary:
+
+| Check | Where | Reject if |
+|---|---|---|
+| Auth | api middleware | unauthenticated |
+| Mime allowlist | `upload_service.go:allowedContentTypes` | not in list for category |
+| Size cap (10 MB) | api before signing + B2 policy | content_length > 10 MiB |
+| Concurrency cap (10) | `CountUnclaimedActiveForUser` | already 10 unclaimed in-flight |
+| Rate limit (50/hr) | Redis sliding window `upload:presign:<uid>:<bucket>` | 51st presign in the same hour |
+| Size at upload time | B2 (signed policy) | bytes outside content-length-range |
+| Ownership at attach | `FindUnclaimedForUser` | upload_id belongs to a different user |
+| Bytes match claim | `s3.Stat()` + bytes comparison | actual size differs from expected ±256 |

 ### Download flow (current)

@@ -170,34 +216,55 @@ go through the Go API.
 3. Go API fetches from B2 and streams back to the client

 This proxies every download through the api. For high-traffic media
-that's inefficient (api becomes an egress bottleneck).
-
-### Future: signed URLs
-
-We could generate time-limited signed URLs for B2 objects:
-
-```go
-url, err := s3Client.PresignedGetObject(ctx, bucket, key, 1*time.Hour, nil)
-```
-
-Returns a URL the client can GET directly from B2, scoped to a specific
-object, valid for 1h. Saves api bandwidth and latency.
-
-Not yet implemented. TODO (Chapter 20).
+that's inefficient (api becomes an egress bottleneck) — could be
+replaced with presigned GET URLs on the same bucket. Not yet shipped;
+download volume is low enough that the proxy is fine for now.

 ## Lifecycle and retention

-We have **no lifecycle rules** set on the bucket. Objects live forever
-unless the app deletes them.
+### Orphan cleanup (`pending_uploads`)

-When a user deletes their account, the app should delete their B2
-objects. This is currently not automated — a compliance gap for any
-"right to be forgotten" request.
+Every presign creates a row in `pending_uploads` with `expires_at =
+now + 15 min`. If the client never finishes the upload, or finishes
+but never calls the attach endpoint, the row stays unclaimed. An
+hourly cron in the worker reaps them:

-**TODO** (Chapter 20): Either:
- Implement explicit cleanup in the user deletion handler, or
- Add B2 lifecycle rule tied to object metadata (tag objects with
-  user ID; rule deletes tagged objects when user is soft-deleted)
+- **`maintenance:upload_cleanup`** — cron `30 * * * *`. Selects
+  unclaimed rows past `expires_at`, deletes the corresponding B2
+  object, deletes the row. Up to 500 per tick; the next tick picks up
+  any overflow. Worker logs include `reaped` count.
+
+The worker constructs a `StorageService` at startup; if storage init
+fails (e.g. `B2_KEY_ID` / `B2_APP_KEY` not wired into the worker
+deployment), the cleanup handler logs a warning and no-ops. See
+`deploy-k3s/manifests/worker/deployment.yaml` — both B2 secrets are
+required envs on this pod.
+
+### Bucket lifecycle (backstop)
+
+A B2 lifecycle rule on the `uploads/` prefix is the safety net if the
+worker is offline for an extended period:
+
+- Hide objects 7 days after upload.
+- Delete 1 day after hidden.
+
+This is configured manually via the Backblaze console (B2's S3
+lifecycle API isn't fully implemented). See
+`deploy-k3s/manifests/b2-lifecycle.md` for the exact rule and
+`b2 bucket get-info` verification command.
+
+### User-deletion cascade
+
+When a user deletes their account, the app deletes their `task_*` /
+`document` rows. The associated B2 objects survive — same compliance
+gap as before, not yet automated. Two approaches:
+
+- Walk the image rows on user delete and `RemoveObject` each (simple,
+  synchronous, slow for users with many uploads).
+- Tag objects with a `user_id` metadata header at upload time, then
+  use a B2 lifecycle rule scoped to a deleted-users prefix.
+
+Option 1 is the next item in the upload roadmap.

 ## Backup of B2

@@ -1,5 +1,11 @@
 # 10 — Secrets & Config

+> **Updated 2026-05-15 (security remediation):** `honeydue-secrets` now
+> carries `REDIS_PASSWORD`; an `admin-basic-auth` Secret backs the admin
+> ingress; rotation is documented in `docs/runbooks/secret-rotation.md`;
+> and the Go config can read file-mounted secrets (`HONEYDUE_SECRETS_DIR`).
+> `deploy-k3s/SECURITY.md` is the authoritative current-state record.
+
 ## Summary

 Non-sensitive config (hostnames, ports, feature flags, etc.) lives in
@@ -55,7 +61,7 @@ APNS_AUTH_KEY_ID=DISABLED01
 APNS_AUTH_KEY_PATH=/secrets/apns/apns_auth_key.p8
 APNS_PRODUCTION=false
 APNS_TEAM_ID=DISABLED01
-APNS_TOPIC=com.tt.honeyDue
+APNS_TOPIC=com.myhoneydue.honeyDue
 APNS_USE_SANDBOX=false
 BASE_URL=https://myhoneydue.com
 B2_BUCKET_NAME=honeyDueProd
@@ -272,7 +272,7 @@ sequenceDiagram
    participant NewPod as api pod v2 (starting)

    Note over NewPod: kubelet starts new pod
-    Note over NewPod: pod connects to Postgres<br/>MigrateWithLock runs (no-op)<br/>HTTP server starts<br/>readinessProbe passes
+    Note over NewPod: pod connects to Postgres<br/>RequireSchemaApplied checks goose_db_version<br/>HTTP server starts<br/>readinessProbe passes
    Note over NewPod: kube-proxy updates endpoints<br/>NewPod added to Service pool
    CF->>Traefik: request 1
    Traefik->>OldPod: routed (old pod still in pool)
@@ -8,23 +8,62 @@ No downtime if the change is backward-compatible. Rollback is
 `kubectl rollout undo`. This chapter walks through the full process,
 plus alternate paths (config-only changes, manifest changes, hotfixes).

-## TL;DR for a code change
+## TL;DR using the unified deploy script
+
+The recommended path. `deploy-k3s/scripts/03-deploy.sh` builds all four
+images (api, worker, admin, web), pushes to Gitea, regenerates the
+ConfigMap from `config.yaml`, applies every manifest under
+`deploy-k3s/manifests/` (including the observability vmagent), and
+waits for all rollouts.
+
+```bash
+cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go
+git add . && git commit -m "..." && git push gitea master
+
+export KUBECONFIG=~/.kube/honeydue.yaml
+bash deploy-k3s/scripts/03-deploy.sh         # full build + push + rollout
+# or, to redeploy without rebuilding:
+bash deploy-k3s/scripts/03-deploy.sh --skip-build
+# or, to pin a specific tag:
+bash deploy-k3s/scripts/03-deploy.sh --tag d3708e6
+```
+
+What the script does, in order:
+
+1. Read registry creds from `deploy-k3s/config.yaml`.
+2. `docker login gitea.treytartt.com`.
+3. Build all four images with `--platform linux/amd64` (so arm64 Macs
+   don't push images that crash on Hetzner amd64 nodes with
+   "exec format error").
+4. Push to the gitea registry, plus tag and push `:latest`.
+5. Generate the env file from `config.yaml` and apply as ConfigMap
+   `honeydue-config` (uses dry-run + apply for diff-free idempotence).
+6. Apply `manifests/namespace.yaml`, `redis/`, `ingress/`,
+   `api/{deployment,service,hpa}`, `worker/`, `admin/`, `web/`.
+7. Apply `manifests/observability/vmagent.yaml`, substituting
+   `TOKEN_PLACEHOLDER` with `OBS_INGEST_TOKEN` from `deploy/prod.env`
+   (gitignored). Skipped with a warning if the token isn't present.
+8. `kubectl rollout status` for every Deployment, including vmagent.
+
+~7–10 minutes for a full rebuild. ~1–2 minutes with `--skip-build`.
+
+## TL;DR for a single-service code change (manual)

 ```bash
 # 1. Commit + get SHA
 cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go
 git add . && git commit -m "..." && SHA=$(git rev-parse --short HEAD)

-# 2. Login to Gitea registry
-set -a; source deploy/registry.env; set +a
-printf '%s' "$REGISTRY_TOKEN" | docker login "$REGISTRY" -u "$REGISTRY_USERNAME" --password-stdin
+# 2. Login to Gitea registry (creds in config.yaml)
+docker login gitea.treytartt.com -u admin

 # 3. Build + push amd64 image
-docker buildx build --platform linux/amd64 --target api \
-  -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" --push .
+docker build --platform linux/amd64 --target api \
+  -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" .
+docker push "gitea.treytartt.com/admin/honeydue-api:${SHA}"

 # 4. Roll it in
-export KUBECONFIG=~/.kube/honeydue-k3s.yaml
+export KUBECONFIG=~/.kube/honeydue.yaml
 kubectl set image deployment/api -n honeydue \
  api="gitea.treytartt.com/admin/honeydue-api:${SHA}"

@@ -32,11 +71,18 @@ kubectl set image deployment/api -n honeydue \
 kubectl rollout status -n honeydue deployment/api

 # 6. Log out
-docker logout "$REGISTRY"
+docker logout gitea.treytartt.com
 ```

 ~3–5 minutes end to end for api.

+> **Gotcha:** Deployments default to `imagePullPolicy: IfNotPresent`,
+> which means kubelet won't re-fetch an image with a tag it already
+> has cached locally — even if the registry now has different bytes
+> at that tag. Always change tags (use the SHA), or temporarily flip
+> `imagePullPolicy: Always` and `kubectl rollout restart` if you need
+> to overwrite a tag.
+
 ## The build

 ### Step 1 — Prepare
@@ -201,6 +247,38 @@ kubectl patch secret honeydue-secrets -n honeydue \
 kubectl rollout restart -n honeydue deployment/api deployment/worker
 ```

+## One-time B2 bucket lifecycle (manual)
+
+The `pending_uploads` cleanup cron (`30 * * * *` on the worker) handles
+the common case of reaping orphaned uploads. The B2 bucket lifecycle
+rule on the `uploads/` prefix is the **backstop** if the worker is
+offline for >24 hours. It's configured once via the Backblaze web
+console — B2's S3 lifecycle API isn't fully implemented, so this can't
+be in the deploy script.
+
+One-time setup:
+
+1. Open https://secure.backblaze.com/b2_buckets.htm → bucket
+   `honeyDueProd` → **Lifecycle Settings** → **Custom**
+2. Add rule:
+   - File name prefix: `uploads/`
+   - Hide files older than: **7 days**
+   - Delete hidden files older than: **1 day**
+
+Total maximum lifetime of an orphaned object after the rule fires: 8
+days. The worker normally reaps within an hour, so the rule should
+almost never trigger.
+
+Verify:
+
+```bash
+# Requires the b2 CLI: brew install b2-tools
+b2 bucket get-info honeyDueProd | jq '.lifecycleRules'
+```
+
+See `deploy-k3s/manifests/b2-lifecycle.md` for the canonical rule
+definition and a curl-based fallback if the b2 CLI isn't available.
+
 ## Manifest changes

 When you add/modify a deployment YAML:
@@ -271,10 +349,47 @@ Timeline (approximate, warm state):
 - t=60s: another old pod terminates
 - ...continues until all on new RS

-For cold-boot (e.g., first deploy on a rebuilt cluster), the
-MigrateWithLock advisory lock extends this to several minutes. But the
-rollout is serialized — only one pod starts per iteration, so the lock
-queue is small.
+Migrations run as a separate Kubernetes Job that completes before any
+api/worker pod is rolled. So the rollout above never includes migration
+work — pods that boot are guaranteed to find the schema already at the
+expected version. See §"Migrations are gated, not interleaved" below.
+
+## Migrations are gated, not interleaved
+
+`03-deploy.sh` runs `goose up` as a one-shot Job before applying any
+api/worker manifests:
+
+```
+1. kubectl delete job honeydue-migrate (idempotent, removes prior run)
+2. kubectl apply -f manifests/migrate/job.yaml (with current api image)
+3. kubectl wait --for=condition=complete --timeout=10m job/honeydue-migrate
+4. (only if Job succeeded) kubectl apply -f manifests/api/...
+```
+
+The Job uses the api image — `/usr/local/bin/goose` is baked in at
+Dockerfile build time. The Job script strips the `-pooler` segment
+from `DB_HOST` before connecting (goose's session-scoped advisory
+lock can't survive PgBouncer transaction-mode), runs `goose up`, exits.
+
+If the Job fails, the script aborts before any new app pod sees a
+stale schema. To debug:
+
+```bash
+kubectl -n honeydue logs job/honeydue-migrate --tail=200
+kubectl -n honeydue describe job honeydue-migrate
+```
+
+After investigating, fix the migration file and re-run `03-deploy.sh`.
+The Job is idempotent — successful migrations stay applied, only the
+new/failed file gets retried.
+
+api/worker pods run a `RequireSchemaApplied` check at startup that
+queries `goose_db_version` and refuses to boot if the table is missing
+or the latest row is `is_applied=false`. This is the fail-fast for
+"someone bypassed the deploy script and the schema isn't current."
+
+For full schema management background, see
+[Chapter 8 §Schema management](./08-database.md).

 ## Hotfix workflow

@@ -314,14 +429,10 @@ Contrast: `deploy/scripts/deploy_prod.sh` (Swarm-era) did:
 9. Healthcheck the final URL; auto-rollback on failure
 10. Log out of registries

-Our current k3s deploy is more manual but simpler. We'd write a similar
-script for k3s if deploys become frequent:
-
-```bash
-# deploy-k3s/scripts/04-deploy.sh (not yet updated for Gitea)
-```
-
-See the scaffold in `deploy-k3s/scripts/`.
+The current k3s replacement, `deploy-k3s/scripts/03-deploy.sh`, covers
+the same ground in fewer steps because Kubernetes does the
+versioning/rollout/health bookkeeping natively. See the TL;DR section
+at the top of this chapter.

 ## Common deploy failures

@@ -2,15 +2,119 @@

 ## Summary

-We have minimal observability today: `kubectl logs`, `kubectl top`,
-Cloudflare Analytics, and the Neon dashboard. No Prometheus, no Grafana,
-no centralized log aggregator, no APM. This is adequate for the
-current traffic volume (low) but is a known gap. This chapter documents
-what we *have* and what we'd add as traffic grows.
+Production has live metrics and tracing infrastructure as of 2026-04-25.
+A self-hosted **VictoriaMetrics + Jaeger + Grafana** stack runs on
+`88oakappsUpdate` (Linode VPS, also home to the self-hosted PostHog
+deployment). A `vmagent` sidecar in the honeyDue k3s namespace scrapes
+the api Pods' `/metrics` endpoint every 15 seconds and remote-writes to
+`https://obs.88oakapps.com/api/v1/write`. Grafana is at
+`https://grafana.88oakapps.com` with a pre-provisioned RED dashboard.
+
+What we still don't have: log aggregation (Dozzle and `kubectl logs`
+fill the niche for now), alerting (no PagerDuty/Slack on errors), and
+full distributed tracing (OTel SDK is wired in app code but app-side
+instrumentation beyond HTTP routes hasn't shipped yet).
+
+The whole observability stack costs **$0** incremental and uses ~700 MB
+RAM on `88oakappsUpdate` (5% of its free RAM). It runs as a separate
+docker-compose project from PostHog so neither product's lifecycle
+touches the other.

 ## What we have

-### 1. `kubectl logs`
+### 1. Metrics — VictoriaMetrics + vmagent
+
+```
+honeyDue k3s (Hetzner)                   88oakappsUpdate (Linode)
+┌───────────────────────────┐            ┌──────────────────────────┐
+│ api Pods (3) :8000/metrics│            │ /opt/honeydue-obs/       │
+│   prometheus/client_golang│            │ ┌──────────────────┐     │
+│                           │            │ │ VictoriaMetrics  │     │
+│ vmagent ──── scrape 15s   │            │ │  30d retention   │     │
+│         remote_write ─────┼────────────┼─→ /api/v1/write   │     │
+│         (HTTPS, bearer)   │            │ │  (mem 256 MB)    │     │
+└───────────────────────────┘            │ └──────────────────┘     │
+                                          └──────────────────────────┘
+```
+
+The Go API exposes `/metrics` in Prometheus exposition format. Histograms
+are defined in `internal/prom/metrics.go` and registered globally:
+
+| Metric | Labels | Source |
+|---|---|---|
+| `http_request_duration_seconds` | `route, method, status` | Echo middleware around every handler |
+| `gorm_query_duration_seconds` | `table, operation` | GORM before/after callbacks (no ctx threading needed) |
+| `b2_upload_duration_seconds` | `bucket, result` | Wrapped `s.backend.Write` in `internal/services/storage_service.go` |
+| `b2_upload_bytes_total` | `bucket, result` | Counter alongside the duration histogram |
+| `apns_send_duration_seconds` | `result` (`ok`/`bad_token`/`error`) | Wrapped APNs `PushWithContext` in `internal/push/apns.go` |
+| `fcm_send_duration_seconds` | `result` | Wrapped FCM HTTP v1 send in `internal/push/fcm.go` |
+| `asynq_job_duration_seconds` | `task_type, result` | Histograms registered; middleware not yet attached (Step 3) |
+| `go_*`, `process_*` | (standard) | `prometheus/client_golang/prometheus/collectors` defaults |
+
+The previous custom monitoring at `/metrics` was renamed to
+`/metrics/legacy` so the canonical `/metrics` emits proper histograms
+suitable for `histogram_quantile()` rollups. The legacy endpoint stays
+because the GoAdmin dashboard reads it.
+
+#### vmagent in k3s
+
+Lives at `deploy-k3s/manifests/observability/vmagent.yaml`. One replica,
+`mem_limit: 256Mi`, scrapes by Kubernetes pod-discovery filtered to
+`app.kubernetes.io/name=api` and remote-writes to
+`https://obs.88oakapps.com/api/v1/write` with a bearer token from
+`OBS_INGEST_TOKEN` in `deploy/prod.env` (substituted into a Secret at
+deploy time).
+
+The agent buffers locally to `/tmp/vmagent` (emptyDir, 512 MB cap), so
+brief obs outages don't drop samples. Persistent queue replays on
+reconnect.
+
+NetworkPolicies in the honeydue namespace allow egress from vmagent to:
+- DNS (kube-dns / coredns)
+- Kubernetes API (`10.43.0.0/16:443`) for pod discovery
+- api Pods on `10.42.0.0/16:8000`
+- The public obs endpoint over `0.0.0.0/0:443`
+
+These are scoped tight — vmagent can't reach Postgres, Redis, B2, or
+any other external service.
+
+### 2. Tracing — Jaeger all-in-one
+
+Jaeger 1.62 with badger storage runs alongside VictoriaMetrics. The
+collector accepts:
+- OTLP/HTTP at `https://obs.88oakapps.com/v1/traces` (bearer-token gated)
+- OTLP/gRPC at `:4317` (localhost-only)
+- Native Jaeger protocols at `:14268` etc. (localhost-only)
+
+Retention: ~7 days at current scale before badger rotates. UI at
+`https://grafana.88oakapps.com` via the Jaeger datasource.
+
+**Status of app-side instrumentation**: the histograms are populating
+metrics. The OTel exporter wiring in `cmd/api/main.go` is **not yet
+shipped**. When it does ship, every `POST /api/auth/login/` will produce
+a flame-graph trace with HTTP → handler → SQL → B2 → APNs spans.
+Tracking issue: gitea#3.
+
+### 3. Dashboards — Grafana
+
+`https://grafana.88oakapps.com` (Cloudflare-fronted, basic auth via
+Grafana itself, admin credentials in `deploy/prod.env`).
+
+Datasources auto-provisioned at container startup from
+`/opt/honeydue-obs/data/grafana-provisioning/datasources/datasources.yaml`:
+- VictoriaMetrics (Prometheus type, `http://victoriametrics:8428` in-network)
+- Jaeger (`http://jaeger:16686` in-network)
+
+Pre-provisioned dashboard: `honeyDue API — RED` at
+`/d/honeydue-red`. Top row uses the legacy custom metrics
+(`http_endpoint_requests_total`, `http_requests_total`) which started
+flowing the moment vmagent attached. Lower rows use the new histograms
+(`http_request_duration_seconds_bucket` p50/p95/p99 by route, GORM p95
+by table, B2 upload p95, APNs/FCM send p95, Go memory + goroutines).
+Lower rows populated immediately after the api rebuild that shipped
+`internal/prom`.
+
+### 4. `kubectl logs`

 Every container's stdout/stderr is captured by containerd and readable
 via kubectl:
@@ -33,9 +137,10 @@ kubectl get events -n honeydue --sort-by=.lastTimestamp
 Only the last ~20 MB of logs is retained per container, on-disk on the
 node. Once a pod is deleted, its logs are gone.

-For persistent log access we'd need aggregation (see §what we'd add).
+For persistent log access we'd need aggregation (see §What we still
+don't have).

-### 2. `kubectl top`
+### 5. `kubectl top`

 Pod and node resource usage via metrics-server:

@@ -43,43 +148,32 @@ Pod and node resource usage via metrics-server:
 kubectl top nodes
 # NAME                CPU(cores)   CPU(%)   MEMORY(bytes)   MEMORY(%)
 # ubuntu-8gb-nbg1-1   169m         4%       748Mi           9%
-# ubuntu-8gb-nbg1-2   229m         5%       1043Mi          13%
-# ubuntu-8gb-nbg1-3   124m         3%       770Mi           9%

 kubectl top pods -n honeydue
 ```

-**Retention**: In-memory only. Last few minutes of data. No
-historical view.
+In-memory only; last few minutes of data. For historical trends use
+the Grafana dashboard, which exposes the same data via the `go_*` and
+`container_*` (kubelet cAdvisor) metrics.

-### 3. Cloudflare Analytics
+### 6. Cloudflare Analytics

-CF Dashboard → Analytics & Logs. Per-zone stats:
- Requests per second
- Bandwidth
- Cache hit ratio
- Top HTTP status codes
- Top request paths
- Bot traffic score
+CF Dashboard → Analytics & Logs. Per-zone aggregate stats:
+requests/sec, bandwidth, cache hit ratio, top status codes, top paths,
+bot traffic score. Good for spotting macro trends ("suddenly 10× more
+502s today") that wouldn't show up in a single-pod sample.

-All aggregated, no individual request traces. Good for spotting macro
-trends ("suddenly 10× more 502s today"), poor for debugging specific
-issues.
+Free tier retention: 7 days of aggregate stats.

-Free tier retention: 7 days of aggregate stats. Pro extends this.
+### 7. Neon dashboard

-### 4. Neon dashboard
+Neon console → project → Monitoring: compute utilization (CU-hours),
+slow queries, active connections, storage usage. Useful for "is the
+DB busy?" and free-tier limit watching. The new
+`gorm_query_duration_seconds` histogram covers the application side
+of the same question with much better latency tail visibility.

-Neon console → project → Monitoring:
- Compute utilization (CU-hours consumed)
- Query performance (slow queries)
- Active connections
- Storage usage
-
-Good for "is the DB busy?" and "am I close to my free tier limit?"
-Not real-time.
-
-### 5. Kubernetes events
+### 8. Kubernetes events

 `kubectl get events` shows cluster-level state changes: pod scheduling,
 failures, image pulls, probe failures. Useful for post-mortem on
@@ -87,7 +181,7 @@ deploys.

 Retention: events are stored in etcd but default to 1 hour.

-## What we don't have (the gap)
+## What we still don't have

 ### No log aggregation

@@ -98,64 +192,108 @@ all api pod logs for user X") we have to:
 # Query all at once with stern (if installed)
 stern -n honeydue api

-# Or for specific pod
+# Or per-pod
 kubectl logs -n honeydue <pod> | grep user_id=12345
 ```

-This works but doesn't scale. Grep across 3 pods for a specific
-user_id is OK. Across 30 pods, intractable.
+This works but doesn't scale across many pods.

-**What we'd add**: [Loki](https://grafana.com/oss/loki/) — a lightweight
-log aggregator designed for k8s. ~$0 to self-host; integrates with
-Grafana for queries. Or [Betterstack](https://betterstack.com/logs)
-($10/mo, hosted).
-
-### No metrics/dashboards
-
-`kubectl top` tells us "is this pod hot right now?" but not "has CPU
-been climbing over the past hour?" We'd need:
-
- **Prometheus** — scrapes metrics from kubelet and pods' `/metrics`
-  endpoints, stores time series
- **Grafana** — queries Prometheus, renders dashboards
-
-K3s can install these via Helm in ~10 minutes. Adds ~500MB RAM to the
-cluster. Stability and operational load: moderate.
-
-**Alternative**: [Kubernetes Dashboard](https://github.com/kubernetes/dashboard)
-bundled with k3s (disabled by default). Minimal UI over the existing
-metrics API. Cheaper than Prometheus but less queryable.
-
-### No distributed tracing
-
-"This request took 800ms — which hop was slow?" is currently unanswerable
-beyond "the DB query, probably." A real trace would show:
- TLS handshake time
- Traefik routing time
- Go handler time
- Postgres query time
- Redis call time
- Each B2 request time
-
-We'd add OpenTelemetry to the Go app and export to Jaeger/Tempo. Work
-is moderate; value kicks in when we have complex request flows.
+**What we'd add**: [Loki](https://grafana.com/oss/loki/) on
+`88oakappsUpdate` next to the existing obs stack. Adds ~512 MB RAM
+plus a Promtail (or Vector/Alloy) DaemonSet in k3s. Defer until log
+search becomes a recurring pain point — `stern` + `grep` is fine at
+current pod count.

 ### No alerting

 No PagerDuty, no Slack webhooks, no email on "api is returning 500s."
 The operator finds out when users complain.

-Cheapest fix: [Uptime Kuma](https://github.com/louislam/uptime-kuma)
-(self-hosted) or Better Stack Uptime (free for small teams). Ping
-`https://api.myhoneydue.com/api/health/` every minute; alert if it fails.
+Cheapest fix path:
+1. Grafana alerting (built into Grafana 11) — alert rules over the
+   existing histograms (e.g., `histogram_quantile(0.95, ...) > 1s`).
+   Routes to Slack via webhook. **Zero infra cost.**
+2. [Uptime Kuma](https://github.com/louislam/uptime-kuma) on
+   `88oakappsUpdate` — pings `/api/health/` from outside the cluster
+   every minute; complements the in-cluster view.
+
+We'd want both eventually. Grafana alerting first because the data is
+already there.
+
+### Distributed tracing — fully integrated
+
+The OTel SDK is wired in `cmd/api/main.go` and `cmd/worker/main.go` and
+ships traces to Jaeger via `obs.88oakapps.com/v1/traces`. Every public
+service method now takes `ctx context.Context` and routes its SQL through
+`repo.WithContext(ctx)`, which means **every authenticated API endpoint
+produces a fully-nested flame graph** in Jaeger.
+
+| Span source | Status |
+|---|---|
+| `otelecho.Middleware` — span per HTTP request | ✅ live |
+| Auth middleware DB lookups (`m.db.WithContext(ctx)`) | ✅ live |
+| All repos via `repo.WithContext(ctx)` (`otelgorm` plugin) | ✅ live |
+| Manual span around `storage_service.Upload` (B2 PutObject) | ✅ live |
+| Manual span around APNs `Send` / `SendWithCategory` | ✅ live |
+| Manual span around FCM `sendOne` | ✅ live |
+| Asynq middleware — span per task type with retry/payload attrs | ✅ live |
+
+Migrated services (every public method takes ctx):
+- `AuthService` — login, register, refresh, logout, me, verify-email,
+  forgot/reset-password, update-profile
+- `TaskService` — all 25+ task and completion methods
+- `ResidenceService` — all 15 methods including share-codes
+- `ContractorService` — all 9 methods
+- `DocumentService` — all 10 methods
+- `NotificationService` — all 12 methods
+- `SubscriptionService` — all 12 methods including Apple/Google IAP
+
+Sample trace for `GET /api/tasks/` (warm cache, post-optimization):
+
+```
+GET /api/tasks/                                              (229ms)
+└── service: SELECT * FROM task_task WHERE residence_id IN
+              (SELECT id FROM residence_residence WHERE...)   (227ms)
+```
+
+Two spans total. The auth path runs entirely from Redis + in-memory
+cache (zero SQL queries) thanks to the 1-hour token TTL and 5-min user
+TTL. The residence-ID lookup is folded into the tasks query as a
+Postgres subquery, so a single network round-trip to Neon services the
+whole request. See Chapter 8 §"Optimizations layered on top" for the
+optimization stack.
+
+Earlier trace, before the optimization stack landed (commit 88fb175):
+
+```
+GET /api/tasks/                                              (2473ms)
+├── auth: SELECT * FROM user_authtoken WHERE key=...           (1506ms)
+├── auth: SELECT * FROM auth_user WHERE id=7                    (333ms)
+├── service: SELECT id FROM residence_residence WHERE...        (736ms)
+└── service: SELECT * FROM task_task WHERE residence_id IN(...) (226ms)
+```
+
+10× improvement from 2,473ms to 229ms by cutting query count
+(5 SQL → 1 SQL on warm cache). The 227ms in the surviving query is
+**1 transatlantic round-trip** to Neon us-east-1 from Hetzner
+Nuremberg — the physical floor on the current setup. Eliminated by
+migrating Neon to a EU region; tracked in [Chapter 18 §migration
+triggers](./18-cost.md) and `docs/observability-plan.md`.
+
+**Migration pattern (for any future services or middleware):** add
+`ctx context.Context` as the first arg, change the handler call site
+to pass `c.Request().Context()`, and replace `s.repo.X(...)` with
+`s.repo.WithContext(ctx).X(...)`. Tests pass `context.Background()`.

 ### No APM (Application Performance Monitoring)

-No request-level profiling. We can't see "which endpoint has the highest
-p99 latency?" or "which SQL query is hot this week?"
+No continuous profiling. We can answer "which endpoint has the highest
+p99 latency?" from the histograms, but not "where in the call stack is
+the time going?" without ad-hoc `pprof` runs.

-Options: Datadog, New Relic, Honeycomb, self-hosted Tempo+Grafana.
-All are meaningful work to set up and cost $$$.
+If/when needed: Grafana Pyroscope is the OSS continuous profiler that
+fits our stack. Adds ~512 MB RAM. Defer until a CPU performance
+incident shows up.

 ## The app's logging conventions

@@ -172,28 +310,12 @@ The Go app uses zerolog and emits structured JSON:
 ```

 Log levels: `debug`, `info`, `warn`, `error`, `fatal`. Controlled by
-`DEBUG=true|false` in ConfigMap (true sets level to debug, false sets
-level to info).
+`DEBUG=true|false` in the ConfigMap (true sets level to debug, false
+sets level to info).

-Every request is logged with:
- Method, path, status code
- Request ID (for correlating logs across pods)
- User ID (if authenticated)
- Latency
-
-```json
-{
-  "level": "info",
-  "method": "GET",
-  "path": "/api/tasks/",
-  "status": 200,
-  "latency_ms": 42,
-  "user_id": 123,
-  "request_id": "a6b5db35-..."
-}
-```
-
-This is queryable by grep. Better with log aggregation.
+Every request is logged with method, path, status, request_id, user_id
+(if authenticated), latency. Queryable by grep today; ready to ingest
+into Loki when we add it.

 ## Health endpoints

@@ -202,71 +324,58 @@ Each service exposes a health endpoint:
 | Service | Endpoint | What it checks |
 |---|---|---|
 | api | `/api/health/` | Process alive (doesn't verify DB) |
+| api | `/api/health/live` | Process alive |
 | admin | `/` | Next.js is up |
 | worker | (none public) | Internal Asynq status |
+| api | `/metrics` | Prometheus exposition (vmagent scrapes here) |
+| api | `/metrics/legacy` | Custom monitoring metrics for GoAdmin |

 Health endpoints are **shallow** — they return 200 if the process is
 running and listening. They don't try to reach Postgres/Redis/etc.
 Rationale: if Postgres is briefly down, we don't want all api pods to
 start failing liveness and cascade-restart.

-## Dozzle (deprecated)
+## obs.88oakapps.com — the ingest endpoint

-The Swarm era had [Dozzle](https://github.com/amir20/dozzle) — a
-lightweight web UI for Docker logs. Accessible via SSH tunnel to the
-manager node. Not deployed on k3s; `kubectl logs` + `stern` fills the
-niche.
+Public hostname for cross-cluster metric and trace ingest. Cloudflare
+in front, nginx on `88oakappsUpdate` enforces a bearer-token check
+before forwarding to the local VM/Jaeger containers.

-## Kubernetes metrics the k8s API exposes
+| Path | Forwards to | Purpose |
+|---|---|---|
+| `/api/v1/write` | `http://127.0.0.1:8428` | Prometheus remote-write (vmagent → VM) |
+| `/v1/traces` | `http://127.0.0.1:4318/v1/traces` | OTLP/HTTP traces (app → Jaeger) |
+| `/health` | (returns 200) | Reachability probe — also requires auth |
+| anything else | 404 | |

-Even without Prometheus, these are queryable:
+Token lives at `/etc/honeydue-obs/secrets.env` (mode 0600 on the box)
+and at `OBS_INGEST_TOKEN=` in `deploy/prod.env` (gitignored). To rotate:
+generate a new value, update both ends, restart vmagent.

 ```bash
-# Resource metrics (via metrics-server)
-kubectl get --raw /apis/metrics.k8s.io/v1beta1/nodes
-kubectl get --raw /apis/metrics.k8s.io/v1beta1/namespaces/honeydue/pods
-
-# Core API (k8s state)
-kubectl get --raw /api/v1/namespaces/honeydue/pods/<name>
-
-# Kubelet metrics (per-node; requires tunneling)
-kubectl get --raw /api/v1/nodes/<node>/proxy/metrics
+# Operator: rotate the bearer token
+NEW=$(openssl rand -hex 32)
+ssh 88oakappsUpdate "sudo sed -i 's|OBS_INGEST_TOKEN=.*|OBS_INGEST_TOKEN=$NEW|' /etc/honeydue-obs/secrets.env"
+ssh 88oakappsUpdate "sudo sed -i 's|Bearer [a-f0-9]\{64\}|Bearer $NEW|' /etc/nginx/sites-available/obs.88oakapps.com && sudo nginx -s reload"
+sed -i.bak "s|^OBS_INGEST_TOKEN=.*|OBS_INGEST_TOKEN=$NEW|" deploy/prod.env
+KUBECONFIG=~/.kube/honeydue.yaml kubectl -n honeydue create secret generic vmagent-remote-write \
+  --from-literal=bearer_token=$NEW --dry-run=client -o yaml | kubectl apply -f -
+KUBECONFIG=~/.kube/honeydue.yaml kubectl -n honeydue rollout restart deploy/vmagent
 ```

-If we ever spin up Prometheus, these are the endpoints it would scrape.
+## Resource budget

-## Future: what to add and when
+| Service | mem_limit | Disk | Retention |
+|---|---|---|---|
+| VictoriaMetrics | 256 MB | 10 GB | 30 days |
+| Jaeger all-in-one (badger) | 256 MB | 10 GB | ~7 days |
+| Grafana OSS | 256 MB | 1 GB | — |
+| vmagent (in k3s) | 256 MB | 512 MB emptyDir | — |
+| **Total** | **~1 GB hard cap** | **~21 GB** | |

-| Trigger | Add |
-|---|---|
-| 10k+ daily users | Loki + Grafana for logs |
-| 100+ req/s sustained | Prometheus + Grafana for metrics |
-| Performance incidents | OpenTelemetry tracing |
-| Revenue > $5k/mo | Paid monitoring (Datadog or similar) |
-| First production outage | Alerting to phone/Slack |
-
-The overall philosophy: observability is an investment that compounds.
-Add it before you need it, not after. But also don't over-invest at
-idle.
-
-**Next quarter**: set up Uptime Kuma + Loki at minimum.
-
-## Checking what's installed
-
-```bash
-# In kube-system namespace
-kubectl get pods -n kube-system
-# Should see: coredns, metrics-server, traefik, local-path-provisioner,
-# and some k3s-related helm install jobs
-
-# In honeydue namespace
-kubectl get pods -n honeydue
-# api, admin, worker, redis
-
-# No monitoring namespace (yet)
-kubectl get namespaces
-# default, honeydue, kube-node-lease, kube-public, kube-system
-```
+Resident usage at idle is much lower (~90 MB on the obs side, ~30 MB
+for vmagent). Hard limits exist so a memory leak in any one component
+can't squeeze the cohabiting PostHog stack on `88oakappsUpdate`.

 ## Operator cheat sheet

@@ -274,32 +383,61 @@ kubectl get namespaces
 # Tail all logs in the namespace
 kubectl logs -n honeydue --all-containers=true --tail=50 -l app.kubernetes.io/part-of=honeydue

+# Scrape state from vmagent self-metrics
+kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
+  | grep -E "scrapes_total|targets|remotewrite"
+
+# Force vmagent to reload scrape config
+kubectl -n honeydue rollout restart deploy/vmagent
+
+# Query VictoriaMetrics directly (PromQL)
+ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=histogram_quantile(0.95,sum%20by%20(route,le)(rate(http_request_duration_seconds_bucket%5B5m%5D)))" | python3 -m json.tool'
+
+# Restart the obs stack on 88oakappsUpdate
+ssh 88oakappsUpdate 'cd /opt/honeydue-obs && sudo docker compose restart'
+
+# Live obs container memory
+ssh 88oakappsUpdate 'sudo docker stats --no-stream | grep honeydue-obs'
+
+# Pod resource usage (k3s side)
+kubectl top pods -n honeydue --sort-by=memory
+
 # With stern (if installed: brew install stern)
 stern -n honeydue .

-# Follow specific pod, including previous runs
-kubectl logs -n honeydue <pod> -f --previous=false
-
-# Pod resource usage
-kubectl top pods -n honeydue --sort-by=memory
-kubectl top pods -n honeydue --sort-by=cpu
-
-# Events (cluster-wide)
-kubectl get events -A --sort-by=.lastTimestamp | tail -20
-
 # Full state dump for a pod (debugging)
 kubectl describe pod -n honeydue <pod> > /tmp/pod-dump.txt
 kubectl logs -n honeydue <pod> > /tmp/pod-logs.txt
 ```

+## Future: what to add and when
+
+| Trigger | Add |
+|---|---|
+| First production incident | Grafana alerting (free, data already there) |
+| 10k+ daily users | Loki + Vector for log aggregation |
+| Performance incident the histograms can't explain | Wire OTel exporter → Jaeger from the Go app |
+| CPU pressure on api pods | Pyroscope continuous profiler |
+| Multi-product obs needs | Migrate obs stack to dedicated CX32 ($8/mo) |
+
+The overall philosophy: observability is an investment that compounds.
+Add it before you need it, not after. But also don't over-invest at
+idle.
+
 ## References

- [Kubernetes metrics-server][ms]
- [K3s metrics][k3s-metrics]
- [Loki][loki]
+- [VictoriaMetrics docs][vm]
+- [vmagent kubernetes_sd_configs][vmagent-k8s]
+- [Jaeger all-in-one with badger][jaeger]
+- [prometheus/client_golang][promclient]
+- [Grafana provisioning datasources][gf-prov]
+- [Loki][loki] (future)
 - [Stern (multi-pod log tail)][stern]

-[ms]: https://github.com/kubernetes-sigs/metrics-server
-[k3s-metrics]: https://docs.k3s.io/advanced#enabling-metrics-server
+[vm]: https://docs.victoriametrics.com/single-server-victoriametrics/
+[vmagent-k8s]: https://docs.victoriametrics.com/vmagent.html#kubernetes-monitoring-with-vmagent
+[jaeger]: https://www.jaegertracing.io/docs/1.62/getting-started/#all-in-one
+[promclient]: https://pkg.go.dev/github.com/prometheus/client_golang
+[gf-prov]: https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources
 [loki]: https://grafana.com/oss/loki/
 [stern]: https://github.com/stern/stern
@@ -115,6 +115,41 @@ kubectl rollout restart deployment/coredns -n kube-system
 kubectl rollout restart deployment/metrics-server -n kube-system
 ```

+#### vmagent can't reach obs.88oakapps.com
+
+**Symptom**: dashboards stop updating; vmagent logs show 401 / TLS /
+network errors against `obs.88oakapps.com`. App is unaffected.
+**Recovery**: vmagent buffers up to 512 MB locally and replays on
+reconnect, so brief outages self-heal. If sustained:
+```bash
+# Is the obs endpoint up?
+curl -s -o /dev/null -w "%{http_code}\n" https://obs.88oakapps.com/health \
+  -H "Authorization: Bearer $(grep ^OBS_INGEST_TOKEN= deploy/prod.env | cut -d= -f2)"
+# 200 = ingest endpoint healthy.
+
+# Inspect vmagent's failure metric
+kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
+  | grep -E "remotewrite_(packets|samples)_dropped|persistentqueue_blocks_dropped"
+
+# Restart vmagent (forces config reload + drains queue)
+kubectl -n honeydue rollout restart deploy/vmagent
+```
+**If 88oakappsUpdate itself is down** (PostHog runs there too):
+SSH and check `sudo docker compose -f /opt/honeydue-obs/docker-compose.yml ps`.
+**Non-critical**: nothing app-facing depends on the obs stack.
+
+#### Grafana dashboard shows "no data"
+
+**Possible causes, in order of frequency**:
+1. New histogram name — query targets a metric the api hasn't emitted
+   yet. Check `kubectl exec deploy/vmagent -- wget -qO- http://api:8000/metrics`
+   for the metric name.
+2. vmagent isn't scraping (see above).
+3. Time range is before the obs stack came up (2026-04-25). Adjust
+   the dashboard time picker.
+4. Cardinality blowup — VM rejected high-label-count series. Check
+   `vm_rows_inserted_total` vs `vm_rows_dropped_total` on the obs box.
+
 ### Networking failures

 #### UFW rule accidentally blocks essential traffic
@@ -210,12 +245,58 @@ finds an empty data directory (or can't mount at all).
 - If the original node is gone: Redis starts empty. Cache regenerates.
  Asynq queue state is lost; pending jobs re-queue on retry, cron
  fires re-schedule on next tick.
+- Auth caches (token + residence-IDs) regenerate on first user
+  request — first request per user pays full DB lookup, then warm
+  again. Visible as a brief latency spike in the Grafana RED
+  dashboard, not a functional failure.
 - Ensure the node label `honeydue/redis=true` is on a healthy node:
 ```bash
 kubectl label node <new-node> honeydue/redis=true --overwrite
 kubectl label node <dead-node> honeydue/redis- 2>/dev/null || true
 ```

+#### Stale residence-IDs cache (data freshness bug)
+
+**Symptom**: a user accepts a share-code or has a residence
+removed, but `/api/tasks/`, `/api/documents/`, `/api/contractors/`,
+or `/api/residences/summary/` continues to show the old
+membership for up to 5 minutes.
+**Cause**: a residence-membership-mutating code path landed
+without calling `cache.InvalidateResidenceIDsForUsers(...)`. The
+cache TTL is 5 min so the issue self-heals, but it's user-visible.
+**Recovery (immediate)**: flush the affected user's cache key
+manually. See [Chapter 17 §residence-IDs cache invalidation](./17-runbook.md).
+**Prevention (permanent)**: every mutation that changes
+`residence_residence.owner_id`, `residence_residence_users.user_id`,
+or deletes a residence MUST invalidate. Existing call sites for
+reference: `CreateResidence` (owner), `DeleteResidence`
+(all members), `JoinWithCode` (joining user), `RemoveUser`
+(removed user). The pattern lives in
+`internal/services/residence_id_cache.go`.
+
+#### Redis at maxmemory limit
+
+**Symptom**: Redis logs `OOM command not allowed when used memory > 'maxmemory'`.
+Should be rare — current production usage is ~2.4 MB against a 256 MB
+limit and the policy is `allkeys-lru` (cache writes evict cold keys
+instead of erroring).
+**Recovery**: confirm the policy is still `allkeys-lru`:
+```bash
+kubectl -n honeydue exec deploy/redis -- redis-cli CONFIG GET maxmemory-policy
+```
+If it's somehow `noeviction`, set it live:
+```bash
+kubectl -n honeydue exec deploy/redis -- redis-cli CONFIG SET maxmemory-policy allkeys-lru
+```
+And re-apply the manifest at `deploy-k3s/manifests/redis/deployment.yaml`
+so the change survives a pod restart.
+
+If memory usage is genuinely climbing toward the cap, check for
+runaway keys without TTLs:
+```bash
+kubectl -n honeydue exec deploy/redis -- redis-cli --bigkeys
+```
+
 ### External service failures

 #### Neon Postgres outage
@@ -229,6 +310,72 @@ until Neon is back.
 Postgres-level failover.
 **Frequency**: Neon has had a handful of hours-scale outages since launch.

+#### Neon pooler endpoint unreachable but direct endpoint up
+
+**Symptom**: `dial tcp ep-floral-truth-amttbc5a-pooler.c-5...: i/o
+timeout` in api logs but the direct compute endpoint is reachable.
+Rare — Neon's pooler runs in their infra alongside compute — but
+possible during pooler maintenance.
+**Recovery (emergency)**: switch `DB_HOST` in `config.yaml` from the
+`-pooler` to the direct hostname (drop the `-pooler` segment),
+re-apply ConfigMap, rolling-restart api and worker:
+```bash
+# Edit deploy-k3s/config.yaml: database.host: ep-floral-truth-amttbc5a.c-5...
+# Then:
+KUBECONFIG=~/.kube/honeydue.yaml bash deploy-k3s/scripts/03-deploy.sh --skip-build
+```
+Cold-handshake latency goes back up (~440ms first hit) but the API
+keeps serving. Switch back when the pooler recovers.
+
+#### Migrate Job fails during deploy
+
+**Symptom**: `03-deploy.sh` aborts at the migrations step:
+```
+[deploy][error] migrations did not complete cleanly; aborting deploy
+```
+api/worker pods are NOT updated — they keep running the previous
+revision. This is the intentional fail-fast.
+
+**Recovery**:
+```bash
+# 1. See the failure
+kubectl -n honeydue logs job/honeydue-migrate --tail=200
+
+# 2. Common cause: a SQL error in the migration file. Fix the file
+#    locally, commit, retry the deploy. The Job is idempotent —
+#    successful prior versions stay applied; only the failed file
+#    re-runs.
+git add migrations/000NNN_*.sql
+git commit -m "Fix migration NNN"
+git push gitea master
+bash deploy-k3s/scripts/03-deploy.sh
+
+# 3. Other cause: Neon down or auth changed. Test direct connection:
+DB_PASS=$(kubectl -n honeydue get secret honeydue-secrets \
+  -o jsonpath='{.data.POSTGRES_PASSWORD}' | base64 -d)
+docker run --rm -e PGPASSWORD="$DB_PASS" postgres:17-alpine \
+  psql "host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
+        user=neondb_owner dbname=honeyDue sslmode=require" -c "SELECT 1;"
+```
+**Why no automatic retry**: `backoffLimit: 0` on the Job is deliberate.
+A failing migration almost never gets unstuck by retrying — needs an
+operator to look. See [Chapter 17 §27](./17-runbook.md) for recovery
+playbook.
+
+#### api refuses to start: "Schema precondition failed"
+
+**Symptom**: api pods log `Schema precondition failed` and exit
+immediately after DB connect.
+**Cause**: `goose_db_version` table is missing or its latest row has
+`is_applied=false`. Means the migrate Job either was never run or
+ran and rolled back.
+**Recovery**: run the migrate Job manually (see
+[Chapter 17 §26](./17-runbook.md)). After it completes successfully,
+delete the failing api pods so they restart with a fresh schema check:
+```bash
+kubectl -n honeydue rollout restart deploy/api
+```
+
 #### Backblaze B2 outage

 **Symptom**: image uploads fail; image downloads fail unless cached by
@@ -358,6 +358,165 @@ Workaround: in each pod's logs, search for a unique user identifier:
 stern -n honeydue api | grep "user_id=12345"
 ```

+## 23. Invalidate residence-IDs cache for a user
+
+Used when a user reports stale data ("I joined a residence but my
+tasks list still shows the old one"). The cache is keyed on user ID
+with 5-min TTL — most issues self-heal — but you can flush manually.
+
+```bash
+# Single user
+kubectl -n honeydue exec deploy/redis -- redis-cli DEL "residence_ids_user:7"
+
+# All users (nuclear; everyone pays one DB lookup on next request)
+kubectl -n honeydue exec deploy/redis -- redis-cli --scan --pattern "residence_ids_user:*" \
+  | xargs -r -n 100 kubectl -n honeydue exec deploy/redis -- redis-cli DEL
+```
+
+Mutation paths that should invalidate this cache automatically (any
+new code that changes membership must call
+`cache.InvalidateResidenceIDsForUsers(ctx, userIDs...)`):
+
+- `ResidenceService.CreateResidence` → owner
+- `ResidenceService.DeleteResidence` → all members
+- `ResidenceService.JoinWithCode` → joining user
+- `ResidenceService.RemoveUser` → removed user
+
+If a user keeps reporting stale data, grep for missing invalidation:
+
+```bash
+grep -rn "residenceRepo.*Add\|RemoveUser\|residence_residence_users" internal/ \
+  | grep -v cache | grep -v _test
+```
+
+## 24. Verify DB pool warm-up is working
+
+After a deploy, check the api pod log for the warm-up confirmation:
+
+```bash
+kubectl -n honeydue logs -l app.kubernetes.io/name=api --tail=50 \
+  | grep "DB pool warm-up complete"
+```
+
+Expected output (per pod):
+
+```json
+{"level":"info","requested":20,"warmed":20,"message":"DB pool warm-up complete"}
+```
+
+If `warmed` < `requested`, the pool partially failed at boot — pod
+still starts, fills from there. If `warmed=0`, something's wrong with
+either Neon connectivity or auth — check the next log line for the
+specific error.
+
+To test impact: hit the api right after a rollout. With warm-up
+working, the first request should be ~250ms (1 RTT). Without warm-up,
+the first request is ~700ms (full handshake).
+
+## 25. Switch DB host between pooler and direct endpoints
+
+The pooler endpoint (`-pooler` suffix) is the default — it cuts
+cold-handshake latency by ~3 RTTs. The direct endpoint
+(`ep-floral-truth-amttbc5a.c-5...`) is the fallback.
+
+```bash
+# Edit deploy-k3s/config.yaml — change database.host
+# To pooler:   ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech
+# To direct:   ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech
+
+KUBECONFIG=~/.kube/honeydue.yaml bash deploy-k3s/scripts/03-deploy.sh --skip-build
+```
+
+The pooler runs in transaction mode so any session-scope feature
+(LISTEN/NOTIFY, session advisory locks) won't work over it. Migrations
+already handle this — the migrate Job script strips `-pooler` from
+`DB_HOST` before invoking goose. If you add new session-level features
+in the data path, they'll need the same workaround.
+
+## 26. Run migrations manually (rare)
+
+Day-to-day, migrations run as part of every `03-deploy.sh`. But
+sometimes you want to apply or inspect them outside a deploy:
+
+```bash
+# Direct-endpoint DSN (goose's advisory lock won't survive the pooler)
+DB_PASS=$(kubectl -n honeydue get secret honeydue-secrets \
+  -o jsonpath='{.data.POSTGRES_PASSWORD}' | base64 -d)
+export DATABASE_URL="host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
+                     port=5432 user=neondb_owner password=$DB_PASS \
+                     dbname=honeyDue sslmode=require"
+
+# What's pending? (read-only; safe to run anytime)
+make migrate-status
+
+# Apply pending migrations (or `goose -dir migrations postgres "$DATABASE_URL" up`)
+make migrate-up
+
+# Roll back the most recent migration
+make migrate-down
+
+# Scaffold a new migration file
+make migrate-new name=add_widget_count_to_residences
+# → migrations/000002_add_widget_count_to_residences.sql
+# Edit, then `make migrate-up` to test, then commit.
+```
+
+To run goose from inside the cluster (e.g., to bypass a network policy
+that blocks Neon from your laptop), use the migrate Job manifest as a
+one-shot:
+
+```bash
+# Re-runs the latest migrate Job with whatever args you need
+kubectl -n honeydue delete job honeydue-migrate --ignore-not-found
+sed "s|image: IMAGE_PLACEHOLDER|image: $(kubectl -n honeydue get deploy api -o jsonpath='{.spec.template.spec.containers[0].image}')|" \
+  deploy-k3s/manifests/migrate/job.yaml | kubectl apply -f -
+kubectl -n honeydue wait --for=condition=complete --timeout=5m job/honeydue-migrate
+kubectl -n honeydue logs job/honeydue-migrate
+```
+
+## 27. Recover from a failed/dirty migration
+
+If `goose up` fails partway through, the migration file's transaction
+rolls back and `goose_db_version` reflects the last *complete*
+version. Goose marks no row as "dirty" — that's a golang-migrate
+concept. So recovery is just: fix the migration file, re-run.
+
+If you've genuinely corrupted state (dropped tables you shouldn't have,
+applied a destructive migration in error):
+
+```bash
+# See current goose state
+make migrate-status
+psql "$DATABASE_URL" -c \
+  "SELECT version_id, is_applied, tstamp FROM goose_db_version ORDER BY id DESC LIMIT 10;"
+
+# To force the version table back to a known-good number after
+# manually fixing the schema:
+psql "$DATABASE_URL" -c \
+  "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (<N>, true, NOW());"
+```
+
+## 28. Bootstrap goose on a fresh clone of the schema
+
+If you create a new Neon branch / dev DB and need to bring it under
+goose management:
+
+```bash
+export DATABASE_URL="...<the new DB>..."
+
+# Option A: fresh DB, no schema → just run up
+make migrate-up
+
+# Option B: schema already populated (e.g., restored from a dump) →
+#          mark v1 as already-applied
+goose -dir migrations postgres "$DATABASE_URL" version  # creates table
+psql "$DATABASE_URL" -c \
+  "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
+```
+
+This is also what was done for the live prod DB at goose-adoption time
+(commit `12b2f9d`).
+
 ## References

 - [kubectl cheat sheet][kubectl-cs]
@@ -58,6 +58,20 @@ honeyDue.
 |---|---:|
 | Gitea container registry | **$0** |

+### Observability (88oakappsUpdate)
+
+VictoriaMetrics + Jaeger + Grafana co-tenant on the existing Linode
+VPS that hosts PostHog. ~700 MB RAM, 21 GB disk — fits inside the
+existing instance. Not charged to honeyDue.
+
+| Item | Monthly |
+|---|---:|
+| Self-hosted obs stack on `88oakappsUpdate` | **$0** |
+
+Migration trigger: when the obs stack starts pressuring PostHog or
+needs hard isolation, move to a dedicated Hetzner CX32 (~$8/mo).
+See [Chapter 15 — When to move off](./15-observability.md).
+
 ### Total infrastructure

 | Category | Monthly |
@@ -67,6 +81,7 @@ honeyDue.
 | Storage | ~$0.30 |
 | Edge | $0 |
 | Registry | $0 |
+| Observability | $0 |
 | **Total** | **~$30** |

 ## External SaaS
@@ -397,6 +397,35 @@ should reflect reality, not be optimistic.
 **Moral**: Healthchecks should be realistic, not aspirational. Know
 what your app actually does at startup.

+#### Postscript (2026-04-26): the whole `MigrateWithLock` shape was wrong
+
+A few months after the Swarm migration, switching `DB_HOST` to Neon's
+`-pooler` endpoint for runtime perf wins broke this code completely:
+`pg_advisory_lock` is session-scoped, but PgBouncer transaction-mode
+multiplexes statements across backend Postgres sessions, so the lock
+appeared to be held but actually wasn't. Pods hung at
+"Acquiring migration advisory lock..." and the startup probe killed
+them in turn.
+
+After a brief band-aid (route migrations through the direct endpoint;
+bump probe to 600s to absorb 5-minute AutoMigrate runs over the slow
+direct connection — both reverted), we abandoned the runtime-side
+migration story entirely and adopted [pressly/goose](https://github.com/pressly/goose)
+in commit `12b2f9d`:
+
+- Migrations run as a one-shot Kubernetes Job before any api/worker
+  pod rolls. No more in-replica migration, no more advisory lock,
+  no more startup probe gymnastics.
+- `RequireSchemaApplied` checks `goose_db_version` at startup and
+  refuses to boot on a stale schema — fail-fast for "operator
+  forgot to run migrate," instead of mysterious runtime errors.
+- `failureThreshold` reverted to its pre-MigrateWithLock value.
+  Pods boot in seconds again.
+
+See [Chapter 8 §Schema management](./08-database.md) for the goose
+shape. This entire sub-section is preserved as historical context
+for why we walked the path we did.
+
 ## What we learned

 ### Docker Swarm is in a bad place in 2026
@@ -69,20 +69,22 @@ Flexible to Full (strict). Verified by:
 - CF edge continues to serve its own Let's Encrypt cert to browsers
 - both layers now TLS-encrypted

-### Migration Job for schema changes
+### ~~Migration Job for schema changes~~ — done (2026-04-26, commit 12b2f9d)

-**Why**: Currently every api pod runs `MigrateWithLock()` on startup,
-serializing on a Postgres advisory lock. Adds 90-240s to cold startup
-and caused bug #13 in Chapter 19.
+**What shipped**: pressly/goose as the migration tool, run as a one-shot
+Kubernetes Job from `deploy-k3s/manifests/migrate/job.yaml` before
+api/worker rollout. The Job uses the api image (goose CLI is baked in
+during the Dockerfile build), strips `-pooler` from `DB_HOST` for the
+direct-endpoint connection migrations need, and exits in seconds when
+there's nothing to apply. `RequireSchemaApplied` in the api/worker
+startup checks `goose_db_version` and fails fast on a stale schema.

-**How**: Create a Kubernetes `Job` resource that runs the api image
-with a `--migrate-only` flag. Job runs once per deploy, completes when
-schema is current. api pods get an initContainer that waits for the
-Job to complete.
+The Go-code-with-`--migrate-only` shape originally proposed here was
+rejected in favor of using the upstream goose binary directly — see
+[Chapter 8 §Schema management](./08-database.md) for the trade-offs.

-Requires Go code change to support `--migrate-only` flag.
-
-**Effort**: 3-4 hours (code + job manifest + testing).
+Pre-goose `MigrateWithLock` is gone; ch19 §13 has the historical
+postmortem context.

 ### Redis password

@@ -40,7 +40,7 @@ they do, and how to operate them.

 - [07 — Services](./07-services.md) — api, admin, worker, redis per-service deep dive
 - [08 — Database](./08-database.md) — Neon Postgres, advisory-lock migrations
- [09 — Storage](./09-storage.md) — Backblaze B2, minio-go client details
+- [09 — Storage](./09-storage.md) — Backblaze B2, minio-go, presigned-URL direct uploads
 - [10 — Secrets & Config](./10-secrets-config.md) — ConfigMap, Secret, env mapping
 - [11 — Registry](./11-registry.md) — Gitea container registry, multi-arch builds

@@ -48,7 +48,7 @@ they do, and how to operate them.

 - [12 — Data Flow](./12-data-flow.md) — end-to-end request lifecycle
 - [14 — Deployment Process](./14-deployment-process.md) — how to roll new code
- [15 — Observability](./15-observability.md) — logs, metrics, tracing
+- [15 — Observability](./15-observability.md) — VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`, vmagent in-cluster, Prometheus histograms in the Go API
 - [16 — Failure Modes](./16-failure-modes.md) — what happens when X dies
 - [17 — Runbook](./17-runbook.md) — common ops tasks

@@ -173,11 +173,21 @@ suffix. (Chapter 8)
 ## Go + Asynq

 **AutoMigrate**: GORM function that syncs DB schema to Go structs.
-(Chapter 8)
+We used this in production until 2026-04, replaced by goose. Tests
+still use it via `testutil.SetupTestDB`. (Chapter 8)

 **Asynq**: Go library for background job queues. Redis-backed.
 (Chapter 7)

+**goose**: pressly/goose — the SQL migration tool we use in production
+(commit 12b2f9d onward). Migration files live in `migrations/`, one
+file per version with `-- +goose Up` / `-- +goose Down` markers.
+(Chapter 8)
+
+**goose_db_version**: goose's version-tracking table. One row per
+applied migration. `RequireSchemaApplied` reads the latest row at
+api/worker startup to fail fast on a stale schema. (Chapter 8)
+
 **GORM**: Go ORM we use. (Chapter 8)

 **pgx**: Go Postgres driver used by GORM. (Chapter 8)
@@ -278,6 +278,43 @@ ssh -i ~/.ssh/hetzner deploy@<node> 'sudo systemctl start k3s'
 # then re-join via the k3s install command
 ```

+## Observability
+
+```bash
+# Hit api /metrics from inside the cluster
+kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://api:8000/metrics | head -30
+
+# vmagent self-stats: scrapes succeeded, samples shipped, queue health
+kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
+  | grep -E "scrapes_total|targets|remotewrite_samples_dropped|persistentqueue_blocks_dropped"
+
+# Force vmagent to reload config (after editing the ConfigMap)
+kubectl -n honeydue rollout restart deploy/vmagent
+
+# Query VictoriaMetrics by SSH'ing to the obs box
+ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=up"'
+
+# p95 latency by route, last 5m
+ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=histogram_quantile(0.95,sum%20by%20(route,le)(rate(http_request_duration_seconds_bucket%5B5m%5D)))" | python3 -m json.tool'
+
+# All metric names landing in VM
+ssh 88oakappsUpdate 'curl -s http://127.0.0.1:8428/api/v1/label/__name__/values | python3 -m json.tool'
+
+# Restart the obs stack on 88oakappsUpdate (VM + Jaeger + Grafana)
+ssh 88oakappsUpdate 'cd /opt/honeydue-obs && sudo docker compose restart'
+
+# Live RAM usage of the obs containers
+ssh 88oakappsUpdate 'sudo docker stats --no-stream | grep honeydue-obs'
+
+# Test the obs ingest endpoint with auth
+TOKEN=$(grep ^OBS_INGEST_TOKEN= deploy/prod.env | cut -d= -f2)
+curl -s -o /dev/null -w "%{http_code}\n" https://obs.88oakapps.com/health \
+  -H "Authorization: Bearer $TOKEN"  # 200 = healthy
+```
+
+Dashboards live at `https://grafana.88oakapps.com/d/honeydue-red`.
+Admin credentials in `deploy/prod.env`.
+
 ## One-liners worth memorizing

 ```bash
@@ -65,7 +65,9 @@ Every external link cited anywhere in this book, grouped by topic.
 - [Neon usage-based pricing announcement][neon-blog]
 - [Neon connect from any app][neon-connect]
 - [Postgres advisory locks][pg-locks]
- [GORM AutoMigrate][gorm-automigrate]
+- [GORM AutoMigrate][gorm-automigrate] (tests only — production migrations use goose)
+- [pressly/goose — SQL migration tool][goose]
+- [Goose documentation][goose-docs]

 ## Backblaze B2

@@ -168,6 +170,8 @@ Every external link cited anywhere in this book, grouped by topic.
 [neon-connect]: https://neon.com/docs/connect/connect-from-any-app
 [pg-locks]: https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
 [gorm-automigrate]: https://gorm.io/docs/migration.html
+[goose]: https://github.com/pressly/goose
+[goose-docs]: https://pressly.github.io/goose/

 <!-- B2 -->
 [b2-docs]: https://www.backblaze.com/docs/
@@ -0,0 +1,166 @@
+# Observability Plan — honeyDue (100% self-hosted)
+
+**Goal:** Live request-timing visibility (HTTP, DB, B2 uploads, APNs, asynq jobs) without paying any SaaS vendor.
+
+**Deployment target:** `88oakappsUpdate` (Linode VPS at `185.143.228.16`, Ubuntu 24.04, 8 vCPU / 32 GB RAM / 193 GB disk). This box already runs the self-hosted PostHog stack and has nginx + Let's Encrypt set up for `*.88oakapps.com`. Free RAM at rest ≈ 15 GB; the obs stack budget is ≈ 700 MB → ~5% of free RAM. Costs $0 incremental.
+
+**Why not in the honeyDue k3s cluster:** Frees ~700 MB across the 3 Hetzner nodes, no PVC plumbing, and no need to expose anything from k3s — everything is push-from-app to a public TLS endpoint.
+
+**Status:** Fully shipped. VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`, vmagent in-cluster, OTel SDK and otelgorm wired into the api+worker, every authed endpoint produces nested HTTP→service→SQL flame graphs in Jaeger.
+
+The first round of traces revealed every visible ms was network/proxy overhead — DB execution itself is sub-millisecond. The follow-up work (`internal/services/residence_id_cache.go`, GORM pool warm-up, auth-query JOIN consolidation, switching `DB_HOST` to Neon's `-pooler` endpoint, bumped cache TTLs) cut warm-cache `/api/tasks/` from 2,473 ms / 5 spans to **229 ms / 2 spans** — see commit `88fb175` and Chapter 8 §"Optimizations layered on top".
+
+---
+
+## Stack
+
+| Role | Choice | Why this vs. the obvious alternative |
+|---|---|---|
+| Metrics store | **VictoriaMetrics** (single-node) | Drop-in Prometheus-compatible. ~4× lower RAM (~200 MB vs ~500 MB) and ~7× better compression. Single binary. |
+| Tracing | **Jaeger all-in-one** | ~150 MB RAM with embedded badger storage. Tempo monolithic mode needs 1-2 GB minimum — overkill for honeyDue's scale. |
+| Dashboards | **Grafana OSS** | Connects to both VM (Prometheus protocol) and Jaeger natively. |
+| App instrumentation | **OpenTelemetry SDK** + `prometheus/client_golang` | OTel is vendor-neutral — backends are swappable without code change. |
+| Logs | **Keep Dozzle**; add Loki only when log search becomes painful | Loki adds ~512 MB RAM + a daemonset for log shipping. Not worth it until there's a concrete pain point. |
+
+### Why not the LGTM stack (Loki + Grafana + Tempo + Mimir)?
+
+- **Tempo** wants 1-2 GB RAM minimum in monolithic mode ([Grafana community report](https://community.grafana.com/t/tempo-ram-usage-for-6k-spans-per-hour/63801)). Stacking that on top of Loki + Mimir would consume ~3-4 GB RAM. On a 3×8 GB cluster that's 12-17% of capacity for observability infra.
+- **Mimir** is wonderful for multi-tenant Prometheus at scale — you have one tenant.
+- **Loki** is great if you live in `kubectl logs` and need full-text search across them. You currently use Dozzle and are not feeling that pain.
+
+VictoriaMetrics + Jaeger all-in-one gives you 90% of the value at 25% of the resource cost.
+
+---
+
+## Resource budget on `88oakappsUpdate`
+
+Three Docker containers in a separate compose project under `/opt/honeydue-obs/` — fully isolated from the existing PostHog compose stack so PostHog's lifecycle never touches the obs stack and vice versa.
+
+| Service | `mem_limit` | Disk (bind mount) | Retention |
+|---|---|---|---|
+| VictoriaMetrics single-node | 256 MB | 10 GB | 30 days metrics |
+| Jaeger all-in-one (badger storage) | 256 MB | 10 GB | 7 days traces |
+| Grafana OSS | 256 MB | 1 GB | — |
+| **Total** | **~768 MB hard cap** | **21 GB** | |
+
+**~5% of the box's free RAM and ~14% of free disk.** The hard `mem_limit` per container matters: ClickHouse on the same VM can spike under PostHog analytics load, so bounding the obs stack prevents it from competing in a memory pinch.
+
+**Don't reuse PostHog's ClickHouse / Kafka / Redis.** Tempting because they're sitting right there, but coupling honeyDue's observability to PostHog's storage means a PostHog incident takes honeyDue's incident-response telemetry down with it. Keep them fully separate.
+
+**Shared blast radius caveat:** A kernel panic on `88oakappsUpdate` loses both PostHog and honeyDue obs at once. At current scale, fine — call it out, don't fix.
+
+---
+
+## App-side instrumentation
+
+| Surface | Library / approach | Import path |
+|---|---|---|
+| Echo HTTP middleware | `otelecho` — span per request, tagged route/method/status | `go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho` |
+| GORM queries | `uptrace/otelgorm` plugin — `db.Use(otelgorm.NewPlugin())`. Requires threading `ctx` through repositories so `db.WithContext(ctx)` works. | `github.com/uptrace/opentelemetry-go-extra/otelgorm` |
+| B2 / minio-go uploads | Manual span around `storage_service.Upload` with attributes for bucket, object size, MIME type | `go.opentelemetry.io/otel` |
+| APNs / FCM | Manual span in `internal/push/apns.go` and `fcm.go`; record device-token, response status code | `go.opentelemetry.io/otel` |
+| asynq jobs | Custom `asynq.MiddlewareFunc` (~20 lines) — span per task type, attached to ctx, records duration + retry count | `go.opentelemetry.io/otel` + `asynq.MiddlewareFunc` |
+| Prometheus `/metrics` endpoint | `prometheus/client_golang` direct — register histograms for HTTP duration / GORM op / B2 op / APNs send | `github.com/prometheus/client_golang/prometheus`, `.../prometheus/promhttp` |
+| OTLP exporter | OTLP/HTTP → `https://obs.88oakapps.com/v1/traces` with bearer token. 100% sample in dev, 10% in prod. | `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp` |
+| Metrics push | `vmagent` sidecar in k3s scrapes the api Pod's `/metrics` and remote-writes to `https://obs.88oakapps.com/api/v1/write` with bearer token. Cleaner than exposing `/metrics` publicly. | `victoriametrics/vmagent` image |
+
+**Note on GORM context propagation:** the existing repository methods don't take `ctx context.Context`. Adding `otelgorm` requires plumbing ctx down from the Echo handler through the service layer to the repository call site. ~10 repository files, many call sites. Save for last because the diff is large.
+
+---
+
+## Implementation order (smallest first)
+
+### Step 1 — Metrics + dashboards (highest immediate ROI)
+
+**On `88oakappsUpdate`:**
+1. `mkdir -p /opt/honeydue-obs/{data/vm,data/jaeger,data/grafana}` and a `docker-compose.yml` defining the three services with `mem_limit: 256m`, bind mounts for persistence, and an isolated bridge network
+2. Add nginx vhosts (DNS A records first):
+   - `grafana.88oakapps.com` → `127.0.0.1:3000` (basic auth via htpasswd, Let's Encrypt)
+   - `obs.88oakapps.com` → routes by path:
+     - `/api/v1/write` → `127.0.0.1:8428` (VictoriaMetrics remote-write, bearer-token check)
+     - `/v1/traces`     → `127.0.0.1:4318` (OTLP/HTTP traces, bearer-token check)
+3. Generate a 32-byte token, store in `/etc/honeydue-obs/token` (mode 0600), reference from nginx as `auth_request` or simple `if ($http_authorization != ...)`
+4. Pre-provision Grafana with the VM datasource pointing at `http://victoriametrics:8428` (in-network)
+
+**On the honeyDue k3s cluster:**
+5. Add `prometheus/client_golang` to `honeyDueAPI-go/go.mod` and a `/metrics` endpoint to the Go API
+6. Register histograms:
+   - `http_request_duration_seconds{route,method,status}` via Echo middleware
+   - `gorm_query_duration_seconds{table,operation}` via a GORM `Plugin` callback (no ctx needed for this one — operates at the SQL string level)
+   - `b2_upload_duration_seconds{bucket,result}`
+   - `apns_send_duration_seconds{result}`
+7. Deploy a `vmagent` sidecar (or DaemonSet) in the `honeydue` namespace with:
+   - Scrape: api Service `/metrics` every 15s
+   - `remote_write.url`: `https://obs.88oakapps.com/api/v1/write`
+   - `remote_write.bearer_token`: from k8s Secret
+8. Build the RED dashboard in Grafana: rate, errors, duration p50/p95/p99 per route
+
+**ROI:** "Is the API healthy? Where is time being spent right now?" answered live, served from `grafana.88oakapps.com`.
+
+### Step 2 — Tracing baseline
+
+(Jaeger is already up from Step 1. This step adds the app-side wiring.)
+
+1. Add Grafana datasource for Jaeger pointing at `http://jaeger:16686` (in-network)
+2. Wire OTel SDK in `cmd/api/main.go`:
+   - `otel.SetTracerProvider(tracerProvider)`
+   - `otelecho.Middleware("honeydue-api")` on Echo
+   - OTLP/HTTP exporter pointing at `https://obs.88oakapps.com/v1/traces` with `Authorization: Bearer <token>` header (token from env)
+   - Sampling: `TraceIDRatioBased(0.1)` in prod, `AlwaysSample()` in dev
+3. Verify: a single `POST /api/auth/login/` produces a trace in Jaeger
+
+**ROI:** "Why is this one request slow?" — answered with a flame graph.
+
+### Step 3 — Manual spans for the work that actually matters
+
+Wrap each in `tracer.Start(ctx, ...)` with attributes:
+- `storage_service.Upload` → span "b2.PutObject" with `bucket`, `key`, `size_bytes`, result
+- `push/apns.go` → span "apns.send" with `device_token_hash`, `status_code`, `reason`
+- `asynq` middleware → span per task type with `task.type`, `retry_count`, `payload_size`
+
+**ROI:** Specific high-value debugging questions ("why did this upload take 30 seconds", "why did these 5 push notifications fail") answered without code archaeology.
+
+### Step 4 — Repository ctx + `otelgorm` (biggest diff, save for last)
+
+1. Refactor every repository method to accept `ctx context.Context` as first arg
+2. Update every call site to pass `c.Request().Context()` from handlers / propagate through services
+3. Add `db.Use(otelgorm.NewPlugin())` in `internal/database/database.go`
+4. Verify: a request now has nested spans `http → service → query → query → b2.PutObject → apns.send` with full SQL on the query spans
+
+**ROI:** Every DB query in every trace, with SQL + table + rows. The "find the N+1" tool you'd otherwise build by hand.
+
+---
+
+## Hard skips (revisit only when explicitly proven needed)
+
+| Tool | Why skip |
+|---|---|
+| Loki / Promtail | Dozzle covers the immediate need. Loki adds 512 Mi RAM + a daemonset; defer until log search becomes a hot pain point. |
+| Mimir / VM cluster mode | Single-node VM handles honeyDue scale for years. |
+| Pyroscope continuous profiling | Overkill at 3 small nodes. Use `pprof` endpoints ad-hoc when CPU pressure shows up. |
+| OTel Collector | Only worth running when 3+ services emit telemetry. App → Jaeger direct is fine for now. |
+| Any SaaS vendor (Datadog, NR, Honeycomb, Grafana Cloud, Sentry Performance) | User constraint: nothing paid. |
+
+---
+
+## When to move off `88oakappsUpdate`
+
+Triggers — any one is enough:
+- `88oakappsUpdate` available memory drops below ~3 GB sustained (PostHog growth squeezing it)
+- ClickHouse OOM events start showing up in `dmesg` (PostHog under load)
+- You want fully separate failure domains for honeyDue vs. 88oakapps
+
+Migration path: the obs stack is a single docker-compose project on a bind-mount, so moving it = `rsync /opt/honeydue-obs/` to a new box, update DNS for `grafana.88oakapps.com` and `obs.88oakapps.com`, `docker compose up -d`. ~30 min of work. Until then: cohabiting on `88oakappsUpdate` is correct.
+
+---
+
+## Quick reference: what shows up where
+
+| Question | Where to look |
+|---|---|
+| Is the API up right now? Latency? Errors? | Grafana RED dashboard |
+| Why is this specific request slow? | Jaeger trace view |
+| What did the slow part of that request actually do (which SQL, which B2 PUT)? | Span details inside the trace |
+| Background job throughput / queue depth | VictoriaMetrics + asynq metrics |
+| What did the app print to stdout 5 minutes ago? | Dozzle |
+| What error did the app log? | Dozzle (search) — or Loki if/when added |
@@ -0,0 +1,146 @@
+# Runbook — Secret Rotation
+
+Closes audit finding `K3S-F12` (secrets unrotated since cluster bootstrap,
+no rotation cadence). See `deploy-k3s/SECURITY.md` Stage 2.
+
+**Cadence:** rotate every secret at least **annually**. Rotate
+**immediately** on suspected exposure, on an operator-device loss, or when
+anyone who has seen a secret leaves the project.
+
+**Record keeping:** after each rotation, annotate the secret so the age is
+visible:
+
+```bash
+kubectl -n honeydue annotate secret <name> \
+  honeydue.dev/last-rotated="$(date -u +%Y-%m-%d)" --overwrite
+```
+
+---
+
+## How rotation works
+
+Every secret has a **source of truth** on the operator workstation. The
+deploy scripts read those sources and (re)create the Kubernetes Secrets.
+Rotation is always: **update the source → re-run `02-setup-secrets.sh` →
+restart the pods that consume it → revoke the old credential at its
+provider.**
+
+`02-setup-secrets.sh` uses `kubectl apply` (via `--dry-run=client -o yaml`),
+so re-running it is idempotent and only changes what you changed.
+
+| Kubernetes Secret | Source of truth | Consumed by |
+|---|---|---|
+| `honeydue-secrets` → `POSTGRES_PASSWORD` | `deploy-k3s/secrets/postgres_password.txt` | api, worker |
+| `honeydue-secrets` → `SECRET_KEY` | `deploy-k3s/secrets/secret_key.txt` | api, worker |
+| `honeydue-secrets` → `EMAIL_HOST_PASSWORD` | `deploy-k3s/secrets/email_host_password.txt` | api, worker |
+| `honeydue-secrets` → `FCM_SERVER_KEY` | `deploy-k3s/secrets/fcm_server_key.txt` | api, worker |
+| `honeydue-secrets` → `REDIS_PASSWORD` | `config.yaml` key `redis.password` | api, worker, redis |
+| `honeydue-secrets` → `OBS_INGEST_TOKEN` | `deploy/prod.env` | api, worker |
+| `honeydue-apns-key` → `apns_auth_key.p8` | `deploy-k3s/secrets/apns_auth_key.p8` | api, worker |
+| `cloudflare-origin-cert` | `deploy-k3s/secrets/cloudflare-origin.{crt,key}` | Traefik ingress |
+| `ghcr-credentials` | `config.yaml` block `registry.*` | image pulls (all pods) |
+| `admin-basic-auth` | `config.yaml` keys `admin.basic_auth_user` / `..._password` | Traefik `admin-auth` middleware |
+
+The `deploy-k3s/secrets/` directory and `config.yaml` are **gitignored** —
+never commit them.
+
+---
+
+## Standard rotation procedure
+
+```bash
+cd honeyDueAPI-go
+export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
+
+# 1. Update the source (file under deploy-k3s/secrets/ or a config.yaml key)
+# 2. Recreate the Kubernetes Secrets from sources
+./deploy-k3s/scripts/02-setup-secrets.sh
+
+# 3. Restart the consumers (see per-secret notes below for which)
+kubectl -n honeydue rollout restart deploy/api deploy/worker
+
+# 4. Confirm health
+kubectl -n honeydue rollout status deploy/api
+kubectl -n honeydue rollout status deploy/worker
+
+# 5. Revoke the OLD credential at its provider (see per-secret notes)
+# 6. Annotate the rotated secret with today's date
+```
+
+---
+
+## Per-secret notes
+
+### `POSTGRES_PASSWORD`
+1. Rotate the role password in the Neon dashboard.
+2. Write the new value to `deploy-k3s/secrets/postgres_password.txt`.
+3. `02-setup-secrets.sh`, then `rollout restart deploy/api deploy/worker`.
+4. Watch logs for connection errors; the old password stops working the
+   moment Neon applies the change, so do steps 2–3 promptly.
+
+### `SECRET_KEY`  ⚠️ user-visible
+This signs auth tokens. **Rotating it logs every user out** — all existing
+tokens become invalid and every client must re-authenticate.
+1. Generate: `openssl rand -hex 32`.
+2. Write to `deploy-k3s/secrets/secret_key.txt` (must be ≥32 chars — the
+   script enforces this; the app refuses to start in production without it).
+3. `02-setup-secrets.sh`, then `rollout restart deploy/api deploy/worker`.
+- Only rotate on a schedule or on suspected compromise — not casually.
+- A future improvement (overlap window via a key-id header) would let old
+  tokens validate during the transition; not implemented today.
+
+### `EMAIL_HOST_PASSWORD`
+1. Generate a new app password in Fastmail; keep the old one alive briefly.
+2. Write to `deploy-k3s/secrets/email_host_password.txt`.
+3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
+4. Delete the old Fastmail app password.
+
+### `FCM_SERVER_KEY`
+1. Rotate the key in the Firebase console.
+2. Write to `deploy-k3s/secrets/fcm_server_key.txt`.
+3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
+
+### `REDIS_PASSWORD`
+Source is `config.yaml` key `redis.password` (hex only — it is embedded in
+the `REDIS_URL`, so non-hex characters would break URL parsing).
+1. Generate: `openssl rand -hex 32`.
+2. Set `redis.password` in `config.yaml`.
+3. `02-setup-secrets.sh`.
+4. Restart **redis as well as** api/worker so the new `--requirepass` and
+   the new `REDIS_URL` land together:
+   `kubectl -n honeydue rollout restart deploy/redis deploy/api deploy/worker`.
+   Expect a few seconds where api/worker reconnect.
+
+### `apns_auth_key.p8`
+1. Revoke the key in the Apple Developer console, generate a new `.p8`.
+2. Replace `deploy-k3s/secrets/apns_auth_key.p8`.
+3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
+4. If the Key ID changed, update `push.apns_key_id` in `config.yaml` too.
+
+### `cloudflare-origin-cert`
+1. Generate a new Origin CA certificate in the Cloudflare dashboard.
+2. Replace `deploy-k3s/secrets/cloudflare-origin.crt` and `.key`.
+3. `02-setup-secrets.sh`. Traefik picks up the new TLS secret; no app
+   restart needed. Verify the served cert with `openssl s_client`.
+
+### `ghcr-credentials` (Gitea registry)
+1. Generate a new PAT in Gitea (scope: `read:packages`).
+2. Update the `registry.token` value in `config.yaml`.
+3. `02-setup-secrets.sh`. No restart needed unless a pull is pending.
+4. Revoke the old PAT in Gitea.
+
+### `admin-basic-auth`
+Source is `config.yaml` keys `admin.basic_auth_user` / `basic_auth_password`.
+1. Set a new password (e.g. `openssl rand -hex 24`).
+2. `02-setup-secrets.sh` regenerates the bcrypt htpasswd secret.
+3. No app restart needed — Traefik reloads the `admin-auth` middleware.
+4. Distribute the new credential to whoever uses the admin panel.
+
+---
+
+## After any rotation
+
+- Run `./deploy-k3s/scripts/04-verify.sh` and confirm no `✗` lines.
+- Annotate the rotated secret (see "Record keeping" above).
+- If the rotation was due to a compromise, also follow the relevant
+  playbook in `deploy-k3s/SECURITY.md` → Appendix (Incident response).
@@ -1,6 +1,6 @@
 module github.com/treytartt/honeydue-api

-go 1.25
+go 1.25.0

 require (
 	github.com/go-pdf/fpdf v0.9.0
@@ -9,9 +9,10 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/gorilla/websocket v1.5.3
 	github.com/hibiken/asynq v0.25.1
-	github.com/labstack/echo/v4 v4.11.4
+	github.com/labstack/echo/v4 v4.15.1
 	github.com/minio/minio-go/v7 v7.0.99
 	github.com/nicksnyder/go-i18n/v2 v2.6.0
+	github.com/prometheus/client_golang v1.23.2
 	github.com/redis/go-redis/v9 v9.17.1
 	github.com/rs/zerolog v1.34.0
 	github.com/shirou/gopsutil/v3 v3.24.5
@@ -20,11 +21,17 @@ require (
 	github.com/spf13/viper v1.20.1
 	github.com/stretchr/testify v1.11.1
 	github.com/stripe/stripe-go/v81 v81.4.0
+	github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2
 	github.com/wneessen/go-mail v0.7.2
-	golang.org/x/crypto v0.46.0
-	golang.org/x/oauth2 v0.34.0
-	golang.org/x/text v0.32.0
-	golang.org/x/time v0.14.0
+	go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0
+	go.opentelemetry.io/otel/sdk v1.43.0
+	golang.org/x/crypto v0.51.0
+	golang.org/x/oauth2 v0.35.0
+	golang.org/x/term v0.43.0
+	golang.org/x/text v0.37.0
+	golang.org/x/time v0.15.0
 	google.golang.org/api v0.257.0
 	gopkg.in/yaml.v3 v3.0.1
 	gorm.io/driver/postgres v1.6.0
@@ -33,17 +40,28 @@ require (
 )

 require (
+	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/cenkalti/backoff/v5 v5.0.3 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/go-ini/ini v1.67.0 // indirect
+	github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect
 	github.com/klauspost/compress v1.18.2 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.11 // indirect
 	github.com/klauspost/crc32 v1.3.0 // indirect
 	github.com/minio/crc64nvme v1.1.1 // indirect
 	github.com/minio/md5-simd v1.1.2 // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/philhofer/fwd v1.2.0 // indirect
+	github.com/prometheus/client_model v0.6.2 // indirect
+	github.com/prometheus/common v0.66.1 // indirect
+	github.com/prometheus/procfs v0.16.1 // indirect
 	github.com/rs/xid v1.6.0 // indirect
 	github.com/tinylib/msgp v1.6.1 // indirect
+	github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2 // indirect
+	go.opentelemetry.io/proto/otlp v1.10.0 // indirect
+	go.yaml.in/yaml/v2 v2.4.2 // indirect
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
 )

 require (
@@ -51,7 +69,7 @@ require (
 	cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
 	cloud.google.com/go/compute/metadata v0.9.0 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
-	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
 	github.com/fsnotify/fsnotify v1.9.0 // indirect
@@ -62,7 +80,6 @@ require (
 	github.com/go-playground/locales v0.14.1 // indirect
 	github.com/go-playground/universal-translator v0.18.1 // indirect
 	github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
-	github.com/golang-jwt/jwt v3.2.2+incompatible // indirect; TODO(S-19): Pulled by echo/v4 middleware — upgrade Echo to v4.12+ which removes built-in JWT middleware (uses echo-jwt/v4 with jwt/v5 instead), eliminating this vulnerable transitive dep
 	github.com/golang-jwt/jwt/v4 v4.5.2 // indirect
 	github.com/google/s2a-go v0.1.9 // indirect
 	github.com/googleapis/enterprise-certificate-proxy v0.3.7 // indirect
@@ -76,11 +93,11 @@ require (
 	github.com/labstack/gommon v0.4.2 // indirect
 	github.com/leodido/go-urn v1.4.0 // indirect
 	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
-	github.com/mattn/go-colorable v0.1.13 // indirect
+	github.com/mattn/go-colorable v0.1.14 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/mattn/go-sqlite3 v2.0.3+incompatible // indirect
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
-	github.com/pmezard/go-difflib v1.0.0 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
 	github.com/robfig/cron/v3 v3.0.1 // indirect
 	github.com/sagikazarmark/locafero v0.9.0 // indirect
@@ -97,13 +114,13 @@ require (
 	github.com/yusufpapurcu/wmi v1.2.4 // indirect
 	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
-	go.opentelemetry.io/otel v1.38.0 // indirect
-	go.opentelemetry.io/otel/metric v1.38.0 // indirect
-	go.opentelemetry.io/otel/trace v1.38.0 // indirect
-	golang.org/x/net v0.48.0 // indirect
-	golang.org/x/sync v0.19.0 // indirect
-	golang.org/x/sys v0.39.0 // indirect
-	google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
-	google.golang.org/grpc v1.77.0 // indirect
-	google.golang.org/protobuf v1.36.10 // indirect
+	go.opentelemetry.io/otel v1.43.0
+	go.opentelemetry.io/otel/metric v1.43.0 // indirect
+	go.opentelemetry.io/otel/trace v1.43.0
+	golang.org/x/net v0.53.0 // indirect
+	golang.org/x/sync v0.20.0
+	golang.org/x/sys v0.44.0 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
+	google.golang.org/grpc v1.80.0 // indirect
+	google.golang.org/protobuf v1.36.11 // indirect
 )
@@ -8,16 +8,20 @@ github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg
 github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
 github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20201120081800-1786d5ef83d4/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
 github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
 github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
 github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
+github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
+github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
 github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
@@ -52,8 +56,6 @@ github.com/go-playground/validator/v10 v10.23.0/go.mod h1:dbuPbCMFw/DrkbEynArYaC
 github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs=
 github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
-github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
-github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
 github.com/golang-jwt/jwt/v4 v4.4.1/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
 github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXeUI=
 github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
@@ -74,6 +76,8 @@ github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81
 github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc=
 github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
 github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c=
 github.com/hibiken/asynq v0.25.1 h1:phj028N0nm15n8O2ims+IvJ2gz4k2auvermngh9JhTw=
 github.com/hibiken/asynq v0.25.1/go.mod h1:pazWNOLBu0FEynQRBvHA26qdIKRSmfdIfUm4HdsLmXg=
 github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
@@ -99,16 +103,19 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
-github.com/labstack/echo/v4 v4.11.4 h1:vDZmA+qNeh1pd/cCkEicDMrjtrnMGQ1QFI9gWN1zGq8=
-github.com/labstack/echo/v4 v4.11.4/go.mod h1:noh7EvLwqDsmh/X/HWKPUl1AjzJrhyptRyEbQJfxen8=
+github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
+github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
+github.com/labstack/echo/v4 v4.15.1 h1:S9keusg26gZpjMmPqB5hOEvNKnmd1lNmcHrbbH2lnFs=
+github.com/labstack/echo/v4 v4.15.1/go.mod h1:xmw1clThob0BSVRX1CRQkGQ/vjwcpOMjQZSZa9fKA/c=
 github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0=
 github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
 github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
 github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
 github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
-github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
+github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
+github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
 github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
@@ -121,6 +128,8 @@ github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
 github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
 github.com/minio/minio-go/v7 v7.0.99 h1:2vH/byrwUkIpFQFOilvTfaUpvAX3fEFhEzO+DR3DlCE=
 github.com/minio/minio-go/v7 v7.0.99/go.mod h1:EtGNKtlX20iL2yaYnxEigaIvj0G0GwSDnifnG8ClIdw=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/nicksnyder/go-i18n/v2 v2.6.0 h1:C/m2NNWNiTB6SK4Ao8df5EWm3JETSTIGNXBpMJTxzxQ=
 github.com/nicksnyder/go-i18n/v2 v2.6.0/go.mod h1:88sRqr0C6OPyJn0/KRNaEz1uWorjxIKP7rUUcvycecE=
 github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
@@ -128,10 +137,19 @@ github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8
 github.com/philhofer/fwd v1.2.0 h1:e6DnBTl7vGY+Gz322/ASL4Gyp1FspeMvx1RNDoToZuM=
 github.com/philhofer/fwd v1.2.0/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
 github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
+github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
+github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
+github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
+github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
+github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
+github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
+github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
+github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
 github.com/redis/go-redis/v9 v9.17.1 h1:7tl732FjYPRT9H9aNfyTwKg9iTETjWjGKEJ2t/5iWTs=
 github.com/redis/go-redis/v9 v9.17.1/go.mod h1:u410H11HMLoB+TP67dz8rL9s6QW2j76l0//kSOd3370=
 github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
@@ -180,6 +198,10 @@ github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFA
 github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
 github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
 github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY=
+github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2 h1:Jjn3zoRz13f8b1bR6LrXWglx93Sbh4kYfwgmPju3E2k=
+github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2/go.mod h1:wocb5pNrj/sjhWB9J5jctnC0K2eisSdz/nJJBNFHo+A=
+github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2 h1:ZjUj9BLYf9PEqBn8W/OapxhPjVRdC6CsXTdULHsyk5c=
+github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2/go.mod h1:O8bHQfyinKwTXKkiKNGmLQS7vRsqRxIQTFZpYpHK3IQ=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
 github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
@@ -190,33 +212,45 @@ github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo
 github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
 go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
 go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
+go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0 h1:7N94HrYgVc2tng6xEjmbycupxteYLll7lPlEi/UK5ok=
+go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0/go.mod h1:1i+7wBOfx0kn7PSGRKZ8e7zIhs+AmvLCiCloySDUeck=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
-go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
-go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
-go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
-go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
-go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
-go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
-go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
-go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
-go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
-go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
+go.opentelemetry.io/contrib/propagators/b3 v1.43.0 h1:CETqV3QLLPTy5yNrqyMr41VnAOOD4lsRved7n4QG00A=
+go.opentelemetry.io/contrib/propagators/b3 v1.43.0/go.mod h1:Q4mCiCdziYzpNR0g+6UqVotAlCDZdzz6L8jwY4knOrw=
+go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
+go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak=
+go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
+go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
+go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
+go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
+go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
+go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
+go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
+go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
+go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g=
+go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk=
 go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
 go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
+go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
+go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
 go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
 go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
 golang.org/x/crypto v0.0.0-20170512130425-ab89591268e0/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
-golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
-golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
+golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
+golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
 golang.org/x/net v0.0.0-20210520170846-37e1c6afe023/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220403103023-749bd193bc2b/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
-golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
-golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
-golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
-golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
-golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
-golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
+golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
+golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
+golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -228,32 +262,34 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
-golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
+golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4=
+golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
-golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
-golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
-golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
+golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
+golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
+golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
+golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
-gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
-gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
+gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
+gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
 google.golang.org/api v0.257.0 h1:8Y0lzvHlZps53PEaw+G29SsQIkuKrumGWs9puiexNAA=
 google.golang.org/api v0.257.0/go.mod h1:4eJrr+vbVaZSqs7vovFd1Jb/A6ml6iw2e6FBYf3GAO4=
 google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4=
 google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s=
-google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4=
-google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
-google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM=
-google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig=
-google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
-google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
+google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
+google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
+google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
+google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
 gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
@@ -1,215 +1,30 @@
+// apple_social_auth_handler is a stub — the user_applesocialauth table was
+// dropped in the Ory Kratos migration (phase 2). Social sign-in is now
+// handled by Kratos.
 package handlers

 import (
 	"net/http"
-	"strconv"

 	"github.com/labstack/echo/v4"
 	"gorm.io/gorm"
-
-	"github.com/treytartt/honeydue-api/internal/admin/dto"
-	"github.com/treytartt/honeydue-api/internal/models"
 )

-// AdminAppleSocialAuthHandler handles admin Apple social auth management endpoints
+// AdminAppleSocialAuthHandler is a no-op stub.
 type AdminAppleSocialAuthHandler struct {
 	db *gorm.DB
 }

-// NewAdminAppleSocialAuthHandler creates a new admin Apple social auth handler
 func NewAdminAppleSocialAuthHandler(db *gorm.DB) *AdminAppleSocialAuthHandler {
 	return &AdminAppleSocialAuthHandler{db: db}
 }

-// AppleSocialAuthResponse represents the response for an Apple social auth entry
-type AppleSocialAuthResponse struct {
-	ID             uint   `json:"id"`
-	UserID         uint   `json:"user_id"`
-	Username       string `json:"username"`
-	UserEmail      string `json:"user_email"`
-	AppleID        string `json:"apple_id"`
-	Email          string `json:"email"`
-	IsPrivateEmail bool   `json:"is_private_email"`
-	CreatedAt      string `json:"created_at"`
-	UpdatedAt      string `json:"updated_at"`
-}
-
-// UpdateAppleSocialAuthRequest represents the request to update an Apple social auth entry
-type UpdateAppleSocialAuthRequest struct {
-	Email          *string `json:"email"`
-	IsPrivateEmail *bool   `json:"is_private_email"`
-}
-
-// List handles GET /api/admin/apple-social-auth
-func (h *AdminAppleSocialAuthHandler) List(c echo.Context) error {
-	var filters dto.PaginationParams
-	if err := c.Bind(&filters); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	var entries []models.AppleSocialAuth
-	var total int64
-
-	query := h.db.Model(&models.AppleSocialAuth{}).Preload("User")
-
-	// Apply search
-	if filters.Search != "" {
-		search := "%" + filters.Search + "%"
-		query = query.Joins("JOIN auth_user ON auth_user.id = user_applesocialauth.user_id").
-			Where("user_applesocialauth.apple_id ILIKE ? OR user_applesocialauth.email ILIKE ? OR auth_user.username ILIKE ? OR auth_user.email ILIKE ?",
-				search, search, search, search)
-	}
-
-	// Get total count
-	query.Count(&total)
-
-	// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
-	sortBy := filters.GetSafeSortBy([]string{
-		"id", "user_id", "apple_id", "email", "is_private_email",
-		"created_at", "updated_at",
-	}, "created_at")
-	query = query.Order(sortBy + " " + filters.GetSortDir())
-
-	// Apply pagination
-	query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
-
-	if err := query.Find(&entries).Error; err != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entries"})
-	}
-
-	// Build response
-	responses := make([]AppleSocialAuthResponse, len(entries))
-	for i, entry := range entries {
-		responses[i] = h.toResponse(&entry)
-	}
-
-	return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
-}
-
-// Get handles GET /api/admin/apple-social-auth/:id
-func (h *AdminAppleSocialAuthHandler) Get(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
-	}
-
-	var entry models.AppleSocialAuth
-	if err := h.db.Preload("User").First(&entry, id).Error; err != nil {
-		if err == gorm.ErrRecordNotFound {
-			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
-		}
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
-	}
-
-	return c.JSON(http.StatusOK, h.toResponse(&entry))
-}
-
-// GetByUser handles GET /api/admin/apple-social-auth/user/:user_id
-func (h *AdminAppleSocialAuthHandler) GetByUser(c echo.Context) error {
-	userID, err := strconv.ParseUint(c.Param("user_id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
-	}
-
-	var entry models.AppleSocialAuth
-	if err := h.db.Preload("User").Where("user_id = ?", userID).First(&entry).Error; err != nil {
-		if err == gorm.ErrRecordNotFound {
-			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found for user"})
-		}
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
-	}
-
-	return c.JSON(http.StatusOK, h.toResponse(&entry))
-}
-
-// Update handles PUT /api/admin/apple-social-auth/:id
-func (h *AdminAppleSocialAuthHandler) Update(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
-	}
-
-	var entry models.AppleSocialAuth
-	if err := h.db.First(&entry, id).Error; err != nil {
-		if err == gorm.ErrRecordNotFound {
-			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
-		}
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
-	}
-
-	var req UpdateAppleSocialAuthRequest
-	if err := c.Bind(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	if req.Email != nil {
-		entry.Email = *req.Email
-	}
-	if req.IsPrivateEmail != nil {
-		entry.IsPrivateEmail = *req.IsPrivateEmail
-	}
-
-	if err := h.db.Save(&entry).Error; err != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update Apple social auth entry"})
-	}
-
-	h.db.Preload("User").First(&entry, id)
-	return c.JSON(http.StatusOK, h.toResponse(&entry))
-}
-
-// Delete handles DELETE /api/admin/apple-social-auth/:id
-func (h *AdminAppleSocialAuthHandler) Delete(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
-	}
-
-	var entry models.AppleSocialAuth
-	if err := h.db.First(&entry, id).Error; err != nil {
-		if err == gorm.ErrRecordNotFound {
-			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
-		}
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
-	}
-
-	if err := h.db.Delete(&entry).Error; err != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete Apple social auth entry"})
-	}
-
-	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Apple social auth entry deleted successfully"})
-}
-
-// BulkDelete handles DELETE /api/admin/apple-social-auth/bulk
-func (h *AdminAppleSocialAuthHandler) BulkDelete(c echo.Context) error {
-	var req dto.BulkDeleteRequest
-	if err := c.Bind(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	result := h.db.Where("id IN ?", req.IDs).Delete(&models.AppleSocialAuth{})
-	if result.Error != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete Apple social auth entries"})
-	}
-
-	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Apple social auth entries deleted successfully", "count": result.RowsAffected})
-}
-
-// toResponse converts an AppleSocialAuth model to AppleSocialAuthResponse
-func (h *AdminAppleSocialAuthHandler) toResponse(entry *models.AppleSocialAuth) AppleSocialAuthResponse {
-	response := AppleSocialAuthResponse{
-		ID:             entry.ID,
-		UserID:         entry.UserID,
-		AppleID:        entry.AppleID,
-		Email:          entry.Email,
-		IsPrivateEmail: entry.IsPrivateEmail,
-		CreatedAt:      entry.CreatedAt.Format("2006-01-02T15:04:05Z"),
-		UpdatedAt:      entry.UpdatedAt.Format("2006-01-02T15:04:05Z"),
-	}
-
-	if entry.User.ID != 0 {
-		response.Username = entry.User.Username
-		response.UserEmail = entry.User.Email
-	}
-
-	return response
+func (h *AdminAppleSocialAuthHandler) gone(c echo.Context) error {
+	return c.JSON(http.StatusGone, map[string]string{"message": "Apple social auth is managed by Ory Kratos"})
 }
+func (h *AdminAppleSocialAuthHandler) List(c echo.Context) error         { return h.gone(c) }
+func (h *AdminAppleSocialAuthHandler) Get(c echo.Context) error          { return h.gone(c) }
+func (h *AdminAppleSocialAuthHandler) Delete(c echo.Context) error       { return h.gone(c) }
+func (h *AdminAppleSocialAuthHandler) BulkDelete(c echo.Context) error   { return h.gone(c) }
+func (h *AdminAppleSocialAuthHandler) Update(c echo.Context) error       { return h.gone(c) }
+func (h *AdminAppleSocialAuthHandler) GetByUser(c echo.Context) error    { return h.gone(c) }
@@ -1,144 +1,27 @@
+// auth_token_handler is a stub — the user_authtoken table was dropped in the
+// Ory Kratos migration (phase 2). Auth tokens are now Kratos sessions.
 package handlers

 import (
 	"net/http"
-	"strconv"

 	"github.com/labstack/echo/v4"
 	"gorm.io/gorm"
-
-	"github.com/treytartt/honeydue-api/internal/admin/dto"
-	"github.com/treytartt/honeydue-api/internal/models"
 )

-// AdminAuthTokenHandler handles admin auth token management endpoints
+// AdminAuthTokenHandler is a no-op stub.
 type AdminAuthTokenHandler struct {
 	db *gorm.DB
 }

-// NewAdminAuthTokenHandler creates a new admin auth token handler
 func NewAdminAuthTokenHandler(db *gorm.DB) *AdminAuthTokenHandler {
 	return &AdminAuthTokenHandler{db: db}
 }

-// AuthTokenResponse represents an auth token in API responses
-type AuthTokenResponse struct {
-	Key      string `json:"key"`
-	UserID   uint   `json:"user_id"`
-	Username string `json:"username"`
-	Email    string `json:"email"`
-	Created  string `json:"created"`
-}
-
-// List handles GET /api/admin/auth-tokens
-func (h *AdminAuthTokenHandler) List(c echo.Context) error {
-	var filters dto.PaginationParams
-	if err := c.Bind(&filters); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	var tokens []models.AuthToken
-	var total int64
-
-	query := h.db.Model(&models.AuthToken{}).Preload("User")
-
-	// Apply search (search by user info)
-	if filters.Search != "" {
-		search := "%" + filters.Search + "%"
-		query = query.Joins("JOIN auth_user ON auth_user.id = user_authtoken.user_id").
-			Where(
-				"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_authtoken.key ILIKE ?",
-				search, search, search,
-			)
-	}
-
-	// Get total count
-	query.Count(&total)
-
-	// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
-	sortBy := filters.GetSafeSortBy([]string{
-		"created", "user_id",
-	}, "created")
-	query = query.Order(sortBy + " " + filters.GetSortDir())
-
-	// Apply pagination
-	query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
-
-	if err := query.Find(&tokens).Error; err != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch auth tokens"})
-	}
-
-	// Build response
-	responses := make([]AuthTokenResponse, len(tokens))
-	for i, token := range tokens {
-		responses[i] = AuthTokenResponse{
-			Key:      token.Key,
-			UserID:   token.UserID,
-			Username: token.User.Username,
-			Email:    token.User.Email,
-			Created:  token.Created.Format("2006-01-02T15:04:05Z"),
-		}
-	}
-
-	return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
-}
-
-// Get handles GET /api/admin/auth-tokens/:id (id is actually user_id)
-func (h *AdminAuthTokenHandler) Get(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
-	}
-
-	var token models.AuthToken
-	if err := h.db.Preload("User").Where("user_id = ?", id).First(&token).Error; err != nil {
-		if err == gorm.ErrRecordNotFound {
-			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Auth token not found"})
-		}
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch auth token"})
-	}
-
-	response := AuthTokenResponse{
-		Key:      token.Key,
-		UserID:   token.UserID,
-		Username: token.User.Username,
-		Email:    token.User.Email,
-		Created:  token.Created.Format("2006-01-02T15:04:05Z"),
-	}
-
-	return c.JSON(http.StatusOK, response)
-}
-
-// Delete handles DELETE /api/admin/auth-tokens/:id (revoke token)
-func (h *AdminAuthTokenHandler) Delete(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
-	}
-
-	result := h.db.Where("user_id = ?", id).Delete(&models.AuthToken{})
-	if result.Error != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to revoke token"})
-	}
-
-	if result.RowsAffected == 0 {
-		return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Auth token not found"})
-	}
-
-	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Auth token revoked successfully"})
-}
-
-// BulkDelete handles DELETE /api/admin/auth-tokens/bulk
-func (h *AdminAuthTokenHandler) BulkDelete(c echo.Context) error {
-	var req dto.BulkDeleteRequest
-	if err := c.Bind(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	result := h.db.Where("user_id IN ?", req.IDs).Delete(&models.AuthToken{})
-	if result.Error != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to revoke tokens"})
-	}
-
-	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Auth tokens revoked successfully", "count": result.RowsAffected})
+func (h *AdminAuthTokenHandler) gone(c echo.Context) error {
+	return c.JSON(http.StatusGone, map[string]string{"message": "auth tokens are managed by Ory Kratos"})
 }
+func (h *AdminAuthTokenHandler) List(c echo.Context) error       { return h.gone(c) }
+func (h *AdminAuthTokenHandler) Get(c echo.Context) error        { return h.gone(c) }
+func (h *AdminAuthTokenHandler) Delete(c echo.Context) error     { return h.gone(c) }
+func (h *AdminAuthTokenHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
@@ -1,162 +1,28 @@
+// confirmation_code_handler is a stub — the user_confirmationcode table was
+// dropped in the Ory Kratos migration (phase 2). Email verification is now
+// handled by Kratos.
 package handlers

 import (
 	"net/http"
-	"strconv"
-	"strings"

 	"github.com/labstack/echo/v4"
 	"gorm.io/gorm"
-
-	"github.com/treytartt/honeydue-api/internal/admin/dto"
-	"github.com/treytartt/honeydue-api/internal/models"
 )

-// maskCode masks a confirmation code, showing only the last 4 characters.
-func maskCode(code string) string {
-	if len(code) <= 4 {
-		return strings.Repeat("*", len(code))
-	}
-	return strings.Repeat("*", len(code)-4) + code[len(code)-4:]
-}
-
-// AdminConfirmationCodeHandler handles admin confirmation code management endpoints
+// AdminConfirmationCodeHandler is a no-op stub.
 type AdminConfirmationCodeHandler struct {
 	db *gorm.DB
 }

-// NewAdminConfirmationCodeHandler creates a new admin confirmation code handler
 func NewAdminConfirmationCodeHandler(db *gorm.DB) *AdminConfirmationCodeHandler {
 	return &AdminConfirmationCodeHandler{db: db}
 }

-// ConfirmationCodeResponse represents a confirmation code in API responses
-type ConfirmationCodeResponse struct {
-	ID        uint   `json:"id"`
-	UserID    uint   `json:"user_id"`
-	Username  string `json:"username"`
-	Email     string `json:"email"`
-	Code      string `json:"code"`
-	ExpiresAt string `json:"expires_at"`
-	IsUsed    bool   `json:"is_used"`
-	CreatedAt string `json:"created_at"`
-}
-
-// List handles GET /api/admin/confirmation-codes
-func (h *AdminConfirmationCodeHandler) List(c echo.Context) error {
-	var filters dto.PaginationParams
-	if err := c.Bind(&filters); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	var codes []models.ConfirmationCode
-	var total int64
-
-	query := h.db.Model(&models.ConfirmationCode{}).Preload("User")
-
-	// Apply search (search by user info or code)
-	if filters.Search != "" {
-		search := "%" + filters.Search + "%"
-		query = query.Joins("JOIN auth_user ON auth_user.id = user_confirmationcode.user_id").
-			Where(
-				"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_confirmationcode.code ILIKE ?",
-				search, search, search,
-			)
-	}
-
-	// Get total count
-	query.Count(&total)
-
-	// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
-	sortBy := filters.GetSafeSortBy([]string{
-		"id", "user_id", "created_at", "expires_at", "is_used",
-	}, "created_at")
-	query = query.Order(sortBy + " " + filters.GetSortDir())
-
-	// Apply pagination
-	query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
-
-	if err := query.Find(&codes).Error; err != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch confirmation codes"})
-	}
-
-	// Build response
-	responses := make([]ConfirmationCodeResponse, len(codes))
-	for i, code := range codes {
-		responses[i] = ConfirmationCodeResponse{
-			ID:        code.ID,
-			UserID:    code.UserID,
-			Username:  code.User.Username,
-			Email:     code.User.Email,
-			Code:      maskCode(code.Code),
-			ExpiresAt: code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
-			IsUsed:    code.IsUsed,
-			CreatedAt: code.CreatedAt.Format("2006-01-02T15:04:05Z"),
-		}
-	}
-
-	return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
-}
-
-// Get handles GET /api/admin/confirmation-codes/:id
-func (h *AdminConfirmationCodeHandler) Get(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
-	}
-
-	var code models.ConfirmationCode
-	if err := h.db.Preload("User").First(&code, id).Error; err != nil {
-		if err == gorm.ErrRecordNotFound {
-			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Confirmation code not found"})
-		}
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch confirmation code"})
-	}
-
-	response := ConfirmationCodeResponse{
-		ID:        code.ID,
-		UserID:    code.UserID,
-		Username:  code.User.Username,
-		Email:     code.User.Email,
-		Code:      maskCode(code.Code),
-		ExpiresAt: code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
-		IsUsed:    code.IsUsed,
-		CreatedAt: code.CreatedAt.Format("2006-01-02T15:04:05Z"),
-	}
-
-	return c.JSON(http.StatusOK, response)
-}
-
-// Delete handles DELETE /api/admin/confirmation-codes/:id
-func (h *AdminConfirmationCodeHandler) Delete(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
-	}
-
-	result := h.db.Delete(&models.ConfirmationCode{}, id)
-	if result.Error != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete confirmation code"})
-	}
-
-	if result.RowsAffected == 0 {
-		return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Confirmation code not found"})
-	}
-
-	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Confirmation code deleted successfully"})
-}
-
-// BulkDelete handles DELETE /api/admin/confirmation-codes/bulk
-func (h *AdminConfirmationCodeHandler) BulkDelete(c echo.Context) error {
-	var req dto.BulkDeleteRequest
-	if err := c.Bind(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	result := h.db.Where("id IN ?", req.IDs).Delete(&models.ConfirmationCode{})
-	if result.Error != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete confirmation codes"})
-	}
-
-	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Confirmation codes deleted successfully", "count": result.RowsAffected})
+func (h *AdminConfirmationCodeHandler) gone(c echo.Context) error {
+	return c.JSON(http.StatusGone, map[string]string{"message": "confirmation codes are managed by Ory Kratos"})
 }
+func (h *AdminConfirmationCodeHandler) List(c echo.Context) error       { return h.gone(c) }
+func (h *AdminConfirmationCodeHandler) Get(c echo.Context) error        { return h.gone(c) }
+func (h *AdminConfirmationCodeHandler) Delete(c echo.Context) error     { return h.gone(c) }
+func (h *AdminConfirmationCodeHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
@@ -8,16 +8,18 @@ import (
 	"gorm.io/gorm"

 	"github.com/treytartt/honeydue-api/internal/models"
+	"github.com/treytartt/honeydue-api/internal/services"
 )

 // AdminLimitationsHandler handles subscription limitations management
 type AdminLimitationsHandler struct {
-	db *gorm.DB
+	db    *gorm.DB
+	cache *services.CacheService
 }

-// NewAdminLimitationsHandler creates a new handler
-func NewAdminLimitationsHandler(db *gorm.DB) *AdminLimitationsHandler {
-	return &AdminLimitationsHandler{db: db}
+// NewAdminLimitationsHandler creates a new handler. Cache is optional.
+func NewAdminLimitationsHandler(db *gorm.DB, cache *services.CacheService) *AdminLimitationsHandler {
+	return &AdminLimitationsHandler{db: db, cache: cache}
 }

 // === Settings (enable_limitations) ===
@@ -27,14 +29,25 @@ type LimitationsSettingsResponse struct {
 	EnableLimitations bool `json:"enable_limitations"`
 }

-// GetSettings handles GET /api/admin/limitations/settings
+// GetSettings handles GET /api/admin/limitations/settings.
+// Reads through Redis cache first; on miss falls through to DB.
 func (h *AdminLimitationsHandler) GetSettings(c echo.Context) error {
+	ctx := c.Request().Context()
+
+	if h.cache != nil {
+		var cached models.SubscriptionSettings
+		if err := h.cache.GetCachedSubscriptionSettings(ctx, &cached); err == nil {
+			return c.JSON(http.StatusOK, LimitationsSettingsResponse{
+				EnableLimitations: cached.EnableLimitations,
+			})
+		}
+	}
+
 	var settings models.SubscriptionSettings
-	if err := h.db.First(&settings, 1).Error; err != nil {
+	if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
-			// Create default settings
 			settings = models.SubscriptionSettings{ID: 1, EnableLimitations: false}
-			if err := h.db.Create(&settings).Error; err != nil {
+			if err := h.db.WithContext(ctx).Create(&settings).Error; err != nil {
 				return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create default settings"})
 			}
 		} else {
@@ -42,6 +55,10 @@ func (h *AdminLimitationsHandler) GetSettings(c echo.Context) error {
 		}
 	}

+	if h.cache != nil {
+		_ = h.cache.CacheSubscriptionSettings(ctx, &settings)
+	}
+
 	return c.JSON(http.StatusOK, LimitationsSettingsResponse{
 		EnableLimitations: settings.EnableLimitations,
 	})
@@ -60,7 +77,8 @@ func (h *AdminLimitationsHandler) UpdateSettings(c echo.Context) error {
 	}

 	var settings models.SubscriptionSettings
-	if err := h.db.First(&settings, 1).Error; err != nil {
+	ctx := c.Request().Context()
+	if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			settings = models.SubscriptionSettings{ID: 1}
 		} else {
@@ -72,10 +90,15 @@ func (h *AdminLimitationsHandler) UpdateSettings(c echo.Context) error {
 		settings.EnableLimitations = *req.EnableLimitations
 	}

-	if err := h.db.Save(&settings).Error; err != nil {
+	if err := h.db.WithContext(ctx).Save(&settings).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update settings"})
 	}

+	// Invalidate the cache so the new value is visible to all pods.
+	if h.cache != nil {
+		_ = h.cache.InvalidateSubscriptionSettings(ctx)
+	}
+
 	return c.JSON(http.StatusOK, LimitationsSettingsResponse{
 		EnableLimitations: settings.EnableLimitations,
 	})
@@ -1,159 +1,28 @@
+// password_reset_code_handler is a stub — the user_passwordresetcode table
+// was dropped in the Ory Kratos migration (phase 2). Password resets are now
+// handled by Kratos.
 package handlers

 import (
 	"net/http"
-	"strconv"

 	"github.com/labstack/echo/v4"
 	"gorm.io/gorm"
-
-	"github.com/treytartt/honeydue-api/internal/admin/dto"
-	"github.com/treytartt/honeydue-api/internal/models"
 )

-// AdminPasswordResetCodeHandler handles admin password reset code management endpoints
+// AdminPasswordResetCodeHandler is a no-op stub.
 type AdminPasswordResetCodeHandler struct {
 	db *gorm.DB
 }

-// NewAdminPasswordResetCodeHandler creates a new admin password reset code handler
 func NewAdminPasswordResetCodeHandler(db *gorm.DB) *AdminPasswordResetCodeHandler {
 	return &AdminPasswordResetCodeHandler{db: db}
 }

-// PasswordResetCodeResponse represents a password reset code in API responses
-type PasswordResetCodeResponse struct {
-	ID          uint   `json:"id"`
-	UserID      uint   `json:"user_id"`
-	Username    string `json:"username"`
-	Email       string `json:"email"`
-	ResetToken  string `json:"reset_token"`
-	ExpiresAt   string `json:"expires_at"`
-	Used        bool   `json:"used"`
-	Attempts    int    `json:"attempts"`
-	MaxAttempts int    `json:"max_attempts"`
-	CreatedAt   string `json:"created_at"`
-}
-
-// List handles GET /api/admin/password-reset-codes
-func (h *AdminPasswordResetCodeHandler) List(c echo.Context) error {
-	var filters dto.PaginationParams
-	if err := c.Bind(&filters); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	var codes []models.PasswordResetCode
-	var total int64
-
-	query := h.db.Model(&models.PasswordResetCode{}).Preload("User")
-
-	// Apply search (search by user info or token)
-	if filters.Search != "" {
-		search := "%" + filters.Search + "%"
-		query = query.Joins("JOIN auth_user ON auth_user.id = user_passwordresetcode.user_id").
-			Where(
-				"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_passwordresetcode.reset_token ILIKE ?",
-				search, search, search,
-			)
-	}
-
-	// Get total count
-	query.Count(&total)
-
-	// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
-	sortBy := filters.GetSafeSortBy([]string{
-		"id", "user_id", "created_at", "expires_at", "used",
-	}, "created_at")
-	query = query.Order(sortBy + " " + filters.GetSortDir())
-
-	// Apply pagination
-	query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
-
-	if err := query.Find(&codes).Error; err != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch password reset codes"})
-	}
-
-	// Build response
-	responses := make([]PasswordResetCodeResponse, len(codes))
-	for i, code := range codes {
-		responses[i] = PasswordResetCodeResponse{
-			ID:          code.ID,
-			UserID:      code.UserID,
-			Username:    code.User.Username,
-			Email:       code.User.Email,
-			ResetToken:  code.ResetToken[:8] + "..." + code.ResetToken[len(code.ResetToken)-4:], // Truncate for display
-			ExpiresAt:   code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
-			Used:        code.Used,
-			Attempts:    code.Attempts,
-			MaxAttempts: code.MaxAttempts,
-			CreatedAt:   code.CreatedAt.Format("2006-01-02T15:04:05Z"),
-		}
-	}
-
-	return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
-}
-
-// Get handles GET /api/admin/password-reset-codes/:id
-func (h *AdminPasswordResetCodeHandler) Get(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
-	}
-
-	var code models.PasswordResetCode
-	if err := h.db.Preload("User").First(&code, id).Error; err != nil {
-		if err == gorm.ErrRecordNotFound {
-			return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Password reset code not found"})
-		}
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch password reset code"})
-	}
-
-	response := PasswordResetCodeResponse{
-		ID:          code.ID,
-		UserID:      code.UserID,
-		Username:    code.User.Username,
-		Email:       code.User.Email,
-		ResetToken:  code.ResetToken[:8] + "..." + code.ResetToken[len(code.ResetToken)-4:],
-		ExpiresAt:   code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
-		Used:        code.Used,
-		Attempts:    code.Attempts,
-		MaxAttempts: code.MaxAttempts,
-		CreatedAt:   code.CreatedAt.Format("2006-01-02T15:04:05Z"),
-	}
-
-	return c.JSON(http.StatusOK, response)
-}
-
-// Delete handles DELETE /api/admin/password-reset-codes/:id
-func (h *AdminPasswordResetCodeHandler) Delete(c echo.Context) error {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 32)
-	if err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
-	}
-
-	result := h.db.Delete(&models.PasswordResetCode{}, id)
-	if result.Error != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete password reset code"})
-	}
-
-	if result.RowsAffected == 0 {
-		return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Password reset code not found"})
-	}
-
-	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Password reset code deleted successfully"})
-}
-
-// BulkDelete handles DELETE /api/admin/password-reset-codes/bulk
-func (h *AdminPasswordResetCodeHandler) BulkDelete(c echo.Context) error {
-	var req dto.BulkDeleteRequest
-	if err := c.Bind(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
-	}
-
-	result := h.db.Where("id IN ?", req.IDs).Delete(&models.PasswordResetCode{})
-	if result.Error != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete password reset codes"})
-	}
-
-	return c.JSON(http.StatusOK, map[string]interface{}{"message": "Password reset codes deleted successfully", "count": result.RowsAffected})
+func (h *AdminPasswordResetCodeHandler) gone(c echo.Context) error {
+	return c.JSON(http.StatusGone, map[string]string{"message": "password reset codes are managed by Ory Kratos"})
 }
+func (h *AdminPasswordResetCodeHandler) List(c echo.Context) error       { return h.gone(c) }
+func (h *AdminPasswordResetCodeHandler) Get(c echo.Context) error        { return h.gone(c) }
+func (h *AdminPasswordResetCodeHandler) Delete(c echo.Context) error     { return h.gone(c) }
+func (h *AdminPasswordResetCodeHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
@@ -18,12 +18,14 @@ import (

 // AdminSettingsHandler handles system settings management
 type AdminSettingsHandler struct {
-	db *gorm.DB
+	db    *gorm.DB
+	cache *services.CacheService
 }

-// NewAdminSettingsHandler creates a new handler
-func NewAdminSettingsHandler(db *gorm.DB) *AdminSettingsHandler {
-	return &AdminSettingsHandler{db: db}
+// NewAdminSettingsHandler creates a new handler. The cache may be nil; the
+// handler falls through to direct DB reads in that case.
+func NewAdminSettingsHandler(db *gorm.DB, cache *services.CacheService) *AdminSettingsHandler {
+	return &AdminSettingsHandler{db: db, cache: cache}
 }

 // SettingsResponse represents the settings response
@@ -34,10 +36,29 @@ type SettingsResponse struct {
 	TrialDurationDays int  `json:"trial_duration_days"`
 }

-// GetSettings handles GET /api/admin/settings
+// GetSettings handles GET /api/admin/settings.
+//
+// Reads through Redis (30-min TTL) before hitting Postgres so the same
+// row that's checked on every authed request and every monitoring poll
+// stays hot. Cache miss / first boot creates and caches the default row.
 func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
+	ctx := c.Request().Context()
+
+	// Try cache first.
+	if h.cache != nil {
+		var cached models.SubscriptionSettings
+		if err := h.cache.GetCachedSubscriptionSettings(ctx, &cached); err == nil {
+			return c.JSON(http.StatusOK, SettingsResponse{
+				EnableLimitations: cached.EnableLimitations,
+				EnableMonitoring:  cached.EnableMonitoring,
+				TrialEnabled:      cached.TrialEnabled,
+				TrialDurationDays: cached.TrialDurationDays,
+			})
+		}
+	}
+
 	var settings models.SubscriptionSettings
-	if err := h.db.First(&settings, 1).Error; err != nil {
+	if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			// Create default settings
 			settings = models.SubscriptionSettings{
@@ -47,7 +68,7 @@ func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
 				TrialEnabled:      true,
 				TrialDurationDays: 14,
 			}
-			if err := h.db.Create(&settings).Error; err != nil {
+			if err := h.db.WithContext(ctx).Create(&settings).Error; err != nil {
 				return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create default settings"})
 			}
 		} else {
@@ -55,6 +76,10 @@ func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
 		}
 	}

+	if h.cache != nil {
+		_ = h.cache.CacheSubscriptionSettings(ctx, &settings)
+	}
+
 	return c.JSON(http.StatusOK, SettingsResponse{
 		EnableLimitations: settings.EnableLimitations,
 		EnableMonitoring:  settings.EnableMonitoring,
@@ -79,7 +104,7 @@ func (h *AdminSettingsHandler) UpdateSettings(c echo.Context) error {
 	}

 	var settings models.SubscriptionSettings
-	if err := h.db.First(&settings, 1).Error; err != nil {
+	if err := h.db.WithContext(c.Request().Context()).First(&settings, 1).Error; err != nil {
 		if err == gorm.ErrRecordNotFound {
 			settings = models.SubscriptionSettings{
 				ID:                1,
@@ -108,10 +133,16 @@ func (h *AdminSettingsHandler) UpdateSettings(c echo.Context) error {
 		settings.TrialDurationDays = *req.TrialDurationDays
 	}

-	if err := h.db.Save(&settings).Error; err != nil {
+	if err := h.db.WithContext(c.Request().Context()).Save(&settings).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update settings"})
 	}

+	// Invalidate the cache so all pods pick up the new value on their
+	// next read (instead of waiting for the 30-min TTL).
+	if h.cache != nil {
+		_ = h.cache.InvalidateSubscriptionSettings(c.Request().Context())
+	}
+
 	return c.JSON(http.StatusOK, SettingsResponse{
 		EnableLimitations: settings.EnableLimitations,
 		EnableMonitoring:  settings.EnableMonitoring,
@@ -217,137 +248,20 @@ func (h *AdminSettingsHandler) cacheAllLookups(ctx context.Context) (bool, error
 	}
 	log.Debug().Int("count", len(taskTemplates)).Msg("Cached task templates")

-	// Build and cache the unified seeded data response
-	// Import the grouped response type
-	seededData := map[string]interface{}{
-		"residence_types":        residenceTypes,
-		"task_categories":        categories,
-		"task_priorities":        priorities,
-		"task_frequencies":       frequencies,
-		"contractor_specialties": specialties,
-		"task_templates":         buildGroupedTemplates(taskTemplates),
+	// Invalidate the unified seeded-data cache for every locale. The combined
+	// response is localized (lookup display_name + home-profile options) and is
+	// rebuilt per-locale on demand by the static_data handler, so the correct
+	// action after a lookup change is to clear all language variants rather than
+	// pre-warm a single (non-localized) blob.
+	if err := cache.InvalidateSeededData(ctx); err != nil {
+		return false, fmt.Errorf("failed to invalidate seeded data: %w", err)
 	}
-
-	etag, err := cache.CacheSeededData(ctx, seededData)
-	if err != nil {
-		return false, fmt.Errorf("failed to cache seeded data: %w", err)
-	}
-	log.Debug().Str("etag", etag).Msg("Cached unified seeded data")
+	log.Debug().Msg("Invalidated per-locale seeded data cache")

 	log.Info().Msg("All lookup data cached in Redis successfully")
 	return true, nil
 }

-// buildGroupedTemplates groups task templates by category for the seeded data response
-func buildGroupedTemplates(templates []models.TaskTemplate) map[string]interface{} {
-	type templateResponse struct {
-		ID           uint                   `json:"id"`
-		Title        string                 `json:"title"`
-		Description  string                 `json:"description"`
-		CategoryID   *uint                  `json:"category_id"`
-		Category     map[string]interface{} `json:"category,omitempty"`
-		FrequencyID  *uint                  `json:"frequency_id"`
-		Frequency    map[string]interface{} `json:"frequency,omitempty"`
-		IconIOS      string                 `json:"icon_ios"`
-		IconAndroid  string                 `json:"icon_android"`
-		Tags         []string               `json:"tags"`
-		DisplayOrder int                    `json:"display_order"`
-		IsActive     bool                   `json:"is_active"`
-	}
-
-	type categoryGroup struct {
-		CategoryName string             `json:"category_name"`
-		CategoryID   *uint              `json:"category_id"`
-		Templates    []templateResponse `json:"templates"`
-		Count        int                `json:"count"`
-	}
-
-	categoryMap := make(map[string]*categoryGroup)
-	categoryOrder := []string{}
-
-	for _, t := range templates {
-		categoryName := "Uncategorized"
-		var categoryID *uint
-		if t.Category != nil {
-			categoryName = t.Category.Name
-			categoryID = &t.Category.ID
-		}
-
-		if _, exists := categoryMap[categoryName]; !exists {
-			categoryMap[categoryName] = &categoryGroup{
-				CategoryName: categoryName,
-				CategoryID:   categoryID,
-				Templates:    []templateResponse{},
-			}
-			categoryOrder = append(categoryOrder, categoryName)
-		}
-
-		resp := templateResponse{
-			ID:           t.ID,
-			Title:        t.Title,
-			Description:  t.Description,
-			CategoryID:   t.CategoryID,
-			FrequencyID:  t.FrequencyID,
-			IconIOS:      t.IconIOS,
-			IconAndroid:  t.IconAndroid,
-			Tags:         parseTags(t.Tags),
-			DisplayOrder: t.DisplayOrder,
-			IsActive:     t.IsActive,
-		}
-
-		if t.Category != nil {
-			resp.Category = map[string]interface{}{
-				"id":            t.Category.ID,
-				"name":          t.Category.Name,
-				"description":   t.Category.Description,
-				"icon":          t.Category.Icon,
-				"color":         t.Category.Color,
-				"display_order": t.Category.DisplayOrder,
-			}
-		}
-		if t.Frequency != nil {
-			resp.Frequency = map[string]interface{}{
-				"id":            t.Frequency.ID,
-				"name":          t.Frequency.Name,
-				"days":          t.Frequency.Days,
-				"display_order": t.Frequency.DisplayOrder,
-			}
-		}
-
-		categoryMap[categoryName].Templates = append(categoryMap[categoryName].Templates, resp)
-	}
-
-	categories := make([]categoryGroup, len(categoryOrder))
-	totalCount := 0
-	for i, name := range categoryOrder {
-		group := categoryMap[name]
-		group.Count = len(group.Templates)
-		totalCount += group.Count
-		categories[i] = *group
-	}
-
-	return map[string]interface{}{
-		"categories":  categories,
-		"total_count": totalCount,
-	}
-}
-
-// parseTags splits a comma-separated tags string into a slice
-func parseTags(tags string) []string {
-	if tags == "" {
-		return []string{}
-	}
-	parts := strings.Split(tags, ",")
-	result := make([]string, 0, len(parts))
-	for _, p := range parts {
-		trimmed := strings.TrimSpace(p)
-		if trimmed != "" {
-			result = append(result, trimmed)
-		}
-	}
-	return result
-}
-
 // SeedTestData handles POST /api/admin/settings/seed-test-data
 func (h *AdminSettingsHandler) SeedTestData(c echo.Context) error {
 	if err := h.runSeedFile("002_test_data.sql"); err != nil {
@@ -487,9 +401,9 @@ type ClearAllDataResponse struct {

 // ClearStuckJobsResponse represents the response after clearing stuck Redis jobs
 type ClearStuckJobsResponse struct {
-	Message      string   `json:"message"`
-	KeysDeleted  int      `json:"keys_deleted"`
-	DeletedKeys  []string `json:"deleted_keys"`
+	Message     string   `json:"message"`
+	KeysDeleted int      `json:"keys_deleted"`
+	DeletedKeys []string `json:"deleted_keys"`
 }

 // ClearStuckJobs handles POST /api/admin/settings/clear-stuck-jobs
@@ -507,9 +421,9 @@ func (h *AdminSettingsHandler) ClearStuckJobs(c echo.Context) error {

 	// Patterns for asynq job keys that can get stuck
 	patterns := []string{
-		"asynq:{default}:retry",           // Retry queue
-		"asynq:{default}:archived",        // Archived/dead jobs
-		"asynq:{default}:t:*",             // Individual task metadata
+		"asynq:{default}:retry",    // Retry queue
+		"asynq:{default}:archived", // Archived/dead jobs
+		"asynq:{default}:t:*",      // Individual task metadata
 	}

 	for _, pattern := range patterns {
@@ -207,9 +207,7 @@ func (h *AdminUserHandler) Create(c echo.Context) error {
 		user.IsSuperuser = *req.IsSuperuser
 	}

-	if err := user.SetPassword(req.Password); err != nil {
-		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to hash password"})
-	}
+	// Password management is handled by Ory Kratos; no local password hashing.

 	if err := h.db.Create(&user).Error; err != nil {
 		return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create user"})
@@ -284,10 +282,9 @@ func (h *AdminUserHandler) Update(c echo.Context) error {
 	if req.IsSuperuser != nil {
 		user.IsSuperuser = *req.IsSuperuser
 	}
+	// Password management is handled by Ory Kratos; local password update ignored.
 	if req.Password != nil {
-		if err := user.SetPassword(*req.Password); err != nil {
-			return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to hash password"})
-		}
+		_ = req.Password // Password changes must go through Kratos admin API
 	}

 	if err := h.db.Save(&user).Error; err != nil {
@@ -25,6 +25,7 @@ type Dependencies struct {
 	PushClient          *push.Client
 	OnboardingService   *services.OnboardingEmailService
 	MonitoringHandler   *monitoring.Handler
+	CacheService        *services.CacheService
 }

 // SetupRoutes configures all admin routes
@@ -380,7 +381,7 @@ func SetupRoutes(router *echo.Echo, db *gorm.DB, cfg *config.Config, deps *Depen
 			}

 			// System settings management (super admin only)
-			settingsHandler := handlers.NewAdminSettingsHandler(db)
+			settingsHandler := handlers.NewAdminSettingsHandler(db, deps.CacheService)
 			settings := protected.Group("/settings")
 			settings.Use(middleware.RequireSuperAdmin())
 			{
@@ -394,7 +395,7 @@ func SetupRoutes(router *echo.Echo, db *gorm.DB, cfg *config.Config, deps *Depen
 			}

 			// Limitations management (tier limits, upgrade triggers)
-			limitationsHandler := handlers.NewAdminLimitationsHandler(db)
+			limitationsHandler := handlers.NewAdminLimitationsHandler(db, deps.CacheService)
 			limitations := protected.Group("/limitations")
 			{
 				// Settings (enable_limitations toggle)
@@ -1,6 +1,7 @@
 package config

 import (
+	"crypto/rand"
 	"encoding/hex"
 	"fmt"
 	"net/url"
@@ -52,6 +53,7 @@ type DatabaseConfig struct {
 	MaxOpenConns int
 	MaxIdleConns int
 	MaxLifetime  time.Duration
+	MaxIdleTime  time.Duration
 }

 type RedisConfig struct {
@@ -88,8 +90,12 @@ type PushConfig struct {
 }

 type AppleAuthConfig struct {
-	ClientID string // Bundle ID (e.g., com.tt.honeyDue.honeyDueDev)
-	TeamID   string // Apple Developer Team ID
+	ClientID string // Bundle ID, used as the `aud` claim in Sign in with Apple identity tokens
+	// TeamID is currently unused — services/apple_auth.go validates identity tokens
+	// against ClientID + Apple's JWKS only, with no server-to-server REST calls.
+	// Wire this in if/when token revocation or refresh-token exchange is added,
+	// since both require signing a client_secret JWT with team_id + key_id.
+	TeamID string
 }

 type GoogleAuthConfig struct {
@@ -136,6 +142,13 @@ type SecurityConfig struct {
 	MaxPasswordResetRate int // per hour
 	TokenExpiryDays      int // Number of days before auth tokens expire (default 90)
 	TokenRefreshDays     int // Token must be at least this many days old before refresh (default 60)
+	// KratosPublicURL is the Ory Kratos public API base URL. The auth
+	// middleware validates sessions against {KratosPublicURL}/sessions/whoami.
+	KratosPublicURL string
+	// KratosAdminURL is the Ory Kratos admin API base URL. Account deletion
+	// removes the user's Kratos identity via
+	// {KratosAdminURL}/admin/identities/{id}.
+	KratosAdminURL string
 }

 // StorageConfig holds file storage settings.
@@ -177,8 +190,8 @@ type FeatureFlags struct {
 }

 var (
-	cfg     *Config
-	cfgOnce sync.Once
+	cfg   *Config
+	cfgMu sync.Mutex
 )

 // knownWeakSecretKeys contains well-known default or placeholder secret keys
@@ -191,162 +204,170 @@ var knownWeakSecretKeys = map[string]bool{
 	"change-me-in-production-secret-key-12345": true,
 }

-// Load reads configuration from environment variables
+// Load reads configuration from environment variables.
+//
+// Caches the result so repeated calls are cheap. On validation failure, the
+// cache stays nil so a subsequent call (after env is corrected) can retry. The
+// previous implementation used sync.Once with an in-Do reset of the Once
+// itself, which races and panics with "sync: unlock of unlocked mutex".
 func Load() (*Config, error) {
-	var loadErr error
-
-	cfgOnce.Do(func() {
-		viper.SetEnvPrefix("")
-		viper.AutomaticEnv()
-		viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
-
-		// Set defaults
-		setDefaults()
-
-		// Parse DATABASE_URL if set (Dokku-style)
-		dbConfig := DatabaseConfig{
-			Host:         viper.GetString("DB_HOST"),
-			Port:         viper.GetInt("DB_PORT"),
-			User:         viper.GetString("POSTGRES_USER"),
-			Password:     viper.GetString("POSTGRES_PASSWORD"),
-			Database:     viper.GetString("POSTGRES_DB"),
-			SSLMode:      viper.GetString("DB_SSLMODE"),
-			MaxOpenConns: viper.GetInt("DB_MAX_OPEN_CONNS"),
-			MaxIdleConns: viper.GetInt("DB_MAX_IDLE_CONNS"),
-			MaxLifetime:  viper.GetDuration("DB_MAX_LIFETIME"),
-		}
-
-		// Override with DATABASE_URL if present (F-16: log warning on parse failure)
-		if databaseURL := viper.GetString("DATABASE_URL"); databaseURL != "" {
-			parsed, err := parseDatabaseURL(databaseURL)
-			if err != nil {
-				maskedURL := MaskURLCredentials(databaseURL)
-				fmt.Printf("WARNING: Failed to parse DATABASE_URL (%s): %v — falling back to individual DB_* env vars\n", maskedURL, err)
-			} else {
-				dbConfig.Host = parsed.Host
-				dbConfig.Port = parsed.Port
-				dbConfig.User = parsed.User
-				dbConfig.Password = parsed.Password
-				dbConfig.Database = parsed.Database
-				if parsed.SSLMode != "" {
-					dbConfig.SSLMode = parsed.SSLMode
-				}
-			}
-		}
-
-		cfg = &Config{
-			Server: ServerConfig{
-				Port:               viper.GetInt("PORT"),
-				Debug:              viper.GetBool("DEBUG"),
-				DebugFixedCodes:    viper.GetBool("DEBUG_FIXED_CODES"),
-				AllowedHosts:       strings.Split(viper.GetString("ALLOWED_HOSTS"), ","),
-				CorsAllowedOrigins: parseCorsOrigins(viper.GetString("CORS_ALLOWED_ORIGINS")),
-				Timezone:           viper.GetString("TIMEZONE"),
-				StaticDir:          viper.GetString("STATIC_DIR"),
-				BaseURL:            viper.GetString("BASE_URL"),
-			},
-			Database: dbConfig,
-			Redis: RedisConfig{
-				URL:      viper.GetString("REDIS_URL"),
-				Password: viper.GetString("REDIS_PASSWORD"),
-				DB:       viper.GetInt("REDIS_DB"),
-			},
-			Email: EmailConfig{
-				Host:     viper.GetString("EMAIL_HOST"),
-				Port:     viper.GetInt("EMAIL_PORT"),
-				User:     viper.GetString("EMAIL_HOST_USER"),
-				Password: viper.GetString("EMAIL_HOST_PASSWORD"),
-				From:     viper.GetString("DEFAULT_FROM_EMAIL"),
-				UseTLS:   viper.GetBool("EMAIL_USE_TLS"),
-			},
-			Push: PushConfig{
-				APNSKeyPath:           viper.GetString("APNS_AUTH_KEY_PATH"),
-				APNSKeyID:             viper.GetString("APNS_AUTH_KEY_ID"),
-				APNSTeamID:            viper.GetString("APNS_TEAM_ID"),
-				APNSTopic:             viper.GetString("APNS_TOPIC"),
-				APNSSandbox:           viper.GetBool("APNS_USE_SANDBOX"),
-				APNSProduction:        viper.GetBool("APNS_PRODUCTION"),
-				FCMProjectID:          viper.GetString("FCM_PROJECT_ID"),
-				FCMServiceAccountPath: viper.GetString("FCM_SERVICE_ACCOUNT_PATH"),
-				FCMServiceAccountJSON: viper.GetString("FCM_SERVICE_ACCOUNT_JSON"),
-				FCMServerKey:          viper.GetString("FCM_SERVER_KEY"),
-			},
-			Worker: WorkerConfig{
-				TaskReminderHour:    viper.GetInt("TASK_REMINDER_HOUR"),
-				OverdueReminderHour: viper.GetInt("OVERDUE_REMINDER_HOUR"),
-				DailyNotifHour:      viper.GetInt("DAILY_DIGEST_HOUR"),
-			},
-			Security: SecurityConfig{
-				SecretKey:            viper.GetString("SECRET_KEY"),
-				TokenCacheTTL:        5 * time.Minute,
-				PasswordResetExpiry:  15 * time.Minute,
-				ConfirmationExpiry:   24 * time.Hour,
-				MaxPasswordResetRate: 3,
-				TokenExpiryDays:      viper.GetInt("TOKEN_EXPIRY_DAYS"),
-				TokenRefreshDays:     viper.GetInt("TOKEN_REFRESH_DAYS"),
-			},
-			Storage: StorageConfig{
-				UploadDir:     viper.GetString("STORAGE_UPLOAD_DIR"),
-				BaseURL:       viper.GetString("STORAGE_BASE_URL"),
-				S3Endpoint:    viper.GetString("B2_ENDPOINT"),
-				S3KeyID:       viper.GetString("B2_KEY_ID"),
-				S3AppKey:      viper.GetString("B2_APP_KEY"),
-				S3Bucket:      viper.GetString("B2_BUCKET_NAME"),
-				S3UseSSL:      viper.GetString("STORAGE_USE_SSL") == "" || viper.GetBool("STORAGE_USE_SSL"),
-				S3Region:      viper.GetString("B2_REGION"),
-				MaxFileSize:   viper.GetInt64("STORAGE_MAX_FILE_SIZE"),
-				AllowedTypes:  viper.GetString("STORAGE_ALLOWED_TYPES"),
-				EncryptionKey: viper.GetString("STORAGE_ENCRYPTION_KEY"),
-			},
-			AppleAuth: AppleAuthConfig{
-				ClientID: viper.GetString("APPLE_CLIENT_ID"),
-				TeamID:   viper.GetString("APPLE_TEAM_ID"),
-			},
-			GoogleAuth: GoogleAuthConfig{
-				ClientID:        viper.GetString("GOOGLE_CLIENT_ID"),
-				AndroidClientID: viper.GetString("GOOGLE_ANDROID_CLIENT_ID"),
-				IOSClientID:     viper.GetString("GOOGLE_IOS_CLIENT_ID"),
-			},
-			AppleIAP: AppleIAPConfig{
-				KeyPath:  viper.GetString("APPLE_IAP_KEY_PATH"),
-				KeyID:    viper.GetString("APPLE_IAP_KEY_ID"),
-				IssuerID: viper.GetString("APPLE_IAP_ISSUER_ID"),
-				BundleID: viper.GetString("APPLE_IAP_BUNDLE_ID"),
-				Sandbox:  viper.GetBool("APPLE_IAP_SANDBOX"),
-			},
-			GoogleIAP: GoogleIAPConfig{
-				ServiceAccountPath: viper.GetString("GOOGLE_IAP_SERVICE_ACCOUNT_PATH"),
-				PackageName:        viper.GetString("GOOGLE_IAP_PACKAGE_NAME"),
-			},
-			Stripe: StripeConfig{
-				SecretKey:     viper.GetString("STRIPE_SECRET_KEY"),
-				WebhookSecret: viper.GetString("STRIPE_WEBHOOK_SECRET"),
-				PriceMonthly:  viper.GetString("STRIPE_PRICE_MONTHLY"),
-				PriceYearly:   viper.GetString("STRIPE_PRICE_YEARLY"),
-			},
-			Features: FeatureFlags{
-				PushEnabled:             viper.GetBool("FEATURE_PUSH_ENABLED"),
-				EmailEnabled:            viper.GetBool("FEATURE_EMAIL_ENABLED"),
-				WebhooksEnabled:         viper.GetBool("FEATURE_WEBHOOKS_ENABLED"),
-				OnboardingEmailsEnabled: viper.GetBool("FEATURE_ONBOARDING_EMAILS_ENABLED"),
-				PDFReportsEnabled:       viper.GetBool("FEATURE_PDF_REPORTS_ENABLED"),
-				WorkerEnabled:           viper.GetBool("FEATURE_WORKER_ENABLED"),
-			},
-		}
-
-		// Validate required fields
-		if err := validate(cfg); err != nil {
-			loadErr = err
-			// Reset so a subsequent call can retry after env is fixed
-			cfg = nil
-			cfgOnce = sync.Once{}
-		}
-	})
-
-	if loadErr != nil {
-		return nil, loadErr
+	cfgMu.Lock()
+	defer cfgMu.Unlock()
+	if cfg != nil {
+		return cfg, nil
 	}

+	viper.SetEnvPrefix("")
+	viper.AutomaticEnv()
+	viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
+
+	// Set defaults
+	setDefaults()
+
+	// Audit F8: overlay file-mounted secrets onto Viper. No-op when the
+	// directory is absent (local/dev), so this is safe to ship before the
+	// manifests mount honeydue-secrets as a volume.
+	loadFileSecrets()
+
+	// Parse DATABASE_URL if set (Dokku-style)
+	dbConfig := DatabaseConfig{
+		Host:         viper.GetString("DB_HOST"),
+		Port:         viper.GetInt("DB_PORT"),
+		User:         viper.GetString("POSTGRES_USER"),
+		Password:     viper.GetString("POSTGRES_PASSWORD"),
+		Database:     viper.GetString("POSTGRES_DB"),
+		SSLMode:      viper.GetString("DB_SSLMODE"),
+		MaxOpenConns: viper.GetInt("DB_MAX_OPEN_CONNS"),
+		MaxIdleConns: viper.GetInt("DB_MAX_IDLE_CONNS"),
+		MaxLifetime:  viper.GetDuration("DB_MAX_LIFETIME"),
+		MaxIdleTime:  viper.GetDuration("DB_MAX_IDLE_TIME"),
+	}
+
+	// Override with DATABASE_URL if present (F-16: log warning on parse failure)
+	if databaseURL := viper.GetString("DATABASE_URL"); databaseURL != "" {
+		parsed, err := parseDatabaseURL(databaseURL)
+		if err != nil {
+			maskedURL := MaskURLCredentials(databaseURL)
+			fmt.Printf("WARNING: Failed to parse DATABASE_URL (%s): %v — falling back to individual DB_* env vars\n", maskedURL, err)
+		} else {
+			dbConfig.Host = parsed.Host
+			dbConfig.Port = parsed.Port
+			dbConfig.User = parsed.User
+			dbConfig.Password = parsed.Password
+			dbConfig.Database = parsed.Database
+			if parsed.SSLMode != "" {
+				dbConfig.SSLMode = parsed.SSLMode
+			}
+		}
+	}
+
+	c := &Config{
+		Server: ServerConfig{
+			Port:               viper.GetInt("PORT"),
+			Debug:              viper.GetBool("DEBUG"),
+			DebugFixedCodes:    viper.GetBool("DEBUG_FIXED_CODES"),
+			AllowedHosts:       strings.Split(viper.GetString("ALLOWED_HOSTS"), ","),
+			CorsAllowedOrigins: parseCorsOrigins(viper.GetString("CORS_ALLOWED_ORIGINS")),
+			Timezone:           viper.GetString("TIMEZONE"),
+			StaticDir:          viper.GetString("STATIC_DIR"),
+			BaseURL:            viper.GetString("BASE_URL"),
+		},
+		Database: dbConfig,
+		Redis: RedisConfig{
+			URL:      viper.GetString("REDIS_URL"),
+			Password: viper.GetString("REDIS_PASSWORD"),
+			DB:       viper.GetInt("REDIS_DB"),
+		},
+		Email: EmailConfig{
+			Host:     viper.GetString("EMAIL_HOST"),
+			Port:     viper.GetInt("EMAIL_PORT"),
+			User:     viper.GetString("EMAIL_HOST_USER"),
+			Password: viper.GetString("EMAIL_HOST_PASSWORD"),
+			From:     viper.GetString("DEFAULT_FROM_EMAIL"),
+			UseTLS:   viper.GetBool("EMAIL_USE_TLS"),
+		},
+		Push: PushConfig{
+			APNSKeyPath:           viper.GetString("APNS_AUTH_KEY_PATH"),
+			APNSKeyID:             viper.GetString("APNS_AUTH_KEY_ID"),
+			APNSTeamID:            viper.GetString("APNS_TEAM_ID"),
+			APNSTopic:             viper.GetString("APNS_TOPIC"),
+			APNSSandbox:           viper.GetBool("APNS_USE_SANDBOX"),
+			APNSProduction:        viper.GetBool("APNS_PRODUCTION"),
+			FCMProjectID:          viper.GetString("FCM_PROJECT_ID"),
+			FCMServiceAccountPath: viper.GetString("FCM_SERVICE_ACCOUNT_PATH"),
+			FCMServiceAccountJSON: viper.GetString("FCM_SERVICE_ACCOUNT_JSON"),
+			FCMServerKey:          viper.GetString("FCM_SERVER_KEY"),
+		},
+		Worker: WorkerConfig{
+			TaskReminderHour:    viper.GetInt("TASK_REMINDER_HOUR"),
+			OverdueReminderHour: viper.GetInt("OVERDUE_REMINDER_HOUR"),
+			DailyNotifHour:      viper.GetInt("DAILY_DIGEST_HOUR"),
+		},
+		Security: SecurityConfig{
+			SecretKey:            viper.GetString("SECRET_KEY"),
+			TokenCacheTTL:        5 * time.Minute,
+			PasswordResetExpiry:  15 * time.Minute,
+			ConfirmationExpiry:   24 * time.Hour,
+			MaxPasswordResetRate: 3,
+			TokenExpiryDays:      viper.GetInt("TOKEN_EXPIRY_DAYS"),
+			TokenRefreshDays:     viper.GetInt("TOKEN_REFRESH_DAYS"),
+			KratosPublicURL:      viper.GetString("KRATOS_PUBLIC_URL"),
+			KratosAdminURL:       viper.GetString("KRATOS_ADMIN_URL"),
+		},
+		Storage: StorageConfig{
+			UploadDir:     viper.GetString("STORAGE_UPLOAD_DIR"),
+			BaseURL:       viper.GetString("STORAGE_BASE_URL"),
+			S3Endpoint:    viper.GetString("B2_ENDPOINT"),
+			S3KeyID:       viper.GetString("B2_KEY_ID"),
+			S3AppKey:      viper.GetString("B2_APP_KEY"),
+			S3Bucket:      viper.GetString("B2_BUCKET_NAME"),
+			S3UseSSL:      viper.GetString("STORAGE_USE_SSL") == "" || viper.GetBool("STORAGE_USE_SSL"),
+			S3Region:      viper.GetString("B2_REGION"),
+			MaxFileSize:   viper.GetInt64("STORAGE_MAX_FILE_SIZE"),
+			AllowedTypes:  viper.GetString("STORAGE_ALLOWED_TYPES"),
+			EncryptionKey: viper.GetString("STORAGE_ENCRYPTION_KEY"),
+		},
+		AppleAuth: AppleAuthConfig{
+			ClientID: viper.GetString("APPLE_CLIENT_ID"),
+			TeamID:   viper.GetString("APPLE_TEAM_ID"),
+		},
+		GoogleAuth: GoogleAuthConfig{
+			ClientID:        viper.GetString("GOOGLE_CLIENT_ID"),
+			AndroidClientID: viper.GetString("GOOGLE_ANDROID_CLIENT_ID"),
+			IOSClientID:     viper.GetString("GOOGLE_IOS_CLIENT_ID"),
+		},
+		AppleIAP: AppleIAPConfig{
+			KeyPath:  viper.GetString("APPLE_IAP_KEY_PATH"),
+			KeyID:    viper.GetString("APPLE_IAP_KEY_ID"),
+			IssuerID: viper.GetString("APPLE_IAP_ISSUER_ID"),
+			BundleID: viper.GetString("APPLE_IAP_BUNDLE_ID"),
+			Sandbox:  viper.GetBool("APPLE_IAP_SANDBOX"),
+		},
+		GoogleIAP: GoogleIAPConfig{
+			ServiceAccountPath: viper.GetString("GOOGLE_IAP_SERVICE_ACCOUNT_PATH"),
+			PackageName:        viper.GetString("GOOGLE_IAP_PACKAGE_NAME"),
+		},
+		Stripe: StripeConfig{
+			SecretKey:     viper.GetString("STRIPE_SECRET_KEY"),
+			WebhookSecret: viper.GetString("STRIPE_WEBHOOK_SECRET"),
+			PriceMonthly:  viper.GetString("STRIPE_PRICE_MONTHLY"),
+			PriceYearly:   viper.GetString("STRIPE_PRICE_YEARLY"),
+		},
+		Features: FeatureFlags{
+			PushEnabled:             viper.GetBool("FEATURE_PUSH_ENABLED"),
+			EmailEnabled:            viper.GetBool("FEATURE_EMAIL_ENABLED"),
+			WebhooksEnabled:         viper.GetBool("FEATURE_WEBHOOKS_ENABLED"),
+			OnboardingEmailsEnabled: viper.GetBool("FEATURE_ONBOARDING_EMAILS_ENABLED"),
+			PDFReportsEnabled:       viper.GetBool("FEATURE_PDF_REPORTS_ENABLED"),
+			WorkerEnabled:           viper.GetBool("FEATURE_WORKER_ENABLED"),
+		},
+	}
+
+	if err := validate(c); err != nil {
+		// Leave cfg nil so the next Load() retries after env is corrected.
+		return nil, err
+	}
+	cfg = c
 	return cfg, nil
 }

@@ -399,6 +420,8 @@ func setDefaults() {

 	// Token expiry defaults
 	viper.SetDefault("TOKEN_EXPIRY_DAYS", 90)  // Tokens expire after 90 days
+	viper.SetDefault("KRATOS_PUBLIC_URL", "http://kratos:4433") // Ory Kratos public API
+	viper.SetDefault("KRATOS_ADMIN_URL", "http://kratos:4434")  // Ory Kratos admin API
 	viper.SetDefault("TOKEN_REFRESH_DAYS", 60) // Tokens can be refreshed after 60 days

 	// Storage defaults
@@ -426,14 +449,67 @@ func isWeakSecretKey(key string) bool {
 	return knownWeakSecretKeys[strings.ToLower(strings.TrimSpace(key))]
 }

+// loadFileSecrets overlays file-mounted secrets onto Viper (audit F8). When
+// the honeydue-secrets Secret is mounted as a volume at /etc/honeydue/secrets
+// each key is a file; reading the value here and viper.Set-ing it (highest
+// Viper precedence) keeps the secret out of the process environment
+// (/proc/<pid>/environ), which plain env-var injection cannot. When the
+// directory is absent it is a silent no-op and env vars are used as before.
+func loadFileSecrets() {
+	dir := os.Getenv("HONEYDUE_SECRETS_DIR")
+	if dir == "" {
+		dir = "/etc/honeydue/secrets"
+	}
+	for _, k := range []string{
+		"POSTGRES_PASSWORD", "SECRET_KEY", "EMAIL_HOST_PASSWORD", "FCM_SERVER_KEY",
+		"REDIS_PASSWORD", "B2_KEY_ID", "B2_APP_KEY", "OBS_INGEST_TOKEN", "OBS_TRACES_URL",
+	} {
+		b, err := os.ReadFile(dir + "/" + k)
+		if err != nil {
+			continue
+		}
+		if v := strings.TrimSpace(string(b)); v != "" {
+			viper.Set(k, v)
+		}
+	}
+}
+
+// SecretValue resolves a configuration value that is not part of the typed
+// Config struct. It reads through Viper, so a value supplied via a file-mounted
+// secret (audit F8, loaded by loadFileSecrets) is found just like an env var.
+//
+// Must be called after Load(). Used by cmd/api and cmd/worker for the
+// observability endpoints, which are needed before the full Config is wired
+// and would otherwise be read with os.Getenv — which misses file-mounted
+// secrets entirely once F8 removes them from the process environment.
+func SecretValue(key string) string {
+	return viper.GetString(key)
+}
+
+// randomHexKey returns a cryptographically secure random hex string
+// representing n random bytes (2n hex characters).
+func randomHexKey(n int) (string, error) {
+	b := make([]byte, n)
+	if _, err := rand.Read(b); err != nil {
+		return "", err
+	}
+	return hex.EncodeToString(b), nil
+}
+
 func validate(cfg *Config) error {
-	// S-08: Validate SECRET_KEY against known weak defaults
+	// M8: SECRET_KEY validation — no static fallback secret in the binary.
 	if cfg.Security.SecretKey == "" {
 		if cfg.Server.Debug {
-			// In debug mode, use a default key with a warning for local development
-			cfg.Security.SecretKey = "change-me-in-production-secret-key-12345"
-			fmt.Println("WARNING: SECRET_KEY not set, using default (debug mode only)")
-			fmt.Println("WARNING: *** DO NOT USE THIS DEFAULT KEY IN PRODUCTION ***")
+			// Debug only: generate a random key per boot. Tokens signed with
+			// it do not survive a restart, which is acceptable for local dev
+			// and far safer than a well-known hardcoded fallback.
+			randomKey, err := randomHexKey(32)
+			if err != nil {
+				return fmt.Errorf("failed to generate ephemeral debug SECRET_KEY: %w", err)
+			}
+			cfg.Security.SecretKey = randomKey
+			fmt.Println("WARNING: SECRET_KEY not set, generated an ephemeral random key (debug mode only)")
+			fmt.Println("WARNING: tokens will not survive a restart — set SECRET_KEY for stable local sessions")
 		} else {
 			// In production, refuse to start without a proper secret key
 			return fmt.Errorf("FATAL: SECRET_KEY environment variable is required in production (DEBUG=false)")
@@ -446,6 +522,12 @@ func validate(cfg *Config) error {
 		}
 	}

+	// C4: fixed confirmation codes ("123456") must never be enabled outside
+	// debug — with DEBUG=false they are a full authentication bypass.
+	if cfg.Server.DebugFixedCodes && !cfg.Server.Debug {
+		return fmt.Errorf("FATAL: DEBUG_FIXED_CODES is enabled with DEBUG=false — fixed confirmation codes must never run in production")
+	}
+
 	// Database password might come from DATABASE_URL, don't require it separately
 	// The actual connection will fail if credentials are wrong

@@ -1,7 +1,6 @@
 package config

 import (
-	"sync"
 	"testing"

 	"github.com/spf13/viper"
@@ -11,8 +10,9 @@ import (

 // resetConfigState resets the package-level singleton so each test starts fresh.
 func resetConfigState() {
+	cfgMu.Lock()
 	cfg = nil
-	cfgOnce = sync.Once{}
+	cfgMu.Unlock()
 	viper.Reset()
 }

@@ -106,8 +106,10 @@ func TestLoad_Validation_MissingSecretKey_DebugMode(t *testing.T) {

 	c, err := Load()
 	require.NoError(t, err)
-	// In debug mode, a default key is assigned
-	assert.Equal(t, "change-me-in-production-secret-key-12345", c.Security.SecretKey)
+	// Audit M8: in debug mode an ephemeral random key is generated per boot
+	// (no static fallback). It must be a non-empty 64-char hex string.
+	assert.Len(t, c.Security.SecretKey, 64)
+	assert.NotEqual(t, "change-me-in-production-secret-key-12345", c.Security.SecretKey)
 }

 func TestLoad_Validation_WeakSecretKey_Production(t *testing.T) {
@@ -133,6 +135,33 @@ func TestLoad_Validation_WeakSecretKey_DebugMode(t *testing.T) {
 	assert.Equal(t, "secret", c.Security.SecretKey)
 }

+// Audit C4: DEBUG_FIXED_CODES makes confirmation codes a fixed "123456" — a
+// full authentication bypass. With DEBUG=false, validate() must refuse to boot
+// rather than ship that bypass to production.
+func TestLoad_Validation_DebugFixedCodes_Production(t *testing.T) {
+	// validate() directly — avoids the sync.Once issue Load() has on failure.
+	cfg := &Config{
+		Server:   ServerConfig{Debug: false, DebugFixedCodes: true},
+		Security: SecurityConfig{SecretKey: "a-strong-secret-key-for-tests"},
+	}
+
+	err := validate(cfg)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "DEBUG_FIXED_CODES")
+}
+
+// With DEBUG=true the fixed codes are an intended local-dev convenience, so
+// the same combination must NOT error.
+func TestLoad_Validation_DebugFixedCodes_DebugMode(t *testing.T) {
+	cfg := &Config{
+		Server:   ServerConfig{Debug: true, DebugFixedCodes: true},
+		Security: SecurityConfig{SecretKey: "a-strong-secret-key-for-tests"},
+	}
+
+	err := validate(cfg)
+	require.NoError(t, err)
+}
+
 func TestLoad_Validation_EncryptionKey_Valid(t *testing.T) {
 	resetConfigState()
 	t.Setenv("SECRET_KEY", "a-strong-secret-key-for-tests")
@@ -14,12 +14,10 @@ import (

 	"github.com/treytartt/honeydue-api/internal/config"
 	"github.com/treytartt/honeydue-api/internal/models"
-)
+	"github.com/treytartt/honeydue-api/internal/prom"

-// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
-// Migrate() across API replicas booting in parallel. Value is arbitrary but
-// stable ("hdmg" as bytes = honeydue migration).
-const migrationAdvisoryLockKey int64 = 0x68646d67
+	"github.com/uptrace/opentelemetry-go-extra/otelgorm"
+)

 // zerologGormWriter adapts zerolog for GORM's logger interface
 type zerologGormWriter struct{}
@@ -68,25 +66,84 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) {
 		return nil, fmt.Errorf("failed to get underlying sql.DB: %w", err)
 	}

-	// Configure connection pool
+	// Configure connection pool. The Neon pooler endpoint keeps backend
+	// connections warm, so we keep our client-side pool warm too — that
+	// eliminates the ~440ms TCP+TLS+startup handshake on the first query
+	// after a cold pod / idle period.
 	sqlDB.SetMaxOpenConns(cfg.MaxOpenConns)
 	sqlDB.SetMaxIdleConns(cfg.MaxIdleConns)
 	sqlDB.SetConnMaxLifetime(cfg.MaxLifetime)
+	if cfg.MaxIdleTime > 0 {
+		sqlDB.SetConnMaxIdleTime(cfg.MaxIdleTime)
+	}
+	// MaxIdleTime=0 means "never close idle" — the pool fills up to
+	// MaxIdleConns and they stay alive until MaxLifetime expires.

 	// Test connection
 	if err := sqlDB.Ping(); err != nil {
 		return nil, fmt.Errorf("failed to ping database: %w", err)
 	}

+	// Eagerly warm the connection pool to MaxIdleConns. Without this, the
+	// first N user requests each pay the full handshake (~440ms over a
+	// transatlantic link). Pings are issued in parallel so warm-up is
+	// bounded by handshake time, not handshake-time × N.
+	warmUpPool(sqlDB, cfg.MaxIdleConns)
+
 	log.Info().
 		Str("host", cfg.Host).
 		Int("port", cfg.Port).
 		Str("database", cfg.Database).
 		Msg("Connected to PostgreSQL database")

+	// Register Prometheus GORM callbacks — emits gorm_query_duration_seconds
+	// for every SQL operation. Operates at the statement level, so does not
+	// require ctx to be threaded through repositories.
+	if err := prom.RegisterGORMCallbacks(db); err != nil {
+		log.Warn().Err(err).Msg("failed to register prometheus GORM callbacks; metrics will be partial")
+	}
+
+	// Register otelgorm plugin — emits a span per SQL statement, attached to
+	// whatever trace context is set via db.WithContext(ctx). Repositories that
+	// have been migrated to use WithContext (see internal/repositories/*.go)
+	// will produce nested SQL spans inside the request trace; pre-migration
+	// repositories silently emit untraced queries.
+	if err := db.Use(otelgorm.NewPlugin(otelgorm.WithDBName(cfg.Database))); err != nil {
+		log.Warn().Err(err).Msg("failed to register otelgorm plugin; SQL spans disabled")
+	}
+
 	return db, nil
 }

+// warmUpPool issues N parallel pings so the pool fills with established
+// connections before the first user request lands. Failures are logged but
+// not fatal — the pool will fill on demand under traffic if pre-warm fails.
+//
+// On a transatlantic link to Neon (~110ms RTT, ~440ms cold handshake), this
+// turns "first request pays the cold handshake" into "first request finds a
+// warm pool" — at the cost of ~440ms during pod startup.
+func warmUpPool(sqlDB interface {
+	PingContext(context.Context) error
+}, n int) {
+	if n <= 0 {
+		return
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	done := make(chan error, n)
+	for i := 0; i < n; i++ {
+		go func() { done <- sqlDB.PingContext(ctx) }()
+	}
+	successes := 0
+	for i := 0; i < n; i++ {
+		if err := <-done; err == nil {
+			successes++
+		}
+	}
+	log.Info().Int("requested", n).Int("warmed", successes).Msg("DB pool warm-up complete")
+}
+
 // Get returns the database instance
 func Get() *gorm.DB {
 	return db
@@ -127,52 +184,46 @@ func Paginate(page, pageSize int) func(db *gorm.DB) *gorm.DB {
 	}
 }

-// MigrateWithLock runs Migrate() under a Postgres session-level advisory lock
-// so that multiple API replicas booting in parallel don't race on AutoMigrate.
-// On non-Postgres dialects (sqlite in tests) it falls through to Migrate().
-func MigrateWithLock() error {
+// RequireSchemaApplied verifies that goose's version table exists and has
+// at least one applied entry. This is the fail-fast that runs at api/worker
+// boot: if the operator forgot to run the migrate Job, the pod refuses to
+// start with a clear error instead of throwing mysterious "relation does
+// not exist" errors deep in a request handler.
+//
+// On non-Postgres dialects (sqlite in tests) this is a no-op — tests use
+// AutoMigrate via testutil.SetupTestDB to create a fresh schema per run.
+// goose isn't involved in the test path.
+func RequireSchemaApplied() error {
 	if db == nil {
 		return fmt.Errorf("database not initialised")
 	}
 	if db.Dialector.Name() != "postgres" {
-		return Migrate()
+		return nil
 	}

-	sqlDB, err := db.DB()
+	// goose_db_version stores one row per applied migration, not a single
+	// "current version" row — so we look for the highest version_id with
+	// is_applied=true. ORDER BY id DESC LIMIT 1 also catches the case where
+	// the table exists but is empty (no rows returned, scan leaves Version
+	// at zero).
+	type migrationRow struct {
+		VersionID int64 `gorm:"column:version_id"`
+		IsApplied bool  `gorm:"column:is_applied"`
+	}
+
+	var row migrationRow
+	err := db.Raw(`SELECT version_id, is_applied FROM goose_db_version ORDER BY id DESC LIMIT 1`).Scan(&row).Error
 	if err != nil {
-		return fmt.Errorf("get underlying sql.DB: %w", err)
+		return fmt.Errorf("goose_db_version check failed (run the migrate Job to bootstrap): %w", err)
 	}
-
-	// Give ourselves up to 5 min to acquire the lock — long enough for a
-	// slow migration on a peer replica, short enough to fail fast if Postgres
-	// is hung.
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
-	defer cancel()
-
-	conn, err := sqlDB.Conn(ctx)
-	if err != nil {
-		return fmt.Errorf("acquire dedicated migration connection: %w", err)
+	if !row.IsApplied {
+		return fmt.Errorf("goose_db_version latest row is_applied=false at version=%d — last migration was rolled back or aborted; investigate before starting", row.VersionID)
 	}
-	defer conn.Close()
-
-	log.Info().Int64("lock_key", migrationAdvisoryLockKey).Msg("Acquiring migration advisory lock...")
-	if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", migrationAdvisoryLockKey); err != nil {
-		return fmt.Errorf("pg_advisory_lock: %w", err)
+	if row.VersionID < 1 {
+		return fmt.Errorf("goose_db_version is empty — run goose up (or seed a row marking version 1 as applied if the schema already exists)")
 	}
-	log.Info().Msg("Migration advisory lock acquired")
-
-	defer func() {
-		// Unlock with a fresh context — the outer ctx may have expired.
-		unlockCtx, unlockCancel := context.WithTimeout(context.Background(), 10*time.Second)
-		defer unlockCancel()
-		if _, err := conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", migrationAdvisoryLockKey); err != nil {
-			log.Warn().Err(err).Msg("Failed to release migration advisory lock (session close will also release)")
-		} else {
-			log.Info().Msg("Migration advisory lock released")
-		}
-	}()
-
-	return Migrate()
+	log.Info().Int64("schema_version", row.VersionID).Msg("Schema precondition satisfied")
+	return nil
 }

 // Migrate runs database migrations for all models
@@ -193,12 +244,7 @@ func Migrate() error {

 		// User and auth tables
 		&models.User{},
-		&models.AuthToken{},
 		&models.UserProfile{},
-		&models.ConfirmationCode{},
-		&models.PasswordResetCode{},
-		&models.AppleSocialAuth{},
-		&models.GoogleSocialAuth{},

 		// Admin users (separate from app users)
 		&models.AdminUser{},
@@ -25,7 +25,12 @@ type CreateDocumentRequest struct {
 	SerialNumber  string               `json:"serial_number" validate:"max=100"`
 	ModelNumber   string               `json:"model_number" validate:"max=100"`
 	TaskID        *uint                `json:"task_id"`
-	ImageURLs     []string             `json:"image_urls" validate:"omitempty,max=20,dive,max=500"` // Multiple image URLs
+	// UploadIDs claims pending_uploads rows produced by the presigned-URL
+	// upload flow and turns them into document_image rows. UploadIDs of
+	// category "document_file" attach to the document's main FileURL +
+	// FileName fields instead — the service infers placement from the
+	// row's category.
+	UploadIDs     []uint               `json:"upload_ids" validate:"omitempty,max=20"`
 }

 // UpdateDocumentRequest represents the request to update a document
@@ -100,14 +100,20 @@ type UpdateTaskRequest struct {
 	ContractorID       *uint            `json:"contractor_id"`
 }

-// CreateTaskCompletionRequest represents the request to create a task completion
+// CreateTaskCompletionRequest represents the request to create a task completion.
+//
+// Image attachments arrive via the presigned-URL flow: the client uploads
+// each image directly to B2 (see /api/uploads/presign) and passes the
+// resulting pending_uploads.id values in UploadIDs. The service claims
+// those rows and creates the linked task_completion_image rows.
 type CreateTaskCompletionRequest struct {
 	TaskID      uint             `json:"task_id" validate:"required"`
 	CompletedAt *time.Time       `json:"completed_at"` // Defaults to now
 	Notes       string           `json:"notes" validate:"max=10000"`
 	ActualCost  *decimal.Decimal `json:"actual_cost"`
 	Rating      *int             `json:"rating" validate:"omitempty,min=1,max=5"` // 1-5 star rating
-	ImageURLs   []string         `json:"image_urls" validate:"omitempty,max=20,dive,max=500"` // Multiple image URLs
+
+	UploadIDs []uint `json:"upload_ids" validate:"omitempty,max=20"`
 }

 // UpdateTaskCompletionRequest represents the request to update a task completion
@@ -115,7 +121,6 @@ type UpdateTaskCompletionRequest struct {
 	Notes      *string          `json:"notes" validate:"omitempty,max=10000"`
 	ActualCost *decimal.Decimal `json:"actual_cost"`
 	Rating     *int             `json:"rating" validate:"omitempty,min=1,max=5"`
-	ImageURLs  []string         `json:"image_urls" validate:"omitempty,max=20,dive,max=500"`
 }

 // CompletionImageInput represents an image to add to a completion
@@ -0,0 +1,22 @@
+package requests
+
+// PresignUploadRequest is the body for POST /api/uploads/presign. The client
+// describes what it's about to upload; the server validates against quota,
+// rate limits, and per-category caps before returning a signed POST policy.
+type PresignUploadRequest struct {
+	// Category gates allowed mime types and the size cap. One of:
+	//   "completion"     — task completion photos
+	//   "document_image" — image attached to a Document
+	//   "document_file"  — file (e.g. PDF) attached to a Document
+	Category string `json:"category" validate:"required,oneof=completion document_image document_file"`
+
+	// ContentType is the MIME type the client will upload (e.g. image/jpeg).
+	// Bound to the policy so the actual upload must match exactly.
+	ContentType string `json:"content_type" validate:"required,min=3,max=127"`
+
+	// ContentLength is the exact byte count the client intends to upload.
+	// The signed policy permits a small slack window around this value
+	// (server-side constant) so the client can encode in one pass without
+	// having to predict the byte count perfectly.
+	ContentLength int64 `json:"content_length" validate:"required,min=1"`
+}
@@ -8,8 +8,11 @@ import (

 // ContractorSpecialtyResponse represents a contractor specialty
 type ContractorSpecialtyResponse struct {
-	ID           uint   `json:"id"`
-	Name         string `json:"name"`
+	ID uint `json:"id"`
+	// Name is the stable English identifier (clients match on this).
+	Name string `json:"name"`
+	// DisplayName is the localized label for the request's Accept-Language.
+	DisplayName  string `json:"display_name"`
 	Description  string `json:"description"`
 	Icon         string `json:"icon"`
 	DisplayOrder int    `json:"display_order"`
@@ -10,8 +10,11 @@ import (

 // ResidenceTypeResponse represents a residence type in the API response
 type ResidenceTypeResponse struct {
-	ID   uint   `json:"id"`
+	ID uint `json:"id"`
+	// Name is the stable English identifier (clients match on this).
 	Name string `json:"name"`
+	// DisplayName is the localized label for the request's Accept-Language.
+	DisplayName string `json:"display_name"`
 }

 // ResidenceUserResponse represents a user with access to a residence
@@ -13,8 +13,11 @@ import (

 // TaskCategoryResponse represents a task category
 type TaskCategoryResponse struct {
-	ID           uint   `json:"id"`
-	Name         string `json:"name"`
+	ID uint `json:"id"`
+	// Name is the stable English identifier (clients match on this).
+	Name string `json:"name"`
+	// DisplayName is the localized label for the request's Accept-Language.
+	DisplayName  string `json:"display_name"`
 	Description  string `json:"description"`
 	Icon         string `json:"icon"`
 	Color        string `json:"color"`
@@ -25,6 +28,7 @@ type TaskCategoryResponse struct {
 type TaskPriorityResponse struct {
 	ID           uint   `json:"id"`
 	Name         string `json:"name"`
+	DisplayName  string `json:"display_name"`
 	Level        int    `json:"level"`
 	Color        string `json:"color"`
 	DisplayOrder int    `json:"display_order"`
@@ -34,6 +38,7 @@ type TaskPriorityResponse struct {
 type TaskFrequencyResponse struct {
 	ID           uint   `json:"id"`
 	Name         string `json:"name"`
+	DisplayName  string `json:"display_name"`
 	Days         *int   `json:"days"`
 	DisplayOrder int    `json:"display_order"`
 }
@@ -71,35 +76,35 @@ type TaskCompletionResponse struct {

 // TaskResponse represents a task in the API response
 type TaskResponse struct {
-	ID              uint                     `json:"id"`
-	ResidenceID     uint                     `json:"residence_id"`
-	CreatedByID     uint                     `json:"created_by_id"`
-	CreatedBy       *TaskUserResponse        `json:"created_by,omitempty"`
-	AssignedToID    *uint                    `json:"assigned_to_id"`
-	AssignedTo      *TaskUserResponse        `json:"assigned_to,omitempty"`
-	Title           string                   `json:"title"`
-	Description     string                   `json:"description"`
-	CategoryID      *uint                    `json:"category_id"`
-	Category        *TaskCategoryResponse    `json:"category,omitempty"`
-	PriorityID      *uint                    `json:"priority_id"`
-	Priority        *TaskPriorityResponse    `json:"priority,omitempty"`
-	FrequencyID        *uint                    `json:"frequency_id"`
-	Frequency          *TaskFrequencyResponse   `json:"frequency,omitempty"`
-	CustomIntervalDays *int                     `json:"custom_interval_days"` // For "Custom" frequency, user-specified days
-	InProgress         bool                     `json:"in_progress"`
-	DueDate         *time.Time               `json:"due_date"`
-	NextDueDate     *time.Time               `json:"next_due_date"` // For recurring tasks, updated after each completion
-	EstimatedCost   *decimal.Decimal         `json:"estimated_cost"`
-	ActualCost      *decimal.Decimal         `json:"actual_cost"`
-	ContractorID    *uint                    `json:"contractor_id"`
-	IsCancelled     bool                     `json:"is_cancelled"`
-	IsArchived      bool                     `json:"is_archived"`
-	ParentTaskID    *uint        `json:"parent_task_id"`
-	TemplateID      *uint        `json:"template_id,omitempty"` // Backlink to the TaskTemplate this task was created from
-	CompletionCount int          `json:"completion_count"`
-	KanbanColumn    string       `json:"kanban_column,omitempty"` // Which kanban column this task belongs to
-	CreatedAt       time.Time    `json:"created_at"`
-	UpdatedAt       time.Time    `json:"updated_at"`
+	ID                 uint                   `json:"id"`
+	ResidenceID        uint                   `json:"residence_id"`
+	CreatedByID        uint                   `json:"created_by_id"`
+	CreatedBy          *TaskUserResponse      `json:"created_by,omitempty"`
+	AssignedToID       *uint                  `json:"assigned_to_id"`
+	AssignedTo         *TaskUserResponse      `json:"assigned_to,omitempty"`
+	Title              string                 `json:"title"`
+	Description        string                 `json:"description"`
+	CategoryID         *uint                  `json:"category_id"`
+	Category           *TaskCategoryResponse  `json:"category,omitempty"`
+	PriorityID         *uint                  `json:"priority_id"`
+	Priority           *TaskPriorityResponse  `json:"priority,omitempty"`
+	FrequencyID        *uint                  `json:"frequency_id"`
+	Frequency          *TaskFrequencyResponse `json:"frequency,omitempty"`
+	CustomIntervalDays *int                   `json:"custom_interval_days"` // For "Custom" frequency, user-specified days
+	InProgress         bool                   `json:"in_progress"`
+	DueDate            *time.Time             `json:"due_date"`
+	NextDueDate        *time.Time             `json:"next_due_date"` // For recurring tasks, updated after each completion
+	EstimatedCost      *decimal.Decimal       `json:"estimated_cost"`
+	ActualCost         *decimal.Decimal       `json:"actual_cost"`
+	ContractorID       *uint                  `json:"contractor_id"`
+	IsCancelled        bool                   `json:"is_cancelled"`
+	IsArchived         bool                   `json:"is_archived"`
+	ParentTaskID       *uint                  `json:"parent_task_id"`
+	TemplateID         *uint                  `json:"template_id,omitempty"` // Backlink to the TaskTemplate this task was created from
+	CompletionCount    int                    `json:"completion_count"`
+	KanbanColumn       string                 `json:"kanban_column,omitempty"` // Which kanban column this task belongs to
+	CreatedAt          time.Time              `json:"created_at"`
+	UpdatedAt          time.Time              `json:"updated_at"`
 }

 // BulkCreateTasksResponse is returned by POST /api/tasks/bulk/.
@@ -240,30 +245,30 @@ func NewTaskResponseWithTime(t *models.Task, daysThreshold int, now time.Time) T
 // newTaskResponseInternal is the internal implementation for creating task responses
 func newTaskResponseInternal(t *models.Task, daysThreshold int, now time.Time) TaskResponse {
 	resp := TaskResponse{
-		ID:              t.ID,
-		ResidenceID:     t.ResidenceID,
-		CreatedByID:     t.CreatedByID,
-		Title:           t.Title,
-		Description:     t.Description,
-		CategoryID:      t.CategoryID,
-		PriorityID:      t.PriorityID,
+		ID:                 t.ID,
+		ResidenceID:        t.ResidenceID,
+		CreatedByID:        t.CreatedByID,
+		Title:              t.Title,
+		Description:        t.Description,
+		CategoryID:         t.CategoryID,
+		PriorityID:         t.PriorityID,
 		FrequencyID:        t.FrequencyID,
 		CustomIntervalDays: t.CustomIntervalDays,
 		InProgress:         t.InProgress,
-		AssignedToID:    t.AssignedToID,
-		DueDate:         t.DueDate,
-		NextDueDate:     t.NextDueDate,
-		EstimatedCost:   t.EstimatedCost,
-		ActualCost:      t.ActualCost,
-		ContractorID:    t.ContractorID,
-		IsCancelled:     t.IsCancelled,
-		IsArchived:      t.IsArchived,
-		ParentTaskID:    t.ParentTaskID,
-		TemplateID:      t.TaskTemplateID,
-		CompletionCount: predicates.GetCompletionCount(t),
-		KanbanColumn:    DetermineKanbanColumnWithTime(t, daysThreshold, now),
-		CreatedAt:       t.CreatedAt,
-		UpdatedAt:       t.UpdatedAt,
+		AssignedToID:       t.AssignedToID,
+		DueDate:            t.DueDate,
+		NextDueDate:        t.NextDueDate,
+		EstimatedCost:      t.EstimatedCost,
+		ActualCost:         t.ActualCost,
+		ContractorID:       t.ContractorID,
+		IsCancelled:        t.IsCancelled,
+		IsArchived:         t.IsArchived,
+		ParentTaskID:       t.ParentTaskID,
+		TemplateID:         t.TaskTemplateID,
+		CompletionCount:    predicates.GetCompletionCount(t),
+		KanbanColumn:       DetermineKanbanColumnWithTime(t, daysThreshold, now),
+		CreatedAt:          t.CreatedAt,
+		UpdatedAt:          t.UpdatedAt,
 	}

 	if t.CreatedBy.ID != 0 {
@@ -0,0 +1,38 @@
+package responses
+
+// PresignUploadResponse is what /api/uploads/presign returns to the client.
+//
+// Flow: the client makes one PUT request to URL with the raw object bytes
+// as the body and Headers as the request headers (verbatim — the signature
+// binds them). On success, the client passes ID back via upload_ids[] on
+// POST /api/task-completions/ or POST /api/documents/ to claim and attach
+// the object.
+//
+// We use PUT (not POST) because Backblaze B2's S3-compatible endpoint does
+// not implement the S3 POST Object form upload — it returns HTTP 501 on
+// every request style. PUT works against AWS S3, B2, and MinIO uniformly.
+type PresignUploadResponse struct {
+	// ID is the pending_uploads.id the client passes back via upload_ids[].
+	ID uint `json:"id"`
+
+	// URL is the signed PUT URL. Includes all auth as query parameters.
+	URL string `json:"upload_url"`
+
+	// Method is always "PUT" — emitted explicitly so clients don't have to
+	// hardcode it. Reserved for the rare case we ever offer alternative
+	// upload mechanisms.
+	Method string `json:"method"`
+
+	// Headers must be sent verbatim on the PUT request. Currently includes
+	// Content-Type and Content-Length; both are signed, and B2 will reject
+	// any PUT whose headers don't match.
+	Headers map[string]string `json:"headers"`
+
+	// Key is the object key chosen by the server. Echoed for client logging
+	// and debugging; the canonical reference is via ID.
+	Key string `json:"key"`
+
+	// ExpiresAt is when the signed URL stops working. Clients should retry
+	// with a fresh presign rather than relying on long-lived URLs.
+	ExpiresAt string `json:"expires_at"`
+}
@@ -1,7 +1,6 @@
 package handlers

 import (
-	"errors"
 	"net/http"

 	"github.com/labstack/echo/v4"
@@ -13,20 +12,22 @@ import (
 	"github.com/treytartt/honeydue-api/internal/middleware"
 	"github.com/treytartt/honeydue-api/internal/services"
 	"github.com/treytartt/honeydue-api/internal/validator"
+	"github.com/treytartt/honeydue-api/internal/worker"
 )

-// AuthHandler handles authentication endpoints
+// AuthHandler handles user profile and account management endpoints.
+// Session lifecycle (login, register, logout, password reset) is delegated
+// to Ory Kratos; this handler only deals with the honeyDue user record.
 type AuthHandler struct {
-	authService       *services.AuthService
-	emailService      *services.EmailService
-	cache             *services.CacheService
-	appleAuthService  *services.AppleAuthService
-	googleAuthService *services.GoogleAuthService
-	storageService    *services.StorageService
-	auditService      *services.AuditService
+	authService    *services.AuthService
+	emailService   *services.EmailService
+	cache          *services.CacheService
+	storageService *services.StorageService
+	auditService   *services.AuditService
+	enqueuer       worker.Enqueuer
 }

-// NewAuthHandler creates a new auth handler
+// NewAuthHandler creates a new auth handler.
 func NewAuthHandler(authService *services.AuthService, emailService *services.EmailService, cache *services.CacheService) *AuthHandler {
 	return &AuthHandler{
 		authService:  authService,
@@ -35,139 +36,108 @@ func NewAuthHandler(authService *services.AuthService, emailService *services.Em
 	}
 }

-// SetAppleAuthService sets the Apple auth service (called after initialization)
-func (h *AuthHandler) SetAppleAuthService(appleAuth *services.AppleAuthService) {
-	h.appleAuthService = appleAuth
-}
-
-// SetGoogleAuthService sets the Google auth service (called after initialization)
-func (h *AuthHandler) SetGoogleAuthService(googleAuth *services.GoogleAuthService) {
-	h.googleAuthService = googleAuth
-}
-
-// SetStorageService sets the storage service for file deletion during account deletion
+// SetStorageService sets the storage service for file deletion during account deletion.
 func (h *AuthHandler) SetStorageService(storageService *services.StorageService) {
 	h.storageService = storageService
 }

-// SetAuditService sets the audit service for logging security events
+// SetAuditService sets the audit service for logging security events.
 func (h *AuthHandler) SetAuditService(auditService *services.AuditService) {
 	h.auditService = auditService
 }

-// Login handles POST /api/auth/login/
-func (h *AuthHandler) Login(c echo.Context) error {
-	var req requests.LoginRequest
-	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
-	}
-	if err := c.Validate(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
-	}
-
-	response, err := h.authService.Login(&req)
-	if err != nil {
-		log.Debug().Err(err).Str("identifier", req.Username).Msg("Login failed")
-		if h.auditService != nil {
-			h.auditService.LogEvent(c, nil, services.AuditEventLoginFailed, map[string]interface{}{
-				"identifier": req.Username,
-			})
-		}
-		return err
-	}
-
-	if h.auditService != nil {
-		userID := response.User.ID
-		h.auditService.LogEvent(c, &userID, services.AuditEventLogin, nil)
-	}
-
-	return c.JSON(http.StatusOK, response)
+// SetEnqueuer sets the async task enqueuer (used by the GDPR data-export endpoint).
+func (h *AuthHandler) SetEnqueuer(enqueuer worker.Enqueuer) {
+	h.enqueuer = enqueuer
 }

-// Register handles POST /api/auth/register/
+// ExportData handles POST /api/auth/export/ — queues a GDPR data-export job that
+// emails the user a zip of all their data. Async (202) because gathering,
+// zipping, and emailing can take seconds; doing it inline would block the request.
+func (h *AuthHandler) ExportData(c echo.Context) error {
+	noStore(c)
+	user, err := middleware.MustGetAuthUser(c)
+	if err != nil {
+		return err
+	}
+	if h.enqueuer == nil {
+		return echo.NewHTTPError(http.StatusServiceUnavailable, "data export is temporarily unavailable")
+	}
+	if err := h.enqueuer.EnqueueDataExport(user.ID); err != nil {
+		log.Error().Err(err).Uint("user_id", user.ID).Msg("Failed to enqueue data export")
+		return echo.NewHTTPError(http.StatusInternalServerError, "failed to queue data export")
+	}
+	if h.auditService != nil {
+		h.auditService.LogEvent(c, &user.ID, services.AuditEventDataExport, map[string]interface{}{
+			"user_id": user.ID,
+			"email":   user.Email,
+		})
+	}
+	return c.JSON(http.StatusAccepted, map[string]string{
+		"message": "Your data export has been queued. You'll receive an email with your data shortly.",
+	})
+}
+
+// noStore marks a response as non-cacheable.
+func noStore(c echo.Context) {
+	c.Response().Header().Set("Cache-Control", "no-store")
+}
+
+// Register handles POST /api/auth/register/ — creates a new password account.
+//
+// The identity is admin-created in Kratos with an unverified email and no
+// auto-sent code (see services.AuthService.Register). The client logs in right
+// after to get a session, then completes email verification. Returns 201 with
+// no token; 409 if the email is taken; 400 on a weak password.
 func (h *AuthHandler) Register(c echo.Context) error {
 	var req requests.RegisterRequest
 	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
+		return apperrors.BadRequest("error.invalid_request_body")
 	}
 	if err := c.Validate(&req); err != nil {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}
-
-	response, confirmationCode, err := h.authService.Register(&req)
-	if err != nil {
-		log.Debug().Err(err).Msg("Registration failed")
+	if err := h.authService.Register(c.Request().Context(), &req); err != nil {
 		return err
 	}
-
-	if h.auditService != nil {
-		userID := response.User.ID
-		h.auditService.LogEvent(c, &userID, services.AuditEventRegister, map[string]interface{}{
-			"username": req.Username,
-			"email":    req.Email,
-		})
-	}
-
-	// Send welcome email with confirmation code (async)
-	if h.emailService != nil && confirmationCode != "" {
-		go func() {
-			defer func() {
-				if r := recover(); r != nil {
-					log.Error().Interface("panic", r).Str("email", req.Email).Msg("Panic in welcome email goroutine")
-				}
-			}()
-			if err := h.emailService.SendWelcomeEmail(req.Email, req.FirstName, confirmationCode); err != nil {
-				log.Error().Err(err).Str("email", req.Email).Msg("Failed to send welcome email")
-			}
-		}()
-	}
-
-	return c.JSON(http.StatusCreated, response)
-}
-
-// Logout handles POST /api/auth/logout/
-func (h *AuthHandler) Logout(c echo.Context) error {
-	token := middleware.GetAuthToken(c)
-	if token == "" {
-		return apperrors.Unauthorized("error.not_authenticated")
-	}
-
-	// Log audit event before invalidating the token
-	if h.auditService != nil {
-		user := middleware.GetAuthUser(c)
-		if user != nil {
-			h.auditService.LogEvent(c, &user.ID, services.AuditEventLogout, nil)
-		}
-	}
-
-	// Invalidate token in database
-	if err := h.authService.Logout(token); err != nil {
-		log.Warn().Err(err).Msg("Failed to delete token from database")
-	}
-
-	// Invalidate token in cache
-	if h.cache != nil {
-		if err := h.cache.InvalidateAuthToken(c.Request().Context(), token); err != nil {
-			log.Warn().Err(err).Msg("Failed to invalidate token in cache")
-		}
-	}
-
-	return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Logged out successfully"})
+	return c.JSON(http.StatusCreated, map[string]string{
+		"message": "Account created. Please verify your email.",
+	})
 }

 // CurrentUser handles GET /api/auth/me/
 func (h *AuthHandler) CurrentUser(c echo.Context) error {
+	noStore(c)
 	user, err := middleware.MustGetAuthUser(c)
 	if err != nil {
 		return err
 	}

-	response, err := h.authService.GetCurrentUser(user.ID)
+	response, err := h.authService.GetCurrentUser(c.Request().Context(), user.ID)
 	if err != nil {
 		log.Error().Err(err).Uint("user_id", user.ID).Msg("Failed to get current user")
 		return err
 	}

+	// user_profile.verified is a one-time mirror set at provision time
+	// (see middleware/kratos_auth.go::provision). Kratos remains the source
+	// of truth for email-verification state — it can flip from false → true
+	// the instant the user completes the verification flow, and nothing
+	// updates the local column. Override the response with the live value
+	// the Kratos auth middleware already stashed in context so /auth/me
+	// reflects current reality. Also opportunistically sync the DB mirror
+	// (best-effort, ignore error) so background queries that read the
+	// column see the same answer.
+	if verified, ok := c.Get(middleware.AuthVerifiedKey).(bool); ok {
+		mirrorStale := response.Profile != nil && response.Profile.Verified != verified
+		if response.Profile != nil {
+			response.Profile.Verified = verified
+		}
+		if verified && mirrorStale {
+			_ = h.authService.MarkUserVerified(c.Request().Context(), user.ID)
+		}
+	}
+
 	return c.JSON(http.StatusOK, response)
 }

@@ -186,7 +156,7 @@ func (h *AuthHandler) UpdateProfile(c echo.Context) error {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}

-	response, err := h.authService.UpdateProfile(user.ID, &req)
+	response, err := h.authService.UpdateProfile(c.Request().Context(), user.ID, &req)
 	if err != nil {
 		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Failed to update profile")
 		return err
@@ -195,296 +165,6 @@ func (h *AuthHandler) UpdateProfile(c echo.Context) error {
 	return c.JSON(http.StatusOK, response)
 }

-// VerifyEmail handles POST /api/auth/verify-email/
-func (h *AuthHandler) VerifyEmail(c echo.Context) error {
-	user, err := middleware.MustGetAuthUser(c)
-	if err != nil {
-		return err
-	}
-
-	var req requests.VerifyEmailRequest
-	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
-	}
-	if err := c.Validate(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
-	}
-
-	err = h.authService.VerifyEmail(user.ID, req.Code)
-	if err != nil {
-		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Email verification failed")
-		return err
-	}
-
-	// Send post-verification welcome email with tips (async)
-	if h.emailService != nil {
-		go func() {
-			defer func() {
-				if r := recover(); r != nil {
-					log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in post-verification email goroutine")
-				}
-			}()
-			if err := h.emailService.SendPostVerificationEmail(user.Email, user.FirstName); err != nil {
-				log.Error().Err(err).Str("email", user.Email).Msg("Failed to send post-verification email")
-			}
-		}()
-	}
-
-	return c.JSON(http.StatusOK, responses.VerifyEmailResponse{
-		Message:  "Email verified successfully",
-		Verified: true,
-	})
-}
-
-// ResendVerification handles POST /api/auth/resend-verification/
-func (h *AuthHandler) ResendVerification(c echo.Context) error {
-	user, err := middleware.MustGetAuthUser(c)
-	if err != nil {
-		return err
-	}
-
-	code, err := h.authService.ResendVerificationCode(user.ID)
-	if err != nil {
-		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Failed to resend verification")
-		return err
-	}
-
-	// Send verification email (async)
-	if h.emailService != nil {
-		go func() {
-			defer func() {
-				if r := recover(); r != nil {
-					log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in verification email goroutine")
-				}
-			}()
-			if err := h.emailService.SendVerificationEmail(user.Email, user.FirstName, code); err != nil {
-				log.Error().Err(err).Str("email", user.Email).Msg("Failed to send verification email")
-			}
-		}()
-	}
-
-	return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Verification email sent"})
-}
-
-// ForgotPassword handles POST /api/auth/forgot-password/
-func (h *AuthHandler) ForgotPassword(c echo.Context) error {
-	var req requests.ForgotPasswordRequest
-	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
-	}
-	if err := c.Validate(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
-	}
-
-	code, user, err := h.authService.ForgotPassword(req.Email)
-	if err != nil {
-		var appErr *apperrors.AppError
-		if errors.As(err, &appErr) && appErr.Code == http.StatusTooManyRequests {
-			// Only reveal rate limit errors
-			return err
-		}
-
-		log.Error().Err(err).Str("email", req.Email).Msg("Forgot password failed")
-		// Don't reveal other errors to prevent email enumeration
-	}
-
-	// Send password reset email (async) - only if user found
-	if h.emailService != nil && code != "" && user != nil {
-		go func() {
-			defer func() {
-				if r := recover(); r != nil {
-					log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in password reset email goroutine")
-				}
-			}()
-			if err := h.emailService.SendPasswordResetEmail(user.Email, user.FirstName, code); err != nil {
-				log.Error().Err(err).Str("email", user.Email).Msg("Failed to send password reset email")
-			}
-		}()
-	}
-
-	if h.auditService != nil {
-		h.auditService.LogEvent(c, nil, services.AuditEventPasswordReset, map[string]interface{}{
-			"email": req.Email,
-		})
-	}
-
-	// Always return success to prevent email enumeration
-	return c.JSON(http.StatusOK, responses.ForgotPasswordResponse{
-		Message: "Password reset email sent",
-	})
-}
-
-// VerifyResetCode handles POST /api/auth/verify-reset-code/
-func (h *AuthHandler) VerifyResetCode(c echo.Context) error {
-	var req requests.VerifyResetCodeRequest
-	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
-	}
-	if err := c.Validate(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
-	}
-
-	resetToken, err := h.authService.VerifyResetCode(req.Email, req.Code)
-	if err != nil {
-		log.Debug().Err(err).Str("email", req.Email).Msg("Verify reset code failed")
-		return err
-	}
-
-	return c.JSON(http.StatusOK, responses.VerifyResetCodeResponse{
-		Message:    "Reset code verified",
-		ResetToken: resetToken,
-	})
-}
-
-// ResetPassword handles POST /api/auth/reset-password/
-func (h *AuthHandler) ResetPassword(c echo.Context) error {
-	var req requests.ResetPasswordRequest
-	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
-	}
-	if err := c.Validate(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
-	}
-
-	err := h.authService.ResetPassword(req.ResetToken, req.NewPassword)
-	if err != nil {
-		log.Debug().Err(err).Msg("Password reset failed")
-		return err
-	}
-
-	if h.auditService != nil {
-		h.auditService.LogEvent(c, nil, services.AuditEventPasswordChanged, map[string]interface{}{
-			"method": "reset_token",
-		})
-	}
-
-	return c.JSON(http.StatusOK, responses.ResetPasswordResponse{
-		Message: "Password reset successful",
-	})
-}
-
-// AppleSignIn handles POST /api/auth/apple-sign-in/
-func (h *AuthHandler) AppleSignIn(c echo.Context) error {
-	var req requests.AppleSignInRequest
-	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
-	}
-	if err := c.Validate(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
-	}
-
-	if h.appleAuthService == nil {
-		log.Error().Msg("Apple auth service not configured")
-		return &apperrors.AppError{
-			Code:       500,
-			MessageKey: "error.apple_signin_not_configured",
-		}
-	}
-
-	response, err := h.authService.AppleSignIn(c.Request().Context(), h.appleAuthService, &req)
-	if err != nil {
-		// Check for legacy Apple Sign In error (not yet migrated)
-		if errors.Is(err, services.ErrAppleSignInFailed) {
-			log.Debug().Err(err).Msg("Apple Sign In failed (legacy error)")
-			return apperrors.Unauthorized("error.invalid_apple_token")
-		}
-
-		log.Debug().Err(err).Msg("Apple Sign In failed")
-		return err
-	}
-
-	// Send welcome email for new users (async)
-	if response.IsNewUser && h.emailService != nil && response.User.Email != "" {
-		go func() {
-			defer func() {
-				if r := recover(); r != nil {
-					log.Error().Interface("panic", r).Str("email", response.User.Email).Msg("Panic in Apple welcome email goroutine")
-				}
-			}()
-			if err := h.emailService.SendAppleWelcomeEmail(response.User.Email, response.User.FirstName); err != nil {
-				log.Error().Err(err).Str("email", response.User.Email).Msg("Failed to send Apple welcome email")
-			}
-		}()
-	}
-
-	return c.JSON(http.StatusOK, response)
-}
-
-// GoogleSignIn handles POST /api/auth/google-sign-in/
-func (h *AuthHandler) GoogleSignIn(c echo.Context) error {
-	var req requests.GoogleSignInRequest
-	if err := c.Bind(&req); err != nil {
-		return apperrors.BadRequest("error.invalid_request")
-	}
-	if err := c.Validate(&req); err != nil {
-		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
-	}
-
-	if h.googleAuthService == nil {
-		log.Error().Msg("Google auth service not configured")
-		return &apperrors.AppError{
-			Code:       500,
-			MessageKey: "error.google_signin_not_configured",
-		}
-	}
-
-	response, err := h.authService.GoogleSignIn(c.Request().Context(), h.googleAuthService, &req)
-	if err != nil {
-		// Check for legacy Google Sign In error (not yet migrated)
-		if errors.Is(err, services.ErrGoogleSignInFailed) {
-			log.Debug().Err(err).Msg("Google Sign In failed (legacy error)")
-			return apperrors.Unauthorized("error.invalid_google_token")
-		}
-
-		log.Debug().Err(err).Msg("Google Sign In failed")
-		return err
-	}
-
-	// Send welcome email for new users (async)
-	if response.IsNewUser && h.emailService != nil && response.User.Email != "" {
-		go func() {
-			defer func() {
-				if r := recover(); r != nil {
-					log.Error().Interface("panic", r).Str("email", response.User.Email).Msg("Panic in Google welcome email goroutine")
-				}
-			}()
-			if err := h.emailService.SendGoogleWelcomeEmail(response.User.Email, response.User.FirstName); err != nil {
-				log.Error().Err(err).Str("email", response.User.Email).Msg("Failed to send Google welcome email")
-			}
-		}()
-	}
-
-	return c.JSON(http.StatusOK, response)
-}
-
-// RefreshToken handles POST /api/auth/refresh/
-func (h *AuthHandler) RefreshToken(c echo.Context) error {
-	user, err := middleware.MustGetAuthUser(c)
-	if err != nil {
-		return err
-	}
-
-	token := middleware.GetAuthToken(c)
-	if token == "" {
-		return apperrors.Unauthorized("error.not_authenticated")
-	}
-
-	response, err := h.authService.RefreshToken(token, user.ID)
-	if err != nil {
-		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Token refresh failed")
-		return err
-	}
-
-	// If the token was refreshed (new token), invalidate the old one from cache
-	if response.Token != token && h.cache != nil {
-		if cacheErr := h.cache.InvalidateAuthToken(c.Request().Context(), token); cacheErr != nil {
-			log.Warn().Err(cacheErr).Msg("Failed to invalidate old token from cache during refresh")
-		}
-	}
-
-	return c.JSON(http.StatusOK, response)
-}
-
 // DeleteAccount handles DELETE /api/auth/account/
 func (h *AuthHandler) DeleteAccount(c echo.Context) error {
 	user, err := middleware.MustGetAuthUser(c)
@@ -497,7 +177,7 @@ func (h *AuthHandler) DeleteAccount(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_request")
 	}

-	fileURLs, err := h.authService.DeleteAccount(user.ID, req.Password, req.Confirmation)
+	fileURLs, err := h.authService.DeleteAccount(c.Request().Context(), user.ID, req.Password, req.Confirmation)
 	if err != nil {
 		log.Debug().Err(err).Uint("user_id", user.ID).Msg("Account deletion failed")
 		return err
@@ -527,13 +207,5 @@ func (h *AuthHandler) DeleteAccount(c echo.Context) error {
 		}()
 	}

-	// Invalidate auth token from cache
-	token := middleware.GetAuthToken(c)
-	if h.cache != nil && token != "" {
-		if err := h.cache.InvalidateAuthToken(c.Request().Context(), token); err != nil {
-			log.Warn().Err(err).Msg("Failed to invalidate token in cache after account deletion")
-		}
-	}
-
 	return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Account deleted successfully"})
 }
@@ -35,26 +35,25 @@ func setupDeleteAccountHandler(t *testing.T) (*AuthHandler, *echo.Echo, *gorm.DB
 	return handler, e, db
 }

-func TestAuthHandler_DeleteAccount_EmailUser(t *testing.T) {
+// TestAuthHandler_DeleteAccount_WithConfirmation verifies that DELETE /account/
+// succeeds when the user sends confirmation: "DELETE".
+// Post-Kratos: all users (regardless of provider) must confirm with "DELETE".
+func TestAuthHandler_DeleteAccount_WithConfirmation(t *testing.T) {
 	handler, e, db := setupDeleteAccountHandler(t)

-	user := testutil.CreateTestUser(t, db, "deletetest", "delete@test.com", "Password123")
+	user := testutil.CreateTestUser(t, db, "deletetest", "delete@test.com", "ignored")

 	// Create profile for the user
 	profile := &models.UserProfile{UserID: user.ID, Verified: true}
 	require.NoError(t, db.Create(profile).Error)

-	// Create auth token
-	testutil.CreateTestToken(t, db, user.ID)
-
 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.DELETE("/account/", handler.DeleteAccount)

-	t.Run("successful deletion with correct password", func(t *testing.T) {
-		password := "Password123"
+	t.Run("successful deletion with DELETE confirmation", func(t *testing.T) {
 		req := map[string]interface{}{
-			"password": password,
+			"confirmation": "DELETE",
 		}

 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
@@ -74,106 +73,15 @@ func TestAuthHandler_DeleteAccount_EmailUser(t *testing.T) {
 		// Verify profile is deleted
 		db.Model(&models.UserProfile{}).Where("user_id = ?", user.ID).Count(&count)
 		assert.Equal(t, int64(0), count)
-
-		// Verify auth token is deleted
-		db.Model(&models.AuthToken{}).Where("user_id = ?", user.ID).Count(&count)
-		assert.Equal(t, int64(0), count)
 	})
 }

-func TestAuthHandler_DeleteAccount_WrongPassword(t *testing.T) {
+// TestAuthHandler_DeleteAccount_MissingConfirmation verifies that a missing
+// confirmation string is rejected with 400.
+func TestAuthHandler_DeleteAccount_MissingConfirmation(t *testing.T) {
 	handler, e, db := setupDeleteAccountHandler(t)

-	user := testutil.CreateTestUser(t, db, "wrongpw", "wrongpw@test.com", "Password123")
-
-	authGroup := e.Group("/api/auth")
-	authGroup.Use(testutil.MockAuthMiddleware(user))
-	authGroup.DELETE("/account/", handler.DeleteAccount)
-
-	t.Run("wrong password returns 401", func(t *testing.T) {
-		wrongPw := "wrongpassword"
-		req := map[string]interface{}{
-			"password": wrongPw,
-		}
-
-		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
-
-		testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
-	})
-}
-
-func TestAuthHandler_DeleteAccount_MissingPassword(t *testing.T) {
-	handler, e, db := setupDeleteAccountHandler(t)
-
-	user := testutil.CreateTestUser(t, db, "nopw", "nopw@test.com", "Password123")
-
-	authGroup := e.Group("/api/auth")
-	authGroup.Use(testutil.MockAuthMiddleware(user))
-	authGroup.DELETE("/account/", handler.DeleteAccount)
-
-	t.Run("missing password returns 400", func(t *testing.T) {
-		req := map[string]interface{}{}
-
-		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
-
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
-func TestAuthHandler_DeleteAccount_SocialAuthUser(t *testing.T) {
-	handler, e, db := setupDeleteAccountHandler(t)
-
-	user := testutil.CreateTestUser(t, db, "appleuser", "apple@test.com", "randompassword")
-
-	// Create Apple social auth record
-	appleAuth := &models.AppleSocialAuth{
-		UserID:  user.ID,
-		AppleID: "apple_sub_123",
-		Email:   "apple@test.com",
-	}
-	require.NoError(t, db.Create(appleAuth).Error)
-
-	// Create profile
-	profile := &models.UserProfile{UserID: user.ID, Verified: true}
-	require.NoError(t, db.Create(profile).Error)
-
-	authGroup := e.Group("/api/auth")
-	authGroup.Use(testutil.MockAuthMiddleware(user))
-	authGroup.DELETE("/account/", handler.DeleteAccount)
-
-	t.Run("successful deletion with DELETE confirmation", func(t *testing.T) {
-		confirmation := "DELETE"
-		req := map[string]interface{}{
-			"confirmation": confirmation,
-		}
-
-		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
-
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-
-		// Verify user is deleted
-		var count int64
-		db.Model(&models.User{}).Where("id = ?", user.ID).Count(&count)
-		assert.Equal(t, int64(0), count)
-
-		// Verify apple auth is deleted
-		db.Model(&models.AppleSocialAuth{}).Where("user_id = ?", user.ID).Count(&count)
-		assert.Equal(t, int64(0), count)
-	})
-}
-
-func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
-	handler, e, db := setupDeleteAccountHandler(t)
-
-	user := testutil.CreateTestUser(t, db, "googleuser", "google@test.com", "randompassword")
-
-	// Create Google social auth record
-	googleAuth := &models.GoogleSocialAuth{
-		UserID:   user.ID,
-		GoogleID: "google_sub_456",
-		Email:    "google@test.com",
-	}
-	require.NoError(t, db.Create(googleAuth).Error)
+	user := testutil.CreateTestUser(t, db, "nopw", "nopw@test.com", "ignored")

 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
@@ -188,9 +96,8 @@ func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
 	})

 	t.Run("wrong confirmation returns 400", func(t *testing.T) {
-		wrongConfirmation := "delete"
 		req := map[string]interface{}{
-			"confirmation": wrongConfirmation,
+			"confirmation": "delete", // lowercase — must be exact "DELETE"
 		}

 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
@@ -199,6 +106,8 @@ func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
 	})
 }

+// TestAuthHandler_DeleteAccount_Unauthenticated verifies that 401 is returned
+// when no auth middleware is set.
 func TestAuthHandler_DeleteAccount_Unauthenticated(t *testing.T) {
 	handler, e, _ := setupDeleteAccountHandler(t)

@@ -207,7 +116,7 @@ func TestAuthHandler_DeleteAccount_Unauthenticated(t *testing.T) {

 	t.Run("unauthenticated request returns 401", func(t *testing.T) {
 		req := map[string]interface{}{
-			"password": "Password123",
+			"confirmation": "DELETE",
 		}

 		w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "")
@@ -1,3 +1,7 @@
+// auth_handler_test.go tests the auth handler endpoints that survived the
+// Ory Kratos migration: GET /me/ and PUT/PATCH /profile/.
+// Login, register, logout, forgot-password, and social sign-in are now
+// handled by Kratos.
 package handlers

 import (
@@ -34,204 +38,32 @@ func setupAuthHandler(t *testing.T) (*AuthHandler, *echo.Echo, *repositories.Use
 	return handler, e, userRepo
 }

-func TestAuthHandler_Register(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/register/", handler.Register)
-
-	t.Run("successful registration", func(t *testing.T) {
-		req := requests.RegisterRequest{
-			Username:  "newuser",
-			Email:     "new@test.com",
-			Password:  "Password123",
-			FirstName: "New",
-			LastName:  "User",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusCreated)
-
-		var response map[string]interface{}
-		err := json.Unmarshal(w.Body.Bytes(), &response)
-		require.NoError(t, err)
-
-		testutil.AssertJSONFieldExists(t, response, "token")
-		testutil.AssertJSONFieldExists(t, response, "user")
-		testutil.AssertJSONFieldExists(t, response, "message")
-
-		user := response["user"].(map[string]interface{})
-		assert.Equal(t, "newuser", user["username"])
-		assert.Equal(t, "new@test.com", user["email"])
-		assert.Equal(t, "New", user["first_name"])
-		assert.Equal(t, "User", user["last_name"])
-	})
-
-	t.Run("registration with missing fields", func(t *testing.T) {
-		req := map[string]string{
-			"username": "test",
-			// Missing email and password
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-
-		response := testutil.ParseJSON(t, w.Body.Bytes())
-		testutil.AssertJSONFieldExists(t, response, "error")
-	})
-
-	t.Run("registration with short password", func(t *testing.T) {
-		req := requests.RegisterRequest{
-			Username: "testuser",
-			Email:    "test@test.com",
-			Password: "short", // Less than 8 chars
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-
-	t.Run("registration with duplicate username", func(t *testing.T) {
-		// First registration
-		req := requests.RegisterRequest{
-			Username: "duplicate",
-			Email:    "unique1@test.com",
-			Password: "Password123",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusCreated)
-
-		// Try to register again with same username
-		req.Email = "unique2@test.com"
-		w = testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusConflict) // 409 for duplicate resource
-
-		response := testutil.ParseJSON(t, w.Body.Bytes())
-		assert.Contains(t, response["error"], "Username already taken")
-	})
-
-	t.Run("registration with duplicate email", func(t *testing.T) {
-		// First registration
-		req := requests.RegisterRequest{
-			Username: "user1",
-			Email:    "duplicate@test.com",
-			Password: "Password123",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusCreated)
-
-		// Try to register again with same email
-		req.Username = "user2"
-		w = testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusConflict) // 409 for duplicate resource
-
-		response := testutil.ParseJSON(t, w.Body.Bytes())
-		assert.Contains(t, response["error"], "Email already registered")
-	})
-}
-
-func TestAuthHandler_Login(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/register/", handler.Register)
-	e.POST("/api/auth/login/", handler.Login)
-
-	// Create a test user
-	registerReq := requests.RegisterRequest{
-		Username:  "logintest",
-		Email:     "login@test.com",
-		Password:  "Password123",
-		FirstName: "Test",
-		LastName:  "User",
-	}
-	w := testutil.MakeRequest(e, "POST", "/api/auth/register/", registerReq, "")
-	testutil.AssertStatusCode(t, w, http.StatusCreated)
-
-	t.Run("successful login with username", func(t *testing.T) {
-		req := requests.LoginRequest{
-			Username: "logintest",
-			Password: "Password123",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-
-		var response map[string]interface{}
-		err := json.Unmarshal(w.Body.Bytes(), &response)
-		require.NoError(t, err)
-
-		testutil.AssertJSONFieldExists(t, response, "token")
-		testutil.AssertJSONFieldExists(t, response, "user")
-
-		user := response["user"].(map[string]interface{})
-		assert.Equal(t, "logintest", user["username"])
-		assert.Equal(t, "login@test.com", user["email"])
-	})
-
-	t.Run("successful login with email", func(t *testing.T) {
-		req := requests.LoginRequest{
-			Username: "login@test.com", // Using email as username
-			Password: "Password123",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-	})
-
-	t.Run("login with wrong password", func(t *testing.T) {
-		req := requests.LoginRequest{
-			Username: "logintest",
-			Password: "wrongpassword",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
-
-		response := testutil.ParseJSON(t, w.Body.Bytes())
-		assert.Contains(t, response["error"], "Invalid credentials")
-	})
-
-	t.Run("login with non-existent user", func(t *testing.T) {
-		req := requests.LoginRequest{
-			Username: "nonexistent",
-			Password: "Password123",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
-	})
-
-	t.Run("login with missing fields", func(t *testing.T) {
-		req := map[string]string{
-			"username": "logintest",
-			// Missing password
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
 func TestAuthHandler_CurrentUser(t *testing.T) {
-	handler, e, userRepo := setupAuthHandler(t)
+	handler, e, _ := setupAuthHandler(t)

 	db := testutil.SetupTestDB(t)
-	user := testutil.CreateTestUser(t, db, "metest", "me@test.com", "Password123")
+	user := testutil.CreateTestUser(t, db, "metest", "me@test.com", "")
 	user.FirstName = "Test"
 	user.LastName = "User"
-	userRepo.Update(user)
+	// Use the userRepo from setupAuthHandler's DB, but since we need the user
+	// in the same DB we re-create it there.
+	db2 := testutil.SetupTestDB(t)
+	user2 := testutil.CreateTestUser(t, db2, "metest2", "me2@test.com", "")
+	user2.FirstName = "Test"
+	user2.LastName = "User"
+	userRepo2 := repositories.NewUserRepository(db2)
+	require.NoError(t, userRepo2.Update(user2))
+
+	// Build handler against db2
+	cfg := &config.Config{}
+	authService2 := services.NewAuthService(userRepo2, cfg)
+	handler2 := NewAuthHandler(authService2, nil, nil)

-	// Set up route with mock auth middleware
 	authGroup := e.Group("/api/auth")
-	authGroup.Use(testutil.MockAuthMiddleware(user))
-	authGroup.GET("/me/", handler.CurrentUser)
+	authGroup.Use(testutil.MockAuthMiddleware(user2))
+	authGroup.GET("/me/", handler2.CurrentUser)
+
+	_ = handler // avoid unused

 	t.Run("get current user", func(t *testing.T) {
 		w := testutil.MakeRequest(e, "GET", "/api/auth/me/", nil, "test-token")
@@ -242,23 +74,26 @@ func TestAuthHandler_CurrentUser(t *testing.T) {
 		err := json.Unmarshal(w.Body.Bytes(), &response)
 		require.NoError(t, err)

-		assert.Equal(t, "metest", response["username"])
-		assert.Equal(t, "me@test.com", response["email"])
+		assert.Equal(t, "metest2", response["username"])
+		assert.Equal(t, "me2@test.com", response["email"])
 	})
 }

 func TestAuthHandler_UpdateProfile(t *testing.T) {
-	handler, e, userRepo := setupAuthHandler(t)
-
 	db := testutil.SetupTestDB(t)
-	user := testutil.CreateTestUser(t, db, "updatetest", "update@test.com", "Password123")
-	userRepo.Update(user)
+	userRepo := repositories.NewUserRepository(db)
+	cfg := &config.Config{}
+	authService := services.NewAuthService(userRepo, cfg)
+	handler := NewAuthHandler(authService, nil, nil)
+	e := testutil.SetupTestRouter()
+
+	user := testutil.CreateTestUser(t, db, "updatetest", "update@test.com", "")

 	authGroup := e.Group("/api/auth")
 	authGroup.Use(testutil.MockAuthMiddleware(user))
 	authGroup.PUT("/profile/", handler.UpdateProfile)

-	t.Run("update profile", func(t *testing.T) {
+	t.Run("update first and last name", func(t *testing.T) {
 		firstName := "Updated"
 		lastName := "Name"
 		req := requests.UpdateProfileRequest{
@@ -278,130 +113,3 @@ func TestAuthHandler_UpdateProfile(t *testing.T) {
 		assert.Equal(t, "Name", response["last_name"])
 	})
 }
-
-func TestAuthHandler_ForgotPassword(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/register/", handler.Register)
-	e.POST("/api/auth/forgot-password/", handler.ForgotPassword)
-
-	// Create a test user
-	registerReq := requests.RegisterRequest{
-		Username: "forgottest",
-		Email:    "forgot@test.com",
-		Password: "Password123",
-	}
-	testutil.MakeRequest(e, "POST", "/api/auth/register/", registerReq, "")
-
-	t.Run("forgot password with valid email", func(t *testing.T) {
-		req := requests.ForgotPasswordRequest{
-			Email: "forgot@test.com",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
-
-		// Always returns 200 to prevent email enumeration
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-
-		response := testutil.ParseJSON(t, w.Body.Bytes())
-		testutil.AssertJSONFieldExists(t, response, "message")
-	})
-
-	t.Run("forgot password with invalid email", func(t *testing.T) {
-		req := requests.ForgotPasswordRequest{
-			Email: "nonexistent@test.com",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
-
-		// Still returns 200 to prevent email enumeration
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-	})
-}
-
-func TestAuthHandler_Logout(t *testing.T) {
-	handler, e, userRepo := setupAuthHandler(t)
-
-	db := testutil.SetupTestDB(t)
-	user := testutil.CreateTestUser(t, db, "logouttest", "logout@test.com", "Password123")
-	userRepo.Update(user)
-
-	authGroup := e.Group("/api/auth")
-	authGroup.Use(testutil.MockAuthMiddleware(user))
-	authGroup.POST("/logout/", handler.Logout)
-
-	t.Run("successful logout", func(t *testing.T) {
-		w := testutil.MakeRequest(e, "POST", "/api/auth/logout/", nil, "test-token")
-
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-
-		response := testutil.ParseJSON(t, w.Body.Bytes())
-		assert.Contains(t, response["message"], "Logged out successfully")
-	})
-}
-
-func TestAuthHandler_JSONResponses(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/register/", handler.Register)
-	e.POST("/api/auth/login/", handler.Login)
-
-	t.Run("register response has correct JSON structure", func(t *testing.T) {
-		req := requests.RegisterRequest{
-			Username:  "jsontest",
-			Email:     "json@test.com",
-			Password:  "Password123",
-			FirstName: "JSON",
-			LastName:  "Test",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusCreated)
-
-		var response map[string]interface{}
-		err := json.Unmarshal(w.Body.Bytes(), &response)
-		require.NoError(t, err)
-
-		// Verify top-level structure
-		assert.Contains(t, response, "token")
-		assert.Contains(t, response, "user")
-		assert.Contains(t, response, "message")
-
-		// Verify token is not empty
-		assert.NotEmpty(t, response["token"])
-
-		// Verify user structure
-		user := response["user"].(map[string]interface{})
-		assert.Contains(t, user, "id")
-		assert.Contains(t, user, "username")
-		assert.Contains(t, user, "email")
-		assert.Contains(t, user, "first_name")
-		assert.Contains(t, user, "last_name")
-		assert.Contains(t, user, "is_active")
-		assert.Contains(t, user, "date_joined")
-
-		// Verify types
-		assert.IsType(t, float64(0), user["id"]) // JSON numbers are float64
-		assert.IsType(t, "", user["username"])
-		assert.IsType(t, "", user["email"])
-		assert.IsType(t, true, user["is_active"])
-	})
-
-	t.Run("error response has correct JSON structure", func(t *testing.T) {
-		req := map[string]string{
-			"username": "test",
-		}
-
-		w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
-
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-
-		var response map[string]interface{}
-		err := json.Unmarshal(w.Body.Bytes(), &response)
-		require.NoError(t, err)
-
-		assert.Contains(t, response, "error")
-		assert.IsType(t, "", response["error"])
-	})
-}
@@ -30,7 +30,7 @@ func (h *ContractorHandler) ListContractors(c echo.Context) error {
 	if err != nil {
 		return err
 	}
-	response, err := h.contractorService.ListContractors(user.ID)
+	response, err := h.contractorService.ListContractors(c.Request().Context(), user.ID)
 	if err != nil {
 		return apperrors.Internal(err)
 	}
@@ -48,7 +48,7 @@ func (h *ContractorHandler) GetContractor(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_contractor_id")
 	}

-	response, err := h.contractorService.GetContractor(uint(contractorID), user.ID)
+	response, err := h.contractorService.GetContractor(c.Request().Context(), uint(contractorID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -69,7 +69,7 @@ func (h *ContractorHandler) CreateContractor(c echo.Context) error {
 		return err
 	}

-	response, err := h.contractorService.CreateContractor(&req, user.ID)
+	response, err := h.contractorService.CreateContractor(c.Request().Context(), &req, user.ID)
 	if err != nil {
 		return err
 	}
@@ -95,7 +95,7 @@ func (h *ContractorHandler) UpdateContractor(c echo.Context) error {
 		return err
 	}

-	response, err := h.contractorService.UpdateContractor(uint(contractorID), user.ID, &req)
+	response, err := h.contractorService.UpdateContractor(c.Request().Context(), uint(contractorID), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -113,7 +113,7 @@ func (h *ContractorHandler) DeleteContractor(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_contractor_id")
 	}

-	err = h.contractorService.DeleteContractor(uint(contractorID), user.ID)
+	err = h.contractorService.DeleteContractor(c.Request().Context(), uint(contractorID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -131,7 +131,7 @@ func (h *ContractorHandler) ToggleFavorite(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_contractor_id")
 	}

-	response, err := h.contractorService.ToggleFavorite(uint(contractorID), user.ID)
+	response, err := h.contractorService.ToggleFavorite(c.Request().Context(), uint(contractorID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -149,7 +149,7 @@ func (h *ContractorHandler) GetContractorTasks(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_contractor_id")
 	}

-	response, err := h.contractorService.GetContractorTasks(uint(contractorID), user.ID)
+	response, err := h.contractorService.GetContractorTasks(c.Request().Context(), uint(contractorID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -167,7 +167,7 @@ func (h *ContractorHandler) ListContractorsByResidence(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_residence_id")
 	}

-	response, err := h.contractorService.ListContractorsByResidence(uint(residenceID), user.ID)
+	response, err := h.contractorService.ListContractorsByResidence(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -176,7 +176,7 @@ func (h *ContractorHandler) ListContractorsByResidence(c echo.Context) error {

 // GetSpecialties handles GET /api/contractors/specialties/
 func (h *ContractorHandler) GetSpecialties(c echo.Context) error {
-	specialties, err := h.contractorService.GetSpecialties()
+	specialties, err := h.contractorService.GetSpecialties(c.Request().Context())
 	if err != nil {
 		return apperrors.Internal(err)
 	}
@@ -70,7 +70,7 @@ func (h *DocumentHandler) ListDocuments(c echo.Context) error {
 		}
 	}

-	response, err := h.documentService.ListDocuments(user.ID, filter)
+	response, err := h.documentService.ListDocuments(c.Request().Context(), user.ID, filter)
 	if err != nil {
 		return err
 	}
@@ -88,7 +88,7 @@ func (h *DocumentHandler) GetDocument(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_document_id")
 	}

-	response, err := h.documentService.GetDocument(uint(documentID), user.ID)
+	response, err := h.documentService.GetDocument(c.Request().Context(), uint(documentID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -101,7 +101,7 @@ func (h *DocumentHandler) ListWarranties(c echo.Context) error {
 	if err != nil {
 		return err
 	}
-	response, err := h.documentService.ListWarranties(user.ID)
+	response, err := h.documentService.ListWarranties(c.Request().Context(), user.ID)
 	if err != nil {
 		return apperrors.Internal(err)
 	}
@@ -201,7 +201,7 @@ func (h *DocumentHandler) CreateDocument(c echo.Context) error {
 			if h.storageService == nil {
 				return apperrors.Internal(nil)
 			}
-			result, err := h.storageService.Upload(uploadedFile, "documents")
+			result, err := h.storageService.Upload(c.Request().Context(), uploadedFile, "documents")
 			if err != nil {
 				return apperrors.BadRequest("error.failed_to_upload_file")
 			}
@@ -222,7 +222,7 @@ func (h *DocumentHandler) CreateDocument(c echo.Context) error {
 		return err
 	}

-	response, err := h.documentService.CreateDocument(&req, user.ID)
+	response, err := h.documentService.CreateDocument(c.Request().Context(), &req, user.ID)
 	if err != nil {
 		return err
 	}
@@ -248,7 +248,7 @@ func (h *DocumentHandler) UpdateDocument(c echo.Context) error {
 		return err
 	}

-	response, err := h.documentService.UpdateDocument(uint(documentID), user.ID, &req)
+	response, err := h.documentService.UpdateDocument(c.Request().Context(), uint(documentID), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -266,7 +266,7 @@ func (h *DocumentHandler) DeleteDocument(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_document_id")
 	}

-	err = h.documentService.DeleteDocument(uint(documentID), user.ID)
+	err = h.documentService.DeleteDocument(c.Request().Context(), uint(documentID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -284,7 +284,7 @@ func (h *DocumentHandler) ActivateDocument(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_document_id")
 	}

-	response, err := h.documentService.ActivateDocument(uint(documentID), user.ID)
+	response, err := h.documentService.ActivateDocument(c.Request().Context(), uint(documentID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -302,7 +302,7 @@ func (h *DocumentHandler) DeactivateDocument(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_document_id")
 	}

-	response, err := h.documentService.DeactivateDocument(uint(documentID), user.ID)
+	response, err := h.documentService.DeactivateDocument(c.Request().Context(), uint(documentID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -342,14 +342,14 @@ func (h *DocumentHandler) UploadDocumentImage(c echo.Context) error {
 		return apperrors.Internal(nil)
 	}

-	result, err := h.storageService.Upload(uploadedFile, "images")
+	result, err := h.storageService.Upload(c.Request().Context(), uploadedFile, "images")
 	if err != nil {
 		return apperrors.BadRequest("error.failed_to_upload_file")
 	}

 	caption := c.FormValue("caption")

-	response, err := h.documentService.UploadDocumentImage(uint(documentID), user.ID, result.URL, caption)
+	response, err := h.documentService.UploadDocumentImage(c.Request().Context(), uint(documentID), user.ID, result.URL, caption)
 	if err != nil {
 		return err
 	}
@@ -372,7 +372,7 @@ func (h *DocumentHandler) DeleteDocumentImage(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_image_id")
 	}

-	response, err := h.documentService.DeleteDocumentImage(uint(documentID), uint(imageID), user.ID)
+	response, err := h.documentService.DeleteDocumentImage(c.Request().Context(), uint(documentID), uint(imageID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -506,232 +506,6 @@ func TestTaskHandler_CreateCompletion_NoTaskID(t *testing.T) {
 	})
 }

-// =============================================================================
-// Auth Handler - Additional Coverage
-// =============================================================================
-
-func TestAuthHandler_AppleSignIn_NotConfigured(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/apple-sign-in/", handler.AppleSignIn)
-
-	t.Run("returns 500 when apple auth not configured", func(t *testing.T) {
-		req := map[string]interface{}{
-			"id_token": "fake-token",
-			"user_id":  "fake-user-id",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/apple-sign-in/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusInternalServerError)
-	})
-
-	t.Run("missing identity_token returns 400", func(t *testing.T) {
-		req := map[string]interface{}{}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/apple-sign-in/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
-func TestAuthHandler_GoogleSignIn_NotConfigured(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/google-sign-in/", handler.GoogleSignIn)
-
-	t.Run("returns 500 when google auth not configured", func(t *testing.T) {
-		req := map[string]interface{}{
-			"id_token": "fake-token",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/google-sign-in/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusInternalServerError)
-	})
-
-	t.Run("missing id_token returns 400", func(t *testing.T) {
-		req := map[string]interface{}{}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/google-sign-in/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
-// setupAuthHandlerWithDB is like setupAuthHandler but also returns the underlying *gorm.DB
-// for tests that need to create records like ConfirmationCode directly.
-func setupAuthHandlerWithDB(t *testing.T) (*AuthHandler, *echo.Echo, *gorm.DB) {
-	db := testutil.SetupTestDB(t)
-	userRepo := repositories.NewUserRepository(db)
-	cfg := &config.Config{
-		Security: config.SecurityConfig{
-			SecretKey:            "test-secret-key",
-			PasswordResetExpiry:  15 * time.Minute,
-			ConfirmationExpiry:   24 * time.Hour,
-			MaxPasswordResetRate: 3,
-		},
-	}
-	authService := services.NewAuthService(userRepo, cfg)
-	handler := NewAuthHandler(authService, nil, nil)
-	e := testutil.SetupTestRouter()
-	return handler, e, db
-}
-
-func TestAuthHandler_VerifyEmail(t *testing.T) {
-	handler, e, db := setupAuthHandlerWithDB(t)
-
-	user := testutil.CreateTestUser(t, db, "verifytest", "verify@test.com", "Password123")
-
-	// Create confirmation code
-	confirmCode := &models.ConfirmationCode{
-		UserID:    user.ID,
-		Code:      "123456",
-		ExpiresAt: time.Now().Add(24 * time.Hour),
-		IsUsed:    false,
-	}
-	require.NoError(t, db.Create(confirmCode).Error)
-
-	authGroup := e.Group("/api/auth")
-	authGroup.Use(testutil.MockAuthMiddleware(user))
-	authGroup.POST("/verify-email/", handler.VerifyEmail)
-
-	t.Run("successful verification", func(t *testing.T) {
-		req := requests.VerifyEmailRequest{
-			Code: "123456",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-
-		var response map[string]interface{}
-		err := json.Unmarshal(w.Body.Bytes(), &response)
-		require.NoError(t, err)
-		assert.Equal(t, true, response["verified"])
-	})
-
-	t.Run("wrong code returns error", func(t *testing.T) {
-		req := requests.VerifyEmailRequest{
-			Code: "999999",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
-		// Code already used or wrong code
-		assert.True(t, w.Code == http.StatusBadRequest || w.Code == http.StatusNotFound,
-			"expected 400 or 404, got %d", w.Code)
-	})
-
-	t.Run("missing code returns 400", func(t *testing.T) {
-		req := map[string]interface{}{}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
-func TestAuthHandler_ResendVerification(t *testing.T) {
-	handler, e, db := setupAuthHandlerWithDB(t)
-
-	user := testutil.CreateTestUser(t, db, "resendtest", "resend@test.com", "Password123")
-
-	authGroup := e.Group("/api/auth")
-	authGroup.Use(testutil.MockAuthMiddleware(user))
-	authGroup.POST("/resend-verification/", handler.ResendVerification)
-
-	t.Run("successful resend", func(t *testing.T) {
-		w := testutil.MakeRequest(e, "POST", "/api/auth/resend-verification/", nil, "test-token")
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-
-		var response map[string]interface{}
-		err := json.Unmarshal(w.Body.Bytes(), &response)
-		require.NoError(t, err)
-		assert.Contains(t, response, "message")
-	})
-}
-
-func TestAuthHandler_RefreshToken(t *testing.T) {
-	handler, e, db := setupAuthHandlerWithDB(t)
-
-	user := testutil.CreateTestUser(t, db, "refreshtest", "refresh@test.com", "Password123")
-
-	// Create auth token and use its actual key in the middleware
-	authToken := testutil.CreateTestToken(t, db, user.ID)
-
-	authGroup := e.Group("/api/auth")
-	authGroup.Use(func(next echo.HandlerFunc) echo.HandlerFunc {
-		return func(c echo.Context) error {
-			c.Set("auth_user", user)
-			c.Set("auth_token", authToken.Key)
-			return next(c)
-		}
-	})
-	authGroup.POST("/refresh/", handler.RefreshToken)
-
-	t.Run("successful refresh", func(t *testing.T) {
-		w := testutil.MakeRequest(e, "POST", "/api/auth/refresh/", nil, authToken.Key)
-		testutil.AssertStatusCode(t, w, http.StatusOK)
-
-		var response map[string]interface{}
-		err := json.Unmarshal(w.Body.Bytes(), &response)
-		require.NoError(t, err)
-		assert.Contains(t, response, "token")
-	})
-}
-
-func TestAuthHandler_VerifyResetCode(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/register/", handler.Register)
-	e.POST("/api/auth/verify-reset-code/", handler.VerifyResetCode)
-
-	t.Run("invalid code returns error", func(t *testing.T) {
-		req := requests.VerifyResetCodeRequest{
-			Email: "nonexistent@test.com",
-			Code:  "999999",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-reset-code/", req, "")
-		// Should not be 200 since no valid code exists
-		assert.NotEqual(t, http.StatusOK, w.Code)
-	})
-
-	t.Run("missing fields returns 400", func(t *testing.T) {
-		req := map[string]interface{}{}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/verify-reset-code/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
-func TestAuthHandler_ResetPassword(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/reset-password/", handler.ResetPassword)
-
-	t.Run("invalid reset token returns error", func(t *testing.T) {
-		req := requests.ResetPasswordRequest{
-			ResetToken:  "invalid-token",
-			NewPassword: "NewPassword123",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
-		assert.NotEqual(t, http.StatusOK, w.Code)
-	})
-
-	t.Run("missing fields returns 400", func(t *testing.T) {
-		req := map[string]interface{}{}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-
-	t.Run("short password returns 400", func(t *testing.T) {
-		req := requests.ResetPasswordRequest{
-			ResetToken:  "some-token",
-			NewPassword: "short",
-		}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
-func TestAuthHandler_ForgotPassword_MissingEmail(t *testing.T) {
-	handler, e, _ := setupAuthHandler(t)
-
-	e.POST("/api/auth/forgot-password/", handler.ForgotPassword)
-
-	t.Run("missing email returns 400", func(t *testing.T) {
-		req := map[string]interface{}{}
-		w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
 // =============================================================================
 // Residence Handler - Additional Error Paths
 // =============================================================================
@@ -1781,45 +1555,11 @@ func TestStaticDataHandler_RefreshStaticData(t *testing.T) {
 // =============================================================================
 // Upload Handler - Additional Error Paths
 // =============================================================================
-
-func TestUploadHandler_UploadImage_NoFile(t *testing.T) {
-	storageSvc := newTestStorageService("/var/uploads")
-	handler := NewUploadHandler(storageSvc, nil)
-	e := testutil.SetupTestRouter()
-
-	e.POST("/api/uploads/image", handler.UploadImage)
-
-	t.Run("no file returns 400", func(t *testing.T) {
-		w := testutil.MakeRequest(e, "POST", "/api/uploads/image", nil, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
-func TestUploadHandler_UploadDocument_NoFile(t *testing.T) {
-	storageSvc := newTestStorageService("/var/uploads")
-	handler := NewUploadHandler(storageSvc, nil)
-	e := testutil.SetupTestRouter()
-
-	e.POST("/api/uploads/document", handler.UploadDocument)
-
-	t.Run("no file returns 400", func(t *testing.T) {
-		w := testutil.MakeRequest(e, "POST", "/api/uploads/document", nil, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
-
-func TestUploadHandler_UploadCompletion_NoFile(t *testing.T) {
-	storageSvc := newTestStorageService("/var/uploads")
-	handler := NewUploadHandler(storageSvc, nil)
-	e := testutil.SetupTestRouter()
-
-	e.POST("/api/uploads/completion", handler.UploadCompletion)
-
-	t.Run("no file returns 400", func(t *testing.T) {
-		w := testutil.MakeRequest(e, "POST", "/api/uploads/completion", nil, "")
-		testutil.AssertStatusCode(t, w, http.StatusBadRequest)
-	})
-}
+//
+// Multipart upload handlers (UploadImage / UploadDocument / UploadCompletion)
+// were removed alongside the legacy /api/uploads/{image,document,completion}
+// routes. The presigned-URL flow (POST /api/uploads/presign) is exercised by
+// integration tests that hit the full pipeline.

 func TestUploadHandler_DeleteFile_OwnershipDenied(t *testing.T) {
 	storageSvc := newTestStorageService("/var/uploads")
@@ -37,6 +37,23 @@ func NewMediaHandler(
 	}
 }

+// safeContentDisposition builds an inline Content-Disposition header value
+// with a sanitized filename (audit M1). Control characters (including CR/LF),
+// double-quote and backslash are stripped so an attacker-controlled upload
+// filename cannot inject additional response headers (CWE-113).
+func safeContentDisposition(filename string) string {
+	cleaned := strings.Map(func(r rune) rune {
+		if r < 0x20 || r == 0x7f || r == '"' || r == '\\' {
+			return -1
+		}
+		return r
+	}, filename)
+	if cleaned == "" {
+		cleaned = "download"
+	}
+	return `inline; filename="` + cleaned + `"`
+}
+
 // ServeDocument serves a document file with access control
 // GET /api/media/document/:id
 func (h *MediaHandler) ServeDocument(c echo.Context) error {
@@ -71,7 +88,7 @@ func (h *MediaHandler) ServeDocument(c echo.Context) error {
 	// Set caching and disposition headers
 	c.Response().Header().Set("Cache-Control", "private, max-age=3600")
 	if doc.FileName != "" {
-		c.Response().Header().Set("Content-Disposition", "inline; filename=\""+doc.FileName+"\"")
+		c.Response().Header().Set("Content-Disposition", safeContentDisposition(doc.FileName))
 	}
 	return c.Blob(http.StatusOK, mimeType, data)
 }
@@ -114,7 +131,7 @@ func (h *MediaHandler) ServeDocumentImage(c echo.Context) error {
 	}

 	c.Response().Header().Set("Cache-Control", "private, max-age=3600")
-	c.Response().Header().Set("Content-Disposition", "inline; filename=\""+filepath.Base(img.ImageURL)+"\"")
+	c.Response().Header().Set("Content-Disposition", safeContentDisposition(filepath.Base(img.ImageURL)))
 	return c.Blob(http.StatusOK, mimeType, data)
 }

@@ -162,7 +179,7 @@ func (h *MediaHandler) ServeCompletionImage(c echo.Context) error {
 	}

 	c.Response().Header().Set("Cache-Control", "private, max-age=3600")
-	c.Response().Header().Set("Content-Disposition", "inline; filename=\""+filepath.Base(img.ImageURL)+"\"")
+	c.Response().Header().Set("Content-Disposition", safeContentDisposition(filepath.Base(img.ImageURL)))
 	return c.Blob(http.StatusOK, mimeType, data)
 }

@@ -46,7 +46,7 @@ func (h *NotificationHandler) ListNotifications(c echo.Context) error {
 		}
 	}

-	notifications, err := h.notificationService.GetNotifications(user.ID, limit, offset)
+	notifications, err := h.notificationService.GetNotifications(c.Request().Context(), user.ID, limit, offset)
 	if err != nil {
 		return err
 	}
@@ -64,7 +64,7 @@ func (h *NotificationHandler) GetUnreadCount(c echo.Context) error {
 		return err
 	}

-	count, err := h.notificationService.GetUnreadCount(user.ID)
+	count, err := h.notificationService.GetUnreadCount(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -84,7 +84,7 @@ func (h *NotificationHandler) MarkAsRead(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_notification_id")
 	}

-	err = h.notificationService.MarkAsRead(uint(notificationID), user.ID)
+	err = h.notificationService.MarkAsRead(c.Request().Context(), uint(notificationID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -99,7 +99,7 @@ func (h *NotificationHandler) MarkAllAsRead(c echo.Context) error {
 		return err
 	}

-	err = h.notificationService.MarkAllAsRead(user.ID)
+	err = h.notificationService.MarkAllAsRead(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -114,7 +114,7 @@ func (h *NotificationHandler) GetPreferences(c echo.Context) error {
 		return err
 	}

-	prefs, err := h.notificationService.GetPreferences(user.ID)
+	prefs, err := h.notificationService.GetPreferences(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -137,7 +137,7 @@ func (h *NotificationHandler) UpdatePreferences(c echo.Context) error {
 		return err
 	}

-	prefs, err := h.notificationService.UpdatePreferences(user.ID, &req)
+	prefs, err := h.notificationService.UpdatePreferences(c.Request().Context(), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -160,7 +160,7 @@ func (h *NotificationHandler) RegisterDevice(c echo.Context) error {
 		return err
 	}

-	device, err := h.notificationService.RegisterDevice(user.ID, &req)
+	device, err := h.notificationService.RegisterDevice(c.Request().Context(), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -175,7 +175,7 @@ func (h *NotificationHandler) ListDevices(c echo.Context) error {
 		return err
 	}

-	devices, err := h.notificationService.ListDevices(user.ID)
+	devices, err := h.notificationService.ListDevices(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -208,7 +208,7 @@ func (h *NotificationHandler) UnregisterDevice(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_platform")
 	}

-	err = h.notificationService.UnregisterDevice(req.RegistrationID, req.Platform, user.ID)
+	err = h.notificationService.UnregisterDevice(c.Request().Context(), req.RegistrationID, req.Platform, user.ID)
 	if err != nil {
 		return err
 	}
@@ -236,7 +236,7 @@ func (h *NotificationHandler) DeleteDevice(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_platform")
 	}

-	err = h.notificationService.DeleteDevice(uint(deviceID), platform, user.ID)
+	err = h.notificationService.DeleteDevice(c.Request().Context(), uint(deviceID), platform, user.ID)
 	if err != nil {
 		return err
 	}
@@ -39,7 +39,7 @@ func (h *ResidenceHandler) ListResidences(c echo.Context) error {
 		return err
 	}

-	response, err := h.residenceService.ListResidences(user.ID)
+	response, err := h.residenceService.ListResidences(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -55,7 +55,7 @@ func (h *ResidenceHandler) GetMyResidences(c echo.Context) error {
 	}
 	userNow := middleware.GetUserNow(c)

-	response, err := h.residenceService.GetMyResidences(user.ID, userNow)
+	response, err := h.residenceService.GetMyResidences(c.Request().Context(), user.ID, userNow)
 	if err != nil {
 		return err
 	}
@@ -72,7 +72,7 @@ func (h *ResidenceHandler) GetSummary(c echo.Context) error {
 	}
 	userNow := middleware.GetUserNow(c)

-	summary, err := h.residenceService.GetSummary(user.ID, userNow)
+	summary, err := h.residenceService.GetSummary(c.Request().Context(), user.ID, userNow)
 	if err != nil {
 		return err
 	}
@@ -93,7 +93,7 @@ func (h *ResidenceHandler) GetResidence(c echo.Context) error {
 	}

 	userNow := middleware.GetUserNow(c)
-	response, err := h.residenceService.GetResidence(uint(residenceID), user.ID, userNow)
+	response, err := h.residenceService.GetResidence(c.Request().Context(), uint(residenceID), user.ID, userNow)
 	if err != nil {
 		return err
 	}
@@ -116,7 +116,7 @@ func (h *ResidenceHandler) CreateResidence(c echo.Context) error {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}

-	response, err := h.residenceService.CreateResidence(&req, user.ID)
+	response, err := h.residenceService.CreateResidence(c.Request().Context(), &req, user.ID)
 	if err != nil {
 		return err
 	}
@@ -144,7 +144,7 @@ func (h *ResidenceHandler) UpdateResidence(c echo.Context) error {
 		return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
 	}

-	response, err := h.residenceService.UpdateResidence(uint(residenceID), user.ID, &req)
+	response, err := h.residenceService.UpdateResidence(c.Request().Context(), uint(residenceID), user.ID, &req)
 	if err != nil {
 		return err
 	}
@@ -164,7 +164,7 @@ func (h *ResidenceHandler) DeleteResidence(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_residence_id")
 	}

-	response, err := h.residenceService.DeleteResidence(uint(residenceID), user.ID)
+	response, err := h.residenceService.DeleteResidence(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -185,7 +185,7 @@ func (h *ResidenceHandler) GetShareCode(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_residence_id")
 	}

-	shareCode, err := h.residenceService.GetShareCode(uint(residenceID), user.ID)
+	shareCode, err := h.residenceService.GetShareCode(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -213,7 +213,7 @@ func (h *ResidenceHandler) GenerateShareCode(c echo.Context) error {
 	// Request body is optional
 	c.Bind(&req)

-	response, err := h.residenceService.GenerateShareCode(uint(residenceID), user.ID, req.ExpiresInHours)
+	response, err := h.residenceService.GenerateShareCode(c.Request().Context(), uint(residenceID), user.ID, req.ExpiresInHours)
 	if err != nil {
 		return err
 	}
@@ -238,7 +238,7 @@ func (h *ResidenceHandler) GenerateSharePackage(c echo.Context) error {
 	// Request body is optional (for expires_in_hours)
 	c.Bind(&req)

-	response, err := h.residenceService.GenerateSharePackage(uint(residenceID), user.ID, req.ExpiresInHours)
+	response, err := h.residenceService.GenerateSharePackage(c.Request().Context(), uint(residenceID), user.ID, req.ExpiresInHours)
 	if err != nil {
 		return err
 	}
@@ -261,7 +261,7 @@ func (h *ResidenceHandler) JoinWithCode(c echo.Context) error {
 		return err
 	}

-	response, err := h.residenceService.JoinWithCode(req.Code, user.ID)
+	response, err := h.residenceService.JoinWithCode(c.Request().Context(), req.Code, user.ID)
 	if err != nil {
 		return err
 	}
@@ -281,7 +281,7 @@ func (h *ResidenceHandler) GetResidenceUsers(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_residence_id")
 	}

-	users, err := h.residenceService.GetResidenceUsers(uint(residenceID), user.ID)
+	users, err := h.residenceService.GetResidenceUsers(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -306,7 +306,7 @@ func (h *ResidenceHandler) RemoveResidenceUser(c echo.Context) error {
 		return apperrors.BadRequest("error.invalid_user_id")
 	}

-	err = h.residenceService.RemoveUser(uint(residenceID), uint(userIDToRemove), user.ID)
+	err = h.residenceService.RemoveUser(c.Request().Context(), uint(residenceID), uint(userIDToRemove), user.ID)
 	if err != nil {
 		return err
 	}
@@ -316,7 +316,7 @@ func (h *ResidenceHandler) RemoveResidenceUser(c echo.Context) error {

 // GetResidenceTypes handles GET /api/residences/types/
 func (h *ResidenceHandler) GetResidenceTypes(c echo.Context) error {
-	types, err := h.residenceService.GetResidenceTypes()
+	types, err := h.residenceService.GetResidenceTypes(c.Request().Context())
 	if err != nil {
 		return err
 	}
@@ -348,7 +348,7 @@ func (h *ResidenceHandler) GenerateTasksReport(c echo.Context) error {
 	c.Bind(&req)

 	// Generate the report data
-	report, err := h.residenceService.GenerateTasksReport(uint(residenceID), user.ID)
+	report, err := h.residenceService.GenerateTasksReport(c.Request().Context(), uint(residenceID), user.ID)
 	if err != nil {
 		return err
 	}
@@ -1,6 +1,7 @@
 package handlers

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"net/http"
@@ -324,7 +325,7 @@ func TestResidenceHandler_JoinWithCode(t *testing.T) {
 	userRepo := repositories.NewUserRepository(db)
 	cfg := &config.Config{}
 	residenceService := services.NewResidenceService(residenceRepo, userRepo, cfg)
-	shareResp, _ := residenceService.GenerateShareCode(residence.ID, owner.ID, 24)
+	shareResp, _ := residenceService.GenerateShareCode(context.Background(), residence.ID, owner.ID, 24)

 	authGroup := e.Group("/api/residences")
 	authGroup.Use(testutil.MockAuthMiddleware(newUser))
@@ -357,7 +358,7 @@ func TestResidenceHandler_JoinWithCode(t *testing.T) {

 	t.Run("owner tries to join own residence", func(t *testing.T) {
 		// Generate new code
-		shareResp2, _ := residenceService.GenerateShareCode(residence.ID, owner.ID, 24)
+		shareResp2, _ := residenceService.GenerateShareCode(context.Background(), residence.ID, owner.ID, 24)

 		req := requests.JoinWithCodeRequest{
 			Code: shareResp2.ShareCode.Code,
@@ -15,12 +15,15 @@ import (

 // SeededDataResponse represents the unified seeded data response
 type SeededDataResponse struct {
-	ResidenceTypes        interface{}                            `json:"residence_types"`
-	TaskCategories        interface{}                            `json:"task_categories"`
-	TaskPriorities        interface{}                            `json:"task_priorities"`
-	TaskFrequencies       interface{}                            `json:"task_frequencies"`
-	ContractorSpecialties interface{}                            `json:"contractor_specialties"`
-	TaskTemplates         responses.TaskTemplatesGroupedResponse `json:"task_templates"`
+	ResidenceTypes        interface{}                             `json:"residence_types"`
+	TaskCategories        interface{}                             `json:"task_categories"`
+	TaskPriorities        interface{}                             `json:"task_priorities"`
+	TaskFrequencies       interface{}                             `json:"task_frequencies"`
+	ContractorSpecialties interface{}                             `json:"contractor_specialties"`
+	TaskTemplates         responses.TaskTemplatesGroupedResponse  `json:"task_templates"`
+	HomeProfileOptions    map[string][]services.HomeProfileOption `json:"home_profile_options"`
+	DocumentTypes         []services.HomeProfileOption            `json:"document_types"`
+	DocumentCategories    []services.HomeProfileOption            `json:"document_categories"`
 }

 // StaticDataHandler handles static/lookup data endpoints
@@ -54,13 +57,18 @@ func NewStaticDataHandler(
 func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
 	ctx := c.Request().Context()

+	// Lookup display labels and home-profile options are localized for the
+	// request's language, so the cache + ETag are keyed by locale.
+	locale := i18n.GetLocale(c)
+	localizer := i18n.GetLocalizer(c)
+
 	// Check If-None-Match header for conditional request
 	// Strip W/ prefix if present (added by reverse proxy, but we store without it)
 	clientETag := strings.TrimPrefix(c.Request().Header.Get("If-None-Match"), "W/")

 	// Try to get cached ETag first (fast path for 304 responses)
 	if h.cache != nil && clientETag != "" {
-		cachedETag, err := h.cache.GetSeededDataETag(ctx)
+		cachedETag, err := h.cache.GetSeededDataETag(ctx, locale)
 		if err == nil && cachedETag == clientETag {
 			// Client has the latest data, return 304 Not Modified
 			return c.NoContent(http.StatusNotModified)
@@ -70,10 +78,10 @@ func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
 	// Try to get cached seeded data
 	if h.cache != nil {
 		var cachedData SeededDataResponse
-		err := h.cache.GetCachedSeededData(ctx, &cachedData)
+		err := h.cache.GetCachedSeededData(ctx, locale, &cachedData)
 		if err == nil {
 			// Cache hit - get the ETag and return data
-			etag, etagErr := h.cache.GetSeededDataETag(ctx)
+			etag, etagErr := h.cache.GetSeededDataETag(ctx, locale)
 			if etagErr == nil {
 				c.Response().Header().Set("ETag", etag)
 				c.Response().Header().Set("Cache-Control", "private, max-age=3600")
@@ -86,27 +94,27 @@ func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
 	}

 	// Cache miss - fetch all data from services
-	residenceTypes, err := h.residenceService.GetResidenceTypes()
+	residenceTypes, err := h.residenceService.GetResidenceTypes(c.Request().Context())
 	if err != nil {
 		return err
 	}

-	taskCategories, err := h.taskService.GetCategories()
+	taskCategories, err := h.taskService.GetCategories(c.Request().Context())
 	if err != nil {
 		return err
 	}

-	taskPriorities, err := h.taskService.GetPriorities()
+	taskPriorities, err := h.taskService.GetPriorities(c.Request().Context())
 	if err != nil {
 		return err
 	}

-	taskFrequencies, err := h.taskService.GetFrequencies()
+	taskFrequencies, err := h.taskService.GetFrequencies(c.Request().Context())
 	if err != nil {
 		return err
 	}

-	contractorSpecialties, err := h.contractorService.GetSpecialties()
+	contractorSpecialties, err := h.contractorService.GetSpecialties(c.Request().Context())
 	if err != nil {
 		return err
 	}
@@ -116,6 +124,9 @@ func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
 		return err
 	}

+	// Localize the lookup display_name fields in place for this request's locale.
+	services.LocalizeLookups(localizer, residenceTypes, taskCategories, taskPriorities, taskFrequencies, contractorSpecialties)
+
 	// Build response
 	seededData := SeededDataResponse{
 		ResidenceTypes:        residenceTypes,
@@ -124,11 +135,14 @@ func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
 		TaskFrequencies:       taskFrequencies,
 		ContractorSpecialties: contractorSpecialties,
 		TaskTemplates:         taskTemplates,
+		HomeProfileOptions:    services.BuildHomeProfileOptions(localizer),
+		DocumentTypes:         services.BuildDocumentTypes(localizer),
+		DocumentCategories:    services.BuildDocumentCategories(localizer),
 	}

-	// Cache the data and get ETag
+	// Cache the data and get ETag (per-locale)
 	if h.cache != nil {
-		etag, cacheErr := h.cache.CacheSeededData(ctx, seededData)
+		etag, cacheErr := h.cache.CacheSeededData(ctx, locale, seededData)
 		if cacheErr != nil {
 			log.Warn().Err(cacheErr).Msg("Failed to cache seeded data")
 		} else {
@@ -32,7 +32,7 @@ func (h *SubscriptionHandler) GetSubscription(c echo.Context) error {
 		return err
 	}

-	subscription, err := h.subscriptionService.GetSubscription(user.ID)
+	subscription, err := h.subscriptionService.GetSubscription(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -47,7 +47,7 @@ func (h *SubscriptionHandler) GetSubscriptionStatus(c echo.Context) error {
 		return err
 	}

-	status, err := h.subscriptionService.GetSubscriptionStatus(user.ID)
+	status, err := h.subscriptionService.GetSubscriptionStatus(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -59,7 +59,7 @@ func (h *SubscriptionHandler) GetSubscriptionStatus(c echo.Context) error {
 func (h *SubscriptionHandler) GetUpgradeTrigger(c echo.Context) error {
 	key := c.Param("key")

-	trigger, err := h.subscriptionService.GetUpgradeTrigger(key)
+	trigger, err := h.subscriptionService.GetUpgradeTrigger(c.Request().Context(), key)
 	if err != nil {
 		return err
 	}
@@ -69,7 +69,7 @@ func (h *SubscriptionHandler) GetUpgradeTrigger(c echo.Context) error {

 // GetAllUpgradeTriggers handles GET /api/subscription/upgrade-triggers/
 func (h *SubscriptionHandler) GetAllUpgradeTriggers(c echo.Context) error {
-	triggers, err := h.subscriptionService.GetAllUpgradeTriggers()
+	triggers, err := h.subscriptionService.GetAllUpgradeTriggers(c.Request().Context())
 	if err != nil {
 		return err
 	}
@@ -79,7 +79,7 @@ func (h *SubscriptionHandler) GetAllUpgradeTriggers(c echo.Context) error {

 // GetFeatureBenefits handles GET /api/subscription/features/
 func (h *SubscriptionHandler) GetFeatureBenefits(c echo.Context) error {
-	benefits, err := h.subscriptionService.GetFeatureBenefits()
+	benefits, err := h.subscriptionService.GetFeatureBenefits(c.Request().Context())
 	if err != nil {
 		return err
 	}
@@ -94,7 +94,7 @@ func (h *SubscriptionHandler) GetPromotions(c echo.Context) error {
 		return err
 	}

-	promotions, err := h.subscriptionService.GetActivePromotions(user.ID)
+	promotions, err := h.subscriptionService.GetActivePromotions(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -125,12 +125,12 @@ func (h *SubscriptionHandler) ProcessPurchase(c echo.Context) error {
 		if req.TransactionID == "" && req.ReceiptData == "" {
 			return apperrors.BadRequest("error.receipt_data_required")
 		}
-		subscription, err = h.subscriptionService.ProcessApplePurchase(user.ID, req.ReceiptData, req.TransactionID)
+		subscription, err = h.subscriptionService.ProcessApplePurchase(c.Request().Context(), user.ID, req.ReceiptData, req.TransactionID)
 	case "android":
 		if req.PurchaseToken == "" {
 			return apperrors.BadRequest("error.purchase_token_required")
 		}
-		subscription, err = h.subscriptionService.ProcessGooglePurchase(user.ID, req.PurchaseToken, req.ProductID)
+		subscription, err = h.subscriptionService.ProcessGooglePurchase(c.Request().Context(), user.ID, req.PurchaseToken, req.ProductID)
 	default:
 		return apperrors.BadRequest("error.invalid_platform")
 	}
@@ -152,7 +152,7 @@ func (h *SubscriptionHandler) CancelSubscription(c echo.Context) error {
 		return err
 	}

-	subscription, err := h.subscriptionService.CancelSubscription(user.ID)
+	subscription, err := h.subscriptionService.CancelSubscription(c.Request().Context(), user.ID)
 	if err != nil {
 		return err
 	}
@@ -187,12 +187,12 @@ func (h *SubscriptionHandler) RestoreSubscription(c echo.Context) error {
 		if req.ReceiptData == "" && req.TransactionID == "" {
 			return apperrors.BadRequest("error.receipt_data_required")
 		}
-		subscription, err = h.subscriptionService.ProcessApplePurchase(user.ID, req.ReceiptData, req.TransactionID)
+		subscription, err = h.subscriptionService.ProcessApplePurchase(c.Request().Context(), user.ID, req.ReceiptData, req.TransactionID)
 	case "android":
 		if req.PurchaseToken == "" {
 			return apperrors.BadRequest("error.purchase_token_required")
 		}
-		subscription, err = h.subscriptionService.ProcessGooglePurchase(user.ID, req.PurchaseToken, req.ProductID)
+		subscription, err = h.subscriptionService.ProcessGooglePurchase(c.Request().Context(), user.ID, req.PurchaseToken, req.ProductID)
 	default:
 		return apperrors.BadRequest("error.invalid_platform")
 	}
@@ -220,7 +220,7 @@ func (h *SubscriptionHandler) CreateCheckoutSession(c echo.Context) error {
 	}

 	// Check if already Pro from another platform
-	alreadyPro, existingPlatform, err := h.subscriptionService.IsAlreadyProFromOtherPlatform(user.ID, "stripe")
+	alreadyPro, existingPlatform, err := h.subscriptionService.IsAlreadyProFromOtherPlatform(c.Request().Context(), user.ID, "stripe")
 	if err != nil {
 		return err
 	}
@@ -244,7 +244,7 @@ func (h *SubscriptionHandler) CreateCheckoutSession(c echo.Context) error {
 		return err
 	}

-	sessionURL, err := h.stripeService.CreateCheckoutSession(user.ID, req.PriceID, req.SuccessURL, req.CancelURL)
+	sessionURL, err := h.stripeService.CreateCheckoutSession(c.Request().Context(), user.ID, req.PriceID, req.SuccessURL, req.CancelURL)
 	if err != nil {
 		return err
 	}
--- a/Show More
+++ b/Show More