Compare commits
58 Commits
7e77e3bbab
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 225fb1306b | |||
| b54493f785 | |||
| 3b2ea9959a | |||
| cf054959bd | |||
| 12de5a230a | |||
| 25897e913e | |||
| 81e454d86d | |||
| 7b87f2e392 | |||
| 6de90acef7 | |||
| 64c656bde1 | |||
| d74cfeee62 | |||
| 52bf1ff3c7 | |||
| e448ec66dc | |||
| 3d3ba84df0 | |||
| 81578f6e27 | |||
| b66151ddd9 | |||
| c845771946 | |||
| 93fddc3769 | |||
| c77ff07ce9 | |||
| 2004f9c5b2 | |||
| 139a990ebc | |||
| 7cc5448a7c | |||
| 5d8559b495 | |||
| 191c9b08e0 | |||
| 4efc87559a | |||
| 1347ffadf5 | |||
| 14026251b7 | |||
| b7f83293b8 | |||
| 29c9014a33 | |||
| 9bee436e86 | |||
| 0798ae8d74 | |||
| ce4d49caef | |||
| cb1dc383b4 | |||
| 8fce568532 | |||
| 289a23f7e6 | |||
| 8d9ca2e6ed | |||
| 0f7450ada9 | |||
| 12b2f9d43b | |||
| d96f317d20 | |||
| 4049b704c3 | |||
| a94744061e | |||
| 30966c6f5e | |||
| b67f7f9e6b | |||
| c9ac273dbd | |||
| 88fb1751c7 | |||
| 9410da7497 | |||
| d9b5f85c3d | |||
| e881d37de0 | |||
| 65a9aae4e5 | |||
| 3f5bf21e09 | |||
| bc3da007db | |||
| 77cfcc0b27 | |||
| d3708e6c72 | |||
| 372d4d2d37 | |||
| df78d9ccd8 | |||
| 1cd6cafa9d | |||
| 57cef36379 | |||
| 9ea058347f |
+11
-1
@@ -28,12 +28,22 @@ EMAIL_HOST_USER=your-email@gmail.com
|
||||
EMAIL_HOST_PASSWORD=your-app-password
|
||||
DEFAULT_FROM_EMAIL=honeyDue <noreply@honeyDue.treytartt.com>
|
||||
|
||||
# Sign in with Apple
|
||||
# APPLE_CLIENT_ID must equal the iOS bundle ID of the build hitting this
|
||||
# backend. The Apple identity-token `aud` claim is checked against it
|
||||
# (see internal/services/apple_auth.go::verifyAudience). With DEBUG=false
|
||||
# an empty value rejects every Apple token.
|
||||
# Release builds: com.myhoneydue.honeyDue
|
||||
# Debug builds: com.myhoneydue.honeyDue.dev
|
||||
APPLE_CLIENT_ID=com.myhoneydue.honeyDue.dev
|
||||
APPLE_TEAM_ID=X86BR9WTLD
|
||||
|
||||
# APNs Settings (iOS Push Notifications)
|
||||
# Direct APNs integration - no external push server needed
|
||||
APNS_AUTH_KEY_PATH=/path/to/AuthKey_XXXXXX.p8
|
||||
APNS_AUTH_KEY_ID=XXXXXXXXXX
|
||||
APNS_TEAM_ID=XXXXXXXXXX
|
||||
APNS_TOPIC=com.tt.honeyDue
|
||||
APNS_TOPIC=com.myhoneydue.honeyDue.dev
|
||||
APNS_PRODUCTION=false # Set to true for production APNs, false for sandbox
|
||||
|
||||
# FCM Settings (Android Push Notifications)
|
||||
|
||||
@@ -8,6 +8,9 @@ bin/
|
||||
/api
|
||||
/worker
|
||||
/admin
|
||||
/admin-reset
|
||||
/notif-diag
|
||||
/send-test-push
|
||||
!admin/
|
||||
*.exe
|
||||
*.exe~
|
||||
@@ -42,3 +45,4 @@ push_certs/
|
||||
|
||||
# Vendor (if not using go modules)
|
||||
# vendor/
|
||||
/migrate
|
||||
|
||||
+19
-3
@@ -1,5 +1,5 @@
|
||||
# Admin panel build stage
|
||||
FROM node:20-alpine AS admin-builder
|
||||
FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS admin-builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -49,6 +49,19 @@ RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /
|
||||
# Build the worker binary
|
||||
RUN CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /app/worker ./cmd/worker
|
||||
|
||||
# Install goose CLI for production migrations. Pinned to a specific version
|
||||
# so an upstream behavioural change can't break a deploy unannounced.
|
||||
# Bumping is a deliberate, reviewable diff. We `go build` rather than
|
||||
# `go install` so the output path is predictable across host platforms —
|
||||
# `go install` with cross-compile env vars drops the binary in
|
||||
# /go/bin/<goos>_<goarch>/, which is awkward to COPY from.
|
||||
RUN cd /tmp && \
|
||||
git clone --depth=1 --branch=v3.22.1 https://github.com/pressly/goose.git goose-src && \
|
||||
cd goose-src && \
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=${TARGETARCH} \
|
||||
go build -ldflags="-w -s" -o /app/goose ./cmd/goose && \
|
||||
cd / && rm -rf /tmp/goose-src
|
||||
|
||||
# Base runtime stage for Go services
|
||||
FROM alpine:3.19 AS go-base
|
||||
|
||||
@@ -64,6 +77,9 @@ WORKDIR /app
|
||||
# Copy all binaries from builder
|
||||
COPY --from=builder /app/api /app/api
|
||||
COPY --from=builder /app/worker /app/worker
|
||||
# goose is the migration runner — same image is reused as the migrate Job
|
||||
# entrypoint via `command: ["/usr/local/bin/goose", ...]`.
|
||||
COPY --from=builder /app/goose /usr/local/bin/goose
|
||||
|
||||
# Copy templates directory
|
||||
COPY --from=builder /app/templates /app/templates
|
||||
@@ -93,7 +109,7 @@ FROM go-base AS worker
|
||||
CMD ["/app/worker"]
|
||||
|
||||
# Admin panel runtime stage
|
||||
FROM node:20-alpine AS admin
|
||||
FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS admin
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -115,7 +131,7 @@ ENV HOSTNAME="0.0.0.0"
|
||||
CMD ["node", "server.js"]
|
||||
|
||||
# Default production stage (for Dokku - runs API + Admin)
|
||||
FROM node:20-alpine AS production
|
||||
FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS production
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apk add --no-cache ca-certificates tzdata curl
|
||||
|
||||
@@ -89,15 +89,36 @@ docker-build-prod:
|
||||
docker build --target worker -t $${REGISTRY:-ghcr.io/treytartt}/honeydue-worker:$${TAG:-latest} .
|
||||
docker build --target admin -t $${REGISTRY:-ghcr.io/treytartt}/honeydue-admin:$${TAG:-latest} .
|
||||
|
||||
# Database migrations
|
||||
# Database migrations (goose)
|
||||
#
|
||||
# DATABASE_URL must point at the *direct* (non-pooler) Neon endpoint —
|
||||
# goose's session-scoped advisory lock won't survive PgBouncer transaction
|
||||
# mode. Example:
|
||||
# export DATABASE_URL='host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
|
||||
# user=neondb_owner password=... dbname=honeyDue sslmode=require'
|
||||
#
|
||||
# Bootstrap (one-time, when adopting goose against an existing DB):
|
||||
# make migrate-status # creates goose_db_version
|
||||
# psql ... -c "INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
|
||||
#
|
||||
# Day-to-day:
|
||||
# make migrate-status # show what's pending
|
||||
# make migrate-up # apply pending migrations
|
||||
# make migrate-down # roll back the latest migration
|
||||
# make migrate-new name=add_some_column # scaffold a new SQL migration
|
||||
|
||||
migrate-up:
|
||||
migrate -path migrations -database "$(DATABASE_URL)" up
|
||||
goose -dir migrations postgres "$(DATABASE_URL)" up
|
||||
|
||||
migrate-down:
|
||||
migrate -path migrations -database "$(DATABASE_URL)" down
|
||||
goose -dir migrations postgres "$(DATABASE_URL)" down
|
||||
|
||||
migrate-create:
|
||||
migrate create -ext sql -dir migrations -seq $(name)
|
||||
migrate-status:
|
||||
goose -dir migrations postgres "$(DATABASE_URL)" status
|
||||
|
||||
migrate-new:
|
||||
@if [ -z "$(name)" ]; then echo "usage: make migrate-new name=<short_name>"; exit 1; fi
|
||||
goose -dir migrations create $(name) sql
|
||||
|
||||
# Encrypt existing uploads at rest (run after setting STORAGE_ENCRYPTION_KEY)
|
||||
migrate-encrypt:
|
||||
|
||||
@@ -184,6 +184,15 @@ needed for local dev. For the complete production env var reference
|
||||
|
||||
Leave all four `B2_*` empty in dev to fall back to a local `/app/uploads` volume.
|
||||
|
||||
**Upload architecture (since `b7f8329`)**: Image and document uploads go
|
||||
**directly from the client to B2** via a presigned POST policy issued by
|
||||
`POST /api/uploads/presign`. Bytes never traverse the api server. B2
|
||||
enforces a 10 MB per-object cap at the protocol level. The worker reaps
|
||||
orphaned upload sessions hourly via the `maintenance:upload_cleanup`
|
||||
cron. See [`docs/deployment/09-storage.md`](./docs/deployment/09-storage.md)
|
||||
for the full flow, and [`docs/deployment/14-deployment-process.md`](./docs/deployment/14-deployment-process.md#one-time-b2-bucket-lifecycle-manual)
|
||||
for the one-time bucket lifecycle setup.
|
||||
|
||||
### Worker schedules (UTC hours)
|
||||
|
||||
| Variable | Description | Default |
|
||||
@@ -349,7 +358,11 @@ All protected endpoints require an `Authorization: Token <token>` header.
|
||||
|
||||
Production runs on a **3-node K3s HA cluster** on Hetzner Cloud, fronted
|
||||
by Cloudflare, with Neon Postgres, Backblaze B2, and a self-hosted Gitea
|
||||
container registry. See the full deployment book for every detail:
|
||||
container registry. Live observability (VictoriaMetrics + Jaeger +
|
||||
Grafana) runs on a separate Linode VPS at
|
||||
[`grafana.88oakapps.com`](https://grafana.88oakapps.com) and is fed by a
|
||||
`vmagent` sidecar in-cluster. See the full deployment book for every
|
||||
detail:
|
||||
|
||||
**→ [docs/deployment/](./docs/deployment/README.md) — The Deployment Book**
|
||||
|
||||
@@ -371,7 +384,10 @@ Quick links:
|
||||
|
||||
- **Runbook** — [docs/deployment/17-runbook.md](./docs/deployment/17-runbook.md) — 22 common ops procedures
|
||||
- **kubectl cheat sheet** — [docs/deployment/appendices/b-commands.md](./docs/deployment/appendices/b-commands.md)
|
||||
- **Deploy process** — [docs/deployment/14-deployment-process.md](./docs/deployment/14-deployment-process.md) — build → push → rollout
|
||||
- **Deploy process** — [docs/deployment/14-deployment-process.md](./docs/deployment/14-deployment-process.md) — `bash deploy-k3s/scripts/03-deploy.sh` builds → pushes → rolls out
|
||||
- **Observability** — [docs/deployment/15-observability.md](./docs/deployment/15-observability.md) — VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`
|
||||
- **Observability plan** — [docs/observability-plan.md](./docs/observability-plan.md) — design doc and rollout phases
|
||||
- **Database / pool tuning** — [docs/deployment/08-database.md](./docs/deployment/08-database.md) — Neon pooler endpoint, GORM pool, warm-up, RTT budget
|
||||
- **Failure modes** — [docs/deployment/16-failure-modes.md](./docs/deployment/16-failure-modes.md) — what happens when X dies
|
||||
- **Swarm postmortem** — [docs/deployment/19-postmortem-swarm.md](./docs/deployment/19-postmortem-swarm.md) — why we migrated
|
||||
|
||||
|
||||
@@ -0,0 +1,257 @@
|
||||
// admin-reset is a one-off CLI for resetting an admin_users row's password.
|
||||
//
|
||||
// It reads DB connection settings from environment variables (the same names
|
||||
// the API uses), looks up the admin user by email, prompts for a new password
|
||||
// twice (no echo), bcrypts it, and updates the row. Safe to keep in the repo
|
||||
// — running it requires DB credentials.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// # load env (host, user, db, sslmode) and password from secrets file
|
||||
// set -a && source deploy/prod.env && set +a
|
||||
// go run ./cmd/admin-reset
|
||||
//
|
||||
// # or with a non-default secrets path / different admin
|
||||
// go run ./cmd/admin-reset --password-file path/to/postgres_password.txt
|
||||
// go run ./cmd/admin-reset --email someone@example.com
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog"
|
||||
"github.com/rs/zerolog/log"
|
||||
"golang.org/x/crypto/bcrypt"
|
||||
"golang.org/x/term"
|
||||
"gorm.io/driver/postgres"
|
||||
"gorm.io/gorm"
|
||||
"gorm.io/gorm/logger"
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/models"
|
||||
)
|
||||
|
||||
const minPasswordLen = 12
|
||||
|
||||
func main() {
|
||||
email := flag.String("email", "admin@myhoneydue.com", "Admin email to reset")
|
||||
passwordFile := flag.String("password-file", "deploy/secrets/postgres_password.txt",
|
||||
"Path to file containing POSTGRES_PASSWORD (used if env var is empty)")
|
||||
list := flag.Bool("list", false, "List all rows in admin_users and exit (no changes)")
|
||||
verify := flag.Bool("verify", false, "Prompt for a password and check it against the stored hash; no changes")
|
||||
newEmail := flag.String("new-email", "", "If set: rename the matched admin's email to this value and exit (no password change)")
|
||||
flag.Parse()
|
||||
|
||||
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: time.RFC3339})
|
||||
|
||||
dsn, host, err := buildDSN(*passwordFile)
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to build database DSN")
|
||||
}
|
||||
|
||||
db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{
|
||||
Logger: logger.Default.LogMode(logger.Silent),
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to connect to database")
|
||||
}
|
||||
|
||||
if *list {
|
||||
var admins []models.AdminUser
|
||||
if err := db.Order("id").Find(&admins).Error; err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to list admin users")
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "DB host: %s\n%d admin user(s):\n\n", host, len(admins))
|
||||
fmt.Fprintf(os.Stderr, "%-4s %-40s %-12s %-6s %s\n", "ID", "EMAIL", "ROLE", "ACTIVE", "LAST_LOGIN")
|
||||
for _, a := range admins {
|
||||
last := "-"
|
||||
if a.LastLogin != nil {
|
||||
last = a.LastLogin.Format(time.RFC3339)
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "%-4d %-40s %-12s %-6t %s\n", a.ID, a.Email, a.Role, a.IsActive, last)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Mirror the live API's case-insensitive lookup so --verify reflects what
|
||||
// /api/admin/auth/login actually does. The reset path uses the same query
|
||||
// for consistency.
|
||||
var admin models.AdminUser
|
||||
if err := db.Where("LOWER(email) = LOWER(?)", *email).First(&admin).Error; err != nil {
|
||||
if errors.Is(err, gorm.ErrRecordNotFound) {
|
||||
log.Fatal().Str("email", *email).Msg("admin user not found (try --list to see existing rows)")
|
||||
}
|
||||
log.Fatal().Err(err).Msg("failed to look up admin user")
|
||||
}
|
||||
|
||||
if *newEmail != "" {
|
||||
target := strings.TrimSpace(*newEmail)
|
||||
if target == "" || !strings.Contains(target, "@") {
|
||||
log.Fatal().Str("new_email", *newEmail).Msg("--new-email must be a valid email address")
|
||||
}
|
||||
if strings.EqualFold(target, admin.Email) {
|
||||
fmt.Fprintf(os.Stderr, "No change — current email already matches %q\n", target)
|
||||
return
|
||||
}
|
||||
// Catch the unique-index conflict early with a clear message instead of a Postgres error.
|
||||
var collisionCount int64
|
||||
if err := db.Model(&models.AdminUser{}).
|
||||
Where("LOWER(email) = LOWER(?) AND id <> ?", target, admin.ID).
|
||||
Count(&collisionCount).Error; err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to check for email collision")
|
||||
}
|
||||
if collisionCount > 0 {
|
||||
log.Fatal().Str("new_email", target).Msg("another admin row already uses this email — aborting")
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "Renaming admin email: %s → %s (id=%d)\n", admin.Email, target, admin.ID)
|
||||
fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
|
||||
res := db.Model(&models.AdminUser{}).
|
||||
Where("id = ?", admin.ID).
|
||||
Updates(map[string]any{
|
||||
"email": target,
|
||||
"updated_at": time.Now().UTC(),
|
||||
})
|
||||
if res.Error != nil {
|
||||
log.Fatal().Err(res.Error).Msg("failed to rename admin email")
|
||||
}
|
||||
if res.RowsAffected != 1 {
|
||||
log.Fatal().Int64("rows", res.RowsAffected).Msg("expected exactly 1 row updated")
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "OK — email is now %s\n", target)
|
||||
return
|
||||
}
|
||||
|
||||
if *verify {
|
||||
fmt.Fprintf(os.Stderr, "Verifying password for: %s (id=%d, role=%s, active=%t)\n",
|
||||
admin.Email, admin.ID, admin.Role, admin.IsActive)
|
||||
fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
|
||||
|
||||
pw, err := readPassword("Password: ")
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to read password")
|
||||
}
|
||||
if admin.CheckPassword(pw) {
|
||||
fmt.Fprintln(os.Stderr, "PASS — bcrypt hash matches the supplied password")
|
||||
if !admin.IsActive {
|
||||
fmt.Fprintln(os.Stderr, "WARNING: is_active = false — login will still be rejected with \"Account is disabled\"")
|
||||
}
|
||||
} else {
|
||||
fmt.Fprintln(os.Stderr, "FAIL — bcrypt hash does NOT match the supplied password")
|
||||
os.Exit(1)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "Resetting password for: %s (id=%d, role=%s, active=%t)\n",
|
||||
admin.Email, admin.ID, admin.Role, admin.IsActive)
|
||||
fmt.Fprintf(os.Stderr, "DB host: %s\n\n", host)
|
||||
|
||||
pw1, err := readPassword("New password: ")
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to read password")
|
||||
}
|
||||
if len(pw1) < minPasswordLen {
|
||||
log.Fatal().Int("min", minPasswordLen).Msg("password too short")
|
||||
}
|
||||
|
||||
pw2, err := readPassword("Confirm password: ")
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to read password")
|
||||
}
|
||||
if pw1 != pw2 {
|
||||
log.Fatal().Msg("passwords do not match")
|
||||
}
|
||||
|
||||
hash, err := bcrypt.GenerateFromPassword([]byte(pw1), bcrypt.DefaultCost)
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to hash password")
|
||||
}
|
||||
|
||||
res := db.Model(&models.AdminUser{}).
|
||||
Where("id = ?", admin.ID).
|
||||
Updates(map[string]any{
|
||||
"password": string(hash),
|
||||
"updated_at": time.Now().UTC(),
|
||||
})
|
||||
if res.Error != nil {
|
||||
log.Fatal().Err(res.Error).Msg("failed to update admin user")
|
||||
}
|
||||
if res.RowsAffected != 1 {
|
||||
log.Fatal().Int64("rows", res.RowsAffected).Msg("expected exactly 1 row updated")
|
||||
}
|
||||
|
||||
fmt.Fprintf(os.Stderr, "\nOK — password reset for %s\n", admin.Email)
|
||||
}
|
||||
|
||||
func buildDSN(passwordFile string) (dsn, host string, err error) {
|
||||
host = os.Getenv("DB_HOST")
|
||||
user := os.Getenv("POSTGRES_USER")
|
||||
dbname := os.Getenv("POSTGRES_DB")
|
||||
sslmode := os.Getenv("DB_SSLMODE")
|
||||
if sslmode == "" {
|
||||
sslmode = "require"
|
||||
}
|
||||
|
||||
port := 5432
|
||||
if s := os.Getenv("DB_PORT"); s != "" {
|
||||
p, perr := strconv.Atoi(s)
|
||||
if perr != nil {
|
||||
return "", "", fmt.Errorf("invalid DB_PORT %q: %w", s, perr)
|
||||
}
|
||||
port = p
|
||||
}
|
||||
|
||||
password := os.Getenv("POSTGRES_PASSWORD")
|
||||
if password == "" && passwordFile != "" {
|
||||
b, rerr := os.ReadFile(passwordFile)
|
||||
if rerr != nil {
|
||||
return "", "", fmt.Errorf("POSTGRES_PASSWORD not set and could not read %s: %w", passwordFile, rerr)
|
||||
}
|
||||
password = strings.TrimRight(string(b), "\r\n")
|
||||
}
|
||||
|
||||
missing := []string{}
|
||||
if host == "" {
|
||||
missing = append(missing, "DB_HOST")
|
||||
}
|
||||
if user == "" {
|
||||
missing = append(missing, "POSTGRES_USER")
|
||||
}
|
||||
if dbname == "" {
|
||||
missing = append(missing, "POSTGRES_DB")
|
||||
}
|
||||
if password == "" {
|
||||
missing = append(missing, "POSTGRES_PASSWORD")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return "", "", fmt.Errorf("missing required env vars: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
|
||||
dsn = fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
|
||||
host, port, user, password, dbname, sslmode)
|
||||
return dsn, host, nil
|
||||
}
|
||||
|
||||
func readPassword(prompt string) (string, error) {
|
||||
fmt.Fprint(os.Stderr, prompt)
|
||||
if term.IsTerminal(int(os.Stdin.Fd())) {
|
||||
b, err := term.ReadPassword(int(os.Stdin.Fd()))
|
||||
fmt.Fprintln(os.Stderr)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimRight(string(b), "\r\n"), nil
|
||||
}
|
||||
s, err := bufio.NewReader(os.Stdin).ReadString('\n')
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimRight(s, "\r\n"), nil
|
||||
}
|
||||
+74
-5
@@ -9,6 +9,7 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/hibiken/asynq"
|
||||
"github.com/rs/zerolog/log"
|
||||
"gorm.io/gorm"
|
||||
|
||||
@@ -19,6 +20,8 @@ import (
|
||||
"github.com/treytartt/honeydue-api/internal/push"
|
||||
"github.com/treytartt/honeydue-api/internal/router"
|
||||
"github.com/treytartt/honeydue-api/internal/services"
|
||||
"github.com/treytartt/honeydue-api/internal/tracing"
|
||||
"github.com/treytartt/honeydue-api/internal/worker"
|
||||
"github.com/treytartt/honeydue-api/pkg/utils"
|
||||
)
|
||||
|
||||
@@ -50,6 +53,29 @@ func main() {
|
||||
Str("redis_url", config.MaskURLCredentials(cfg.Redis.URL)).
|
||||
Msg("Starting HoneyDue API server")
|
||||
|
||||
// Initialize OpenTelemetry tracing — exports to obs.88oakapps.com
|
||||
// (Jaeger via OTLP/HTTP) when OBS_TRACES_URL is set; otherwise installs
|
||||
// a no-op tracer so call sites can use otel.Tracer() unconditionally.
|
||||
// config.SecretValue (not os.Getenv) so file-mounted secrets resolve
|
||||
// after audit F8 removed these from the process environment.
|
||||
tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{
|
||||
ServiceName: "honeydue-api",
|
||||
Environment: deploymentEnvironment(cfg.Server.Debug),
|
||||
EndpointURL: config.SecretValue("OBS_TRACES_URL"),
|
||||
BearerToken: config.SecretValue("OBS_INGEST_TOKEN"),
|
||||
SampleRatio: tracing.SampleRatioFromEnv(),
|
||||
})
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("tracing init failed — continuing without traces")
|
||||
}
|
||||
defer func() {
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := tracingShutdown(shutdownCtx); err != nil {
|
||||
log.Warn().Err(err).Msg("tracing shutdown error")
|
||||
}
|
||||
}()
|
||||
|
||||
// Connect to database (retry with backoff)
|
||||
var db *gorm.DB
|
||||
var dbErr error
|
||||
@@ -65,11 +91,14 @@ func main() {
|
||||
log.Error().Err(dbErr).Msg("Failed to connect to database - API will start but database operations will fail")
|
||||
} else {
|
||||
defer database.Close()
|
||||
// Run database migrations only if connected.
|
||||
// MigrateWithLock serialises parallel replica starts via a Postgres
|
||||
// advisory lock so concurrent AutoMigrate calls don't race on DDL.
|
||||
if err := database.MigrateWithLock(); err != nil {
|
||||
log.Error().Err(err).Msg("Failed to run database migrations")
|
||||
// Migrations are managed out-of-band by golang-migrate (see
|
||||
// cmd/migrate and deploy-k3s/manifests/migrate/job.yaml) so the api
|
||||
// no longer runs AutoMigrate at startup. Instead we verify the
|
||||
// schema is at the expected version and refuse to start if not —
|
||||
// this catches the "operator forgot to run migrate" footgun loudly,
|
||||
// at boot, instead of with mysterious runtime errors.
|
||||
if err := database.RequireSchemaApplied(); err != nil {
|
||||
log.Fatal().Err(err).Msg("Schema precondition failed — run `kubectl -n honeydue create job --from=cronjob/honeydue-migrate` (or `make migrate-up` locally) and retry")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,6 +196,28 @@ func main() {
|
||||
Msg("Push notification client initialized")
|
||||
}
|
||||
|
||||
// Initialize Asynq enqueuer (api-side). Used by services that move
|
||||
// long-running work off the request path (currently: task-completion
|
||||
// notification fan-out). Same Redis as cmd/worker — file-mounted password
|
||||
// applied separately because cfg.Redis.URL does not embed it (audit HIGH-1).
|
||||
var taskEnqueuer *worker.TaskClient
|
||||
if redisOpt, parseErr := asynq.ParseRedisURI(cfg.Redis.URL); parseErr != nil {
|
||||
log.Warn().Err(parseErr).Msg("Failed to parse Redis URL for Asynq enqueuer — completion notifications will run inline")
|
||||
} else if clientOpt, ok := redisOpt.(asynq.RedisClientOpt); ok {
|
||||
if cfg.Redis.Password != "" {
|
||||
clientOpt.Password = cfg.Redis.Password
|
||||
}
|
||||
taskEnqueuer = worker.NewTaskClient(clientOpt)
|
||||
defer func() {
|
||||
if cerr := taskEnqueuer.Close(); cerr != nil {
|
||||
log.Warn().Err(cerr).Msg("Failed to close Asynq enqueuer on shutdown")
|
||||
}
|
||||
}()
|
||||
log.Info().Msg("Asynq enqueuer initialized")
|
||||
} else {
|
||||
log.Warn().Msg("Redis opt is not RedisClientOpt — Asynq enqueuer skipped; completion notifications will run inline")
|
||||
}
|
||||
|
||||
// Setup router with dependencies (includes admin panel at /admin)
|
||||
deps := &router.Dependencies{
|
||||
DB: db,
|
||||
@@ -178,6 +229,12 @@ func main() {
|
||||
StorageService: storageService,
|
||||
MonitoringService: monitoringService,
|
||||
}
|
||||
// Only assign the enqueuer when we actually constructed one. Assigning a
|
||||
// nil *worker.TaskClient directly would create a typed-nil interface that
|
||||
// fails the `if deps.TaskEnqueuer != nil` check in router.SetupRouter.
|
||||
if taskEnqueuer != nil {
|
||||
deps.TaskEnqueuer = taskEnqueuer
|
||||
}
|
||||
e := router.SetupRouter(deps)
|
||||
|
||||
// Create HTTP server
|
||||
@@ -217,3 +274,15 @@ func main() {
|
||||
|
||||
log.Info().Msg("Server exited")
|
||||
}
|
||||
|
||||
// deploymentEnvironment turns the boolean Debug flag into the conventional
|
||||
// environment label spans get tagged with.
|
||||
func deploymentEnvironment(debug bool) string {
|
||||
if env := os.Getenv("DEPLOYMENT_ENVIRONMENT"); env != "" {
|
||||
return env
|
||||
}
|
||||
if debug {
|
||||
return "dev"
|
||||
}
|
||||
return "prod"
|
||||
}
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
package main
|
||||
|
||||
import "time"
|
||||
|
||||
// shouldInitEmail returns true if email config has host and user set.
|
||||
func shouldInitEmail(host, user string) bool {
|
||||
return host != "" && user != ""
|
||||
}
|
||||
|
||||
// shouldInitStorage returns true if upload directory is configured.
|
||||
func shouldInitStorage(uploadDir string) bool {
|
||||
return uploadDir != ""
|
||||
}
|
||||
|
||||
// shouldInitEncryption returns true if encryption key is set.
|
||||
func shouldInitEncryption(encryptionKey string) bool {
|
||||
return encryptionKey != ""
|
||||
}
|
||||
|
||||
// connectWithRetry attempts a connection with exponential backoff.
|
||||
// Returns nil on success or the last error after all retries fail.
|
||||
func connectWithRetry(connect func() error, maxRetries int) error {
|
||||
var err error
|
||||
for i := 0; i < maxRetries; i++ {
|
||||
err = connect()
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(time.Duration(i+1) * time.Millisecond) // use ms in tests
|
||||
}
|
||||
return err
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// --- shouldInitEmail ---
|
||||
|
||||
func TestShouldInitEmail_BothSet_True(t *testing.T) {
|
||||
if !shouldInitEmail("smtp.example.com", "user@example.com") {
|
||||
t.Error("expected true when both set")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldInitEmail_MissingHost_False(t *testing.T) {
|
||||
if shouldInitEmail("", "user@example.com") {
|
||||
t.Error("expected false when host empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldInitEmail_MissingUser_False(t *testing.T) {
|
||||
if shouldInitEmail("smtp.example.com", "") {
|
||||
t.Error("expected false when user empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldInitEmail_BothEmpty_False(t *testing.T) {
|
||||
if shouldInitEmail("", "") {
|
||||
t.Error("expected false when both empty")
|
||||
}
|
||||
}
|
||||
|
||||
// --- shouldInitStorage ---
|
||||
|
||||
func TestShouldInitStorage_Set_True(t *testing.T) {
|
||||
if !shouldInitStorage("/uploads") {
|
||||
t.Error("expected true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldInitStorage_Empty_False(t *testing.T) {
|
||||
if shouldInitStorage("") {
|
||||
t.Error("expected false")
|
||||
}
|
||||
}
|
||||
|
||||
// --- shouldInitEncryption ---
|
||||
|
||||
func TestShouldInitEncryption_Set_True(t *testing.T) {
|
||||
if !shouldInitEncryption("secret-key-123") {
|
||||
t.Error("expected true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldInitEncryption_Empty_False(t *testing.T) {
|
||||
if shouldInitEncryption("") {
|
||||
t.Error("expected false")
|
||||
}
|
||||
}
|
||||
|
||||
// --- connectWithRetry ---
|
||||
|
||||
func TestConnectWithRetry_SucceedsFirst_NoRetry(t *testing.T) {
|
||||
calls := 0
|
||||
err := connectWithRetry(func() error {
|
||||
calls++
|
||||
return nil
|
||||
}, 3)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if calls != 1 {
|
||||
t.Errorf("calls = %d, want 1", calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectWithRetry_SucceedsSecond_OneRetry(t *testing.T) {
|
||||
calls := 0
|
||||
err := connectWithRetry(func() error {
|
||||
calls++
|
||||
if calls == 1 {
|
||||
return errors.New("fail")
|
||||
}
|
||||
return nil
|
||||
}, 3)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if calls != 2 {
|
||||
t.Errorf("calls = %d, want 2", calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectWithRetry_AllFail_ReturnsError(t *testing.T) {
|
||||
calls := 0
|
||||
err := connectWithRetry(func() error {
|
||||
calls++
|
||||
return errors.New("fail")
|
||||
}, 3)
|
||||
if err == nil {
|
||||
t.Error("expected error")
|
||||
}
|
||||
if calls != 3 {
|
||||
t.Errorf("calls = %d, want 3", calls)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,333 @@
|
||||
// notif-diag is a CLI for inspecting and (optionally) cleaning up stuck
|
||||
// notification rows. Default mode is read-only — runs SELECTs and prints a
|
||||
// summary. With --mark-failed-as-sent, marks pending rows that already have a
|
||||
// recorded error as sent (cosmetic — no retry, no resend).
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// set -a && source deploy/prod.env && set +a
|
||||
// go run ./cmd/notif-diag # diagnose
|
||||
// go run ./cmd/notif-diag --mark-failed-as-sent --yes # clean up errored backlog
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog"
|
||||
"github.com/rs/zerolog/log"
|
||||
"gorm.io/driver/postgres"
|
||||
"gorm.io/gorm"
|
||||
"gorm.io/gorm/logger"
|
||||
)
|
||||
|
||||
func main() {
|
||||
passwordFile := stringFlag("password-file", "deploy/secrets/postgres_password.txt",
|
||||
"Path to file containing POSTGRES_PASSWORD (used if env var is empty)")
|
||||
markFailed := boolFlag("mark-failed-as-sent",
|
||||
"Mark every pending row with a non-empty error_message as sent. Cosmetic only — does not retry the push.")
|
||||
yes := boolFlag("yes", "Skip the interactive confirmation prompt for destructive actions.")
|
||||
|
||||
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr, TimeFormat: time.RFC3339})
|
||||
|
||||
dsn, host, err := buildDSN(*passwordFile)
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to build database DSN")
|
||||
}
|
||||
|
||||
db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{
|
||||
Logger: logger.Default.LogMode(logger.Silent),
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to connect to database")
|
||||
}
|
||||
|
||||
fmt.Printf("DB host: %s\n", host)
|
||||
fmt.Println(strings.Repeat("=", 80))
|
||||
|
||||
overallTotals(db)
|
||||
pendingByType(db)
|
||||
recentPending(db)
|
||||
deviceCounts(db)
|
||||
|
||||
if *markFailed {
|
||||
markFailedAsSent(db, *yes)
|
||||
}
|
||||
}
|
||||
|
||||
// markFailedAsSent updates pending rows whose error_message is non-empty,
|
||||
// flipping them to sent=true with sent_at=updated_at. This is purely cosmetic:
|
||||
// it removes them from the "pending" count so dashboards and the diag tool
|
||||
// don't keep flagging an old, unfixable backlog. It does NOT re-send anything.
|
||||
func markFailedAsSent(db *gorm.DB, skipPrompt bool) {
|
||||
var candidate int64
|
||||
if err := db.Raw(`
|
||||
SELECT COUNT(*) FROM notifications_notification
|
||||
WHERE sent = false AND error_message IS NOT NULL AND error_message <> ''
|
||||
`).Scan(&candidate).Error; err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to count cleanup candidates")
|
||||
}
|
||||
|
||||
fmt.Printf("\n# Cleanup candidate count: %d\n", candidate)
|
||||
if candidate == 0 {
|
||||
fmt.Println(" (nothing to clean up)")
|
||||
return
|
||||
}
|
||||
fmt.Println(" These rows have a recorded send error and will never be retried.")
|
||||
fmt.Println(" Marking them sent=true is cosmetic — it just prevents them from")
|
||||
fmt.Println(" showing up as pending in admin dashboards going forward.")
|
||||
|
||||
if !skipPrompt {
|
||||
fmt.Printf("\nProceed? Type 'yes' to update %d rows: ", candidate)
|
||||
s, err := bufio.NewReader(os.Stdin).ReadString('\n')
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to read confirmation")
|
||||
}
|
||||
if strings.TrimSpace(s) != "yes" {
|
||||
fmt.Println("Aborted.")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
res := db.Exec(`
|
||||
UPDATE notifications_notification
|
||||
SET sent = true, sent_at = COALESCE(updated_at, NOW())
|
||||
WHERE sent = false AND error_message IS NOT NULL AND error_message <> ''
|
||||
`)
|
||||
if res.Error != nil {
|
||||
log.Fatal().Err(res.Error).Msg("failed to update rows")
|
||||
}
|
||||
fmt.Printf("OK — updated %d rows.\n", res.RowsAffected)
|
||||
}
|
||||
|
||||
// overallTotals shows the high-level sent/pending/read split.
|
||||
func overallTotals(db *gorm.DB) {
|
||||
type row struct {
|
||||
Total int64
|
||||
Sent int64
|
||||
Pending int64
|
||||
Read int64
|
||||
Errored int64
|
||||
}
|
||||
var r row
|
||||
db.Raw(`
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE sent = true) AS sent,
|
||||
COUNT(*) FILTER (WHERE sent = false) AS pending,
|
||||
COUNT(*) FILTER (WHERE read = true) AS read,
|
||||
COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS errored
|
||||
FROM notifications_notification
|
||||
`).Scan(&r)
|
||||
|
||||
fmt.Println("\n# Overall notification counts")
|
||||
fmt.Printf(" total: %d\n", r.Total)
|
||||
fmt.Printf(" sent: %d\n", r.Sent)
|
||||
fmt.Printf(" pending: %d\n", r.Pending)
|
||||
fmt.Printf(" read: %d\n", r.Read)
|
||||
fmt.Printf(" errored: %d (rows with non-empty error_message)\n", r.Errored)
|
||||
}
|
||||
|
||||
// pendingByType breaks the pending rows down by type and age.
|
||||
func pendingByType(db *gorm.DB) {
|
||||
type row struct {
|
||||
NotificationType string
|
||||
PendingCount int64
|
||||
Oldest *time.Time
|
||||
Newest *time.Time
|
||||
WithErrors int64
|
||||
Last24h int64
|
||||
Last7d int64
|
||||
}
|
||||
var rows []row
|
||||
db.Raw(`
|
||||
SELECT
|
||||
notification_type,
|
||||
COUNT(*) AS pending_count,
|
||||
MIN(created_at) AS oldest,
|
||||
MAX(created_at) AS newest,
|
||||
COUNT(*) FILTER (WHERE error_message IS NOT NULL AND error_message <> '') AS with_errors,
|
||||
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '24 hours') AS last_24h,
|
||||
COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '7 days') AS last_7d
|
||||
FROM notifications_notification
|
||||
WHERE sent = false
|
||||
GROUP BY notification_type
|
||||
ORDER BY MAX(created_at) DESC NULLS LAST
|
||||
`).Scan(&rows)
|
||||
|
||||
fmt.Println("\n# Pending rows by type")
|
||||
if len(rows) == 0 {
|
||||
fmt.Println(" (no pending notifications)")
|
||||
return
|
||||
}
|
||||
fmt.Printf(" %-22s %7s %7s %7s %7s %-19s %-19s\n",
|
||||
"TYPE", "PENDING", "ERRORED", "LAST24H", "LAST7D", "OLDEST", "NEWEST")
|
||||
for _, r := range rows {
|
||||
fmt.Printf(" %-22s %7d %7d %7d %7d %-19s %-19s\n",
|
||||
r.NotificationType, r.PendingCount, r.WithErrors, r.Last24h, r.Last7d,
|
||||
fmtTime(r.Oldest), fmtTime(r.Newest))
|
||||
}
|
||||
}
|
||||
|
||||
// recentPending shows the 5 most recent pending rows with full detail.
|
||||
func recentPending(db *gorm.DB) {
|
||||
type row struct {
|
||||
ID uint
|
||||
UserID uint
|
||||
NotificationType string
|
||||
Title string
|
||||
Body string
|
||||
ErrorMessage string
|
||||
CreatedAt time.Time
|
||||
}
|
||||
var rows []row
|
||||
db.Raw(`
|
||||
SELECT id, user_id, notification_type, title, body, COALESCE(error_message, '') AS error_message, created_at
|
||||
FROM notifications_notification
|
||||
WHERE sent = false
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 5
|
||||
`).Scan(&rows)
|
||||
|
||||
fmt.Println("\n# 5 most recent pending notifications")
|
||||
if len(rows) == 0 {
|
||||
fmt.Println(" (none)")
|
||||
return
|
||||
}
|
||||
for _, r := range rows {
|
||||
errPart := ""
|
||||
if r.ErrorMessage != "" {
|
||||
errPart = fmt.Sprintf("\n error: %s", r.ErrorMessage)
|
||||
}
|
||||
fmt.Printf(" [%d] user=%d %s %s%s\n title: %s\n body: %s\n",
|
||||
r.ID, r.UserID, r.CreatedAt.Format("2006-01-02 15:04:05"), r.NotificationType, errPart,
|
||||
truncate(r.Title, 100), truncate(r.Body, 100))
|
||||
}
|
||||
}
|
||||
|
||||
// deviceCounts shows how many push devices are registered (active vs inactive).
|
||||
func deviceCounts(db *gorm.DB) {
|
||||
type row struct {
|
||||
Total int64
|
||||
Active int64
|
||||
WithUser int64
|
||||
DistinctUsers int64
|
||||
}
|
||||
|
||||
fmt.Println("\n# Registered push devices")
|
||||
for _, t := range []struct {
|
||||
label string
|
||||
table string
|
||||
}{
|
||||
{"APNs (iOS)", "push_notifications_apnsdevice"},
|
||||
{"GCM (Android)", "push_notifications_gcmdevice"},
|
||||
} {
|
||||
var r row
|
||||
err := db.Raw(fmt.Sprintf(`
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE active = true) AS active,
|
||||
COUNT(*) FILTER (WHERE user_id IS NOT NULL) AS with_user,
|
||||
COUNT(DISTINCT user_id) AS distinct_users
|
||||
FROM %s
|
||||
`, t.table)).Scan(&r).Error
|
||||
if err != nil {
|
||||
fmt.Printf(" %-15s ERROR: %v\n", t.label, err)
|
||||
continue
|
||||
}
|
||||
fmt.Printf(" %-15s total=%-5d active=%-5d with_user=%-5d distinct_users=%d\n",
|
||||
t.label, r.Total, r.Active, r.WithUser, r.DistinctUsers)
|
||||
}
|
||||
}
|
||||
|
||||
func buildDSN(passwordFile string) (dsn, host string, err error) {
|
||||
host = os.Getenv("DB_HOST")
|
||||
user := os.Getenv("POSTGRES_USER")
|
||||
dbname := os.Getenv("POSTGRES_DB")
|
||||
sslmode := os.Getenv("DB_SSLMODE")
|
||||
if sslmode == "" {
|
||||
sslmode = "require"
|
||||
}
|
||||
|
||||
port := 5432
|
||||
if s := os.Getenv("DB_PORT"); s != "" {
|
||||
p, perr := strconv.Atoi(s)
|
||||
if perr != nil {
|
||||
return "", "", fmt.Errorf("invalid DB_PORT %q: %w", s, perr)
|
||||
}
|
||||
port = p
|
||||
}
|
||||
|
||||
password := os.Getenv("POSTGRES_PASSWORD")
|
||||
if password == "" && passwordFile != "" {
|
||||
b, rerr := os.ReadFile(passwordFile)
|
||||
if rerr != nil {
|
||||
return "", "", fmt.Errorf("POSTGRES_PASSWORD not set and could not read %s: %w", passwordFile, rerr)
|
||||
}
|
||||
password = strings.TrimRight(string(b), "\r\n")
|
||||
}
|
||||
|
||||
missing := []string{}
|
||||
if host == "" {
|
||||
missing = append(missing, "DB_HOST")
|
||||
}
|
||||
if user == "" {
|
||||
missing = append(missing, "POSTGRES_USER")
|
||||
}
|
||||
if dbname == "" {
|
||||
missing = append(missing, "POSTGRES_DB")
|
||||
}
|
||||
if password == "" {
|
||||
missing = append(missing, "POSTGRES_PASSWORD")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return "", "", fmt.Errorf("missing required env vars: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
|
||||
dsn = fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=%s",
|
||||
host, port, user, password, dbname, sslmode)
|
||||
return dsn, host, nil
|
||||
}
|
||||
|
||||
// stringFlag is a tiny stand-in for flag.String to keep imports lean — using it
|
||||
// also dodges flag-package quirks when this file is rebuilt with go run.
|
||||
func stringFlag(name, def, _usage string) *string {
|
||||
v := def
|
||||
prefix := "--" + name + "="
|
||||
for _, a := range os.Args[1:] {
|
||||
if strings.HasPrefix(a, prefix) {
|
||||
v = strings.TrimPrefix(a, prefix)
|
||||
}
|
||||
}
|
||||
return &v
|
||||
}
|
||||
|
||||
// boolFlag is true if --name is present in os.Args (no value form).
|
||||
func boolFlag(name, _usage string) *bool {
|
||||
want := "--" + name
|
||||
v := false
|
||||
for _, a := range os.Args[1:] {
|
||||
if a == want {
|
||||
v = true
|
||||
}
|
||||
}
|
||||
return &v
|
||||
}
|
||||
|
||||
func fmtTime(t *time.Time) string {
|
||||
if t == nil {
|
||||
return "-"
|
||||
}
|
||||
return t.Format("2006-01-02 15:04:05")
|
||||
}
|
||||
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "…"
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
// send-test-push enqueues a one-shot Asynq push notification task. The worker
|
||||
// picks it up and routes it through internal/push/Client.SendToAll, which now
|
||||
// hits APNs production. Verifies end-to-end that push delivery is working
|
||||
// without waiting for the next cron tick.
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// # Port-forward Redis from the cluster first:
|
||||
// kubectl --kubeconfig=~/.kube/honeydue-k3s.yaml -n honeydue port-forward svc/redis 6379:6379
|
||||
//
|
||||
// # Then in another shell:
|
||||
// go run ./cmd/send-test-push --user-id 6 --title "Test" --message "Hello from notif-diag"
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/hibiken/asynq"
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/worker/jobs"
|
||||
)
|
||||
|
||||
func main() {
|
||||
userID := flag.Uint("user-id", 0, "Target auth_user.id (required)")
|
||||
title := flag.String("title", "Test push", "Notification title")
|
||||
message := flag.String("message", "Hello from send-test-push", "Notification body")
|
||||
redisAddr := flag.String("redis", "localhost:6379", "Redis host:port (use kubectl port-forward to reach the in-cluster redis)")
|
||||
flag.Parse()
|
||||
|
||||
if *userID == 0 {
|
||||
fmt.Fprintln(os.Stderr, "--user-id is required")
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
task, err := jobs.NewSendPushTask(*userID, *title, *message, map[string]string{
|
||||
"type": "test",
|
||||
"user_id": strconv.FormatUint(uint64(*userID), 10),
|
||||
})
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "build task: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
client := asynq.NewClient(asynq.RedisClientOpt{Addr: *redisAddr})
|
||||
defer func() { _ = client.Close() }()
|
||||
|
||||
info, err := client.Enqueue(task, asynq.Queue("default"), asynq.MaxRetry(3))
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "enqueue: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Printf("Enqueued task: id=%s queue=%s type=%s\n", info.ID, info.Queue, info.Type)
|
||||
fmt.Printf("Tail worker logs to see the result:\n")
|
||||
fmt.Printf(" kubectl --kubeconfig=~/.kube/honeydue-k3s.yaml -n honeydue logs deploy/worker --tail=20 -f\n")
|
||||
}
|
||||
@@ -11,13 +11,19 @@ import (
|
||||
"github.com/hibiken/asynq"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"github.com/rs/zerolog/log"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/config"
|
||||
"github.com/treytartt/honeydue-api/internal/database"
|
||||
"github.com/treytartt/honeydue-api/internal/monitoring"
|
||||
"github.com/treytartt/honeydue-api/internal/prom"
|
||||
"github.com/treytartt/honeydue-api/internal/push"
|
||||
"github.com/treytartt/honeydue-api/internal/repositories"
|
||||
"github.com/treytartt/honeydue-api/internal/services"
|
||||
"github.com/treytartt/honeydue-api/internal/tracing"
|
||||
"github.com/treytartt/honeydue-api/internal/worker"
|
||||
"github.com/treytartt/honeydue-api/internal/worker/jobs"
|
||||
"github.com/treytartt/honeydue-api/pkg/utils"
|
||||
)
|
||||
@@ -40,6 +46,29 @@ func main() {
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
// Initialize OpenTelemetry tracing for the worker process. Same OTLP
|
||||
// destination as the api; service.name distinguishes them in Jaeger.
|
||||
// config.SecretValue (not os.Getenv) so file-mounted secrets resolve
|
||||
// after audit F8 removed these from the process environment.
|
||||
tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{
|
||||
ServiceName: "honeydue-worker",
|
||||
Environment: workerDeploymentEnv(cfg.Server.Debug),
|
||||
EndpointURL: config.SecretValue("OBS_TRACES_URL"),
|
||||
BearerToken: config.SecretValue("OBS_INGEST_TOKEN"),
|
||||
SampleRatio: tracing.SampleRatioFromEnv(),
|
||||
})
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("worker tracing init failed — continuing without traces")
|
||||
}
|
||||
defer func() {
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := tracingShutdown(shutdownCtx); err != nil {
|
||||
log.Warn().Err(err).Msg("worker tracing shutdown error")
|
||||
}
|
||||
}()
|
||||
asynqTracer := tracing.Tracer("honeydue/worker/asynq")
|
||||
|
||||
// Initialize database
|
||||
db, err := database.Connect(&cfg.Database, cfg.Server.Debug)
|
||||
if err != nil {
|
||||
@@ -80,6 +109,17 @@ func main() {
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("Failed to parse Redis URL")
|
||||
}
|
||||
// Audit HIGH-1: the Redis password is a file-mounted secret (REDIS_PASSWORD),
|
||||
// not embedded in REDIS_URL — REDIS_URL travels in the honeydue-config
|
||||
// ConfigMap. Apply the password onto the parsed opt so the Asynq server,
|
||||
// inspector and monitoring client (all derived from redisOpt below)
|
||||
// authenticate against a requirepass-protected Redis.
|
||||
if cfg.Redis.Password != "" {
|
||||
if clientOpt, ok := redisOpt.(asynq.RedisClientOpt); ok {
|
||||
clientOpt.Password = cfg.Redis.Password
|
||||
redisOpt = clientOpt
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize monitoring service (if Redis is available)
|
||||
var monitoringService *monitoring.Service
|
||||
@@ -141,14 +181,62 @@ func main() {
|
||||
// Create job handler
|
||||
jobHandler := jobs.NewHandler(db, pushClient, emailService, notificationService, cfg)
|
||||
|
||||
// Wire upload service for the pending_uploads cleanup cron AND share the
|
||||
// underlying storage service with the TaskService below so the worker can
|
||||
// load completion images for email embedding. Storage may be local-disk
|
||||
// (no S3 backend), in which case the upload service stays nil and the
|
||||
// cleanup handler no-ops. Cache is optional — the cleanup path doesn't
|
||||
// rate-limit and works fine with a nil cache.
|
||||
var sharedStorageService *services.StorageService
|
||||
if storageService, sErr := services.NewStorageService(&cfg.Storage); sErr == nil {
|
||||
sharedStorageService = storageService
|
||||
if s3 := storageService.S3Backend(); s3 != nil {
|
||||
pendingUploadRepo := repositories.NewPendingUploadRepository(db)
|
||||
uploadService := services.NewUploadService(pendingUploadRepo, s3, &cfg.Storage, nil)
|
||||
jobHandler.SetUploadService(uploadService)
|
||||
}
|
||||
} else {
|
||||
log.Warn().Err(sErr).Msg("Failed to initialize storage service for upload cleanup; cleanup cron will no-op")
|
||||
}
|
||||
|
||||
// Wire a TaskService for the task-completed notification handler. The
|
||||
// worker re-creates this (vs. importing the api's wired instance) because
|
||||
// each binary owns its own dependency graph. The handler is fully nil-safe
|
||||
// — if any of the wired services are absent, the corresponding side of
|
||||
// notification delivery (push or email) is skipped.
|
||||
taskRepo := repositories.NewTaskRepository(db)
|
||||
residenceRepo := repositories.NewResidenceRepository(db)
|
||||
workerTaskService := services.NewTaskService(taskRepo, residenceRepo)
|
||||
if notificationService != nil {
|
||||
workerTaskService.SetNotificationService(notificationService)
|
||||
}
|
||||
if emailService != nil {
|
||||
workerTaskService.SetEmailService(emailService)
|
||||
}
|
||||
if sharedStorageService != nil {
|
||||
workerTaskService.SetStorageService(sharedStorageService)
|
||||
}
|
||||
jobHandler.SetTaskService(workerTaskService)
|
||||
|
||||
// Create Asynq mux and register handlers
|
||||
mux := asynq.NewServeMux()
|
||||
|
||||
// Tracing + metrics middleware: every job runs inside a span and emits
|
||||
// asynq_job_duration_seconds{task_type,result}.
|
||||
mux.Use(asynqTracingMiddleware(asynqTracer))
|
||||
|
||||
mux.HandleFunc(jobs.TypeSmartReminder, jobHandler.HandleSmartReminder)
|
||||
mux.HandleFunc(jobs.TypeDailyDigest, jobHandler.HandleDailyDigest)
|
||||
mux.HandleFunc(jobs.TypeSendEmail, jobHandler.HandleSendEmail)
|
||||
mux.HandleFunc(jobs.TypeSendPush, jobHandler.HandleSendPush)
|
||||
mux.HandleFunc(jobs.TypeOnboardingEmails, jobHandler.HandleOnboardingEmails)
|
||||
mux.HandleFunc(jobs.TypeReminderLogCleanup, jobHandler.HandleReminderLogCleanup)
|
||||
mux.HandleFunc(jobs.TypeUploadCleanup, jobHandler.HandleUploadCleanup)
|
||||
mux.HandleFunc(jobs.TypeNotificationCleanup, jobHandler.HandleNotificationCleanup)
|
||||
mux.HandleFunc(jobs.TypeWebhookLogCleanup, jobHandler.HandleWebhookLogCleanup)
|
||||
mux.HandleFunc(jobs.TypeAuditLogCleanup, jobHandler.HandleAuditLogCleanup)
|
||||
mux.HandleFunc(worker.TypeTaskCompletedNotification, jobHandler.HandleTaskCompletedNotification)
|
||||
mux.HandleFunc(worker.TypeDataExport, jobHandler.HandleDataExport)
|
||||
|
||||
// Register email job handlers (welcome, verification, password reset, password changed)
|
||||
if emailService != nil {
|
||||
@@ -188,6 +276,32 @@ func main() {
|
||||
}
|
||||
log.Info().Str("cron", "0 3 * * *").Msg("Registered reminder log cleanup job (runs daily at 3:00 AM UTC)")
|
||||
|
||||
// Schedule pending_uploads cleanup (hourly at :30 to avoid colliding with
|
||||
// the top-of-hour reminder + digest crons). Reaps unclaimed expired
|
||||
// upload sessions; the B2 bucket lifecycle (7 days on uploads/ prefix)
|
||||
// is the backstop if this worker is offline for an extended period.
|
||||
if _, err := scheduler.Register("30 * * * *", asynq.NewTask(jobs.TypeUploadCleanup, nil)); err != nil {
|
||||
log.Fatal().Err(err).Msg("Failed to register upload cleanup job")
|
||||
}
|
||||
log.Info().Str("cron", "30 * * * *").Msg("Registered pending_uploads cleanup job (runs hourly)")
|
||||
|
||||
// Data-retention cleanups (BE-2). Staggered off the 3:00 reminder cleanup to
|
||||
// avoid piling DELETEs onto the same Neon connection window.
|
||||
if _, err := scheduler.Register("0 2 * * *", asynq.NewTask(jobs.TypeNotificationCleanup, nil)); err != nil {
|
||||
log.Fatal().Err(err).Msg("Failed to register notification cleanup job")
|
||||
}
|
||||
log.Info().Str("cron", "0 2 * * *").Msg("Registered notification cleanup job (daily 02:00 UTC, 90d retention)")
|
||||
|
||||
if _, err := scheduler.Register("30 2 * * 0", asynq.NewTask(jobs.TypeWebhookLogCleanup, nil)); err != nil {
|
||||
log.Fatal().Err(err).Msg("Failed to register webhook log cleanup job")
|
||||
}
|
||||
log.Info().Str("cron", "30 2 * * 0").Msg("Registered webhook log cleanup job (weekly Sun 02:30 UTC, 180d retention)")
|
||||
|
||||
if _, err := scheduler.Register("30 3 * * 0", asynq.NewTask(jobs.TypeAuditLogCleanup, nil)); err != nil {
|
||||
log.Fatal().Err(err).Msg("Failed to register audit log cleanup job")
|
||||
}
|
||||
log.Info().Str("cron", "30 3 * * 0").Msg("Registered audit log cleanup job (weekly Sun 03:30 UTC, 365d retention)")
|
||||
|
||||
// Handle graceful shutdown
|
||||
quit := make(chan os.Signal, 1)
|
||||
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
||||
@@ -199,6 +313,12 @@ func main() {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"status":"ok"}`))
|
||||
})
|
||||
// Expose Prometheus metrics so vmagent can scrape the worker. The
|
||||
// apns_send_*, fcm_send_*, asynq_job_* and cache_ops_* series have been
|
||||
// recorded on this process all along — they were just never exposed, which
|
||||
// is why those dashboard panels read empty. Same :6060 as health; in-cluster
|
||||
// only (not externally published).
|
||||
healthMux.Handle("/metrics", prom.HTTPHandler())
|
||||
healthSrv := &http.Server{
|
||||
Addr: workerHealthAddr,
|
||||
Handler: healthMux,
|
||||
@@ -238,3 +358,44 @@ func main() {
|
||||
|
||||
log.Info().Msg("Worker stopped")
|
||||
}
|
||||
|
||||
// asynqTracingMiddleware returns an asynq.MiddlewareFunc that opens a span
|
||||
// per task execution and records asynq_job_duration_seconds. Span attrs
|
||||
// include task type, queue, retry count, and the result outcome.
|
||||
func asynqTracingMiddleware(tracer trace.Tracer) asynq.MiddlewareFunc {
|
||||
return func(next asynq.Handler) asynq.Handler {
|
||||
return asynq.HandlerFunc(func(ctx context.Context, t *asynq.Task) error {
|
||||
ctx, span := tracer.Start(ctx, "asynq.handle:"+t.Type(),
|
||||
trace.WithAttributes(
|
||||
attribute.String("asynq.task_type", t.Type()),
|
||||
attribute.Int("asynq.payload_bytes", len(t.Payload())),
|
||||
),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
start := time.Now()
|
||||
err := next.ProcessTask(ctx, t)
|
||||
dur := time.Since(start)
|
||||
result := "ok"
|
||||
if err != nil {
|
||||
result = "error"
|
||||
span.SetStatus(codes.Error, err.Error())
|
||||
span.RecordError(err)
|
||||
}
|
||||
span.SetAttributes(attribute.String("asynq.result", result))
|
||||
prom.ObserveAsynqJob(t.Type(), result, dur)
|
||||
return err
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// workerDeploymentEnv mirrors deploymentEnvironment in cmd/api/main.go.
|
||||
func workerDeploymentEnv(debug bool) string {
|
||||
if env := os.Getenv("DEPLOYMENT_ENVIRONMENT"); env != "" {
|
||||
return env
|
||||
}
|
||||
if debug {
|
||||
return "dev"
|
||||
}
|
||||
return "prod"
|
||||
}
|
||||
|
||||
@@ -42,7 +42,7 @@ email:
|
||||
push:
|
||||
apns_key_id: ""
|
||||
apns_team_id: ""
|
||||
apns_topic: com.tt.honeyDue
|
||||
apns_topic: com.myhoneydue.honeyDue.dev
|
||||
apns_production: false
|
||||
apns_use_sandbox: true # Sandbox for dev
|
||||
|
||||
@@ -85,8 +85,9 @@ tls:
|
||||
# If mode=cloudflare, create secrets/cloudflare-origin.crt and .key
|
||||
|
||||
# --- Apple Auth / IAP (optional) ---
|
||||
# client_id MUST equal the iOS Debug bundle ID for the dev backend.
|
||||
apple_auth:
|
||||
client_id: ""
|
||||
client_id: "com.myhoneydue.honeyDue.dev"
|
||||
team_id: ""
|
||||
iap_key_id: ""
|
||||
iap_issuer_id: ""
|
||||
|
||||
@@ -92,7 +92,7 @@ ADMIN_PW="$(openssl rand -base64 16)"
|
||||
|
||||
EMAIL_USER="treytartt@fastmail.com"
|
||||
APNS_KEY_ID="9R5Q7ZX874"
|
||||
APNS_TEAM_ID="V3PF3M6B6U"
|
||||
APNS_TEAM_ID="X86BR9WTLD"
|
||||
|
||||
log ""
|
||||
log "Pre-filled from existing dev server:"
|
||||
@@ -147,7 +147,7 @@ email:
|
||||
push:
|
||||
apns_key_id: "${APNS_KEY_ID}"
|
||||
apns_team_id: "${APNS_TEAM_ID}"
|
||||
apns_topic: com.tt.honeyDue
|
||||
apns_topic: com.myhoneydue.honeyDue.dev
|
||||
apns_production: false
|
||||
apns_use_sandbox: true
|
||||
|
||||
@@ -189,7 +189,7 @@ tls:
|
||||
|
||||
# --- Apple Auth / IAP ---
|
||||
apple_auth:
|
||||
client_id: "com.tt.honeyDue"
|
||||
client_id: "com.myhoneydue.honeyDue.dev"
|
||||
team_id: "${APNS_TEAM_ID}"
|
||||
iap_key_id: ""
|
||||
iap_issuer_id: ""
|
||||
|
||||
@@ -3,6 +3,7 @@ config.yaml
|
||||
|
||||
# Generated files
|
||||
kubeconfig
|
||||
kubeconfig.*
|
||||
cluster-config.yaml
|
||||
prod.env
|
||||
|
||||
|
||||
@@ -0,0 +1,966 @@
|
||||
# honeyDue k3s Cluster — Operations Runbook
|
||||
|
||||
Living document for the honeyDue production cluster. Add entries when you hit
|
||||
something non-obvious so future-you (or your replacement) doesn't have to
|
||||
rediscover it.
|
||||
|
||||
Last full revision: **2026-06-03** (Hetzner → OVH BHS cutover; cluster solo
|
||||
production from that date forward). For pre-OVH history, see
|
||||
`MIGRATION_NOTES.md` (Swarm → k3s migration on Hetzner, 2026-04-24).
|
||||
|
||||
---
|
||||
|
||||
## 1. Topology and inventory
|
||||
|
||||
### Hosting
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Provider | OVHcloud (us.ovhcloud.com) |
|
||||
| Datacenter | BHS — Beauharnois, Quebec, Canada |
|
||||
| Plan | VPS-1 × 3 (~$6.46/mo each, ~$19/mo total) |
|
||||
| Node spec | 4 vCPU (Intel Haswell, shared), 7.6 GB RAM, 75 GB NVMe |
|
||||
| Public bandwidth | 400 Mbps per node, unlimited traffic |
|
||||
| Private network | **None.** Nodes have public IPv4 + IPv6 only; inter-node traffic crosses the public internet (encrypted by flannel WireGuard backend — see §3) |
|
||||
|
||||
### Nodes
|
||||
|
||||
| SSH alias | Kubernetes node name | Public IPv4 | Public IPv6 | Roles |
|
||||
|---|---|---|---|---|
|
||||
| `ovhcloud1` | `vps-1624d691` | `51.81.83.33` | `2604:2dc0:101:200::5a9a` | control-plane, etcd, redis-pinned |
|
||||
| `ovhcloud2` | `vps-c0f51be2` | `51.81.87.86` | `2604:2dc0:101:200::30d4` | control-plane, etcd |
|
||||
| `ovhcloud3` | `vps-dbca24c7` | `51.81.85.248` | `2604:2dc0:101:200::450f` | control-plane, etcd |
|
||||
|
||||
The cluster is **all-control-plane** (workloads schedule on the same nodes that
|
||||
run etcd and the API server). `vps-1624d691` carries the
|
||||
`honeydue/redis=true` label so the Redis Deployment's `nodeSelector` binds
|
||||
there; the Redis PVC (`local-path`, host-pinned) lives on that node's disk.
|
||||
|
||||
### SSH access
|
||||
|
||||
`~/.ssh/config` entries (operator workstation):
|
||||
|
||||
```
|
||||
Host ovhcloud1
|
||||
HostName 51.81.83.33
|
||||
Port 22
|
||||
User ubuntu
|
||||
IdentityFile ~/.ssh/ovhcloud
|
||||
IdentitiesOnly yes
|
||||
Host ovhcloud2
|
||||
HostName 51.81.87.86
|
||||
Port 22
|
||||
User ubuntu
|
||||
IdentityFile ~/.ssh/ovhcloud
|
||||
IdentitiesOnly yes
|
||||
Host ovhcloud3
|
||||
HostName 51.81.85.248
|
||||
Port 22
|
||||
User ubuntu
|
||||
IdentityFile ~/.ssh/ovhcloud
|
||||
IdentitiesOnly yes
|
||||
```
|
||||
|
||||
`ubuntu` has passwordless sudo (`/etc/sudoers.d/90-cloud-init-users` from OVH's
|
||||
cloud-init).
|
||||
|
||||
### kubectl access
|
||||
|
||||
```bash
|
||||
export KUBECONFIG=/Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go/deploy-k3s/kubeconfig
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
The `deploy-k3s/kubeconfig` file (mode 0600, gitignored) is the OVH cluster's
|
||||
admin kubeconfig with `server: https://51.81.83.33:6443`. A stale Hetzner copy
|
||||
lives next to it as `kubeconfig.hetzner.bak` for historical reference; the
|
||||
Hetzner cluster is powered off and that file's API server is unreachable.
|
||||
|
||||
To refresh from the cluster (if the local copy is lost or rotated):
|
||||
|
||||
```bash
|
||||
ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
|
||||
| sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
|
||||
> deploy-k3s/kubeconfig
|
||||
chmod 600 deploy-k3s/kubeconfig
|
||||
```
|
||||
|
||||
The k3s API at `:6443` is open to the public internet (token-protected).
|
||||
|
||||
---
|
||||
|
||||
## 2. Software
|
||||
|
||||
### Kernel-level
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| OS | Ubuntu 26.04 LTS (set by OVH's VPS-1 image) |
|
||||
| Kernel | `7.0.0-14-generic` |
|
||||
| Init | systemd |
|
||||
| Container runtime | containerd 2.2.2 (bundled with k3s) |
|
||||
| Firewall | `ufw` (per-node, configured at install — see §3) |
|
||||
| Other host packages | `fail2ban` (SSH brute-force protection, default jail), `unattended-upgrades` (security updates), `open-iscsi` (k3s prereq for some storage backends), `curl` |
|
||||
|
||||
### Kubernetes
|
||||
|
||||
| | |
|
||||
|---|---|
|
||||
| Distribution | k3s |
|
||||
| Version | **`v1.34.6+k3s1`** (pinned in `config.yaml:cluster.k3s_version`) |
|
||||
| Control plane | 3-node HA, embedded etcd (no external Postgres backing store) |
|
||||
| CNI / networking | flannel with **WireGuard-native backend** (`--flannel-backend=wireguard-native`). Encrypts pod-to-pod and etcd peer traffic because nodes only have public IPs (no private network). ~3-5% CPU overhead under load. |
|
||||
| Service LB | klipper-lb (default k3s `servicelb`). The `svclb-traefik` DaemonSet binds host ports `:80` and `:443` on each node and forwards to the Traefik Service. **Not** the DaemonSet-w/-hostNetwork Traefik pattern used on the old Hetzner cluster — see §10 *Differences from MIGRATION_NOTES*. |
|
||||
| Ingress controller | Traefik (k3s default), single-replica Deployment, exposed via klipper-lb |
|
||||
| DNS | CoreDNS (k3s default) |
|
||||
| Secrets encryption | Enabled (`--secrets-encryption`); etcd values are AES-CBC encrypted at rest |
|
||||
| kubeconfig perms | `0600` (`--write-kubeconfig-mode=0600`) |
|
||||
| Cloud controller | Disabled (`--disable-cloud-controller`) — no provider integration on OVH |
|
||||
| Misc | `--node-ip` / `--node-external-ip` / `--advertise-address` all set to each node's public IPv4. TLS SANs cover all 3 IPs so any IP can serve the API. |
|
||||
|
||||
### Application stack (in cluster, `honeydue` namespace)
|
||||
|
||||
| Deployment | Replicas | Image (digest-pinned) | Notes |
|
||||
|---|---:|---|---|
|
||||
| `api` | 3 | `gitea.treytartt.com/admin/honeydue-api@sha256:34fde6...` | Go REST API on `:8000`, exposes `/metrics` |
|
||||
| `web` | 3 | `gitea.treytartt.com/admin/honeydue-web@sha256:8c62cf...` | Next.js, server-side proxy to api |
|
||||
| `admin` | 1 | `gitea.treytartt.com/admin/honeydue-admin@sha256:b81263...` | Next.js admin panel, gated behind Traefik basic-auth |
|
||||
| `worker` | 1 | `gitea.treytartt.com/admin/honeydue-worker@sha256:fe1f5e...` | Asynq scheduler + Redis-backed jobs (singleton — must not run as >1 replica or every cron fires N×) |
|
||||
| `redis` | 1 | `redis:7-alpine@sha256:6ab0b6...` | Pinned to `vps-1624d691` via `honeydue/redis=true`. PVC `redis-data` (local-path, 5 Gi). Password-auth required. |
|
||||
| `vmagent` | 1 | `victoriametrics/vmagent@sha256:...` (default tag) | Scrapes api `/metrics` + kube-state-metrics; remote-writes to obs.88oakapps.com |
|
||||
| `kube-state-metrics` | 1 | `kube-state-metrics@sha256:...` | In `kube-system`, scraped by vmagent for `kube_*` cluster-state metrics |
|
||||
| `alloy-logs` (DaemonSet) | 3 (1/node) | `grafana/alloy@sha256:...` | Tails `/var/log/pods/*` and ships to Loki at obs.88oakapps.com |
|
||||
|
||||
The Asynq scheduler inside `worker` registers these cron jobs:
|
||||
|
||||
| Cron | Job | Notes |
|
||||
|---|---|---|
|
||||
| `0 * * * *` | Smart reminder check (per-user hour) | Default user hour: 14:00 UTC |
|
||||
| `0 * * * *` | Daily digest check (per-user hour) | Default user hour: 03:00 UTC |
|
||||
| `0 10 * * *` | Onboarding emails | 10:00 UTC |
|
||||
| `0 3 * * *` | Reminder log cleanup | 03:00 UTC |
|
||||
| `30 * * * *` | Pending uploads cleanup | xx:30 every hour |
|
||||
|
||||
### External dependencies
|
||||
|
||||
| Service | Endpoint | Purpose | Failure mode |
|
||||
|---|---|---|---|
|
||||
| Neon Postgres | `ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech:5432` | App data. Pooler endpoint (transaction-mode PgBouncer in front of Neon compute) so connections stay warm. | api / worker pods crash-loop with `dial tcp: connection refused`. Health endpoint returns `postgres: error`. |
|
||||
| Backblaze B2 (S3-compatible) | `s3.us-east-005.backblazeb2.com` (bucket `honeyDueProd`) | User uploads (photos, PDFs, completion attachments) | Upload routes return 5xx; reads of cached/static files still work. |
|
||||
| Cloudflare | `myhoneydue.com` zone | DNS + TLS termination + edge cache + DDoS | Traffic stops reaching origin. Direct `https://51.81.x.x` still works for diagnostics. |
|
||||
| obs.88oakapps.com | Operator-run Grafana + VictoriaMetrics + Loki | Metrics & logs | vmagent + alloy-logs back off and retry. No app-side impact. |
|
||||
| Apple APNs | `api.push.apple.com:443` (production) | iOS push notifications | Push fails; circuit breaker opens; failure logged. App functionality unaffected. |
|
||||
| Fastmail SMTP | `smtp.fastmail.com:587` | Transactional emails (verification, recovery, digests) | Email send fails in the worker; logged; user reset/digest flow degrades. |
|
||||
| Gitea registry | `gitea.treytartt.com` | Container image registry | Deploys can't pull. Existing pods keep running on cached images. |
|
||||
|
||||
---
|
||||
|
||||
## 3. Network and firewall
|
||||
|
||||
### Per-node `ufw` configuration
|
||||
|
||||
Applied during install (same on all 3 nodes):
|
||||
|
||||
```
|
||||
default deny incoming
|
||||
default allow outgoing
|
||||
allow 22/tcp (SSH, world)
|
||||
allow 80/tcp (HTTP via Cloudflare, world — see GAP-1)
|
||||
allow 443/tcp (HTTPS, same — GAP-1)
|
||||
allow 6443/tcp (k3s API, world, token-protected)
|
||||
allow 2379:2380/tcp from <other 2 OVH IPs> (etcd client + peer)
|
||||
allow 10250/tcp from <other 2 OVH IPs> (kubelet)
|
||||
allow 51820/udp from <other 2 OVH IPs> (WireGuard tunnel)
|
||||
allow 8472/udp from <other 2 OVH IPs> (VXLAN, defense-in-depth fallback)
|
||||
```
|
||||
|
||||
To inspect: `ssh ovhcloudN sudo ufw status numbered`.
|
||||
|
||||
### Cluster networking
|
||||
|
||||
- **Pod CIDR**: `10.42.0.0/16` (default k3s)
|
||||
- **Service CIDR**: `10.43.0.0/16` (default k3s)
|
||||
- **Flannel backend**: WireGuard-native. Each node hosts a `flannel-wg` interface on UDP 51820 and tunnels pod traffic to peers. Verify: `ssh ovhcloudN ip -d link show flannel-wg`.
|
||||
|
||||
### Traefik ingress flow
|
||||
|
||||
```
|
||||
Cloudflare → node:80/443 (public)
|
||||
→ klipper-lb svclb-traefik DaemonSet pod (hostPort:80/443)
|
||||
→ Traefik Service (ClusterIP 10.43.245.127:80/443)
|
||||
→ Traefik Deployment pod (single replica)
|
||||
→ matches Ingress host rule (api.myhoneydue.com etc.)
|
||||
→ routes to backend Service (api / web / admin)
|
||||
→ backend Pod
|
||||
```
|
||||
|
||||
The Traefik default also lives in `kube-system` and is managed by k3s's
|
||||
HelmChart. **No HelmChartConfig override is applied on OVH** (unlike Hetzner
|
||||
— see §10).
|
||||
|
||||
---
|
||||
|
||||
## 4. DNS configuration (Cloudflare)
|
||||
|
||||
The `myhoneydue.com` zone in Cloudflare has these public records. **All
|
||||
hostnames are proxied (orange cloud)** — required by the `cloudflare-only`
|
||||
Traefik middleware which 403s any non-CF source IP.
|
||||
|
||||
| Host | Type | Values | Proxy |
|
||||
|---|---|---|---|
|
||||
| `api.myhoneydue.com` | A × 3 | `51.81.83.33`, `51.81.87.86`, `51.81.85.248` | Proxied |
|
||||
| `app.myhoneydue.com` | A × 3 | (same trio) | Proxied |
|
||||
| `admin.myhoneydue.com` | A × 3 | (same trio) | Proxied |
|
||||
| `myhoneydue.com` (apex `@`) | A × 3 | (same trio) | Proxied |
|
||||
|
||||
Cloudflare round-robins among the 3 origins, klipper-lb on whichever node CF
|
||||
hits forwards to Traefik, and Traefik routes by Host header. Per-request,
|
||||
effectively load-balanced across the 3 nodes for ingress, with no central LB.
|
||||
|
||||
**SSL/TLS mode**: Flexible (CF terminates TLS at the edge; origin is plain
|
||||
HTTP on `:80`). Upgrading to Full (strict) is on the deferred list — would
|
||||
need an origin certificate provisioned to `cloudflare-origin-cert` secret and
|
||||
Traefik configured for TLS termination.
|
||||
|
||||
---
|
||||
|
||||
## 5. Filesystem layout (`deploy-k3s/`)
|
||||
|
||||
```
|
||||
deploy-k3s/
|
||||
├── config.yaml # Single config source (gitignored; contains tokens)
|
||||
├── config.yaml.example # Template
|
||||
├── kubeconfig # OVH admin kubeconfig (gitignored, 0600)
|
||||
├── kubeconfig.hetzner.bak # Old Hetzner kubeconfig (unreachable, kept for history)
|
||||
├── kubeconfig.tunnel # Optional: localhost-pointing copy for SSH-tunnel use
|
||||
├── secrets/
|
||||
│ ├── README.md
|
||||
│ ├── postgres_password.txt # Neon DB password
|
||||
│ ├── secret_key.txt # 32+ char app-token signing secret
|
||||
│ ├── email_host_password.txt # Fastmail SMTP app password
|
||||
│ ├── fcm_server_key.txt # FCM server key (currently unused — Android push disabled)
|
||||
│ ├── apns_auth_key.p8 # APNs auth key (binary)
|
||||
│ ├── cloudflare-origin.crt # Origin certificate (currently unused — CF Flexible)
|
||||
│ └── cloudflare-origin.key
|
||||
│ (all gitignored except README.md)
|
||||
├── manifests/
|
||||
│ ├── namespace.yaml
|
||||
│ ├── network-policies.yaml # default-deny + per-app egress/ingress (13 NetPols total)
|
||||
│ ├── rbac.yaml # api/worker/admin/web/redis ServiceAccounts (NOT applied by 03-deploy.sh; manual once)
|
||||
│ ├── pod-disruption-budgets.yaml # api-pdb, web-pdb, worker-pdb (NOT applied by 03-deploy.sh; manual once)
|
||||
│ ├── traefik-helmchartconfig.yaml # Hetzner-only DaemonSet+hostNetwork override (do NOT apply on OVH; we use default klipper-lb)
|
||||
│ ├── kyverno-verify-images.yaml # Operator-gated policy (do NOT apply blindly — see file comment)
|
||||
│ ├── api/{deployment,service,hpa}.yaml
|
||||
│ ├── worker/deployment.yaml
|
||||
│ ├── admin/{deployment,service}.yaml
|
||||
│ ├── web/{deployment,service}.yaml
|
||||
│ ├── redis/{deployment,service,pvc}.yaml
|
||||
│ ├── ingress/{middleware,ingress-simple}.yaml
|
||||
│ ├── migrate/job.yaml # goose migration Job (image-subbed at deploy time)
|
||||
│ ├── observability/{kube-state-metrics,vmagent,alloy-logs}.yaml
|
||||
│ └── kratos/ # Ory Kratos identity service (NOT yet deployed; gated on operator OIDC setup)
|
||||
└── scripts/
|
||||
├── _config.sh # Sourced by all scripts: cfg(), generate_env(), generate_cluster_config()
|
||||
├── 01-provision-cluster.sh # Hetzner-Cloud-specific (uses hetzner-k3s CLI) — DO NOT RUN ON OVH
|
||||
├── 02-setup-secrets.sh # Creates honeydue-secrets etc. from secrets/ + config.yaml; kubeconfig-driven
|
||||
├── 03-deploy.sh # Build + push + apply manifests + roll deployments; kubeconfig-driven
|
||||
├── 04-verify.sh # Post-deploy health + security checks; kubeconfig-driven
|
||||
└── rollback.sh # `kubectl rollout undo` across all deployments
|
||||
```
|
||||
|
||||
The `deploy/prod.env` file (sibling to `deploy-k3s/`, gitignored) holds
|
||||
observability + admin credentials that `02/03-deploy.sh` read but never
|
||||
display:
|
||||
|
||||
```
|
||||
OBS_INGEST_URL (https://obs.88oakapps.com/api/v1/write)
|
||||
OBS_TRACES_URL (https://obs.88oakapps.com/v1/traces)
|
||||
OBS_INGEST_TOKEN (bearer token for VM + Loki + traces — all use same token)
|
||||
GRAFANA_URL (https://grafana.88oakapps.com)
|
||||
GRAFANA_ADMIN_USER (admin)
|
||||
GRAFANA_ADMIN_PASSWORD
|
||||
ADMIN_EMAIL / ADMIN_PASSWORD (in-app admin login)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Install from clean boxes — the truthful procedure
|
||||
|
||||
This is what we ran on 2026-06-03 to stand up the live cluster, exactly. If
|
||||
you ever rebuild from zero this is the canonical sequence. Total wall-clock:
|
||||
~12 min for cluster bootstrap; ~10 min for workloads.
|
||||
|
||||
### 6.1 Prerequisites
|
||||
|
||||
- 3 fresh Ubuntu VPS instances (any provider with public IPv4, ≥4 GB RAM,
|
||||
≥40 GB disk)
|
||||
- `~/.ssh/config` entries (`ovhcloud1/2/3`) pointing at them, with
|
||||
passwordless sudo
|
||||
- Local `kubectl` and `curl`
|
||||
- The repo's `deploy-k3s/secrets/` populated (or the ability to copy live
|
||||
secrets from another running cluster — see §7.2)
|
||||
- `deploy/prod.env` populated with obs token + Grafana creds
|
||||
|
||||
### 6.2 Per-node OS hardening + firewall (all 3 in parallel)
|
||||
|
||||
For each `ovhcloudN`, over SSH:
|
||||
|
||||
```sh
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -y -qq fail2ban unattended-upgrades open-iscsi curl ufw
|
||||
sudo systemctl enable --now iscsid fail2ban
|
||||
sudo dpkg-reconfigure -f noninteractive -plow unattended-upgrades
|
||||
|
||||
sudo ufw --force reset
|
||||
sudo ufw default deny incoming
|
||||
sudo ufw default allow outgoing
|
||||
sudo ufw allow 22/tcp
|
||||
sudo ufw allow 80/tcp
|
||||
sudo ufw allow 443/tcp
|
||||
sudo ufw allow 6443/tcp
|
||||
SELF=$(hostname -I | awk '{print $1}')
|
||||
for peer in 51.81.83.33 51.81.87.86 51.81.85.248; do
|
||||
[ "$peer" = "$SELF" ] && continue
|
||||
sudo ufw allow from "$peer" to any port 2379:2380 proto tcp
|
||||
sudo ufw allow from "$peer" to any port 10250 proto tcp
|
||||
sudo ufw allow from "$peer" to any port 51820 proto udp
|
||||
sudo ufw allow from "$peer" to any port 8472 proto udp
|
||||
done
|
||||
sudo ufw --force enable
|
||||
```
|
||||
|
||||
**Watch ordering:** `allow 22/tcp` MUST precede `ufw enable`. Existing SSH
|
||||
sessions survive (`ufw` only affects new connections), but a misordered script
|
||||
locks you out of fresh logins.
|
||||
|
||||
### 6.3 Install k3s on `ovhcloud1` (the init node)
|
||||
|
||||
```sh
|
||||
ssh ovhcloud1 'curl -sfL https://get.k3s.io | \
|
||||
INSTALL_K3S_VERSION=v1.34.6+k3s1 \
|
||||
sh -s - server \
|
||||
--cluster-init \
|
||||
--node-ip=51.81.83.33 \
|
||||
--node-external-ip=51.81.83.33 \
|
||||
--advertise-address=51.81.83.33 \
|
||||
--flannel-backend=wireguard-native \
|
||||
--flannel-external-ip \
|
||||
--secrets-encryption \
|
||||
--write-kubeconfig-mode=0600 \
|
||||
--tls-san=51.81.83.33 \
|
||||
--tls-san=51.81.87.86 \
|
||||
--tls-san=51.81.85.248 \
|
||||
--disable-cloud-controller'
|
||||
```
|
||||
|
||||
Wait for `sudo k3s kubectl get nodes` to show this node Ready (~2-5 s).
|
||||
Read the cluster token:
|
||||
|
||||
```sh
|
||||
ssh ovhcloud1 'sudo cat /var/lib/rancher/k3s/server/node-token'
|
||||
```
|
||||
|
||||
### 6.4 Join `ovhcloud2`, then `ovhcloud3` (sequential)
|
||||
|
||||
Joining etcd one node at a time avoids split-brain on slow networks.
|
||||
Replace `<TOKEN>` with the value from 6.3.
|
||||
|
||||
For `ovhcloud2`:
|
||||
|
||||
```sh
|
||||
ssh ovhcloud2 'curl -sfL https://get.k3s.io | \
|
||||
INSTALL_K3S_VERSION=v1.34.6+k3s1 \
|
||||
K3S_TOKEN=<TOKEN> \
|
||||
sh -s - server \
|
||||
--server=https://51.81.83.33:6443 \
|
||||
--node-ip=51.81.87.86 \
|
||||
--node-external-ip=51.81.87.86 \
|
||||
--advertise-address=51.81.87.86 \
|
||||
--flannel-backend=wireguard-native \
|
||||
--flannel-external-ip \
|
||||
--secrets-encryption \
|
||||
--write-kubeconfig-mode=0600 \
|
||||
--tls-san=51.81.83.33 --tls-san=51.81.87.86 --tls-san=51.81.85.248 \
|
||||
--disable-cloud-controller'
|
||||
```
|
||||
|
||||
Then identical for `ovhcloud3` with `--node-ip=51.81.85.248` and
|
||||
`--advertise-address=51.81.85.248`. After each, wait for `kubectl get nodes`
|
||||
to show the new node Ready before proceeding.
|
||||
|
||||
### 6.5 Pull kubeconfig to the operator workstation
|
||||
|
||||
```sh
|
||||
ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
|
||||
| sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
|
||||
> deploy-k3s/kubeconfig
|
||||
chmod 600 deploy-k3s/kubeconfig
|
||||
export KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig
|
||||
kubectl get nodes -o wide # All 3 Ready, INTERNAL-IP = public IP
|
||||
```
|
||||
|
||||
### 6.6 Label the redis node
|
||||
|
||||
```sh
|
||||
kubectl label node vps-1624d691 honeydue/redis=true --overwrite
|
||||
```
|
||||
|
||||
(Use whichever k8s node name corresponds to `ovhcloud1`. The Redis
|
||||
Deployment's `nodeSelector` binds to this label.)
|
||||
|
||||
### 6.7 Bootstrap manifests NOT applied by `03-deploy.sh`
|
||||
|
||||
These must be applied manually on a fresh cluster, **before** running
|
||||
`03-deploy.sh`, or workloads will fail to schedule:
|
||||
|
||||
```sh
|
||||
kubectl apply -f deploy-k3s/manifests/rbac.yaml
|
||||
kubectl apply -f deploy-k3s/manifests/pod-disruption-budgets.yaml
|
||||
```
|
||||
|
||||
`rbac.yaml` creates the 5 ServiceAccounts (`api`, `worker`, `admin`, `web`,
|
||||
`redis`) referenced by the Deployment manifests. Without these, ReplicaSets
|
||||
hang on `FailedCreate: error looking up service account` and pods never
|
||||
start. Symptom on first deploy: `kubectl get deploy` shows `0 up-to-date`
|
||||
across the board with no pod activity — see §9 *Gotchas*.
|
||||
|
||||
**Do NOT apply** `traefik-helmchartconfig.yaml` (Hetzner-only — see §10) or
|
||||
`kyverno-verify-images.yaml` (gated on operator Kyverno install).
|
||||
|
||||
### 6.8 Seed secrets
|
||||
|
||||
Two paths; pick whichever fits your situation:
|
||||
|
||||
**Path A — clean install from local files** (the original design):
|
||||
|
||||
```sh
|
||||
KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/02-setup-secrets.sh
|
||||
```
|
||||
|
||||
Requires `deploy-k3s/secrets/` to contain real `postgres_password.txt`,
|
||||
`secret_key.txt`, `email_host_password.txt`, `fcm_server_key.txt`,
|
||||
`apns_auth_key.p8`, `cloudflare-origin.crt`, `cloudflare-origin.key`. The
|
||||
script reads `config.yaml` for `registry.*`, `redis.password`,
|
||||
`admin.basic_auth_*`, and `storage.b2_*`.
|
||||
|
||||
**Path B — clone live secrets from another running cluster** (what we
|
||||
actually did during the migration; useful if `secrets/` is empty or you want
|
||||
exact-byte equivalence):
|
||||
|
||||
```sh
|
||||
HETZNER=$(pwd)/deploy-k3s/kubeconfig.hetzner.bak # or any kubeconfig with the secrets
|
||||
OVH=$(pwd)/deploy-k3s/kubeconfig
|
||||
kubectl --kubeconfig=$OVH apply -f deploy-k3s/manifests/namespace.yaml
|
||||
for S in honeydue-secrets honeydue-apns-key gitea-credentials cloudflare-origin-cert admin-basic-auth; do
|
||||
kubectl --kubeconfig=$HETZNER -n honeydue get secret $S -o json \
|
||||
| python3 -c "
|
||||
import json, sys
|
||||
d = json.load(sys.stdin)
|
||||
m = d['metadata']
|
||||
for k in ('uid','resourceVersion','creationTimestamp','generation','managedFields','ownerReferences','selfLink'):
|
||||
m.pop(k, None)
|
||||
m.pop('annotations', None)
|
||||
print(json.dumps(d))" \
|
||||
| kubectl --kubeconfig=$OVH apply -f -
|
||||
done
|
||||
```
|
||||
|
||||
After either path, verify:
|
||||
|
||||
```sh
|
||||
kubectl -n honeydue get secrets
|
||||
# Expect: admin-basic-auth, cloudflare-origin-cert, gitea-credentials,
|
||||
# honeydue-apns-key, honeydue-secrets
|
||||
```
|
||||
|
||||
### 6.9 Deploy workloads
|
||||
|
||||
```sh
|
||||
KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig \
|
||||
./deploy-k3s/scripts/03-deploy.sh --skip-build --tag latest
|
||||
```
|
||||
|
||||
- `--skip-build` skips Docker build + push, deploys whatever's already in the
|
||||
registry at the named tag. Use this when migrating between clusters to
|
||||
guarantee both run identical bits.
|
||||
- Without flags it builds the api / worker / admin / web images from the
|
||||
local repo HEAD and pushes to `gitea.treytartt.com` first.
|
||||
- The script applies (in order): namespace, network-policies (13 of them),
|
||||
redis, ingress, then runs the goose migration Job (blocking on success),
|
||||
then api / worker / admin / web Deployments, then observability
|
||||
(kube-state-metrics, vmagent, alloy-logs).
|
||||
- It does NOT apply: `rbac.yaml`, `pod-disruption-budgets.yaml`,
|
||||
`traefik-helmchartconfig.yaml`, `kyverno-verify-images.yaml`. The first
|
||||
two must be applied manually (see §6.7); the latter two are Hetzner-only
|
||||
or operator-gated.
|
||||
- It does NOT apply: anything under `kratos/` (skipped until
|
||||
`kratos-secrets` exists, which requires real OIDC client IDs).
|
||||
|
||||
### 6.10 Verify
|
||||
|
||||
```sh
|
||||
KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/04-verify.sh
|
||||
```
|
||||
|
||||
Expect: all deployments `READY=desired`, 13 NetworkPolicies, 7 ServiceAccounts
|
||||
(api, worker, admin, web, redis, vmagent, alloy-logs), 3 PDBs, cloudflare-only
|
||||
middleware present, in-cluster `/api/health/` returns 200.
|
||||
|
||||
External smoke test (DNS-aware, but the api `/health/` route is exempt from
|
||||
the cloudflare-only middleware so direct-IP works for diagnostics):
|
||||
|
||||
```sh
|
||||
for IP in 51.81.83.33 51.81.87.86 51.81.85.248; do
|
||||
curl -s -o /dev/null -w "$IP -> %{http_code}\n" \
|
||||
-H 'Host: api.myhoneydue.com' http://$IP/api/health/
|
||||
done
|
||||
# All three should return 200.
|
||||
```
|
||||
|
||||
### 6.11 DNS cutover (if migrating)
|
||||
|
||||
In the Cloudflare dashboard for `myhoneydue.com`, set the 4 hostnames in §4 to
|
||||
the OVH IPs and keep proxied. Effective propagation ~30 s to 5 min through
|
||||
the Cloudflare proxy.
|
||||
|
||||
If you have a previous cluster, **scale its worker to 0 before flipping** to
|
||||
avoid scheduled-job double-fires:
|
||||
|
||||
```sh
|
||||
KUBECONFIG=<previous> kubectl -n honeydue scale deploy/worker --replicas=0
|
||||
# (cut DNS)
|
||||
KUBECONFIG=<new> kubectl -n honeydue scale deploy/worker --replicas=1
|
||||
```
|
||||
|
||||
Run those last two lines back-to-back. Worker work is mostly scheduled
|
||||
(hourly+), so a brief gap is harmless; overlap would cause duplicate emails.
|
||||
|
||||
---
|
||||
|
||||
## 7. Day-to-day operations
|
||||
|
||||
### Common kubectl one-liners
|
||||
|
||||
```sh
|
||||
export KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig
|
||||
|
||||
# Cluster state
|
||||
kubectl get nodes -o wide
|
||||
kubectl -n honeydue get pods
|
||||
kubectl -n honeydue get deploy
|
||||
kubectl top nodes
|
||||
kubectl -n honeydue top pods
|
||||
|
||||
# Tail logs
|
||||
kubectl -n honeydue logs deploy/api -f --tail=50
|
||||
kubectl -n honeydue logs -l app.kubernetes.io/name=api -f --tail=20
|
||||
stern -n honeydue api # if stern is installed (multi-pod)
|
||||
|
||||
# Restart a deployment (no image change, picks up ConfigMap changes)
|
||||
kubectl -n honeydue rollout restart deploy/api
|
||||
|
||||
# Rollback one revision
|
||||
kubectl -n honeydue rollout undo deploy/api
|
||||
|
||||
# Scale (worker MUST stay at 0 or 1)
|
||||
kubectl -n honeydue scale deploy/api --replicas=4
|
||||
|
||||
# Get into a pod
|
||||
kubectl -n honeydue exec -it deploy/api -- sh
|
||||
```
|
||||
|
||||
### Redeploy after code changes
|
||||
|
||||
```sh
|
||||
KUBECONFIG=$(pwd)/deploy-k3s/kubeconfig ./deploy-k3s/scripts/03-deploy.sh
|
||||
```
|
||||
|
||||
Builds images from local HEAD, tags with the git short SHA, pushes to Gitea,
|
||||
runs `goose up` (idempotent), rolls api/worker/admin/web. Total: ~3-5 min
|
||||
when images change.
|
||||
|
||||
To deploy without rebuilding (pin to a specific tag):
|
||||
|
||||
```sh
|
||||
./deploy-k3s/scripts/03-deploy.sh --skip-build --tag <tag-or-:latest>
|
||||
```
|
||||
|
||||
### Migrations
|
||||
|
||||
Goose migrations live in `migrations/`. New file pattern:
|
||||
|
||||
```
|
||||
make migrate-new name=add_foo_column # generates migrations/YYYYMMDDHHMMSS_add_foo_column.sql
|
||||
# Edit the file with -- +goose Up / -- +goose Down sections
|
||||
```
|
||||
|
||||
`03-deploy.sh` runs a one-shot Job (`manifests/migrate/job.yaml`) that
|
||||
executes `goose up` against Neon (direct compute endpoint, not pooler — see
|
||||
file comment). The Job blocks api/worker rollout and aborts the deploy on
|
||||
failure. No app pod runs `AutoMigrate`; api/worker startup verifies
|
||||
`goose_db_version` is current and refuses to boot on mismatch.
|
||||
|
||||
### Grafana
|
||||
|
||||
URL: https://grafana.88oakapps.com (creds in `deploy/prod.env`)
|
||||
|
||||
Three dashboards in the `honeyDue` folder:
|
||||
|
||||
| UID | Title | Use |
|
||||
|---|---|---|
|
||||
| `honeydue-eli5-overview` | honeyDue — Overview (ELI5) | Single-screen at-a-glance health: pods up, crashes, errors, RPS, latency, Postgres, memory, top endpoints, push failures, worker activity, recent error logs. Created 2026-06-03. |
|
||||
| `honeydue-red` | honeyDue API — RED | Rate/Errors/Duration cuts (legacy) |
|
||||
| `honeydue-logs` | honeyDue — Production Logs | Live log explorer |
|
||||
|
||||
For the ELI5 dashboard's queries, **api-side metrics use `service="api"`,
|
||||
NOT `namespace="honeydue"`.** vmagent's scrape config drops the namespace
|
||||
label from api metrics — only `service`, `pod`, `node`, `job`, plus the
|
||||
metric's own labels (route, method, status, etc.) survive. Queries that
|
||||
filter on `namespace="honeydue"` for api metrics silently match nothing.
|
||||
|
||||
### kubectl tunnel (if 6443 is firewalled to your IP)
|
||||
|
||||
Currently `6443` is open WAN-side (matching the previous Hetzner posture).
|
||||
If you tighten that to operator-IPs-only and your IP changes, use an SSH
|
||||
tunnel:
|
||||
|
||||
```sh
|
||||
ssh -fN -o ExitOnForwardFailure=yes -o ServerAliveInterval=30 \
|
||||
-i ~/.ssh/ovhcloud \
|
||||
-L 127.0.0.1:6443:127.0.0.1:6443 \
|
||||
ubuntu@51.81.83.33
|
||||
|
||||
cp deploy-k3s/kubeconfig deploy-k3s/kubeconfig.tunnel
|
||||
sed -i.bak 's|https://51.81.83.33:6443|https://127.0.0.1:6443|' deploy-k3s/kubeconfig.tunnel
|
||||
export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig.tunnel"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. Disaster recovery
|
||||
|
||||
### "I lost the kubeconfig"
|
||||
|
||||
```sh
|
||||
ssh ovhcloud1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
|
||||
| sed 's|server: https://127.0.0.1:6443|server: https://51.81.83.33:6443|' \
|
||||
> deploy-k3s/kubeconfig
|
||||
chmod 600 deploy-k3s/kubeconfig
|
||||
```
|
||||
|
||||
If `ovhcloud1` is down but `ovhcloud2` or `3` is up, swap host and IP — the
|
||||
TLS SAN covers all three.
|
||||
|
||||
### "A node is unresponsive"
|
||||
|
||||
```sh
|
||||
kubectl drain vps-XXX --ignore-daemonsets --delete-emptydir-data
|
||||
# Reboot via OVH manager or:
|
||||
ssh ovhcloudN sudo reboot
|
||||
# Wait for Ready, then:
|
||||
kubectl uncordon vps-XXX
|
||||
```
|
||||
|
||||
The cluster tolerates 1 node down (etcd quorum 2/3). With 2 down, etcd
|
||||
loses quorum and the API server stops accepting writes.
|
||||
|
||||
### "etcd quorum lost (2+ nodes dead)"
|
||||
|
||||
Bring nodes back online if possible. If not:
|
||||
|
||||
```sh
|
||||
ssh ovhcloud1 'sudo k3s server --cluster-reset --cluster-reset-restore-path=/var/lib/rancher/k3s/server/db/snapshots/<latest>'
|
||||
```
|
||||
|
||||
k3s takes automatic etcd snapshots every 12h, keeping 5. List with:
|
||||
|
||||
```sh
|
||||
ssh ovhcloud1 sudo ls -la /var/lib/rancher/k3s/server/db/snapshots/
|
||||
```
|
||||
|
||||
This is destructive — workload state since the snapshot is lost, but Neon
|
||||
(actual app data) is unaffected.
|
||||
|
||||
### "I have to rebuild the whole cluster from scratch"
|
||||
|
||||
Provision 3 fresh boxes, then exactly the sequence in §6. End-to-end is
|
||||
~30 min. The dependencies that make this possible:
|
||||
|
||||
| Stays put through rebuild | Where |
|
||||
|---|---|
|
||||
| Application data | Neon Postgres (managed) |
|
||||
| User uploads | Backblaze B2 (managed) |
|
||||
| Container images | `gitea.treytartt.com` (self-hosted, but not on the OVH cluster) |
|
||||
| Operator secrets | `deploy-k3s/secrets/` + `config.yaml` + `deploy/prod.env` on the operator workstation (gitignored) |
|
||||
| DNS | Cloudflare control panel |
|
||||
|
||||
If `gitea.treytartt.com` is on the same OVH cluster, you have a circular
|
||||
dependency — rebuilding requires images you can't pull until the cluster is
|
||||
up. Currently Gitea is NOT in the honeyDue cluster (separate Hetzner-era
|
||||
host), so this isn't a problem today, but worth flagging if that ever
|
||||
changes.
|
||||
|
||||
### "Cutover back to Hetzner / failover to a backup cluster"
|
||||
|
||||
There is **no warm standby today.** Bringing up a second cluster is the
|
||||
same §6 procedure on different hardware, then a Cloudflare DNS swap. The
|
||||
worker-swap dance is critical:
|
||||
|
||||
```sh
|
||||
KUBECONFIG=<current> kubectl -n honeydue scale deploy/worker --replicas=0
|
||||
# (Update Cloudflare DNS to new cluster's IPs — proxied)
|
||||
KUBECONFIG=<new> kubectl -n honeydue scale deploy/worker --replicas=1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 9. Known gotchas
|
||||
|
||||
### 9.1 First-deploy "0 up-to-date" across all Deployments
|
||||
|
||||
**Symptoms:** `kubectl get deploy` shows `READY 0/N, UP-TO-DATE 0` for
|
||||
api/worker/admin/web/redis. `kubectl get events` shows
|
||||
`FailedCreate: error looking up service account honeydue/<name>: serviceaccount "..." not found`.
|
||||
|
||||
**Cause:** `rbac.yaml` (ServiceAccounts) is NOT applied by `03-deploy.sh`. On
|
||||
a fresh cluster the SAs don't exist; the ReplicaSet controller can't create
|
||||
pods.
|
||||
|
||||
**Fix:**
|
||||
|
||||
```sh
|
||||
kubectl apply -f deploy-k3s/manifests/rbac.yaml
|
||||
kubectl -n honeydue rollout restart deploy/api deploy/worker deploy/admin deploy/web deploy/redis
|
||||
```
|
||||
|
||||
This was hit during the 2026-06-03 OVH bootstrap. Permanently fix by adding
|
||||
`kubectl apply -f rbac.yaml` to `03-deploy.sh` between the namespace and
|
||||
network-policies apply, but until that lands, follow §6.7 on every fresh
|
||||
cluster.
|
||||
|
||||
### 9.2 vmagent SD broken on fresh deploy ("0 pods up" in Grafana)
|
||||
|
||||
**Symptoms:**
|
||||
- Grafana panels using `kube_*` metrics or `up{job=...}` show 0
|
||||
- vmagent logs: `dial tcp 10.43.0.1:443: connect: connection refused` every ~30 s
|
||||
- Direct test from a pod also refused
|
||||
|
||||
**Cause:** k3s's NetworkPolicy controller evaluates egress rules *after*
|
||||
kube-proxy's DNAT (not before, contrary to spec). Pod-to-`kubernetes`-Service
|
||||
(`10.43.0.1:443`) gets DNAT'd to `<node_ip>:6443`, *then* the policy check
|
||||
runs. Without an explicit egress rule for `:6443`, the packet is rejected.
|
||||
|
||||
The `allow-egress-from-vmagent` NetPol in `network-policies.yaml` includes
|
||||
both rules:
|
||||
|
||||
```yaml
|
||||
- to:
|
||||
- ipBlock: { cidr: 10.43.0.0/16 }
|
||||
ports:
|
||||
- { port: 443, protocol: TCP }
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 0.0.0.0/0
|
||||
except: [10.42.0.0/16]
|
||||
ports:
|
||||
- { port: 6443, protocol: TCP }
|
||||
```
|
||||
|
||||
**If this happens:** confirm `network-policies.yaml` was applied:
|
||||
|
||||
```sh
|
||||
kubectl -n honeydue get netpol allow-egress-from-vmagent -o yaml | grep -A 5 6443
|
||||
```
|
||||
|
||||
Counter-evidence that confirms diagnosis: `kube-state-metrics` in
|
||||
`kube-system` works fine (no NetPols in that namespace).
|
||||
|
||||
### 9.3 vmagent appears healthy but no data in Grafana
|
||||
|
||||
vmagent's `/-/healthy` returns 200 as long as the process is alive and
|
||||
remote-write is TCP-functional. It doesn't check that scrapes are actually
|
||||
*succeeding*. The liveness probe in `vmagent.yaml` queries `/api/v1/targets`
|
||||
and fails the pod if no target is `up`. After ~3 failures (~3 min), kubelet
|
||||
recycles it.
|
||||
|
||||
If vmagent runs for weeks but Grafana is empty, the probe was disabled or
|
||||
the exec command broke.
|
||||
|
||||
### 9.4 vmagent bearer token destroyed by direct `kubectl apply`
|
||||
|
||||
The committed `vmagent.yaml` has `bearer_token: TOKEN_PLACEHOLDER`. The real
|
||||
token is `sed`-substituted at deploy time by `03-deploy.sh`. Applying the
|
||||
file directly:
|
||||
|
||||
```sh
|
||||
kubectl apply -f deploy-k3s/manifests/observability/vmagent.yaml # WRONG
|
||||
```
|
||||
|
||||
overwrites the Secret with the literal `TOKEN_PLACEHOLDER` and remote-writes
|
||||
401. To restore without a full redeploy:
|
||||
|
||||
```sh
|
||||
OBS_TOKEN_B64=$(kubectl -n honeydue get secret honeydue-secrets \
|
||||
-o jsonpath='{.data.OBS_INGEST_TOKEN}')
|
||||
kubectl -n honeydue patch secret vmagent-remote-write --type=json \
|
||||
-p="[{\"op\":\"replace\",\"path\":\"/data/bearer_token\",\"value\":\"${OBS_TOKEN_B64}\"}]"
|
||||
kubectl -n honeydue rollout restart deploy/vmagent
|
||||
```
|
||||
|
||||
Or just re-run `./deploy-k3s/scripts/03-deploy.sh` — the sed handles it.
|
||||
|
||||
### 9.5 Dashboard queries: api metrics need `service="api"` not `namespace="honeydue"`
|
||||
|
||||
vmagent's scrape config (`vmagent-config` ConfigMap) explicitly chooses which
|
||||
Kubernetes pod-metadata labels to copy onto each scraped series. **Namespace
|
||||
isn't one of them.** Labels you can use on api-side metrics:
|
||||
|
||||
- `service` (literal `"api"`)
|
||||
- `job` (literal `"api"`)
|
||||
- `pod` (the api pod name)
|
||||
- `node` (the k8s node name)
|
||||
- `cluster` (vmagent external_label, currently `"honeydue-k3s"`)
|
||||
- `environment` (vmagent external_label, currently `"prod"`)
|
||||
- Plus each metric's own labels (`method`, `route`, `status` for HTTP; etc.)
|
||||
|
||||
`kube_*` metrics from kube-state-metrics DO carry `namespace` natively
|
||||
(KSM publishes it as a label, vmagent passes it through). Loki streams have
|
||||
`namespace` because alloy-logs explicitly relabels it. So the rule is:
|
||||
|
||||
| Metric prefix | Use |
|
||||
|---|---|
|
||||
| `kube_*` | `namespace="honeydue"` |
|
||||
| `http_*`, `gorm_*`, `go_*`, `process_*` (api) | `service="api"` |
|
||||
| Loki logs `{...}` | `namespace="honeydue"` |
|
||||
|
||||
### 9.6 Cluster-label collision when two clusters run together
|
||||
|
||||
Both Hetzner and OVH vmagents push as `cluster=honeydue-k3s, environment=prod`
|
||||
(same external_labels). During the migration overlap this made dashboards
|
||||
sum both clusters' data. The simplest narrowing during overlap is by node
|
||||
name pattern (`node=~"vps-.*"` for OVH, `node=~"ubuntu-.*"` for Hetzner). If
|
||||
you ever bring up a backup cluster long-term, change one cluster's
|
||||
`external_labels.cluster` to something distinct (e.g. `honeydue-ovh`
|
||||
vs. `honeydue-backup`).
|
||||
|
||||
### 9.7 Worker double-firing scheduled jobs
|
||||
|
||||
If two `worker` Deployments run concurrently (e.g. two clusters both pointing
|
||||
at the same Neon DB), Asynq schedulers each fire crons independently — users
|
||||
get duplicate emails. Workaround: scale all-but-one worker to 0. This is the
|
||||
exact mechanic used during cutovers (§6.11).
|
||||
|
||||
### 9.8 Node kubeconfig mode
|
||||
|
||||
`/etc/rancher/k3s/k3s.yaml` on each node is mode `0600` because we install
|
||||
with `--write-kubeconfig-mode=0600`. Tightening from k3s default (0644) was
|
||||
intentional. Don't change without coordinating — any tooling on the node
|
||||
that expects to read it (none today) will break.
|
||||
|
||||
---
|
||||
|
||||
## 10. Differences from MIGRATION_NOTES.md (Hetzner-era)
|
||||
|
||||
`MIGRATION_NOTES.md` documents the Swarm → k3s migration on Hetzner
|
||||
(2026-04-24). Most of it still applies, with these OVH-specific deltas:
|
||||
|
||||
| What MIGRATION_NOTES says | What OVH actually has |
|
||||
|---|---|
|
||||
| `hetzner-k3s` provisioner | Manual k3s install (§6) |
|
||||
| Hetzner Load Balancer (not used) → Cloudflare round-robin | Same — Cloudflare round-robin (§4) |
|
||||
| Traefik as DaemonSet + hostNetwork via HelmChartConfig | Traefik default Deployment + klipper-lb svclb DaemonSet. The `traefik-helmchartconfig.yaml` file is **NOT applied** on OVH. |
|
||||
| `servicelb` disabled (`--disable=servicelb`) | `servicelb` enabled (we didn't pass `--disable=servicelb`). This is what makes klipper-lb work. |
|
||||
| sysctl `net.ipv4.ip_unprivileged_port_start=0` for hostNetwork Traefik | Not needed — klipper-lb proxies the port binding instead |
|
||||
| UFW rules between 3 Hetzner IPs | UFW rules between 3 OVH IPs (51.81.83.33, 51.81.87.86, 51.81.85.248) |
|
||||
| Kubeconfig at `~/.kube/honeydue-k3s.yaml` | Kubeconfig at `deploy-k3s/kubeconfig` |
|
||||
| TLS at origin: not configured (CF Flexible) | Same — CF Flexible. `cloudflare-origin-cert` Secret exists (carried over) but Ingress doesn't reference it. |
|
||||
|
||||
---
|
||||
|
||||
## 11. Outstanding follow-ups (deferred, not blocking)
|
||||
|
||||
1. **No warm standby / rollback cluster.** OVH is solo production. An OVH
|
||||
outage is a real outage; recovery time = §6 procedure (~30 min). User
|
||||
plans to bring a second cluster up as a target.
|
||||
2. **UFW allows 80/443 from world.** Hetzner had a network-layer Cloudflare-IP
|
||||
allowlist on these ports. OVH currently relies on the L7
|
||||
`cloudflare-only` Traefik middleware, which protects admin but NOT api /
|
||||
web / apex (those routes have to be reachable from anywhere, but they're
|
||||
then trivially DDoSable bypassing Cloudflare). Fix: add ufw allow rules
|
||||
restricting `80/tcp` and `443/tcp` to Cloudflare's published IP ranges
|
||||
(~22 IPv4 prefixes from https://www.cloudflare.com/ips-v4/).
|
||||
3. **Cloudflare TLS Flexible → Full(strict).** Origin certs exist as Secret
|
||||
but Ingress doesn't terminate TLS. Upgrading to Full(strict) requires
|
||||
Traefik configured with the cert + an HTTPS entrypoint + Ingress
|
||||
`tls:` block.
|
||||
4. **`rbac.yaml` + `pod-disruption-budgets.yaml` should be in `03-deploy.sh`.**
|
||||
They're currently bootstrap-only. Adding them is idempotent and prevents
|
||||
the §9.1 footgun.
|
||||
5. **Push notification metrics are log-derived, not counters.** Successes
|
||||
aren't logged or counted. Proper Prometheus instrumentation (~15 lines in
|
||||
`internal/push/client.go`) would give a real success/failure ratio.
|
||||
6. **Worker has no `/metrics` endpoint.** `cmd/worker/main.go` serves `:6060`
|
||||
for healthz only. Adding Asynq's `metrics.NewPrometheusExporter()` + a
|
||||
ServiceMonitor + uncommenting the `worker` job stanza in
|
||||
`vmagent-config` ConfigMap would give real queue depth and job latency.
|
||||
7. **Ory Kratos.** Manifests exist (`manifests/kratos/`) but the deploy
|
||||
is gated on operator-side prerequisites (Neon `kratos` database,
|
||||
`auth.myhoneydue.com` DNS, real Apple+Google OIDC clients, Kratos image
|
||||
tag pinned). Until `kratos-secrets` exists, `03-deploy.sh` silently
|
||||
skips the Kratos apply.
|
||||
8. **Hetzner cluster fully retired? `config.yaml` `nodes:` block describes
|
||||
OVH; the bak kubeconfig is at `kubeconfig.hetzner.bak`. Boxes themselves
|
||||
are operator-managed.
|
||||
|
||||
### 11.1 Dashboard observability gaps (raised 2026-06-03 during dashboard build)
|
||||
|
||||
Surfaced while building the `honeydue-eli5-overview` Grafana dashboard. Each
|
||||
needs code or infra changes to expose; none blocks today's operations.
|
||||
|
||||
9. **node-exporter not deployed.** No node-level metrics today
|
||||
(`node_filesystem_avail_bytes`, `node_memory_*`, `node_load1`, etc.).
|
||||
The dashboard's pod-level memory/CPU panels are app-process only — a
|
||||
node running out of disk would silently fail the cluster before any
|
||||
dashboard signal showed it. Highest-priority Tier-3 item. Fix: deploy
|
||||
`node-exporter` as a DaemonSet (~50 lines of YAML), add a scrape stanza
|
||||
to `vmagent-config`, add a `Node disk free` stat panel.
|
||||
10. **Traefik metrics not enabled.** Traefik can expose `/metrics` with
|
||||
`traefik_entrypoint_requests_total` + `traefik_service_request_duration_seconds`,
|
||||
giving edge-level visibility into requests that never reached api
|
||||
pods (404s, redirects, middleware blocks). Enable via a
|
||||
HelmChartConfig override that sets `metrics.prometheus.entryPoint=metrics`
|
||||
+ adds a `:9100` entryPoint + a scrape stanza. Skipped today to avoid
|
||||
Traefik restart risk; safe additive change when ready.
|
||||
11. **Push notification success/failure counters** (already #5). Add
|
||||
`prometheus.NewCounterVec` in `internal/push/client.go` with labels
|
||||
`platform={ios,android}, outcome={success,failed,breaker_open,disabled}`.
|
||||
Increments at every Send/SendActionable branch. Replaces the
|
||||
log-derived "Push failures" stat on the dashboard with a real success
|
||||
rate.
|
||||
12. **Worker queue / job metrics** (already #6). Asynq has a built-in
|
||||
Prometheus exporter (`asynq/x/metrics`). Wire it into the worker's
|
||||
`:6060` health server (a single `healthMux.Handle` line) and
|
||||
uncomment the worker scrape stanza in `vmagent-config`. Surfaces
|
||||
queue depth, retry count, processing time per task type.
|
||||
13. **Cache hit / miss rate.** `internal/services/cache_service.go` has
|
||||
no counters. Add a Counter with labels `{operation=get|set, result=hit|miss}`
|
||||
around the cache wrapper. ~10 lines. Useful once real traffic flows
|
||||
to verify the ETag and Redis caches are paying their keep.
|
||||
14. **APNs send-latency histogram.** Wrap `internal/push/apns.go::Send`
|
||||
in a `prometheus.NewHistogramVec` keyed on outcome. Tells you when
|
||||
Apple's gateway is slow (which correlates with their incident page).
|
||||
|
||||
---
|
||||
|
||||
## 12. Audit trail
|
||||
|
||||
| Date | Change |
|
||||
|---|---|
|
||||
| 2026-04-24 | Initial k3s cluster on Hetzner (Swarm → k3s migration) — see MIGRATION_NOTES.md |
|
||||
| 2026-04-25 | `config.yaml` reconstructed from live ConfigMap (original file lost) |
|
||||
| 2026-05-15 | Audit fixes: Redis auth required, admin basic auth, secrets-encryption flag |
|
||||
| 2026-05-16 | `02-setup-secrets.sh` started carrying B2 credentials (was a manifest/script drift) |
|
||||
| 2026-06-02 | Kratos scaffolding committed (not deployed) |
|
||||
| 2026-06-03 | **Hetzner → OVH BHS cutover.** New 3-node cluster on 51.81.83.33, .87.86, .85.248. DNS cut on Cloudflare. Hetzner kubeconfig moved to `.bak`. Grafana `honeydue-eli5-overview` dashboard created. Hetzner cluster powered off later same day. |
|
||||
| 2026-06-03 | Dashboard build-out: extended `honeydue-eli5-overview` to 22 panels covering Tier-1 (HTTP status, CPU per pod, goroutines, top slow) and Tier-2 (GC, network I/O, pod uptime, top 5xx) signals. Surfaced Tier-3 instrumentation gaps in §11.1. |
|
||||
+896
-676
File diff suppressed because it is too large
Load Diff
@@ -30,6 +30,7 @@ load_balancer_ip: ""
|
||||
domains:
|
||||
api: api.myhoneydue.com
|
||||
admin: admin.myhoneydue.com
|
||||
app: app.myhoneydue.com # web client host — added to CORS_ALLOWED_ORIGINS
|
||||
base: myhoneydue.com
|
||||
|
||||
# --- Container Registry (GHCR) ---
|
||||
@@ -62,7 +63,7 @@ email:
|
||||
push:
|
||||
apns_key_id: ""
|
||||
apns_team_id: ""
|
||||
apns_topic: com.tt.honeyDue
|
||||
apns_topic: com.myhoneydue.honeyDue
|
||||
apns_production: true
|
||||
apns_use_sandbox: false
|
||||
|
||||
@@ -72,8 +73,13 @@ storage:
|
||||
b2_app_key: ""
|
||||
b2_bucket: ""
|
||||
b2_endpoint: "" # e.g. s3.us-west-004.backblazeb2.com
|
||||
b2_region: "" # e.g. us-east-005
|
||||
b2_use_ssl: true
|
||||
max_file_size: 10485760
|
||||
allowed_types: "image/jpeg,image/png,image/gif,image/webp,application/pdf"
|
||||
upload_dir: /app/uploads # filesystem path inside the api container
|
||||
base_url: /uploads # public URL prefix served by the api
|
||||
static_dir: /app/static # static asset path inside the api container
|
||||
|
||||
# --- Worker Schedules (UTC hours) ---
|
||||
worker:
|
||||
@@ -100,8 +106,10 @@ admin:
|
||||
basic_auth_password: "" # HTTP basic auth password for admin panel
|
||||
|
||||
# --- Apple Auth / IAP (optional, leave empty if unused) ---
|
||||
# client_id MUST equal the iOS Release bundle ID — Apple identity tokens
|
||||
# are rejected if the `aud` claim doesn't match.
|
||||
apple_auth:
|
||||
client_id: ""
|
||||
client_id: "com.myhoneydue.honeyDue"
|
||||
team_id: ""
|
||||
iap_key_id: ""
|
||||
iap_issuer_id: ""
|
||||
|
||||
@@ -23,8 +23,11 @@ spec:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
serviceAccountName: admin
|
||||
# Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
|
||||
# the ServiceAccount-level setting in rbac.yaml.
|
||||
automountServiceAccountToken: false
|
||||
imagePullSecrets:
|
||||
- name: ghcr-credentials
|
||||
- name: gitea-credentials
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
@@ -35,6 +38,7 @@ spec:
|
||||
containers:
|
||||
- name: admin
|
||||
image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh
|
||||
imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit; images are SHA/digest-pinned
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
protocol: TCP
|
||||
|
||||
@@ -23,8 +23,11 @@ spec:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
serviceAccountName: api
|
||||
# Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
|
||||
# the ServiceAccount-level setting in rbac.yaml.
|
||||
automountServiceAccountToken: false
|
||||
imagePullSecrets:
|
||||
- name: ghcr-credentials
|
||||
- name: gitea-credentials
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
@@ -35,6 +38,7 @@ spec:
|
||||
containers:
|
||||
- name: api
|
||||
image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh
|
||||
imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit; images are SHA/digest-pinned
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
protocol: TCP
|
||||
@@ -46,34 +50,16 @@ spec:
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: honeydue-config
|
||||
env:
|
||||
- name: POSTGRES_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: POSTGRES_PASSWORD
|
||||
- name: SECRET_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: SECRET_KEY
|
||||
- name: EMAIL_HOST_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: EMAIL_HOST_PASSWORD
|
||||
- name: FCM_SERVER_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: FCM_SERVER_KEY
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: REDIS_PASSWORD
|
||||
optional: true
|
||||
# Audit CODE-F8: secrets are NOT injected as environment variables.
|
||||
# Env vars are readable for the life of the pod via /proc/<pid>/environ
|
||||
# and leak into crash dumps / child processes. honeydue-secrets is
|
||||
# mounted read-only at /etc/honeydue/secrets (mode 0400) and the Go
|
||||
# config layer (config.loadFileSecrets) reads each key from its file.
|
||||
# Non-secret config still arrives via the configMapRef above.
|
||||
volumeMounts:
|
||||
- name: app-secrets
|
||||
mountPath: /etc/honeydue/secrets
|
||||
readOnly: true
|
||||
- name: apns-key
|
||||
mountPath: /secrets/apns
|
||||
readOnly: true
|
||||
@@ -90,11 +76,12 @@ spec:
|
||||
httpGet:
|
||||
path: /api/health/
|
||||
port: 8000
|
||||
# MigrateWithLock in cmd/api/main.go runs pg_advisory_lock on
|
||||
# every startup. On a cold boot with 3 replicas, the first does
|
||||
# AutoMigrate (~90s) and the others wait on the lock, so real
|
||||
# startup runs 90–240s. 48 × 5s = 240s grace absorbs it without
|
||||
# healthcheck killing a still-starting replica.
|
||||
# Schema migrations run separately in the honeydue-migrate Job
|
||||
# *before* this Deployment rolls — the api itself does not migrate
|
||||
# (it only verifies goose_db_version at boot). Cold start still
|
||||
# pays the DB pool warm-up + Redis connect + APNs/FCM client init
|
||||
# before /api/health/ goes green. 48 × 5s = 240s grace keeps the
|
||||
# probe from killing a still-starting replica.
|
||||
failureThreshold: 48
|
||||
periodSeconds: 5
|
||||
readinessProbe:
|
||||
@@ -112,6 +99,12 @@ spec:
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 10
|
||||
volumes:
|
||||
# Audit CODE-F8: the whole honeydue-secrets Secret, projected as files.
|
||||
# defaultMode 0400 → readable only by the container's runAsUser (1000).
|
||||
- name: app-secrets
|
||||
secret:
|
||||
secretName: honeydue-secrets
|
||||
defaultMode: 0400
|
||||
- name: apns-key
|
||||
secret:
|
||||
secretName: honeydue-apns-key
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
# B2 bucket lifecycle — `uploads/` prefix
|
||||
|
||||
The `pending_uploads` cleanup worker (cron `30 * * * *`, see
|
||||
`internal/worker/jobs/handler.go::HandleUploadCleanup`) reaps unclaimed
|
||||
upload sessions every hour, deleting both the row and the corresponding B2
|
||||
object. This bucket-level lifecycle rule is a **backstop** — it catches B2
|
||||
objects that survive the row deletion (e.g. worker crashed mid-loop, B2
|
||||
delete errored, manual DB tampering).
|
||||
|
||||
## Rule
|
||||
|
||||
Apply via the Backblaze web console: **Bucket → `honeyDueProd` → Lifecycle Settings → Custom**
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"fileNamePrefix": "uploads/",
|
||||
"daysFromUploadingToHiding": 7,
|
||||
"daysFromHidingToDeleting": 1
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Effect: any object under the `uploads/` prefix is hidden 7 days after
|
||||
upload, then permanently deleted 1 day after that. Total maximum lifetime
|
||||
of an orphaned object: 8 days.
|
||||
|
||||
This rule does NOT affect:
|
||||
|
||||
- `images/`, `documents/`, `completions/` — legacy multipart-uploaded
|
||||
objects, which are managed by the existing `task_completion_image` /
|
||||
`document_image` / `document.file_url` references.
|
||||
|
||||
## Why a backstop, not the primary mechanism
|
||||
|
||||
The application worker is the primary mechanism because:
|
||||
|
||||
1. It can delete the **DB row** alongside the B2 object — lifecycle alone
|
||||
would leave dangling `pending_uploads` rows.
|
||||
2. It runs hourly vs. lifecycle's once-per-day evaluation — much tighter
|
||||
recovery window for the common case.
|
||||
3. It produces logs / metrics for orphan rate observability.
|
||||
|
||||
## Verification
|
||||
|
||||
After applying:
|
||||
|
||||
```bash
|
||||
b2 bucket get-info honeyDueProd | jq '.lifecycleRules'
|
||||
```
|
||||
|
||||
Should show the rule above. If you don't have the B2 CLI:
|
||||
|
||||
```bash
|
||||
curl -u "$B2_KEY_ID:$B2_APP_KEY" https://api.backblazeb2.com/b2api/v3/b2_authorize_account
|
||||
# Then use the returned authorization_token + apiUrl to call b2_get_bucket
|
||||
```
|
||||
@@ -53,7 +53,12 @@ metadata:
|
||||
labels:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
|
||||
# cloudflare-only + admin-auth wired in (audit F2/F3/CODE-L6). Order
|
||||
# matters: reject non-Cloudflare IPs, then basic auth, then headers,
|
||||
# then rate limit. The admin-basic-auth secret is created by
|
||||
# 02-setup-secrets.sh from config.yaml admin.basic_auth_* — that runs
|
||||
# before 03-deploy.sh, so the middleware always has its secret.
|
||||
traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-admin-auth@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
|
||||
spec:
|
||||
ingressClassName: traefik
|
||||
tls:
|
||||
@@ -98,3 +103,98 @@ spec:
|
||||
name: web
|
||||
port:
|
||||
number: 3000
|
||||
---
|
||||
# Auth-endpoint Ingress (audit F10 / LIVE-L12). A dedicated Ingress for the
|
||||
# auth paths so Traefik gives their longer path-prefix routers a higher
|
||||
# priority than honeydue-api's "/" router — these paths then get
|
||||
# auth-rate-limit (5/min) instead of the general rate-limit (100/min).
|
||||
# Anything not matched here falls through to honeydue-api unchanged.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: honeydue-api-auth
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.middlewares: honeydue-auth-rate-limit@kubernetescrd,honeydue-security-headers@kubernetescrd
|
||||
spec:
|
||||
ingressClassName: traefik
|
||||
tls:
|
||||
- hosts:
|
||||
- api.myhoneydue.com
|
||||
secretName: cloudflare-origin-cert
|
||||
rules:
|
||||
- host: api.myhoneydue.com
|
||||
http:
|
||||
paths:
|
||||
- path: /api/auth/login
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/auth/register
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/auth/forgot-password
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/auth/reset-password
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/residences/join-with-code
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/auth/verify-reset-code
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/auth/apple-sign-in
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/auth/google-sign-in
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/auth/refresh
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
- path: /api/auth/account
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
# API Ingress — Cloudflare-only + security headers + rate limiting
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: honeydue-api
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
|
||||
spec:
|
||||
tls:
|
||||
- hosts:
|
||||
- api.myhoneydue.com
|
||||
secretName: cloudflare-origin-cert
|
||||
rules:
|
||||
- host: api.myhoneydue.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: api
|
||||
port:
|
||||
number: 8000
|
||||
|
||||
---
|
||||
# Admin Ingress — Cloudflare-only + security headers + rate limiting + basic auth
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: honeydue-admin
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd,honeydue-admin-auth@kubernetescrd
|
||||
spec:
|
||||
tls:
|
||||
- hosts:
|
||||
- admin.myhoneydue.com
|
||||
secretName: cloudflare-origin-cert
|
||||
rules:
|
||||
- host: admin.myhoneydue.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: admin
|
||||
port:
|
||||
number: 3000
|
||||
@@ -21,12 +21,20 @@ spec:
|
||||
headers:
|
||||
frameDeny: true
|
||||
contentTypeNosniff: true
|
||||
browserXssFilter: true
|
||||
# browserXssFilter removed (audit L7): it emits the deprecated
|
||||
# X-XSS-Protection header, which can itself introduce XSS in legacy
|
||||
# browsers. Modern browsers ignore it.
|
||||
referrerPolicy: "strict-origin-when-cross-origin"
|
||||
customResponseHeaders:
|
||||
X-Content-Type-Options: "nosniff"
|
||||
X-Frame-Options: "DENY"
|
||||
Strict-Transport-Security: "max-age=31536000; includeSubDomains"
|
||||
# HSTS: 2-year max-age + preload (audit L5/CODE-L3). After this is
|
||||
# live on api/admin/app, submit myhoneydue.com to hstspreload.org.
|
||||
Strict-Transport-Security: "max-age=63072000; includeSubDomains; preload"
|
||||
# Cross-origin isolation (audit F9). COEP (require-corp) is omitted —
|
||||
# it commonly breaks third-party embeds; add only after testing.
|
||||
Cross-Origin-Opener-Policy: "same-origin"
|
||||
Cross-Origin-Resource-Policy: "same-origin"
|
||||
# Content-Security-Policy is intentionally NOT set here — the Go API
|
||||
# sets a CSP in internal/router/router.go that permits Google Fonts
|
||||
# for the landing page. Two CSP headers would intersect and break it.
|
||||
@@ -83,3 +91,24 @@ spec:
|
||||
basicAuth:
|
||||
secret: admin-basic-auth
|
||||
realm: "honeyDue Admin"
|
||||
|
||||
---
|
||||
# Strict rate limit for auth endpoints (audit F10 / LIVE-L12).
|
||||
# Applied via the honeydue-api-auth Ingress to login / register /
|
||||
# forgot-password / reset-password / join-with-code. depth: 2 makes the
|
||||
# limiter key on the real client IP rather than the Cloudflare edge IP
|
||||
# (request path: client -> Cloudflare -> Traefik). This is the edge half;
|
||||
# the per-account lockout in the Go app is the robust half.
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: auth-rate-limit
|
||||
namespace: honeydue
|
||||
spec:
|
||||
rateLimit:
|
||||
average: 5
|
||||
burst: 10
|
||||
period: 1m
|
||||
sourceCriterion:
|
||||
ipStrategy:
|
||||
depth: 2
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
# Ory Kratos — honeyDue identity service (Phase 1: infrastructure)
|
||||
|
||||
This directory deploys [Ory Kratos](https://www.ory.sh/kratos/) into the
|
||||
`honeydue` namespace as the identity provider — replacing the hand-rolled auth
|
||||
in `internal/services/auth_service.go` etc.
|
||||
|
||||
**Phase 1 is infrastructure only.** Once deployed, Kratos runs but nothing uses
|
||||
it yet — the honeyDue Go API still does its own auth. Phase 2 (backend swap)
|
||||
and Phase 3 (KMP/web clients) follow. Migrating onto Kratos can lose all
|
||||
existing user data — honeyDue is pre-production, so no user import is done.
|
||||
|
||||
The deploy is **gated**: `03-deploy.sh` applies Kratos only when the
|
||||
`kratos-secrets` Secret exists, and `02-setup-secrets.sh` creates that Secret
|
||||
only when `config.yaml` has a `kratos:` block. Until then the existing stack
|
||||
deploys completely unaffected.
|
||||
|
||||
## Files
|
||||
|
||||
| File | What |
|
||||
|---|---|
|
||||
| `configmap.yaml` | `kratos.yml`, identity schema, Google/Apple OIDC claim mappers (no secrets) |
|
||||
| `migrate-job.yaml` | `kratos migrate sql` — schema migration, run before the Deployment |
|
||||
| `kratos.yaml` | Deployment (×2), Service, NetworkPolicies |
|
||||
| `ingress.yaml` | `auth.myhoneydue.com` → Kratos public API :4433 |
|
||||
|
||||
## Operator prerequisites (must be done before deploying)
|
||||
|
||||
1. **Kratos version** — Ory uses CalVer (`v25.x` / `v26.x`). Pick the current
|
||||
stable, then replace `REPLACE_WITH_CURRENT_STABLE_TAG` in `kratos.yaml` and
|
||||
`migrate-job.yaml` with `oryd/kratos:vXX.Y@sha256:<digest>`, and set the
|
||||
matching `version:` in `configmap.yaml`.
|
||||
|
||||
2. **Kratos database** — create a separate Neon database named `kratos` (do not
|
||||
share honeyDue's). Capture its connection string as the DSN.
|
||||
|
||||
3. **DNS** — add `auth.myhoneydue.com` in Cloudflare (proxied), pointing at the
|
||||
cluster ingress like the other honeyDue hosts. Confirm the
|
||||
`cloudflare-origin-cert` TLS secret covers `auth.myhoneydue.com`.
|
||||
|
||||
4. **Google OAuth client** — Google Cloud Console → create an OAuth 2.0 client.
|
||||
Redirect URI: `https://auth.myhoneydue.com/self-service/methods/oidc/callback/google`.
|
||||
Put the **client ID** into `configmap.yaml` (`GOOGLE_OAUTH_CLIENT_ID`); the
|
||||
**client secret** goes in `config.yaml`.
|
||||
|
||||
5. **Apple Sign In** — Apple Developer → a Services ID + a Sign in with Apple
|
||||
key. Return URL: `https://auth.myhoneydue.com/self-service/methods/oidc/callback/apple`.
|
||||
Put the **Services ID / Team ID / Key ID** into `configmap.yaml`
|
||||
(`APPLE_SERVICES_ID` / `APPLE_TEAM_ID` / `APPLE_PRIVATE_KEY_ID`); the **.p8
|
||||
private key** goes in `config.yaml`.
|
||||
|
||||
6. **`config.yaml`** — add a `kratos:` block:
|
||||
```yaml
|
||||
kratos:
|
||||
dsn: "postgres://USER:PASS@HOST/kratos?sslmode=require"
|
||||
secrets_cookie: "<openssl rand -hex 16>" # generate ONCE, keep stable
|
||||
secrets_cipher: "<openssl rand -hex 16>" # must be exactly 32 chars
|
||||
smtp_connection_uri: "smtps://USER:PASS@smtp.fastmail.com:465/"
|
||||
google_client_secret: "<from Google Cloud Console>"
|
||||
apple_private_key: |
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
...
|
||||
-----END PRIVATE KEY-----
|
||||
```
|
||||
`secrets_cookie` / `secrets_cipher` must stay stable forever — rotating them
|
||||
invalidates every session and makes encrypted data unreadable.
|
||||
|
||||
## Deploy
|
||||
|
||||
```bash
|
||||
cd honeyDueAPI-go
|
||||
export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
|
||||
./deploy-k3s/scripts/02-setup-secrets.sh # creates kratos-secrets from config.yaml
|
||||
./deploy-k3s/scripts/03-deploy.sh # applies kratos manifests, runs migrate, rolls
|
||||
```
|
||||
|
||||
`03-deploy.sh` applies `configmap.yaml` → runs `migrate-job.yaml` → waits →
|
||||
applies `kratos.yaml` + `ingress.yaml`.
|
||||
|
||||
## Verify
|
||||
|
||||
- `kubectl -n honeydue get pods -l app.kubernetes.io/name=kratos` — 2/2 Running
|
||||
- `kubectl -n honeydue logs job/kratos-migrate` — migration succeeded
|
||||
- `curl https://auth.myhoneydue.com/health/ready` — `{"status":"ok"}`
|
||||
- `curl https://auth.myhoneydue.com/self-service/registration/api` — returns a flow
|
||||
|
||||
## Not yet done (later phases)
|
||||
|
||||
- **Phase 2** — honeyDue Go backend: swap `middleware/auth.go` for Kratos
|
||||
session validation, drop the hand-rolled auth code, rebuild the `users`
|
||||
table keyed on the Kratos identity ID.
|
||||
- **Phase 3** — KMP mobile + Next.js web clients point at Kratos flows.
|
||||
- Admin-panel auth stays on its own JWT (out of scope).
|
||||
@@ -0,0 +1,232 @@
|
||||
# Ory Kratos configuration for honeyDue.
|
||||
#
|
||||
# Secrets are NOT in this ConfigMap. The DSN, cookie/cipher secrets, SMTP URI
|
||||
# and OIDC client secrets are injected as environment variables from the
|
||||
# kratos-secrets Secret (see kratos.yaml). Kratos is configured natively via
|
||||
# env vars, so this is the idiomatic split — only non-secret config here.
|
||||
#
|
||||
# OIDC scope: Apple-only as of 2026-06-03. Google is intentionally absent;
|
||||
# adding it later is additive — append a `- id: google` block under
|
||||
# selfservice.methods.oidc.config.providers (it becomes index 1) and bind a
|
||||
# matching CLIENT_SECRET env in kratos.yaml.
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: kratos-config
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: kratos
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
data:
|
||||
kratos.yml: |
|
||||
# version must track the Kratos image tag — kratos.yaml + migrate-job.yaml
|
||||
# both pin oryd/kratos:v26.2.0 (2026-06-03). See kratos/README.md.
|
||||
version: v1.3.0 # internal config schema version; do not change unless Kratos release notes require it
|
||||
|
||||
serve:
|
||||
public:
|
||||
base_url: https://auth.myhoneydue.com/
|
||||
cors:
|
||||
enabled: true
|
||||
allowed_origins:
|
||||
- https://myhoneydue.com
|
||||
- https://app.myhoneydue.com
|
||||
- https://admin.myhoneydue.com
|
||||
allowed_methods: [GET, POST, PUT, PATCH, DELETE, OPTIONS]
|
||||
allowed_headers: [Authorization, Content-Type, X-Session-Token, Cookie]
|
||||
exposed_headers: [Content-Type, Set-Cookie]
|
||||
# Required: the web clients call Kratos browser flows with
|
||||
# credentials (the ory_kratos_session cookie). Safe here because
|
||||
# allowed_origins is an explicit list, never a wildcard.
|
||||
allow_credentials: true
|
||||
admin:
|
||||
base_url: http://kratos.honeydue.svc.cluster.local:4434/
|
||||
|
||||
selfservice:
|
||||
default_browser_return_url: https://app.myhoneydue.com/
|
||||
allowed_return_urls:
|
||||
- https://app.myhoneydue.com
|
||||
- https://myhoneydue.com
|
||||
- honeydue://callback
|
||||
|
||||
methods:
|
||||
password:
|
||||
enabled: true
|
||||
code: # email one-time codes (verify/recover)
|
||||
enabled: true
|
||||
oidc:
|
||||
enabled: true
|
||||
config:
|
||||
providers:
|
||||
# index 0 — Apple Sign In. apple_private_key (.p8 contents) is
|
||||
# injected via env SELFSERVICE_METHODS_OIDC_CONFIG_PROVIDERS_0_APPLE_PRIVATE_KEY.
|
||||
# client_id is the Apple Services ID (here: the bundle ID, which
|
||||
# was configured as a Services ID with Sign In with Apple
|
||||
# capability — see operator notes in README.md §5).
|
||||
- id: apple
|
||||
provider: apple
|
||||
# Production bundle id. Apple issues id_tokens with
|
||||
# `aud` = the requesting app's bundle id, so this is the
|
||||
# primary audience Kratos verifies against.
|
||||
client_id: com.myhoneydue.honeyDue
|
||||
# Debug builds out of Xcode use a `.dev` bundle id (see
|
||||
# iosApp/honeyDue.xcodeproj — Debug config). Their id_tokens
|
||||
# therefore have `aud: com.myhoneydue.honeyDue.dev`, which
|
||||
# the primary client_id check rejects. Whitelist the dev
|
||||
# audience so Apple Sign In works from a non-Release Xcode
|
||||
# build without per-build Kratos reconfiguration.
|
||||
additional_id_token_audiences:
|
||||
- com.myhoneydue.honeyDue.dev
|
||||
apple_team_id: X86BR9WTLD
|
||||
apple_private_key_id: HQD3NCF99C
|
||||
mapper_url: file:///etc/kratos/oidc.apple.jsonnet
|
||||
scope: [openid, email, name]
|
||||
|
||||
flows:
|
||||
error:
|
||||
ui_url: https://app.myhoneydue.com/auth/error
|
||||
login:
|
||||
ui_url: https://app.myhoneydue.com/auth/login
|
||||
lifespan: 10m
|
||||
registration:
|
||||
ui_url: https://app.myhoneydue.com/auth/registration
|
||||
lifespan: 10m
|
||||
after:
|
||||
password:
|
||||
hooks:
|
||||
- hook: session # auto-login after registration
|
||||
oidc:
|
||||
hooks:
|
||||
- hook: session
|
||||
verification:
|
||||
enabled: true
|
||||
ui_url: https://app.myhoneydue.com/auth/verification
|
||||
use: code
|
||||
after:
|
||||
default_browser_return_url: https://app.myhoneydue.com/
|
||||
recovery:
|
||||
enabled: true
|
||||
ui_url: https://app.myhoneydue.com/auth/recovery
|
||||
use: code
|
||||
settings:
|
||||
ui_url: https://app.myhoneydue.com/auth/settings
|
||||
privileged_session_max_age: 15m
|
||||
logout:
|
||||
after:
|
||||
default_browser_return_url: https://app.myhoneydue.com/
|
||||
|
||||
log:
|
||||
level: info
|
||||
format: json
|
||||
leak_sensitive_values: false
|
||||
|
||||
ciphers:
|
||||
algorithm: xchacha20-poly1305
|
||||
|
||||
hashers:
|
||||
algorithm: bcrypt
|
||||
bcrypt:
|
||||
cost: 12
|
||||
|
||||
identity:
|
||||
default_schema_id: honeydue
|
||||
schemas:
|
||||
- id: honeydue
|
||||
url: file:///etc/kratos/identity.schema.json
|
||||
|
||||
courier:
|
||||
smtp:
|
||||
from_address: noreply@myhoneydue.com
|
||||
from_name: honeyDue
|
||||
# connection_uri is injected via env COURIER_SMTP_CONNECTION_URI
|
||||
|
||||
session:
|
||||
lifespan: 720h # 30-day sessions (mobile)
|
||||
cookie:
|
||||
domain: myhoneydue.com
|
||||
same_site: Lax
|
||||
|
||||
identity.schema.json: |
|
||||
{
|
||||
"$id": "https://honeydue.app/identity.schema.json",
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"title": "honeyDue user",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"traits": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"email": {
|
||||
"type": "string",
|
||||
"format": "email",
|
||||
"title": "Email",
|
||||
"minLength": 3,
|
||||
"maxLength": 320,
|
||||
"ory.sh/kratos": {
|
||||
"credentials": {
|
||||
"password": { "identifier": true },
|
||||
"code": { "identifier": true, "via": "email" },
|
||||
"totp": { "account_name": true }
|
||||
},
|
||||
"verification": { "via": "email" },
|
||||
"recovery": { "via": "email" }
|
||||
}
|
||||
},
|
||||
"name": {
|
||||
"type": "object",
|
||||
"title": "Name",
|
||||
"properties": {
|
||||
"first": { "type": "string", "title": "First name", "maxLength": 100 },
|
||||
"last": { "type": "string", "title": "Last name", "maxLength": 100 }
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["email"],
|
||||
"additionalProperties": false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
oidc.google.jsonnet: |
|
||||
// Maps Google OIDC claims onto the honeyDue identity schema.
|
||||
local claims = std.extVar('claims');
|
||||
{
|
||||
identity: {
|
||||
traits: {
|
||||
email: claims.email,
|
||||
[if 'given_name' in claims || 'family_name' in claims then 'name']: {
|
||||
first: if 'given_name' in claims then claims.given_name else '',
|
||||
last: if 'family_name' in claims then claims.family_name else '',
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
oidc.apple.jsonnet: |
|
||||
// Maps Apple OIDC claims onto the honeyDue identity schema. Apple only
|
||||
// returns the name on the very first authorization and not in the ID
|
||||
// token claims, so only email is mapped here.
|
||||
//
|
||||
// Sign in with Apple emails are marked verified UNCONDITIONALLY: completing
|
||||
// SIWA cryptographically proves the user controls that Apple ID, and Apple
|
||||
// owns/verifies the (relay or real) email, so a 6-digit code would be
|
||||
// redundant. We deliberately do NOT gate this on Apple's `email_verified`
|
||||
// claim — Apple omits that claim on many authorizations (only sends it on
|
||||
// the first grant), which made auto-verification random: sometimes verified,
|
||||
// sometimes a surprise code prompt (observed 2026-06-03). Marking it
|
||||
// verified on every SIWA makes the behaviour consistent: Apple users never
|
||||
// see a code; password sign-ups still verify via the honeyDue API flow.
|
||||
local claims = std.extVar('claims');
|
||||
{
|
||||
identity: {
|
||||
traits: {
|
||||
email: claims.email,
|
||||
},
|
||||
verified_addresses: std.prune([
|
||||
if 'email' in claims then {
|
||||
via: 'email',
|
||||
value: claims.email,
|
||||
},
|
||||
]),
|
||||
},
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
# Public ingress for Ory Kratos — auth.myhoneydue.com → Kratos public API :4433.
|
||||
#
|
||||
# Middlewares match the honeyDue API ingress (security-headers + rate-limit).
|
||||
# The cloudflare-only middleware is intentionally NOT applied here: on this
|
||||
# cluster, klipper-lb SNATs the source IP before Traefik sees it, so
|
||||
# cloudflare-only's IP allowlist rejects every legitimate Cloudflare request
|
||||
# (verified 2026-06-03 — iOS Apple Sign In failed silently because Kratos
|
||||
# never received the request). The api ingress doesn't use cloudflare-only
|
||||
# for the same reason. DDoS protection still rides on Cloudflare's edge.
|
||||
#
|
||||
# Kratos's self-service flows are multi-request, so the strict auth-rate-limit
|
||||
# (5/min) is intentionally NOT used here — Kratos applies its own per-flow
|
||||
# protections.
|
||||
#
|
||||
# OPERATOR: confirm the cloudflare-origin-cert TLS secret covers
|
||||
# auth.myhoneydue.com (apex + wildcard origin cert), and add the
|
||||
# auth.myhoneydue.com DNS record in Cloudflare (proxied) → cluster ingress.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: honeydue-auth
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: kratos
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd
|
||||
spec:
|
||||
ingressClassName: traefik
|
||||
tls:
|
||||
- hosts:
|
||||
- auth.myhoneydue.com
|
||||
secretName: cloudflare-origin-cert
|
||||
rules:
|
||||
- host: auth.myhoneydue.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: kratos
|
||||
port:
|
||||
number: 4433
|
||||
@@ -0,0 +1,208 @@
|
||||
# Ory Kratos — identity service for honeyDue.
|
||||
#
|
||||
# Deployed once the operator has completed the prerequisites in kratos/README.md
|
||||
# (Neon `kratos` database, auth.myhoneydue.com DNS, Apple Sign In OIDC client,
|
||||
# and the kratos-secrets Secret). Until then 03-deploy.sh skips the Kratos
|
||||
# apply, so the existing stack is unaffected.
|
||||
#
|
||||
# IMAGE: pinned to oryd/kratos v26.2.0 (CalVer current stable as of 2026-06-03)
|
||||
# with the linux/amd64 digest. The schema-migration Job is in migrate-job.yaml
|
||||
# and runs before this Deployment rolls.
|
||||
#
|
||||
# OIDC: currently Apple-only (configmap.yaml providers[0]). Google was scoped
|
||||
# out at deploy time; adding it later is additive — append to providers[] in
|
||||
# configmap.yaml and add the matching CLIENT_SECRET env binding here.
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: kratos
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: kratos
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
replicas: 2
|
||||
strategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 0
|
||||
maxSurge: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kratos
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kratos
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
automountServiceAccountToken: false
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: kratos
|
||||
image: oryd/kratos:v26.2.0@sha256:92eedc292ff8e1a918ac442c88ed0abe44610c75121700963114549908a45ac3
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- serve
|
||||
- --config
|
||||
- /etc/kratos/kratos.yml
|
||||
- --watch-courier # send verification/recovery email in-process
|
||||
ports:
|
||||
- name: public
|
||||
containerPort: 4433
|
||||
- name: admin
|
||||
containerPort: 4434
|
||||
env:
|
||||
# Kratos is configured natively via env vars; secrets come from
|
||||
# the kratos-secrets Secret rather than the ConfigMap.
|
||||
- name: DSN
|
||||
valueFrom: { secretKeyRef: { name: kratos-secrets, key: dsn } }
|
||||
- name: SECRETS_COOKIE
|
||||
valueFrom: { secretKeyRef: { name: kratos-secrets, key: secrets_cookie } }
|
||||
- name: SECRETS_CIPHER
|
||||
valueFrom: { secretKeyRef: { name: kratos-secrets, key: secrets_cipher } }
|
||||
- name: COURIER_SMTP_CONNECTION_URI
|
||||
valueFrom: { secretKeyRef: { name: kratos-secrets, key: smtp_connection_uri } }
|
||||
# OIDC provider secrets — index must match the providers list
|
||||
# order in configmap.yaml. Apple-only for now (index 0).
|
||||
- name: SELFSERVICE_METHODS_OIDC_CONFIG_PROVIDERS_0_APPLE_PRIVATE_KEY
|
||||
valueFrom: { secretKeyRef: { name: kratos-secrets, key: apple_private_key } }
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/kratos
|
||||
readOnly: true
|
||||
- name: tmp
|
||||
mountPath: /tmp
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health/ready
|
||||
port: 4434
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health/alive
|
||||
port: 4434
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: "1"
|
||||
memory: 512Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: kratos-config
|
||||
- name: tmp
|
||||
emptyDir:
|
||||
sizeLimit: 64Mi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kratos
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: kratos
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: kratos
|
||||
ports:
|
||||
- name: public
|
||||
port: 4433
|
||||
targetPort: 4433
|
||||
- name: admin
|
||||
port: 4434
|
||||
targetPort: 4434
|
||||
---
|
||||
# Ingress to Kratos. Traefik (the auth.myhoneydue.com IngressRoute) reaches
|
||||
# only the public API :4433. The honeyDue api pods reach the public API :4433
|
||||
# (session whoami) AND the admin API :4434 (identity deletion on account
|
||||
# close). The admin API :4434 takes no other cluster ingress.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-ingress-to-kratos
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kratos
|
||||
policyTypes:
|
||||
- Ingress
|
||||
ingress:
|
||||
# Traefik ingress controller -> public API only.
|
||||
- from:
|
||||
- namespaceSelector:
|
||||
matchLabels:
|
||||
kubernetes.io/metadata.name: kube-system
|
||||
ports:
|
||||
- port: 4433
|
||||
protocol: TCP
|
||||
# honeyDue api pods -> public API (whoami) + admin API (identity deletion).
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: api
|
||||
ports:
|
||||
- port: 4433
|
||||
protocol: TCP
|
||||
- port: 4434
|
||||
protocol: TCP
|
||||
---
|
||||
# Kratos egress: DNS, the Neon Postgres database, SMTP, and HTTPS to the
|
||||
# OIDC providers (Apple/Google token + JWKS endpoints).
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-egress-from-kratos
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kratos
|
||||
policyTypes:
|
||||
- Egress
|
||||
egress:
|
||||
- to:
|
||||
- namespaceSelector: {}
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
# Neon Postgres (external)
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 0.0.0.0/0
|
||||
except:
|
||||
- 10.42.0.0/16
|
||||
- 10.43.0.0/16
|
||||
ports:
|
||||
- port: 5432
|
||||
protocol: TCP
|
||||
# SMTP (Fastmail) + HTTPS to Apple/Google OIDC endpoints (external)
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 0.0.0.0/0
|
||||
except:
|
||||
- 10.42.0.0/16
|
||||
- 10.43.0.0/16
|
||||
ports:
|
||||
- port: 465
|
||||
protocol: TCP
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
@@ -0,0 +1,51 @@
|
||||
# Ory Kratos schema migration — runs `kratos migrate sql` against the Kratos
|
||||
# database before the Kratos Deployment rolls. 03-deploy.sh applies this,
|
||||
# waits for completion, then applies kratos.yaml.
|
||||
#
|
||||
# IMAGE: pinned to oryd/kratos v26.2.0 (CalVer current stable as of 2026-06-03)
|
||||
# with the linux/amd64 digest. Bump in sync with kratos.yaml's image.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: kratos-migrate
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: kratos
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kratos
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
automountServiceAccountToken: false
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: kratos-migrate
|
||||
image: oryd/kratos:v26.2.0@sha256:92eedc292ff8e1a918ac442c88ed0abe44610c75121700963114549908a45ac3
|
||||
imagePullPolicy: IfNotPresent
|
||||
args: ["migrate", "sql", "-e", "--yes"]
|
||||
env:
|
||||
- name: DSN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: kratos-secrets
|
||||
key: dsn
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 256Mi
|
||||
@@ -0,0 +1,61 @@
|
||||
# Kyverno image-signature verification policy (audit CODE-L5).
|
||||
#
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
# THIS MANIFEST IS NOT APPLIED BY 03-deploy.sh. It is intentionally outside
|
||||
# the script's apply set. Applying it before the prerequisites are in place
|
||||
# would block every honeydue Pod from scheduling. Operator steps:
|
||||
#
|
||||
# 1. Install Kyverno in the cluster (it is an admission controller):
|
||||
# kubectl create -f https://github.com/kyverno/kyverno/releases/latest/download/install.yaml
|
||||
# 2. Generate a cosign key pair and keep the private key safe:
|
||||
# cosign generate-key-pair # -> cosign.key (PRIVATE) + cosign.pub
|
||||
# Set COSIGN_KEY=cosign.key in the deploy environment so 03-deploy.sh
|
||||
# signs images after pushing them (the signing step is already wired,
|
||||
# guarded, into 03-deploy.sh).
|
||||
# 3. Paste the contents of cosign.pub into the publicKeys block below.
|
||||
# 4. Apply this policy: kubectl apply -f deploy-k3s/manifests/kyverno-verify-images.yaml
|
||||
# 5. After confirming honeydue Pods still schedule, flip
|
||||
# validationFailureAction from Audit to Enforce.
|
||||
#
|
||||
# Until then it is a documented, ready-to-use template — not active config.
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
apiVersion: kyverno.io/v1
|
||||
kind: ClusterPolicy
|
||||
metadata:
|
||||
name: verify-honeydue-images
|
||||
annotations:
|
||||
policies.kyverno.io/title: Verify honeyDue image signatures
|
||||
policies.kyverno.io/description: >-
|
||||
Requires that honeyDue application images pulled into the honeydue
|
||||
namespace carry a valid cosign signature made with the operator's key.
|
||||
spec:
|
||||
# Audit first — logs violations without blocking. Switch to Enforce once
|
||||
# signing is confirmed working end to end.
|
||||
validationFailureAction: Audit
|
||||
background: false
|
||||
webhookTimeoutSeconds: 30
|
||||
rules:
|
||||
- name: verify-gitea-image-signatures
|
||||
match:
|
||||
any:
|
||||
- resources:
|
||||
kinds:
|
||||
- Pod
|
||||
namespaces:
|
||||
- honeydue
|
||||
verifyImages:
|
||||
# Only the images we build and sign. Public base images
|
||||
# (redis, vmagent) are pinned by digest instead — see their manifests.
|
||||
- imageReferences:
|
||||
- "gitea.treytartt.com/admin/honeydue-api*"
|
||||
- "gitea.treytartt.com/admin/honeydue-worker*"
|
||||
- "gitea.treytartt.com/admin/honeydue-admin*"
|
||||
- "gitea.treytartt.com/admin/honeydue-web*"
|
||||
attestors:
|
||||
- count: 1
|
||||
entries:
|
||||
- keys:
|
||||
publicKeys: |-
|
||||
-----BEGIN PUBLIC KEY-----
|
||||
REPLACE_WITH_CONTENTS_OF_cosign.pub
|
||||
-----END PUBLIC KEY-----
|
||||
@@ -0,0 +1,78 @@
|
||||
# One-shot migration Job. Runs goose against Neon's *direct* (non-pooler)
|
||||
# endpoint, applies any pending migrations from /app/migrations (baked into
|
||||
# the api image), exits.
|
||||
#
|
||||
# 03-deploy.sh deletes any prior Job, applies this one, waits for completion
|
||||
# with `kubectl wait --for=condition=complete`, and rolls api/worker only
|
||||
# after the Job succeeds. A Job failure aborts the whole deploy.
|
||||
#
|
||||
# We reuse the api image rather than build a separate one — the api Dockerfile
|
||||
# already installs the goose CLI to /usr/local/bin/goose and copies the
|
||||
# migrations directory to /app/migrations.
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: honeydue-migrate
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: migrate
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
backoffLimit: 0 # fail fast — no silent retries on a bad migration
|
||||
ttlSecondsAfterFinished: 86400 # keep finished Job for 24h so logs are inspectable
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: migrate
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
# The migrate Job never calls the k8s API (audit F11).
|
||||
automountServiceAccountToken: false
|
||||
imagePullSecrets:
|
||||
- name: gitea-credentials
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
runAsGroup: 1000
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: goose
|
||||
image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh — same as api
|
||||
imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit
|
||||
command: ["/bin/sh", "-c"]
|
||||
# DB_HOST in the ConfigMap points at the -pooler endpoint for runtime.
|
||||
# goose's session-scoped advisory lock can't survive PgBouncer
|
||||
# transaction-mode, so we strip the -pooler segment for migrations.
|
||||
# `set -e` so any sub-command failure exits non-zero.
|
||||
args:
|
||||
- |
|
||||
set -e
|
||||
DIRECT_HOST=$(echo "$DB_HOST" | sed 's/-pooler\.\(.*\)$/.\1/')
|
||||
echo "[migrate] running goose up against $DIRECT_HOST"
|
||||
exec /usr/local/bin/goose \
|
||||
-dir /app/migrations \
|
||||
postgres "host=$DIRECT_HOST port=$DB_PORT user=$POSTGRES_USER password=$POSTGRES_PASSWORD dbname=$POSTGRES_DB sslmode=$DB_SSLMODE" \
|
||||
up
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: honeydue-config
|
||||
env:
|
||||
- name: POSTGRES_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: POSTGRES_PASSWORD
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 256Mi
|
||||
@@ -140,6 +140,20 @@ spec:
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 6379
|
||||
# Kratos (in-cluster). The auth middleware validates every session via
|
||||
# http://kratos:4433/sessions/whoami; the AuthService also uses :4434
|
||||
# for account deletion (DELETE /admin/identities/{id}). k3s evaluates
|
||||
# egress rules AFTER kube-proxy DNAT (runbook §9.2), so this podSelector
|
||||
# rule covers Service ClusterIP traffic correctly.
|
||||
- to:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kratos
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 4433
|
||||
- protocol: TCP
|
||||
port: 4434
|
||||
# External services: Neon DB (5432), SMTP (587), HTTPS (443 — APNs, FCM, B2, PostHog)
|
||||
- to:
|
||||
- ipBlock:
|
||||
@@ -275,3 +289,154 @@ spec:
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 443
|
||||
|
||||
---
|
||||
# vmagent egress.
|
||||
#
|
||||
# IMPORTANT (gotcha): k3s's built-in NetworkPolicy controller appears to
|
||||
# evaluate egress rules AFTER kube-proxy's DNAT, not before (contrary to
|
||||
# the k8s spec). So traffic from a pod to the kubernetes Service
|
||||
# (ClusterIP 10.43.0.1:443) is policy-checked as dst=<node_public_ip>:6443.
|
||||
# That's why we need an explicit rule for :6443 to public IPs, even though
|
||||
# we already allow :443 to the cluster service CIDR.
|
||||
#
|
||||
# Without the :6443 rule, vmagent's k8s service discovery silently fails
|
||||
# and zero pods get scraped. See deploy-k3s/RUNBOOK.md ("vmagent SD broken").
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-egress-from-vmagent
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
policyTypes:
|
||||
- Egress
|
||||
egress:
|
||||
# DNS (cluster-internal)
|
||||
- to:
|
||||
- namespaceSelector: {}
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
# k8s API server via ClusterIP (pre-DNAT view)
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.43.0.0/16
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
# k8s API server post-DNAT (real path k3s NetPol enforcer sees) — REQUIRED
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 0.0.0.0/0
|
||||
except:
|
||||
- 10.42.0.0/16
|
||||
ports:
|
||||
- port: 6443
|
||||
protocol: TCP
|
||||
# Scrape api Pods on :8000
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.42.0.0/16
|
||||
ports:
|
||||
- port: 8000
|
||||
protocol: TCP
|
||||
# Scrape kube-state-metrics Pod on :8080 (pod CIDR)
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.42.0.0/16
|
||||
ports:
|
||||
- port: 8080
|
||||
protocol: TCP
|
||||
# HTTPS to public (remote-write to obs.88oakapps.com via Cloudflare)
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 0.0.0.0/0
|
||||
except:
|
||||
- 10.42.0.0/16
|
||||
- 10.43.0.0/16
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
|
||||
---
|
||||
# Allow vmagent → api ingress on :8000 so api pods accept scrapes.
|
||||
# api Pods are otherwise locked down by default-deny-all + allow-ingress-to-api
|
||||
# (which only allows Traefik). This adds vmagent specifically.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-vmagent-to-api
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: api
|
||||
policyTypes:
|
||||
- Ingress
|
||||
ingress:
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
ports:
|
||||
- port: 8000
|
||||
protocol: TCP
|
||||
|
||||
---
|
||||
# alloy-logs egress — Grafana Alloy discovers honeydue pods via the k8s API
|
||||
# and pushes their logs to Loki at obs.88oakapps.com. Same k3s NetworkPolicy
|
||||
# DNAT gotcha as vmagent: API-server traffic is policy-checked as
|
||||
# dst=<node_public_ip>:6443, so an explicit :6443 rule is required.
|
||||
# Alloy reads log FILES from a hostPath, so it needs no ingress and no
|
||||
# egress to pod :8000/:8080 — only DNS, the API server, and obs HTTPS.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-egress-from-alloy-logs
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
policyTypes:
|
||||
- Egress
|
||||
egress:
|
||||
# DNS (cluster-internal)
|
||||
- to:
|
||||
- namespaceSelector: {}
|
||||
ports:
|
||||
- port: 53
|
||||
protocol: UDP
|
||||
- port: 53
|
||||
protocol: TCP
|
||||
# k8s API server via ClusterIP (pre-DNAT view)
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.43.0.0/16
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
# k8s API server post-DNAT (real path k3s NetPol enforcer sees) — REQUIRED
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 0.0.0.0/0
|
||||
except:
|
||||
- 10.42.0.0/16
|
||||
ports:
|
||||
- port: 6443
|
||||
protocol: TCP
|
||||
# HTTPS to public (log push to obs.88oakapps.com via Cloudflare)
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 0.0.0.0/0
|
||||
except:
|
||||
- 10.42.0.0/16
|
||||
- 10.43.0.0/16
|
||||
ports:
|
||||
- port: 443
|
||||
protocol: TCP
|
||||
|
||||
@@ -0,0 +1,278 @@
|
||||
# honeyDue log shipper — Grafana Alloy as a DaemonSet.
|
||||
#
|
||||
# Each node runs one Alloy pod that tails the honeydue-namespace pod logs in
|
||||
# /var/log/pods and pushes them to Loki at obs.88oakapps.com/loki/api/v1/push
|
||||
# (the same nginx ingest endpoint + bearer token vmagent uses for metrics).
|
||||
#
|
||||
# Runs as root: /var/log/pods is 0750 root:root on the k3s nodes, so a
|
||||
# non-root uid cannot even traverse it. The container is otherwise locked
|
||||
# down — all capabilities dropped, read-only root filesystem, seccomp
|
||||
# RuntimeDefault — and root inside the container reads only a read-only
|
||||
# hostPath mount of /var/log/pods. This is the one root-running workload in
|
||||
# the namespace (standard for log collectors); see docs/deployment.
|
||||
#
|
||||
# 03-deploy.sh substitutes TOKEN_PLACEHOLDER with OBS_INGEST_TOKEN from
|
||||
# deploy/prod.env before applying — the token never lands in the repo.
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: alloy-logs
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
---
|
||||
# Least privilege: Alloy's discovery.kubernetes only lists/watches pods, and
|
||||
# only in the honeydue namespace — so a namespaced Role, not a ClusterRole.
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: alloy-logs
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: alloy-logs
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: alloy-logs
|
||||
namespace: honeydue
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: alloy-logs
|
||||
---
|
||||
# Bearer token for the Loki push endpoint. TOKEN_PLACEHOLDER is replaced by
|
||||
# 03-deploy.sh with OBS_INGEST_TOKEN (same token vmagent uses).
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alloy-logs-auth
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
type: Opaque
|
||||
stringData:
|
||||
bearer_token: TOKEN_PLACEHOLDER
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alloy-logs
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
data:
|
||||
config.alloy: |
|
||||
// honeyDue log shipper. Each DaemonSet instance discovers honeydue-namespace
|
||||
// pods via the Kubernetes API, tails the container log files present on its
|
||||
// own node (/var/log/pods), and pushes them to Loki at obs.88oakapps.com.
|
||||
|
||||
logging {
|
||||
level = "warn"
|
||||
format = "logfmt"
|
||||
}
|
||||
|
||||
discovery.kubernetes "pods" {
|
||||
role = "pod"
|
||||
namespaces {
|
||||
names = ["honeydue"]
|
||||
}
|
||||
}
|
||||
|
||||
// Turn pod metadata into Loki labels and build the on-disk log path.
|
||||
discovery.relabel "pod_logs" {
|
||||
targets = discovery.kubernetes.pods.targets
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_namespace"]
|
||||
action = "replace"
|
||||
target_label = "namespace"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_name"]
|
||||
action = "replace"
|
||||
target_label = "pod"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_container_name"]
|
||||
action = "replace"
|
||||
target_label = "container"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
||||
action = "replace"
|
||||
target_label = "app"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_node_name"]
|
||||
action = "replace"
|
||||
target_label = "node"
|
||||
}
|
||||
// /var/log/pods/<namespace>_<pod>_<uid>/<container>/<n>.log
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
|
||||
separator = "/"
|
||||
action = "replace"
|
||||
replacement = "/var/log/pods/*$1/*.log"
|
||||
target_label = "__path__"
|
||||
}
|
||||
}
|
||||
|
||||
local.file_match "pod_logs" {
|
||||
path_targets = discovery.relabel.pod_logs.output
|
||||
}
|
||||
|
||||
loki.source.file "pod_logs" {
|
||||
targets = local.file_match.pod_logs.targets
|
||||
forward_to = [loki.process.pod_logs.receiver]
|
||||
// With no stored read offset (fresh node, or positions wiped), start
|
||||
// at the END of each file instead of re-shipping history — otherwise
|
||||
// Loki rejects the now-too-old entries ("entry too far behind") and
|
||||
// shipping stalls. Offsets persist on a hostPath (see volumes), so a
|
||||
// normal pod restart resumes exactly where it left off.
|
||||
tail_from_end = true
|
||||
}
|
||||
|
||||
// Parse the CRI log format (timestamp / stream / flags / message),
|
||||
// then drop probe/scrape noise before shipping.
|
||||
loki.process "pod_logs" {
|
||||
forward_to = [loki.write.obs.receiver]
|
||||
|
||||
stage.cri {}
|
||||
|
||||
// Drop successful probe/scrape access logs. k8s liveness/readiness
|
||||
// hits /api/health/ every few seconds and vmagent scrapes /metrics
|
||||
// on a 15s interval — all 2xx, pure noise that drowns real logs.
|
||||
// A non-2xx health check, or one logged above info level, does NOT
|
||||
// match this regex and is kept.
|
||||
stage.drop {
|
||||
expression = "\"level\":\"info\".*\"path\":\"/(api/health/?|metrics)\".*\"status\":2[0-9][0-9]"
|
||||
drop_counter_reason = "probe_access_ok"
|
||||
}
|
||||
}
|
||||
|
||||
loki.write "obs" {
|
||||
endpoint {
|
||||
url = "https://obs.88oakapps.com/loki/api/v1/push"
|
||||
bearer_token_file = "/etc/alloy-secrets/bearer_token"
|
||||
}
|
||||
external_labels = {
|
||||
cluster = "honeydue-k3s",
|
||||
environment = "prod",
|
||||
}
|
||||
}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: alloy-logs
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: alloy-logs
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
serviceAccountName: alloy-logs
|
||||
# Alloy needs its SA token — discovery.kubernetes talks to the API server.
|
||||
automountServiceAccountToken: true
|
||||
# Root is required to traverse /var/log/pods (0750 root:root). The
|
||||
# container is otherwise fully confined (see container securityContext).
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
runAsGroup: 0
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
tolerations:
|
||||
# DaemonSet must run on every node, including any control-plane taint.
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: alloy
|
||||
image: grafana/alloy:v1.5.1@sha256:01a63f4e032ce54ee94b22049bc27f597e74f85566478c377f4b5c7f020c1eb3
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- run
|
||||
- /etc/alloy/config.alloy
|
||||
- --storage.path=/tmp/alloy
|
||||
- --server.http.listen-addr=0.0.0.0:12345
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 12345
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/alloy
|
||||
readOnly: true
|
||||
- name: auth
|
||||
mountPath: /etc/alloy-secrets
|
||||
readOnly: true
|
||||
- name: varlogpods
|
||||
mountPath: /var/log/pods
|
||||
readOnly: true
|
||||
- name: tmp
|
||||
mountPath: /tmp/alloy
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 12345
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 20
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 150m
|
||||
memory: 256Mi
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: alloy-logs
|
||||
- name: auth
|
||||
secret:
|
||||
secretName: alloy-logs-auth
|
||||
defaultMode: 0400
|
||||
- name: varlogpods
|
||||
hostPath:
|
||||
path: /var/log/pods
|
||||
type: Directory
|
||||
# Alloy's positions/WAL store. A hostPath (not emptyDir) so file read
|
||||
# offsets survive pod restarts — otherwise every restart re-reads log
|
||||
# files from the start and Loki rejects the now-too-old entries.
|
||||
- name: tmp
|
||||
hostPath:
|
||||
path: /var/lib/honeydue-alloy-logs
|
||||
type: DirectoryOrCreate
|
||||
@@ -0,0 +1,223 @@
|
||||
# kube-state-metrics — exposes cluster object state (pods, deployments,
|
||||
# services, etc.) as Prometheus metrics. vmagent scrapes it via the api
|
||||
# group defined in vmagent-config; Grafana panels that count pods,
|
||||
# replicas, etc. consume the `kube_*` metrics this produces.
|
||||
#
|
||||
# Lives in kube-system because it watches resources cluster-wide.
|
||||
# RBAC is cluster-scoped (ClusterRole + ClusterRoleBinding).
|
||||
#
|
||||
# Image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
|
||||
# (latest stable as of authoring; bump when a newer minor is released)
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/part-of: honeydue-observability
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/part-of: honeydue-observability
|
||||
rules:
|
||||
# Core resources
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
- secrets
|
||||
- nodes
|
||||
- pods
|
||||
- services
|
||||
- serviceaccounts
|
||||
- resourcequotas
|
||||
- replicationcontrollers
|
||||
- limitranges
|
||||
- persistentvolumeclaims
|
||||
- persistentvolumes
|
||||
- namespaces
|
||||
- endpoints
|
||||
verbs: [list, watch]
|
||||
# Apps
|
||||
- apiGroups: ["apps"]
|
||||
resources:
|
||||
- statefulsets
|
||||
- daemonsets
|
||||
- deployments
|
||||
- replicasets
|
||||
verbs: [list, watch]
|
||||
# Batch
|
||||
- apiGroups: ["batch"]
|
||||
resources:
|
||||
- cronjobs
|
||||
- jobs
|
||||
verbs: [list, watch]
|
||||
# Autoscaling
|
||||
- apiGroups: ["autoscaling"]
|
||||
resources:
|
||||
- horizontalpodautoscalers
|
||||
verbs: [list, watch]
|
||||
# Authentication / authorization (used by some ksm collectors)
|
||||
- apiGroups: ["authentication.k8s.io"]
|
||||
resources: [tokenreviews]
|
||||
verbs: [create]
|
||||
- apiGroups: ["authorization.k8s.io"]
|
||||
resources: [subjectaccessreviews]
|
||||
verbs: [create]
|
||||
# Policy
|
||||
- apiGroups: ["policy"]
|
||||
resources: [poddisruptionbudgets]
|
||||
verbs: [list, watch]
|
||||
# Certificate signing
|
||||
- apiGroups: ["certificates.k8s.io"]
|
||||
resources: [certificatesigningrequests]
|
||||
verbs: [list, watch]
|
||||
# Discovery
|
||||
- apiGroups: ["discovery.k8s.io"]
|
||||
resources: [endpointslices]
|
||||
verbs: [list, watch]
|
||||
# Storage
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources:
|
||||
- storageclasses
|
||||
- volumeattachments
|
||||
verbs: [list, watch]
|
||||
# Admission policy
|
||||
- apiGroups: ["admissionregistration.k8s.io"]
|
||||
resources:
|
||||
- mutatingwebhookconfigurations
|
||||
- validatingwebhookconfigurations
|
||||
verbs: [list, watch]
|
||||
# Networking
|
||||
- apiGroups: ["networking.k8s.io"]
|
||||
resources:
|
||||
- networkpolicies
|
||||
- ingressclasses
|
||||
- ingresses
|
||||
verbs: [list, watch]
|
||||
# Coordination (leader election)
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resources: [leases]
|
||||
verbs: [list, watch]
|
||||
# RBAC
|
||||
- apiGroups: ["rbac.authorization.k8s.io"]
|
||||
resources:
|
||||
- clusterrolebindings
|
||||
- clusterroles
|
||||
- rolebindings
|
||||
- roles
|
||||
verbs: [list, watch]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/part-of: honeydue-observability
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: kube-state-metrics
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-state-metrics
|
||||
namespace: kube-system
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/part-of: honeydue-observability
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 8080
|
||||
targetPort: http-metrics
|
||||
protocol: TCP
|
||||
- name: telemetry
|
||||
port: 8081
|
||||
targetPort: telemetry
|
||||
protocol: TCP
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/part-of: honeydue-observability
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: kube-state-metrics
|
||||
app.kubernetes.io/part-of: honeydue-observability
|
||||
spec:
|
||||
serviceAccountName: kube-state-metrics
|
||||
automountServiceAccountToken: true
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
fsGroup: 65534
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: kube-state-metrics
|
||||
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http-metrics
|
||||
- containerPort: 8081
|
||||
name: telemetry
|
||||
args:
|
||||
- --port=8080
|
||||
- --telemetry-port=8081
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: [ALL]
|
||||
readOnlyRootFilesystem: true
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /livez
|
||||
port: http-metrics
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /readyz
|
||||
port: http-metrics
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
@@ -0,0 +1,126 @@
|
||||
# node-exporter — per-node host metrics (filesystem, memory, load, CPU).
|
||||
# Runs as a normal pod (NOT hostNetwork) so vmagent scrapes it pod-to-pod over
|
||||
# the cluster CIDR, avoiding any dependency on node public IPs (the netpol
|
||||
# node-IP list is OVH-stale). Host /proc, /sys and / are bind-mounted read-only
|
||||
# so the filesystem/memory/load collectors read the real host, not the pod ns.
|
||||
# Added 2026-06-08 to close RUNBOOK §11.1 gap #9 (node disk/mem were unmonitored).
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-exporter
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: node-exporter
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: node-exporter
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: node-exporter
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
# Run on every node, including any tainted control-plane nodes.
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: node-exporter
|
||||
image: quay.io/prometheus/node-exporter:v1.8.2 # TODO digest-pin (audit K3S-F14)
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
# Only report real host mounts; drop the kubelet/container churn.
|
||||
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/kubelet/.+|var/lib/docker/.+|var/lib/containerd/.+)($|/)
|
||||
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
|
||||
- --no-collector.wifi
|
||||
- --no-collector.hwmon
|
||||
- --web.listen-address=:9100
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9100
|
||||
protocol: TCP
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: 30m
|
||||
memory: 32Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 128Mi
|
||||
volumeMounts:
|
||||
- name: proc
|
||||
mountPath: /host/proc
|
||||
readOnly: true
|
||||
- name: sys
|
||||
mountPath: /host/sys
|
||||
readOnly: true
|
||||
- name: root
|
||||
mountPath: /host/root
|
||||
mountPropagation: HostToContainer
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: root
|
||||
hostPath:
|
||||
path: /
|
||||
---
|
||||
# default-deny-all blocks ingress; allow vmagent to scrape :9100.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-ingress-to-node-exporter
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: node-exporter
|
||||
policyTypes:
|
||||
- Ingress
|
||||
ingress:
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
ports:
|
||||
- port: 9100
|
||||
protocol: TCP
|
||||
---
|
||||
# vmagent's existing egress policy only opens :8000/:8080 to the pod CIDR.
|
||||
# Additive policy (NetworkPolicies are OR'd) opening :9100 for the node-exporter
|
||||
# scrape — leaves the working allow-egress-from-vmagent policy untouched.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-egress-from-vmagent-to-node-exporter
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
policyTypes:
|
||||
- Egress
|
||||
egress:
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.42.0.0/16
|
||||
ports:
|
||||
- port: 9100
|
||||
protocol: TCP
|
||||
@@ -0,0 +1,289 @@
|
||||
# vmagent — scrapes Prometheus /metrics from in-cluster services and
|
||||
# remote-writes them to https://obs.88oakapps.com/api/v1/write
|
||||
# (VictoriaMetrics on 88oakappsUpdate, fronted by Cloudflare + nginx
|
||||
# bearer-token auth). Single replica is fine — vmagent buffers locally
|
||||
# during transient remote outages.
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: vmagent-config
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
data:
|
||||
scrape.yaml: |
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
external_labels:
|
||||
cluster: honeydue-k3s
|
||||
environment: prod
|
||||
|
||||
scrape_configs:
|
||||
# honeyDue Go API — exposes /metrics on :8000
|
||||
- job_name: api
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names: [honeydue]
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||
action: keep
|
||||
regex: api
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
||||
action: keep
|
||||
regex: "8000"
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
- source_labels: [__meta_kubernetes_pod_node_name]
|
||||
target_label: node
|
||||
- target_label: service
|
||||
replacement: api
|
||||
|
||||
# kube-state-metrics — cluster object state (kube_pod_*, kube_deployment_*,
|
||||
# etc.) needed for Grafana panels that count pods/replicas/etc.
|
||||
- job_name: kube-state-metrics
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
namespaces:
|
||||
names: [kube-system]
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
|
||||
action: keep
|
||||
regex: kube-state-metrics
|
||||
- source_labels: [__meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: http-metrics
|
||||
|
||||
# node-exporter — per-node host metrics (node_filesystem_*, node_memory_*,
|
||||
# node_load*). Pod-networked DaemonSet scraped on :9100 over the pod CIDR.
|
||||
- job_name: node-exporter
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names: [honeydue]
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||
action: keep
|
||||
regex: node-exporter
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
||||
action: keep
|
||||
regex: "9100"
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
- source_labels: [__meta_kubernetes_pod_node_name]
|
||||
target_label: node
|
||||
- target_label: service
|
||||
replacement: node-exporter
|
||||
|
||||
# honeyDue worker — exposes /metrics on :6060 (apns/fcm/asynq/cache series).
|
||||
- job_name: worker
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names: [honeydue]
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||
action: keep
|
||||
regex: worker
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
||||
action: keep
|
||||
regex: "6060"
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
- source_labels: [__meta_kubernetes_pod_node_name]
|
||||
target_label: node
|
||||
- target_label: service
|
||||
replacement: worker
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: vmagent-remote-write
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
type: Opaque
|
||||
stringData:
|
||||
# Bearer token for obs.88oakapps.com. Provisioned at deploy time from
|
||||
# deploy/prod.env (OBS_INGEST_TOKEN). The cluster-side token must match
|
||||
# the token in /etc/honeydue-obs/ingest_token on 88oakappsUpdate.
|
||||
bearer_token: TOKEN_PLACEHOLDER
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: vmagent
|
||||
namespace: honeydue
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: [pods, services, endpoints]
|
||||
verbs: [get, list, watch]
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: vmagent
|
||||
namespace: honeydue
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: vmagent
|
||||
namespace: honeydue
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: vmagent
|
||||
namespace: honeydue
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: vmagent
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
---
|
||||
# Allow vmagent to discover the kube-state-metrics Service/Endpoints in
|
||||
# kube-system so the kube-state-metrics scrape job can find its target.
|
||||
# Cross-namespace SD needs an explicit RoleBinding here.
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: vmagent-kube-system
|
||||
namespace: kube-system
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: [services, endpoints, pods]
|
||||
verbs: [get, list, watch]
|
||||
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: vmagent-kube-system
|
||||
namespace: kube-system
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: vmagent
|
||||
namespace: honeydue
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: vmagent-kube-system
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vmagent
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
serviceAccountName: vmagent
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
fsGroup: 1000
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: vmagent
|
||||
# Pinned by digest (audit K3S-F14).
|
||||
image: victoriametrics/vmagent:v1.106.1@sha256:90208a667c0baf65f7536b92a84c40b6e35ffe8e88bda7e4447b97b06c6ba6b8
|
||||
imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit
|
||||
# Container-level hardening (audit F7) — matches the other 5
|
||||
# workloads. vmagent only writes to the /tmp/vmagent emptyDir
|
||||
# (its remoteWrite buffer), so a read-only root filesystem holds.
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
args:
|
||||
- "-promscrape.config=/etc/vmagent/scrape.yaml"
|
||||
- "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write"
|
||||
- "-remoteWrite.bearerTokenFile=/etc/vmagent-secrets/bearer_token"
|
||||
- "-remoteWrite.tmpDataPath=/tmp/vmagent"
|
||||
- "-remoteWrite.maxDiskUsagePerURL=512MB"
|
||||
- "-loggerLevel=INFO"
|
||||
ports:
|
||||
- containerPort: 8429
|
||||
name: http
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/vmagent
|
||||
readOnly: true
|
||||
- name: secrets
|
||||
mountPath: /etc/vmagent-secrets
|
||||
readOnly: true
|
||||
- name: buffer
|
||||
mountPath: /tmp/vmagent
|
||||
# Process startup gate. /-/healthy returns 200 once vmagent has
|
||||
# parsed config — gives the agent up to 2 min to come up before
|
||||
# liveness starts evaluating.
|
||||
startupProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
failureThreshold: 24
|
||||
# Real liveness check: are scrapes actually succeeding?
|
||||
# /-/healthy was the old probe and returned 200 for 17 days even
|
||||
# while vmagent had zero healthy targets (stale k8s SD watch).
|
||||
# This exec probe queries vmagent's own targets API and fails if
|
||||
# NO target is in state "up". Three consecutive failures (3 min)
|
||||
# → kubelet kills the pod → fresh SD watch.
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- 'n=$(wget -qO- -T 4 http://localhost:8429/api/v1/targets 2>/dev/null | grep -c ''"health":"up"''); [ "$n" -gt 0 ]'
|
||||
initialDelaySeconds: 180
|
||||
periodSeconds: 120
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 5
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: vmagent-config
|
||||
- name: secrets
|
||||
secret:
|
||||
secretName: vmagent-remote-write
|
||||
defaultMode: 0400
|
||||
- name: buffer
|
||||
emptyDir:
|
||||
sizeLimit: 512Mi
|
||||
@@ -20,6 +20,9 @@ spec:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
serviceAccountName: redis
|
||||
# Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
|
||||
# the ServiceAccount-level setting in rbac.yaml.
|
||||
automountServiceAccountToken: false
|
||||
nodeSelector:
|
||||
honeydue/redis: "true"
|
||||
securityContext:
|
||||
@@ -31,12 +34,18 @@ spec:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: redis
|
||||
image: redis:7-alpine
|
||||
# Pinned by digest (audit K3S-F14) — redis:7-alpine is 7.4.9-alpine.
|
||||
image: redis:7-alpine@sha256:6ab0b6e7381779332f97b8ca76193e45b0756f38d4c0dcda72dbb3c32061ab99
|
||||
imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy noeviction"
|
||||
# allkeys-lru: under memory pressure, evict the least-recently-used key.
|
||||
# honeyDue uses Redis as a cache + asynq queue. The cache layer falls
|
||||
# through to DB on miss, so eviction is graceful. asynq keys with TTLs
|
||||
# would be evicted only after older cache entries are gone.
|
||||
ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy allkeys-lru"
|
||||
if [ -n "$REDIS_PASSWORD" ]; then
|
||||
ARGS="$ARGS --requirepass $REDIS_PASSWORD"
|
||||
fi
|
||||
|
||||
@@ -23,8 +23,11 @@ spec:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
serviceAccountName: web
|
||||
# Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
|
||||
# the ServiceAccount-level setting in rbac.yaml.
|
||||
automountServiceAccountToken: false
|
||||
imagePullSecrets:
|
||||
- name: ghcr-credentials
|
||||
- name: gitea-credentials
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1001
|
||||
@@ -43,6 +46,7 @@ spec:
|
||||
containers:
|
||||
- name: web
|
||||
image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh or manual sed
|
||||
imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit; images are SHA/digest-pinned
|
||||
ports:
|
||||
- containerPort: 3000
|
||||
protocol: TCP
|
||||
|
||||
@@ -27,8 +27,11 @@ spec:
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
serviceAccountName: worker
|
||||
# Explicit pod-level opt-out (audit F11) — defense-in-depth on top of
|
||||
# the ServiceAccount-level setting in rbac.yaml.
|
||||
automountServiceAccountToken: false
|
||||
imagePullSecrets:
|
||||
- name: ghcr-credentials
|
||||
- name: gitea-credentials
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
@@ -39,6 +42,12 @@ spec:
|
||||
containers:
|
||||
- name: worker
|
||||
image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh
|
||||
imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit; images are SHA/digest-pinned
|
||||
ports:
|
||||
# health + Prometheus /metrics (in-cluster only; scraped by vmagent)
|
||||
- name: metrics
|
||||
containerPort: 6060
|
||||
protocol: TCP
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
@@ -47,34 +56,16 @@ spec:
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: honeydue-config
|
||||
env:
|
||||
- name: POSTGRES_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: POSTGRES_PASSWORD
|
||||
- name: SECRET_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: SECRET_KEY
|
||||
- name: EMAIL_HOST_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: EMAIL_HOST_PASSWORD
|
||||
- name: FCM_SERVER_KEY
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: FCM_SERVER_KEY
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: honeydue-secrets
|
||||
key: REDIS_PASSWORD
|
||||
optional: true
|
||||
# Audit CODE-F8: secrets are NOT injected as environment variables.
|
||||
# Env vars are readable for the life of the pod via /proc/<pid>/environ
|
||||
# and leak into crash dumps / child processes. honeydue-secrets is
|
||||
# mounted read-only at /etc/honeydue/secrets (mode 0400) and the Go
|
||||
# config layer (config.loadFileSecrets) reads each key from its file.
|
||||
# Non-secret config still arrives via the configMapRef above.
|
||||
volumeMounts:
|
||||
- name: app-secrets
|
||||
mountPath: /etc/honeydue/secrets
|
||||
readOnly: true
|
||||
- name: apns-key
|
||||
mountPath: /secrets/apns
|
||||
readOnly: true
|
||||
@@ -94,6 +85,12 @@ spec:
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
volumes:
|
||||
# Audit CODE-F8: the whole honeydue-secrets Secret, projected as files.
|
||||
# defaultMode 0400 → readable only by the container's runAsUser (1000).
|
||||
- name: app-secrets
|
||||
secret:
|
||||
secretName: honeydue-secrets
|
||||
defaultMode: 0400
|
||||
- name: apns-key
|
||||
secret:
|
||||
secretName: honeydue-apns-key
|
||||
@@ -103,3 +100,46 @@ spec:
|
||||
- name: tmp
|
||||
emptyDir:
|
||||
sizeLimit: 64Mi
|
||||
---
|
||||
# Allow vmagent to scrape the worker's /metrics on :6060 (default-deny-all is in
|
||||
# force; the worker otherwise receives no ingress). Additive — see node-exporter.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-ingress-to-worker-metrics
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: worker
|
||||
policyTypes:
|
||||
- Ingress
|
||||
ingress:
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
ports:
|
||||
- port: 6060
|
||||
protocol: TCP
|
||||
---
|
||||
# vmagent's base egress policy only opens :8000/:8080 to the pod CIDR; this
|
||||
# additive policy opens :6060 for the worker scrape (leaves the base untouched).
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-egress-from-vmagent-to-worker
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
policyTypes:
|
||||
- Egress
|
||||
egress:
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.42.0.0/16
|
||||
ports:
|
||||
- port: 6060
|
||||
protocol: TCP
|
||||
|
||||
@@ -68,6 +68,43 @@ SECRET_ARGS=(
|
||||
if [[ -n "${REDIS_PASSWORD}" ]]; then
|
||||
log " Including REDIS_PASSWORD in secrets"
|
||||
SECRET_ARGS+=(--from-literal="REDIS_PASSWORD=${REDIS_PASSWORD}")
|
||||
else
|
||||
# Audit K3S-F1 (CRITICAL) / MEDIUM-4: refuse to deploy with an unauthenticated
|
||||
# Redis. A previous version only warned here, which let a deploy from an
|
||||
# unedited config.yaml silently bring Redis up with no password.
|
||||
die "redis.password is empty in config.yaml — refusing to deploy: Redis would run with NO authentication (audit K3S-F1). Set a strong value, e.g.: openssl rand -base64 32"
|
||||
fi
|
||||
|
||||
# B2 (Backblaze) object-storage credentials. The api/worker manifests
|
||||
# reference B2_KEY_ID / B2_APP_KEY as required secret keys, so honeydue-secrets
|
||||
# MUST carry them or those pods fail to start. Sourced from config.yaml so the
|
||||
# script and the manifests no longer drift (was a latent gap before 2026-05-16).
|
||||
B2_KEY_ID_VAL="$(cfg storage.b2_key_id 2>/dev/null || true)"
|
||||
B2_APP_KEY_VAL="$(cfg storage.b2_app_key 2>/dev/null || true)"
|
||||
if [[ -n "${B2_KEY_ID_VAL}" && -n "${B2_APP_KEY_VAL}" ]]; then
|
||||
log " Including B2_KEY_ID / B2_APP_KEY in secrets"
|
||||
SECRET_ARGS+=(--from-literal="B2_KEY_ID=${B2_KEY_ID_VAL}")
|
||||
SECRET_ARGS+=(--from-literal="B2_APP_KEY=${B2_APP_KEY_VAL}")
|
||||
else
|
||||
warn "storage.b2_key_id / b2_app_key not set in config.yaml — B2 uploads will be disabled."
|
||||
fi
|
||||
|
||||
# Observability ingest credentials live in deploy/prod.env (gitignored) so
|
||||
# the values aren't checked into config.yaml. Skipped silently when the
|
||||
# file or keys are absent — the api/worker manifests mark these env vars
|
||||
# optional, so the deployment still rolls without traces.
|
||||
PROD_ENV_FILE="${DEPLOY_DIR}/../deploy/prod.env"
|
||||
if [[ -f "${PROD_ENV_FILE}" ]]; then
|
||||
OBS_TOKEN_VAL="$(grep -E '^OBS_INGEST_TOKEN=' "${PROD_ENV_FILE}" 2>/dev/null | cut -d= -f2- || true)"
|
||||
OBS_URL_VAL="$(grep -E '^OBS_TRACES_URL=' "${PROD_ENV_FILE}" 2>/dev/null | cut -d= -f2- || true)"
|
||||
if [[ -n "${OBS_TOKEN_VAL}" ]]; then
|
||||
log " Including OBS_INGEST_TOKEN in secrets"
|
||||
SECRET_ARGS+=(--from-literal="OBS_INGEST_TOKEN=${OBS_TOKEN_VAL}")
|
||||
fi
|
||||
if [[ -n "${OBS_URL_VAL}" ]]; then
|
||||
log " Including OBS_TRACES_URL in secrets"
|
||||
SECRET_ARGS+=(--from-literal="OBS_TRACES_URL=${OBS_URL_VAL}")
|
||||
fi
|
||||
fi
|
||||
|
||||
kubectl create secret generic honeydue-secrets \
|
||||
@@ -82,22 +119,24 @@ kubectl create secret generic honeydue-apns-key \
|
||||
--from-file="apns_auth_key.p8=${SECRETS_DIR}/apns_auth_key.p8" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# --- Create GHCR registry credentials ---
|
||||
# --- Create container registry credentials ---
|
||||
# Secret name is gitea-credentials (audit F6): the registry is self-hosted
|
||||
# Gitea, not GHCR. Every deployment manifest references this same name.
|
||||
|
||||
REGISTRY_SERVER="$(cfg registry.server)"
|
||||
REGISTRY_USER="$(cfg registry.username)"
|
||||
REGISTRY_TOKEN="$(cfg registry.token)"
|
||||
|
||||
if [[ -n "${REGISTRY_SERVER}" && -n "${REGISTRY_USER}" && -n "${REGISTRY_TOKEN}" ]]; then
|
||||
log "Creating ghcr-credentials..."
|
||||
kubectl create secret docker-registry ghcr-credentials \
|
||||
log "Creating gitea-credentials..."
|
||||
kubectl create secret docker-registry gitea-credentials \
|
||||
--namespace="${NAMESPACE}" \
|
||||
--docker-server="${REGISTRY_SERVER}" \
|
||||
--docker-username="${REGISTRY_USER}" \
|
||||
--docker-password="${REGISTRY_TOKEN}" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
else
|
||||
warn "Registry credentials incomplete in config.yaml — skipping ghcr-credentials."
|
||||
warn "Registry credentials incomplete in config.yaml — skipping gitea-credentials."
|
||||
fi
|
||||
|
||||
# --- Create Cloudflare origin cert ---
|
||||
@@ -114,7 +153,8 @@ kubectl create secret tls cloudflare-origin-cert \
|
||||
if [[ -n "${ADMIN_AUTH_USER}" && -n "${ADMIN_AUTH_PASSWORD}" ]]; then
|
||||
command -v htpasswd >/dev/null 2>&1 || die "Missing: htpasswd (install apache2-utils)"
|
||||
log "Creating admin-basic-auth secret..."
|
||||
HTPASSWD="$(htpasswd -nb "${ADMIN_AUTH_USER}" "${ADMIN_AUTH_PASSWORD}")"
|
||||
# -B forces bcrypt (Traefik BasicAuth supports it; avoids weak apr1-MD5).
|
||||
HTPASSWD="$(htpasswd -nbB "${ADMIN_AUTH_USER}" "${ADMIN_AUTH_PASSWORD}")"
|
||||
kubectl create secret generic admin-basic-auth \
|
||||
--namespace="${NAMESPACE}" \
|
||||
--from-literal=users="${HTPASSWD}" \
|
||||
@@ -124,6 +164,35 @@ else
|
||||
warn "Admin panel will NOT have basic auth protection."
|
||||
fi
|
||||
|
||||
# --- Create Kratos secrets (Ory Kratos identity service) ---
|
||||
# Created only when config.yaml has a kratos.dsn. Until then 03-deploy.sh skips
|
||||
# the Kratos deploy entirely, so the existing stack is unaffected.
|
||||
|
||||
KRATOS_DSN="$(cfg kratos.dsn 2>/dev/null || true)"
|
||||
if [[ -n "${KRATOS_DSN}" ]]; then
|
||||
log "Creating kratos-secrets..."
|
||||
KR_COOKIE="$(cfg kratos.secrets_cookie 2>/dev/null || true)"
|
||||
KR_CIPHER="$(cfg kratos.secrets_cipher 2>/dev/null || true)"
|
||||
KR_SMTP="$(cfg kratos.smtp_connection_uri 2>/dev/null || true)"
|
||||
KR_GOOGLE="$(cfg kratos.google_client_secret 2>/dev/null || true)"
|
||||
KR_APPLE="$(cfg kratos.apple_private_key 2>/dev/null || true)"
|
||||
[[ -n "${KR_COOKIE}" && -n "${KR_CIPHER}" ]] \
|
||||
|| die "kratos.secrets_cookie / secrets_cipher must be set (generate once: openssl rand -hex 16)"
|
||||
[[ ${#KR_CIPHER} -eq 32 ]] \
|
||||
|| die "kratos.secrets_cipher must be exactly 32 characters (openssl rand -hex 16)"
|
||||
kubectl create secret generic kratos-secrets \
|
||||
--namespace="${NAMESPACE}" \
|
||||
--from-literal="dsn=${KRATOS_DSN}" \
|
||||
--from-literal="secrets_cookie=${KR_COOKIE}" \
|
||||
--from-literal="secrets_cipher=${KR_CIPHER}" \
|
||||
--from-literal="smtp_connection_uri=${KR_SMTP}" \
|
||||
--from-literal="google_client_secret=${KR_GOOGLE}" \
|
||||
--from-literal="apple_private_key=${KR_APPLE}" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
else
|
||||
warn "config.yaml has no kratos.dsn — skipping kratos-secrets (Kratos not yet configured)."
|
||||
fi
|
||||
|
||||
# --- Done ---
|
||||
|
||||
log ""
|
||||
|
||||
+146
-12
@@ -81,20 +81,24 @@ if [[ "${SKIP_BUILD}" == "false" ]]; then
|
||||
log "Logging in to ${REGISTRY_SERVER}..."
|
||||
printf '%s' "${REGISTRY_TOKEN}" | docker login "${REGISTRY_SERVER}" -u "${REGISTRY_USER}" --password-stdin >/dev/null
|
||||
|
||||
log "Building API image: ${API_IMAGE}"
|
||||
docker build --target api -t "${API_IMAGE}" "${REPO_DIR}"
|
||||
# k3s nodes are linux/amd64 (Hetzner CX). Force the build platform so
|
||||
# local arm64 Macs don't push images that crash with "exec format error".
|
||||
BUILD_PLATFORM="linux/amd64"
|
||||
|
||||
log "Building Worker image: ${WORKER_IMAGE}"
|
||||
docker build --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}"
|
||||
log "Building API image: ${API_IMAGE} (${BUILD_PLATFORM})"
|
||||
docker build --platform "${BUILD_PLATFORM}" --target api -t "${API_IMAGE}" "${REPO_DIR}"
|
||||
|
||||
log "Building Admin image: ${ADMIN_IMAGE} (NEXT_PUBLIC_API_URL=${ADMIN_API_URL})"
|
||||
docker build --target admin \
|
||||
log "Building Worker image: ${WORKER_IMAGE} (${BUILD_PLATFORM})"
|
||||
docker build --platform "${BUILD_PLATFORM}" --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}"
|
||||
|
||||
log "Building Admin image: ${ADMIN_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${ADMIN_API_URL})"
|
||||
docker build --platform "${BUILD_PLATFORM}" --target admin \
|
||||
--build-arg "NEXT_PUBLIC_API_URL=${ADMIN_API_URL}" \
|
||||
-t "${ADMIN_IMAGE}" "${REPO_DIR}"
|
||||
|
||||
if [[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]]; then
|
||||
log "Building Web image: ${WEB_IMAGE} (NEXT_PUBLIC_API_URL=${WEB_API_URL})"
|
||||
docker build \
|
||||
log "Building Web image: ${WEB_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${WEB_API_URL})"
|
||||
docker build --platform "${BUILD_PLATFORM}" \
|
||||
--build-arg "NEXT_PUBLIC_API_URL=${WEB_API_URL}" \
|
||||
--build-arg "NEXT_PUBLIC_POSTHOG_KEY=${NEXT_PUBLIC_POSTHOG_KEY}" \
|
||||
--build-arg "NEXT_PUBLIC_POSTHOG_HOST=${NEXT_PUBLIC_POSTHOG_HOST}" \
|
||||
@@ -124,6 +128,56 @@ else
|
||||
warn "Skipping build. Using images for tag: ${DEPLOY_TAG}"
|
||||
fi
|
||||
|
||||
# --- Resolve immutable image digests (audit F5) ---
|
||||
# A short-SHA tag is mutable — anyone who can push to the registry can
|
||||
# overwrite it, and imagePullPolicy then pulls the new bits silently. We
|
||||
# deploy by @sha256: digest instead, pinning the exact image that was just
|
||||
# built and pushed. `docker push` populates RepoDigests; with --skip-build
|
||||
# (no local image) resolve_ref falls back to the tag.
|
||||
resolve_ref() {
|
||||
local img="$1" digest
|
||||
digest="$(docker inspect --format='{{range .RepoDigests}}{{println .}}{{end}}' "${img}" 2>/dev/null | grep -m1 '@sha256:' || true)"
|
||||
if [[ -n "${digest}" ]]; then
|
||||
printf '%s' "${digest}"
|
||||
else
|
||||
warn "could not resolve a digest for ${img} — deploying by mutable tag"
|
||||
printf '%s' "${img}"
|
||||
fi
|
||||
}
|
||||
API_REF="$(resolve_ref "${API_IMAGE}")"
|
||||
WORKER_REF="$(resolve_ref "${WORKER_IMAGE}")"
|
||||
ADMIN_REF="$(resolve_ref "${ADMIN_IMAGE}")"
|
||||
WEB_REF="$(resolve_ref "${WEB_IMAGE}")"
|
||||
log "Deploying by digest:"
|
||||
log " API: ${API_REF}"
|
||||
log " Worker: ${WORKER_REF}"
|
||||
log " Admin: ${ADMIN_REF}"
|
||||
|
||||
# --- Image scan + signing (audit CODE-L5) ---
|
||||
# Both steps are best-effort: the deploy does NOT fail if the tools are
|
||||
# absent, so an operator who has not set up cosign/trivy yet is not blocked.
|
||||
# Install trivy + cosign and export COSIGN_KEY to enforce. Cluster-side
|
||||
# admission verification (Kyverno/Connaisseur) is a separate operator step.
|
||||
if [[ "${SKIP_BUILD}" == "false" ]]; then
|
||||
if command -v trivy >/dev/null 2>&1; then
|
||||
log "Scanning images with Trivy (HIGH,CRITICAL)..."
|
||||
for img in "${API_IMAGE}" "${WORKER_IMAGE}" "${ADMIN_IMAGE}"; do
|
||||
trivy image --severity HIGH,CRITICAL --exit-code 0 --quiet "${img}" \
|
||||
|| warn "Trivy reported findings for ${img}"
|
||||
done
|
||||
else
|
||||
warn "trivy not installed — skipping image vulnerability scan (audit L5)"
|
||||
fi
|
||||
if command -v cosign >/dev/null 2>&1 && [[ -n "${COSIGN_KEY:-}" ]]; then
|
||||
log "Signing images with cosign..."
|
||||
for ref in "${API_REF}" "${WORKER_REF}" "${ADMIN_REF}"; do
|
||||
cosign sign --yes --key "${COSIGN_KEY}" "${ref}" || warn "cosign sign failed for ${ref}"
|
||||
done
|
||||
else
|
||||
warn "cosign not configured (need cosign + COSIGN_KEY) — skipping image signing (audit L5)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- Generate and apply ConfigMap from config.yaml ---
|
||||
|
||||
log "Generating env from config.yaml..."
|
||||
@@ -142,24 +196,95 @@ kubectl create configmap honeydue-config \
|
||||
log "Applying manifests..."
|
||||
|
||||
kubectl apply -f "${MANIFESTS}/namespace.yaml"
|
||||
|
||||
# NetworkPolicies first — default-deny-all + per-app allow rules.
|
||||
# These MUST be applied; without them the cluster falls back to default-allow
|
||||
# (worse posture) AND the vmagent egress rule for :6443 (which fixes a k3s
|
||||
# post-DNAT enforcement quirk for k8s API discovery) is missing.
|
||||
# See deploy-k3s/RUNBOOK.md ("vmagent SD broken on fresh deploy").
|
||||
kubectl apply -f "${MANIFESTS}/network-policies.yaml"
|
||||
|
||||
kubectl apply -f "${MANIFESTS}/redis/"
|
||||
kubectl apply -f "${MANIFESTS}/ingress/"
|
||||
|
||||
# --- Run migrations BEFORE rolling api/worker ---
|
||||
#
|
||||
# goose-based migration Job. We delete any prior Job (Jobs are immutable —
|
||||
# applying a duplicate name otherwise fails), apply a fresh one with the new
|
||||
# api image (which includes /usr/local/bin/goose and /app/migrations), and
|
||||
# block until it succeeds. A failure aborts the deploy before any new app
|
||||
# pod sees a stale schema.
|
||||
log "Running database migrations (goose Job)..."
|
||||
kubectl delete job honeydue-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/migrate/job.yaml" | kubectl apply -f -
|
||||
if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=10m job/honeydue-migrate; then
|
||||
warn "migration Job failed — see logs:"
|
||||
kubectl logs -n "${NAMESPACE}" job/honeydue-migrate --tail=200 || true
|
||||
die "migrations did not complete cleanly; aborting deploy"
|
||||
fi
|
||||
log "Migrations applied; proceeding with api/worker rollout"
|
||||
|
||||
# Apply deployments with image substitution
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${API_IMAGE}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f -
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f -
|
||||
kubectl apply -f "${MANIFESTS}/api/service.yaml"
|
||||
kubectl apply -f "${MANIFESTS}/api/hpa.yaml"
|
||||
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_IMAGE}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f -
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_REF}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f -
|
||||
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_IMAGE}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f -
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_REF}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f -
|
||||
kubectl apply -f "${MANIFESTS}/admin/service.yaml"
|
||||
|
||||
if [[ -d "${MANIFESTS}/web" ]]; then
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_IMAGE}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f -
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_REF}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f -
|
||||
kubectl apply -f "${MANIFESTS}/web/service.yaml"
|
||||
fi
|
||||
|
||||
# Observability — vmagent scrapes api Pods :8000/metrics + kube-state-metrics
|
||||
# :8080/metrics and remote-writes everything to obs.88oakapps.com. The bearer
|
||||
# token comes from deploy/prod.env so it stays out of the repo; the manifest
|
||||
# holds TOKEN_PLACEHOLDER. kube-state-metrics provides the kube_* metrics
|
||||
# Grafana panels need to count pods, deployments, etc.
|
||||
if [[ -d "${MANIFESTS}/observability" ]]; then
|
||||
# kube-state-metrics — no secrets, plain apply
|
||||
kubectl apply -f "${MANIFESTS}/observability/kube-state-metrics.yaml"
|
||||
|
||||
# vmagent — needs the bearer-token substitution
|
||||
# prod.env lives at the repo's deploy/ dir (sibling of deploy-k3s/), not
|
||||
# under deploy-k3s/. It's gitignored — operator copies values there once.
|
||||
OBS_TOKEN="$(grep -E '^OBS_INGEST_TOKEN=' "${REPO_DIR}/deploy/prod.env" 2>/dev/null | cut -d= -f2- || true)"
|
||||
if [[ -z "${OBS_TOKEN}" ]]; then
|
||||
warn "OBS_INGEST_TOKEN not found in deploy/prod.env — skipping vmagent + alloy-logs apply"
|
||||
else
|
||||
sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/vmagent.yaml" | kubectl apply -f -
|
||||
# alloy-logs — DaemonSet that tails honeydue pod logs and pushes them to
|
||||
# Loki at obs.88oakapps.com. Same OBS_INGEST_TOKEN as vmagent.
|
||||
if [[ -f "${MANIFESTS}/observability/alloy-logs.yaml" ]]; then
|
||||
sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/alloy-logs.yaml" | kubectl apply -f -
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- Ory Kratos (identity service) ---
|
||||
# Applied only when kratos-secrets exists — i.e. the operator has completed the
|
||||
# Kratos prerequisites in deploy-k3s/manifests/kratos/README.md. Otherwise
|
||||
# skipped, so the existing stack deploys unaffected.
|
||||
if kubectl -n "${NAMESPACE}" get secret kratos-secrets >/dev/null 2>&1; then
|
||||
log "Deploying Ory Kratos..."
|
||||
kubectl apply -f "${MANIFESTS}/kratos/configmap.yaml"
|
||||
# The migrate Job is immutable — delete any prior run, then apply + wait.
|
||||
kubectl delete job kratos-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null
|
||||
kubectl apply -f "${MANIFESTS}/kratos/migrate-job.yaml"
|
||||
if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=5m job/kratos-migrate; then
|
||||
warn "Kratos migration Job failed — logs:"
|
||||
kubectl logs -n "${NAMESPACE}" job/kratos-migrate --tail=100 || true
|
||||
die "aborting: Kratos schema migration failed"
|
||||
fi
|
||||
kubectl apply -f "${MANIFESTS}/kratos/kratos.yaml"
|
||||
kubectl apply -f "${MANIFESTS}/kratos/ingress.yaml"
|
||||
else
|
||||
log "kratos-secrets not present — skipping Kratos deploy (see manifests/kratos/README.md)."
|
||||
fi
|
||||
|
||||
# --- Wait for rollouts ---
|
||||
|
||||
log "Waiting for rollouts..."
|
||||
@@ -171,6 +296,15 @@ kubectl rollout status deployment/admin -n "${NAMESPACE}" --timeout=300s
|
||||
if [[ -d "${MANIFESTS}/web" ]]; then
|
||||
kubectl rollout status deployment/web -n "${NAMESPACE}" --timeout=300s
|
||||
fi
|
||||
if kubectl -n "${NAMESPACE}" get deployment vmagent >/dev/null 2>&1; then
|
||||
kubectl rollout status deployment/vmagent -n "${NAMESPACE}" --timeout=120s
|
||||
fi
|
||||
if kubectl -n "${NAMESPACE}" get daemonset alloy-logs >/dev/null 2>&1; then
|
||||
kubectl rollout status daemonset/alloy-logs -n "${NAMESPACE}" --timeout=120s
|
||||
fi
|
||||
if kubectl -n "${NAMESPACE}" get deployment kratos >/dev/null 2>&1; then
|
||||
kubectl rollout status deployment/kratos -n "${NAMESPACE}" --timeout=180s
|
||||
fi
|
||||
|
||||
# --- Done ---
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ lines = [
|
||||
# API
|
||||
'DEBUG=false',
|
||||
f\"ALLOWED_HOSTS={d['api']},{d['base']}\",
|
||||
f\"CORS_ALLOWED_ORIGINS=https://{d['base']},https://{d['admin']}\",
|
||||
f\"CORS_ALLOWED_ORIGINS=https://{d['base']},https://{d['admin']},https://{d.get('app', 'app.' + d['base'])}\",
|
||||
'TIMEZONE=UTC',
|
||||
f\"BASE_URL=https://{d['base']}\",
|
||||
'PORT=8000',
|
||||
@@ -118,8 +118,15 @@ lines = [
|
||||
f\"DB_MAX_OPEN_CONNS={db['max_open_conns']}\",
|
||||
f\"DB_MAX_IDLE_CONNS={db['max_idle_conns']}\",
|
||||
f\"DB_MAX_LIFETIME={db['max_lifetime']}\",
|
||||
# Redis (K8s internal DNS — password injected if configured)
|
||||
f\"REDIS_URL=redis://{':%s@' % val(rd.get('password')) if rd.get('password') else ''}redis.honeydue.svc.cluster.local:6379/0\",
|
||||
f\"DB_MAX_IDLE_TIME={db.get('max_idle_time', '0s')}\",
|
||||
# Redis — in-namespace DNS short form (works because pod /etc/resolv.conf
|
||||
# searches honeydue.svc.cluster.local). Audit HIGH-1: the password is
|
||||
# intentionally NOT embedded here. This URL is emitted into the
|
||||
# honeydue-config ConfigMap, which is NOT encrypted at rest and is
|
||||
# readable by anyone with `get configmap`. The Redis password travels
|
||||
# only in honeydue-secrets as REDIS_PASSWORD (file-mounted, F8); the API
|
||||
# applies it in cache_service.go and the worker onto its Asynq opt.
|
||||
'REDIS_URL=redis://redis:6379/0',
|
||||
'REDIS_DB=0',
|
||||
# Email
|
||||
f\"EMAIL_HOST={em['host']}\",
|
||||
@@ -139,12 +146,21 @@ lines = [
|
||||
f\"OVERDUE_REMINDER_HOUR={wk['overdue_reminder_hour']}\",
|
||||
f\"DAILY_DIGEST_HOUR={wk['daily_digest_hour']}\",
|
||||
# B2 Storage
|
||||
f\"B2_KEY_ID={val(st['b2_key_id'])}\",
|
||||
f\"B2_APP_KEY={val(st['b2_app_key'])}\",
|
||||
# B2_KEY_ID and B2_APP_KEY are intentionally NOT emitted into the
|
||||
# ConfigMap — they're credentials and belong in honeydue-secrets
|
||||
# (set by 02-setup-secrets.sh). Wire them into the api/worker
|
||||
# deployments via envFrom: secretRef when B2 uploads need to be
|
||||
# active. Leaving them in cleartext here would leak via
|
||||
# \"kubectl get cm\".
|
||||
f\"B2_BUCKET_NAME={val(st['b2_bucket'])}\",
|
||||
f\"B2_ENDPOINT={val(st['b2_endpoint'])}\",
|
||||
f\"B2_REGION={val(st.get('b2_region'))}\",
|
||||
f\"B2_USE_SSL={b(st.get('b2_use_ssl', True))}\",
|
||||
f\"STORAGE_MAX_FILE_SIZE={st['max_file_size']}\",
|
||||
f\"STORAGE_ALLOWED_TYPES={st['allowed_types']}\",
|
||||
f\"STORAGE_UPLOAD_DIR={val(st.get('upload_dir', '/app/uploads'))}\",
|
||||
f\"STORAGE_BASE_URL={val(st.get('base_url', '/uploads'))}\",
|
||||
f\"STATIC_DIR={val(st.get('static_dir', '/app/static'))}\",
|
||||
# Features
|
||||
f\"FEATURE_PUSH_ENABLED={b(ft['push_enabled'])}\",
|
||||
f\"FEATURE_EMAIL_ENABLED={b(ft['email_enabled'])}\",
|
||||
@@ -207,8 +223,18 @@ config = {
|
||||
'image': 'ubuntu-24.04',
|
||||
},
|
||||
'additional_packages': ['open-iscsi'],
|
||||
'post_create_commands': ['sudo systemctl enable --now iscsid'],
|
||||
'k3s_config_file': 'secrets-encryption: true\n',
|
||||
# Audit K3S-CG2: harden the node OS at provision time — fail2ban for SSH
|
||||
# brute-force, unattended-upgrades for automatic security patches.
|
||||
'post_create_commands': [
|
||||
'sudo systemctl enable --now iscsid',
|
||||
'sudo apt-get update -qq',
|
||||
'sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq fail2ban unattended-upgrades',
|
||||
'sudo systemctl enable --now fail2ban',
|
||||
'sudo dpkg-reconfigure -f noninteractive -plow unattended-upgrades',
|
||||
],
|
||||
# Audit K3S-CG1 / K3S-F4: encrypt Secrets at rest in etcd, and write the
|
||||
# node kubeconfig as mode 0600 (not world-readable).
|
||||
'k3s_config_file': 'secrets-encryption: true\nwrite-kubeconfig-mode: \"0600\"\n',
|
||||
}
|
||||
|
||||
print(yaml.dump(config, default_flow_style=False, sort_keys=False))
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
{
|
||||
"$id": "https://honeydue.app/identity.schema.json",
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"title": "honeyDue user",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"traits": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"email": {
|
||||
"type": "string",
|
||||
"format": "email",
|
||||
"title": "Email",
|
||||
"minLength": 3,
|
||||
"maxLength": 320,
|
||||
"ory.sh/kratos": {
|
||||
"credentials": {
|
||||
"password": { "identifier": true },
|
||||
"code": { "identifier": true, "via": "email" },
|
||||
"totp": { "account_name": true }
|
||||
},
|
||||
"verification": { "via": "email" },
|
||||
"recovery": { "via": "email" }
|
||||
}
|
||||
},
|
||||
"name": {
|
||||
"type": "object",
|
||||
"title": "Name",
|
||||
"properties": {
|
||||
"first": { "type": "string", "title": "First name", "maxLength": 100 },
|
||||
"last": { "type": "string", "title": "Last name", "maxLength": 100 }
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["email"],
|
||||
"additionalProperties": false
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
version: v1.3.0
|
||||
|
||||
serve:
|
||||
public:
|
||||
base_url: http://localhost:4433/
|
||||
cors:
|
||||
enabled: true
|
||||
allowed_origins:
|
||||
- http://localhost
|
||||
- http://localhost:3000
|
||||
- http://localhost:8000
|
||||
- http://127.0.0.1
|
||||
allowed_methods: [GET, POST, PUT, PATCH, DELETE, OPTIONS]
|
||||
allowed_headers: [Authorization, Content-Type, X-Session-Token, Cookie]
|
||||
exposed_headers: [Content-Type, Set-Cookie]
|
||||
allow_credentials: true
|
||||
admin:
|
||||
base_url: http://kratos:4434/
|
||||
|
||||
selfservice:
|
||||
default_browser_return_url: http://localhost:8000/
|
||||
allowed_return_urls:
|
||||
- http://localhost:8000
|
||||
- honeydue://callback
|
||||
|
||||
methods:
|
||||
password:
|
||||
enabled: true
|
||||
config:
|
||||
min_password_length: 8
|
||||
identifier_similarity_check_enabled: false
|
||||
code:
|
||||
enabled: true
|
||||
oidc:
|
||||
enabled: false
|
||||
|
||||
flows:
|
||||
error:
|
||||
ui_url: http://localhost:8000/auth/error
|
||||
login:
|
||||
ui_url: http://localhost:8000/auth/login
|
||||
lifespan: 10m
|
||||
registration:
|
||||
ui_url: http://localhost:8000/auth/registration
|
||||
lifespan: 10m
|
||||
after:
|
||||
password:
|
||||
hooks:
|
||||
- hook: session
|
||||
verification:
|
||||
enabled: true
|
||||
ui_url: http://localhost:8000/auth/verification
|
||||
use: code
|
||||
after:
|
||||
default_browser_return_url: http://localhost:8000/
|
||||
recovery:
|
||||
enabled: true
|
||||
ui_url: http://localhost:8000/auth/recovery
|
||||
use: code
|
||||
settings:
|
||||
ui_url: http://localhost:8000/auth/settings
|
||||
privileged_session_max_age: 15m
|
||||
logout:
|
||||
after:
|
||||
default_browser_return_url: http://localhost:8000/
|
||||
|
||||
log:
|
||||
level: debug
|
||||
format: text
|
||||
leak_sensitive_values: true
|
||||
|
||||
secrets:
|
||||
cookie:
|
||||
- local-dev-cookie-secret-please-change-this-32chars
|
||||
cipher:
|
||||
- 0123456789abcdef0123456789abcdef
|
||||
|
||||
ciphers:
|
||||
algorithm: xchacha20-poly1305
|
||||
|
||||
hashers:
|
||||
algorithm: bcrypt
|
||||
bcrypt:
|
||||
cost: 8
|
||||
|
||||
identity:
|
||||
default_schema_id: honeydue
|
||||
schemas:
|
||||
- id: honeydue
|
||||
url: file:///etc/config/kratos/identity.schema.json
|
||||
|
||||
courier:
|
||||
smtp:
|
||||
connection_uri: smtp://mailpit:1025/?disable_starttls=true
|
||||
from_address: noreply@localhost
|
||||
from_name: honeyDue Local
|
||||
|
||||
session:
|
||||
lifespan: 720h
|
||||
cookie:
|
||||
same_site: Lax
|
||||
@@ -35,7 +35,7 @@ DEFAULT_FROM_EMAIL=honeyDue <noreply@honeyDue.treytartt.com>
|
||||
# APNS private key goes in deploy/secrets/apns_auth_key.p8
|
||||
APNS_AUTH_KEY_ID=CHANGEME_APNS_KEY_ID
|
||||
APNS_TEAM_ID=CHANGEME_APNS_TEAM_ID
|
||||
APNS_TOPIC=com.tt.honeyDue
|
||||
APNS_TOPIC=com.myhoneydue.honeyDue
|
||||
APNS_USE_SANDBOX=false
|
||||
APNS_PRODUCTION=true
|
||||
|
||||
@@ -80,7 +80,11 @@ FEATURE_PDF_REPORTS_ENABLED=true
|
||||
FEATURE_WORKER_ENABLED=true
|
||||
|
||||
# Optional auth/iap values
|
||||
APPLE_CLIENT_ID=
|
||||
# APPLE_CLIENT_ID must equal the iOS Release bundle ID. The Apple
|
||||
# identity-token `aud` claim is verified against this value
|
||||
# (internal/services/apple_auth.go::verifyAudience). Leaving it empty
|
||||
# with DEBUG=false rejects every Apple token as invalid audience.
|
||||
APPLE_CLIENT_ID=com.myhoneydue.honeyDue
|
||||
APPLE_TEAM_ID=
|
||||
GOOGLE_CLIENT_ID=
|
||||
GOOGLE_ANDROID_CLIENT_ID=
|
||||
|
||||
@@ -1,6 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# DEPRECATED — production migrated from Docker Swarm to k3s on 2026-04-24.
|
||||
# This script targets the old Swarm manager + registry flow and will fail
|
||||
# at the SSH/Swarm validation step because hetzner1 no longer runs dockerd.
|
||||
#
|
||||
# Use the k3s deploy stack instead:
|
||||
#
|
||||
# export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
|
||||
# ./deploy-k3s/scripts/03-deploy.sh
|
||||
#
|
||||
# If you don't have deploy-k3s/kubeconfig locally, fetch it once:
|
||||
# ssh -i ~/.ssh/hetzner deploy@hetzner1 'sudo cat /etc/rancher/k3s/k3s.yaml' \
|
||||
# | sed 's|server: https://127.0.0.1:6443|server: https://178.104.247.152:6443|' \
|
||||
# > deploy-k3s/kubeconfig
|
||||
# chmod 600 deploy-k3s/kubeconfig
|
||||
#
|
||||
# To override and run anyway (do NOT do this casually), set:
|
||||
# ALLOW_LEGACY_SWARM_DEPLOY=1 ./deploy/scripts/deploy_prod.sh
|
||||
if [[ "${ALLOW_LEGACY_SWARM_DEPLOY:-0}" != "1" ]]; then
|
||||
printf '[deploy][error] %s\n' \
|
||||
"deploy_prod.sh is the legacy Docker Swarm flow. Production now runs on k3s." \
|
||||
"Use ./deploy-k3s/scripts/03-deploy.sh instead (see top of this script for setup)." \
|
||||
"If you really need the old Swarm path, set ALLOW_LEGACY_SWARM_DEPLOY=1." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
DEPLOY_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
REPO_DIR="$(cd "${DEPLOY_DIR}/.." && pwd)"
|
||||
|
||||
+62
-2
@@ -14,6 +14,7 @@ services:
|
||||
POSTGRES_DB: ${POSTGRES_DB:-honeydue}
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
- ./deploy/local/postgres-init:/docker-entrypoint-initdb.d:ro
|
||||
ports:
|
||||
- "${DB_PORT:-5433}:5432" # 5433 externally to avoid conflicts with local postgres
|
||||
healthcheck:
|
||||
@@ -85,12 +86,16 @@ services:
|
||||
APNS_AUTH_KEY_PATH: ${APNS_AUTH_KEY_PATH}
|
||||
APNS_AUTH_KEY_ID: ${APNS_AUTH_KEY_ID}
|
||||
APNS_TEAM_ID: ${APNS_TEAM_ID}
|
||||
APNS_TOPIC: ${APNS_TOPIC:-com.tt.honeyDue}
|
||||
APNS_TOPIC: ${APNS_TOPIC:-com.myhoneydue.honeyDue.dev}
|
||||
APNS_USE_SANDBOX: "true"
|
||||
FCM_SERVER_KEY: ${FCM_SERVER_KEY}
|
||||
|
||||
# Storage encryption
|
||||
STORAGE_ENCRYPTION_KEY: ${STORAGE_ENCRYPTION_KEY}
|
||||
|
||||
# Kratos (identity service)
|
||||
KRATOS_PUBLIC_URL: "http://kratos:4433"
|
||||
KRATOS_ADMIN_URL: "http://kratos:4434"
|
||||
volumes:
|
||||
- ./push_certs:/certs:ro
|
||||
- ./uploads:/app/uploads
|
||||
@@ -99,6 +104,8 @@ services:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
kratos:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://127.0.0.1:8000/api/health/"]
|
||||
interval: 30s
|
||||
@@ -158,7 +165,7 @@ services:
|
||||
APNS_AUTH_KEY_PATH: "/certs/apns_key.p8"
|
||||
APNS_AUTH_KEY_ID: ${APNS_AUTH_KEY_ID}
|
||||
APNS_TEAM_ID: ${APNS_TEAM_ID}
|
||||
APNS_TOPIC: ${APNS_TOPIC:-com.tt.honeyDue}
|
||||
APNS_TOPIC: ${APNS_TOPIC:-com.myhoneydue.honeyDue.dev}
|
||||
APNS_USE_SANDBOX: "true"
|
||||
FCM_SERVER_KEY: ${FCM_SERVER_KEY}
|
||||
|
||||
@@ -184,6 +191,59 @@ services:
|
||||
networks:
|
||||
- honeydue-network
|
||||
|
||||
# Mailpit — local SMTP catcher (for Kratos email codes during onboarding)
|
||||
mailpit:
|
||||
image: axllent/mailpit:latest
|
||||
container_name: honeydue-mailpit
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${MAILPIT_SMTP_PORT:-1025}:1025"
|
||||
- "${MAILPIT_HTTP_PORT:-8025}:8025"
|
||||
networks:
|
||||
- honeydue-network
|
||||
|
||||
# Kratos schema migration (one-shot, runs before kratos starts)
|
||||
kratos-migrate:
|
||||
image: oryd/kratos:v1.3.0
|
||||
container_name: honeydue-kratos-migrate
|
||||
command: ["migrate", "sql", "-e", "--yes"]
|
||||
environment:
|
||||
DSN: "postgres://${POSTGRES_USER:-honeydue}:${POSTGRES_PASSWORD:-honeydue_dev_password}@db:5432/kratos?sslmode=disable"
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- honeydue-network
|
||||
restart: "no"
|
||||
|
||||
# Ory Kratos — identity service
|
||||
kratos:
|
||||
image: oryd/kratos:v1.3.0
|
||||
container_name: honeydue-kratos
|
||||
restart: unless-stopped
|
||||
command: ["serve", "--config", "/etc/config/kratos/kratos.yml", "--watch-courier", "--dev"]
|
||||
ports:
|
||||
- "${KRATOS_PUBLIC_PORT:-4433}:4433"
|
||||
- "${KRATOS_ADMIN_PORT:-4434}:4434"
|
||||
environment:
|
||||
DSN: "postgres://${POSTGRES_USER:-honeydue}:${POSTGRES_PASSWORD:-honeydue_dev_password}@db:5432/kratos?sslmode=disable"
|
||||
LOG_LEVEL: "debug"
|
||||
volumes:
|
||||
- ./deploy/local/kratos:/etc/config/kratos:ro
|
||||
depends_on:
|
||||
kratos-migrate:
|
||||
condition: service_completed_successfully
|
||||
mailpit:
|
||||
condition: service_started
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://127.0.0.1:4434/health/ready"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 10
|
||||
start_period: 10s
|
||||
networks:
|
||||
- honeydue-network
|
||||
|
||||
# Dozzle — lightweight real-time log viewer
|
||||
dozzle:
|
||||
image: amir20/dozzle:latest
|
||||
|
||||
@@ -194,10 +194,17 @@ See [Chapter 8](./08-database.md), [9](./09-storage.md), and
|
||||
until we have Apple Developer / Google Play accounts. The env vars are
|
||||
set to sentinel values that let the Go app boot; `FEATURE_PUSH_ENABLED=false`
|
||||
gates all call sites.
|
||||
- **External metrics/monitoring (Prometheus, Grafana, Betterstack).**
|
||||
Right now we rely on `kubectl logs`, `kubectl top`, and Cloudflare's own
|
||||
analytics. See [Chapter 15](./15-observability.md) for what's there and
|
||||
what we'd add.
|
||||
- **In-cluster Prometheus / Grafana.** Self-hosted Prometheus-compatible
|
||||
metrics + tracing + dashboards live **outside** the k3s cluster on
|
||||
`88oakappsUpdate` (the same Linode VPS that hosts PostHog), reached
|
||||
via `https://obs.88oakapps.com` (Cloudflare-fronted, bearer-gated).
|
||||
A `vmagent` sidecar in the honeydue namespace scrapes the api Pods
|
||||
and remote-writes out. This frees ~700 MB of cluster RAM and means
|
||||
observability survives a k3s control-plane incident. See
|
||||
[Chapter 15](./15-observability.md).
|
||||
- **Alerting.** No PagerDuty, Slack hooks, or pages-on-error wired up
|
||||
yet. Histograms are flowing into Grafana — alert rules on top of them
|
||||
is the next add. See [Chapter 15 — Future](./15-observability.md).
|
||||
- **Automated backups of Redis state.** Redis is configured with AOF
|
||||
(append-only file) persistence, but the PVC is only on one node. Redis
|
||||
holds only cache + Asynq queue state; losing it re-populates on first
|
||||
|
||||
@@ -8,6 +8,13 @@ long-haul components, and dedicated service accounts with dropped
|
||||
capabilities inside containers. This chapter documents each layer, the
|
||||
rationale, and what's currently missing (and why).
|
||||
|
||||
> **Updated 2026-05-15 — security remediation.** The 2026-05 audits
|
||||
> (`live_scan_5_12.md`, `k3_audit_5_12.md`, `security_scan_5_12.md`) drove a
|
||||
> full remediation pass. **`deploy-k3s/SECURITY.md` is the authoritative,
|
||||
> per-finding current-state record.** This chapter is corrected for the
|
||||
> major items below; where any other detail conflicts with `SECURITY.md`,
|
||||
> `SECURITY.md` wins.
|
||||
|
||||
## Threat model
|
||||
|
||||
Who we're defending against, in rough order of likelihood:
|
||||
@@ -54,8 +61,8 @@ Cloudflare sits in front of every public request.
|
||||
- **Authorize requests** — that's the app's job
|
||||
- **Protect origin if origin IP leaks** — once someone knows a node IP
|
||||
they can bypass CF. Mitigation: keep origin firewall strict (Chapter 4).
|
||||
- **Encrypt between CF and origin** — we're on SSL=Flexible, so CF↔origin
|
||||
is HTTP. This is in our TODO (Chapter 20, upgrade to Full-strict).
|
||||
- **~~Encrypt between CF and origin~~** — done (2026-04-24): SSL mode is
|
||||
Full (strict); CF↔origin is TLS with a Cloudflare Origin CA cert.
|
||||
|
||||
### The proxy-IP problem
|
||||
|
||||
@@ -75,8 +82,8 @@ This means a malicious request that bypasses CF (by hitting the node IP
|
||||
directly) can't spoof headers — Traefik ignores `X-Forwarded-*` unless
|
||||
the source IP is in CF's ranges.
|
||||
|
||||
**TODO** (Chapter 20): Enforce at UFW level — allow 80/tcp only from
|
||||
CF IP ranges. Today any IP can reach the origin on port 80.
|
||||
**Done (2026-04-24):** the node UFW allowlist permits `:443` only from
|
||||
Cloudflare's IP ranges; the `Anywhere` rules on `:80`/`:443` were removed.
|
||||
|
||||
## Layer 2 — Node (OS, SSH, firewall)
|
||||
|
||||
@@ -297,15 +304,13 @@ The `deploy-k3s/manifests/network-policies.yaml` scaffold defines:
|
||||
reach api pods on port 8000
|
||||
- **allow-ingress-to-admin** — same, for admin:3000
|
||||
|
||||
**These are not currently applied.** Without them, our pods can freely
|
||||
talk to anything — including, theoretically, malicious destinations if
|
||||
an attacker gets RCE inside a pod.
|
||||
**Applied.** `03-deploy.sh` applies
|
||||
`deploy-k3s/manifests/network-policies.yaml` on every deploy — default-deny
|
||||
plus the explicit per-app allows below. Traefik runs `hostNetwork`, so its
|
||||
traffic is matched by node-IP `ipBlock`s plus the pod CIDR `10.42.0.0/16`,
|
||||
not a `namespaceSelector`.
|
||||
|
||||
**TODO** (Chapter 20): Apply network policies. The scaffold is there; we
|
||||
just need to `kubectl apply -f deploy-k3s/manifests/network-policies.yaml`
|
||||
and test that nothing breaks.
|
||||
|
||||
### What network policies would prevent
|
||||
### What network policies prevent
|
||||
|
||||
| Attack scenario | NetworkPolicy blocks |
|
||||
|---|---|
|
||||
@@ -324,13 +329,10 @@ renewed Let's Encrypt or CF-managed cert for `*.myhoneydue.com`.
|
||||
|
||||
### CF ↔ origin
|
||||
|
||||
**Plaintext HTTP** (SSL = Flexible). An attacker with access to the
|
||||
Cloudflare-to-Hetzner path could read traffic. In practice nobody who
|
||||
isn't Cloudflare or Hetzner sits on that path.
|
||||
|
||||
**TODO** (Chapter 20): Upgrade to SSL = Full (strict) with a Cloudflare
|
||||
Origin CA certificate. This encrypts CF ↔ origin and verifies that
|
||||
origin's cert is the CF-issued one (prevents MitM if DNS is compromised).
|
||||
**TLS — SSL = Full (strict)** (since 2026-04-24). A Cloudflare Origin CA
|
||||
certificate (`cloudflare-origin-cert` secret) is installed on all three
|
||||
ingresses; Cloudflare validates it. Both user↔CF and CF↔origin are
|
||||
encrypted, and a DNS-hijack MitM is defeated by the origin-cert check.
|
||||
|
||||
### API ↔ Neon Postgres
|
||||
|
||||
@@ -454,11 +456,14 @@ Mitigations:
|
||||
- Gitea itself is behind login; PAT is scoped to read:packages +
|
||||
write:packages only
|
||||
- Gitea runs on the operator's infrastructure (same operator account)
|
||||
- Image tags are SHA-pinned (`:237c6b8`) not `:latest` → attacker can't
|
||||
replace an existing tag's image without us noticing the digest change
|
||||
- Workloads deploy by immutable `@sha256:` digest, not by mutable tag
|
||||
(`03-deploy.sh` resolves the digest after push; the redis/vmagent/node
|
||||
base images are digest-pinned too) — a swapped tag cannot reach the
|
||||
cluster.
|
||||
|
||||
**TODO** (Chapter 20): Add cosign signing at build time, verify at pull
|
||||
time.
|
||||
**TODO**: cosign signing is wired into `03-deploy.sh` (guarded — runs when
|
||||
`cosign` + `COSIGN_KEY` are present); cluster-side admission verification
|
||||
(Kyverno/Connaisseur) is still pending. See `deploy-k3s/SECURITY.md` → L5.
|
||||
|
||||
## Operator workstation security
|
||||
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
# 06 — Traefik Ingress
|
||||
|
||||
> **Updated 2026-05-15 (security remediation):** the Traefik middleware set
|
||||
> changed — `cloudflare-only` + `admin-auth` are now attached to the admin
|
||||
> ingress, a strict `auth-rate-limit` middleware fronts the auth endpoints
|
||||
> (via a dedicated `honeydue-api-auth` Ingress), and `security-headers`
|
||||
> gained COOP/CORP + a 2-year preload HSTS and dropped the deprecated
|
||||
> `X-XSS-Protection`. `deploy-k3s/SECURITY.md` is the authoritative
|
||||
> current-state record.
|
||||
|
||||
## Summary
|
||||
|
||||
Traefik is the reverse proxy that routes external HTTP requests to the
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
# 07 — Services
|
||||
|
||||
> **Updated 2026-05-15 (security remediation):** Redis now requires a
|
||||
> password (`config.yaml` `redis.password` → `honeydue-secrets`), all
|
||||
> workloads deploy by immutable `@sha256:` digest, and the redis/vmagent
|
||||
> base images are digest-pinned. `deploy-k3s/SECURITY.md` is the
|
||||
> authoritative current-state record.
|
||||
|
||||
## Summary
|
||||
|
||||
Five workloads run in the `honeydue` namespace: **api** (Go REST API, 3
|
||||
@@ -175,13 +181,15 @@ doesn't run as root.
|
||||
file writes to the image layer. Go binary doesn't need to write to `/`;
|
||||
only `/tmp` is mutable.
|
||||
|
||||
**`startupProbe.failureThreshold: 48`** (= 48 × 5s = 240s grace) — this
|
||||
was bumped up from the scaffold default of 12. Reason: on first boot,
|
||||
the Go app runs `MigrateWithLock()` which acquires a Postgres advisory
|
||||
lock and runs AutoMigrate. First replica takes ~90s; subsequent
|
||||
replicas wait on the lock. With 3 replicas all starting simultaneously
|
||||
and the lock serializing them, 240s is the right grace. See
|
||||
[Chapter 19](./19-postmortem-swarm.md) for the detailed story.
|
||||
**`startupProbe.failureThreshold: 48`** (= 48 × 5s = 240s grace) —
|
||||
historically bumped from the scaffold default of 12 to absorb in-replica
|
||||
migration time. Now that migrations run out-of-band as a Kubernetes
|
||||
Job ([Chapter 8 §Schema management](./08-database.md)), pods boot in
|
||||
seconds and only need a few probe failures of grace, but the budget
|
||||
stays at 240s because cold pods on a fresh Hetzner node still pay
|
||||
~10s for image pull + startup. See
|
||||
[Chapter 19 §13](./19-postmortem-swarm.md) for the historical
|
||||
context (the in-replica advisory-lock approach this replaced).
|
||||
|
||||
**`readinessProbe.initialDelaySeconds: 5`** — after the startupProbe
|
||||
passes, wait 5s before starting readiness checks. Prevents a racy
|
||||
|
||||
+195
-75
@@ -4,8 +4,10 @@
|
||||
|
||||
Authoritative user data lives in a Neon-managed Postgres database in AWS
|
||||
us-east-1. Connections use TLS (`DB_SSLMODE=require`). Schema is managed
|
||||
via GORM AutoMigrate inside the api binary, coordinated across replicas
|
||||
by a Postgres advisory lock to prevent concurrent migration attempts.
|
||||
via [pressly/goose](https://github.com/pressly/goose) running as a
|
||||
one-shot Kubernetes Job before every api/worker rollout. See §Schema
|
||||
management below for the full shape; ch19 §13 documents the previous
|
||||
in-replica AutoMigrate approach this replaced.
|
||||
|
||||
## Why Neon
|
||||
|
||||
@@ -32,7 +34,7 @@ Neon Launch won on:
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| Hostname | `ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech` |
|
||||
| Hostname | `ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech` |
|
||||
| Port | 5432 |
|
||||
| Username | `neondb_owner` |
|
||||
| Database | `honeyDue` (case-sensitive!) |
|
||||
@@ -58,9 +60,19 @@ paid tiers much higher.
|
||||
|
||||
### PgBouncer on Neon
|
||||
|
||||
Neon provides a built-in PgBouncer at `-pooler` subdomain. Our hostname
|
||||
already includes `-pooler` handling in the route, so connections go
|
||||
through PgBouncer transparently.
|
||||
Neon provides a built-in PgBouncer at the `-pooler` subdomain. The
|
||||
non-pooler endpoint (`ep-floral-truth-amttbc5a.c-5.us-east-1...`) is
|
||||
the direct compute endpoint and connects straight to Postgres,
|
||||
paying the full TCP+TLS+startup handshake on every cold connection.
|
||||
The `-pooler` endpoint multiplexes through PgBouncer in Neon's
|
||||
infrastructure.
|
||||
|
||||
**We use the `-pooler` endpoint** because the direct endpoint paid
|
||||
~440ms per cold handshake on a transatlantic link, visible as
|
||||
1500ms-tail spikes in /api/tasks/ traces. The pooler keeps backend
|
||||
Postgres connections warm in Neon's data center, so the only
|
||||
latency our Go pods see is one TCP+TLS to PgBouncer (already
|
||||
warm via our pool) plus one query round-trip.
|
||||
|
||||
Modes PgBouncer supports:
|
||||
- **session** — one server connection held per client session (transparent)
|
||||
@@ -68,26 +80,59 @@ Modes PgBouncer supports:
|
||||
- **statement** — per-statement (most aggressive; breaks many features)
|
||||
|
||||
Neon's pooler runs in **transaction mode**. This is compatible with GORM
|
||||
out of the box (we don't use session-level features like prepared
|
||||
statements or session variables).
|
||||
runtime queries (we don't use session-level features like LISTEN/NOTIFY
|
||||
or session-scope advisory locks in the data path). The one place this
|
||||
matters is migrations: goose's session-scoped advisory lock can't
|
||||
survive PgBouncer transaction-mode pooling. The migrate Job
|
||||
(`deploy-k3s/manifests/migrate/job.yaml`) handles this by stripping
|
||||
the `-pooler` segment from `DB_HOST` before invoking goose — runtime
|
||||
keeps using the pooler, only migrations bypass it.
|
||||
|
||||
### Connection pool settings
|
||||
|
||||
In `prod.env`:
|
||||
In `config.yaml` (rendered into ConfigMap → env vars):
|
||||
|
||||
```
|
||||
DB_MAX_OPEN_CONNS=25
|
||||
DB_MAX_IDLE_CONNS=10
|
||||
DB_MAX_LIFETIME=600s
|
||||
```yaml
|
||||
database:
|
||||
max_open_conns: 25
|
||||
max_idle_conns: 20
|
||||
max_lifetime: "1800s"
|
||||
max_idle_time: "0s"
|
||||
```
|
||||
|
||||
These are the Go `database/sql` pool settings (GORM uses `database/sql`
|
||||
underneath):
|
||||
These map to Go `database/sql` pool settings:
|
||||
|
||||
- **MaxOpenConns: 25** — at most 25 concurrent connections per replica
|
||||
- **MaxIdleConns: 10** — keep up to 10 warm connections ready to reuse
|
||||
- **MaxLifetime: 600s** — recycle connections after 10 min (prevents
|
||||
stale state in long-lived connections, good for Neon's idle timeout)
|
||||
- **MaxOpenConns: 25** — at most 25 concurrent connections per replica.
|
||||
- **MaxIdleConns: 20** — keep up to 20 warm connections per replica
|
||||
ready to reuse. Bumped from 10 because the pooler tolerates many
|
||||
client connections cheaply, and the cost of a cold handshake (~440ms
|
||||
transatlantic) is far higher than the cost of holding an idle
|
||||
connection.
|
||||
- **MaxLifetime: 1800s** — recycle connections after 30 min. Bumped
|
||||
from 600s; with the pooler keeping things warm, longer lifetime
|
||||
reduces churn.
|
||||
- **MaxIdleTime: 0s** — never close idle connections. Lifetime drives
|
||||
recycling instead.
|
||||
|
||||
### Pool warm-up at boot
|
||||
|
||||
`database.Connect()` issues 20 parallel `PingContext` calls
|
||||
immediately after opening the pool. This pre-establishes
|
||||
`MaxIdleConns` connections to the pooler so the first user request
|
||||
doesn't pay any handshake.
|
||||
|
||||
The warm-up is bounded by *one* round-trip time (~440ms cold), not
|
||||
one round-trip per connection — pings run concurrently. Confirmed
|
||||
in pod logs at boot:
|
||||
|
||||
```
|
||||
{"level":"info","requested":20,"warmed":20,"message":"DB pool warm-up complete"}
|
||||
```
|
||||
|
||||
If warm-up partially fails (e.g., 18/20 succeed), the pod still
|
||||
starts; the pool fills the rest under traffic. Failure to ping at all
|
||||
would be caught by the synchronous `sqlDB.Ping()` immediately before,
|
||||
which is fatal.
|
||||
|
||||
### Worst-case connection count
|
||||
|
||||
@@ -107,66 +152,110 @@ the default 25/10. If we hit connection errors in prod, adjust.
|
||||
|
||||
## Schema management
|
||||
|
||||
### GORM AutoMigrate
|
||||
### goose
|
||||
|
||||
On startup, the Go API's `cmd/api/main.go` calls
|
||||
`database.MigrateWithLock()` which:
|
||||
We use [pressly/goose](https://github.com/pressly/goose) (pinned in the
|
||||
api `Dockerfile` to v3.22.1) for schema migrations. Why goose specifically:
|
||||
|
||||
1. Opens a dedicated Postgres connection
|
||||
2. `SELECT pg_advisory_lock(1751412071)` — acquires a session-level
|
||||
advisory lock on a hardcoded key
|
||||
3. Calls `db.AutoMigrate(&models.*{})` for every GORM model
|
||||
4. `SELECT pg_advisory_unlock(...)` via deferred function
|
||||
5. Close the connection
|
||||
- Each migration file runs inside its own transaction by default —
|
||||
partial-failure recovery is built in (no "dirty" state to manually
|
||||
unstick like golang-migrate).
|
||||
- Locking is opt-in. We *don't* opt in. Migrations run as a single
|
||||
Kubernetes Job — that's the singleton process. No advisory-lock vs
|
||||
PgBouncer-transaction-mode foot-gun.
|
||||
- Plain SQL files. No DSL, no library integration in our Go code.
|
||||
|
||||
The advisory lock serializes migrations across replicas: when 3 api
|
||||
pods start simultaneously, one acquires the lock and migrates; the
|
||||
others block on the lock. Once the first finishes (≤2s for already-
|
||||
migrated schema, up to 90s on first cold boot), the next acquires and
|
||||
sees the schema is current (no-op migrate).
|
||||
See `docs/deployment/19-postmortem-swarm.md` (Schema Versioning section)
|
||||
for the AutoMigrate-with-advisory-lock approach this replaced and why.
|
||||
|
||||
### Why an advisory lock
|
||||
### Migration files
|
||||
|
||||
Without it, concurrent `CREATE TABLE IF NOT EXISTS ...` statements from
|
||||
multiple replicas would race — Postgres usually handles it, but GORM's
|
||||
AutoMigrate also alters tables (adds columns, indexes) which can deadlock
|
||||
under concurrency.
|
||||
Live under `migrations/`, named `<NNNNNN>_<short_name>.sql`. Each file
|
||||
has both the up and down migration in one file, separated by goose
|
||||
markers:
|
||||
|
||||
The advisory lock pattern (also used by Rails + Django + Alembic) is the
|
||||
canonical solution.
|
||||
```sql
|
||||
-- +goose Up
|
||||
CREATE TABLE example (id bigint PRIMARY KEY);
|
||||
|
||||
### The lock key
|
||||
-- +goose Down
|
||||
DROP TABLE example;
|
||||
```
|
||||
|
||||
`1751412071` is a hardcoded integer in `internal/database/database.go`.
|
||||
Arbitrary but unique — as long as nothing else in the Postgres instance
|
||||
uses the same advisory lock key, no conflicts.
|
||||
Multi-statement constructs (`CREATE FUNCTION`, `DO $$ BEGIN ... END $$`)
|
||||
need `-- +goose StatementBegin` / `-- +goose StatementEnd` wrappers
|
||||
because goose splits on semicolons by default.
|
||||
|
||||
### First-boot behavior
|
||||
`migrations/000001_init.sql` is the baseline — captures every
|
||||
table/index/sequence as it existed when goose was adopted, generated
|
||||
via `pg_dump --schema-only --no-owner --no-privileges`. The pre-goose
|
||||
hand-numbered migrations (002-022 in git history at commit
|
||||
58e6997) had their effects folded into this baseline; they're gone
|
||||
from the live tree but remain in git for archaeology.
|
||||
|
||||
On a **fresh database** (new Neon project), the first api pod runs
|
||||
through every model's `CREATE TABLE` statement. This is ~50 tables for
|
||||
honeyDue and takes ~90 seconds.
|
||||
### Production migration flow
|
||||
|
||||
On a **warm database** (tables already exist), AutoMigrate is fast —
|
||||
typically under 2 seconds. It still runs (GORM checks every model
|
||||
against the schema) but finds no work to do.
|
||||
`deploy-k3s/scripts/03-deploy.sh` runs migrations as part of every
|
||||
deploy, **before** the api/worker rollout starts:
|
||||
|
||||
### Where this bit us
|
||||
```
|
||||
1. kubectl delete job honeydue-migrate (idempotent)
|
||||
2. kubectl apply -f manifests/migrate/job.yaml (with current api image)
|
||||
3. kubectl wait --for=condition=complete --timeout=10m job/honeydue-migrate
|
||||
4. (only if Job succeeded) kubectl apply -f manifests/api/...
|
||||
```
|
||||
|
||||
With 3 api pods starting simultaneously and migrations taking 90s first
|
||||
time, the lock queue for the last replica is ~180s. We needed a
|
||||
startupProbe grace of 240s to cover this without false restart loops.
|
||||
See Chapter 7 §startupProbe and Chapter 19 §MigrateWithLock.
|
||||
The Job uses the api image — we install the goose CLI binary at
|
||||
`/usr/local/bin/goose` during the api Dockerfile build, so any pod that
|
||||
can run api can run goose. No separate image to build/push.
|
||||
|
||||
### Downside: no schema versioning
|
||||
The Job's `command` runs `goose ... up` against the **direct**
|
||||
(non-pooler) Neon endpoint. Goose's session-scoped advisory lock can't
|
||||
survive PgBouncer transaction-mode pooling, so the Job script strips
|
||||
the `-pooler` segment from `DB_HOST` before connecting. The api/worker
|
||||
runtime continues to use the pooler endpoint for everything else; only
|
||||
this one Job needs the direct connection.
|
||||
|
||||
AutoMigrate can only *add* — new tables, new columns, new indexes. It
|
||||
won't drop columns, rename them, or change types destructively. For
|
||||
those we'd need raw SQL migrations (a tool like `golang-migrate` or
|
||||
`dbmate`).
|
||||
### Schema-version precondition
|
||||
|
||||
Today: we accept that schema changes are additive-only. When we need
|
||||
destructive changes, we'd hand-write them.
|
||||
`internal/database/database.go::RequireSchemaApplied()` runs at api and
|
||||
worker startup. It queries `goose_db_version` for the highest applied
|
||||
version and refuses to start if the table is missing or the latest row
|
||||
is `is_applied=false`. This catches "operator forgot to run migrate" as
|
||||
a clear boot error instead of a mysterious runtime "relation does not
|
||||
exist" later.
|
||||
|
||||
### Local migration workflow
|
||||
|
||||
```bash
|
||||
# Set the direct-endpoint DSN once
|
||||
export DATABASE_URL='host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
|
||||
user=neondb_owner password=$PG_PASSWORD dbname=honeyDue sslmode=require'
|
||||
|
||||
make migrate-status # what's pending
|
||||
make migrate-up # apply
|
||||
make migrate-down # roll back the latest
|
||||
make migrate-new name=add_widget_col # scaffold a new SQL file
|
||||
```
|
||||
|
||||
Each new migration file goes through code review like any other code
|
||||
change. The deploy-script Job applies it on the next deploy.
|
||||
|
||||
### Bootstrap (one-time, when the prod DB already had a schema)
|
||||
|
||||
Bootstrapping a goose-managed DB whose schema already exists requires
|
||||
seeding `goose_db_version` so goose treats version 1 as already-applied:
|
||||
|
||||
```bash
|
||||
# Once. After this, future migrations append normally.
|
||||
goose -dir migrations postgres "$DATABASE_URL" version # creates the table
|
||||
psql "$DATABASE_URL" -c \
|
||||
"INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
|
||||
```
|
||||
|
||||
This was done for honeyDue's prod Neon project at the time of goose
|
||||
adoption — no need to repeat unless we set up a fresh DB from a
|
||||
schema dump.
|
||||
|
||||
## What's in the database
|
||||
|
||||
@@ -229,17 +318,45 @@ value.
|
||||
## Neon regions
|
||||
|
||||
Neon's default region for new projects is `aws-us-east-1` (Virginia).
|
||||
Our DB is there. Latency from Nuremberg to us-east-1 is **~90-120ms
|
||||
round trip**.
|
||||
Our DB is there. Latency from Nuremberg to us-east-1 is **~108ms one-way**
|
||||
TCP-level (verified by `nc -z -w 5` from `hetzner1`), so **~220ms RTT
|
||||
through Neon's pooler stack**.
|
||||
|
||||
This is the slowest hop in our data flow. Every api request that needs
|
||||
a DB query (most of them) pays this latency at least once.
|
||||
a DB query pays this latency at least once. Sub-millisecond Postgres
|
||||
execution time (verified via `EXPLAIN ANALYZE`: 0.04-0.34 ms on every
|
||||
hot path) means **wall-clock latency = network + Neon proxy overhead**.
|
||||
|
||||
**When this matters**: When we start seeing ~200ms+ response times from
|
||||
complex endpoints, it's likely DB latency dominant. Options:
|
||||
- Migrate Neon to `aws-eu-central-1` (Frankfurt) — shaves ~90ms off
|
||||
- Add Redis caching for hot reads (Chapter 7)
|
||||
- Read replicas (Neon supports them on paid tiers)
|
||||
### Optimizations layered on top to minimize round trips
|
||||
|
||||
We don't move the DB region (yet) but we cut the *number* of RTTs per
|
||||
request via:
|
||||
|
||||
1. **Auth caching** (Chapter 7 §Redis) — token + user lookups served
|
||||
from Redis (1-hour TTL) and per-pod in-memory cache (5-min TTL).
|
||||
On warm cache: 0 SQL round-trips for auth.
|
||||
2. **JOIN consolidation** — two-step
|
||||
`find residence-IDs → find tasks IN ids` collapsed into a single
|
||||
query with a Postgres subquery. One RTT instead of two.
|
||||
3. **Single-query auth** — token + user fetched in one INNER JOIN
|
||||
instead of GORM's two-query Preload pattern.
|
||||
4. **Residence-IDs Redis cache** — cached per user with 5-min TTL,
|
||||
invalidated on Create/Delete/Join/Remove. Saves 1 RTT per
|
||||
`/api/documents/`, `/api/contractors/`, `/api/residences/summary/`
|
||||
request.
|
||||
|
||||
After these, a fully-warm `/api/tasks/` is **1 SQL round-trip total
|
||||
(~220ms wall-clock)**. Verified via Jaeger trace — see Chapter 15.
|
||||
|
||||
### When this still matters
|
||||
|
||||
- Any cold-cache request still pays 2-3 RTTs (~500-700ms).
|
||||
- Pod startup pays 1 RTT × 20 (warm-up), but that runs in parallel:
|
||||
~440ms one-shot.
|
||||
|
||||
Long-term fix: migrate Neon to `aws-eu-central-1` (Frankfurt) — drops
|
||||
RTT to ~5ms and brings warm-cache requests under 50ms. Tracked in
|
||||
`docs/observability-plan.md` and Chapter 18 §migration triggers.
|
||||
|
||||
## Environment variables the app reads
|
||||
|
||||
@@ -247,14 +364,15 @@ From ConfigMap:
|
||||
|
||||
| Var | Purpose |
|
||||
|---|---|
|
||||
| `DB_HOST` | Neon pooler hostname |
|
||||
| `DB_HOST` | Neon pooler hostname (`-pooler` suffix) |
|
||||
| `DB_PORT` | 5432 |
|
||||
| `POSTGRES_USER` | `neondb_owner` |
|
||||
| `POSTGRES_DB` | `honeyDue` |
|
||||
| `DB_SSLMODE` | `require` |
|
||||
| `DB_MAX_OPEN_CONNS` | 25 |
|
||||
| `DB_MAX_IDLE_CONNS` | 10 |
|
||||
| `DB_MAX_LIFETIME` | `600s` |
|
||||
| `DB_MAX_IDLE_CONNS` | 20 |
|
||||
| `DB_MAX_LIFETIME` | `1800s` |
|
||||
| `DB_MAX_IDLE_TIME` | `0s` (never close idle) |
|
||||
|
||||
From Secret (`honeydue-secrets`):
|
||||
|
||||
@@ -288,11 +406,13 @@ GROUP BY usename, state, application_name;
|
||||
- [Neon docs][neon-docs]
|
||||
- [Neon pricing][neon-pricing]
|
||||
- [Postgres advisory locks][pg-locks]
|
||||
- [GORM AutoMigrate][gorm-automigrate]
|
||||
- [pressly/goose][goose] — production migration tool
|
||||
- [GORM AutoMigrate][gorm-automigrate] (tests only)
|
||||
- [honeyDue task architecture][task-arch] (repo-local)
|
||||
|
||||
[neon-docs]: https://neon.com/docs/introduction
|
||||
[neon-pricing]: https://neon.com/pricing
|
||||
[pg-locks]: https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
|
||||
[goose]: https://github.com/pressly/goose
|
||||
[gorm-automigrate]: https://gorm.io/docs/migration.html
|
||||
[task-arch]: ../../docs/TASK_LOGIC_ARCHITECTURE.md
|
||||
|
||||
+100
-33
@@ -150,18 +150,64 @@ Allowed MIME types: `image/jpeg`, `image/png`, `image/gif`, `image/webp`,
|
||||
|
||||
## Access control
|
||||
|
||||
### Upload flow
|
||||
### Upload flow (current — direct-to-B2 with presigned POST)
|
||||
|
||||
1. Client POSTs to `/api/upload/`
|
||||
2. Go API validates the user is authenticated and authorized for the
|
||||
target resource
|
||||
3. Go API streams the upload to B2 via minio-go's `PutObject`
|
||||
4. B2 returns a key
|
||||
5. Go API stores the key in Postgres
|
||||
6. Returns the key to the client
|
||||
Image and document uploads go **directly from the client to B2**. The
|
||||
api server only signs a short-lived POST policy; the bytes never
|
||||
traverse our cluster. This is the WhatsApp / Slack architecture and
|
||||
sidesteps the api as a proxy bottleneck.
|
||||
|
||||
The B2 bucket is **private**. Clients can't GET directly; they always
|
||||
go through the Go API.
|
||||
1. Client `POST /api/uploads/presign` with `{category, content_type, content_length}`.
|
||||
2. api validates auth, per-user quota (10 concurrent in-flight,
|
||||
50/hour rate limit), allowed mime, and the 10 MB cap. On success it
|
||||
creates a `pending_uploads` row, signs a B2 POST policy with a
|
||||
`content-length-range` condition bound to the claimed length ±256
|
||||
bytes, and returns `{id, upload_url, fields, key, expires_at}`.
|
||||
3. Client multipart-POSTs the bytes directly to B2 using the returned
|
||||
fields. **B2 enforces the size cap at the protocol level** — clients
|
||||
can't bypass it by lying about Content-Length.
|
||||
4. Client POSTs to the entity-creation endpoint (`/api/task-completions/`,
|
||||
`/api/documents/`) with `upload_ids: [id]`. The service `HEAD`s each
|
||||
B2 object, verifies size matches `expected_bytes`, marks the
|
||||
`pending_uploads.claimed_at`, and writes the `task_completion_image`
|
||||
/ `document_image` row referencing the upload.
|
||||
|
||||
The signed URL is valid for 15 minutes; presigns are not reusable.
|
||||
|
||||
The B2 bucket stays **private** — only the api ever holds the key
|
||||
material. Clients can't list or GET directly without a presign.
|
||||
|
||||
```
|
||||
┌──────────┐ 1) presign ┌────────┐
|
||||
│ client │ ──────────────────► │ api │
|
||||
│ │ ◄────────────────── │ │ POST policy + key
|
||||
│ │ └────────┘
|
||||
│ │ row in
|
||||
│ │ pending_uploads
|
||||
│ │ (claimed_at NULL)
|
||||
│ │ 2) POST bytes ┌────────┐
|
||||
│ │ ──────────────────► │ B2 │ enforces policy
|
||||
│ │ ◄────────────────── │ │
|
||||
│ │ └────────┘
|
||||
│ │ 3) attach ┌────────┐
|
||||
│ │ ──────────────────► │ api │ HEAD B2 object,
|
||||
│ │ upload_ids: [id] │ │ mark claimed_at,
|
||||
│ │ └────────┘ insert image row
|
||||
└──────────┘
|
||||
```
|
||||
|
||||
Server-side enforcement summary:
|
||||
|
||||
| Check | Where | Reject if |
|
||||
|---|---|---|
|
||||
| Auth | api middleware | unauthenticated |
|
||||
| Mime allowlist | `upload_service.go:allowedContentTypes` | not in list for category |
|
||||
| Size cap (10 MB) | api before signing + B2 policy | content_length > 10 MiB |
|
||||
| Concurrency cap (10) | `CountUnclaimedActiveForUser` | already 10 unclaimed in-flight |
|
||||
| Rate limit (50/hr) | Redis sliding window `upload:presign:<uid>:<bucket>` | 51st presign in the same hour |
|
||||
| Size at upload time | B2 (signed policy) | bytes outside content-length-range |
|
||||
| Ownership at attach | `FindUnclaimedForUser` | upload_id belongs to a different user |
|
||||
| Bytes match claim | `s3.Stat()` + bytes comparison | actual size differs from expected ±256 |
|
||||
|
||||
### Download flow (current)
|
||||
|
||||
@@ -170,34 +216,55 @@ go through the Go API.
|
||||
3. Go API fetches from B2 and streams back to the client
|
||||
|
||||
This proxies every download through the api. For high-traffic media
|
||||
that's inefficient (api becomes an egress bottleneck).
|
||||
|
||||
### Future: signed URLs
|
||||
|
||||
We could generate time-limited signed URLs for B2 objects:
|
||||
|
||||
```go
|
||||
url, err := s3Client.PresignedGetObject(ctx, bucket, key, 1*time.Hour, nil)
|
||||
```
|
||||
|
||||
Returns a URL the client can GET directly from B2, scoped to a specific
|
||||
object, valid for 1h. Saves api bandwidth and latency.
|
||||
|
||||
Not yet implemented. TODO (Chapter 20).
|
||||
that's inefficient (api becomes an egress bottleneck) — could be
|
||||
replaced with presigned GET URLs on the same bucket. Not yet shipped;
|
||||
download volume is low enough that the proxy is fine for now.
|
||||
|
||||
## Lifecycle and retention
|
||||
|
||||
We have **no lifecycle rules** set on the bucket. Objects live forever
|
||||
unless the app deletes them.
|
||||
### Orphan cleanup (`pending_uploads`)
|
||||
|
||||
When a user deletes their account, the app should delete their B2
|
||||
objects. This is currently not automated — a compliance gap for any
|
||||
"right to be forgotten" request.
|
||||
Every presign creates a row in `pending_uploads` with `expires_at =
|
||||
now + 15 min`. If the client never finishes the upload, or finishes
|
||||
but never calls the attach endpoint, the row stays unclaimed. An
|
||||
hourly cron in the worker reaps them:
|
||||
|
||||
**TODO** (Chapter 20): Either:
|
||||
- Implement explicit cleanup in the user deletion handler, or
|
||||
- Add B2 lifecycle rule tied to object metadata (tag objects with
|
||||
user ID; rule deletes tagged objects when user is soft-deleted)
|
||||
- **`maintenance:upload_cleanup`** — cron `30 * * * *`. Selects
|
||||
unclaimed rows past `expires_at`, deletes the corresponding B2
|
||||
object, deletes the row. Up to 500 per tick; the next tick picks up
|
||||
any overflow. Worker logs include `reaped` count.
|
||||
|
||||
The worker constructs a `StorageService` at startup; if storage init
|
||||
fails (e.g. `B2_KEY_ID` / `B2_APP_KEY` not wired into the worker
|
||||
deployment), the cleanup handler logs a warning and no-ops. See
|
||||
`deploy-k3s/manifests/worker/deployment.yaml` — both B2 secrets are
|
||||
required envs on this pod.
|
||||
|
||||
### Bucket lifecycle (backstop)
|
||||
|
||||
A B2 lifecycle rule on the `uploads/` prefix is the safety net if the
|
||||
worker is offline for an extended period:
|
||||
|
||||
- Hide objects 7 days after upload.
|
||||
- Delete 1 day after hidden.
|
||||
|
||||
This is configured manually via the Backblaze console (B2's S3
|
||||
lifecycle API isn't fully implemented). See
|
||||
`deploy-k3s/manifests/b2-lifecycle.md` for the exact rule and
|
||||
`b2 bucket get-info` verification command.
|
||||
|
||||
### User-deletion cascade
|
||||
|
||||
When a user deletes their account, the app deletes their `task_*` /
|
||||
`document` rows. The associated B2 objects survive — same compliance
|
||||
gap as before, not yet automated. Two approaches:
|
||||
|
||||
- Walk the image rows on user delete and `RemoveObject` each (simple,
|
||||
synchronous, slow for users with many uploads).
|
||||
- Tag objects with a `user_id` metadata header at upload time, then
|
||||
use a B2 lifecycle rule scoped to a deleted-users prefix.
|
||||
|
||||
Option 1 is the next item in the upload roadmap.
|
||||
|
||||
## Backup of B2
|
||||
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
# 10 — Secrets & Config
|
||||
|
||||
> **Updated 2026-05-15 (security remediation):** `honeydue-secrets` now
|
||||
> carries `REDIS_PASSWORD`; an `admin-basic-auth` Secret backs the admin
|
||||
> ingress; rotation is documented in `docs/runbooks/secret-rotation.md`;
|
||||
> and the Go config can read file-mounted secrets (`HONEYDUE_SECRETS_DIR`).
|
||||
> `deploy-k3s/SECURITY.md` is the authoritative current-state record.
|
||||
|
||||
## Summary
|
||||
|
||||
Non-sensitive config (hostnames, ports, feature flags, etc.) lives in
|
||||
@@ -55,7 +61,7 @@ APNS_AUTH_KEY_ID=DISABLED01
|
||||
APNS_AUTH_KEY_PATH=/secrets/apns/apns_auth_key.p8
|
||||
APNS_PRODUCTION=false
|
||||
APNS_TEAM_ID=DISABLED01
|
||||
APNS_TOPIC=com.tt.honeyDue
|
||||
APNS_TOPIC=com.myhoneydue.honeyDue
|
||||
APNS_USE_SANDBOX=false
|
||||
BASE_URL=https://myhoneydue.com
|
||||
B2_BUCKET_NAME=honeyDueProd
|
||||
|
||||
@@ -272,7 +272,7 @@ sequenceDiagram
|
||||
participant NewPod as api pod v2 (starting)
|
||||
|
||||
Note over NewPod: kubelet starts new pod
|
||||
Note over NewPod: pod connects to Postgres<br/>MigrateWithLock runs (no-op)<br/>HTTP server starts<br/>readinessProbe passes
|
||||
Note over NewPod: pod connects to Postgres<br/>RequireSchemaApplied checks goose_db_version<br/>HTTP server starts<br/>readinessProbe passes
|
||||
Note over NewPod: kube-proxy updates endpoints<br/>NewPod added to Service pool
|
||||
CF->>Traefik: request 1
|
||||
Traefik->>OldPod: routed (old pod still in pool)
|
||||
|
||||
@@ -8,23 +8,62 @@ No downtime if the change is backward-compatible. Rollback is
|
||||
`kubectl rollout undo`. This chapter walks through the full process,
|
||||
plus alternate paths (config-only changes, manifest changes, hotfixes).
|
||||
|
||||
## TL;DR for a code change
|
||||
## TL;DR using the unified deploy script
|
||||
|
||||
The recommended path. `deploy-k3s/scripts/03-deploy.sh` builds all four
|
||||
images (api, worker, admin, web), pushes to Gitea, regenerates the
|
||||
ConfigMap from `config.yaml`, applies every manifest under
|
||||
`deploy-k3s/manifests/` (including the observability vmagent), and
|
||||
waits for all rollouts.
|
||||
|
||||
```bash
|
||||
cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go
|
||||
git add . && git commit -m "..." && git push gitea master
|
||||
|
||||
export KUBECONFIG=~/.kube/honeydue.yaml
|
||||
bash deploy-k3s/scripts/03-deploy.sh # full build + push + rollout
|
||||
# or, to redeploy without rebuilding:
|
||||
bash deploy-k3s/scripts/03-deploy.sh --skip-build
|
||||
# or, to pin a specific tag:
|
||||
bash deploy-k3s/scripts/03-deploy.sh --tag d3708e6
|
||||
```
|
||||
|
||||
What the script does, in order:
|
||||
|
||||
1. Read registry creds from `deploy-k3s/config.yaml`.
|
||||
2. `docker login gitea.treytartt.com`.
|
||||
3. Build all four images with `--platform linux/amd64` (so arm64 Macs
|
||||
don't push images that crash on Hetzner amd64 nodes with
|
||||
"exec format error").
|
||||
4. Push to the gitea registry, plus tag and push `:latest`.
|
||||
5. Generate the env file from `config.yaml` and apply as ConfigMap
|
||||
`honeydue-config` (uses dry-run + apply for diff-free idempotence).
|
||||
6. Apply `manifests/namespace.yaml`, `redis/`, `ingress/`,
|
||||
`api/{deployment,service,hpa}`, `worker/`, `admin/`, `web/`.
|
||||
7. Apply `manifests/observability/vmagent.yaml`, substituting
|
||||
`TOKEN_PLACEHOLDER` with `OBS_INGEST_TOKEN` from `deploy/prod.env`
|
||||
(gitignored). Skipped with a warning if the token isn't present.
|
||||
8. `kubectl rollout status` for every Deployment, including vmagent.
|
||||
|
||||
~7–10 minutes for a full rebuild. ~1–2 minutes with `--skip-build`.
|
||||
|
||||
## TL;DR for a single-service code change (manual)
|
||||
|
||||
```bash
|
||||
# 1. Commit + get SHA
|
||||
cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go
|
||||
git add . && git commit -m "..." && SHA=$(git rev-parse --short HEAD)
|
||||
|
||||
# 2. Login to Gitea registry
|
||||
set -a; source deploy/registry.env; set +a
|
||||
printf '%s' "$REGISTRY_TOKEN" | docker login "$REGISTRY" -u "$REGISTRY_USERNAME" --password-stdin
|
||||
# 2. Login to Gitea registry (creds in config.yaml)
|
||||
docker login gitea.treytartt.com -u admin
|
||||
|
||||
# 3. Build + push amd64 image
|
||||
docker buildx build --platform linux/amd64 --target api \
|
||||
-t "gitea.treytartt.com/admin/honeydue-api:${SHA}" --push .
|
||||
docker build --platform linux/amd64 --target api \
|
||||
-t "gitea.treytartt.com/admin/honeydue-api:${SHA}" .
|
||||
docker push "gitea.treytartt.com/admin/honeydue-api:${SHA}"
|
||||
|
||||
# 4. Roll it in
|
||||
export KUBECONFIG=~/.kube/honeydue-k3s.yaml
|
||||
export KUBECONFIG=~/.kube/honeydue.yaml
|
||||
kubectl set image deployment/api -n honeydue \
|
||||
api="gitea.treytartt.com/admin/honeydue-api:${SHA}"
|
||||
|
||||
@@ -32,11 +71,18 @@ kubectl set image deployment/api -n honeydue \
|
||||
kubectl rollout status -n honeydue deployment/api
|
||||
|
||||
# 6. Log out
|
||||
docker logout "$REGISTRY"
|
||||
docker logout gitea.treytartt.com
|
||||
```
|
||||
|
||||
~3–5 minutes end to end for api.
|
||||
|
||||
> **Gotcha:** Deployments default to `imagePullPolicy: IfNotPresent`,
|
||||
> which means kubelet won't re-fetch an image with a tag it already
|
||||
> has cached locally — even if the registry now has different bytes
|
||||
> at that tag. Always change tags (use the SHA), or temporarily flip
|
||||
> `imagePullPolicy: Always` and `kubectl rollout restart` if you need
|
||||
> to overwrite a tag.
|
||||
|
||||
## The build
|
||||
|
||||
### Step 1 — Prepare
|
||||
@@ -201,6 +247,38 @@ kubectl patch secret honeydue-secrets -n honeydue \
|
||||
kubectl rollout restart -n honeydue deployment/api deployment/worker
|
||||
```
|
||||
|
||||
## One-time B2 bucket lifecycle (manual)
|
||||
|
||||
The `pending_uploads` cleanup cron (`30 * * * *` on the worker) handles
|
||||
the common case of reaping orphaned uploads. The B2 bucket lifecycle
|
||||
rule on the `uploads/` prefix is the **backstop** if the worker is
|
||||
offline for >24 hours. It's configured once via the Backblaze web
|
||||
console — B2's S3 lifecycle API isn't fully implemented, so this can't
|
||||
be in the deploy script.
|
||||
|
||||
One-time setup:
|
||||
|
||||
1. Open https://secure.backblaze.com/b2_buckets.htm → bucket
|
||||
`honeyDueProd` → **Lifecycle Settings** → **Custom**
|
||||
2. Add rule:
|
||||
- File name prefix: `uploads/`
|
||||
- Hide files older than: **7 days**
|
||||
- Delete hidden files older than: **1 day**
|
||||
|
||||
Total maximum lifetime of an orphaned object after the rule fires: 8
|
||||
days. The worker normally reaps within an hour, so the rule should
|
||||
almost never trigger.
|
||||
|
||||
Verify:
|
||||
|
||||
```bash
|
||||
# Requires the b2 CLI: brew install b2-tools
|
||||
b2 bucket get-info honeyDueProd | jq '.lifecycleRules'
|
||||
```
|
||||
|
||||
See `deploy-k3s/manifests/b2-lifecycle.md` for the canonical rule
|
||||
definition and a curl-based fallback if the b2 CLI isn't available.
|
||||
|
||||
## Manifest changes
|
||||
|
||||
When you add/modify a deployment YAML:
|
||||
@@ -271,10 +349,47 @@ Timeline (approximate, warm state):
|
||||
- t=60s: another old pod terminates
|
||||
- ...continues until all on new RS
|
||||
|
||||
For cold-boot (e.g., first deploy on a rebuilt cluster), the
|
||||
MigrateWithLock advisory lock extends this to several minutes. But the
|
||||
rollout is serialized — only one pod starts per iteration, so the lock
|
||||
queue is small.
|
||||
Migrations run as a separate Kubernetes Job that completes before any
|
||||
api/worker pod is rolled. So the rollout above never includes migration
|
||||
work — pods that boot are guaranteed to find the schema already at the
|
||||
expected version. See §"Migrations are gated, not interleaved" below.
|
||||
|
||||
## Migrations are gated, not interleaved
|
||||
|
||||
`03-deploy.sh` runs `goose up` as a one-shot Job before applying any
|
||||
api/worker manifests:
|
||||
|
||||
```
|
||||
1. kubectl delete job honeydue-migrate (idempotent, removes prior run)
|
||||
2. kubectl apply -f manifests/migrate/job.yaml (with current api image)
|
||||
3. kubectl wait --for=condition=complete --timeout=10m job/honeydue-migrate
|
||||
4. (only if Job succeeded) kubectl apply -f manifests/api/...
|
||||
```
|
||||
|
||||
The Job uses the api image — `/usr/local/bin/goose` is baked in at
|
||||
Dockerfile build time. The Job script strips the `-pooler` segment
|
||||
from `DB_HOST` before connecting (goose's session-scoped advisory
|
||||
lock can't survive PgBouncer transaction-mode), runs `goose up`, exits.
|
||||
|
||||
If the Job fails, the script aborts before any new app pod sees a
|
||||
stale schema. To debug:
|
||||
|
||||
```bash
|
||||
kubectl -n honeydue logs job/honeydue-migrate --tail=200
|
||||
kubectl -n honeydue describe job honeydue-migrate
|
||||
```
|
||||
|
||||
After investigating, fix the migration file and re-run `03-deploy.sh`.
|
||||
The Job is idempotent — successful migrations stay applied, only the
|
||||
new/failed file gets retried.
|
||||
|
||||
api/worker pods run a `RequireSchemaApplied` check at startup that
|
||||
queries `goose_db_version` and refuses to boot if the table is missing
|
||||
or the latest row is `is_applied=false`. This is the fail-fast for
|
||||
"someone bypassed the deploy script and the schema isn't current."
|
||||
|
||||
For full schema management background, see
|
||||
[Chapter 8 §Schema management](./08-database.md).
|
||||
|
||||
## Hotfix workflow
|
||||
|
||||
@@ -314,14 +429,10 @@ Contrast: `deploy/scripts/deploy_prod.sh` (Swarm-era) did:
|
||||
9. Healthcheck the final URL; auto-rollback on failure
|
||||
10. Log out of registries
|
||||
|
||||
Our current k3s deploy is more manual but simpler. We'd write a similar
|
||||
script for k3s if deploys become frequent:
|
||||
|
||||
```bash
|
||||
# deploy-k3s/scripts/04-deploy.sh (not yet updated for Gitea)
|
||||
```
|
||||
|
||||
See the scaffold in `deploy-k3s/scripts/`.
|
||||
The current k3s replacement, `deploy-k3s/scripts/03-deploy.sh`, covers
|
||||
the same ground in fewer steps because Kubernetes does the
|
||||
versioning/rollout/health bookkeeping natively. See the TL;DR section
|
||||
at the top of this chapter.
|
||||
|
||||
## Common deploy failures
|
||||
|
||||
|
||||
+302
-164
@@ -2,15 +2,119 @@
|
||||
|
||||
## Summary
|
||||
|
||||
We have minimal observability today: `kubectl logs`, `kubectl top`,
|
||||
Cloudflare Analytics, and the Neon dashboard. No Prometheus, no Grafana,
|
||||
no centralized log aggregator, no APM. This is adequate for the
|
||||
current traffic volume (low) but is a known gap. This chapter documents
|
||||
what we *have* and what we'd add as traffic grows.
|
||||
Production has live metrics and tracing infrastructure as of 2026-04-25.
|
||||
A self-hosted **VictoriaMetrics + Jaeger + Grafana** stack runs on
|
||||
`88oakappsUpdate` (Linode VPS, also home to the self-hosted PostHog
|
||||
deployment). A `vmagent` sidecar in the honeyDue k3s namespace scrapes
|
||||
the api Pods' `/metrics` endpoint every 15 seconds and remote-writes to
|
||||
`https://obs.88oakapps.com/api/v1/write`. Grafana is at
|
||||
`https://grafana.88oakapps.com` with a pre-provisioned RED dashboard.
|
||||
|
||||
What we still don't have: log aggregation (Dozzle and `kubectl logs`
|
||||
fill the niche for now), alerting (no PagerDuty/Slack on errors), and
|
||||
full distributed tracing (OTel SDK is wired in app code but app-side
|
||||
instrumentation beyond HTTP routes hasn't shipped yet).
|
||||
|
||||
The whole observability stack costs **$0** incremental and uses ~700 MB
|
||||
RAM on `88oakappsUpdate` (5% of its free RAM). It runs as a separate
|
||||
docker-compose project from PostHog so neither product's lifecycle
|
||||
touches the other.
|
||||
|
||||
## What we have
|
||||
|
||||
### 1. `kubectl logs`
|
||||
### 1. Metrics — VictoriaMetrics + vmagent
|
||||
|
||||
```
|
||||
honeyDue k3s (Hetzner) 88oakappsUpdate (Linode)
|
||||
┌───────────────────────────┐ ┌──────────────────────────┐
|
||||
│ api Pods (3) :8000/metrics│ │ /opt/honeydue-obs/ │
|
||||
│ prometheus/client_golang│ │ ┌──────────────────┐ │
|
||||
│ │ │ │ VictoriaMetrics │ │
|
||||
│ vmagent ──── scrape 15s │ │ │ 30d retention │ │
|
||||
│ remote_write ─────┼────────────┼─→ /api/v1/write │ │
|
||||
│ (HTTPS, bearer) │ │ │ (mem 256 MB) │ │
|
||||
└───────────────────────────┘ │ └──────────────────┘ │
|
||||
└──────────────────────────┘
|
||||
```
|
||||
|
||||
The Go API exposes `/metrics` in Prometheus exposition format. Histograms
|
||||
are defined in `internal/prom/metrics.go` and registered globally:
|
||||
|
||||
| Metric | Labels | Source |
|
||||
|---|---|---|
|
||||
| `http_request_duration_seconds` | `route, method, status` | Echo middleware around every handler |
|
||||
| `gorm_query_duration_seconds` | `table, operation` | GORM before/after callbacks (no ctx threading needed) |
|
||||
| `b2_upload_duration_seconds` | `bucket, result` | Wrapped `s.backend.Write` in `internal/services/storage_service.go` |
|
||||
| `b2_upload_bytes_total` | `bucket, result` | Counter alongside the duration histogram |
|
||||
| `apns_send_duration_seconds` | `result` (`ok`/`bad_token`/`error`) | Wrapped APNs `PushWithContext` in `internal/push/apns.go` |
|
||||
| `fcm_send_duration_seconds` | `result` | Wrapped FCM HTTP v1 send in `internal/push/fcm.go` |
|
||||
| `asynq_job_duration_seconds` | `task_type, result` | Histograms registered; middleware not yet attached (Step 3) |
|
||||
| `go_*`, `process_*` | (standard) | `prometheus/client_golang/prometheus/collectors` defaults |
|
||||
|
||||
The previous custom monitoring at `/metrics` was renamed to
|
||||
`/metrics/legacy` so the canonical `/metrics` emits proper histograms
|
||||
suitable for `histogram_quantile()` rollups. The legacy endpoint stays
|
||||
because the GoAdmin dashboard reads it.
|
||||
|
||||
#### vmagent in k3s
|
||||
|
||||
Lives at `deploy-k3s/manifests/observability/vmagent.yaml`. One replica,
|
||||
`mem_limit: 256Mi`, scrapes by Kubernetes pod-discovery filtered to
|
||||
`app.kubernetes.io/name=api` and remote-writes to
|
||||
`https://obs.88oakapps.com/api/v1/write` with a bearer token from
|
||||
`OBS_INGEST_TOKEN` in `deploy/prod.env` (substituted into a Secret at
|
||||
deploy time).
|
||||
|
||||
The agent buffers locally to `/tmp/vmagent` (emptyDir, 512 MB cap), so
|
||||
brief obs outages don't drop samples. Persistent queue replays on
|
||||
reconnect.
|
||||
|
||||
NetworkPolicies in the honeydue namespace allow egress from vmagent to:
|
||||
- DNS (kube-dns / coredns)
|
||||
- Kubernetes API (`10.43.0.0/16:443`) for pod discovery
|
||||
- api Pods on `10.42.0.0/16:8000`
|
||||
- The public obs endpoint over `0.0.0.0/0:443`
|
||||
|
||||
These are scoped tight — vmagent can't reach Postgres, Redis, B2, or
|
||||
any other external service.
|
||||
|
||||
### 2. Tracing — Jaeger all-in-one
|
||||
|
||||
Jaeger 1.62 with badger storage runs alongside VictoriaMetrics. The
|
||||
collector accepts:
|
||||
- OTLP/HTTP at `https://obs.88oakapps.com/v1/traces` (bearer-token gated)
|
||||
- OTLP/gRPC at `:4317` (localhost-only)
|
||||
- Native Jaeger protocols at `:14268` etc. (localhost-only)
|
||||
|
||||
Retention: ~7 days at current scale before badger rotates. UI at
|
||||
`https://grafana.88oakapps.com` via the Jaeger datasource.
|
||||
|
||||
**Status of app-side instrumentation**: the histograms are populating
|
||||
metrics. The OTel exporter wiring in `cmd/api/main.go` is **not yet
|
||||
shipped**. When it does ship, every `POST /api/auth/login/` will produce
|
||||
a flame-graph trace with HTTP → handler → SQL → B2 → APNs spans.
|
||||
Tracking issue: gitea#3.
|
||||
|
||||
### 3. Dashboards — Grafana
|
||||
|
||||
`https://grafana.88oakapps.com` (Cloudflare-fronted, basic auth via
|
||||
Grafana itself, admin credentials in `deploy/prod.env`).
|
||||
|
||||
Datasources auto-provisioned at container startup from
|
||||
`/opt/honeydue-obs/data/grafana-provisioning/datasources/datasources.yaml`:
|
||||
- VictoriaMetrics (Prometheus type, `http://victoriametrics:8428` in-network)
|
||||
- Jaeger (`http://jaeger:16686` in-network)
|
||||
|
||||
Pre-provisioned dashboard: `honeyDue API — RED` at
|
||||
`/d/honeydue-red`. Top row uses the legacy custom metrics
|
||||
(`http_endpoint_requests_total`, `http_requests_total`) which started
|
||||
flowing the moment vmagent attached. Lower rows use the new histograms
|
||||
(`http_request_duration_seconds_bucket` p50/p95/p99 by route, GORM p95
|
||||
by table, B2 upload p95, APNs/FCM send p95, Go memory + goroutines).
|
||||
Lower rows populated immediately after the api rebuild that shipped
|
||||
`internal/prom`.
|
||||
|
||||
### 4. `kubectl logs`
|
||||
|
||||
Every container's stdout/stderr is captured by containerd and readable
|
||||
via kubectl:
|
||||
@@ -33,9 +137,10 @@ kubectl get events -n honeydue --sort-by=.lastTimestamp
|
||||
Only the last ~20 MB of logs is retained per container, on-disk on the
|
||||
node. Once a pod is deleted, its logs are gone.
|
||||
|
||||
For persistent log access we'd need aggregation (see §what we'd add).
|
||||
For persistent log access we'd need aggregation (see §What we still
|
||||
don't have).
|
||||
|
||||
### 2. `kubectl top`
|
||||
### 5. `kubectl top`
|
||||
|
||||
Pod and node resource usage via metrics-server:
|
||||
|
||||
@@ -43,43 +148,32 @@ Pod and node resource usage via metrics-server:
|
||||
kubectl top nodes
|
||||
# NAME CPU(cores) CPU(%) MEMORY(bytes) MEMORY(%)
|
||||
# ubuntu-8gb-nbg1-1 169m 4% 748Mi 9%
|
||||
# ubuntu-8gb-nbg1-2 229m 5% 1043Mi 13%
|
||||
# ubuntu-8gb-nbg1-3 124m 3% 770Mi 9%
|
||||
|
||||
kubectl top pods -n honeydue
|
||||
```
|
||||
|
||||
**Retention**: In-memory only. Last few minutes of data. No
|
||||
historical view.
|
||||
In-memory only; last few minutes of data. For historical trends use
|
||||
the Grafana dashboard, which exposes the same data via the `go_*` and
|
||||
`container_*` (kubelet cAdvisor) metrics.
|
||||
|
||||
### 3. Cloudflare Analytics
|
||||
### 6. Cloudflare Analytics
|
||||
|
||||
CF Dashboard → Analytics & Logs. Per-zone stats:
|
||||
- Requests per second
|
||||
- Bandwidth
|
||||
- Cache hit ratio
|
||||
- Top HTTP status codes
|
||||
- Top request paths
|
||||
- Bot traffic score
|
||||
CF Dashboard → Analytics & Logs. Per-zone aggregate stats:
|
||||
requests/sec, bandwidth, cache hit ratio, top status codes, top paths,
|
||||
bot traffic score. Good for spotting macro trends ("suddenly 10× more
|
||||
502s today") that wouldn't show up in a single-pod sample.
|
||||
|
||||
All aggregated, no individual request traces. Good for spotting macro
|
||||
trends ("suddenly 10× more 502s today"), poor for debugging specific
|
||||
issues.
|
||||
Free tier retention: 7 days of aggregate stats.
|
||||
|
||||
Free tier retention: 7 days of aggregate stats. Pro extends this.
|
||||
### 7. Neon dashboard
|
||||
|
||||
### 4. Neon dashboard
|
||||
Neon console → project → Monitoring: compute utilization (CU-hours),
|
||||
slow queries, active connections, storage usage. Useful for "is the
|
||||
DB busy?" and free-tier limit watching. The new
|
||||
`gorm_query_duration_seconds` histogram covers the application side
|
||||
of the same question with much better latency tail visibility.
|
||||
|
||||
Neon console → project → Monitoring:
|
||||
- Compute utilization (CU-hours consumed)
|
||||
- Query performance (slow queries)
|
||||
- Active connections
|
||||
- Storage usage
|
||||
|
||||
Good for "is the DB busy?" and "am I close to my free tier limit?"
|
||||
Not real-time.
|
||||
|
||||
### 5. Kubernetes events
|
||||
### 8. Kubernetes events
|
||||
|
||||
`kubectl get events` shows cluster-level state changes: pod scheduling,
|
||||
failures, image pulls, probe failures. Useful for post-mortem on
|
||||
@@ -87,7 +181,7 @@ deploys.
|
||||
|
||||
Retention: events are stored in etcd but default to 1 hour.
|
||||
|
||||
## What we don't have (the gap)
|
||||
## What we still don't have
|
||||
|
||||
### No log aggregation
|
||||
|
||||
@@ -98,64 +192,108 @@ all api pod logs for user X") we have to:
|
||||
# Query all at once with stern (if installed)
|
||||
stern -n honeydue api
|
||||
|
||||
# Or for specific pod
|
||||
# Or per-pod
|
||||
kubectl logs -n honeydue <pod> | grep user_id=12345
|
||||
```
|
||||
|
||||
This works but doesn't scale. Grep across 3 pods for a specific
|
||||
user_id is OK. Across 30 pods, intractable.
|
||||
This works but doesn't scale across many pods.
|
||||
|
||||
**What we'd add**: [Loki](https://grafana.com/oss/loki/) — a lightweight
|
||||
log aggregator designed for k8s. ~$0 to self-host; integrates with
|
||||
Grafana for queries. Or [Betterstack](https://betterstack.com/logs)
|
||||
($10/mo, hosted).
|
||||
|
||||
### No metrics/dashboards
|
||||
|
||||
`kubectl top` tells us "is this pod hot right now?" but not "has CPU
|
||||
been climbing over the past hour?" We'd need:
|
||||
|
||||
- **Prometheus** — scrapes metrics from kubelet and pods' `/metrics`
|
||||
endpoints, stores time series
|
||||
- **Grafana** — queries Prometheus, renders dashboards
|
||||
|
||||
K3s can install these via Helm in ~10 minutes. Adds ~500MB RAM to the
|
||||
cluster. Stability and operational load: moderate.
|
||||
|
||||
**Alternative**: [Kubernetes Dashboard](https://github.com/kubernetes/dashboard)
|
||||
bundled with k3s (disabled by default). Minimal UI over the existing
|
||||
metrics API. Cheaper than Prometheus but less queryable.
|
||||
|
||||
### No distributed tracing
|
||||
|
||||
"This request took 800ms — which hop was slow?" is currently unanswerable
|
||||
beyond "the DB query, probably." A real trace would show:
|
||||
- TLS handshake time
|
||||
- Traefik routing time
|
||||
- Go handler time
|
||||
- Postgres query time
|
||||
- Redis call time
|
||||
- Each B2 request time
|
||||
|
||||
We'd add OpenTelemetry to the Go app and export to Jaeger/Tempo. Work
|
||||
is moderate; value kicks in when we have complex request flows.
|
||||
**What we'd add**: [Loki](https://grafana.com/oss/loki/) on
|
||||
`88oakappsUpdate` next to the existing obs stack. Adds ~512 MB RAM
|
||||
plus a Promtail (or Vector/Alloy) DaemonSet in k3s. Defer until log
|
||||
search becomes a recurring pain point — `stern` + `grep` is fine at
|
||||
current pod count.
|
||||
|
||||
### No alerting
|
||||
|
||||
No PagerDuty, no Slack webhooks, no email on "api is returning 500s."
|
||||
The operator finds out when users complain.
|
||||
|
||||
Cheapest fix: [Uptime Kuma](https://github.com/louislam/uptime-kuma)
|
||||
(self-hosted) or Better Stack Uptime (free for small teams). Ping
|
||||
`https://api.myhoneydue.com/api/health/` every minute; alert if it fails.
|
||||
Cheapest fix path:
|
||||
1. Grafana alerting (built into Grafana 11) — alert rules over the
|
||||
existing histograms (e.g., `histogram_quantile(0.95, ...) > 1s`).
|
||||
Routes to Slack via webhook. **Zero infra cost.**
|
||||
2. [Uptime Kuma](https://github.com/louislam/uptime-kuma) on
|
||||
`88oakappsUpdate` — pings `/api/health/` from outside the cluster
|
||||
every minute; complements the in-cluster view.
|
||||
|
||||
We'd want both eventually. Grafana alerting first because the data is
|
||||
already there.
|
||||
|
||||
### Distributed tracing — fully integrated
|
||||
|
||||
The OTel SDK is wired in `cmd/api/main.go` and `cmd/worker/main.go` and
|
||||
ships traces to Jaeger via `obs.88oakapps.com/v1/traces`. Every public
|
||||
service method now takes `ctx context.Context` and routes its SQL through
|
||||
`repo.WithContext(ctx)`, which means **every authenticated API endpoint
|
||||
produces a fully-nested flame graph** in Jaeger.
|
||||
|
||||
| Span source | Status |
|
||||
|---|---|
|
||||
| `otelecho.Middleware` — span per HTTP request | ✅ live |
|
||||
| Auth middleware DB lookups (`m.db.WithContext(ctx)`) | ✅ live |
|
||||
| All repos via `repo.WithContext(ctx)` (`otelgorm` plugin) | ✅ live |
|
||||
| Manual span around `storage_service.Upload` (B2 PutObject) | ✅ live |
|
||||
| Manual span around APNs `Send` / `SendWithCategory` | ✅ live |
|
||||
| Manual span around FCM `sendOne` | ✅ live |
|
||||
| Asynq middleware — span per task type with retry/payload attrs | ✅ live |
|
||||
|
||||
Migrated services (every public method takes ctx):
|
||||
- `AuthService` — login, register, refresh, logout, me, verify-email,
|
||||
forgot/reset-password, update-profile
|
||||
- `TaskService` — all 25+ task and completion methods
|
||||
- `ResidenceService` — all 15 methods including share-codes
|
||||
- `ContractorService` — all 9 methods
|
||||
- `DocumentService` — all 10 methods
|
||||
- `NotificationService` — all 12 methods
|
||||
- `SubscriptionService` — all 12 methods including Apple/Google IAP
|
||||
|
||||
Sample trace for `GET /api/tasks/` (warm cache, post-optimization):
|
||||
|
||||
```
|
||||
GET /api/tasks/ (229ms)
|
||||
└── service: SELECT * FROM task_task WHERE residence_id IN
|
||||
(SELECT id FROM residence_residence WHERE...) (227ms)
|
||||
```
|
||||
|
||||
Two spans total. The auth path runs entirely from Redis + in-memory
|
||||
cache (zero SQL queries) thanks to the 1-hour token TTL and 5-min user
|
||||
TTL. The residence-ID lookup is folded into the tasks query as a
|
||||
Postgres subquery, so a single network round-trip to Neon services the
|
||||
whole request. See Chapter 8 §"Optimizations layered on top" for the
|
||||
optimization stack.
|
||||
|
||||
Earlier trace, before the optimization stack landed (commit 88fb175):
|
||||
|
||||
```
|
||||
GET /api/tasks/ (2473ms)
|
||||
├── auth: SELECT * FROM user_authtoken WHERE key=... (1506ms)
|
||||
├── auth: SELECT * FROM auth_user WHERE id=7 (333ms)
|
||||
├── service: SELECT id FROM residence_residence WHERE... (736ms)
|
||||
└── service: SELECT * FROM task_task WHERE residence_id IN(...) (226ms)
|
||||
```
|
||||
|
||||
10× improvement from 2,473ms to 229ms by cutting query count
|
||||
(5 SQL → 1 SQL on warm cache). The 227ms in the surviving query is
|
||||
**1 transatlantic round-trip** to Neon us-east-1 from Hetzner
|
||||
Nuremberg — the physical floor on the current setup. Eliminated by
|
||||
migrating Neon to a EU region; tracked in [Chapter 18 §migration
|
||||
triggers](./18-cost.md) and `docs/observability-plan.md`.
|
||||
|
||||
**Migration pattern (for any future services or middleware):** add
|
||||
`ctx context.Context` as the first arg, change the handler call site
|
||||
to pass `c.Request().Context()`, and replace `s.repo.X(...)` with
|
||||
`s.repo.WithContext(ctx).X(...)`. Tests pass `context.Background()`.
|
||||
|
||||
### No APM (Application Performance Monitoring)
|
||||
|
||||
No request-level profiling. We can't see "which endpoint has the highest
|
||||
p99 latency?" or "which SQL query is hot this week?"
|
||||
No continuous profiling. We can answer "which endpoint has the highest
|
||||
p99 latency?" from the histograms, but not "where in the call stack is
|
||||
the time going?" without ad-hoc `pprof` runs.
|
||||
|
||||
Options: Datadog, New Relic, Honeycomb, self-hosted Tempo+Grafana.
|
||||
All are meaningful work to set up and cost $$$.
|
||||
If/when needed: Grafana Pyroscope is the OSS continuous profiler that
|
||||
fits our stack. Adds ~512 MB RAM. Defer until a CPU performance
|
||||
incident shows up.
|
||||
|
||||
## The app's logging conventions
|
||||
|
||||
@@ -172,28 +310,12 @@ The Go app uses zerolog and emits structured JSON:
|
||||
```
|
||||
|
||||
Log levels: `debug`, `info`, `warn`, `error`, `fatal`. Controlled by
|
||||
`DEBUG=true|false` in ConfigMap (true sets level to debug, false sets
|
||||
level to info).
|
||||
`DEBUG=true|false` in the ConfigMap (true sets level to debug, false
|
||||
sets level to info).
|
||||
|
||||
Every request is logged with:
|
||||
- Method, path, status code
|
||||
- Request ID (for correlating logs across pods)
|
||||
- User ID (if authenticated)
|
||||
- Latency
|
||||
|
||||
```json
|
||||
{
|
||||
"level": "info",
|
||||
"method": "GET",
|
||||
"path": "/api/tasks/",
|
||||
"status": 200,
|
||||
"latency_ms": 42,
|
||||
"user_id": 123,
|
||||
"request_id": "a6b5db35-..."
|
||||
}
|
||||
```
|
||||
|
||||
This is queryable by grep. Better with log aggregation.
|
||||
Every request is logged with method, path, status, request_id, user_id
|
||||
(if authenticated), latency. Queryable by grep today; ready to ingest
|
||||
into Loki when we add it.
|
||||
|
||||
## Health endpoints
|
||||
|
||||
@@ -202,71 +324,58 @@ Each service exposes a health endpoint:
|
||||
| Service | Endpoint | What it checks |
|
||||
|---|---|---|
|
||||
| api | `/api/health/` | Process alive (doesn't verify DB) |
|
||||
| api | `/api/health/live` | Process alive |
|
||||
| admin | `/` | Next.js is up |
|
||||
| worker | (none public) | Internal Asynq status |
|
||||
| api | `/metrics` | Prometheus exposition (vmagent scrapes here) |
|
||||
| api | `/metrics/legacy` | Custom monitoring metrics for GoAdmin |
|
||||
|
||||
Health endpoints are **shallow** — they return 200 if the process is
|
||||
running and listening. They don't try to reach Postgres/Redis/etc.
|
||||
Rationale: if Postgres is briefly down, we don't want all api pods to
|
||||
start failing liveness and cascade-restart.
|
||||
|
||||
## Dozzle (deprecated)
|
||||
## obs.88oakapps.com — the ingest endpoint
|
||||
|
||||
The Swarm era had [Dozzle](https://github.com/amir20/dozzle) — a
|
||||
lightweight web UI for Docker logs. Accessible via SSH tunnel to the
|
||||
manager node. Not deployed on k3s; `kubectl logs` + `stern` fills the
|
||||
niche.
|
||||
Public hostname for cross-cluster metric and trace ingest. Cloudflare
|
||||
in front, nginx on `88oakappsUpdate` enforces a bearer-token check
|
||||
before forwarding to the local VM/Jaeger containers.
|
||||
|
||||
## Kubernetes metrics the k8s API exposes
|
||||
| Path | Forwards to | Purpose |
|
||||
|---|---|---|
|
||||
| `/api/v1/write` | `http://127.0.0.1:8428` | Prometheus remote-write (vmagent → VM) |
|
||||
| `/v1/traces` | `http://127.0.0.1:4318/v1/traces` | OTLP/HTTP traces (app → Jaeger) |
|
||||
| `/health` | (returns 200) | Reachability probe — also requires auth |
|
||||
| anything else | 404 | |
|
||||
|
||||
Even without Prometheus, these are queryable:
|
||||
Token lives at `/etc/honeydue-obs/secrets.env` (mode 0600 on the box)
|
||||
and at `OBS_INGEST_TOKEN=` in `deploy/prod.env` (gitignored). To rotate:
|
||||
generate a new value, update both ends, restart vmagent.
|
||||
|
||||
```bash
|
||||
# Resource metrics (via metrics-server)
|
||||
kubectl get --raw /apis/metrics.k8s.io/v1beta1/nodes
|
||||
kubectl get --raw /apis/metrics.k8s.io/v1beta1/namespaces/honeydue/pods
|
||||
|
||||
# Core API (k8s state)
|
||||
kubectl get --raw /api/v1/namespaces/honeydue/pods/<name>
|
||||
|
||||
# Kubelet metrics (per-node; requires tunneling)
|
||||
kubectl get --raw /api/v1/nodes/<node>/proxy/metrics
|
||||
# Operator: rotate the bearer token
|
||||
NEW=$(openssl rand -hex 32)
|
||||
ssh 88oakappsUpdate "sudo sed -i 's|OBS_INGEST_TOKEN=.*|OBS_INGEST_TOKEN=$NEW|' /etc/honeydue-obs/secrets.env"
|
||||
ssh 88oakappsUpdate "sudo sed -i 's|Bearer [a-f0-9]\{64\}|Bearer $NEW|' /etc/nginx/sites-available/obs.88oakapps.com && sudo nginx -s reload"
|
||||
sed -i.bak "s|^OBS_INGEST_TOKEN=.*|OBS_INGEST_TOKEN=$NEW|" deploy/prod.env
|
||||
KUBECONFIG=~/.kube/honeydue.yaml kubectl -n honeydue create secret generic vmagent-remote-write \
|
||||
--from-literal=bearer_token=$NEW --dry-run=client -o yaml | kubectl apply -f -
|
||||
KUBECONFIG=~/.kube/honeydue.yaml kubectl -n honeydue rollout restart deploy/vmagent
|
||||
```
|
||||
|
||||
If we ever spin up Prometheus, these are the endpoints it would scrape.
|
||||
## Resource budget
|
||||
|
||||
## Future: what to add and when
|
||||
| Service | mem_limit | Disk | Retention |
|
||||
|---|---|---|---|
|
||||
| VictoriaMetrics | 256 MB | 10 GB | 30 days |
|
||||
| Jaeger all-in-one (badger) | 256 MB | 10 GB | ~7 days |
|
||||
| Grafana OSS | 256 MB | 1 GB | — |
|
||||
| vmagent (in k3s) | 256 MB | 512 MB emptyDir | — |
|
||||
| **Total** | **~1 GB hard cap** | **~21 GB** | |
|
||||
|
||||
| Trigger | Add |
|
||||
|---|---|
|
||||
| 10k+ daily users | Loki + Grafana for logs |
|
||||
| 100+ req/s sustained | Prometheus + Grafana for metrics |
|
||||
| Performance incidents | OpenTelemetry tracing |
|
||||
| Revenue > $5k/mo | Paid monitoring (Datadog or similar) |
|
||||
| First production outage | Alerting to phone/Slack |
|
||||
|
||||
The overall philosophy: observability is an investment that compounds.
|
||||
Add it before you need it, not after. But also don't over-invest at
|
||||
idle.
|
||||
|
||||
**Next quarter**: set up Uptime Kuma + Loki at minimum.
|
||||
|
||||
## Checking what's installed
|
||||
|
||||
```bash
|
||||
# In kube-system namespace
|
||||
kubectl get pods -n kube-system
|
||||
# Should see: coredns, metrics-server, traefik, local-path-provisioner,
|
||||
# and some k3s-related helm install jobs
|
||||
|
||||
# In honeydue namespace
|
||||
kubectl get pods -n honeydue
|
||||
# api, admin, worker, redis
|
||||
|
||||
# No monitoring namespace (yet)
|
||||
kubectl get namespaces
|
||||
# default, honeydue, kube-node-lease, kube-public, kube-system
|
||||
```
|
||||
Resident usage at idle is much lower (~90 MB on the obs side, ~30 MB
|
||||
for vmagent). Hard limits exist so a memory leak in any one component
|
||||
can't squeeze the cohabiting PostHog stack on `88oakappsUpdate`.
|
||||
|
||||
## Operator cheat sheet
|
||||
|
||||
@@ -274,32 +383,61 @@ kubectl get namespaces
|
||||
# Tail all logs in the namespace
|
||||
kubectl logs -n honeydue --all-containers=true --tail=50 -l app.kubernetes.io/part-of=honeydue
|
||||
|
||||
# Scrape state from vmagent self-metrics
|
||||
kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
|
||||
| grep -E "scrapes_total|targets|remotewrite"
|
||||
|
||||
# Force vmagent to reload scrape config
|
||||
kubectl -n honeydue rollout restart deploy/vmagent
|
||||
|
||||
# Query VictoriaMetrics directly (PromQL)
|
||||
ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=histogram_quantile(0.95,sum%20by%20(route,le)(rate(http_request_duration_seconds_bucket%5B5m%5D)))" | python3 -m json.tool'
|
||||
|
||||
# Restart the obs stack on 88oakappsUpdate
|
||||
ssh 88oakappsUpdate 'cd /opt/honeydue-obs && sudo docker compose restart'
|
||||
|
||||
# Live obs container memory
|
||||
ssh 88oakappsUpdate 'sudo docker stats --no-stream | grep honeydue-obs'
|
||||
|
||||
# Pod resource usage (k3s side)
|
||||
kubectl top pods -n honeydue --sort-by=memory
|
||||
|
||||
# With stern (if installed: brew install stern)
|
||||
stern -n honeydue .
|
||||
|
||||
# Follow specific pod, including previous runs
|
||||
kubectl logs -n honeydue <pod> -f --previous=false
|
||||
|
||||
# Pod resource usage
|
||||
kubectl top pods -n honeydue --sort-by=memory
|
||||
kubectl top pods -n honeydue --sort-by=cpu
|
||||
|
||||
# Events (cluster-wide)
|
||||
kubectl get events -A --sort-by=.lastTimestamp | tail -20
|
||||
|
||||
# Full state dump for a pod (debugging)
|
||||
kubectl describe pod -n honeydue <pod> > /tmp/pod-dump.txt
|
||||
kubectl logs -n honeydue <pod> > /tmp/pod-logs.txt
|
||||
```
|
||||
|
||||
## Future: what to add and when
|
||||
|
||||
| Trigger | Add |
|
||||
|---|---|
|
||||
| First production incident | Grafana alerting (free, data already there) |
|
||||
| 10k+ daily users | Loki + Vector for log aggregation |
|
||||
| Performance incident the histograms can't explain | Wire OTel exporter → Jaeger from the Go app |
|
||||
| CPU pressure on api pods | Pyroscope continuous profiler |
|
||||
| Multi-product obs needs | Migrate obs stack to dedicated CX32 ($8/mo) |
|
||||
|
||||
The overall philosophy: observability is an investment that compounds.
|
||||
Add it before you need it, not after. But also don't over-invest at
|
||||
idle.
|
||||
|
||||
## References
|
||||
|
||||
- [Kubernetes metrics-server][ms]
|
||||
- [K3s metrics][k3s-metrics]
|
||||
- [Loki][loki]
|
||||
- [VictoriaMetrics docs][vm]
|
||||
- [vmagent kubernetes_sd_configs][vmagent-k8s]
|
||||
- [Jaeger all-in-one with badger][jaeger]
|
||||
- [prometheus/client_golang][promclient]
|
||||
- [Grafana provisioning datasources][gf-prov]
|
||||
- [Loki][loki] (future)
|
||||
- [Stern (multi-pod log tail)][stern]
|
||||
|
||||
[ms]: https://github.com/kubernetes-sigs/metrics-server
|
||||
[k3s-metrics]: https://docs.k3s.io/advanced#enabling-metrics-server
|
||||
[vm]: https://docs.victoriametrics.com/single-server-victoriametrics/
|
||||
[vmagent-k8s]: https://docs.victoriametrics.com/vmagent.html#kubernetes-monitoring-with-vmagent
|
||||
[jaeger]: https://www.jaegertracing.io/docs/1.62/getting-started/#all-in-one
|
||||
[promclient]: https://pkg.go.dev/github.com/prometheus/client_golang
|
||||
[gf-prov]: https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources
|
||||
[loki]: https://grafana.com/oss/loki/
|
||||
[stern]: https://github.com/stern/stern
|
||||
|
||||
@@ -115,6 +115,41 @@ kubectl rollout restart deployment/coredns -n kube-system
|
||||
kubectl rollout restart deployment/metrics-server -n kube-system
|
||||
```
|
||||
|
||||
#### vmagent can't reach obs.88oakapps.com
|
||||
|
||||
**Symptom**: dashboards stop updating; vmagent logs show 401 / TLS /
|
||||
network errors against `obs.88oakapps.com`. App is unaffected.
|
||||
**Recovery**: vmagent buffers up to 512 MB locally and replays on
|
||||
reconnect, so brief outages self-heal. If sustained:
|
||||
```bash
|
||||
# Is the obs endpoint up?
|
||||
curl -s -o /dev/null -w "%{http_code}\n" https://obs.88oakapps.com/health \
|
||||
-H "Authorization: Bearer $(grep ^OBS_INGEST_TOKEN= deploy/prod.env | cut -d= -f2)"
|
||||
# 200 = ingest endpoint healthy.
|
||||
|
||||
# Inspect vmagent's failure metric
|
||||
kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
|
||||
| grep -E "remotewrite_(packets|samples)_dropped|persistentqueue_blocks_dropped"
|
||||
|
||||
# Restart vmagent (forces config reload + drains queue)
|
||||
kubectl -n honeydue rollout restart deploy/vmagent
|
||||
```
|
||||
**If 88oakappsUpdate itself is down** (PostHog runs there too):
|
||||
SSH and check `sudo docker compose -f /opt/honeydue-obs/docker-compose.yml ps`.
|
||||
**Non-critical**: nothing app-facing depends on the obs stack.
|
||||
|
||||
#### Grafana dashboard shows "no data"
|
||||
|
||||
**Possible causes, in order of frequency**:
|
||||
1. New histogram name — query targets a metric the api hasn't emitted
|
||||
yet. Check `kubectl exec deploy/vmagent -- wget -qO- http://api:8000/metrics`
|
||||
for the metric name.
|
||||
2. vmagent isn't scraping (see above).
|
||||
3. Time range is before the obs stack came up (2026-04-25). Adjust
|
||||
the dashboard time picker.
|
||||
4. Cardinality blowup — VM rejected high-label-count series. Check
|
||||
`vm_rows_inserted_total` vs `vm_rows_dropped_total` on the obs box.
|
||||
|
||||
### Networking failures
|
||||
|
||||
#### UFW rule accidentally blocks essential traffic
|
||||
@@ -210,12 +245,58 @@ finds an empty data directory (or can't mount at all).
|
||||
- If the original node is gone: Redis starts empty. Cache regenerates.
|
||||
Asynq queue state is lost; pending jobs re-queue on retry, cron
|
||||
fires re-schedule on next tick.
|
||||
- Auth caches (token + residence-IDs) regenerate on first user
|
||||
request — first request per user pays full DB lookup, then warm
|
||||
again. Visible as a brief latency spike in the Grafana RED
|
||||
dashboard, not a functional failure.
|
||||
- Ensure the node label `honeydue/redis=true` is on a healthy node:
|
||||
```bash
|
||||
kubectl label node <new-node> honeydue/redis=true --overwrite
|
||||
kubectl label node <dead-node> honeydue/redis- 2>/dev/null || true
|
||||
```
|
||||
|
||||
#### Stale residence-IDs cache (data freshness bug)
|
||||
|
||||
**Symptom**: a user accepts a share-code or has a residence
|
||||
removed, but `/api/tasks/`, `/api/documents/`, `/api/contractors/`,
|
||||
or `/api/residences/summary/` continues to show the old
|
||||
membership for up to 5 minutes.
|
||||
**Cause**: a residence-membership-mutating code path landed
|
||||
without calling `cache.InvalidateResidenceIDsForUsers(...)`. The
|
||||
cache TTL is 5 min so the issue self-heals, but it's user-visible.
|
||||
**Recovery (immediate)**: flush the affected user's cache key
|
||||
manually. See [Chapter 17 §residence-IDs cache invalidation](./17-runbook.md).
|
||||
**Prevention (permanent)**: every mutation that changes
|
||||
`residence_residence.owner_id`, `residence_residence_users.user_id`,
|
||||
or deletes a residence MUST invalidate. Existing call sites for
|
||||
reference: `CreateResidence` (owner), `DeleteResidence`
|
||||
(all members), `JoinWithCode` (joining user), `RemoveUser`
|
||||
(removed user). The pattern lives in
|
||||
`internal/services/residence_id_cache.go`.
|
||||
|
||||
#### Redis at maxmemory limit
|
||||
|
||||
**Symptom**: Redis logs `OOM command not allowed when used memory > 'maxmemory'`.
|
||||
Should be rare — current production usage is ~2.4 MB against a 256 MB
|
||||
limit and the policy is `allkeys-lru` (cache writes evict cold keys
|
||||
instead of erroring).
|
||||
**Recovery**: confirm the policy is still `allkeys-lru`:
|
||||
```bash
|
||||
kubectl -n honeydue exec deploy/redis -- redis-cli CONFIG GET maxmemory-policy
|
||||
```
|
||||
If it's somehow `noeviction`, set it live:
|
||||
```bash
|
||||
kubectl -n honeydue exec deploy/redis -- redis-cli CONFIG SET maxmemory-policy allkeys-lru
|
||||
```
|
||||
And re-apply the manifest at `deploy-k3s/manifests/redis/deployment.yaml`
|
||||
so the change survives a pod restart.
|
||||
|
||||
If memory usage is genuinely climbing toward the cap, check for
|
||||
runaway keys without TTLs:
|
||||
```bash
|
||||
kubectl -n honeydue exec deploy/redis -- redis-cli --bigkeys
|
||||
```
|
||||
|
||||
### External service failures
|
||||
|
||||
#### Neon Postgres outage
|
||||
@@ -229,6 +310,72 @@ until Neon is back.
|
||||
Postgres-level failover.
|
||||
**Frequency**: Neon has had a handful of hours-scale outages since launch.
|
||||
|
||||
#### Neon pooler endpoint unreachable but direct endpoint up
|
||||
|
||||
**Symptom**: `dial tcp ep-floral-truth-amttbc5a-pooler.c-5...: i/o
|
||||
timeout` in api logs but the direct compute endpoint is reachable.
|
||||
Rare — Neon's pooler runs in their infra alongside compute — but
|
||||
possible during pooler maintenance.
|
||||
**Recovery (emergency)**: switch `DB_HOST` in `config.yaml` from the
|
||||
`-pooler` to the direct hostname (drop the `-pooler` segment),
|
||||
re-apply ConfigMap, rolling-restart api and worker:
|
||||
```bash
|
||||
# Edit deploy-k3s/config.yaml: database.host: ep-floral-truth-amttbc5a.c-5...
|
||||
# Then:
|
||||
KUBECONFIG=~/.kube/honeydue.yaml bash deploy-k3s/scripts/03-deploy.sh --skip-build
|
||||
```
|
||||
Cold-handshake latency goes back up (~440ms first hit) but the API
|
||||
keeps serving. Switch back when the pooler recovers.
|
||||
|
||||
#### Migrate Job fails during deploy
|
||||
|
||||
**Symptom**: `03-deploy.sh` aborts at the migrations step:
|
||||
```
|
||||
[deploy][error] migrations did not complete cleanly; aborting deploy
|
||||
```
|
||||
api/worker pods are NOT updated — they keep running the previous
|
||||
revision. This is the intentional fail-fast.
|
||||
|
||||
**Recovery**:
|
||||
```bash
|
||||
# 1. See the failure
|
||||
kubectl -n honeydue logs job/honeydue-migrate --tail=200
|
||||
|
||||
# 2. Common cause: a SQL error in the migration file. Fix the file
|
||||
# locally, commit, retry the deploy. The Job is idempotent —
|
||||
# successful prior versions stay applied; only the failed file
|
||||
# re-runs.
|
||||
git add migrations/000NNN_*.sql
|
||||
git commit -m "Fix migration NNN"
|
||||
git push gitea master
|
||||
bash deploy-k3s/scripts/03-deploy.sh
|
||||
|
||||
# 3. Other cause: Neon down or auth changed. Test direct connection:
|
||||
DB_PASS=$(kubectl -n honeydue get secret honeydue-secrets \
|
||||
-o jsonpath='{.data.POSTGRES_PASSWORD}' | base64 -d)
|
||||
docker run --rm -e PGPASSWORD="$DB_PASS" postgres:17-alpine \
|
||||
psql "host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
|
||||
user=neondb_owner dbname=honeyDue sslmode=require" -c "SELECT 1;"
|
||||
```
|
||||
**Why no automatic retry**: `backoffLimit: 0` on the Job is deliberate.
|
||||
A failing migration almost never gets unstuck by retrying — needs an
|
||||
operator to look. See [Chapter 17 §27](./17-runbook.md) for recovery
|
||||
playbook.
|
||||
|
||||
#### api refuses to start: "Schema precondition failed"
|
||||
|
||||
**Symptom**: api pods log `Schema precondition failed` and exit
|
||||
immediately after DB connect.
|
||||
**Cause**: `goose_db_version` table is missing or its latest row has
|
||||
`is_applied=false`. Means the migrate Job either was never run or
|
||||
ran and rolled back.
|
||||
**Recovery**: run the migrate Job manually (see
|
||||
[Chapter 17 §26](./17-runbook.md)). After it completes successfully,
|
||||
delete the failing api pods so they restart with a fresh schema check:
|
||||
```bash
|
||||
kubectl -n honeydue rollout restart deploy/api
|
||||
```
|
||||
|
||||
#### Backblaze B2 outage
|
||||
|
||||
**Symptom**: image uploads fail; image downloads fail unless cached by
|
||||
|
||||
@@ -358,6 +358,165 @@ Workaround: in each pod's logs, search for a unique user identifier:
|
||||
stern -n honeydue api | grep "user_id=12345"
|
||||
```
|
||||
|
||||
## 23. Invalidate residence-IDs cache for a user
|
||||
|
||||
Used when a user reports stale data ("I joined a residence but my
|
||||
tasks list still shows the old one"). The cache is keyed on user ID
|
||||
with 5-min TTL — most issues self-heal — but you can flush manually.
|
||||
|
||||
```bash
|
||||
# Single user
|
||||
kubectl -n honeydue exec deploy/redis -- redis-cli DEL "residence_ids_user:7"
|
||||
|
||||
# All users (nuclear; everyone pays one DB lookup on next request)
|
||||
kubectl -n honeydue exec deploy/redis -- redis-cli --scan --pattern "residence_ids_user:*" \
|
||||
| xargs -r -n 100 kubectl -n honeydue exec deploy/redis -- redis-cli DEL
|
||||
```
|
||||
|
||||
Mutation paths that should invalidate this cache automatically (any
|
||||
new code that changes membership must call
|
||||
`cache.InvalidateResidenceIDsForUsers(ctx, userIDs...)`):
|
||||
|
||||
- `ResidenceService.CreateResidence` → owner
|
||||
- `ResidenceService.DeleteResidence` → all members
|
||||
- `ResidenceService.JoinWithCode` → joining user
|
||||
- `ResidenceService.RemoveUser` → removed user
|
||||
|
||||
If a user keeps reporting stale data, grep for missing invalidation:
|
||||
|
||||
```bash
|
||||
grep -rn "residenceRepo.*Add\|RemoveUser\|residence_residence_users" internal/ \
|
||||
| grep -v cache | grep -v _test
|
||||
```
|
||||
|
||||
## 24. Verify DB pool warm-up is working
|
||||
|
||||
After a deploy, check the api pod log for the warm-up confirmation:
|
||||
|
||||
```bash
|
||||
kubectl -n honeydue logs -l app.kubernetes.io/name=api --tail=50 \
|
||||
| grep "DB pool warm-up complete"
|
||||
```
|
||||
|
||||
Expected output (per pod):
|
||||
|
||||
```json
|
||||
{"level":"info","requested":20,"warmed":20,"message":"DB pool warm-up complete"}
|
||||
```
|
||||
|
||||
If `warmed` < `requested`, the pool partially failed at boot — pod
|
||||
still starts, fills from there. If `warmed=0`, something's wrong with
|
||||
either Neon connectivity or auth — check the next log line for the
|
||||
specific error.
|
||||
|
||||
To test impact: hit the api right after a rollout. With warm-up
|
||||
working, the first request should be ~250ms (1 RTT). Without warm-up,
|
||||
the first request is ~700ms (full handshake).
|
||||
|
||||
## 25. Switch DB host between pooler and direct endpoints
|
||||
|
||||
The pooler endpoint (`-pooler` suffix) is the default — it cuts
|
||||
cold-handshake latency by ~3 RTTs. The direct endpoint
|
||||
(`ep-floral-truth-amttbc5a.c-5...`) is the fallback.
|
||||
|
||||
```bash
|
||||
# Edit deploy-k3s/config.yaml — change database.host
|
||||
# To pooler: ep-floral-truth-amttbc5a-pooler.c-5.us-east-1.aws.neon.tech
|
||||
# To direct: ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech
|
||||
|
||||
KUBECONFIG=~/.kube/honeydue.yaml bash deploy-k3s/scripts/03-deploy.sh --skip-build
|
||||
```
|
||||
|
||||
The pooler runs in transaction mode so any session-scope feature
|
||||
(LISTEN/NOTIFY, session advisory locks) won't work over it. Migrations
|
||||
already handle this — the migrate Job script strips `-pooler` from
|
||||
`DB_HOST` before invoking goose. If you add new session-level features
|
||||
in the data path, they'll need the same workaround.
|
||||
|
||||
## 26. Run migrations manually (rare)
|
||||
|
||||
Day-to-day, migrations run as part of every `03-deploy.sh`. But
|
||||
sometimes you want to apply or inspect them outside a deploy:
|
||||
|
||||
```bash
|
||||
# Direct-endpoint DSN (goose's advisory lock won't survive the pooler)
|
||||
DB_PASS=$(kubectl -n honeydue get secret honeydue-secrets \
|
||||
-o jsonpath='{.data.POSTGRES_PASSWORD}' | base64 -d)
|
||||
export DATABASE_URL="host=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \
|
||||
port=5432 user=neondb_owner password=$DB_PASS \
|
||||
dbname=honeyDue sslmode=require"
|
||||
|
||||
# What's pending? (read-only; safe to run anytime)
|
||||
make migrate-status
|
||||
|
||||
# Apply pending migrations (or `goose -dir migrations postgres "$DATABASE_URL" up`)
|
||||
make migrate-up
|
||||
|
||||
# Roll back the most recent migration
|
||||
make migrate-down
|
||||
|
||||
# Scaffold a new migration file
|
||||
make migrate-new name=add_widget_count_to_residences
|
||||
# → migrations/000002_add_widget_count_to_residences.sql
|
||||
# Edit, then `make migrate-up` to test, then commit.
|
||||
```
|
||||
|
||||
To run goose from inside the cluster (e.g., to bypass a network policy
|
||||
that blocks Neon from your laptop), use the migrate Job manifest as a
|
||||
one-shot:
|
||||
|
||||
```bash
|
||||
# Re-runs the latest migrate Job with whatever args you need
|
||||
kubectl -n honeydue delete job honeydue-migrate --ignore-not-found
|
||||
sed "s|image: IMAGE_PLACEHOLDER|image: $(kubectl -n honeydue get deploy api -o jsonpath='{.spec.template.spec.containers[0].image}')|" \
|
||||
deploy-k3s/manifests/migrate/job.yaml | kubectl apply -f -
|
||||
kubectl -n honeydue wait --for=condition=complete --timeout=5m job/honeydue-migrate
|
||||
kubectl -n honeydue logs job/honeydue-migrate
|
||||
```
|
||||
|
||||
## 27. Recover from a failed/dirty migration
|
||||
|
||||
If `goose up` fails partway through, the migration file's transaction
|
||||
rolls back and `goose_db_version` reflects the last *complete*
|
||||
version. Goose marks no row as "dirty" — that's a golang-migrate
|
||||
concept. So recovery is just: fix the migration file, re-run.
|
||||
|
||||
If you've genuinely corrupted state (dropped tables you shouldn't have,
|
||||
applied a destructive migration in error):
|
||||
|
||||
```bash
|
||||
# See current goose state
|
||||
make migrate-status
|
||||
psql "$DATABASE_URL" -c \
|
||||
"SELECT version_id, is_applied, tstamp FROM goose_db_version ORDER BY id DESC LIMIT 10;"
|
||||
|
||||
# To force the version table back to a known-good number after
|
||||
# manually fixing the schema:
|
||||
psql "$DATABASE_URL" -c \
|
||||
"INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (<N>, true, NOW());"
|
||||
```
|
||||
|
||||
## 28. Bootstrap goose on a fresh clone of the schema
|
||||
|
||||
If you create a new Neon branch / dev DB and need to bring it under
|
||||
goose management:
|
||||
|
||||
```bash
|
||||
export DATABASE_URL="...<the new DB>..."
|
||||
|
||||
# Option A: fresh DB, no schema → just run up
|
||||
make migrate-up
|
||||
|
||||
# Option B: schema already populated (e.g., restored from a dump) →
|
||||
# mark v1 as already-applied
|
||||
goose -dir migrations postgres "$DATABASE_URL" version # creates table
|
||||
psql "$DATABASE_URL" -c \
|
||||
"INSERT INTO goose_db_version (version_id, is_applied, tstamp) VALUES (1, true, NOW());"
|
||||
```
|
||||
|
||||
This is also what was done for the live prod DB at goose-adoption time
|
||||
(commit `12b2f9d`).
|
||||
|
||||
## References
|
||||
|
||||
- [kubectl cheat sheet][kubectl-cs]
|
||||
|
||||
@@ -58,6 +58,20 @@ honeyDue.
|
||||
|---|---:|
|
||||
| Gitea container registry | **$0** |
|
||||
|
||||
### Observability (88oakappsUpdate)
|
||||
|
||||
VictoriaMetrics + Jaeger + Grafana co-tenant on the existing Linode
|
||||
VPS that hosts PostHog. ~700 MB RAM, 21 GB disk — fits inside the
|
||||
existing instance. Not charged to honeyDue.
|
||||
|
||||
| Item | Monthly |
|
||||
|---|---:|
|
||||
| Self-hosted obs stack on `88oakappsUpdate` | **$0** |
|
||||
|
||||
Migration trigger: when the obs stack starts pressuring PostHog or
|
||||
needs hard isolation, move to a dedicated Hetzner CX32 (~$8/mo).
|
||||
See [Chapter 15 — When to move off](./15-observability.md).
|
||||
|
||||
### Total infrastructure
|
||||
|
||||
| Category | Monthly |
|
||||
@@ -67,6 +81,7 @@ honeyDue.
|
||||
| Storage | ~$0.30 |
|
||||
| Edge | $0 |
|
||||
| Registry | $0 |
|
||||
| Observability | $0 |
|
||||
| **Total** | **~$30** |
|
||||
|
||||
## External SaaS
|
||||
|
||||
@@ -397,6 +397,35 @@ should reflect reality, not be optimistic.
|
||||
**Moral**: Healthchecks should be realistic, not aspirational. Know
|
||||
what your app actually does at startup.
|
||||
|
||||
#### Postscript (2026-04-26): the whole `MigrateWithLock` shape was wrong
|
||||
|
||||
A few months after the Swarm migration, switching `DB_HOST` to Neon's
|
||||
`-pooler` endpoint for runtime perf wins broke this code completely:
|
||||
`pg_advisory_lock` is session-scoped, but PgBouncer transaction-mode
|
||||
multiplexes statements across backend Postgres sessions, so the lock
|
||||
appeared to be held but actually wasn't. Pods hung at
|
||||
"Acquiring migration advisory lock..." and the startup probe killed
|
||||
them in turn.
|
||||
|
||||
After a brief band-aid (route migrations through the direct endpoint;
|
||||
bump probe to 600s to absorb 5-minute AutoMigrate runs over the slow
|
||||
direct connection — both reverted), we abandoned the runtime-side
|
||||
migration story entirely and adopted [pressly/goose](https://github.com/pressly/goose)
|
||||
in commit `12b2f9d`:
|
||||
|
||||
- Migrations run as a one-shot Kubernetes Job before any api/worker
|
||||
pod rolls. No more in-replica migration, no more advisory lock,
|
||||
no more startup probe gymnastics.
|
||||
- `RequireSchemaApplied` checks `goose_db_version` at startup and
|
||||
refuses to boot on a stale schema — fail-fast for "operator
|
||||
forgot to run migrate," instead of mysterious runtime errors.
|
||||
- `failureThreshold` reverted to its pre-MigrateWithLock value.
|
||||
Pods boot in seconds again.
|
||||
|
||||
See [Chapter 8 §Schema management](./08-database.md) for the goose
|
||||
shape. This entire sub-section is preserved as historical context
|
||||
for why we walked the path we did.
|
||||
|
||||
## What we learned
|
||||
|
||||
### Docker Swarm is in a bad place in 2026
|
||||
|
||||
@@ -69,20 +69,22 @@ Flexible to Full (strict). Verified by:
|
||||
- CF edge continues to serve its own Let's Encrypt cert to browsers
|
||||
- both layers now TLS-encrypted
|
||||
|
||||
### Migration Job for schema changes
|
||||
### ~~Migration Job for schema changes~~ — done (2026-04-26, commit 12b2f9d)
|
||||
|
||||
**Why**: Currently every api pod runs `MigrateWithLock()` on startup,
|
||||
serializing on a Postgres advisory lock. Adds 90-240s to cold startup
|
||||
and caused bug #13 in Chapter 19.
|
||||
**What shipped**: pressly/goose as the migration tool, run as a one-shot
|
||||
Kubernetes Job from `deploy-k3s/manifests/migrate/job.yaml` before
|
||||
api/worker rollout. The Job uses the api image (goose CLI is baked in
|
||||
during the Dockerfile build), strips `-pooler` from `DB_HOST` for the
|
||||
direct-endpoint connection migrations need, and exits in seconds when
|
||||
there's nothing to apply. `RequireSchemaApplied` in the api/worker
|
||||
startup checks `goose_db_version` and fails fast on a stale schema.
|
||||
|
||||
**How**: Create a Kubernetes `Job` resource that runs the api image
|
||||
with a `--migrate-only` flag. Job runs once per deploy, completes when
|
||||
schema is current. api pods get an initContainer that waits for the
|
||||
Job to complete.
|
||||
The Go-code-with-`--migrate-only` shape originally proposed here was
|
||||
rejected in favor of using the upstream goose binary directly — see
|
||||
[Chapter 8 §Schema management](./08-database.md) for the trade-offs.
|
||||
|
||||
Requires Go code change to support `--migrate-only` flag.
|
||||
|
||||
**Effort**: 3-4 hours (code + job manifest + testing).
|
||||
Pre-goose `MigrateWithLock` is gone; ch19 §13 has the historical
|
||||
postmortem context.
|
||||
|
||||
### Redis password
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ they do, and how to operate them.
|
||||
|
||||
- [07 — Services](./07-services.md) — api, admin, worker, redis per-service deep dive
|
||||
- [08 — Database](./08-database.md) — Neon Postgres, advisory-lock migrations
|
||||
- [09 — Storage](./09-storage.md) — Backblaze B2, minio-go client details
|
||||
- [09 — Storage](./09-storage.md) — Backblaze B2, minio-go, presigned-URL direct uploads
|
||||
- [10 — Secrets & Config](./10-secrets-config.md) — ConfigMap, Secret, env mapping
|
||||
- [11 — Registry](./11-registry.md) — Gitea container registry, multi-arch builds
|
||||
|
||||
@@ -48,7 +48,7 @@ they do, and how to operate them.
|
||||
|
||||
- [12 — Data Flow](./12-data-flow.md) — end-to-end request lifecycle
|
||||
- [14 — Deployment Process](./14-deployment-process.md) — how to roll new code
|
||||
- [15 — Observability](./15-observability.md) — logs, metrics, tracing
|
||||
- [15 — Observability](./15-observability.md) — VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`, vmagent in-cluster, Prometheus histograms in the Go API
|
||||
- [16 — Failure Modes](./16-failure-modes.md) — what happens when X dies
|
||||
- [17 — Runbook](./17-runbook.md) — common ops tasks
|
||||
|
||||
|
||||
@@ -173,11 +173,21 @@ suffix. (Chapter 8)
|
||||
## Go + Asynq
|
||||
|
||||
**AutoMigrate**: GORM function that syncs DB schema to Go structs.
|
||||
(Chapter 8)
|
||||
We used this in production until 2026-04, replaced by goose. Tests
|
||||
still use it via `testutil.SetupTestDB`. (Chapter 8)
|
||||
|
||||
**Asynq**: Go library for background job queues. Redis-backed.
|
||||
(Chapter 7)
|
||||
|
||||
**goose**: pressly/goose — the SQL migration tool we use in production
|
||||
(commit 12b2f9d onward). Migration files live in `migrations/`, one
|
||||
file per version with `-- +goose Up` / `-- +goose Down` markers.
|
||||
(Chapter 8)
|
||||
|
||||
**goose_db_version**: goose's version-tracking table. One row per
|
||||
applied migration. `RequireSchemaApplied` reads the latest row at
|
||||
api/worker startup to fail fast on a stale schema. (Chapter 8)
|
||||
|
||||
**GORM**: Go ORM we use. (Chapter 8)
|
||||
|
||||
**pgx**: Go Postgres driver used by GORM. (Chapter 8)
|
||||
|
||||
@@ -278,6 +278,43 @@ ssh -i ~/.ssh/hetzner deploy@<node> 'sudo systemctl start k3s'
|
||||
# then re-join via the k3s install command
|
||||
```
|
||||
|
||||
## Observability
|
||||
|
||||
```bash
|
||||
# Hit api /metrics from inside the cluster
|
||||
kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://api:8000/metrics | head -30
|
||||
|
||||
# vmagent self-stats: scrapes succeeded, samples shipped, queue health
|
||||
kubectl -n honeydue exec deploy/vmagent -- wget -qO- http://127.0.0.1:8429/metrics \
|
||||
| grep -E "scrapes_total|targets|remotewrite_samples_dropped|persistentqueue_blocks_dropped"
|
||||
|
||||
# Force vmagent to reload config (after editing the ConfigMap)
|
||||
kubectl -n honeydue rollout restart deploy/vmagent
|
||||
|
||||
# Query VictoriaMetrics by SSH'ing to the obs box
|
||||
ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=up"'
|
||||
|
||||
# p95 latency by route, last 5m
|
||||
ssh 88oakappsUpdate 'curl -s "http://127.0.0.1:8428/api/v1/query?query=histogram_quantile(0.95,sum%20by%20(route,le)(rate(http_request_duration_seconds_bucket%5B5m%5D)))" | python3 -m json.tool'
|
||||
|
||||
# All metric names landing in VM
|
||||
ssh 88oakappsUpdate 'curl -s http://127.0.0.1:8428/api/v1/label/__name__/values | python3 -m json.tool'
|
||||
|
||||
# Restart the obs stack on 88oakappsUpdate (VM + Jaeger + Grafana)
|
||||
ssh 88oakappsUpdate 'cd /opt/honeydue-obs && sudo docker compose restart'
|
||||
|
||||
# Live RAM usage of the obs containers
|
||||
ssh 88oakappsUpdate 'sudo docker stats --no-stream | grep honeydue-obs'
|
||||
|
||||
# Test the obs ingest endpoint with auth
|
||||
TOKEN=$(grep ^OBS_INGEST_TOKEN= deploy/prod.env | cut -d= -f2)
|
||||
curl -s -o /dev/null -w "%{http_code}\n" https://obs.88oakapps.com/health \
|
||||
-H "Authorization: Bearer $TOKEN" # 200 = healthy
|
||||
```
|
||||
|
||||
Dashboards live at `https://grafana.88oakapps.com/d/honeydue-red`.
|
||||
Admin credentials in `deploy/prod.env`.
|
||||
|
||||
## One-liners worth memorizing
|
||||
|
||||
```bash
|
||||
|
||||
@@ -65,7 +65,9 @@ Every external link cited anywhere in this book, grouped by topic.
|
||||
- [Neon usage-based pricing announcement][neon-blog]
|
||||
- [Neon connect from any app][neon-connect]
|
||||
- [Postgres advisory locks][pg-locks]
|
||||
- [GORM AutoMigrate][gorm-automigrate]
|
||||
- [GORM AutoMigrate][gorm-automigrate] (tests only — production migrations use goose)
|
||||
- [pressly/goose — SQL migration tool][goose]
|
||||
- [Goose documentation][goose-docs]
|
||||
|
||||
## Backblaze B2
|
||||
|
||||
@@ -168,6 +170,8 @@ Every external link cited anywhere in this book, grouped by topic.
|
||||
[neon-connect]: https://neon.com/docs/connect/connect-from-any-app
|
||||
[pg-locks]: https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS
|
||||
[gorm-automigrate]: https://gorm.io/docs/migration.html
|
||||
[goose]: https://github.com/pressly/goose
|
||||
[goose-docs]: https://pressly.github.io/goose/
|
||||
|
||||
<!-- B2 -->
|
||||
[b2-docs]: https://www.backblaze.com/docs/
|
||||
|
||||
@@ -0,0 +1,166 @@
|
||||
# Observability Plan — honeyDue (100% self-hosted)
|
||||
|
||||
**Goal:** Live request-timing visibility (HTTP, DB, B2 uploads, APNs, asynq jobs) without paying any SaaS vendor.
|
||||
|
||||
**Deployment target:** `88oakappsUpdate` (Linode VPS at `185.143.228.16`, Ubuntu 24.04, 8 vCPU / 32 GB RAM / 193 GB disk). This box already runs the self-hosted PostHog stack and has nginx + Let's Encrypt set up for `*.88oakapps.com`. Free RAM at rest ≈ 15 GB; the obs stack budget is ≈ 700 MB → ~5% of free RAM. Costs $0 incremental.
|
||||
|
||||
**Why not in the honeyDue k3s cluster:** Frees ~700 MB across the 3 Hetzner nodes, no PVC plumbing, and no need to expose anything from k3s — everything is push-from-app to a public TLS endpoint.
|
||||
|
||||
**Status:** Fully shipped. VictoriaMetrics + Jaeger + Grafana on `obs.88oakapps.com`, vmagent in-cluster, OTel SDK and otelgorm wired into the api+worker, every authed endpoint produces nested HTTP→service→SQL flame graphs in Jaeger.
|
||||
|
||||
The first round of traces revealed every visible ms was network/proxy overhead — DB execution itself is sub-millisecond. The follow-up work (`internal/services/residence_id_cache.go`, GORM pool warm-up, auth-query JOIN consolidation, switching `DB_HOST` to Neon's `-pooler` endpoint, bumped cache TTLs) cut warm-cache `/api/tasks/` from 2,473 ms / 5 spans to **229 ms / 2 spans** — see commit `88fb175` and Chapter 8 §"Optimizations layered on top".
|
||||
|
||||
---
|
||||
|
||||
## Stack
|
||||
|
||||
| Role | Choice | Why this vs. the obvious alternative |
|
||||
|---|---|---|
|
||||
| Metrics store | **VictoriaMetrics** (single-node) | Drop-in Prometheus-compatible. ~4× lower RAM (~200 MB vs ~500 MB) and ~7× better compression. Single binary. |
|
||||
| Tracing | **Jaeger all-in-one** | ~150 MB RAM with embedded badger storage. Tempo monolithic mode needs 1-2 GB minimum — overkill for honeyDue's scale. |
|
||||
| Dashboards | **Grafana OSS** | Connects to both VM (Prometheus protocol) and Jaeger natively. |
|
||||
| App instrumentation | **OpenTelemetry SDK** + `prometheus/client_golang` | OTel is vendor-neutral — backends are swappable without code change. |
|
||||
| Logs | **Keep Dozzle**; add Loki only when log search becomes painful | Loki adds ~512 MB RAM + a daemonset for log shipping. Not worth it until there's a concrete pain point. |
|
||||
|
||||
### Why not the LGTM stack (Loki + Grafana + Tempo + Mimir)?
|
||||
|
||||
- **Tempo** wants 1-2 GB RAM minimum in monolithic mode ([Grafana community report](https://community.grafana.com/t/tempo-ram-usage-for-6k-spans-per-hour/63801)). Stacking that on top of Loki + Mimir would consume ~3-4 GB RAM. On a 3×8 GB cluster that's 12-17% of capacity for observability infra.
|
||||
- **Mimir** is wonderful for multi-tenant Prometheus at scale — you have one tenant.
|
||||
- **Loki** is great if you live in `kubectl logs` and need full-text search across them. You currently use Dozzle and are not feeling that pain.
|
||||
|
||||
VictoriaMetrics + Jaeger all-in-one gives you 90% of the value at 25% of the resource cost.
|
||||
|
||||
---
|
||||
|
||||
## Resource budget on `88oakappsUpdate`
|
||||
|
||||
Three Docker containers in a separate compose project under `/opt/honeydue-obs/` — fully isolated from the existing PostHog compose stack so PostHog's lifecycle never touches the obs stack and vice versa.
|
||||
|
||||
| Service | `mem_limit` | Disk (bind mount) | Retention |
|
||||
|---|---|---|---|
|
||||
| VictoriaMetrics single-node | 256 MB | 10 GB | 30 days metrics |
|
||||
| Jaeger all-in-one (badger storage) | 256 MB | 10 GB | 7 days traces |
|
||||
| Grafana OSS | 256 MB | 1 GB | — |
|
||||
| **Total** | **~768 MB hard cap** | **21 GB** | |
|
||||
|
||||
**~5% of the box's free RAM and ~14% of free disk.** The hard `mem_limit` per container matters: ClickHouse on the same VM can spike under PostHog analytics load, so bounding the obs stack prevents it from competing in a memory pinch.
|
||||
|
||||
**Don't reuse PostHog's ClickHouse / Kafka / Redis.** Tempting because they're sitting right there, but coupling honeyDue's observability to PostHog's storage means a PostHog incident takes honeyDue's incident-response telemetry down with it. Keep them fully separate.
|
||||
|
||||
**Shared blast radius caveat:** A kernel panic on `88oakappsUpdate` loses both PostHog and honeyDue obs at once. At current scale, fine — call it out, don't fix.
|
||||
|
||||
---
|
||||
|
||||
## App-side instrumentation
|
||||
|
||||
| Surface | Library / approach | Import path |
|
||||
|---|---|---|
|
||||
| Echo HTTP middleware | `otelecho` — span per request, tagged route/method/status | `go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho` |
|
||||
| GORM queries | `uptrace/otelgorm` plugin — `db.Use(otelgorm.NewPlugin())`. Requires threading `ctx` through repositories so `db.WithContext(ctx)` works. | `github.com/uptrace/opentelemetry-go-extra/otelgorm` |
|
||||
| B2 / minio-go uploads | Manual span around `storage_service.Upload` with attributes for bucket, object size, MIME type | `go.opentelemetry.io/otel` |
|
||||
| APNs / FCM | Manual span in `internal/push/apns.go` and `fcm.go`; record device-token, response status code | `go.opentelemetry.io/otel` |
|
||||
| asynq jobs | Custom `asynq.MiddlewareFunc` (~20 lines) — span per task type, attached to ctx, records duration + retry count | `go.opentelemetry.io/otel` + `asynq.MiddlewareFunc` |
|
||||
| Prometheus `/metrics` endpoint | `prometheus/client_golang` direct — register histograms for HTTP duration / GORM op / B2 op / APNs send | `github.com/prometheus/client_golang/prometheus`, `.../prometheus/promhttp` |
|
||||
| OTLP exporter | OTLP/HTTP → `https://obs.88oakapps.com/v1/traces` with bearer token. 100% sample in dev, 10% in prod. | `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp` |
|
||||
| Metrics push | `vmagent` sidecar in k3s scrapes the api Pod's `/metrics` and remote-writes to `https://obs.88oakapps.com/api/v1/write` with bearer token. Cleaner than exposing `/metrics` publicly. | `victoriametrics/vmagent` image |
|
||||
|
||||
**Note on GORM context propagation:** the existing repository methods don't take `ctx context.Context`. Adding `otelgorm` requires plumbing ctx down from the Echo handler through the service layer to the repository call site. ~10 repository files, many call sites. Save for last because the diff is large.
|
||||
|
||||
---
|
||||
|
||||
## Implementation order (smallest first)
|
||||
|
||||
### Step 1 — Metrics + dashboards (highest immediate ROI)
|
||||
|
||||
**On `88oakappsUpdate`:**
|
||||
1. `mkdir -p /opt/honeydue-obs/{data/vm,data/jaeger,data/grafana}` and a `docker-compose.yml` defining the three services with `mem_limit: 256m`, bind mounts for persistence, and an isolated bridge network
|
||||
2. Add nginx vhosts (DNS A records first):
|
||||
- `grafana.88oakapps.com` → `127.0.0.1:3000` (basic auth via htpasswd, Let's Encrypt)
|
||||
- `obs.88oakapps.com` → routes by path:
|
||||
- `/api/v1/write` → `127.0.0.1:8428` (VictoriaMetrics remote-write, bearer-token check)
|
||||
- `/v1/traces` → `127.0.0.1:4318` (OTLP/HTTP traces, bearer-token check)
|
||||
3. Generate a 32-byte token, store in `/etc/honeydue-obs/token` (mode 0600), reference from nginx as `auth_request` or simple `if ($http_authorization != ...)`
|
||||
4. Pre-provision Grafana with the VM datasource pointing at `http://victoriametrics:8428` (in-network)
|
||||
|
||||
**On the honeyDue k3s cluster:**
|
||||
5. Add `prometheus/client_golang` to `honeyDueAPI-go/go.mod` and a `/metrics` endpoint to the Go API
|
||||
6. Register histograms:
|
||||
- `http_request_duration_seconds{route,method,status}` via Echo middleware
|
||||
- `gorm_query_duration_seconds{table,operation}` via a GORM `Plugin` callback (no ctx needed for this one — operates at the SQL string level)
|
||||
- `b2_upload_duration_seconds{bucket,result}`
|
||||
- `apns_send_duration_seconds{result}`
|
||||
7. Deploy a `vmagent` sidecar (or DaemonSet) in the `honeydue` namespace with:
|
||||
- Scrape: api Service `/metrics` every 15s
|
||||
- `remote_write.url`: `https://obs.88oakapps.com/api/v1/write`
|
||||
- `remote_write.bearer_token`: from k8s Secret
|
||||
8. Build the RED dashboard in Grafana: rate, errors, duration p50/p95/p99 per route
|
||||
|
||||
**ROI:** "Is the API healthy? Where is time being spent right now?" answered live, served from `grafana.88oakapps.com`.
|
||||
|
||||
### Step 2 — Tracing baseline
|
||||
|
||||
(Jaeger is already up from Step 1. This step adds the app-side wiring.)
|
||||
|
||||
1. Add Grafana datasource for Jaeger pointing at `http://jaeger:16686` (in-network)
|
||||
2. Wire OTel SDK in `cmd/api/main.go`:
|
||||
- `otel.SetTracerProvider(tracerProvider)`
|
||||
- `otelecho.Middleware("honeydue-api")` on Echo
|
||||
- OTLP/HTTP exporter pointing at `https://obs.88oakapps.com/v1/traces` with `Authorization: Bearer <token>` header (token from env)
|
||||
- Sampling: `TraceIDRatioBased(0.1)` in prod, `AlwaysSample()` in dev
|
||||
3. Verify: a single `POST /api/auth/login/` produces a trace in Jaeger
|
||||
|
||||
**ROI:** "Why is this one request slow?" — answered with a flame graph.
|
||||
|
||||
### Step 3 — Manual spans for the work that actually matters
|
||||
|
||||
Wrap each in `tracer.Start(ctx, ...)` with attributes:
|
||||
- `storage_service.Upload` → span "b2.PutObject" with `bucket`, `key`, `size_bytes`, result
|
||||
- `push/apns.go` → span "apns.send" with `device_token_hash`, `status_code`, `reason`
|
||||
- `asynq` middleware → span per task type with `task.type`, `retry_count`, `payload_size`
|
||||
|
||||
**ROI:** Specific high-value debugging questions ("why did this upload take 30 seconds", "why did these 5 push notifications fail") answered without code archaeology.
|
||||
|
||||
### Step 4 — Repository ctx + `otelgorm` (biggest diff, save for last)
|
||||
|
||||
1. Refactor every repository method to accept `ctx context.Context` as first arg
|
||||
2. Update every call site to pass `c.Request().Context()` from handlers / propagate through services
|
||||
3. Add `db.Use(otelgorm.NewPlugin())` in `internal/database/database.go`
|
||||
4. Verify: a request now has nested spans `http → service → query → query → b2.PutObject → apns.send` with full SQL on the query spans
|
||||
|
||||
**ROI:** Every DB query in every trace, with SQL + table + rows. The "find the N+1" tool you'd otherwise build by hand.
|
||||
|
||||
---
|
||||
|
||||
## Hard skips (revisit only when explicitly proven needed)
|
||||
|
||||
| Tool | Why skip |
|
||||
|---|---|
|
||||
| Loki / Promtail | Dozzle covers the immediate need. Loki adds 512 Mi RAM + a daemonset; defer until log search becomes a hot pain point. |
|
||||
| Mimir / VM cluster mode | Single-node VM handles honeyDue scale for years. |
|
||||
| Pyroscope continuous profiling | Overkill at 3 small nodes. Use `pprof` endpoints ad-hoc when CPU pressure shows up. |
|
||||
| OTel Collector | Only worth running when 3+ services emit telemetry. App → Jaeger direct is fine for now. |
|
||||
| Any SaaS vendor (Datadog, NR, Honeycomb, Grafana Cloud, Sentry Performance) | User constraint: nothing paid. |
|
||||
|
||||
---
|
||||
|
||||
## When to move off `88oakappsUpdate`
|
||||
|
||||
Triggers — any one is enough:
|
||||
- `88oakappsUpdate` available memory drops below ~3 GB sustained (PostHog growth squeezing it)
|
||||
- ClickHouse OOM events start showing up in `dmesg` (PostHog under load)
|
||||
- You want fully separate failure domains for honeyDue vs. 88oakapps
|
||||
|
||||
Migration path: the obs stack is a single docker-compose project on a bind-mount, so moving it = `rsync /opt/honeydue-obs/` to a new box, update DNS for `grafana.88oakapps.com` and `obs.88oakapps.com`, `docker compose up -d`. ~30 min of work. Until then: cohabiting on `88oakappsUpdate` is correct.
|
||||
|
||||
---
|
||||
|
||||
## Quick reference: what shows up where
|
||||
|
||||
| Question | Where to look |
|
||||
|---|---|
|
||||
| Is the API up right now? Latency? Errors? | Grafana RED dashboard |
|
||||
| Why is this specific request slow? | Jaeger trace view |
|
||||
| What did the slow part of that request actually do (which SQL, which B2 PUT)? | Span details inside the trace |
|
||||
| Background job throughput / queue depth | VictoriaMetrics + asynq metrics |
|
||||
| What did the app print to stdout 5 minutes ago? | Dozzle |
|
||||
| What error did the app log? | Dozzle (search) — or Loki if/when added |
|
||||
@@ -0,0 +1,146 @@
|
||||
# Runbook — Secret Rotation
|
||||
|
||||
Closes audit finding `K3S-F12` (secrets unrotated since cluster bootstrap,
|
||||
no rotation cadence). See `deploy-k3s/SECURITY.md` Stage 2.
|
||||
|
||||
**Cadence:** rotate every secret at least **annually**. Rotate
|
||||
**immediately** on suspected exposure, on an operator-device loss, or when
|
||||
anyone who has seen a secret leaves the project.
|
||||
|
||||
**Record keeping:** after each rotation, annotate the secret so the age is
|
||||
visible:
|
||||
|
||||
```bash
|
||||
kubectl -n honeydue annotate secret <name> \
|
||||
honeydue.dev/last-rotated="$(date -u +%Y-%m-%d)" --overwrite
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## How rotation works
|
||||
|
||||
Every secret has a **source of truth** on the operator workstation. The
|
||||
deploy scripts read those sources and (re)create the Kubernetes Secrets.
|
||||
Rotation is always: **update the source → re-run `02-setup-secrets.sh` →
|
||||
restart the pods that consume it → revoke the old credential at its
|
||||
provider.**
|
||||
|
||||
`02-setup-secrets.sh` uses `kubectl apply` (via `--dry-run=client -o yaml`),
|
||||
so re-running it is idempotent and only changes what you changed.
|
||||
|
||||
| Kubernetes Secret | Source of truth | Consumed by |
|
||||
|---|---|---|
|
||||
| `honeydue-secrets` → `POSTGRES_PASSWORD` | `deploy-k3s/secrets/postgres_password.txt` | api, worker |
|
||||
| `honeydue-secrets` → `SECRET_KEY` | `deploy-k3s/secrets/secret_key.txt` | api, worker |
|
||||
| `honeydue-secrets` → `EMAIL_HOST_PASSWORD` | `deploy-k3s/secrets/email_host_password.txt` | api, worker |
|
||||
| `honeydue-secrets` → `FCM_SERVER_KEY` | `deploy-k3s/secrets/fcm_server_key.txt` | api, worker |
|
||||
| `honeydue-secrets` → `REDIS_PASSWORD` | `config.yaml` key `redis.password` | api, worker, redis |
|
||||
| `honeydue-secrets` → `OBS_INGEST_TOKEN` | `deploy/prod.env` | api, worker |
|
||||
| `honeydue-apns-key` → `apns_auth_key.p8` | `deploy-k3s/secrets/apns_auth_key.p8` | api, worker |
|
||||
| `cloudflare-origin-cert` | `deploy-k3s/secrets/cloudflare-origin.{crt,key}` | Traefik ingress |
|
||||
| `ghcr-credentials` | `config.yaml` block `registry.*` | image pulls (all pods) |
|
||||
| `admin-basic-auth` | `config.yaml` keys `admin.basic_auth_user` / `..._password` | Traefik `admin-auth` middleware |
|
||||
|
||||
The `deploy-k3s/secrets/` directory and `config.yaml` are **gitignored** —
|
||||
never commit them.
|
||||
|
||||
---
|
||||
|
||||
## Standard rotation procedure
|
||||
|
||||
```bash
|
||||
cd honeyDueAPI-go
|
||||
export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig"
|
||||
|
||||
# 1. Update the source (file under deploy-k3s/secrets/ or a config.yaml key)
|
||||
# 2. Recreate the Kubernetes Secrets from sources
|
||||
./deploy-k3s/scripts/02-setup-secrets.sh
|
||||
|
||||
# 3. Restart the consumers (see per-secret notes below for which)
|
||||
kubectl -n honeydue rollout restart deploy/api deploy/worker
|
||||
|
||||
# 4. Confirm health
|
||||
kubectl -n honeydue rollout status deploy/api
|
||||
kubectl -n honeydue rollout status deploy/worker
|
||||
|
||||
# 5. Revoke the OLD credential at its provider (see per-secret notes)
|
||||
# 6. Annotate the rotated secret with today's date
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Per-secret notes
|
||||
|
||||
### `POSTGRES_PASSWORD`
|
||||
1. Rotate the role password in the Neon dashboard.
|
||||
2. Write the new value to `deploy-k3s/secrets/postgres_password.txt`.
|
||||
3. `02-setup-secrets.sh`, then `rollout restart deploy/api deploy/worker`.
|
||||
4. Watch logs for connection errors; the old password stops working the
|
||||
moment Neon applies the change, so do steps 2–3 promptly.
|
||||
|
||||
### `SECRET_KEY` ⚠️ user-visible
|
||||
This signs auth tokens. **Rotating it logs every user out** — all existing
|
||||
tokens become invalid and every client must re-authenticate.
|
||||
1. Generate: `openssl rand -hex 32`.
|
||||
2. Write to `deploy-k3s/secrets/secret_key.txt` (must be ≥32 chars — the
|
||||
script enforces this; the app refuses to start in production without it).
|
||||
3. `02-setup-secrets.sh`, then `rollout restart deploy/api deploy/worker`.
|
||||
- Only rotate on a schedule or on suspected compromise — not casually.
|
||||
- A future improvement (overlap window via a key-id header) would let old
|
||||
tokens validate during the transition; not implemented today.
|
||||
|
||||
### `EMAIL_HOST_PASSWORD`
|
||||
1. Generate a new app password in Fastmail; keep the old one alive briefly.
|
||||
2. Write to `deploy-k3s/secrets/email_host_password.txt`.
|
||||
3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
|
||||
4. Delete the old Fastmail app password.
|
||||
|
||||
### `FCM_SERVER_KEY`
|
||||
1. Rotate the key in the Firebase console.
|
||||
2. Write to `deploy-k3s/secrets/fcm_server_key.txt`.
|
||||
3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
|
||||
|
||||
### `REDIS_PASSWORD`
|
||||
Source is `config.yaml` key `redis.password` (hex only — it is embedded in
|
||||
the `REDIS_URL`, so non-hex characters would break URL parsing).
|
||||
1. Generate: `openssl rand -hex 32`.
|
||||
2. Set `redis.password` in `config.yaml`.
|
||||
3. `02-setup-secrets.sh`.
|
||||
4. Restart **redis as well as** api/worker so the new `--requirepass` and
|
||||
the new `REDIS_URL` land together:
|
||||
`kubectl -n honeydue rollout restart deploy/redis deploy/api deploy/worker`.
|
||||
Expect a few seconds where api/worker reconnect.
|
||||
|
||||
### `apns_auth_key.p8`
|
||||
1. Revoke the key in the Apple Developer console, generate a new `.p8`.
|
||||
2. Replace `deploy-k3s/secrets/apns_auth_key.p8`.
|
||||
3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`.
|
||||
4. If the Key ID changed, update `push.apns_key_id` in `config.yaml` too.
|
||||
|
||||
### `cloudflare-origin-cert`
|
||||
1. Generate a new Origin CA certificate in the Cloudflare dashboard.
|
||||
2. Replace `deploy-k3s/secrets/cloudflare-origin.crt` and `.key`.
|
||||
3. `02-setup-secrets.sh`. Traefik picks up the new TLS secret; no app
|
||||
restart needed. Verify the served cert with `openssl s_client`.
|
||||
|
||||
### `ghcr-credentials` (Gitea registry)
|
||||
1. Generate a new PAT in Gitea (scope: `read:packages`).
|
||||
2. Update the `registry.token` value in `config.yaml`.
|
||||
3. `02-setup-secrets.sh`. No restart needed unless a pull is pending.
|
||||
4. Revoke the old PAT in Gitea.
|
||||
|
||||
### `admin-basic-auth`
|
||||
Source is `config.yaml` keys `admin.basic_auth_user` / `basic_auth_password`.
|
||||
1. Set a new password (e.g. `openssl rand -hex 24`).
|
||||
2. `02-setup-secrets.sh` regenerates the bcrypt htpasswd secret.
|
||||
3. No app restart needed — Traefik reloads the `admin-auth` middleware.
|
||||
4. Distribute the new credential to whoever uses the admin panel.
|
||||
|
||||
---
|
||||
|
||||
## After any rotation
|
||||
|
||||
- Run `./deploy-k3s/scripts/04-verify.sh` and confirm no `✗` lines.
|
||||
- Annotate the rotated secret (see "Record keeping" above).
|
||||
- If the rotation was due to a compromise, also follow the relevant
|
||||
playbook in `deploy-k3s/SECURITY.md` → Appendix (Incident response).
|
||||
@@ -1,6 +1,6 @@
|
||||
module github.com/treytartt/honeydue-api
|
||||
|
||||
go 1.25
|
||||
go 1.25.0
|
||||
|
||||
require (
|
||||
github.com/go-pdf/fpdf v0.9.0
|
||||
@@ -9,9 +9,10 @@ require (
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
github.com/hibiken/asynq v0.25.1
|
||||
github.com/labstack/echo/v4 v4.11.4
|
||||
github.com/labstack/echo/v4 v4.15.1
|
||||
github.com/minio/minio-go/v7 v7.0.99
|
||||
github.com/nicksnyder/go-i18n/v2 v2.6.0
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/redis/go-redis/v9 v9.17.1
|
||||
github.com/rs/zerolog v1.34.0
|
||||
github.com/shirou/gopsutil/v3 v3.24.5
|
||||
@@ -20,11 +21,17 @@ require (
|
||||
github.com/spf13/viper v1.20.1
|
||||
github.com/stretchr/testify v1.11.1
|
||||
github.com/stripe/stripe-go/v81 v81.4.0
|
||||
github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2
|
||||
github.com/wneessen/go-mail v0.7.2
|
||||
golang.org/x/crypto v0.46.0
|
||||
golang.org/x/oauth2 v0.34.0
|
||||
golang.org/x/text v0.32.0
|
||||
golang.org/x/time v0.14.0
|
||||
go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0
|
||||
go.opentelemetry.io/otel/sdk v1.43.0
|
||||
golang.org/x/crypto v0.51.0
|
||||
golang.org/x/oauth2 v0.35.0
|
||||
golang.org/x/term v0.43.0
|
||||
golang.org/x/text v0.37.0
|
||||
golang.org/x/time v0.15.0
|
||||
google.golang.org/api v0.257.0
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
gorm.io/driver/postgres v1.6.0
|
||||
@@ -33,17 +40,28 @@ require (
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-ini/ini v1.67.0 // indirect
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect
|
||||
github.com/klauspost/compress v1.18.2 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.2.11 // indirect
|
||||
github.com/klauspost/crc32 v1.3.0 // indirect
|
||||
github.com/minio/crc64nvme v1.1.1 // indirect
|
||||
github.com/minio/md5-simd v1.1.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/philhofer/fwd v1.2.0 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/common v0.66.1 // indirect
|
||||
github.com/prometheus/procfs v0.16.1 // indirect
|
||||
github.com/rs/xid v1.6.0 // indirect
|
||||
github.com/tinylib/msgp v1.6.1 // indirect
|
||||
github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2 // indirect
|
||||
go.opentelemetry.io/proto/otlp v1.10.0 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.2 // indirect
|
||||
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
|
||||
)
|
||||
|
||||
require (
|
||||
@@ -51,7 +69,7 @@ require (
|
||||
cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
|
||||
cloud.google.com/go/compute/metadata v0.9.0 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fsnotify/fsnotify v1.9.0 // indirect
|
||||
@@ -62,7 +80,6 @@ require (
|
||||
github.com/go-playground/locales v0.14.1 // indirect
|
||||
github.com/go-playground/universal-translator v0.18.1 // indirect
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
|
||||
github.com/golang-jwt/jwt v3.2.2+incompatible // indirect; TODO(S-19): Pulled by echo/v4 middleware — upgrade Echo to v4.12+ which removes built-in JWT middleware (uses echo-jwt/v4 with jwt/v5 instead), eliminating this vulnerable transitive dep
|
||||
github.com/golang-jwt/jwt/v4 v4.5.2 // indirect
|
||||
github.com/google/s2a-go v0.1.9 // indirect
|
||||
github.com/googleapis/enterprise-certificate-proxy v0.3.7 // indirect
|
||||
@@ -76,11 +93,11 @@ require (
|
||||
github.com/labstack/gommon v0.4.2 // indirect
|
||||
github.com/leodido/go-urn v1.4.0 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-colorable v0.1.14 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/mattn/go-sqlite3 v2.0.3+incompatible // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.2.4 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
|
||||
github.com/robfig/cron/v3 v3.0.1 // indirect
|
||||
github.com/sagikazarmark/locafero v0.9.0 // indirect
|
||||
@@ -97,13 +114,13 @@ require (
|
||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
|
||||
go.opentelemetry.io/otel v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.38.0 // indirect
|
||||
golang.org/x/net v0.48.0 // indirect
|
||||
golang.org/x/sync v0.19.0 // indirect
|
||||
golang.org/x/sys v0.39.0 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 // indirect
|
||||
google.golang.org/grpc v1.77.0 // indirect
|
||||
google.golang.org/protobuf v1.36.10 // indirect
|
||||
go.opentelemetry.io/otel v1.43.0
|
||||
go.opentelemetry.io/otel/metric v1.43.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.43.0
|
||||
golang.org/x/net v0.53.0 // indirect
|
||||
golang.org/x/sync v0.20.0
|
||||
golang.org/x/sys v0.44.0 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
|
||||
google.golang.org/grpc v1.80.0 // indirect
|
||||
google.golang.org/protobuf v1.36.11 // indirect
|
||||
)
|
||||
|
||||
@@ -8,16 +8,20 @@ github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg
|
||||
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
||||
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
|
||||
github.com/alecthomas/units v0.0.0-20201120081800-1786d5ef83d4/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
|
||||
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
|
||||
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
|
||||
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
@@ -52,8 +56,6 @@ github.com/go-playground/validator/v10 v10.23.0/go.mod h1:dbuPbCMFw/DrkbEynArYaC
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs=
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
|
||||
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
||||
github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
|
||||
github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
|
||||
github.com/golang-jwt/jwt/v4 v4.4.1/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
|
||||
github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXeUI=
|
||||
github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
|
||||
@@ -74,6 +76,8 @@ github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81
|
||||
github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc=
|
||||
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c=
|
||||
github.com/hibiken/asynq v0.25.1 h1:phj028N0nm15n8O2ims+IvJ2gz4k2auvermngh9JhTw=
|
||||
github.com/hibiken/asynq v0.25.1/go.mod h1:pazWNOLBu0FEynQRBvHA26qdIKRSmfdIfUm4HdsLmXg=
|
||||
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
||||
@@ -99,16 +103,19 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/labstack/echo/v4 v4.11.4 h1:vDZmA+qNeh1pd/cCkEicDMrjtrnMGQ1QFI9gWN1zGq8=
|
||||
github.com/labstack/echo/v4 v4.11.4/go.mod h1:noh7EvLwqDsmh/X/HWKPUl1AjzJrhyptRyEbQJfxen8=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/labstack/echo/v4 v4.15.1 h1:S9keusg26gZpjMmPqB5hOEvNKnmd1lNmcHrbbH2lnFs=
|
||||
github.com/labstack/echo/v4 v4.15.1/go.mod h1:xmw1clThob0BSVRX1CRQkGQ/vjwcpOMjQZSZa9fKA/c=
|
||||
github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0=
|
||||
github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
|
||||
github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
|
||||
github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
|
||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||
github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
|
||||
github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
|
||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
@@ -121,6 +128,8 @@ github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
|
||||
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
|
||||
github.com/minio/minio-go/v7 v7.0.99 h1:2vH/byrwUkIpFQFOilvTfaUpvAX3fEFhEzO+DR3DlCE=
|
||||
github.com/minio/minio-go/v7 v7.0.99/go.mod h1:EtGNKtlX20iL2yaYnxEigaIvj0G0GwSDnifnG8ClIdw=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
github.com/nicksnyder/go-i18n/v2 v2.6.0 h1:C/m2NNWNiTB6SK4Ao8df5EWm3JETSTIGNXBpMJTxzxQ=
|
||||
github.com/nicksnyder/go-i18n/v2 v2.6.0/go.mod h1:88sRqr0C6OPyJn0/KRNaEz1uWorjxIKP7rUUcvycecE=
|
||||
github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
|
||||
@@ -128,10 +137,19 @@ github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8
|
||||
github.com/philhofer/fwd v1.2.0 h1:e6DnBTl7vGY+Gz322/ASL4Gyp1FspeMvx1RNDoToZuM=
|
||||
github.com/philhofer/fwd v1.2.0/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
|
||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
|
||||
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
|
||||
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
|
||||
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
|
||||
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
|
||||
github.com/redis/go-redis/v9 v9.17.1 h1:7tl732FjYPRT9H9aNfyTwKg9iTETjWjGKEJ2t/5iWTs=
|
||||
github.com/redis/go-redis/v9 v9.17.1/go.mod h1:u410H11HMLoB+TP67dz8rL9s6QW2j76l0//kSOd3370=
|
||||
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
|
||||
@@ -180,6 +198,10 @@ github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFA
|
||||
github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
|
||||
github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
|
||||
github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY=
|
||||
github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2 h1:Jjn3zoRz13f8b1bR6LrXWglx93Sbh4kYfwgmPju3E2k=
|
||||
github.com/uptrace/opentelemetry-go-extra/otelgorm v0.3.2/go.mod h1:wocb5pNrj/sjhWB9J5jctnC0K2eisSdz/nJJBNFHo+A=
|
||||
github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2 h1:ZjUj9BLYf9PEqBn8W/OapxhPjVRdC6CsXTdULHsyk5c=
|
||||
github.com/uptrace/opentelemetry-go-extra/otelsql v0.3.2/go.mod h1:O8bHQfyinKwTXKkiKNGmLQS7vRsqRxIQTFZpYpHK3IQ=
|
||||
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
||||
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
||||
github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
|
||||
@@ -190,33 +212,45 @@ github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo
|
||||
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
|
||||
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
|
||||
go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0 h1:7N94HrYgVc2tng6xEjmbycupxteYLll7lPlEi/UK5ok=
|
||||
go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho v0.68.0/go.mod h1:1i+7wBOfx0kn7PSGRKZ8e7zIhs+AmvLCiCloySDUeck=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
|
||||
go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
|
||||
go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
|
||||
go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
|
||||
go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
|
||||
go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
|
||||
go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
|
||||
go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
|
||||
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
|
||||
go.opentelemetry.io/contrib/propagators/b3 v1.43.0 h1:CETqV3QLLPTy5yNrqyMr41VnAOOD4lsRved7n4QG00A=
|
||||
go.opentelemetry.io/contrib/propagators/b3 v1.43.0/go.mod h1:Q4mCiCdziYzpNR0g+6UqVotAlCDZdzz6L8jwY4knOrw=
|
||||
go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
|
||||
go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak=
|
||||
go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
|
||||
go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
|
||||
go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
|
||||
go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
|
||||
go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
|
||||
go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
|
||||
go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g=
|
||||
go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
|
||||
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
|
||||
golang.org/x/crypto v0.0.0-20170512130425-ab89591268e0/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||
golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
|
||||
golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
|
||||
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
|
||||
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
|
||||
golang.org/x/net v0.0.0-20210520170846-37e1c6afe023/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20220403103023-749bd193bc2b/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
|
||||
golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
|
||||
golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
|
||||
golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
|
||||
golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
|
||||
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
|
||||
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
|
||||
golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
|
||||
golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
@@ -228,32 +262,34 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
|
||||
golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
|
||||
golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4=
|
||||
golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk=
|
||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
|
||||
golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
|
||||
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||
golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
|
||||
golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
|
||||
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
|
||||
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
|
||||
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
|
||||
gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
|
||||
gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
|
||||
google.golang.org/api v0.257.0 h1:8Y0lzvHlZps53PEaw+G29SsQIkuKrumGWs9puiexNAA=
|
||||
google.golang.org/api v0.257.0/go.mod h1:4eJrr+vbVaZSqs7vovFd1Jb/A6ml6iw2e6FBYf3GAO4=
|
||||
google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4=
|
||||
google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846 h1:Wgl1rcDNThT+Zn47YyCXOXyX/COgMTIdhJ717F0l4xk=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20251124214823-79d6a2a48846/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
|
||||
google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM=
|
||||
google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig=
|
||||
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
|
||||
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
|
||||
google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
|
||||
google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
|
||||
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
||||
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
|
||||
@@ -1,215 +1,30 @@
|
||||
// apple_social_auth_handler is a stub — the user_applesocialauth table was
|
||||
// dropped in the Ory Kratos migration (phase 2). Social sign-in is now
|
||||
// handled by Kratos.
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
"gorm.io/gorm"
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/admin/dto"
|
||||
"github.com/treytartt/honeydue-api/internal/models"
|
||||
)
|
||||
|
||||
// AdminAppleSocialAuthHandler handles admin Apple social auth management endpoints
|
||||
// AdminAppleSocialAuthHandler is a no-op stub.
|
||||
type AdminAppleSocialAuthHandler struct {
|
||||
db *gorm.DB
|
||||
}
|
||||
|
||||
// NewAdminAppleSocialAuthHandler creates a new admin Apple social auth handler
|
||||
func NewAdminAppleSocialAuthHandler(db *gorm.DB) *AdminAppleSocialAuthHandler {
|
||||
return &AdminAppleSocialAuthHandler{db: db}
|
||||
}
|
||||
|
||||
// AppleSocialAuthResponse represents the response for an Apple social auth entry
|
||||
type AppleSocialAuthResponse struct {
|
||||
ID uint `json:"id"`
|
||||
UserID uint `json:"user_id"`
|
||||
Username string `json:"username"`
|
||||
UserEmail string `json:"user_email"`
|
||||
AppleID string `json:"apple_id"`
|
||||
Email string `json:"email"`
|
||||
IsPrivateEmail bool `json:"is_private_email"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
UpdatedAt string `json:"updated_at"`
|
||||
}
|
||||
|
||||
// UpdateAppleSocialAuthRequest represents the request to update an Apple social auth entry
|
||||
type UpdateAppleSocialAuthRequest struct {
|
||||
Email *string `json:"email"`
|
||||
IsPrivateEmail *bool `json:"is_private_email"`
|
||||
}
|
||||
|
||||
// List handles GET /api/admin/apple-social-auth
|
||||
func (h *AdminAppleSocialAuthHandler) List(c echo.Context) error {
|
||||
var filters dto.PaginationParams
|
||||
if err := c.Bind(&filters); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
var entries []models.AppleSocialAuth
|
||||
var total int64
|
||||
|
||||
query := h.db.Model(&models.AppleSocialAuth{}).Preload("User")
|
||||
|
||||
// Apply search
|
||||
if filters.Search != "" {
|
||||
search := "%" + filters.Search + "%"
|
||||
query = query.Joins("JOIN auth_user ON auth_user.id = user_applesocialauth.user_id").
|
||||
Where("user_applesocialauth.apple_id ILIKE ? OR user_applesocialauth.email ILIKE ? OR auth_user.username ILIKE ? OR auth_user.email ILIKE ?",
|
||||
search, search, search, search)
|
||||
}
|
||||
|
||||
// Get total count
|
||||
query.Count(&total)
|
||||
|
||||
// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
|
||||
sortBy := filters.GetSafeSortBy([]string{
|
||||
"id", "user_id", "apple_id", "email", "is_private_email",
|
||||
"created_at", "updated_at",
|
||||
}, "created_at")
|
||||
query = query.Order(sortBy + " " + filters.GetSortDir())
|
||||
|
||||
// Apply pagination
|
||||
query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
|
||||
|
||||
if err := query.Find(&entries).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entries"})
|
||||
}
|
||||
|
||||
// Build response
|
||||
responses := make([]AppleSocialAuthResponse, len(entries))
|
||||
for i, entry := range entries {
|
||||
responses[i] = h.toResponse(&entry)
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
|
||||
}
|
||||
|
||||
// Get handles GET /api/admin/apple-social-auth/:id
|
||||
func (h *AdminAppleSocialAuthHandler) Get(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
|
||||
}
|
||||
|
||||
var entry models.AppleSocialAuth
|
||||
if err := h.db.Preload("User").First(&entry, id).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
|
||||
}
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, h.toResponse(&entry))
|
||||
}
|
||||
|
||||
// GetByUser handles GET /api/admin/apple-social-auth/user/:user_id
|
||||
func (h *AdminAppleSocialAuthHandler) GetByUser(c echo.Context) error {
|
||||
userID, err := strconv.ParseUint(c.Param("user_id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
|
||||
}
|
||||
|
||||
var entry models.AppleSocialAuth
|
||||
if err := h.db.Preload("User").Where("user_id = ?", userID).First(&entry).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found for user"})
|
||||
}
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, h.toResponse(&entry))
|
||||
}
|
||||
|
||||
// Update handles PUT /api/admin/apple-social-auth/:id
|
||||
func (h *AdminAppleSocialAuthHandler) Update(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
|
||||
}
|
||||
|
||||
var entry models.AppleSocialAuth
|
||||
if err := h.db.First(&entry, id).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
|
||||
}
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
|
||||
}
|
||||
|
||||
var req UpdateAppleSocialAuthRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
if req.Email != nil {
|
||||
entry.Email = *req.Email
|
||||
}
|
||||
if req.IsPrivateEmail != nil {
|
||||
entry.IsPrivateEmail = *req.IsPrivateEmail
|
||||
}
|
||||
|
||||
if err := h.db.Save(&entry).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update Apple social auth entry"})
|
||||
}
|
||||
|
||||
h.db.Preload("User").First(&entry, id)
|
||||
return c.JSON(http.StatusOK, h.toResponse(&entry))
|
||||
}
|
||||
|
||||
// Delete handles DELETE /api/admin/apple-social-auth/:id
|
||||
func (h *AdminAppleSocialAuthHandler) Delete(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
|
||||
}
|
||||
|
||||
var entry models.AppleSocialAuth
|
||||
if err := h.db.First(&entry, id).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Apple social auth entry not found"})
|
||||
}
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch Apple social auth entry"})
|
||||
}
|
||||
|
||||
if err := h.db.Delete(&entry).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete Apple social auth entry"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, map[string]interface{}{"message": "Apple social auth entry deleted successfully"})
|
||||
}
|
||||
|
||||
// BulkDelete handles DELETE /api/admin/apple-social-auth/bulk
|
||||
func (h *AdminAppleSocialAuthHandler) BulkDelete(c echo.Context) error {
|
||||
var req dto.BulkDeleteRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
result := h.db.Where("id IN ?", req.IDs).Delete(&models.AppleSocialAuth{})
|
||||
if result.Error != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete Apple social auth entries"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, map[string]interface{}{"message": "Apple social auth entries deleted successfully", "count": result.RowsAffected})
|
||||
}
|
||||
|
||||
// toResponse converts an AppleSocialAuth model to AppleSocialAuthResponse
|
||||
func (h *AdminAppleSocialAuthHandler) toResponse(entry *models.AppleSocialAuth) AppleSocialAuthResponse {
|
||||
response := AppleSocialAuthResponse{
|
||||
ID: entry.ID,
|
||||
UserID: entry.UserID,
|
||||
AppleID: entry.AppleID,
|
||||
Email: entry.Email,
|
||||
IsPrivateEmail: entry.IsPrivateEmail,
|
||||
CreatedAt: entry.CreatedAt.Format("2006-01-02T15:04:05Z"),
|
||||
UpdatedAt: entry.UpdatedAt.Format("2006-01-02T15:04:05Z"),
|
||||
}
|
||||
|
||||
if entry.User.ID != 0 {
|
||||
response.Username = entry.User.Username
|
||||
response.UserEmail = entry.User.Email
|
||||
}
|
||||
|
||||
return response
|
||||
func (h *AdminAppleSocialAuthHandler) gone(c echo.Context) error {
|
||||
return c.JSON(http.StatusGone, map[string]string{"message": "Apple social auth is managed by Ory Kratos"})
|
||||
}
|
||||
func (h *AdminAppleSocialAuthHandler) List(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminAppleSocialAuthHandler) Get(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminAppleSocialAuthHandler) Delete(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminAppleSocialAuthHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminAppleSocialAuthHandler) Update(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminAppleSocialAuthHandler) GetByUser(c echo.Context) error { return h.gone(c) }
|
||||
|
||||
@@ -1,144 +1,27 @@
|
||||
// auth_token_handler is a stub — the user_authtoken table was dropped in the
|
||||
// Ory Kratos migration (phase 2). Auth tokens are now Kratos sessions.
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
"gorm.io/gorm"
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/admin/dto"
|
||||
"github.com/treytartt/honeydue-api/internal/models"
|
||||
)
|
||||
|
||||
// AdminAuthTokenHandler handles admin auth token management endpoints
|
||||
// AdminAuthTokenHandler is a no-op stub.
|
||||
type AdminAuthTokenHandler struct {
|
||||
db *gorm.DB
|
||||
}
|
||||
|
||||
// NewAdminAuthTokenHandler creates a new admin auth token handler
|
||||
func NewAdminAuthTokenHandler(db *gorm.DB) *AdminAuthTokenHandler {
|
||||
return &AdminAuthTokenHandler{db: db}
|
||||
}
|
||||
|
||||
// AuthTokenResponse represents an auth token in API responses
|
||||
type AuthTokenResponse struct {
|
||||
Key string `json:"key"`
|
||||
UserID uint `json:"user_id"`
|
||||
Username string `json:"username"`
|
||||
Email string `json:"email"`
|
||||
Created string `json:"created"`
|
||||
}
|
||||
|
||||
// List handles GET /api/admin/auth-tokens
|
||||
func (h *AdminAuthTokenHandler) List(c echo.Context) error {
|
||||
var filters dto.PaginationParams
|
||||
if err := c.Bind(&filters); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
var tokens []models.AuthToken
|
||||
var total int64
|
||||
|
||||
query := h.db.Model(&models.AuthToken{}).Preload("User")
|
||||
|
||||
// Apply search (search by user info)
|
||||
if filters.Search != "" {
|
||||
search := "%" + filters.Search + "%"
|
||||
query = query.Joins("JOIN auth_user ON auth_user.id = user_authtoken.user_id").
|
||||
Where(
|
||||
"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_authtoken.key ILIKE ?",
|
||||
search, search, search,
|
||||
)
|
||||
}
|
||||
|
||||
// Get total count
|
||||
query.Count(&total)
|
||||
|
||||
// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
|
||||
sortBy := filters.GetSafeSortBy([]string{
|
||||
"created", "user_id",
|
||||
}, "created")
|
||||
query = query.Order(sortBy + " " + filters.GetSortDir())
|
||||
|
||||
// Apply pagination
|
||||
query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
|
||||
|
||||
if err := query.Find(&tokens).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch auth tokens"})
|
||||
}
|
||||
|
||||
// Build response
|
||||
responses := make([]AuthTokenResponse, len(tokens))
|
||||
for i, token := range tokens {
|
||||
responses[i] = AuthTokenResponse{
|
||||
Key: token.Key,
|
||||
UserID: token.UserID,
|
||||
Username: token.User.Username,
|
||||
Email: token.User.Email,
|
||||
Created: token.Created.Format("2006-01-02T15:04:05Z"),
|
||||
}
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
|
||||
}
|
||||
|
||||
// Get handles GET /api/admin/auth-tokens/:id (id is actually user_id)
|
||||
func (h *AdminAuthTokenHandler) Get(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
|
||||
}
|
||||
|
||||
var token models.AuthToken
|
||||
if err := h.db.Preload("User").Where("user_id = ?", id).First(&token).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Auth token not found"})
|
||||
}
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch auth token"})
|
||||
}
|
||||
|
||||
response := AuthTokenResponse{
|
||||
Key: token.Key,
|
||||
UserID: token.UserID,
|
||||
Username: token.User.Username,
|
||||
Email: token.User.Email,
|
||||
Created: token.Created.Format("2006-01-02T15:04:05Z"),
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// Delete handles DELETE /api/admin/auth-tokens/:id (revoke token)
|
||||
func (h *AdminAuthTokenHandler) Delete(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid user ID"})
|
||||
}
|
||||
|
||||
result := h.db.Where("user_id = ?", id).Delete(&models.AuthToken{})
|
||||
if result.Error != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to revoke token"})
|
||||
}
|
||||
|
||||
if result.RowsAffected == 0 {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Auth token not found"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, map[string]interface{}{"message": "Auth token revoked successfully"})
|
||||
}
|
||||
|
||||
// BulkDelete handles DELETE /api/admin/auth-tokens/bulk
|
||||
func (h *AdminAuthTokenHandler) BulkDelete(c echo.Context) error {
|
||||
var req dto.BulkDeleteRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
result := h.db.Where("user_id IN ?", req.IDs).Delete(&models.AuthToken{})
|
||||
if result.Error != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to revoke tokens"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, map[string]interface{}{"message": "Auth tokens revoked successfully", "count": result.RowsAffected})
|
||||
func (h *AdminAuthTokenHandler) gone(c echo.Context) error {
|
||||
return c.JSON(http.StatusGone, map[string]string{"message": "auth tokens are managed by Ory Kratos"})
|
||||
}
|
||||
func (h *AdminAuthTokenHandler) List(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminAuthTokenHandler) Get(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminAuthTokenHandler) Delete(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminAuthTokenHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
|
||||
|
||||
@@ -1,162 +1,28 @@
|
||||
// confirmation_code_handler is a stub — the user_confirmationcode table was
|
||||
// dropped in the Ory Kratos migration (phase 2). Email verification is now
|
||||
// handled by Kratos.
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
"gorm.io/gorm"
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/admin/dto"
|
||||
"github.com/treytartt/honeydue-api/internal/models"
|
||||
)
|
||||
|
||||
// maskCode masks a confirmation code, showing only the last 4 characters.
|
||||
func maskCode(code string) string {
|
||||
if len(code) <= 4 {
|
||||
return strings.Repeat("*", len(code))
|
||||
}
|
||||
return strings.Repeat("*", len(code)-4) + code[len(code)-4:]
|
||||
}
|
||||
|
||||
// AdminConfirmationCodeHandler handles admin confirmation code management endpoints
|
||||
// AdminConfirmationCodeHandler is a no-op stub.
|
||||
type AdminConfirmationCodeHandler struct {
|
||||
db *gorm.DB
|
||||
}
|
||||
|
||||
// NewAdminConfirmationCodeHandler creates a new admin confirmation code handler
|
||||
func NewAdminConfirmationCodeHandler(db *gorm.DB) *AdminConfirmationCodeHandler {
|
||||
return &AdminConfirmationCodeHandler{db: db}
|
||||
}
|
||||
|
||||
// ConfirmationCodeResponse represents a confirmation code in API responses
|
||||
type ConfirmationCodeResponse struct {
|
||||
ID uint `json:"id"`
|
||||
UserID uint `json:"user_id"`
|
||||
Username string `json:"username"`
|
||||
Email string `json:"email"`
|
||||
Code string `json:"code"`
|
||||
ExpiresAt string `json:"expires_at"`
|
||||
IsUsed bool `json:"is_used"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
// List handles GET /api/admin/confirmation-codes
|
||||
func (h *AdminConfirmationCodeHandler) List(c echo.Context) error {
|
||||
var filters dto.PaginationParams
|
||||
if err := c.Bind(&filters); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
var codes []models.ConfirmationCode
|
||||
var total int64
|
||||
|
||||
query := h.db.Model(&models.ConfirmationCode{}).Preload("User")
|
||||
|
||||
// Apply search (search by user info or code)
|
||||
if filters.Search != "" {
|
||||
search := "%" + filters.Search + "%"
|
||||
query = query.Joins("JOIN auth_user ON auth_user.id = user_confirmationcode.user_id").
|
||||
Where(
|
||||
"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_confirmationcode.code ILIKE ?",
|
||||
search, search, search,
|
||||
)
|
||||
}
|
||||
|
||||
// Get total count
|
||||
query.Count(&total)
|
||||
|
||||
// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
|
||||
sortBy := filters.GetSafeSortBy([]string{
|
||||
"id", "user_id", "created_at", "expires_at", "is_used",
|
||||
}, "created_at")
|
||||
query = query.Order(sortBy + " " + filters.GetSortDir())
|
||||
|
||||
// Apply pagination
|
||||
query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
|
||||
|
||||
if err := query.Find(&codes).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch confirmation codes"})
|
||||
}
|
||||
|
||||
// Build response
|
||||
responses := make([]ConfirmationCodeResponse, len(codes))
|
||||
for i, code := range codes {
|
||||
responses[i] = ConfirmationCodeResponse{
|
||||
ID: code.ID,
|
||||
UserID: code.UserID,
|
||||
Username: code.User.Username,
|
||||
Email: code.User.Email,
|
||||
Code: maskCode(code.Code),
|
||||
ExpiresAt: code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
|
||||
IsUsed: code.IsUsed,
|
||||
CreatedAt: code.CreatedAt.Format("2006-01-02T15:04:05Z"),
|
||||
}
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
|
||||
}
|
||||
|
||||
// Get handles GET /api/admin/confirmation-codes/:id
|
||||
func (h *AdminConfirmationCodeHandler) Get(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
|
||||
}
|
||||
|
||||
var code models.ConfirmationCode
|
||||
if err := h.db.Preload("User").First(&code, id).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Confirmation code not found"})
|
||||
}
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch confirmation code"})
|
||||
}
|
||||
|
||||
response := ConfirmationCodeResponse{
|
||||
ID: code.ID,
|
||||
UserID: code.UserID,
|
||||
Username: code.User.Username,
|
||||
Email: code.User.Email,
|
||||
Code: maskCode(code.Code),
|
||||
ExpiresAt: code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
|
||||
IsUsed: code.IsUsed,
|
||||
CreatedAt: code.CreatedAt.Format("2006-01-02T15:04:05Z"),
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// Delete handles DELETE /api/admin/confirmation-codes/:id
|
||||
func (h *AdminConfirmationCodeHandler) Delete(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
|
||||
}
|
||||
|
||||
result := h.db.Delete(&models.ConfirmationCode{}, id)
|
||||
if result.Error != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete confirmation code"})
|
||||
}
|
||||
|
||||
if result.RowsAffected == 0 {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Confirmation code not found"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, map[string]interface{}{"message": "Confirmation code deleted successfully"})
|
||||
}
|
||||
|
||||
// BulkDelete handles DELETE /api/admin/confirmation-codes/bulk
|
||||
func (h *AdminConfirmationCodeHandler) BulkDelete(c echo.Context) error {
|
||||
var req dto.BulkDeleteRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
result := h.db.Where("id IN ?", req.IDs).Delete(&models.ConfirmationCode{})
|
||||
if result.Error != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete confirmation codes"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, map[string]interface{}{"message": "Confirmation codes deleted successfully", "count": result.RowsAffected})
|
||||
func (h *AdminConfirmationCodeHandler) gone(c echo.Context) error {
|
||||
return c.JSON(http.StatusGone, map[string]string{"message": "confirmation codes are managed by Ory Kratos"})
|
||||
}
|
||||
func (h *AdminConfirmationCodeHandler) List(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminConfirmationCodeHandler) Get(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminConfirmationCodeHandler) Delete(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminConfirmationCodeHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
|
||||
|
||||
@@ -8,16 +8,18 @@ import (
|
||||
"gorm.io/gorm"
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/models"
|
||||
"github.com/treytartt/honeydue-api/internal/services"
|
||||
)
|
||||
|
||||
// AdminLimitationsHandler handles subscription limitations management
|
||||
type AdminLimitationsHandler struct {
|
||||
db *gorm.DB
|
||||
db *gorm.DB
|
||||
cache *services.CacheService
|
||||
}
|
||||
|
||||
// NewAdminLimitationsHandler creates a new handler
|
||||
func NewAdminLimitationsHandler(db *gorm.DB) *AdminLimitationsHandler {
|
||||
return &AdminLimitationsHandler{db: db}
|
||||
// NewAdminLimitationsHandler creates a new handler. Cache is optional.
|
||||
func NewAdminLimitationsHandler(db *gorm.DB, cache *services.CacheService) *AdminLimitationsHandler {
|
||||
return &AdminLimitationsHandler{db: db, cache: cache}
|
||||
}
|
||||
|
||||
// === Settings (enable_limitations) ===
|
||||
@@ -27,14 +29,25 @@ type LimitationsSettingsResponse struct {
|
||||
EnableLimitations bool `json:"enable_limitations"`
|
||||
}
|
||||
|
||||
// GetSettings handles GET /api/admin/limitations/settings
|
||||
// GetSettings handles GET /api/admin/limitations/settings.
|
||||
// Reads through Redis cache first; on miss falls through to DB.
|
||||
func (h *AdminLimitationsHandler) GetSettings(c echo.Context) error {
|
||||
ctx := c.Request().Context()
|
||||
|
||||
if h.cache != nil {
|
||||
var cached models.SubscriptionSettings
|
||||
if err := h.cache.GetCachedSubscriptionSettings(ctx, &cached); err == nil {
|
||||
return c.JSON(http.StatusOK, LimitationsSettingsResponse{
|
||||
EnableLimitations: cached.EnableLimitations,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
var settings models.SubscriptionSettings
|
||||
if err := h.db.First(&settings, 1).Error; err != nil {
|
||||
if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
// Create default settings
|
||||
settings = models.SubscriptionSettings{ID: 1, EnableLimitations: false}
|
||||
if err := h.db.Create(&settings).Error; err != nil {
|
||||
if err := h.db.WithContext(ctx).Create(&settings).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create default settings"})
|
||||
}
|
||||
} else {
|
||||
@@ -42,6 +55,10 @@ func (h *AdminLimitationsHandler) GetSettings(c echo.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
if h.cache != nil {
|
||||
_ = h.cache.CacheSubscriptionSettings(ctx, &settings)
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, LimitationsSettingsResponse{
|
||||
EnableLimitations: settings.EnableLimitations,
|
||||
})
|
||||
@@ -60,7 +77,8 @@ func (h *AdminLimitationsHandler) UpdateSettings(c echo.Context) error {
|
||||
}
|
||||
|
||||
var settings models.SubscriptionSettings
|
||||
if err := h.db.First(&settings, 1).Error; err != nil {
|
||||
ctx := c.Request().Context()
|
||||
if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
settings = models.SubscriptionSettings{ID: 1}
|
||||
} else {
|
||||
@@ -72,10 +90,15 @@ func (h *AdminLimitationsHandler) UpdateSettings(c echo.Context) error {
|
||||
settings.EnableLimitations = *req.EnableLimitations
|
||||
}
|
||||
|
||||
if err := h.db.Save(&settings).Error; err != nil {
|
||||
if err := h.db.WithContext(ctx).Save(&settings).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update settings"})
|
||||
}
|
||||
|
||||
// Invalidate the cache so the new value is visible to all pods.
|
||||
if h.cache != nil {
|
||||
_ = h.cache.InvalidateSubscriptionSettings(ctx)
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, LimitationsSettingsResponse{
|
||||
EnableLimitations: settings.EnableLimitations,
|
||||
})
|
||||
|
||||
@@ -1,159 +1,28 @@
|
||||
// password_reset_code_handler is a stub — the user_passwordresetcode table
|
||||
// was dropped in the Ory Kratos migration (phase 2). Password resets are now
|
||||
// handled by Kratos.
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
"gorm.io/gorm"
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/admin/dto"
|
||||
"github.com/treytartt/honeydue-api/internal/models"
|
||||
)
|
||||
|
||||
// AdminPasswordResetCodeHandler handles admin password reset code management endpoints
|
||||
// AdminPasswordResetCodeHandler is a no-op stub.
|
||||
type AdminPasswordResetCodeHandler struct {
|
||||
db *gorm.DB
|
||||
}
|
||||
|
||||
// NewAdminPasswordResetCodeHandler creates a new admin password reset code handler
|
||||
func NewAdminPasswordResetCodeHandler(db *gorm.DB) *AdminPasswordResetCodeHandler {
|
||||
return &AdminPasswordResetCodeHandler{db: db}
|
||||
}
|
||||
|
||||
// PasswordResetCodeResponse represents a password reset code in API responses
|
||||
type PasswordResetCodeResponse struct {
|
||||
ID uint `json:"id"`
|
||||
UserID uint `json:"user_id"`
|
||||
Username string `json:"username"`
|
||||
Email string `json:"email"`
|
||||
ResetToken string `json:"reset_token"`
|
||||
ExpiresAt string `json:"expires_at"`
|
||||
Used bool `json:"used"`
|
||||
Attempts int `json:"attempts"`
|
||||
MaxAttempts int `json:"max_attempts"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
// List handles GET /api/admin/password-reset-codes
|
||||
func (h *AdminPasswordResetCodeHandler) List(c echo.Context) error {
|
||||
var filters dto.PaginationParams
|
||||
if err := c.Bind(&filters); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
var codes []models.PasswordResetCode
|
||||
var total int64
|
||||
|
||||
query := h.db.Model(&models.PasswordResetCode{}).Preload("User")
|
||||
|
||||
// Apply search (search by user info or token)
|
||||
if filters.Search != "" {
|
||||
search := "%" + filters.Search + "%"
|
||||
query = query.Joins("JOIN auth_user ON auth_user.id = user_passwordresetcode.user_id").
|
||||
Where(
|
||||
"auth_user.username ILIKE ? OR auth_user.email ILIKE ? OR user_passwordresetcode.reset_token ILIKE ?",
|
||||
search, search, search,
|
||||
)
|
||||
}
|
||||
|
||||
// Get total count
|
||||
query.Count(&total)
|
||||
|
||||
// Apply sorting (allowlist prevents SQL injection via sort_by parameter)
|
||||
sortBy := filters.GetSafeSortBy([]string{
|
||||
"id", "user_id", "created_at", "expires_at", "used",
|
||||
}, "created_at")
|
||||
query = query.Order(sortBy + " " + filters.GetSortDir())
|
||||
|
||||
// Apply pagination
|
||||
query = query.Offset(filters.GetOffset()).Limit(filters.GetPerPage())
|
||||
|
||||
if err := query.Find(&codes).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch password reset codes"})
|
||||
}
|
||||
|
||||
// Build response
|
||||
responses := make([]PasswordResetCodeResponse, len(codes))
|
||||
for i, code := range codes {
|
||||
responses[i] = PasswordResetCodeResponse{
|
||||
ID: code.ID,
|
||||
UserID: code.UserID,
|
||||
Username: code.User.Username,
|
||||
Email: code.User.Email,
|
||||
ResetToken: code.ResetToken[:8] + "..." + code.ResetToken[len(code.ResetToken)-4:], // Truncate for display
|
||||
ExpiresAt: code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
|
||||
Used: code.Used,
|
||||
Attempts: code.Attempts,
|
||||
MaxAttempts: code.MaxAttempts,
|
||||
CreatedAt: code.CreatedAt.Format("2006-01-02T15:04:05Z"),
|
||||
}
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, dto.NewPaginatedResponse(responses, total, filters.GetPage(), filters.GetPerPage()))
|
||||
}
|
||||
|
||||
// Get handles GET /api/admin/password-reset-codes/:id
|
||||
func (h *AdminPasswordResetCodeHandler) Get(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
|
||||
}
|
||||
|
||||
var code models.PasswordResetCode
|
||||
if err := h.db.Preload("User").First(&code, id).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Password reset code not found"})
|
||||
}
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to fetch password reset code"})
|
||||
}
|
||||
|
||||
response := PasswordResetCodeResponse{
|
||||
ID: code.ID,
|
||||
UserID: code.UserID,
|
||||
Username: code.User.Username,
|
||||
Email: code.User.Email,
|
||||
ResetToken: code.ResetToken[:8] + "..." + code.ResetToken[len(code.ResetToken)-4:],
|
||||
ExpiresAt: code.ExpiresAt.Format("2006-01-02T15:04:05Z"),
|
||||
Used: code.Used,
|
||||
Attempts: code.Attempts,
|
||||
MaxAttempts: code.MaxAttempts,
|
||||
CreatedAt: code.CreatedAt.Format("2006-01-02T15:04:05Z"),
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// Delete handles DELETE /api/admin/password-reset-codes/:id
|
||||
func (h *AdminPasswordResetCodeHandler) Delete(c echo.Context) error {
|
||||
id, err := strconv.ParseUint(c.Param("id"), 10, 32)
|
||||
if err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid ID"})
|
||||
}
|
||||
|
||||
result := h.db.Delete(&models.PasswordResetCode{}, id)
|
||||
if result.Error != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete password reset code"})
|
||||
}
|
||||
|
||||
if result.RowsAffected == 0 {
|
||||
return c.JSON(http.StatusNotFound, map[string]interface{}{"error": "Password reset code not found"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, map[string]interface{}{"message": "Password reset code deleted successfully"})
|
||||
}
|
||||
|
||||
// BulkDelete handles DELETE /api/admin/password-reset-codes/bulk
|
||||
func (h *AdminPasswordResetCodeHandler) BulkDelete(c echo.Context) error {
|
||||
var req dto.BulkDeleteRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, map[string]interface{}{"error": "Invalid request body"})
|
||||
}
|
||||
|
||||
result := h.db.Where("id IN ?", req.IDs).Delete(&models.PasswordResetCode{})
|
||||
if result.Error != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to delete password reset codes"})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, map[string]interface{}{"message": "Password reset codes deleted successfully", "count": result.RowsAffected})
|
||||
func (h *AdminPasswordResetCodeHandler) gone(c echo.Context) error {
|
||||
return c.JSON(http.StatusGone, map[string]string{"message": "password reset codes are managed by Ory Kratos"})
|
||||
}
|
||||
func (h *AdminPasswordResetCodeHandler) List(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminPasswordResetCodeHandler) Get(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminPasswordResetCodeHandler) Delete(c echo.Context) error { return h.gone(c) }
|
||||
func (h *AdminPasswordResetCodeHandler) BulkDelete(c echo.Context) error { return h.gone(c) }
|
||||
|
||||
@@ -18,12 +18,14 @@ import (
|
||||
|
||||
// AdminSettingsHandler handles system settings management
|
||||
type AdminSettingsHandler struct {
|
||||
db *gorm.DB
|
||||
db *gorm.DB
|
||||
cache *services.CacheService
|
||||
}
|
||||
|
||||
// NewAdminSettingsHandler creates a new handler
|
||||
func NewAdminSettingsHandler(db *gorm.DB) *AdminSettingsHandler {
|
||||
return &AdminSettingsHandler{db: db}
|
||||
// NewAdminSettingsHandler creates a new handler. The cache may be nil; the
|
||||
// handler falls through to direct DB reads in that case.
|
||||
func NewAdminSettingsHandler(db *gorm.DB, cache *services.CacheService) *AdminSettingsHandler {
|
||||
return &AdminSettingsHandler{db: db, cache: cache}
|
||||
}
|
||||
|
||||
// SettingsResponse represents the settings response
|
||||
@@ -34,10 +36,29 @@ type SettingsResponse struct {
|
||||
TrialDurationDays int `json:"trial_duration_days"`
|
||||
}
|
||||
|
||||
// GetSettings handles GET /api/admin/settings
|
||||
// GetSettings handles GET /api/admin/settings.
|
||||
//
|
||||
// Reads through Redis (30-min TTL) before hitting Postgres so the same
|
||||
// row that's checked on every authed request and every monitoring poll
|
||||
// stays hot. Cache miss / first boot creates and caches the default row.
|
||||
func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
|
||||
ctx := c.Request().Context()
|
||||
|
||||
// Try cache first.
|
||||
if h.cache != nil {
|
||||
var cached models.SubscriptionSettings
|
||||
if err := h.cache.GetCachedSubscriptionSettings(ctx, &cached); err == nil {
|
||||
return c.JSON(http.StatusOK, SettingsResponse{
|
||||
EnableLimitations: cached.EnableLimitations,
|
||||
EnableMonitoring: cached.EnableMonitoring,
|
||||
TrialEnabled: cached.TrialEnabled,
|
||||
TrialDurationDays: cached.TrialDurationDays,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
var settings models.SubscriptionSettings
|
||||
if err := h.db.First(&settings, 1).Error; err != nil {
|
||||
if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
// Create default settings
|
||||
settings = models.SubscriptionSettings{
|
||||
@@ -47,7 +68,7 @@ func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
|
||||
TrialEnabled: true,
|
||||
TrialDurationDays: 14,
|
||||
}
|
||||
if err := h.db.Create(&settings).Error; err != nil {
|
||||
if err := h.db.WithContext(ctx).Create(&settings).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create default settings"})
|
||||
}
|
||||
} else {
|
||||
@@ -55,6 +76,10 @@ func (h *AdminSettingsHandler) GetSettings(c echo.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
if h.cache != nil {
|
||||
_ = h.cache.CacheSubscriptionSettings(ctx, &settings)
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, SettingsResponse{
|
||||
EnableLimitations: settings.EnableLimitations,
|
||||
EnableMonitoring: settings.EnableMonitoring,
|
||||
@@ -79,7 +104,7 @@ func (h *AdminSettingsHandler) UpdateSettings(c echo.Context) error {
|
||||
}
|
||||
|
||||
var settings models.SubscriptionSettings
|
||||
if err := h.db.First(&settings, 1).Error; err != nil {
|
||||
if err := h.db.WithContext(c.Request().Context()).First(&settings, 1).Error; err != nil {
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
settings = models.SubscriptionSettings{
|
||||
ID: 1,
|
||||
@@ -108,10 +133,16 @@ func (h *AdminSettingsHandler) UpdateSettings(c echo.Context) error {
|
||||
settings.TrialDurationDays = *req.TrialDurationDays
|
||||
}
|
||||
|
||||
if err := h.db.Save(&settings).Error; err != nil {
|
||||
if err := h.db.WithContext(c.Request().Context()).Save(&settings).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update settings"})
|
||||
}
|
||||
|
||||
// Invalidate the cache so all pods pick up the new value on their
|
||||
// next read (instead of waiting for the 30-min TTL).
|
||||
if h.cache != nil {
|
||||
_ = h.cache.InvalidateSubscriptionSettings(c.Request().Context())
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, SettingsResponse{
|
||||
EnableLimitations: settings.EnableLimitations,
|
||||
EnableMonitoring: settings.EnableMonitoring,
|
||||
@@ -217,137 +248,20 @@ func (h *AdminSettingsHandler) cacheAllLookups(ctx context.Context) (bool, error
|
||||
}
|
||||
log.Debug().Int("count", len(taskTemplates)).Msg("Cached task templates")
|
||||
|
||||
// Build and cache the unified seeded data response
|
||||
// Import the grouped response type
|
||||
seededData := map[string]interface{}{
|
||||
"residence_types": residenceTypes,
|
||||
"task_categories": categories,
|
||||
"task_priorities": priorities,
|
||||
"task_frequencies": frequencies,
|
||||
"contractor_specialties": specialties,
|
||||
"task_templates": buildGroupedTemplates(taskTemplates),
|
||||
// Invalidate the unified seeded-data cache for every locale. The combined
|
||||
// response is localized (lookup display_name + home-profile options) and is
|
||||
// rebuilt per-locale on demand by the static_data handler, so the correct
|
||||
// action after a lookup change is to clear all language variants rather than
|
||||
// pre-warm a single (non-localized) blob.
|
||||
if err := cache.InvalidateSeededData(ctx); err != nil {
|
||||
return false, fmt.Errorf("failed to invalidate seeded data: %w", err)
|
||||
}
|
||||
|
||||
etag, err := cache.CacheSeededData(ctx, seededData)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("failed to cache seeded data: %w", err)
|
||||
}
|
||||
log.Debug().Str("etag", etag).Msg("Cached unified seeded data")
|
||||
log.Debug().Msg("Invalidated per-locale seeded data cache")
|
||||
|
||||
log.Info().Msg("All lookup data cached in Redis successfully")
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// buildGroupedTemplates groups task templates by category for the seeded data response
|
||||
func buildGroupedTemplates(templates []models.TaskTemplate) map[string]interface{} {
|
||||
type templateResponse struct {
|
||||
ID uint `json:"id"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
CategoryID *uint `json:"category_id"`
|
||||
Category map[string]interface{} `json:"category,omitempty"`
|
||||
FrequencyID *uint `json:"frequency_id"`
|
||||
Frequency map[string]interface{} `json:"frequency,omitempty"`
|
||||
IconIOS string `json:"icon_ios"`
|
||||
IconAndroid string `json:"icon_android"`
|
||||
Tags []string `json:"tags"`
|
||||
DisplayOrder int `json:"display_order"`
|
||||
IsActive bool `json:"is_active"`
|
||||
}
|
||||
|
||||
type categoryGroup struct {
|
||||
CategoryName string `json:"category_name"`
|
||||
CategoryID *uint `json:"category_id"`
|
||||
Templates []templateResponse `json:"templates"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
categoryMap := make(map[string]*categoryGroup)
|
||||
categoryOrder := []string{}
|
||||
|
||||
for _, t := range templates {
|
||||
categoryName := "Uncategorized"
|
||||
var categoryID *uint
|
||||
if t.Category != nil {
|
||||
categoryName = t.Category.Name
|
||||
categoryID = &t.Category.ID
|
||||
}
|
||||
|
||||
if _, exists := categoryMap[categoryName]; !exists {
|
||||
categoryMap[categoryName] = &categoryGroup{
|
||||
CategoryName: categoryName,
|
||||
CategoryID: categoryID,
|
||||
Templates: []templateResponse{},
|
||||
}
|
||||
categoryOrder = append(categoryOrder, categoryName)
|
||||
}
|
||||
|
||||
resp := templateResponse{
|
||||
ID: t.ID,
|
||||
Title: t.Title,
|
||||
Description: t.Description,
|
||||
CategoryID: t.CategoryID,
|
||||
FrequencyID: t.FrequencyID,
|
||||
IconIOS: t.IconIOS,
|
||||
IconAndroid: t.IconAndroid,
|
||||
Tags: parseTags(t.Tags),
|
||||
DisplayOrder: t.DisplayOrder,
|
||||
IsActive: t.IsActive,
|
||||
}
|
||||
|
||||
if t.Category != nil {
|
||||
resp.Category = map[string]interface{}{
|
||||
"id": t.Category.ID,
|
||||
"name": t.Category.Name,
|
||||
"description": t.Category.Description,
|
||||
"icon": t.Category.Icon,
|
||||
"color": t.Category.Color,
|
||||
"display_order": t.Category.DisplayOrder,
|
||||
}
|
||||
}
|
||||
if t.Frequency != nil {
|
||||
resp.Frequency = map[string]interface{}{
|
||||
"id": t.Frequency.ID,
|
||||
"name": t.Frequency.Name,
|
||||
"days": t.Frequency.Days,
|
||||
"display_order": t.Frequency.DisplayOrder,
|
||||
}
|
||||
}
|
||||
|
||||
categoryMap[categoryName].Templates = append(categoryMap[categoryName].Templates, resp)
|
||||
}
|
||||
|
||||
categories := make([]categoryGroup, len(categoryOrder))
|
||||
totalCount := 0
|
||||
for i, name := range categoryOrder {
|
||||
group := categoryMap[name]
|
||||
group.Count = len(group.Templates)
|
||||
totalCount += group.Count
|
||||
categories[i] = *group
|
||||
}
|
||||
|
||||
return map[string]interface{}{
|
||||
"categories": categories,
|
||||
"total_count": totalCount,
|
||||
}
|
||||
}
|
||||
|
||||
// parseTags splits a comma-separated tags string into a slice
|
||||
func parseTags(tags string) []string {
|
||||
if tags == "" {
|
||||
return []string{}
|
||||
}
|
||||
parts := strings.Split(tags, ",")
|
||||
result := make([]string, 0, len(parts))
|
||||
for _, p := range parts {
|
||||
trimmed := strings.TrimSpace(p)
|
||||
if trimmed != "" {
|
||||
result = append(result, trimmed)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// SeedTestData handles POST /api/admin/settings/seed-test-data
|
||||
func (h *AdminSettingsHandler) SeedTestData(c echo.Context) error {
|
||||
if err := h.runSeedFile("002_test_data.sql"); err != nil {
|
||||
@@ -487,9 +401,9 @@ type ClearAllDataResponse struct {
|
||||
|
||||
// ClearStuckJobsResponse represents the response after clearing stuck Redis jobs
|
||||
type ClearStuckJobsResponse struct {
|
||||
Message string `json:"message"`
|
||||
KeysDeleted int `json:"keys_deleted"`
|
||||
DeletedKeys []string `json:"deleted_keys"`
|
||||
Message string `json:"message"`
|
||||
KeysDeleted int `json:"keys_deleted"`
|
||||
DeletedKeys []string `json:"deleted_keys"`
|
||||
}
|
||||
|
||||
// ClearStuckJobs handles POST /api/admin/settings/clear-stuck-jobs
|
||||
@@ -507,9 +421,9 @@ func (h *AdminSettingsHandler) ClearStuckJobs(c echo.Context) error {
|
||||
|
||||
// Patterns for asynq job keys that can get stuck
|
||||
patterns := []string{
|
||||
"asynq:{default}:retry", // Retry queue
|
||||
"asynq:{default}:archived", // Archived/dead jobs
|
||||
"asynq:{default}:t:*", // Individual task metadata
|
||||
"asynq:{default}:retry", // Retry queue
|
||||
"asynq:{default}:archived", // Archived/dead jobs
|
||||
"asynq:{default}:t:*", // Individual task metadata
|
||||
}
|
||||
|
||||
for _, pattern := range patterns {
|
||||
|
||||
@@ -207,9 +207,7 @@ func (h *AdminUserHandler) Create(c echo.Context) error {
|
||||
user.IsSuperuser = *req.IsSuperuser
|
||||
}
|
||||
|
||||
if err := user.SetPassword(req.Password); err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to hash password"})
|
||||
}
|
||||
// Password management is handled by Ory Kratos; no local password hashing.
|
||||
|
||||
if err := h.db.Create(&user).Error; err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create user"})
|
||||
@@ -284,10 +282,9 @@ func (h *AdminUserHandler) Update(c echo.Context) error {
|
||||
if req.IsSuperuser != nil {
|
||||
user.IsSuperuser = *req.IsSuperuser
|
||||
}
|
||||
// Password management is handled by Ory Kratos; local password update ignored.
|
||||
if req.Password != nil {
|
||||
if err := user.SetPassword(*req.Password); err != nil {
|
||||
return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to hash password"})
|
||||
}
|
||||
_ = req.Password // Password changes must go through Kratos admin API
|
||||
}
|
||||
|
||||
if err := h.db.Save(&user).Error; err != nil {
|
||||
|
||||
@@ -25,6 +25,7 @@ type Dependencies struct {
|
||||
PushClient *push.Client
|
||||
OnboardingService *services.OnboardingEmailService
|
||||
MonitoringHandler *monitoring.Handler
|
||||
CacheService *services.CacheService
|
||||
}
|
||||
|
||||
// SetupRoutes configures all admin routes
|
||||
@@ -380,7 +381,7 @@ func SetupRoutes(router *echo.Echo, db *gorm.DB, cfg *config.Config, deps *Depen
|
||||
}
|
||||
|
||||
// System settings management (super admin only)
|
||||
settingsHandler := handlers.NewAdminSettingsHandler(db)
|
||||
settingsHandler := handlers.NewAdminSettingsHandler(db, deps.CacheService)
|
||||
settings := protected.Group("/settings")
|
||||
settings.Use(middleware.RequireSuperAdmin())
|
||||
{
|
||||
@@ -394,7 +395,7 @@ func SetupRoutes(router *echo.Echo, db *gorm.DB, cfg *config.Config, deps *Depen
|
||||
}
|
||||
|
||||
// Limitations management (tier limits, upgrade triggers)
|
||||
limitationsHandler := handlers.NewAdminLimitationsHandler(db)
|
||||
limitationsHandler := handlers.NewAdminLimitationsHandler(db, deps.CacheService)
|
||||
limitations := protected.Group("/limitations")
|
||||
{
|
||||
// Settings (enable_limitations toggle)
|
||||
|
||||
+244
-162
@@ -1,6 +1,7 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"net/url"
|
||||
@@ -52,6 +53,7 @@ type DatabaseConfig struct {
|
||||
MaxOpenConns int
|
||||
MaxIdleConns int
|
||||
MaxLifetime time.Duration
|
||||
MaxIdleTime time.Duration
|
||||
}
|
||||
|
||||
type RedisConfig struct {
|
||||
@@ -88,8 +90,12 @@ type PushConfig struct {
|
||||
}
|
||||
|
||||
type AppleAuthConfig struct {
|
||||
ClientID string // Bundle ID (e.g., com.tt.honeyDue.honeyDueDev)
|
||||
TeamID string // Apple Developer Team ID
|
||||
ClientID string // Bundle ID, used as the `aud` claim in Sign in with Apple identity tokens
|
||||
// TeamID is currently unused — services/apple_auth.go validates identity tokens
|
||||
// against ClientID + Apple's JWKS only, with no server-to-server REST calls.
|
||||
// Wire this in if/when token revocation or refresh-token exchange is added,
|
||||
// since both require signing a client_secret JWT with team_id + key_id.
|
||||
TeamID string
|
||||
}
|
||||
|
||||
type GoogleAuthConfig struct {
|
||||
@@ -136,6 +142,13 @@ type SecurityConfig struct {
|
||||
MaxPasswordResetRate int // per hour
|
||||
TokenExpiryDays int // Number of days before auth tokens expire (default 90)
|
||||
TokenRefreshDays int // Token must be at least this many days old before refresh (default 60)
|
||||
// KratosPublicURL is the Ory Kratos public API base URL. The auth
|
||||
// middleware validates sessions against {KratosPublicURL}/sessions/whoami.
|
||||
KratosPublicURL string
|
||||
// KratosAdminURL is the Ory Kratos admin API base URL. Account deletion
|
||||
// removes the user's Kratos identity via
|
||||
// {KratosAdminURL}/admin/identities/{id}.
|
||||
KratosAdminURL string
|
||||
}
|
||||
|
||||
// StorageConfig holds file storage settings.
|
||||
@@ -177,8 +190,8 @@ type FeatureFlags struct {
|
||||
}
|
||||
|
||||
var (
|
||||
cfg *Config
|
||||
cfgOnce sync.Once
|
||||
cfg *Config
|
||||
cfgMu sync.Mutex
|
||||
)
|
||||
|
||||
// knownWeakSecretKeys contains well-known default or placeholder secret keys
|
||||
@@ -191,162 +204,170 @@ var knownWeakSecretKeys = map[string]bool{
|
||||
"change-me-in-production-secret-key-12345": true,
|
||||
}
|
||||
|
||||
// Load reads configuration from environment variables
|
||||
// Load reads configuration from environment variables.
|
||||
//
|
||||
// Caches the result so repeated calls are cheap. On validation failure, the
|
||||
// cache stays nil so a subsequent call (after env is corrected) can retry. The
|
||||
// previous implementation used sync.Once with an in-Do reset of the Once
|
||||
// itself, which races and panics with "sync: unlock of unlocked mutex".
|
||||
func Load() (*Config, error) {
|
||||
var loadErr error
|
||||
|
||||
cfgOnce.Do(func() {
|
||||
viper.SetEnvPrefix("")
|
||||
viper.AutomaticEnv()
|
||||
viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
|
||||
|
||||
// Set defaults
|
||||
setDefaults()
|
||||
|
||||
// Parse DATABASE_URL if set (Dokku-style)
|
||||
dbConfig := DatabaseConfig{
|
||||
Host: viper.GetString("DB_HOST"),
|
||||
Port: viper.GetInt("DB_PORT"),
|
||||
User: viper.GetString("POSTGRES_USER"),
|
||||
Password: viper.GetString("POSTGRES_PASSWORD"),
|
||||
Database: viper.GetString("POSTGRES_DB"),
|
||||
SSLMode: viper.GetString("DB_SSLMODE"),
|
||||
MaxOpenConns: viper.GetInt("DB_MAX_OPEN_CONNS"),
|
||||
MaxIdleConns: viper.GetInt("DB_MAX_IDLE_CONNS"),
|
||||
MaxLifetime: viper.GetDuration("DB_MAX_LIFETIME"),
|
||||
}
|
||||
|
||||
// Override with DATABASE_URL if present (F-16: log warning on parse failure)
|
||||
if databaseURL := viper.GetString("DATABASE_URL"); databaseURL != "" {
|
||||
parsed, err := parseDatabaseURL(databaseURL)
|
||||
if err != nil {
|
||||
maskedURL := MaskURLCredentials(databaseURL)
|
||||
fmt.Printf("WARNING: Failed to parse DATABASE_URL (%s): %v — falling back to individual DB_* env vars\n", maskedURL, err)
|
||||
} else {
|
||||
dbConfig.Host = parsed.Host
|
||||
dbConfig.Port = parsed.Port
|
||||
dbConfig.User = parsed.User
|
||||
dbConfig.Password = parsed.Password
|
||||
dbConfig.Database = parsed.Database
|
||||
if parsed.SSLMode != "" {
|
||||
dbConfig.SSLMode = parsed.SSLMode
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cfg = &Config{
|
||||
Server: ServerConfig{
|
||||
Port: viper.GetInt("PORT"),
|
||||
Debug: viper.GetBool("DEBUG"),
|
||||
DebugFixedCodes: viper.GetBool("DEBUG_FIXED_CODES"),
|
||||
AllowedHosts: strings.Split(viper.GetString("ALLOWED_HOSTS"), ","),
|
||||
CorsAllowedOrigins: parseCorsOrigins(viper.GetString("CORS_ALLOWED_ORIGINS")),
|
||||
Timezone: viper.GetString("TIMEZONE"),
|
||||
StaticDir: viper.GetString("STATIC_DIR"),
|
||||
BaseURL: viper.GetString("BASE_URL"),
|
||||
},
|
||||
Database: dbConfig,
|
||||
Redis: RedisConfig{
|
||||
URL: viper.GetString("REDIS_URL"),
|
||||
Password: viper.GetString("REDIS_PASSWORD"),
|
||||
DB: viper.GetInt("REDIS_DB"),
|
||||
},
|
||||
Email: EmailConfig{
|
||||
Host: viper.GetString("EMAIL_HOST"),
|
||||
Port: viper.GetInt("EMAIL_PORT"),
|
||||
User: viper.GetString("EMAIL_HOST_USER"),
|
||||
Password: viper.GetString("EMAIL_HOST_PASSWORD"),
|
||||
From: viper.GetString("DEFAULT_FROM_EMAIL"),
|
||||
UseTLS: viper.GetBool("EMAIL_USE_TLS"),
|
||||
},
|
||||
Push: PushConfig{
|
||||
APNSKeyPath: viper.GetString("APNS_AUTH_KEY_PATH"),
|
||||
APNSKeyID: viper.GetString("APNS_AUTH_KEY_ID"),
|
||||
APNSTeamID: viper.GetString("APNS_TEAM_ID"),
|
||||
APNSTopic: viper.GetString("APNS_TOPIC"),
|
||||
APNSSandbox: viper.GetBool("APNS_USE_SANDBOX"),
|
||||
APNSProduction: viper.GetBool("APNS_PRODUCTION"),
|
||||
FCMProjectID: viper.GetString("FCM_PROJECT_ID"),
|
||||
FCMServiceAccountPath: viper.GetString("FCM_SERVICE_ACCOUNT_PATH"),
|
||||
FCMServiceAccountJSON: viper.GetString("FCM_SERVICE_ACCOUNT_JSON"),
|
||||
FCMServerKey: viper.GetString("FCM_SERVER_KEY"),
|
||||
},
|
||||
Worker: WorkerConfig{
|
||||
TaskReminderHour: viper.GetInt("TASK_REMINDER_HOUR"),
|
||||
OverdueReminderHour: viper.GetInt("OVERDUE_REMINDER_HOUR"),
|
||||
DailyNotifHour: viper.GetInt("DAILY_DIGEST_HOUR"),
|
||||
},
|
||||
Security: SecurityConfig{
|
||||
SecretKey: viper.GetString("SECRET_KEY"),
|
||||
TokenCacheTTL: 5 * time.Minute,
|
||||
PasswordResetExpiry: 15 * time.Minute,
|
||||
ConfirmationExpiry: 24 * time.Hour,
|
||||
MaxPasswordResetRate: 3,
|
||||
TokenExpiryDays: viper.GetInt("TOKEN_EXPIRY_DAYS"),
|
||||
TokenRefreshDays: viper.GetInt("TOKEN_REFRESH_DAYS"),
|
||||
},
|
||||
Storage: StorageConfig{
|
||||
UploadDir: viper.GetString("STORAGE_UPLOAD_DIR"),
|
||||
BaseURL: viper.GetString("STORAGE_BASE_URL"),
|
||||
S3Endpoint: viper.GetString("B2_ENDPOINT"),
|
||||
S3KeyID: viper.GetString("B2_KEY_ID"),
|
||||
S3AppKey: viper.GetString("B2_APP_KEY"),
|
||||
S3Bucket: viper.GetString("B2_BUCKET_NAME"),
|
||||
S3UseSSL: viper.GetString("STORAGE_USE_SSL") == "" || viper.GetBool("STORAGE_USE_SSL"),
|
||||
S3Region: viper.GetString("B2_REGION"),
|
||||
MaxFileSize: viper.GetInt64("STORAGE_MAX_FILE_SIZE"),
|
||||
AllowedTypes: viper.GetString("STORAGE_ALLOWED_TYPES"),
|
||||
EncryptionKey: viper.GetString("STORAGE_ENCRYPTION_KEY"),
|
||||
},
|
||||
AppleAuth: AppleAuthConfig{
|
||||
ClientID: viper.GetString("APPLE_CLIENT_ID"),
|
||||
TeamID: viper.GetString("APPLE_TEAM_ID"),
|
||||
},
|
||||
GoogleAuth: GoogleAuthConfig{
|
||||
ClientID: viper.GetString("GOOGLE_CLIENT_ID"),
|
||||
AndroidClientID: viper.GetString("GOOGLE_ANDROID_CLIENT_ID"),
|
||||
IOSClientID: viper.GetString("GOOGLE_IOS_CLIENT_ID"),
|
||||
},
|
||||
AppleIAP: AppleIAPConfig{
|
||||
KeyPath: viper.GetString("APPLE_IAP_KEY_PATH"),
|
||||
KeyID: viper.GetString("APPLE_IAP_KEY_ID"),
|
||||
IssuerID: viper.GetString("APPLE_IAP_ISSUER_ID"),
|
||||
BundleID: viper.GetString("APPLE_IAP_BUNDLE_ID"),
|
||||
Sandbox: viper.GetBool("APPLE_IAP_SANDBOX"),
|
||||
},
|
||||
GoogleIAP: GoogleIAPConfig{
|
||||
ServiceAccountPath: viper.GetString("GOOGLE_IAP_SERVICE_ACCOUNT_PATH"),
|
||||
PackageName: viper.GetString("GOOGLE_IAP_PACKAGE_NAME"),
|
||||
},
|
||||
Stripe: StripeConfig{
|
||||
SecretKey: viper.GetString("STRIPE_SECRET_KEY"),
|
||||
WebhookSecret: viper.GetString("STRIPE_WEBHOOK_SECRET"),
|
||||
PriceMonthly: viper.GetString("STRIPE_PRICE_MONTHLY"),
|
||||
PriceYearly: viper.GetString("STRIPE_PRICE_YEARLY"),
|
||||
},
|
||||
Features: FeatureFlags{
|
||||
PushEnabled: viper.GetBool("FEATURE_PUSH_ENABLED"),
|
||||
EmailEnabled: viper.GetBool("FEATURE_EMAIL_ENABLED"),
|
||||
WebhooksEnabled: viper.GetBool("FEATURE_WEBHOOKS_ENABLED"),
|
||||
OnboardingEmailsEnabled: viper.GetBool("FEATURE_ONBOARDING_EMAILS_ENABLED"),
|
||||
PDFReportsEnabled: viper.GetBool("FEATURE_PDF_REPORTS_ENABLED"),
|
||||
WorkerEnabled: viper.GetBool("FEATURE_WORKER_ENABLED"),
|
||||
},
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if err := validate(cfg); err != nil {
|
||||
loadErr = err
|
||||
// Reset so a subsequent call can retry after env is fixed
|
||||
cfg = nil
|
||||
cfgOnce = sync.Once{}
|
||||
}
|
||||
})
|
||||
|
||||
if loadErr != nil {
|
||||
return nil, loadErr
|
||||
cfgMu.Lock()
|
||||
defer cfgMu.Unlock()
|
||||
if cfg != nil {
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
viper.SetEnvPrefix("")
|
||||
viper.AutomaticEnv()
|
||||
viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
|
||||
|
||||
// Set defaults
|
||||
setDefaults()
|
||||
|
||||
// Audit F8: overlay file-mounted secrets onto Viper. No-op when the
|
||||
// directory is absent (local/dev), so this is safe to ship before the
|
||||
// manifests mount honeydue-secrets as a volume.
|
||||
loadFileSecrets()
|
||||
|
||||
// Parse DATABASE_URL if set (Dokku-style)
|
||||
dbConfig := DatabaseConfig{
|
||||
Host: viper.GetString("DB_HOST"),
|
||||
Port: viper.GetInt("DB_PORT"),
|
||||
User: viper.GetString("POSTGRES_USER"),
|
||||
Password: viper.GetString("POSTGRES_PASSWORD"),
|
||||
Database: viper.GetString("POSTGRES_DB"),
|
||||
SSLMode: viper.GetString("DB_SSLMODE"),
|
||||
MaxOpenConns: viper.GetInt("DB_MAX_OPEN_CONNS"),
|
||||
MaxIdleConns: viper.GetInt("DB_MAX_IDLE_CONNS"),
|
||||
MaxLifetime: viper.GetDuration("DB_MAX_LIFETIME"),
|
||||
MaxIdleTime: viper.GetDuration("DB_MAX_IDLE_TIME"),
|
||||
}
|
||||
|
||||
// Override with DATABASE_URL if present (F-16: log warning on parse failure)
|
||||
if databaseURL := viper.GetString("DATABASE_URL"); databaseURL != "" {
|
||||
parsed, err := parseDatabaseURL(databaseURL)
|
||||
if err != nil {
|
||||
maskedURL := MaskURLCredentials(databaseURL)
|
||||
fmt.Printf("WARNING: Failed to parse DATABASE_URL (%s): %v — falling back to individual DB_* env vars\n", maskedURL, err)
|
||||
} else {
|
||||
dbConfig.Host = parsed.Host
|
||||
dbConfig.Port = parsed.Port
|
||||
dbConfig.User = parsed.User
|
||||
dbConfig.Password = parsed.Password
|
||||
dbConfig.Database = parsed.Database
|
||||
if parsed.SSLMode != "" {
|
||||
dbConfig.SSLMode = parsed.SSLMode
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c := &Config{
|
||||
Server: ServerConfig{
|
||||
Port: viper.GetInt("PORT"),
|
||||
Debug: viper.GetBool("DEBUG"),
|
||||
DebugFixedCodes: viper.GetBool("DEBUG_FIXED_CODES"),
|
||||
AllowedHosts: strings.Split(viper.GetString("ALLOWED_HOSTS"), ","),
|
||||
CorsAllowedOrigins: parseCorsOrigins(viper.GetString("CORS_ALLOWED_ORIGINS")),
|
||||
Timezone: viper.GetString("TIMEZONE"),
|
||||
StaticDir: viper.GetString("STATIC_DIR"),
|
||||
BaseURL: viper.GetString("BASE_URL"),
|
||||
},
|
||||
Database: dbConfig,
|
||||
Redis: RedisConfig{
|
||||
URL: viper.GetString("REDIS_URL"),
|
||||
Password: viper.GetString("REDIS_PASSWORD"),
|
||||
DB: viper.GetInt("REDIS_DB"),
|
||||
},
|
||||
Email: EmailConfig{
|
||||
Host: viper.GetString("EMAIL_HOST"),
|
||||
Port: viper.GetInt("EMAIL_PORT"),
|
||||
User: viper.GetString("EMAIL_HOST_USER"),
|
||||
Password: viper.GetString("EMAIL_HOST_PASSWORD"),
|
||||
From: viper.GetString("DEFAULT_FROM_EMAIL"),
|
||||
UseTLS: viper.GetBool("EMAIL_USE_TLS"),
|
||||
},
|
||||
Push: PushConfig{
|
||||
APNSKeyPath: viper.GetString("APNS_AUTH_KEY_PATH"),
|
||||
APNSKeyID: viper.GetString("APNS_AUTH_KEY_ID"),
|
||||
APNSTeamID: viper.GetString("APNS_TEAM_ID"),
|
||||
APNSTopic: viper.GetString("APNS_TOPIC"),
|
||||
APNSSandbox: viper.GetBool("APNS_USE_SANDBOX"),
|
||||
APNSProduction: viper.GetBool("APNS_PRODUCTION"),
|
||||
FCMProjectID: viper.GetString("FCM_PROJECT_ID"),
|
||||
FCMServiceAccountPath: viper.GetString("FCM_SERVICE_ACCOUNT_PATH"),
|
||||
FCMServiceAccountJSON: viper.GetString("FCM_SERVICE_ACCOUNT_JSON"),
|
||||
FCMServerKey: viper.GetString("FCM_SERVER_KEY"),
|
||||
},
|
||||
Worker: WorkerConfig{
|
||||
TaskReminderHour: viper.GetInt("TASK_REMINDER_HOUR"),
|
||||
OverdueReminderHour: viper.GetInt("OVERDUE_REMINDER_HOUR"),
|
||||
DailyNotifHour: viper.GetInt("DAILY_DIGEST_HOUR"),
|
||||
},
|
||||
Security: SecurityConfig{
|
||||
SecretKey: viper.GetString("SECRET_KEY"),
|
||||
TokenCacheTTL: 5 * time.Minute,
|
||||
PasswordResetExpiry: 15 * time.Minute,
|
||||
ConfirmationExpiry: 24 * time.Hour,
|
||||
MaxPasswordResetRate: 3,
|
||||
TokenExpiryDays: viper.GetInt("TOKEN_EXPIRY_DAYS"),
|
||||
TokenRefreshDays: viper.GetInt("TOKEN_REFRESH_DAYS"),
|
||||
KratosPublicURL: viper.GetString("KRATOS_PUBLIC_URL"),
|
||||
KratosAdminURL: viper.GetString("KRATOS_ADMIN_URL"),
|
||||
},
|
||||
Storage: StorageConfig{
|
||||
UploadDir: viper.GetString("STORAGE_UPLOAD_DIR"),
|
||||
BaseURL: viper.GetString("STORAGE_BASE_URL"),
|
||||
S3Endpoint: viper.GetString("B2_ENDPOINT"),
|
||||
S3KeyID: viper.GetString("B2_KEY_ID"),
|
||||
S3AppKey: viper.GetString("B2_APP_KEY"),
|
||||
S3Bucket: viper.GetString("B2_BUCKET_NAME"),
|
||||
S3UseSSL: viper.GetString("STORAGE_USE_SSL") == "" || viper.GetBool("STORAGE_USE_SSL"),
|
||||
S3Region: viper.GetString("B2_REGION"),
|
||||
MaxFileSize: viper.GetInt64("STORAGE_MAX_FILE_SIZE"),
|
||||
AllowedTypes: viper.GetString("STORAGE_ALLOWED_TYPES"),
|
||||
EncryptionKey: viper.GetString("STORAGE_ENCRYPTION_KEY"),
|
||||
},
|
||||
AppleAuth: AppleAuthConfig{
|
||||
ClientID: viper.GetString("APPLE_CLIENT_ID"),
|
||||
TeamID: viper.GetString("APPLE_TEAM_ID"),
|
||||
},
|
||||
GoogleAuth: GoogleAuthConfig{
|
||||
ClientID: viper.GetString("GOOGLE_CLIENT_ID"),
|
||||
AndroidClientID: viper.GetString("GOOGLE_ANDROID_CLIENT_ID"),
|
||||
IOSClientID: viper.GetString("GOOGLE_IOS_CLIENT_ID"),
|
||||
},
|
||||
AppleIAP: AppleIAPConfig{
|
||||
KeyPath: viper.GetString("APPLE_IAP_KEY_PATH"),
|
||||
KeyID: viper.GetString("APPLE_IAP_KEY_ID"),
|
||||
IssuerID: viper.GetString("APPLE_IAP_ISSUER_ID"),
|
||||
BundleID: viper.GetString("APPLE_IAP_BUNDLE_ID"),
|
||||
Sandbox: viper.GetBool("APPLE_IAP_SANDBOX"),
|
||||
},
|
||||
GoogleIAP: GoogleIAPConfig{
|
||||
ServiceAccountPath: viper.GetString("GOOGLE_IAP_SERVICE_ACCOUNT_PATH"),
|
||||
PackageName: viper.GetString("GOOGLE_IAP_PACKAGE_NAME"),
|
||||
},
|
||||
Stripe: StripeConfig{
|
||||
SecretKey: viper.GetString("STRIPE_SECRET_KEY"),
|
||||
WebhookSecret: viper.GetString("STRIPE_WEBHOOK_SECRET"),
|
||||
PriceMonthly: viper.GetString("STRIPE_PRICE_MONTHLY"),
|
||||
PriceYearly: viper.GetString("STRIPE_PRICE_YEARLY"),
|
||||
},
|
||||
Features: FeatureFlags{
|
||||
PushEnabled: viper.GetBool("FEATURE_PUSH_ENABLED"),
|
||||
EmailEnabled: viper.GetBool("FEATURE_EMAIL_ENABLED"),
|
||||
WebhooksEnabled: viper.GetBool("FEATURE_WEBHOOKS_ENABLED"),
|
||||
OnboardingEmailsEnabled: viper.GetBool("FEATURE_ONBOARDING_EMAILS_ENABLED"),
|
||||
PDFReportsEnabled: viper.GetBool("FEATURE_PDF_REPORTS_ENABLED"),
|
||||
WorkerEnabled: viper.GetBool("FEATURE_WORKER_ENABLED"),
|
||||
},
|
||||
}
|
||||
|
||||
if err := validate(c); err != nil {
|
||||
// Leave cfg nil so the next Load() retries after env is corrected.
|
||||
return nil, err
|
||||
}
|
||||
cfg = c
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
@@ -399,6 +420,8 @@ func setDefaults() {
|
||||
|
||||
// Token expiry defaults
|
||||
viper.SetDefault("TOKEN_EXPIRY_DAYS", 90) // Tokens expire after 90 days
|
||||
viper.SetDefault("KRATOS_PUBLIC_URL", "http://kratos:4433") // Ory Kratos public API
|
||||
viper.SetDefault("KRATOS_ADMIN_URL", "http://kratos:4434") // Ory Kratos admin API
|
||||
viper.SetDefault("TOKEN_REFRESH_DAYS", 60) // Tokens can be refreshed after 60 days
|
||||
|
||||
// Storage defaults
|
||||
@@ -426,14 +449,67 @@ func isWeakSecretKey(key string) bool {
|
||||
return knownWeakSecretKeys[strings.ToLower(strings.TrimSpace(key))]
|
||||
}
|
||||
|
||||
// loadFileSecrets overlays file-mounted secrets onto Viper (audit F8). When
|
||||
// the honeydue-secrets Secret is mounted as a volume at /etc/honeydue/secrets
|
||||
// each key is a file; reading the value here and viper.Set-ing it (highest
|
||||
// Viper precedence) keeps the secret out of the process environment
|
||||
// (/proc/<pid>/environ), which plain env-var injection cannot. When the
|
||||
// directory is absent it is a silent no-op and env vars are used as before.
|
||||
func loadFileSecrets() {
|
||||
dir := os.Getenv("HONEYDUE_SECRETS_DIR")
|
||||
if dir == "" {
|
||||
dir = "/etc/honeydue/secrets"
|
||||
}
|
||||
for _, k := range []string{
|
||||
"POSTGRES_PASSWORD", "SECRET_KEY", "EMAIL_HOST_PASSWORD", "FCM_SERVER_KEY",
|
||||
"REDIS_PASSWORD", "B2_KEY_ID", "B2_APP_KEY", "OBS_INGEST_TOKEN", "OBS_TRACES_URL",
|
||||
} {
|
||||
b, err := os.ReadFile(dir + "/" + k)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if v := strings.TrimSpace(string(b)); v != "" {
|
||||
viper.Set(k, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SecretValue resolves a configuration value that is not part of the typed
|
||||
// Config struct. It reads through Viper, so a value supplied via a file-mounted
|
||||
// secret (audit F8, loaded by loadFileSecrets) is found just like an env var.
|
||||
//
|
||||
// Must be called after Load(). Used by cmd/api and cmd/worker for the
|
||||
// observability endpoints, which are needed before the full Config is wired
|
||||
// and would otherwise be read with os.Getenv — which misses file-mounted
|
||||
// secrets entirely once F8 removes them from the process environment.
|
||||
func SecretValue(key string) string {
|
||||
return viper.GetString(key)
|
||||
}
|
||||
|
||||
// randomHexKey returns a cryptographically secure random hex string
|
||||
// representing n random bytes (2n hex characters).
|
||||
func randomHexKey(n int) (string, error) {
|
||||
b := make([]byte, n)
|
||||
if _, err := rand.Read(b); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return hex.EncodeToString(b), nil
|
||||
}
|
||||
|
||||
func validate(cfg *Config) error {
|
||||
// S-08: Validate SECRET_KEY against known weak defaults
|
||||
// M8: SECRET_KEY validation — no static fallback secret in the binary.
|
||||
if cfg.Security.SecretKey == "" {
|
||||
if cfg.Server.Debug {
|
||||
// In debug mode, use a default key with a warning for local development
|
||||
cfg.Security.SecretKey = "change-me-in-production-secret-key-12345"
|
||||
fmt.Println("WARNING: SECRET_KEY not set, using default (debug mode only)")
|
||||
fmt.Println("WARNING: *** DO NOT USE THIS DEFAULT KEY IN PRODUCTION ***")
|
||||
// Debug only: generate a random key per boot. Tokens signed with
|
||||
// it do not survive a restart, which is acceptable for local dev
|
||||
// and far safer than a well-known hardcoded fallback.
|
||||
randomKey, err := randomHexKey(32)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to generate ephemeral debug SECRET_KEY: %w", err)
|
||||
}
|
||||
cfg.Security.SecretKey = randomKey
|
||||
fmt.Println("WARNING: SECRET_KEY not set, generated an ephemeral random key (debug mode only)")
|
||||
fmt.Println("WARNING: tokens will not survive a restart — set SECRET_KEY for stable local sessions")
|
||||
} else {
|
||||
// In production, refuse to start without a proper secret key
|
||||
return fmt.Errorf("FATAL: SECRET_KEY environment variable is required in production (DEBUG=false)")
|
||||
@@ -446,6 +522,12 @@ func validate(cfg *Config) error {
|
||||
}
|
||||
}
|
||||
|
||||
// C4: fixed confirmation codes ("123456") must never be enabled outside
|
||||
// debug — with DEBUG=false they are a full authentication bypass.
|
||||
if cfg.Server.DebugFixedCodes && !cfg.Server.Debug {
|
||||
return fmt.Errorf("FATAL: DEBUG_FIXED_CODES is enabled with DEBUG=false — fixed confirmation codes must never run in production")
|
||||
}
|
||||
|
||||
// Database password might come from DATABASE_URL, don't require it separately
|
||||
// The actual connection will fail if credentials are wrong
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/spf13/viper"
|
||||
@@ -11,8 +10,9 @@ import (
|
||||
|
||||
// resetConfigState resets the package-level singleton so each test starts fresh.
|
||||
func resetConfigState() {
|
||||
cfgMu.Lock()
|
||||
cfg = nil
|
||||
cfgOnce = sync.Once{}
|
||||
cfgMu.Unlock()
|
||||
viper.Reset()
|
||||
}
|
||||
|
||||
@@ -106,8 +106,10 @@ func TestLoad_Validation_MissingSecretKey_DebugMode(t *testing.T) {
|
||||
|
||||
c, err := Load()
|
||||
require.NoError(t, err)
|
||||
// In debug mode, a default key is assigned
|
||||
assert.Equal(t, "change-me-in-production-secret-key-12345", c.Security.SecretKey)
|
||||
// Audit M8: in debug mode an ephemeral random key is generated per boot
|
||||
// (no static fallback). It must be a non-empty 64-char hex string.
|
||||
assert.Len(t, c.Security.SecretKey, 64)
|
||||
assert.NotEqual(t, "change-me-in-production-secret-key-12345", c.Security.SecretKey)
|
||||
}
|
||||
|
||||
func TestLoad_Validation_WeakSecretKey_Production(t *testing.T) {
|
||||
@@ -133,6 +135,33 @@ func TestLoad_Validation_WeakSecretKey_DebugMode(t *testing.T) {
|
||||
assert.Equal(t, "secret", c.Security.SecretKey)
|
||||
}
|
||||
|
||||
// Audit C4: DEBUG_FIXED_CODES makes confirmation codes a fixed "123456" — a
|
||||
// full authentication bypass. With DEBUG=false, validate() must refuse to boot
|
||||
// rather than ship that bypass to production.
|
||||
func TestLoad_Validation_DebugFixedCodes_Production(t *testing.T) {
|
||||
// validate() directly — avoids the sync.Once issue Load() has on failure.
|
||||
cfg := &Config{
|
||||
Server: ServerConfig{Debug: false, DebugFixedCodes: true},
|
||||
Security: SecurityConfig{SecretKey: "a-strong-secret-key-for-tests"},
|
||||
}
|
||||
|
||||
err := validate(cfg)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "DEBUG_FIXED_CODES")
|
||||
}
|
||||
|
||||
// With DEBUG=true the fixed codes are an intended local-dev convenience, so
|
||||
// the same combination must NOT error.
|
||||
func TestLoad_Validation_DebugFixedCodes_DebugMode(t *testing.T) {
|
||||
cfg := &Config{
|
||||
Server: ServerConfig{Debug: true, DebugFixedCodes: true},
|
||||
Security: SecurityConfig{SecretKey: "a-strong-secret-key-for-tests"},
|
||||
}
|
||||
|
||||
err := validate(cfg)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestLoad_Validation_EncryptionKey_Valid(t *testing.T) {
|
||||
resetConfigState()
|
||||
t.Setenv("SECRET_KEY", "a-strong-secret-key-for-tests")
|
||||
|
||||
@@ -14,12 +14,10 @@ import (
|
||||
|
||||
"github.com/treytartt/honeydue-api/internal/config"
|
||||
"github.com/treytartt/honeydue-api/internal/models"
|
||||
)
|
||||
"github.com/treytartt/honeydue-api/internal/prom"
|
||||
|
||||
// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
|
||||
// Migrate() across API replicas booting in parallel. Value is arbitrary but
|
||||
// stable ("hdmg" as bytes = honeydue migration).
|
||||
const migrationAdvisoryLockKey int64 = 0x68646d67
|
||||
"github.com/uptrace/opentelemetry-go-extra/otelgorm"
|
||||
)
|
||||
|
||||
// zerologGormWriter adapts zerolog for GORM's logger interface
|
||||
type zerologGormWriter struct{}
|
||||
@@ -68,25 +66,84 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) {
|
||||
return nil, fmt.Errorf("failed to get underlying sql.DB: %w", err)
|
||||
}
|
||||
|
||||
// Configure connection pool
|
||||
// Configure connection pool. The Neon pooler endpoint keeps backend
|
||||
// connections warm, so we keep our client-side pool warm too — that
|
||||
// eliminates the ~440ms TCP+TLS+startup handshake on the first query
|
||||
// after a cold pod / idle period.
|
||||
sqlDB.SetMaxOpenConns(cfg.MaxOpenConns)
|
||||
sqlDB.SetMaxIdleConns(cfg.MaxIdleConns)
|
||||
sqlDB.SetConnMaxLifetime(cfg.MaxLifetime)
|
||||
if cfg.MaxIdleTime > 0 {
|
||||
sqlDB.SetConnMaxIdleTime(cfg.MaxIdleTime)
|
||||
}
|
||||
// MaxIdleTime=0 means "never close idle" — the pool fills up to
|
||||
// MaxIdleConns and they stay alive until MaxLifetime expires.
|
||||
|
||||
// Test connection
|
||||
if err := sqlDB.Ping(); err != nil {
|
||||
return nil, fmt.Errorf("failed to ping database: %w", err)
|
||||
}
|
||||
|
||||
// Eagerly warm the connection pool to MaxIdleConns. Without this, the
|
||||
// first N user requests each pay the full handshake (~440ms over a
|
||||
// transatlantic link). Pings are issued in parallel so warm-up is
|
||||
// bounded by handshake time, not handshake-time × N.
|
||||
warmUpPool(sqlDB, cfg.MaxIdleConns)
|
||||
|
||||
log.Info().
|
||||
Str("host", cfg.Host).
|
||||
Int("port", cfg.Port).
|
||||
Str("database", cfg.Database).
|
||||
Msg("Connected to PostgreSQL database")
|
||||
|
||||
// Register Prometheus GORM callbacks — emits gorm_query_duration_seconds
|
||||
// for every SQL operation. Operates at the statement level, so does not
|
||||
// require ctx to be threaded through repositories.
|
||||
if err := prom.RegisterGORMCallbacks(db); err != nil {
|
||||
log.Warn().Err(err).Msg("failed to register prometheus GORM callbacks; metrics will be partial")
|
||||
}
|
||||
|
||||
// Register otelgorm plugin — emits a span per SQL statement, attached to
|
||||
// whatever trace context is set via db.WithContext(ctx). Repositories that
|
||||
// have been migrated to use WithContext (see internal/repositories/*.go)
|
||||
// will produce nested SQL spans inside the request trace; pre-migration
|
||||
// repositories silently emit untraced queries.
|
||||
if err := db.Use(otelgorm.NewPlugin(otelgorm.WithDBName(cfg.Database))); err != nil {
|
||||
log.Warn().Err(err).Msg("failed to register otelgorm plugin; SQL spans disabled")
|
||||
}
|
||||
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// warmUpPool issues N parallel pings so the pool fills with established
|
||||
// connections before the first user request lands. Failures are logged but
|
||||
// not fatal — the pool will fill on demand under traffic if pre-warm fails.
|
||||
//
|
||||
// On a transatlantic link to Neon (~110ms RTT, ~440ms cold handshake), this
|
||||
// turns "first request pays the cold handshake" into "first request finds a
|
||||
// warm pool" — at the cost of ~440ms during pod startup.
|
||||
func warmUpPool(sqlDB interface {
|
||||
PingContext(context.Context) error
|
||||
}, n int) {
|
||||
if n <= 0 {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
done := make(chan error, n)
|
||||
for i := 0; i < n; i++ {
|
||||
go func() { done <- sqlDB.PingContext(ctx) }()
|
||||
}
|
||||
successes := 0
|
||||
for i := 0; i < n; i++ {
|
||||
if err := <-done; err == nil {
|
||||
successes++
|
||||
}
|
||||
}
|
||||
log.Info().Int("requested", n).Int("warmed", successes).Msg("DB pool warm-up complete")
|
||||
}
|
||||
|
||||
// Get returns the database instance
|
||||
func Get() *gorm.DB {
|
||||
return db
|
||||
@@ -127,52 +184,46 @@ func Paginate(page, pageSize int) func(db *gorm.DB) *gorm.DB {
|
||||
}
|
||||
}
|
||||
|
||||
// MigrateWithLock runs Migrate() under a Postgres session-level advisory lock
|
||||
// so that multiple API replicas booting in parallel don't race on AutoMigrate.
|
||||
// On non-Postgres dialects (sqlite in tests) it falls through to Migrate().
|
||||
func MigrateWithLock() error {
|
||||
// RequireSchemaApplied verifies that goose's version table exists and has
|
||||
// at least one applied entry. This is the fail-fast that runs at api/worker
|
||||
// boot: if the operator forgot to run the migrate Job, the pod refuses to
|
||||
// start with a clear error instead of throwing mysterious "relation does
|
||||
// not exist" errors deep in a request handler.
|
||||
//
|
||||
// On non-Postgres dialects (sqlite in tests) this is a no-op — tests use
|
||||
// AutoMigrate via testutil.SetupTestDB to create a fresh schema per run.
|
||||
// goose isn't involved in the test path.
|
||||
func RequireSchemaApplied() error {
|
||||
if db == nil {
|
||||
return fmt.Errorf("database not initialised")
|
||||
}
|
||||
if db.Dialector.Name() != "postgres" {
|
||||
return Migrate()
|
||||
return nil
|
||||
}
|
||||
|
||||
sqlDB, err := db.DB()
|
||||
// goose_db_version stores one row per applied migration, not a single
|
||||
// "current version" row — so we look for the highest version_id with
|
||||
// is_applied=true. ORDER BY id DESC LIMIT 1 also catches the case where
|
||||
// the table exists but is empty (no rows returned, scan leaves Version
|
||||
// at zero).
|
||||
type migrationRow struct {
|
||||
VersionID int64 `gorm:"column:version_id"`
|
||||
IsApplied bool `gorm:"column:is_applied"`
|
||||
}
|
||||
|
||||
var row migrationRow
|
||||
err := db.Raw(`SELECT version_id, is_applied FROM goose_db_version ORDER BY id DESC LIMIT 1`).Scan(&row).Error
|
||||
if err != nil {
|
||||
return fmt.Errorf("get underlying sql.DB: %w", err)
|
||||
return fmt.Errorf("goose_db_version check failed (run the migrate Job to bootstrap): %w", err)
|
||||
}
|
||||
|
||||
// Give ourselves up to 5 min to acquire the lock — long enough for a
|
||||
// slow migration on a peer replica, short enough to fail fast if Postgres
|
||||
// is hung.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
conn, err := sqlDB.Conn(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("acquire dedicated migration connection: %w", err)
|
||||
if !row.IsApplied {
|
||||
return fmt.Errorf("goose_db_version latest row is_applied=false at version=%d — last migration was rolled back or aborted; investigate before starting", row.VersionID)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
log.Info().Int64("lock_key", migrationAdvisoryLockKey).Msg("Acquiring migration advisory lock...")
|
||||
if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", migrationAdvisoryLockKey); err != nil {
|
||||
return fmt.Errorf("pg_advisory_lock: %w", err)
|
||||
if row.VersionID < 1 {
|
||||
return fmt.Errorf("goose_db_version is empty — run goose up (or seed a row marking version 1 as applied if the schema already exists)")
|
||||
}
|
||||
log.Info().Msg("Migration advisory lock acquired")
|
||||
|
||||
defer func() {
|
||||
// Unlock with a fresh context — the outer ctx may have expired.
|
||||
unlockCtx, unlockCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer unlockCancel()
|
||||
if _, err := conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", migrationAdvisoryLockKey); err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to release migration advisory lock (session close will also release)")
|
||||
} else {
|
||||
log.Info().Msg("Migration advisory lock released")
|
||||
}
|
||||
}()
|
||||
|
||||
return Migrate()
|
||||
log.Info().Int64("schema_version", row.VersionID).Msg("Schema precondition satisfied")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Migrate runs database migrations for all models
|
||||
@@ -193,12 +244,7 @@ func Migrate() error {
|
||||
|
||||
// User and auth tables
|
||||
&models.User{},
|
||||
&models.AuthToken{},
|
||||
&models.UserProfile{},
|
||||
&models.ConfirmationCode{},
|
||||
&models.PasswordResetCode{},
|
||||
&models.AppleSocialAuth{},
|
||||
&models.GoogleSocialAuth{},
|
||||
|
||||
// Admin users (separate from app users)
|
||||
&models.AdminUser{},
|
||||
|
||||
@@ -25,7 +25,12 @@ type CreateDocumentRequest struct {
|
||||
SerialNumber string `json:"serial_number" validate:"max=100"`
|
||||
ModelNumber string `json:"model_number" validate:"max=100"`
|
||||
TaskID *uint `json:"task_id"`
|
||||
ImageURLs []string `json:"image_urls" validate:"omitempty,max=20,dive,max=500"` // Multiple image URLs
|
||||
// UploadIDs claims pending_uploads rows produced by the presigned-URL
|
||||
// upload flow and turns them into document_image rows. UploadIDs of
|
||||
// category "document_file" attach to the document's main FileURL +
|
||||
// FileName fields instead — the service infers placement from the
|
||||
// row's category.
|
||||
UploadIDs []uint `json:"upload_ids" validate:"omitempty,max=20"`
|
||||
}
|
||||
|
||||
// UpdateDocumentRequest represents the request to update a document
|
||||
|
||||
@@ -100,14 +100,20 @@ type UpdateTaskRequest struct {
|
||||
ContractorID *uint `json:"contractor_id"`
|
||||
}
|
||||
|
||||
// CreateTaskCompletionRequest represents the request to create a task completion
|
||||
// CreateTaskCompletionRequest represents the request to create a task completion.
|
||||
//
|
||||
// Image attachments arrive via the presigned-URL flow: the client uploads
|
||||
// each image directly to B2 (see /api/uploads/presign) and passes the
|
||||
// resulting pending_uploads.id values in UploadIDs. The service claims
|
||||
// those rows and creates the linked task_completion_image rows.
|
||||
type CreateTaskCompletionRequest struct {
|
||||
TaskID uint `json:"task_id" validate:"required"`
|
||||
CompletedAt *time.Time `json:"completed_at"` // Defaults to now
|
||||
Notes string `json:"notes" validate:"max=10000"`
|
||||
ActualCost *decimal.Decimal `json:"actual_cost"`
|
||||
Rating *int `json:"rating" validate:"omitempty,min=1,max=5"` // 1-5 star rating
|
||||
ImageURLs []string `json:"image_urls" validate:"omitempty,max=20,dive,max=500"` // Multiple image URLs
|
||||
|
||||
UploadIDs []uint `json:"upload_ids" validate:"omitempty,max=20"`
|
||||
}
|
||||
|
||||
// UpdateTaskCompletionRequest represents the request to update a task completion
|
||||
@@ -115,7 +121,6 @@ type UpdateTaskCompletionRequest struct {
|
||||
Notes *string `json:"notes" validate:"omitempty,max=10000"`
|
||||
ActualCost *decimal.Decimal `json:"actual_cost"`
|
||||
Rating *int `json:"rating" validate:"omitempty,min=1,max=5"`
|
||||
ImageURLs []string `json:"image_urls" validate:"omitempty,max=20,dive,max=500"`
|
||||
}
|
||||
|
||||
// CompletionImageInput represents an image to add to a completion
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
package requests
|
||||
|
||||
// PresignUploadRequest is the body for POST /api/uploads/presign. The client
|
||||
// describes what it's about to upload; the server validates against quota,
|
||||
// rate limits, and per-category caps before returning a signed POST policy.
|
||||
type PresignUploadRequest struct {
|
||||
// Category gates allowed mime types and the size cap. One of:
|
||||
// "completion" — task completion photos
|
||||
// "document_image" — image attached to a Document
|
||||
// "document_file" — file (e.g. PDF) attached to a Document
|
||||
Category string `json:"category" validate:"required,oneof=completion document_image document_file"`
|
||||
|
||||
// ContentType is the MIME type the client will upload (e.g. image/jpeg).
|
||||
// Bound to the policy so the actual upload must match exactly.
|
||||
ContentType string `json:"content_type" validate:"required,min=3,max=127"`
|
||||
|
||||
// ContentLength is the exact byte count the client intends to upload.
|
||||
// The signed policy permits a small slack window around this value
|
||||
// (server-side constant) so the client can encode in one pass without
|
||||
// having to predict the byte count perfectly.
|
||||
ContentLength int64 `json:"content_length" validate:"required,min=1"`
|
||||
}
|
||||
@@ -8,8 +8,11 @@ import (
|
||||
|
||||
// ContractorSpecialtyResponse represents a contractor specialty
|
||||
type ContractorSpecialtyResponse struct {
|
||||
ID uint `json:"id"`
|
||||
Name string `json:"name"`
|
||||
ID uint `json:"id"`
|
||||
// Name is the stable English identifier (clients match on this).
|
||||
Name string `json:"name"`
|
||||
// DisplayName is the localized label for the request's Accept-Language.
|
||||
DisplayName string `json:"display_name"`
|
||||
Description string `json:"description"`
|
||||
Icon string `json:"icon"`
|
||||
DisplayOrder int `json:"display_order"`
|
||||
|
||||
@@ -10,8 +10,11 @@ import (
|
||||
|
||||
// ResidenceTypeResponse represents a residence type in the API response
|
||||
type ResidenceTypeResponse struct {
|
||||
ID uint `json:"id"`
|
||||
ID uint `json:"id"`
|
||||
// Name is the stable English identifier (clients match on this).
|
||||
Name string `json:"name"`
|
||||
// DisplayName is the localized label for the request's Accept-Language.
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
|
||||
// ResidenceUserResponse represents a user with access to a residence
|
||||
|
||||
@@ -13,8 +13,11 @@ import (
|
||||
|
||||
// TaskCategoryResponse represents a task category
|
||||
type TaskCategoryResponse struct {
|
||||
ID uint `json:"id"`
|
||||
Name string `json:"name"`
|
||||
ID uint `json:"id"`
|
||||
// Name is the stable English identifier (clients match on this).
|
||||
Name string `json:"name"`
|
||||
// DisplayName is the localized label for the request's Accept-Language.
|
||||
DisplayName string `json:"display_name"`
|
||||
Description string `json:"description"`
|
||||
Icon string `json:"icon"`
|
||||
Color string `json:"color"`
|
||||
@@ -25,6 +28,7 @@ type TaskCategoryResponse struct {
|
||||
type TaskPriorityResponse struct {
|
||||
ID uint `json:"id"`
|
||||
Name string `json:"name"`
|
||||
DisplayName string `json:"display_name"`
|
||||
Level int `json:"level"`
|
||||
Color string `json:"color"`
|
||||
DisplayOrder int `json:"display_order"`
|
||||
@@ -34,6 +38,7 @@ type TaskPriorityResponse struct {
|
||||
type TaskFrequencyResponse struct {
|
||||
ID uint `json:"id"`
|
||||
Name string `json:"name"`
|
||||
DisplayName string `json:"display_name"`
|
||||
Days *int `json:"days"`
|
||||
DisplayOrder int `json:"display_order"`
|
||||
}
|
||||
@@ -71,35 +76,35 @@ type TaskCompletionResponse struct {
|
||||
|
||||
// TaskResponse represents a task in the API response
|
||||
type TaskResponse struct {
|
||||
ID uint `json:"id"`
|
||||
ResidenceID uint `json:"residence_id"`
|
||||
CreatedByID uint `json:"created_by_id"`
|
||||
CreatedBy *TaskUserResponse `json:"created_by,omitempty"`
|
||||
AssignedToID *uint `json:"assigned_to_id"`
|
||||
AssignedTo *TaskUserResponse `json:"assigned_to,omitempty"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
CategoryID *uint `json:"category_id"`
|
||||
Category *TaskCategoryResponse `json:"category,omitempty"`
|
||||
PriorityID *uint `json:"priority_id"`
|
||||
Priority *TaskPriorityResponse `json:"priority,omitempty"`
|
||||
FrequencyID *uint `json:"frequency_id"`
|
||||
Frequency *TaskFrequencyResponse `json:"frequency,omitempty"`
|
||||
CustomIntervalDays *int `json:"custom_interval_days"` // For "Custom" frequency, user-specified days
|
||||
InProgress bool `json:"in_progress"`
|
||||
DueDate *time.Time `json:"due_date"`
|
||||
NextDueDate *time.Time `json:"next_due_date"` // For recurring tasks, updated after each completion
|
||||
EstimatedCost *decimal.Decimal `json:"estimated_cost"`
|
||||
ActualCost *decimal.Decimal `json:"actual_cost"`
|
||||
ContractorID *uint `json:"contractor_id"`
|
||||
IsCancelled bool `json:"is_cancelled"`
|
||||
IsArchived bool `json:"is_archived"`
|
||||
ParentTaskID *uint `json:"parent_task_id"`
|
||||
TemplateID *uint `json:"template_id,omitempty"` // Backlink to the TaskTemplate this task was created from
|
||||
CompletionCount int `json:"completion_count"`
|
||||
KanbanColumn string `json:"kanban_column,omitempty"` // Which kanban column this task belongs to
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
ID uint `json:"id"`
|
||||
ResidenceID uint `json:"residence_id"`
|
||||
CreatedByID uint `json:"created_by_id"`
|
||||
CreatedBy *TaskUserResponse `json:"created_by,omitempty"`
|
||||
AssignedToID *uint `json:"assigned_to_id"`
|
||||
AssignedTo *TaskUserResponse `json:"assigned_to,omitempty"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
CategoryID *uint `json:"category_id"`
|
||||
Category *TaskCategoryResponse `json:"category,omitempty"`
|
||||
PriorityID *uint `json:"priority_id"`
|
||||
Priority *TaskPriorityResponse `json:"priority,omitempty"`
|
||||
FrequencyID *uint `json:"frequency_id"`
|
||||
Frequency *TaskFrequencyResponse `json:"frequency,omitempty"`
|
||||
CustomIntervalDays *int `json:"custom_interval_days"` // For "Custom" frequency, user-specified days
|
||||
InProgress bool `json:"in_progress"`
|
||||
DueDate *time.Time `json:"due_date"`
|
||||
NextDueDate *time.Time `json:"next_due_date"` // For recurring tasks, updated after each completion
|
||||
EstimatedCost *decimal.Decimal `json:"estimated_cost"`
|
||||
ActualCost *decimal.Decimal `json:"actual_cost"`
|
||||
ContractorID *uint `json:"contractor_id"`
|
||||
IsCancelled bool `json:"is_cancelled"`
|
||||
IsArchived bool `json:"is_archived"`
|
||||
ParentTaskID *uint `json:"parent_task_id"`
|
||||
TemplateID *uint `json:"template_id,omitempty"` // Backlink to the TaskTemplate this task was created from
|
||||
CompletionCount int `json:"completion_count"`
|
||||
KanbanColumn string `json:"kanban_column,omitempty"` // Which kanban column this task belongs to
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// BulkCreateTasksResponse is returned by POST /api/tasks/bulk/.
|
||||
@@ -240,30 +245,30 @@ func NewTaskResponseWithTime(t *models.Task, daysThreshold int, now time.Time) T
|
||||
// newTaskResponseInternal is the internal implementation for creating task responses
|
||||
func newTaskResponseInternal(t *models.Task, daysThreshold int, now time.Time) TaskResponse {
|
||||
resp := TaskResponse{
|
||||
ID: t.ID,
|
||||
ResidenceID: t.ResidenceID,
|
||||
CreatedByID: t.CreatedByID,
|
||||
Title: t.Title,
|
||||
Description: t.Description,
|
||||
CategoryID: t.CategoryID,
|
||||
PriorityID: t.PriorityID,
|
||||
ID: t.ID,
|
||||
ResidenceID: t.ResidenceID,
|
||||
CreatedByID: t.CreatedByID,
|
||||
Title: t.Title,
|
||||
Description: t.Description,
|
||||
CategoryID: t.CategoryID,
|
||||
PriorityID: t.PriorityID,
|
||||
FrequencyID: t.FrequencyID,
|
||||
CustomIntervalDays: t.CustomIntervalDays,
|
||||
InProgress: t.InProgress,
|
||||
AssignedToID: t.AssignedToID,
|
||||
DueDate: t.DueDate,
|
||||
NextDueDate: t.NextDueDate,
|
||||
EstimatedCost: t.EstimatedCost,
|
||||
ActualCost: t.ActualCost,
|
||||
ContractorID: t.ContractorID,
|
||||
IsCancelled: t.IsCancelled,
|
||||
IsArchived: t.IsArchived,
|
||||
ParentTaskID: t.ParentTaskID,
|
||||
TemplateID: t.TaskTemplateID,
|
||||
CompletionCount: predicates.GetCompletionCount(t),
|
||||
KanbanColumn: DetermineKanbanColumnWithTime(t, daysThreshold, now),
|
||||
CreatedAt: t.CreatedAt,
|
||||
UpdatedAt: t.UpdatedAt,
|
||||
AssignedToID: t.AssignedToID,
|
||||
DueDate: t.DueDate,
|
||||
NextDueDate: t.NextDueDate,
|
||||
EstimatedCost: t.EstimatedCost,
|
||||
ActualCost: t.ActualCost,
|
||||
ContractorID: t.ContractorID,
|
||||
IsCancelled: t.IsCancelled,
|
||||
IsArchived: t.IsArchived,
|
||||
ParentTaskID: t.ParentTaskID,
|
||||
TemplateID: t.TaskTemplateID,
|
||||
CompletionCount: predicates.GetCompletionCount(t),
|
||||
KanbanColumn: DetermineKanbanColumnWithTime(t, daysThreshold, now),
|
||||
CreatedAt: t.CreatedAt,
|
||||
UpdatedAt: t.UpdatedAt,
|
||||
}
|
||||
|
||||
if t.CreatedBy.ID != 0 {
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
package responses
|
||||
|
||||
// PresignUploadResponse is what /api/uploads/presign returns to the client.
|
||||
//
|
||||
// Flow: the client makes one PUT request to URL with the raw object bytes
|
||||
// as the body and Headers as the request headers (verbatim — the signature
|
||||
// binds them). On success, the client passes ID back via upload_ids[] on
|
||||
// POST /api/task-completions/ or POST /api/documents/ to claim and attach
|
||||
// the object.
|
||||
//
|
||||
// We use PUT (not POST) because Backblaze B2's S3-compatible endpoint does
|
||||
// not implement the S3 POST Object form upload — it returns HTTP 501 on
|
||||
// every request style. PUT works against AWS S3, B2, and MinIO uniformly.
|
||||
type PresignUploadResponse struct {
|
||||
// ID is the pending_uploads.id the client passes back via upload_ids[].
|
||||
ID uint `json:"id"`
|
||||
|
||||
// URL is the signed PUT URL. Includes all auth as query parameters.
|
||||
URL string `json:"upload_url"`
|
||||
|
||||
// Method is always "PUT" — emitted explicitly so clients don't have to
|
||||
// hardcode it. Reserved for the rare case we ever offer alternative
|
||||
// upload mechanisms.
|
||||
Method string `json:"method"`
|
||||
|
||||
// Headers must be sent verbatim on the PUT request. Currently includes
|
||||
// Content-Type and Content-Length; both are signed, and B2 will reject
|
||||
// any PUT whose headers don't match.
|
||||
Headers map[string]string `json:"headers"`
|
||||
|
||||
// Key is the object key chosen by the server. Echoed for client logging
|
||||
// and debugging; the canonical reference is via ID.
|
||||
Key string `json:"key"`
|
||||
|
||||
// ExpiresAt is when the signed URL stops working. Clients should retry
|
||||
// with a fresh presign rather than relying on long-lived URLs.
|
||||
ExpiresAt string `json:"expires_at"`
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"net/http"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
@@ -13,20 +12,22 @@ import (
|
||||
"github.com/treytartt/honeydue-api/internal/middleware"
|
||||
"github.com/treytartt/honeydue-api/internal/services"
|
||||
"github.com/treytartt/honeydue-api/internal/validator"
|
||||
"github.com/treytartt/honeydue-api/internal/worker"
|
||||
)
|
||||
|
||||
// AuthHandler handles authentication endpoints
|
||||
// AuthHandler handles user profile and account management endpoints.
|
||||
// Session lifecycle (login, register, logout, password reset) is delegated
|
||||
// to Ory Kratos; this handler only deals with the honeyDue user record.
|
||||
type AuthHandler struct {
|
||||
authService *services.AuthService
|
||||
emailService *services.EmailService
|
||||
cache *services.CacheService
|
||||
appleAuthService *services.AppleAuthService
|
||||
googleAuthService *services.GoogleAuthService
|
||||
storageService *services.StorageService
|
||||
auditService *services.AuditService
|
||||
authService *services.AuthService
|
||||
emailService *services.EmailService
|
||||
cache *services.CacheService
|
||||
storageService *services.StorageService
|
||||
auditService *services.AuditService
|
||||
enqueuer worker.Enqueuer
|
||||
}
|
||||
|
||||
// NewAuthHandler creates a new auth handler
|
||||
// NewAuthHandler creates a new auth handler.
|
||||
func NewAuthHandler(authService *services.AuthService, emailService *services.EmailService, cache *services.CacheService) *AuthHandler {
|
||||
return &AuthHandler{
|
||||
authService: authService,
|
||||
@@ -35,139 +36,108 @@ func NewAuthHandler(authService *services.AuthService, emailService *services.Em
|
||||
}
|
||||
}
|
||||
|
||||
// SetAppleAuthService sets the Apple auth service (called after initialization)
|
||||
func (h *AuthHandler) SetAppleAuthService(appleAuth *services.AppleAuthService) {
|
||||
h.appleAuthService = appleAuth
|
||||
}
|
||||
|
||||
// SetGoogleAuthService sets the Google auth service (called after initialization)
|
||||
func (h *AuthHandler) SetGoogleAuthService(googleAuth *services.GoogleAuthService) {
|
||||
h.googleAuthService = googleAuth
|
||||
}
|
||||
|
||||
// SetStorageService sets the storage service for file deletion during account deletion
|
||||
// SetStorageService sets the storage service for file deletion during account deletion.
|
||||
func (h *AuthHandler) SetStorageService(storageService *services.StorageService) {
|
||||
h.storageService = storageService
|
||||
}
|
||||
|
||||
// SetAuditService sets the audit service for logging security events
|
||||
// SetAuditService sets the audit service for logging security events.
|
||||
func (h *AuthHandler) SetAuditService(auditService *services.AuditService) {
|
||||
h.auditService = auditService
|
||||
}
|
||||
|
||||
// Login handles POST /api/auth/login/
|
||||
func (h *AuthHandler) Login(c echo.Context) error {
|
||||
var req requests.LoginRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
}
|
||||
if err := c.Validate(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
response, err := h.authService.Login(&req)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Str("identifier", req.Username).Msg("Login failed")
|
||||
if h.auditService != nil {
|
||||
h.auditService.LogEvent(c, nil, services.AuditEventLoginFailed, map[string]interface{}{
|
||||
"identifier": req.Username,
|
||||
})
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
if h.auditService != nil {
|
||||
userID := response.User.ID
|
||||
h.auditService.LogEvent(c, &userID, services.AuditEventLogin, nil)
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, response)
|
||||
// SetEnqueuer sets the async task enqueuer (used by the GDPR data-export endpoint).
|
||||
func (h *AuthHandler) SetEnqueuer(enqueuer worker.Enqueuer) {
|
||||
h.enqueuer = enqueuer
|
||||
}
|
||||
|
||||
// Register handles POST /api/auth/register/
|
||||
// ExportData handles POST /api/auth/export/ — queues a GDPR data-export job that
|
||||
// emails the user a zip of all their data. Async (202) because gathering,
|
||||
// zipping, and emailing can take seconds; doing it inline would block the request.
|
||||
func (h *AuthHandler) ExportData(c echo.Context) error {
|
||||
noStore(c)
|
||||
user, err := middleware.MustGetAuthUser(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if h.enqueuer == nil {
|
||||
return echo.NewHTTPError(http.StatusServiceUnavailable, "data export is temporarily unavailable")
|
||||
}
|
||||
if err := h.enqueuer.EnqueueDataExport(user.ID); err != nil {
|
||||
log.Error().Err(err).Uint("user_id", user.ID).Msg("Failed to enqueue data export")
|
||||
return echo.NewHTTPError(http.StatusInternalServerError, "failed to queue data export")
|
||||
}
|
||||
if h.auditService != nil {
|
||||
h.auditService.LogEvent(c, &user.ID, services.AuditEventDataExport, map[string]interface{}{
|
||||
"user_id": user.ID,
|
||||
"email": user.Email,
|
||||
})
|
||||
}
|
||||
return c.JSON(http.StatusAccepted, map[string]string{
|
||||
"message": "Your data export has been queued. You'll receive an email with your data shortly.",
|
||||
})
|
||||
}
|
||||
|
||||
// noStore marks a response as non-cacheable.
|
||||
func noStore(c echo.Context) {
|
||||
c.Response().Header().Set("Cache-Control", "no-store")
|
||||
}
|
||||
|
||||
// Register handles POST /api/auth/register/ — creates a new password account.
|
||||
//
|
||||
// The identity is admin-created in Kratos with an unverified email and no
|
||||
// auto-sent code (see services.AuthService.Register). The client logs in right
|
||||
// after to get a session, then completes email verification. Returns 201 with
|
||||
// no token; 409 if the email is taken; 400 on a weak password.
|
||||
func (h *AuthHandler) Register(c echo.Context) error {
|
||||
var req requests.RegisterRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
return apperrors.BadRequest("error.invalid_request_body")
|
||||
}
|
||||
if err := c.Validate(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
response, confirmationCode, err := h.authService.Register(&req)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msg("Registration failed")
|
||||
if err := h.authService.Register(c.Request().Context(), &req); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if h.auditService != nil {
|
||||
userID := response.User.ID
|
||||
h.auditService.LogEvent(c, &userID, services.AuditEventRegister, map[string]interface{}{
|
||||
"username": req.Username,
|
||||
"email": req.Email,
|
||||
})
|
||||
}
|
||||
|
||||
// Send welcome email with confirmation code (async)
|
||||
if h.emailService != nil && confirmationCode != "" {
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Error().Interface("panic", r).Str("email", req.Email).Msg("Panic in welcome email goroutine")
|
||||
}
|
||||
}()
|
||||
if err := h.emailService.SendWelcomeEmail(req.Email, req.FirstName, confirmationCode); err != nil {
|
||||
log.Error().Err(err).Str("email", req.Email).Msg("Failed to send welcome email")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusCreated, response)
|
||||
}
|
||||
|
||||
// Logout handles POST /api/auth/logout/
|
||||
func (h *AuthHandler) Logout(c echo.Context) error {
|
||||
token := middleware.GetAuthToken(c)
|
||||
if token == "" {
|
||||
return apperrors.Unauthorized("error.not_authenticated")
|
||||
}
|
||||
|
||||
// Log audit event before invalidating the token
|
||||
if h.auditService != nil {
|
||||
user := middleware.GetAuthUser(c)
|
||||
if user != nil {
|
||||
h.auditService.LogEvent(c, &user.ID, services.AuditEventLogout, nil)
|
||||
}
|
||||
}
|
||||
|
||||
// Invalidate token in database
|
||||
if err := h.authService.Logout(token); err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to delete token from database")
|
||||
}
|
||||
|
||||
// Invalidate token in cache
|
||||
if h.cache != nil {
|
||||
if err := h.cache.InvalidateAuthToken(c.Request().Context(), token); err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to invalidate token in cache")
|
||||
}
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Logged out successfully"})
|
||||
return c.JSON(http.StatusCreated, map[string]string{
|
||||
"message": "Account created. Please verify your email.",
|
||||
})
|
||||
}
|
||||
|
||||
// CurrentUser handles GET /api/auth/me/
|
||||
func (h *AuthHandler) CurrentUser(c echo.Context) error {
|
||||
noStore(c)
|
||||
user, err := middleware.MustGetAuthUser(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
response, err := h.authService.GetCurrentUser(user.ID)
|
||||
response, err := h.authService.GetCurrentUser(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Uint("user_id", user.ID).Msg("Failed to get current user")
|
||||
return err
|
||||
}
|
||||
|
||||
// user_profile.verified is a one-time mirror set at provision time
|
||||
// (see middleware/kratos_auth.go::provision). Kratos remains the source
|
||||
// of truth for email-verification state — it can flip from false → true
|
||||
// the instant the user completes the verification flow, and nothing
|
||||
// updates the local column. Override the response with the live value
|
||||
// the Kratos auth middleware already stashed in context so /auth/me
|
||||
// reflects current reality. Also opportunistically sync the DB mirror
|
||||
// (best-effort, ignore error) so background queries that read the
|
||||
// column see the same answer.
|
||||
if verified, ok := c.Get(middleware.AuthVerifiedKey).(bool); ok {
|
||||
mirrorStale := response.Profile != nil && response.Profile.Verified != verified
|
||||
if response.Profile != nil {
|
||||
response.Profile.Verified = verified
|
||||
}
|
||||
if verified && mirrorStale {
|
||||
_ = h.authService.MarkUserVerified(c.Request().Context(), user.ID)
|
||||
}
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
@@ -186,7 +156,7 @@ func (h *AuthHandler) UpdateProfile(c echo.Context) error {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
response, err := h.authService.UpdateProfile(user.ID, &req)
|
||||
response, err := h.authService.UpdateProfile(c.Request().Context(), user.ID, &req)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Uint("user_id", user.ID).Msg("Failed to update profile")
|
||||
return err
|
||||
@@ -195,296 +165,6 @@ func (h *AuthHandler) UpdateProfile(c echo.Context) error {
|
||||
return c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// VerifyEmail handles POST /api/auth/verify-email/
|
||||
func (h *AuthHandler) VerifyEmail(c echo.Context) error {
|
||||
user, err := middleware.MustGetAuthUser(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var req requests.VerifyEmailRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
}
|
||||
if err := c.Validate(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
err = h.authService.VerifyEmail(user.ID, req.Code)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Uint("user_id", user.ID).Msg("Email verification failed")
|
||||
return err
|
||||
}
|
||||
|
||||
// Send post-verification welcome email with tips (async)
|
||||
if h.emailService != nil {
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in post-verification email goroutine")
|
||||
}
|
||||
}()
|
||||
if err := h.emailService.SendPostVerificationEmail(user.Email, user.FirstName); err != nil {
|
||||
log.Error().Err(err).Str("email", user.Email).Msg("Failed to send post-verification email")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, responses.VerifyEmailResponse{
|
||||
Message: "Email verified successfully",
|
||||
Verified: true,
|
||||
})
|
||||
}
|
||||
|
||||
// ResendVerification handles POST /api/auth/resend-verification/
|
||||
func (h *AuthHandler) ResendVerification(c echo.Context) error {
|
||||
user, err := middleware.MustGetAuthUser(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
code, err := h.authService.ResendVerificationCode(user.ID)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Uint("user_id", user.ID).Msg("Failed to resend verification")
|
||||
return err
|
||||
}
|
||||
|
||||
// Send verification email (async)
|
||||
if h.emailService != nil {
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in verification email goroutine")
|
||||
}
|
||||
}()
|
||||
if err := h.emailService.SendVerificationEmail(user.Email, user.FirstName, code); err != nil {
|
||||
log.Error().Err(err).Str("email", user.Email).Msg("Failed to send verification email")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Verification email sent"})
|
||||
}
|
||||
|
||||
// ForgotPassword handles POST /api/auth/forgot-password/
|
||||
func (h *AuthHandler) ForgotPassword(c echo.Context) error {
|
||||
var req requests.ForgotPasswordRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
}
|
||||
if err := c.Validate(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
code, user, err := h.authService.ForgotPassword(req.Email)
|
||||
if err != nil {
|
||||
var appErr *apperrors.AppError
|
||||
if errors.As(err, &appErr) && appErr.Code == http.StatusTooManyRequests {
|
||||
// Only reveal rate limit errors
|
||||
return err
|
||||
}
|
||||
|
||||
log.Error().Err(err).Str("email", req.Email).Msg("Forgot password failed")
|
||||
// Don't reveal other errors to prevent email enumeration
|
||||
}
|
||||
|
||||
// Send password reset email (async) - only if user found
|
||||
if h.emailService != nil && code != "" && user != nil {
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in password reset email goroutine")
|
||||
}
|
||||
}()
|
||||
if err := h.emailService.SendPasswordResetEmail(user.Email, user.FirstName, code); err != nil {
|
||||
log.Error().Err(err).Str("email", user.Email).Msg("Failed to send password reset email")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
if h.auditService != nil {
|
||||
h.auditService.LogEvent(c, nil, services.AuditEventPasswordReset, map[string]interface{}{
|
||||
"email": req.Email,
|
||||
})
|
||||
}
|
||||
|
||||
// Always return success to prevent email enumeration
|
||||
return c.JSON(http.StatusOK, responses.ForgotPasswordResponse{
|
||||
Message: "Password reset email sent",
|
||||
})
|
||||
}
|
||||
|
||||
// VerifyResetCode handles POST /api/auth/verify-reset-code/
|
||||
func (h *AuthHandler) VerifyResetCode(c echo.Context) error {
|
||||
var req requests.VerifyResetCodeRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
}
|
||||
if err := c.Validate(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
resetToken, err := h.authService.VerifyResetCode(req.Email, req.Code)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Str("email", req.Email).Msg("Verify reset code failed")
|
||||
return err
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, responses.VerifyResetCodeResponse{
|
||||
Message: "Reset code verified",
|
||||
ResetToken: resetToken,
|
||||
})
|
||||
}
|
||||
|
||||
// ResetPassword handles POST /api/auth/reset-password/
|
||||
func (h *AuthHandler) ResetPassword(c echo.Context) error {
|
||||
var req requests.ResetPasswordRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
}
|
||||
if err := c.Validate(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
err := h.authService.ResetPassword(req.ResetToken, req.NewPassword)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msg("Password reset failed")
|
||||
return err
|
||||
}
|
||||
|
||||
if h.auditService != nil {
|
||||
h.auditService.LogEvent(c, nil, services.AuditEventPasswordChanged, map[string]interface{}{
|
||||
"method": "reset_token",
|
||||
})
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, responses.ResetPasswordResponse{
|
||||
Message: "Password reset successful",
|
||||
})
|
||||
}
|
||||
|
||||
// AppleSignIn handles POST /api/auth/apple-sign-in/
|
||||
func (h *AuthHandler) AppleSignIn(c echo.Context) error {
|
||||
var req requests.AppleSignInRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
}
|
||||
if err := c.Validate(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
if h.appleAuthService == nil {
|
||||
log.Error().Msg("Apple auth service not configured")
|
||||
return &apperrors.AppError{
|
||||
Code: 500,
|
||||
MessageKey: "error.apple_signin_not_configured",
|
||||
}
|
||||
}
|
||||
|
||||
response, err := h.authService.AppleSignIn(c.Request().Context(), h.appleAuthService, &req)
|
||||
if err != nil {
|
||||
// Check for legacy Apple Sign In error (not yet migrated)
|
||||
if errors.Is(err, services.ErrAppleSignInFailed) {
|
||||
log.Debug().Err(err).Msg("Apple Sign In failed (legacy error)")
|
||||
return apperrors.Unauthorized("error.invalid_apple_token")
|
||||
}
|
||||
|
||||
log.Debug().Err(err).Msg("Apple Sign In failed")
|
||||
return err
|
||||
}
|
||||
|
||||
// Send welcome email for new users (async)
|
||||
if response.IsNewUser && h.emailService != nil && response.User.Email != "" {
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Error().Interface("panic", r).Str("email", response.User.Email).Msg("Panic in Apple welcome email goroutine")
|
||||
}
|
||||
}()
|
||||
if err := h.emailService.SendAppleWelcomeEmail(response.User.Email, response.User.FirstName); err != nil {
|
||||
log.Error().Err(err).Str("email", response.User.Email).Msg("Failed to send Apple welcome email")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// GoogleSignIn handles POST /api/auth/google-sign-in/
|
||||
func (h *AuthHandler) GoogleSignIn(c echo.Context) error {
|
||||
var req requests.GoogleSignInRequest
|
||||
if err := c.Bind(&req); err != nil {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
}
|
||||
if err := c.Validate(&req); err != nil {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
if h.googleAuthService == nil {
|
||||
log.Error().Msg("Google auth service not configured")
|
||||
return &apperrors.AppError{
|
||||
Code: 500,
|
||||
MessageKey: "error.google_signin_not_configured",
|
||||
}
|
||||
}
|
||||
|
||||
response, err := h.authService.GoogleSignIn(c.Request().Context(), h.googleAuthService, &req)
|
||||
if err != nil {
|
||||
// Check for legacy Google Sign In error (not yet migrated)
|
||||
if errors.Is(err, services.ErrGoogleSignInFailed) {
|
||||
log.Debug().Err(err).Msg("Google Sign In failed (legacy error)")
|
||||
return apperrors.Unauthorized("error.invalid_google_token")
|
||||
}
|
||||
|
||||
log.Debug().Err(err).Msg("Google Sign In failed")
|
||||
return err
|
||||
}
|
||||
|
||||
// Send welcome email for new users (async)
|
||||
if response.IsNewUser && h.emailService != nil && response.User.Email != "" {
|
||||
go func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Error().Interface("panic", r).Str("email", response.User.Email).Msg("Panic in Google welcome email goroutine")
|
||||
}
|
||||
}()
|
||||
if err := h.emailService.SendGoogleWelcomeEmail(response.User.Email, response.User.FirstName); err != nil {
|
||||
log.Error().Err(err).Str("email", response.User.Email).Msg("Failed to send Google welcome email")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// RefreshToken handles POST /api/auth/refresh/
|
||||
func (h *AuthHandler) RefreshToken(c echo.Context) error {
|
||||
user, err := middleware.MustGetAuthUser(c)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
token := middleware.GetAuthToken(c)
|
||||
if token == "" {
|
||||
return apperrors.Unauthorized("error.not_authenticated")
|
||||
}
|
||||
|
||||
response, err := h.authService.RefreshToken(token, user.ID)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Uint("user_id", user.ID).Msg("Token refresh failed")
|
||||
return err
|
||||
}
|
||||
|
||||
// If the token was refreshed (new token), invalidate the old one from cache
|
||||
if response.Token != token && h.cache != nil {
|
||||
if cacheErr := h.cache.InvalidateAuthToken(c.Request().Context(), token); cacheErr != nil {
|
||||
log.Warn().Err(cacheErr).Msg("Failed to invalidate old token from cache during refresh")
|
||||
}
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, response)
|
||||
}
|
||||
|
||||
// DeleteAccount handles DELETE /api/auth/account/
|
||||
func (h *AuthHandler) DeleteAccount(c echo.Context) error {
|
||||
user, err := middleware.MustGetAuthUser(c)
|
||||
@@ -497,7 +177,7 @@ func (h *AuthHandler) DeleteAccount(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_request")
|
||||
}
|
||||
|
||||
fileURLs, err := h.authService.DeleteAccount(user.ID, req.Password, req.Confirmation)
|
||||
fileURLs, err := h.authService.DeleteAccount(c.Request().Context(), user.ID, req.Password, req.Confirmation)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Uint("user_id", user.ID).Msg("Account deletion failed")
|
||||
return err
|
||||
@@ -527,13 +207,5 @@ func (h *AuthHandler) DeleteAccount(c echo.Context) error {
|
||||
}()
|
||||
}
|
||||
|
||||
// Invalidate auth token from cache
|
||||
token := middleware.GetAuthToken(c)
|
||||
if h.cache != nil && token != "" {
|
||||
if err := h.cache.InvalidateAuthToken(c.Request().Context(), token); err != nil {
|
||||
log.Warn().Err(err).Msg("Failed to invalidate token in cache after account deletion")
|
||||
}
|
||||
}
|
||||
|
||||
return c.JSON(http.StatusOK, responses.MessageResponse{Message: "Account deleted successfully"})
|
||||
}
|
||||
|
||||
@@ -35,26 +35,25 @@ func setupDeleteAccountHandler(t *testing.T) (*AuthHandler, *echo.Echo, *gorm.DB
|
||||
return handler, e, db
|
||||
}
|
||||
|
||||
func TestAuthHandler_DeleteAccount_EmailUser(t *testing.T) {
|
||||
// TestAuthHandler_DeleteAccount_WithConfirmation verifies that DELETE /account/
|
||||
// succeeds when the user sends confirmation: "DELETE".
|
||||
// Post-Kratos: all users (regardless of provider) must confirm with "DELETE".
|
||||
func TestAuthHandler_DeleteAccount_WithConfirmation(t *testing.T) {
|
||||
handler, e, db := setupDeleteAccountHandler(t)
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "deletetest", "delete@test.com", "Password123")
|
||||
user := testutil.CreateTestUser(t, db, "deletetest", "delete@test.com", "ignored")
|
||||
|
||||
// Create profile for the user
|
||||
profile := &models.UserProfile{UserID: user.ID, Verified: true}
|
||||
require.NoError(t, db.Create(profile).Error)
|
||||
|
||||
// Create auth token
|
||||
testutil.CreateTestToken(t, db, user.ID)
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.DELETE("/account/", handler.DeleteAccount)
|
||||
|
||||
t.Run("successful deletion with correct password", func(t *testing.T) {
|
||||
password := "Password123"
|
||||
t.Run("successful deletion with DELETE confirmation", func(t *testing.T) {
|
||||
req := map[string]interface{}{
|
||||
"password": password,
|
||||
"confirmation": "DELETE",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
|
||||
@@ -74,106 +73,15 @@ func TestAuthHandler_DeleteAccount_EmailUser(t *testing.T) {
|
||||
// Verify profile is deleted
|
||||
db.Model(&models.UserProfile{}).Where("user_id = ?", user.ID).Count(&count)
|
||||
assert.Equal(t, int64(0), count)
|
||||
|
||||
// Verify auth token is deleted
|
||||
db.Model(&models.AuthToken{}).Where("user_id = ?", user.ID).Count(&count)
|
||||
assert.Equal(t, int64(0), count)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_DeleteAccount_WrongPassword(t *testing.T) {
|
||||
// TestAuthHandler_DeleteAccount_MissingConfirmation verifies that a missing
|
||||
// confirmation string is rejected with 400.
|
||||
func TestAuthHandler_DeleteAccount_MissingConfirmation(t *testing.T) {
|
||||
handler, e, db := setupDeleteAccountHandler(t)
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "wrongpw", "wrongpw@test.com", "Password123")
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.DELETE("/account/", handler.DeleteAccount)
|
||||
|
||||
t.Run("wrong password returns 401", func(t *testing.T) {
|
||||
wrongPw := "wrongpassword"
|
||||
req := map[string]interface{}{
|
||||
"password": wrongPw,
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_DeleteAccount_MissingPassword(t *testing.T) {
|
||||
handler, e, db := setupDeleteAccountHandler(t)
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "nopw", "nopw@test.com", "Password123")
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.DELETE("/account/", handler.DeleteAccount)
|
||||
|
||||
t.Run("missing password returns 400", func(t *testing.T) {
|
||||
req := map[string]interface{}{}
|
||||
|
||||
w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_DeleteAccount_SocialAuthUser(t *testing.T) {
|
||||
handler, e, db := setupDeleteAccountHandler(t)
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "appleuser", "apple@test.com", "randompassword")
|
||||
|
||||
// Create Apple social auth record
|
||||
appleAuth := &models.AppleSocialAuth{
|
||||
UserID: user.ID,
|
||||
AppleID: "apple_sub_123",
|
||||
Email: "apple@test.com",
|
||||
}
|
||||
require.NoError(t, db.Create(appleAuth).Error)
|
||||
|
||||
// Create profile
|
||||
profile := &models.UserProfile{UserID: user.ID, Verified: true}
|
||||
require.NoError(t, db.Create(profile).Error)
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.DELETE("/account/", handler.DeleteAccount)
|
||||
|
||||
t.Run("successful deletion with DELETE confirmation", func(t *testing.T) {
|
||||
confirmation := "DELETE"
|
||||
req := map[string]interface{}{
|
||||
"confirmation": confirmation,
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
|
||||
// Verify user is deleted
|
||||
var count int64
|
||||
db.Model(&models.User{}).Where("id = ?", user.ID).Count(&count)
|
||||
assert.Equal(t, int64(0), count)
|
||||
|
||||
// Verify apple auth is deleted
|
||||
db.Model(&models.AppleSocialAuth{}).Where("user_id = ?", user.ID).Count(&count)
|
||||
assert.Equal(t, int64(0), count)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
|
||||
handler, e, db := setupDeleteAccountHandler(t)
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "googleuser", "google@test.com", "randompassword")
|
||||
|
||||
// Create Google social auth record
|
||||
googleAuth := &models.GoogleSocialAuth{
|
||||
UserID: user.ID,
|
||||
GoogleID: "google_sub_456",
|
||||
Email: "google@test.com",
|
||||
}
|
||||
require.NoError(t, db.Create(googleAuth).Error)
|
||||
user := testutil.CreateTestUser(t, db, "nopw", "nopw@test.com", "ignored")
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
@@ -188,9 +96,8 @@ func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
|
||||
})
|
||||
|
||||
t.Run("wrong confirmation returns 400", func(t *testing.T) {
|
||||
wrongConfirmation := "delete"
|
||||
req := map[string]interface{}{
|
||||
"confirmation": wrongConfirmation,
|
||||
"confirmation": "delete", // lowercase — must be exact "DELETE"
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "test-token")
|
||||
@@ -199,6 +106,8 @@ func TestAuthHandler_DeleteAccount_SocialAuthMissingConfirmation(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestAuthHandler_DeleteAccount_Unauthenticated verifies that 401 is returned
|
||||
// when no auth middleware is set.
|
||||
func TestAuthHandler_DeleteAccount_Unauthenticated(t *testing.T) {
|
||||
handler, e, _ := setupDeleteAccountHandler(t)
|
||||
|
||||
@@ -207,7 +116,7 @@ func TestAuthHandler_DeleteAccount_Unauthenticated(t *testing.T) {
|
||||
|
||||
t.Run("unauthenticated request returns 401", func(t *testing.T) {
|
||||
req := map[string]interface{}{
|
||||
"password": "Password123",
|
||||
"confirmation": "DELETE",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "DELETE", "/api/auth/account/", req, "")
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
// auth_handler_test.go tests the auth handler endpoints that survived the
|
||||
// Ory Kratos migration: GET /me/ and PUT/PATCH /profile/.
|
||||
// Login, register, logout, forgot-password, and social sign-in are now
|
||||
// handled by Kratos.
|
||||
package handlers
|
||||
|
||||
import (
|
||||
@@ -34,204 +38,32 @@ func setupAuthHandler(t *testing.T) (*AuthHandler, *echo.Echo, *repositories.Use
|
||||
return handler, e, userRepo
|
||||
}
|
||||
|
||||
func TestAuthHandler_Register(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/register/", handler.Register)
|
||||
|
||||
t.Run("successful registration", func(t *testing.T) {
|
||||
req := requests.RegisterRequest{
|
||||
Username: "newuser",
|
||||
Email: "new@test.com",
|
||||
Password: "Password123",
|
||||
FirstName: "New",
|
||||
LastName: "User",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusCreated)
|
||||
|
||||
var response map[string]interface{}
|
||||
err := json.Unmarshal(w.Body.Bytes(), &response)
|
||||
require.NoError(t, err)
|
||||
|
||||
testutil.AssertJSONFieldExists(t, response, "token")
|
||||
testutil.AssertJSONFieldExists(t, response, "user")
|
||||
testutil.AssertJSONFieldExists(t, response, "message")
|
||||
|
||||
user := response["user"].(map[string]interface{})
|
||||
assert.Equal(t, "newuser", user["username"])
|
||||
assert.Equal(t, "new@test.com", user["email"])
|
||||
assert.Equal(t, "New", user["first_name"])
|
||||
assert.Equal(t, "User", user["last_name"])
|
||||
})
|
||||
|
||||
t.Run("registration with missing fields", func(t *testing.T) {
|
||||
req := map[string]string{
|
||||
"username": "test",
|
||||
// Missing email and password
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
|
||||
response := testutil.ParseJSON(t, w.Body.Bytes())
|
||||
testutil.AssertJSONFieldExists(t, response, "error")
|
||||
})
|
||||
|
||||
t.Run("registration with short password", func(t *testing.T) {
|
||||
req := requests.RegisterRequest{
|
||||
Username: "testuser",
|
||||
Email: "test@test.com",
|
||||
Password: "short", // Less than 8 chars
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
|
||||
t.Run("registration with duplicate username", func(t *testing.T) {
|
||||
// First registration
|
||||
req := requests.RegisterRequest{
|
||||
Username: "duplicate",
|
||||
Email: "unique1@test.com",
|
||||
Password: "Password123",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusCreated)
|
||||
|
||||
// Try to register again with same username
|
||||
req.Email = "unique2@test.com"
|
||||
w = testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusConflict) // 409 for duplicate resource
|
||||
|
||||
response := testutil.ParseJSON(t, w.Body.Bytes())
|
||||
assert.Contains(t, response["error"], "Username already taken")
|
||||
})
|
||||
|
||||
t.Run("registration with duplicate email", func(t *testing.T) {
|
||||
// First registration
|
||||
req := requests.RegisterRequest{
|
||||
Username: "user1",
|
||||
Email: "duplicate@test.com",
|
||||
Password: "Password123",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusCreated)
|
||||
|
||||
// Try to register again with same email
|
||||
req.Username = "user2"
|
||||
w = testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusConflict) // 409 for duplicate resource
|
||||
|
||||
response := testutil.ParseJSON(t, w.Body.Bytes())
|
||||
assert.Contains(t, response["error"], "Email already registered")
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_Login(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/register/", handler.Register)
|
||||
e.POST("/api/auth/login/", handler.Login)
|
||||
|
||||
// Create a test user
|
||||
registerReq := requests.RegisterRequest{
|
||||
Username: "logintest",
|
||||
Email: "login@test.com",
|
||||
Password: "Password123",
|
||||
FirstName: "Test",
|
||||
LastName: "User",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/register/", registerReq, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusCreated)
|
||||
|
||||
t.Run("successful login with username", func(t *testing.T) {
|
||||
req := requests.LoginRequest{
|
||||
Username: "logintest",
|
||||
Password: "Password123",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
|
||||
var response map[string]interface{}
|
||||
err := json.Unmarshal(w.Body.Bytes(), &response)
|
||||
require.NoError(t, err)
|
||||
|
||||
testutil.AssertJSONFieldExists(t, response, "token")
|
||||
testutil.AssertJSONFieldExists(t, response, "user")
|
||||
|
||||
user := response["user"].(map[string]interface{})
|
||||
assert.Equal(t, "logintest", user["username"])
|
||||
assert.Equal(t, "login@test.com", user["email"])
|
||||
})
|
||||
|
||||
t.Run("successful login with email", func(t *testing.T) {
|
||||
req := requests.LoginRequest{
|
||||
Username: "login@test.com", // Using email as username
|
||||
Password: "Password123",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
})
|
||||
|
||||
t.Run("login with wrong password", func(t *testing.T) {
|
||||
req := requests.LoginRequest{
|
||||
Username: "logintest",
|
||||
Password: "wrongpassword",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
|
||||
|
||||
response := testutil.ParseJSON(t, w.Body.Bytes())
|
||||
assert.Contains(t, response["error"], "Invalid credentials")
|
||||
})
|
||||
|
||||
t.Run("login with non-existent user", func(t *testing.T) {
|
||||
req := requests.LoginRequest{
|
||||
Username: "nonexistent",
|
||||
Password: "Password123",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusUnauthorized)
|
||||
})
|
||||
|
||||
t.Run("login with missing fields", func(t *testing.T) {
|
||||
req := map[string]string{
|
||||
"username": "logintest",
|
||||
// Missing password
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/login/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_CurrentUser(t *testing.T) {
|
||||
handler, e, userRepo := setupAuthHandler(t)
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
db := testutil.SetupTestDB(t)
|
||||
user := testutil.CreateTestUser(t, db, "metest", "me@test.com", "Password123")
|
||||
user := testutil.CreateTestUser(t, db, "metest", "me@test.com", "")
|
||||
user.FirstName = "Test"
|
||||
user.LastName = "User"
|
||||
userRepo.Update(user)
|
||||
// Use the userRepo from setupAuthHandler's DB, but since we need the user
|
||||
// in the same DB we re-create it there.
|
||||
db2 := testutil.SetupTestDB(t)
|
||||
user2 := testutil.CreateTestUser(t, db2, "metest2", "me2@test.com", "")
|
||||
user2.FirstName = "Test"
|
||||
user2.LastName = "User"
|
||||
userRepo2 := repositories.NewUserRepository(db2)
|
||||
require.NoError(t, userRepo2.Update(user2))
|
||||
|
||||
// Build handler against db2
|
||||
cfg := &config.Config{}
|
||||
authService2 := services.NewAuthService(userRepo2, cfg)
|
||||
handler2 := NewAuthHandler(authService2, nil, nil)
|
||||
|
||||
// Set up route with mock auth middleware
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.GET("/me/", handler.CurrentUser)
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user2))
|
||||
authGroup.GET("/me/", handler2.CurrentUser)
|
||||
|
||||
_ = handler // avoid unused
|
||||
|
||||
t.Run("get current user", func(t *testing.T) {
|
||||
w := testutil.MakeRequest(e, "GET", "/api/auth/me/", nil, "test-token")
|
||||
@@ -242,23 +74,26 @@ func TestAuthHandler_CurrentUser(t *testing.T) {
|
||||
err := json.Unmarshal(w.Body.Bytes(), &response)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, "metest", response["username"])
|
||||
assert.Equal(t, "me@test.com", response["email"])
|
||||
assert.Equal(t, "metest2", response["username"])
|
||||
assert.Equal(t, "me2@test.com", response["email"])
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_UpdateProfile(t *testing.T) {
|
||||
handler, e, userRepo := setupAuthHandler(t)
|
||||
|
||||
db := testutil.SetupTestDB(t)
|
||||
user := testutil.CreateTestUser(t, db, "updatetest", "update@test.com", "Password123")
|
||||
userRepo.Update(user)
|
||||
userRepo := repositories.NewUserRepository(db)
|
||||
cfg := &config.Config{}
|
||||
authService := services.NewAuthService(userRepo, cfg)
|
||||
handler := NewAuthHandler(authService, nil, nil)
|
||||
e := testutil.SetupTestRouter()
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "updatetest", "update@test.com", "")
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.PUT("/profile/", handler.UpdateProfile)
|
||||
|
||||
t.Run("update profile", func(t *testing.T) {
|
||||
t.Run("update first and last name", func(t *testing.T) {
|
||||
firstName := "Updated"
|
||||
lastName := "Name"
|
||||
req := requests.UpdateProfileRequest{
|
||||
@@ -278,130 +113,3 @@ func TestAuthHandler_UpdateProfile(t *testing.T) {
|
||||
assert.Equal(t, "Name", response["last_name"])
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_ForgotPassword(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/register/", handler.Register)
|
||||
e.POST("/api/auth/forgot-password/", handler.ForgotPassword)
|
||||
|
||||
// Create a test user
|
||||
registerReq := requests.RegisterRequest{
|
||||
Username: "forgottest",
|
||||
Email: "forgot@test.com",
|
||||
Password: "Password123",
|
||||
}
|
||||
testutil.MakeRequest(e, "POST", "/api/auth/register/", registerReq, "")
|
||||
|
||||
t.Run("forgot password with valid email", func(t *testing.T) {
|
||||
req := requests.ForgotPasswordRequest{
|
||||
Email: "forgot@test.com",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
|
||||
|
||||
// Always returns 200 to prevent email enumeration
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
|
||||
response := testutil.ParseJSON(t, w.Body.Bytes())
|
||||
testutil.AssertJSONFieldExists(t, response, "message")
|
||||
})
|
||||
|
||||
t.Run("forgot password with invalid email", func(t *testing.T) {
|
||||
req := requests.ForgotPasswordRequest{
|
||||
Email: "nonexistent@test.com",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
|
||||
|
||||
// Still returns 200 to prevent email enumeration
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_Logout(t *testing.T) {
|
||||
handler, e, userRepo := setupAuthHandler(t)
|
||||
|
||||
db := testutil.SetupTestDB(t)
|
||||
user := testutil.CreateTestUser(t, db, "logouttest", "logout@test.com", "Password123")
|
||||
userRepo.Update(user)
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.POST("/logout/", handler.Logout)
|
||||
|
||||
t.Run("successful logout", func(t *testing.T) {
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/logout/", nil, "test-token")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
|
||||
response := testutil.ParseJSON(t, w.Body.Bytes())
|
||||
assert.Contains(t, response["message"], "Logged out successfully")
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_JSONResponses(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/register/", handler.Register)
|
||||
e.POST("/api/auth/login/", handler.Login)
|
||||
|
||||
t.Run("register response has correct JSON structure", func(t *testing.T) {
|
||||
req := requests.RegisterRequest{
|
||||
Username: "jsontest",
|
||||
Email: "json@test.com",
|
||||
Password: "Password123",
|
||||
FirstName: "JSON",
|
||||
LastName: "Test",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusCreated)
|
||||
|
||||
var response map[string]interface{}
|
||||
err := json.Unmarshal(w.Body.Bytes(), &response)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify top-level structure
|
||||
assert.Contains(t, response, "token")
|
||||
assert.Contains(t, response, "user")
|
||||
assert.Contains(t, response, "message")
|
||||
|
||||
// Verify token is not empty
|
||||
assert.NotEmpty(t, response["token"])
|
||||
|
||||
// Verify user structure
|
||||
user := response["user"].(map[string]interface{})
|
||||
assert.Contains(t, user, "id")
|
||||
assert.Contains(t, user, "username")
|
||||
assert.Contains(t, user, "email")
|
||||
assert.Contains(t, user, "first_name")
|
||||
assert.Contains(t, user, "last_name")
|
||||
assert.Contains(t, user, "is_active")
|
||||
assert.Contains(t, user, "date_joined")
|
||||
|
||||
// Verify types
|
||||
assert.IsType(t, float64(0), user["id"]) // JSON numbers are float64
|
||||
assert.IsType(t, "", user["username"])
|
||||
assert.IsType(t, "", user["email"])
|
||||
assert.IsType(t, true, user["is_active"])
|
||||
})
|
||||
|
||||
t.Run("error response has correct JSON structure", func(t *testing.T) {
|
||||
req := map[string]string{
|
||||
"username": "test",
|
||||
}
|
||||
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/register/", req, "")
|
||||
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
|
||||
var response map[string]interface{}
|
||||
err := json.Unmarshal(w.Body.Bytes(), &response)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Contains(t, response, "error")
|
||||
assert.IsType(t, "", response["error"])
|
||||
})
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ func (h *ContractorHandler) ListContractors(c echo.Context) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
response, err := h.contractorService.ListContractors(user.ID)
|
||||
response, err := h.contractorService.ListContractors(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return apperrors.Internal(err)
|
||||
}
|
||||
@@ -48,7 +48,7 @@ func (h *ContractorHandler) GetContractor(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_contractor_id")
|
||||
}
|
||||
|
||||
response, err := h.contractorService.GetContractor(uint(contractorID), user.ID)
|
||||
response, err := h.contractorService.GetContractor(c.Request().Context(), uint(contractorID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -69,7 +69,7 @@ func (h *ContractorHandler) CreateContractor(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
response, err := h.contractorService.CreateContractor(&req, user.ID)
|
||||
response, err := h.contractorService.CreateContractor(c.Request().Context(), &req, user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -95,7 +95,7 @@ func (h *ContractorHandler) UpdateContractor(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
response, err := h.contractorService.UpdateContractor(uint(contractorID), user.ID, &req)
|
||||
response, err := h.contractorService.UpdateContractor(c.Request().Context(), uint(contractorID), user.ID, &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -113,7 +113,7 @@ func (h *ContractorHandler) DeleteContractor(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_contractor_id")
|
||||
}
|
||||
|
||||
err = h.contractorService.DeleteContractor(uint(contractorID), user.ID)
|
||||
err = h.contractorService.DeleteContractor(c.Request().Context(), uint(contractorID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -131,7 +131,7 @@ func (h *ContractorHandler) ToggleFavorite(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_contractor_id")
|
||||
}
|
||||
|
||||
response, err := h.contractorService.ToggleFavorite(uint(contractorID), user.ID)
|
||||
response, err := h.contractorService.ToggleFavorite(c.Request().Context(), uint(contractorID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -149,7 +149,7 @@ func (h *ContractorHandler) GetContractorTasks(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_contractor_id")
|
||||
}
|
||||
|
||||
response, err := h.contractorService.GetContractorTasks(uint(contractorID), user.ID)
|
||||
response, err := h.contractorService.GetContractorTasks(c.Request().Context(), uint(contractorID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -167,7 +167,7 @@ func (h *ContractorHandler) ListContractorsByResidence(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_residence_id")
|
||||
}
|
||||
|
||||
response, err := h.contractorService.ListContractorsByResidence(uint(residenceID), user.ID)
|
||||
response, err := h.contractorService.ListContractorsByResidence(c.Request().Context(), uint(residenceID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -176,7 +176,7 @@ func (h *ContractorHandler) ListContractorsByResidence(c echo.Context) error {
|
||||
|
||||
// GetSpecialties handles GET /api/contractors/specialties/
|
||||
func (h *ContractorHandler) GetSpecialties(c echo.Context) error {
|
||||
specialties, err := h.contractorService.GetSpecialties()
|
||||
specialties, err := h.contractorService.GetSpecialties(c.Request().Context())
|
||||
if err != nil {
|
||||
return apperrors.Internal(err)
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ func (h *DocumentHandler) ListDocuments(c echo.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
response, err := h.documentService.ListDocuments(user.ID, filter)
|
||||
response, err := h.documentService.ListDocuments(c.Request().Context(), user.ID, filter)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -88,7 +88,7 @@ func (h *DocumentHandler) GetDocument(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_document_id")
|
||||
}
|
||||
|
||||
response, err := h.documentService.GetDocument(uint(documentID), user.ID)
|
||||
response, err := h.documentService.GetDocument(c.Request().Context(), uint(documentID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -101,7 +101,7 @@ func (h *DocumentHandler) ListWarranties(c echo.Context) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
response, err := h.documentService.ListWarranties(user.ID)
|
||||
response, err := h.documentService.ListWarranties(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return apperrors.Internal(err)
|
||||
}
|
||||
@@ -201,7 +201,7 @@ func (h *DocumentHandler) CreateDocument(c echo.Context) error {
|
||||
if h.storageService == nil {
|
||||
return apperrors.Internal(nil)
|
||||
}
|
||||
result, err := h.storageService.Upload(uploadedFile, "documents")
|
||||
result, err := h.storageService.Upload(c.Request().Context(), uploadedFile, "documents")
|
||||
if err != nil {
|
||||
return apperrors.BadRequest("error.failed_to_upload_file")
|
||||
}
|
||||
@@ -222,7 +222,7 @@ func (h *DocumentHandler) CreateDocument(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
response, err := h.documentService.CreateDocument(&req, user.ID)
|
||||
response, err := h.documentService.CreateDocument(c.Request().Context(), &req, user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -248,7 +248,7 @@ func (h *DocumentHandler) UpdateDocument(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
response, err := h.documentService.UpdateDocument(uint(documentID), user.ID, &req)
|
||||
response, err := h.documentService.UpdateDocument(c.Request().Context(), uint(documentID), user.ID, &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -266,7 +266,7 @@ func (h *DocumentHandler) DeleteDocument(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_document_id")
|
||||
}
|
||||
|
||||
err = h.documentService.DeleteDocument(uint(documentID), user.ID)
|
||||
err = h.documentService.DeleteDocument(c.Request().Context(), uint(documentID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -284,7 +284,7 @@ func (h *DocumentHandler) ActivateDocument(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_document_id")
|
||||
}
|
||||
|
||||
response, err := h.documentService.ActivateDocument(uint(documentID), user.ID)
|
||||
response, err := h.documentService.ActivateDocument(c.Request().Context(), uint(documentID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -302,7 +302,7 @@ func (h *DocumentHandler) DeactivateDocument(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_document_id")
|
||||
}
|
||||
|
||||
response, err := h.documentService.DeactivateDocument(uint(documentID), user.ID)
|
||||
response, err := h.documentService.DeactivateDocument(c.Request().Context(), uint(documentID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -342,14 +342,14 @@ func (h *DocumentHandler) UploadDocumentImage(c echo.Context) error {
|
||||
return apperrors.Internal(nil)
|
||||
}
|
||||
|
||||
result, err := h.storageService.Upload(uploadedFile, "images")
|
||||
result, err := h.storageService.Upload(c.Request().Context(), uploadedFile, "images")
|
||||
if err != nil {
|
||||
return apperrors.BadRequest("error.failed_to_upload_file")
|
||||
}
|
||||
|
||||
caption := c.FormValue("caption")
|
||||
|
||||
response, err := h.documentService.UploadDocumentImage(uint(documentID), user.ID, result.URL, caption)
|
||||
response, err := h.documentService.UploadDocumentImage(c.Request().Context(), uint(documentID), user.ID, result.URL, caption)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -372,7 +372,7 @@ func (h *DocumentHandler) DeleteDocumentImage(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_image_id")
|
||||
}
|
||||
|
||||
response, err := h.documentService.DeleteDocumentImage(uint(documentID), uint(imageID), user.ID)
|
||||
response, err := h.documentService.DeleteDocumentImage(c.Request().Context(), uint(documentID), uint(imageID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -506,232 +506,6 @@ func TestTaskHandler_CreateCompletion_NoTaskID(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Auth Handler - Additional Coverage
|
||||
// =============================================================================
|
||||
|
||||
func TestAuthHandler_AppleSignIn_NotConfigured(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/apple-sign-in/", handler.AppleSignIn)
|
||||
|
||||
t.Run("returns 500 when apple auth not configured", func(t *testing.T) {
|
||||
req := map[string]interface{}{
|
||||
"id_token": "fake-token",
|
||||
"user_id": "fake-user-id",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/apple-sign-in/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusInternalServerError)
|
||||
})
|
||||
|
||||
t.Run("missing identity_token returns 400", func(t *testing.T) {
|
||||
req := map[string]interface{}{}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/apple-sign-in/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_GoogleSignIn_NotConfigured(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/google-sign-in/", handler.GoogleSignIn)
|
||||
|
||||
t.Run("returns 500 when google auth not configured", func(t *testing.T) {
|
||||
req := map[string]interface{}{
|
||||
"id_token": "fake-token",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/google-sign-in/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusInternalServerError)
|
||||
})
|
||||
|
||||
t.Run("missing id_token returns 400", func(t *testing.T) {
|
||||
req := map[string]interface{}{}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/google-sign-in/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
// setupAuthHandlerWithDB is like setupAuthHandler but also returns the underlying *gorm.DB
|
||||
// for tests that need to create records like ConfirmationCode directly.
|
||||
func setupAuthHandlerWithDB(t *testing.T) (*AuthHandler, *echo.Echo, *gorm.DB) {
|
||||
db := testutil.SetupTestDB(t)
|
||||
userRepo := repositories.NewUserRepository(db)
|
||||
cfg := &config.Config{
|
||||
Security: config.SecurityConfig{
|
||||
SecretKey: "test-secret-key",
|
||||
PasswordResetExpiry: 15 * time.Minute,
|
||||
ConfirmationExpiry: 24 * time.Hour,
|
||||
MaxPasswordResetRate: 3,
|
||||
},
|
||||
}
|
||||
authService := services.NewAuthService(userRepo, cfg)
|
||||
handler := NewAuthHandler(authService, nil, nil)
|
||||
e := testutil.SetupTestRouter()
|
||||
return handler, e, db
|
||||
}
|
||||
|
||||
func TestAuthHandler_VerifyEmail(t *testing.T) {
|
||||
handler, e, db := setupAuthHandlerWithDB(t)
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "verifytest", "verify@test.com", "Password123")
|
||||
|
||||
// Create confirmation code
|
||||
confirmCode := &models.ConfirmationCode{
|
||||
UserID: user.ID,
|
||||
Code: "123456",
|
||||
ExpiresAt: time.Now().Add(24 * time.Hour),
|
||||
IsUsed: false,
|
||||
}
|
||||
require.NoError(t, db.Create(confirmCode).Error)
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.POST("/verify-email/", handler.VerifyEmail)
|
||||
|
||||
t.Run("successful verification", func(t *testing.T) {
|
||||
req := requests.VerifyEmailRequest{
|
||||
Code: "123456",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
|
||||
var response map[string]interface{}
|
||||
err := json.Unmarshal(w.Body.Bytes(), &response)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, true, response["verified"])
|
||||
})
|
||||
|
||||
t.Run("wrong code returns error", func(t *testing.T) {
|
||||
req := requests.VerifyEmailRequest{
|
||||
Code: "999999",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
|
||||
// Code already used or wrong code
|
||||
assert.True(t, w.Code == http.StatusBadRequest || w.Code == http.StatusNotFound,
|
||||
"expected 400 or 404, got %d", w.Code)
|
||||
})
|
||||
|
||||
t.Run("missing code returns 400", func(t *testing.T) {
|
||||
req := map[string]interface{}{}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/verify-email/", req, "test-token")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_ResendVerification(t *testing.T) {
|
||||
handler, e, db := setupAuthHandlerWithDB(t)
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "resendtest", "resend@test.com", "Password123")
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(user))
|
||||
authGroup.POST("/resend-verification/", handler.ResendVerification)
|
||||
|
||||
t.Run("successful resend", func(t *testing.T) {
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/resend-verification/", nil, "test-token")
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
|
||||
var response map[string]interface{}
|
||||
err := json.Unmarshal(w.Body.Bytes(), &response)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, response, "message")
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_RefreshToken(t *testing.T) {
|
||||
handler, e, db := setupAuthHandlerWithDB(t)
|
||||
|
||||
user := testutil.CreateTestUser(t, db, "refreshtest", "refresh@test.com", "Password123")
|
||||
|
||||
// Create auth token and use its actual key in the middleware
|
||||
authToken := testutil.CreateTestToken(t, db, user.ID)
|
||||
|
||||
authGroup := e.Group("/api/auth")
|
||||
authGroup.Use(func(next echo.HandlerFunc) echo.HandlerFunc {
|
||||
return func(c echo.Context) error {
|
||||
c.Set("auth_user", user)
|
||||
c.Set("auth_token", authToken.Key)
|
||||
return next(c)
|
||||
}
|
||||
})
|
||||
authGroup.POST("/refresh/", handler.RefreshToken)
|
||||
|
||||
t.Run("successful refresh", func(t *testing.T) {
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/refresh/", nil, authToken.Key)
|
||||
testutil.AssertStatusCode(t, w, http.StatusOK)
|
||||
|
||||
var response map[string]interface{}
|
||||
err := json.Unmarshal(w.Body.Bytes(), &response)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, response, "token")
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_VerifyResetCode(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/register/", handler.Register)
|
||||
e.POST("/api/auth/verify-reset-code/", handler.VerifyResetCode)
|
||||
|
||||
t.Run("invalid code returns error", func(t *testing.T) {
|
||||
req := requests.VerifyResetCodeRequest{
|
||||
Email: "nonexistent@test.com",
|
||||
Code: "999999",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/verify-reset-code/", req, "")
|
||||
// Should not be 200 since no valid code exists
|
||||
assert.NotEqual(t, http.StatusOK, w.Code)
|
||||
})
|
||||
|
||||
t.Run("missing fields returns 400", func(t *testing.T) {
|
||||
req := map[string]interface{}{}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/verify-reset-code/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_ResetPassword(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/reset-password/", handler.ResetPassword)
|
||||
|
||||
t.Run("invalid reset token returns error", func(t *testing.T) {
|
||||
req := requests.ResetPasswordRequest{
|
||||
ResetToken: "invalid-token",
|
||||
NewPassword: "NewPassword123",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
|
||||
assert.NotEqual(t, http.StatusOK, w.Code)
|
||||
})
|
||||
|
||||
t.Run("missing fields returns 400", func(t *testing.T) {
|
||||
req := map[string]interface{}{}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
|
||||
t.Run("short password returns 400", func(t *testing.T) {
|
||||
req := requests.ResetPasswordRequest{
|
||||
ResetToken: "some-token",
|
||||
NewPassword: "short",
|
||||
}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/reset-password/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
func TestAuthHandler_ForgotPassword_MissingEmail(t *testing.T) {
|
||||
handler, e, _ := setupAuthHandler(t)
|
||||
|
||||
e.POST("/api/auth/forgot-password/", handler.ForgotPassword)
|
||||
|
||||
t.Run("missing email returns 400", func(t *testing.T) {
|
||||
req := map[string]interface{}{}
|
||||
w := testutil.MakeRequest(e, "POST", "/api/auth/forgot-password/", req, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Residence Handler - Additional Error Paths
|
||||
// =============================================================================
|
||||
@@ -1781,45 +1555,11 @@ func TestStaticDataHandler_RefreshStaticData(t *testing.T) {
|
||||
// =============================================================================
|
||||
// Upload Handler - Additional Error Paths
|
||||
// =============================================================================
|
||||
|
||||
func TestUploadHandler_UploadImage_NoFile(t *testing.T) {
|
||||
storageSvc := newTestStorageService("/var/uploads")
|
||||
handler := NewUploadHandler(storageSvc, nil)
|
||||
e := testutil.SetupTestRouter()
|
||||
|
||||
e.POST("/api/uploads/image", handler.UploadImage)
|
||||
|
||||
t.Run("no file returns 400", func(t *testing.T) {
|
||||
w := testutil.MakeRequest(e, "POST", "/api/uploads/image", nil, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
func TestUploadHandler_UploadDocument_NoFile(t *testing.T) {
|
||||
storageSvc := newTestStorageService("/var/uploads")
|
||||
handler := NewUploadHandler(storageSvc, nil)
|
||||
e := testutil.SetupTestRouter()
|
||||
|
||||
e.POST("/api/uploads/document", handler.UploadDocument)
|
||||
|
||||
t.Run("no file returns 400", func(t *testing.T) {
|
||||
w := testutil.MakeRequest(e, "POST", "/api/uploads/document", nil, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
|
||||
func TestUploadHandler_UploadCompletion_NoFile(t *testing.T) {
|
||||
storageSvc := newTestStorageService("/var/uploads")
|
||||
handler := NewUploadHandler(storageSvc, nil)
|
||||
e := testutil.SetupTestRouter()
|
||||
|
||||
e.POST("/api/uploads/completion", handler.UploadCompletion)
|
||||
|
||||
t.Run("no file returns 400", func(t *testing.T) {
|
||||
w := testutil.MakeRequest(e, "POST", "/api/uploads/completion", nil, "")
|
||||
testutil.AssertStatusCode(t, w, http.StatusBadRequest)
|
||||
})
|
||||
}
|
||||
//
|
||||
// Multipart upload handlers (UploadImage / UploadDocument / UploadCompletion)
|
||||
// were removed alongside the legacy /api/uploads/{image,document,completion}
|
||||
// routes. The presigned-URL flow (POST /api/uploads/presign) is exercised by
|
||||
// integration tests that hit the full pipeline.
|
||||
|
||||
func TestUploadHandler_DeleteFile_OwnershipDenied(t *testing.T) {
|
||||
storageSvc := newTestStorageService("/var/uploads")
|
||||
|
||||
@@ -37,6 +37,23 @@ func NewMediaHandler(
|
||||
}
|
||||
}
|
||||
|
||||
// safeContentDisposition builds an inline Content-Disposition header value
|
||||
// with a sanitized filename (audit M1). Control characters (including CR/LF),
|
||||
// double-quote and backslash are stripped so an attacker-controlled upload
|
||||
// filename cannot inject additional response headers (CWE-113).
|
||||
func safeContentDisposition(filename string) string {
|
||||
cleaned := strings.Map(func(r rune) rune {
|
||||
if r < 0x20 || r == 0x7f || r == '"' || r == '\\' {
|
||||
return -1
|
||||
}
|
||||
return r
|
||||
}, filename)
|
||||
if cleaned == "" {
|
||||
cleaned = "download"
|
||||
}
|
||||
return `inline; filename="` + cleaned + `"`
|
||||
}
|
||||
|
||||
// ServeDocument serves a document file with access control
|
||||
// GET /api/media/document/:id
|
||||
func (h *MediaHandler) ServeDocument(c echo.Context) error {
|
||||
@@ -71,7 +88,7 @@ func (h *MediaHandler) ServeDocument(c echo.Context) error {
|
||||
// Set caching and disposition headers
|
||||
c.Response().Header().Set("Cache-Control", "private, max-age=3600")
|
||||
if doc.FileName != "" {
|
||||
c.Response().Header().Set("Content-Disposition", "inline; filename=\""+doc.FileName+"\"")
|
||||
c.Response().Header().Set("Content-Disposition", safeContentDisposition(doc.FileName))
|
||||
}
|
||||
return c.Blob(http.StatusOK, mimeType, data)
|
||||
}
|
||||
@@ -114,7 +131,7 @@ func (h *MediaHandler) ServeDocumentImage(c echo.Context) error {
|
||||
}
|
||||
|
||||
c.Response().Header().Set("Cache-Control", "private, max-age=3600")
|
||||
c.Response().Header().Set("Content-Disposition", "inline; filename=\""+filepath.Base(img.ImageURL)+"\"")
|
||||
c.Response().Header().Set("Content-Disposition", safeContentDisposition(filepath.Base(img.ImageURL)))
|
||||
return c.Blob(http.StatusOK, mimeType, data)
|
||||
}
|
||||
|
||||
@@ -162,7 +179,7 @@ func (h *MediaHandler) ServeCompletionImage(c echo.Context) error {
|
||||
}
|
||||
|
||||
c.Response().Header().Set("Cache-Control", "private, max-age=3600")
|
||||
c.Response().Header().Set("Content-Disposition", "inline; filename=\""+filepath.Base(img.ImageURL)+"\"")
|
||||
c.Response().Header().Set("Content-Disposition", safeContentDisposition(filepath.Base(img.ImageURL)))
|
||||
return c.Blob(http.StatusOK, mimeType, data)
|
||||
}
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ func (h *NotificationHandler) ListNotifications(c echo.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
notifications, err := h.notificationService.GetNotifications(user.ID, limit, offset)
|
||||
notifications, err := h.notificationService.GetNotifications(c.Request().Context(), user.ID, limit, offset)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -64,7 +64,7 @@ func (h *NotificationHandler) GetUnreadCount(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
count, err := h.notificationService.GetUnreadCount(user.ID)
|
||||
count, err := h.notificationService.GetUnreadCount(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -84,7 +84,7 @@ func (h *NotificationHandler) MarkAsRead(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_notification_id")
|
||||
}
|
||||
|
||||
err = h.notificationService.MarkAsRead(uint(notificationID), user.ID)
|
||||
err = h.notificationService.MarkAsRead(c.Request().Context(), uint(notificationID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -99,7 +99,7 @@ func (h *NotificationHandler) MarkAllAsRead(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
err = h.notificationService.MarkAllAsRead(user.ID)
|
||||
err = h.notificationService.MarkAllAsRead(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -114,7 +114,7 @@ func (h *NotificationHandler) GetPreferences(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
prefs, err := h.notificationService.GetPreferences(user.ID)
|
||||
prefs, err := h.notificationService.GetPreferences(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -137,7 +137,7 @@ func (h *NotificationHandler) UpdatePreferences(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
prefs, err := h.notificationService.UpdatePreferences(user.ID, &req)
|
||||
prefs, err := h.notificationService.UpdatePreferences(c.Request().Context(), user.ID, &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -160,7 +160,7 @@ func (h *NotificationHandler) RegisterDevice(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
device, err := h.notificationService.RegisterDevice(user.ID, &req)
|
||||
device, err := h.notificationService.RegisterDevice(c.Request().Context(), user.ID, &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -175,7 +175,7 @@ func (h *NotificationHandler) ListDevices(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
devices, err := h.notificationService.ListDevices(user.ID)
|
||||
devices, err := h.notificationService.ListDevices(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -208,7 +208,7 @@ func (h *NotificationHandler) UnregisterDevice(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_platform")
|
||||
}
|
||||
|
||||
err = h.notificationService.UnregisterDevice(req.RegistrationID, req.Platform, user.ID)
|
||||
err = h.notificationService.UnregisterDevice(c.Request().Context(), req.RegistrationID, req.Platform, user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -236,7 +236,7 @@ func (h *NotificationHandler) DeleteDevice(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_platform")
|
||||
}
|
||||
|
||||
err = h.notificationService.DeleteDevice(uint(deviceID), platform, user.ID)
|
||||
err = h.notificationService.DeleteDevice(c.Request().Context(), uint(deviceID), platform, user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -39,7 +39,7 @@ func (h *ResidenceHandler) ListResidences(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
response, err := h.residenceService.ListResidences(user.ID)
|
||||
response, err := h.residenceService.ListResidences(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -55,7 +55,7 @@ func (h *ResidenceHandler) GetMyResidences(c echo.Context) error {
|
||||
}
|
||||
userNow := middleware.GetUserNow(c)
|
||||
|
||||
response, err := h.residenceService.GetMyResidences(user.ID, userNow)
|
||||
response, err := h.residenceService.GetMyResidences(c.Request().Context(), user.ID, userNow)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -72,7 +72,7 @@ func (h *ResidenceHandler) GetSummary(c echo.Context) error {
|
||||
}
|
||||
userNow := middleware.GetUserNow(c)
|
||||
|
||||
summary, err := h.residenceService.GetSummary(user.ID, userNow)
|
||||
summary, err := h.residenceService.GetSummary(c.Request().Context(), user.ID, userNow)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -93,7 +93,7 @@ func (h *ResidenceHandler) GetResidence(c echo.Context) error {
|
||||
}
|
||||
|
||||
userNow := middleware.GetUserNow(c)
|
||||
response, err := h.residenceService.GetResidence(uint(residenceID), user.ID, userNow)
|
||||
response, err := h.residenceService.GetResidence(c.Request().Context(), uint(residenceID), user.ID, userNow)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -116,7 +116,7 @@ func (h *ResidenceHandler) CreateResidence(c echo.Context) error {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
response, err := h.residenceService.CreateResidence(&req, user.ID)
|
||||
response, err := h.residenceService.CreateResidence(c.Request().Context(), &req, user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -144,7 +144,7 @@ func (h *ResidenceHandler) UpdateResidence(c echo.Context) error {
|
||||
return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err))
|
||||
}
|
||||
|
||||
response, err := h.residenceService.UpdateResidence(uint(residenceID), user.ID, &req)
|
||||
response, err := h.residenceService.UpdateResidence(c.Request().Context(), uint(residenceID), user.ID, &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -164,7 +164,7 @@ func (h *ResidenceHandler) DeleteResidence(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_residence_id")
|
||||
}
|
||||
|
||||
response, err := h.residenceService.DeleteResidence(uint(residenceID), user.ID)
|
||||
response, err := h.residenceService.DeleteResidence(c.Request().Context(), uint(residenceID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -185,7 +185,7 @@ func (h *ResidenceHandler) GetShareCode(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_residence_id")
|
||||
}
|
||||
|
||||
shareCode, err := h.residenceService.GetShareCode(uint(residenceID), user.ID)
|
||||
shareCode, err := h.residenceService.GetShareCode(c.Request().Context(), uint(residenceID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -213,7 +213,7 @@ func (h *ResidenceHandler) GenerateShareCode(c echo.Context) error {
|
||||
// Request body is optional
|
||||
c.Bind(&req)
|
||||
|
||||
response, err := h.residenceService.GenerateShareCode(uint(residenceID), user.ID, req.ExpiresInHours)
|
||||
response, err := h.residenceService.GenerateShareCode(c.Request().Context(), uint(residenceID), user.ID, req.ExpiresInHours)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -238,7 +238,7 @@ func (h *ResidenceHandler) GenerateSharePackage(c echo.Context) error {
|
||||
// Request body is optional (for expires_in_hours)
|
||||
c.Bind(&req)
|
||||
|
||||
response, err := h.residenceService.GenerateSharePackage(uint(residenceID), user.ID, req.ExpiresInHours)
|
||||
response, err := h.residenceService.GenerateSharePackage(c.Request().Context(), uint(residenceID), user.ID, req.ExpiresInHours)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -261,7 +261,7 @@ func (h *ResidenceHandler) JoinWithCode(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
response, err := h.residenceService.JoinWithCode(req.Code, user.ID)
|
||||
response, err := h.residenceService.JoinWithCode(c.Request().Context(), req.Code, user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -281,7 +281,7 @@ func (h *ResidenceHandler) GetResidenceUsers(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_residence_id")
|
||||
}
|
||||
|
||||
users, err := h.residenceService.GetResidenceUsers(uint(residenceID), user.ID)
|
||||
users, err := h.residenceService.GetResidenceUsers(c.Request().Context(), uint(residenceID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -306,7 +306,7 @@ func (h *ResidenceHandler) RemoveResidenceUser(c echo.Context) error {
|
||||
return apperrors.BadRequest("error.invalid_user_id")
|
||||
}
|
||||
|
||||
err = h.residenceService.RemoveUser(uint(residenceID), uint(userIDToRemove), user.ID)
|
||||
err = h.residenceService.RemoveUser(c.Request().Context(), uint(residenceID), uint(userIDToRemove), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -316,7 +316,7 @@ func (h *ResidenceHandler) RemoveResidenceUser(c echo.Context) error {
|
||||
|
||||
// GetResidenceTypes handles GET /api/residences/types/
|
||||
func (h *ResidenceHandler) GetResidenceTypes(c echo.Context) error {
|
||||
types, err := h.residenceService.GetResidenceTypes()
|
||||
types, err := h.residenceService.GetResidenceTypes(c.Request().Context())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -348,7 +348,7 @@ func (h *ResidenceHandler) GenerateTasksReport(c echo.Context) error {
|
||||
c.Bind(&req)
|
||||
|
||||
// Generate the report data
|
||||
report, err := h.residenceService.GenerateTasksReport(uint(residenceID), user.ID)
|
||||
report, err := h.residenceService.GenerateTasksReport(c.Request().Context(), uint(residenceID), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@@ -324,7 +325,7 @@ func TestResidenceHandler_JoinWithCode(t *testing.T) {
|
||||
userRepo := repositories.NewUserRepository(db)
|
||||
cfg := &config.Config{}
|
||||
residenceService := services.NewResidenceService(residenceRepo, userRepo, cfg)
|
||||
shareResp, _ := residenceService.GenerateShareCode(residence.ID, owner.ID, 24)
|
||||
shareResp, _ := residenceService.GenerateShareCode(context.Background(), residence.ID, owner.ID, 24)
|
||||
|
||||
authGroup := e.Group("/api/residences")
|
||||
authGroup.Use(testutil.MockAuthMiddleware(newUser))
|
||||
@@ -357,7 +358,7 @@ func TestResidenceHandler_JoinWithCode(t *testing.T) {
|
||||
|
||||
t.Run("owner tries to join own residence", func(t *testing.T) {
|
||||
// Generate new code
|
||||
shareResp2, _ := residenceService.GenerateShareCode(residence.ID, owner.ID, 24)
|
||||
shareResp2, _ := residenceService.GenerateShareCode(context.Background(), residence.ID, owner.ID, 24)
|
||||
|
||||
req := requests.JoinWithCodeRequest{
|
||||
Code: shareResp2.ShareCode.Code,
|
||||
|
||||
@@ -15,12 +15,15 @@ import (
|
||||
|
||||
// SeededDataResponse represents the unified seeded data response
|
||||
type SeededDataResponse struct {
|
||||
ResidenceTypes interface{} `json:"residence_types"`
|
||||
TaskCategories interface{} `json:"task_categories"`
|
||||
TaskPriorities interface{} `json:"task_priorities"`
|
||||
TaskFrequencies interface{} `json:"task_frequencies"`
|
||||
ContractorSpecialties interface{} `json:"contractor_specialties"`
|
||||
TaskTemplates responses.TaskTemplatesGroupedResponse `json:"task_templates"`
|
||||
ResidenceTypes interface{} `json:"residence_types"`
|
||||
TaskCategories interface{} `json:"task_categories"`
|
||||
TaskPriorities interface{} `json:"task_priorities"`
|
||||
TaskFrequencies interface{} `json:"task_frequencies"`
|
||||
ContractorSpecialties interface{} `json:"contractor_specialties"`
|
||||
TaskTemplates responses.TaskTemplatesGroupedResponse `json:"task_templates"`
|
||||
HomeProfileOptions map[string][]services.HomeProfileOption `json:"home_profile_options"`
|
||||
DocumentTypes []services.HomeProfileOption `json:"document_types"`
|
||||
DocumentCategories []services.HomeProfileOption `json:"document_categories"`
|
||||
}
|
||||
|
||||
// StaticDataHandler handles static/lookup data endpoints
|
||||
@@ -54,13 +57,18 @@ func NewStaticDataHandler(
|
||||
func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
|
||||
ctx := c.Request().Context()
|
||||
|
||||
// Lookup display labels and home-profile options are localized for the
|
||||
// request's language, so the cache + ETag are keyed by locale.
|
||||
locale := i18n.GetLocale(c)
|
||||
localizer := i18n.GetLocalizer(c)
|
||||
|
||||
// Check If-None-Match header for conditional request
|
||||
// Strip W/ prefix if present (added by reverse proxy, but we store without it)
|
||||
clientETag := strings.TrimPrefix(c.Request().Header.Get("If-None-Match"), "W/")
|
||||
|
||||
// Try to get cached ETag first (fast path for 304 responses)
|
||||
if h.cache != nil && clientETag != "" {
|
||||
cachedETag, err := h.cache.GetSeededDataETag(ctx)
|
||||
cachedETag, err := h.cache.GetSeededDataETag(ctx, locale)
|
||||
if err == nil && cachedETag == clientETag {
|
||||
// Client has the latest data, return 304 Not Modified
|
||||
return c.NoContent(http.StatusNotModified)
|
||||
@@ -70,10 +78,10 @@ func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
|
||||
// Try to get cached seeded data
|
||||
if h.cache != nil {
|
||||
var cachedData SeededDataResponse
|
||||
err := h.cache.GetCachedSeededData(ctx, &cachedData)
|
||||
err := h.cache.GetCachedSeededData(ctx, locale, &cachedData)
|
||||
if err == nil {
|
||||
// Cache hit - get the ETag and return data
|
||||
etag, etagErr := h.cache.GetSeededDataETag(ctx)
|
||||
etag, etagErr := h.cache.GetSeededDataETag(ctx, locale)
|
||||
if etagErr == nil {
|
||||
c.Response().Header().Set("ETag", etag)
|
||||
c.Response().Header().Set("Cache-Control", "private, max-age=3600")
|
||||
@@ -86,27 +94,27 @@ func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
|
||||
}
|
||||
|
||||
// Cache miss - fetch all data from services
|
||||
residenceTypes, err := h.residenceService.GetResidenceTypes()
|
||||
residenceTypes, err := h.residenceService.GetResidenceTypes(c.Request().Context())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
taskCategories, err := h.taskService.GetCategories()
|
||||
taskCategories, err := h.taskService.GetCategories(c.Request().Context())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
taskPriorities, err := h.taskService.GetPriorities()
|
||||
taskPriorities, err := h.taskService.GetPriorities(c.Request().Context())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
taskFrequencies, err := h.taskService.GetFrequencies()
|
||||
taskFrequencies, err := h.taskService.GetFrequencies(c.Request().Context())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
contractorSpecialties, err := h.contractorService.GetSpecialties()
|
||||
contractorSpecialties, err := h.contractorService.GetSpecialties(c.Request().Context())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -116,6 +124,9 @@ func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Localize the lookup display_name fields in place for this request's locale.
|
||||
services.LocalizeLookups(localizer, residenceTypes, taskCategories, taskPriorities, taskFrequencies, contractorSpecialties)
|
||||
|
||||
// Build response
|
||||
seededData := SeededDataResponse{
|
||||
ResidenceTypes: residenceTypes,
|
||||
@@ -124,11 +135,14 @@ func (h *StaticDataHandler) GetStaticData(c echo.Context) error {
|
||||
TaskFrequencies: taskFrequencies,
|
||||
ContractorSpecialties: contractorSpecialties,
|
||||
TaskTemplates: taskTemplates,
|
||||
HomeProfileOptions: services.BuildHomeProfileOptions(localizer),
|
||||
DocumentTypes: services.BuildDocumentTypes(localizer),
|
||||
DocumentCategories: services.BuildDocumentCategories(localizer),
|
||||
}
|
||||
|
||||
// Cache the data and get ETag
|
||||
// Cache the data and get ETag (per-locale)
|
||||
if h.cache != nil {
|
||||
etag, cacheErr := h.cache.CacheSeededData(ctx, seededData)
|
||||
etag, cacheErr := h.cache.CacheSeededData(ctx, locale, seededData)
|
||||
if cacheErr != nil {
|
||||
log.Warn().Err(cacheErr).Msg("Failed to cache seeded data")
|
||||
} else {
|
||||
|
||||
@@ -32,7 +32,7 @@ func (h *SubscriptionHandler) GetSubscription(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
subscription, err := h.subscriptionService.GetSubscription(user.ID)
|
||||
subscription, err := h.subscriptionService.GetSubscription(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -47,7 +47,7 @@ func (h *SubscriptionHandler) GetSubscriptionStatus(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
status, err := h.subscriptionService.GetSubscriptionStatus(user.ID)
|
||||
status, err := h.subscriptionService.GetSubscriptionStatus(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -59,7 +59,7 @@ func (h *SubscriptionHandler) GetSubscriptionStatus(c echo.Context) error {
|
||||
func (h *SubscriptionHandler) GetUpgradeTrigger(c echo.Context) error {
|
||||
key := c.Param("key")
|
||||
|
||||
trigger, err := h.subscriptionService.GetUpgradeTrigger(key)
|
||||
trigger, err := h.subscriptionService.GetUpgradeTrigger(c.Request().Context(), key)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -69,7 +69,7 @@ func (h *SubscriptionHandler) GetUpgradeTrigger(c echo.Context) error {
|
||||
|
||||
// GetAllUpgradeTriggers handles GET /api/subscription/upgrade-triggers/
|
||||
func (h *SubscriptionHandler) GetAllUpgradeTriggers(c echo.Context) error {
|
||||
triggers, err := h.subscriptionService.GetAllUpgradeTriggers()
|
||||
triggers, err := h.subscriptionService.GetAllUpgradeTriggers(c.Request().Context())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -79,7 +79,7 @@ func (h *SubscriptionHandler) GetAllUpgradeTriggers(c echo.Context) error {
|
||||
|
||||
// GetFeatureBenefits handles GET /api/subscription/features/
|
||||
func (h *SubscriptionHandler) GetFeatureBenefits(c echo.Context) error {
|
||||
benefits, err := h.subscriptionService.GetFeatureBenefits()
|
||||
benefits, err := h.subscriptionService.GetFeatureBenefits(c.Request().Context())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -94,7 +94,7 @@ func (h *SubscriptionHandler) GetPromotions(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
promotions, err := h.subscriptionService.GetActivePromotions(user.ID)
|
||||
promotions, err := h.subscriptionService.GetActivePromotions(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -125,12 +125,12 @@ func (h *SubscriptionHandler) ProcessPurchase(c echo.Context) error {
|
||||
if req.TransactionID == "" && req.ReceiptData == "" {
|
||||
return apperrors.BadRequest("error.receipt_data_required")
|
||||
}
|
||||
subscription, err = h.subscriptionService.ProcessApplePurchase(user.ID, req.ReceiptData, req.TransactionID)
|
||||
subscription, err = h.subscriptionService.ProcessApplePurchase(c.Request().Context(), user.ID, req.ReceiptData, req.TransactionID)
|
||||
case "android":
|
||||
if req.PurchaseToken == "" {
|
||||
return apperrors.BadRequest("error.purchase_token_required")
|
||||
}
|
||||
subscription, err = h.subscriptionService.ProcessGooglePurchase(user.ID, req.PurchaseToken, req.ProductID)
|
||||
subscription, err = h.subscriptionService.ProcessGooglePurchase(c.Request().Context(), user.ID, req.PurchaseToken, req.ProductID)
|
||||
default:
|
||||
return apperrors.BadRequest("error.invalid_platform")
|
||||
}
|
||||
@@ -152,7 +152,7 @@ func (h *SubscriptionHandler) CancelSubscription(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
subscription, err := h.subscriptionService.CancelSubscription(user.ID)
|
||||
subscription, err := h.subscriptionService.CancelSubscription(c.Request().Context(), user.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -187,12 +187,12 @@ func (h *SubscriptionHandler) RestoreSubscription(c echo.Context) error {
|
||||
if req.ReceiptData == "" && req.TransactionID == "" {
|
||||
return apperrors.BadRequest("error.receipt_data_required")
|
||||
}
|
||||
subscription, err = h.subscriptionService.ProcessApplePurchase(user.ID, req.ReceiptData, req.TransactionID)
|
||||
subscription, err = h.subscriptionService.ProcessApplePurchase(c.Request().Context(), user.ID, req.ReceiptData, req.TransactionID)
|
||||
case "android":
|
||||
if req.PurchaseToken == "" {
|
||||
return apperrors.BadRequest("error.purchase_token_required")
|
||||
}
|
||||
subscription, err = h.subscriptionService.ProcessGooglePurchase(user.ID, req.PurchaseToken, req.ProductID)
|
||||
subscription, err = h.subscriptionService.ProcessGooglePurchase(c.Request().Context(), user.ID, req.PurchaseToken, req.ProductID)
|
||||
default:
|
||||
return apperrors.BadRequest("error.invalid_platform")
|
||||
}
|
||||
@@ -220,7 +220,7 @@ func (h *SubscriptionHandler) CreateCheckoutSession(c echo.Context) error {
|
||||
}
|
||||
|
||||
// Check if already Pro from another platform
|
||||
alreadyPro, existingPlatform, err := h.subscriptionService.IsAlreadyProFromOtherPlatform(user.ID, "stripe")
|
||||
alreadyPro, existingPlatform, err := h.subscriptionService.IsAlreadyProFromOtherPlatform(c.Request().Context(), user.ID, "stripe")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -244,7 +244,7 @@ func (h *SubscriptionHandler) CreateCheckoutSession(c echo.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
sessionURL, err := h.stripeService.CreateCheckoutSession(user.ID, req.PriceID, req.SuccessURL, req.CancelURL)
|
||||
sessionURL, err := h.stripeService.CreateCheckoutSession(c.Request().Context(), user.ID, req.PriceID, req.SuccessURL, req.CancelURL)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user