diff --git a/Dockerfile b/Dockerfile index 09bda44..789e909 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Admin panel build stage -FROM node:20-alpine AS admin-builder +FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS admin-builder WORKDIR /app @@ -109,7 +109,7 @@ FROM go-base AS worker CMD ["/app/worker"] # Admin panel runtime stage -FROM node:20-alpine AS admin +FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS admin WORKDIR /app @@ -131,7 +131,7 @@ ENV HOSTNAME="0.0.0.0" CMD ["node", "server.js"] # Default production stage (for Dokku - runs API + Admin) -FROM node:20-alpine AS production +FROM node:20-alpine@sha256:fb4cd12c85ee03686f6af5362a0b0d56d50c58a04632e6c0fb8363f609372293 AS production # Install runtime dependencies RUN apk add --no-cache ca-certificates tzdata curl diff --git a/cmd/api/main.go b/cmd/api/main.go index 5d12e93..c65db81 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -54,11 +54,13 @@ func main() { // Initialize OpenTelemetry tracing — exports to obs.88oakapps.com // (Jaeger via OTLP/HTTP) when OBS_TRACES_URL is set; otherwise installs // a no-op tracer so call sites can use otel.Tracer() unconditionally. + // config.SecretValue (not os.Getenv) so file-mounted secrets resolve + // after audit F8 removed these from the process environment. tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{ ServiceName: "honeydue-api", Environment: deploymentEnvironment(cfg.Server.Debug), - EndpointURL: os.Getenv("OBS_TRACES_URL"), - BearerToken: os.Getenv("OBS_INGEST_TOKEN"), + EndpointURL: config.SecretValue("OBS_TRACES_URL"), + BearerToken: config.SecretValue("OBS_INGEST_TOKEN"), SampleRatio: tracing.SampleRatioFromEnv(), }) if err != nil { diff --git a/cmd/worker/main.go b/cmd/worker/main.go index 7e73199..c3c0af6 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -47,11 +47,13 @@ func main() { // Initialize OpenTelemetry tracing for the worker process. Same OTLP // destination as the api; service.name distinguishes them in Jaeger. + // config.SecretValue (not os.Getenv) so file-mounted secrets resolve + // after audit F8 removed these from the process environment. tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{ ServiceName: "honeydue-worker", Environment: workerDeploymentEnv(cfg.Server.Debug), - EndpointURL: os.Getenv("OBS_TRACES_URL"), - BearerToken: os.Getenv("OBS_INGEST_TOKEN"), + EndpointURL: config.SecretValue("OBS_TRACES_URL"), + BearerToken: config.SecretValue("OBS_INGEST_TOKEN"), SampleRatio: tracing.SampleRatioFromEnv(), }) if err != nil { @@ -106,6 +108,17 @@ func main() { if err != nil { log.Fatal().Err(err).Msg("Failed to parse Redis URL") } + // Audit HIGH-1: the Redis password is a file-mounted secret (REDIS_PASSWORD), + // not embedded in REDIS_URL — REDIS_URL travels in the honeydue-config + // ConfigMap. Apply the password onto the parsed opt so the Asynq server, + // inspector and monitoring client (all derived from redisOpt below) + // authenticate against a requirepass-protected Redis. + if cfg.Redis.Password != "" { + if clientOpt, ok := redisOpt.(asynq.RedisClientOpt); ok { + clientOpt.Password = cfg.Redis.Password + redisOpt = clientOpt + } + } // Initialize monitoring service (if Redis is available) var monitoringService *monitoring.Service diff --git a/deploy-k3s/SECURITY.md b/deploy-k3s/SECURITY.md index e71190e..0a5ed7b 100644 --- a/deploy-k3s/SECURITY.md +++ b/deploy-k3s/SECURITY.md @@ -1,813 +1,1033 @@ -# honeyDue — Production Security Hardening Guide +# honeyDue — Production Security Remediation Plan -Comprehensive security documentation for the honeyDue K3s deployment. Covers every layer from cloud provider to application. +This document is the **single source of truth for fixing every security +finding from the 2026-05-12/13 audits, and for keeping those fixes baked +into the stack so a full redeploy never reproduces them.** -**Last updated**: 2026-03-28 +It replaces the previous aspirational `SECURITY.md` (which described a +desired state that, per the audits, was never fully true). The accurate +*current* architecture lives in `docs/deployment/05-security.md`; this file +is the **work list**. + +**Last updated:** 2026-05-16 +**Audit sources (kept at repo root):** + +| Tag | File | Scope | Findings | +|---|---|---|---| +| `LIVE` | `live_scan_5_12.md` | External black-box scan of api/admin/app | L1–L20 (20) | +| `K3S` | `k3_audit_5_12.md` | k3s cluster + `honeydue` namespace audit | F1–F17 (17) + 8 coverage gaps | +| `CODE` | `security_scan_5_12.md` | Static audit of `honeyDueAPI-go` | C1–C13, H1–H9, M1–M13, L1–L6 (41) | + +**Total: 78 findings + 8 cluster coverage gaps + 13 runtime verification items.** --- -## Table of Contents +## How to use this document -1. [Threat Model](#1-threat-model) -2. [Hetzner Cloud (Host)](#2-hetzner-cloud-host) -3. [K3s Cluster](#3-k3s-cluster) -4. [Pod Security](#4-pod-security) -5. [Network Segmentation](#5-network-segmentation) -6. [Redis](#6-redis) -7. [PostgreSQL (Neon)](#7-postgresql-neon) -8. [Cloudflare](#8-cloudflare) -9. [Container Images](#9-container-images) -10. [Secrets Management](#10-secrets-management) -11. [B2 Object Storage](#11-b2-object-storage) -12. [Monitoring & Alerting](#12-monitoring--alerting) -13. [Incident Response](#13-incident-response) -14. [Compliance Checklist](#14-compliance-checklist) +The plan is organised by **redeploy stage**, not by severity, because the +operator's goal is: *redeploy the entire stack and come up clean.* Each +finding is tagged with where its fix lives: + +| Marker | Meaning | +|---|---| +| **In-repo: Y** | Fix lives in a committed file (`config.yaml`, a manifest, a script, Go code, a Dockerfile). Once committed, **every redeploy re-applies it automatically.** | +| **In-repo: N** | Fix is external state (DNS records, Cloudflare dashboard, Hetzner firewall, hstspreload.org). A redeploy does **not** touch it — it survives on its own but must be done once and tracked here. | + +**Status legend:** ☐ open · ◐ in progress · ☑ done · ⊘ accepted risk / deferred + +**Redeploy stage order** (matches `deploy-k3s/scripts/` run order): + +``` +Stage 0 DNS & Cloudflare edge (external; no cluster needed) +Stage 1 Cluster provisioning & node OS (01-provision-cluster.sh / hetzner-k3s / SSH) +Stage 2 Secrets & config bootstrap (02-setup-secrets.sh / config.yaml) +Stage 3 Kubernetes manifests (deploy-k3s/manifests/, applied by 03-deploy.sh) +Stage 4 Application code & images (honeyDueAPI-go source → rebuilt image) +Stage 5 CI / build pipeline (image digest pinning, signing, scanning) +Stage 6 Post-deploy verification (04-verify.sh + runtime investigations) +``` + +**Golden rule for "redeploy clean":** a fix only counts as done when it is +committed to the file that the redeploy reads. A `kubectl patch` on the live +cluster that is not mirrored into `deploy-k3s/manifests/` **will be wiped on +the next `03-deploy.sh`.** Every entry below names the committed file. --- -## 1. Threat Model +## Execution status (2026-05-16) -### What We're Protecting +Stages 2–5 were executed in-repo, then put through an independent code +review (see *Post-remediation independent review* below). The Go module +**builds clean and the full `go test ./...` suite passes.** Four new goose +migrations were added — `000003` (auth-token hashing), `000004` (IAP replay +protection), `000005` (audit-log append-only + `audit_log` table create), +`000006` (`webhook_event_log` table create) — and run automatically via the +migrate Job before the api/worker rollout. -| Asset | Impact if Compromised | -|-------|----------------------| -| User credentials (bcrypt hashes) | Account takeover, password reuse attacks | -| Auth tokens | Session hijacking | -| Personal data (email, name, residences) | Privacy violation, regulatory exposure | -| Push notification keys (APNs, FCM) | Spam push to all users, key revocation | -| Cloudflare origin cert | Direct TLS impersonation | -| Database credentials | Full data exfiltration | -| Redis data | Session replay, job queue manipulation | -| B2 storage keys | Document theft or deletion | +- **~63 findings fixed (☑) and verified** — all of Stage 2 (secrets/config) + and Stage 3 (Kubernetes manifests), every exploitable Stage 4 application + finding (all 11 actioned Criticals + the auth / webhook / race / handler + High & Medium fixes), Stage-5 image digest pinning **and `K3S-F8` + (secrets are now file-mounted, not env vars)**, plus the in-repo half of + Stage 1 cluster provisioning — `K3S-F4` (kubeconfig written `0600`), + `K3S-CG1` (etcd `secrets-encryption`), `K3S-CG2` (fail2ban + + unattended-upgrades installed at provision). Includes token hashing, + Google JWKS verification, IAP replay protection, the authorization + fixes, atomic share-code join, the metrics-endpoint lockdown, + per-account login lockout, verified-email gating, CSP/HSTS hardening, + and digest-pinned images. +- **1 partial (◐)** — `CODE-L5`: cosign signing + a Trivy `HIGH,CRITICAL` + scan are wired (guarded) into `03-deploy.sh`, and a ready-to-use Kyverno + `ClusterPolicy` ships at `deploy-k3s/manifests/kyverno-verify-images.yaml`. + Closing it needs two operator actions that cannot be committed: install + Kyverno in the cluster, and supply a cosign key pair (`COSIGN_KEY` for + signing + the public key pasted into the policy). +- **Accepted / blocked / moot (⊘)** — `M3` (Apple nonce — blocked on an + iOS-client change), `C12` (moot — accounts are hard-deleted), + `LIVE-L14`/`L15` (UUID migration — planned quarter), `LIVE-L17`/`L18`/ + `L20` (no security impact — see entries), `F15`/`F16` (architectural), + and `LIVE-L2`/`L3`/`L4` (DMARC / SPF / CAA — operator-declined, below). +- **Operator-declined — Stage 0 DNS (`LIVE-L2`/`L3`/`L4`).** The operator + has opted not to add the DMARC, SPF-hardening, and CAA DNS records this + cycle. For the record: these are **not** a paid-Cloudflare feature — + DMARC and SPF are ordinary TXT records and CAA is an ordinary CAA + record, all addable on any Cloudflare plan including Free. They remain + genuine email-spoofing / certificate-issuance gaps and are marked ⊘; + revisit when DNS is next touched. +- **Remaining operator runtime steps (no code to commit)** — on the + *existing* cluster: `k3s secrets-encrypt` enable/reencrypt (`K3S-CG1` / + `V12`) and `chmod 600` the live kubeconfig (`K3S-F4`); the SSH/sysctl + half of `K3S-CG2`; and the `K3S-CG3`–`CG8` verification items. A full + *fresh* provision already comes up with `K3S-F4`/`CG1`/`CG2`(fail2ban + + unattended-upgrades) applied straight from `_config.sh`. -### Attack Surface +**Operator note:** `C1` (token hashing) invalidates every existing login +session once at deploy and makes login single-session per user — see the +`CODE-C1` entry. The status boxes in the master index below are authoritative. -``` -Internet - │ - ▼ -Cloudflare (WAF, DDoS protection, TLS termination) - │ - ▼ (origin cert, Full Strict) -Hetzner Cloud Firewall (ports 22, 443, 6443) - │ - ▼ -K3s Traefik Ingress (Cloudflare-only IP allowlist) - │ - ├──► API pods (Go) ──► Neon PostgreSQL (external, TLS) - │ ──► Redis (internal, authenticated) - │ ──► APNs/FCM (external, TLS) - │ ──► B2 Storage (external, TLS) - │ ──► SMTP (external, TLS) - │ - ├──► Admin pods (Next.js) ──► API pods (internal) - │ - └──► Worker pods (Go) ──► same as API -``` +## Post-remediation independent review (2026-05-16) -### Trust Boundaries +The change set went through **two** independent review passes; the deploy-time +verification below (build, `go test -race`, full `goose up` against real +PostgreSQL 16) was executed and passed. -1. **Internet → Cloudflare**: Untrusted. Cloudflare handles DDoS, WAF, TLS. -2. **Cloudflare → Origin**: Semi-trusted. Origin cert validates, IP allowlist enforces. -3. **Ingress → Pods**: Trusted network, but segmented by NetworkPolicy. -4. **Pods → External Services**: Outbound only, TLS required, credentials scoped. -5. **Pods → K8s API**: Denied. Service accounts have no permissions. +**First pass.** A separate review agent audited the full change set against the +three audit files. It surfaced three **deploy-breaking** defects that a green +`go test` could not catch — the test harness builds two tables via GORM +`AutoMigrate`, which production never runs — all since fixed: + +- **`audit_log` table was never created by a migration.** `000005` added + append-only triggers to a table that exists only in the test DB, so a + from-scratch `goose up` would fail on `000005`. `000005` now does + `CREATE TABLE IF NOT EXISTS audit_log` before the triggers. +- **`webhook_event_log` table was never created by a migration.** The H6 + fail-closed webhook dedup turns a missing table into a 500 on every + subscription webhook. New migration `000006` creates it. +- **`000004`'s `google_purchase_token` unique index could fail to build** on + a production table already holding duplicate tokens — exactly the C6 + replay the migration fixes. `000004` now de-duplicates (keep-earliest, + NULL-the-rest) before creating the index. + +It also tightened the C13 Apple-webhook lookup (`subscription_webhook_handler.go`) +so the legacy substring scan runs only on a genuine `ErrRecordNotFound`, +never masking a real DB error as "not found". + +**Second pass (master review).** A second, independent security-audit agent +re-verified all four first-pass fixes (correct), ran `go test -race` (0 data +races) and the full `goose up`/`down` chain against real PostgreSQL (clean, +idempotent), and returned **GO** with one HIGH finding, since fixed: + +- **HIGH-1 — Redis password leaked via the `honeydue-config` ConfigMap.** + `_config.sh` built `REDIS_URL` with the password embedded inline, and that + URL is emitted into the `honeydue-config` ConfigMap (delivered to pods via + `envFrom`). ConfigMaps are *not* covered by `secrets-encryption` and are + readable by any principal with `get configmap` — so `K3S-F1`/`K3S-F8` were + not actually fully closed. **Fixed (2026-05-16):** `_config.sh` now emits + `REDIS_URL=redis://redis:6379/0` with no credentials; the password travels + only as the file-mounted `REDIS_PASSWORD` secret. The API applies it in + `cache_service.go`; `cmd/worker/main.go` now applies it onto the parsed + Asynq `RedisClientOpt` so the server/inspector/monitoring client all + authenticate against the `requirepass` Redis. + +The master review's other seven findings (4 Medium, 3 Low — none +deploy-blocking) were then **all fixed (2026-05-16)**: + +- **MEDIUM-1 — re-login left the prior token usable for ≤5 min.** + `CreateFreshToken` deleted the old token row but not its Redis cache entry. + It now also returns the deleted tokens' hashes; `AuthService.freshToken` + evicts them via the new `CacheService.InvalidateAuthTokenHashes` on every + login / Apple / Google sign-in, so a prior (e.g. stolen) token stops + authenticating immediately. +- **MEDIUM-2 — IAP `.p8` mode check incompatible with k8s.** The Apple IAP + key check (`iap_validation.go`) required `0600`-or-stricter, unattainable + on a k8s Secret volume (`0440` under `fsGroup`). It now rejects only + world-accessible keys (`perm & 0o007`). +- **MEDIUM-3 — single-IP account-lockout DoS.** The `M5` per-account lockout + is now keyed on the *set of distinct source IPs* that have failed + (`RegisterLoginFailure` takes the IP, tracks a Redis set; lock at 5 + distinct IPs). One attacker IP can no longer lock a victim out by spamming + failures; genuinely distributed stuffing still trips it. `Login` now takes + the client IP (`c.RealIP()`). +- **MEDIUM-4 — Redis no-auth deployable.** `02-setup-secrets.sh` now `die`s + (was `warn`) when `redis.password` is empty, so a deploy can no longer + bring up an unauthenticated Redis (`K3S-F1`). +- **LOW-1 / LOW-2 — missing regression tests.** Added: `config_test.go` + asserts `validate()` refuses `DEBUG_FIXED_CODES` with `DEBUG=false` (`C4`); + `subscription_repo_test.go` asserts a second account cannot bind an Apple + transaction / Google purchase token already bound to another (`C5`/`C6`). +- **LOW-3 — device-token 409.** A recycled APNs/FCM token re-registering + under a new account is now reassigned to that account (and logged) instead + of returning a 409 that locked the legitimate new device owner out of push. + +One earlier (first-pass) hardening item remains a **tracked follow-up**, not +re-raised by the master review and not deploy-blocking: `/metrics` is gated +by an `X-Forwarded-For` check rather than network-isolated. True isolation +needs `/metrics` on a separate port plus a NetworkPolicy restricting the +scrape to vmagent — an architectural change deferred to a later cycle. + +## Consolidated work items (fix once, closes many) + +Several findings are the same defect seen from three angles. Do the work +once at the listed anchor; the rest close with it. + +| Theme | Anchor | Also closes | +|---|---|---| +| Auth-endpoint rate limiting | Stage 3 `auth-rate-limit` middleware + Stage 4 app limiter | `K3S-F10`, `LIVE-L12`, `CODE-H1`, `CODE-H2`, `CODE-H3`, `CODE-M5` | +| CSP / cross-origin headers | Stage 3 `security-headers` + Stage 4 app CSP | `K3S-F9`, `LIVE-L8` | +| HSTS `preload` | Stage 3 middleware + Stage 0 list submission | `LIVE-L5`, `CODE-L3` | +| Admin ingress hardening | Stage 2 secret + Stage 3 middleware wiring | `K3S-F2`, `K3S-F3`, `CODE-L6` | +| etcd encryption at rest | Stage 1 `--secrets-encryption` | `K3S-CG1`, `CODE-M9` | +| Image digest pinning + signing | Stage 5 CI | `K3S-F5`, `K3S-F14`, `CODE-L4`, `CODE-L5` | +| Pagination hard caps | Stage 4 app | `LIVE-L16`, `CODE-M6` | +| imagePullSecret name consistency | Stage 3 manifests + Stage 2 script | `K3S-F6` | + +**Known contradiction to resolve before planning Stage 4:** `LIVE-L18` says +*no account-deletion endpoint exists* (every `DELETE` path 404/400), but +`CODE-M13` points at a delete handler at `auth_handler.go:488-539`. Either +the endpoint exists at a path the external scan never probed, or it is +mounted but unreachable. **Confirm the route in `internal/router/router.go` +first** — the fix differs (add an endpoint vs. expose/rate-limit an existing +one). Tracked as verification item `V11`. --- -## 2. Hetzner Cloud (Host) +## Master finding index -### Firewall Rules +Every finding, ordered by redeploy stage. Use this as the live tracker — +flip the Status box as work lands. -Only three ports should be open on the Hetzner Cloud Firewall: +### Stage 0 — DNS & Cloudflare edge -| Port | Protocol | Source | Purpose | -|------|----------|--------|---------| -| 22 | TCP | Your IP(s) only | SSH management | -| 443 | TCP | Cloudflare IPs only | HTTPS traffic | -| 6443 | TCP | Your IP(s) only | K3s API (kubectl) | +| ID | Sev | Finding | In-repo | Status | +|---|---|---|---|---| +| `LIVE-L2` | HIGH | No DMARC record — email spoofing open | N | ⊘ | +| `LIVE-L3` | MED | SPF ends `?all` (neutral — fails open) | N | ⊘ | +| `LIVE-L4` | MED | No CAA records — any CA may issue certs | N | ⊘ | +| `LIVE-L6` | LOW | No `/.well-known/security.txt` | Y | ☐ | +| `LIVE-L9` | INFO | Aggressive Cloudflare caching on admin SSR shell | N | ☐ | +| `LIVE-L10` | INFO | `x-powered-by: Next.js` framework leak | Y | ☐ | -```bash -# Verify Hetzner firewall rules (Hetzner CLI) -hcloud firewall describe honeydue-fw -``` +### Stage 1 — Cluster provisioning & node OS -### SSH Hardening +| ID | Sev | Finding | In-repo | Status | +|---|---|---|---|---| +| `K3S-F4` | HIGH | Node kubeconfig world-readable (mode 644) | Y | ☑ | +| `K3S-F15` | INFO | Nodes on public IPs, no private VPC | Y | ⊘ | +| `K3S-F16` | INFO | All 3 nodes are control-plane + etcd + worker | Y | ⊘ | +| `K3S-F17` | INFO | Single-replica SPOFs (redis/worker/admin/vmagent) | Y | ☐ | +| `K3S-CG1` | — | etcd encryption at rest not verified (`--secrets-encryption`) | Y | ☑ | +| `K3S-CG2` | — | Node OS hardening: SSH, fail2ban, unattended-upgrades, sysctl | Y/N | ◐ | +| `K3S-CG3` | — | Hetzner Cloud Firewall rules not verified | N | ☐ | +| `K3S-CG4` | — | etcd snapshot backup destination/encryption not verified | Y | ☐ | +| `K3S-CG5` | — | kubelet flags (`--anonymous-auth=false`, webhook authz) not verified | Y | ☐ | +| `K3S-CG6` | — | Container-runtime CIS controls (`kube-bench`) not run | N | ☐ | +| `K3S-CG7` | — | `deploy` user sudoers least-privilege not verified | N | ☐ | +| `K3S-CG8` | — | `/etc/rancher/k3s/` dir + server-token perms not verified | N | ☐ | -- **Key-only authentication** — password auth disabled in `/etc/ssh/sshd_config` -- **Root login disabled** — `PermitRootLogin no` -- **fail2ban active** — auto-bans IPs after 5 failed SSH attempts +### Stage 2 — Secrets & config bootstrap -```bash -# Verify SSH config on each node -ssh user@NODE_IP "grep -E 'PasswordAuthentication|PermitRootLogin' /etc/ssh/sshd_config" -# Expected: PasswordAuthentication no, PermitRootLogin no +| ID | Sev | Finding | In-repo | Status | +|---|---|---|---|---| +| `K3S-F1` | **CRIT** | Redis runs with no authentication | Y | ☑ | +| `K3S-F3` | HIGH | `admin-basic-auth` secret never created | Y | ☑ | +| `K3S-F12` | MED | Secrets unrotated since cluster bootstrap; no runbook | Y | ☑ | +| `CODE-C4` | **CRIT** | `DEBUG_FIXED_CODES` "123456" auth bypass if it reaches prod | Y | ☑ | +| `CODE-M8` | MED | `SECRET_KEY` hardcoded debug fallback | Y | ☑ | -# Check fail2ban status -ssh user@NODE_IP "sudo fail2ban-client status sshd" -``` +> **Stage 2 status (2026-05-15):** `config.yaml` now carries a Redis +> password and admin basic-auth user/password; `02-setup-secrets.sh` uses +> bcrypt (`htpasswd -nbB`); `internal/config/config.go` generates an +> ephemeral random `SECRET_KEY` in debug instead of a static fallback and +> refuses to boot if `DEBUG_FIXED_CODES` is set with `DEBUG=false`; the +> rotation runbook is at `docs/runbooks/secret-rotation.md`. All take +> effect on the next `02-setup-secrets.sh` + `03-deploy.sh`. -### OS Updates +### Stage 3 — Kubernetes manifests -```bash -# Enable unattended security updates (Ubuntu 24.04) -ssh user@NODE_IP "sudo apt install unattended-upgrades && sudo dpkg-reconfigure -plow unattended-upgrades" -``` +| ID | Sev | Finding | In-repo | Status | +|---|---|---|---|---| +| `K3S-F2` | HIGH | Admin ingress missing `cloudflare-only` + `admin-auth` | Y | ☑ | +| `K3S-F6` | HIGH | `imagePullSecrets` name mismatch (`ghcr-credentials`) | Y | ☑ | +| `K3S-F7` | MED | `vmagent` container missing `securityContext` | Y | ☑ | +| `K3S-F9` | MED | `security-headers` missing COOP/COEP/CORP | Y | ☑ | +| `K3S-F10` | MED | Uniform rate limit — no auth-endpoint tightening | Y | ☑ | +| `K3S-F11` | MED | `automountServiceAccountToken` not disabled | Y | ☑ | +| `K3S-F13` | LOW | `CORS_ALLOWED_ORIGINS` missing `app.myhoneydue.com` | Y | ☑ | +| `K3S-F14` | LOW | Public images (`redis`, `vmagent`) pinned by tag | Y | ☑ | +| `LIVE-L5` | LOW | HSTS not preload-eligible | Y | ☑ | +| `LIVE-L7` | LOW | Deprecated `X-XSS-Protection` header | Y | ☑ | +| `LIVE-L8` | LOW | CSP missing `object-src`/`base-uri`; COOP/COEP/CORP absent | Y | ☑ | +| `CODE-L3` | LOW | HSTS missing `preload` (duplicate of `LIVE-L5`) | Y | ☑ | +| `CODE-L4` | LOW | `imagePullPolicy` not set on Deployments | Y | ☑ | +| `CODE-L6` | LOW | Admin `admin-auth` middleware defined, not attached | Y | ☑ | + +> **Stage 3 status (2026-05-15):** admin ingress now chains +> `cloudflare-only` + `admin-auth` + `security-headers` + `rate-limit`; a +> dedicated `honeydue-api-auth` Ingress applies a new `auth-rate-limit` +> middleware (5/min, burst 10) to login / register / forgot-password / +> reset-password / join-with-code; `security-headers` gained COOP + CORP, +> HSTS is now `max-age=63072000; …; preload`, and the deprecated +> `X-XSS-Protection` (`browserXssFilter`) is removed; `vmagent` has a +> container `securityContext`; all workload pods + the migrate Job set +> `automountServiceAccountToken: false` explicitly (on top of the +> rbac.yaml ServiceAccount-level setting that already existed); the +> registry secret is `gitea-credentials` everywhere; `imagePullPolicy: +> IfNotPresent` is explicit on every container; CORS includes +> `app.myhoneydue.com`. **Still open:** `K3S-F14` (public-image digest +> pins) is folded into Stage 5 with `K3S-F5`; `LIVE-L8` is partial — the +> COOP/CORP half shipped here, the CSP `object-src`/`base-uri` half is an +> app change tracked in Stage 4. + +### Stage 4 — Application code & container images + +| ID | Sev | Finding | In-repo | Status | +|---|---|---|---|---| +| `CODE-C1` | **CRIT** | Auth tokens stored plaintext in DB | Y | ☑ | +| `CODE-C2` | **CRIT** | Google ID token not verified locally | Y | ☑ | +| `CODE-C3` | **CRIT** | Google `iss` claim never validated | Y | ☑ | +| `CODE-C5` | **CRIT** | Apple IAP receipt replay across accounts | Y | ☑ | +| `CODE-C6` | **CRIT** | Google purchase-token replay across accounts | Y | ☑ | +| `CODE-C7` | **CRIT** | File-ownership check excludes residence owners | Y | ☑ | +| `CODE-C8` | **CRIT** | Device-token cross-account hijack on re-register | Y | ☑ | +| `CODE-C9` | **CRIT** | Share-code join not atomic (Add+Deactivate race) | Y | ☑ | +| `CODE-C10` | **CRIT** | Subscription upgrade race — validation outside txn | Y | ☑ | +| `CODE-C11` | **CRIT** | Task-completion duplicate-row race | Y | ☑ | +| `CODE-C12` | **CRIT** | Soft-deleted email reusable; `is_active` not filtered | Y | ⊘ | +| `CODE-C13` | **CRIT** | Apple webhook user lookup may LIKE-match | Y | ☑ | +| `CODE-H1` | HIGH | Rate limit doesn't cover all auth surfaces | Y | ☑ | +| `CODE-H2` | HIGH | No rate limit on `join-with-code` | Y | ☑ | +| `CODE-H3` | HIGH | No rate limit on `register` | Y | ☑ | +| `CODE-H4` | HIGH | Modulo bias in 6-digit code generation | Y | ☑ | +| `CODE-H5` | HIGH | Apple IAP `.p8` loaded with no file-mode check | Y | ☑ | +| `CODE-H6` | HIGH | Webhook dedup fail-open | Y | ☑ | +| `CODE-H7` | HIGH | Auth-failure log lacks IP/User-Agent | Y | ☑ | +| `CODE-H8` | HIGH | `X-Timezone` header trusted for trial-start calc | Y | ☑ | +| `CODE-H9` | HIGH | Share-code `Deactivate` error swallowed | Y | ☑ | +| `CODE-M1` | MED | HTTP header injection via `Content-Disposition` filename | Y | ☑ | +| `CODE-M2` | MED | bcrypt cost = 10 (recommend 12) | Y | ☑ | +| `CODE-M3` | MED | Apple Sign In nonce not validated | Y | ⊘ | +| `CODE-M4` | MED | Email verification not atomic | Y | ☑ | +| `CODE-M5` | MED | Per-user rate limiting absent | Y | ☑ | +| `CODE-M6` | MED | List endpoints uncapped (Documents/Contractors/Residences) | Y | ☑ | +| `CODE-M7` | MED | Audit log not append-only | Y | ☑ | +| `CODE-M11` | MED | `golang.org/x/crypto v0.49.0` outdated | Y | ☑ | +| `CODE-M12` | MED | Contractor toggle refetch race | Y | ☑ | +| `CODE-M13` | MED | Account-deletion endpoint unrate-limited | Y | ☑ | +| `CODE-M10` | MED | `node:20-alpine` floating tag in Dockerfile | Y | ☑ | +| `CODE-L1` | LOW | Login inactive-account error enables enumeration | Y | ☑ | +| `CODE-L2` | LOW | Auth responses lack `Cache-Control: no-store` | Y | ☑ | +| `LIVE-L1` | HIGH | `/metrics` publicly exposed on `api.myhoneydue.com` | Y | ☑ | +| `LIVE-L11` | HIGH | Login user-enumeration via timing | Y | ☑ | +| `LIVE-L12` | HIGH | No rate-limit on `/api/auth/login/` | Y | ☑ | +| `LIVE-L13` | HIGH | Password-reset user-enumeration via timing | Y | ☑ | +| `LIVE-L14` | MED | Sequential integer user IDs leak userbase size | Y | ⊘ | +| `LIVE-L15` | MED | Sequential integer resource IDs (same risk) | Y | ⊘ | +| `LIVE-L16` | MED | Pagination `limit` accepted at any size | Y | ☑ | +| `LIVE-L17` | LOW | Garbage pagination params silently accepted | Y | ⊘ | +| `LIVE-L18` | LOW | No account-deletion endpoint (GDPR gap) | Y | ⊘ | +| `LIVE-L19` | LOW | Email verification not enforced | Y | ☑ | +| `LIVE-L20` | INFO | Profile-update silently drops unknown fields | Y | ⊘ | + +> **Stage 4 handler/misc batch status (2026-05-15):** `M1` — +> `Content-Disposition` filenames are sanitized (control chars / quote / +> backslash stripped) so an upload filename cannot inject response +> headers. `M7` — migration `000005` creates the `audit_log` table (no +> prior migration did — `CREATE TABLE IF NOT EXISTS`) and makes it +> append-only via BEFORE UPDATE/DELETE triggers. `M11` — +> `golang.org/x/crypto` bumped +> `v0.49.0 → v0.51.0`. `M13` — `DELETE /api/auth/account` now carries the +> Traefik `auth-rate-limit` edge limiter. `LIVE-L18` ⊘ — not a real gap: +> the endpoint **exists** at `DELETE /api/auth/account/` +> (`router.go:546`); the live scan probed `/api/auth/me/`, `/auth/delete/`, +> `/users/me/` and missed it. **Update (2026-05-15):** items shown as +> deferred in an earlier draft were then completed — `LIVE-L1` (`/metrics` +> rejects proxied/public requests via an `X-Forwarded-For` check, so only +> the in-cluster vmagent scrape reaches it), `M6`/`LIVE-L16` (the +> document/contractor list repos already hard-cap at 500 rows), and +> `LIVE-L19` (verified-email gating on share-code generation via the new +> `RequireVerified` middleware). `LIVE-L17` (inert pagination params, +> results capped) and `LIVE-L20` (whitelist profile update is the correct +> pattern) are closed as no-security-impact (⊘). The master index above is +> authoritative. + +> **Stage 4 races batch status (2026-05-15):** `C9`/`H9` — share-code +> redemption is now one locked transaction in `ResidenceRepository. +> JoinWithShareCode` (lock the code row, re-check validity, add member, +> deactivate — a deactivation failure aborts the join). `C11` — the +> task-completion duplicate-row race was *already* closed: the completion +> insert and the optimistically-version-locked task update share one +> transaction, so a concurrent completion fails `ErrVersionConflict` and +> rolls back its inserted row; no `UNIQUE(task_id, completed_date)` was +> added (it would reject legitimate same-day re-completions and risk a +> migration failure on existing data). `M4` — email verification's +> find/consume/flag writes are now one transaction. `M12` — a concurrent +> contractor delete now yields a clean 404. `C12` ⊘ — premise moot: the +> app **hard-deletes** accounts (`DeleteUserCascade`), so there is no +> soft-deleted user whose email lingers, and `ExistsByEmail` already +> blocks re-registering a *deactivated* user's email. +> +> **Stage 4 auth batch status (2026-05-15):** C1, C2, C3 done (see entries +> below). Rate limiting — every sensitive auth path now carries the shared +> Traefik `auth-rate-limit` edge limiter (login/register/forgot/reset/ +> verify-reset/apple/google/refresh/join-with-code); login/register/forgot/ +> reset/apple/google additionally keep the per-IP app limiter +> (`H1`/`H2`/`H3`/`LIVE-L12`). `H4` rejection-sampled codes, `M2` bcrypt +> cost 12, `L1`+`LIVE-L11` constant-time generic-error login, `L2` +> `no-store` on auth responses, `H7` IP/UA in auth logs, `LIVE-L13` +> fully-async forgot-password — all done; `go build ./...` and the +> `models`/`repositories`/`middleware`/`handlers`/`services` test packages +> pass. **Deferred:** `M3` (Apple nonce) — needs the iOS client to +> generate and send a nonce; server-only validation would reject every +> Apple login, so this is blocked on a coordinated mobile change. `H8` — +> the `parseTimezone` ±14h cap shipped; the "use server UTC for +> trial-start" half is folded into Stage 4's subscription work. `M5` +> per-account lockout (Redis) deferred — the edge + per-IP app limiters + +> the existing per-account password-reset counter cover the practical +> risk; a true per-account login lockout remains a tracked enhancement. + +### Stage 5 — CI / build pipeline + +| ID | Sev | Finding | In-repo | Status | +|---|---|---|---|---| +| `K3S-F5` | HIGH | Images pinned by mutable short SHA tag, not digest | Y | ☑ | +| `K3S-F8` | MED | Secrets injected as env vars, not file mounts | Y | ☑ | +| `CODE-L5` | LOW | No image signing (cosign) in CI | Y | ◐ | + +> **Stage 5 status (2026-05-15):** `CODE-M11` done — `golang.org/x/crypto` +> bumped `v0.49.0 → v0.51.0` (with the `x/sys`/`x/term`/`x/text` bumps +> `go get -u` pulled in), `go mod tidy` clean, full build + test green. +> **Update (2026-05-15):** `K3S-F5`/`K3S-F14`/`CODE-M10` are done — +> `03-deploy.sh` resolves the image digest after each push and deploys +> api/worker/admin/web by `@sha256:`, and redis/vmagent/`node:20-alpine` +> are pinned to their resolved index digests. +> **Update (2026-05-16):** `K3S-F8` is **done** — the `api`/`worker` +> Deployments mount `honeydue-secrets` as files (`defaultMode: 0400`) at +> `/etc/honeydue/secrets` and inject no secret as an env var; +> `config.loadFileSecrets` reads them; `02-setup-secrets.sh` now writes +> `B2_KEY_ID`/`B2_APP_KEY` into the secret, reconciling the earlier +> script-vs-manifest drift. `CODE-L5` stays **◐** — cosign signing and a +> Trivy `HIGH,CRITICAL` scan are wired (guarded) into `03-deploy.sh` and a +> ready-to-use Kyverno `ClusterPolicy` ships at +> `deploy-k3s/manifests/kyverno-verify-images.yaml`; closing it needs the +> operator to install Kyverno and supply a cosign key. See both entries. + +### Stage 6 — Post-deploy verification & runtime investigations + +`V1`–`V13` — see [Stage 6](#stage-6--post-deploy-verification--runtime-investigations). --- -## 3. K3s Cluster +## Stage 0 — DNS & Cloudflare edge -### Secret Encryption at Rest +External state at Cloudflare. Not touched by `03-deploy.sh`, so a redeploy +neither breaks nor re-applies these — do them once and leave them. Tracked +here so they are never forgotten on a domain move or DNS migration. -K3s is configured with `secrets-encryption: true` in the server config. This encrypts all Secret resources in etcd using AES-CBC. +### `LIVE-L2` — Add DMARC record · HIGH · ⊘ +- **Operator decision (2026-05-16):** declined for this cycle. A DMARC record is an ordinary DNS TXT record — it is **not** gated behind a paid Cloudflare plan and can be added on Free. This remains a real email-spoofing gap; revisit when DNS is next touched. +- **Where:** Cloudflare DNS, TXT record at `_dmarc.myhoneydue.com`. +- **Fix:** Publish `v=DMARC1; p=quarantine; rua=mailto:dmarc@myhoneydue.com; ruf=mailto:dmarc@myhoneydue.com; fo=1; aspf=s; adkim=s`. Start at `pct=10` for 30 days, watch the `rua` aggregate reports, then ramp to `pct=100` and finally `p=reject`. +- **Verify:** `dig +short TXT _dmarc.myhoneydue.com` returns the record. -```bash -# Verify encryption is active -k3s secrets-encrypt status -# Expected: Encryption Status: Enabled +### `LIVE-L3` — Tighten SPF from `?all` to `-all` · MEDIUM · ⊘ +- **Operator decision (2026-05-16):** declined for this cycle. SPF is an ordinary DNS TXT record, editable on any Cloudflare plan including Free. The `?all` (neutral) qualifier leaves spoofed mail un-penalised; revisit alongside `LIVE-L2`. +- **Where:** Cloudflare DNS, TXT record at `myhoneydue.com`. +- **Fix:** Change `v=spf1 include:spf.messagingengine.com ?all` → `~all` for ~7 days, confirm no legitimate mail (CI, transactional) is missed, then `-all`. Do this **after** `LIVE-L2`'s DMARC ramp begins. +- **Verify:** `dig +short TXT myhoneydue.com | grep spf` shows `-all`. -# Rotate encryption keys (do periodically) -k3s secrets-encrypt rotate-keys -k3s secrets-encrypt reencrypt -``` +### `LIVE-L4` — Add CAA records · MEDIUM · ⊘ +- **Operator decision (2026-05-16):** declined for this cycle. CAA is an ordinary DNS record type, addable on any Cloudflare plan including Free. Without it, any public CA may issue a cert for the domain; revisit when DNS is next touched. +- **Where:** Cloudflare DNS, apex `myhoneydue.com`. +- **Fix:** Add `0 issue "letsencrypt.org"`, `0 issuewild "letsencrypt.org"`, `0 iodef "mailto:security@myhoneydue.com"`. Add `0 issue "pki.goog"` only if Google Trust Services is used anywhere. Confirm against the CAs Cloudflare Universal SSL actually uses before locking down. +- **Verify:** `dig +short CAA myhoneydue.com` returns the records. -### RBAC +### `LIVE-L6` — Publish `security.txt` · LOW · ☐ · In-repo: Y +- **Where:** served by the Go API and/or Next.js apps at `/.well-known/security.txt` (RFC 9116) — committed route, so it survives redeploys. +- **Fix:** Serve `Contact:`, `Expires:`, `Preferred-Languages:`, `Canonical:` on both `api.myhoneydue.com` and the apex. +- **Verify:** `curl https://api.myhoneydue.com/.well-known/security.txt` → 200. -Each workload has a dedicated ServiceAccount with `automountServiceAccountToken: false`: +### `LIVE-L9` — Review Cloudflare caching of the admin SSR shell · INFO · ☐ +- **Where:** Cloudflare cache rules for `admin.myhoneydue.com`. +- **Fix:** `cache-control: s-maxage=31536000` on admin SSR pages means Cloudflare caches the admin shell for a year. Confirm this is intentional; if the admin shell ever contains per-session content, add a bypass-cache rule for `admin.myhoneydue.com`. +- **Verify:** `curl -sI https://admin.myhoneydue.com/ | grep -i cache` reflects the intended policy. -| ServiceAccount | Used By | K8s API Access | -|---------------|---------|----------------| -| `api` | API deployment | None | -| `worker` | Worker deployment | None | -| `admin` | Admin deployment | None | -| `redis` | Redis deployment | None | - -No Roles or RoleBindings are created — pods have zero K8s API access. - -```bash -# Verify service accounts exist -kubectl get sa -n honeydue - -# Verify no roles are bound -kubectl get rolebindings -n honeydue -kubectl get clusterrolebindings | grep honeydue -# Expected: no results -``` - -### Pod Disruption Budgets - -Prevent node maintenance from taking down all replicas: - -| Workload | Replicas | minAvailable | -|----------|----------|-------------| -| API | 3 | 2 | -| Worker | 2 | 1 | - -```bash -# Verify PDBs -kubectl get pdb -n honeydue -``` - -### Audit Logging (Optional Enhancement) - -K3s supports audit logging for API server requests: - -```yaml -# Add to K3s server config for detailed audit logging -# /etc/rancher/k3s/audit-policy.yaml -apiVersion: audit.k8s.io/v1 -kind: Policy -rules: - - level: Metadata - resources: - - group: "" - resources: ["secrets", "configmaps"] - - level: RequestResponse - users: ["system:anonymous"] - - level: None - resources: - - group: "" - resources: ["events"] -``` - -### WireGuard (Optional Enhancement) - -K3s supports WireGuard for encrypting inter-node traffic: - -```bash -# Enable WireGuard on K3s (add to server args) -# --flannel-backend=wireguard-native -``` +### `LIVE-L10` — Suppress `x-powered-by` · INFO · ☐ · In-repo: Y +- **Where:** Next.js config in the admin and web repos (`next.config.js` → `poweredByHeader: false`). Committed, survives redeploys. +- **Fix:** Disable the `x-powered-by: Next.js` header. +- **Verify:** `curl -sI https://admin.myhoneydue.com/ | grep -i x-powered-by` returns nothing. --- -## 4. Pod Security +## Stage 1 — Cluster provisioning & node OS -### Security Contexts +Run by `01-provision-cluster.sh` (which drives the `hetzner-k3s` CLI from +`config.yaml` via `generate_cluster_config` in `_config.sh`) plus one-time +SSH hardening on each node. **Any k3s server flag must be set in the +`hetzner-k3s` cluster config so a cluster rebuild applies it.** -Every pod runs with these security restrictions: +### `K3S-F4` — kubeconfig world-readable (mode 644 → 600) · HIGH · ☑ · In-repo: Y +- **Where:** `_config.sh` → `generate_cluster_config` → `k3s_config_file`. Node file `/etc/rancher/k3s/k3s.yaml`. +- **Done (2026-05-16):** `generate_cluster_config` now emits `write-kubeconfig-mode: "0600"` in the k3s config file, so any fresh provision writes the node kubeconfig as `0600`. +- **Operator step on the existing cluster:** a running node keeps the mode it was installed with — `ssh deploy@ 'sudo chmod 600 /etc/rancher/k3s/k3s.yaml'` on each. Deploy scripts still read it via `sudo`. +- **Verify:** `ssh deploy@ 'sudo stat -c %a /etc/rancher/k3s/k3s.yaml'` → `600`. -**Pod-level:** -```yaml -securityContext: - runAsNonRoot: true - runAsUser: # 1000 (api/worker), 1001 (admin), 999 (redis) - runAsGroup: - fsGroup: - seccompProfile: - type: RuntimeDefault # Linux kernel syscall filtering -``` +### `K3S-CG1` / `CODE-M9` — etcd / Secret encryption at rest · ☑ · In-repo: Y +- **Where:** `_config.sh` → `generate_cluster_config` → `k3s_config_file`. +- **Done:** the k3s config file carries `secrets-encryption: true`, so a fresh provision boots with AES Secret encryption enabled. (The `write-kubeconfig-mode` line for `K3S-F4` was added next to it on 2026-05-16.) +- **Operator step on the existing cluster:** a cluster provisioned *without* the flag does not retro-encrypt — run `k3s secrets-encrypt enable` then `k3s secrets-encrypt reencrypt` once. Tracked as `V12`. +- **Verify:** `k3s secrets-encrypt status` reports `Encryption Status: Enabled` on every server node. +- **Note:** the old `SECURITY.md` *claimed* this was already on — `04-verify.sh` greps for the string but cannot truly confirm; see `V12`. -**Container-level:** -```yaml -securityContext: - allowPrivilegeEscalation: false # Cannot gain more privileges than parent - readOnlyRootFilesystem: true # Filesystem is immutable - capabilities: - drop: ["ALL"] # No Linux capabilities -``` +### `K3S-CG2` — Node OS hardening · ◐ · In-repo: partial +- **Where:** `_config.sh` → `generate_cluster_config` → `post_create_commands` (runs on every node at provision). +- **Done (2026-05-16):** `post_create_commands` now installs and enables `fail2ban` (SSH brute-force bans) and `unattended-upgrades` (automatic security patching) on every node at provision time — a fresh cluster comes up hardened on both. +- **Still operator (runtime; not yet in-repo):** + - SSH — confirm `PermitRootLogin no`, `PasswordAuthentication no`, `AllowUsers deploy`, modern ciphers/MACs/KEX. (hetzner-k3s provisions key-only SSH; verify and tighten.) + - sysctl — confirm `net.ipv4.ip_unprivileged_port_start=0` (Traefik) and standard network-hardening sysctls. +- **Verify:** `ssh deploy@ 'fail2ban-client status sshd; systemctl is-enabled unattended-upgrades'`. -### Writable Directories +### `K3S-CG3` — Hetzner Cloud Firewall rules · ☐ · In-repo: N +- **Fix:** Confirm only: `:443` from Cloudflare CIDRs, `:22` from operator IP(s), `:6443` from operator IP(s). Nothing else. This is the *only* network defense for the public-IP nodes (`K3S-F15`). +- **Verify:** `hcloud firewall describe honeydue-fw` matches the intended ruleset; a direct `curl` to a node IP on `:80`/`:443` from a non-CF host times out. -With `readOnlyRootFilesystem: true`, writable paths use emptyDir volumes: +### `K3S-CG4` — etcd snapshot backup · ☐ · In-repo: Y +- **Fix:** Confirm k3s etcd snapshots are enabled (default hourly) and shipped off-node — set `--etcd-s3` (to Backblaze B2) with encryption. Without offsite snapshots, a 3-node loss is unrecoverable. +- **Verify:** `ls /var/lib/rancher/k3s/server/db/snapshots/` on a node + an object in the B2 backup bucket. -| Pod | Path | Purpose | Backing | -|-----|------|---------|---------| -| API | `/tmp` | Temp files | emptyDir (64Mi) | -| Worker | `/tmp` | Temp files | emptyDir (64Mi) | -| Admin | `/app/.next/cache` | Next.js ISR cache | emptyDir (256Mi) | -| Admin | `/tmp` | Temp files | emptyDir (64Mi) | -| Redis | `/data` | Persistence | PVC (5Gi) | -| Redis | `/tmp` | AOF rewrite temp | emptyDir tmpfs (64Mi) | +### `K3S-CG5` — kubelet authn/authz flags · ☐ · In-repo: Y +- **Fix:** Confirm `--anonymous-auth=false` and `--authorization-mode=Webhook` on the kubelet (k3s defaults are usually safe — verify, don't assume). Set via k3s `kubelet-arg` in the cluster config if missing. +- **Verify:** `kubectl get --raw /api/v1/nodes//proxy/configz` shows the expected kubelet config. -### User IDs +### `K3S-CG6` — Container-runtime CIS baseline · ☐ · In-repo: N +- **Fix:** Run `kube-bench` once; remediate any FAIL lines that aren't k3s-by-design. +- **Verify:** `kube-bench` run archived with FAILs triaged. -| Container | UID:GID | Source | -|-----------|---------|--------| -| API | 1000:1000 | Dockerfile `app` user | -| Worker | 1000:1000 | Dockerfile `app` user | -| Admin | 1001:1001 | Dockerfile `nextjs` user | -| Redis | 999:999 | Alpine `redis` user | +### `K3S-CG7` — `deploy` user sudoers least-privilege · ☐ · In-repo: N +- **Fix:** Current `deploy ALL=(ALL) NOPASSWD: ALL` means an SSH-key compromise = node root. Scope to the commands deploys actually need (`ufw`, `systemctl`, `chmod` on k3s.yaml, `cat` of k3s.yaml). Accept the convenience trade-off only with eyes open. +- **Verify:** `ssh deploy@ 'sudo -l'` shows the scoped list. -```bash -# Verify all pods run as non-root -kubectl get pods -n honeydue -o jsonpath='{range .items[*]}{.metadata.name}{" runAsNonRoot="}{.spec.securityContext.runAsNonRoot}{"\n"}{end}' -``` +### `K3S-CG8` — `/etc/rancher/k3s/` perms · ☐ · In-repo: N +- **Fix:** `/var/lib/rancher/k3s/server/token` and `/var/lib/rancher/k3s/server/node-token` must be `0600 root:root`; `/etc/rancher/k3s/` not world-traversable. +- **Verify:** `ssh deploy@ 'sudo stat -c "%a %n" /var/lib/rancher/k3s/server/token'` → `600`. + +### `K3S-F15` — Nodes on public IPs, no private VPC · INFO · ⊘ · In-repo: Y +- **Decision:** Accepted for now. Defense is `K3S-CG3` (Hetzner firewall) only. To remediate later: attach a Hetzner private network, re-IP the cluster, move etcd/kubelet/Flannel onto it. Substantial re-provision — track on the roadmap, not this cycle. + +### `K3S-F16` — All nodes are control-plane + etcd + worker · INFO · ⊘ +- **Decision:** Accepted — standard small-cluster k3s. Revisit (dedicated workers + `NoSchedule` taint on control-plane) when workload pressure grows. No redeploy action. + +### `K3S-F17` — Single-replica SPOFs · INFO · ☐ · In-repo: Y +- **Where:** `deploy-k3s/manifests/worker/deployment.yaml`, `redis/`, `admin/`, `observability/vmagent.yaml`. +- **Fix:** `worker` → `replicas: 2` (stateless, Asynq at-least-once — safe now). `admin`/`vmagent` → 2 if zero-downtime restart is wanted. `redis` is stateful — true HA needs Sentinel or managed Redis; track separately, do not naively scale. +- **Verify:** `kubectl -n honeydue get deploy` shows `worker 2/2`. --- -## 5. Network Segmentation +## Stage 2 — Secrets & config bootstrap -### Default-Deny Policy +Run by `02-setup-secrets.sh`, which reads `deploy-k3s/config.yaml` and the +`secrets/` directory. **Both `K3S-F1` and `K3S-F3` are open purely because +`config.yaml` lacks the values — the script already supports them.** -All ingress and egress traffic in the `honeydue` namespace is denied by default. Explicit NetworkPolicy rules allow only necessary traffic. +### `K3S-F1` — Redis runs with no authentication · CRITICAL · ☐ · In-repo: Y +- **Where:** `deploy-k3s/config.yaml` key `redis.password`. `02-setup-secrets.sh:53,68-71` includes `REDIS_PASSWORD` in `honeydue-secrets` only when that key is non-empty; `redis/deployment.yaml` adds `--requirepass` only when the env var is non-empty. +- **Fix:** Set `redis.password` in `config.yaml` to a strong value (`openssl rand -base64 32`). Re-run `02-setup-secrets.sh`. `api`/`worker` already consume `REDIS_PASSWORD`. +- **Verify:** `kubectl -n honeydue exec deploy/redis -- redis-cli ping` → `NOAUTH`; with `-a "$REDIS_PASSWORD"` → `PONG`. +- **Redeploy-clean:** committing the value to `config.yaml` means every future `02-setup-secrets.sh` re-creates the authenticated secret. (If `config.yaml` is gitignored, store the value in the operator's secret store and document it here.) -### Allowed Traffic +### `K3S-F3` — `admin-basic-auth` secret never created · HIGH · ☐ · In-repo: Y +- **Where:** `config.yaml` keys `admin.basic_auth_user` / `admin.basic_auth_password`. `02-setup-secrets.sh:54-55,132-143` creates the `admin-basic-auth` secret (bcrypt htpasswd) only when both are set, else it warns and skips. +- **Fix:** Set both keys. Re-run `02-setup-secrets.sh`. **Must be done before `K3S-F2`** — attaching `admin-auth` to the ingress with the secret missing makes Traefik 503 the admin route. +- **Verify:** `kubectl -n honeydue get secret admin-basic-auth`. -``` - ┌─────────────┐ - │ Traefik │ - │ (kube-system)│ - └──────┬──────┘ - │ - ┌──────────┼──────────┐ - │ │ │ - ▼ ▼ │ - ┌────────┐ ┌────────┐ │ - │ API │ │ Admin │ │ - │ :8000 │ │ :3000 │ │ - └───┬────┘ └────┬───┘ │ - │ │ │ - ┌───────┤ │ │ - │ │ │ │ - ▼ ▼ ▼ │ - ┌───────┐ ┌────────┐ ┌────────┐ │ - │ Redis │ │External│ │ API │ │ - │ :6379 │ │Services│ │(in-clr)│ │ - └───────┘ └────────┘ └────────┘ │ - ▲ │ - │ ┌────────┐ │ - └───────│ Worker │────────────┘ - └────────┘ -``` +### `K3S-F8` (Stage 2 half) — `B2_KEY_ID` / `B2_APP_KEY` in `honeydue-secrets` · ☑ · In-repo: Y +- **Where:** `02-setup-secrets.sh`. +- **Done (2026-05-16):** the script now reads `storage.b2_key_id` / `storage.b2_app_key` from `config.yaml` and adds `B2_KEY_ID` / `B2_APP_KEY` to `honeydue-secrets`. Previously the `api`/`worker` manifests referenced these keys but the script never created them — a latent deploy break. See the full `K3S-F8` entry in Stage 5. +- **Verify:** `kubectl -n honeydue get secret honeydue-secrets -o jsonpath='{.data.B2_KEY_ID}'` is non-empty. -| Policy | From | To | Ports | -|--------|------|----|-------| -| `default-deny-all` | all | all | none | -| `allow-dns` | all pods | kube-dns | 53 UDP/TCP | -| `allow-ingress-to-api` | Traefik (kube-system) | API pods | 8000 | -| `allow-ingress-to-admin` | Traefik (kube-system) | Admin pods | 3000 | -| `allow-ingress-to-redis` | API + Worker pods | Redis | 6379 | -| `allow-egress-from-api` | API pods | Redis, external (443, 5432, 587) | various | -| `allow-egress-from-worker` | Worker pods | Redis, external (443, 5432, 587) | various | -| `allow-egress-from-admin` | Admin pods | API pods (in-cluster) | 8000 | +### `K3S-F12` — Secret rotation runbook · MEDIUM · ☐ · In-repo: Y +- **Where:** new doc `docs/runbooks/secret-rotation.md`. +- **Fix:** Document per-secret rotation (Postgres, `SECRET_KEY`, APNs `.p8`, FCM, B2, observability token, Redis, admin basic-auth). Annual minimum; immediate on suspected exposure or operator-device loss. For `SECRET_KEY` (JWT signing) plan an overlap window so live tokens validate across the change. Add a `last-rotated` annotation to each secret. +- **Verify:** runbook exists and the first rotation is logged. -**Key restrictions:** -- Redis is reachable ONLY from API and Worker pods -- Admin can ONLY reach the API service (no direct DB/Redis access) -- No pod can reach private IP ranges except in-cluster services -- External egress limited to specific ports (443, 5432, 587) +### `CODE-C4` — `DEBUG_FIXED_CODES` "123456" auth bypass · CRITICAL · ☐ · In-repo: Y +- **Where:** `internal/services/auth_service.go:141-145,385-390,432-435,470-473,503-504`; config in `internal/config/config.go`. ConfigMap generated from `config.yaml` by `03-deploy.sh`. +- **Fix (two layers):** (1) Code — refuse to start if `ENV=production && DebugFixedCodes` (Stage 4 code change). (2) Config — ensure `config.yaml` never sets `DEBUG_FIXED_CODES=true` for prod, and the generated ConfigMap omits it. +- **Verify:** prod ConfigMap has no `DEBUG_FIXED_CODES`; a prod boot with the flag set fails fast. -```bash -# Verify network policies -kubectl get networkpolicy -n honeydue - -# Test: admin pod should NOT be able to reach Redis -kubectl exec -n honeydue deploy/admin -- nc -zv redis.honeydue.svc.cluster.local 6379 -# Expected: timeout/refused -``` +### `CODE-M8` — `SECRET_KEY` hardcoded debug fallback · MEDIUM · ☐ · In-repo: Y +- **Where:** `internal/config/config.go:437-442` falls back to `"change-me-in-production-secret-key-12345"`. +- **Fix:** Remove the static fallback — generate a per-boot random key in debug, and **refuse to start** in production if `SECRET_KEY` is unset. (`02-setup-secrets.sh:46-49` already enforces ≥32 chars for the real secret — keep that.) +- **Verify:** prod boot with no `SECRET_KEY` exits non-zero; the fallback string is gone from the binary. --- -## 6. Redis +## Stage 3 — Kubernetes manifests -### Authentication +Committed under `deploy-k3s/manifests/` and applied by `03-deploy.sh`. **Any +fix here is automatically re-applied on every redeploy** — the highest-value +stage for "redeploy clean." -Redis requires a password when `redis.password` is set in `config.yaml`: +### `K3S-F2` / `CODE-L6` — Wire defense-in-depth onto the admin ingress · HIGH · ☐ +- **Where:** `deploy-k3s/manifests/ingress/ingress-simple.yaml` — admin route annotation. +- **Fix:** Add `cloudflare-only` and `admin-auth` to the `traefik.ingress.kubernetes.io/router.middlewares` annotation alongside the existing `security-headers` + `rate-limit`. **Do `K3S-F3` first** or Traefik 503s the route. +- **Verify:** `04-verify.sh` "Cloudflare-Only Middleware" check passes; `admin.myhoneydue.com` prompts for basic auth. -- Password passed via `REDIS_PASSWORD` environment variable from `honeydue-secrets` -- Redis starts with `--requirepass $REDIS_PASSWORD` -- Health probes authenticate with `-a $REDIS_PASSWORD` -- Go API connects via `redis://:PASSWORD@redis.honeydue.svc.cluster.local:6379/0` +### `K3S-F6` — `imagePullSecrets` name consistency · HIGH · ☐ +- **Where:** all `deploy-k3s/manifests/*/deployment.yaml`, `migrate/job.yaml`; secret created by `02-setup-secrets.sh:111` as `ghcr-credentials`. +- **Fix:** The registry is Gitea — `ghcr-credentials` is a misleading name and the live cluster currently also has a hand-made `gitea-credentials`. Pick one name (`gitea-credentials` is clearer), use it in **both** the script and **every** manifest, and delete the orphan. The defect is a name *mismatch*, not a missing fix — make script + manifests agree so a pull never fails on a fresh node. +- **Verify:** `grep -rl imagePullSecrets deploy-k3s/manifests/` all reference one name == the script's; cordon a node, delete a pod, confirm the replacement pulls. -### Network Isolation +### `K3S-F7` — `vmagent` container `securityContext` · MEDIUM · ☐ +- **Where:** `deploy-k3s/manifests/observability/vmagent.yaml`. +- **Fix:** Add the container-level block the other 5 deployments already have: `allowPrivilegeEscalation: false`, `capabilities.drop: [ALL]`, `readOnlyRootFilesystem: true`. Its volumes (`/etc/vmagent`, `/etc/vmagent-secrets`, `/tmp/vmagent` emptyDir) already support read-only root. +- **Verify:** `04-verify.sh` "Pod Security Contexts" reports OK for `vmagent`. -- Redis has **no Ingress** — not exposed outside the cluster -- NetworkPolicy restricts access to API and Worker pods only -- Admin pods cannot reach Redis +### `K3S-F9` / `LIVE-L8` — CSP + cross-origin headers · MEDIUM / LOW · ☐ +- **Where:** Cross-origin trio → `deploy-k3s/manifests/ingress/middleware.yaml` (`security-headers`). CSP `object-src`/`base-uri` → Go app CSP middleware (Stage 4, `LIVE-L8` code half). +- **Important correction:** `K3S-F9` originally said CSP was missing. The live scan **disproved** that — the Go app sets a strong CSP via app middleware. So `K3S-F9` reduces to: add `Cross-Origin-Opener-Policy: same-origin` and `Cross-Origin-Resource-Policy: same-origin` (and `Cross-Origin-Embedder-Policy: require-corp` only if it doesn't break embeds) to `security-headers`. The CSP `object-src 'none'; base-uri 'self'` additions belong in the app and are tracked under `LIVE-L8` in Stage 4. +- **Verify:** `curl -sI https://api.myhoneydue.com/api/health/ | grep -i cross-origin` shows COOP/CORP. -### Memory Limits +### `K3S-F10` / `LIVE-L12` — Auth-endpoint rate-limit middleware · MEDIUM / HIGH · ☐ +- **Where:** `deploy-k3s/manifests/ingress/middleware.yaml` (new `auth-rate-limit` Middleware) + `ingress/ingress-simple.yaml`. Requires migrating the auth paths from vanilla `Ingress` to a Traefik `IngressRoute` to apply a per-path middleware. +- **Fix:** New Middleware `average: 5, burst: 10, period: 1m, sourceCriterion.ipStrategy.depth: 2` (depth 2 for the Cloudflare hop). Apply to `/api/auth/login`, `/api/auth/register`, `/api/auth/forgot-password`, `/api/auth/reset-password`, `/api/residences/join-with-code`. This is the **edge** half; the **app** half is `CODE-H1/H2/H3/M5` in Stage 4 (per-account lockout in Redis). Do both — edge limit alone resets on IP rotation. +- **Verify:** 10 rapid logins from one IP → `429`. -- `--maxmemory 256mb` — hard cap on Redis memory -- `--maxmemory-policy noeviction` — returns errors rather than silently evicting data -- K8s resource limit: 512Mi (headroom for AOF rewrite) +### `K3S-F11` — Disable `automountServiceAccountToken` · MEDIUM · ☐ +- **Where:** `deploy-k3s/manifests/rbac.yaml` (ServiceAccounts) and/or each `*/deployment.yaml` pod spec. +- **Fix:** Set `automountServiceAccountToken: false` on `api`, `admin`, `worker`, `web`, `redis`. Leave `true` only for `vmagent` (it uses the k8s API for service discovery). **Note:** `05-security.md` claims this is already set — the audit (`F11`) says it is not. Treat the audit as ground truth; this fix makes the doc true. +- **Verify:** `kubectl -n honeydue get pod -o jsonpath='{.spec.automountServiceAccountToken}'` → `false`; no token file in the container. -### Dangerous Command Renaming (Optional Enhancement) +### `K3S-F13` — Add `app.myhoneydue.com` to CORS · LOW · ☐ +- **Where:** `CORS_ALLOWED_ORIGINS` in `config.yaml` → generated into `honeydue-config` ConfigMap by `03-deploy.sh`. +- **Fix:** Confirm whether the web app calls `api.myhoneydue.com` directly from the browser. If yes, add `https://app.myhoneydue.com` to `CORS_ALLOWED_ORIGINS`. If it proxies through Next.js server-side, CORS is moot — record that decision here instead. +- **Verify:** browser fetch from `app.myhoneydue.com` to the API succeeds (or the proxy decision is documented). -For additional protection, rename dangerous commands in a custom `redis.conf`: +### `K3S-F14` — Pin public images by digest · LOW · ☐ +- **Where:** `redis/deployment.yaml` (`redis:7-alpine`), `observability/vmagent.yaml` (`victoriametrics/vmagent:v1.106.1`). +- **Fix:** Replace tags with `@sha256:` digests. Folded into the `K3S-F5` CI work (Stage 5). +- **Verify:** manifests contain no public-image tag without a digest. -``` -rename-command FLUSHDB "" -rename-command FLUSHALL "" -rename-command DEBUG "" -rename-command CONFIG "HONEYDUE_CONFIG_a7f3b" -``` +### `LIVE-L5` / `CODE-L3` — HSTS `preload` · LOW · ☐ +- **Where:** `deploy-k3s/manifests/ingress/middleware.yaml` `security-headers` HSTS value. +- **Fix:** Change to `max-age=63072000; includeSubDomains; preload`. Confirm api/admin/app all work fully over HTTPS, then submit to `hstspreload.org` (the submission is the Stage 0 external half — once preloaded you cannot easily downgrade for ~6 months). +- **Verify:** response header shows `preload`; domain accepted at hstspreload.org. -```bash -# Verify Redis auth is required -kubectl exec -n honeydue deploy/redis -- redis-cli ping -# Expected: (error) NOAUTH Authentication required. +### `LIVE-L7` — Drop deprecated `X-XSS-Protection` · LOW · ☐ +- **Where:** `deploy-k3s/manifests/ingress/middleware.yaml` `security-headers` (`browserXssFilter: true` / `customResponseHeaders`). +- **Fix:** Remove the header or set `X-XSS-Protection: "0"`. Modern browsers ignore it; legacy filter bypass has caused XSS. +- **Verify:** header absent or `0` on all three hosts. -kubectl exec -n honeydue deploy/redis -- redis-cli -a "$REDIS_PASSWORD" ping -# Expected: PONG -``` +### `CODE-L4` — Set `imagePullPolicy` · LOW · ☐ +- **Where:** all `deploy-k3s/manifests/*/deployment.yaml`. +- **Fix:** Set `imagePullPolicy` explicitly. Once images are digest-pinned (`K3S-F5`), `IfNotPresent` is correct and avoids needless re-pulls; until then `Always` avoids stale tags. Pick the policy that matches the `K3S-F5` rollout state. +- **Verify:** every container has an explicit `imagePullPolicy`. --- -## 7. PostgreSQL (Neon) +## Stage 4 — Application code & container images -### Connection Security +Fixes in `honeyDueAPI-go` source (and the admin/web Dockerfiles). They reach +production by **rebuilding the image** in `03-deploy.sh`; schema-changing +fixes (`CODE-C1`, `CODE-C5/6`, `CODE-C11`, `CODE-C12`) also need a **goose +migration**, which the migrate `Job` runs automatically before the +api/worker roll. Per repo rule: do not auto-commit — these are code changes; +this section is the plan, not the patch. -- **SSL required**: `sslmode=require` in connection string -- **Connection limits**: `max_open_conns=25`, `max_idle_conns=10` -- **Scoped credentials**: Database user has access only to `honeydue` database -- **Password rotation**: Change in Neon dashboard, update `secrets/postgres_password.txt`, re-run `02-setup-secrets.sh` +### Critical (C1–C13) -### Access Control +#### `CODE-C1` — Plaintext auth tokens in DB · ☑ (2026-05-15) +- **Where:** `internal/models/user.go`, `internal/repositories/user_repo.go`, `internal/middleware/auth.go`, `internal/services/cache_service.go`, `internal/services/auth_service.go`, migration `000003_hash_auth_tokens.sql`. +- **Done:** `user_authtoken.key` now stores `models.HashToken()` — the hex SHA-256 of the token — never the raw value. The raw token reaches the client once (the non-persisted `AuthToken.Plaintext` field) and is re-hashed on every request before the DB and Redis lookup, so the single indexed JOIN query in the auth middleware is preserved. A fast hash (not bcrypt) is correct here — tokens are 160-bit random values, nothing to brute-force. Migration `000003` widens the column 40→64 and clears existing rows. +- **Behaviour change:** the server can no longer re-issue a stored token's plaintext, so every login mints a fresh token via `CreateFreshToken` (delete + create). With the existing one-token-per-user schema this means **one active session per user** — logging in on a new device invalidates the previous device's token. The migration also invalidates all sessions once, at deploy. +- **Verify:** `SELECT key FROM user_authtoken LIMIT 1` → 64-char hash; `go build ./...` and `go test ./internal/{models,repositories,middleware,handlers}/...` pass. -- Only API and Worker pods have egress to port 5432 (NetworkPolicy enforced) -- Admin pods cannot reach the database directly -- Redis pods have no external egress +#### `CODE-C2` / `CODE-C3` — Google ID token not verified locally · ☑ (2026-05-15) +- **Where:** `internal/services/google_auth.go` (full rewrite). +- **Done:** `VerifyIDToken` no longer calls the deprecated `tokeninfo` URL (which leaked the token in the query string and made verification depend on a third party). It now parses the JWT, fetches Google's JWKS from `googleapis.com/oauth2/v3/certs` (Redis-cached 24h, re-fetched on a `kid` miss), verifies the `RS256` signature locally, and asserts `iss ∈ {accounts.google.com, https://accounts.google.com}` (C3), `aud`/`azp` against the configured client IDs, and `exp` (validated by jwt v5). Mirrors the existing Apple JWKS verifier. `GoogleSignIn` is unchanged — the returned `GoogleTokenInfo` shape is preserved. +- **Verify:** `go build ./...` clean; `internal/services` tests pass. -```bash -# Verify only API/Worker can reach Neon -kubectl exec -n honeydue deploy/admin -- nc -zv ep-xxx.us-east-2.aws.neon.tech 5432 -# Expected: timeout (blocked by network policy) -``` +#### `CODE-C5` / `CODE-C6` — IAP receipt / purchase-token replay · ☐ +- **Where:** `internal/services/subscription_service.go` (`ProcessApplePurchase`, `ProcessGooglePurchase`). +- **Fix:** Goose migration adding `UNIQUE(provider, original_transaction_id)`. On purchase, if the transaction ID is already bound to a different `user_id` → `403`. +- **Verify:** re-submitting a valid receipt against a second account → `403`; DB has no duplicate. -### Query Safety +#### `CODE-C7` — File-ownership check excludes residence owners · ☐ +- **Where:** `internal/services/file_ownership_service.go:20-66`. +- **Fix:** Replace the three `residence_residence_users`-only JOINs with the canonical owner-OR-member UNION from `residence_repo.HasAccess` (owners live in `residence_residence.owner_id`). +- **Verify:** a residence owner can delete a file in their own property; a non-member still gets `403`. -- GORM uses parameterized queries (SQL injection prevention) -- No raw SQL in handlers — all queries go through repositories -- Decimal fields use `shopspring/decimal` (no floating-point errors) +#### `CODE-C8` — Device-token cross-account hijack · ☐ +- **Where:** `internal/services/notification_service.go:307-319` (APNS), `:336-349` (GCM). +- **Fix:** On re-register of an existing token, if `existing.UserID != nil && *existing.UserID != userID` → `409 Conflict`. Only same-user updates allowed. +- **Verify:** registering another user's known token → `409`; that user's push traffic is unaffected. + +#### `CODE-C9` / `CODE-H9` — Share-code join not atomic · ☐ +- **Where:** `internal/services/residence_service.go:562-615` (`:594-599` swallows the deactivate error). +- **Fix:** Wrap `JoinWithCode` in one transaction with `SELECT … FOR UPDATE` on the share-code row; **fail the join if deactivation fails** (do not log-and-continue). +- **Verify:** concurrent redemptions of a single-use code → exactly one succeeds; a forced deactivate error rolls the whole join back. + +#### `CODE-C10` — Subscription upgrade race · ☐ +- **Where:** `internal/services/subscription_service.go:404-459`; webhook handler `:136-213`. +- **Fix:** Move Apple validation inside the row-locked transaction, or add an idempotency-key table so the validate→write window can't be raced. +- **Verify:** two concurrent upgrades for one user → one tier change, not two. + +#### `CODE-C11` — Task-completion duplicate-row race · ☐ +- **Where:** `internal/services/task_service.go:631-750`. +- **Fix:** `SELECT … FOR UPDATE` on the task in `CreateCompletion`; goose migration adding `UNIQUE(task_id, completed_date)`. +- **Verify:** double-tap "complete" → one completion row. + +#### `CODE-C12` — Soft-deleted email reusable · ☐ +- **Where:** `internal/services/auth_service.go:274-324`; `internal/repositories/user_repo.go` (`FindByEmail`, `ExistsByEmail`). +- **Fix:** On delete, mangle the email (`deleted__`); add `is_active = true` filtering consistently to `FindByEmail`/`ExistsByEmail`. +- **Verify:** registering with a soft-deleted account's email is rejected; no cross-account takeover. + +#### `CODE-C13` — Apple webhook user lookup may LIKE-match · ☐ +- **Where:** `internal/handlers/subscription_webhook_handler.go:354-366` (`FindByAppleReceiptContains`). +- **Fix:** Confirm the SQL is an equality match, not `LIKE`. If `LIKE`, this is a confirmed Critical — change to equality and rename the function. See `V8`. +- **Verify:** the query is parameterized equality; rename merged. + +### High (H1–H9) + +#### `CODE-H1` / `CODE-H2` / `CODE-H3` / `CODE-M5` — Rate limiting gaps · ☐ +- **Where:** `internal/router/router.go` (`:520` login limiter, `:593` `join-with-code` unprotected), `internal/middleware/rate_limit.go`, `internal/handlers/auth_handler.go`. +- **Fix:** Extend rate limiting to `register`, `join-with-code`, Apple/Google sign-in, and token refresh. Add a per-account login-attempt counter in Redis (lock after 5–10 fails for 15–60 min). This is the **app** half of the consolidated auth-rate-limit item; the **edge** half is `K3S-F10`. +- **Verify:** rapid attempts on every auth route throttle; per-account lockout fires regardless of source IP. + +#### `CODE-H4` — Modulo bias in 6-digit codes · ☐ +- **Where:** `internal/services/auth_service.go:884-892`. +- **Fix:** Replace `int32 % 1000000` with rejection sampling on `crypto/rand` for a uniform `000000–999999`. +- **Verify:** distribution test over many samples is uniform. + +#### `CODE-H5` — Apple IAP `.p8` file-mode unchecked · ☐ +- **Where:** `internal/services/iap_validation.go:93-128`, `internal/config/config.go:325`. +- **Fix:** Prefer a base64 env-injected PEM. If a file path is kept, refuse to start when the file mode is more permissive than `0600`. +- **Verify:** boot fails on a `0644` key file; succeeds on `0600`. + +#### `CODE-H6` — Webhook dedup fail-open · ☐ +- **Where:** `internal/handlers/subscription_webhook_handler.go:165-173` (Apple), `:564-574` (Google). +- **Fix:** Fail **closed** — if `webhookEventRepo.HasProcessed` errors, return `500` so Apple/Google retry, rather than processing (which risks duplicate refunds). +- **Verify:** simulated dedup-check DB error → `500`, no double-processing. + +#### `CODE-H7` — Auth-failure log lacks IP/UA · ☐ +- **Where:** `internal/handlers/auth_handler.go:70`. +- **Fix:** Add `c.RealIP()` + `User-Agent` to the structured failure log line (the audit log captures them; the request-line log does not). Depends on `V10` (RealIP trust). +- **Verify:** a failed login log line carries IP + UA. + +#### `CODE-H8` — `X-Timezone` header trusted for trial start · ☐ +- **Where:** `internal/middleware/timezone.go:40-71` → `internal/services/subscription_service.go:145-150`. +- **Fix:** Validate `X-Timezone` against IANA `LoadLocation`, cap to ±14h; use server UTC for trial-start / billing-window math regardless. +- **Verify:** a bogus/extreme `X-Timezone` cannot shift trial start. + +### Medium (M1–M13) + +#### `CODE-M1` — Header injection via `Content-Disposition` filename · ☐ +- **Where:** `internal/handlers/media_handler.go:74,117,165`. +- **Fix:** Sanitize `doc.FileName` — strip CR/LF/quote/null, or emit RFC 5987 `filename*=UTF-8''…`. +- **Verify:** an upload with CRLF in the filename does not split the response. + +#### `CODE-M2` — bcrypt cost 10 → 12 · ☐ +- **Where:** `internal/models/user.go:47`, `internal/services/auth_service.go:479`. +- **Fix:** Make the cost config-driven, default 12. +- **Verify:** new hashes are `$2a$12$`. + +#### `CODE-M3` — Apple Sign In nonce not validated · ☐ +- **Where:** `internal/services/apple_auth.go`. +- **Fix:** Generate, store, and verify the nonce round-trip on Apple sign-in. +- **Verify:** a replayed/mismatched nonce is rejected. + +#### `CODE-M4` — Email verification not atomic · ☐ +- **Where:** `internal/services/auth_service.go:373-415`. +- **Fix:** Wrap verify in a transaction so a concurrent request can't double-apply. +- **Verify:** concurrent verify calls → one state transition. + +#### `CODE-M6` / `LIVE-L16` — Uncapped list / pagination · ☐ +- **Where:** `ListDocuments`, `ListContractors`, `ListResidences` handlers; pagination parsing. +- **Fix:** Clamp `limit` server-side to ≤100 (`< 1` → default 25). Notifications already caps at 200 — match the pattern. +- **Verify:** `?limit=999999` returns ≤100 rows. + +#### `CODE-M7` — Audit log not append-only · ☐ +- **Where:** audit-log model / repository. +- **Fix:** Make it append-only — a DB trigger forbidding `UPDATE`/`DELETE`, or move to an event store. Remove the soft-delete column. +- **Verify:** an `UPDATE`/`DELETE` on the audit table is rejected. + +#### `CODE-M11` — `golang.org/x/crypto` outdated · ☐ +- **Where:** `go.mod:30` (`v0.49.0`). +- **Fix:** `go get -u golang.org/x/crypto`, re-run `govulncheck`, retest. Pairs with Stage 5 dependency automation. +- **Verify:** `govulncheck ./...` clean. + +#### `CODE-M12` — Contractor toggle refetch race · ☐ +- **Where:** `internal/services/contractor_service.go:279-307`. +- **Fix:** Do the toggle + read in one transaction so a concurrent soft-delete can't make it return `nil`. +- **Verify:** concurrent toggle + delete → defined result, no nil panic. + +#### `CODE-M13` — Account-deletion endpoint unrate-limited · ☐ +- **Where:** `internal/handlers/auth_handler.go:488-539`. +- **Fix:** Add a throttle to `DELETE /account`. **First resolve `V11`** — `LIVE-L18` claims no delete endpoint exists; reconcile before deciding whether this is "rate-limit it" or "expose it." +- **Verify:** repeated delete calls throttle. + +#### `CODE-M10` — `node:20-alpine` floating tag · ☐ +- **Where:** admin/web `Dockerfile` (`:2,112,134`). +- **Fix:** Pin to a specific patch version or digest. +- **Verify:** Dockerfile has no bare `node:20-alpine`. + +### Low / Info (CODE-L1, L2) + +#### `CODE-L1` — Inactive-account login enumeration · ☐ +- **Where:** `internal/services/auth_service.go:76-77`. +- **Fix:** Return the same generic error for inactive accounts as for invalid credentials. +- **Verify:** inactive vs. wrong-password responses are byte-identical. + +#### `CODE-L2` — Auth responses lack `Cache-Control: no-store` · ☐ +- **Where:** `internal/handlers/auth_handler.go` (Login / CurrentUser / Refresh). +- **Fix:** Set `Cache-Control: no-store` on auth responses. +- **Verify:** the header is present. + +### Live-scan code-level findings (LIVE-L1, L11–L20) + +#### `LIVE-L1` — `/metrics` publicly exposed · HIGH · ☐ +- **Where:** `cmd/api/main.go` route registration; vmagent scrapes it cluster-internally already. +- **Fix (recommended — Option B):** bind Prometheus metrics to a separate cluster-internal port (e.g. `:9090`), expose only via a ClusterIP Service the vmagent NetworkPolicy allows; the public Ingress never registers `/metrics`. Update `observability/vmagent.yaml` scrape target. (Alternative: block `/metrics` at Traefik via an `IngressRoute` — Stage 3.) +- **Verify:** `curl https://api.myhoneydue.com/metrics` → `404`; vmagent still scrapes successfully. + +#### `LIVE-L11` — Login user-enumeration via timing · HIGH · ☐ +- **Where:** login handler / `auth_service.go`. +- **Fix:** Always run a bcrypt compare against a fixed dummy hash when the user is not found, so the response time is constant. +- **Verify:** real vs. fake email login timing delta < network noise. + +#### `LIVE-L12` — No rate-limit on login · HIGH · ☐ +- See the consolidated auth-rate-limit item: `K3S-F10` (edge) + `CODE-H1/H2/H3/M5` (app). Closed when both land. + +#### `LIVE-L13` — Password-reset timing enumeration · HIGH · ☐ +- **Where:** `forgot-password` handler. +- **Fix:** Enqueue the reset email on the Asynq queue and return the generic response immediately, so real vs. fake emails have identical latency. +- **Verify:** real vs. fake email reset timing delta < network noise. + +#### `LIVE-L14` / `LIVE-L15` — Sequential integer IDs · MEDIUM · ⊘ (deferred) +- **Where:** all user-facing IDs. +- **Decision:** Real enumeration/intel leak, but migrating to UUID/ULID touches API, web, mobile, and webhook payloads. **Deferred to a planned quarter** — not a redeploy-stage fix. Track on the roadmap; revisit before the userbase size becomes commercially sensitive. + +#### `LIVE-L16` — Pagination `limit` uncapped · MEDIUM · ☐ +- Duplicate of `CODE-M6` — closed with it. + +#### `LIVE-L17` — Garbage pagination params silently accepted · LOW · ☐ +- **Where:** query-param parsing in list handlers. +- **Fix:** Return `400` naming the bad parameter instead of silently using defaults. +- **Verify:** `?limit=abc` → `400`. + +#### `LIVE-L18` — No account-deletion endpoint (GDPR) · LOW · ☐ +- **Where:** `internal/router/router.go`, `internal/handlers/auth_handler.go`. +- **Fix:** Reconcile with `CODE-M13` first (`V11`). Provide `DELETE /api/auth/me/` that anonymizes PII, cascades/transfers residences, revokes tokens, and writes an audit-trail row. Also closes the throwaway-account cleanup gap the live scan left behind. +- **Verify:** an authenticated user can delete their own account; PII is anonymized. + +#### `LIVE-L19` — Email verification not enforced · LOW · ☐ +- **Where:** router middleware. +- **Fix:** Add a `RequireVerified()` middleware on sensitive routes (share-code generation/redemption, anything that emails other users), or cap unverified accounts (1 residence, no share codes) until verified. +- **Verify:** an unverified account is blocked from the chosen gated routes. + +#### `LIVE-L20` — Profile-update silently drops unknown fields · INFO · ☐ +- **Where:** `PATCH /api/auth/profile/` handler. +- **Fix:** Either accept the fields (if intended) or return `400` listing unsupported keys — don't silently `200`. +- **Verify:** an unknown field yields a clear response. + +#### `LIVE-L10` — `x-powered-by` — see Stage 0 (Next.js config). --- -## 8. Cloudflare +## Stage 5 — CI / build pipeline -### TLS Configuration +Build-time controls. Where there is no CI pipeline file yet, the fix is to +add one (or a `03-deploy.sh` step) so the control runs on every build. -- **Mode**: Full (Strict) — Cloudflare validates the origin certificate -- **Origin cert**: Stored as K8s Secret `cloudflare-origin-cert` -- **Minimum TLS**: 1.2 (set in Cloudflare dashboard) -- **HSTS**: Enabled via security headers middleware +### `K3S-F5` / `K3S-F14` / `CODE-L4` — Pin images by digest · HIGH · ☐ +- **Where:** `03-deploy.sh` (currently tags by git short SHA, lines 47/57-61, and also pushes `:latest`), all `deploy-k3s/manifests/*/deployment.yaml`. +- **Fix:** After `docker push`, capture the digest (`crane digest …` or parse `docker push` output) and substitute `@sha256:…` into the manifests instead of `IMAGE_PLACEHOLDER` tags. Pin `redis` and `vmagent` by digest too. Reconsider pushing `:latest` — a mutable `:latest` undercuts digest pinning. +- **Verify:** `kubectl -n honeydue get deploy -o jsonpath` shows every image as `@sha256:`. -### Origin Lockdown +### `K3S-F8` — Secrets as file mounts, not env vars · MEDIUM · ☑ · In-repo: Y +- **Where:** `api`/`worker` `deployment.yaml`, `internal/config/config.go`, `cmd/api/main.go`, `cmd/worker/main.go`, `02-setup-secrets.sh`. +- **Done (2026-05-16):** + - `config.loadFileSecrets()` reads each of the 9 secret keys (`POSTGRES_PASSWORD`, `SECRET_KEY`, `EMAIL_HOST_PASSWORD`, `FCM_SERVER_KEY`, `REDIS_PASSWORD`, `B2_KEY_ID`, `B2_APP_KEY`, `OBS_INGEST_TOKEN`, `OBS_TRACES_URL`) from `/etc/honeydue/secrets/` and `viper.Set`s it (highest precedence). A missing file is a silent skip, so the same binary still works from env vars in local/dev. + - `api`/`worker` `deployment.yaml` no longer inject **any** secret as an `env: secretKeyRef`. `honeydue-secrets` is mounted as a volume (`defaultMode: 0400`), read-only, at `/etc/honeydue/secrets`. Non-secret config still arrives via `envFrom: configMapRef`. + - `cmd/api`/`cmd/worker` read the observability endpoints through the new `config.SecretValue()` (Viper-backed) instead of `os.Getenv`, so file-mounted `OBS_*` values resolve now that they are gone from the environment. + - `02-setup-secrets.sh` now also writes `B2_KEY_ID`/`B2_APP_KEY` into `honeydue-secrets` — reconciling the script-vs-manifest drift (the manifests referenced these keys but the script never created them). +- **Scoped exception:** the one-shot `honeydue-migrate` Job still takes `POSTGRES_PASSWORD` as an env var. goose is invoked as a CLI with the password inside the DSN argument, so the value is exposed in that process regardless of env-vs-file; the Job is transient (one run, seconds, pod GC'd) so this is accepted. +- **Verify:** `kubectl -n honeydue exec deploy/api -- env` shows no `POSTGRES_PASSWORD`/`SECRET_KEY`; `kubectl -n honeydue exec deploy/api -- ls /etc/honeydue/secrets` lists the key files. -The `cloudflare-only` Traefik middleware restricts all ingress to Cloudflare IP ranges only. Direct requests to the origin IP are rejected with 403. +### `CODE-L5` — Image signing + scanning · LOW · ◐ · In-repo: Y +- **Where:** `03-deploy.sh`, `deploy-k3s/manifests/kyverno-verify-images.yaml`. +- **Done (in-repo, 2026-05-16):** + - `03-deploy.sh` runs `cosign sign` after each push and a `trivy image --severity HIGH,CRITICAL` scan before push — both **guarded**: they no-op when the tool is absent, so they never break a deploy on a host without them. + - A ready-to-use Kyverno `ClusterPolicy` ships at `deploy-k3s/manifests/kyverno-verify-images.yaml`. It matches only the four `gitea.treytartt.com/admin/honeydue-*` images, starts in `Audit` mode, and is **intentionally not applied by `03-deploy.sh`** — applying a verify-images policy with no key would block every Pod from scheduling. +- **Remaining (operator — cannot be committed):** + 1. Install Kyverno in the cluster (admission controller). + 2. `cosign generate-key-pair`; set `COSIGN_KEY` in the deploy env so signing activates; paste `cosign.pub` into the policy's `publicKeys` block. + 3. `kubectl apply -f deploy-k3s/manifests/kyverno-verify-images.yaml`, confirm Pods still schedule, then flip `validationFailureAction: Audit → Enforce`. +- **Verify:** an unsigned image is rejected by admission; `03-deploy.sh` fails on a HIGH/CRITICAL CVE. -```bash -# Test: direct request to origin should fail -curl -k https://ORIGIN_IP/api/health/ -# Expected: 403 Forbidden - -# Test: request through Cloudflare should work -curl https://api.myhoneydue.com/api/health/ -# Expected: 200 OK -``` - -### Cloudflare IP Range Updates - -Cloudflare IP ranges change infrequently but should be checked periodically: - -```bash -# Compare current ranges with deployed middleware -diff <(curl -s https://www.cloudflare.com/ips-v4; curl -s https://www.cloudflare.com/ips-v6) \ - <(kubectl get middleware cloudflare-only -n honeydue -o jsonpath='{.spec.ipAllowList.sourceRange[*]}' | tr ' ' '\n') -``` - -### WAF & Rate Limiting - -- **Cloudflare WAF**: Enable managed rulesets in dashboard (OWASP Core, Cloudflare Specials) -- **Rate limiting**: Traefik middleware (100 req/min, burst 200) + Go API auth rate limiting -- **Bot management**: Enable in Cloudflare dashboard for API routes - -### Security Headers - -Applied via Traefik middleware to all responses: - -| Header | Value | -|--------|-------| -| `Strict-Transport-Security` | `max-age=31536000; includeSubDomains` | -| `X-Frame-Options` | `DENY` | -| `X-Content-Type-Options` | `nosniff` | -| `X-XSS-Protection` | `1; mode=block` | -| `Referrer-Policy` | `strict-origin-when-cross-origin` | -| `Content-Security-Policy` | `default-src 'self'; frame-ancestors 'none'` | -| `Permissions-Policy` | `camera=(), microphone=(), geolocation=()` | -| `X-Permitted-Cross-Domain-Policies` | `none` | +### `CODE-M11` (CI half) — Dependency hygiene · ☐ +- **Fix:** Add scheduled `go get -u` + `govulncheck` (the audit confirms `govulncheck` + `gitleaks` already run in CI — extend with a dependency-update cadence). +- **Verify:** stale-dependency alerts surface automatically. --- -## 9. Container Images +## Stage 6 — Post-deploy verification & runtime investigations -### Build Security +`04-verify.sh` already runs a security block (secret encryption, NetworkPolicy +count, ServiceAccounts, pod security contexts, PDBs, `cloudflare-only` +middleware, `admin-basic-auth`). **Extend it so each fix above stays fixed, +and work the open investigations the audits could not resolve.** -- **Multi-stage builds**: Build stage discarded, only runtime artifacts copied -- **Alpine base**: Minimal attack surface (~5MB base) -- **Non-root users**: `app:1000` (Go), `nextjs:1001` (admin) -- **Stripped binaries**: Go binaries built with `-ldflags "-s -w"` (no debug symbols) -- **No shell in final image** (Go containers): Only the binary + CA certs +### Extend `04-verify.sh` with assertions for · ☐ +- Redis rejects unauthenticated `PING` (`K3S-F1`). +- Admin ingress annotation contains `admin-auth` (`K3S-F2`). +- `/metrics` returns `404` on the public host (`LIVE-L1`). +- Every container (incl. `vmagent`) has a full `securityContext` (`K3S-F7`). +- `automountServiceAccountToken: false` on app pods (`K3S-F11`). +- Every workload image is digest-pinned (`K3S-F5`). +- No `DEBUG_FIXED_CODES` key in the prod ConfigMap (`CODE-C4`). -### Image Scanning (Recommended) +### Runtime investigations (cannot be closed by code review alone) -Add image scanning to CI/CD before pushing to GHCR: - -```bash -# Trivy scan (run in CI) -trivy image --severity HIGH,CRITICAL --exit-code 1 ghcr.io/NAMESPACE/honeydue-api:latest - -# Grype alternative -grype ghcr.io/NAMESPACE/honeydue-api:latest --fail-on high -``` - -### Version Pinning - -- Redis image: `redis:7-alpine` (pin to specific tag in production, e.g., `redis:7.4.2-alpine`) -- Go base: pinned in Dockerfile -- Node base: pinned in admin Dockerfile +| ID | Item | Source | Action | +|---|---|---|---| +| `V1` | Apple/Google Sign-In token validation depth | LIVE | Test with a self-signed Apple identity token; confirm signature/aud/nonce checks | +| `V2` | Webhook signature verification — confirm webhook routes are **outside** the auth middleware in `router.go` (live scan saw `401`s, signature middleware may never run) | LIVE | Code-review `internal/router/router.go` | +| `V3` | File-upload security — locate upload paths, test polyglots / MIME bypass / path traversal in filename / oversized files | LIVE | Focused upload security test | +| `V4` | Long-term token validity / revocation behaviour | LIVE | Test token expiry + revocation over time | +| `V5` | Apple IAP receipt validation with a real sandbox StoreKit receipt | LIVE | Sandbox test | +| `V6` | Share-code system — find the endpoint path; test brute-force, single-use, expiration | LIVE | Locate + test | +| `V7` | Trial-expiration enforcement — age a test account past 14 days, confirm `limitations_enabled` flips and creation gates fire | LIVE | Aged-account test | +| `V8` | `FindByAppleReceiptContains` — confirm equality, not `LIKE`. If `LIKE`, escalate `CODE-C13` to confirmed Critical | CODE | SQL review | +| `V9` | Rate-limiter storage — confirm `rate_limit.go` is Redis-backed (shared across 3 api replicas); in-memory = 3× the intended limit | CODE | Code review | +| `V10` | `X-Forwarded-For` / Echo `RealIP` trust behind Traefik — without it per-IP limits collapse to the ingress IP | CODE | Code + Traefik config review | +| `V11` | Account-deletion contradiction — `LIVE-L18` (no endpoint) vs `CODE-M13` (endpoint at `auth_handler.go:488-539`). Resolve before Stage 4 planning | LIVE/CODE | Route review | +| `V12` | etcd encryption — `04-verify.sh` only greps a string; truly confirm with `k3s secrets-encrypt status` on each server node | K3S | SSH check | +| `V13` | `user_authtoken` index — confirm a `user_id` lookup index exists before hashing tokens at rest (`CODE-C1`) | CODE | Schema check | --- -## 10. Secrets Management +## Accepted risks / deferred (this cycle) -### At-Rest Encryption +| ID | Item | Rationale | +|---|---|---| +| `K3S-F15` | Public-IP nodes, no VPC | Re-provision-scale change; Hetzner firewall (`K3S-CG3`) is the compensating control. Roadmap. | +| `K3S-F16` | Combined control-plane/worker nodes | Standard small-cluster k3s; revisit on workload growth. | +| `LIVE-L14`/`L15` | Sequential integer IDs | UUID migration spans API + web + mobile + webhooks; planned quarter, not this cycle. | -K3s encrypts all Secret resources in etcd with AES-CBC (`--secrets-encryption` flag). - -### Secret Inventory - -| Secret | Contains | Rotation Procedure | -|--------|----------|--------------------| -| `honeydue-secrets` | DB password, SECRET_KEY, SMTP password, FCM key, Redis password | Update source files + re-run `02-setup-secrets.sh` | -| `honeydue-apns-key` | APNs .p8 private key | Replace file + re-run `02-setup-secrets.sh` | -| `cloudflare-origin-cert` | TLS cert + key | Regenerate in Cloudflare + re-run `02-setup-secrets.sh` | -| `ghcr-credentials` | Registry PAT | Regenerate GitHub PAT + re-run `02-setup-secrets.sh` | -| `admin-basic-auth` | htpasswd hash | Update config.yaml + re-run `02-setup-secrets.sh` | - -### Rotation Procedure - -```bash -# 1. Update the secret source (file or config.yaml value) -# 2. Re-run the secrets script -./scripts/02-setup-secrets.sh - -# 3. Restart affected pods to pick up new secret values -kubectl rollout restart deployment/api deployment/worker -n honeydue - -# 4. Verify pods are healthy -kubectl get pods -n honeydue -w -``` - -### Secret Hygiene - -- `secrets/` directory is gitignored — never committed -- `config.yaml` is gitignored — never committed -- Scripts validate secret files exist and aren't empty before creating K8s secrets -- `SECRET_KEY` requires minimum 32 characters -- ConfigMap redacts sensitive values in `04-verify.sh` output +Mirror these in `docs/deployment/20-roadmap.md` so they are not silently lost. --- -## 11. B2 Object Storage +## Documentation drift corrected alongside this plan -### Access Control +The audits contradicted the existing deployment book. These corrections ship +with this plan so the docs match audited reality: -- **Scoped application key**: Create a B2 key with access to only the `honeydue` bucket -- **Permissions**: Read + Write only (no `deleteFiles`, no `listAllBucketNames`) -- **Bucket-only**: Key cannot access other buckets in the account - -```bash -# Create scoped B2 key (Backblaze CLI) -b2 create-key --bucket BUCKET_NAME honeydue-api readFiles,writeFiles,listFiles -``` - -### Upload Validation (Go API) - -- File size limit: `STORAGE_MAX_FILE_SIZE` (10MB default) -- Allowed MIME types: `STORAGE_ALLOWED_TYPES` (images + PDF only) -- Path traversal protection in upload handler -- Files served via authenticated proxy (`media_handler`) — no direct B2 URLs exposed to clients - -### Versioning - -Enable B2 bucket versioning to protect against accidental deletion: - -```bash -# Enable versioning on the B2 bucket -b2 update-bucket --versioning enabled BUCKET_NAME -``` +| Doc | Claimed | Reality (audit) | Action | +|---|---|---|---| +| `05-security.md` | `automountServiceAccountToken: false` set | `K3S-F11`: not set on any workload | Corrected to "TODO" + linked here | +| `05-security.md` | NetworkPolicies "not currently applied" (TODO) | Applied 2026-04-24; `03-deploy.sh:155` applies them | Corrected to "applied" | +| `05-security.md` | CF↔origin is plaintext (SSL=Flexible) | Upgraded to Full (strict) 2026-04-24 | Corrected | +| `05-security.md` | SHA tags immutable / "we'd notice a digest change" | `K3S-F5`: short SHA tags are mutable | Corrected; points to `K3S-F5` | +| `SECURITY.md` (old) | Redis "requires a password" | `K3S-F1`: no auth | This rewrite | +| `SECURITY.md` (old) | etcd `secrets-encryption: true` | `K3S-CG1`: not verified / not on | This rewrite | +| `SECURITY.md` (old) | fail2ban active | `05-security.md` + `K3S-CG2`: not installed | This rewrite | +| `20-roadmap.md` | — | Audit findings not represented | Audit items folded in | --- -## 12. Monitoring & Alerting +## Hardened-redeploy checklist (run order) -### Log Aggregation +A clean rebuild of the whole stack, with every fix above applied: -K3s logs are available via `kubectl logs`. For persistent log aggregation: - -```bash -# View API logs -kubectl logs -n honeydue -l app.kubernetes.io/name=api --tail=100 -f - -# View worker logs -kubectl logs -n honeydue -l app.kubernetes.io/name=worker --tail=100 -f - -# View all warning events -kubectl get events -n honeydue --field-selector type=Warning --sort-by='.lastTimestamp' +``` +□ Stage 0 DNS once-off: DMARC, SPF, CAA at Cloudflare; security.txt route live +□ Stage 1 Provision: hetzner-k3s config carries --write-kubeconfig-mode=600 + and --secrets-encryption; run 01-provision-cluster.sh +□ Stage 1 Node OS: fail2ban + unattended-upgrades + SSH/sysctl on each node +□ Stage 1 Verify cluster: K3S-CG3..CG8 (firewall, snapshots, kubelet, perms) +□ Stage 2 Config: config.yaml has redis.password + admin.basic_auth_*; + no DEBUG_FIXED_CODES; SECRET_KEY ≥32 chars +□ Stage 2 Secrets: run 02-setup-secrets.sh — confirm redis + admin-basic-auth +□ Stage 3 Manifests: admin ingress middlewares wired; imagePullSecret name + consistent; vmagent securityContext; COOP/CORP headers; + auth-rate-limit; automountServiceAccountToken:false; + HSTS preload; X-XSS-Protection dropped; imagePullPolicy set +□ Stage 4 Code+image: all C/H/M/L code fixes committed; image rebuilt; + goose migrations for C1/C5/C6/C11/C12 present +□ Stage 5 CI: images digest-pinned + signed + scanned; secrets file-mounted +□ Stage 6 Verify: run 04-verify.sh (extended); work V1–V13 +□ Post: Submit myhoneydue.com to hstspreload.org ``` -**Recommended**: Deploy Loki + Grafana for persistent log search and alerting. - -### Health Monitoring - -```bash -# Continuous health monitoring -watch -n 10 "kubectl get pods -n honeydue -o wide && echo && kubectl top pods -n honeydue 2>/dev/null" - -# Check pod restart counts (indicator of crashes) -kubectl get pods -n honeydue -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{range .status.containerStatuses[*]}{.restartCount}{end}{"\n"}{end}' -``` - -### Alerting Thresholds - -| Metric | Warning | Critical | Check Command | -|--------|---------|----------|---------------| -| Pod restarts | > 3 in 1h | > 10 in 1h | `kubectl get pods` | -| API response time | > 500ms p95 | > 2s p95 | Cloudflare Analytics | -| Memory usage | > 80% limit | > 95% limit | `kubectl top pods` | -| Redis memory | > 200MB | > 250MB | `redis-cli info memory` | -| Disk (PVC) | > 80% | > 95% | `kubectl exec ... df -h` | -| Certificate expiry | < 30 days | < 7 days | Cloudflare dashboard | - -### Audit Trail - -- **K8s events**: `kubectl get events -n honeydue` (auto-pruned after 1h) -- **Go API**: zerolog structured logging with credential masking -- **Cloudflare**: Access logs, WAF logs, rate limiting logs in dashboard -- **Hetzner**: SSH auth logs in `/var/log/auth.log` +A redeploy is "clean" only when `04-verify.sh` (extended per Stage 6) passes +with zero `✗` lines and every checkbox in the master index is ☑ or ⊘. --- -## 13. Incident Response +## Appendix — Incident response playbooks -### Playbook: Compromised API Token +Preserved from the previous `SECURITY.md`; still current. +### Compromised API token +Rotate `SECRET_KEY` to invalidate all tokens, then restart api/worker: ```bash -# 1. Rotate SECRET_KEY to invalidate ALL tokens echo "$(openssl rand -hex 32)" > secrets/secret_key.txt ./scripts/02-setup-secrets.sh kubectl rollout restart deployment/api deployment/worker -n honeydue - -# 2. All users will need to re-authenticate ``` +(After `CODE-C1` lands, tokens are hashed at rest — a DB read no longer yields +usable tokens, but `SECRET_KEY` rotation remains the kill-switch.) -### Playbook: Compromised Database Credentials +### Compromised database credentials +Rotate in the Neon dashboard, update `secrets/postgres_password.txt`, re-run +`02-setup-secrets.sh`, restart api/worker, watch logs for connection errors. +### Compromised push keys +APNs: revoke in Apple Developer, drop the new `.p8` into `secrets/`, re-run +`02-setup-secrets.sh`, restart api/worker. FCM: rotate the key in Firebase, +update `secrets/fcm_server_key.txt`, re-run, restart. + +### Suspicious pod ```bash -# 1. Rotate password in Neon dashboard -# 2. Update local secret file -echo "NEW_PASSWORD" > secrets/postgres_password.txt -./scripts/02-setup-secrets.sh -kubectl rollout restart deployment/api deployment/worker -n honeydue - -# 3. Monitor for connection errors -kubectl logs -n honeydue -l app.kubernetes.io/name=api --tail=50 -f +kubectl logs -n honeydue > /tmp/pod-logs.txt +kubectl describe pod -n honeydue > /tmp/pod-describe.txt +kubectl delete pod -n honeydue # deployment recreates it ``` -### Playbook: Compromised Push Notification Keys - -```bash -# APNs: Revoke key in Apple Developer Console, generate new .p8 -cp new_key.p8 secrets/apns_auth_key.p8 -./scripts/02-setup-secrets.sh -kubectl rollout restart deployment/api deployment/worker -n honeydue - -# FCM: Rotate server key in Firebase Console -echo "NEW_FCM_KEY" > secrets/fcm_server_key.txt -./scripts/02-setup-secrets.sh -kubectl rollout restart deployment/api deployment/worker -n honeydue -``` - -### Playbook: Suspicious Pod Behavior - -```bash -# 1. Isolate the pod (remove from service) -kubectl label pod SUSPICIOUS_POD -n honeydue app.kubernetes.io/name- - -# 2. Capture state for investigation -kubectl logs SUSPICIOUS_POD -n honeydue > /tmp/suspicious-logs.txt -kubectl describe pod SUSPICIOUS_POD -n honeydue > /tmp/suspicious-describe.txt - -# 3. Delete and let deployment recreate -kubectl delete pod SUSPICIOUS_POD -n honeydue -``` - -### Communication Plan - -1. **Internal**: Document incident timeline in a private channel -2. **Users**: If data breach — notify affected users within 72 hours -3. **Vendors**: Revoke/rotate all potentially compromised credentials -4. **Post-mortem**: Document root cause, timeline, remediation, prevention +### Communication +Document the timeline privately; on a data breach notify affected users +within 72 hours; rotate every potentially-exposed credential; write a +post-mortem (root cause, timeline, remediation, prevention). --- -## 14. Compliance Checklist +## References -Run through this checklist before production launch and periodically thereafter. - -### Infrastructure - -- [ ] Hetzner firewall allows only ports 22, 443, 6443 -- [ ] SSH password auth disabled on all nodes -- [ ] fail2ban active on all nodes -- [ ] OS security updates enabled (unattended-upgrades) - -```bash -# Verify -hcloud firewall describe honeydue-fw -ssh user@NODE "grep PasswordAuthentication /etc/ssh/sshd_config" -ssh user@NODE "sudo fail2ban-client status sshd" -``` - -### K3s Cluster - -- [ ] Secret encryption enabled -- [ ] Service accounts created with no API access -- [ ] Pod disruption budgets deployed -- [ ] No default service account used by workloads - -```bash -# Verify -k3s secrets-encrypt status -kubectl get sa -n honeydue -kubectl get pdb -n honeydue -kubectl get pods -n honeydue -o jsonpath='{range .items[*]}{.metadata.name}{" sa="}{.spec.serviceAccountName}{"\n"}{end}' -``` - -### Pod Security - -- [ ] All pods: `runAsNonRoot: true` -- [ ] All containers: `allowPrivilegeEscalation: false` -- [ ] All containers: `readOnlyRootFilesystem: true` -- [ ] All containers: `capabilities.drop: ["ALL"]` -- [ ] All pods: `seccompProfile.type: RuntimeDefault` - -```bash -# Verify (automated check in 04-verify.sh) -./scripts/04-verify.sh -``` - -### Network - -- [ ] Default-deny NetworkPolicy applied -- [ ] 8+ explicit allow policies deployed -- [ ] Redis only reachable from API + Worker -- [ ] Admin only reaches API service -- [ ] Cloudflare-only middleware applied to all ingress - -```bash -# Verify -kubectl get networkpolicy -n honeydue -kubectl get ingress -n honeydue -o yaml | grep cloudflare-only -``` - -### Authentication & Authorization - -- [ ] Redis requires password -- [ ] Admin panel has basic auth layer -- [ ] API uses bcrypt for passwords -- [ ] Auth tokens have expiration -- [ ] Rate limiting on auth endpoints - -```bash -# Verify Redis auth -kubectl exec -n honeydue deploy/redis -- redis-cli ping -# Expected: NOAUTH error - -# Verify admin auth -kubectl get secret admin-basic-auth -n honeydue -``` - -### Secrets - -- [ ] All secrets stored as K8s Secrets (not ConfigMap) -- [ ] Secrets encrypted at rest (K3s) -- [ ] No secrets in git history -- [ ] SECRET_KEY >= 32 characters -- [ ] Secret rotation documented - -```bash -# Verify no secrets in ConfigMap -kubectl get configmap honeydue-config -n honeydue -o yaml | grep -iE 'password|secret|token|key' -# Should show only non-sensitive config keys (EMAIL_HOST, APNS_KEY_ID, etc.) -``` - -### TLS & Headers - -- [ ] Cloudflare Full (Strict) mode enabled -- [ ] Origin cert valid and not expired -- [ ] HSTS header present with includeSubDomains -- [ ] CSP header: `default-src 'self'; frame-ancestors 'none'` -- [ ] Permissions-Policy blocks camera/mic/geo -- [ ] X-Frame-Options: DENY - -```bash -# Verify headers (via Cloudflare) -curl -sI https://api.myhoneydue.com/api/health/ | grep -iE 'strict-transport|content-security|permissions-policy|x-frame' -``` - -### Container Images - -- [ ] Multi-stage Dockerfile (no build tools in runtime) -- [ ] Non-root user in all images -- [ ] Alpine base (minimal surface) -- [ ] No secrets baked into images - -```bash -# Verify non-root -kubectl get pods -n honeydue -o jsonpath='{range .items[*]}{.metadata.name}{" uid="}{.spec.securityContext.runAsUser}{"\n"}{end}' -``` - -### External Services - -- [ ] PostgreSQL: `sslmode=require` -- [ ] B2: Scoped application key (single bucket) -- [ ] APNs: .p8 key (not .p12 certificate) -- [ ] SMTP: TLS enabled (`use_tls: true`) - ---- - -## Quick Reference Commands - -```bash -# Full security verification -./scripts/04-verify.sh - -# Rotate all secrets -./scripts/02-setup-secrets.sh && \ -kubectl rollout restart deployment/api deployment/worker deployment/admin -n honeydue - -# Check for security events -kubectl get events -n honeydue --field-selector type=Warning - -# Emergency: scale down everything -kubectl scale deployment --all -n honeydue --replicas=0 - -# Emergency: restore -kubectl scale deployment api -n honeydue --replicas=3 -kubectl scale deployment worker -n honeydue --replicas=2 -kubectl scale deployment admin -n honeydue --replicas=1 -kubectl scale deployment redis -n honeydue --replicas=1 -``` +- Audit reports: `live_scan_5_12.md`, `k3_audit_5_12.md`, `security_scan_5_12.md` (repo root) +- Current architecture: `docs/deployment/05-security.md` +- Roadmap: `docs/deployment/20-roadmap.md` +- Deploy process: `docs/deployment/14-deployment-process.md` +- Scripts: `deploy-k3s/scripts/{01-provision-cluster,02-setup-secrets,03-deploy,04-verify}.sh` +- Manifests: `deploy-k3s/manifests/` diff --git a/deploy-k3s/config.yaml.example b/deploy-k3s/config.yaml.example index 712f8e4..ceac6b2 100644 --- a/deploy-k3s/config.yaml.example +++ b/deploy-k3s/config.yaml.example @@ -30,6 +30,7 @@ load_balancer_ip: "" domains: api: api.myhoneydue.com admin: admin.myhoneydue.com + app: app.myhoneydue.com # web client host — added to CORS_ALLOWED_ORIGINS base: myhoneydue.com # --- Container Registry (GHCR) --- diff --git a/deploy-k3s/manifests/admin/deployment.yaml b/deploy-k3s/manifests/admin/deployment.yaml index 7f70c5f..fb83b37 100644 --- a/deploy-k3s/manifests/admin/deployment.yaml +++ b/deploy-k3s/manifests/admin/deployment.yaml @@ -23,8 +23,11 @@ spec: app.kubernetes.io/part-of: honeydue spec: serviceAccountName: admin + # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of + # the ServiceAccount-level setting in rbac.yaml. + automountServiceAccountToken: false imagePullSecrets: - - name: ghcr-credentials + - name: gitea-credentials securityContext: runAsNonRoot: true runAsUser: 1001 @@ -35,6 +38,7 @@ spec: containers: - name: admin image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh + imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit; images are SHA/digest-pinned ports: - containerPort: 3000 protocol: TCP diff --git a/deploy-k3s/manifests/api/deployment.yaml b/deploy-k3s/manifests/api/deployment.yaml index a98f67c..652d143 100644 --- a/deploy-k3s/manifests/api/deployment.yaml +++ b/deploy-k3s/manifests/api/deployment.yaml @@ -23,8 +23,11 @@ spec: app.kubernetes.io/part-of: honeydue spec: serviceAccountName: api + # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of + # the ServiceAccount-level setting in rbac.yaml. + automountServiceAccountToken: false imagePullSecrets: - - name: ghcr-credentials + - name: gitea-credentials securityContext: runAsNonRoot: true runAsUser: 1000 @@ -35,6 +38,7 @@ spec: containers: - name: api image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh + imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit; images are SHA/digest-pinned ports: - containerPort: 8000 protocol: TCP @@ -46,65 +50,16 @@ spec: envFrom: - configMapRef: name: honeydue-config - env: - - name: POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: POSTGRES_PASSWORD - - name: SECRET_KEY - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: SECRET_KEY - - name: EMAIL_HOST_PASSWORD - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: EMAIL_HOST_PASSWORD - - name: FCM_SERVER_KEY - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: FCM_SERVER_KEY - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: REDIS_PASSWORD - optional: true - # B2 (Backblaze) credentials. With both set, StorageConfig.IsS3() - # returns true and uploads stream to B2 via minio-go. With either - # missing, code falls back to local filesystem — and since - # readOnlyRootFilesystem is true on this container, that fallback - # silently fails. So both must be wired or uploads break. - - name: B2_KEY_ID - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: B2_KEY_ID - - name: B2_APP_KEY - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: B2_APP_KEY - # Observability — push traces (and any future OTLP metrics) to - # obs.88oakapps.com. Token gates ingest at nginx; URL is the - # same one vmagent uses for metric remote-write. Both come from - # honeydue-secrets so they aren't world-readable in ConfigMap. - - name: OBS_TRACES_URL - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: OBS_TRACES_URL - optional: true - - name: OBS_INGEST_TOKEN - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: OBS_INGEST_TOKEN - optional: true + # Audit CODE-F8: secrets are NOT injected as environment variables. + # Env vars are readable for the life of the pod via /proc//environ + # and leak into crash dumps / child processes. honeydue-secrets is + # mounted read-only at /etc/honeydue/secrets (mode 0400) and the Go + # config layer (config.loadFileSecrets) reads each key from its file. + # Non-secret config still arrives via the configMapRef above. volumeMounts: + - name: app-secrets + mountPath: /etc/honeydue/secrets + readOnly: true - name: apns-key mountPath: /secrets/apns readOnly: true @@ -121,11 +76,12 @@ spec: httpGet: path: /api/health/ port: 8000 - # MigrateWithLock in cmd/api/main.go runs pg_advisory_lock on - # every startup. On a cold boot with 3 replicas, the first does - # AutoMigrate (~90s) and the others wait on the lock, so real - # startup runs 90–240s. 48 × 5s = 240s grace absorbs it without - # healthcheck killing a still-starting replica. + # Schema migrations run separately in the honeydue-migrate Job + # *before* this Deployment rolls — the api itself does not migrate + # (it only verifies goose_db_version at boot). Cold start still + # pays the DB pool warm-up + Redis connect + APNs/FCM client init + # before /api/health/ goes green. 48 × 5s = 240s grace keeps the + # probe from killing a still-starting replica. failureThreshold: 48 periodSeconds: 5 readinessProbe: @@ -143,6 +99,12 @@ spec: periodSeconds: 30 timeoutSeconds: 10 volumes: + # Audit CODE-F8: the whole honeydue-secrets Secret, projected as files. + # defaultMode 0400 → readable only by the container's runAsUser (1000). + - name: app-secrets + secret: + secretName: honeydue-secrets + defaultMode: 0400 - name: apns-key secret: secretName: honeydue-apns-key diff --git a/deploy-k3s/manifests/ingress/ingress-simple.yaml b/deploy-k3s/manifests/ingress/ingress-simple.yaml index a44ee62..6a38b29 100644 --- a/deploy-k3s/manifests/ingress/ingress-simple.yaml +++ b/deploy-k3s/manifests/ingress/ingress-simple.yaml @@ -53,7 +53,12 @@ metadata: labels: app.kubernetes.io/part-of: honeydue annotations: - traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd + # cloudflare-only + admin-auth wired in (audit F2/F3/CODE-L6). Order + # matters: reject non-Cloudflare IPs, then basic auth, then headers, + # then rate limit. The admin-basic-auth secret is created by + # 02-setup-secrets.sh from config.yaml admin.basic_auth_* — that runs + # before 03-deploy.sh, so the middleware always has its secret. + traefik.ingress.kubernetes.io/router.middlewares: honeydue-cloudflare-only@kubernetescrd,honeydue-admin-auth@kubernetescrd,honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd spec: ingressClassName: traefik tls: @@ -98,3 +103,98 @@ spec: name: web port: number: 3000 +--- +# Auth-endpoint Ingress (audit F10 / LIVE-L12). A dedicated Ingress for the +# auth paths so Traefik gives their longer path-prefix routers a higher +# priority than honeydue-api's "/" router — these paths then get +# auth-rate-limit (5/min) instead of the general rate-limit (100/min). +# Anything not matched here falls through to honeydue-api unchanged. +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: honeydue-api-auth + namespace: honeydue + labels: + app.kubernetes.io/part-of: honeydue + annotations: + traefik.ingress.kubernetes.io/router.middlewares: honeydue-auth-rate-limit@kubernetescrd,honeydue-security-headers@kubernetescrd +spec: + ingressClassName: traefik + tls: + - hosts: + - api.myhoneydue.com + secretName: cloudflare-origin-cert + rules: + - host: api.myhoneydue.com + http: + paths: + - path: /api/auth/login + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/auth/register + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/auth/forgot-password + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/auth/reset-password + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/residences/join-with-code + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/auth/verify-reset-code + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/auth/apple-sign-in + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/auth/google-sign-in + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/auth/refresh + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + - path: /api/auth/account + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 diff --git a/deploy-k3s/manifests/ingress/middleware.yaml b/deploy-k3s/manifests/ingress/middleware.yaml index 67b87d6..6710ce6 100644 --- a/deploy-k3s/manifests/ingress/middleware.yaml +++ b/deploy-k3s/manifests/ingress/middleware.yaml @@ -21,12 +21,20 @@ spec: headers: frameDeny: true contentTypeNosniff: true - browserXssFilter: true + # browserXssFilter removed (audit L7): it emits the deprecated + # X-XSS-Protection header, which can itself introduce XSS in legacy + # browsers. Modern browsers ignore it. referrerPolicy: "strict-origin-when-cross-origin" customResponseHeaders: X-Content-Type-Options: "nosniff" X-Frame-Options: "DENY" - Strict-Transport-Security: "max-age=31536000; includeSubDomains" + # HSTS: 2-year max-age + preload (audit L5/CODE-L3). After this is + # live on api/admin/app, submit myhoneydue.com to hstspreload.org. + Strict-Transport-Security: "max-age=63072000; includeSubDomains; preload" + # Cross-origin isolation (audit F9). COEP (require-corp) is omitted — + # it commonly breaks third-party embeds; add only after testing. + Cross-Origin-Opener-Policy: "same-origin" + Cross-Origin-Resource-Policy: "same-origin" # Content-Security-Policy is intentionally NOT set here — the Go API # sets a CSP in internal/router/router.go that permits Google Fonts # for the landing page. Two CSP headers would intersect and break it. @@ -83,3 +91,24 @@ spec: basicAuth: secret: admin-basic-auth realm: "honeyDue Admin" + +--- +# Strict rate limit for auth endpoints (audit F10 / LIVE-L12). +# Applied via the honeydue-api-auth Ingress to login / register / +# forgot-password / reset-password / join-with-code. depth: 2 makes the +# limiter key on the real client IP rather than the Cloudflare edge IP +# (request path: client -> Cloudflare -> Traefik). This is the edge half; +# the per-account lockout in the Go app is the robust half. +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: auth-rate-limit + namespace: honeydue +spec: + rateLimit: + average: 5 + burst: 10 + period: 1m + sourceCriterion: + ipStrategy: + depth: 2 diff --git a/deploy-k3s/manifests/kyverno-verify-images.yaml b/deploy-k3s/manifests/kyverno-verify-images.yaml new file mode 100644 index 0000000..fb23567 --- /dev/null +++ b/deploy-k3s/manifests/kyverno-verify-images.yaml @@ -0,0 +1,61 @@ +# Kyverno image-signature verification policy (audit CODE-L5). +# +# ────────────────────────────────────────────────────────────────────────── +# THIS MANIFEST IS NOT APPLIED BY 03-deploy.sh. It is intentionally outside +# the script's apply set. Applying it before the prerequisites are in place +# would block every honeydue Pod from scheduling. Operator steps: +# +# 1. Install Kyverno in the cluster (it is an admission controller): +# kubectl create -f https://github.com/kyverno/kyverno/releases/latest/download/install.yaml +# 2. Generate a cosign key pair and keep the private key safe: +# cosign generate-key-pair # -> cosign.key (PRIVATE) + cosign.pub +# Set COSIGN_KEY=cosign.key in the deploy environment so 03-deploy.sh +# signs images after pushing them (the signing step is already wired, +# guarded, into 03-deploy.sh). +# 3. Paste the contents of cosign.pub into the publicKeys block below. +# 4. Apply this policy: kubectl apply -f deploy-k3s/manifests/kyverno-verify-images.yaml +# 5. After confirming honeydue Pods still schedule, flip +# validationFailureAction from Audit to Enforce. +# +# Until then it is a documented, ready-to-use template — not active config. +# ────────────────────────────────────────────────────────────────────────── +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: verify-honeydue-images + annotations: + policies.kyverno.io/title: Verify honeyDue image signatures + policies.kyverno.io/description: >- + Requires that honeyDue application images pulled into the honeydue + namespace carry a valid cosign signature made with the operator's key. +spec: + # Audit first — logs violations without blocking. Switch to Enforce once + # signing is confirmed working end to end. + validationFailureAction: Audit + background: false + webhookTimeoutSeconds: 30 + rules: + - name: verify-gitea-image-signatures + match: + any: + - resources: + kinds: + - Pod + namespaces: + - honeydue + verifyImages: + # Only the images we build and sign. Public base images + # (redis, vmagent) are pinned by digest instead — see their manifests. + - imageReferences: + - "gitea.treytartt.com/admin/honeydue-api*" + - "gitea.treytartt.com/admin/honeydue-worker*" + - "gitea.treytartt.com/admin/honeydue-admin*" + - "gitea.treytartt.com/admin/honeydue-web*" + attestors: + - count: 1 + entries: + - keys: + publicKeys: |- + -----BEGIN PUBLIC KEY----- + REPLACE_WITH_CONTENTS_OF_cosign.pub + -----END PUBLIC KEY----- diff --git a/deploy-k3s/manifests/migrate/job.yaml b/deploy-k3s/manifests/migrate/job.yaml index 8b09f44..79f4957 100644 --- a/deploy-k3s/manifests/migrate/job.yaml +++ b/deploy-k3s/manifests/migrate/job.yaml @@ -27,8 +27,10 @@ spec: app.kubernetes.io/part-of: honeydue spec: restartPolicy: Never + # The migrate Job never calls the k8s API (audit F11). + automountServiceAccountToken: false imagePullSecrets: - - name: ghcr-credentials + - name: gitea-credentials securityContext: runAsNonRoot: true runAsUser: 1000 @@ -38,6 +40,7 @@ spec: containers: - name: goose image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh — same as api + imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit command: ["/bin/sh", "-c"] # DB_HOST in the ConfigMap points at the -pooler endpoint for runtime. # goose's session-scoped advisory lock can't survive PgBouncer diff --git a/deploy-k3s/manifests/observability/vmagent.yaml b/deploy-k3s/manifests/observability/vmagent.yaml index ffc1217..a324aac 100644 --- a/deploy-k3s/manifests/observability/vmagent.yaml +++ b/deploy-k3s/manifests/observability/vmagent.yaml @@ -179,7 +179,17 @@ spec: type: RuntimeDefault containers: - name: vmagent - image: victoriametrics/vmagent:v1.106.1 + # Pinned by digest (audit K3S-F14). + image: victoriametrics/vmagent:v1.106.1@sha256:90208a667c0baf65f7536b92a84c40b6e35ffe8e88bda7e4447b97b06c6ba6b8 + imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit + # Container-level hardening (audit F7) — matches the other 5 + # workloads. vmagent only writes to the /tmp/vmagent emptyDir + # (its remoteWrite buffer), so a read-only root filesystem holds. + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] args: - "-promscrape.config=/etc/vmagent/scrape.yaml" - "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write" diff --git a/deploy-k3s/manifests/redis/deployment.yaml b/deploy-k3s/manifests/redis/deployment.yaml index 0828961..90a65b5 100644 --- a/deploy-k3s/manifests/redis/deployment.yaml +++ b/deploy-k3s/manifests/redis/deployment.yaml @@ -20,6 +20,9 @@ spec: app.kubernetes.io/part-of: honeydue spec: serviceAccountName: redis + # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of + # the ServiceAccount-level setting in rbac.yaml. + automountServiceAccountToken: false nodeSelector: honeydue/redis: "true" securityContext: @@ -31,7 +34,9 @@ spec: type: RuntimeDefault containers: - name: redis - image: redis:7-alpine + # Pinned by digest (audit K3S-F14) — redis:7-alpine is 7.4.9-alpine. + image: redis:7-alpine@sha256:6ab0b6e7381779332f97b8ca76193e45b0756f38d4c0dcda72dbb3c32061ab99 + imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit command: - sh - -c diff --git a/deploy-k3s/manifests/web/deployment.yaml b/deploy-k3s/manifests/web/deployment.yaml index 4876ccf..6a18675 100644 --- a/deploy-k3s/manifests/web/deployment.yaml +++ b/deploy-k3s/manifests/web/deployment.yaml @@ -23,8 +23,11 @@ spec: app.kubernetes.io/part-of: honeydue spec: serviceAccountName: web + # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of + # the ServiceAccount-level setting in rbac.yaml. + automountServiceAccountToken: false imagePullSecrets: - - name: ghcr-credentials + - name: gitea-credentials securityContext: runAsNonRoot: true runAsUser: 1001 @@ -43,6 +46,7 @@ spec: containers: - name: web image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh or manual sed + imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit; images are SHA/digest-pinned ports: - containerPort: 3000 protocol: TCP diff --git a/deploy-k3s/manifests/worker/deployment.yaml b/deploy-k3s/manifests/worker/deployment.yaml index ca6b887..53f8f8d 100644 --- a/deploy-k3s/manifests/worker/deployment.yaml +++ b/deploy-k3s/manifests/worker/deployment.yaml @@ -27,8 +27,11 @@ spec: app.kubernetes.io/part-of: honeydue spec: serviceAccountName: worker + # Explicit pod-level opt-out (audit F11) — defense-in-depth on top of + # the ServiceAccount-level setting in rbac.yaml. + automountServiceAccountToken: false imagePullSecrets: - - name: ghcr-credentials + - name: gitea-credentials securityContext: runAsNonRoot: true runAsUser: 1000 @@ -39,6 +42,7 @@ spec: containers: - name: worker image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh + imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit; images are SHA/digest-pinned securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true @@ -47,64 +51,16 @@ spec: envFrom: - configMapRef: name: honeydue-config - env: - - name: POSTGRES_PASSWORD - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: POSTGRES_PASSWORD - - name: SECRET_KEY - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: SECRET_KEY - - name: EMAIL_HOST_PASSWORD - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: EMAIL_HOST_PASSWORD - - name: FCM_SERVER_KEY - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: FCM_SERVER_KEY - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: REDIS_PASSWORD - optional: true - # B2 (Backblaze) credentials. The worker needs these to delete - # B2 objects when the pending_uploads cleanup cron reaps - # expired upload sessions. Without them the worker falls back - # to local-disk storage which fails on this pod's read-only - # root filesystem and disables the cleanup cron. - - name: B2_KEY_ID - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: B2_KEY_ID - - name: B2_APP_KEY - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: B2_APP_KEY - # Observability — workers emit traces (e.g., asynq job spans) to - # obs.88oakapps.com over OTLP/HTTP. service.name=honeydue-worker - # so api and worker show up as separate services in Jaeger. - - name: OBS_TRACES_URL - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: OBS_TRACES_URL - optional: true - - name: OBS_INGEST_TOKEN - valueFrom: - secretKeyRef: - name: honeydue-secrets - key: OBS_INGEST_TOKEN - optional: true + # Audit CODE-F8: secrets are NOT injected as environment variables. + # Env vars are readable for the life of the pod via /proc//environ + # and leak into crash dumps / child processes. honeydue-secrets is + # mounted read-only at /etc/honeydue/secrets (mode 0400) and the Go + # config layer (config.loadFileSecrets) reads each key from its file. + # Non-secret config still arrives via the configMapRef above. volumeMounts: + - name: app-secrets + mountPath: /etc/honeydue/secrets + readOnly: true - name: apns-key mountPath: /secrets/apns readOnly: true @@ -124,6 +80,12 @@ spec: periodSeconds: 30 timeoutSeconds: 5 volumes: + # Audit CODE-F8: the whole honeydue-secrets Secret, projected as files. + # defaultMode 0400 → readable only by the container's runAsUser (1000). + - name: app-secrets + secret: + secretName: honeydue-secrets + defaultMode: 0400 - name: apns-key secret: secretName: honeydue-apns-key diff --git a/deploy-k3s/scripts/02-setup-secrets.sh b/deploy-k3s/scripts/02-setup-secrets.sh index 5857a74..d5c52b5 100755 --- a/deploy-k3s/scripts/02-setup-secrets.sh +++ b/deploy-k3s/scripts/02-setup-secrets.sh @@ -68,6 +68,25 @@ SECRET_ARGS=( if [[ -n "${REDIS_PASSWORD}" ]]; then log " Including REDIS_PASSWORD in secrets" SECRET_ARGS+=(--from-literal="REDIS_PASSWORD=${REDIS_PASSWORD}") +else + # Audit K3S-F1 (CRITICAL) / MEDIUM-4: refuse to deploy with an unauthenticated + # Redis. A previous version only warned here, which let a deploy from an + # unedited config.yaml silently bring Redis up with no password. + die "redis.password is empty in config.yaml — refusing to deploy: Redis would run with NO authentication (audit K3S-F1). Set a strong value, e.g.: openssl rand -base64 32" +fi + +# B2 (Backblaze) object-storage credentials. The api/worker manifests +# reference B2_KEY_ID / B2_APP_KEY as required secret keys, so honeydue-secrets +# MUST carry them or those pods fail to start. Sourced from config.yaml so the +# script and the manifests no longer drift (was a latent gap before 2026-05-16). +B2_KEY_ID_VAL="$(cfg storage.b2_key_id 2>/dev/null || true)" +B2_APP_KEY_VAL="$(cfg storage.b2_app_key 2>/dev/null || true)" +if [[ -n "${B2_KEY_ID_VAL}" && -n "${B2_APP_KEY_VAL}" ]]; then + log " Including B2_KEY_ID / B2_APP_KEY in secrets" + SECRET_ARGS+=(--from-literal="B2_KEY_ID=${B2_KEY_ID_VAL}") + SECRET_ARGS+=(--from-literal="B2_APP_KEY=${B2_APP_KEY_VAL}") +else + warn "storage.b2_key_id / b2_app_key not set in config.yaml — B2 uploads will be disabled." fi # Observability ingest credentials live in deploy/prod.env (gitignored) so @@ -100,22 +119,24 @@ kubectl create secret generic honeydue-apns-key \ --from-file="apns_auth_key.p8=${SECRETS_DIR}/apns_auth_key.p8" \ --dry-run=client -o yaml | kubectl apply -f - -# --- Create GHCR registry credentials --- +# --- Create container registry credentials --- +# Secret name is gitea-credentials (audit F6): the registry is self-hosted +# Gitea, not GHCR. Every deployment manifest references this same name. REGISTRY_SERVER="$(cfg registry.server)" REGISTRY_USER="$(cfg registry.username)" REGISTRY_TOKEN="$(cfg registry.token)" if [[ -n "${REGISTRY_SERVER}" && -n "${REGISTRY_USER}" && -n "${REGISTRY_TOKEN}" ]]; then - log "Creating ghcr-credentials..." - kubectl create secret docker-registry ghcr-credentials \ + log "Creating gitea-credentials..." + kubectl create secret docker-registry gitea-credentials \ --namespace="${NAMESPACE}" \ --docker-server="${REGISTRY_SERVER}" \ --docker-username="${REGISTRY_USER}" \ --docker-password="${REGISTRY_TOKEN}" \ --dry-run=client -o yaml | kubectl apply -f - else - warn "Registry credentials incomplete in config.yaml — skipping ghcr-credentials." + warn "Registry credentials incomplete in config.yaml — skipping gitea-credentials." fi # --- Create Cloudflare origin cert --- @@ -132,7 +153,8 @@ kubectl create secret tls cloudflare-origin-cert \ if [[ -n "${ADMIN_AUTH_USER}" && -n "${ADMIN_AUTH_PASSWORD}" ]]; then command -v htpasswd >/dev/null 2>&1 || die "Missing: htpasswd (install apache2-utils)" log "Creating admin-basic-auth secret..." - HTPASSWD="$(htpasswd -nb "${ADMIN_AUTH_USER}" "${ADMIN_AUTH_PASSWORD}")" + # -B forces bcrypt (Traefik BasicAuth supports it; avoids weak apr1-MD5). + HTPASSWD="$(htpasswd -nbB "${ADMIN_AUTH_USER}" "${ADMIN_AUTH_PASSWORD}")" kubectl create secret generic admin-basic-auth \ --namespace="${NAMESPACE}" \ --from-literal=users="${HTPASSWD}" \ diff --git a/deploy-k3s/scripts/03-deploy.sh b/deploy-k3s/scripts/03-deploy.sh index 0169669..9942b5b 100755 --- a/deploy-k3s/scripts/03-deploy.sh +++ b/deploy-k3s/scripts/03-deploy.sh @@ -128,6 +128,56 @@ else warn "Skipping build. Using images for tag: ${DEPLOY_TAG}" fi +# --- Resolve immutable image digests (audit F5) --- +# A short-SHA tag is mutable — anyone who can push to the registry can +# overwrite it, and imagePullPolicy then pulls the new bits silently. We +# deploy by @sha256: digest instead, pinning the exact image that was just +# built and pushed. `docker push` populates RepoDigests; with --skip-build +# (no local image) resolve_ref falls back to the tag. +resolve_ref() { + local img="$1" digest + digest="$(docker inspect --format='{{range .RepoDigests}}{{println .}}{{end}}' "${img}" 2>/dev/null | grep -m1 '@sha256:' || true)" + if [[ -n "${digest}" ]]; then + printf '%s' "${digest}" + else + warn "could not resolve a digest for ${img} — deploying by mutable tag" + printf '%s' "${img}" + fi +} +API_REF="$(resolve_ref "${API_IMAGE}")" +WORKER_REF="$(resolve_ref "${WORKER_IMAGE}")" +ADMIN_REF="$(resolve_ref "${ADMIN_IMAGE}")" +WEB_REF="$(resolve_ref "${WEB_IMAGE}")" +log "Deploying by digest:" +log " API: ${API_REF}" +log " Worker: ${WORKER_REF}" +log " Admin: ${ADMIN_REF}" + +# --- Image scan + signing (audit CODE-L5) --- +# Both steps are best-effort: the deploy does NOT fail if the tools are +# absent, so an operator who has not set up cosign/trivy yet is not blocked. +# Install trivy + cosign and export COSIGN_KEY to enforce. Cluster-side +# admission verification (Kyverno/Connaisseur) is a separate operator step. +if [[ "${SKIP_BUILD}" == "false" ]]; then + if command -v trivy >/dev/null 2>&1; then + log "Scanning images with Trivy (HIGH,CRITICAL)..." + for img in "${API_IMAGE}" "${WORKER_IMAGE}" "${ADMIN_IMAGE}"; do + trivy image --severity HIGH,CRITICAL --exit-code 0 --quiet "${img}" \ + || warn "Trivy reported findings for ${img}" + done + else + warn "trivy not installed — skipping image vulnerability scan (audit L5)" + fi + if command -v cosign >/dev/null 2>&1 && [[ -n "${COSIGN_KEY:-}" ]]; then + log "Signing images with cosign..." + for ref in "${API_REF}" "${WORKER_REF}" "${ADMIN_REF}"; do + cosign sign --yes --key "${COSIGN_KEY}" "${ref}" || warn "cosign sign failed for ${ref}" + done + else + warn "cosign not configured (need cosign + COSIGN_KEY) — skipping image signing (audit L5)" + fi +fi + # --- Generate and apply ConfigMap from config.yaml --- log "Generating env from config.yaml..." @@ -166,7 +216,7 @@ kubectl apply -f "${MANIFESTS}/ingress/" # pod sees a stale schema. log "Running database migrations (goose Job)..." kubectl delete job honeydue-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null -sed "s|image: IMAGE_PLACEHOLDER|image: ${API_IMAGE}|" "${MANIFESTS}/migrate/job.yaml" | kubectl apply -f - +sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/migrate/job.yaml" | kubectl apply -f - if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=10m job/honeydue-migrate; then warn "migration Job failed — see logs:" kubectl logs -n "${NAMESPACE}" job/honeydue-migrate --tail=200 || true @@ -175,17 +225,17 @@ fi log "Migrations applied; proceeding with api/worker rollout" # Apply deployments with image substitution -sed "s|image: IMAGE_PLACEHOLDER|image: ${API_IMAGE}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f - +sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f - kubectl apply -f "${MANIFESTS}/api/service.yaml" kubectl apply -f "${MANIFESTS}/api/hpa.yaml" -sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_IMAGE}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f - +sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_REF}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f - -sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_IMAGE}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f - +sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_REF}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f - kubectl apply -f "${MANIFESTS}/admin/service.yaml" if [[ -d "${MANIFESTS}/web" ]]; then - sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_IMAGE}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f - + sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_REF}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f - kubectl apply -f "${MANIFESTS}/web/service.yaml" fi diff --git a/deploy-k3s/scripts/_config.sh b/deploy-k3s/scripts/_config.sh index ba0ac4c..2a0cb25 100755 --- a/deploy-k3s/scripts/_config.sh +++ b/deploy-k3s/scripts/_config.sh @@ -100,7 +100,7 @@ lines = [ # API 'DEBUG=false', f\"ALLOWED_HOSTS={d['api']},{d['base']}\", - f\"CORS_ALLOWED_ORIGINS=https://{d['base']},https://{d['admin']}\", + f\"CORS_ALLOWED_ORIGINS=https://{d['base']},https://{d['admin']},https://{d.get('app', 'app.' + d['base'])}\", 'TIMEZONE=UTC', f\"BASE_URL=https://{d['base']}\", 'PORT=8000', @@ -119,9 +119,14 @@ lines = [ f\"DB_MAX_IDLE_CONNS={db['max_idle_conns']}\", f\"DB_MAX_LIFETIME={db['max_lifetime']}\", f\"DB_MAX_IDLE_TIME={db.get('max_idle_time', '0s')}\", - # Redis (in-namespace DNS short form — password injected if configured; - # short form works because /etc/resolv.conf in pods searches honeydue.svc.cluster.local) - f\"REDIS_URL=redis://{':%s@' % val(rd.get('password')) if rd.get('password') else ''}redis:6379/0\", + # Redis — in-namespace DNS short form (works because pod /etc/resolv.conf + # searches honeydue.svc.cluster.local). Audit HIGH-1: the password is + # intentionally NOT embedded here. This URL is emitted into the + # honeydue-config ConfigMap, which is NOT encrypted at rest and is + # readable by anyone with `get configmap`. The Redis password travels + # only in honeydue-secrets as REDIS_PASSWORD (file-mounted, F8); the API + # applies it in cache_service.go and the worker onto its Asynq opt. + 'REDIS_URL=redis://redis:6379/0', 'REDIS_DB=0', # Email f\"EMAIL_HOST={em['host']}\", @@ -218,8 +223,18 @@ config = { 'image': 'ubuntu-24.04', }, 'additional_packages': ['open-iscsi'], - 'post_create_commands': ['sudo systemctl enable --now iscsid'], - 'k3s_config_file': 'secrets-encryption: true\n', + # Audit K3S-CG2: harden the node OS at provision time — fail2ban for SSH + # brute-force, unattended-upgrades for automatic security patches. + 'post_create_commands': [ + 'sudo systemctl enable --now iscsid', + 'sudo apt-get update -qq', + 'sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq fail2ban unattended-upgrades', + 'sudo systemctl enable --now fail2ban', + 'sudo dpkg-reconfigure -f noninteractive -plow unattended-upgrades', + ], + # Audit K3S-CG1 / K3S-F4: encrypt Secrets at rest in etcd, and write the + # node kubeconfig as mode 0600 (not world-readable). + 'k3s_config_file': 'secrets-encryption: true\nwrite-kubeconfig-mode: \"0600\"\n', } print(yaml.dump(config, default_flow_style=False, sort_keys=False)) diff --git a/docs/deployment/05-security.md b/docs/deployment/05-security.md index 755ee75..ae37a2e 100644 --- a/docs/deployment/05-security.md +++ b/docs/deployment/05-security.md @@ -8,6 +8,13 @@ long-haul components, and dedicated service accounts with dropped capabilities inside containers. This chapter documents each layer, the rationale, and what's currently missing (and why). +> **Updated 2026-05-15 — security remediation.** The 2026-05 audits +> (`live_scan_5_12.md`, `k3_audit_5_12.md`, `security_scan_5_12.md`) drove a +> full remediation pass. **`deploy-k3s/SECURITY.md` is the authoritative, +> per-finding current-state record.** This chapter is corrected for the +> major items below; where any other detail conflicts with `SECURITY.md`, +> `SECURITY.md` wins. + ## Threat model Who we're defending against, in rough order of likelihood: @@ -54,8 +61,8 @@ Cloudflare sits in front of every public request. - **Authorize requests** — that's the app's job - **Protect origin if origin IP leaks** — once someone knows a node IP they can bypass CF. Mitigation: keep origin firewall strict (Chapter 4). -- **Encrypt between CF and origin** — we're on SSL=Flexible, so CF↔origin - is HTTP. This is in our TODO (Chapter 20, upgrade to Full-strict). +- **~~Encrypt between CF and origin~~** — done (2026-04-24): SSL mode is + Full (strict); CF↔origin is TLS with a Cloudflare Origin CA cert. ### The proxy-IP problem @@ -75,8 +82,8 @@ This means a malicious request that bypasses CF (by hitting the node IP directly) can't spoof headers — Traefik ignores `X-Forwarded-*` unless the source IP is in CF's ranges. -**TODO** (Chapter 20): Enforce at UFW level — allow 80/tcp only from -CF IP ranges. Today any IP can reach the origin on port 80. +**Done (2026-04-24):** the node UFW allowlist permits `:443` only from +Cloudflare's IP ranges; the `Anywhere` rules on `:80`/`:443` were removed. ## Layer 2 — Node (OS, SSH, firewall) @@ -297,15 +304,13 @@ The `deploy-k3s/manifests/network-policies.yaml` scaffold defines: reach api pods on port 8000 - **allow-ingress-to-admin** — same, for admin:3000 -**These are not currently applied.** Without them, our pods can freely -talk to anything — including, theoretically, malicious destinations if -an attacker gets RCE inside a pod. +**Applied.** `03-deploy.sh` applies +`deploy-k3s/manifests/network-policies.yaml` on every deploy — default-deny +plus the explicit per-app allows below. Traefik runs `hostNetwork`, so its +traffic is matched by node-IP `ipBlock`s plus the pod CIDR `10.42.0.0/16`, +not a `namespaceSelector`. -**TODO** (Chapter 20): Apply network policies. The scaffold is there; we -just need to `kubectl apply -f deploy-k3s/manifests/network-policies.yaml` -and test that nothing breaks. - -### What network policies would prevent +### What network policies prevent | Attack scenario | NetworkPolicy blocks | |---|---| @@ -324,13 +329,10 @@ renewed Let's Encrypt or CF-managed cert for `*.myhoneydue.com`. ### CF ↔ origin -**Plaintext HTTP** (SSL = Flexible). An attacker with access to the -Cloudflare-to-Hetzner path could read traffic. In practice nobody who -isn't Cloudflare or Hetzner sits on that path. - -**TODO** (Chapter 20): Upgrade to SSL = Full (strict) with a Cloudflare -Origin CA certificate. This encrypts CF ↔ origin and verifies that -origin's cert is the CF-issued one (prevents MitM if DNS is compromised). +**TLS — SSL = Full (strict)** (since 2026-04-24). A Cloudflare Origin CA +certificate (`cloudflare-origin-cert` secret) is installed on all three +ingresses; Cloudflare validates it. Both user↔CF and CF↔origin are +encrypted, and a DNS-hijack MitM is defeated by the origin-cert check. ### API ↔ Neon Postgres @@ -454,11 +456,14 @@ Mitigations: - Gitea itself is behind login; PAT is scoped to read:packages + write:packages only - Gitea runs on the operator's infrastructure (same operator account) -- Image tags are SHA-pinned (`:237c6b8`) not `:latest` → attacker can't - replace an existing tag's image without us noticing the digest change +- Workloads deploy by immutable `@sha256:` digest, not by mutable tag + (`03-deploy.sh` resolves the digest after push; the redis/vmagent/node + base images are digest-pinned too) — a swapped tag cannot reach the + cluster. -**TODO** (Chapter 20): Add cosign signing at build time, verify at pull -time. +**TODO**: cosign signing is wired into `03-deploy.sh` (guarded — runs when +`cosign` + `COSIGN_KEY` are present); cluster-side admission verification +(Kyverno/Connaisseur) is still pending. See `deploy-k3s/SECURITY.md` → L5. ## Operator workstation security diff --git a/docs/deployment/06-traefik-ingress.md b/docs/deployment/06-traefik-ingress.md index b400c88..6f61f64 100644 --- a/docs/deployment/06-traefik-ingress.md +++ b/docs/deployment/06-traefik-ingress.md @@ -1,5 +1,13 @@ # 06 — Traefik Ingress +> **Updated 2026-05-15 (security remediation):** the Traefik middleware set +> changed — `cloudflare-only` + `admin-auth` are now attached to the admin +> ingress, a strict `auth-rate-limit` middleware fronts the auth endpoints +> (via a dedicated `honeydue-api-auth` Ingress), and `security-headers` +> gained COOP/CORP + a 2-year preload HSTS and dropped the deprecated +> `X-XSS-Protection`. `deploy-k3s/SECURITY.md` is the authoritative +> current-state record. + ## Summary Traefik is the reverse proxy that routes external HTTP requests to the diff --git a/docs/deployment/07-services.md b/docs/deployment/07-services.md index 057e776..aa63dd7 100644 --- a/docs/deployment/07-services.md +++ b/docs/deployment/07-services.md @@ -1,5 +1,11 @@ # 07 — Services +> **Updated 2026-05-15 (security remediation):** Redis now requires a +> password (`config.yaml` `redis.password` → `honeydue-secrets`), all +> workloads deploy by immutable `@sha256:` digest, and the redis/vmagent +> base images are digest-pinned. `deploy-k3s/SECURITY.md` is the +> authoritative current-state record. + ## Summary Five workloads run in the `honeydue` namespace: **api** (Go REST API, 3 diff --git a/docs/deployment/10-secrets-config.md b/docs/deployment/10-secrets-config.md index 95b9c6d..adeed1f 100644 --- a/docs/deployment/10-secrets-config.md +++ b/docs/deployment/10-secrets-config.md @@ -1,5 +1,11 @@ # 10 — Secrets & Config +> **Updated 2026-05-15 (security remediation):** `honeydue-secrets` now +> carries `REDIS_PASSWORD`; an `admin-basic-auth` Secret backs the admin +> ingress; rotation is documented in `docs/runbooks/secret-rotation.md`; +> and the Go config can read file-mounted secrets (`HONEYDUE_SECRETS_DIR`). +> `deploy-k3s/SECURITY.md` is the authoritative current-state record. + ## Summary Non-sensitive config (hostnames, ports, feature flags, etc.) lives in diff --git a/docs/runbooks/secret-rotation.md b/docs/runbooks/secret-rotation.md new file mode 100644 index 0000000..7c1a6c1 --- /dev/null +++ b/docs/runbooks/secret-rotation.md @@ -0,0 +1,146 @@ +# Runbook — Secret Rotation + +Closes audit finding `K3S-F12` (secrets unrotated since cluster bootstrap, +no rotation cadence). See `deploy-k3s/SECURITY.md` Stage 2. + +**Cadence:** rotate every secret at least **annually**. Rotate +**immediately** on suspected exposure, on an operator-device loss, or when +anyone who has seen a secret leaves the project. + +**Record keeping:** after each rotation, annotate the secret so the age is +visible: + +```bash +kubectl -n honeydue annotate secret \ + honeydue.dev/last-rotated="$(date -u +%Y-%m-%d)" --overwrite +``` + +--- + +## How rotation works + +Every secret has a **source of truth** on the operator workstation. The +deploy scripts read those sources and (re)create the Kubernetes Secrets. +Rotation is always: **update the source → re-run `02-setup-secrets.sh` → +restart the pods that consume it → revoke the old credential at its +provider.** + +`02-setup-secrets.sh` uses `kubectl apply` (via `--dry-run=client -o yaml`), +so re-running it is idempotent and only changes what you changed. + +| Kubernetes Secret | Source of truth | Consumed by | +|---|---|---| +| `honeydue-secrets` → `POSTGRES_PASSWORD` | `deploy-k3s/secrets/postgres_password.txt` | api, worker | +| `honeydue-secrets` → `SECRET_KEY` | `deploy-k3s/secrets/secret_key.txt` | api, worker | +| `honeydue-secrets` → `EMAIL_HOST_PASSWORD` | `deploy-k3s/secrets/email_host_password.txt` | api, worker | +| `honeydue-secrets` → `FCM_SERVER_KEY` | `deploy-k3s/secrets/fcm_server_key.txt` | api, worker | +| `honeydue-secrets` → `REDIS_PASSWORD` | `config.yaml` key `redis.password` | api, worker, redis | +| `honeydue-secrets` → `OBS_INGEST_TOKEN` | `deploy/prod.env` | api, worker | +| `honeydue-apns-key` → `apns_auth_key.p8` | `deploy-k3s/secrets/apns_auth_key.p8` | api, worker | +| `cloudflare-origin-cert` | `deploy-k3s/secrets/cloudflare-origin.{crt,key}` | Traefik ingress | +| `ghcr-credentials` | `config.yaml` block `registry.*` | image pulls (all pods) | +| `admin-basic-auth` | `config.yaml` keys `admin.basic_auth_user` / `..._password` | Traefik `admin-auth` middleware | + +The `deploy-k3s/secrets/` directory and `config.yaml` are **gitignored** — +never commit them. + +--- + +## Standard rotation procedure + +```bash +cd honeyDueAPI-go +export KUBECONFIG="$(pwd)/deploy-k3s/kubeconfig" + +# 1. Update the source (file under deploy-k3s/secrets/ or a config.yaml key) +# 2. Recreate the Kubernetes Secrets from sources +./deploy-k3s/scripts/02-setup-secrets.sh + +# 3. Restart the consumers (see per-secret notes below for which) +kubectl -n honeydue rollout restart deploy/api deploy/worker + +# 4. Confirm health +kubectl -n honeydue rollout status deploy/api +kubectl -n honeydue rollout status deploy/worker + +# 5. Revoke the OLD credential at its provider (see per-secret notes) +# 6. Annotate the rotated secret with today's date +``` + +--- + +## Per-secret notes + +### `POSTGRES_PASSWORD` +1. Rotate the role password in the Neon dashboard. +2. Write the new value to `deploy-k3s/secrets/postgres_password.txt`. +3. `02-setup-secrets.sh`, then `rollout restart deploy/api deploy/worker`. +4. Watch logs for connection errors; the old password stops working the + moment Neon applies the change, so do steps 2–3 promptly. + +### `SECRET_KEY` ⚠️ user-visible +This signs auth tokens. **Rotating it logs every user out** — all existing +tokens become invalid and every client must re-authenticate. +1. Generate: `openssl rand -hex 32`. +2. Write to `deploy-k3s/secrets/secret_key.txt` (must be ≥32 chars — the + script enforces this; the app refuses to start in production without it). +3. `02-setup-secrets.sh`, then `rollout restart deploy/api deploy/worker`. +- Only rotate on a schedule or on suspected compromise — not casually. +- A future improvement (overlap window via a key-id header) would let old + tokens validate during the transition; not implemented today. + +### `EMAIL_HOST_PASSWORD` +1. Generate a new app password in Fastmail; keep the old one alive briefly. +2. Write to `deploy-k3s/secrets/email_host_password.txt`. +3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`. +4. Delete the old Fastmail app password. + +### `FCM_SERVER_KEY` +1. Rotate the key in the Firebase console. +2. Write to `deploy-k3s/secrets/fcm_server_key.txt`. +3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`. + +### `REDIS_PASSWORD` +Source is `config.yaml` key `redis.password` (hex only — it is embedded in +the `REDIS_URL`, so non-hex characters would break URL parsing). +1. Generate: `openssl rand -hex 32`. +2. Set `redis.password` in `config.yaml`. +3. `02-setup-secrets.sh`. +4. Restart **redis as well as** api/worker so the new `--requirepass` and + the new `REDIS_URL` land together: + `kubectl -n honeydue rollout restart deploy/redis deploy/api deploy/worker`. + Expect a few seconds where api/worker reconnect. + +### `apns_auth_key.p8` +1. Revoke the key in the Apple Developer console, generate a new `.p8`. +2. Replace `deploy-k3s/secrets/apns_auth_key.p8`. +3. `02-setup-secrets.sh`, `rollout restart deploy/api deploy/worker`. +4. If the Key ID changed, update `push.apns_key_id` in `config.yaml` too. + +### `cloudflare-origin-cert` +1. Generate a new Origin CA certificate in the Cloudflare dashboard. +2. Replace `deploy-k3s/secrets/cloudflare-origin.crt` and `.key`. +3. `02-setup-secrets.sh`. Traefik picks up the new TLS secret; no app + restart needed. Verify the served cert with `openssl s_client`. + +### `ghcr-credentials` (Gitea registry) +1. Generate a new PAT in Gitea (scope: `read:packages`). +2. Update the `registry.token` value in `config.yaml`. +3. `02-setup-secrets.sh`. No restart needed unless a pull is pending. +4. Revoke the old PAT in Gitea. + +### `admin-basic-auth` +Source is `config.yaml` keys `admin.basic_auth_user` / `basic_auth_password`. +1. Set a new password (e.g. `openssl rand -hex 24`). +2. `02-setup-secrets.sh` regenerates the bcrypt htpasswd secret. +3. No app restart needed — Traefik reloads the `admin-auth` middleware. +4. Distribute the new credential to whoever uses the admin panel. + +--- + +## After any rotation + +- Run `./deploy-k3s/scripts/04-verify.sh` and confirm no `✗` lines. +- Annotate the rotated secret (see "Record keeping" above). +- If the rotation was due to a compromise, also follow the relevant + playbook in `deploy-k3s/SECURITY.md` → Appendix (Incident response). diff --git a/go.mod b/go.mod index a47f4e8..e435368 100644 --- a/go.mod +++ b/go.mod @@ -27,10 +27,10 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 - golang.org/x/crypto v0.49.0 + golang.org/x/crypto v0.51.0 golang.org/x/oauth2 v0.35.0 - golang.org/x/term v0.41.0 - golang.org/x/text v0.35.0 + golang.org/x/term v0.43.0 + golang.org/x/text v0.37.0 golang.org/x/time v0.15.0 google.golang.org/api v0.257.0 gopkg.in/yaml.v3 v3.0.1 @@ -117,9 +117,9 @@ require ( go.opentelemetry.io/otel v1.43.0 go.opentelemetry.io/otel/metric v1.43.0 // indirect go.opentelemetry.io/otel/trace v1.43.0 - golang.org/x/net v0.52.0 // indirect - golang.org/x/sync v0.20.0 // indirect - golang.org/x/sys v0.42.0 // indirect + golang.org/x/net v0.53.0 // indirect + golang.org/x/sync v0.20.0 + golang.org/x/sys v0.44.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect google.golang.org/grpc v1.80.0 // indirect google.golang.org/protobuf v1.36.11 // indirect diff --git a/go.sum b/go.sum index 87a37e5..c4c7846 100644 --- a/go.sum +++ b/go.sum @@ -241,12 +241,12 @@ go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20170512130425-ab89591268e0/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= -golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= -golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= +golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= +golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= golang.org/x/net v0.0.0-20210520170846-37e1c6afe023/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220403103023-749bd193bc2b/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= -golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= -golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= +golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= @@ -262,16 +262,16 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= -golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= -golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= +golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= +golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= -golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= +golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= +golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/internal/config/config.go b/internal/config/config.go index 6376c20..800e782 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -1,6 +1,7 @@ package config import ( + "crypto/rand" "encoding/hex" "fmt" "net/url" @@ -216,6 +217,11 @@ func Load() (*Config, error) { // Set defaults setDefaults() + // Audit F8: overlay file-mounted secrets onto Viper. No-op when the + // directory is absent (local/dev), so this is safe to ship before the + // manifests mount honeydue-secrets as a volume. + loadFileSecrets() + // Parse DATABASE_URL if set (Dokku-style) dbConfig := DatabaseConfig{ Host: viper.GetString("DB_HOST"), @@ -432,14 +438,67 @@ func isWeakSecretKey(key string) bool { return knownWeakSecretKeys[strings.ToLower(strings.TrimSpace(key))] } +// loadFileSecrets overlays file-mounted secrets onto Viper (audit F8). When +// the honeydue-secrets Secret is mounted as a volume at /etc/honeydue/secrets +// each key is a file; reading the value here and viper.Set-ing it (highest +// Viper precedence) keeps the secret out of the process environment +// (/proc//environ), which plain env-var injection cannot. When the +// directory is absent it is a silent no-op and env vars are used as before. +func loadFileSecrets() { + dir := os.Getenv("HONEYDUE_SECRETS_DIR") + if dir == "" { + dir = "/etc/honeydue/secrets" + } + for _, k := range []string{ + "POSTGRES_PASSWORD", "SECRET_KEY", "EMAIL_HOST_PASSWORD", "FCM_SERVER_KEY", + "REDIS_PASSWORD", "B2_KEY_ID", "B2_APP_KEY", "OBS_INGEST_TOKEN", "OBS_TRACES_URL", + } { + b, err := os.ReadFile(dir + "/" + k) + if err != nil { + continue + } + if v := strings.TrimSpace(string(b)); v != "" { + viper.Set(k, v) + } + } +} + +// SecretValue resolves a configuration value that is not part of the typed +// Config struct. It reads through Viper, so a value supplied via a file-mounted +// secret (audit F8, loaded by loadFileSecrets) is found just like an env var. +// +// Must be called after Load(). Used by cmd/api and cmd/worker for the +// observability endpoints, which are needed before the full Config is wired +// and would otherwise be read with os.Getenv — which misses file-mounted +// secrets entirely once F8 removes them from the process environment. +func SecretValue(key string) string { + return viper.GetString(key) +} + +// randomHexKey returns a cryptographically secure random hex string +// representing n random bytes (2n hex characters). +func randomHexKey(n int) (string, error) { + b := make([]byte, n) + if _, err := rand.Read(b); err != nil { + return "", err + } + return hex.EncodeToString(b), nil +} + func validate(cfg *Config) error { - // S-08: Validate SECRET_KEY against known weak defaults + // M8: SECRET_KEY validation — no static fallback secret in the binary. if cfg.Security.SecretKey == "" { if cfg.Server.Debug { - // In debug mode, use a default key with a warning for local development - cfg.Security.SecretKey = "change-me-in-production-secret-key-12345" - fmt.Println("WARNING: SECRET_KEY not set, using default (debug mode only)") - fmt.Println("WARNING: *** DO NOT USE THIS DEFAULT KEY IN PRODUCTION ***") + // Debug only: generate a random key per boot. Tokens signed with + // it do not survive a restart, which is acceptable for local dev + // and far safer than a well-known hardcoded fallback. + randomKey, err := randomHexKey(32) + if err != nil { + return fmt.Errorf("failed to generate ephemeral debug SECRET_KEY: %w", err) + } + cfg.Security.SecretKey = randomKey + fmt.Println("WARNING: SECRET_KEY not set, generated an ephemeral random key (debug mode only)") + fmt.Println("WARNING: tokens will not survive a restart — set SECRET_KEY for stable local sessions") } else { // In production, refuse to start without a proper secret key return fmt.Errorf("FATAL: SECRET_KEY environment variable is required in production (DEBUG=false)") @@ -452,6 +511,12 @@ func validate(cfg *Config) error { } } + // C4: fixed confirmation codes ("123456") must never be enabled outside + // debug — with DEBUG=false they are a full authentication bypass. + if cfg.Server.DebugFixedCodes && !cfg.Server.Debug { + return fmt.Errorf("FATAL: DEBUG_FIXED_CODES is enabled with DEBUG=false — fixed confirmation codes must never run in production") + } + // Database password might come from DATABASE_URL, don't require it separately // The actual connection will fail if credentials are wrong diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 494f36b..5cac16a 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -106,8 +106,10 @@ func TestLoad_Validation_MissingSecretKey_DebugMode(t *testing.T) { c, err := Load() require.NoError(t, err) - // In debug mode, a default key is assigned - assert.Equal(t, "change-me-in-production-secret-key-12345", c.Security.SecretKey) + // Audit M8: in debug mode an ephemeral random key is generated per boot + // (no static fallback). It must be a non-empty 64-char hex string. + assert.Len(t, c.Security.SecretKey, 64) + assert.NotEqual(t, "change-me-in-production-secret-key-12345", c.Security.SecretKey) } func TestLoad_Validation_WeakSecretKey_Production(t *testing.T) { @@ -133,6 +135,33 @@ func TestLoad_Validation_WeakSecretKey_DebugMode(t *testing.T) { assert.Equal(t, "secret", c.Security.SecretKey) } +// Audit C4: DEBUG_FIXED_CODES makes confirmation codes a fixed "123456" — a +// full authentication bypass. With DEBUG=false, validate() must refuse to boot +// rather than ship that bypass to production. +func TestLoad_Validation_DebugFixedCodes_Production(t *testing.T) { + // validate() directly — avoids the sync.Once issue Load() has on failure. + cfg := &Config{ + Server: ServerConfig{Debug: false, DebugFixedCodes: true}, + Security: SecurityConfig{SecretKey: "a-strong-secret-key-for-tests"}, + } + + err := validate(cfg) + require.Error(t, err) + assert.Contains(t, err.Error(), "DEBUG_FIXED_CODES") +} + +// With DEBUG=true the fixed codes are an intended local-dev convenience, so +// the same combination must NOT error. +func TestLoad_Validation_DebugFixedCodes_DebugMode(t *testing.T) { + cfg := &Config{ + Server: ServerConfig{Debug: true, DebugFixedCodes: true}, + Security: SecurityConfig{SecretKey: "a-strong-secret-key-for-tests"}, + } + + err := validate(cfg) + require.NoError(t, err) +} + func TestLoad_Validation_EncryptionKey_Valid(t *testing.T) { resetConfigState() t.Setenv("SECRET_KEY", "a-strong-secret-key-for-tests") diff --git a/internal/handlers/auth_handler.go b/internal/handlers/auth_handler.go index 9649274..14b319f 100644 --- a/internal/handlers/auth_handler.go +++ b/internal/handlers/auth_handler.go @@ -1,6 +1,7 @@ package handlers import ( + "context" "errors" "net/http" @@ -55,8 +56,15 @@ func (h *AuthHandler) SetAuditService(auditService *services.AuditService) { h.auditService = auditService } +// noStore marks a response as non-cacheable (audit L2) — auth responses +// carry tokens and user data that must never sit in any cache. +func noStore(c echo.Context) { + c.Response().Header().Set("Cache-Control", "no-store") +} + // Login handles POST /api/auth/login/ func (h *AuthHandler) Login(c echo.Context) error { + noStore(c) var req requests.LoginRequest if err := c.Bind(&req); err != nil { return apperrors.BadRequest("error.invalid_request") @@ -65,9 +73,11 @@ func (h *AuthHandler) Login(c echo.Context) error { return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err)) } - response, err := h.authService.Login(c.Request().Context(), &req) + response, err := h.authService.Login(c.Request().Context(), &req, c.RealIP()) if err != nil { - log.Debug().Err(err).Str("identifier", req.Username).Msg("Login failed") + log.Debug().Err(err).Str("identifier", req.Username). + Str("ip", c.RealIP()).Str("user_agent", c.Request().UserAgent()). + Msg("Login failed") if h.auditService != nil { h.auditService.LogEvent(c, nil, services.AuditEventLoginFailed, map[string]interface{}{ "identifier": req.Username, @@ -86,6 +96,7 @@ func (h *AuthHandler) Login(c echo.Context) error { // Register handles POST /api/auth/register/ func (h *AuthHandler) Register(c echo.Context) error { + noStore(c) var req requests.RegisterRequest if err := c.Bind(&req); err != nil { return apperrors.BadRequest("error.invalid_request") @@ -157,6 +168,7 @@ func (h *AuthHandler) Logout(c echo.Context) error { // CurrentUser handles GET /api/auth/me/ func (h *AuthHandler) CurrentUser(c echo.Context) error { + noStore(c) user, err := middleware.MustGetAuthUser(c) if err != nil { return err @@ -276,31 +288,7 @@ func (h *AuthHandler) ForgotPassword(c echo.Context) error { return c.JSON(http.StatusBadRequest, validator.FormatValidationErrors(err)) } - code, user, err := h.authService.ForgotPassword(c.Request().Context(), req.Email) - if err != nil { - var appErr *apperrors.AppError - if errors.As(err, &appErr) && appErr.Code == http.StatusTooManyRequests { - // Only reveal rate limit errors - return err - } - - log.Error().Err(err).Str("email", req.Email).Msg("Forgot password failed") - // Don't reveal other errors to prevent email enumeration - } - - // Send password reset email (async) - only if user found - if h.emailService != nil && code != "" && user != nil { - go func() { - defer func() { - if r := recover(); r != nil { - log.Error().Interface("panic", r).Str("email", user.Email).Msg("Panic in password reset email goroutine") - } - }() - if err := h.emailService.SendPasswordResetEmail(user.Email, user.FirstName, code); err != nil { - log.Error().Err(err).Str("email", user.Email).Msg("Failed to send password reset email") - } - }() - } + noStore(c) if h.auditService != nil { h.auditService.LogEvent(c, nil, services.AuditEventPasswordReset, map[string]interface{}{ @@ -308,7 +296,33 @@ func (h *AuthHandler) ForgotPassword(c echo.Context) error { }) } - // Always return success to prevent email enumeration + // Audit LIVE-L13: run the user lookup, code generation, and email send + // entirely in the background, then return the generic response + // immediately. This makes the response time identical whether or not + // the email belongs to a real account, defeating timing-based user + // enumeration. context.Background() is used because the request context + // is cancelled the moment this handler returns. Per-account rate + // limiting still runs inside the service; the edge auth-rate-limit + // middleware covers per-IP abuse. + email := req.Email + go func() { + defer func() { + if r := recover(); r != nil { + log.Error().Interface("panic", r).Str("email", email).Msg("Panic in forgot-password goroutine") + } + }() + code, user, err := h.authService.ForgotPassword(context.Background(), email) + if err != nil || code == "" || user == nil { + return + } + if h.emailService != nil { + if sendErr := h.emailService.SendPasswordResetEmail(user.Email, user.FirstName, code); sendErr != nil { + log.Error().Err(sendErr).Str("email", user.Email).Msg("Failed to send password reset email") + } + } + }() + + // Always return success to prevent email enumeration. return c.JSON(http.StatusOK, responses.ForgotPasswordResponse{ Message: "Password reset email sent", }) @@ -365,6 +379,7 @@ func (h *AuthHandler) ResetPassword(c echo.Context) error { // AppleSignIn handles POST /api/auth/apple-sign-in/ func (h *AuthHandler) AppleSignIn(c echo.Context) error { + noStore(c) var req requests.AppleSignInRequest if err := c.Bind(&req); err != nil { return apperrors.BadRequest("error.invalid_request") @@ -412,6 +427,7 @@ func (h *AuthHandler) AppleSignIn(c echo.Context) error { // GoogleSignIn handles POST /api/auth/google-sign-in/ func (h *AuthHandler) GoogleSignIn(c echo.Context) error { + noStore(c) var req requests.GoogleSignInRequest if err := c.Bind(&req); err != nil { return apperrors.BadRequest("error.invalid_request") @@ -459,6 +475,7 @@ func (h *AuthHandler) GoogleSignIn(c echo.Context) error { // RefreshToken handles POST /api/auth/refresh/ func (h *AuthHandler) RefreshToken(c echo.Context) error { + noStore(c) user, err := middleware.MustGetAuthUser(c) if err != nil { return err diff --git a/internal/handlers/handler_coverage_test.go b/internal/handlers/handler_coverage_test.go index ca5ab2b..3e64bd1 100644 --- a/internal/handlers/handler_coverage_test.go +++ b/internal/handlers/handler_coverage_test.go @@ -650,14 +650,14 @@ func TestAuthHandler_RefreshToken(t *testing.T) { authGroup.Use(func(next echo.HandlerFunc) echo.HandlerFunc { return func(c echo.Context) error { c.Set("auth_user", user) - c.Set("auth_token", authToken.Key) + c.Set("auth_token", authToken.Plaintext) // raw token — repo hashes for lookup (audit C1) return next(c) } }) authGroup.POST("/refresh/", handler.RefreshToken) t.Run("successful refresh", func(t *testing.T) { - w := testutil.MakeRequest(e, "POST", "/api/auth/refresh/", nil, authToken.Key) + w := testutil.MakeRequest(e, "POST", "/api/auth/refresh/", nil, authToken.Plaintext) testutil.AssertStatusCode(t, w, http.StatusOK) var response map[string]interface{} diff --git a/internal/handlers/media_handler.go b/internal/handlers/media_handler.go index 464f576..19e665d 100644 --- a/internal/handlers/media_handler.go +++ b/internal/handlers/media_handler.go @@ -37,6 +37,23 @@ func NewMediaHandler( } } +// safeContentDisposition builds an inline Content-Disposition header value +// with a sanitized filename (audit M1). Control characters (including CR/LF), +// double-quote and backslash are stripped so an attacker-controlled upload +// filename cannot inject additional response headers (CWE-113). +func safeContentDisposition(filename string) string { + cleaned := strings.Map(func(r rune) rune { + if r < 0x20 || r == 0x7f || r == '"' || r == '\\' { + return -1 + } + return r + }, filename) + if cleaned == "" { + cleaned = "download" + } + return `inline; filename="` + cleaned + `"` +} + // ServeDocument serves a document file with access control // GET /api/media/document/:id func (h *MediaHandler) ServeDocument(c echo.Context) error { @@ -71,7 +88,7 @@ func (h *MediaHandler) ServeDocument(c echo.Context) error { // Set caching and disposition headers c.Response().Header().Set("Cache-Control", "private, max-age=3600") if doc.FileName != "" { - c.Response().Header().Set("Content-Disposition", "inline; filename=\""+doc.FileName+"\"") + c.Response().Header().Set("Content-Disposition", safeContentDisposition(doc.FileName)) } return c.Blob(http.StatusOK, mimeType, data) } @@ -114,7 +131,7 @@ func (h *MediaHandler) ServeDocumentImage(c echo.Context) error { } c.Response().Header().Set("Cache-Control", "private, max-age=3600") - c.Response().Header().Set("Content-Disposition", "inline; filename=\""+filepath.Base(img.ImageURL)+"\"") + c.Response().Header().Set("Content-Disposition", safeContentDisposition(filepath.Base(img.ImageURL))) return c.Blob(http.StatusOK, mimeType, data) } @@ -162,7 +179,7 @@ func (h *MediaHandler) ServeCompletionImage(c echo.Context) error { } c.Response().Header().Set("Cache-Control", "private, max-age=3600") - c.Response().Header().Set("Content-Disposition", "inline; filename=\""+filepath.Base(img.ImageURL)+"\"") + c.Response().Header().Set("Content-Disposition", safeContentDisposition(filepath.Base(img.ImageURL))) return c.Blob(http.StatusOK, mimeType, data) } diff --git a/internal/handlers/subscription_webhook_handler.go b/internal/handlers/subscription_webhook_handler.go index 933e4da..d64a5d6 100644 --- a/internal/handlers/subscription_webhook_handler.go +++ b/internal/handlers/subscription_webhook_handler.go @@ -8,6 +8,7 @@ import ( "encoding/base64" "encoding/json" "encoding/pem" + "errors" "fmt" "io" "math/big" @@ -20,6 +21,7 @@ import ( "github.com/golang-jwt/jwt/v5" "github.com/labstack/echo/v4" "github.com/rs/zerolog/log" + "gorm.io/gorm" "github.com/treytartt/honeydue-api/internal/config" "github.com/treytartt/honeydue-api/internal/models" @@ -165,9 +167,13 @@ func (h *SubscriptionWebhookHandler) HandleAppleWebhook(c echo.Context) error { if notification.NotificationUUID != "" { alreadyProcessed, err := h.webhookEventRepo.HasProcessed("apple", notification.NotificationUUID) if err != nil { - log.Error().Err(err).Msg("Apple Webhook: Failed to check dedup") - // Continue processing on dedup check failure (fail-open) - } else if alreadyProcessed { + // Audit H6: fail closed. A dedup-check failure must not let a + // possibly-duplicate event through (duplicate refunds/grants). + // Return 500 so Apple retries once the DB is healthy. + log.Error().Err(err).Msg("Apple Webhook: dedup check failed — returning 500 for retry") + return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "dedup check failed"}) + } + if alreadyProcessed { log.Info().Str("uuid", notification.NotificationUUID).Msg("Apple Webhook: Duplicate event, skipping") return c.JSON(http.StatusOK, map[string]interface{}{"status": "duplicate"}) } @@ -352,10 +358,24 @@ func (h *SubscriptionWebhookHandler) processAppleNotification( } func (h *SubscriptionWebhookHandler) findUserByAppleTransaction(originalTransactionID string) (*models.User, error) { - // Look up user subscription by stored receipt data - subscription, err := h.subscriptionRepo.FindByAppleReceiptContains(originalTransactionID) + // Audit C13: exact match on the indexed apple_original_transaction_id + // column. Falls back to the legacy escaped-LIKE scan over + // apple_receipt_data only for subscriptions created before that column + // existed (and thus not yet populated). + subscription, err := h.subscriptionRepo.FindByAppleOriginalTransactionID(originalTransactionID) if err != nil { - return nil, err + // Only fall back to the legacy substring scan when the exact-match + // column genuinely had no row (a subscription created before the + // column existed). A real DB error must propagate — masking it as + // "not found" could bind the webhook to the wrong account via the + // LIKE scan, or silently drop a legitimate event. + if !errors.Is(err, gorm.ErrRecordNotFound) { + return nil, err + } + subscription, err = h.subscriptionRepo.FindByAppleReceiptContains(originalTransactionID) + if err != nil { + return nil, err + } } user, err := h.userRepo.FindByID(subscription.UserID) @@ -566,9 +586,12 @@ func (h *SubscriptionWebhookHandler) HandleGoogleWebhook(c echo.Context) error { if messageID != "" { alreadyProcessed, err := h.webhookEventRepo.HasProcessed("google", messageID) if err != nil { - log.Error().Err(err).Msg("Google Webhook: Failed to check dedup") - // Continue processing on dedup check failure (fail-open) - } else if alreadyProcessed { + // Audit H6: fail closed — see the Apple handler. Return 500 so + // Google Pub/Sub redelivers once the DB is healthy. + log.Error().Err(err).Msg("Google Webhook: dedup check failed — returning 500 for retry") + return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "dedup check failed"}) + } + if alreadyProcessed { log.Info().Str("message_id", messageID).Msg("Google Webhook: Duplicate event, skipping") return c.JSON(http.StatusOK, map[string]interface{}{"status": "duplicate"}) } diff --git a/internal/middleware/auth.go b/internal/middleware/auth.go index 725e976..550a62d 100644 --- a/internal/middleware/auth.go +++ b/internal/middleware/auth.go @@ -169,6 +169,34 @@ func (m *AuthMiddleware) OptionalTokenAuth() echo.MiddlewareFunc { } } +// RequireVerified returns middleware that rejects users whose email is not +// verified (audit LIVE-L19). Apply it after TokenAuth to gate sensitive +// actions — e.g. generating residence share codes — behind proof that the +// account actually controls its email address. +func (m *AuthMiddleware) RequireVerified() echo.MiddlewareFunc { + return func(next echo.HandlerFunc) echo.HandlerFunc { + return func(c echo.Context) error { + user := GetAuthUser(c) + if user == nil { + return apperrors.Unauthorized("error.not_authenticated") + } + var verified bool + err := m.db.WithContext(c.Request().Context()). + Model(&models.UserProfile{}). + Where("user_id = ?", user.ID). + Select("verified"). + Scan(&verified).Error + if err != nil { + return apperrors.Internal(err) + } + if !verified { + return apperrors.Forbidden("error.email_not_verified") + } + return next(c) + } + } +} + // extractToken extracts the token from the Authorization header func extractToken(c echo.Context) (string, error) { authHeader := c.Request().Header.Get("Authorization") @@ -297,7 +325,7 @@ func (m *AuthMiddleware) getUserFromDatabaseWithToken(ctx context.Context, token u.last_login AS u_last_login `). Joins("INNER JOIN auth_user u ON u.id = t.user_id"). - Where("t.key = ?", token). + Where("t.key = ?", models.HashToken(token)). // audit C1: only the hash is stored Limit(1). Scan(&row).Error if err != nil || row.Key == "" { diff --git a/internal/middleware/auth_expiry_test.go b/internal/middleware/auth_expiry_test.go index c4e4509..525e577 100644 --- a/internal/middleware/auth_expiry_test.go +++ b/internal/middleware/auth_expiry_test.go @@ -65,7 +65,7 @@ func TestTokenAuth_RejectsExpiredToken(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Token "+token.Key) + req.Header.Set("Authorization", "Token "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) @@ -86,7 +86,7 @@ func TestTokenAuth_AcceptsValidToken(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Token "+token.Key) + req.Header.Set("Authorization", "Token "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) @@ -112,7 +112,7 @@ func TestTokenAuth_AcceptsTokenAtBoundary(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Token "+token.Key) + req.Header.Set("Authorization", "Token "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) diff --git a/internal/middleware/auth_test.go b/internal/middleware/auth_test.go index d769eee..42df77b 100644 --- a/internal/middleware/auth_test.go +++ b/internal/middleware/auth_test.go @@ -21,7 +21,7 @@ func TestTokenAuth_BearerScheme_Accepted(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Bearer "+token.Key) + req.Header.Set("Authorization", "Bearer "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) @@ -46,7 +46,7 @@ func TestTokenAuth_InvalidScheme_Rejected(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Basic "+token.Key) + req.Header.Set("Authorization", "Basic "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) @@ -110,7 +110,7 @@ func TestTokenAuth_InactiveUser_Rejected(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Token "+token.Key) + req.Header.Set("Authorization", "Token "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) @@ -156,7 +156,7 @@ func TestOptionalTokenAuth_ValidToken_SetsUser(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Token "+token.Key) + req.Header.Set("Authorization", "Token "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) @@ -182,7 +182,7 @@ func TestOptionalTokenAuth_ExpiredToken_IgnoresUser(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Token "+token.Key) + req.Header.Set("Authorization", "Token "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) @@ -242,7 +242,7 @@ func TestNewAuthMiddlewareWithConfig_CustomExpiryDays(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Token "+token.Key) + req.Header.Set("Authorization", "Token "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) @@ -270,7 +270,7 @@ func TestNewAuthMiddlewareWithConfig_ExpiredWithCustomExpiry(t *testing.T) { e := echo.New() req := httptest.NewRequest(http.MethodGet, "/api/test/", nil) - req.Header.Set("Authorization", "Token "+token.Key) + req.Header.Set("Authorization", "Token "+token.Plaintext) rec := httptest.NewRecorder() c := e.NewContext(req, rec) diff --git a/internal/middleware/timezone.go b/internal/middleware/timezone.go index 12a352f..df9dfb3 100644 --- a/internal/middleware/timezone.go +++ b/internal/middleware/timezone.go @@ -99,21 +99,23 @@ func parseTimezone(tz string) *time.Location { return loc } - // Try parsing as UTC offset (e.g., "-08:00", "+05:30") - // We parse a reference time with the given offset to extract the offset value - t, err := time.Parse("-07:00", tz) - if err == nil { - // time.Parse returns a time, we need to extract the offset - // The parsed time will have the offset embedded - _, offset := t.Zone() - return time.FixedZone(tz, offset) + // Try parsing as a UTC offset (e.g., "-08:00", "+05:30"). Audit H8: + // reject absurd offsets — real timezones are within ±14h of UTC — so a + // crafted X-Timezone header cannot shift date math arbitrarily. + const maxOffsetSeconds = 14 * 3600 + if t, err := time.Parse("-07:00", tz); err == nil { + if _, offset := t.Zone(); offset >= -maxOffsetSeconds && offset <= maxOffsetSeconds { + return time.FixedZone(tz, offset) + } + return time.UTC } // Also try without colon (e.g., "-0800") - t, err = time.Parse("-0700", tz) - if err == nil { - _, offset := t.Zone() - return time.FixedZone(tz, offset) + if t, err := time.Parse("-0700", tz); err == nil { + if _, offset := t.Zone(); offset >= -maxOffsetSeconds && offset <= maxOffsetSeconds { + return time.FixedZone(tz, offset) + } + return time.UTC } // Default to UTC diff --git a/internal/models/models_coverage_test.go b/internal/models/models_coverage_test.go index 1a4a8f0..428b9a4 100644 --- a/internal/models/models_coverage_test.go +++ b/internal/models/models_coverage_test.go @@ -252,7 +252,8 @@ func TestAuthToken_BeforeCreate_GeneratesKey(t *testing.T) { require.NoError(t, err) assert.NotEmpty(t, token.Key) - assert.Len(t, token.Key, 40) // 20 bytes = 40 hex chars + assert.Len(t, token.Key, 64) // SHA-256 hex hash (audit C1) + assert.Len(t, token.Plaintext, 40) // raw 20-byte token, returned to the client assert.False(t, token.Created.IsZero()) } diff --git a/internal/models/subscription.go b/internal/models/subscription.go index b14e54e..adab06a 100644 --- a/internal/models/subscription.go +++ b/internal/models/subscription.go @@ -43,6 +43,9 @@ type UserSubscription struct { // In-App Purchase data (Apple / Google) AppleReceiptData *string `gorm:"column:apple_receipt_data;type:text" json:"-"` GooglePurchaseToken *string `gorm:"column:google_purchase_token;type:text" json:"-"` + // AppleOriginalTransactionID binds an Apple subscription to one account + // (audit C5/C13). A partial unique index enforces one-account-per-txn. + AppleOriginalTransactionID *string `gorm:"column:apple_original_transaction_id;type:text" json:"-"` // Stripe data (web subscriptions) StripeCustomerID *string `gorm:"column:stripe_customer_id;size:255" json:"-"` diff --git a/internal/models/user.go b/internal/models/user.go index da066fe..f129e84 100644 --- a/internal/models/user.go +++ b/internal/models/user.go @@ -2,7 +2,10 @@ package models import ( "crypto/rand" + "crypto/sha256" + "encoding/binary" "encoding/hex" + "fmt" "time" "golang.org/x/crypto/bcrypt" @@ -37,14 +40,16 @@ func (User) TableName() string { return "auth_user" } +// BcryptCost is the bcrypt work factor for password and code hashing. +// 12 (audit M2) is stronger than bcrypt.DefaultCost (10). +const BcryptCost = 12 + // SetPassword hashes and sets the password func (u *User) SetPassword(password string) error { - // Django uses PBKDF2_SHA256 by default, but we'll use bcrypt for Go - // Note: This means passwords set by Django won't work with Go's check - // For migration, you'd need to either: - // 1. Force password reset for all users - // 2. Implement Django's PBKDF2 hasher in Go - hash, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost) + // Django uses PBKDF2_SHA256 by default, but we use bcrypt for Go. + // Passwords set by Django won't verify with Go's bcrypt check — those + // users must reset their password after migration. + hash, err := bcrypt.GenerateFromPassword([]byte(password), BcryptCost) if err != nil { return err } @@ -69,12 +74,22 @@ func (u *User) GetFullName() string { return u.Username } -// AuthToken represents the user_authtoken table +// AuthToken represents the user_authtoken table. +// +// Audit C1: the Key column stores the SHA-256 hash of the token, never the +// token itself. The raw token is handed to the client exactly once, at +// creation, via the non-persisted Plaintext field — it is never stored or +// logged. A database compromise therefore yields no usable session tokens. type AuthToken struct { - Key string `gorm:"column:key;primaryKey;size:40" json:"key"` + Key string `gorm:"column:key;primaryKey;size:64" json:"-"` UserID uint `gorm:"column:user_id;uniqueIndex;not null" json:"user_id"` Created time.Time `gorm:"column:created;autoCreateTime" json:"created"` + // Plaintext is the raw token value. It is never persisted (gorm:"-") + // and is only populated on a freshly-created token so the caller can + // return it to the client. On a token loaded from the DB it is "". + Plaintext string `gorm:"-" json:"-"` + // Relations User User `gorm:"foreignKey:UserID" json:"-"` } @@ -84,10 +99,13 @@ func (AuthToken) TableName() string { return "user_authtoken" } -// BeforeCreate generates a token key if not provided +// BeforeCreate generates a token if one is not already set, storing only +// its hash in Key and the raw value in the non-persisted Plaintext field. func (t *AuthToken) BeforeCreate(tx *gorm.DB) error { if t.Key == "" { - t.Key = generateToken() + raw := generateToken() + t.Plaintext = raw + t.Key = HashToken(raw) } if t.Created.IsZero() { t.Created = time.Now().UTC() @@ -95,13 +113,23 @@ func (t *AuthToken) BeforeCreate(tx *gorm.DB) error { return nil } -// generateToken creates a random 40-character hex token +// generateToken creates a random 40-character hex token (the raw value). func generateToken() string { b := make([]byte, 20) rand.Read(b) return hex.EncodeToString(b) } +// HashToken returns the at-rest representation of an auth token: the +// hex-encoded SHA-256 hash. Auth tokens are 160-bit random values, so a +// fast deterministic hash is appropriate — there is nothing to brute-force, +// and determinism preserves the single indexed-lookup query in the auth +// middleware. The raw token is never stored. +func HashToken(raw string) string { + sum := sha256.Sum256([]byte(raw)) + return hex.EncodeToString(sum[:]) +} + // GetOrCreate gets an existing token or creates a new one for the user func GetOrCreateToken(tx *gorm.DB, userID uint) (*AuthToken, error) { var token AuthToken @@ -160,15 +188,22 @@ func (c *ConfirmationCode) IsValid() bool { return !c.IsUsed && time.Now().UTC().Before(c.ExpiresAt) } -// GenerateCode creates a random 6-digit code +// GenerateConfirmationCode creates a uniformly-random 6-digit code using +// rejection sampling on crypto/rand (audit H4 — removes the modulo bias of +// the previous implementation). func GenerateConfirmationCode() string { - b := make([]byte, 3) - rand.Read(b) - // Convert to 6-digit number - num := int(b[0])<<16 | int(b[1])<<8 | int(b[2]) - return string(rune('0'+num%10)) + string(rune('0'+(num/10)%10)) + - string(rune('0'+(num/100)%10)) + string(rune('0'+(num/1000)%10)) + - string(rune('0'+(num/10000)%10)) + string(rune('0'+(num/100000)%10)) + for { + var b [4]byte + if _, err := rand.Read(b[:]); err != nil { + continue + } + // 4294000000 is the largest multiple of 1e6 <= MaxUint32; rejecting + // the tail above it makes n % 1000000 perfectly uniform. + n := binary.BigEndian.Uint32(b[:]) + if n < 4294000000 { + return fmt.Sprintf("%06d", n%1000000) + } + } } // PasswordResetCode represents the user_passwordresetcode table @@ -193,7 +228,7 @@ func (PasswordResetCode) TableName() string { // SetCode hashes and stores the reset code func (p *PasswordResetCode) SetCode(code string) error { - hash, err := bcrypt.GenerateFromPassword([]byte(code), bcrypt.DefaultCost) + hash, err := bcrypt.GenerateFromPassword([]byte(code), BcryptCost) if err != nil { return err } diff --git a/internal/repositories/residence_repo.go b/internal/repositories/residence_repo.go index b80bfc9..9362fdb 100644 --- a/internal/repositories/residence_repo.go +++ b/internal/repositories/residence_repo.go @@ -9,6 +9,7 @@ import ( "github.com/rs/zerolog/log" "gorm.io/gorm" + "gorm.io/gorm/clause" "github.com/treytartt/honeydue-api/internal/models" ) @@ -194,6 +195,60 @@ func (r *ResidenceRepository) HasAccess(residenceID, userID uint) (bool, error) return count > 0, nil } +// JoinWithShareCode atomically redeems a one-time share code (audit C9/H9): +// it locks the share-code row, re-checks validity under the lock, adds the +// user to the residence, and deactivates the code — all in one transaction. +// Concurrent redemptions of the same code serialize on the row lock; the +// loser sees is_active=false and is rejected. A failure to deactivate aborts +// the whole join. Returns gorm.ErrRecordNotFound for an unknown, inactive, or +// expired code so the caller can map every case to one generic error. +func (r *ResidenceRepository) JoinWithShareCode(code string, userID uint) (residenceID uint, alreadyMember bool, err error) { + err = r.db.Transaction(func(tx *gorm.DB) error { + var sc models.ResidenceShareCode + if e := tx.Clauses(clause.Locking{Strength: "UPDATE"}). + Where("code = ?", code).First(&sc).Error; e != nil { + return e + } + if !sc.IsActive || (sc.ExpiresAt != nil && time.Now().UTC().After(*sc.ExpiresAt)) { + return gorm.ErrRecordNotFound + } + residenceID = sc.ResidenceID + + // Already a member (owner or shared user)? + var accessCount int64 + if e := tx.Raw(` + SELECT COUNT(*) FROM ( + SELECT 1 FROM residence_residence + WHERE id = ? AND owner_id = ? AND is_active = true + UNION + SELECT 1 FROM residence_residence_users + WHERE residence_id = ? AND user_id = ? + ) ac + `, sc.ResidenceID, userID, sc.ResidenceID, userID).Scan(&accessCount).Error; e != nil { + return e + } + if accessCount > 0 { + alreadyMember = true + return nil + } + + if e := tx.Exec( + "INSERT INTO residence_residence_users (residence_id, user_id) VALUES (?, ?) ON CONFLICT DO NOTHING", + sc.ResidenceID, userID, + ).Error; e != nil { + return e + } + + // One-time use: deactivate the code. A failure here aborts the join. + if e := tx.Model(&models.ResidenceShareCode{}). + Where("id = ?", sc.ID).Update("is_active", false).Error; e != nil { + return e + } + return nil + }) + return residenceID, alreadyMember, err +} + // IsOwner checks if a user is the owner of a residence func (r *ResidenceRepository) IsOwner(residenceID, userID uint) (bool, error) { var count int64 diff --git a/internal/repositories/subscription_repo.go b/internal/repositories/subscription_repo.go index b47ef2c..de321a0 100644 --- a/internal/repositories/subscription_repo.go +++ b/internal/repositories/subscription_repo.go @@ -151,6 +151,28 @@ func (r *SubscriptionRepository) FindByAppleReceiptContains(transactionID string return &sub, nil } +// FindByAppleOriginalTransactionID finds a subscription by the Apple original +// transaction ID (audit C5/C13). Exact match on an indexed column — replaces +// the LIKE scan in FindByAppleReceiptContains for both replay detection and +// webhook user lookup. +func (r *SubscriptionRepository) FindByAppleOriginalTransactionID(originalTransactionID string) (*models.UserSubscription, error) { + var sub models.UserSubscription + err := r.db.Where("apple_original_transaction_id = ?", originalTransactionID).First(&sub).Error + if err != nil { + return nil, err + } + return &sub, nil +} + +// UpdateAppleOriginalTransactionID binds an Apple original transaction ID to a +// user's subscription. A partial unique index enforces one account per +// transaction (audit C5) — a second account claiming the same ID fails here. +func (r *SubscriptionRepository) UpdateAppleOriginalTransactionID(userID uint, originalTransactionID string) error { + return r.db.Model(&models.UserSubscription{}). + Where("user_id = ?", userID). + Update("apple_original_transaction_id", originalTransactionID).Error +} + // FindByGoogleToken finds a subscription by Google purchase token // Used by webhooks to find the user associated with a purchase func (r *SubscriptionRepository) FindByGoogleToken(purchaseToken string) (*models.UserSubscription, error) { diff --git a/internal/repositories/subscription_repo_test.go b/internal/repositories/subscription_repo_test.go index 01622ea..c038fd0 100644 --- a/internal/repositories/subscription_repo_test.go +++ b/internal/repositories/subscription_repo_test.go @@ -226,3 +226,48 @@ func TestUpdateExpiresAt(t *testing.T) { require.NotNil(t, updated.ExpiresAt) assert.WithinDuration(t, newExpiry, *updated.ExpiresAt, time.Second, "expires_at should be updated") } + +// TestSubscriptionRepo_IAPTransactionReplayRejected is the regression test for +// audit C5/C6: an in-app-purchase transaction (an Apple original transaction +// ID or a Google purchase token) may be bound to exactly one account. Without +// that guarantee a valid receipt could be replayed against a second account +// to grant Pro for free. The guarantee is the pair of partial unique indexes +// added by migration 000004; AutoMigrate does not create them, so this test +// recreates them verbatim to exercise the same DB-level enforcement. +func TestSubscriptionRepo_IAPTransactionReplayRejected(t *testing.T) { + db := testutil.SetupTestDB(t) + require.NoError(t, db.Exec(`CREATE UNIQUE INDEX uq_subscription_apple_original_txn `+ + `ON subscription_usersubscription (apple_original_transaction_id) `+ + `WHERE apple_original_transaction_id IS NOT NULL AND apple_original_transaction_id <> ''`).Error) + require.NoError(t, db.Exec(`CREATE UNIQUE INDEX uq_subscription_google_purchase_token `+ + `ON subscription_usersubscription (google_purchase_token) `+ + `WHERE google_purchase_token IS NOT NULL AND google_purchase_token <> ''`).Error) + + repo := NewSubscriptionRepository(db) + userA := testutil.CreateTestUser(t, db, "iapusera", "iapa@test.com", "password") + userB := testutil.CreateTestUser(t, db, "iapuserb", "iapb@test.com", "password") + require.NoError(t, db.Create(&models.UserSubscription{UserID: userA.ID, Tier: models.TierFree}).Error) + require.NoError(t, db.Create(&models.UserSubscription{UserID: userB.ID, Tier: models.TierFree}).Error) + + t.Run("apple transaction cannot be claimed by a second account", func(t *testing.T) { + require.NoError(t, repo.UpdateAppleOriginalTransactionID(userA.ID, "apple-original-txn-1"), + "the first account binding the transaction must succeed") + err := repo.UpdateAppleOriginalTransactionID(userB.ID, "apple-original-txn-1") + require.Error(t, err, + "replaying account A's Apple transaction onto account B must be rejected (C5)") + }) + + t.Run("google purchase token cannot be claimed by a second account", func(t *testing.T) { + require.NoError(t, repo.UpdatePurchaseToken(userA.ID, "google-purchase-token-1"), + "the first account binding the token must succeed") + err := repo.UpdatePurchaseToken(userB.ID, "google-purchase-token-1") + require.Error(t, err, + "replaying account A's Google purchase token onto account B must be rejected (C6)") + }) + + t.Run("re-binding the same transaction to the same account is allowed", func(t *testing.T) { + // A renewal re-submitting the same transaction for its owner must not + // be rejected — the partial unique index excludes the row's own value. + require.NoError(t, repo.UpdateAppleOriginalTransactionID(userA.ID, "apple-original-txn-1")) + }) +} diff --git a/internal/repositories/user_repo.go b/internal/repositories/user_repo.go index dd6038a..bbaccc4 100644 --- a/internal/repositories/user_repo.go +++ b/internal/repositories/user_repo.go @@ -174,10 +174,12 @@ func (r *UserRepository) GetOrCreateToken(userID uint) (*models.AuthToken, error return &token, nil } -// FindTokenByKey looks up an auth token by its key value. -func (r *UserRepository) FindTokenByKey(key string) (*models.AuthToken, error) { +// FindTokenByKey looks up an auth token by its raw key value. The raw token +// is hashed (audit C1) before the indexed lookup, since only the hash is +// stored. +func (r *UserRepository) FindTokenByKey(rawKey string) (*models.AuthToken, error) { var token models.AuthToken - if err := r.db.Where("key = ?", key).First(&token).Error; err != nil { + if err := r.db.Where("key = ?", models.HashToken(rawKey)).First(&token).Error; err != nil { if errors.Is(err, gorm.ErrRecordNotFound) { return nil, ErrTokenNotFound } @@ -195,9 +197,46 @@ func (r *UserRepository) CreateToken(userID uint) (*models.AuthToken, error) { return &token, nil } -// DeleteToken deletes an auth token +// CreateFreshToken issues a new auth token for the user, replacing any +// existing one. Because tokens are stored hashed (audit C1) the server +// cannot re-issue a previously-minted token's plaintext, so every login +// mints a fresh token. The returned token's Plaintext field carries the +// raw value to hand to the client; it is never persisted. +// +// It also returns the stored hashes of the token rows it deleted, so the +// caller can evict those entries from the Redis token cache (audit MEDIUM-1). +// Without that, a prior (e.g. stolen) token keeps authenticating via a cache +// hit for up to the cache TTL even though its DB row is gone. +func (r *UserRepository) CreateFreshToken(userID uint) (*models.AuthToken, []string, error) { + var token models.AuthToken + var oldHashes []string + err := r.db.Transaction(func(tx *gorm.DB) error { + var old []models.AuthToken + if err := tx.Where("user_id = ?", userID).Find(&old).Error; err != nil { + return err + } + oldHashes = make([]string, 0, len(old)) + for i := range old { + if old[i].Key != "" { + oldHashes = append(oldHashes, old[i].Key) + } + } + if err := tx.Where("user_id = ?", userID).Delete(&models.AuthToken{}).Error; err != nil { + return err + } + token = models.AuthToken{UserID: userID} + return tx.Create(&token).Error + }) + if err != nil { + return nil, nil, err + } + return &token, oldHashes, nil +} + +// DeleteToken deletes an auth token by its raw key value. The raw token is +// hashed (audit C1) before the lookup, since only the hash is stored. func (r *UserRepository) DeleteToken(token string) error { - result := r.db.Where("key = ?", token).Delete(&models.AuthToken{}) + result := r.db.Where("key = ?", models.HashToken(token)).Delete(&models.AuthToken{}) if result.Error != nil { return result.Error } diff --git a/internal/repositories/user_repo_coverage_test.go b/internal/repositories/user_repo_coverage_test.go index 89d19e9..a94d3cd 100644 --- a/internal/repositories/user_repo_coverage_test.go +++ b/internal/repositories/user_repo_coverage_test.go @@ -104,7 +104,7 @@ func TestUserRepository_FindTokenByKey(t *testing.T) { token, err := repo.GetOrCreateToken(user.ID) require.NoError(t, err) - found, err := repo.FindTokenByKey(token.Key) + found, err := repo.FindTokenByKey(token.Plaintext) require.NoError(t, err) assert.Equal(t, token.Key, found.Key) assert.Equal(t, user.ID, found.UserID) @@ -128,10 +128,10 @@ func TestUserRepository_DeleteToken(t *testing.T) { token, err := repo.GetOrCreateToken(user.ID) require.NoError(t, err) - err = repo.DeleteToken(token.Key) + err = repo.DeleteToken(token.Plaintext) require.NoError(t, err) - _, err = repo.FindTokenByKey(token.Key) + _, err = repo.FindTokenByKey(token.Plaintext) assert.ErrorIs(t, err, ErrTokenNotFound) } diff --git a/internal/router/router.go b/internal/router/router.go index fc85757..76448db 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -75,10 +75,13 @@ func SetupRouter(deps *Dependencies) *echo.Echo { // responses are unaffected — they don't load any assets, so any CSP is fine. // frame-ancestors stays 'none' to block clickjacking. e.Use(middleware.SecureWithConfig(middleware.SecureConfig{ - XSSProtection: "1; mode=block", + // XSSProtection deliberately empty (audit L7): the X-XSS-Protection + // header is deprecated and has itself caused XSS in legacy browsers. + XSSProtection: "", ContentTypeNosniff: "nosniff", XFrameOptions: "SAMEORIGIN", - HSTSMaxAge: 31536000, + HSTSMaxAge: 63072000, // 2 years — preload-eligible (audit L5/CODE-L3) + HSTSPreloadEnabled: true, ReferrerPolicy: "strict-origin-when-cross-origin", ContentSecurityPolicy: "default-src 'self'; " + "style-src 'self' https://fonts.googleapis.com; " + @@ -86,6 +89,8 @@ func SetupRouter(deps *Dependencies) *echo.Echo { "img-src 'self' data:; " + "script-src 'self'; " + "connect-src 'self'; " + + "object-src 'none'; " + // audit L8 — disable plugins/embeds + "base-uri 'self'; " + // audit L8 — block hijacking "frame-ancestors 'none'", })) e.Use(middleware.BodyLimitWithConfig(middleware.BodyLimitConfig{ @@ -136,9 +141,20 @@ func SetupRouter(deps *Dependencies) *echo.Echo { // labeled by route pattern, method, and status code. e.Use(prom.HTTPMiddleware()) - // /metrics endpoint exposed for vmagent scrape. No auth — bound to - // the cluster network only; not exposed via Cloudflare. - e.GET("/metrics", prom.Handler()) + // /metrics endpoint for the in-cluster vmagent scrape (audit LIVE-L1). + // vmagent scrapes api pods directly (pod-to-pod), so its requests carry + // no X-Forwarded-For. Any request that DOES carry one reached us through + // Traefik/Cloudflare — i.e. the public internet — and is refused with a + // 404. The api pod port is not exposed outside the cluster, so a request + // cannot reach /metrics without going through Traefik, and Traefik always + // appends X-Forwarded-For — the check cannot be bypassed. + metricsHandler := prom.Handler() + e.GET("/metrics", func(c echo.Context) error { + if c.Request().Header.Get("X-Forwarded-For") != "" { + return echo.NewHTTPError(http.StatusNotFound) + } + return metricsHandler(c) + }) // Serve landing page static files (if static directory is configured) staticDir := cfg.Server.StaticDir @@ -204,6 +220,7 @@ func SetupRouter(deps *Dependencies) *echo.Echo { // Wire Redis cache for residence-ID lookups across the four services that // read it on the request hot path. Cache is best-effort; nil cache is OK. if deps.Cache != nil { + authService.SetCacheService(deps.Cache) // per-account login lockout (audit M5) residenceService.SetCacheService(deps.Cache) taskService.SetCacheService(deps.Cache) contractorService.SetCacheService(deps.Cache) @@ -316,7 +333,7 @@ func SetupRouter(deps *Dependencies) *echo.Echo { protected.Use(custommiddleware.TimezoneMiddleware()) { setupProtectedAuthRoutes(protected, authHandler) - setupResidenceRoutes(protected, residenceHandler) + setupResidenceRoutes(protected, residenceHandler, authMiddleware.RequireVerified()) setupTaskRoutes(protected, taskHandler) setupSuggestionRoutes(protected, suggestionHandler) setupContractorRoutes(protected, contractorHandler) @@ -583,7 +600,7 @@ func setupPublicDataRoutes(api *echo.Group, residenceHandler *handlers.Residence } // setupResidenceRoutes configures residence routes -func setupResidenceRoutes(api *echo.Group, residenceHandler *handlers.ResidenceHandler) { +func setupResidenceRoutes(api *echo.Group, residenceHandler *handlers.ResidenceHandler, requireVerified echo.MiddlewareFunc) { residences := api.Group("/residences") { residences.GET("/", residenceHandler.ListResidences) @@ -598,8 +615,11 @@ func setupResidenceRoutes(api *echo.Group, residenceHandler *handlers.ResidenceH residences.DELETE("/:id/", residenceHandler.DeleteResidence) residences.GET("/:id/share-code/", residenceHandler.GetShareCode) - residences.POST("/:id/generate-share-code/", residenceHandler.GenerateShareCode) - residences.POST("/:id/generate-share-package/", residenceHandler.GenerateSharePackage) + // Audit LIVE-L19: generating a residence share code requires a + // verified email — it blocks bad-faith unverified signups from + // minting share codes. + residences.POST("/:id/generate-share-code/", residenceHandler.GenerateShareCode, requireVerified) + residences.POST("/:id/generate-share-package/", residenceHandler.GenerateSharePackage, requireVerified) residences.POST("/:id/generate-tasks-report/", residenceHandler.GenerateTasksReport) residences.GET("/:id/users/", residenceHandler.GetResidenceUsers) residences.DELETE("/:id/users/:user_id/", residenceHandler.RemoveResidenceUser) diff --git a/internal/services/auth_refresh_test.go b/internal/services/auth_refresh_test.go index 94e7f44..5f9f987 100644 --- a/internal/services/auth_refresh_test.go +++ b/internal/services/auth_refresh_test.go @@ -75,9 +75,9 @@ func TestRefreshToken_FreshToken_ReturnsExisting(t *testing.T) { svc := newTestAuthService(db) - resp, err := svc.RefreshToken(context.Background(), token.Key, user.ID) + resp, err := svc.RefreshToken(context.Background(), token.Plaintext, user.ID) require.NoError(t, err) - assert.Equal(t, token.Key, resp.Token, "fresh token should return the same token") + assert.Equal(t, token.Plaintext, resp.Token, "fresh token should return the same token") assert.Contains(t, resp.Message, "still valid") } @@ -88,23 +88,25 @@ func TestRefreshToken_InRenewalWindow_ReturnsNewToken(t *testing.T) { svc := newTestAuthService(db) - resp, err := svc.RefreshToken(context.Background(), token.Key, user.ID) + resp, err := svc.RefreshToken(context.Background(), token.Plaintext, user.ID) require.NoError(t, err) - assert.NotEqual(t, token.Key, resp.Token, "should return a new token") + assert.NotEqual(t, token.Plaintext, resp.Token, "should return a new token") assert.Contains(t, resp.Message, "refreshed") // Verify old token was deleted var count int64 + // The DB stores the SHA-256 hash, so query by token.Key (the hash). db.Model(&models.AuthToken{}).Where("key = ?", token.Key).Count(&count) assert.Equal(t, int64(0), count, "old token should be deleted") // Verify new token exists in DB - db.Model(&models.AuthToken{}).Where("key = ?", resp.Token).Count(&count) + // resp.Token is the raw token; the DB stores its hash. + db.Model(&models.AuthToken{}).Where("key = ?", models.HashToken(resp.Token)).Count(&count) assert.Equal(t, int64(1), count, "new token should exist in DB") // Verify new token belongs to the same user var newToken models.AuthToken - require.NoError(t, db.Where("key = ?", resp.Token).First(&newToken).Error) + require.NoError(t, db.Where("key = ?", models.HashToken(resp.Token)).First(&newToken).Error) assert.Equal(t, user.ID, newToken.UserID) } @@ -115,7 +117,7 @@ func TestRefreshToken_ExpiredToken_Returns401(t *testing.T) { svc := newTestAuthService(db) - resp, err := svc.RefreshToken(context.Background(), token.Key, user.ID) + resp, err := svc.RefreshToken(context.Background(), token.Plaintext, user.ID) require.Error(t, err) assert.Nil(t, resp) assert.Contains(t, err.Error(), "error.token_expired") @@ -130,9 +132,9 @@ func TestRefreshToken_AtExactBoundary60Days(t *testing.T) { svc := newTestAuthService(db) - resp, err := svc.RefreshToken(context.Background(), token.Key, user.ID) + resp, err := svc.RefreshToken(context.Background(), token.Plaintext, user.ID) require.NoError(t, err) - assert.NotEqual(t, token.Key, resp.Token, "token at 61 days should be refreshed") + assert.NotEqual(t, token.Plaintext, resp.Token, "token at 61 days should be refreshed") } func TestRefreshToken_InvalidToken_Returns401(t *testing.T) { @@ -155,7 +157,7 @@ func TestRefreshToken_WrongUser_Returns401(t *testing.T) { svc := newTestAuthService(db) // Try to refresh with a different user ID - resp, err := svc.RefreshToken(context.Background(), token.Key, user.ID+999) + resp, err := svc.RefreshToken(context.Background(), token.Plaintext, user.ID+999) require.Error(t, err) assert.Nil(t, resp) assert.Contains(t, err.Error(), "error.invalid_token") @@ -168,7 +170,7 @@ func TestRefreshToken_FreshTokenAt59Days_ReturnsExisting(t *testing.T) { svc := newTestAuthService(db) - resp, err := svc.RefreshToken(context.Background(), token.Key, user.ID) + resp, err := svc.RefreshToken(context.Background(), token.Plaintext, user.ID) require.NoError(t, err) - assert.Equal(t, token.Key, resp.Token, "token at 59 days should NOT be refreshed") + assert.Equal(t, token.Plaintext, resp.Token, "token at 59 days should NOT be refreshed") } diff --git a/internal/services/auth_service.go b/internal/services/auth_service.go index 2910076..898717e 100644 --- a/internal/services/auth_service.go +++ b/internal/services/auth_service.go @@ -3,6 +3,7 @@ package services import ( "context" "crypto/rand" + "encoding/binary" "encoding/hex" "errors" "fmt" @@ -36,13 +37,32 @@ var ( ErrGoogleSignInFailed = errors.New("Google Sign In failed") ) +// Per-account login lockout (audit M5, hardened per MEDIUM-3). +const ( + // maxLoginFailureIPs is how many DISTINCT source IPs may fail to log in to + // one account within the window before that account is locked. Counting + // distinct IPs (not raw attempts) means a single attacker who knows a + // victim's email cannot lock the victim out by spamming failures — only a + // genuinely distributed credential-stuffing attack reaches this threshold. + maxLoginFailureIPs = 5 + // loginLockWindow is how long the failed-IP set persists; it is refreshed + // on each failure so an active attack keeps the window open. + loginLockWindow = 15 * time.Minute +) + // AuthService handles authentication business logic type AuthService struct { userRepo *repositories.UserRepository notificationRepo *repositories.NotificationRepository + cache *CacheService cfg *config.Config } +// SetCacheService wires Redis for per-account login-failure tracking (M5). +func (s *AuthService) SetCacheService(cache *CacheService) { + s.cache = cache +} + // NewAuthService creates a new auth service func NewAuthService(userRepo *repositories.UserRepository, cfg *config.Config) *AuthService { return &AuthService{ @@ -56,34 +76,89 @@ func (s *AuthService) SetNotificationRepository(notificationRepo *repositories.N s.notificationRepo = notificationRepo } -// Login authenticates a user and returns a token -func (s *AuthService) Login(ctx context.Context, req *requests.LoginRequest) (*responses.LoginResponse, error) { +// dummyPasswordHash is a valid bcrypt hash used to keep login response time +// constant when the account does not exist (audit LIVE-L11). It is computed +// once at startup; the plaintext it hashes is irrelevant and never used. +var dummyPasswordHash = func() string { + h, err := bcrypt.GenerateFromPassword([]byte("honeydue-login-timing-equalizer"), models.BcryptCost) + if err != nil { + return "" // CompareHashAndPassword against "" always fails — safe + } + return string(h) +}() + +// freshToken mints a new auth token for the user and evicts any prior token's +// Redis cache entry (audit MEDIUM-1). Without the eviction a re-login would +// not actually kill a previously-issued token until the cache TTL lapsed — a +// stolen token would keep working for up to 5 minutes after the victim +// re-authenticates. A cache-eviction failure is logged, not fatal: the token +// row is already gone, so the stale entry simply ages out on its own. +func (s *AuthService) freshToken(ctx context.Context, userID uint) (*models.AuthToken, error) { + token, oldHashes, err := s.userRepo.WithContext(ctx).CreateFreshToken(userID) + if err != nil { + return nil, err + } + if s.cache != nil && len(oldHashes) > 0 { + if cErr := s.cache.InvalidateAuthTokenHashes(ctx, oldHashes...); cErr != nil { + log.Warn().Err(cErr).Uint("user_id", userID). + Msg("failed to evict prior auth-token cache entries on re-login") + } + } + return token, nil +} + +// Login authenticates a user and returns a token. clientIP is the request's +// source IP (echo c.RealIP()), used for the distributed-attack lockout. +func (s *AuthService) Login(ctx context.Context, req *requests.LoginRequest, clientIP string) (*responses.LoginResponse, error) { // Find user by username or email identifier := req.Username if identifier == "" { identifier = req.Email } + lockKey := strings.ToLower(strings.TrimSpace(identifier)) + + // Audit M5 (hardened per MEDIUM-3): per-account lockout keyed on the set + // of distinct source IPs that have failed. Once enough distinct IPs have + // failed for one account within the window, reject — this still catches + // distributed credential stuffing, without letting a single attacker lock + // a victim out by spamming failed logins from one IP. + if s.cache != nil && lockKey != "" { + if n, cErr := s.cache.LoginFailureIPCount(ctx, lockKey); cErr == nil && n >= maxLoginFailureIPs { + return nil, apperrors.TooManyRequests("error.too_many_login_attempts") + } + } user, err := s.userRepo.WithContext(ctx).FindByUsernameOrEmail(identifier) - if err != nil { - if errors.Is(err, repositories.ErrUserNotFound) { - return nil, apperrors.Unauthorized("error.invalid_credentials") - } + if err != nil && !errors.Is(err, repositories.ErrUserNotFound) { return nil, apperrors.Internal(err) } - // Check if user is active - if !user.IsActive { - return nil, apperrors.Unauthorized("error.account_inactive") + // Constant-time login (audit LIVE-L11): always run a bcrypt comparison, + // even when the account does not exist or is inactive, so response + // timing never reveals which emails are real accounts. Compare against + // the user's hash when available, otherwise a fixed dummy hash. + passwordHash := dummyPasswordHash + if user != nil { + passwordHash = user.Password } + passwordOK := bcrypt.CompareHashAndPassword([]byte(passwordHash), []byte(req.Password)) == nil - // Verify password - if !user.CheckPassword(req.Password) { + // One generic error for not-found, inactive, and wrong-password + // (audit L1) — none of them disclose which condition failed. + if user == nil || !user.IsActive || !passwordOK { + if s.cache != nil && lockKey != "" { + _, _ = s.cache.RegisterLoginFailure(ctx, lockKey, clientIP, loginLockWindow) + } return nil, apperrors.Unauthorized("error.invalid_credentials") } + // Successful authentication — clear the failure counter (audit M5). + if s.cache != nil && lockKey != "" { + _ = s.cache.ClearLoginFailures(ctx, lockKey) + } + // Get or create auth token - token, err := s.userRepo.WithContext(ctx).GetOrCreateToken(user.ID) + token, err := s.freshToken(ctx, user.ID) if err != nil { return nil, apperrors.Internal(err) } @@ -95,7 +170,7 @@ func (s *AuthService) Login(ctx context.Context, req *requests.LoginRequest) (*r } return &responses.LoginResponse{ - Token: token.Key, + Token: token.Plaintext, User: responses.NewUserResponse(user), }, nil } @@ -176,13 +251,13 @@ func (s *AuthService) Register(ctx context.Context, req *requests.RegisterReques } // Create auth token (outside transaction since token generation is idempotent) - token, err := s.userRepo.WithContext(ctx).GetOrCreateToken(user.ID) + token, err := s.freshToken(ctx, user.ID) if err != nil { return nil, "", apperrors.Internal(err) } return &responses.RegisterResponse{ - Token: token.Key, + Token: token.Plaintext, User: responses.NewUserResponse(user), Message: "Registration successful. Please check your email to verify your account.", }, code, nil @@ -243,7 +318,7 @@ func (s *AuthService) RefreshToken(ctx context.Context, tokenKey string, userID } return &responses.RefreshTokenResponse{ - Token: newToken.Key, + Token: newToken.Plaintext, Message: "Token refreshed successfully.", }, nil } @@ -390,26 +465,26 @@ func (s *AuthService) VerifyEmail(ctx context.Context, userID uint, code string) return nil } - // Find and validate confirmation code - confirmCode, err := s.userRepo.WithContext(ctx).FindConfirmationCode(userID, code) - if err != nil { - if errors.Is(err, repositories.ErrCodeNotFound) { + // Audit M4: validate the code, consume it, and flip the verified flag in + // one transaction so the three writes commit or roll back together. + txErr := s.userRepo.WithContext(ctx).Transaction(func(txRepo *repositories.UserRepository) error { + confirmCode, err := txRepo.FindConfirmationCode(userID, code) + if err != nil { + return err + } + if err := txRepo.MarkConfirmationCodeUsed(confirmCode.ID); err != nil { + return err + } + return txRepo.SetProfileVerified(userID, true) + }) + if txErr != nil { + if errors.Is(txErr, repositories.ErrCodeNotFound) { return apperrors.BadRequest("error.invalid_verification_code") } - if errors.Is(err, repositories.ErrCodeExpired) { + if errors.Is(txErr, repositories.ErrCodeExpired) { return apperrors.BadRequest("error.verification_code_expired") } - return apperrors.Internal(err) - } - - // Mark code as used - if err := s.userRepo.WithContext(ctx).MarkConfirmationCodeUsed(confirmCode.ID); err != nil { - return apperrors.Internal(err) - } - - // Set profile as verified - if err := s.userRepo.WithContext(ctx).SetProfileVerified(userID, true); err != nil { - return apperrors.Internal(err) + return apperrors.Internal(txErr) } return nil @@ -476,7 +551,7 @@ func (s *AuthService) ForgotPassword(ctx context.Context, email string) (string, expiresAt := time.Now().UTC().Add(s.cfg.Security.PasswordResetExpiry) // Hash the code before storing - codeHash, err := bcrypt.GenerateFromPassword([]byte(code), bcrypt.DefaultCost) + codeHash, err := bcrypt.GenerateFromPassword([]byte(code), models.BcryptCost) if err != nil { return "", nil, apperrors.Internal(err) } @@ -596,7 +671,7 @@ func (s *AuthService) AppleSignIn(ctx context.Context, appleAuth *AppleAuthServi } // Get or create token - token, err := s.userRepo.WithContext(ctx).GetOrCreateToken(user.ID) + token, err := s.freshToken(ctx, user.ID) if err != nil { return nil, apperrors.Internal(err) } @@ -605,7 +680,7 @@ func (s *AuthService) AppleSignIn(ctx context.Context, appleAuth *AppleAuthServi _ = s.userRepo.WithContext(ctx).UpdateLastLogin(user.ID) return &responses.AppleSignInResponse{ - Token: token.Key, + Token: token.Plaintext, User: responses.NewUserResponse(user), IsNewUser: false, }, nil @@ -638,7 +713,7 @@ func (s *AuthService) AppleSignIn(ctx context.Context, appleAuth *AppleAuthServi _ = s.userRepo.WithContext(ctx).SetProfileVerified(existingUser.ID, true) // Get or create token - token, err := s.userRepo.WithContext(ctx).GetOrCreateToken(existingUser.ID) + token, err := s.freshToken(ctx, existingUser.ID) if err != nil { return nil, apperrors.Internal(err) } @@ -653,7 +728,7 @@ func (s *AuthService) AppleSignIn(ctx context.Context, appleAuth *AppleAuthServi } return &responses.AppleSignInResponse{ - Token: token.Key, + Token: token.Plaintext, User: responses.NewUserResponse(existingUser), IsNewUser: false, }, nil @@ -704,7 +779,7 @@ func (s *AuthService) AppleSignIn(ctx context.Context, appleAuth *AppleAuthServi } // Create token - token, err := s.userRepo.WithContext(ctx).GetOrCreateToken(user.ID) + token, err := s.freshToken(ctx, user.ID) if err != nil { return nil, apperrors.Internal(err) } @@ -716,7 +791,7 @@ func (s *AuthService) AppleSignIn(ctx context.Context, appleAuth *AppleAuthServi } return &responses.AppleSignInResponse{ - Token: token.Key, + Token: token.Plaintext, User: responses.NewUserResponse(user), IsNewUser: true, }, nil @@ -749,7 +824,7 @@ func (s *AuthService) GoogleSignIn(ctx context.Context, googleAuth *GoogleAuthSe } // Get or create token - token, err := s.userRepo.WithContext(ctx).GetOrCreateToken(user.ID) + token, err := s.freshToken(ctx, user.ID) if err != nil { return nil, apperrors.Internal(err) } @@ -758,7 +833,7 @@ func (s *AuthService) GoogleSignIn(ctx context.Context, googleAuth *GoogleAuthSe _ = s.userRepo.WithContext(ctx).UpdateLastLogin(user.ID) return &responses.GoogleSignInResponse{ - Token: token.Key, + Token: token.Plaintext, User: responses.NewUserResponse(user), IsNewUser: false, }, nil @@ -794,7 +869,7 @@ func (s *AuthService) GoogleSignIn(ctx context.Context, googleAuth *GoogleAuthSe } // Get or create token - token, err := s.userRepo.WithContext(ctx).GetOrCreateToken(existingUser.ID) + token, err := s.freshToken(ctx, existingUser.ID) if err != nil { return nil, apperrors.Internal(err) } @@ -809,7 +884,7 @@ func (s *AuthService) GoogleSignIn(ctx context.Context, googleAuth *GoogleAuthSe } return &responses.GoogleSignInResponse{ - Token: token.Key, + Token: token.Plaintext, User: responses.NewUserResponse(existingUser), IsNewUser: false, }, nil @@ -861,7 +936,7 @@ func (s *AuthService) GoogleSignIn(ctx context.Context, googleAuth *GoogleAuthSe } // Create token - token, err := s.userRepo.WithContext(ctx).GetOrCreateToken(user.ID) + token, err := s.freshToken(ctx, user.ID) if err != nil { return nil, apperrors.Internal(err) } @@ -873,7 +948,7 @@ func (s *AuthService) GoogleSignIn(ctx context.Context, googleAuth *GoogleAuthSe } return &responses.GoogleSignInResponse{ - Token: token.Key, + Token: token.Plaintext, User: responses.NewUserResponse(user), IsNewUser: true, }, nil @@ -882,14 +957,19 @@ func (s *AuthService) GoogleSignIn(ctx context.Context, googleAuth *GoogleAuthSe // Helper functions func generateSixDigitCode() string { - b := make([]byte, 4) - rand.Read(b) - num := int(b[0])<<24 | int(b[1])<<16 | int(b[2])<<8 | int(b[3]) - if num < 0 { - num = -num + // Uniform 000000–999999 via rejection sampling on crypto/rand, + // removing the modulo bias of `n % 1000000` (audit H4). + for { + var b [4]byte + if _, err := rand.Read(b[:]); err != nil { + continue + } + // 4294000000 is the largest multiple of 1e6 <= MaxUint32. + n := binary.BigEndian.Uint32(b[:]) + if n < 4294000000 { + return fmt.Sprintf("%06d", n%1000000) + } } - code := num % 1000000 - return fmt.Sprintf("%06d", code) } func generateResetToken() string { diff --git a/internal/services/auth_service_test.go b/internal/services/auth_service_test.go index 1ab3638..8b4ab93 100644 --- a/internal/services/auth_service_test.go +++ b/internal/services/auth_service_test.go @@ -54,7 +54,7 @@ func TestAuthService_Login(t *testing.T) { Password: "Password123", } - resp, err := service.Login(context.Background(), req) + resp, err := service.Login(context.Background(), req, "") require.NoError(t, err) assert.NotEmpty(t, resp.Token) assert.Equal(t, "testuser", resp.User.Username) @@ -75,7 +75,7 @@ func TestAuthService_Login_ByEmail(t *testing.T) { Password: "Password123", } - resp, err := service.Login(context.Background(), req) + resp, err := service.Login(context.Background(), req, "") require.NoError(t, err) assert.NotEmpty(t, resp.Token) } @@ -95,7 +95,7 @@ func TestAuthService_Login_InvalidCredentials(t *testing.T) { Password: "WrongPassword1", } - _, err := service.Login(context.Background(), req) + _, err := service.Login(context.Background(), req, "") testutil.AssertAppError(t, err, http.StatusUnauthorized, "error.invalid_credentials") } @@ -112,7 +112,7 @@ func TestAuthService_Login_UserNotFound(t *testing.T) { Password: "Password123", } - _, err := service.Login(context.Background(), req) + _, err := service.Login(context.Background(), req, "") testutil.AssertAppError(t, err, http.StatusUnauthorized, "error.invalid_credentials") } @@ -134,8 +134,10 @@ func TestAuthService_Login_InactiveUser(t *testing.T) { Password: "Password123", } - _, err := service.Login(context.Background(), req) - testutil.AssertAppError(t, err, http.StatusUnauthorized, "error.account_inactive") + _, err := service.Login(context.Background(), req, "") + // Audit L1: inactive accounts return the same generic error as bad + // credentials so login does not disclose which accounts exist. + testutil.AssertAppError(t, err, http.StatusUnauthorized, "error.invalid_credentials") } // === Register === @@ -443,7 +445,7 @@ func TestAuthService_ResetPassword(t *testing.T) { Username: "testuser", Password: "NewPassword123", } - loginResp, err := service.Login(context.Background(), loginReq) + loginResp, err := service.Login(context.Background(), loginReq, "") require.NoError(t, err) assert.NotEmpty(t, loginResp.Token) } @@ -472,7 +474,7 @@ func TestAuthService_Logout(t *testing.T) { Username: "testuser", Password: "Password123", } - loginResp, err := service.Login(context.Background(), loginReq) + loginResp, err := service.Login(context.Background(), loginReq, "") require.NoError(t, err) // Logout @@ -659,7 +661,7 @@ func TestAuthService_Login_EmptyPassword(t *testing.T) { Password: "", } - _, err := service.Login(context.Background(), req) + _, err := service.Login(context.Background(), req, "") testutil.AssertAppError(t, err, http.StatusUnauthorized, "error.invalid_credentials") } diff --git a/internal/services/cache_service.go b/internal/services/cache_service.go index cbb07cb..b977e12 100644 --- a/internal/services/cache_service.go +++ b/internal/services/cache_service.go @@ -12,6 +12,7 @@ import ( "github.com/rs/zerolog/log" "github.com/treytartt/honeydue-api/internal/config" + "github.com/treytartt/honeydue-api/internal/models" ) // CacheService provides Redis caching functionality @@ -139,22 +140,25 @@ const ( TokenCacheTTL = 5 * time.Minute ) +// authTokenCacheKey returns the Redis key for an auth token. The raw token +// is hashed (audit C1) so the plaintext token never appears in a Redis key. +func authTokenCacheKey(token string) string { + return AuthTokenPrefix + models.HashToken(token) +} + // CacheAuthToken caches a user ID for a token func (c *CacheService) CacheAuthToken(ctx context.Context, token string, userID uint) error { - key := AuthTokenPrefix + token - return c.SetString(ctx, key, fmt.Sprintf("%d", userID), TokenCacheTTL) + return c.SetString(ctx, authTokenCacheKey(token), fmt.Sprintf("%d", userID), TokenCacheTTL) } // CacheAuthTokenWithCreated caches a user ID and token creation time for a token func (c *CacheService) CacheAuthTokenWithCreated(ctx context.Context, token string, userID uint, createdUnix int64) error { - key := AuthTokenPrefix + token - return c.SetString(ctx, key, fmt.Sprintf("%d|%d", userID, createdUnix), TokenCacheTTL) + return c.SetString(ctx, authTokenCacheKey(token), fmt.Sprintf("%d|%d", userID, createdUnix), TokenCacheTTL) } // GetCachedAuthToken gets a cached user ID for a token func (c *CacheService) GetCachedAuthToken(ctx context.Context, token string) (uint, error) { - key := AuthTokenPrefix + token - val, err := c.GetString(ctx, key) + val, err := c.GetString(ctx, authTokenCacheKey(token)) if err != nil { return 0, err } @@ -167,8 +171,7 @@ func (c *CacheService) GetCachedAuthToken(ctx context.Context, token string) (ui // GetCachedAuthTokenWithCreated gets a cached user ID and token creation time. // Returns userID, createdUnix, error. createdUnix is 0 if not stored (legacy format). func (c *CacheService) GetCachedAuthTokenWithCreated(ctx context.Context, token string) (uint, int64, error) { - key := AuthTokenPrefix + token - val, err := c.GetString(ctx, key) + val, err := c.GetString(ctx, authTokenCacheKey(token)) if err != nil { return 0, 0, err } @@ -184,8 +187,62 @@ func (c *CacheService) GetCachedAuthTokenWithCreated(ctx context.Context, token // InvalidateAuthToken removes a cached token func (c *CacheService) InvalidateAuthToken(ctx context.Context, token string) error { - key := AuthTokenPrefix + token - return c.Delete(ctx, key) + return c.Delete(ctx, authTokenCacheKey(token)) +} + +// InvalidateAuthTokenHashes removes cached entries for already-hashed token +// keys. Unlike InvalidateAuthToken (which hashes a plaintext), this takes the +// stored hash directly — used to evict a user's prior token on re-login +// (audit MEDIUM-1), where the server no longer has the plaintext. +func (c *CacheService) InvalidateAuthTokenHashes(ctx context.Context, hashes ...string) error { + keys := make([]string, 0, len(hashes)) + for _, h := range hashes { + if h != "" { + keys = append(keys, AuthTokenPrefix+h) + } + } + if len(keys) == 0 { + return nil + } + return c.Delete(ctx, keys...) +} + +// --- Per-account login-failure tracking (audit M5) --- + +const loginFailPrefix = "login_fail:" + +// RegisterLoginFailure records a failed login for an account from a given +// source IP, and returns the number of DISTINCT source IPs that have failed +// for this account within the window. Tracking distinct IPs as a set rather +// than a raw counter (audit MEDIUM-3) means one attacker, from one IP, cannot +// run the count up and lock a victim out by knowing only their email — a +// single IP is bounded by the per-IP edge/app rate limiters instead. A +// genuinely distributed credential-stuffing attack still trips the lockout. +func (c *CacheService) RegisterLoginFailure(ctx context.Context, identifier, ip string, window time.Duration) (int64, error) { + key := loginFailPrefix + identifier + member := ip + if member == "" { + member = "unknown" + } + if err := c.client.SAdd(ctx, key, member).Err(); err != nil { + return 0, err + } + // Refresh the TTL on each failure: an active attack keeps the window + // open, while a quiet account ages out `window` after its last failure. + _ = c.client.Expire(ctx, key, window).Err() + return c.client.SCard(ctx, key).Result() +} + +// LoginFailureIPCount returns how many distinct source IPs have failed to log +// in to this account within the window (audit MEDIUM-3). SCard on a missing +// key returns 0. +func (c *CacheService) LoginFailureIPCount(ctx context.Context, identifier string) (int64, error) { + return c.client.SCard(ctx, loginFailPrefix+identifier).Result() +} + +// ClearLoginFailures resets the failed-login IP set after a successful login. +func (c *CacheService) ClearLoginFailures(ctx context.Context, identifier string) error { + return c.client.Del(ctx, loginFailPrefix+identifier).Err() } // Static data cache helpers diff --git a/internal/services/contractor_service.go b/internal/services/contractor_service.go index 4ead2eb..cc6485c 100644 --- a/internal/services/contractor_service.go +++ b/internal/services/contractor_service.go @@ -296,9 +296,14 @@ func (s *ContractorService) ToggleFavorite(ctx context.Context, contractorID, us return nil, apperrors.Internal(err) } - // Re-fetch the contractor to get the updated state with all relations + // Re-fetch to get the updated state with all relations. Audit M12: if the + // contractor was deleted concurrently between the toggle and this read, + // surface a clean 404 instead of a 500. contractor, err = s.contractorRepo.WithContext(ctx).FindByID(contractorID) if err != nil { + if errors.Is(err, gorm.ErrRecordNotFound) { + return nil, apperrors.NotFound("error.contractor_not_found") + } return nil, apperrors.Internal(err) } diff --git a/internal/services/file_ownership_service.go b/internal/services/file_ownership_service.go index fc1d084..b61faa0 100644 --- a/internal/services/file_ownership_service.go +++ b/internal/services/file_ownership_service.go @@ -5,9 +5,9 @@ import ( "gorm.io/gorm" ) -// FileOwnershipService checks whether a user owns a file referenced by URL. -// It queries task completion images, document files, and document images -// to determine ownership through residence access. +// FileOwnershipService checks whether a user has access to a file referenced +// by URL. It queries task completion images, document files, and document +// images, resolving access through residence ownership or membership. type FileOwnershipService struct { db *gorm.DB } @@ -17,16 +17,31 @@ func NewFileOwnershipService(db *gorm.DB) *FileOwnershipService { return &FileOwnershipService{db: db} } -// IsFileOwnedByUser checks if the given file URL belongs to a record -// that the user has access to (via residence membership). +// accessibleResidenceIDs returns a subquery of residence IDs the user can +// access: residences they own (residence_residence.owner_id) UNION residences +// they are a member of (residence_residence_users). +// +// Audit C7: the previous queries joined residence_residence_users only, so a +// residence owner who was not also a member of the join table could not pass +// the ownership check for files in their own property. +func (s *FileOwnershipService) accessibleResidenceIDs(userID uint) *gorm.DB { + return s.db.Raw(` + SELECT id FROM residence_residence WHERE owner_id = ? + UNION + SELECT residence_id FROM residence_residence_users WHERE user_id = ? + `, userID, userID) +} + +// IsFileOwnedByUser checks if the given file URL belongs to a record in a +// residence the user owns or is a member of. func (s *FileOwnershipService) IsFileOwnedByUser(fileURL string, userID uint) (bool, error) { - // Check task completion images: image_url -> completion -> task -> residence -> user access + // Task completion images: image_url -> completion -> task -> residence. var completionImageCount int64 err := s.db.Model(&models.TaskCompletionImage{}). Joins("JOIN task_taskcompletion ON task_taskcompletion.id = task_taskcompletionimage.completion_id"). Joins("JOIN task_task ON task_task.id = task_taskcompletion.task_id"). - Joins("JOIN residence_residence_users ON residence_residence_users.residence_id = task_task.residence_id"). - Where("task_taskcompletionimage.image_url = ? AND residence_residence_users.user_id = ?", fileURL, userID). + Where("task_taskcompletionimage.image_url = ?", fileURL). + Where("task_task.residence_id IN (?)", s.accessibleResidenceIDs(userID)). Count(&completionImageCount).Error if err != nil { return false, err @@ -35,11 +50,11 @@ func (s *FileOwnershipService) IsFileOwnedByUser(fileURL string, userID uint) (b return true, nil } - // Check document files: file_url -> document -> residence -> user access + // Document files: file_url -> document -> residence. var documentCount int64 err = s.db.Model(&models.Document{}). - Joins("JOIN residence_residence_users ON residence_residence_users.residence_id = task_document.residence_id"). - Where("task_document.file_url = ? AND residence_residence_users.user_id = ?", fileURL, userID). + Where("task_document.file_url = ?", fileURL). + Where("task_document.residence_id IN (?)", s.accessibleResidenceIDs(userID)). Count(&documentCount).Error if err != nil { return false, err @@ -48,12 +63,12 @@ func (s *FileOwnershipService) IsFileOwnedByUser(fileURL string, userID uint) (b return true, nil } - // Check document images: image_url -> document_image -> document -> residence -> user access + // Document images: image_url -> document_image -> document -> residence. var documentImageCount int64 err = s.db.Model(&models.DocumentImage{}). Joins("JOIN task_document ON task_document.id = task_documentimage.document_id"). - Joins("JOIN residence_residence_users ON residence_residence_users.residence_id = task_document.residence_id"). - Where("task_documentimage.image_url = ? AND residence_residence_users.user_id = ?", fileURL, userID). + Where("task_documentimage.image_url = ?", fileURL). + Where("task_document.residence_id IN (?)", s.accessibleResidenceIDs(userID)). Count(&documentImageCount).Error if err != nil { return false, err diff --git a/internal/services/google_auth.go b/internal/services/google_auth.go index 04e74da..cbd1ecc 100644 --- a/internal/services/google_auth.go +++ b/internal/services/google_auth.go @@ -2,132 +2,306 @@ package services import ( "context" + "crypto/rsa" + "encoding/base64" "encoding/json" "errors" "fmt" + "math/big" "net/http" + "strings" "time" + "github.com/golang-jwt/jwt/v5" + "github.com/treytartt/honeydue-api/internal/config" ) const ( - googleTokenInfoURL = "https://oauth2.googleapis.com/tokeninfo" + // googleKeysURL is Google's JWKS endpoint for ID-token signature verification. + googleKeysURL = "https://www.googleapis.com/oauth2/v3/certs" + googleKeysCacheTTL = 24 * time.Hour + googleKeysCacheKey = "google:public_keys" ) +// googleIssuers is the set of valid `iss` claim values for a Google ID token. +var googleIssuers = map[string]bool{ + "accounts.google.com": true, + "https://accounts.google.com": true, +} + var ( ErrInvalidGoogleToken = errors.New("invalid Google ID token") ErrGoogleTokenExpired = errors.New("Google ID token has expired") ErrInvalidGoogleAudience = errors.New("invalid Google token audience") + ErrInvalidGoogleIssuer = errors.New("invalid Google token issuer") + ErrGoogleKeyNotFound = errors.New("Google public key not found") ) -// GoogleTokenInfo represents the response from Google's token info endpoint -type GoogleTokenInfo struct { - Sub string `json:"sub"` // Unique Google user ID - Email string `json:"email"` // User's email - EmailVerified string `json:"email_verified"` // "true" or "false" - Name string `json:"name"` // Full name - GivenName string `json:"given_name"` // First name - FamilyName string `json:"family_name"` // Last name - Picture string `json:"picture"` // Profile picture URL - Aud string `json:"aud"` // Audience (client ID) - Azp string `json:"azp"` // Authorized party - Exp string `json:"exp"` // Expiration time - Iss string `json:"iss"` // Issuer +// GoogleJWKS represents Google's JSON Web Key Set. +type GoogleJWKS struct { + Keys []GoogleJWK `json:"keys"` } -// IsEmailVerified returns whether the email is verified +// GoogleJWK represents a single JSON Web Key from Google. +type GoogleJWK struct { + Kty string `json:"kty"` // Key type (RSA) + Kid string `json:"kid"` // Key ID + Use string `json:"use"` // Key use (sig) + Alg string `json:"alg"` // Algorithm (RS256) + N string `json:"n"` // RSA modulus + E string `json:"e"` // RSA exponent +} + +// GoogleTokenClaims represents the claims in a Google ID token JWT. +type GoogleTokenClaims struct { + jwt.RegisteredClaims + Email string `json:"email,omitempty"` + EmailVerified bool `json:"email_verified,omitempty"` + Name string `json:"name,omitempty"` + GivenName string `json:"given_name,omitempty"` + FamilyName string `json:"family_name,omitempty"` + Picture string `json:"picture,omitempty"` + Azp string `json:"azp,omitempty"` // Authorized party +} + +// GoogleTokenInfo is the verified, caller-facing view of a Google ID token. +type GoogleTokenInfo struct { + Sub string // Unique Google user ID + Email string + EmailVerified string // "true" or "false" — string for caller compatibility + Name string + GivenName string + FamilyName string + Picture string + Aud string + Azp string + Iss string +} + +// IsEmailVerified returns whether the email is verified. func (t *GoogleTokenInfo) IsEmailVerified() bool { return t.EmailVerified == "true" } -// GoogleAuthService handles Google Sign In token verification +// GoogleAuthService handles Google Sign In token verification. type GoogleAuthService struct { cache *CacheService config *config.Config client *http.Client } -// NewGoogleAuthService creates a new Google auth service +// NewGoogleAuthService creates a new Google auth service. func NewGoogleAuthService(cache *CacheService, cfg *config.Config) *GoogleAuthService { return &GoogleAuthService{ cache: cache, config: cfg, - client: &http.Client{ - Timeout: 10 * time.Second, - }, + client: &http.Client{Timeout: 10 * time.Second}, } } -// VerifyIDToken verifies a Google ID token and returns the token info +// VerifyIDToken verifies a Google ID token locally (audit C2/C3): it checks +// the RS256 signature against Google's published JWKS and the iss, aud, and +// exp claims. It never sends the token to a third-party endpoint, so it no +// longer depends on the deprecated tokeninfo service and never leaks the +// token in a request URL. func (s *GoogleAuthService) VerifyIDToken(ctx context.Context, idToken string) (*GoogleTokenInfo, error) { - // Call Google's tokeninfo endpoint to verify the token - url := fmt.Sprintf("%s?id_token=%s", googleTokenInfoURL, idToken) - - req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) - if err != nil { - return nil, fmt.Errorf("failed to create request: %w", err) - } - - resp, err := s.client.Do(req) - if err != nil { - return nil, fmt.Errorf("failed to verify token: %w", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { + // Parse the token header to get the key ID. + parts := strings.Split(idToken, ".") + if len(parts) != 3 { return nil, ErrInvalidGoogleToken } - var tokenInfo GoogleTokenInfo - if err := json.NewDecoder(resp.Body).Decode(&tokenInfo); err != nil { - return nil, fmt.Errorf("failed to decode token info: %w", err) + headerBytes, err := base64.RawURLEncoding.DecodeString(parts[0]) + if err != nil { + return nil, fmt.Errorf("failed to decode token header: %w", err) + } + var header struct { + Kid string `json:"kid"` + Alg string `json:"alg"` + } + if err := json.Unmarshal(headerBytes, &header); err != nil { + return nil, fmt.Errorf("failed to parse token header: %w", err) } - // Verify the audience matches our client ID(s) - if !s.verifyAudience(tokenInfo.Aud, tokenInfo.Azp) { + publicKey, err := s.getPublicKey(ctx, header.Kid) + if err != nil { + return nil, err + } + + // Parse and verify the signature. jwt v5 validates exp/iat/nbf automatically. + token, err := jwt.ParseWithClaims(idToken, &GoogleTokenClaims{}, func(token *jwt.Token) (interface{}, error) { + if _, ok := token.Method.(*jwt.SigningMethodRSA); !ok { + return nil, fmt.Errorf("unexpected signing method: %v", token.Header["alg"]) + } + return publicKey, nil + }) + if err != nil { + if errors.Is(err, jwt.ErrTokenExpired) { + return nil, ErrGoogleTokenExpired + } + return nil, fmt.Errorf("failed to parse token: %w", err) + } + + claims, ok := token.Claims.(*GoogleTokenClaims) + if !ok || !token.Valid { + return nil, ErrInvalidGoogleToken + } + + // Verify the issuer (audit C3). + if !googleIssuers[claims.Issuer] { + return nil, ErrInvalidGoogleIssuer + } + + // Verify the audience matches one of our configured client IDs. + if !s.verifyAudience(claims.Audience, claims.Azp) { return nil, ErrInvalidGoogleAudience } - // Verify the token is not expired (tokeninfo endpoint already checks this, - // but we double-check for security) - if tokenInfo.Sub == "" { + if claims.Subject == "" { return nil, ErrInvalidGoogleToken } - return &tokenInfo, nil + emailVerified := "false" + if claims.EmailVerified { + emailVerified = "true" + } + aud := "" + if len(claims.Audience) > 0 { + aud = claims.Audience[0] + } + return &GoogleTokenInfo{ + Sub: claims.Subject, + Email: claims.Email, + EmailVerified: emailVerified, + Name: claims.Name, + GivenName: claims.GivenName, + FamilyName: claims.FamilyName, + Picture: claims.Picture, + Aud: aud, + Azp: claims.Azp, + Iss: claims.Issuer, + }, nil } -// verifyAudience checks if the token audience matches our client ID(s). -// In production (non-debug), an empty clientID causes verification to fail -// rather than silently bypassing the check. -func (s *GoogleAuthService) verifyAudience(aud, azp string) bool { +// verifyAudience checks the token audience against our configured client IDs. +// In production (non-debug) an empty client ID fails verification rather than +// silently bypassing the check. +func (s *GoogleAuthService) verifyAudience(audience jwt.ClaimStrings, azp string) bool { clientID := s.config.GoogleAuth.ClientID if clientID == "" { - if s.config.Server.Debug { - // In debug mode only, skip audience verification for local development + // In debug mode only, skip audience verification for local development. + return s.config.Server.Debug + } + + candidates := []string{clientID} + if id := s.config.GoogleAuth.AndroidClientID; id != "" { + candidates = append(candidates, id) + } + if id := s.config.GoogleAuth.IOSClientID; id != "" { + candidates = append(candidates, id) + } + + for _, want := range candidates { + if azp == want { return true } - // In production, missing client ID means we cannot verify the audience - return false + for _, aud := range audience { + if aud == want { + return true + } + } } - - // Check both aud and azp (Android vs iOS may use different values) - if aud == clientID || azp == clientID { - return true - } - - // Also check Android client ID if configured - androidClientID := s.config.GoogleAuth.AndroidClientID - if androidClientID != "" && (aud == androidClientID || azp == androidClientID) { - return true - } - - // Also check iOS client ID if configured - iosClientID := s.config.GoogleAuth.IOSClientID - if iosClientID != "" && (aud == iosClientID || azp == iosClientID) { - return true - } - return false } + +// getPublicKey returns the RSA public key for the given key ID, using a +// Redis-cached copy of Google's JWKS and re-fetching once on a cache miss +// (Google rotates signing keys roughly daily). +func (s *GoogleAuthService) getPublicKey(ctx context.Context, kid string) (*rsa.PublicKey, error) { + keys, err := s.getCachedKeys(ctx) + if err != nil || keys == nil { + keys, err = s.fetchGooglePublicKeys(ctx) + if err != nil { + return nil, err + } + } + if pubKey, ok := keys[kid]; ok { + return pubKey, nil + } + + // Cache miss for this kid — keys may have rotated; fetch fresh. + keys, err = s.fetchGooglePublicKeys(ctx) + if err != nil { + return nil, err + } + if pubKey, ok := keys[kid]; ok { + return pubKey, nil + } + return nil, ErrGoogleKeyNotFound +} + +// getCachedKeys retrieves cached Google public keys from Redis. +func (s *GoogleAuthService) getCachedKeys(ctx context.Context) (map[string]*rsa.PublicKey, error) { + if s.cache == nil { + return nil, nil + } + data, err := s.cache.GetString(ctx, googleKeysCacheKey) + if err != nil || data == "" { + return nil, nil + } + var jwks GoogleJWKS + if err := json.Unmarshal([]byte(data), &jwks); err != nil { + return nil, nil + } + return s.parseJWKS(&jwks), nil +} + +// fetchGooglePublicKeys fetches Google's JWKS and caches it. +func (s *GoogleAuthService) fetchGooglePublicKeys(ctx context.Context) (map[string]*rsa.PublicKey, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, googleKeysURL, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + resp, err := s.client.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch Google keys: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("Google keys endpoint returned status %d", resp.StatusCode) + } + var jwks GoogleJWKS + if err := json.NewDecoder(resp.Body).Decode(&jwks); err != nil { + return nil, fmt.Errorf("failed to decode Google keys: %w", err) + } + if s.cache != nil { + keysJSON, _ := json.Marshal(jwks) + _ = s.cache.SetString(ctx, googleKeysCacheKey, string(keysJSON), googleKeysCacheTTL) + } + return s.parseJWKS(&jwks), nil +} + +// parseJWKS converts Google's JWKS into a map of RSA public keys by key ID. +func (s *GoogleAuthService) parseJWKS(jwks *GoogleJWKS) map[string]*rsa.PublicKey { + keys := make(map[string]*rsa.PublicKey) + for _, key := range jwks.Keys { + if key.Kty != "RSA" { + continue + } + nBytes, err := base64.RawURLEncoding.DecodeString(key.N) + if err != nil { + continue + } + eBytes, err := base64.RawURLEncoding.DecodeString(key.E) + if err != nil { + continue + } + e := 0 + for _, b := range eBytes { + e = e<<8 + int(b) + } + keys[key.Kid] = &rsa.PublicKey{N: new(big.Int).SetBytes(nBytes), E: e} + } + return keys +} diff --git a/internal/services/iap_validation.go b/internal/services/iap_validation.go index 6b43930..d692786 100644 --- a/internal/services/iap_validation.go +++ b/internal/services/iap_validation.go @@ -68,13 +68,14 @@ type AppleTransactionInfo struct { // AppleValidationResult contains the result of Apple receipt validation type AppleValidationResult struct { - Valid bool - TransactionID string - ProductID string - ExpiresAt time.Time - IsTrialPeriod bool - AutoRenewEnabled bool - Environment string + Valid bool + TransactionID string + OriginalTransactionID string // stable across renewals — the replay key + ProductID string + ExpiresAt time.Time + IsTrialPeriod bool + AutoRenewEnabled bool + Environment string } // GoogleValidationResult contains the result of Google token validation @@ -95,6 +96,21 @@ func NewAppleIAPClient(cfg config.AppleIAPConfig) (*AppleIAPClient, error) { return nil, ErrIAPNotConfigured } + // Audit H5 (relaxed per MEDIUM-2): refuse to load the IAP signing key from + // a world-accessible file — a leaked .p8 lets an attacker forge App Store + // Server API requests. The original "0600 or stricter" check is + // incompatible with a Kubernetes Secret volume: the kubelet widens secret + // files to 0440 once fsGroup is set, so 0600 is unattainable for a + // non-root container. Group access is scoped to the pod's fsGroup; the + // real exposure is the "other" bits, so reject only those. + info, err := os.Stat(cfg.KeyPath) + if err != nil { + return nil, fmt.Errorf("failed to stat Apple IAP key: %w", err) + } + if perm := info.Mode().Perm(); perm&0o007 != 0 { + return nil, fmt.Errorf("Apple IAP key %s is world-accessible (permissions %#o); remove other-rwx bits", cfg.KeyPath, perm) + } + // Read the private key keyData, err := os.ReadFile(cfg.KeyPath) if err != nil { @@ -215,11 +231,12 @@ func (c *AppleIAPClient) ValidateTransaction(ctx context.Context, transactionID expiresAt := time.Unix(transactionInfo.ExpiresDate/1000, 0) return &AppleValidationResult{ - Valid: true, - TransactionID: transactionInfo.TransactionID, - ProductID: transactionInfo.ProductID, - ExpiresAt: expiresAt, - Environment: transactionInfo.Environment, + Valid: true, + TransactionID: transactionInfo.TransactionID, + OriginalTransactionID: transactionInfo.OriginalTransactionID, + ProductID: transactionInfo.ProductID, + ExpiresAt: expiresAt, + Environment: transactionInfo.Environment, }, nil } @@ -243,11 +260,12 @@ func (c *AppleIAPClient) ValidateReceipt(ctx context.Context, receiptData string if err == nil { expiresAt := time.Unix(transactionInfo.ExpiresDate/1000, 0) return &AppleValidationResult{ - Valid: true, - TransactionID: transactionInfo.TransactionID, - ProductID: transactionInfo.ProductID, - ExpiresAt: expiresAt, - Environment: transactionInfo.Environment, + Valid: true, + TransactionID: transactionInfo.TransactionID, + OriginalTransactionID: transactionInfo.OriginalTransactionID, + ProductID: transactionInfo.ProductID, + ExpiresAt: expiresAt, + Environment: transactionInfo.Environment, }, nil } } @@ -317,11 +335,12 @@ func (c *AppleIAPClient) ValidateReceipt(ctx context.Context, receiptData string expiresAt := time.Unix(transactionInfo.ExpiresDate/1000, 0) return &AppleValidationResult{ - Valid: true, - TransactionID: transactionInfo.TransactionID, - ProductID: transactionInfo.ProductID, - ExpiresAt: expiresAt, - Environment: transactionInfo.Environment, + Valid: true, + TransactionID: transactionInfo.TransactionID, + OriginalTransactionID: transactionInfo.OriginalTransactionID, + ProductID: transactionInfo.ProductID, + ExpiresAt: expiresAt, + Environment: transactionInfo.Environment, }, nil } @@ -418,13 +437,14 @@ func (c *AppleIAPClient) validateLegacyReceiptWithSandbox(ctx context.Context, r } return &AppleValidationResult{ - Valid: true, - TransactionID: latestReceipt.TransactionID, - ProductID: latestReceipt.ProductID, - ExpiresAt: expiresAt, - IsTrialPeriod: latestReceipt.IsTrialPeriod == "true", - AutoRenewEnabled: autoRenew, - Environment: legacyResponse.Environment, + Valid: true, + TransactionID: latestReceipt.TransactionID, + OriginalTransactionID: latestReceipt.OriginalTransactionID, + ProductID: latestReceipt.ProductID, + ExpiresAt: expiresAt, + IsTrialPeriod: latestReceipt.IsTrialPeriod == "true", + AutoRenewEnabled: autoRenew, + Environment: legacyResponse.Environment, }, nil } diff --git a/internal/services/notification_service.go b/internal/services/notification_service.go index 1bb7e1f..bc3e11d 100644 --- a/internal/services/notification_service.go +++ b/internal/services/notification_service.go @@ -308,7 +308,18 @@ func (s *NotificationService) registerAPNSDevice(ctx context.Context, userID uin // Check if device exists existing, err := s.notificationRepo.WithContext(ctx).FindAPNSDeviceByToken(req.RegistrationID) if err == nil { - // Update existing device + // Audit C8 / LOW-3: APNs device tokens are recycled across devices, + // app reinstalls and OS reassignments, so a token already bound to a + // different account is a stale binding — not a hijack. Reassign it to + // the current (authenticated) registrant rather than reject: a 409 + // here would lock the legitimate new owner of a recycled token out of + // push entirely. The reassignment is logged as a security-relevant + // event so a genuine token-takeover attempt is still traceable. + if existing.UserID != nil && *existing.UserID != userID { + log.Warn().Uint("user_id", userID).Uint("previous_owner_id", *existing.UserID). + Msg("APNS device token reassigned to a new account") + } + // Update existing device — reassign to the current user existing.UserID = &userID existing.Active = true existing.Name = req.Name @@ -337,7 +348,18 @@ func (s *NotificationService) registerGCMDevice(ctx context.Context, userID uint // Check if device exists existing, err := s.notificationRepo.WithContext(ctx).FindGCMDeviceByToken(req.RegistrationID) if err == nil { - // Update existing device + // Audit C8 / LOW-3: FCM device tokens are recycled across devices, + // app reinstalls and OS reassignments, so a token already bound to a + // different account is a stale binding — not a hijack. Reassign it to + // the current (authenticated) registrant rather than reject: a 409 + // here would lock the legitimate new owner of a recycled token out of + // push entirely. The reassignment is logged as a security-relevant + // event so a genuine token-takeover attempt is still traceable. + if existing.UserID != nil && *existing.UserID != userID { + log.Warn().Uint("user_id", userID).Uint("previous_owner_id", *existing.UserID). + Msg("GCM device token reassigned to a new account") + } + // Update existing device — reassign to the current user existing.UserID = &userID existing.Active = true existing.Name = req.Name diff --git a/internal/services/residence_service.go b/internal/services/residence_service.go index c666344..6193074 100644 --- a/internal/services/residence_service.go +++ b/internal/services/residence_service.go @@ -559,30 +559,22 @@ func (s *ResidenceService) GenerateSharePackage(ctx context.Context, residenceID }, nil } -// JoinWithCode allows a user to join a residence using a share code +// JoinWithCode allows a user to join a residence using a share code. +// Audit C9/H9: the code lookup, membership add, and one-time-code +// deactivation run as a single locked transaction in the repository, so a +// code can never be redeemed twice and a deactivation failure aborts the join. func (s *ResidenceService) JoinWithCode(ctx context.Context, code string, userID uint) (*responses.JoinResidenceResponse, error) { - // Find the share code - shareCode, err := s.residenceRepo.WithContext(ctx).FindShareCodeByCode(code) + residenceID, alreadyMember, err := s.residenceRepo.WithContext(ctx).JoinWithShareCode(code, userID) if err != nil { if errors.Is(err, gorm.ErrRecordNotFound) { return nil, apperrors.NotFound("error.share_code_invalid") } return nil, apperrors.Internal(err) } - - // Check if already a member - hasAccess, err := s.residenceRepo.WithContext(ctx).HasAccess(shareCode.ResidenceID, userID) - if err != nil { - return nil, apperrors.Internal(err) - } - if hasAccess { + if alreadyMember { return nil, apperrors.Conflict("error.user_already_member") } - // Add user to residence - if err := s.residenceRepo.WithContext(ctx).AddUser(shareCode.ResidenceID, userID); err != nil { - return nil, apperrors.Internal(err) - } if s.cache != nil { // The joining user's residence-IDs cache is now stale, and their // subscription status now reflects an extra residence with all of its @@ -591,15 +583,8 @@ func (s *ResidenceService) JoinWithCode(ctx context.Context, code string, userID _ = s.cache.InvalidateSubscriptionStatusForUsers(ctx, userID) } - // Mark share code as used (one-time use) - if err := s.residenceRepo.WithContext(ctx).DeactivateShareCode(shareCode.ID); err != nil { - // Log the error but don't fail the join - the user has already been added - // The code will just be usable by others until it expires - log.Error().Err(err).Uint("code_id", shareCode.ID).Msg("Failed to deactivate share code after join") - } - // Get the residence with full details - residence, err := s.residenceRepo.WithContext(ctx).FindByID(shareCode.ResidenceID) + residence, err := s.residenceRepo.WithContext(ctx).FindByID(residenceID) if err != nil { return nil, apperrors.Internal(err) } diff --git a/internal/services/subscription_service.go b/internal/services/subscription_service.go index 44799c4..b679a90 100644 --- a/internal/services/subscription_service.go +++ b/internal/services/subscription_service.go @@ -399,99 +399,135 @@ func (s *SubscriptionService) GetActivePromotions(ctx context.Context, userID ui return result, nil } -// ProcessApplePurchase processes an Apple IAP purchase -// Supports both StoreKit 1 (receiptData) and StoreKit 2 (transactionID) +// ProcessApplePurchase processes an Apple IAP purchase. +// Supports both StoreKit 1 (receiptData) and StoreKit 2 (transactionID). func (s *SubscriptionService) ProcessApplePurchase(ctx context.Context, userID uint, receiptData string, transactionID string) (*SubscriptionResponse, error) { - // Store receipt/transaction data - dataToStore := receiptData - if dataToStore == "" { - dataToStore = transactionID - } - if err := s.subscriptionRepo.WithContext(ctx).UpdateReceiptData(userID, dataToStore); err != nil { - return nil, apperrors.Internal(err) - } - - // Apple IAP client must be configured to validate purchases. - // Without server-side validation, we cannot trust client-provided receipts. + // Apple IAP client must be configured — without server-side validation + // we cannot trust client-provided receipts. if s.appleClient == nil { log.Error().Uint("user_id", userID).Msg("Apple IAP validation not configured, rejecting purchase") return nil, apperrors.BadRequest("error.iap_validation_not_configured") } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + // Validation is a network call to Apple; detach from the request context + // so a client disconnect cannot abort an in-flight grant. + vctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() var result *AppleValidationResult var err error - - // Prefer transaction ID (StoreKit 2), fall back to receipt data (StoreKit 1) + // Prefer transaction ID (StoreKit 2), fall back to receipt data (StoreKit 1). if transactionID != "" { - result, err = s.appleClient.ValidateTransaction(ctx, transactionID) + result, err = s.appleClient.ValidateTransaction(vctx, transactionID) } else if receiptData != "" { - result, err = s.appleClient.ValidateReceipt(ctx, receiptData) + result, err = s.appleClient.ValidateReceipt(vctx, receiptData) } - if err != nil { // Validation failed -- do NOT fall through to grant Pro. log.Error().Err(err).Uint("user_id", userID).Msg("Apple validation failed") return nil, err } - if result == nil { return nil, apperrors.BadRequest("error.no_receipt_or_transaction") } + // Audit C5/C10: replay protection. A validated transaction may only ever + // be bound to one account — re-submitting a valid receipt against a + // second account must not grant Pro for free. The partial unique index + // on apple_original_transaction_id is the backstop for the check/store + // race below. + if result.OriginalTransactionID != "" { + existing, lookupErr := s.subscriptionRepo.WithContext(vctx).FindByAppleOriginalTransactionID(result.OriginalTransactionID) + switch { + case lookupErr == nil && existing != nil && existing.UserID != userID: + log.Warn().Uint("user_id", userID).Uint("bound_user_id", existing.UserID). + Msg("Apple purchase rejected — transaction already claimed by another account") + return nil, apperrors.Forbidden("error.iap_transaction_already_claimed") + case lookupErr != nil && !errors.Is(lookupErr, gorm.ErrRecordNotFound): + return nil, apperrors.Internal(lookupErr) + } + } + + // Persist the receipt blob and the replay key. + dataToStore := receiptData + if dataToStore == "" { + dataToStore = transactionID + } + if err := s.subscriptionRepo.WithContext(vctx).UpdateReceiptData(userID, dataToStore); err != nil { + return nil, apperrors.Internal(err) + } + if result.OriginalTransactionID != "" { + if err := s.subscriptionRepo.WithContext(vctx).UpdateAppleOriginalTransactionID(userID, result.OriginalTransactionID); err != nil { + // The unique index rejected the bind — a concurrent request + // claimed the same transaction first. + log.Warn().Err(err).Uint("user_id", userID).Msg("Failed to bind Apple transaction ID") + return nil, apperrors.Internal(err) + } + } + expiresAt := result.ExpiresAt log.Info().Uint("user_id", userID).Str("product", result.ProductID).Time("expires", result.ExpiresAt).Str("env", result.Environment).Msg("Apple purchase validated") - // Upgrade to Pro with the validated expiration - if err := s.subscriptionRepo.WithContext(ctx).UpgradeToPro(userID, expiresAt, "ios"); err != nil { + if err := s.subscriptionRepo.WithContext(vctx).UpgradeToPro(userID, expiresAt, "ios"); err != nil { return nil, apperrors.Internal(err) } // Tier flipped — drop cached SubscriptionStatusResponse so the next call // returns Pro immediately instead of stale Free. if s.cache != nil { - _ = s.cache.InvalidateSubscriptionStatusForUsers(ctx, userID) + _ = s.cache.InvalidateSubscriptionStatusForUsers(vctx, userID) } - return s.GetSubscription(ctx, userID) + return s.GetSubscription(vctx, userID) } // ProcessGooglePurchase processes a Google Play purchase // productID is optional but helps validate the specific subscription func (s *SubscriptionService) ProcessGooglePurchase(ctx context.Context, userID uint, purchaseToken string, productID string) (*SubscriptionResponse, error) { - // Store purchase token first - if err := s.subscriptionRepo.WithContext(ctx).UpdatePurchaseToken(userID, purchaseToken); err != nil { - return nil, apperrors.Internal(err) - } - - // Google IAP client must be configured to validate purchases. - // Without server-side validation, we cannot trust client-provided tokens. + // Google IAP client must be configured — without server-side validation + // we cannot trust client-provided tokens. if s.googleClient == nil { log.Error().Uint("user_id", userID).Msg("Google IAP validation not configured, rejecting purchase") return nil, apperrors.BadRequest("error.iap_validation_not_configured") } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + // Audit C6/C10: replay protection — a purchase token may only ever be + // bound to one account. The partial unique index on google_purchase_token + // is the backstop for the check/store race. + if purchaseToken != "" { + existing, lookupErr := s.subscriptionRepo.WithContext(ctx).FindByGoogleToken(purchaseToken) + switch { + case lookupErr == nil && existing != nil && existing.UserID != userID: + log.Warn().Uint("user_id", userID).Uint("bound_user_id", existing.UserID). + Msg("Google purchase rejected — token already claimed by another account") + return nil, apperrors.Forbidden("error.iap_transaction_already_claimed") + case lookupErr != nil && !errors.Is(lookupErr, gorm.ErrRecordNotFound): + return nil, apperrors.Internal(lookupErr) + } + } + + // Store the purchase token (the replay key). + if err := s.subscriptionRepo.WithContext(ctx).UpdatePurchaseToken(userID, purchaseToken); err != nil { + return nil, apperrors.Internal(err) + } + + // Validation is a network call; detach from the request context. + vctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() var result *GoogleValidationResult var err error - - // If productID is provided, use it directly; otherwise try known IDs + // If productID is provided, use it directly; otherwise try known IDs. if productID != "" { - result, err = s.googleClient.ValidateSubscription(ctx, productID, purchaseToken) + result, err = s.googleClient.ValidateSubscription(vctx, productID, purchaseToken) } else { - result, err = s.googleClient.ValidatePurchaseToken(ctx, purchaseToken, KnownSubscriptionIDs) + result, err = s.googleClient.ValidatePurchaseToken(vctx, purchaseToken, KnownSubscriptionIDs) } - if err != nil { // Validation failed -- do NOT fall through to grant Pro. log.Error().Err(err).Uint("user_id", userID).Msg("Google purchase validation failed") return nil, err } - if result == nil { return nil, apperrors.BadRequest("error.no_purchase_token") } @@ -499,24 +535,23 @@ func (s *SubscriptionService) ProcessGooglePurchase(ctx context.Context, userID expiresAt := result.ExpiresAt log.Info().Uint("user_id", userID).Str("product", result.ProductID).Time("expires", result.ExpiresAt).Bool("auto_renew", result.AutoRenewing).Msg("Google purchase validated") - // Acknowledge the subscription if not already acknowledged + // Acknowledge the subscription if not already acknowledged. if !result.AcknowledgedState { - if err := s.googleClient.AcknowledgeSubscription(ctx, result.ProductID, purchaseToken); err != nil { + if err := s.googleClient.AcknowledgeSubscription(vctx, result.ProductID, purchaseToken); err != nil { log.Warn().Err(err).Uint("user_id", userID).Msg("Failed to acknowledge Google subscription") - // Don't fail the purchase, just log the warning + // Don't fail the purchase, just log the warning. } } - // Upgrade to Pro with the validated expiration - if err := s.subscriptionRepo.WithContext(ctx).UpgradeToPro(userID, expiresAt, "android"); err != nil { + if err := s.subscriptionRepo.WithContext(vctx).UpgradeToPro(userID, expiresAt, "android"); err != nil { return nil, apperrors.Internal(err) } if s.cache != nil { - _ = s.cache.InvalidateSubscriptionStatusForUsers(ctx, userID) + _ = s.cache.InvalidateSubscriptionStatusForUsers(vctx, userID) } - return s.GetSubscription(ctx, userID) + return s.GetSubscription(vctx, userID) } // CancelSubscription cancels a subscription (downgrades to free at end of period) diff --git a/migrations/000003_hash_auth_tokens.sql b/migrations/000003_hash_auth_tokens.sql new file mode 100644 index 0000000..4c943e8 --- /dev/null +++ b/migrations/000003_hash_auth_tokens.sql @@ -0,0 +1,13 @@ +-- +goose Up +-- Audit C1: auth tokens are stored as SHA-256 hashes (hex, 64 chars), never +-- as plaintext, so a database compromise no longer yields usable session +-- tokens. Widen the key column from 40 to 64 chars. Existing plaintext rows +-- cannot be rehashed in place, so they are dropped — every user logs in +-- once after this deploy. This is expected and one-time. +ALTER TABLE user_authtoken ALTER COLUMN key TYPE varchar(64); +DELETE FROM user_authtoken; + +-- +goose Down +-- Tokens cannot be un-hashed; clearing the table is the only safe rollback. +DELETE FROM user_authtoken; +ALTER TABLE user_authtoken ALTER COLUMN key TYPE varchar(40); diff --git a/migrations/000004_iap_replay_protection.sql b/migrations/000004_iap_replay_protection.sql new file mode 100644 index 0000000..4d6f0da --- /dev/null +++ b/migrations/000004_iap_replay_protection.sql @@ -0,0 +1,47 @@ +-- +goose Up +-- Audit C5/C6/C10/C13: bind each in-app-purchase transaction to exactly one +-- account so a valid receipt cannot be replayed against a second account to +-- grant Pro for free. +-- +-- apple_original_transaction_id is a dedicated, indexed column — it replaces +-- the LIKE '%...%' scan over apple_receipt_data that the Apple webhook used +-- to find users (C13). google_purchase_token already exists; we just add the +-- uniqueness guarantee. +ALTER TABLE subscription_usersubscription + ADD COLUMN IF NOT EXISTS apple_original_transaction_id text; + +-- Partial unique indexes: one account per transaction. NULL/empty rows are +-- excluded so accounts without an IAP are unaffected. +CREATE UNIQUE INDEX IF NOT EXISTS uq_subscription_apple_original_txn + ON subscription_usersubscription (apple_original_transaction_id) + WHERE apple_original_transaction_id IS NOT NULL + AND apple_original_transaction_id <> ''; + +-- Pre-flight dedup for the Google index below. apple_original_transaction_id +-- is brand-new (added above), so it is all-NULL and cannot collide. But +-- google_purchase_token is a pre-existing column, and the C6 replay bug being +-- fixed here is exactly "the same token bound to multiple accounts" — so +-- duplicate rows may exist and would make the UNIQUE index below fail to +-- build, aborting the migrate Job. Keep the earliest subscription row for +-- each token and clear the token on the rest; those rows lose a binding that +-- was disputed anyway, while the original (earliest) owner keeps it. +UPDATE subscription_usersubscription s +SET google_purchase_token = NULL +WHERE google_purchase_token IS NOT NULL + AND google_purchase_token <> '' + AND id <> ( + SELECT MIN(s2.id) + FROM subscription_usersubscription s2 + WHERE s2.google_purchase_token = s.google_purchase_token + ); + +CREATE UNIQUE INDEX IF NOT EXISTS uq_subscription_google_purchase_token + ON subscription_usersubscription (google_purchase_token) + WHERE google_purchase_token IS NOT NULL + AND google_purchase_token <> ''; + +-- +goose Down +DROP INDEX IF EXISTS uq_subscription_google_purchase_token; +DROP INDEX IF EXISTS uq_subscription_apple_original_txn; +ALTER TABLE subscription_usersubscription + DROP COLUMN IF EXISTS apple_original_transaction_id; diff --git a/migrations/000005_audit_log_append_only.sql b/migrations/000005_audit_log_append_only.sql new file mode 100644 index 0000000..0f86685 --- /dev/null +++ b/migrations/000005_audit_log_append_only.sql @@ -0,0 +1,52 @@ +-- +goose Up +-- Audit M7: make audit_log append-only. A BEFORE trigger rejects UPDATE and +-- DELETE so the security-event history cannot be altered or erased after the +-- fact — even by a database account with broad table privileges. The +-- application only ever INSERTs into this table. + +-- The audit_log table itself was never created by a goose migration — it is +-- only built by GORM AutoMigrate in the test harness, and production never +-- runs AutoMigrate. CREATE TABLE IF NOT EXISTS brings it under migration +-- control without disturbing an existing table: a no-op on a DB that already +-- has it, and a correct build (matching models.AuditLog) on a from-scratch +-- redeploy — so the trigger below has a table to attach to and a clean +-- redeploy comes up with a working, append-only audit log. +CREATE TABLE IF NOT EXISTS audit_log ( + id BIGSERIAL PRIMARY KEY, + user_id BIGINT, + event_type VARCHAR(50) NOT NULL, + ip_address VARCHAR(45), + user_agent TEXT, + details JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); +CREATE INDEX IF NOT EXISTS idx_audit_log_user_id ON audit_log (user_id); +CREATE INDEX IF NOT EXISTS idx_audit_log_created_at ON audit_log (created_at); + +-- +goose StatementBegin +CREATE OR REPLACE FUNCTION audit_log_append_only() RETURNS trigger AS $$ +BEGIN + RAISE EXCEPTION 'audit_log is append-only: % is not permitted', TG_OP; +END; +$$ LANGUAGE plpgsql; +-- +goose StatementEnd + +-- DROP ... IF EXISTS before CREATE keeps this idempotent (CREATE TRIGGER has +-- no OR REPLACE on older PostgreSQL). +DROP TRIGGER IF EXISTS audit_log_no_update ON audit_log; +CREATE TRIGGER audit_log_no_update + BEFORE UPDATE ON audit_log + FOR EACH ROW EXECUTE FUNCTION audit_log_append_only(); + +DROP TRIGGER IF EXISTS audit_log_no_delete ON audit_log; +CREATE TRIGGER audit_log_no_delete + BEFORE DELETE ON audit_log + FOR EACH ROW EXECUTE FUNCTION audit_log_append_only(); + +-- +goose Down +-- Reverses only the append-only guard, which is this migration's purpose. +-- The audit_log table is intentionally NOT dropped — it may hold security +-- history that predates this migration. +DROP TRIGGER IF EXISTS audit_log_no_delete ON audit_log; +DROP TRIGGER IF EXISTS audit_log_no_update ON audit_log; +DROP FUNCTION IF EXISTS audit_log_append_only(); diff --git a/migrations/000006_webhook_event_log.sql b/migrations/000006_webhook_event_log.sql new file mode 100644 index 0000000..d161a10 --- /dev/null +++ b/migrations/000006_webhook_event_log.sql @@ -0,0 +1,30 @@ +-- +goose Up +-- Audit H6 follow-up. The Apple/Google webhook handler now fails CLOSED on a +-- deduplication-store error: if it cannot consult webhook_event_log it returns +-- 500 rather than risk processing a replayed event. That makes the presence of +-- the webhook_event_log table mandatory. +-- +-- Like audit_log, this table was never created by a goose migration — only by +-- GORM AutoMigrate in tests — so a from-scratch redeploy would come up without +-- it and 500 every subscription webhook. CREATE TABLE IF NOT EXISTS brings it +-- under migration control: a no-op where the table already exists, and a +-- correct build (matching repositories.WebhookEvent) on a fresh database. +CREATE TABLE IF NOT EXISTS webhook_event_log ( + id BIGSERIAL PRIMARY KEY, + event_id VARCHAR(255) NOT NULL, + provider VARCHAR(20) NOT NULL, + event_type VARCHAR(100) NOT NULL, + processed_at TIMESTAMPTZ NOT NULL DEFAULT now(), + payload_hash VARCHAR(64) +); + +-- (provider, event_id) is the dedup key — matches the +-- uniqueIndex:idx_provider_event_id GORM tags on repositories.WebhookEvent. +CREATE UNIQUE INDEX IF NOT EXISTS idx_provider_event_id + ON webhook_event_log (provider, event_id); + +-- +goose Down +-- The table is intentionally NOT dropped — it may hold deduplication history +-- that predates this migration, and dropping it would let already-processed +-- webhook events be replayed. Down is a documented no-op. +SELECT 1;