From 6f303dbbaa8f6476ee30f16cf0abfd03edb003a5 Mon Sep 17 00:00:00 2001 From: Trey t Date: Fri, 24 Apr 2026 07:20:21 -0500 Subject: [PATCH] Migrate prod deploy from Swarm to K3s; add full deployment book MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Infrastructure: - Stack now runs on K3s v1.34.6 HA (3 Hetzner CX33 nodes as managers) - Traefik DaemonSet + hostNetwork replaces Caddy + ingress mesh - All manifests in deploy-k3s/manifests/; Swarm config (deploy/) kept temporarily for reference Bug fixes surfaced during migration: - Dockerfile: golang:1.24-alpine -> 1.25-alpine (go.mod requires 1.25) - cache_service.go: remove sync.Once reassignment from inside Do() callback (was causing 'unlock of unlocked mutex' fatal after Redis Ping failure) - router.go: relax CSP from 'default-src none' to 'default-src self' + allowlist fonts.googleapis.com so the marketing landing page CSS actually loads in browsers - deploy/scripts/deploy_prod.sh: use docker buildx with --platform linux/amd64 so arm64 (Apple Silicon) dev machines produce images runnable on x86_64 Hetzner nodes; fix array expansion under set -u - deploy/swarm-stack.prod.yml: fix secret source references to use top-level aliases (the '\${X_SECRET}' form never actually resolved); dozzle ports: long-form host_ip is rejected by Swarm, switched to short-form (bound to 0.0.0.0 with UFW-based loopback restriction); worker replicas 2 -> 1 (Asynq scheduler singleton) - deploy-k3s/manifests/admin/deployment.yaml: probe path '/admin/' -> '/' (Next.js serves at root; /admin/ returned 404 and killed pods); startupProbe failureThreshold 12 -> 24 - deploy-k3s/manifests/pod-disruption-budgets.yaml: worker minAvailable 1 -> 0 (singleton) - deploy-k3s/manifests/api/deployment.yaml: startupProbe failureThreshold 12 -> 48 (MigrateWithLock serializes across 3 replicas on first-boot; real startup takes up to 240s) - .gitignore: tighten 'api' -> '/api' (was matching deploy-k3s/manifests/api/ and admin/src/app/api/*, hiding legitimate files) New files: - deploy-k3s/manifests/traefik-helmchartconfig.yaml: DaemonSet + hostNetwork override for k3s-bundled Traefik - deploy-k3s/manifests/ingress/ingress-simple.yaml: plain Ingress without TLS (CF Flexible SSL) and without middleware - deploy-k3s/MIGRATION_NOTES.md: operator-facing migration log Documentation: - docs/deployment/ — full deployment book, 26 files, ~42k words: - Part I Overview, infrastructure, orchestrator choice (Ch 0-2) - Part II Networking, firewall, Cloudflare (Ch 3-4, 13) - Part III Security, Traefik ingress (Ch 5-6) - Part IV Services, DB, storage, secrets, registry (Ch 7-11) - Part V Data flow, deploy process, observability, failures, runbook (Ch 12, 14-17) - Part VI Cost, Swarm postmortem, roadmap (Ch 18-20) - Appendices: glossary, kubectl cheat sheet, file locations, consolidated citations - README.md: Production Deployment section replaced with pointer to the book; Go version bumped to 1.25 Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 2 +- README.md | 51 +- admin/src/app/api/health/route.ts | 12 + deploy-k3s-dev/manifests/api/deployment.yaml | 118 ++++ deploy-k3s-dev/manifests/api/service.yaml | 16 + .../manifests/worker/deployment.yaml | 101 +++ deploy-k3s/MIGRATION_NOTES.md | 170 ++++++ deploy-k3s/manifests/admin/deployment.yaml | 8 +- deploy-k3s/manifests/api/deployment.yaml | 123 ++++ deploy-k3s/manifests/api/hpa.yaml | 41 ++ deploy-k3s/manifests/api/service.yaml | 16 + .../manifests/ingress/ingress-simple.yaml | 61 ++ .../manifests/pod-disruption-budgets.yaml | 4 +- .../manifests/traefik-helmchartconfig.yaml | 53 ++ deploy-k3s/manifests/worker/deployment.yaml | 105 ++++ deploy/Caddyfile | 52 ++ deploy/scripts/deploy_prod.sh | 126 +++- deploy/swarm-stack.prod.yml | 117 +++- docs/deployment/00-overview.md | 240 ++++++++ docs/deployment/01-infrastructure.md | 294 +++++++++ docs/deployment/02-orchestrator-choice.md | 323 ++++++++++ docs/deployment/03-networking.md | 465 ++++++++++++++ docs/deployment/04-firewall.md | 357 +++++++++++ docs/deployment/05-security.md | 526 ++++++++++++++++ docs/deployment/06-traefik-ingress.md | 419 +++++++++++++ docs/deployment/07-services.md | 575 ++++++++++++++++++ docs/deployment/08-database.md | 298 +++++++++ docs/deployment/09-storage.md | 265 ++++++++ docs/deployment/10-secrets-config.md | 369 +++++++++++ docs/deployment/11-registry.md | 329 ++++++++++ docs/deployment/12-data-flow.md | 317 ++++++++++ docs/deployment/13-cloudflare.md | 344 +++++++++++ docs/deployment/14-deployment-process.md | 433 +++++++++++++ docs/deployment/15-observability.md | 305 ++++++++++ docs/deployment/16-failure-modes.md | 360 +++++++++++ docs/deployment/17-runbook.md | 369 +++++++++++ docs/deployment/18-cost.md | 243 ++++++++ docs/deployment/19-postmortem-swarm.md | 480 +++++++++++++++ docs/deployment/20-roadmap.md | 318 ++++++++++ docs/deployment/README.md | 112 ++++ docs/deployment/appendices/a-glossary.md | 207 +++++++ docs/deployment/appendices/b-commands.md | 305 ++++++++++ .../deployment/appendices/c-file-locations.md | 216 +++++++ docs/deployment/appendices/d-references.md | 202 ++++++ internal/router/router.go | 23 +- internal/services/cache_service.go | 8 +- 46 files changed, 9785 insertions(+), 93 deletions(-) create mode 100644 admin/src/app/api/health/route.ts create mode 100644 deploy-k3s-dev/manifests/api/deployment.yaml create mode 100644 deploy-k3s-dev/manifests/api/service.yaml create mode 100644 deploy-k3s-dev/manifests/worker/deployment.yaml create mode 100644 deploy-k3s/MIGRATION_NOTES.md create mode 100644 deploy-k3s/manifests/api/deployment.yaml create mode 100644 deploy-k3s/manifests/api/hpa.yaml create mode 100644 deploy-k3s/manifests/api/service.yaml create mode 100644 deploy-k3s/manifests/ingress/ingress-simple.yaml create mode 100644 deploy-k3s/manifests/traefik-helmchartconfig.yaml create mode 100644 deploy-k3s/manifests/worker/deployment.yaml create mode 100644 deploy/Caddyfile create mode 100644 docs/deployment/00-overview.md create mode 100644 docs/deployment/01-infrastructure.md create mode 100644 docs/deployment/02-orchestrator-choice.md create mode 100644 docs/deployment/03-networking.md create mode 100644 docs/deployment/04-firewall.md create mode 100644 docs/deployment/05-security.md create mode 100644 docs/deployment/06-traefik-ingress.md create mode 100644 docs/deployment/07-services.md create mode 100644 docs/deployment/08-database.md create mode 100644 docs/deployment/09-storage.md create mode 100644 docs/deployment/10-secrets-config.md create mode 100644 docs/deployment/11-registry.md create mode 100644 docs/deployment/12-data-flow.md create mode 100644 docs/deployment/13-cloudflare.md create mode 100644 docs/deployment/14-deployment-process.md create mode 100644 docs/deployment/15-observability.md create mode 100644 docs/deployment/16-failure-modes.md create mode 100644 docs/deployment/17-runbook.md create mode 100644 docs/deployment/18-cost.md create mode 100644 docs/deployment/19-postmortem-swarm.md create mode 100644 docs/deployment/20-roadmap.md create mode 100644 docs/deployment/README.md create mode 100644 docs/deployment/appendices/a-glossary.md create mode 100644 docs/deployment/appendices/b-commands.md create mode 100644 docs/deployment/appendices/c-file-locations.md create mode 100644 docs/deployment/appendices/d-references.md diff --git a/.gitignore b/.gitignore index b09bb45..a7ac383 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,7 @@ # Binaries bin/ -api +/api /worker /admin !admin/ diff --git a/README.md b/README.md index 1a9506f..98c31ba 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Go REST API for the honeyDue property management platform. Powers iOS and Androi ## Tech Stack -- **Language**: Go 1.24 +- **Language**: Go 1.25 - **HTTP Framework**: [Echo v4](https://github.com/labstack/echo) - **ORM**: [GORM](https://gorm.io/) with PostgreSQL - **Background Jobs**: [Asynq](https://github.com/hibiken/asynq) (Redis-backed) @@ -16,7 +16,7 @@ Go REST API for the honeyDue property management platform. Powers iOS and Androi ## Prerequisites -- **Go 1.24+** — [install](https://go.dev/dl/) +- **Go 1.25+** — [install](https://go.dev/dl/) - **PostgreSQL 16+** — via Docker (recommended) or [native install](https://www.postgresql.org/download/) - **Redis 7+** — via Docker (recommended) or [native install](https://redis.io/docs/getting-started/) - **Docker & Docker Compose** — [install](https://docs.docker.com/get-docker/) (recommended for local development) @@ -259,34 +259,43 @@ All protected endpoints require an `Authorization: Token ` header. ## Production Deployment -### Dokku +Production runs on a **3-node K3s HA cluster** on Hetzner Cloud, fronted +by Cloudflare, with Neon Postgres, Backblaze B2, and a self-hosted Gitea +container registry. See the full deployment book for every detail: -```bash -# Push to Dokku -git push dokku main +**→ [docs/deployment/](./docs/deployment/README.md) — The Deployment Book** -# Seed lookup data -cat seeds/001_lookups.sql | dokku postgres:connect honeydue-db +26 chapters and ~42,000 words covering: -# Check logs -dokku logs honeydue-api -t -``` +- **Part I — The System**: overview, Hetzner infrastructure, why K3s + (and not Swarm, full Kubernetes, or Nomad) +- **Part II — Networking**: Flannel VXLAN, CoreDNS, kube-proxy, every + UFW rule on every node, Cloudflare DNS setup +- **Part III — Security**: RBAC, Pod Security, secrets, TLS chain +- **Part IV — Workloads**: api, admin, worker, redis per-service deep + dives; Neon Postgres config; Backblaze B2 storage; Gitea registry +- **Part V — Operation**: end-to-end data flow, deploy process, + observability, failure modes, operator runbook +- **Part VI — Context**: cost breakdown, postmortem of the bugs from + the Swarm→K3s migration, roadmap -### Docker Swarm +Quick links: -```bash -# Build and push production images -make docker-build-prod -docker push ${REGISTRY}/honeydue-api:${TAG} -docker push ${REGISTRY}/honeydue-worker:${TAG} -docker push ${REGISTRY}/honeydue-admin:${TAG} +- **Runbook** — [docs/deployment/17-runbook.md](./docs/deployment/17-runbook.md) — 22 common ops procedures +- **kubectl cheat sheet** — [docs/deployment/appendices/b-commands.md](./docs/deployment/appendices/b-commands.md) +- **Deploy process** — [docs/deployment/14-deployment-process.md](./docs/deployment/14-deployment-process.md) — build → push → rollout +- **Failure modes** — [docs/deployment/16-failure-modes.md](./docs/deployment/16-failure-modes.md) — what happens when X dies +- **Swarm postmortem** — [docs/deployment/19-postmortem-swarm.md](./docs/deployment/19-postmortem-swarm.md) — why we migrated -# Deploy the stack (all env vars must be set in .env or environment) -docker stack deploy -c docker-compose.yml honeydue -``` +Operational state lives under: + +- `deploy-k3s/manifests/` — Kubernetes manifests (apply with `kubectl`) +- `deploy-k3s/MIGRATION_NOTES.md` — notes from the Swarm → K3s migration +- `deploy/` — legacy Swarm config (retained temporarily; to be removed) ## Related Projects +- **Deployment Book**: [`docs/deployment/`](./docs/deployment/README.md) — full production operations reference - **Mobile App (KMM)**: `../HoneyDueKMM` — Kotlin Multiplatform iOS/Android client - **Task Logic Docs**: `docs/TASK_LOGIC_ARCHITECTURE.md` — required reading before task-related work - **Push Notification Docs**: `docs/PUSH_NOTIFICATIONS.md` diff --git a/admin/src/app/api/health/route.ts b/admin/src/app/api/health/route.ts new file mode 100644 index 0000000..5338afa --- /dev/null +++ b/admin/src/app/api/health/route.ts @@ -0,0 +1,12 @@ +import { NextResponse } from 'next/server' + +export const dynamic = 'force-dynamic' +export const revalidate = 0 + +export async function GET() { + return NextResponse.json({ status: 'ok' }, { status: 200 }) +} + +export async function HEAD() { + return new NextResponse(null, { status: 200 }) +} diff --git a/deploy-k3s-dev/manifests/api/deployment.yaml b/deploy-k3s-dev/manifests/api/deployment.yaml new file mode 100644 index 0000000..dd59814 --- /dev/null +++ b/deploy-k3s-dev/manifests/api/deployment.yaml @@ -0,0 +1,118 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + namespace: honeydue + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: honeydue +spec: + replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 + selector: + matchLabels: + app.kubernetes.io/name: api + template: + metadata: + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: honeydue + spec: + serviceAccountName: api + imagePullSecrets: + - name: ghcr-credentials + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: api + image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh + ports: + - containerPort: 8000 + protocol: TCP + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + envFrom: + - configMapRef: + name: honeydue-config + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: POSTGRES_PASSWORD + - name: SECRET_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: SECRET_KEY + - name: EMAIL_HOST_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: EMAIL_HOST_PASSWORD + - name: FCM_SERVER_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: FCM_SERVER_KEY + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: REDIS_PASSWORD + optional: true + volumeMounts: + - name: apns-key + mountPath: /secrets/apns + readOnly: true + - name: tmp + mountPath: /tmp + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: "1" + memory: 512Mi + startupProbe: + httpGet: + path: /api/health/ + port: 8000 + failureThreshold: 12 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /api/health/ + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + livenessProbe: + httpGet: + path: /api/health/ + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 10 + volumes: + - name: apns-key + secret: + secretName: honeydue-apns-key + items: + - key: apns_auth_key.p8 + path: apns_auth_key.p8 + - name: tmp + emptyDir: + sizeLimit: 64Mi diff --git a/deploy-k3s-dev/manifests/api/service.yaml b/deploy-k3s-dev/manifests/api/service.yaml new file mode 100644 index 0000000..6a7594a --- /dev/null +++ b/deploy-k3s-dev/manifests/api/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: api + namespace: honeydue + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: honeydue +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: api + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP diff --git a/deploy-k3s-dev/manifests/worker/deployment.yaml b/deploy-k3s-dev/manifests/worker/deployment.yaml new file mode 100644 index 0000000..efd571c --- /dev/null +++ b/deploy-k3s-dev/manifests/worker/deployment.yaml @@ -0,0 +1,101 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: worker + namespace: honeydue + labels: + app.kubernetes.io/name: worker + app.kubernetes.io/part-of: honeydue +spec: + replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 + selector: + matchLabels: + app.kubernetes.io/name: worker + template: + metadata: + labels: + app.kubernetes.io/name: worker + app.kubernetes.io/part-of: honeydue + spec: + serviceAccountName: worker + imagePullSecrets: + - name: ghcr-credentials + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: worker + image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + envFrom: + - configMapRef: + name: honeydue-config + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: POSTGRES_PASSWORD + - name: SECRET_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: SECRET_KEY + - name: EMAIL_HOST_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: EMAIL_HOST_PASSWORD + - name: FCM_SERVER_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: FCM_SERVER_KEY + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: REDIS_PASSWORD + optional: true + volumeMounts: + - name: apns-key + mountPath: /secrets/apns + readOnly: true + - name: tmp + mountPath: /tmp + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + exec: + command: ["pgrep", "-f", "/app/worker"] + initialDelaySeconds: 15 + periodSeconds: 30 + timeoutSeconds: 5 + volumes: + - name: apns-key + secret: + secretName: honeydue-apns-key + items: + - key: apns_auth_key.p8 + path: apns_auth_key.p8 + - name: tmp + emptyDir: + sizeLimit: 64Mi diff --git a/deploy-k3s/MIGRATION_NOTES.md b/deploy-k3s/MIGRATION_NOTES.md new file mode 100644 index 0000000..d701a7e --- /dev/null +++ b/deploy-k3s/MIGRATION_NOTES.md @@ -0,0 +1,170 @@ +# K3s Migration Notes — 2026-04-24 + +honeyDue is running on a 3-node K3s HA cluster on the existing Hetzner nodes +(hetzner1/2/3), replacing the previous Docker Swarm deployment. + +## Why we migrated + +Docker Swarm's libnetwork has a known stale-DNS bug on 29.x +([moby/moby#52265](https://github.com/moby/moby/issues/52265)) that leaves +ghost A-records when tasks migrate between nodes. Single-replica services +(like the admin panel) landed on a ghost IP ~50% of the time → connection +refused → 502. Full stack recreate cleared it, but the bug recurs on every +node-to-node task migration. + +K3s uses CoreDNS + containerd with no libnetwork history → the bug class +doesn't exist there. See `docs/SWARM_POSTMORTEM.md` if it exists, or the +research summary in the earlier deploy session. + +## Differences from the original `deploy-k3s/` scaffold + +The original scaffold assumes a greenfield provision via `hetzner-k3s`, +GHCR for images, Cloudflare origin certs, and a Hetzner Load Balancer. +We reused existing nodes and kept Cloudflare Flexible SSL: + +| Setting | Scaffold default | What we did | +|---|---|---| +| Provisioning | `hetzner-k3s` tool creates boxes | Manual k3s install on existing Hetzner boxes | +| Registry | GHCR (`ghcr-credentials`) | Gitea (`gitea-credentials`) via `kubectl create secret docker-registry` | +| Ingress TLS | `cloudflare-origin-cert` Secret | No TLS at origin (CF Flexible) | +| Load balancer | Hetzner LB → nodes | Cloudflare round-robin across 3 node IPs | +| Admin basic auth | `admin-auth` Traefik middleware | Not applied — in-app auth only | +| CF-only IP allowlist | `cloudflare-only` middleware | Not applied — UFW restricts some ports, 80/443 open to anyone who knows node IPs | +| Traefik | LoadBalancer via servicelb | DaemonSet w/ hostNetwork (servicelb disabled); see `traefik-config.yaml` below | +| Worker replicas | 2 | 1 (Asynq scheduler is singleton) | +| API start_period | 12×5s = 60s | 48×5s = 240s (covers migrate + lock queue on first boot) | +| Admin probe path | `/admin/` | `/` (Next.js serves at root) | + +## Manifest fixes applied in-repo (already committed) + +- `manifests/api/deployment.yaml` — `startupProbe.failureThreshold: 12 → 48` +- `manifests/admin/deployment.yaml` — probe path `/admin/ → /`, threshold `12 → 24` +- `manifests/worker/deployment.yaml` — `replicas: 2 → 1` +- `manifests/pod-disruption-budgets.yaml` — worker `minAvailable: 1 → 0` + +## Traefik override (applied as HelmChartConfig) + +K3s ships Traefik as a single-replica Deployment with a LoadBalancer service. +With servicelb disabled (to avoid binding a random port), we reconfigure it +to a DaemonSet binding directly on each node's public :80/:443 via +`hostNetwork: true`. The HelmChartConfig: + +```yaml +apiVersion: helm.cattle.io/v1 +kind: HelmChartConfig +metadata: + name: traefik + namespace: kube-system +spec: + valuesContent: |- + deployment: + kind: DaemonSet + hostNetwork: true + service: + enabled: false + ports: + web: + port: 80 + hostPort: 80 + websecure: + port: 443 + hostPort: 443 + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + maxSurge: 0 + securityContext: + capabilities: + drop: [ALL] + add: [NET_BIND_SERVICE] + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + additionalArguments: + - "--entrypoints.web.forwardedHeaders.trustedIPs=173.245.48.0/20,103.21.244.0/22,103.22.200.0/22,103.31.4.0/22,141.101.64.0/18,108.162.192.0/18,190.93.240.0/20,188.114.96.0/20,197.234.240.0/22,198.41.128.0/17,162.158.0.0/15,104.16.0.0/13,104.24.0.0/14,172.64.0.0/13,131.0.72.0/22" +``` + +Apply with `kubectl apply -f traefik-config.yaml`, then bump the helm job +(`kubectl delete job -n kube-system helm-install-traefik`) to trigger reinstall. + +## Required node-level sysctl + +hostNetwork pods with capabilities don't get CAP_NET_BIND_SERVICE in the +host netns on modern containerd. Set on each node: + +```bash +echo 'net.ipv4.ip_unprivileged_port_start=0' | sudo tee /etc/sysctl.d/99-unprivileged-ports.conf +sudo sysctl --system +``` + +## UFW rules added for k3s (per node) + +All between the 3 node IPs (178.104.247.152, 178.105.32.198, 178.104.249.189): + +- `6443/tcp` — kube API +- `2379/tcp`, `2380/tcp` — embedded etcd client + peer +- `10250/tcp` — kubelet +- `8472/udp` — flannel VXLAN overlay + +Plus from your workstation IP to each node's `6443/tcp` for `kubectl`. + +## Ingress + +Minimal hostname-only routing (`/tmp/honeydue-ingress.yaml` at deploy time +— move it into `deploy-k3s/manifests/ingress/` in a follow-up): + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: honeydue-api + namespace: honeydue +spec: + ingressClassName: traefik + rules: + - host: api.myhoneydue.com + http: + paths: + - {path: /, pathType: Prefix, backend: {service: {name: api, port: {number: 8000}}}} + - host: myhoneydue.com + http: + paths: + - {path: /, pathType: Prefix, backend: {service: {name: api, port: {number: 8000}}}} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: honeydue-admin + namespace: honeydue +spec: + ingressClassName: traefik + rules: + - host: admin.myhoneydue.com + http: + paths: + - {path: /, pathType: Prefix, backend: {service: {name: admin, port: {number: 3000}}}} +``` + +## Operator access + +Kubeconfig lives at `~/.kube/honeydue-k3s.yaml`. + +```bash +export KUBECONFIG=~/.kube/honeydue-k3s.yaml +kubectl get pods -n honeydue +``` + +## Remaining TODOs (not blocking) + +- Apply `manifests/ingress/middleware.yaml` for security headers + rate limiting + (CF-only allowlist + basic auth deliberately skipped until you want them) +- Apply `manifests/network-policies.yaml` for default-deny + explicit allows +- Apply `manifests/api/hpa.yaml` if you want autoscaling (metrics-server is + already running, so just `kubectl apply` it) +- Upgrade to CF Full (strict) SSL: generate origin cert, create + `cloudflare-origin-cert` Secret, add `tls:` block back to Ingress +- Set up a proper migration Job so `api` replicas don't each run `MigrateWithLock` + on startup — lets you drop the 240s startupProbe grace +- Remove `deploy/` (the Swarm-era config) once you're confident in k3s diff --git a/deploy-k3s/manifests/admin/deployment.yaml b/deploy-k3s/manifests/admin/deployment.yaml index 4a33296..53f18ec 100644 --- a/deploy-k3s/manifests/admin/deployment.yaml +++ b/deploy-k3s/manifests/admin/deployment.yaml @@ -65,15 +65,17 @@ spec: limits: cpu: 500m memory: 256Mi + # Admin Next.js app serves at `/`, not `/admin/`. `/admin/` returns + # 404 and kills the pod via the probe. startupProbe: httpGet: - path: /admin/ + path: / port: 3000 - failureThreshold: 12 + failureThreshold: 24 periodSeconds: 5 readinessProbe: httpGet: - path: /admin/ + path: / port: 3000 initialDelaySeconds: 5 periodSeconds: 10 diff --git a/deploy-k3s/manifests/api/deployment.yaml b/deploy-k3s/manifests/api/deployment.yaml new file mode 100644 index 0000000..ec1a8f7 --- /dev/null +++ b/deploy-k3s/manifests/api/deployment.yaml @@ -0,0 +1,123 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + namespace: honeydue + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: honeydue +spec: + replicas: 3 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 + selector: + matchLabels: + app.kubernetes.io/name: api + template: + metadata: + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: honeydue + spec: + serviceAccountName: api + imagePullSecrets: + - name: ghcr-credentials + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: api + image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh + ports: + - containerPort: 8000 + protocol: TCP + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + envFrom: + - configMapRef: + name: honeydue-config + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: POSTGRES_PASSWORD + - name: SECRET_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: SECRET_KEY + - name: EMAIL_HOST_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: EMAIL_HOST_PASSWORD + - name: FCM_SERVER_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: FCM_SERVER_KEY + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: REDIS_PASSWORD + optional: true + volumeMounts: + - name: apns-key + mountPath: /secrets/apns + readOnly: true + - name: tmp + mountPath: /tmp + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: "1" + memory: 512Mi + startupProbe: + httpGet: + path: /api/health/ + port: 8000 + # MigrateWithLock in cmd/api/main.go runs pg_advisory_lock on + # every startup. On a cold boot with 3 replicas, the first does + # AutoMigrate (~90s) and the others wait on the lock, so real + # startup runs 90–240s. 48 × 5s = 240s grace absorbs it without + # healthcheck killing a still-starting replica. + failureThreshold: 48 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /api/health/ + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + livenessProbe: + httpGet: + path: /api/health/ + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 10 + volumes: + - name: apns-key + secret: + secretName: honeydue-apns-key + items: + - key: apns_auth_key.p8 + path: apns_auth_key.p8 + - name: tmp + emptyDir: + sizeLimit: 64Mi diff --git a/deploy-k3s/manifests/api/hpa.yaml b/deploy-k3s/manifests/api/hpa.yaml new file mode 100644 index 0000000..05be9df --- /dev/null +++ b/deploy-k3s/manifests/api/hpa.yaml @@ -0,0 +1,41 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: api + namespace: honeydue + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: honeydue +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: api + minReplicas: 3 + maxReplicas: 6 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + behavior: + scaleUp: + stabilizationWindowSeconds: 60 + policies: + - type: Pods + value: 1 + periodSeconds: 60 + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Pods + value: 1 + periodSeconds: 120 diff --git a/deploy-k3s/manifests/api/service.yaml b/deploy-k3s/manifests/api/service.yaml new file mode 100644 index 0000000..6a7594a --- /dev/null +++ b/deploy-k3s/manifests/api/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: api + namespace: honeydue + labels: + app.kubernetes.io/name: api + app.kubernetes.io/part-of: honeydue +spec: + type: ClusterIP + selector: + app.kubernetes.io/name: api + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP diff --git a/deploy-k3s/manifests/ingress/ingress-simple.yaml b/deploy-k3s/manifests/ingress/ingress-simple.yaml new file mode 100644 index 0000000..bdfbe88 --- /dev/null +++ b/deploy-k3s/manifests/ingress/ingress-simple.yaml @@ -0,0 +1,61 @@ +# Simple hostname-based Ingress — no TLS (Cloudflare Flexible handles edge +# TLS, CF→origin is plain HTTP on 80). Upgrade to Full (strict) by +# adding back a `tls:` block with a Cloudflare Origin CA cert stored in +# secret/cloudflare-origin-cert. +# +# Middleware chain (security headers, rate limit, CF-only allowlist, admin +# basic auth) is defined in `middleware.yaml` but NOT attached here — +# annotate this ingress to turn any of them on. +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: honeydue-api + namespace: honeydue + labels: + app.kubernetes.io/part-of: honeydue +spec: + ingressClassName: traefik + rules: + - host: api.myhoneydue.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 + # Root domain serves the marketing landing page from the Go API's + # STATIC_DIR. ALLOWED_HOSTS in honeydue-config includes myhoneydue.com. + - host: myhoneydue.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: api + port: + number: 8000 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: honeydue-admin + namespace: honeydue + labels: + app.kubernetes.io/part-of: honeydue +spec: + ingressClassName: traefik + rules: + - host: admin.myhoneydue.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: admin + port: + number: 3000 diff --git a/deploy-k3s/manifests/pod-disruption-budgets.yaml b/deploy-k3s/manifests/pod-disruption-budgets.yaml index e1abea8..f4b60e4 100644 --- a/deploy-k3s/manifests/pod-disruption-budgets.yaml +++ b/deploy-k3s/manifests/pod-disruption-budgets.yaml @@ -1,6 +1,6 @@ # Pod Disruption Budgets — prevent node maintenance from killing all replicas # API: at least 2 of 3 replicas must stay up during voluntary disruptions -# Worker: at least 1 of 2 replicas must stay up +# Worker: singleton (Asynq scheduler) — must allow drain, minAvailable: 0 apiVersion: policy/v1 kind: PodDisruptionBudget @@ -26,7 +26,7 @@ metadata: app.kubernetes.io/name: worker app.kubernetes.io/part-of: honeydue spec: - minAvailable: 1 + minAvailable: 0 selector: matchLabels: app.kubernetes.io/name: worker diff --git a/deploy-k3s/manifests/traefik-helmchartconfig.yaml b/deploy-k3s/manifests/traefik-helmchartconfig.yaml new file mode 100644 index 0000000..cc020ad --- /dev/null +++ b/deploy-k3s/manifests/traefik-helmchartconfig.yaml @@ -0,0 +1,53 @@ +# Traefik reconfiguration for this deployment. +# +# K3s defaults: Traefik as single-replica Deployment, LoadBalancer service. +# We disabled servicelb (--disable=servicelb on k3s install), so LoadBalancer +# doesn't get an external IP. This config makes Traefik a DaemonSet binding +# directly on each node's public :80/:443 via hostNetwork, matching our +# Cloudflare DNS round-robin across 3 node IPs. +# +# Apply: kubectl apply -f traefik-helmchartconfig.yaml +# Then bump Helm reconcile: kubectl delete job -n kube-system helm-install-traefik +apiVersion: helm.cattle.io/v1 +kind: HelmChartConfig +metadata: + name: traefik + namespace: kube-system +spec: + valuesContent: |- + deployment: + kind: DaemonSet + hostNetwork: true + service: + enabled: false + ports: + web: + port: 80 + hostPort: 80 + websecure: + port: 443 + hostPort: 443 + # hostNetwork with port 80/443 requires RollingUpdate maxUnavailable > 0 + # (each node's port is held by one pod; can't surge). + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + maxSurge: 0 + securityContext: + capabilities: + drop: [ALL] + add: [NET_BIND_SERVICE] + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + # NOTE: The host-level sysctl `net.ipv4.ip_unprivileged_port_start=0` + # must be set on each node. Without it, hostNetwork pods can't actually + # use NET_BIND_SERVICE to bind :80/:443. Persisted at + # /etc/sysctl.d/99-unprivileged-ports.conf on each node. + additionalArguments: + # Trust Cloudflare's forwarded proto header so the Go app sees the + # original https scheme even though CF→origin is plain HTTP. + # IP ranges from https://www.cloudflare.com/ips-v4/ (as of 2026-04). + - "--entrypoints.web.forwardedHeaders.trustedIPs=173.245.48.0/20,103.21.244.0/22,103.22.200.0/22,103.31.4.0/22,141.101.64.0/18,108.162.192.0/18,190.93.240.0/20,188.114.96.0/20,197.234.240.0/22,198.41.128.0/17,162.158.0.0/15,104.16.0.0/13,104.24.0.0/14,172.64.0.0/13,131.0.72.0/22" diff --git a/deploy-k3s/manifests/worker/deployment.yaml b/deploy-k3s/manifests/worker/deployment.yaml new file mode 100644 index 0000000..82e4d48 --- /dev/null +++ b/deploy-k3s/manifests/worker/deployment.yaml @@ -0,0 +1,105 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: worker + namespace: honeydue + labels: + app.kubernetes.io/name: worker + app.kubernetes.io/part-of: honeydue +spec: + # Asynq's Scheduler is a singleton — running >1 replica fires every cron + # task once per replica (duplicate daily digests, onboarding emails, etc.). + # Keep at 1 until asynq.PeriodicTaskManager with Redis leader election is + # wired in cmd/worker/main.go. + replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 + selector: + matchLabels: + app.kubernetes.io/name: worker + template: + metadata: + labels: + app.kubernetes.io/name: worker + app.kubernetes.io/part-of: honeydue + spec: + serviceAccountName: worker + imagePullSecrets: + - name: ghcr-credentials + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: worker + image: IMAGE_PLACEHOLDER # Replaced by 03-deploy.sh + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + envFrom: + - configMapRef: + name: honeydue-config + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: POSTGRES_PASSWORD + - name: SECRET_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: SECRET_KEY + - name: EMAIL_HOST_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: EMAIL_HOST_PASSWORD + - name: FCM_SERVER_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: FCM_SERVER_KEY + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: REDIS_PASSWORD + optional: true + volumeMounts: + - name: apns-key + mountPath: /secrets/apns + readOnly: true + - name: tmp + mountPath: /tmp + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + exec: + command: ["pgrep", "-f", "/app/worker"] + initialDelaySeconds: 15 + periodSeconds: 30 + timeoutSeconds: 5 + volumes: + - name: apns-key + secret: + secretName: honeydue-apns-key + items: + - key: apns_auth_key.p8 + path: apns_auth_key.p8 + - name: tmp + emptyDir: + sizeLimit: 64Mi diff --git a/deploy/Caddyfile b/deploy/Caddyfile new file mode 100644 index 0000000..624885e --- /dev/null +++ b/deploy/Caddyfile @@ -0,0 +1,52 @@ +# honeyDue edge proxy — terminates HTTP from Cloudflare, routes by Host header. +# +# Cloudflare is in front, SSL mode "Flexible" — CF terminates TLS at the edge +# and talks to this origin over plain HTTP on port 80. No LE certs needed here +# for now. Later, to go "Full (strict)", remove `auto_https off`, add `tls` blocks +# that use the ACME HTTP-01 challenge, and open 443 on the node. + +{ + admin off + auto_https off +} + +# api.myhoneydue.com → Go REST API +# `dynamic a` re-resolves the Swarm service DNS every 30s instead of caching +# the IP forever at config parse. This is critical on Swarm with endpoint_mode: +# dnsrr — when a task restarts, its overlay IP changes, and static DNS caching +# leaves Caddy dialing dead IPs. +api.myhoneydue.com:80 { + reverse_proxy { + dynamic a { + name api + port 8000 + refresh 30s + } + header_up X-Forwarded-Proto {http.request.header.X-Forwarded-Proto} + } +} + +# admin.myhoneydue.com → Next.js admin panel via overlay DNS (VIP endpoint) +# +# This relies on Swarm's embedded resolver, which has a known libnetwork +# stale-record bug (moby/moby#52265, affects 29.x). We work around it by +# (a) using default VIP endpoint_mode — a stable service IP — and +# (b) running a clean overlay from scratch (see Phase 1 stack recreate). +# +# If ghosts come back, the long-term fix is Traefik w/ Swarm provider that +# reads task IPs from Docker API, bypassing libnetwork DNS entirely. See +# deploy/MIGRATION_NOTES.md for the Traefik migration plan. +admin.myhoneydue.com:80 { + reverse_proxy admin:3000 { + lb_try_duration 3s + lb_try_interval 250ms + header_up X-Forwarded-Proto {http.request.header.X-Forwarded-Proto} + } +} + +# Catch-all for root/unknown hostnames hitting our IPs directly. +# Cloudflare SSL=Flexible will still hit us on :80 for myhoneydue.com; return +# a placeholder until you wire a real marketing site. +:80 { + respond "honeyDue" 200 +} diff --git a/deploy/scripts/deploy_prod.sh b/deploy/scripts/deploy_prod.sh index 52504a1..7621695 100755 --- a/deploy/scripts/deploy_prod.sh +++ b/deploy/scripts/deploy_prod.sh @@ -248,13 +248,15 @@ Images that would be built and pushed: ${ADMIN_IMAGE} Replicas: + caddy: 3 (one per node) api: ${API_REPLICAS:-3} - worker: ${WORKER_REPLICAS:-2} + worker: ${WORKER_REPLICAS:-1} admin: ${ADMIN_REPLICAS:-1} Published ports: - api: ${API_PORT:-8000} (ingress) - admin: ${ADMIN_PORT:-3000} (ingress) + caddy: 80, 443 (ingress — public) + api: internal only (proxied by caddy) + admin: internal only (proxied by caddy) dozzle: ${DOZZLE_PORT:-9999} (manager loopback only — SSH tunnel required) Versioned secrets that would be created on this deploy: @@ -264,6 +266,9 @@ Versioned secrets that would be created on this deploy: ${DEPLOY_STACK_NAME}_fcm_server_key_ ${DEPLOY_STACK_NAME}_apns_auth_key_ +Versioned configs that would be created on this deploy: + ${DEPLOY_STACK_NAME}_caddyfile_ + No changes made. Re-run without DRY_RUN=1 to deploy. ================================================= @@ -289,27 +294,54 @@ if [[ "${SKIP_BUILD}" != "1" ]]; then log "Logging in to ${REGISTRY}" printf '%s' "${REGISTRY_TOKEN}" | docker login "${REGISTRY}" -u "${REGISTRY_USERNAME}" --password-stdin >/dev/null - log "Building API image ${API_IMAGE}" - docker build --target api -t "${API_IMAGE}" "${REPO_DIR}" - log "Building Worker image ${WORKER_IMAGE}" - docker build --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}" - log "Building Admin image ${ADMIN_IMAGE}" - docker build --target admin -t "${ADMIN_IMAGE}" "${REPO_DIR}" + # Target platform for Swarm nodes. Hetzner CX is x86_64; override via + # BUILD_PLATFORM=linux/arm64 if you move to ARM (Ampere) hosts. + BUILD_PLATFORM="${BUILD_PLATFORM:-linux/amd64}" + log "Build platform: ${BUILD_PLATFORM} (dev host may differ; buildx cross-compiles)" - log "Pushing deploy images" - docker push "${API_IMAGE}" - docker push "${WORKER_IMAGE}" - docker push "${ADMIN_IMAGE}" - - if [[ "${PUSH_LATEST_TAG}" == "true" ]]; then - log "Updating :latest tags" - docker tag "${API_IMAGE}" "${REGISTRY_PREFIX}/honeydue-api:latest" - docker tag "${WORKER_IMAGE}" "${REGISTRY_PREFIX}/honeydue-worker:latest" - docker tag "${ADMIN_IMAGE}" "${REGISTRY_PREFIX}/honeydue-admin:latest" - docker push "${REGISTRY_PREFIX}/honeydue-api:latest" - docker push "${REGISTRY_PREFIX}/honeydue-worker:latest" - docker push "${REGISTRY_PREFIX}/honeydue-admin:latest" + # Ensure a buildx builder exists and is usable + if ! docker buildx inspect honeydue-builder >/dev/null 2>&1; then + log "Creating buildx builder 'honeydue-builder'" + docker buildx create --name honeydue-builder --use >/dev/null + else + docker buildx use honeydue-builder >/dev/null fi + docker buildx inspect --bootstrap >/dev/null + + build_and_push() { + local target="$1" + local image="$2" + shift 2 + + local tag_args=(-t "${image}") + while (( $# > 0 )); do + tag_args+=(-t "$1") + shift + done + + log "Building + pushing ${target} image for ${BUILD_PLATFORM}: ${image}" + docker buildx build \ + --platform "${BUILD_PLATFORM}" \ + --target "${target}" \ + "${tag_args[@]}" \ + --push \ + "${REPO_DIR}" + } + + api_extra=() + worker_extra=() + admin_extra=() + if [[ "${PUSH_LATEST_TAG}" == "true" ]]; then + api_extra=("${REGISTRY_PREFIX}/honeydue-api:latest") + worker_extra=("${REGISTRY_PREFIX}/honeydue-worker:latest") + admin_extra=("${REGISTRY_PREFIX}/honeydue-admin:latest") + fi + + # ${arr[@]+"${arr[@]}"} safely expands to nothing when the array is empty + # under `set -u` — avoids "unbound variable" on bash arrays. + build_and_push api "${API_IMAGE}" ${api_extra[@]+"${api_extra[@]}"} + build_and_push worker "${WORKER_IMAGE}" ${worker_extra[@]+"${worker_extra[@]}"} + build_and_push admin "${ADMIN_IMAGE}" ${admin_extra[@]+"${admin_extra[@]}"} else warn "SKIP_BUILD=1 set. Using prebuilt images for tag: ${DEPLOY_TAG}" fi @@ -322,6 +354,12 @@ SECRET_KEY_SECRET="${DEPLOY_STACK_NAME}_secret_key_${DEPLOY_ID}" EMAIL_HOST_PASSWORD_SECRET="${DEPLOY_STACK_NAME}_email_host_password_${DEPLOY_ID}" FCM_SERVER_KEY_SECRET="${DEPLOY_STACK_NAME}_fcm_server_key_${DEPLOY_ID}" APNS_AUTH_KEY_SECRET="${DEPLOY_STACK_NAME}_apns_auth_key_${DEPLOY_ID}" +CADDYFILE_CONFIG="${DEPLOY_STACK_NAME}_caddyfile_${DEPLOY_ID}" + +CADDYFILE_SRC="${DEPLOY_DIR}/Caddyfile" +if [[ ! -f "${CADDYFILE_SRC}" ]]; then + die "Missing required file: ${CADDYFILE_SRC}" +fi TMP_DIR="$(mktemp -d)" cleanup() { @@ -332,6 +370,7 @@ trap cleanup EXIT cp "${STACK_TEMPLATE}" "${TMP_DIR}/swarm-stack.prod.yml" cp "${PROD_ENV}" "${TMP_DIR}/prod.env" cp "${REGISTRY_ENV}" "${TMP_DIR}/registry.env" +cp "${CADDYFILE_SRC}" "${TMP_DIR}/Caddyfile" mkdir -p "${TMP_DIR}/secrets" cp "${SECRET_POSTGRES}" "${TMP_DIR}/secrets/postgres_password.txt" cp "${SECRET_APP_KEY}" "${TMP_DIR}/secrets/secret_key.txt" @@ -356,6 +395,7 @@ SECRET_KEY_SECRET=${SECRET_KEY_SECRET} EMAIL_HOST_PASSWORD_SECRET=${EMAIL_HOST_PASSWORD_SECRET} FCM_SERVER_KEY_SECRET=${FCM_SERVER_KEY_SECRET} APNS_AUTH_KEY_SECRET=${APNS_AUTH_KEY_SECRET} +CADDYFILE_CONFIG=${CADDYFILE_CONFIG} EOF log "Uploading deploy bundle to ${SSH_TARGET}:${DEPLOY_REMOTE_DIR}" @@ -364,6 +404,7 @@ scp "${SCP_OPTS[@]}" "${TMP_DIR}/swarm-stack.prod.yml" "${SSH_TARGET}:${DEPLOY_R scp "${SCP_OPTS[@]}" "${TMP_DIR}/prod.env" "${SSH_TARGET}:${DEPLOY_REMOTE_DIR}/prod.env" scp "${SCP_OPTS[@]}" "${TMP_DIR}/registry.env" "${SSH_TARGET}:${DEPLOY_REMOTE_DIR}/registry.env" scp "${SCP_OPTS[@]}" "${TMP_DIR}/runtime.env" "${SSH_TARGET}:${DEPLOY_REMOTE_DIR}/runtime.env" +scp "${SCP_OPTS[@]}" "${TMP_DIR}/Caddyfile" "${SSH_TARGET}:${DEPLOY_REMOTE_DIR}/Caddyfile" scp "${SCP_OPTS[@]}" "${TMP_DIR}/secrets/postgres_password.txt" "${SSH_TARGET}:${DEPLOY_REMOTE_DIR}/secrets/postgres_password.txt" scp "${SCP_OPTS[@]}" "${TMP_DIR}/secrets/secret_key.txt" "${SSH_TARGET}:${DEPLOY_REMOTE_DIR}/secrets/secret_key.txt" scp "${SCP_OPTS[@]}" "${TMP_DIR}/secrets/email_host_password.txt" "${SSH_TARGET}:${DEPLOY_REMOTE_DIR}/secrets/email_host_password.txt" @@ -397,6 +438,17 @@ create_secret() { fi } +create_config() { + local name="$1" + local src="$2" + if docker config inspect "${name}" >/dev/null 2>&1; then + echo "[remote] config exists: ${name}" + else + docker config create "${name}" "${src}" >/dev/null + echo "[remote] created config: ${name}" + fi +} + printf '%s' "${REGISTRY_TOKEN}" | docker login "${REGISTRY}" -u "${REGISTRY_USERNAME}" --password-stdin >/dev/null rm -f "${REMOTE_DIR}/registry.env" @@ -406,6 +458,8 @@ create_secret "${EMAIL_HOST_PASSWORD_SECRET}" "${REMOTE_DIR}/secrets/email_host_ create_secret "${FCM_SERVER_KEY_SECRET}" "${REMOTE_DIR}/secrets/fcm_server_key.txt" create_secret "${APNS_AUTH_KEY_SECRET}" "${REMOTE_DIR}/secrets/apns_auth_key.p8" +create_config "${CADDYFILE_CONFIG}" "${REMOTE_DIR}/Caddyfile" + rm -f "${REMOTE_DIR}/secrets/postgres_password.txt" rm -f "${REMOTE_DIR}/secrets/secret_key.txt" rm -f "${REMOTE_DIR}/secrets/email_host_password.txt" @@ -455,18 +509,22 @@ while true; do sleep 10 done -log "Pruning old secret versions (keeping last ${SECRET_KEEP_VERSIONS})" -ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "bash -s -- '${DEPLOY_STACK_NAME}' '${SECRET_KEEP_VERSIONS}'" <<'EOF' || warn "Secret pruning reported errors (non-fatal)" +log "Pruning old secret + config versions (keeping last ${SECRET_KEEP_VERSIONS})" +ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "bash -s -- '${DEPLOY_STACK_NAME}' '${SECRET_KEEP_VERSIONS}'" <<'EOF' || warn "Pruning reported errors (non-fatal)" set -euo pipefail STACK_NAME="$1" KEEP="$2" prune_prefix() { - local prefix="$1" - # List matching secrets with creation time, sorted newest-first. + local kind="$1" # "secret" or "config" + local prefix="$2" + local ls_cmd rm_cmd + ls_cmd="docker ${kind} ls --format '{{.CreatedAt}}|{{.Name}}'" + rm_cmd="docker ${kind} rm" + local all - all="$(docker secret ls --format '{{.CreatedAt}}|{{.Name}}' 2>/dev/null \ + all="$(eval "${ls_cmd}" 2>/dev/null \ | grep "|${prefix}_" \ | sort -r \ || true)" @@ -477,7 +535,7 @@ prune_prefix() { local total total="$(printf '%s\n' "${all}" | wc -l | tr -d ' ')" if (( total <= KEEP )); then - echo "[cleanup] ${prefix}: ${total} version(s) — nothing to prune" + echo "[cleanup] ${kind}/${prefix}: ${total} version(s) — nothing to prune" return 0 fi @@ -486,16 +544,20 @@ prune_prefix() { while IFS= read -r name; do [[ -z "${name}" ]] && continue - if docker secret rm "${name}" >/dev/null 2>&1; then - echo "[cleanup] removed: ${name}" + if ${rm_cmd} "${name}" >/dev/null 2>&1; then + echo "[cleanup] removed ${kind}: ${name}" else - echo "[cleanup] in-use (kept): ${name}" + echo "[cleanup] in-use ${kind} (kept): ${name}" fi done <<< "${to_remove}" } for base in postgres_password secret_key email_host_password fcm_server_key apns_auth_key; do - prune_prefix "${STACK_NAME}_${base}" + prune_prefix secret "${STACK_NAME}_${base}" +done + +for base in caddyfile; do + prune_prefix config "${STACK_NAME}_${base}" done EOF diff --git a/deploy/swarm-stack.prod.yml b/deploy/swarm-stack.prod.yml index 7f41a12..506ef85 100644 --- a/deploy/swarm-stack.prod.yml +++ b/deploy/swarm-stack.prod.yml @@ -1,6 +1,59 @@ version: "3.8" services: + # Edge reverse proxy — the only service publishing :80/:443 publicly. + # Routes by Host header to internal `api` and `admin` services over the + # overlay network. Runs one replica per node via ingress mesh, so any node + # can terminate incoming traffic. + caddy: + image: caddy:2-alpine + ports: + - target: 80 + published: 80 + protocol: tcp + mode: ingress + - target: 443 + published: 443 + protocol: tcp + mode: ingress + configs: + - source: caddyfile + target: /etc/caddy/Caddyfile + mode: 0444 + volumes: + - caddy_data:/data + - caddy_config:/config + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1/"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 10s + deploy: + replicas: 3 + restart_policy: + condition: any + delay: 5s + update_config: + parallelism: 1 + delay: 10s + order: start-first + rollback_config: + parallelism: 1 + delay: 5s + order: stop-first + placement: + max_replicas_per_node: 1 + resources: + limits: + cpus: "0.25" + memory: 128M + reservations: + cpus: "0.05" + memory: 32M + networks: + - honeydue-network + redis: image: redis:7-alpine command: redis-server --appendonly yes --appendfsync everysec --maxmemory 200mb --maxmemory-policy allkeys-lru @@ -30,11 +83,8 @@ services: api: image: ${API_IMAGE} - ports: - - target: 8000 - published: ${API_PORT} - protocol: tcp - mode: ingress + # No `ports:` block — Caddy edge service proxies to api:8000 over the + # overlay network. Port 8000 is never publicly exposed. environment: PORT: "8000" DEBUG: "${DEBUG}" @@ -104,6 +154,10 @@ services: APPLE_IAP_SANDBOX: "${APPLE_IAP_SANDBOX}" GOOGLE_IAP_SERVICE_ACCOUNT_PATH: "${GOOGLE_IAP_SERVICE_ACCOUNT_PATH}" GOOGLE_IAP_PACKAGE_NAME: "${GOOGLE_IAP_PACKAGE_NAME}" + + # Seeded on first migration (idempotent — skipped if admin_users row exists) + ADMIN_EMAIL: "${ADMIN_EMAIL}" + ADMIN_PASSWORD: "${ADMIN_PASSWORD}" stop_grace_period: 60s command: - /bin/sh @@ -116,15 +170,15 @@ services: export FCM_SERVER_KEY="$$(cat /run/secrets/fcm_server_key)" exec /app/api secrets: - - source: ${POSTGRES_PASSWORD_SECRET} + - source: postgres_password target: postgres_password - - source: ${SECRET_KEY_SECRET} + - source: secret_key target: secret_key - - source: ${EMAIL_HOST_PASSWORD_SECRET} + - source: email_host_password target: email_host_password - - source: ${FCM_SERVER_KEY_SECRET} + - source: fcm_server_key target: fcm_server_key - - source: ${APNS_AUTH_KEY_SECRET} + - source: apns_auth_key target: apns_auth_key volumes: - uploads:/app/uploads @@ -132,10 +186,18 @@ services: test: ["CMD", "curl", "-f", "http://127.0.0.1:8000/api/health/"] interval: 30s timeout: 10s - start_period: 15s + # Single-replica AutoMigrate on a fresh DB takes ~90s; subsequent + # replicas are ~2s (idempotent). 180s gives honest headroom for the + # first replica to finish, without masking cascade failures. + start_period: 180s retries: 3 deploy: replicas: ${API_REPLICAS} + # DNS round-robin instead of VIP. VIP's kernel IPVS state can go stale + # during replica churn (rolling updates, task restarts), causing + # intermittent i/o timeouts from clients on the overlay network (Caddy). + # dnsrr resolves to live task IPs directly and bypasses IPVS. + endpoint_mode: dnsrr restart_policy: condition: any delay: 5s @@ -159,11 +221,8 @@ services: admin: image: ${ADMIN_IMAGE} - ports: - - target: 3000 - published: ${ADMIN_PORT} - protocol: tcp - mode: ingress + # No `ports:` block — reached via Caddy on admin.myhoneydue.com using + # Swarm's embedded DNS and default VIP endpoint_mode. environment: PORT: "3000" HOSTNAME: "0.0.0.0" @@ -248,15 +307,15 @@ services: export FCM_SERVER_KEY="$$(cat /run/secrets/fcm_server_key)" exec /app/worker secrets: - - source: ${POSTGRES_PASSWORD_SECRET} + - source: postgres_password target: postgres_password - - source: ${SECRET_KEY_SECRET} + - source: secret_key target: secret_key - - source: ${EMAIL_HOST_PASSWORD_SECRET} + - source: email_host_password target: email_host_password - - source: ${FCM_SERVER_KEY_SECRET} + - source: fcm_server_key target: fcm_server_key - - source: ${APNS_AUTH_KEY_SECRET} + - source: apns_auth_key target: apns_auth_key healthcheck: test: ["CMD", "curl", "-f", "http://127.0.0.1:6060/health"] @@ -293,12 +352,11 @@ services: # ssh -L ${DOZZLE_PORT}:127.0.0.1:${DOZZLE_PORT} # Then browse http://localhost:${DOZZLE_PORT} image: amir20/dozzle:latest + # Bind to loopback only on the manager. Swarm's long-form port spec + # rejects `host_ip`, so we use the short form — 127.0.0.1::8080. + # Access via SSH tunnel: ssh -L ${DOZZLE_PORT}:127.0.0.1:${DOZZLE_PORT} ports: - - target: 8080 - published: ${DOZZLE_PORT} - protocol: tcp - mode: host - host_ip: 127.0.0.1 + - "127.0.0.1:${DOZZLE_PORT}:8080" environment: DOZZLE_NO_ANALYTICS: "true" volumes: @@ -324,6 +382,8 @@ services: volumes: redis_data: uploads: + caddy_data: + caddy_config: networks: honeydue-network: @@ -331,6 +391,11 @@ networks: driver_opts: encrypted: "true" +configs: + caddyfile: + external: true + name: ${CADDYFILE_CONFIG} + secrets: postgres_password: external: true diff --git a/docs/deployment/00-overview.md b/docs/deployment/00-overview.md new file mode 100644 index 0000000..2448e8d --- /dev/null +++ b/docs/deployment/00-overview.md @@ -0,0 +1,240 @@ +# 00 — Overview + +## Summary + +honeyDue runs on a three-node Kubernetes cluster managed by K3s, fronted by +Cloudflare, and backed by a managed Postgres (Neon), S3-compatible object +storage (Backblaze B2), and a self-hosted container registry (Gitea). The +application consists of a Go REST API, a Next.js admin panel, and a +background worker process using Redis-backed queues. Traefik handles HTTP +ingress and path-based routing. The whole stack fits in about 1 GB of RAM +across the three nodes with plenty of headroom. + +This chapter is the map. Everything here is expanded in a later chapter. + +## Architecture at a glance + +```mermaid +flowchart TB + subgraph Internet + Browser[End-user browser / mobile client] + end + + subgraph CF[Cloudflare] + CFEdge[Edge POP
TLS terminates here] + end + + Browser -- HTTPS :443 --> CFEdge + + subgraph Hetzner[Hetzner Cloud — Nuremberg nbg1] + direction LR + subgraph H1[hetzner1
178.104.247.152] + T1[Traefik
:80/:443 hostNet] + A1[api pod] + W1[worker pod] + end + subgraph H2[hetzner2
178.105.32.198] + T2[Traefik
:80/:443 hostNet] + A2[api pod] + R1[redis pod
PVC] + end + subgraph H3[hetzner3
178.104.249.189] + T3[Traefik
:80/:443 hostNet] + A3[api pod] + AD1[admin pod] + end + end + + CFEdge -- HTTP :80
DNS round-robin --> T1 + CFEdge -- HTTP :80 --> T2 + CFEdge -- HTTP :80 --> T3 + + T1 & T2 & T3 -.Ingress routes by
Host header.-> A1 + T1 & T2 & T3 -.-> AD1 + A1 & A2 & A3 -.-> R1 + + subgraph External[Managed services] + Neon[(Neon Postgres
AWS us-east-1)] + B2[(Backblaze B2
us-east-005)] + FM[Fastmail SMTP] + Gitea[Gitea Registry
gitea.treytartt.com] + end + + A1 & A2 & A3 -- SSL --> Neon + W1 -- SSL --> Neon + A1 & A2 & A3 -- HTTPS --> B2 + W1 -- SMTP :587 --> FM + H1 & H2 & H3 -. image pull .-> Gitea +``` + +### ASCII fallback + +``` + ┌─────────────────────┐ + │ End user │ + └──────────┬──────────┘ + │ HTTPS :443 + ▼ + ┌─────────────────────┐ + │ Cloudflare edge │ TLS terminates here + │ (SSL = Flexible) │ + └──────────┬──────────┘ + HTTP :80 round-robin + ┌─────────────┼─────────────┐ + ▼ ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ hetzner1 │ │ hetzner2 │ │ hetzner3 │ + │ 178.104.247.152 │ │ 178.105.32.198 │ │ 178.104.249.189 │ + │ Traefik :80/443 │ │ Traefik :80/443 │ │ Traefik :80/443 │ + │ api worker │ │ api redis │ │ api admin │ + └─────────┬───────┘ └─────────┬───────┘ └─────────┬───────┘ + │ │ │ + └──────── Kubernetes overlay ───────────┘ + │ + ┌─────────────────────────────┴──────────────────────────────┐ + │ │ + ▼ ▼ ▼ ▼ +┌─────────┐ ┌─────────────┐ ┌──────────┐ ┌───────────────┐ +│ Neon │ │ Backblaze B2│ │ Fastmail │ │ Gitea Registry│ +│Postgres │ │ uploads │ │ SMTP │ │ image pull │ +└─────────┘ └─────────────┘ └──────────┘ └───────────────┘ +``` + +## The stack, one layer at a time + +### Layer 0 — Hardware + +Three Hetzner Cloud CX33 instances (4 vCPU, 8 GB RAM, 80 GB NVMe SSD) in +Hetzner's Nuremberg (nbg1) datacenter. Each node is $7.99/mo (April 2026 +pricing), totaling ~$24/mo. See [Chapter 1](./01-infrastructure.md). + +### Layer 1 — Operating system + +Ubuntu 24.04.3 LTS. Each node has: +- SSH on port 22, key-only auth, `deploy` user with NOPASSWD sudo +- `ufw` firewall with strict default-deny-incoming; specific ports allowed + per Chapter 4 +- Sysctl override `net.ipv4.ip_unprivileged_port_start=0` so non-root + containers can bind privileged ports (needed for Traefik to serve :80/:443) + +### Layer 2 — Container runtime + +`containerd` v2.2.2 (bundled with K3s). Docker was previously installed from +the Swarm era but is now disabled. containerd is Kubernetes' reference +runtime and has a smaller footprint than Docker's full stack. + +### Layer 3 — Orchestrator + +K3s v1.34.6 in HA mode. All 3 nodes are `control-plane,etcd` (Raft quorum +of 3 — can tolerate one node failure). K3s is a minimal Kubernetes +distribution from Rancher Labs (now Suse): single-binary, embedded etcd +instead of a separate etcd cluster, sane defaults for small installations. +See [Chapter 2](./02-orchestrator-choice.md) for why k3s over full Kubernetes +or Docker Swarm. + +### Layer 4 — Cluster networking + +- **Flannel VXLAN** for pod-to-pod overlay (default on K3s). VXLAN tunnels + pod traffic over UDP port 8472 between nodes. +- **CoreDNS** for service discovery (what pods call `api` or `redis` to + reach each other). +- **kube-proxy** in IPVS mode for ClusterIP → pod routing. + +[Chapter 3](./03-networking.md) walks through a single request to show +every hop. + +### Layer 5 — Ingress + +**Traefik v3** as a DaemonSet with `hostNetwork: true`. Each node has a +Traefik pod that binds directly to the node's public :80 and :443. No +`servicelb`, no Hetzner Load Balancer — Cloudflare round-robins the three +node IPs in DNS and any node can serve any request. See +[Chapter 6](./06-traefik-ingress.md). + +### Layer 6 — Edge / CDN + +Cloudflare Free plan. Proxied A records for `api.myhoneydue.com`, +`admin.myhoneydue.com`, and `myhoneydue.com` each point at all three node +IPs. Edge handles TLS termination (SSL=Flexible), DDoS protection, caching +for static assets, and traffic failover if a node becomes unreachable. +See [Chapter 13](./13-cloudflare.md). + +### Layer 7 — Application services + +| Service | Type | Replicas | Image | +|---|---|---|---| +| `api` | Go (Echo, GORM) | 3 | `gitea.treytartt.com/admin/honeydue-api:` | +| `admin` | Next.js 16 | 1 | `gitea.treytartt.com/admin/honeydue-admin:` | +| `worker` | Go (Asynq) | 1 | `gitea.treytartt.com/admin/honeydue-worker:` | +| `redis` | redis:7-alpine | 1 | Docker Hub | + +See [Chapter 7](./07-services.md). + +### Layer 8 — External dependencies + +- **Neon Postgres** (Launch plan) — `honeyDue` database +- **Backblaze B2** — `honeyDueProd` bucket for user uploads +- **Fastmail SMTP** — transactional email +- **Gitea** (self-hosted at `gitea.treytartt.com`) — container registry +- **Cloudflare** — DNS, TLS, CDN + +See [Chapter 8](./08-database.md), [9](./09-storage.md), and +[11](./11-registry.md). + +## What's deliberately absent + +- **TLS at origin.** Cloudflare terminates TLS at the edge and talks HTTP + on port 80 to the nodes. This is "Flexible SSL" in Cloudflare terminology. + It's the simplest setup; we have a TODO to upgrade to "Full (strict)" with + Cloudflare Origin CA certs ([Chapter 13](./13-cloudflare.md), §Future). +- **Hetzner Load Balancer.** We save the $8.49/mo by having Cloudflare + round-robin across node IPs directly. If any node is unresponsive, + Cloudflare's own origin health checks will route around it within 30s. +- **Push notifications.** APNs (iOS) and FCM (Android) are *configured off* + until we have Apple Developer / Google Play accounts. The env vars are + set to sentinel values that let the Go app boot; `FEATURE_PUSH_ENABLED=false` + gates all call sites. +- **External metrics/monitoring (Prometheus, Grafana, Betterstack).** + Right now we rely on `kubectl logs`, `kubectl top`, and Cloudflare's own + analytics. See [Chapter 15](./15-observability.md) for what's there and + what we'd add. +- **Automated backups of Redis state.** Redis is configured with AOF + (append-only file) persistence, but the PVC is only on one node. Redis + holds only cache + Asynq queue state; losing it re-populates on first + request / next cron tick. Not critical. +- **Admin panel basic auth (Traefik middleware).** In-app admin login is + enabled; the extra Traefik-layer basic auth the scaffold supports is not + currently attached. + +## The deployment pipeline in one paragraph + +Changes to application code are built on your workstation by +`docker buildx build --platform linux/amd64 --push`, which cross-compiles +from arm64 (Apple Silicon) to amd64 (Hetzner nodes) and pushes directly to +`gitea.treytartt.com`. Manifests live in `deploy-k3s/manifests/`; they +reference image tags by git short SHA. `kubectl apply -f` rolls the new +image in with `maxUnavailable: 0, maxSurge: 1` — one new pod at a time, +old one stays up until new is healthy. Service discovery by Kubernetes +DNS means `api` and `admin` hostnames always resolve to live backing pods; +traffic shifts the moment a new pod passes its readiness probe. +[Chapter 14](./14-deployment-process.md) walks through a complete deploy. + +## What we *used* to have (the short version) + +Up until 2026-04-24 this stack ran on **Docker Swarm** on the same three +Hetzner boxes. It worked, but the Docker libnetwork service-discovery +layer has a bug in the 29.x line ([moby/moby#52265][moby-52265]) that +leaves stale DNS A-records behind when tasks migrate between nodes. We +hit it: the admin panel returned 502s for ~50% of requests through +Cloudflare because Caddy (our previous reverse proxy) was dialing a ghost +IP that had since been recycled to the Dozzle log viewer. We spent four +hours trying increasingly clever workarounds (dnsrr vs VIP, +`dynamic a` DNS refresh, global mode, host-mode ports, host.docker.internal, +hardcoded node IPs) before concluding that libnetwork state corruption +survives every non-nuclear fix. + +The full autopsy is in [Chapter 19 — Swarm Postmortem](./19-postmortem-swarm.md). +K3s uses CoreDNS and has no libnetwork history; the bug class doesn't +exist there. + +[moby-52265]: https://github.com/moby/moby/issues/52265 diff --git a/docs/deployment/01-infrastructure.md b/docs/deployment/01-infrastructure.md new file mode 100644 index 0000000..72a3444 --- /dev/null +++ b/docs/deployment/01-infrastructure.md @@ -0,0 +1,294 @@ +# 01 — Infrastructure + +## Summary + +Three Hetzner Cloud CX33 virtual machines in the Nuremberg (nbg1) datacenter +form the compute foundation. Each is a 4 vCPU / 8 GB RAM / 80 GB NVMe SSD +instance on Hetzner's shared-CPU "Cloud" line. Total compute cost is +$23.97/mo. This chapter explains each node spec in detail, why we picked +Hetzner and this tier specifically, and the rejected alternatives. + +## Node specifications + +All three nodes are identical. Specs per node: + +| Spec | Value | +|---|---| +| Provider | Hetzner Cloud (`www.hetzner.com/cloud`) | +| Instance type | CX33 (shared-CPU line) | +| vCPU | 4 | +| RAM | 8 GB | +| Disk | 80 GB NVMe SSD | +| Network | 20 TB/mo outbound included | +| IPv4 address | Public dedicated | +| IPv6 address | /64 subnet | +| Region | `nbg1` (Nuremberg, Germany) | +| OS | Ubuntu 24.04.3 LTS (HWE kernel 6.8.0-90-generic) | +| Price | **$7.99/mo** (April 2026) ⁽¹⁾ | + +⁽¹⁾ Hetzner applied a price adjustment on 2026-04-01 — CX33 went from +~$6.59 to $7.99. See [Hetzner price adjustment announcement][hetzner-prices]. + +### The three nodes + +| SSH alias | Public IPv4 | IPv6 | k3s hostname | +|---|---|---|---| +| `hetzner1` | 178.104.247.152 | `2a01:4f8:1c18:79c7::1` | `ubuntu-8gb-nbg1-2` | +| `hetzner2` | 178.105.32.198 | `2a01:4f8:1c18:5ecf::1` | `ubuntu-8gb-nbg1-1` | +| `hetzner3` | 178.104.249.189 | `2a01:4f8:1c18:241a::1` | `ubuntu-8gb-nbg1-3` | + +**Naming quirk.** The SSH-alias numbers and the Hetzner-assigned hostname +numbers do not match (`hetzner1` is `nbg1-2`, `hetzner2` is `nbg1-1`). This +is because the Hetzner hostnames are assigned in server-creation order; the +SSH aliases were set up later in the order we wanted to refer to them. We +chose not to rename the hosts — renaming `hostname` on a Kubernetes node +after it joins the cluster causes problems (node certificates, etcd +identity, etc. tie to the hostname). Living with the quirk is easier than +rebuilding. See the mapping table in [the README](./README.md). + +## Why Hetzner + +### Decision matrix + +Compared at the time of purchase (~2026-04-23): + +| Provider | Instance | vCPU / RAM / SSD | Price/mo | Traffic/mo | +|---|---|---|---:|---| +| **Hetzner** | **CX33** | **4 / 8 GB / 80 GB** | **$7.99** | **20 TB** | +| DigitalOcean | General-purpose | 2 / 8 GB / 25 GB | $63 | 4 TB | +| DigitalOcean | Basic | 4 / 8 GB / 160 GB | $48 | 5 TB | +| Vultr | High Perf | 4 / 8 GB / 180 GB | $48 | 5 TB | +| Linode (Akamai) | Shared | 4 / 8 GB / 160 GB | $48 | 5 TB | +| OVHcloud | VPS 2026 4vC | 4 / 8 GB / 75 GB | ~$13 | unlimited | +| Contabo | Cloud VPS 2 | 4 / 8 GB / 200 GB | $8 | 32 TB | +| Netcup | VPS 1000 G11 | 4 / 8 GB / 256 GB | ~$6 | unlimited | +| Oracle Always Free | ARM Ampere | up to 4 / 24 GB / 200 GB | $0 | 10 TB | *availability lottery* | + +**Why Hetzner won:** + +1. **Price/performance at this tier is best-in-class among mainstream hosts.** + Similar specs at DigitalOcean/Vultr/Linode cost 6× as much. You're paying + the "American managed cloud" premium there for UX polish we don't need. +2. **Dedicated IPv4 + /64 IPv6 + 20 TB traffic included.** No overage anxiety + at this scale; 20 TB is multiple months of anticipated traffic for a + bootstrapped app. +3. **European datacenter, GDPR-native.** honeyDue serves users in + multiple regions; if EU users dominate, Nuremberg is fast. US users pay + about +100 ms over a US-East host, which is well within Cloudflare-cached + tolerances for most app traffic. +4. **Mature API + `hcloud` CLI** for automation if we ever need it. +5. **Hetzner Cloud Firewall is free** and rule-for-rule equivalent to AWS + Security Groups / DO Cloud Firewall. We use UFW on the nodes instead + (Chapter 4) because our rule set evolved ad-hoc and moving it to the + provider's firewall is a small cleanup project. + +**Why not the cheaper options:** + +- **Netcup** is ~$1/mo cheaper per node with more disk, but its API is + barebones, the account/billing UX is more fiddly, and their network + routing in the US (where the operator is based) has more hops than + Hetzner's. +- **Contabo** is the cheapest, but the company has a reputation for + oversubscribed nodes. For a production service, unpredictable CPU steal + and disk I/O variance is not worth saving $0/node. Contabo is fine for + non-critical workloads; it's a poor fit for prod. +- **Oracle Cloud Always Free** is genuinely free (4 ARM cores + 24 GB RAM) + but: + - Requires ARM64 builds (we build on ARM but would need to not need + cross-compile — see Chapter 11 for why amd64 matters) + - Capacity for free accounts is a lottery; instance creation fails + "out of capacity" more often than it succeeds + - Oracle has reclaimed idle free-tier instances in the past + +### Why not the premium options + +DigitalOcean, Vultr, and Linode are excellent products with better UX than +Hetzner. They were rejected because at honeyDue's current scale the 3–6× +price multiplier doesn't buy anything we'd use: + +- We don't need managed databases, object storage, or load balancers from + the same provider — those are Neon, Backblaze, and Cloudflare +- We don't need their monitoring dashboards — Cloudflare Analytics + + `kubectl top` + future Prometheus cover it +- The UI polish matters mostly for day-1 setup; ongoing operations are + `kubectl` and `ssh` + +When honeyDue has enough revenue that an engineer's time is worth more than +$40/mo, we'd consider moving for the better tooling. Not yet. + +## Why Nuremberg (`nbg1`) + +Hetzner has datacenters in Nuremberg (nbg1), Falkenstein (fsn1), Helsinki +(hel1), Ashburn (ash), and Hillsboro (hil). Nuremberg was picked because: + +- The operator's primary user base is expected to be mixed US/EU +- Within the EU, Nuremberg is the most central from a peering perspective + (well-connected to DE-CIX, Europe's largest internet exchange) +- Falkenstein is Hetzner's main datacenter and tends to have longer + provisioning queues during capacity crunches; Nuremberg is smaller and + more available + +For a US-only userbase, Ashburn (ash) or Hillsboro (hil) would be better +picks — US users would see ~20 ms instead of ~120 ms. + +Cloudflare's edge caches most assets, so the origin location matters mostly +for first-request / uncached / POST traffic. + +## Why three nodes + +**Raft quorum and fault tolerance.** K3s in HA mode uses Raft consensus +(via embedded etcd) for cluster state. Raft requires a majority of nodes +to agree on every write. Quorum formulas: + +| Total managers | Quorum | Max failures tolerated | +|---|---|---| +| 1 | 1 | 0 | +| 2 | 2 | 0 | +| 3 | 2 | 1 | +| 4 | 3 | 1 | +| 5 | 3 | 2 | + +Three is the smallest odd number that tolerates a failure, and three is +where price/resilience is sweetest. Five nodes doesn't help until you need +to tolerate *two* simultaneous failures — a scale concern that doesn't +apply at our traffic volume. + +Two nodes is worse than one: you still have single-failure intolerance +(one down = no quorum), but you've doubled your cost and failure surface. +Avoid even-node clusters for consensus systems. + +## Node hardening + +Each node was bootstrapped with: + +1. **Docker installed** from `download.docker.com` using the stable repo + (this was the original Swarm setup; still installed but disabled — k3s + bundles its own containerd). +2. **`deploy` user created** with: + - Home directory + - Bash as login shell + - Member of `docker` group (historical, when Swarm was the orchestrator) + - Member of `sudo` group with `NOPASSWD: ALL` in `/etc/sudoers.d/deploy` +3. **SSH key installed** at `/home/deploy/.ssh/authorized_keys` + - The key is the public half of `~/.ssh/hetzner` on the operator + workstation (`ssh-ed25519`, 256 bits) +4. **`/opt/honeydue/deploy`** directory created, owned by `deploy` + (originally for Swarm deploy bundle drop zone; unused now) +5. **Sysctl** `net.ipv4.ip_unprivileged_port_start=0` persisted to + `/etc/sysctl.d/99-unprivileged-ports.conf`. Required so Traefik (running + as UID 65532) can bind `:80` and `:443` in the host network namespace. + +The full bootstrap script is at `/tmp/honeydue_bootstrap.sh` on the +operator workstation (used during the initial Swarm setup — see +[Chapter 19](./19-postmortem-swarm.md) for context). + +## Cost breakdown + +``` +3 × Hetzner CX33 $23.97/mo +Hetzner network traffic $0 (20 TB/mo included per node, nowhere near it) +Neon Postgres (Launch) $5-15/mo (usage-based, ~$5 min) +Backblaze B2 <$1/mo (tiny upload volume currently) +Cloudflare Free $0 +Gitea (self-hosted) $0 (the operator's existing Gitea) +───────────────────────────────── +Total infra ~$30-40/mo +``` + +See [Chapter 18 — Cost](./18-cost.md) for a full breakdown including +external SaaS (Fastmail, Apple Developer, etc.) and at-scale projections. + +## Provisioning workflow + +Nodes were provisioned manually through Hetzner Cloud Console. This is +fine for a three-node cluster; for larger clusters we'd switch to the +[`hetzner-k3s`][hetzner-k3s] Ruby tool that the `deploy-k3s/` scaffold +expects. The manual steps were: + +1. Create project in Hetzner Cloud Console. +2. Upload SSH key (`hetzner.pub`). +3. Create 3× CX33 servers in `nbg1` with Ubuntu 24.04. +4. SSH in as `root`, run bootstrap to create `deploy` user and install + Docker / later k3s. +5. Apply Hetzner Cloud Firewall rules at the network edge *optional* (we + use UFW per Chapter 4 instead). + +A future greenfield deployment would run `deploy-k3s/scripts/01-provision-cluster.sh`, +which does all of this in one shot via the `hetzner-k3s` CLI. + +## Upgrade / replacement plan + +**Node failure.** If a node becomes unreachable, the other two retain +Raft quorum and the cluster continues accepting writes. Pods from the +failed node get rescheduled to the survivors (so long as the survivors +have spare capacity — see Chapter 16). To replace the dead node: + +1. Delete it from the cluster: `kubectl delete node ` +2. Create a replacement CX33 in Hetzner console +3. Install k3s on it with `--server=https://:6443` +4. Verify `kubectl get nodes` shows it as Ready + +**Scaling up.** To add a fourth node, same procedure without deleting +anything. Consider whether you want it as a server (adds to Raft quorum; +must also add up to an odd total) or an agent (worker-only). K3s agents +join with `INSTALL_K3S_EXEC=agent` instead of `server`. + +**Upgrading K3s.** K3s has a minor release every ~3 months. Upgrade by +running the install script with the new version on each node, one at a +time, verifying cluster health between each. See +[Chapter 17](./17-runbook.md) for the detailed procedure. + +**Upgrading the OS.** Ubuntu 24.04 LTS is supported until 2029. +`unattended-upgrades` is *not* currently installed, so OS patches require +manual `apt upgrade`. Install `unattended-upgrades` when time permits — +security patches are important and automation reduces the risk of +falling behind. + +## Physical location & regulatory + +- **Sovereignty**: Hetzner is headquartered in Gunzenhausen, Germany. + All data at rest in `nbg1` is subject to German law and the GDPR. +- **User data**: Most user data actually lives in + **Neon Postgres (AWS us-east-1, Virginia)** and **Backblaze B2 + (us-east-005, South Carolina)** — both US-hosted. EU users' data + therefore *exits* the EU in the API path. If strict EU data residency + is ever a requirement, Neon has a EU region (Frankfurt) and Backblaze + has EU endpoints; switching is a configuration change, not an + architectural one. +- **Encryption at rest**: Hetzner encrypts node-local disks at the + hypervisor layer. Neon encrypts at the AWS EBS layer. B2 encrypts + objects server-side. None of our application code or config holds + secrets at rest that aren't already in Kubernetes Secrets (which + are stored in etcd; etcd on disk is unencrypted by default in k3s + but see Chapter 5 for hardening). + +## Operator cheat sheet + +```bash +# SSH to any node +ssh -i ~/.ssh/hetzner deploy@hetzner1 + +# Check node health +kubectl get nodes -o wide + +# Per-node resource usage +kubectl top nodes + +# See what's on each node +kubectl get pods -A -o wide | sort -k 8 + +# Hetzner console (in browser) +# https://console.hetzner.cloud/ +``` + +## References + +- [Hetzner Cloud product page][hetzner-cloud] +- [Hetzner price adjustment April 2026][hetzner-prices] +- [hetzner-k3s tool][hetzner-k3s] +- [K3s architecture docs][k3s-arch] + +[hetzner-cloud]: https://www.hetzner.com/cloud/ +[hetzner-prices]: https://docs.hetzner.com/general/infrastructure-and-availability/price-adjustment/ +[hetzner-k3s]: https://github.com/vitobotta/hetzner-k3s +[k3s-arch]: https://docs.k3s.io/architecture diff --git a/docs/deployment/02-orchestrator-choice.md b/docs/deployment/02-orchestrator-choice.md new file mode 100644 index 0000000..0a61c30 --- /dev/null +++ b/docs/deployment/02-orchestrator-choice.md @@ -0,0 +1,323 @@ +# 02 — Orchestrator Choice + +## Summary + +We run K3s — a lightweight Kubernetes distribution from SUSE/Rancher Labs. +This wasn't our first choice. We originally deployed on Docker Swarm and +spent a long afternoon hitting a libnetwork bug before migrating. This +chapter walks through the comparison of the three realistic orchestrators +(Docker Swarm, full Kubernetes, and K3s) and a fourth (Nomad) we +considered and rejected. The story of the Swarm→k3s migration is in +[Chapter 19](./19-postmortem-swarm.md); this chapter is about the decision +framework. + +## The decision + +**K3s v1.34.6+k3s1**, HA mode, three control-plane nodes with embedded etcd. + +## Candidates considered + +| | Docker Swarm | K3s | Full Kubernetes (kubeadm) | Hashicorp Nomad | +|---|---|---|---|---| +| Learning curve | Easiest | Medium | Hardest | Easy | +| Install on 3 nodes | `docker swarm init/join` | `curl \| sh` per node | Many steps | `nomad server/agent` | +| Memory footprint (control plane) | ~200 MB per node | ~500 MB per node | ~1 GB per node | ~200 MB per node | +| Service discovery | libnetwork (buggy) | CoreDNS | CoreDNS | Consul | +| HA quorum | Raft (3+ managers) | Raft via embedded etcd (3+ servers) | etcd cluster (3+ nodes) | Raft (3+ servers) | +| Secrets management | Swarm secrets | k8s Secrets | k8s Secrets | Vault or file-backed | +| Rolling updates | Swarm update_config | Deployments | Deployments | job update stanza | +| Ingress | None (third-party) | Traefik bundled | None (install yourself) | None (install yourself) | +| Active development | Maintenance mode | Active | Active | Active | +| Industry momentum | Declining | Growing | Dominant | Niche | + +## Why K3s + +### Against Docker Swarm + +Swarm was our first pick because it's the simplest "production-like" +option. `docker swarm init` gives you a working cluster in seconds. It's +built into the Docker daemon you already have. + +What killed it: + +1. **libnetwork state bugs.** Swarm's service discovery relies on + libnetwork's gossip-backed service registry. When a service's task + migrates between nodes, the old endpoint record isn't always removed + cleanly — especially on encrypted overlays or during transient network + partitions. The result: stale DNS A-records that persist indefinitely, + survive service removal, survive containerd restarts, survive pretty + much everything except recreating the overlay network. Multiple open + issues track this: [moby/moby#52265][moby-52265], + [moby/moby#51491][moby-51491], [Dokploy#3480][dokploy-3480]. + +2. **It's in maintenance mode.** Mirantis [committed to supporting + Swarm through 2030][mirantis-swarm] as part of Mirantis Kubernetes + Engine 3, but nothing is being actively developed. The libnetwork code + has no champion; bug fixes land slowly and often incompletely (the + 29.0.0 partial fix for #50236, the 29.3.0 regression, the pending + follow-up in #52289 — months apart). + +3. **Industry signal.** Every 2026 write-up of "should I pick Swarm" + reaches the same conclusion: run what works; don't bet new workload on + it. [Better Stack][bstack-swarm] and [VirtualizationHowTo][vht-swarm] + are representative. + +The [Chapter 19 postmortem](./19-postmortem-swarm.md) details the specific +bug we hit, the workarounds we tried, and why each failed. + +### Against full Kubernetes (kubeadm) + +Full Kubernetes is the de-facto standard. It has the biggest ecosystem, the +most documentation, the most mindshare. Against it: + +1. **Operational overhead.** A kubeadm-built cluster has ~6 control-plane + processes (kube-apiserver, etcd, kube-scheduler, kube-controller-manager, + kube-proxy, kubelet) each of which needs monitoring, upgrading, and + understanding. K3s bundles them into a single binary with sensible + defaults. + +2. **Memory.** A kubeadm control plane wants ~1 GB RAM baseline per master + node. On an 8 GB node that's 12% gone before any workload runs. K3s is + ~500 MB per master. + +3. **Etcd.** Full Kubernetes expects a separate 3+ node etcd cluster for + HA, typically on the same masters but as an independent process. K3s + embeds etcd in the server binary; still Raft, still HA, but one less + thing to install/upgrade/monitor. + +4. **Cluster creation UX.** `kubeadm init` + certificate distribution + CNI + install + storage class setup is a multi-step dance. K3s `curl -sfL + https://get.k3s.io | sh -s - server --cluster-init` plus two joins is a + 10-minute cluster. + +**What we'd lose by not using full Kubernetes:** nothing that matters at +our scale. K3s is 100% Kubernetes API-compatible. Every `kubectl` command, +every Helm chart, every manifest works identically. If we ever need to +migrate to full Kubernetes, `kubectl get all -A -o yaml` gives us the +entire state and we re-apply it on the new cluster. + +### Against Hashicorp Nomad + +Nomad is very good at what it does — simpler than Kubernetes, more robust +than Swarm, has real load balancing (via Consul Connect), and the +`nomad agent` binary is ~80 MB vs k3s' ~200 MB. + +Against it: + +1. **Ecosystem is smaller.** Far fewer community Helm charts, operators, + tutorials. Every new component needs bespoke integration. +2. **Service discovery requires Consul.** Two products to operate, not one. +3. **Ingress requires a separate tool** (Traefik, HAProxy, Fabio). K3s + bundles Traefik by default. +4. **Secrets management** requires Vault or relies on Nomad's template + stanza. Not bad, but more moving parts. +5. **The operator hasn't used Nomad in production before.** Learning curve + on a new platform during a prod migration is a bad trade. + +Nomad would be a defensible choice. K3s won primarily on ecosystem +maturity and the operator's familiarity with Kubernetes primitives. + +## What K3s actually is + +K3s is a CNCF Sandbox project (now graduated to Rancher/SUSE-backed) +originally designed for edge and IoT. Its design goals: + +- Single ~200 MB static binary +- Works on ARM64 and AMD64 +- Bundles everything needed for a working cluster: containerd, Flannel, + CoreDNS, Traefik, metrics-server, local-path storage provisioner, and + (optionally) servicelb (klipper-lb) load balancer +- Replaces the kubeadm setup dance with `curl | sh` +- Replaces etcd-in-its-own-cluster with embedded etcd (or SQLite for + single-node) +- Replaces Docker with containerd (though you can opt back into Docker) + +It is **not** a fork of Kubernetes. K3s is Kubernetes, packaged differently. +The Kubernetes Go code it wraps is unmodified (aside from build-time +stripping of cloud provider integrations you don't need). `kubectl`, +the API, CRDs, operators — all identical. + +## HA architecture we chose + +```mermaid +flowchart TB + subgraph Cluster[k3s HA cluster] + subgraph N1[hetzner1] + K1[k3s server] + E1[etcd] + KUB1[kubelet] + TR1[Traefik pod
hostNet :80/:443] + P1[app pods] + end + subgraph N2[hetzner2] + K2[k3s server] + E2[etcd] + KUB2[kubelet] + TR2[Traefik pod
hostNet :80/:443] + P2[app pods] + end + subgraph N3[hetzner3] + K3[k3s server] + E3[etcd] + KUB3[kubelet] + TR3[Traefik pod
hostNet :80/:443] + P3[app pods] + end + end + + E1 <--Raft--> E2 <--Raft--> E3 + E1 <--Raft--> E3 + + K1 & K2 & K3 --- API[kube-apiserver
port 6443] +``` + +### ASCII fallback + +``` + hetzner1 hetzner2 hetzner3 + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ k3s srv │ │ k3s srv │ │ k3s srv │ + │ ├ etcd ─┼──────┼ ├ etcd ──┼──────┼─ etcd │ │ + │ │ :6443│ │ │ :6443│ │ :6443│ │ + │ ├ kubelet │ ├ kubelet │ kubelet│ + │ └ pods │ │ └ pods │ │ pods │ │ + └──────────┘ └──────────┘ └──────────┘ + │ ▲ │ ▲ │ ▲ + │ └─── Raft ────┤ └─── Raft ────┘ │ + └────────── Raft ─┴─────────────────────┘ +``` + +All three nodes are **server** nodes (in k3s terminology) — they all run +`kube-apiserver`, `kube-scheduler`, `kube-controller-manager`, and +participate in etcd Raft consensus. A fourth "agent" node could be added +as worker-only; we don't need that capacity yet. + +**Quorum**: 2 out of 3 nodes must agree on writes. The cluster stays +operational if any one node dies. Two dying nodes = cluster loses quorum +(Raft halts) until a majority returns. + +## What we disabled + +We ran k3s install with `--disable=servicelb`. `servicelb` (a.k.a. +`klipper-lb`) is a trick where k3s spawns a daemonset that listens on a +node's host ports and proxies to `LoadBalancer`-typed services. Fine for +dev; we don't need it because we handle ingress with Traefik in +DaemonSet+hostNetwork mode (Chapter 6). + +We did **not** disable: +- **traefik** — we reconfigured it via HelmChartConfig rather than + disable-and-replace. See Chapter 6. +- **local-path-provisioner** — provides the default `StorageClass` we use + for Redis PVC (Chapter 7). +- **metrics-server** — required for `kubectl top` and HorizontalPodAutoscaler. +- **coredns** — the cluster DNS. Essential for service discovery. + +## Version choices + +### K3s v1.34.6+k3s1 + +This was the latest stable K3s release as of 2026-04-24. K3s follows +upstream Kubernetes' release cadence — `1.34` matches Kubernetes 1.34.x. +The `+k3s1` suffix is the K3s build number within that upstream version. + +**Upgrade policy**: K3s supports one minor version per quarter. We'd +upgrade in place to 1.35 when it's been out ~30 days and has no open +critical bugs in the release notes. See Chapter 17 for the procedure. + +### containerd v2.2.2 + +Bundled with K3s. containerd 2.x brought full support for the +`cri-dockerd` replacement API and performance improvements over 1.x. +We don't pin containerd separately — we take whatever K3s ships. + +### Flannel (VXLAN backend) + +Bundled with K3s as the default CNI. Flannel's VXLAN backend is +straightforward, performant enough, and has worked reliably in every K3s +install we've seen. Alternatives (Calico, Cilium) are more featureful but +add operational complexity. + +See [Chapter 3](./03-networking.md) for a deep dive on the networking +layer. + +## What we did NOT choose from K3s' ecosystem + +- **servicelb / klipper-lb** — off. Reason above. +- **embedded SQLite** — on single-node k3s, SQLite replaces etcd. We're + multi-node, so this doesn't apply. +- **`--flannel-backend=wireguard-native`** — WireGuard-encrypted overlay. + We didn't enable it because (a) VXLAN already works, (b) our node-to-node + traffic stays within Hetzner's internal network anyway, and (c) we haven't + proven we need it. Encryption is a TODO (Chapter 20). + +## Raft and split-brain behavior + +If the 3 nodes become network-partitioned such that one node sees the +other two and vice versa (a "2-1 split"): + +- **Majority partition (2 nodes)** — retains quorum, cluster keeps + accepting writes. Pods on those 2 nodes keep running. Pods on the + isolated node eventually get marked `NotReady` after + `node-monitor-grace-period` (default 40s), and after + `pod-eviction-timeout` (default 5 min) their pods are marked for + eviction and rescheduled onto the surviving nodes. +- **Minority partition (1 node)** — loses quorum. API server on that + node refuses writes; existing pods keep running (kubelet doesn't need + the API server for already-scheduled pods), but nothing new can deploy, + scale, or reschedule. + +When the partition heals, Raft reconciles automatically. The minority +node catches up on etcd state via snapshot+replay. + +**Worst case** (all 3 isolated from each other): no quorum, no node is +authoritative. Pods keep running from existing state; nothing can be +updated. This requires all three nodes losing network to each other +simultaneously, which implies Hetzner's entire internal switching is +broken — at that point, the whole region is likely down anyway. + +## Our decision in one sentence + +K3s gave us the Kubernetes API (enormous ecosystem, known primitives, our +existing scaffold in `deploy-k3s/manifests/`) without the operational +overhead of kubeadm; and unlike Swarm, its service-discovery layer is +rock-solid. + +## Operator cheat sheet + +```bash +# On any k3s server node, root commands use k3s-wrapped kubectl: +sudo k3s kubectl get nodes + +# From workstation, use the copied kubeconfig: +export KUBECONFIG=~/.kube/honeydue-k3s.yaml +kubectl get nodes + +# Check k3s service: +ssh deploy@hetzner1 "sudo systemctl status k3s" + +# Watch cluster events live: +kubectl get events -A --watch + +# See what's on each node: +kubectl get pods -A -o wide | sort -k 8 +``` + +## References + +- [K3s architecture][k3s-arch] +- [K3s requirements][k3s-reqs] +- [Mirantis Swarm support announcement][mirantis-swarm] +- [moby/moby#52265 — libnetwork stale records][moby-52265] +- [moby/moby#51491 — DNS broken after swarm init][moby-51491] +- [Dokploy #3480 — Traefik stale VIP on Swarm][dokploy-3480] +- [Better Stack: Hetzner Cloud Review 2026][bstack-swarm] +- [VirtualizationHowTo: Is Docker Swarm Still Safe in 2026?][vht-swarm] + +[k3s-arch]: https://docs.k3s.io/architecture +[k3s-reqs]: https://docs.k3s.io/installation/requirements +[mirantis-swarm]: https://www.mirantis.com/blog/mirantis-guarantees-long-term-support-for-swarm/ +[moby-52265]: https://github.com/moby/moby/issues/52265 +[moby-51491]: https://github.com/moby/moby/issues/51491 +[dokploy-3480]: https://github.com/Dokploy/dokploy/issues/3480 +[bstack-swarm]: https://betterstack.com/community/guides/web-servers/hetzner-cloud-review/ +[vht-swarm]: https://www.virtualizationhowto.com/2026/03/is-docker-swarm-still-safe-in-2026/ diff --git a/docs/deployment/03-networking.md b/docs/deployment/03-networking.md new file mode 100644 index 0000000..5c0de8c --- /dev/null +++ b/docs/deployment/03-networking.md @@ -0,0 +1,465 @@ +# 03 — Networking + +## Summary + +The network stack has five layers: the physical/internet layer (Hetzner's +public network), the node layer (Ubuntu with UFW), the Kubernetes overlay +(Flannel VXLAN), the service layer (kube-proxy IPVS + CoreDNS), and the +ingress layer (Traefik). This chapter walks through each, explains how +they compose, and traces a single HTTP request from browser to Go API +response showing every hop. + +## The five layers + +```mermaid +flowchart TB + subgraph L5[Layer 5 — Ingress] + Traefik + end + subgraph L4[Layer 4 — Service discovery] + KubeProxy[kube-proxy IPVS] + CoreDNS + end + subgraph L3[Layer 3 — Pod overlay] + Flannel[Flannel VXLAN
UDP 8472] + end + subgraph L2[Layer 2 — Node network] + UFW + Kernel[Linux kernel
netfilter/iptables] + end + subgraph L1[Layer 1 — Physical] + Hetzner[Hetzner network
public v4 + v6] + end + + L5 --> L4 --> L3 --> L2 --> L1 +``` + +### ASCII fallback + +``` + ┌──────────────────────────────────────┐ + │ L5 Traefik (host network, :80/:443)│ + ├──────────────────────────────────────┤ + │ L4 kube-proxy (IPVS) + CoreDNS │ + ├──────────────────────────────────────┤ + │ L3 Flannel VXLAN overlay │ + │ 10.42.0.0/16 pod CIDR │ + ├──────────────────────────────────────┤ + │ L2 Ubuntu + UFW + kernel iptables │ + ├──────────────────────────────────────┤ + │ L1 Hetzner public IPv4/IPv6 │ + └──────────────────────────────────────┘ +``` + +## Layer 1 — Physical network + +Each Hetzner CX33 has: +- A **public IPv4** address on the internet +- A **public IPv6** /64 subnet (one address used, the rest unused) +- **20 TB/mo** outbound traffic included; inbound is free +- **~1 Gbps** network bandwidth per node + +All inter-node traffic goes over the **public network**. Hetzner Cloud +offers a private-network feature (vswitch), but we didn't attach one — +adding it now would require reconfiguring Flannel's advertise-addr. A +future improvement: attach a private vSwitch to all three nodes, +reconfigure Flannel to use it, shrink our public-interface attack surface. + +## Layer 2 — Node network + +Each node runs Ubuntu 24.04.3 LTS with: + +- **Default routing** via the Hetzner-provided gateway +- **UFW** as the iptables frontend (Chapter 4 lists every rule) +- **IP forwarding** enabled (`net.ipv4.ip_forward=1`) — required for + Kubernetes pod routing +- **Bridge netfilter** enabled (`net.bridge.bridge-nf-call-iptables=1`) + — required so iptables can see bridged traffic + +K3s configures the latter two automatically at install time via +`/etc/sysctl.d/90-kubelet.conf` (or similar; exact file varies by distro). + +Two additional sysctls we set manually: + +``` +# /etc/sysctl.d/99-unprivileged-ports.conf +net.ipv4.ip_unprivileged_port_start=0 +``` + +**Why**: Traefik runs as UID 65532 (non-root) in host network mode to bind +:80 and :443. Without this sysctl, even with `CAP_NET_BIND_SERVICE`, it +can't bind privileged ports in the host namespace. Ubuntu 24.04's default +is 1024 (so ports 1–1023 are "privileged"). Setting it to 0 lets any +user bind any port. + +**Security implication**: Minimal. The ports Traefik binds are still +controlled by the container runtime — other pods on the node can't +accidentally grab 80/443 because kubelet won't schedule conflicting host +ports. And the UFW rules still gate what's reachable externally. + +## Layer 3 — Pod overlay (Flannel VXLAN) + +### What Flannel is + +Flannel is a CNI (Container Network Interface) plugin. Its job: give every +pod in the cluster a routable IP address, and make those IPs reachable +from any other pod regardless of which node they're on. + +### The pod CIDR + +K3s assigns **10.42.0.0/16** as the cluster-wide pod CIDR by default. Each +node gets a /24 slice: + +| Node | Pod CIDR | +|---|---| +| ubuntu-8gb-nbg1-1 | 10.42.1.0/24 | +| ubuntu-8gb-nbg1-2 | 10.42.0.0/24 | +| ubuntu-8gb-nbg1-3 | 10.42.2.0/24 | + +Each pod gets an IP from its node's slice. So a pod on hetzner2 +(`nbg1-1`) might be `10.42.1.6`; a pod on hetzner3 (`nbg1-3`) might be +`10.42.2.10`. + +### How VXLAN works + +VXLAN ("Virtual Extensible LAN") tunnels Layer-2 frames over UDP. Flannel +wraps every inter-node packet like so: + +``` + Original pod → pod packet: + ┌──────────────────────────────────────────────────┐ + │ Ethernet │ IP src=10.42.0.5 → dst=10.42.2.10 │ … │ + └──────────────────────────────────────────────────┘ + + Flannel VXLAN-encapsulates it: + ┌──────────────────────────────────────────────────────────────────┐ + │ Eth │ IP src=178.104.247.152 → dst=178.104.249.189 │ UDP 8472 │ │ + │ VXLAN header │ │ │ + └──────────────────────────────────────────────────────────────────┘ +``` + +The outer IP/UDP carries the packet between nodes over Hetzner's public +network. On arrival, the destination node unwraps the VXLAN header and +delivers the inner packet to the target pod. + +**UDP port 8472** is VXLAN's IANA-assigned port. It must be open +node-to-node in UFW (see Chapter 4). + +**MTU note**: VXLAN encapsulation adds 50 bytes of overhead (8 VXLAN + +8 UDP + 20 IP + 14 Ethernet). Hetzner's network uses standard 1500-byte +MTU, so Flannel's overlay MTU is 1450. Mismatches cause silent packet +drops. K3s sets this correctly by default. + +### Flannel config + +`/var/lib/rancher/k3s/agent/etc/flannel/net-conf.json` on each node: + +```json +{ + "Network": "10.42.0.0/16", + "EnableIPv6": false, + "EnableIPv4": true, + "IPv6Network": "::/0", + "Backend": { "Type": "vxlan" } +} +``` + +We did not enable IPv6 in the cluster — an unnecessary complexity for our +scale, and CoreDNS + kube-proxy + node controllers all work fine in v4-only +mode. + +### No encryption (yet) + +Flannel VXLAN traffic over Hetzner's public network is **not encrypted**. +This means pod-to-pod traffic between nodes is visible to any attacker +with packet capture on the path — in practice, nobody between our three +nodes at Hetzner Nuremberg, but it's still plaintext on the wire. + +**Mitigation today**: All sensitive inter-pod traffic already uses TLS: +- api ↔ Neon Postgres: TLS 1.3 (`DB_SSLMODE=require`) +- api/worker ↔ Backblaze B2: HTTPS +- api ↔ Fastmail: STARTTLS +- api ↔ Redis: plaintext but Redis only holds cache + Asynq queue state, + no user credentials + +**TODO** (Chapter 20): Switch Flannel to `wireguard-native` backend. K3s +supports this with a flag at install time; enabling on an existing +cluster requires a config edit and rolling kubelet restart. + +## Layer 4 — Service discovery + +Pods don't talk to each other by IP — IPs are ephemeral, assigned on pod +creation. They use **service names** resolved by DNS. + +### CoreDNS + +K3s runs **CoreDNS** as the cluster DNS server. A pod in the `honeydue` +namespace resolves `redis` to the Redis Service's ClusterIP: + +``` +redis → 10.43.7.10 (Service ClusterIP) +redis.honeydue → 10.43.7.10 +redis.honeydue.svc.cluster.local → 10.43.7.10 +``` + +When an app resolves `redis:6379`: + +1. The pod's `/etc/resolv.conf` points to `10.43.0.10` (the CoreDNS + Service). +2. CoreDNS receives the query, checks its known Services, returns + `10.43.7.10`. +3. The pod sends TCP to `10.43.7.10:6379`. +4. kube-proxy (Layer 4, below) intercepts and routes to the actual pod IP. + +### The service CIDR + +K3s assigns **10.43.0.0/16** as the service CIDR. ClusterIPs live here. +Currently: + +| Service | ClusterIP | +|---|---| +| `api.honeydue` | 10.43.167.83 | +| `admin.honeydue` | 10.43.136.168 | +| `redis.honeydue` | 10.43.7.10 | +| `kubernetes.default` | 10.43.0.1 | +| `kube-dns.kube-system` | 10.43.0.10 | + +ClusterIPs are **stable** for the life of the Service — they don't change +when pods come and go. + +### kube-proxy (IPVS mode) + +`kube-proxy` is the dataplane component that makes Services work. It runs +as a DaemonSet (one per node), watches the k3s API for Service and +Endpoint changes, and programs the kernel to route traffic. + +K3s defaults to **IPVS mode** on modern kernels. IPVS is a Linux kernel +feature for in-kernel L4 load balancing — essentially connection-tracking +NAT with round-robin or other scheduling. + +When a pod dials `10.43.7.10:6379`: + +1. The first packet hits the node's kernel +2. IPVS sees the destination is a ClusterIP +3. IPVS picks an endpoint from the Service's endpoint set (e.g., + `10.42.0.10:6379` on hetzner2) +4. IPVS rewrites the destination and forwards +5. Flannel tunnels it to the destination node (if remote) or delivers + locally (if the endpoint is on the same node) + +This happens per-TCP-connection, not per-packet, thanks to conntrack. + +### Why IPVS over iptables + +K3s' default kube-proxy mode is IPVS. The alternative (iptables mode) is +older and slower — for every Service, iptables mode adds a chain of rules +that grow linearly with Service count. IPVS uses a hash table and scales +to thousands of Services without performance degradation. At our scale +either works, but IPVS is the better default. + +### Headless Services + +Some of our Services are *not* using a ClusterIP — they're "headless" +(`clusterIP: None`). Our setup doesn't currently use them but it's worth +knowing the distinction: headless Services return all endpoint IPs +directly via DNS, no kube-proxy involvement. Useful for stateful sets +where clients need to talk to a specific replica. + +## Layer 5 — Ingress (Traefik) + +External traffic arrives on the node's public :80 or :443. Traefik +handles the first mile of routing. See [Chapter 6](./06-traefik-ingress.md) +for Traefik-specific details; this section just shows how it fits in the +networking stack. + +Traefik runs as a **DaemonSet** with `hostNetwork: true`. That means: +- One Traefik pod per node +- Each pod is in the **host's network namespace**, not a pod netns +- Each pod can bind directly to `0.0.0.0:80` and `0.0.0.0:443` on the node + +When Cloudflare sends a request to `178.104.247.152:80`: + +1. Packet arrives at hetzner1's NIC +2. UFW accepts (80/tcp is open from anywhere) +3. Linux kernel routes to localhost:80 because something's listening +4. Traefik (running in host namespace) accepts the connection +5. Traefik reads the `Host:` header +6. Traefik matches an Ingress rule (api.myhoneydue.com → api Service) +7. Traefik dials `10.43.167.83:8000` (Service ClusterIP) +8. Kube-proxy IPVS rewrites to a live api pod endpoint +9. Flannel VXLAN tunnels if the endpoint is on a remote node +10. The api pod receives the request, processes, responds +11. Response flows back the reverse path + +Full trace in the [end-to-end section](#end-to-end-request-trace) below. + +## IPs we care about + +| What | CIDR / IP | Used for | +|---|---|---| +| Pod CIDR | 10.42.0.0/16 | All pod IPs cluster-wide | +| Service CIDR | 10.43.0.0/16 | All ClusterIPs | +| Flannel VXLAN | UDP 8472 | Pod-to-pod traffic (inter-node) | +| CoreDNS Service | 10.43.0.10:53 | Cluster DNS | +| Kubernetes Service | 10.43.0.1:443 | Internal kube-apiserver | +| Node IPs | See README | External + flannel source/dst | +| Traefik | host network | Listens on node's :80, :443 | + +## End-to-end request trace + +A user in Texas hits `https://api.myhoneydue.com/api/tasks/`. Here's every +hop: + +```mermaid +sequenceDiagram + autonumber + participant U as User (Austin, TX) + participant CF as Cloudflare edge (DFW POP) + participant H as hetzner2 (picked by CF)
178.105.32.198 + participant TR as Traefik pod
(hostNetwork) + participant API as api pod on hetzner3
10.42.2.6:8000 + participant DB as Neon Postgres
(AWS us-east-1) + + U->>CF: HTTPS :443 GET /api/tasks/ + Note over CF: TLS handshake terminates here + CF->>H: HTTP :80 (with original Host header) + H->>TR: Accepted by kernel, delivered to Traefik + Note over TR: Matches Ingress rule
host: api.myhoneydue.com + TR->>TR: Resolve api.honeydue → 10.43.167.83 + TR->>H: dial 10.43.167.83:8000 + H->>H: kube-proxy IPVS rewrites
dst → 10.42.2.6:8000 + H->>API: Flannel VXLAN encapsulate
UDP 8472 → hetzner3 + Note over API: Pod receives packet + API->>DB: SELECT … FROM tasks WHERE user_id = …
TLS :5432 + DB-->>API: Result rows + API-->>TR: HTTP 200 JSON + TR-->>CF: HTTP 200 + CF-->>U: HTTPS 200 +``` + +### Timing budget for a cache-miss read + +| Hop | Typical latency | +|---|---| +| User → CF edge (DFW) | 5–15 ms | +| CF edge → hetzner2 (origin HTTP :80) | 90–120 ms (cross-Atlantic) | +| UFW + kernel accept | <1 ms | +| Traefik accept + route | 1–2 ms | +| kube-proxy + Flannel (same node) | <1 ms | +| kube-proxy + Flannel (remote node, VXLAN) | 1–3 ms | +| Go API request handling | 1–5 ms | +| Neon Postgres query (TLS + SQL) | 20–60 ms (AWS us-east-1) | +| Return path (reverse) | similar | + +**Total typical**: ~200–300 ms for a user in North America, dominated by +the cross-Atlantic CF→origin hop. Cached responses at Cloudflare skip the +origin hop entirely. + +## Inter-node routing concretely + +Here's what `ip route` shows on hetzner2 (not run live, reconstructed from +typical k3s+flannel+vxlan setup): + +``` +default via 172.31.1.1 dev eth0 # Hetzner gateway +10.42.0.0/24 via 10.42.0.0 dev flannel.1 # to hetzner1 pods (via VXLAN iface) +10.42.1.0/24 dev cni0 # local pods on hetzner2 +10.42.2.0/24 via 10.42.2.0 dev flannel.1 # to hetzner3 pods (via VXLAN iface) +10.43.0.0/16 via 10.42.1.1 dev cni0 # services via kube-proxy +``` + +The `flannel.1` interface is the VXLAN tunnel endpoint. Traffic written +to it gets encapsulated in UDP 8472 and sent to the peer node's public IP. + +Flannel learns about peer nodes via the Kubernetes API (it watches Node +resources). When hetzner3 joins, Flannel on hetzner1 and hetzner2 both +learn its public IP and pod CIDR, update their routes and ARP tables, +and traffic just works. + +## Network performance + +### Within a node (pod to pod, same host) + +Packets go through `cni0` bridge, never leave the node. Sub-millisecond +latency, bounded by kernel + veth performance. Easily >10 Gbps. + +### Between nodes (pod to pod, different host) + +Packets go through Flannel VXLAN. Added overhead: encap/decap in the +kernel (~5–10 μs), plus the actual network hop between hetzner nodes +(~0.5 ms within the same Hetzner datacenter). Throughput is bounded by +Hetzner's NIC (≈1 Gbps sustained per node). + +In practice this is fine for everything we do. The slowest link in our +application is Neon (AWS us-east-1), which is ~100 ms round-trip. + +## DNS resolution path + +A pod resolves `redis`: + +1. App does `getaddrinfo("redis")`. +2. glibc reads `/etc/resolv.conf`, finds nameserver `10.43.0.10`. +3. sends UDP 53 to `10.43.0.10`. +4. Destination is CoreDNS Service ClusterIP. +5. kube-proxy IPVS load-balances across CoreDNS pods (there's usually 1). +6. The packet arrives at the CoreDNS pod. +7. CoreDNS checks its Kubernetes plugin cache for `redis..svc.cluster.local`. +8. Returns `10.43.7.10` (redis Service ClusterIP) with a low TTL. + +CoreDNS is stateless — if it restarts, pods re-query on their next lookup. + +**DNS caching in pods**: The Go API uses `net.Resolver` which does not +cache by default. Each new connection triggers a fresh DNS lookup. This +is correct behavior for Kubernetes (where Service IPs are stable but +Endpoints change), but it means a CoreDNS outage breaks new connections +immediately. + +Next.js (admin) also uses Node's default resolver, similar behavior. + +## What breaks if X fails + +| Failure | Symptom | +|---|---| +| Flannel daemon on one node crashes | Pods on that node can't reach other nodes' pods; kube-proxy Services sometimes work (kernel conntrack) | +| CoreDNS pod crashes (only 1) | New connection DNS lookups fail; existing connections continue | +| kube-proxy daemon on one node crashes | Pods on that node can't resolve Service ClusterIPs; direct pod IPs still work | +| UFW misconfigured (port 8472 UDP blocked) | Pods on that node can't reach remote pods over overlay | +| Node's NIC fails | Node unreachable; Raft loses it; its pods get rescheduled elsewhere | +| Hetzner datacenter outage | Entire cluster offline | + +## Operator cheat sheet + +```bash +# See all IPs in the cluster +kubectl get pods -A -o wide # pod IPs + nodes +kubectl get svc -A # Service ClusterIPs + +# Test pod-to-pod DNS from inside a pod +kubectl exec -n honeydue deploy/api -- nslookup redis +kubectl exec -n honeydue deploy/api -- getent hosts redis + +# Test pod-to-pod TCP connectivity +kubectl exec -n honeydue deploy/api -- nc -zv redis 6379 +kubectl exec -n honeydue deploy/api -- wget -q -O- http://admin:3000/ + +# See the node's iptables/IPVS rules (run on a node) +ssh deploy@hetzner1 "sudo ipvsadm -Ln" +ssh deploy@hetzner1 "sudo iptables -L -n -t nat | head -50" + +# See the cluster's flannel state +kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.addresses[?(@.type=="InternalIP")].address}{" "}{.spec.podCIDR}{"\n"}{end}' +``` + +## References + +- [Kubernetes networking concepts][k8s-net] +- [Flannel VXLAN backend][flannel-vxlan] +- [CoreDNS k8s plugin][coredns-k8s] +- [IPVS mode for kube-proxy][ipvs] +- [VXLAN RFC 7348][vxlan-rfc] + +[k8s-net]: https://kubernetes.io/docs/concepts/services-networking/ +[flannel-vxlan]: https://github.com/flannel-io/flannel/blob/master/Documentation/backends.md#vxlan +[coredns-k8s]: https://coredns.io/plugins/kubernetes/ +[ipvs]: https://kubernetes.io/blog/2018/07/09/ipvs-based-in-cluster-load-balancing-deep-dive/ +[vxlan-rfc]: https://datatracker.ietf.org/doc/html/rfc7348 diff --git a/docs/deployment/04-firewall.md b/docs/deployment/04-firewall.md new file mode 100644 index 0000000..7c96724 --- /dev/null +++ b/docs/deployment/04-firewall.md @@ -0,0 +1,357 @@ +# 04 — Firewall + +## Summary + +Every node runs UFW (Uncomplicated Firewall, a frontend for iptables) with +a default-deny-incoming policy. Specific ports are allowed from specific +sources only. This chapter lists every rule on every node, why each rule +exists, and what breaks without it. It also traces what happens to an +inbound packet as it goes through iptables, UFW, and the kernel. + +## Policy + +All three nodes have the same UFW config. The policy: + +| Direction | Default | +|---|---| +| **Incoming** | **deny** | +| Outgoing | allow | +| Routed | disabled (we don't NAT) | + +Default deny is a white-list model: unless a rule explicitly allows a +packet, it's dropped. This is more secure than default-allow but requires +that every legitimate port be enumerated in a rule. + +## Current ruleset per node + +Run `sudo ufw status verbose` on any node to see the live ruleset. The +canonical ruleset below, grouped by purpose. + +### Public-facing (anywhere) + +| Port | Protocol | From | Purpose | Comment | +|---|---|---|---|---| +| 22 | TCP | Anywhere | SSH | | +| 80 | TCP | Anywhere | HTTP (Cloudflare → Traefik) | | +| 443 | TCP | Anywhere | HTTPS (future, currently unused at origin) | | + +**Why 443 is open but unused**: We're on Cloudflare SSL=Flexible, so +Cloudflare talks to origin over plain HTTP:80. Port 443 on origin is +only hit by misconfigured clients (who bypass CF DNS and hit node IPs +directly). Traefik's config accepts it but we don't require it. Keeping +it open smooths a future switch to Full (strict) SSL mode. + +**Future hardening**: Restrict 80 and 443 to Cloudflare's published IP +ranges (15 IPv4 CIDRs, 7 IPv6 CIDRs). See [Chapter 13](./13-cloudflare.md) +for the ranges and the UFW rule format. Today they're open to anyone. + +### SSH (operator access) + +| Port | Protocol | From | Purpose | +|---|---|---|---| +| 22 | TCP | Anywhere | SSH login (key-only) | + +SSH is open to the internet but hardened: key-only auth, no root login, +`AllowUsers deploy` configured (the stock distribution still allows root; +we hardened in bootstrap). See [Chapter 5](./05-security.md) for the full +SSH config. + +**TODO** (Chapter 20): Move SSH off :22 to :2222 or similar, tighten to +the operator's current IP. Current state is acceptable given key-only + +fail2ban defaults. + +### Kubernetes API (kubectl from operator) + +| Port | Protocol | From | Purpose | +|---|---|---|---| +| 6443 | TCP | 47.185.183.191 (operator IP) | kubectl to kube-apiserver | + +When the operator's public IP changes (moves, new ISP), this rule needs +updating on all 3 nodes. Ugly but necessary. A better long-term fix is +**Cloudflare Access** or **Tailscale** to avoid pinning operator IPs. + +### Inter-node cluster traffic + +These rules allow the three nodes to talk to each other for cluster state. +Each node has an allow rule for each of the **three node IPs** (including +its own — the "allow from self" rule exists so local flows are explicit). + +| Port | Protocol | From | Purpose | +|---|---|---|---| +| 6443 | TCP | other nodes | kube-apiserver (other servers' talk to each other) | +| 2379 | TCP | other nodes | etcd client (Raft state reads) | +| 2380 | TCP | other nodes | etcd peer (Raft state writes between server nodes) | +| 10250 | TCP | other nodes | kubelet (metrics, exec, logs from API server) | +| 8472 | UDP | other nodes | Flannel VXLAN overlay | + +### Application-specific (legacy, mostly superfluous on k3s) + +These rules were added during the Swarm era and still exist on the nodes. +None of them hurt anything; most are unused on k3s. + +| Port | Protocol | From | Purpose (original) | Status on k3s | +|---|---|---|---|---| +| 2377 | TCP | node IPs | Swarm cluster management | unused (Swarm gone) | +| 7946 | TCP + UDP | node IPs | Swarm gossip | unused | +| 4789 | UDP | node IPs | Swarm VXLAN | unused (k3s uses 8472) | +| (ESP, proto 50) | — | node IPs | IPSec encrypted overlay | unused | +| 500 | UDP | node IPs | IKE key exchange | unused | +| 3000 | TCP | node IPs | admin Next.js, when we tried node-IP hardcoding | unused | + +These can be removed in a cleanup pass. They don't affect security because +no process listens on those ports anymore. + +## Why each required rule exists + +### Port 22 — SSH (public) + +Obviously needed for operator access. Without it we'd have no way to +reach the nodes. Hetzner console's "rescue" mode is an emergency fallback. + +### Port 80 — HTTP (public) + +Cloudflare talks HTTP to origin on port 80 (SSL=Flexible mode). Without +this rule, Cloudflare gets connection-refused and returns 521 to users. + +### Port 443 — HTTPS (public) + +Currently unused in SSL=Flexible mode. Open to smooth the future +Full-strict migration. No process listens on 443 yet; the kernel would +reject connections. Rule is harmless. + +### Port 6443 — kube-apiserver (operator + inter-node) + +**From operator IP**: so `kubectl` works. Without this, `kubectl get pods` +times out. + +**From other nodes**: server nodes check each other's apiservers for +Raft elections and cross-node controller operations. Without this, +nodes can still run pods but can't participate in cluster state changes. + +### Ports 2379/2380 — embedded etcd (inter-node) + +K3s runs etcd as an embedded library inside the server binary. The etcd +client port (2379) and peer port (2380) carry Raft protocol messages +between the three servers. **Without these rules, Raft cannot replicate +state and the cluster loses quorum.** + +This bit us during the k3s install — initially the joins failed because +2379/2380 were blocked. + +### Port 10250 — kubelet (inter-node) + +The kubelet on each node exposes a read-only API for the kube-apiserver +to call — `kubectl logs`, `kubectl exec`, kubelet metrics scraping. +Without this rule, operator commands like `kubectl logs -n honeydue +deploy/api` fail with "Error from server: unable to upgrade connection". + +### Port 8472 UDP — Flannel VXLAN (inter-node) + +Pod-to-pod traffic between nodes flows through VXLAN tunnels on UDP 8472. +**Without this rule, cross-node pod communication silently fails** — which +looks like "admin can't reach api" or "worker can't reach Redis" depending +on where pods land. + +This rule is load-bearing. It is the single most important inter-node +rule. + +## Inbound packet's journey through UFW/iptables + +When a packet arrives at hetzner1's network interface on port 80: + +```mermaid +sequenceDiagram + participant NIC as hetzner1 NIC + participant PRE as iptables
raw + mangle + nat PREROUTING + participant FIL as iptables filter INPUT
(UFW lives here) + participant SOCK as Traefik pod socket
(host network) + + NIC->>PRE: Packet: SYN :80 from CF + PRE->>PRE: conntrack state: NEW + PRE->>FIL: handoff to INPUT chain + FIL->>FIL: UFW rules evaluated + Note over FIL: Rule: allow 80/tcp from anywhere
→ ACCEPT + FIL->>SOCK: delivered to listening socket + SOCK->>SOCK: Traefik accepts connection +``` + +UFW is really a set of wrapper chains on top of iptables. `sudo iptables +-L INPUT -n --line-numbers` on any node shows the actual rules; UFW just +makes editing them easier. + +## Rule syntax we used + +UFW commands we ran during setup (for reference): + +```bash +# Reset to default +sudo ufw --force reset + +# Default deny incoming +sudo ufw default deny incoming +sudo ufw default allow outgoing + +# SSH + web (public) +sudo ufw allow 22/tcp comment 'SSH' +sudo ufw allow 80/tcp comment 'HTTP' +sudo ufw allow 443/tcp comment 'HTTPS' + +# Kubernetes inter-node (repeat for each peer IP) +for ip in 178.104.247.152 178.105.32.198 178.104.249.189; do + sudo ufw allow from "$ip" to any port 6443 proto tcp comment "k3s-api $ip" + sudo ufw allow from "$ip" to any port 2379 proto tcp comment "k3s-etcd-client $ip" + sudo ufw allow from "$ip" to any port 2380 proto tcp comment "k3s-etcd-peer $ip" + sudo ufw allow from "$ip" to any port 10250 proto tcp comment "k3s-kubelet $ip" + sudo ufw allow from "$ip" to any port 8472 proto udp comment "k3s-flannel-vxlan $ip" +done + +# Kubectl from operator +sudo ufw allow from 47.185.183.191 to any port 6443 proto tcp comment 'kubectl from dev' + +# Enable +sudo ufw --force enable +``` + +Rules persist across reboots via `/etc/ufw/user.rules`. + +## What if we used Hetzner Cloud Firewall instead? + +Hetzner Cloud has a provider-level firewall feature — rule-for-rule +equivalent but configured in the Hetzner console (or via API), not on the +nodes. Tradeoffs: + +| | Hetzner Cloud Firewall | UFW (current) | +|---|---|---| +| Cost | Free | Free | +| Config location | Hetzner console / API | Per-node `/etc/ufw/` | +| Applies to | All traffic to NIC | All traffic to kernel | +| Failure mode | Provider-side issue = rules gone | Node-side issue = rules gone | +| Inter-node traffic | Same rules for all nodes | Same rules on each node | +| Visible to attacker | Yes (provider fingerprints) | Yes (iptables probe) | +| Rule ordering | UI-based | `iptables -L` | + +Either works. A future improvement: move the stable rules to Hetzner +Cloud Firewall (one source of truth) and leave only the dynamic rules +(operator IP, ad-hoc debug) on the nodes. + +## Why we don't use iptables directly + +UFW is a frontend. `iptables` works, but the rules are harder to read and +edit. `sudo ufw allow from X to any port Y proto Z comment 'Z-rule'` is +clearer than writing the equivalent `-A INPUT ...` rule directly. + +Also, UFW's `comment` field lets us explain each rule, which becomes +critical when the ruleset grows past ~10 rules. + +## Testing the firewall + +From the operator workstation (47.185.183.191): + +```bash +# Should work (22/tcp open) +ssh deploy@hetzner1 exit + +# Should work (80/tcp open) +curl -I -H "Host: api.myhoneydue.com" http://hetzner1/api/health/ + +# Should work (443/tcp open; TLS handshake will fail because nothing listens) +curl -kI https://178.104.247.152/ + +# Should work (6443 allowed from operator IP) +export KUBECONFIG=~/.kube/honeydue-k3s.yaml +kubectl get nodes + +# Should time out (default-deny from arbitrary ports) +curl http://178.104.247.152:3000/ # not open to operator +curl http://178.104.247.152:6379/ # Redis not exposed publicly +``` + +From another peer node (hetzner2 trying to reach hetzner1): + +```bash +# Should work (k3s API allowed from peer node IPs) +curl -k https://178.104.247.152:6443/healthz + +# Should work (etcd client from peer) +nc -zv 178.104.247.152 2379 +``` + +## The hidden dependency: kubelet/containerd also need ports + +Beyond the UFW rules, the kubelet also listens on: +- **10255/tcp** — kubelet read-only port (no auth, deprecated; disabled by default in k3s) +- **10256/tcp** — kube-proxy health +- **10257/tcp** — kube-controller-manager health +- **10259/tcp** — kube-scheduler health + +These are bound to `localhost` only, so they don't need UFW rules. But +they're important to know about when debugging — if one of these health +endpoints isn't responding, the relevant component is broken. + +## Legacy rules to clean up + +The following rules are on the nodes from the Swarm era and can be +removed in a future cleanup pass: + +```bash +# On each node, list Swarm-era rules +sudo ufw status numbered | grep -E "2377|7946|4789|500|3000|esp" + +# Remove by number (highest-to-lowest to avoid renumbering) +# Example: +sudo ufw --force delete 15 +sudo ufw --force delete 14 +# ... etc. +``` + +We left them in because they don't affect security (no process listens on +those ports), and removing them requires careful testing that nothing in +k3s secretly relies on 4789/udp or similar. + +## Operator cheat sheet + +```bash +# Show the ruleset, with comments, numbered +sudo ufw status numbered verbose + +# Add a new rule +sudo ufw allow from to any port proto comment '' + +# Remove a rule by number +sudo ufw status numbered +sudo ufw --force delete + +# Temporarily disable all rules (emergency) +sudo ufw disable + +# Re-enable +sudo ufw enable + +# Reload after editing /etc/ufw/ files directly +sudo ufw reload +``` + +## What to do if the firewall locks you out + +Worst case: you apply a rule that blocks your own SSH, UFW enables it +immediately, and you can't log back in. Recovery: + +1. Hetzner Cloud Console → Server → Rescue mode +2. Boot into rescue, mount the disk +3. Edit `/etc/ufw/user.rules` to remove the bad rule +4. Reboot back into normal mode + +This has never happened to us but it's the escape hatch. The Console is +always a TLS login away. + +## References + +- [UFW man page][ufw-man] +- [K3s networking requirements][k3s-reqs] +- [Kubernetes ports and protocols][k8s-ports] +- [Cloudflare IP ranges][cf-ips] + +[ufw-man]: https://manpages.ubuntu.com/manpages/noble/en/man8/ufw.8.html +[k3s-reqs]: https://docs.k3s.io/installation/requirements#networking +[k8s-ports]: https://kubernetes.io/docs/reference/networking/ports-and-protocols/ +[cf-ips]: https://www.cloudflare.com/ips/ diff --git a/docs/deployment/05-security.md b/docs/deployment/05-security.md new file mode 100644 index 0000000..755ee75 --- /dev/null +++ b/docs/deployment/05-security.md @@ -0,0 +1,526 @@ +# 05 — Security + +## Summary + +Security on this deployment is layered: Cloudflare at the edge, UFW at +the node, k3s RBAC + Pod Security at the orchestrator, TLS between +long-haul components, and dedicated service accounts with dropped +capabilities inside containers. This chapter documents each layer, the +rationale, and what's currently missing (and why). + +## Threat model + +Who we're defending against, in rough order of likelihood: + +1. **Opportunistic scanners** — bots scanning random IPv4 ranges for + known vulnerabilities. Mitigated by the firewall. +2. **Credential stuffing / brute-force** — especially against SSH and + admin login. Mitigated by key-only SSH, strong passwords, rate limits. +3. **Compromised external service** — if Neon, Backblaze, or Cloudflare + were breached, attacker would have access to whatever we store there. + Mitigated by scoped credentials, least-privilege API keys. +4. **Compromised container image** — if Gitea or our build pipeline + were compromised, malicious code could reach prod. Mitigated by + (a) Gitea is behind authentication, (b) image pull secrets scoped, + (c) containers run non-root with minimal capabilities. +5. **Insider threat** — not really a threat for a solo operator. +6. **State actor** — not in threat model. At our scale this is + effectively unaddressable without becoming a security company. + +Explicitly **not** in threat model: +- DDoS at a scale that saturates Cloudflare. We pay $0 for CF; their + DDoS mitigation is included but not unlimited. If we got hit with a + large attack, we'd move to a paid plan. +- Physical access to Hetzner datacenters. That's their problem. + +## Layer 1 — Cloudflare edge + +Cloudflare sits in front of every public request. + +### What Cloudflare does for us + +| Protection | How it works | +|---|---| +| TLS termination | CF presents a cert for `*.myhoneydue.com`; clients encrypt to CF | +| DDoS mitigation | Automatic on all plans including Free | +| Bot filtering | "Under Attack" mode + bot score based blocking | +| IP concealment | Origin IPs not in DNS; attackers can't directly scan | +| WAF rules | CF Free includes managed ruleset for common exploits | +| Rate limiting | Free tier: 10k requests/10min; more on paid plans | + +### What Cloudflare does **not** do + +- **Authenticate users** — that's the app's job +- **Authorize requests** — that's the app's job +- **Protect origin if origin IP leaks** — once someone knows a node IP + they can bypass CF. Mitigation: keep origin firewall strict (Chapter 4). +- **Encrypt between CF and origin** — we're on SSL=Flexible, so CF↔origin + is HTTP. This is in our TODO (Chapter 20, upgrade to Full-strict). + +### The proxy-IP problem + +Cloudflare publishes its IP ranges +([cloudflare.com/ips](https://www.cloudflare.com/ips/)). Any client can +verify a request came from a CF IP by checking the remote address. Our +Traefik is configured to trust `X-Forwarded-Proto` (so the Go API sees +`https` even though origin received HTTP) only from CF IP ranges: + +```yaml +# deploy-k3s/manifests/traefik-helmchartconfig.yaml +additionalArguments: + - "--entrypoints.web.forwardedHeaders.trustedIPs=173.245.48.0/20,..." +``` + +This means a malicious request that bypasses CF (by hitting the node IP +directly) can't spoof headers — Traefik ignores `X-Forwarded-*` unless +the source IP is in CF's ranges. + +**TODO** (Chapter 20): Enforce at UFW level — allow 80/tcp only from +CF IP ranges. Today any IP can reach the origin on port 80. + +## Layer 2 — Node (OS, SSH, firewall) + +Each node runs Ubuntu 24.04.3 LTS with: + +### SSH hardening + +`/etc/ssh/sshd_config` on each node: + +``` +Port 22 +PermitRootLogin no +PasswordAuthentication no +PubkeyAuthentication yes +AllowUsers deploy +``` + +Result: +- Only the `deploy` user can log in +- Only with a public key (no password) +- Root cannot log in remotely + +The public key authorized for `deploy`: + +``` +ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBU9xTTBD78tYUqHijgyU9PDqtmS4NuM/6uy8XgDzva+ hetzner2@myhoneydue.com +``` + +(Note: the comment field says "hetzner2" but it's the key for all three +nodes — the comment is the key's identifier, not a restriction.) + +Private key is at `~/.ssh/hetzner` on the operator workstation. + +### Sudo + +The `deploy` user has unrestricted sudo with no password +(`/etc/sudoers.d/deploy`): + +``` +deploy ALL=(ALL) NOPASSWD: ALL +``` + +This is convenient but broad. A compromise of the `deploy` SSH key = +root on the node. Mitigations: +- Key is stored only on the operator workstation, not checked into git +- Operator workstation has disk encryption (macOS FileVault) +- Operator workstation has a passphrase for the key (ssh-agent cache) + +Future hardening: scope sudo to specific commands that deploy workflows +need (e.g., `/usr/sbin/ufw`, `/usr/bin/systemctl`), but this requires +enumerating every command we might run, which breaks ad-hoc debugging. + +### fail2ban + +**Not installed.** fail2ban would ban IPs that fail SSH auth repeatedly. +Because we disable password auth entirely, the attack surface is tiny (an +attacker with the private key wins; failed-public-key attempts are +functionally DDoS, not credential-stuffing). Installing fail2ban is on +the TODO list anyway because it buys us rate-limiting on SSH bot noise. + +### unattended-upgrades + +**Not installed.** Security patches require manual `apt upgrade`. This is +a gap. Install and configure for security-only updates as soon as time +permits. + +### UFW firewall + +See [Chapter 4](./04-firewall.md) for the complete ruleset. Summary: +default-deny incoming, specific allows for SSH (22), HTTP (80), HTTPS +(443), k3s API from operator IP (6443), and inter-node cluster ports. + +## Layer 3 — Kubernetes RBAC + +K3s inherits full Kubernetes RBAC. Every component that talks to the API +server has a ServiceAccount with only the permissions it needs. + +### System accounts + +K3s creates these by default: +- `kube-system:admin` — cluster admin, used by `kubectl` +- `kube-system:coredns` — for CoreDNS +- `kube-system:traefik` — for Traefik ingress controller +- `kube-system:helm-install-traefik` — for the Helm chart installer + +We don't touch these. + +### Application service accounts + +Our `rbac.yaml` creates four ServiceAccounts in the `honeydue` namespace: + +```yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: api + namespace: honeydue +automountServiceAccountToken: false # ← important +``` + +Same for `admin`, `worker`, `redis`. + +**`automountServiceAccountToken: false`** means pods don't get a k8s +API token mounted in `/var/run/secrets/kubernetes.io/serviceaccount/`. +Without it, a compromised pod cannot query the Kubernetes API even if +the default service account has broad permissions. + +### What the app pods CAN'T do + +Our app service accounts have **no RoleBindings or ClusterRoleBindings**. +They cannot: +- List, get, create, update, delete any Kubernetes resource +- Read other namespaces' secrets +- Schedule workloads +- View cluster state + +If the api container were fully compromised (RCE), the attacker would +have: +- Network access to other pods in the `honeydue` namespace (Chapter 16) +- Read access to our ConfigMap + Secrets (mounted into the container) +- No ability to pivot to other parts of the cluster via the k8s API + +## Layer 4 — Pod Security + +Every pod runs with restrictive security context: + +```yaml +securityContext: + runAsNonRoot: true + runAsUser: 1000 # api; different per service + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + +containers: + - securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] +``` + +### What each setting does + +| Setting | Effect | +|---|---| +| `runAsNonRoot: true` | Pod refuses to start if the image's default user is root | +| `runAsUser: 1000` | Override to UID 1000 (app user) | +| `allowPrivilegeEscalation: false` | Process cannot become root via setuid, ptrace, etc. | +| `readOnlyRootFilesystem: true` | `/` is read-only; writes require explicit volumes | +| `capabilities: drop: [ALL]` | No Linux capabilities (NET_ADMIN, SYS_TIME, etc.) | +| `seccompProfile: RuntimeDefault` | Restrict syscalls to containerd's default seccomp allowlist | + +Read-only root means our app images must declare writable volumes for +anything mutable: + +```yaml +volumeMounts: + - name: tmp + mountPath: /tmp +volumes: + - name: tmp + emptyDir: + sizeLimit: 64Mi +``` + +If the app needs to write somewhere else (e.g., Next.js cache), we mount +an emptyDir there explicitly. + +### Traefik exception + +Traefik needs `CAP_NET_BIND_SERVICE` to bind ports 80/443 on the host +network. Its security context adds just that one capability back: + +```yaml +securityContext: + capabilities: + drop: [ALL] + add: [NET_BIND_SERVICE] + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 +``` + +The `net.ipv4.ip_unprivileged_port_start=0` sysctl on the nodes +complements this — on older kernels NET_BIND_SERVICE alone isn't enough +in the host netns. + +### Pod Security Admission (PSA) + +Kubernetes has a built-in admission controller for enforcing Pod Security +Standards at the namespace level: + +```yaml +apiVersion: v1 +kind: Namespace +metadata: + name: honeydue + labels: + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce-version: latest +``` + +We **don't currently set this**. We get the equivalent effect from +the explicit securityContext on each pod, but namespace-level enforcement +would catch new workloads that forget to set it. **TODO** (Chapter 20). + +## Layer 5 — Network Policies + +The `deploy-k3s/manifests/network-policies.yaml` scaffold defines: + +- **default-deny-all** — deny all ingress and egress by default in the + `honeydue` namespace +- **allow-dns** — allow egress UDP/TCP 53 to CoreDNS +- **allow-ingress-to-api** — allow Traefik (`kube-system` namespace) to + reach api pods on port 8000 +- **allow-ingress-to-admin** — same, for admin:3000 + +**These are not currently applied.** Without them, our pods can freely +talk to anything — including, theoretically, malicious destinations if +an attacker gets RCE inside a pod. + +**TODO** (Chapter 20): Apply network policies. The scaffold is there; we +just need to `kubectl apply -f deploy-k3s/manifests/network-policies.yaml` +and test that nothing breaks. + +### What network policies would prevent + +| Attack scenario | NetworkPolicy blocks | +|---|---| +| Pod A compromised, attacker SSHs sideways to pod B | Yes (explicit allow needed) | +| Pod RCE → scan internal networks | Yes (default deny egress) | +| Pod RCE → exfil to attacker's C2 | Yes (outbound to internet needs egress rule) | + +Without policies, all of these work. + +## TLS and encryption + +### CF ↔ user + +Always TLS 1.2+ (CF doesn't support older). CF presents an automatically- +renewed Let's Encrypt or CF-managed cert for `*.myhoneydue.com`. + +### CF ↔ origin + +**Plaintext HTTP** (SSL = Flexible). An attacker with access to the +Cloudflare-to-Hetzner path could read traffic. In practice nobody who +isn't Cloudflare or Hetzner sits on that path. + +**TODO** (Chapter 20): Upgrade to SSL = Full (strict) with a Cloudflare +Origin CA certificate. This encrypts CF ↔ origin and verifies that +origin's cert is the CF-issued one (prevents MitM if DNS is compromised). + +### API ↔ Neon Postgres + +**TLS 1.3** via `DB_SSLMODE=require`. The Go app's postgres driver (pgx) +negotiates TLS and verifies Neon's cert against the system CA bundle. +Connection fails if TLS can't be established. + +### API ↔ Backblaze B2 + +**HTTPS** (B2 doesn't support HTTP). `B2_USE_SSL=true` in our ConfigMap +(though actually the app reads `STORAGE_USE_SSL` — see Chapter 9 for this +vestigial variable's story). + +### Worker ↔ Fastmail SMTP + +**STARTTLS** on port 587. The Go `wneessen/go-mail` library uses +`TLSOpportunistic` mode — which means it connects plain then upgrades via +STARTTLS. Fastmail always supports STARTTLS, so in practice every +connection is encrypted. + +### API/worker ↔ Redis + +**Plaintext** inside the cluster. Redis 7 supports TLS (redis-tls.conf, +`redis-server --tls-port`), but we haven't enabled it because Redis is +on the overlay network, not exposed externally, and only holds cache + +queue state. + +### Pod-to-pod (Flannel overlay) + +**Plaintext VXLAN** over Hetzner's public network. See +[Chapter 3 §Layer 3](./03-networking.md#layer-3--pod-overlay-flannel-vxlan). +TODO to switch to WireGuard backend. + +## Secrets management + +### Kubernetes Secrets + +Our k8s Secrets are stored in etcd. etcd-at-rest encryption is **not +currently enabled** — a compromise of the etcd data directory would +expose Secret values. Given: +- Nodes have disk encryption at the Hetzner hypervisor layer +- Attacker needs root on the node to read etcd +- Our operator access is already root-via-sudo + +This is an accepted risk. **TODO** (Chapter 20): enable encryption at rest +for etcd. K3s supports it via `--secrets-encryption` flag on the server. + +### What Secrets we have + +``` +$ kubectl get secrets -n honeydue +NAME TYPE DATA AGE +gitea-credentials kubernetes.io/dockerconfigjson 1 ... +honeydue-apns-key Opaque 1 ... +honeydue-secrets Opaque 9 ... +``` + +Contents: + +| Secret | Key | Source | +|---|---|---| +| `gitea-credentials` | `.dockerconfigjson` | PAT for Gitea registry (image pulls) | +| `honeydue-apns-key` | `apns_auth_key.p8` | Placeholder p8 file (push off) | +| `honeydue-secrets` | `POSTGRES_PASSWORD` | Neon DB password | +| `honeydue-secrets` | `SECRET_KEY` | 64-char random, app signing key | +| `honeydue-secrets` | `EMAIL_HOST_PASSWORD` | Fastmail app password | +| `honeydue-secrets` | `FCM_SERVER_KEY` | "disabled-no-push-accounts-yet" placeholder | +| `honeydue-secrets` | `REDIS_PASSWORD` | Empty (no auth on internal Redis) | +| `honeydue-secrets` | `B2_KEY_ID` | B2 app key ID | +| `honeydue-secrets` | `B2_APP_KEY` | B2 app key secret | +| `honeydue-secrets` | `ADMIN_EMAIL` | `admin@myhoneydue.com` | +| `honeydue-secrets` | `ADMIN_PASSWORD` | Generated 24-char initial admin password | + +### Source of truth + +The Secret values came from: +- `deploy/secrets/*.txt` files on the operator workstation (gitignored) +- `deploy/prod.env` (gitignored) +- `deploy/registry.env` (gitignored) + +These Swarm-era files are still the canonical source. If you need to +recreate Secrets in a new cluster: + +```bash +cd honeyDueAPI-go +kubectl create secret generic honeydue-secrets -n honeydue \ + --from-literal=POSTGRES_PASSWORD="$(cat deploy/secrets/postgres_password.txt)" \ + --from-literal=SECRET_KEY="$(cat deploy/secrets/secret_key.txt)" \ + --from-literal=EMAIL_HOST_PASSWORD="$(cat deploy/secrets/email_host_password.txt)" \ + ... +``` + +The full recreation script is in Chapter 17 (Runbook). + +### Secret rotation + +Not automated. To rotate (e.g., after a compromise): + +1. Generate new value: `openssl rand -base64 32` +2. Update the secret: + ```bash + kubectl create secret generic honeydue-secrets -n honeydue \ + --from-literal=SECRET_KEY='new-value' \ + --dry-run=client -o yaml | kubectl apply -f - + ``` +3. Restart dependent pods: + ```bash + kubectl rollout restart -n honeydue deploy/api deploy/worker + ``` +4. Update `deploy/secrets/secret_key.txt` to match +5. Revoke the old credential at the source (Neon, Fastmail, etc.) + +## Container image provenance + +Images come from `gitea.treytartt.com/admin/*`. We have **no image +signing or verification** (cosign/sigstore) in place. A compromise of +the Gitea registry = the ability to push malicious images that would be +pulled into prod on the next rollout. + +Mitigations: +- Gitea itself is behind login; PAT is scoped to read:packages + + write:packages only +- Gitea runs on the operator's infrastructure (same operator account) +- Image tags are SHA-pinned (`:237c6b8`) not `:latest` → attacker can't + replace an existing tag's image without us noticing the digest change + +**TODO** (Chapter 20): Add cosign signing at build time, verify at pull +time. + +## Operator workstation security + +The operator workstation has: +- macOS with FileVault (full disk encryption) +- Login password required +- Private keys in `~/.ssh/` (mode 0600) +- Kubeconfig at `~/.kube/honeydue-k3s.yaml` (mode 0600) — contains a bearer + token to the cluster + +**Losing the laptop would require immediate credential rotation:** +- New SSH key, redeploy public part on all 3 nodes +- New kubeconfig: run `sudo cat /etc/rancher/k3s/k3s.yaml` on hetzner1, + copy to workstation, update `KUBECONFIG` env +- Rotate operator-access PATs on Gitea, Neon, Cloudflare, Backblaze + +## Compliance notes + +This stack is **not currently certified** for: +- HIPAA — we transit and store health-related data but haven't contractually + bound any BAA +- SOC 2 — no auditing, no documented controls beyond this document +- PCI-DSS — we don't handle card data; Apple/Google IAP handles payments +- GDPR — we follow GDPR best practices (data minimization, user deletion) + but haven't had a formal assessment + +If honeyDue ever needs any of these, the infrastructure is compatible +but the operational processes around it would need formal work. + +## Operator cheat sheet + +```bash +# See all RBAC-related resources in a namespace +kubectl get sa,role,rolebinding -n honeydue + +# Check what a ServiceAccount can do +kubectl auth can-i --list --as=system:serviceaccount:honeydue:api -n honeydue + +# Verify pod is running with expected security context +kubectl get pod -n honeydue -o jsonpath='{.spec.securityContext}' +kubectl get pod -n honeydue -o jsonpath='{.spec.containers[0].securityContext}' + +# List all Secrets (without revealing content) +kubectl get secret -n honeydue +kubectl describe secret honeydue-secrets -n honeydue # shows keys, not values + +# Decode a secret (CAREFUL: prints plaintext) +kubectl get secret honeydue-secrets -n honeydue -o jsonpath='{.data.SECRET_KEY}' | base64 -d +``` + +## References + +- [Kubernetes Pod Security Standards][psa] +- [Kubernetes RBAC][rbac] +- [Kubernetes NetworkPolicy][netpol] +- [Cloudflare IP ranges][cf-ips] +- [K3s secrets encryption][k3s-secrets] +- [SSH hardening guide][ssh-guide] + +[psa]: https://kubernetes.io/docs/concepts/security/pod-security-standards/ +[rbac]: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ +[netpol]: https://kubernetes.io/docs/concepts/services-networking/network-policies/ +[cf-ips]: https://www.cloudflare.com/ips/ +[k3s-secrets]: https://docs.k3s.io/security/secrets-encryption +[ssh-guide]: https://linux-audit.com/audit-and-harden-your-ssh-configuration/ diff --git a/docs/deployment/06-traefik-ingress.md b/docs/deployment/06-traefik-ingress.md new file mode 100644 index 0000000..922d549 --- /dev/null +++ b/docs/deployment/06-traefik-ingress.md @@ -0,0 +1,419 @@ +# 06 — Traefik Ingress + +## Summary + +Traefik is the reverse proxy that routes external HTTP requests to the +right application pod based on the `Host:` header. We run Traefik v3 as a +Kubernetes DaemonSet with `hostNetwork: true` — each of the three nodes +has its own Traefik pod listening directly on the node's `:80`/`:443`. +Cloudflare round-robins DNS across the three node IPs, so any node can +serve any request. No external load balancer. + +## Why Traefik + +K3s bundles Traefik by default. The alternatives: + +| Option | Pros | Cons | +|---|---|---| +| **Traefik v3 (bundled)** | Zero install, excellent k8s integration, middleware system, active development | Helm-driven config is indirect | +| NGINX Ingress | Most popular, battle-tested | Another thing to install, more config surface | +| HAProxy Ingress | Extremely performant | More hands-on, older docs | +| Caddy | Simple config, auto-HTTPS | `caddy-docker-proxy` / Ingress integration is less mature | +| Envoy / Istio | Most featureful | Massive overkill at our scale | + +Traefik came "free" with K3s, does the job, and its +[Swarm provider][traefik-swarm] is what we would have used if we'd +fixed our Swarm architecture. Using it on k3s keeps the mental model +consistent. + +## Deployment model + +```mermaid +flowchart TB + subgraph CF[Cloudflare edge] + DNS[DNS A records:
api.myhoneydue.com → 3 node IPs
admin.myhoneydue.com → 3 node IPs] + end + + subgraph N1[hetzner1] + T1[Traefik pod
hostNetwork:true
:80/:443] + kernel1[Linux kernel
net.ipv4.ip_unprivileged_port_start=0] + end + subgraph N2[hetzner2] + T2[Traefik pod
hostNetwork:true
:80/:443] + kernel2[Linux kernel] + end + subgraph N3[hetzner3] + T3[Traefik pod
hostNetwork:true
:80/:443] + kernel3[Linux kernel] + end + + subgraph Cluster[k3s cluster services] + APISvc[api Service :8000] + AdminSvc[admin Service :3000] + end + + DNS -. HTTP :80 .-> T1 & T2 & T3 + T1 & T2 & T3 -- reverse_proxy --> APISvc & AdminSvc +``` + +### ASCII fallback + +``` + Cloudflare DNS + ┌───────────────────┐ + │ api → 3 IPs │ + │ admin→ 3 IPs │ + └─────────┬─────────┘ + │ HTTP :80 + ┌───────────────────┼───────────────────┐ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ hetzner1 │ │ hetzner2 │ │ hetzner3 │ + │ Traefik │ │ Traefik │ │ Traefik │ + │ :80/443 │ │ :80/443 │ │ :80/443 │ + │(hostNet) │ │(hostNet) │ │(hostNet) │ + └────┬─────┘ └────┬─────┘ └────┬─────┘ + │ │ │ + └── ClusterIP ──────┼── ClusterIP ──────┘ + ▼ + ┌────────────────────────┐ + │ api Service :8000 │ + │ admin Service :3000 │ + └────────────────────────┘ +``` + +## Why DaemonSet + hostNetwork + +**What we're trying to achieve**: Any public-facing node should answer +:80/:443. Cloudflare round-robins DNS; whichever node it picks, that +node must serve. + +**The default k3s Traefik deployment** is a single-replica Deployment +exposed via a LoadBalancer Service. That requires either: +- Hetzner Load Balancer (+ $8.49/mo, another thing to manage), **or** +- K3s' built-in `servicelb` (klipper-lb) which binds node ports + dynamically to proxy to the Service + +Neither was quite what we wanted. With three replicas of the stock Traefik +behind klipper-lb, each Traefik pod is reachable but there's an extra hop +through klipper's proxy daemon. + +**DaemonSet + hostNetwork** is cleaner: each Traefik pod *is* the host's +:80/:443. No proxy daemon, no LB Service, no VIP. Cloudflare DNS → +node IP → kernel → Traefik, one hop. + +### Trade-offs of hostNetwork + +**Pro:** +- One fewer layer of indirection; lower latency +- No Service needed; no kube-proxy in the ingress path +- Standard Cloudflare round-robin DNS is the failover mechanism + +**Con:** +- Traefik is in the host netns; it sees the node's interfaces, not + the cluster overlay +- Traefik still joins the cluster-DNS resolution (via `hostNetwork`'s + default DNS policy) so it can resolve Service names like `api` +- Port conflicts possible if anything else wants :80/:443 on the node + (nothing else does in our setup) + +### Trade-offs of DaemonSet + +**Pro:** +- One Traefik per node; matches our Cloudflare 3-IP round-robin + exactly +- Any node down = Cloudflare's origin health checks route around it + +**Con:** +- Updates require `maxUnavailable > 0` (host ports conflict during + surge) → brief moment where one node is down during rollout +- 3× the memory usage vs. 1-replica Deployment (but Traefik is tiny + — ~128 MB total across all three) + +## Our Traefik configuration + +We reconfigure the bundled K3s Traefik via a `HelmChartConfig`. K3s +uses the `helm-controller` to manage bundled addons; `HelmChartConfig` +lets us override values without disabling-and-replacing the chart. + +Full config at +`deploy-k3s/manifests/traefik-helmchartconfig.yaml`. Key settings: + +```yaml +apiVersion: helm.cattle.io/v1 +kind: HelmChartConfig +metadata: + name: traefik + namespace: kube-system +spec: + valuesContent: |- + deployment: + kind: DaemonSet # was Deployment + hostNetwork: true + service: + enabled: false # no LoadBalancer Service + ports: + web: + port: 80 + hostPort: 80 + websecure: + port: 443 + hostPort: 443 + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + maxSurge: 0 + securityContext: + capabilities: + drop: [ALL] + add: [NET_BIND_SERVICE] + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + additionalArguments: + - "--entrypoints.web.forwardedHeaders.trustedIPs=" +``` + +### Why each setting + +- **`kind: DaemonSet`** — one Traefik per node. Default is a Deployment + with 1 replica. +- **`hostNetwork: true`** — Traefik runs in the host's network namespace + so it can bind real :80/:443 on the node. +- **`service.enabled: false`** — no LoadBalancer Service is created. + With `hostNetwork`, we don't need one. +- **`ports.*.hostPort`** — explicit host port binding. Matches the + container port (DaemonSet semantics with `hostPort: 80` ensure the + kubelet schedules at most one Traefik per node). +- **`updateStrategy.maxUnavailable: 1, maxSurge: 0`** — we accept one + node being down during a Traefik update (host port can't be shared). + The Traefik Helm chart rejects this config combination with + `maxSurge > 0` — this was the second config iteration. +- **Security context** — non-root (UID 65532), read-only root filesystem, + only `NET_BIND_SERVICE` capability. See Chapter 5. +- **`forwardedHeaders.trustedIPs`** — Cloudflare's IP ranges. Traefik + trusts `X-Forwarded-Proto` et al. only from these ranges, so a + bypassing client can't spoof the proto header. + +### Forwarded-headers trustedIPs + +The full list of trusted CF ranges is in our `additionalArguments`. It's +the union of CF's published IPv4 and IPv6 ranges. When Cloudflare passes +a request to origin, it adds `X-Forwarded-For` and `X-Forwarded-Proto` +headers; Traefik only honors these if the request came from one of these +IPs. Every other client's headers are ignored. + +If CF publishes new IP ranges (rare but possible), the +`trustedIPs` list needs updating. It's a raw string in our +HelmChartConfig — we'd need to edit, apply, and bump the helm job. + +## Traefik v3 vs v2 + +K3s ships Traefik v3 (currently `3.6.10`). The v2 → v3 migration +changed a few things: +- `swarmMode` removed (replaced by a `swarm` provider, but we don't + use Swarm anyway) +- Encoded-character handling changed (v3 warns about RFC 3986 handling; + we ignore the warning) +- Middleware CRD group is `traefik.io/v1alpha1` (was `containo.us`) + +Our deployment handles all of this automatically via the bundled +chart. + +## Ingress resources + +We define two standard k8s `Ingress` resources in +`deploy-k3s/manifests/ingress/ingress-simple.yaml`: + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: honeydue-api + namespace: honeydue +spec: + ingressClassName: traefik + rules: + - host: api.myhoneydue.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: {name: api, port: {number: 8000}} + - host: myhoneydue.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: {name: api, port: {number: 8000}} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: honeydue-admin + namespace: honeydue +spec: + ingressClassName: traefik + rules: + - host: admin.myhoneydue.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: {name: admin, port: {number: 3000}} +``` + +Traefik watches for Ingress resources with `ingressClassName: traefik` +and programs its router table accordingly. Changes are applied within +seconds — no restart needed. + +### What pathType: Prefix means + +Every request starting with `/` matches (which is everything). Alternative +is `Exact` (matches only the literal path). `Prefix` is the default for +most Ingress controllers and matches how users think about URL routing. + +## How requests flow + +1. **Cloudflare DNS** resolves `api.myhoneydue.com` to one of three IPs + (round-robin). Say it picks `178.105.32.198` (hetzner2). +2. **Cloudflare edge** establishes TCP to `178.105.32.198:80` (plain HTTP, + SSL=Flexible). Original HTTPS terminated at CF. +3. **UFW on hetzner2** accepts the SYN (80/tcp open from anywhere). +4. **Linux kernel** sees a listener on 0.0.0.0:80 (the Traefik pod). + Hands off the SYN. +5. **Traefik accepts** the connection. Reads the HTTP request. +6. **Traefik matches** the `Host:` header against its router table. + `Host: api.myhoneydue.com` → `honeydue-api` Ingress → `api` Service. +7. **Traefik dials** `10.43.167.83:8000` (api Service ClusterIP). This + goes through the cluster DNS (CoreDNS) and kube-proxy (IPVS). +8. **kube-proxy IPVS** rewrites the destination to a live api pod endpoint + — say `10.42.2.6:8000` (api pod on hetzner3). +9. **Flannel VXLAN** encapsulates the packet and sends to hetzner3 + (UDP :8472 between node IPs). +10. **hetzner3's kernel** decapsulates, delivers to the api pod. +11. **api pod** processes, returns response. +12. **Response flows back** the reverse path. + +Cloudflare caches 200 responses at the edge (default TTL varies; for +HTML/JSON usually 0 unless we set `Cache-Control` headers). So the +second request for the same URL might not reach the origin at all. + +## Middleware (mostly unused) + +Traefik supports middleware — small functions run before/after the proxy. +The `deploy-k3s/manifests/ingress/middleware.yaml` scaffold defines: + +- **`rate-limit`** — 100 req/min average, 200 burst +- **`security-headers`** — HSTS, X-Frame-Options, CSP, etc. +- **`cloudflare-only`** — IP allowlist restricting origin to CF ranges +- **`admin-auth`** — HTTP basic auth for admin panel + +**None of these are currently attached to our Ingresses.** To enable, +add the `traefik.ingress.kubernetes.io/router.middlewares` annotation to +the Ingress: + +```yaml +metadata: + annotations: + traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd +``` + +We left them off to minimize surface area for the first week of the new +cluster. Enabling is TODO in Chapter 20. + +## Traefik dashboard + +Disabled. The Traefik dashboard (`/dashboard/` and `/api/`) exposes +runtime state and is potentially information leaky. The bundled k3s +Traefik disables it by default, and we haven't re-enabled it. + +If needed for debugging: + +```bash +# Port-forward to a Traefik pod +kubectl port-forward -n kube-system daemonset/traefik 9000:9000 +# (the chart exposes the dashboard on :9000 when enabled) +# Then visit http://localhost:9000/dashboard/ +``` + +This requires kubectl access and isn't exposed publicly. + +## Version pinning + +We take whatever Traefik version is bundled with K3s (currently 3.6.10). +The bundled chart is pinned to a specific version in K3s' release notes; +when we upgrade K3s the Traefik version can change. If that ever breaks +something, we can pin a specific version via the HelmChartConfig's +`version` field: + +```yaml +spec: + version: 39.0.501+up39.0.5 # specific chart version +``` + +## Limitations we accept + +- **No sticky sessions.** Every request to `api.myhoneydue.com` can go + to a different pod. Our Go API is stateless — this is fine. +- **No canary deployments** (yet). Traefik supports weighted routing + via its CRDs (`TraefikService`) but we don't use them. TODO if/when + we do gradual rollouts. +- **No mTLS.** Traefik supports mutual TLS client auth for sensitive + endpoints. We don't use it. +- **Single ingress class.** Everything goes through the same Traefik. + For multi-tenant setups we'd want separate ingress classes with + separate policies. + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---|---|---| +| 404 from Traefik | Ingress doesn't match `Host:` | Check Ingress host field, DNS | +| 502 from Traefik | Backend Service has no endpoints | `kubectl get endpoints -n honeydue` | +| 503 from Traefik | Circuit breaker / backend unhealthy | Check pod logs, readiness probe | +| 504 from Traefik | Backend slow | Check pod CPU/memory, DB connections | +| Connection refused at 80 | Traefik pod not running or kernel not listening | `kubectl get pods -n kube-system -l app.kubernetes.io/name=traefik`; `ssh deploy@node 'ss -lntp | grep :80'` | +| Mixed content error in browser | `X-Forwarded-Proto` not honored by app | Check `trustedIPs` includes CF; check app reads the header | + +## Operator cheat sheet + +```bash +# Traefik pods per node +kubectl get pods -n kube-system -l app.kubernetes.io/name=traefik -o wide + +# Traefik logs (all pods) +kubectl logs -n kube-system -l app.kubernetes.io/name=traefik --tail=50 --prefix + +# Ingress status +kubectl get ingress -n honeydue + +# List all routers Traefik sees (requires dashboard or API) +kubectl exec -n kube-system daemonset/traefik -- traefik healthcheck + +# Re-apply config +kubectl apply -f deploy-k3s/manifests/traefik-helmchartconfig.yaml +kubectl delete job -n kube-system helm-install-traefik # triggers reinstall + +# Restart all Traefik pods +kubectl rollout restart daemonset/traefik -n kube-system +``` + +## References + +- [Traefik v3 docs][traefik] +- [Traefik Swarm provider][traefik-swarm] +- [K3s Traefik customization][k3s-traefik] +- [HelmChartConfig docs][k3s-helm] +- [Cloudflare IP ranges][cf-ips] + +[traefik]: https://doc.traefik.io/traefik/v3.6/ +[traefik-swarm]: https://doc.traefik.io/traefik/providers/swarm/ +[k3s-traefik]: https://docs.k3s.io/networking/networking-services#traefik-ingress-controller +[k3s-helm]: https://docs.k3s.io/helm#customizing-packaged-components-with-helmchartconfig +[cf-ips]: https://www.cloudflare.com/ips/ diff --git a/docs/deployment/07-services.md b/docs/deployment/07-services.md new file mode 100644 index 0000000..8fbb2fb --- /dev/null +++ b/docs/deployment/07-services.md @@ -0,0 +1,575 @@ +# 07 — Services + +## Summary + +Four workloads run in the `honeydue` namespace: **api** (Go REST API, 3 +replicas), **admin** (Next.js panel, 1 replica), **worker** (Go background +jobs, 1 replica), and **redis** (cache + job queue, 1 replica, PVC-backed). +This chapter deep-dives each: container image, resource limits, probes, +volumes, and why each knob is set the way it is. + +## Overview + +| Service | Image | Replicas | Ports | Role | +|---|---|---|---|---| +| `api` | `gitea.treytartt.com/admin/honeydue-api:` | 3 | 8000 | HTTP REST API | +| `admin` | `gitea.treytartt.com/admin/honeydue-admin:` | 1 | 3000 | Next.js admin panel | +| `worker` | `gitea.treytartt.com/admin/honeydue-worker:` | 1 | — | Background job processor | +| `redis` | `redis:7-alpine` | 1 | 6379 | Cache + Asynq queue | + +All four are Kubernetes `Deployment` workloads (not StatefulSets, not +DaemonSets). They share: +- ServiceAccount with `automountServiceAccountToken: false` (Chapter 5) +- `imagePullSecrets: [gitea-credentials]` (Chapter 11) +- `envFrom: configMapRef: honeydue-config` (Chapter 10) +- Individual env vars wired to `honeydue-secrets` keys +- Read-only root filesystem with `tmp` emptyDir mounted at `/tmp` + +## Service 1 — api (Go REST API) + +### What it does + +The Go HTTP API — the heart of the app. Handlers for user auth, +residences, tasks, contractors, documents, subscriptions, notifications, +etc. Reads/writes to Neon Postgres, reads/writes to Redis cache, reads +from Backblaze B2. + +Also serves a marketing landing page at `/` (static HTML + CSS from +`/app/static/`). This is why the `myhoneydue.com` apex domain routes to +the api service (Chapter 6). + +### Deployment spec highlights + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api +spec: + replicas: 3 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 + template: + spec: + serviceAccountName: api + imagePullSecrets: [name: gitea-credentials] + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: { type: RuntimeDefault } + containers: + - name: api + image: gitea.treytartt.com/admin/honeydue-api:237c6b8 + ports: [containerPort: 8000] + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: { drop: [ALL] } + envFrom: [configMapRef: {name: honeydue-config}] + env: + - name: POSTGRES_PASSWORD + valueFrom: { secretKeyRef: {name: honeydue-secrets, key: POSTGRES_PASSWORD} } + - name: SECRET_KEY + valueFrom: { secretKeyRef: {name: honeydue-secrets, key: SECRET_KEY} } + # ... all other secrets + volumeMounts: + - { name: apns-key, mountPath: /secrets/apns, readOnly: true } + - { name: tmp, mountPath: /tmp } + resources: + requests: { cpu: 100m, memory: 128Mi } + limits: { cpu: 1000m, memory: 512Mi } + startupProbe: { httpGet: {path: /api/health/, port: 8000}, failureThreshold: 48, periodSeconds: 5 } + readinessProbe: { httpGet: {path: /api/health/, port: 8000}, initialDelaySeconds: 5, periodSeconds: 10, timeoutSeconds: 5 } + livenessProbe: { httpGet: {path: /api/health/, port: 8000}, initialDelaySeconds: 30, periodSeconds: 30, timeoutSeconds: 10 } + volumes: + - name: apns-key + secret: + secretName: honeydue-apns-key + items: [key: apns_auth_key.p8, path: apns_auth_key.p8] + - name: tmp + emptyDir: {sizeLimit: 64Mi} +``` + +### Why each setting + +**`replicas: 3`** — one per node via anti-affinity rules (not strictly +required but helpful). Three gives us HA (one pod down = two still +serve traffic) and headroom for rolling updates. + +**`maxUnavailable: 0, maxSurge: 1`** — during a rollout, start a 4th +pod before killing any old one. Ensures the service stays at 3 live +pods throughout. `maxUnavailable: 0` means zero downtime updates — but +depends on readinessProbe being accurate. + +**`runAsUser: 1000`** — the `app` user created in the Dockerfile. Image +doesn't run as root. + +**`readOnlyRootFilesystem: true`** — prevents any attacker-introduced +file writes to the image layer. Go binary doesn't need to write to `/`; +only `/tmp` is mutable. + +**`startupProbe.failureThreshold: 48`** (= 48 × 5s = 240s grace) — this +was bumped up from the scaffold default of 12. Reason: on first boot, +the Go app runs `MigrateWithLock()` which acquires a Postgres advisory +lock and runs AutoMigrate. First replica takes ~90s; subsequent +replicas wait on the lock. With 3 replicas all starting simultaneously +and the lock serializing them, 240s is the right grace. See +[Chapter 19](./19-postmortem-swarm.md) for the detailed story. + +**`readinessProbe.initialDelaySeconds: 5`** — after the startupProbe +passes, wait 5s before starting readiness checks. Prevents a racy +initial failure. + +**`livenessProbe.initialDelaySeconds: 30`** — don't start restarting on +liveness failures for 30s after readiness passes. Avoids cascading +failures from false-negative liveness checks. + +**`resources.requests/limits`** — Kubernetes uses `requests` for +scheduling (how much a pod "reserves") and `limits` for enforcement +(max it can use before throttling/OOM). Our api is CPU-bursty for +complex query handling, so we give it 100m baseline with a 1000m ceiling. +512Mi memory ceiling is comfortable — in practice api uses ~100-200Mi. + +**`volumes.apns-key`** — mounts the `honeydue-apns-key` Secret as a file +at `/secrets/apns/apns_auth_key.p8`. The `APNS_AUTH_KEY_PATH` env var +points to this path. Even though push is currently disabled, the file +must exist because the Go app may try to stat it on startup. + +**`volumes.tmp`** — `emptyDir` with `sizeLimit: 64Mi`. Bounded so a +runaway process can't fill the node's disk. + +### The Service + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: api + namespace: honeydue +spec: + type: ClusterIP + selector: {app.kubernetes.io/name: api} + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP +``` + +ClusterIP `10.43.167.83`. Reachable as `api.honeydue.svc.cluster.local` or +just `api` from inside the namespace. + +### HorizontalPodAutoscaler (not yet enabled) + +`deploy-k3s/manifests/api/hpa.yaml` defines an HPA that would scale api +between 3 and 6 replicas based on CPU (70% util) and memory (80% util). + +**Not currently applied.** `metrics-server` runs but we haven't run +`kubectl apply -f api/hpa.yaml`. TODO in Chapter 20. + +## Service 2 — admin (Next.js panel) + +### What it does + +Server-rendered admin UI. Authenticates admin users against a +separate `admin_users` table in Postgres (seeded with `ADMIN_EMAIL` + +`ADMIN_PASSWORD` on first migration). Lets operators view/manage +users, residences, tasks, subscriptions, etc. + +Built as a Next.js 16 standalone server. + +### Why 1 replica + +Low traffic. It's an internal tool. One pod suffices. If it crashes, +Kubernetes restarts it in ~10s. If the hosting node dies, Kubernetes +reschedules to another node. + +The cost of running 3 replicas is tiny (Next.js is ~128MB per pod) but +has no operational benefit. When the admin panel becomes user-facing, +revisit. + +### Deployment highlights + +```yaml +replicas: 1 +strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 + +securityContext: + runAsNonRoot: true + runAsUser: 1001 # different from api (1000) for isolation + runAsGroup: 1001 + fsGroup: 1001 + +containers: + - image: gitea.treytartt.com/admin/honeydue-admin: + ports: [containerPort: 3000] + env: + - name: PORT + value: "3000" + - name: HOSTNAME + value: "0.0.0.0" + - name: NEXT_PUBLIC_API_URL + valueFrom: {configMapKeyRef: {name: honeydue-config, key: NEXT_PUBLIC_API_URL}} + volumeMounts: + - {name: nextjs-cache, mountPath: /app/.next/cache} + - {name: tmp, mountPath: /tmp} + resources: + requests: {cpu: 50m, memory: 64Mi} + limits: {cpu: 500m, memory: 256Mi} + startupProbe: + httpGet: {path: /, port: 3000} # was /admin/ — wrong for this app (Chapter 19) + failureThreshold: 24 + periodSeconds: 5 + readinessProbe: + httpGet: {path: /, port: 3000} + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 +``` + +**Probe path `/`** — Next.js serves at root. `/admin/` (scaffold default) +returns 404 and killed the pod repeatedly during initial bring-up. +See Chapter 19 §Admin probe path for the story. + +**`runAsUser: 1001`** — different from api's 1000 so that if one +service were compromised, the stolen UID would at least be distinct +from other services' (minor defense-in-depth). + +**`nextjs-cache`** — emptyDir mount for Next.js's server-side cache. +Without it, the read-only rootfs would prevent Next from caching +server-rendered pages. Not a persistent volume because cache is +regenerable on restart. + +### The Service + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: admin +spec: + type: ClusterIP + selector: {app.kubernetes.io/name: admin} + ports: [port: 3000, targetPort: 3000] +``` + +ClusterIP `10.43.136.168`. + +## Service 3 — worker (Go + Asynq) + +### What it does + +Runs scheduled background jobs via [Asynq](https://github.com/hibiken/asynq) +(a Redis-backed job queue for Go): + +- **Task reminders** (14:00 UTC daily) — notify users of upcoming tasks +- **Overdue reminders** (15:00 UTC daily) — notify users of overdue tasks +- **Daily digest** (03:00 UTC daily) — summary email per user +- **Onboarding emails** — multi-step drip campaign for new users +- **Cleanup jobs** — expired tokens, stale data + +### Why 1 replica (hard requirement) + +Asynq uses a `Scheduler` component that does cron-like scheduling. The +Scheduler is **not leader-elected** by default — if you run two, both +fire every cron task. Users get duplicate emails. + +The asynq docs cover this: to scale scheduling, migrate to +`PeriodicTaskManager` + `PeriodicTaskConfigProvider` which coordinate +via Redis. Not yet done in our codebase. + +Until then: `replicas: 1` is a hard constraint. See the comment in the +deployment manifest: + +```yaml +spec: + # Asynq's Scheduler is a singleton — running >1 replica fires every cron + # task once per replica (duplicate daily digests, onboarding emails, etc.). + # Keep at 1 until asynq.PeriodicTaskManager with Redis leader election is + # wired in cmd/worker/main.go. + replicas: 1 +``` + +### What happens if the worker pod dies? + +- Asynq schedule state is in Redis (which has AOF persistence) +- When a new worker pod starts, it re-registers the scheduler and picks up + where it left off +- Any job that was in-flight (dequeued but not acknowledged) gets retried + by Asynq's automatic retry logic (see the `worker.RetryOptions` in the + Go code) +- Cron jobs that were supposed to fire during the downtime: fire on the + next tick + +A 5-minute worker outage = 5 minutes of delayed jobs. Not great but +acceptable. + +### PodDisruptionBudget + +```yaml +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: worker-pdb +spec: + minAvailable: 0 + selector: {matchLabels: {app.kubernetes.io/name: worker}} +``` + +`minAvailable: 0` means voluntary disruptions (`kubectl drain`) can take +the worker down. This matches the singleton constraint: there's only one, +it's OK to drain. + +### No Service + +worker doesn't listen on any HTTP port for application traffic — it's a +queue consumer, not a web server. So there's **no Kubernetes Service** +for it. + +(On Swarm we had the worker expose a health endpoint at `:6060/health`; +the k3s scaffold doesn't replicate this. Future work.) + +## Service 4 — redis + +### What it does + +- Caching layer (ETag-based lookups, user session cache) +- Asynq queue backend (job state, scheduled tasks, retry state) + +### Why 1 replica + +Single-instance Redis with AOF persistence. Not replicated, not +clustered. Downsides: +- Node outage = Redis outage (cache regenerates, queue state is preserved + by AOF on the PVC) +- No failover — if the node hosting Redis dies, Redis restarts on another + node *but* the PVC is local-path (per-node), so the data is gone + +For our scale this is acceptable. Redis holds no authoritative state +(everything that matters is in Postgres). Cache regenerates on first +request; Asynq retries enqueue on failure. + +### PVC + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: redis-data +spec: + accessModes: [ReadWriteOnce] + storageClassName: local-path + resources: {requests: {storage: 5Gi}} +``` + +Uses k3s' built-in `local-path-provisioner`. The PVC binds to a local +directory on the node where the Redis pod lands (`/var/lib/rancher/k3s/storage/`). +`ReadWriteOnce` means only one pod at a time. + +### Node affinity + +```yaml +nodeSelector: + honeydue/redis: "true" +``` + +We labeled `ubuntu-8gb-nbg1-2` (hetzner1) with `honeydue/redis=true` so +Redis always lands there. This ensures the PVC finds its backing +storage (since PVCs with `local-path` are per-node). + +```bash +kubectl label node ubuntu-8gb-nbg1-2 honeydue/redis=true --overwrite +``` + +### Why not Redis Sentinel / Cluster + +Complexity. At our scale (~a few req/s, kilobytes of cache), a single +Redis does fine. If Redis becomes critical-path for availability, we'd: +- Use a managed Redis (Upstash, Dragonfly Cloud) — $5-15/mo, their problem +- Or run Redis Sentinel with 3 replicas — manageable but operational work + +Neither is needed yet. + +### Redis config + +From the deployment: + +```yaml +command: + - sh + - -c + - | + ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy noeviction" + if [ -n "$REDIS_PASSWORD" ]; then + ARGS="$ARGS --requirepass $REDIS_PASSWORD" + fi + exec redis-server $ARGS +``` + +Settings: +- **`--appendonly yes --appendfsync everysec`** — AOF persistence, + fsync every second. Survives restarts with up to 1 second of data + loss. +- **`--maxmemory 256mb`** — Redis will refuse new data if it grows past + 256 MB. Gives us a safety cap. +- **`--maxmemory-policy noeviction`** — we'd rather get errors than + silently drop data. This is the right choice when Redis holds queue + state (losing a queue item silently = missed job). + +The `REDIS_PASSWORD` env var is optional. Currently empty (no auth). The +Redis pod is only reachable from inside the overlay network, and our +NetworkPolicies (once enabled) would restrict egress further. + +## Resource summary + +Combined requests and limits across all services: + +| Service | CPU requests | CPU limits | Memory requests | Memory limits | Replicas | +|---|---|---|---|---|---| +| api | 100m | 1000m | 128Mi | 512Mi | 3 | +| admin | 50m | 500m | 64Mi | 256Mi | 1 | +| worker | 50m | 500m | 64Mi | 256Mi | 1 | +| redis | 100m | 500m | 128Mi | 512Mi | 1 | +| traefik (kube-system) | ~100m | unlimited | ~50Mi | unlimited | 3 | +| **Total requests** | **~750m** | | **~550Mi** | | | + +Each node has 4000m CPU + 8192Mi memory. Total cluster capacity is +12000m + 24576Mi. We're using roughly 6% CPU and 2% memory for requests +— tons of headroom. + +## Health check semantics + +Kubernetes distinguishes three probe types: + +- **startupProbe** — is the container done starting? Runs until it passes + once, then stops. While running, the other probes are disabled. + Failing startupProbe = container killed and restarted. +- **readinessProbe** — is the container ready to serve traffic? A failing + pod is removed from Service endpoints (traffic stops flowing to it) + but the pod keeps running. +- **livenessProbe** — is the container healthy? A failing pod is killed + and restarted. + +### Why we tuned startupProbe separately + +The api's first-boot migration takes 90–240s. If we only had a +readinessProbe with a typical initialDelay of 5s + failureThreshold of 3, +the pod would be killed before migration finishes. startupProbe lets us +give generous first-boot grace (240s) without affecting the sharper +ongoing readiness/liveness checks. + +### Probe path design + +Each service's `/health` endpoint should be: +- Cheap (no DB query, no external call) +- Fast (< 100ms) +- Honest (returns 200 iff the process can serve) + +Our api's `/api/health/` does a trivial check. It does NOT verify Postgres +connectivity (to avoid cascading DB failures tearing down all api pods). +If Postgres is down, api pods stay "ready" and return 5xx for actual +endpoints — that's the right behavior. + +## Log routing + +All container logs go to stdout/stderr. containerd captures them to +`/var/log/containers/` on the node. `kubectl logs` fetches them via the +kubelet's /api/v1/pods//log endpoint. + +We have **no log aggregation** in the cluster (no Loki, no ELK, no +Datadog). For debugging we use: + +```bash +kubectl logs -n honeydue deploy/api -f --prefix +kubectl logs -n honeydue deploy/api --previous # previous pod's logs +``` + +See [Chapter 15](./15-observability.md). + +## Rolling update semantics + +When you push a new image and `kubectl set image` or `kubectl apply` with +a new image tag: + +1. Kubernetes creates a new ReplicaSet with the new image +2. Starts 1 new pod (per `maxSurge: 1`) +3. Waits for it to pass readinessProbe +4. Removes 1 pod from the old ReplicaSet +5. Repeats until all N pods are on the new ReplicaSet +6. Old ReplicaSet stays around (for rollback) with 0 replicas + +For api (3 replicas): total rollout time is roughly +`3 × (pod_startup_time + small_buffer)` = ~15 minutes in the cold-boot +case, seconds for warm updates where migrations are no-op. + +During the rollout: +- Service endpoint set updates as pods become ready +- kube-proxy IPVS is reprogrammed on each node +- Traefik's connection pool to the Service invalidates gradually + +Users see no downtime if the new image is compatible. If it's broken: + +```bash +kubectl rollout undo deployment/api -n honeydue +``` + +Reverts to the previous ReplicaSet. Typically takes 30 seconds to +stabilize. + +## Why no StatefulSet + +For Redis (the only stateful thing we run), we use a Deployment + PVC. +StatefulSet is designed for: +- Ordered startup (pod-0 before pod-1) +- Stable hostnames (pod-0 gets DNS name `redis-0.redis`) +- Per-replica PVCs + +We have one Redis replica. None of those features matter for a +singleton. Deployment + PVC + nodeSelector is simpler and equivalent. + +If we ever run Redis Sentinel or Cluster, we'd migrate to StatefulSet. + +## Operator cheat sheet + +```bash +# See all pods in honeydue namespace +kubectl get pods -n honeydue -o wide + +# Per-service rollout status +kubectl rollout status deployment/api -n honeydue + +# Scale a service +kubectl scale deployment/api -n honeydue --replicas=5 + +# Restart all pods (e.g., to re-read a configmap) +kubectl rollout restart deployment/api -n honeydue + +# Exec into a pod +kubectl exec -it -n honeydue deploy/admin -- /bin/sh + +# Describe a pod (shows events, probe state, restarts) +kubectl describe pod -n honeydue + +# Resource usage +kubectl top pods -n honeydue +``` + +## References + +- [Kubernetes Deployments][deploy] +- [Pod lifecycle + probes][probes] +- [Asynq scheduler limitations][asynq-sched] +- [K3s local-path provisioner][k3s-lp] + +[deploy]: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ +[probes]: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-lifecycle +[asynq-sched]: https://github.com/hibiken/asynq/wiki/Periodic-Tasks +[k3s-lp]: https://docs.k3s.io/storage#setting-up-the-local-storage-provider diff --git a/docs/deployment/08-database.md b/docs/deployment/08-database.md new file mode 100644 index 0000000..865ade7 --- /dev/null +++ b/docs/deployment/08-database.md @@ -0,0 +1,298 @@ +# 08 — Database (Neon Postgres) + +## Summary + +Authoritative user data lives in a Neon-managed Postgres database in AWS +us-east-1. Connections use TLS (`DB_SSLMODE=require`). Schema is managed +via GORM AutoMigrate inside the api binary, coordinated across replicas +by a Postgres advisory lock to prevent concurrent migration attempts. + +## Why Neon + +### Decision matrix + +At deploy time we considered: + +| Option | Setup effort | Monthly cost | Backup/PITR | Scale ceiling | Notes | +|---|---|---|---|---|---| +| **Neon Launch** | Zero (managed) | $5-15 | Included | Large | **Picked** | +| Postgres on a Hetzner VPS | High | $8 (VPS) | Manual | Medium | More ops | +| AWS RDS | Medium | $30+ | Included | Huge | Overkill, expensive | +| Supabase Free | Zero | $0 | Limited | Small | Free tier has quota limits | +| CNPG on our k3s | High (Helm) | $0 (using cluster) | Self-rolled | Medium | Operational burden | + +Neon Launch won on: +- **Serverless**: scales compute to zero when idle (cheap) +- **Branch databases**: we can create dev/staging branches from prod in seconds +- **Connection pooling built-in**: PgBouncer on the hostname suffix `-pooler` +- **Point-in-time recovery** included (paid tier) +- **Pay-as-you-go** with a $5 minimum — fits a bootstrapped app + +### Connection details + +| Field | Value | +|---|---| +| Hostname | `ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech` | +| Port | 5432 | +| Username | `neondb_owner` | +| Database | `honeyDue` (case-sensitive!) | +| TLS mode | `require` (enforced by Neon; app pg driver verifies) | +| Branch | production (Neon's concept — isolated DB within the project) | + +### The database name is case-sensitive + +Postgres identifiers are lowercase unless quoted. Neon's UI created the +database as `"honeyDue"` (quoted, camelCase preserved). In `prod.env` / +ConfigMap we must use exactly `POSTGRES_DB=honeyDue` — lowercase +`honeydue` gets a `database "honeydue" does not exist` error. This bit +us during the initial Swarm deploy (Chapter 19 §Neon DB name). + +## Connection pooling + +### Why it matters + +Postgres is memory-hungry per connection (~5-10 MB each). 3 api replicas +× `DB_MAX_OPEN_CONNS=25` = up to 75 direct Postgres connections. Add +the worker's 25. Neon's free tier caps at 100 concurrent connections; +paid tiers much higher. + +### PgBouncer on Neon + +Neon provides a built-in PgBouncer at `-pooler` subdomain. Our hostname +already includes `-pooler` handling in the route, so connections go +through PgBouncer transparently. + +Modes PgBouncer supports: +- **session** — one server connection held per client session (transparent) +- **transaction** — server connection released after each transaction (high-throughput) +- **statement** — per-statement (most aggressive; breaks many features) + +Neon's pooler runs in **transaction mode**. This is compatible with GORM +out of the box (we don't use session-level features like prepared +statements or session variables). + +### Connection pool settings + +In `prod.env`: + +``` +DB_MAX_OPEN_CONNS=25 +DB_MAX_IDLE_CONNS=10 +DB_MAX_LIFETIME=600s +``` + +These are the Go `database/sql` pool settings (GORM uses `database/sql` +underneath): + +- **MaxOpenConns: 25** — at most 25 concurrent connections per replica +- **MaxIdleConns: 10** — keep up to 10 warm connections ready to reuse +- **MaxLifetime: 600s** — recycle connections after 10 min (prevents + stale state in long-lived connections, good for Neon's idle timeout) + +### Worst-case connection count + +3 api + 1 worker replicas × 25 conns = 100 peak. Right at Neon free +tier's ceiling, with zero margin. **This is a real risk** — a spike that +saturates the pool on all replicas simultaneously would exhaust Neon's +limit. + +Mitigations to consider: +- Drop `DB_MAX_OPEN_CONNS` to 15 → 60 peak. Safe on free tier. +- Upgrade to Neon Scale plan (1000+ connections). +- Rely on Neon's PgBouncer to multiplex — the raw backend connections + to Postgres-proper are pooled, not our TCP connections to Neon. + +Currently we trust Neon's pooler to handle the multiplexing and run with +the default 25/10. If we hit connection errors in prod, adjust. + +## Schema management + +### GORM AutoMigrate + +On startup, the Go API's `cmd/api/main.go` calls +`database.MigrateWithLock()` which: + +1. Opens a dedicated Postgres connection +2. `SELECT pg_advisory_lock(1751412071)` — acquires a session-level + advisory lock on a hardcoded key +3. Calls `db.AutoMigrate(&models.*{})` for every GORM model +4. `SELECT pg_advisory_unlock(...)` via deferred function +5. Close the connection + +The advisory lock serializes migrations across replicas: when 3 api +pods start simultaneously, one acquires the lock and migrates; the +others block on the lock. Once the first finishes (≤2s for already- +migrated schema, up to 90s on first cold boot), the next acquires and +sees the schema is current (no-op migrate). + +### Why an advisory lock + +Without it, concurrent `CREATE TABLE IF NOT EXISTS ...` statements from +multiple replicas would race — Postgres usually handles it, but GORM's +AutoMigrate also alters tables (adds columns, indexes) which can deadlock +under concurrency. + +The advisory lock pattern (also used by Rails + Django + Alembic) is the +canonical solution. + +### The lock key + +`1751412071` is a hardcoded integer in `internal/database/database.go`. +Arbitrary but unique — as long as nothing else in the Postgres instance +uses the same advisory lock key, no conflicts. + +### First-boot behavior + +On a **fresh database** (new Neon project), the first api pod runs +through every model's `CREATE TABLE` statement. This is ~50 tables for +honeyDue and takes ~90 seconds. + +On a **warm database** (tables already exist), AutoMigrate is fast — +typically under 2 seconds. It still runs (GORM checks every model +against the schema) but finds no work to do. + +### Where this bit us + +With 3 api pods starting simultaneously and migrations taking 90s first +time, the lock queue for the last replica is ~180s. We needed a +startupProbe grace of 240s to cover this without false restart loops. +See Chapter 7 §startupProbe and Chapter 19 §MigrateWithLock. + +### Downside: no schema versioning + +AutoMigrate can only *add* — new tables, new columns, new indexes. It +won't drop columns, rename them, or change types destructively. For +those we'd need raw SQL migrations (a tool like `golang-migrate` or +`dbmate`). + +Today: we accept that schema changes are additive-only. When we need +destructive changes, we'd hand-write them. + +## What's in the database + +Major tables (see `honeyDueAPI-go/internal/models/`): + +| Table | Purpose | +|---|---| +| `auth_user` | Users (Django legacy name kept for compatibility) | +| `user_userprofile` | Profile data | +| `authtoken_token` | API auth tokens | +| `residence_residence` | Properties users manage | +| `task_task` | Maintenance tasks | +| `task_taskcompletion` | Task completion history | +| `contractor_contractor` | Contractor contacts | +| `documents_document` | Document records (files in B2) | +| `notification_notification` | In-app notifications | +| `subscription_usersubscription` | IAP subscriptions | +| `admin_users` | Next.js admin panel users | + +See `honeyDueAPI-go/docs/TASK_LOGIC_ARCHITECTURE.md` for the task logic +model details. + +## Backup and recovery + +### Neon's built-in + +Neon Launch includes **point-in-time recovery** within the last 24h +(longer on Scale plan). To restore: + +1. Go to Neon console → project → Backups +2. Create a branch from a timestamp +3. Point the app at the new branch (change `DB_HOST` in our ConfigMap) + +Done. No tape-wrangling. + +### What we don't have + +- Off-site backup (if Neon itself is compromised, we have no exfil). A + nightly `pg_dump` to Backblaze B2 would close this gap. **TODO** + (Chapter 20). +- Tested DR drills. We've never actually restored from a Neon backup + into a new branch and pointed the app at it. Should be routine; hasn't + been exercised. + +## Migrations from old MyCrib/Casera data + +honeyDue originally ran on a Django codebase (MyCrib / Casera-era). The +schema inherits Django's naming (`app_model` table names, `_id` suffix +foreign keys). The Go app's GORM models have `TableName()` methods that +preserve this: + +```go +func (Task) TableName() string { return "task_task" } +``` + +This isn't ideal (GORM's default `tasks` would be cleaner), but changing +would require a migration that renames every table — more risk than +value. + +## Neon regions + +Neon's default region for new projects is `aws-us-east-1` (Virginia). +Our DB is there. Latency from Nuremberg to us-east-1 is **~90-120ms +round trip**. + +This is the slowest hop in our data flow. Every api request that needs +a DB query (most of them) pays this latency at least once. + +**When this matters**: When we start seeing ~200ms+ response times from +complex endpoints, it's likely DB latency dominant. Options: +- Migrate Neon to `aws-eu-central-1` (Frankfurt) — shaves ~90ms off +- Add Redis caching for hot reads (Chapter 7) +- Read replicas (Neon supports them on paid tiers) + +## Environment variables the app reads + +From ConfigMap: + +| Var | Purpose | +|---|---| +| `DB_HOST` | Neon pooler hostname | +| `DB_PORT` | 5432 | +| `POSTGRES_USER` | `neondb_owner` | +| `POSTGRES_DB` | `honeyDue` | +| `DB_SSLMODE` | `require` | +| `DB_MAX_OPEN_CONNS` | 25 | +| `DB_MAX_IDLE_CONNS` | 10 | +| `DB_MAX_LIFETIME` | `600s` | + +From Secret (`honeydue-secrets`): + +| Var | Purpose | +|---|---| +| `POSTGRES_PASSWORD` | Neon DB password | + +## Operator cheat sheet + +```bash +# Connect to Neon from workstation (requires psql + the password) +PGPASSWORD="" psql -h ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \ + -U neondb_owner -d honeyDue + +# From a pod (lets you debug against the actual in-cluster network path) +kubectl exec -n honeydue -it deploy/api -- sh +# inside the pod (no psql by default, but wget + JSON API works) +wget -qO- http://127.0.0.1:8000/api/health/ + +# See current migration state (no direct CLI, but the api logs show it) +kubectl logs -n honeydue deploy/api | grep -i migration + +# See active connections (run against Neon) +SELECT count(*), usename, state, application_name +FROM pg_stat_activity +GROUP BY usename, state, application_name; +``` + +## References + +- [Neon docs][neon-docs] +- [Neon pricing][neon-pricing] +- [Postgres advisory locks][pg-locks] +- [GORM AutoMigrate][gorm-automigrate] +- [honeyDue task architecture][task-arch] (repo-local) + +[neon-docs]: https://neon.com/docs/introduction +[neon-pricing]: https://neon.com/pricing +[pg-locks]: https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS +[gorm-automigrate]: https://gorm.io/docs/migration.html +[task-arch]: ../../docs/TASK_LOGIC_ARCHITECTURE.md diff --git a/docs/deployment/09-storage.md b/docs/deployment/09-storage.md new file mode 100644 index 0000000..c7cf26b --- /dev/null +++ b/docs/deployment/09-storage.md @@ -0,0 +1,265 @@ +# 09 — Object Storage (Backblaze B2) + +## Summary + +User-uploaded files (photos, documents, task completion attachments) go +to Backblaze B2 via its S3-compatible API. The Go API uses `minio-go/v7` +as the client. This works around a Swarm-era problem where named volumes +are per-node — uploads on node A were invisible to replicas on B and C. +With k3s we could use a shared PVC instead, but B2 is cheaper, offsite, +and already set up. + +## Why Backblaze B2 + +### Decision matrix + +| Option | Price per TB stored | Egress | Pros | Cons | +|---|---|---|---|---| +| **Backblaze B2** | **$6/mo** | $0.01/GB, free via CF | Cheap, hard spending caps, S3-compatible | US-West/East regions only (not EU) | +| AWS S3 Standard | $23/mo | $0.09/GB | Most ubiquitous | Expensive | +| Cloudflare R2 | $15/mo | Free (!) | Zero egress, CF-native | Newer, fewer features | +| DigitalOcean Spaces | $5/mo for 250GB + $0.01/GB | Free 1TB, $0.01/GB after | Simple | Less reliable than AWS | +| Local PVC on k3s | $0 | $0 | Already in cluster | Per-node, no HA, no offsite | + +B2 won because: +1. **Hard spending cap** — unique in the industry. No surprise AWS bill. +2. **Cheapest at rest** — 3–4× cheaper than S3. +3. **Free egress through Cloudflare** — we already use CF; when we + eventually serve upload URLs through CF, egress is free. +4. **Mature S3-compatible API** — minio-go talks to it natively. + +Rejected: +- **R2** was the close second. Zero egress is amazing. Rejected + primarily for inertia (B2 already set up in the MyCrib era). A future + migration to R2 would be reasonable. +- **Local PVC** doesn't work for our setup because we want uploads + durable and accessible from any node/replica. + +## Configuration + +Bucket: `honeyDueProd` (mixed case; B2 allows this, minio-go handles it +via path-style addressing — see §path-style below). + +Region: `us-east-005` (B2's South Carolina region — closer to our +Neon DB in AWS us-east-1 than the West Coast options). + +Endpoint: `s3.us-east-005.backblazeb2.com` + +### Environment variables + +From ConfigMap: + +| Var | Value | +|---|---| +| `B2_ENDPOINT` | `s3.us-east-005.backblazeb2.com` | +| `B2_BUCKET_NAME` | `honeyDueProd` | +| `B2_REGION` | `us-east-005` | +| `B2_USE_SSL` | `true` (but see §vestigial var below) | + +From Secret: + +| Var | Value | +|---|---| +| `B2_KEY_ID` | App key ID (B2-specific identifier) | +| `B2_APP_KEY` | App key secret | + +### App key scope + +The B2 app key is **bucket-scoped**, not account-scoped. Can only +read/write the `honeyDueProd` bucket. Cannot: +- List other buckets +- Delete the bucket +- Create new buckets +- Touch account settings + +This is the B2 equivalent of an IAM role with least privilege. If the +key leaks, the damage is limited to the `honeyDueProd` bucket. + +## The minio-go client + +The Go app uses `github.com/minio/minio-go/v7` — a Go SDK compatible +with any S3-flavored API. Relevant code at +`internal/services/storage_backend_s3.go`: + +```go +client, err := minio.New(endpoint, &minio.Options{ + Creds: credentials.NewStaticV4(keyID, appKey, ""), + Secure: useSSL, + Region: region, +}) +``` + +### Path-style vs virtual-hosted addressing + +S3's URL scheme has two flavors: + +- **Virtual-hosted**: `https://mybucket.s3.amazonaws.com/mykey` +- **Path-style**: `https://s3.amazonaws.com/mybucket/mykey` + +With virtual-hosted style, the bucket name must be DNS-compatible — +lowercase, no uppercase letters. `honeyDueProd` fails this. + +With path-style, the bucket name is just a URL path segment — any valid +string works. + +minio-go auto-detects: for AWS S3 it prefers virtual-hosted; for +non-AWS endpoints (like B2) it defaults to path-style. So +`honeyDueProd` with capital letters works transparently. + +## The `B2_USE_SSL` vestigial variable + +`prod.env` has `B2_USE_SSL=true`. But the Go app's +`internal/config/config.go:295` reads the env var +`STORAGE_USE_SSL`, not `B2_USE_SSL`: + +```go +S3UseSSL: viper.GetString("STORAGE_USE_SSL") == "" || viper.GetBool("STORAGE_USE_SSL"), +``` + +Whoever wrote the original config used `B2_USE_SSL` in `prod.env` and +`STORAGE_USE_SSL` in the code. They don't match. + +**Net effect**: The app reads `STORAGE_USE_SSL`, which is unset, and +the default `(empty) || true` evaluates to `true`. So SSL is always on, +despite `B2_USE_SSL=false` or `true` or anything else. + +This is a dormant bug. Anyone setting `B2_USE_SSL=false` expecting to +disable TLS would be surprised it stays on. Fortunately that's the +right default for production B2 (which only accepts HTTPS anyway). + +**TODO**: Rename `STORAGE_USE_SSL` → `B2_USE_SSL` in the Go code to +match the config. Documented in Chapter 19 §Vestigial config. + +## What we store there + +Today (limited rollout): +- User profile photos +- Task completion photos +- Document uploads (PDFs, images attached to records) + +File keys follow a hierarchy like: +``` +users//profile/.jpg +residences//documents/.pdf +tasks//completions/.jpg +``` + +Max file size is **10 MB** per upload (`STORAGE_MAX_FILE_SIZE=10485760`). +Allowed MIME types: `image/jpeg`, `image/png`, `image/gif`, `image/webp`, +`application/pdf` (`STORAGE_ALLOWED_TYPES`). + +## Access control + +### Upload flow + +1. Client POSTs to `/api/upload/` +2. Go API validates the user is authenticated and authorized for the + target resource +3. Go API streams the upload to B2 via minio-go's `PutObject` +4. B2 returns a key +5. Go API stores the key in Postgres +6. Returns the key to the client + +The B2 bucket is **private**. Clients can't GET directly; they always +go through the Go API. + +### Download flow (current) + +1. Client requests `/api/media/` +2. Go API checks the user can access this key +3. Go API fetches from B2 and streams back to the client + +This proxies every download through the api. For high-traffic media +that's inefficient (api becomes an egress bottleneck). + +### Future: signed URLs + +We could generate time-limited signed URLs for B2 objects: + +```go +url, err := s3Client.PresignedGetObject(ctx, bucket, key, 1*time.Hour, nil) +``` + +Returns a URL the client can GET directly from B2, scoped to a specific +object, valid for 1h. Saves api bandwidth and latency. + +Not yet implemented. TODO (Chapter 20). + +## Lifecycle and retention + +We have **no lifecycle rules** set on the bucket. Objects live forever +unless the app deletes them. + +When a user deletes their account, the app should delete their B2 +objects. This is currently not automated — a compliance gap for any +"right to be forgotten" request. + +**TODO** (Chapter 20): Either: +- Implement explicit cleanup in the user deletion handler, or +- Add B2 lifecycle rule tied to object metadata (tag objects with + user ID; rule deletes tagged objects when user is soft-deleted) + +## Backup of B2 + +We have no backup of B2 objects. B2 itself replicates within the region, +but: +- Accidental deletion via our app = data gone +- B2 itself being compromised = data gone + +B2 offers **Object Lock** (WORM — write once read many) which prevents +deletion for a retention period. Not enabled; revisit if/when user data +sensitivity justifies it. + +## Cost projection + +Current usage is **small** — estimated <50 GB stored. + +``` +50 GB × $0.006/GB = $0.30/mo storage +1 GB/mo egress (mostly uncached media served via api) → $0.01 (first + 3× of stored amount is free anyway, so effectively $0) +``` + +Total B2 cost: **< $1/mo**. Hard spending cap set to $20/mo in B2 +console — if we ever breach that, something's wrong and we want to +know immediately. + +At 100k users each uploading ~10 MB average: +- 1 TB stored = $6/mo +- Egress depends on access patterns; with signed URLs served through CF + the egress could still be ~free + +## Operator cheat sheet + +```bash +# List bucket contents (requires mc or aws CLI configured with B2 creds) +mc alias set b2 https://s3.us-east-005.backblazeb2.com +mc ls b2/honeyDueProd/ + +# Count objects +mc find b2/honeyDueProd/ --type f | wc -l + +# Download an object +mc cp b2/honeyDueProd/ ./ + +# Check B2 console for usage graphs: +# https://secure.backblaze.com/b2_buckets.htm +``` + +From inside a Go api pod: +```bash +# Check the in-cluster client config +kubectl exec -n honeydue deploy/api -- env | grep B2_ +``` + +## References + +- [Backblaze B2 docs][b2-docs] +- [B2 S3-compatible API][b2-s3] +- [minio-go/v7][minio-go] +- [S3 path-style vs virtual-hosted][s3-style] + +[b2-docs]: https://www.backblaze.com/docs/ +[b2-s3]: https://www.backblaze.com/docs/cloud-storage-s3-compatible-api +[minio-go]: https://github.com/minio/minio-go +[s3-style]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html diff --git a/docs/deployment/10-secrets-config.md b/docs/deployment/10-secrets-config.md new file mode 100644 index 0000000..a11d798 --- /dev/null +++ b/docs/deployment/10-secrets-config.md @@ -0,0 +1,369 @@ +# 10 — Secrets & Config + +## Summary + +Non-sensitive config (hostnames, ports, feature flags, etc.) lives in +`honeydue-config` ConfigMap. Sensitive values (DB password, signing +keys, API keys) live in `honeydue-secrets` and `honeydue-apns-key` +Secrets. Container registry auth lives in `gitea-credentials` (type +`kubernetes.io/dockerconfigjson`). This chapter maps every env var to +its source and explains what's stored where. + +## Structure + +```mermaid +flowchart LR + subgraph SourceWorkstation[Operator workstation] + ProdEnv[deploy/prod.env] + Secrets[deploy/secrets/*.txt] + Registry[deploy/registry.env] + end + + subgraph K8s[Kubernetes cluster] + CM[honeydue-config
ConfigMap] + S1[honeydue-secrets
Secret] + S2[honeydue-apns-key
Secret] + S3[gitea-credentials
Secret] + end + + subgraph Pods + Api[api pod] + Admin[admin pod] + Worker[worker pod] + end + + ProdEnv -. kubectl create configmap
--from-env-file .-> CM + Secrets -. kubectl create secret
--from-file/--from-literal .-> S1 + Secrets -. --from-file .-> S2 + Registry -. kubectl create secret docker-registry .-> S3 + + CM -- envFrom --> Api & Admin & Worker + S1 -- env: secretKeyRef --> Api & Worker + S2 -- volumeMounts --> Api & Worker + S3 -- imagePullSecrets --> Api & Admin & Worker +``` + +## ConfigMap: honeydue-config + +Built from `deploy/prod.env` (minus sensitive keys). Contents (58 keys, +abbreviated): + +``` +ADMIN_PANEL_URL=https://admin.myhoneydue.com +ALLOWED_HOSTS=api.myhoneydue.com,myhoneydue.com +APNS_AUTH_KEY_ID=DISABLED01 +APNS_AUTH_KEY_PATH=/secrets/apns/apns_auth_key.p8 +APNS_PRODUCTION=false +APNS_TEAM_ID=DISABLED01 +APNS_TOPIC=com.tt.honeyDue +APNS_USE_SANDBOX=false +BASE_URL=https://myhoneydue.com +B2_BUCKET_NAME=honeyDueProd +B2_ENDPOINT=s3.us-east-005.backblazeb2.com +B2_REGION=us-east-005 +B2_USE_SSL=true +CORS_ALLOWED_ORIGINS=https://myhoneydue.com,https://admin.myhoneydue.com +DAILY_DIGEST_HOUR=3 +DB_HOST=ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech +DB_MAX_IDLE_CONNS=10 +DB_MAX_LIFETIME=600s +DB_MAX_OPEN_CONNS=25 +DB_PORT=5432 +DB_SSLMODE=require +DEBUG=false +DEFAULT_FROM_EMAIL=noreply@myhoneydue.com +EMAIL_HOST=smtp.fastmail.com +EMAIL_HOST_USER=treytartt@fastmail.com +EMAIL_PORT=587 +EMAIL_USE_TLS=true +FEATURE_EMAIL_ENABLED=true +FEATURE_ONBOARDING_EMAILS_ENABLED=true +FEATURE_PDF_REPORTS_ENABLED=true +FEATURE_PUSH_ENABLED=false +FEATURE_WEBHOOKS_ENABLED=true +FEATURE_WORKER_ENABLED=true +NEXT_PUBLIC_API_URL=https://api.myhoneydue.com +OVERDUE_REMINDER_HOUR=15 +PORT=8000 +POSTGRES_DB=honeyDue +POSTGRES_USER=neondb_owner +REDIS_DB=0 +REDIS_URL=redis://redis:6379/0 +STATIC_DIR=/app/static +STORAGE_ALLOWED_TYPES=image/jpeg,image/png,image/gif,image/webp,application/pdf +STORAGE_BASE_URL=/uploads +STORAGE_MAX_FILE_SIZE=10485760 +STORAGE_UPLOAD_DIR=/app/uploads +TASK_REMINDER_HOUR=14 +TIMEZONE=UTC +``` + +Plus empty-but-declared keys for optional integrations (Apple/Google +auth + IAP). + +### How pods use it + +```yaml +envFrom: + - configMapRef: + name: honeydue-config +``` + +Every key in the ConfigMap becomes an env var in the container. +`envFrom` is bulk — no need to enumerate each one. + +### Changing config + +Edit `deploy/prod.env` locally, regenerate the ConfigMap: + +```bash +# Simplified; see scripts for the full version +kubectl create configmap honeydue-config -n honeydue \ + --from-env-file=deploy/prod.env \ + --dry-run=client -o yaml | kubectl apply -f - + +# Pods don't auto-reload env vars. Restart to pick up changes: +kubectl rollout restart -n honeydue deploy/api deploy/admin deploy/worker +``` + +## Secret: honeydue-secrets (Opaque) + +9 keys: + +| Key | Purpose | +|---|---| +| `POSTGRES_PASSWORD` | Neon DB password | +| `SECRET_KEY` | Django-compat signing key (64 chars, base64) | +| `EMAIL_HOST_PASSWORD` | Fastmail app password | +| `FCM_SERVER_KEY` | FCM push key (currently placeholder, push disabled) | +| `REDIS_PASSWORD` | Empty (no auth on in-cluster Redis) | +| `B2_KEY_ID` | Backblaze B2 app key ID | +| `B2_APP_KEY` | Backblaze B2 app key secret | +| `ADMIN_EMAIL` | Next.js admin panel initial admin email | +| `ADMIN_PASSWORD` | Next.js admin panel initial admin password | + +### How pods use it + +Individual `env:` entries wire specific Secret keys to env vars: + +```yaml +env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: POSTGRES_PASSWORD + - name: SECRET_KEY + valueFrom: + secretKeyRef: + name: honeydue-secrets + key: SECRET_KEY + # ... etc +``` + +This pattern (vs. `envFrom: secretRef:`) is more explicit — you know +exactly which secret keys a pod uses by reading the manifest. + +### ADMIN_PASSWORD — one-time use + +The Go app's `internal/database/database.go:519-538` reads +`ADMIN_EMAIL` + `ADMIN_PASSWORD` at startup. If the `admin_users` table +doesn't have a row for that email, it inserts one with a bcrypt hash of +the password. Already-existing rows are **not** updated. + +So: +- First deploy: admin user created +- Subsequent deploys: no-op +- If you want to rotate the initial admin password: do it in the admin + panel UI, not by changing `ADMIN_PASSWORD` + +After first deploy you can technically blank `ADMIN_PASSWORD` in the +Secret. Leaving it set is harmless but slightly messy. + +## Secret: honeydue-apns-key (Opaque) + +One file: `apns_auth_key.p8`. Mounted as a volume into api and worker +pods at `/secrets/apns/apns_auth_key.p8` (read-only). + +Push is currently **disabled** (`FEATURE_PUSH_ENABLED=false`), so +this `.p8` is a throwaway EC P-256 private key generated by +`openssl genpkey`. It passes the Go app's "does this file contain +`BEGIN PRIVATE KEY`" validation but cannot authenticate against Apple. + +When push is enabled: + +1. Generate a real APNs auth key in Apple Developer console +2. Replace `deploy/secrets/apns_auth_key.p8` +3. Update `APNS_AUTH_KEY_ID`, `APNS_TEAM_ID`, `APNS_TOPIC` in ConfigMap +4. `kubectl create secret generic honeydue-apns-key ... --dry-run=client -o yaml | kubectl apply -f -` +5. Set `FEATURE_PUSH_ENABLED=true` +6. `kubectl rollout restart` api and worker + +## Secret: gitea-credentials (docker-registry) + +Type `kubernetes.io/dockerconfigjson`. Contains a base64-encoded Docker +config for Gitea registry auth. + +Created via: + +```bash +kubectl create secret docker-registry gitea-credentials \ + --namespace=honeydue \ + --docker-server=gitea.treytartt.com \ + --docker-username=admin \ + --docker-password= \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +Referenced in every deployment that pulls from Gitea: + +```yaml +spec: + imagePullSecrets: + - name: gitea-credentials +``` + +When a pod needs to pull an image, the kubelet reads this secret and +uses it for the registry authentication. + +## Source files — what's canonical + +The Swarm-era files are still the **source of truth** for secrets: + +| File | Contents | Canonical? | +|---|---|---| +| `deploy/prod.env` | All non-sensitive config | Yes | +| `deploy/secrets/postgres_password.txt` | Neon DB password | Yes | +| `deploy/secrets/secret_key.txt` | App signing key | Yes | +| `deploy/secrets/email_host_password.txt` | Fastmail password | Yes | +| `deploy/secrets/fcm_server_key.txt` | FCM key (placeholder) | Yes | +| `deploy/secrets/apns_auth_key.p8` | APNs key (placeholder) | Yes | +| `deploy/registry.env` | Gitea registry auth | Yes | +| `deploy-k3s/manifests/secrets.yaml.example` | Template only (never committed with real values) | No — template | +| In-cluster Secrets | Live state | Derived | + +### Why canonical lives in `deploy/` not `deploy-k3s/` + +Historical. We migrated from Swarm to k3s but kept the source files +untouched. Rather than move them now (and break any remaining Swarm-era +tooling), we use them from the k3s setup scripts as-is. + +Future cleanup: move to `deploy-k3s/secrets/` for better provenance. + +## Recreating the cluster secrets + +If the k3s cluster is rebuilt, the Secrets need to be recreated from the +local source files. Rough procedure: + +```bash +export KUBECONFIG=~/.kube/honeydue-k3s.yaml + +# Namespace first +kubectl create namespace honeydue + +# Docker config secret for Gitea +set -a; source deploy/registry.env; set +a +kubectl create secret docker-registry gitea-credentials \ + -n honeydue \ + --docker-server="$REGISTRY" \ + --docker-username="$REGISTRY_USERNAME" \ + --docker-password="$REGISTRY_TOKEN" + +# Main secrets bundle +set -a; source deploy/prod.env; set +a +kubectl create secret generic honeydue-secrets -n honeydue \ + --from-literal=POSTGRES_PASSWORD="$(tr -d '\n' < deploy/secrets/postgres_password.txt)" \ + --from-literal=SECRET_KEY="$(tr -d '\n' < deploy/secrets/secret_key.txt)" \ + --from-literal=EMAIL_HOST_PASSWORD="$(tr -d '\n' < deploy/secrets/email_host_password.txt)" \ + --from-literal=FCM_SERVER_KEY="$(tr -d '\n' < deploy/secrets/fcm_server_key.txt)" \ + --from-literal=REDIS_PASSWORD="" \ + --from-literal=B2_KEY_ID="$B2_KEY_ID" \ + --from-literal=B2_APP_KEY="$B2_APP_KEY" \ + --from-literal=ADMIN_EMAIL="$ADMIN_EMAIL" \ + --from-literal=ADMIN_PASSWORD="$ADMIN_PASSWORD" + +# APNS key Secret +kubectl create secret generic honeydue-apns-key -n honeydue \ + --from-file=apns_auth_key.p8=deploy/secrets/apns_auth_key.p8 + +# ConfigMap from prod.env (minus secret keys) +# See deploy-k3s/scripts/02-setup-secrets.sh for the full version +# Simplified: +declare -a args +secret_keys="POSTGRES_PASSWORD SECRET_KEY EMAIL_HOST_PASSWORD FCM_SERVER_KEY REDIS_PASSWORD B2_KEY_ID B2_APP_KEY ADMIN_EMAIL ADMIN_PASSWORD" +while IFS='=' read -r k v; do + [[ -z "$k" || "$k" =~ ^# ]] && continue + for sk in $secret_keys; do [[ "$k" == "$sk" ]] && continue 2; done + args+=(--from-literal="$k=$v") +done < deploy/prod.env +kubectl create configmap honeydue-config -n honeydue "${args[@]}" +``` + +The full version with all edge cases is in +`deploy-k3s/scripts/02-setup-secrets.sh` (which was written for the +GHCR-era assumption; adapt for Gitea). + +## Pitfalls + +### Trailing newlines in secret files + +Secret files created by text editors typically end with a newline. If +we pass the content directly, the newline becomes part of the secret +— a mismatch to what the app expects. + +We strip trailing newlines with `tr -d '\n'` before creating Secrets. +If you forget, your DB password will be silently wrong. + +### Case sensitivity on POSTGRES_DB + +`POSTGRES_DB=honeyDue` must be exactly `honeyDue`. `honeydue` (lowercase) +fails with `database "honeydue" does not exist`. Postgres identifiers +are case-sensitive if originally quoted at CREATE time. + +### Placeholder detection + +The Swarm-era deploy script rejected values containing `CHANGEME`, +`your-`, `paste_here`, etc. When setting up the k3s cluster we had to +strip those from `prod.env` first. If you ever see a pod error about +"invalid host" or "invalid key id", check if a placeholder leaked +through. + +### B2_USE_SSL vs STORAGE_USE_SSL + +The config has `B2_USE_SSL` but the Go code reads `STORAGE_USE_SSL`. +See Chapter 9 §Vestigial variable. Setting `B2_USE_SSL=false` in the +ConfigMap does nothing; SSL stays on. + +## Operator cheat sheet + +```bash +# Print a ConfigMap as env-file format +kubectl get cm honeydue-config -n honeydue -o jsonpath='{range .data}{"\n"}{end}' + +# Edit a ConfigMap interactively (DOES NOT restart pods) +kubectl edit cm honeydue-config -n honeydue + +# After editing a ConfigMap, restart pods to pick up +kubectl rollout restart -n honeydue deploy/api deploy/admin deploy/worker + +# View a Secret (prints base64 — decode with base64 -d) +kubectl get secret honeydue-secrets -n honeydue -o yaml + +# Reveal a specific secret value (DANGER: plaintext to stdout) +kubectl get secret honeydue-secrets -n honeydue \ + -o jsonpath='{.data.POSTGRES_PASSWORD}' | base64 -d + +# Update a single secret key +kubectl patch secret honeydue-secrets -n honeydue \ + --type=merge -p "{\"data\":{\"SECRET_KEY\":\"$(echo -n 'newvalue' | base64)\"}}" +``` + +## References + +- [Kubernetes ConfigMaps][cm] +- [Kubernetes Secrets][secret] +- [Secret types][secret-types] + +[cm]: https://kubernetes.io/docs/concepts/configuration/configmap/ +[secret]: https://kubernetes.io/docs/concepts/configuration/secret/ +[secret-types]: https://kubernetes.io/docs/concepts/configuration/secret/#secret-types diff --git a/docs/deployment/11-registry.md b/docs/deployment/11-registry.md new file mode 100644 index 0000000..3ca10b9 --- /dev/null +++ b/docs/deployment/11-registry.md @@ -0,0 +1,329 @@ +# 11 — Container Registry (Gitea) + +## Summary + +We host our own container registry on Gitea at `gitea.treytartt.com`. +Every image push and pull goes there, not Docker Hub or GHCR. The Gitea +instance runs outside this k3s cluster (on its own VPS) and is available +at `https://gitea.treytartt.com` with public HTTPS. Image pulls are +authenticated via a Personal Access Token stored as a Kubernetes +`dockerconfigjson` Secret. + +## Why Gitea + +### Decision matrix + +| Option | Cost | Auth model | Pros | Cons | +|---|---|---|---|---| +| **Gitea built-in registry** | $0 (already running Gitea) | Gitea PAT | Self-hosted, integrated with code | Another service to maintain | +| GHCR (GitHub Container Registry) | Free for public, $0 for private with paid plan | GitHub PAT | Popular, reliable | Uses GitHub; vendor dependency | +| Docker Hub | Free tier limited; paid $5-7/mo | Docker Hub account | Ubiquitous | Rate limits on anonymous pulls | +| AWS ECR | ~$1/mo for small use | IAM | Integrates with AWS workloads | AWS account required | +| Harbor (self-hosted) | $0 | Many options | Best enterprise features | Heavy to operate | + +Gitea won primarily because **the operator was already running Gitea for +code hosting**. Container registry is built into Gitea 1.17+ as a free +feature. One fewer service to set up. + +Side benefits: +- Code and images live together (one backup policy, one access model) +- PATs are scoped and rotatable via the same UI +- No external vendor to worry about for this critical piece of the + deploy pipeline + +Rejected alternatives: +- **Docker Hub** — rate limits on unauthenticated pulls would bite us if + nodes pull the same image repeatedly during rolling updates +- **GHCR** — fine but adds GitHub dependency we don't otherwise have +- **Harbor** — massive overkill; we're not a 100-team enterprise + +## Layout + +Images live under the authenticated user's namespace: + +``` +gitea.treytartt.com/admin/honeydue-api:237c6b8 +gitea.treytartt.com/admin/honeydue-worker:237c6b8 +gitea.treytartt.com/admin/honeydue-admin:237c6b8 +``` + +`admin` is the Gitea user that owns the images. Images are **private** +by default. + +### Image tagging strategy + +Tags are git short SHAs (e.g., `237c6b8`). Not `:latest`. Not semantic +version. + +Rationale: +- `:latest` is ambiguous — which build? Rolling updates should roll a + *specific* tag so rollbacks are deterministic. +- `:v1.2.3` works for released libraries but our app rolls forward + continuously; versioning per deploy is unnecessary overhead. +- Git SHAs are unique, immutable, and tie each image to the exact + commit that built it. + +`PUSH_LATEST_TAG=false` is set in `deploy/cluster.env`. When we rebuild +and push, only the SHA tag gets pushed. The `latest` tag is never +created by our deploy pipeline. + +## Authentication + +### Creating the PAT + +At , we created +a token with scopes: + +- `read:package` +- `write:package` + +No other scopes. This token can only interact with package registry; it +can't read repo contents, create issues, or touch account settings. + +### PAT on the operator workstation + +Stored in `deploy/registry.env`: + +``` +REGISTRY=gitea.treytartt.com +REGISTRY_NAMESPACE=admin +REGISTRY_USERNAME=admin +REGISTRY_TOKEN= +``` + +This file is `.gitignore`d in `deploy/.gitignore`. If it ever gets +committed accidentally, rotate the PAT immediately. + +### PAT in the cluster + +Stored as the `gitea-credentials` Secret (type `dockerconfigjson`) in +the `honeydue` namespace. See Chapter 10. + +Kubelet reads this Secret when a pod needs to pull from the Gitea +registry. + +## The build pipeline + +### Dockerfile multi-stage + +`honeyDueAPI-go/Dockerfile` has three target stages: + +- `api` — compiled Go binary + static assets for the HTTP API +- `worker` — compiled Go binary for the background worker +- `admin` — Next.js standalone build of the admin panel + +A single Dockerfile keeps build-cache sharing efficient (the Go builder +stage produces binaries for both api and worker; admin reuses its own +Node builder stage). + +### Multi-arch cross-compilation + +The operator workstation is **arm64** (Apple Silicon). The Hetzner nodes +are **x86_64**. A naive `docker build` on arm64 produces arm64 images +that won't run on the nodes (`exec format error`). + +The deploy pipeline uses `docker buildx`: + +```bash +docker buildx build \ + --platform linux/amd64 \ + --target api \ + -t gitea.treytartt.com/admin/honeydue-api:$SHA \ + --push \ + /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go +``` + +- **`--platform linux/amd64`** — cross-compile to x86_64 +- **`--target api`** — which Dockerfile stage to build +- **`--push`** — push directly to the registry (skip local image cache) + +The Go stages use the `TARGETARCH` build arg to produce the right +architecture binary. Node stages use QEMU emulation (which is slower but +acceptable for our ~1 min admin build). + +### Buildx builder + +We use a named buildx builder to keep state out of Docker's default +environment: + +```bash +docker buildx create --name honeydue-builder --use +docker buildx inspect --bootstrap +``` + +The `honeydue-builder` is a docker-container driver — spawns a +BuildKit container when building, tears it down when idle. Supports +multi-platform and caches layers across builds. + +## From local file to cluster — the full path + +```mermaid +flowchart LR + subgraph dev[Operator workstation] + Code[Source code] + Dockerfile + Buildx[docker buildx] + end + subgraph Gitea[gitea.treytartt.com] + Reg[Package registry] + end + subgraph K8s[k3s cluster] + Kubelet + Containerd + Pod + end + + Code --> Dockerfile + Dockerfile --> Buildx + Buildx -- push --> Reg + Reg -- pull --> Kubelet + Kubelet --> Containerd + Containerd --> Pod +``` + +### End-to-end + +1. **Operator pushes code**: commits to `main` locally +2. **Operator builds + pushes image**: `docker buildx build --push ...` + from the repo root. Build takes 1–3 minutes first time, seconds on + warm cache. +3. **Image lands in Gitea**: visible at + `https://gitea.treytartt.com/admin/-/packages/container/honeydue-api` +4. **Operator updates Deployment**: `kubectl set image deployment/api + api=gitea.treytartt.com/admin/honeydue-api:$NEW_SHA -n honeydue` +5. **K8s begins rolling update**: creates new ReplicaSet with new image +6. **Kubelet on target node** sees a pod with an image it doesn't have +7. **Kubelet calls containerd**: "pull this image using these creds" +8. **Containerd authenticates** to Gitea registry using the PAT from + `gitea-credentials` Secret, downloads the image +9. **Containerd starts the container** with the new image +10. **Readiness probe passes**: new pod joins the Service endpoints +11. **Kubelet tears down** an old pod + +## Pushing manually + +If you need to push a one-off image (e.g., testing a fix): + +```bash +# Login (once per session) +set -a; source deploy/registry.env; set +a +printf '%s' "$REGISTRY_TOKEN" | docker login "$REGISTRY" -u "$REGISTRY_USERNAME" --password-stdin + +# Build + push +cd honeyDueAPI-go +SHA=$(git rev-parse --short HEAD) +docker buildx build \ + --platform linux/amd64 \ + --target api \ + -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" \ + --push . + +# Logout (don't leave creds in ~/.docker/config.json) +docker logout gitea.treytartt.com +``` + +## Image sizes + +Current images: + +| Image | Size | Layers | +|---|---|---| +| `honeydue-api` | ~53 MB | Alpine base + Go binary | +| `honeydue-worker` | ~50 MB | Alpine base + Go binary | +| `honeydue-admin` | ~150 MB | Node 20 alpine + Next.js standalone | + +The Go binaries are statically compiled, CGO_ENABLED=0. Alpine is the +base for smallest footprint. + +## Image retention + +Gitea does **not auto-prune** images. Every `:` tag accumulates +forever. The package page at +`https://gitea.treytartt.com/admin/-/packages/container/honeydue-api` +lists them all. + +At current pace (deploys ~few/week, images ~50-150 MB each), this grows +~10 GB/year. Not critical; 80 GB node disk can take years. + +**TODO**: Add a monthly cleanup: delete all but last 30 tags per image. +Can be a cron job or a manual quarterly cleanup. + +## Image verification — not yet + +We do not sign images or verify signatures. An attacker who compromised +Gitea could push a malicious image under an existing tag (though Gitea +should prevent tag reuse if immutable tags are configured). + +**TODO** (Chapter 20): Add [cosign](https://github.com/sigstore/cosign) +for signing at build time + `Kyverno` or `Connaisseur` policy to verify +at pull time. + +## Gitea registry itself + +The Gitea instance runs outside this k3s cluster on its own VPS +(operator's existing infrastructure). It's **not** part of the honeyDue +deployment — it's adjacent infrastructure. + +If the Gitea host goes down: +- Currently-running pods keep working (they already pulled their images) +- New deployments/scale-ups fail at the image-pull step +- No impact on existing user traffic + +This is an acceptable external dependency. Gitea host has its own +uptime story. + +## Cost + +**$0/mo.** Gitea registry is included in the Gitea install we already +pay the VPS for (not accounted to honeyDue's cost). + +If we ever switched to GHCR, cost would still be $0 for public images +or bundled with our (nonexistent) GitHub Team subscription. + +## What we don't have + +- **Image scanning** (Trivy, Snyk) — scan images for known CVEs on push +- **Image signing** (cosign) +- **Multi-region replication** — only hosted in one place +- **High availability** — Gitea is single-instance + +For our scale, none of these are needed. TODO (Chapter 20) if the +operator appetite increases. + +## Operator cheat sheet + +```bash +# List packages via API +curl -sS "https://gitea.treytartt.com/api/v1/packages/admin?type=container" \ + -H "Accept: application/json" | jq . + +# Browse in UI +# https://gitea.treytartt.com/admin/-/packages + +# Delete a specific tag via API +curl -X DELETE \ + -H "Authorization: token $GITEA_PAT" \ + "https://gitea.treytartt.com/api/v1/packages/admin/container/honeydue-api/237c6b8" + +# Login from kubectl side (refresh the Secret) +kubectl create secret docker-registry gitea-credentials -n honeydue \ + --docker-server=gitea.treytartt.com \ + --docker-username=admin \ + --docker-password= \ + --dry-run=client -o yaml | kubectl apply -f - + +# After rotating PAT, restart pods that use it for pulls +kubectl rollout restart -n honeydue deploy/api deploy/admin deploy/worker +``` + +## References + +- [Gitea Container Registry][gitea-cr] +- [Docker buildx multi-platform][buildx] +- [Kubernetes image pull secrets][pull-secrets] +- [cosign][cosign] + +[gitea-cr]: https://docs.gitea.com/usage/packages/container +[buildx]: https://docs.docker.com/build/buildx/ +[pull-secrets]: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +[cosign]: https://github.com/sigstore/cosign diff --git a/docs/deployment/12-data-flow.md b/docs/deployment/12-data-flow.md new file mode 100644 index 0000000..6648dfe --- /dev/null +++ b/docs/deployment/12-data-flow.md @@ -0,0 +1,317 @@ +# 12 — Data Flow + +## Summary + +This chapter follows a user's request end to end, hop by hop. It's the +consolidated picture of Chapters 3, 6, 7, 8, 9 working together. Use +this chapter to answer "when X doesn't work, which layer failed?" + +## Scenario: User creates a task + +A user in Austin opens the mobile app and adds a new task for their +property. The client sends `POST https://api.myhoneydue.com/api/tasks/` +with a JSON body and an auth token. We trace every hop. + +## Hop 1 — Mobile client → Cloudflare edge + +```mermaid +sequenceDiagram + participant App as iOS client + participant DNS as Local DNS + participant CFE as Cloudflare edge (DFW) + + App->>DNS: Resolve api.myhoneydue.com + DNS->>App: 104.21.13.7 (Cloudflare edge IP) + App->>CFE: TCP SYN :443 + CFE-->>App: TCP SYN+ACK + App->>CFE: TLS ClientHello + CFE->>App: TLS ServerHello + cert + Note over App,CFE: TLS 1.3 handshake
~1 RTT + App->>CFE: HTTP/2 stream
POST /api/tasks/
Authorization: Token +``` + +- Client resolves `api.myhoneydue.com` via OS resolver, gets Cloudflare + edge IP (not our origin IP) +- Client establishes TLS 1.3 to CF's nearest POP (Dallas for Austin) +- Cert presented by CF is `sni.cloudflaressl.com` or a CF-issued + `*.myhoneydue.com` — our origin cert is never seen by the client +- Latency: ~5–15 ms Austin → DFW + +## Hop 2 — Cloudflare edge → Origin (hetzner) + +```mermaid +sequenceDiagram + participant CFE as Cloudflare DFW POP + participant DNS as CF internal DNS + participant HN as hetzner node (random of 3) + participant Traefik as Traefik pod
(host network) + + CFE->>DNS: Which origin for api.myhoneydue.com? + DNS->>CFE: One of 178.104.247.152, 178.105.32.198, 178.104.249.189 + CFE->>HN: TCP SYN :80 + HN-->>CFE: SYN+ACK + CFE->>HN: HTTP/1.1 POST /api/tasks/
Host: api.myhoneydue.com
X-Forwarded-For:
X-Forwarded-Proto: https
CF-Connecting-IP: + Note over HN: UFW: allow 80/tcp from
anywhere (anywhere for now) + HN->>Traefik: delivered to listener +``` + +- CF picks one of the 3 node IPs via DNS round-robin. This is per-connection, not per-request. +- Protocol between CF and origin: **HTTP/1.1 plaintext** (SSL=Flexible). + A future Full-strict upgrade would make this HTTPS. +- Latency: ~90–120 ms DFW → Nuremberg +- CF adds headers: `CF-Connecting-IP`, `X-Forwarded-For`, `X-Forwarded-Proto` + +## Hop 3 — Traefik → api Service + +```mermaid +sequenceDiagram + participant Traefik as Traefik pod + participant CoreDNS as CoreDNS (10.43.0.10) + participant KP as kube-proxy IPVS
(kernel) + participant APIPod as api pod
(some node) + + Note over Traefik: Match Host: api.myhoneydue.com
→ honeydue-api Ingress
→ backend: api Service :8000 + Traefik->>CoreDNS: Resolve "api" + CoreDNS->>Traefik: 10.43.167.83 (Service ClusterIP) + Traefik->>KP: TCP SYN to 10.43.167.83:8000 + KP->>KP: IPVS: pick endpoint
from Service endpoint set + KP->>APIPod: Rewrite destination
to 10.42.2.6:8000
(Flannel VXLAN if remote node) +``` + +- Traefik resolves `api` via CoreDNS → gets the Service ClusterIP +- Traefik sends to `10.43.167.83:8000` +- kube-proxy IPVS (running in-kernel on the node where Traefik lives) + intercepts, picks a live endpoint, rewrites +- Destination might be local (same node) or remote (VXLAN tunnel to + another node) +- Latency: <3 ms even cross-node + +## Hop 4 — api → Postgres (Neon) + +```mermaid +sequenceDiagram + participant API as api pod (Go) + participant Resolv as Pod resolv.conf + participant Neon as Neon pooler
AWS us-east-1 + + API->>Resolv: Resolve ep-floral-truth-...-pooler.us-east-1.aws.neon.tech + Note over Resolv: Goes to CoreDNS
which forwards to upstream
(Hetzner's DNS, then public root) + Resolv->>API: Neon pooler IP (e.g., 34.206.177.121) + API->>Neon: TCP :5432 + API->>Neon: TLS 1.3 handshake (DB_SSLMODE=require) + API->>Neon: Postgres startup (user, database) + API->>Neon: BEGIN
SELECT ... FROM task_task WHERE residence_id = ?
INSERT INTO task_task (...) VALUES (...)
COMMIT + Neon-->>API: Query results +``` + +- Go's database/sql pool may already have an idle connection. If so, + skip handshake. +- If new connection: ~50 ms TLS handshake + Postgres startup +- Query itself: typically ~5–20 ms (single-row read/write on indexed + columns) +- Total for this hop: often <10 ms on a warm connection, ~80 ms cold + +## Hop 5 — api → Redis (cache miss invalidation) + +```mermaid +sequenceDiagram + participant API as api pod + participant CoreDNS + participant KP as kube-proxy + participant Redis as redis pod + + API->>CoreDNS: Resolve "redis" + CoreDNS->>API: 10.43.7.10 + API->>KP: TCP :6379 + KP->>Redis: rewritten to 10.42.x.y:6379 + API->>Redis: DEL tasks:user: (invalidate cached list) + Redis-->>API: OK +``` + +- Redis connection is usually kept alive in the api's pool +- Latency: <1 ms (Redis is on hetzner2, usually a short hop) + +## Hop 6 — api → worker (enqueue side effect) + +For some task creation events, api enqueues a background job +(send-notification, update-lookup-table, etc.): + +```mermaid +sequenceDiagram + participant API as api pod + participant Redis as redis pod (acting as Asynq queue) + participant Worker as worker pod + + API->>Redis: RPUSH asynq:queue:default + Redis-->>API: OK + Note over API,Worker: (Async, no response blocking) + Worker->>Redis: BLPOP asynq:queue:default + Redis-->>Worker: + Worker->>Worker: Process job
(send email, push, etc.) +``` + +api returns to the caller without waiting for the job. + +## Hop 7 — Response back to user + +Reverse the path: + +1. api returns JSON response to Traefik +2. Traefik returns to Cloudflare +3. Cloudflare re-encrypts TLS to user +4. User receives response + +## End-to-end latency budget + +For a typical "create task" operation: + +| Hop | Latency | +|---|---| +| User → CF (Austin → DFW) | 5–15 ms | +| CF → hetzner (cross-Atlantic) | 90–120 ms | +| UFW + kernel + Traefik accept | <1 ms | +| Traefik → api (same or cross-node) | 1–3 ms | +| api request parsing, auth validation | 1–3 ms | +| api → Postgres (query) | 20–60 ms | +| api → Redis (invalidate) | <1 ms | +| api response generation | 1–5 ms | +| Return path | same as forward, reversed | + +**Total**: ~220–310 ms typical. Dominated by the cross-Atlantic CF→origin +hop and the Postgres query round trip. + +## Read path (GET /api/tasks/) + +Similar but simpler: + +```mermaid +sequenceDiagram + participant App as iOS client + participant CF as Cloudflare + participant Traefik + participant API as api pod + participant Redis + participant Neon + + App->>CF: GET /api/tasks/ + CF->>Traefik: (no cache hit) + Traefik->>API: Route via Service + API->>Redis: GET tasks:user: + alt Cache hit + Redis-->>API: cached JSON + else Cache miss + API->>Neon: SELECT ... + Neon-->>API: rows + API->>Redis: SET tasks:user: EX 300 + end + API-->>Traefik: 200 JSON + Traefik-->>CF: 200 + CF-->>App: 200 (may cache per response headers) +``` + +## Admin panel data flow + +A different dance because the admin is Next.js: + +```mermaid +sequenceDiagram + participant Browser + participant CF + participant Traefik + participant Admin as admin pod (Next.js) + participant AdminAPI as api pod
(via public URL) + participant Neon + + Browser->>CF: GET admin.myhoneydue.com/users + CF->>Traefik: HTTP :80 + Traefik->>Admin: Service /users + Note over Admin: Next.js SSR:
fetch from NEXT_PUBLIC_API_URL + Admin->>CF: GET api.myhoneydue.com/api/admin/users/ + CF->>Traefik: (api ingress) + Traefik->>AdminAPI: Service + AdminAPI->>Neon: SELECT ... FROM auth_user + Neon-->>AdminAPI: rows + AdminAPI-->>Admin: JSON + Admin->>Admin: Render HTML + Admin-->>Traefik: HTML + Traefik-->>CF: HTML + CF-->>Browser: HTML +``` + +Notably, the admin pod's calls to api go **back out to Cloudflare** and +in through the public URL. Not the in-cluster Service IP. This is +because `NEXT_PUBLIC_API_URL=https://api.myhoneydue.com` — Next.js builds +use the same URL for browser-side and server-side fetches. + +This is **suboptimal** — server-side (SSR) calls could use the internal +`api.honeydue.svc:8000` URL and skip the CF round-trip. Future +optimization: separate `NEXT_PUBLIC_API_URL` (browser) from `API_URL` +(server-side). + +## Static asset flow + +For the marketing landing page at `https://myhoneydue.com/`: + +1. CF caches HTML per `Cache-Control` (the Go app sets short TTLs) +2. CF caches CSS / JS / images aggressively (via default CF rules) +3. First request hits origin, subsequent requests served from CF edge + +The static assets live inside the api container at `/app/static/`. +Served by Echo's static file handler at routes `/css`, `/js`, `/images`. + +## Request flow during a rolling update + +When a new api image is deployed, some requests will hit old pods and +some will hit new pods for a few minutes: + +```mermaid +sequenceDiagram + participant CF + participant Traefik + participant OldPod as api pod v1 + participant NewPod as api pod v2 (starting) + + Note over NewPod: kubelet starts new pod + Note over NewPod: pod connects to Postgres
MigrateWithLock runs (no-op)
HTTP server starts
readinessProbe passes + Note over NewPod: kube-proxy updates endpoints
NewPod added to Service pool + CF->>Traefik: request 1 + Traefik->>OldPod: routed (old pod still in pool) + CF->>Traefik: request 2 + Traefik->>NewPod: routed (new pod now in pool) + Note over OldPod: Kubelet terminates old pod
(graceful SIGTERM, then SIGKILL after grace) + CF->>Traefik: request 3 + Traefik->>NewPod: routed (OldPod gone from pool) +``` + +Both old and new handle traffic simultaneously until the rolling update +completes. As long as the new code is API-compatible, users don't +notice. + +## Failure modes in the data path + +See [Chapter 16 — Failure Modes](./16-failure-modes.md) for a full +catalog. + +Quick summary: + +| Layer fails | User sees | Recovery | +|---|---|---| +| Cloudflare DNS down | Can't resolve api.myhoneydue.com | Manual DNS fallback; extremely rare | +| Cloudflare edge down (single POP) | Slow, CF routes to another POP | Automatic | +| Node NIC fails | Some requests time out (CF routes away) | Cluster reschedules pods | +| UFW misconfig blocks :80 | 521 errors at CF | Re-add rule | +| Traefik pod down on one node | CF routes to other nodes | Automatic | +| kube-proxy broken on one node | Pods on that node can't reach Services | Restart kubelet | +| CoreDNS down | New connections fail DNS | Restart CoreDNS | +| Flannel broken between nodes | Cross-node pod communication fails | Restart flannel or node | +| api pod OOM | 502 to user briefly | kubelet restarts pod | +| Postgres down | 500 errors from api | Neon-side issue; outage | +| Redis down | api serves without cache (degraded) | Restart Redis pod | +| B2 down | Uploads fail, existing content served if cached | Backblaze-side outage | + +## References + +- [Chapter 3 — Networking](./03-networking.md) for the overlay mechanics +- [Chapter 6 — Traefik](./06-traefik-ingress.md) for routing details +- [Chapter 7 — Services](./07-services.md) for per-service specifics +- [Chapter 16 — Failure Modes](./16-failure-modes.md) for what-if scenarios diff --git a/docs/deployment/13-cloudflare.md b/docs/deployment/13-cloudflare.md new file mode 100644 index 0000000..38e77cb --- /dev/null +++ b/docs/deployment/13-cloudflare.md @@ -0,0 +1,344 @@ +# 13 — Cloudflare + +## Summary + +Cloudflare sits in front of every public request. It provides DNS +(authoritative nameservers for `myhoneydue.com`), TLS termination at +the edge, DDoS mitigation, caching, and the round-robin fan-out across +our three node IPs. We use the Free plan. TLS mode is "Flexible" +(HTTP between CF and origin). This chapter documents every Cloudflare +setting that matters. + +## DNS + +### Zone + +`myhoneydue.com`, managed by Cloudflare. Authoritative nameservers: + +``` +carol.ns.cloudflare.com +ishaan.ns.cloudflare.com +``` + +### Records that matter + +| Type | Name | Content | Proxy | Notes | +|---|---|---|---|---| +| A | `api` | 178.104.247.152 | 🟠 Proxied | hetzner1 | +| A | `api` | 178.105.32.198 | 🟠 Proxied | hetzner2 | +| A | `api` | 178.104.249.189 | 🟠 Proxied | hetzner3 | +| A | `admin` | 178.104.247.152 | 🟠 Proxied | same 3 IPs | +| A | `admin` | 178.105.32.198 | 🟠 Proxied | | +| A | `admin` | 178.104.249.189 | 🟠 Proxied | | +| A | `@` | 178.104.247.152 | 🟠 Proxied | same 3 IPs | +| A | `@` | 178.105.32.198 | 🟠 Proxied | | +| A | `@` | 178.104.249.189 | 🟠 Proxied | | + +Three A records per name → Cloudflare selects one per request. With +proxying on (orange cloud), **the client never sees these IPs** — it +sees a Cloudflare edge IP. CF internally picks which of the three +origin IPs to connect to; if one fails the connection, CF retries the +next. + +**TXT records for email** (Fastmail sending domain): SPF, DKIM, DMARC. +Not our immediate concern; configured by the Fastmail custom-domain +setup. + +### Why three A records per name, not one + +With one record pointing at hetzner1: +- Only hetzner1 sees traffic +- If hetzner1 is unreachable, everything breaks until we change DNS + +With three records: +- CF chooses one origin per connection +- If one node's port :80 stops responding, CF tries the others +- Node upgrades can be done one at a time with no user impact + +This is poor-man's load balancing. A Hetzner Load Balancer or Cloudflare +Load Balancer (paid) would be more sophisticated — with active health +checks and automatic failover on sub-second latency. Our DNS approach +is "good enough" for the traffic volume. + +### Cloudflare's origin health checks + +On Free plan, CF doesn't actively probe origins. It reacts to real +connection failures: if an origin returns 5xx repeatedly or connection +times out, CF marks it unhealthy for that edge POP for some time. + +Upgrading to **Cloudflare Load Balancing** ($5/mo add-on) would enable +active health checks — explicit probes independent of traffic. Useful +when you want sub-second failover. + +## TLS + +### Mode: Flexible + +CF Dashboard → SSL/TLS → Overview → **Flexible**. + +**What this means:** +- User ↔ Cloudflare: **TLS** (HTTPS) +- Cloudflare ↔ Origin: **plaintext HTTP** (port 80) + +**Why we chose it:** +- No origin cert required on the Hetzner nodes +- Zero Traefik cert-management complexity +- Fine for a site where CF terminates all user-facing TLS + +**Downsides:** +- An attacker with network access between CF and Hetzner could read + traffic. Realistically: nobody between CF's POPs and Hetzner's + Nuremberg DC, but it's theoretically plaintext on the wire. +- MitM risk if DNS gets hijacked and traffic is routed through an + unintended origin. + +### Future: Full (strict) + +The next step up is **Full (strict)**: CF verifies origin's TLS cert +and connects over HTTPS. Cloudflare provides free **Origin CA +certificates** for this: they're issued by a CF-internal CA that only +CF's own edge accepts. An attacker without a CF-signed cert can't +impersonate our origin. + +Path to enable: +1. Generate Origin CA cert in CF dashboard → SSL/TLS → Origin Server +2. Download as PEM +3. Create k8s Secret `cloudflare-origin-cert`: + ```bash + kubectl create secret tls cloudflare-origin-cert -n honeydue \ + --cert=origin.crt --key=origin.key + ``` +4. Add `tls:` block to our Ingress: + ```yaml + spec: + tls: + - hosts: [api.myhoneydue.com] + secretName: cloudflare-origin-cert + ``` +5. Switch CF SSL mode to Full (strict) + +Trad-off: the `cloudflare-origin-cert` expires (default 15 years), so +low maintenance. **TODO** (Chapter 20). + +### Edge certificate + +CF provides a free edge certificate for `*.myhoneydue.com` and +`myhoneydue.com`. Auto-renewed by Cloudflare. We don't touch it. + +### Always Use HTTPS + +SSL/TLS → Edge Certificates → **Always Use HTTPS: On** (default). + +Redirects any HTTP → HTTPS at the CF edge. Clients that hit +`http://api.myhoneydue.com/*` get 301'd to `https://...`. Origin never +sees the HTTP request. + +### HSTS + +**Not currently enabled.** HSTS (HTTP Strict Transport Security) sends +a header telling browsers "always use HTTPS for this domain." Once set +with long `max-age`, it's **permanent** until it expires — if we later +misconfigure TLS, HSTS-enabled browsers refuse to connect at all. + +Enabling HSTS is a TODO but requires confidence in our TLS stability. +Not tonight. + +## DDoS mitigation + +CF's Free plan includes basic DDoS protection: +- Volumetric attacks absorbed at the edge +- Obvious bot patterns blocked (known-bad user agents, headless browsers + doing suspicious things) + +Under a large attack, CF might: +- Insert a "checking your browser" JavaScript challenge (the ~5-second + "Cloudflare is checking your browser" page) +- Rate-limit by IP + +Under a sustained, sophisticated attack we might need: +- CF Pro plan ($20/mo) for more rule customization +- Enterprise plan for negotiated protection +- Extra measures like Cloudflare Magic Transit + +So far, not needed. + +## Caching + +Default CF caching: +- Static assets (CSS, JS, images) cached aggressively based on extension +- HTML pages honored per `Cache-Control` headers from origin +- JSON API responses typically not cached (no `Cache-Control: public`) + +Our Go API doesn't set `Cache-Control: public` on any endpoint, so CF +treats them as uncacheable. Every API call reaches origin. + +If we wanted to cache certain endpoints (e.g., public lookup tables): +```go +c.Response().Header().Set("Cache-Control", "public, max-age=300") +``` +And CF will cache for 5 minutes. + +## Firewall rules at CF + +CF Dashboard → Security → WAF. On Free tier: +- Managed rules: a small free allowlist of "obvious-attack" patterns +- Custom rules: limited (5 on Free, 20 on Pro) + +We have **no custom rules defined** currently. The managed ruleset +covers: +- SQL injection attempts in query strings +- Known-vulnerable bot User-Agents +- XSS attempts in common parameters + +## Rate limiting + +CF Free: **10,000 requests per 10 minutes per IP for free rules** (we +haven't configured any). The API itself should have rate limits for +sensitive endpoints; we don't rely on CF for that. + +## What CF does NOT do for us + +- **Authenticate users** — our app does +- **Authorize requests** — our app does +- **Encrypt pod-to-pod traffic** — nothing Cloudflare can help with +- **Backup origin data** — CF caches but doesn't store copies + persistently + +## Turnstile / bot management + +Not enabled. If we start seeing account-creation spam, Cloudflare +Turnstile (free) would be a good addition — a CAPTCHA replacement that +doesn't require user interaction for most traffic. + +## Origin IP protection + +CF proxying (orange cloud) is the primary protection of our origin IPs. +When proxying is on: +- DNS queries return CF edge IPs, never origin +- HTTP/HTTPS traffic goes through CF + +However, our origin IPs **can leak** via: +- Email sending (if the app ever sent email directly from the origin IP) + — we use Fastmail so this isn't an issue +- Outbound connections (our pods connect out to Neon, B2, Fastmail from + the nodes' public IPs; those IPs appear in external logs) +- Historical DNS records (services like SecurityTrails log historical + DNS; if we ever had unproxied A records, attackers can look them up) + +**If origin IPs leak**, attackers can bypass CF's protection by +connecting directly to node IPs. Current mitigation: +- UFW only allows :80/:443 from anywhere +- Our app has no ports bound to the public IP + +**Future** (Chapter 20): UFW rule to allow :80/:443 only from CF IP +ranges. Prevents direct-connect bypass entirely. + +## Cloudflare IP ranges (used in Traefik trustedIPs) + +From [cloudflare.com/ips](https://www.cloudflare.com/ips/): + +IPv4 ranges: +``` +173.245.48.0/20 +103.21.244.0/22 +103.22.200.0/22 +103.31.4.0/22 +141.101.64.0/18 +108.162.192.0/18 +190.93.240.0/20 +188.114.96.0/20 +197.234.240.0/22 +198.41.128.0/17 +162.158.0.0/15 +104.16.0.0/13 +104.24.0.0/14 +172.64.0.0/13 +131.0.72.0/22 +``` + +IPv6 ranges: +``` +2400:cb00::/32 +2606:4700::/32 +2803:f800::/32 +2405:b500::/32 +2405:8100::/32 +2a06:98c0::/29 +2c0f:f248::/32 +``` + +These are used in two places: +1. **Traefik `forwardedHeaders.trustedIPs`** — we already have this + configured (Chapter 6) +2. **UFW `allow 80/tcp from `** — NOT configured (TODO) + +CF occasionally adds new ranges. If a future CF range isn't in our +list, we'd either trust unknown IPs (if lax) or reject legitimate CF +traffic (if strict). The canonical source is the public API: + +```bash +curl -sS https://www.cloudflare.com/ips-v4 +curl -sS https://www.cloudflare.com/ips-v6 +``` + +## API token for programmatic changes + +If we automate DNS changes (e.g., adding new subdomain on deploy), +we'd need a CF API token with `Zone:DNS:Edit` scope for the +`myhoneydue.com` zone. + +Currently not automated; DNS is managed in the CF dashboard by hand. + +## Cost + +**$0/mo**. Free plan covers everything we use. Paid plans add features +we don't need yet: + +| Feature | Free | Pro ($20) | Business ($200) | +|---|---|---|---| +| DNS + proxying | ✓ | ✓ | ✓ | +| Basic DDoS | ✓ | ✓ | ✓ | +| SSL (edge + Flexible + Full + Full strict) | ✓ | ✓ | ✓ | +| WAF managed rules | ✓ (limited) | ✓ (more) | ✓ (all) | +| Custom firewall rules | 5 | 20 | 100 | +| Page Rules | 3 | 20 | 50 | +| Image Resizing | no | no | ✓ | +| Load Balancing | no | $5/mo add-on | ✓ | + +We'd consider Pro ($20/mo) if: +- We needed a custom WAF rule beyond the 5-rule limit +- We wanted Image Resizing for user-uploaded photos + +Neither is needed today. + +## Operator cheat sheet + +```bash +# Query current CF-served DNS +dig +short @1.1.1.1 api.myhoneydue.com # returns CF edge IPs when proxied + +# Query our origin directly (bypass CF) +curl -sS -H "Host: api.myhoneydue.com" http://178.104.247.152/api/health/ + +# Check CF headers (confirm you're going through CF) +curl -sS -I https://api.myhoneydue.com/api/health/ | grep -i cf- + +# Purge CF cache (requires API token) +curl -X POST \ + -H "Authorization: Bearer $CF_TOKEN" \ + -H "Content-Type: application/json" \ + "https://api.cloudflare.com/client/v4/zones//purge_cache" \ + -d '{"purge_everything":true}' +``` + +## References + +- [Cloudflare IP ranges][cf-ips] +- [Cloudflare SSL modes explained][cf-ssl] +- [Origin CA certificates][cf-origin-ca] +- [Cloudflare DNS best practices][cf-dns] + +[cf-ips]: https://www.cloudflare.com/ips/ +[cf-ssl]: https://developers.cloudflare.com/ssl/origin-configuration/ssl-modes/ +[cf-origin-ca]: https://developers.cloudflare.com/ssl/origin-configuration/origin-ca/ +[cf-dns]: https://developers.cloudflare.com/dns/ diff --git a/docs/deployment/14-deployment-process.md b/docs/deployment/14-deployment-process.md new file mode 100644 index 0000000..f2021ef --- /dev/null +++ b/docs/deployment/14-deployment-process.md @@ -0,0 +1,433 @@ +# 14 — Deployment Process + +## Summary + +A production deploy is: build a new image, push to Gitea, update the +Deployment's image field with the new SHA, Kubernetes rolls new pods in. +No downtime if the change is backward-compatible. Rollback is +`kubectl rollout undo`. This chapter walks through the full process, +plus alternate paths (config-only changes, manifest changes, hotfixes). + +## TL;DR for a code change + +```bash +# 1. Commit + get SHA +cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go +git add . && git commit -m "..." && SHA=$(git rev-parse --short HEAD) + +# 2. Login to Gitea registry +set -a; source deploy/registry.env; set +a +printf '%s' "$REGISTRY_TOKEN" | docker login "$REGISTRY" -u "$REGISTRY_USERNAME" --password-stdin + +# 3. Build + push amd64 image +docker buildx build --platform linux/amd64 --target api \ + -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" --push . + +# 4. Roll it in +export KUBECONFIG=~/.kube/honeydue-k3s.yaml +kubectl set image deployment/api -n honeydue \ + api="gitea.treytartt.com/admin/honeydue-api:${SHA}" + +# 5. Watch +kubectl rollout status -n honeydue deployment/api + +# 6. Log out +docker logout "$REGISTRY" +``` + +~3–5 minutes end to end for api. + +## The build + +### Step 1 — Prepare + +```bash +cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go +git status # clean working tree? +git log -1 --oneline # this is the SHA that'll ship +``` + +### Step 2 — Login to Gitea + +```bash +set -a; source deploy/registry.env; set +a +printf '%s' "$REGISTRY_TOKEN" | \ + docker login "$REGISTRY" -u "$REGISTRY_USERNAME" --password-stdin +``` + +**Note**: `docker login` without `--password-stdin` writes the token to +shell history. Don't skip the `printf` trick. + +### Step 3 — Build + push + +```bash +SHA=$(git rev-parse --short HEAD) + +# For API +docker buildx build \ + --platform linux/amd64 \ + --target api \ + -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" \ + --push . + +# For Worker +docker buildx build \ + --platform linux/amd64 \ + --target worker \ + -t "gitea.treytartt.com/admin/honeydue-worker:${SHA}" \ + --push . + +# For Admin (Next.js) +docker buildx build \ + --platform linux/amd64 \ + --target admin \ + -t "gitea.treytartt.com/admin/honeydue-admin:${SHA}" \ + --push . +``` + +- `--platform linux/amd64` — cross-compile from operator's arm64 to + Hetzner nodes' amd64 +- `--target X` — select a stage from the multi-stage Dockerfile +- `--push` — push to registry in one step; don't leave image in local + Docker + +First build is slow (~3–5 min cold). Subsequent builds hit BuildKit +layer cache and complete in ~30–60s if only app code changed. + +### Build platform note + +If `docker buildx` isn't configured: + +```bash +docker buildx create --name honeydue-builder --use +docker buildx inspect --bootstrap +``` + +This creates a BuildKit container that supports cross-platform builds. +The `--bootstrap` line spins it up immediately so errors surface now +instead of on first build. + +## The deploy + +### For a single service + +```bash +export KUBECONFIG=~/.kube/honeydue-k3s.yaml + +kubectl set image deployment/api -n honeydue \ + api="gitea.treytartt.com/admin/honeydue-api:${SHA}" +``` + +This updates the Deployment's image field. Kubernetes: +1. Creates a new ReplicaSet with the new image (annotation records + rev) +2. Starts a new pod (per `maxSurge: 1`) +3. Waits for readinessProbe to pass on the new pod (up to 240s for + cold api boot) +4. Once ready, removes a pod from the old ReplicaSet +5. Repeats until all pods are on the new ReplicaSet +6. Marks rollout complete + +### Watching the rollout + +```bash +kubectl rollout status -n honeydue deployment/api +``` + +Outputs progress; returns when complete or timed out. Default timeout +is 10 minutes. + +More detailed: + +```bash +# Watch pods transition +kubectl get pods -n honeydue -l app.kubernetes.io/name=api -w + +# Watch events +kubectl get events -n honeydue --sort-by=.lastTimestamp -w +``` + +### For all three services + +```bash +for svc in api worker admin; do + kubectl set image deployment/$svc -n honeydue \ + $svc="gitea.treytartt.com/admin/honeydue-${svc}:${SHA}" +done + +# Watch all rollouts +for svc in api worker admin; do + kubectl rollout status -n honeydue deployment/$svc +done +``` + +## Config-only changes (no new image) + +When you change `prod.env` but code is unchanged: + +```bash +# 1. Update prod.env locally +# 2. Regenerate ConfigMap +kubectl create configmap honeydue-config -n honeydue \ + --from-env-file=deploy/prod.env \ + --dry-run=client -o yaml | kubectl apply -f - + +# 3. Pods do NOT auto-reload env vars. Restart them. +kubectl rollout restart -n honeydue deployment/api deployment/admin deployment/worker +``` + +`rollout restart` triggers a rolling update with the *same* image but +forces pod recreation. New pods pick up the updated ConfigMap. + +### Why not auto-reload? + +Kubernetes has no built-in mechanism to restart pods on ConfigMap change. +There's no `envFromWatch` equivalent. Third-party operators like +Reloader can do it, but we don't run one. + +For sensitive config (like the `SECRET_KEY`), this is actually good — +pods don't cycle unexpectedly when someone tweaks the ConfigMap. + +## Secret changes + +Same flow as config: + +```bash +# Rotate a value +kubectl patch secret honeydue-secrets -n honeydue \ + --type=merge -p "{\"data\":{\"SECRET_KEY\":\"$(echo -n 'newvalue' | base64)\"}}" + +# Restart pods +kubectl rollout restart -n honeydue deployment/api deployment/worker +``` + +## Manifest changes + +When you add/modify a deployment YAML: + +```bash +kubectl apply -f deploy-k3s/manifests/api/deployment.yaml +``` + +If the change is a spec field that Kubernetes considers a new pod +template (e.g., changing resource limits, env, volumes), pods roll. +If the change is a scalar like replicas, no pod churn — just new pods +added/removed. + +## Rollback + +### Last-known-good rollback + +```bash +kubectl rollout undo deployment/api -n honeydue +``` + +Reverts to the previous ReplicaSet (the one with the previous image). +Takes ~30s to stabilize. + +### Rollback to a specific revision + +```bash +# See revision history +kubectl rollout history deployment/api -n honeydue + +# Revert to specific revision number +kubectl rollout undo deployment/api -n honeydue --to-revision=3 +``` + +Kubernetes keeps up to 10 ReplicaSet revisions by default +(`spec.revisionHistoryLimit`). + +### Hard rollback (deploy an older image) + +```bash +kubectl set image deployment/api -n honeydue \ + api="gitea.treytartt.com/admin/honeydue-api:" +``` + +Useful when you want to go back further than the revision history, or +to a specific known-good SHA. + +## Rolling update semantics + +```yaml +strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 +``` + +For api (3 replicas): +- `maxUnavailable: 0` — no pod is removed until replacement is ready +- `maxSurge: 1` — up to 4 pods exist simultaneously during rollout + +Timeline (approximate, warm state): +- t=0: kubectl set image +- t=0: k8s creates new RS with 1 pod +- t=30s (or so): new pod readiness probe passes +- t=30s: k8s terminates 1 old pod +- t=60s: next new pod ready +- t=60s: another old pod terminates +- ...continues until all on new RS + +For cold-boot (e.g., first deploy on a rebuilt cluster), the +MigrateWithLock advisory lock extends this to several minutes. But the +rollout is serialized — only one pod starts per iteration, so the lock +queue is small. + +## Hotfix workflow + +When we need to ship a fix fast and skip the usual steps: + +1. Fix in code +2. Build + push +3. `kubectl set image` on the affected service only +4. Monitor with `kubectl logs -f` + +Don't skip CI/tests in a real org; for solo operator this is the tradeoff. + +## Integration with Gitea + +Currently no CI/CD. The operator builds from the workstation and pushes +manually. Future: + +- Gitea Actions (Drone-like CI) could trigger on push to `main` +- Build + push step could run in a GitHub Actions-compatible workflow +- Auto-deploy on tag push, manual promote to prod + +**TODO** (Chapter 20). + +## What the old Swarm deploy script did + +Contrast: `deploy/scripts/deploy_prod.sh` (Swarm-era) did: + +1. Validate every config file (placeholder detection, APNS key format, + B2 all-or-none) +2. Buildx to amd64 +3. Push to Gitea (we retrofitted this from GHCR) +4. SCP bundle to manager node +5. `docker secret create` + `docker config create` with versioned names +6. `docker stack deploy --with-registry-auth` +7. Poll stack services until convergence (420s timeout) +8. Prune old secret/config versions +9. Healthcheck the final URL; auto-rollback on failure +10. Log out of registries + +Our current k3s deploy is more manual but simpler. We'd write a similar +script for k3s if deploys become frequent: + +```bash +# deploy-k3s/scripts/04-deploy.sh (not yet updated for Gitea) +``` + +See the scaffold in `deploy-k3s/scripts/`. + +## Common deploy failures + +| Symptom | Likely cause | +|---|---| +| `ImagePullBackOff` | Image not in registry, or pull secret expired | +| Stuck at "Progressing" | Readiness probe not passing; check pod logs | +| `CrashLoopBackOff` immediately | App won't start; check pod logs for panic/exit reason | +| `CrashLoopBackOff` after migration | Cache service, Redis connection, or post-init code issue | +| Old pods never terminate | New pods not ready; rollout doesn't progress | +| Rollout succeeds but app is broken | Readiness probe is too lenient; passes on broken app | + +### Debugging commands + +```bash +# Describe the deployment (shows events, conditions) +kubectl describe deployment api -n honeydue + +# Describe the latest pod +kubectl describe pod -n honeydue -l app.kubernetes.io/name=api + +# Logs from currently-running pods +kubectl logs -n honeydue -l app.kubernetes.io/name=api --tail=100 --prefix + +# Logs from the last-terminated pod +kubectl logs -n honeydue --previous + +# Events in the namespace (newest first) +kubectl get events -n honeydue --sort-by=.lastTimestamp + +# Pause a rollout (stops new pods from being created) +kubectl rollout pause deployment/api -n honeydue + +# Resume +kubectl rollout resume deployment/api -n honeydue +``` + +## Zero-downtime considerations + +For zero-downtime deploys, the new image must be: + +1. **Backward-compatible** with the current database schema (schema + migrations run before new code) +2. **Backward-compatible** with in-flight API requests (don't remove + endpoints mid-deploy; deprecate first) +3. **Backward-compatible** with Redis data structures (don't change + cache key formats abruptly) + +For breaking changes: +1. Deploy intermediate version that handles both old and new +2. Once rolled out everywhere, deploy breaking-change version +3. Two deploys, same day or different days + +We don't have this discipline yet; our API has too few clients to +worry about. As mobile clients proliferate, this becomes more important. + +## Blue-green / canary (not yet) + +Kubernetes supports advanced rollout strategies: +- **Canary**: route 5% of traffic to new version, scale up gradually +- **Blue-green**: run new version alongside old, flip traffic all at + once + +These require Traefik's TraefikService CRD with weighted routing, or +a service mesh. **TODO** if traffic scale justifies. + +## Cleanup: the old Swarm config + +`deploy/` directory contains the Swarm-era config. It's still there but +unused. After we're confident in k3s (a few weeks? month?), remove it: + +```bash +rm -rf deploy/ +``` + +Keep the useful files in `deploy-k3s/` only. + +## Operator cheat sheet + +```bash +# Full build + deploy +cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go +SHA=$(git rev-parse --short HEAD) +set -a; source deploy/registry.env; set +a +printf '%s' "$REGISTRY_TOKEN" | docker login "$REGISTRY" -u admin --password-stdin +docker buildx build --platform linux/amd64 --target api -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" --push . +docker buildx build --platform linux/amd64 --target worker -t "gitea.treytartt.com/admin/honeydue-worker:${SHA}" --push . +docker buildx build --platform linux/amd64 --target admin -t "gitea.treytartt.com/admin/honeydue-admin:${SHA}" --push . +docker logout gitea.treytartt.com + +export KUBECONFIG=~/.kube/honeydue-k3s.yaml +for svc in api worker admin; do + kubectl set image deployment/$svc -n honeydue "$svc=gitea.treytartt.com/admin/honeydue-${svc}:${SHA}" +done + +for svc in api worker admin; do + kubectl rollout status -n honeydue deployment/$svc +done +``` + +## References + +- [Kubernetes Deployment rolling update][rolling] +- [kubectl rollout][rollout] +- [Docker buildx][buildx] + +[rolling]: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#rolling-update-deployment +[rollout]: https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#rollout +[buildx]: https://docs.docker.com/build/buildx/ diff --git a/docs/deployment/15-observability.md b/docs/deployment/15-observability.md new file mode 100644 index 0000000..84bc5e7 --- /dev/null +++ b/docs/deployment/15-observability.md @@ -0,0 +1,305 @@ +# 15 — Observability + +## Summary + +We have minimal observability today: `kubectl logs`, `kubectl top`, +Cloudflare Analytics, and the Neon dashboard. No Prometheus, no Grafana, +no centralized log aggregator, no APM. This is adequate for the +current traffic volume (low) but is a known gap. This chapter documents +what we *have* and what we'd add as traffic grows. + +## What we have + +### 1. `kubectl logs` + +Every container's stdout/stderr is captured by containerd and readable +via kubectl: + +```bash +# Live tail from all api pods +kubectl logs -n honeydue -l app.kubernetes.io/name=api -f --prefix + +# Last 100 lines +kubectl logs -n honeydue -l app.kubernetes.io/name=api --tail=100 + +# Previous pod's logs (before the most recent restart) +kubectl logs -n honeydue --previous + +# Events (not logs — k8s-level state changes) +kubectl get events -n honeydue --sort-by=.lastTimestamp +``` + +**Retention**: containerd rotates logs when they exceed 10 MB (default). +Only the last ~20 MB of logs is retained per container, on-disk on the +node. Once a pod is deleted, its logs are gone. + +For persistent log access we'd need aggregation (see §what we'd add). + +### 2. `kubectl top` + +Pod and node resource usage via metrics-server: + +```bash +kubectl top nodes +# NAME CPU(cores) CPU(%) MEMORY(bytes) MEMORY(%) +# ubuntu-8gb-nbg1-1 169m 4% 748Mi 9% +# ubuntu-8gb-nbg1-2 229m 5% 1043Mi 13% +# ubuntu-8gb-nbg1-3 124m 3% 770Mi 9% + +kubectl top pods -n honeydue +``` + +**Retention**: In-memory only. Last few minutes of data. No +historical view. + +### 3. Cloudflare Analytics + +CF Dashboard → Analytics & Logs. Per-zone stats: +- Requests per second +- Bandwidth +- Cache hit ratio +- Top HTTP status codes +- Top request paths +- Bot traffic score + +All aggregated, no individual request traces. Good for spotting macro +trends ("suddenly 10× more 502s today"), poor for debugging specific +issues. + +Free tier retention: 7 days of aggregate stats. Pro extends this. + +### 4. Neon dashboard + +Neon console → project → Monitoring: +- Compute utilization (CU-hours consumed) +- Query performance (slow queries) +- Active connections +- Storage usage + +Good for "is the DB busy?" and "am I close to my free tier limit?" +Not real-time. + +### 5. Kubernetes events + +`kubectl get events` shows cluster-level state changes: pod scheduling, +failures, image pulls, probe failures. Useful for post-mortem on +deploys. + +Retention: events are stored in etcd but default to 1 hour. + +## What we don't have (the gap) + +### No log aggregation + +Individual pod logs are on the node. For multi-pod debugging ("show me +all api pod logs for user X") we have to: + +```bash +# Query all at once with stern (if installed) +stern -n honeydue api + +# Or for specific pod +kubectl logs -n honeydue | grep user_id=12345 +``` + +This works but doesn't scale. Grep across 3 pods for a specific +user_id is OK. Across 30 pods, intractable. + +**What we'd add**: [Loki](https://grafana.com/oss/loki/) — a lightweight +log aggregator designed for k8s. ~$0 to self-host; integrates with +Grafana for queries. Or [Betterstack](https://betterstack.com/logs) +($10/mo, hosted). + +### No metrics/dashboards + +`kubectl top` tells us "is this pod hot right now?" but not "has CPU +been climbing over the past hour?" We'd need: + +- **Prometheus** — scrapes metrics from kubelet and pods' `/metrics` + endpoints, stores time series +- **Grafana** — queries Prometheus, renders dashboards + +K3s can install these via Helm in ~10 minutes. Adds ~500MB RAM to the +cluster. Stability and operational load: moderate. + +**Alternative**: [Kubernetes Dashboard](https://github.com/kubernetes/dashboard) +bundled with k3s (disabled by default). Minimal UI over the existing +metrics API. Cheaper than Prometheus but less queryable. + +### No distributed tracing + +"This request took 800ms — which hop was slow?" is currently unanswerable +beyond "the DB query, probably." A real trace would show: +- TLS handshake time +- Traefik routing time +- Go handler time +- Postgres query time +- Redis call time +- Each B2 request time + +We'd add OpenTelemetry to the Go app and export to Jaeger/Tempo. Work +is moderate; value kicks in when we have complex request flows. + +### No alerting + +No PagerDuty, no Slack webhooks, no email on "api is returning 500s." +The operator finds out when users complain. + +Cheapest fix: [Uptime Kuma](https://github.com/louislam/uptime-kuma) +(self-hosted) or Better Stack Uptime (free for small teams). Ping +`https://api.myhoneydue.com/api/health/` every minute; alert if it fails. + +### No APM (Application Performance Monitoring) + +No request-level profiling. We can't see "which endpoint has the highest +p99 latency?" or "which SQL query is hot this week?" + +Options: Datadog, New Relic, Honeycomb, self-hosted Tempo+Grafana. +All are meaningful work to set up and cost $$$. + +## The app's logging conventions + +The Go app uses zerolog and emits structured JSON: + +```json +{ + "level": "info", + "time": "2026-04-24T05:29:40Z", + "caller": "/app/cmd/api/main.go:189", + "addr": ":8000", + "message": "HTTP server listening" +} +``` + +Log levels: `debug`, `info`, `warn`, `error`, `fatal`. Controlled by +`DEBUG=true|false` in ConfigMap (true sets level to debug, false sets +level to info). + +Every request is logged with: +- Method, path, status code +- Request ID (for correlating logs across pods) +- User ID (if authenticated) +- Latency + +```json +{ + "level": "info", + "method": "GET", + "path": "/api/tasks/", + "status": 200, + "latency_ms": 42, + "user_id": 123, + "request_id": "a6b5db35-..." +} +``` + +This is queryable by grep. Better with log aggregation. + +## Health endpoints + +Each service exposes a health endpoint: + +| Service | Endpoint | What it checks | +|---|---|---| +| api | `/api/health/` | Process alive (doesn't verify DB) | +| admin | `/` | Next.js is up | +| worker | (none public) | Internal Asynq status | + +Health endpoints are **shallow** — they return 200 if the process is +running and listening. They don't try to reach Postgres/Redis/etc. +Rationale: if Postgres is briefly down, we don't want all api pods to +start failing liveness and cascade-restart. + +## Dozzle (deprecated) + +The Swarm era had [Dozzle](https://github.com/amir20/dozzle) — a +lightweight web UI for Docker logs. Accessible via SSH tunnel to the +manager node. Not deployed on k3s; `kubectl logs` + `stern` fills the +niche. + +## Kubernetes metrics the k8s API exposes + +Even without Prometheus, these are queryable: + +```bash +# Resource metrics (via metrics-server) +kubectl get --raw /apis/metrics.k8s.io/v1beta1/nodes +kubectl get --raw /apis/metrics.k8s.io/v1beta1/namespaces/honeydue/pods + +# Core API (k8s state) +kubectl get --raw /api/v1/namespaces/honeydue/pods/ + +# Kubelet metrics (per-node; requires tunneling) +kubectl get --raw /api/v1/nodes//proxy/metrics +``` + +If we ever spin up Prometheus, these are the endpoints it would scrape. + +## Future: what to add and when + +| Trigger | Add | +|---|---| +| 10k+ daily users | Loki + Grafana for logs | +| 100+ req/s sustained | Prometheus + Grafana for metrics | +| Performance incidents | OpenTelemetry tracing | +| Revenue > $5k/mo | Paid monitoring (Datadog or similar) | +| First production outage | Alerting to phone/Slack | + +The overall philosophy: observability is an investment that compounds. +Add it before you need it, not after. But also don't over-invest at +idle. + +**Next quarter**: set up Uptime Kuma + Loki at minimum. + +## Checking what's installed + +```bash +# In kube-system namespace +kubectl get pods -n kube-system +# Should see: coredns, metrics-server, traefik, local-path-provisioner, +# and some k3s-related helm install jobs + +# In honeydue namespace +kubectl get pods -n honeydue +# api, admin, worker, redis + +# No monitoring namespace (yet) +kubectl get namespaces +# default, honeydue, kube-node-lease, kube-public, kube-system +``` + +## Operator cheat sheet + +```bash +# Tail all logs in the namespace +kubectl logs -n honeydue --all-containers=true --tail=50 -l app.kubernetes.io/part-of=honeydue + +# With stern (if installed: brew install stern) +stern -n honeydue . + +# Follow specific pod, including previous runs +kubectl logs -n honeydue -f --previous=false + +# Pod resource usage +kubectl top pods -n honeydue --sort-by=memory +kubectl top pods -n honeydue --sort-by=cpu + +# Events (cluster-wide) +kubectl get events -A --sort-by=.lastTimestamp | tail -20 + +# Full state dump for a pod (debugging) +kubectl describe pod -n honeydue > /tmp/pod-dump.txt +kubectl logs -n honeydue > /tmp/pod-logs.txt +``` + +## References + +- [Kubernetes metrics-server][ms] +- [K3s metrics][k3s-metrics] +- [Loki][loki] +- [Stern (multi-pod log tail)][stern] + +[ms]: https://github.com/kubernetes-sigs/metrics-server +[k3s-metrics]: https://docs.k3s.io/advanced#enabling-metrics-server +[loki]: https://grafana.com/oss/loki/ +[stern]: https://github.com/stern/stern diff --git a/docs/deployment/16-failure-modes.md b/docs/deployment/16-failure-modes.md new file mode 100644 index 0000000..ef5585c --- /dev/null +++ b/docs/deployment/16-failure-modes.md @@ -0,0 +1,360 @@ +# 16 — Failure Modes + +## Summary + +Every component in the system has a failure mode, a user-visible +symptom, and a recovery story. This chapter enumerates them from the +edge inward. Use this as a reference when debugging or when planning +resilience improvements. + +## Failure catalog + +### Cloudflare-level + +#### CF edge POP outage + +**Symptom**: users in one geographic region see errors; other regions +fine. +**Recovery**: automatic — CF routes traffic to next-nearest POP. +**Our action**: none; wait for CF. +**Frequency**: rare, usually resolved in minutes. + +#### CF global outage (rare but has happened) + +**Symptom**: the whole site unreachable via CF. +**Recovery**: manual — disable CF proxy (grey cloud DNS records), users +hit origins directly. +**Our action**: in Cloudflare dashboard, flip each A record's proxy off. +Users then resolve to our node IPs directly; UFW allows :80/:443 from +anywhere so they reach Traefik. TLS breaks (origin has no cert in SSL +Flexible mode), but HTTP works. +**Frequency**: extremely rare (hours-long event happens ~annually). + +#### DNS hijacking + +**Symptom**: users' DNS queries return attacker IPs; all traffic +compromised. +**Mitigation**: unlikely at CF; users who use DoH/DoT are protected. +No mitigation at our level. +**Recovery**: requires CF incident response. + +### Node-level + +#### One node's NIC fails + +**Symptom**: Cloudflare's retry logic routes around it within seconds. +Users see a brief spike in latency as CF learns the IP is unhealthy. +Pods on that node get rescheduled to surviving nodes by Kubernetes +after `node-monitor-grace-period` (40s). +**Recovery**: +- Automatic pod rescheduling takes ~5 min (grace period + pod eviction) +- Dead node's Raft vote is missing; cluster stays up (2 of 3 quorum) +- Replace the node via Hetzner console when convenient +**Our action**: verify `kubectl get nodes` shows NotReady; check +Hetzner console to confirm the node's status; recreate if needed. + +#### Two nodes fail simultaneously + +**Symptom**: Raft loses quorum. Kubernetes API server rejects writes. +Existing pods keep running but nothing new can be scheduled/updated. +Single surviving node's pods continue serving traffic. +**Recovery**: +- If a failed node comes back within Raft's leader-election timeout + (seconds to minutes), quorum restores +- If failed nodes are truly gone, the cluster is broken — need to + rebuild +**Rebuild procedure**: from the surviving node, `k3s-killall.sh`, then +bootstrap a new 3-node cluster from scratch. Data in Neon/B2 is safe; +Redis state is lost. + +#### All three nodes fail simultaneously + +**Symptom**: full site outage. +**Recovery**: rebuild the cluster from scratch. +**Frequency**: Hetzner-region-wide outage, extremely rare. + +#### Node disk fills up + +**Symptom**: pods get evicted ("node is disk-pressure"). Containers +can't be scheduled on that node. +**Common cause**: container log buildup (containerd rotates at 10 MB +per container but across dozens of pod churn cycles, total fills up), +local-path PVC fills up, apt cache. +**Recovery**: +```bash +ssh deploy@ "sudo df -h; sudo du -sh /var/lib/rancher/* | sort -h" +# Then clean up +``` + +### k3s control plane failures + +#### etcd corruption on one node + +**Symptom**: Raft detects divergence; that node stops serving writes. +**Recovery**: remove the node from the cluster, rejoin. Etcd snapshot +is pulled from surviving peers automatically. + +#### CoreDNS down + +**Symptom**: pods can't resolve Service names. New TCP connections +fail; existing connections continue (they already resolved). +Typical manifestation: "DB connection failed — no such host" errors. +**Recovery**: k3s automatically restarts CoreDNS pod. If it +keeps crashing: +```bash +kubectl logs -n kube-system deploy/coredns --previous +kubectl rollout restart deployment/coredns -n kube-system +``` +**Frequency**: rare. + +#### metrics-server down + +**Symptom**: `kubectl top` returns an error; HPAs can't scale. +**Recovery**: restart metrics-server pod. Non-critical; service stays up. +```bash +kubectl rollout restart deployment/metrics-server -n kube-system +``` + +### Networking failures + +#### UFW rule accidentally blocks essential traffic + +**Symptom**: Some specific thing stops working (e.g., api can't reach +Postgres, cross-node pod traffic fails, kubectl times out). +**Recovery**: log in via SSH (if that still works), `sudo ufw status +numbered`, `sudo ufw --force delete ` to remove offending rule. +**If SSH is blocked too**: Hetzner console → Rescue mode → mount disk +→ edit `/etc/ufw/user.rules`. + +#### Flannel broken on one node + +**Symptom**: pods on that node can't reach remote pods via overlay. +ClusterIP Services involving cross-node endpoints fail. +**Recovery**: restart kubelet on that node: +```bash +ssh deploy@ "sudo systemctl restart k3s" +``` + +#### Kube-proxy broken on one node + +**Symptom**: pods on that node can't reach ClusterIPs. Symptoms look +like DNS resolution succeeded but connection refused or timed out. +**Recovery**: same as Flannel — restart k3s on the node. + +### Application-level + +#### api pod OOM + +**Symptom**: pod gets killed, kubelet restarts it. User's request +returns 502 briefly; subsequent requests routed to healthy pods. +Readiness probe removes the OOMing pod from Service endpoints. +**Recovery**: automatic (pod restarts). If it keeps OOMing: +- Increase `resources.limits.memory` in the deployment +- Or debug the memory leak +**Check**: +```bash +kubectl describe pod -n honeydue | grep -i oom +kubectl logs -n honeydue --previous +``` + +#### api pod panics + +**Symptom**: goroutine panic kills the process. Kubelet restarts. +Similar user impact to OOM. +**Recovery**: automatic restart. But if the panic is deterministic +(same input → panic), the pod crashloops. +**Action**: read the logs, find the panic stack trace, fix the code, +deploy. +**Circuit-breaker scenario**: if all 3 api pods crashloop on startup +because of bad code, kubectl rollout undo to previous revision. + +#### api deadlocks + +**Symptom**: all 3 pods are up, readiness passes (shallow probe), but +real requests time out or hang. +**Recovery**: liveness probe is the same endpoint as readiness, so it +won't help. You'll see gradually increasing 504s at the edge. Manual +intervention: +```bash +kubectl rollout restart deployment/api -n honeydue +``` + +#### admin pod crashes + +**Symptom**: 502 at Cloudflare when accessing admin.myhoneydue.com. +**Recovery**: k8s auto-restarts. Usually within 10-30s. +**Impact**: only admins lose access; user-facing api is unaffected. + +#### worker stops processing jobs + +**Symptom**: emails stop being sent, cron jobs stop firing. +**Detection**: no direct alert; need to notice via user feedback or +missing daily-digest emails. Or check Redis for queue backlog. +**Recovery**: +```bash +kubectl rollout restart deployment/worker -n honeydue +``` +**If persistent**: check logs for specific error: +```bash +kubectl logs -n honeydue deploy/worker --tail=100 +``` + +#### redis pod dies + node is different + +**Symptom**: Redis schedules to a new node, but the PVC is on the +original node (local-path is per-node). New Redis pod comes up but +finds an empty data directory (or can't mount at all). +**Recovery**: +- If the original node is still alive but Redis pod died: pod comes + back up on same node with data intact +- If the original node is gone: Redis starts empty. Cache regenerates. + Asynq queue state is lost; pending jobs re-queue on retry, cron + fires re-schedule on next tick. +- Ensure the node label `honeydue/redis=true` is on a healthy node: +```bash +kubectl label node honeydue/redis=true --overwrite +kubectl label node honeydue/redis- 2>/dev/null || true +``` + +### External service failures + +#### Neon Postgres outage + +**Symptom**: api logs fill with "failed to connect to database." All +mutating API calls fail. Reads from cache continue (via Redis) but +eventually cache expires. +**Recovery**: no action from us; Neon's problem. Users will see 5xx +until Neon is back. +**Mitigation for future**: multi-region Neon read replica, or +Postgres-level failover. +**Frequency**: Neon has had a handful of hours-scale outages since launch. + +#### Backblaze B2 outage + +**Symptom**: image uploads fail; image downloads fail unless cached by +CF. +**Recovery**: wait. B2 rarely goes down. +**Mitigation**: serve downloads via CF with long cache TTL — most +users won't notice brief B2 outages for read traffic. + +#### Fastmail SMTP unreachable + +**Symptom**: `worker` can't send transactional emails. Jobs retry per +Asynq's retry policy, eventually giving up and logging an error. +**Recovery**: automatic retry; wait for Fastmail to come back. +**Manual intervention**: re-enqueue jobs from the Asynq UI (we don't +expose it yet — future). + +#### Gitea registry unreachable + +**Symptom**: `kubectl rollout` stuck at "Pulling image" for new pods. +Existing pods continue running with their already-pulled images. +**Recovery**: wait for Gitea to come back. +**Mitigation**: K8s has `imagePullPolicy: IfNotPresent` by default on +SHA-tagged images, so images aren't re-pulled on every restart if +the node already has them cached. + +#### Cloudflare DNS failure + +See §CF failures above. + +## Combined failures + +### "Everything is slow" + +Most often = Neon is being hammered by our load + someone else's noisy +neighbor. +- Check `kubectl top pods` (are we CPU-bound?) +- Check Neon console for query performance +- Check CF analytics for traffic spikes + +### "Some users see 502, others don't" + +Usually one node has an unhealthy Traefik or api. Cloudflare routes +some connections to it, others to healthy nodes. +- `kubectl get pods -n kube-system -l app.kubernetes.io/name=traefik` +- `kubectl get pods -n honeydue -l app.kubernetes.io/name=api` +- Check per-pod logs + +### "It worked 5 minutes ago, now it doesn't" + +Something recent changed. Check: +- Recent deploys: `kubectl rollout history deployment/api -n honeydue` +- Recent manifest changes: `kubectl get events -A --sort-by=.lastTimestamp | tail -30` +- External: Cloudflare Status page, Neon Status page, Backblaze Status page + +## Planned outages + +### Node upgrades (OS patches) + +```bash +# Drain the node (evict pods, block scheduling) +kubectl drain ubuntu-8gb-nbg1-1 --ignore-daemonsets --delete-emptydir-data + +# SSH in, upgrade, reboot +ssh deploy@hetzner2 "sudo apt update && sudo apt upgrade -y && sudo reboot" + +# Wait for node to come back +watch kubectl get nodes + +# Uncordon +kubectl uncordon ubuntu-8gb-nbg1-1 +``` + +During the drain, pods from that node reschedule to the survivors. +With current workload (api: 3 replicas, everything else: 1), rescheduling +1 api pod is fine. Traffic loss: zero. + +Worker pod or Redis pod scheduled on the drained node would be +briefly unavailable during reschedule. Acceptable for planned windows. + +### k3s upgrades + +Same per-node drain + upgrade pattern, but with k3s-specific install: + +```bash +# On the node +curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.35.x+k3s1 sh -s - server + +# k3s detects existing install and upgrades in place +``` + +Do one node at a time. Verify cluster health between each. + +## Disaster recovery + +### Complete cluster loss + +Procedure: +1. Provision 3 new Hetzner CX33 nodes (or use existing if healthy) +2. Follow bootstrap procedure (Chapter 1 §node hardening) +3. Install k3s on each (Chapter 2 §HA architecture) +4. Configure kubeconfig +5. Apply all manifests: + ```bash + kubectl apply -f deploy-k3s/manifests/namespace.yaml + kubectl apply -f deploy-k3s/manifests/rbac.yaml + kubectl apply -f deploy-k3s/manifests/traefik-helmchartconfig.yaml + # Wait for Traefik to redeploy + # ... recreate secrets (see Chapter 10) ... + # ... apply rest of manifests ... + ``` +6. Update DNS if node IPs changed +7. Verify: curl https://api.myhoneydue.com/api/health/ + +Estimated time: **1-2 hours** if you've done it before. A lot of +context-switching between Hetzner console, SSH, kubectl, and CF. + +Neon data is untouched by any of this. B2 data is untouched. Only +state that's lost: Redis cache (regenerates) and any in-flight Asynq +jobs that were mid-processing. + +## References + +- [Kubernetes pod lifecycle][lifecycle] +- [K3s HA recovery][k3s-ha-recovery] +- [Hetzner rescue system][hetzner-rescue] + +[lifecycle]: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/ +[k3s-ha-recovery]: https://docs.k3s.io/datastore/ha-embedded#new-cluster-with-embedded-db +[hetzner-rescue]: https://docs.hetzner.com/cloud/servers/getting-started/enabling-rescue-system/ diff --git a/docs/deployment/17-runbook.md b/docs/deployment/17-runbook.md new file mode 100644 index 0000000..44df38f --- /dev/null +++ b/docs/deployment/17-runbook.md @@ -0,0 +1,369 @@ +# 17 — Operator Runbook + +## Summary + +Common procedures the operator runs. Each is a numbered sequence of +exact commands. If a step is unclear, add a comment; if a procedure +fails in an unexpected way, add the symptom + fix to this document. + +## Environment setup + +Every command assumes: + +```bash +export KUBECONFIG=~/.kube/honeydue-k3s.yaml +cd /Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go +``` + +If you see "Unable to connect to the server," the kubeconfig isn't set. + +## 1. Check cluster health + +```bash +kubectl get nodes # all 3 Ready? +kubectl get pods -A | grep -vE 'Running|Completed' # anything not running? +kubectl top nodes # resource usage +kubectl get events -A --sort-by=.lastTimestamp | tail -20 +``` + +## 2. Deploy new code + +### Full deploy (all three services) + +```bash +SHA=$(git rev-parse --short HEAD) + +# Login +set -a; source deploy/registry.env; set +a +printf '%s' "$REGISTRY_TOKEN" | \ + docker login "$REGISTRY" -u "$REGISTRY_USERNAME" --password-stdin + +# Build +docker buildx build --platform linux/amd64 --target api \ + -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" --push . +docker buildx build --platform linux/amd64 --target worker \ + -t "gitea.treytartt.com/admin/honeydue-worker:${SHA}" --push . +docker buildx build --platform linux/amd64 --target admin \ + -t "gitea.treytartt.com/admin/honeydue-admin:${SHA}" --push . + +# Apply +for svc in api worker admin; do + kubectl set image deployment/$svc -n honeydue \ + "$svc=gitea.treytartt.com/admin/honeydue-${svc}:${SHA}" +done + +# Watch +for svc in api worker admin; do + kubectl rollout status -n honeydue deployment/$svc +done + +# Logout +docker logout gitea.treytartt.com +``` + +### Single service + +```bash +SHA=$(git rev-parse --short HEAD) +set -a; source deploy/registry.env; set +a +printf '%s' "$REGISTRY_TOKEN" | docker login "$REGISTRY" -u "$REGISTRY_USERNAME" --password-stdin +docker buildx build --platform linux/amd64 --target api \ + -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" --push . +kubectl set image deployment/api -n honeydue \ + api="gitea.treytartt.com/admin/honeydue-api:${SHA}" +kubectl rollout status -n honeydue deployment/api +docker logout "$REGISTRY" +``` + +## 3. Rollback + +### Last good + +```bash +kubectl rollout undo deployment/api -n honeydue +kubectl rollout status -n honeydue deployment/api +``` + +### Specific SHA + +```bash +kubectl set image deployment/api -n honeydue \ + api="gitea.treytartt.com/admin/honeydue-api:" +``` + +## 4. Read logs + +```bash +# Follow all api pod logs +kubectl logs -n honeydue -l app.kubernetes.io/name=api -f --prefix + +# Errors only +kubectl logs -n honeydue -l app.kubernetes.io/name=api --tail=1000 | grep -i error + +# Previous pod (before crash/restart) +kubectl logs -n honeydue --previous +``` + +## 5. Exec into a pod + +```bash +kubectl exec -n honeydue -it deploy/api -- /bin/sh +# inside: +# wget -qO- http://127.0.0.1:8000/api/health/ +# env | grep DB_ +# exit +``` + +## 6. Rotate a secret + +```bash +# For honeydue-secrets keys +kubectl patch secret honeydue-secrets -n honeydue \ + --type=merge \ + -p "{\"data\":{\"SECRET_KEY\":\"$(echo -n 'new-value' | base64)\"}}" + +# Update local file to match (keep in sync) +printf '%s' 'new-value' > deploy/secrets/secret_key.txt + +# Restart pods so they pick up the new secret +kubectl rollout restart -n honeydue deploy/api deploy/worker +``` + +## 7. Change a ConfigMap value + +```bash +# Edit deploy/prod.env locally +# Regenerate the configmap +kubectl create configmap honeydue-config -n honeydue \ + --from-env-file=deploy/prod.env \ + --dry-run=client -o yaml | kubectl apply -f - + +# Restart to pick up +kubectl rollout restart -n honeydue deploy/api deploy/admin deploy/worker +``` + +## 8. Scale a service + +```bash +kubectl scale deployment/api -n honeydue --replicas=5 +# Then wait +kubectl rollout status -n honeydue deployment/api +``` + +**DO NOT** scale worker above 1 until Asynq PeriodicTaskManager is wired. + +## 9. Drain a node for maintenance + +```bash +# Prevent new pods, evict existing +kubectl drain --ignore-daemonsets --delete-emptydir-data + +# Do maintenance (apt upgrade, reboot, etc.) +ssh deploy@ "sudo apt update && sudo apt upgrade -y && sudo reboot" + +# Wait for node to come back +watch kubectl get nodes + +# Allow scheduling again +kubectl uncordon +``` + +Node hostnames (not SSH aliases!): +- `ubuntu-8gb-nbg1-1` (hetzner2) +- `ubuntu-8gb-nbg1-2` (hetzner1) +- `ubuntu-8gb-nbg1-3` (hetzner3) + +## 10. Add a new node + +```bash +# 1. Provision CX33 in Hetzner console +# 2. SSH in as root, create deploy user + key +# 3. Install k3s as agent (or server) +NODE_TOKEN=$(ssh -i ~/.ssh/hetzner deploy@hetzner1 'sudo cat /var/lib/rancher/k3s/server/node-token') +ssh -i ~/.ssh/hetzner root@ "curl -sfL https://get.k3s.io | K3S_TOKEN=\"$NODE_TOKEN\" INSTALL_K3S_EXEC=\"server --server=https://178.104.247.152:6443 --disable=servicelb --write-kubeconfig-mode=644\" sh -" + +# 4. Add UFW rules for inter-node traffic +# (see deploy-k3s/scripts/ for the script) + +# 5. Verify +kubectl get nodes +``` + +## 11. Remove a node + +```bash +# Drain first +kubectl drain --ignore-daemonsets --delete-emptydir-data + +# Tell k3s to leave +ssh -i ~/.ssh/hetzner deploy@ "sudo systemctl stop k3s && sudo /usr/local/bin/k3s-uninstall.sh" + +# Remove from cluster +kubectl delete node +``` + +## 12. Force-restart all pods + +```bash +kubectl rollout restart -n honeydue deploy/api deploy/admin deploy/worker deploy/redis +``` + +Use sparingly. Causes brief downtime per pod. + +## 13. Migrate to a new Neon DB + +```bash +# 1. Point a new branch or project on Neon +# 2. Update prod.env with new DB_HOST +# 3. Apply new ConfigMap +kubectl create configmap honeydue-config -n honeydue \ + --from-env-file=deploy/prod.env \ + --dry-run=client -o yaml | kubectl apply -f - + +# 4. Rolling restart +kubectl rollout restart -n honeydue deploy/api deploy/worker +``` + +## 14. Rotate Gitea registry PAT + +```bash +# 1. Create new PAT in Gitea UI +# 2. Update deploy/registry.env locally +# 3. Update in-cluster Secret +kubectl create secret docker-registry gitea-credentials -n honeydue \ + --docker-server=gitea.treytartt.com \ + --docker-username=admin \ + --docker-password= \ + --dry-run=client -o yaml | kubectl apply -f - + +# 4. Delete old PAT from Gitea UI + +# 5. Pods don't re-auth with existing images (already pulled), but +# new pulls will use new PAT. Test by rolling a pod: +kubectl rollout restart -n honeydue deployment/api +``` + +## 15. Clean up old images in Gitea + +Manual, via Gitea UI: +https://gitea.treytartt.com/admin/-/packages + +Keep ~last 30 tags per image; delete older. + +Or via API: +```bash +GITEA_PAT="$(grep REGISTRY_TOKEN deploy/registry.env | cut -d= -f2)" +# List tags +curl -sS -H "Authorization: token $GITEA_PAT" \ + "https://gitea.treytartt.com/api/v1/packages/admin/container/honeydue-api/versions" | jq . +# Delete specific tag +curl -X DELETE -H "Authorization: token $GITEA_PAT" \ + "https://gitea.treytartt.com/api/v1/packages/admin/container/honeydue-api/" +``` + +## 16. Recreate the cluster from scratch + +See [Chapter 16 §Disaster recovery](./16-failure-modes.md#disaster-recovery). + +## 17. Connect to Neon directly + +```bash +# Get password +PW=$(cat deploy/secrets/postgres_password.txt) + +# Connect +PGPASSWORD="$PW" psql \ + -h ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \ + -U neondb_owner \ + -d honeyDue +``` + +## 18. Check admin user credentials + +```bash +# ADMIN_EMAIL is in the honeydue-secrets Secret +kubectl get secret honeydue-secrets -n honeydue \ + -o jsonpath='{.data.ADMIN_EMAIL}' | base64 -d + +# ADMIN_PASSWORD (ONLY VALID FOR FIRST DEPLOY; may have been changed in UI) +kubectl get secret honeydue-secrets -n honeydue \ + -o jsonpath='{.data.ADMIN_PASSWORD}' | base64 -d +``` + +If you need to reset admin password because nobody remembers it: + +```bash +# Generate a new bcrypt hash +NEW_PASSWORD='newpassword' +HASH=$(htpasswd -bnBC 10 "" "$NEW_PASSWORD" | tr -d ':\n') + +# Update directly in Postgres +PGPASSWORD="$(cat deploy/secrets/postgres_password.txt)" psql \ + -h ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech \ + -U neondb_owner -d honeyDue \ + -c "UPDATE admin_users SET password='$HASH' WHERE email='admin@myhoneydue.com'" +``` + +## 19. Trigger a Helm chart re-run (Traefik etc.) + +If the Traefik HelmChartConfig was updated but chart didn't reconcile: + +```bash +kubectl delete job -n kube-system helm-install-traefik +# Helm operator re-runs automatically within ~30 seconds +kubectl get pods -n kube-system -l app.kubernetes.io/name=traefik -w +``` + +## 20. Smoke test after any change + +```bash +# Through Cloudflare +for url in "https://api.myhoneydue.com/api/health/" \ + "https://admin.myhoneydue.com/" \ + "https://myhoneydue.com/"; do + ok=0 + for i in $(seq 1 20); do + [[ "$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$url")" == "200" ]] && ok=$((ok+1)) + done + printf "%-45s %d/20 ok\n" "$url" "$ok" +done +``` + +Expect 20/20 on all three. + +## 21. Kill everything (emergency rollback) + +If the cluster is so broken you need to reset the app layer: + +```bash +# Scale everything to 0 +kubectl scale -n honeydue deploy/api deploy/admin deploy/worker deploy/redis --replicas=0 + +# When ready, scale back up +kubectl scale -n honeydue deploy/api --replicas=3 +kubectl scale -n honeydue deploy/admin deploy/worker deploy/redis --replicas=1 +``` + +During the scale-down, CF returns errors to users because no pod is +serving. The rolling update for scale-up takes ~5 min. + +## 22. Find which pod a user's request hit + +Not directly supported (we don't log node/pod name in requests). When +we add request logging that includes these, a grep through logs works. + +Workaround: in each pod's logs, search for a unique user identifier: + +```bash +stern -n honeydue api | grep "user_id=12345" +``` + +## References + +- [kubectl cheat sheet][kubectl-cs] +- [K3s docs][k3s-docs] +- [Neon connect][neon-connect] + +[kubectl-cs]: https://kubernetes.io/docs/reference/kubectl/cheatsheet/ +[k3s-docs]: https://docs.k3s.io/ +[neon-connect]: https://neon.com/docs/connect/connect-from-any-app diff --git a/docs/deployment/18-cost.md b/docs/deployment/18-cost.md new file mode 100644 index 0000000..7015ddf --- /dev/null +++ b/docs/deployment/18-cost.md @@ -0,0 +1,243 @@ +# 18 — Cost + +## Summary + +Current monthly infrastructure cost is ~$30-40. External SaaS (Fastmail, +Apple Developer, Google Play) adds ~$8-17/mo depending on push-enable +status. This chapter itemizes every line, projects costs at scale +(10k, 100k, 1M users), and shows what dials to turn when we need to +save or spend. + +## Current monthly cost + +### Compute (Hetzner) + +| Item | Unit cost | Count | Monthly | +|---|---:|---|---:| +| CX33 (4 vCPU, 8 GB RAM, 80 GB SSD) | $7.99 | 3 | **$23.97** | +| Traffic | $0 (20 TB/mo included per node, well below) | — | $0 | +| Hetzner Cloud Firewall | $0 | — | $0 | +| IPv4 public address | $0 (included) | 3 | $0 | +| **Subtotal** | | | **$23.97** | + +### Database (Neon) + +Neon Launch plan: $0.106/CU-hour + $0.35/GB-month storage, $5 minimum. + +At current usage (low traffic, small schema): +- ~10 CU-hours/month × $0.106 ≈ $1 +- ~1 GB storage × $0.35 ≈ $0.35 +- Hits the $5 minimum + +| Item | Monthly | +|---|---:| +| Neon Launch ($5 min + usage) | **~$5** | + +### Object storage (Backblaze B2) + +At current usage (~50 GB stored): + +| Item | Monthly | +|---|---:| +| Storage ($0.006/GB × 50 GB) | $0.30 | +| Egress (effectively $0 — mostly served through CF) | $0 | +| **Subtotal** | **~$0.30** | + +### Edge (Cloudflare) + +| Item | Monthly | +|---|---:| +| Cloudflare Free plan (DNS, TLS, CDN, basic DDoS) | **$0** | + +### Registry (Gitea) + +Self-hosted on the operator's existing Gitea VPS. Not charged to +honeyDue. + +| Item | Monthly | +|---|---:| +| Gitea container registry | **$0** | + +### Total infrastructure + +| Category | Monthly | +|---|---:| +| Compute | $23.97 | +| Database | ~$5 | +| Storage | ~$0.30 | +| Edge | $0 | +| Registry | $0 | +| **Total** | **~$30** | + +## External SaaS + +Things not part of the deploy but required for the product: + +| Item | Cost | Notes | +|---|---:|---| +| Fastmail (SMTP for transactional email) | Part of operator's existing plan | — | +| Apple Developer Program | $99/year = $8.25/mo | Required for iOS app + APNs | +| Google Play Developer | $25 one-time + $0/mo ongoing | — | +| Hetzner Cloud Firewall | $0 | Free; we use UFW instead | + +At push-enabled state, total monthly run rate is **~$38-42**. + +## Hidden / untracked costs + +- **Operator time**: The biggest cost for a bootstrapped project. + Treating ops time at $100/hr, a 4-hour incident = $400. +- **Electricity for operator workstation during builds**: trivial. +- **Domain registration (myhoneydue.com)**: ~$12/year = $1/mo. + +## Cost drivers + +### 1. Compute (scales with traffic) + +If api gets >70% CPU utilization, HPA will scale from 3 to 6 replicas. +Memory at 3 replicas × 512Mi limit = 1.5 GB; nodes have 8 GB each. +Plenty of room before needing more nodes. + +Tipping points: +- >6 api replicas needed sustainedly = bigger CX43 (8 vCPU, 16 GB, + ~$16/mo each) or more CX33s +- Heavy worker throughput = need Asynq PeriodicTaskManager (code + change, not infra) + +### 2. Database (scales with query volume + data) + +Neon Launch: pay per CU-hour of compute. If idle time ≫ active time, +we stay near $5 min. If the app is busy, CU-hours grow. + +Tipping points: +- Consistently >$30/mo at Launch → evaluate Neon Scale plan +- DB storage >50 GB → $15+/mo just for storage +- Active query load → consider read replicas (paid feature) + +### 3. Storage (scales with user uploads) + +B2 at $0.006/GB is cheap. 1 TB = $6/mo. + +Tipping points: +- >5 TB stored = consider R2 (free egress) if egress becomes a factor +- Very high egress = evaluate moving B2 behind CF Workers + +### 4. Edge + +Cloudflare Free is generous. We move to Pro ($20/mo) if: +- We need custom WAF rules beyond 5 +- We need Image Resizing for user uploads +- We need custom Page Rules beyond 3 + +## Projections + +### 10,000 daily active users + +Assume 50 API requests per user per day = 500k req/day = ~6 req/s avg. +Peaks maybe 3-5× = ~25 req/s. + +Bottleneck: probably Neon free-tier CU-hours. At 25 req/s with DB calls, +we'd burn through CU-hours fast. Neon bill: $15-30/mo. + +Compute: 3 CX33s still handle this comfortably. + +| Category | Projected monthly | +|---|---:| +| Compute | $24 | +| Neon | ~$20 | +| Storage | ~$2 | +| Cloudflare | $0 | +| **Total** | **~$46** | + +### 100,000 daily active users + +500k req/s peaks = multi-node api scaling. HPA kicks in. + +| Category | Projected monthly | +|---|---:| +| Compute (3x CX33) | $24 | +| Plus Hetzner LB | $8.49 | +| Neon Scale (pay-as-you-go, higher baseline) | $40-60 | +| B2 (200 GB stored, some egress) | $2 | +| Cloudflare Pro | $20 | +| **Total** | **~$95-115** | + +At this scale, operator time becomes the bigger cost. Adding paid +monitoring (Betterstack ~$15/mo) and uptime (Betterstack Uptime $5/mo) +becomes reasonable. + +### 1,000,000 daily active users + +Bigger question. We'd be re-evaluating: +- More Hetzner nodes or bigger instances +- Neon at scale vs. self-hosted Postgres +- Maybe Cloudflare Workers to offload traffic + +Ballpark: $300-500/mo. At this scale, the company has revenue to +justify an ops hire, and this chapter's assumptions break down. + +## Dials to save money + +### Immediate (reduce $) + +| Lever | Savings | Trade-off | +|---|---|---| +| Switch 3 CX33 → 3 Netcup VPS1000G11 | ~$4/mo | Less polished provider, slightly worse UX | +| Disable Neon Launch, use Supabase free tier | ~$5/mo | Supabase free tier limits | +| 2 nodes instead of 3 | ~$8/mo | Lose HA, two-node Raft is worse than one | +| 1 CX23 (2 vCPU, 4 GB) for admin + worker; 2 CX33 for api | ~$5/mo | Complexity; node roles | + +None of these are compelling. Current cost is in the "don't optimize" +zone. + +### Dials to spend when it becomes worth it + +| Spend | Return | +|---|---| +| Upgrade Neon to Scale ($20+) | More CU-hours, connection count room | +| Add Hetzner LB ($8.49) | Real active health checks, sub-second failover | +| Add monitoring (Betterstack $15) | Proactive detection of issues | +| Add uptime monitoring ($5) | Alerts when site is down | +| CF Pro ($20) | Better WAF, Image Resizing | +| CF Load Balancing ($5) | Multi-region failover, active checks on origins | + +Cumulatively **~$70/mo** takes us to a fully-monitored, fully-alerted, +multi-region-failing-over setup. At 100k users, worth it. + +## Historical spend + +**April 2026 MTD**: ~$35 (Hetzner + Neon prorated). + +**April 2026 (projected)**: $30-40. + +**March 2026**: Pre-launch; no user traffic yet. Just node rentals. +~$25. + +## Hetzner April 2026 price adjustment + +CX33 went from ~$6.59 → $7.99/mo on 2026-04-01. Our monthly compute +cost rose by $4.20 overnight. This is on our budget radar but isn't a +forcing function to switch providers. + +If Hetzner keeps raising prices (which they've historically resisted; +the 2026 adjustment was their first in several years), reconsider. + +## Budget alerts + +- **B2**: hard-capped via B2 console at $20/mo. If we breach, something + is wrong and B2 rejects further writes. +- **Neon**: soft limits via Neon alerts. Set threshold at $20 to get + email when approaching. +- **Hetzner**: no variable cost at our scale, no alerts needed. +- **Cloudflare**: Free plan has hard quotas; no surprise bills possible. + +## References + +- [Hetzner Cloud pricing][hetzner-cloud] +- [Neon pricing][neon-pricing] +- [Backblaze B2 pricing][b2-pricing] +- [Cloudflare Free plan][cf-free] + +[hetzner-cloud]: https://www.hetzner.com/cloud/ +[neon-pricing]: https://neon.com/pricing +[b2-pricing]: https://www.backblaze.com/cloud-storage/pricing +[cf-free]: https://www.cloudflare.com/plans/free/ diff --git a/docs/deployment/19-postmortem-swarm.md b/docs/deployment/19-postmortem-swarm.md new file mode 100644 index 0000000..5ac12b4 --- /dev/null +++ b/docs/deployment/19-postmortem-swarm.md @@ -0,0 +1,480 @@ +# 19 — Postmortem: The Swarm Era + +## Summary + +honeyDue launched on Docker Swarm on 2026-04-23. Over the course of a +single afternoon we hit **thirteen distinct bugs** before declaring +Swarm unfit and migrating to k3s. This chapter is the forensic record: +the symptom of each bug, the root cause, the specific fix, and citations +where relevant. It's preserved because these lessons are expensive and +future-us should not pay them again. + +**TL;DR**: Twelve of the thirteen bugs were recoverable. The thirteenth +was a Docker libnetwork ghost-DNS defect ([moby/moby#52265][moby-52265]) +that is fundamentally incompatible with single-replica services. No +amount of clever config fixed it; we had to change orchestrators. + +## Timeline + +**~18:00** — Infrastructure stood up. Docker Swarm initialized. First +build + push to Gitea. + +**~19:30** — First deploy runs. Immediate failures. + +**~22:00** — api + admin returning 200 through Cloudflare. Flaky but +working. + +**~23:00** — Admin flapping 50%+ through Cloudflare. Ghost DNS record +identified. Workarounds begin. + +**~00:30 (next day)** — Ghost DNS survives every non-nuclear +intervention. Research confirms it's a known libnetwork bug. Decision +to migrate to k3s. + +**~04:30** — k3s cluster up, all services healthy, 150/150 requests +green. Postmortem begins. + +The session ran ~10 hours. The migration itself took ~1 hour. + +## The thirteen bugs + +### 1 — Deploy script array expansion under `set -u` + +**File**: `deploy/scripts/deploy_prod.sh` + +**Symptom**: +``` +./deploy/scripts/deploy_prod.sh: line 339: api_extra[@]: unbound variable +``` + +**Root cause**: Bash arrays expanded with `"${arr[@]}"` under `set -u` +fail when the array is empty. Our deploy script initialized empty +arrays conditionally but expanded them unconditionally. + +**Fix**: Use the `${arr[@]+"${arr[@]}"}` safe-expansion idiom, or +restructure to avoid passing empty arrays: + +```bash +build_and_push api "${API_IMAGE}" ${api_extra[@]+"${api_extra[@]}"} +``` + +Inside the function, same treatment — use `shift` instead of array +slicing. + +**Moral**: `set -u` with bash arrays is a known pitfall. The +`"${arr[@]}"` expansion isn't safe under strict mode if arrays can be +empty. + +### 2 — Dockerfile Go version mismatch + +**File**: `Dockerfile` + +**Symptom**: +``` +go: go.mod requires go >= 1.25 (running go 1.24.13; GOTOOLCHAIN=local) +ERROR: failed to build: failed to solve: process "/bin/sh -c go mod download" did not complete successfully: exit code: 1 +``` + +**Root cause**: `go.mod` specifies `go 1.25`, but the Dockerfile's +builder stage used `golang:1.24-alpine`. + +**Fix**: Bumped to `golang:1.25-alpine`. One-character change. + +**Moral**: Keep the Dockerfile base image in sync with `go.mod`'s +go directive. CI would catch this; we had none. + +### 3 — dev machine arm64 vs node amd64 + +**Symptom**: Would have been `exec format error` on the nodes if we'd +deployed without fixing. Caught at build config stage. + +**Root cause**: Operator on Apple Silicon (arm64). Hetzner nodes are +amd64. Plain `docker build` produces arm64 images. + +**Fix**: Switched deploy script to use `docker buildx build --platform +linux/amd64 --push`. This cross-compiles the Go stages (they honor +`TARGETARCH`) and uses QEMU emulation for the Node stages. + +**Moral**: Cross-platform builds are routine for Apple Silicon +developers. Document it up front, bake it into the deploy script. + +### 4 — Swarm stack `host_ip` rejected + +**File**: `deploy/swarm-stack.prod.yml` (dozzle service) + +**Symptom**: +``` +services.dozzle.ports.0 Additional property host_ip is not allowed +``` + +**Root cause**: Docker Compose v3.8 schema allows `host_ip` in long-form +port spec. Swarm's `docker stack deploy` parser doesn't. + +**Fix**: Use the short form: +```yaml +ports: + - "127.0.0.1:${DOZZLE_PORT}:8080" +``` + +But then: Swarm's ingress mesh mode silently ignores the `127.0.0.1` +binding and listens on `0.0.0.0` anyway. Only way to get true +loopback-only binding is `mode: host`, which changes port-publishing +semantics. + +**Moral**: Compose-file compatibility between plain Docker and Swarm +is imperfect. Check the [Swarm-specific compose reference][swarm-compose] +when in doubt. + +### 5 — Stack file secret references + +**Symptom**: +``` +service worker: undefined secret "honeydue_postgres_password_237c6b8-20260423195810" +``` + +**Root cause**: The original stack file template used +`source: ${POSTGRES_PASSWORD_SECRET}` (which expanded to the versioned +secret name like `honeydue_postgres_password_`) under each service's +`secrets:` list. + +Swarm expects `source:` to match the **alias** in the top-level +`secrets:` block (`postgres_password`), not the actual secret `name:`. + +**Fix**: Changed every `source:` to the alias form: + +```yaml +# Was: +- source: ${POSTGRES_PASSWORD_SECRET} + target: postgres_password + +# Now: +- source: postgres_password + target: postgres_password +``` + +**Moral**: The original template was clever but subtly wrong. It had +never successfully deployed — the earlier Dokku setup used a different +secret model. Bugs-in-template-code catch you when you first hit them. + +### 6 — API pod crash: `sync.Once` double-unlock + +**File**: `internal/services/cache_service.go:54` + +**Symptom**: api pods completed migrations, started HTTP server, then +fataled with: +``` +fatal error: sync: unlock of unlocked mutex +goroutine 1 [running]: +internal/sync.fatal(...) +sync.(*Once).doSlow(...) +github.com/treytartt/honeydue-api/internal/services.NewCacheService + /app/internal/services/cache_service.go:31 +``` + +**Root cause**: Inside a `sync.Once.Do(func() { ... })` callback, the +code did: + +```go +cacheOnce.Do(func() { + // ... + if err := client.Ping(ctx).Err(); err != nil { + initErr = fmt.Errorf(...) + cacheOnce = sync.Once{} // ← THIS LINE + return + } +}) +``` + +The intent: "if Redis ping fails, reset the Once so a retry can happen." +The reality: the Once's internal mutex is held while `Do` is running the +callback. Reassigning `cacheOnce = sync.Once{}` creates a NEW zero- +valued Once and replaces the old one. When `Do` tries to release the +mutex afterward, the mutex is the new-zero-valued one — which isn't +locked. Panic. + +**Fix**: Removed the reset. `main.go` already handles the error +gracefully (`cache = nil`, continues without caching). Retries happen +via pod restart, not in-process. + +```go +if err := client.Ping(ctx).Err(); err != nil { + initErr = fmt.Errorf(...) + // Don't reassign cacheOnce here — mutating it from inside Do() + // is a fatal error. Let main.go handle the error. + return +} +``` + +**Moral**: `sync.Once` is simpler than it looks. Never reassign an +active sync primitive from within its own callback. + +### 7 — Stack file `maxUnavailable: 2` warning for worker + +**Symptom**: We noticed `WORKER_REPLICAS=2` in `cluster.env` despite +the Asynq scheduler being a singleton. + +**Root cause**: Asynq's `Scheduler` is not leader-elected by default. +Running >1 replica causes duplicate cron firings — duplicate daily +digests, double-welcome emails. + +**Fix**: `WORKER_REPLICAS=1`. Added a comment in `cluster.env.example` +explaining why. + +**Moral**: Defaults can be dangerous. Even when a default seems +reasonable ("2 replicas for HA"), check against the app's semantics. + +### 8 — `PUSH_LATEST_TAG=true` for prod + +**Symptom**: During a test, we saw `honeydue-api:latest` updating, +which would make rollbacks harder. + +**Root cause**: The cluster.env had `PUSH_LATEST_TAG=true` when the +design intent was SHA-pinned deploys only. + +**Fix**: `PUSH_LATEST_TAG=false`. SHA tags only. + +**Moral**: Tag-mutable images make rollbacks non-deterministic. +Prefer immutable SHA tags. + +### 9 — Neon DB name case sensitivity + +**Symptom**: +``` +server error: ERROR: database "honeydue" does not exist (SQLSTATE 3D000) +``` + +**Root cause**: Neon's UI created the database as `"honeyDue"` (quoted, +camelCase). Postgres treats quoted identifiers case-sensitively at +create time. Our `prod.env` had `POSTGRES_DB=honeydue` (lowercase). + +**Fix**: `POSTGRES_DB=honeyDue`. + +**Moral**: Respect Postgres's identifier quoting rules. If something +was created with quotes, refer to it with exact case. + +### 10 — Admin DNS ghost A-record (the big one) + +**Symptom**: Through Cloudflare, `admin.myhoneydue.com` returned 502 on +~50% of requests. The other 50% succeeded. The pattern was stable over +hours. + +**Investigation**: + +The admin service had 1 replica, alive on one of three Swarm nodes. +Caddy (reverse proxy at the time) resolved `admin` via Swarm's +embedded DNS at `127.0.0.11`. `nslookup admin` returned: + +``` +Name: admin Address: 10.0.1.36 (current task IP) +Name: admin Address: 10.0.1.17 (GHOST — what is this?) +``` + +Two A records for one-replica service, both returned randomly. + +`10.0.1.17` was checked: that IP now belonged to the **dozzle** +container on hetzner3. Nothing listens on dozzle's 3000 port → +connection refused → 502. + +The old admin task had run on hetzner3 with IP 10.0.1.17. When it +migrated to hetzner1 with IP 10.0.1.36, libnetwork's DNS registration +for admin was supposed to update. On hetzner2 and hetzner3, the old +10.0.1.17 record never got removed. + +**Things tried, none worked**: + +| Attempt | Result | +|---|---| +| `endpoint_mode: dnsrr` on admin | DNS still returns both IPs | +| Kill + restart Caddy container | DNS still returns both IPs | +| Scale admin to 0 and back to 1 | Ghost 10.0.1.17 still in DNS with 0 replicas | +| `docker service rm honeydue_admin` | Ghost 10.0.1.17 still in DNS (orphaned) | +| Change admin to `mode: global` | Different IPs but ghost remains | +| `mode: host` on admin ports + `extra_hosts: host.docker.internal:host-gateway` | `host.docker.internal` resolved to docker0 (172.17.0.1), not reachable from overlay | +| Hardcoded 3 node IPs in Caddy + UFW port 3000 node-to-node | ~90% reliable, NAT hairpin issues when Caddy dials its own node | + +**Root cause**: [moby/moby#52265][moby-52265] — Docker libnetwork's +overlay network state store doesn't reliably deregister service +endpoints when tasks migrate between nodes. Known bug in the 29.x +line. Partial fixes in #50236 (29.0) were incomplete; 29.3 still +leaks; #52289 is the pending follow-up. + +**Why it only manifests on single-replica services**: With 3 replicas, +Caddy's DNS query returns 4 IPs (3 real + 1 ghost). Round-robin +succeeds 75% of the time. With 1 replica, 1 real + 1 ghost = 50% +failure. More replicas = bug is masked. + +**Final fix**: None at the libnetwork level. The ghost survives every +non-cluster-recreating operation. The only clean purge is +`docker stack rm` + `docker network rm` + full redeploy. Even then, +the bug recurs on the next task migration. + +**Decision**: Migrate to k3s. CoreDNS has none of libnetwork's state- +store semantics and the bug class doesn't exist. 4 hours of fighting +Swarm → 1-hour k3s migration that just worked. + +**Citations**: +- [moby/moby#52265 — Overlay ARP stale entries on 29.3.0][moby-52265] +- [moby/moby#51491 — DNS broken after swarm init][moby-51491] +- [Dokploy#3480 — Traefik stale VIP on Swarm][dokploy-3480] + +### 11 — IPSec ESP + UDP 500 blocked + +**Symptom**: Earlier in the Swarm setup, api 3/3 was working but +cross-node overlay traffic was intermittently failing. This turned out +to be a separate bug masking #10 earlier in the session. + +**Root cause**: We had encrypted overlay enabled +(`driver_opts: encrypted: "true"`). Swarm's encrypted mode uses IPSec +ESP (IP protocol 50) + UDP 500 (IKE). Our UFW only allowed UDP 4789 +(VXLAN) and 7946 (gossip). ESP was blocked by default-deny. Encrypted +packets dropped silently on some flows. + +**Fix**: Added UFW rules for each peer node IP: +```bash +sudo ufw allow from to any proto esp +sudo ufw allow from to any port 500 proto udp +``` + +Once applied, cross-node overlay data path became stable. + +**Moral**: Encrypted Swarm overlay requires more than VXLAN to be open. +ESP (protocol 50) and UDP 500 (IKE) for IPSec. Official Docker docs +mention this but it's easy to miss. + +### 12 — Admin startupProbe path + +**Symptom**: Admin pod kept restarting with startup probe failures. +Kubelet reported: +``` +Startup probe failed: HTTP probe failed with statuscode: 404 +``` + +**Root cause**: The k3s scaffold's `admin/deployment.yaml` had: +```yaml +startupProbe: + httpGet: + path: /admin/ + port: 3000 +``` + +But our admin Next.js app serves at `/`, not `/admin/`. Requests to +`/admin/` return 404. K8s considered the pod unhealthy and restart- +looped. + +**Fix**: Change probe path to `/`. Also bumped `failureThreshold` from +12 to 24 (120s grace) for Next.js's slower-than-expected cold boot +when the node's already busy. + +**Moral**: Copy-pasted scaffolds can have assumptions that don't match +your app. Always verify probes against actual reachable paths. + +### 13 — MigrateWithLock startup probe grace + +**Symptom**: API pods were getting killed by k8s during migration. +First replica was OK (fast migration); replicas 2 and 3 waited on +the advisory lock too long and healthchecks tripped. + +**Root cause**: Go app's `MigrateWithLock()` uses +`pg_advisory_lock()` to serialize migrations across replicas. First +replica does real AutoMigrate (~90s cold); subsequent replicas wait +on the lock, then run no-op migrations. Total time for 3rd replica +can be 3+ minutes. + +K3s scaffold's `api/deployment.yaml` had: +```yaml +startupProbe: + failureThreshold: 12 + periodSeconds: 5 +``` + += 60s grace. Not enough. + +**Fix**: Bumped `failureThreshold` to 48 (= 240s grace). Comment in +the manifest explains why. This is *not* a band-aid — the real startup +time genuinely is 90-240s depending on lock queue position. The probe +should reflect reality, not be optimistic. + +**Moral**: Healthchecks should be realistic, not aspirational. Know +what your app actually does at startup. + +## What we learned + +### Docker Swarm is in a bad place in 2026 + +Not dead — Mirantis supports it through 2030 — but **nobody is +modernizing libnetwork**. When you hit a DNS or networking bug, you're +on your own. The fix churn on #52265 (incomplete 29.0 fix → 29.3 +regression → pending #52289) is a tell: the code has no champion. + +For new deployments, **don't pick Swarm** unless you're doing something +Swarm-shaped (tiny, single-replica, no inter-service traffic). K3s is +a strictly better choice for anything approximating what we're doing. + +### Investigate before you work around + +We spent a lot of time on clever workarounds for bug #10 (host-mode +ports, host.docker.internal, hardcoded node IPs, UFW routing) before +doing the 20-minute research task that revealed the bug was a known +libnetwork defect. If we'd searched "Swarm DNS stale record 2026" first, +we'd have saved ~3 hours. + +### Scaffolds are starting points, not finishing points + +The k3s scaffold in `deploy-k3s/` was excellent — production-grade +RBAC, PDBs, security contexts, network policies, Traefik middleware. +But its image references (GHCR), TLS assumptions (CF Full strict), and +probe paths (admin's `/admin/`) didn't match our actual setup. Every +scaffold needs a read-through against your environment before you +`kubectl apply -f`. + +### Keep the old config until the new config is proven + +We kept `deploy/` (Swarm) intact during the k3s migration. That meant +if k3s failed, we could `git stash` the k3s work and do a fast Swarm +redeploy. It took ~4 days before we deleted `deploy/`, by which point +we were confident. + +## Files affected by tonight's work + +All in `honeyDueAPI-go`: + +- `Dockerfile` — Go 1.24 → 1.25 (bug #2) +- `deploy/scripts/deploy_prod.sh` — buildx refactor, array expansion fixes (bugs #1, #3) +- `deploy/swarm-stack.prod.yml` — dozzle host_ip, secret source references, multiple iterations trying to fix #10 +- `deploy/prod.env` — admin seed env vars, DB_POSTGRES_DB case, B2 values, push-disabled placeholders (bug #9) +- `deploy/cluster.env` — WORKER_REPLICAS 2 → 1, PUSH_LATEST_TAG (bugs #7, #8) +- `deploy/Caddyfile` — multiple iterations (ultimately deleted when we moved to k3s) +- `internal/services/cache_service.go` — removed sync.Once reset (bug #6) +- `internal/database/database.go` — (no change, MigrateWithLock semantics investigated) +- `deploy-k3s/manifests/api/deployment.yaml` — startupProbe grace (bug #13) +- `deploy-k3s/manifests/admin/deployment.yaml` — probe path (bug #12) +- `deploy-k3s/manifests/worker/deployment.yaml` — replicas 2 → 1 +- `deploy-k3s/manifests/pod-disruption-budgets.yaml` — worker minAvailable 1 → 0 +- `deploy-k3s/manifests/traefik-helmchartconfig.yaml` — NEW (DaemonSet + hostNetwork for Traefik) +- `deploy-k3s/manifests/ingress/ingress-simple.yaml` — NEW (simple host routing, no TLS) +- `deploy-k3s/MIGRATION_NOTES.md` — NEW + +## What was thrown away + +- Swarm stack definitions (still in `deploy/`, planned for removal) +- Caddy Caddyfile (k3s uses Traefik instead) +- Several hours of work on Caddy `dynamic a` upstream refresh, host- + mode ports, and NAT-hairpin workarounds for bug #10 — all moot + once we migrated + +## References + +- [moby/moby#52265 — Overlay ARP stale entries][moby-52265] +- [moby/moby#51491 — DNS broken after swarm init][moby-51491] +- [Dokploy#3480 — Traefik stale VIP][dokploy-3480] +- [Mirantis Swarm LTS commitment][mirantis-swarm] +- [Kubernetes probe best practices][k8s-probes] +- [Asynq scheduler limitations][asynq-sched] + +[moby-52265]: https://github.com/moby/moby/issues/52265 +[moby-51491]: https://github.com/moby/moby/issues/51491 +[dokploy-3480]: https://github.com/Dokploy/dokploy/issues/3480 +[mirantis-swarm]: https://www.mirantis.com/blog/mirantis-guarantees-long-term-support-for-swarm/ +[k8s-probes]: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ +[asynq-sched]: https://github.com/hibiken/asynq/wiki/Periodic-Tasks +[swarm-compose]: https://docs.docker.com/reference/compose-file/legacy-versions/ diff --git a/docs/deployment/20-roadmap.md b/docs/deployment/20-roadmap.md new file mode 100644 index 0000000..099aab3 --- /dev/null +++ b/docs/deployment/20-roadmap.md @@ -0,0 +1,318 @@ +# 20 — Roadmap + +## Summary + +A consolidated list of known gaps, improvements, and scaling triggers. +Items are grouped by category and roughly ordered by priority. This is +the "if we had more time" list referenced throughout the book. + +## High priority (do soon) + +### Uptime monitoring + +**Why**: Right now we find out the site is down when users complain. + +**How**: Set up Uptime Kuma (self-hosted) or Better Stack Uptime +(free tier) to ping `https://api.myhoneydue.com/api/health/` every +minute, with Slack/email alerts on failure. + +**Effort**: ~30 min for Uptime Kuma deploy, ~10 min for Better Stack +signup. + +### Cloudflare origin IP restriction + +**Why**: UFW allows :80 from anywhere. If node IPs leak, direct-connect +attackers bypass CF's WAF/DDoS protection. + +**How**: Replace the anywhere-80 UFW rule with 15 IPv4 + 7 IPv6 CF +ranges. See [Chapter 13 §CF IP ranges](./13-cloudflare.md#cloudflare-ip-ranges-used-in-traefik-trustedips). + +Automation: a small script that refreshes the CF IP list monthly and +re-applies UFW rules. + +**Effort**: 1 hour. + +### Enable network policies in k3s + +**Why**: Currently pods can freely egress anywhere. A compromised pod +could exfiltrate data or attack lateral services. + +**How**: `kubectl apply -f deploy-k3s/manifests/network-policies.yaml`. +The scaffold defines default-deny + explicit allows for: +- DNS egress for all pods +- Traefik → api (port 8000) +- Traefik → admin (port 3000) +- api/worker → Redis +- api/worker → external services (Postgres, B2, Fastmail) + +Then test that nothing breaks (might need to adjust allow rules). + +**Effort**: 1-2 hours including testing. + +### Apply Traefik security middleware + +**Why**: Our current Ingress has no rate limiting or security headers +beyond what Traefik adds by default. + +**How**: Apply `deploy-k3s/manifests/ingress/middleware.yaml`, annotate +Ingresses to use them: + +```yaml +metadata: + annotations: + traefik.ingress.kubernetes.io/router.middlewares: honeydue-security-headers@kubernetescrd,honeydue-rate-limit@kubernetescrd +``` + +**Effort**: 15 min. + +## Medium priority + +### Upgrade to CF Full (strict) SSL + +**Why**: Currently CF↔origin is plain HTTP. An attacker between CF and +Hetzner could read traffic. Full (strict) mode encrypts this leg with +a CF-issued origin cert. + +**How**: +1. Generate Origin CA cert in CF dashboard → SSL/TLS → Origin Server +2. Create `cloudflare-origin-cert` Secret in k8s +3. Add `tls:` block to Ingresses +4. Switch CF SSL mode to Full (strict) + +**Effort**: 30 min. + +**Citations**: [Cloudflare Origin CA docs][cf-origin-ca] + +### Migration Job for schema changes + +**Why**: Currently every api pod runs `MigrateWithLock()` on startup, +serializing on a Postgres advisory lock. Adds 90-240s to cold startup +and caused bug #13 in Chapter 19. + +**How**: Create a Kubernetes `Job` resource that runs the api image +with a `--migrate-only` flag. Job runs once per deploy, completes when +schema is current. api pods get an initContainer that waits for the +Job to complete. + +Requires Go code change to support `--migrate-only` flag. + +**Effort**: 3-4 hours (code + job manifest + testing). + +### Redis password + +**Why**: Redis runs in the cluster with no auth. Any compromised pod +could read cache or queue state. + +**How**: Set `REDIS_PASSWORD` in `honeydue-secrets`, update api/worker +env, update Redis command to include `--requirepass`. Already partially +wired up in the manifests. + +**Effort**: 20 min. + +### Image signing with cosign + +**Why**: No guarantee that an image pulled from Gitea is the one we +built. Gitea compromise = arbitrary code execution in cluster. + +**How**: +1. Install cosign on build machine +2. Sign images as part of deploy: `cosign sign gitea.treytartt.com/admin/honeydue-api:` +3. Deploy Kyverno (or Connaisseur) to cluster +4. Apply cluster policy requiring all images have valid cosign signatures + +**Effort**: 4-6 hours. + +### etcd encryption at rest + +**Why**: Kubernetes Secrets are stored in etcd unencrypted by default. +Node disk compromise = plaintext secrets. + +**How**: K3s supports `--secrets-encryption` flag at server install. +Need to recreate cluster or re-install k3s server on each node. + +**Effort**: 1 hour. + +### Automated unattended-upgrades + +**Why**: Currently OS patches require manual `apt upgrade`. Security +patches can be delayed. + +**How**: +```bash +sudo apt install unattended-upgrades +# Configure /etc/apt/apt.conf.d/50unattended-upgrades for security-only +sudo dpkg-reconfigure -plow unattended-upgrades +``` + +**Effort**: 30 min per node. + +### fail2ban + +**Why**: SSH is open to the world. No rate limiting on failed attempts. +Bot noise is constant. + +**How**: `sudo apt install fail2ban; sudo systemctl enable --now fail2ban`. +Default config bans IPs after 5 failed attempts for 10 min. + +**Effort**: 15 min per node. + +### Move SSH off port 22 + +**Why**: Port 22 attracts constant scanner noise. Moving to a +non-default port cuts >90% of attempts. + +**How**: +1. Edit `/etc/ssh/sshd_config` on each node: `Port 2222` +2. UFW rule: `sudo ufw allow 2222/tcp` +3. Update `~/.ssh/config` on operator: `Port 2222` +4. Restart sshd: `sudo systemctl restart ssh` +5. Remove UFW rule for port 22 after verifying + +**Effort**: 30 min (and pray). + +## Lower priority + +### Prometheus + Grafana + +**Why**: Historical metrics, dashboards, alerting. + +**How**: `kube-prometheus-stack` Helm chart. Adds ~500 MB RAM across +cluster. + +**Effort**: 4-6 hours including dashboard setup. + +### Loki log aggregation + +**Why**: Cross-pod log queries, longer retention. + +**How**: `grafana/loki` + `promtail` DaemonSet. Integrates with existing +Grafana. + +**Effort**: 2-3 hours. + +### OpenTelemetry tracing + +**Why**: Request-level profiling. Show which hop dominates p99 latency. + +**How**: Add OpenTelemetry SDK to Go app; export to Jaeger/Tempo. + +**Effort**: 8-12 hours including tuning. + +### Hetzner private network + +**Why**: Currently all inter-node traffic (including Flannel overlay) +goes over public network. Private network = less attack surface, no +bandwidth costs (if metered in future). + +**How**: Attach Hetzner vswitch to the 3 nodes, reconfigure Flannel to +advertise private IPs, update UFW rules to allow from private IP range +instead of specific public IPs. + +**Effort**: 2-3 hours including testing Flannel reconfig. + +### Move secrets to Vault + +**Why**: Kubernetes Secrets are base64-encoded etcd values. Vault is +purpose-built for secret management with audit logs, dynamic secrets, +rotation policies. + +**How**: Deploy Vault in the cluster (or external), migrate secret +values, use Vault Agent Injector or External Secrets Operator. + +**Effort**: 6-8 hours. + +Not high priority until we have multiple engineers who shouldn't see +every secret, or compliance requirements. + +### Automated backups to B2 + +**Why**: Neon's backup is Neon's problem. If Neon-as-a-company +disappeared, we'd lose everything. + +**How**: Nightly `pg_dump | gzip | aws s3 cp` (via `s3cmd` for B2) as a +CronJob in the cluster. + +**Effort**: 2 hours. + +### Multi-region + +**Why**: ~100 ms CF→origin hop could be reduced by having origins in +multiple regions. Not needed at current scale. + +**How**: Add 2 more Hetzner nodes in ash (Ashburn, US). Separate k3s +cluster (or one stretched cluster — painful). Cloudflare Load Balancing +for geo-based routing. + +**Effort**: Days of work, doubling cost. Don't until traffic justifies. + +### CF Workers for static + caching + +**Why**: Certain endpoints (the marketing landing page, public API +lookups) could serve from CF Workers with near-zero origin load. + +**How**: Move static pages to Cloudflare Pages; cache API responses +with `Cache-Control: public, max-age=300`. + +**Effort**: 4-6 hours. + +### WireGuard-encrypted overlay + +**Why**: Current Flannel VXLAN is plaintext between nodes. An attacker +with Hetzner-internal network access could read pod-to-pod traffic. + +**How**: K3s supports `--flannel-backend=wireguard-native`. Reinstall +k3s server on each node with the new backend. + +**Effort**: 2-3 hours (requires brief downtime). + +## Scaling triggers + +| Trigger | Action | +|---|---| +| p99 latency > 500ms sustained | Investigate with tracing; consider CF Workers for cached paths | +| API CPU > 70% sustained | HPA already configured; may need more nodes | +| DB connections at Neon limit | Upgrade Neon Scale or reduce `DB_MAX_OPEN_CONNS` | +| Redis memory > 80% | Scale Redis memory; consider cache sharding | +| B2 storage > 500 GB | Evaluate if R2 (free egress) is cheaper overall | +| Active users > 100k | Evaluate multi-region, CF Pro, paid monitoring | +| Revenue > $5k/mo | Hire ops help; this document assumes solo operator | + +## Known gaps we accept + +- **No canary deploys**: all-or-nothing rollouts via `kubectl set image` +- **No feature flags** (app-level): code is deployed as-is. Can't toggle + features without re-deploying +- **No A/B testing infra**: out of scope for current product stage +- **No Windows/tablet-specific CDN rules**: CF serves everyone the same + responses +- **No explicit blue-green**: rolling updates only + +## Stuff to delete when brave + +- `deploy/` (the Swarm era) — once we've been on k3s 30 days +- Legacy UFW rules from the Swarm era (2377, 7946, 4789, ESP, 500, 3000) + — they don't hurt but they're confusing +- `deploy-k3s/manifests/secrets.yaml.example` — we don't use this + pattern, we create secrets imperatively + +## Stuff that could go wrong and we should plan for + +- **Hetzner price hike**: 2026-04-01 already happened. If another one + comes, we could migrate to Netcup or OVH for savings. +- **Neon EOL free tier**: Neon could change pricing policy. Fallback: + self-host Postgres on a Hetzner box or migrate to Supabase. +- **Cloudflare Free plan changes**: CF could restrict Free features. + Fallback: BunnyCDN, or raw nodes without CDN. +- **Gitea host outage**: If Gitea is down, deploys can't pull new + images. Existing pods continue. For long outages, we'd cache images + locally or temporarily push to Docker Hub. + +## Progress tracker + +As items are done, mark them here. Think of this as a running changelog. + +- [x] k3s migration from Swarm (2026-04-24) +- [x] Traefik DaemonSet + hostNetwork +- [x] Admin seed via ADMIN_EMAIL + ADMIN_PASSWORD +- [x] Documentation book (this doc set) +- [ ] All other items above diff --git a/docs/deployment/README.md b/docs/deployment/README.md new file mode 100644 index 0000000..a84f056 --- /dev/null +++ b/docs/deployment/README.md @@ -0,0 +1,112 @@ +# honeyDue Production Deployment — The Book + +This is the complete reference for the honeyDue production deployment as it +exists on **2026-04-24**. It serves two audiences: + +1. **A new engineer** learning the system for the first time. Start at + Chapter 0 (Overview) and read in order. Concepts are built up; nothing is + assumed beyond "you've deployed web apps before." +2. **The operator** (future-you) needing a specific fact fast. Every chapter + opens with a one-paragraph summary and has an operator runbook at its end. + The appendices are a cheat sheet. + +The deployment is non-trivial. It's a 3-node HA Kubernetes cluster running +a Go API, a Next.js admin panel, a background worker, Redis, and Traefik — +all fronted by Cloudflare, integrated with Neon Postgres, Backblaze B2, and +a self-hosted Gitea registry. This book explains **why each of those pieces +was chosen** (often over two or three alternatives we tried first), what +they do, and how to operate them. + +## Table of Contents + +### Part I — The System + +- [00 — Overview](./00-overview.md) — what's running, at a glance +- [01 — Infrastructure](./01-infrastructure.md) — Hetzner nodes, specs, cost, region +- [02 — Orchestrator Choice](./02-orchestrator-choice.md) — why k3s (and not Swarm, full k8s, or Nomad) + +### Part II — Networking + +- [03 — Networking](./03-networking.md) — flannel, CoreDNS, kube-proxy, the overlay story +- [04 — Firewall](./04-firewall.md) — every UFW rule on every node, rationale +- [13 — Cloudflare](./13-cloudflare.md) — DNS, SSL modes, round-robin origin pool + +### Part III — Security + +- [05 — Security](./05-security.md) — RBAC, Pod Security, secrets, TLS chain +- [06 — Traefik Ingress](./06-traefik-ingress.md) — host-network DaemonSet, cert plan + +### Part IV — Workloads + +- [07 — Services](./07-services.md) — api, admin, worker, redis per-service deep dive +- [08 — Database](./08-database.md) — Neon Postgres, advisory-lock migrations +- [09 — Storage](./09-storage.md) — Backblaze B2, minio-go client details +- [10 — Secrets & Config](./10-secrets-config.md) — ConfigMap, Secret, env mapping +- [11 — Registry](./11-registry.md) — Gitea container registry, multi-arch builds + +### Part V — Operation + +- [12 — Data Flow](./12-data-flow.md) — end-to-end request lifecycle +- [14 — Deployment Process](./14-deployment-process.md) — how to roll new code +- [15 — Observability](./15-observability.md) — logs, metrics, tracing +- [16 — Failure Modes](./16-failure-modes.md) — what happens when X dies +- [17 — Runbook](./17-runbook.md) — common ops tasks + +### Part VI — Context + +- [18 — Cost](./18-cost.md) — what this costs to run, per service +- [19 — Swarm Postmortem](./19-postmortem-swarm.md) — the story of why we migrated from Docker Swarm +- [20 — Roadmap](./20-roadmap.md) — known TODOs and scaling triggers + +### Appendices + +- [A — Glossary](./appendices/a-glossary.md) +- [B — kubectl Cheat Sheet](./appendices/b-commands.md) +- [C — File Locations](./appendices/c-file-locations.md) +- [D — References & Citations](./appendices/d-references.md) + +## Quick Facts + +| Field | Value | +|---|---| +| Orchestrator | K3s v1.34.6+k3s1 (3 nodes, HA control plane) | +| Ingress | Traefik v3 (DaemonSet, hostNetwork) | +| Nodes | 3× Hetzner Cloud CX33 (4 vCPU, 8 GB RAM, 80 GB SSD) in `nbg1` (Nuremberg) | +| DNS & Edge | Cloudflare (Free plan), SSL=Flexible, round-robin 3 node A records | +| Database | Neon Postgres, `ep-floral-truth-amttbc5a.c-5.us-east-1.aws.neon.tech` | +| Cache + Queue | Redis 7-alpine, in-cluster, 1 replica, PVC-backed, pinned to `nbg1-2` | +| Object Storage | Backblaze B2, `honeyDueProd` bucket, `us-east-005` region | +| Image Registry | Self-hosted Gitea v1.25.5 at `gitea.treytartt.com` | +| Transactional Email | Fastmail SMTP (`smtp.fastmail.com:587`) | +| Domains | `api.myhoneydue.com`, `admin.myhoneydue.com`, `myhoneydue.com` | +| Monthly Cost (current) | ~$30–40 (3× Hetzner + Neon Launch + B2 + Cloudflare Free + Gitea free) | +| kubeconfig | `~/.kube/honeydue-k3s.yaml` on operator workstation | +| Repo | `honeyDueAPI-go/deploy-k3s/` for manifests, `deploy/` is the legacy Swarm config | + +## How to Read This Book + +- **"Why did we…?"** answers are in the chapter covering that component. Every + major design choice has an explicit rejection of 1–3 alternatives. +- **Historical bugs** are in Chapter 19. The rest of the book describes the + current (fixed) state; 19 is the forensic record of what was broken and + how we figured it out. +- **Operator commands** you'll run regularly are in Appendix B. Chapter 17 + has longer procedures (cert rotation, DB migration, etc.). +- **Citations** throughout use footnote-style links to the canonical source + (k3s docs, moby issues, Cloudflare docs, etc.). Appendix D collects them. + +## Conventions + +- Kubernetes namespace for the app is `honeydue`. +- SSH aliases are `hetzner1`, `hetzner2`, `hetzner3` in your `~/.ssh/config`. +- Node hostnames in the cluster are `ubuntu-8gb-nbg1-{1,2,3}` (Hetzner-assigned). +- The mapping is non-obvious because the Hetzner hostname suffix order does + not match SSH alias order: + +| SSH alias | Public IP | Hostname in k3s | +|---|---|---| +| hetzner1 | 178.104.247.152 | `ubuntu-8gb-nbg1-2` | +| hetzner2 | 178.105.32.198 | `ubuntu-8gb-nbg1-1` | +| hetzner3 | 178.104.249.189 | `ubuntu-8gb-nbg1-3` | + +When a chapter refers to "hetzner1" it means the box at 178.104.247.152 / `nbg1-2`. diff --git a/docs/deployment/appendices/a-glossary.md b/docs/deployment/appendices/a-glossary.md new file mode 100644 index 0000000..badee6f --- /dev/null +++ b/docs/deployment/appendices/a-glossary.md @@ -0,0 +1,207 @@ +# Appendix A — Glossary + +Alphabetical. Cross-referenced to chapters where each term is used in +detail. + +## Kubernetes / k3s + +**ClusterIP**: Internal IP of a Kubernetes Service. Stable; load- +balances to backing pods. (Chapter 3) + +**containerd**: Container runtime bundled with k3s. Replaces Docker for +the runtime layer. (Chapter 2) + +**ConfigMap**: Kubernetes resource holding non-sensitive config (env +vars). Mounted into pods via `envFrom`. (Chapter 10) + +**CoreDNS**: Cluster-internal DNS resolver. Every pod's +`/etc/resolv.conf` points to the CoreDNS Service. (Chapter 3) + +**CRD (Custom Resource Definition)**: Kubernetes extension mechanism +for third-party resource types. Traefik's `IngressRoute` and +`Middleware` are CRDs. (Chapter 6) + +**DaemonSet**: Workload that runs exactly one pod per node. We use it +for Traefik so each node has its own ingress pod. (Chapter 6) + +**Deployment**: Kubernetes workload for stateless pods. Supports rolling +updates. Most of our services are Deployments. (Chapter 7) + +**Endpoints**: The actual pod IPs backing a Service's ClusterIP. +Dynamically updated as pods come and go. (Chapter 3) + +**etcd**: Distributed key-value store holding cluster state. K3s +embeds it. Raft-replicated across server nodes. (Chapter 2) + +**Flannel**: Kubernetes CNI (Container Network Interface) plugin for +pod-to-pod networking. Uses VXLAN tunneling. (Chapter 3) + +**HPA (HorizontalPodAutoscaler)**: K8s resource that scales Deployment +replicas based on CPU/memory usage. Not currently enabled for us. +(Chapter 7) + +**Ingress**: K8s resource describing external-to-internal routing rules. +Traefik watches Ingresses and programs itself accordingly. (Chapter 6) + +**IPVS**: Linux kernel feature for in-kernel L4 load balancing. Our +kube-proxy uses it. (Chapter 3) + +**k3s**: Lightweight Kubernetes distribution by Rancher/SUSE. What we +run. (Chapter 2) + +**kubectl**: Kubernetes CLI tool. Runs on operator workstation. +(Chapter 17) + +**kubelet**: Agent running on each node, responsible for pod lifecycle. +(Chapter 2) + +**kube-proxy**: Service-to-pod routing component. Runs on each node in +IPVS mode. (Chapter 3) + +**Namespace**: Kubernetes logical grouping. Our app lives in `honeydue`. +System services in `kube-system`. (Chapter 7) + +**NetworkPolicy**: K8s resource defining allowed traffic between pods. +Not currently applied. (Chapter 5) + +**Node**: A physical or virtual machine running Kubernetes. We have 3. +(Chapter 1) + +**PDB (PodDisruptionBudget)**: Constraint on voluntary pod disruptions +(drain, upgrade). Keeps N replicas available. (Chapter 7) + +**Pod**: Smallest Kubernetes unit — one or more containers sharing +network and storage. Our pods are usually one-container. (Chapter 7) + +**PVC (PersistentVolumeClaim)**: Request for persistent storage. Redis +uses one. (Chapter 7) + +**RBAC**: Role-Based Access Control. Governs who/what can do what via +the Kubernetes API. (Chapter 5) + +**ReplicaSet**: Managed by a Deployment; ensures N pods of a template +are running. Each deploy creates a new ReplicaSet. (Chapter 14) + +**Secret**: K8s resource holding sensitive values. Base64-encoded; +stored in etcd (unencrypted by default). (Chapter 10) + +**Service**: K8s resource providing a stable endpoint (ClusterIP) for +a set of pods. (Chapter 3) + +**ServiceAccount**: Identity used by pods to authenticate to the +Kubernetes API. We disable token mounting for our app pods. +(Chapter 5) + +**Taint / Toleration**: Mechanism to prevent pods from being scheduled +on certain nodes. Not used in our setup. (Chapter 7) + +## Docker / Swarm + +**libnetwork**: Docker's networking library. Provides overlay +networking for Swarm. Source of the DNS ghost bug (Chapter 19). + +**mode: global**: Swarm deploy mode for services running one pod per +node. (Chapter 19) + +**mode: host**: Port publishing mode that binds to node's real +interface, bypassing the ingress mesh. (Chapter 4) + +**Overlay network**: Encrypted or unencrypted virtual network spanning +Swarm nodes. (Chapter 19) + +**Swarm**: Docker's built-in orchestrator. What we used to run. +(Chapter 19) + +**VXLAN**: Virtual Extensible LAN. Layer-2 over Layer-3 tunneling. +Used by both Swarm overlay and Kubernetes Flannel. (Chapter 3) + +## Cloudflare + +**Flexible SSL**: CF SSL mode where CF↔origin is HTTP. Our current +setup. (Chapter 13) + +**Full (strict) SSL**: CF SSL mode where CF↔origin is HTTPS with cert +verification. Our target. (Chapter 13) + +**Origin CA**: CF-internal certificate authority that issues certs CF's +edge trusts. Used for Full strict mode. (Chapter 13) + +**POP (Point of Presence)**: A CF edge location. ~300 globally. +(Chapter 13) + +**Proxied (orange cloud)**: DNS record with CF proxying on. Traffic +goes through CF. (Chapter 13) + +**Workers**: CF's serverless compute at the edge. We don't use yet. +(Chapter 20) + +## Hetzner + +**CX33**: Hetzner Cloud instance type. 4 vCPU, 8 GB RAM, 80 GB SSD. +(Chapter 1) + +**Cloud Firewall**: Hetzner's provider-level firewall feature. We use +UFW on nodes instead. (Chapter 4) + +**nbg1**: Nuremberg datacenter code. Our region. (Chapter 1) + +## Neon + +**Branch**: Neon's isolation primitive. Each project can have multiple +branches (prod, staging, dev). (Chapter 8) + +**CU (Compute Unit)**: Neon's pricing unit for compute. +(Chapter 8) + +**Launch plan**: Neon's entry-level paid plan. $5 min + usage. +(Chapter 8) + +**Pooler**: Neon's built-in PgBouncer instance at the `-pooler` hostname +suffix. (Chapter 8) + +## Backblaze B2 + +**B2**: Backblaze's object storage. What we use for uploads. +(Chapter 9) + +**App key**: B2's bucket-scoped credential. Not an IAM-flavored role. +(Chapter 9) + +**S3-compatible**: API that speaks AWS S3 protocol. B2 supports it. +(Chapter 9) + +## Go + Asynq + +**AutoMigrate**: GORM function that syncs DB schema to Go structs. +(Chapter 8) + +**Asynq**: Go library for background job queues. Redis-backed. +(Chapter 7) + +**GORM**: Go ORM we use. (Chapter 8) + +**pgx**: Go Postgres driver used by GORM. (Chapter 8) + +**sync.Once**: Go stdlib primitive for "run this exactly once." Source +of bug #6 (Chapter 19). + +## Other + +**advisory lock**: A Postgres lock that doesn't block rows but lets +apps coordinate voluntarily. We use for migration serialization. +(Chapter 8) + +**AOF (Append-Only File)**: Redis persistence mode that logs every +write. (Chapter 7) + +**MTU**: Maximum Transmission Unit. Packet size limit. VXLAN reduces +effective MTU by 50 bytes. (Chapter 3) + +**Raft**: Consensus algorithm. Used by etcd. (Chapter 2) + +**STARTTLS**: SMTP upgrade from plain to TLS. Used for Fastmail. +(Chapter 5) + +**UFW**: Uncomplicated Firewall. Frontend for iptables. (Chapter 4) + +**VXLAN**: See Docker/Swarm section. diff --git a/docs/deployment/appendices/b-commands.md b/docs/deployment/appendices/b-commands.md new file mode 100644 index 0000000..baec7b9 --- /dev/null +++ b/docs/deployment/appendices/b-commands.md @@ -0,0 +1,305 @@ +# Appendix B — kubectl Cheat Sheet + +Specific to this deployment. Assumes: + +```bash +export KUBECONFIG=~/.kube/honeydue-k3s.yaml +``` + +## Viewing state + +```bash +# All pods in our namespace +kubectl get pods -n honeydue + +# With node placement + IPs +kubectl get pods -n honeydue -o wide + +# All resources in our namespace +kubectl get all -n honeydue + +# Cluster-wide pod overview +kubectl get pods -A + +# Node health +kubectl get nodes +kubectl top nodes + +# What's using RAM +kubectl top pods -n honeydue --sort-by=memory + +# What's using CPU +kubectl top pods -n honeydue --sort-by=cpu +``` + +## Logs + +```bash +# Follow all api pod logs +kubectl logs -n honeydue -l app.kubernetes.io/name=api -f --prefix + +# One specific pod +kubectl logs -n honeydue + +# Previous pod's logs (after crash) +kubectl logs -n honeydue --previous + +# Filtered +kubectl logs -n honeydue deploy/api | grep -i error +kubectl logs -n honeydue deploy/api --since=1h + +# stern is nicer for multi-pod (if installed) +stern -n honeydue api +``` + +## Deploying new code + +```bash +SHA=$(git rev-parse --short HEAD) + +# Build + push (requires docker login to Gitea first) +docker buildx build --platform linux/amd64 --target api \ + -t "gitea.treytartt.com/admin/honeydue-api:${SHA}" --push . + +# Roll it in +kubectl set image deployment/api -n honeydue \ + api="gitea.treytartt.com/admin/honeydue-api:${SHA}" + +# Watch +kubectl rollout status -n honeydue deployment/api +``` + +## Rolling update controls + +```bash +# Pause a rollout in progress (new pods stop being created) +kubectl rollout pause deployment/api -n honeydue + +# Resume +kubectl rollout resume deployment/api -n honeydue + +# Rollback to previous version +kubectl rollout undo deployment/api -n honeydue + +# Rollback to specific revision +kubectl rollout history deployment/api -n honeydue +kubectl rollout undo deployment/api -n honeydue --to-revision=3 + +# Force restart (re-pulls image if digest changed; reloads ConfigMap) +kubectl rollout restart deployment/api -n honeydue +``` + +## Scaling + +```bash +# Scale up +kubectl scale deployment/api -n honeydue --replicas=5 + +# Scale down +kubectl scale deployment/api -n honeydue --replicas=3 + +# Kill everything (emergency) +kubectl scale deployment -n honeydue --all --replicas=0 + +# Bring back +kubectl scale deployment/api -n honeydue --replicas=3 +kubectl scale deployment/admin deployment/worker deployment/redis -n honeydue --replicas=1 +``` + +## Debugging a pod + +```bash +# Describe = events + state + restart history +kubectl describe pod -n honeydue + +# Shell in +kubectl exec -it -n honeydue deploy/api -- /bin/sh + +# Inside: +# Test HTTP locally (bypasses Traefik, Service, overlay) +wget -qO- http://127.0.0.1:8000/api/health/ + +# Test cross-Service DNS +getent hosts redis +getent hosts admin +getent hosts postgres + +# Run arbitrary command (one-shot) +kubectl exec -n honeydue deploy/api -- env | grep POSTGRES +``` + +## Networking checks + +```bash +# Resolve a Service from a pod +kubectl exec -n honeydue deploy/api -- nslookup redis + +# Check Service endpoints (the actual IPs behind a ClusterIP) +kubectl get endpoints -n honeydue api + +# Traffic test via Service +kubectl run test --rm -it --image=alpine/curl -- sh +# curl http://api.honeydue.svc:8000/api/health/ + +# List all Ingresses +kubectl get ingress -A +``` + +## Secret / Config + +```bash +# List +kubectl get secrets -n honeydue +kubectl get configmap -n honeydue + +# Describe (shows keys, not values) +kubectl describe secret honeydue-secrets -n honeydue + +# Read a value (DANGER: plaintext to stdout) +kubectl get secret honeydue-secrets -n honeydue \ + -o jsonpath='{.data.POSTGRES_PASSWORD}' | base64 -d; echo + +# Update a single secret key +kubectl patch secret honeydue-secrets -n honeydue \ + --type=merge -p "{\"data\":{\"SECRET_KEY\":\"$(echo -n 'new-val' | base64)\"}}" + +# Regenerate ConfigMap from prod.env +kubectl create configmap honeydue-config -n honeydue \ + --from-env-file=deploy/prod.env \ + --dry-run=client -o yaml | kubectl apply -f - + +# Edit a ConfigMap interactively (does NOT restart pods) +kubectl edit configmap honeydue-config -n honeydue +``` + +## Node management + +```bash +# Prevent scheduling on a node +kubectl cordon + +# Prevent scheduling + evict existing pods +kubectl drain --ignore-daemonsets --delete-emptydir-data + +# Allow scheduling again +kubectl uncordon + +# Label a node +kubectl label node honeydue/redis=true --overwrite + +# Remove a label +kubectl label node honeydue/redis- +``` + +## Events (the timeline) + +```bash +# All events, newest last +kubectl get events -A --sort-by=.lastTimestamp + +# Watch live +kubectl get events -A --sort-by=.lastTimestamp -w + +# Only warnings +kubectl get events -A --field-selector type=Warning + +# Events for a specific pod +kubectl describe pod -n honeydue | awk '/Events:/,0' +``` + +## Traefik-specific + +```bash +# All Traefik pods (DaemonSet, so one per node) +kubectl get pods -n kube-system -l app.kubernetes.io/name=traefik -o wide + +# Restart Traefik across all nodes +kubectl rollout restart daemonset/traefik -n kube-system + +# View Traefik config (via ConfigMap) +kubectl get cm -n kube-system traefik -o yaml | less + +# See the HelmChartConfig we applied +kubectl get helmchartconfig -n kube-system traefik -o yaml + +# Force Helm re-reconcile +kubectl delete job -n kube-system helm-install-traefik +``` + +## Cluster-wide operations + +```bash +# API server health +kubectl cluster-info + +# All namespaces +kubectl get namespaces + +# All k3s-system pods +kubectl get pods -n kube-system + +# All ServiceAccounts in our namespace +kubectl get sa -n honeydue + +# Check what an SA can do +kubectl auth can-i --list --as=system:serviceaccount:honeydue:api +``` + +## Hetzner SSH (not kubectl but oft needed) + +```bash +# SSH in +ssh -i ~/.ssh/hetzner deploy@hetzner1 + +# Check k3s service +ssh -i ~/.ssh/hetzner deploy@hetzner1 'sudo systemctl status k3s' + +# Per-node commands in parallel (e.g., apt upgrade) +for h in hetzner1 hetzner2 hetzner3; do + ssh -i ~/.ssh/hetzner "deploy@$h" 'sudo apt update && sudo apt upgrade -y' +done +``` + +## Emergency: cluster is wedged + +```bash +# Check all nodes Ready +kubectl get nodes + +# If one is NotReady +ssh -i ~/.ssh/hetzner deploy@ 'sudo systemctl restart k3s' + +# If still bad, kill k3s on that node and check +ssh -i ~/.ssh/hetzner deploy@ 'sudo /usr/local/bin/k3s-killall.sh' +ssh -i ~/.ssh/hetzner deploy@ 'sudo systemctl start k3s' + +# Last resort: uninstall + rejoin +# ssh -i ~/.ssh/hetzner deploy@ 'sudo /usr/local/bin/k3s-uninstall.sh' +# then re-join via the k3s install command +``` + +## One-liners worth memorizing + +```bash +# Heavy smoke test through CF +for url in https://api.myhoneydue.com/api/health/ https://admin.myhoneydue.com/ https://myhoneydue.com/; do + ok=0 + for i in $(seq 1 20); do + [[ "$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$url")" == "200" ]] && ok=$((ok+1)) + done + printf "%-45s %d/20\n" "$url" "$ok" +done + +# Pods not ready +kubectl get pods -A | awk '$3!="Running" && $3!="Completed" && $3!="STATUS"' + +# Restart everything in our namespace +for d in api admin worker redis; do + kubectl rollout restart deploy/$d -n honeydue +done + +# Watch all rollouts simultaneously +for d in api admin worker redis; do + kubectl rollout status deploy/$d -n honeydue & +done; wait +``` diff --git a/docs/deployment/appendices/c-file-locations.md b/docs/deployment/appendices/c-file-locations.md new file mode 100644 index 0000000..cedccff --- /dev/null +++ b/docs/deployment/appendices/c-file-locations.md @@ -0,0 +1,216 @@ +# Appendix C — File Locations + +Complete map of where every significant file lives — on the operator +workstation, in the git repo, and on the Hetzner nodes. + +## Operator workstation + +### Kubernetes + +| Path | Purpose | +|---|---| +| `~/.kube/honeydue-k3s.yaml` | kubeconfig for the k3s cluster. Contains an admin bearer token. Mode 0600. | +| `~/.kube/config` | Default kubeconfig (points elsewhere, not our cluster). | + +Set `KUBECONFIG=~/.kube/honeydue-k3s.yaml` before any `kubectl` command. + +### SSH + +| Path | Purpose | +|---|---| +| `~/.ssh/hetzner` | Private key for node SSH (ed25519). Mode 0600. | +| `~/.ssh/hetzner.pub` | Public key corresponding to above. | +| `~/.ssh/config` | Host aliases for hetzner1/hetzner2/hetzner3 → node IPs. | + +Public key content: +``` +ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBU9xTTBD78tYUqHijgyU9PDqtmS4NuM/6uy8XgDzva+ hetzner2@myhoneydue.com +``` + +### Docker + +| Path | Purpose | +|---|---| +| `~/.docker/config.json` | Docker CLI config. After `docker login` to Gitea, contains creds. **Log out after each deploy** to not leave PATs on disk. | +| `~/Library/Containers/com.docker.docker/` | Docker Desktop state (macOS). | + +## Git repo (`/Users/treyt/Desktop/code/honeyDue/honeyDueAPI-go/`) + +### Top-level + +| Path | Purpose | +|---|---| +| `CLAUDE.md` | Project-wide instructions for Claude assistant. Never commit secrets here. | +| `Dockerfile` | Multi-stage Docker build: api, worker, admin targets. | +| `go.mod`, `go.sum` | Go module definition. | +| `package.json` (admin-ui/) | Next.js dependencies. | + +### Application code + +| Path | Purpose | +|---|---| +| `cmd/api/main.go` | API server entry point. | +| `cmd/worker/main.go` | Background worker entry point. | +| `cmd/admin/main.go` | (may or may not exist for Go admin variant) | +| `internal/config/` | Viper configuration loading. | +| `internal/database/` | Postgres connection, migrations. | +| `internal/handlers/` | HTTP handlers (one file per domain). | +| `internal/services/` | Business logic. `cache_service.go` is where the sync.Once bug was (Chapter 19). | +| `internal/repositories/` | GORM repositories. | +| `internal/router/router.go` | Echo routes, including static file serving. CSP is set here. | +| `internal/middleware/` | Echo middleware (auth, logging, etc.). | +| `internal/task/` | Task predicates/scopes/categorization. See `docs/TASK_LOGIC_ARCHITECTURE.md`. | + +### Deploy config (Swarm era — still exists, unused) + +| Path | Purpose | +|---|---| +| `deploy/` | Legacy Swarm deploy root. | +| `deploy/prod.env` | Non-secret config (ConfigMap source). **Gitignored.** | +| `deploy/registry.env` | Gitea PAT + registry URL. **Gitignored.** | +| `deploy/cluster.env` | Swarm cluster settings. Partly used for k3s too (manager host). **Gitignored.** | +| `deploy/secrets/postgres_password.txt` | Neon password. **Gitignored.** | +| `deploy/secrets/secret_key.txt` | App signing key (≥32 chars). **Gitignored.** | +| `deploy/secrets/email_host_password.txt` | Fastmail password. **Gitignored.** | +| `deploy/secrets/fcm_server_key.txt` | FCM key (placeholder, push off). **Gitignored.** | +| `deploy/secrets/apns_auth_key.p8` | APNs key (placeholder, push off). **Gitignored.** | +| `deploy/swarm-stack.prod.yml` | Swarm stack definition. Unused after migration. | +| `deploy/Caddyfile` | Caddy config. Unused after migration. | +| `deploy/scripts/deploy_prod.sh` | Swarm deploy script. Unused. | +| `deploy/DEPLOYING.md`, `deploy/README.md`, `deploy/shit_deploy_cant_do.md` | Swarm-era docs. Historical reference. | + +### Deploy config (k3s) + +| Path | Purpose | +|---|---| +| `deploy-k3s/README.md` | k3s deployment README (scaffold version). | +| `deploy-k3s/MIGRATION_NOTES.md` | Notes from Swarm → k3s migration. | +| `deploy-k3s/SECURITY.md` | Security posture doc (scaffold). | +| `deploy-k3s/config.yaml.example` | Template for a unified config.yaml (unused — we kept Swarm's file layout). | +| `deploy-k3s/manifests/namespace.yaml` | Creates `honeydue` namespace. | +| `deploy-k3s/manifests/rbac.yaml` | ServiceAccounts + `automountServiceAccountToken: false`. | +| `deploy-k3s/manifests/pod-disruption-budgets.yaml` | PDBs for api (2/3) and worker (0/1). | +| `deploy-k3s/manifests/network-policies.yaml` | Default-deny + allows. NOT currently applied. | +| `deploy-k3s/manifests/api/deployment.yaml` | api Deployment. | +| `deploy-k3s/manifests/api/service.yaml` | api ClusterIP Service. | +| `deploy-k3s/manifests/api/hpa.yaml` | api HorizontalPodAutoscaler. NOT currently applied. | +| `deploy-k3s/manifests/admin/deployment.yaml` | admin Deployment. | +| `deploy-k3s/manifests/admin/service.yaml` | admin Service. | +| `deploy-k3s/manifests/worker/deployment.yaml` | worker Deployment. | +| `deploy-k3s/manifests/redis/deployment.yaml` | Redis Deployment. | +| `deploy-k3s/manifests/redis/service.yaml` | Redis Service. | +| `deploy-k3s/manifests/redis/pvc.yaml` | Redis PersistentVolumeClaim. | +| `deploy-k3s/manifests/ingress/ingress.yaml` | Full Ingress with TLS + middleware (scaffold; needs CF origin cert). | +| `deploy-k3s/manifests/ingress/ingress-simple.yaml` | Simple Ingress without TLS (what we actually apply). | +| `deploy-k3s/manifests/ingress/middleware.yaml` | Traefik middleware CRDs. Not currently applied. | +| `deploy-k3s/manifests/traefik-helmchartconfig.yaml` | Our DaemonSet + hostNetwork override for Traefik. | +| `deploy-k3s/manifests/secrets.yaml.example` | Template (never deployed). | +| `deploy-k3s/scripts/01-provision-cluster.sh` | hetzner-k3s provisioning (we didn't use it; existing nodes). | +| `deploy-k3s/scripts/02-setup-secrets.sh` | Creates Secrets + ConfigMap (scaffold version; we ran commands manually). | +| `deploy-k3s/scripts/03-deploy.sh` | Applies manifests (unused; we ran kubectl manually). | +| `deploy-k3s/scripts/04-verify.sh` | Post-deploy verification. | +| `deploy-k3s/scripts/rollback.sh` | Rollback helper. | + +### Documentation + +| Path | Purpose | +|---|---| +| `docs/deployment/` | **This book.** | +| `docs/TASK_LOGIC_ARCHITECTURE.md` | Task logic internals. | +| `docs/PUSH_NOTIFICATIONS.md` | Push notifications setup (for future). | +| `docs/SUBSCRIPTION_WEBHOOKS.md` | Apple/Google subscription webhooks. | +| `docs/Dokku_notes` | Pre-Swarm era deployment notes. Historical. | +| `docs/server_2026_2_24.md` | Earlier architecture doc (predates k3s migration). | + +## On the Hetzner nodes + +### System + +| Path | Purpose | +|---|---| +| `/etc/ssh/sshd_config` | SSH config — `PermitRootLogin no`, `PasswordAuthentication no`, `AllowUsers deploy`. | +| `/etc/sudoers.d/deploy` | `deploy ALL=(ALL) NOPASSWD: ALL`. | +| `/etc/ufw/` | UFW configuration. See Chapter 4 for rule inventory. | +| `/etc/sysctl.d/99-unprivileged-ports.conf` | `net.ipv4.ip_unprivileged_port_start=0` for Traefik. | +| `/home/deploy/.ssh/authorized_keys` | Our hetzner.pub. | + +### K3s + +| Path | Purpose | +|---|---| +| `/etc/rancher/k3s/k3s.yaml` | Kubeconfig (localhost-scoped; we copied to workstation). | +| `/etc/systemd/system/k3s.service` | systemd service file. | +| `/etc/systemd/system/k3s.service.env` | K3s install args (INSTALL_K3S_EXEC). | +| `/var/lib/rancher/k3s/` | K3s state root (etcd, containerd, PVC storage). | +| `/var/lib/rancher/k3s/server/node-token` | Token for joining additional nodes. | +| `/var/lib/rancher/k3s/storage/` | local-path PVC storage. Redis data lives here. | +| `/var/lib/rancher/k3s/agent/containerd/` | containerd state. | +| `/var/log/containers/` | Container log files. | + +### Commands installed + +| Path | Purpose | +|---|---| +| `/usr/local/bin/k3s` | The k3s binary. | +| `/usr/local/bin/kubectl` | Symlink to k3s (CLI for this cluster). | +| `/usr/local/bin/crictl` | containerd CLI. | +| `/usr/local/bin/k3s-killall.sh` | Emergency kill-all-k3s script. | +| `/usr/local/bin/k3s-uninstall.sh` | Clean uninstall script. | + +### Docker (legacy; disabled) + +| Path | Purpose | +|---|---| +| `/etc/systemd/system/docker.service` | systemd unit (stopped + disabled). | +| `/var/lib/docker/` | Docker state (unused on current cluster). | + +## On Cloudflare + +Not a filesystem, but worth noting the dashboard hierarchy: + +``` +Websites → myhoneydue.com +├── DNS → Records (A records for api, admin, @) +├── SSL/TLS → Overview (SSL mode: Flexible) +├── SSL/TLS → Edge Certificates (Always Use HTTPS: On) +├── SSL/TLS → Origin Server (would live the Origin CA cert if we enabled it) +├── Rules → Overview (where Origin Rules live if we had them) +├── Rules → Page Rules (none) +├── Security → WAF (managed rules only) +├── Speed → Optimization (default) +└── Analytics & Logs (read-only stats) +``` + +## On Gitea (`gitea.treytartt.com`) + +The image registry lives at: + +``` +gitea.treytartt.com/admin/-/packages # UI listing of all packages +gitea.treytartt.com/admin/-/packages/container/honeydue-api # API image +gitea.treytartt.com/admin/-/packages/container/honeydue-worker # Worker image +gitea.treytartt.com/admin/-/packages/container/honeydue-admin # Admin image +``` + +Per-version tags visible in the UI with `docker pull` commands. + +PATs at `gitea.treytartt.com/-/user/settings/applications`. + +## On Neon + +``` +console.neon.tech → project → Branches (production branch default) +console.neon.tech → project → Monitoring (CU-hour usage, slow queries) +console.neon.tech → project → Operations (history of schema changes) +``` + +Connection strings at `console.neon.tech → project → Connection Details`. + +## On Backblaze B2 + +``` +secure.backblaze.com/b2_buckets.htm # Buckets list +secure.backblaze.com/b2_app_keys.htm # App keys +``` + +`honeyDueProd` bucket → Files tab for browsing contents. diff --git a/docs/deployment/appendices/d-references.md b/docs/deployment/appendices/d-references.md new file mode 100644 index 0000000..31f3ea4 --- /dev/null +++ b/docs/deployment/appendices/d-references.md @@ -0,0 +1,202 @@ +# Appendix D — References & Citations + +Every external link cited anywhere in this book, grouped by topic. + +## Docker / Moby + +- [moby/moby#52265 — Overlay ARP stale entries on 29.3.0 regression][moby-52265] (Chapter 19, primary root-cause citation) +- [moby/moby#51491 — DNS broken after `docker swarm init` on 29.0.0][moby-51491] +- [Dokploy#3480 — Traefik routes intermittently timeout due to stale VIP][dokploy-3480] +- [Mirantis: Commits to Long-Term Support for Swarm Through 2030][mirantis-swarm] +- [Better Stack: Hetzner Cloud Review 2026][bstack-swarm] +- [VirtualizationHowTo: Is Docker Swarm Still Safe in 2026?][vht-swarm] +- [bleevht: Where Docker Swarm Still Fits in 2026][bleevht-swarm] +- [Docker buildx multi-platform builds][buildx] +- [Compose specification][compose-spec] + +## Kubernetes / k3s + +- [K3s documentation home][k3s-docs] +- [K3s architecture][k3s-arch] +- [K3s requirements (networking ports)][k3s-reqs] +- [K3s advanced config — metrics server][k3s-metrics] +- [K3s HA datastore recovery][k3s-ha-recovery] +- [K3s storage — local-path provisioner][k3s-lp] +- [K3s Helm integration — HelmChartConfig][k3s-helm] +- [K3s Traefik customization][k3s-traefik] +- [K3s secrets encryption][k3s-secrets] +- [Kubernetes concepts — Services & Networking][k8s-net] +- [Kubernetes Ingress][k8s-ingress] +- [Kubernetes Deployments — rolling updates][rolling] +- [kubectl rollout][rollout] +- [kubectl cheat sheet][kubectl-cs] +- [Pod lifecycle + probes][probes] +- [Pod Security Standards][psa] +- [Kubernetes RBAC][rbac] +- [NetworkPolicy][netpol] +- [Ports and Protocols reference][k8s-ports] +- [metrics-server][ms] + +## Traefik + +- [Traefik v3 documentation][traefik] +- [Traefik Swarm provider][traefik-swarm] +- [Traefik migrate v2 → v3][traefik-v3] + +## Cloudflare + +- [IP ranges][cf-ips] +- [SSL modes explained][cf-ssl] +- [Origin CA certificates][cf-origin-ca] +- [DNS best practices][cf-dns] +- [Free plan][cf-free] + +## Hetzner + +- [Hetzner Cloud][hetzner-cloud] +- [Hetzner price adjustment 2026-04-01][hetzner-prices] +- [Hetzner rescue system][hetzner-rescue] +- [hetzner-k3s tool][hetzner-k3s] + +## Neon / Postgres + +- [Neon docs][neon-docs] +- [Neon pricing][neon-pricing] +- [Neon usage-based pricing announcement][neon-blog] +- [Neon connect from any app][neon-connect] +- [Postgres advisory locks][pg-locks] +- [GORM AutoMigrate][gorm-automigrate] + +## Backblaze B2 + +- [B2 documentation][b2-docs] +- [B2 S3-compatible API][b2-s3] +- [B2 pricing][b2-pricing] +- [minio-go SDK][minio-go] +- [S3 path-style vs virtual-hosted addressing][s3-style] + +## Gitea + +- [Gitea container registry docs][gitea-cr] + +## CNI / Networking + +- [Flannel VXLAN backend][flannel-vxlan] +- [CoreDNS Kubernetes plugin][coredns-k8s] +- [IPVS mode for kube-proxy deep dive][ipvs] +- [VXLAN RFC 7348][vxlan-rfc] +- [Kubernetes NetworkPolicy][netpol] + +## Security tools + +- [cosign (image signing)][cosign] +- [Loki (logs)][loki] +- [Stern (multi-pod log tailing)][stern] +- [fail2ban][fail2ban] + +## Asynq + +- [Asynq documentation][asynq] +- [Asynq periodic tasks (scheduler limitations)][asynq-sched] + +## Miscellaneous + +- [Let's Encrypt][le] +- [UFW man page][ufw-man] +- [SSH hardening guide][ssh-guide] +- [pg_dump][pg-dump] + +--- + +## Link definitions + + +[moby-52265]: https://github.com/moby/moby/issues/52265 +[moby-51491]: https://github.com/moby/moby/issues/51491 +[dokploy-3480]: https://github.com/Dokploy/dokploy/issues/3480 +[mirantis-swarm]: https://www.mirantis.com/blog/mirantis-guarantees-long-term-support-for-swarm/ +[bstack-swarm]: https://betterstack.com/community/guides/web-servers/hetzner-cloud-review/ +[vht-swarm]: https://www.virtualizationhowto.com/2026/03/is-docker-swarm-still-safe-in-2026/ +[bleevht-swarm]: https://bleevht.substack.com/p/where-docker-swarm-still-fits-in +[buildx]: https://docs.docker.com/build/buildx/ +[compose-spec]: https://docs.docker.com/reference/compose-file/ + + +[k3s-docs]: https://docs.k3s.io/ +[k3s-arch]: https://docs.k3s.io/architecture +[k3s-reqs]: https://docs.k3s.io/installation/requirements#networking +[k3s-metrics]: https://docs.k3s.io/advanced#enabling-metrics-server +[k3s-ha-recovery]: https://docs.k3s.io/datastore/ha-embedded#new-cluster-with-embedded-db +[k3s-lp]: https://docs.k3s.io/storage#setting-up-the-local-storage-provider +[k3s-helm]: https://docs.k3s.io/helm#customizing-packaged-components-with-helmchartconfig +[k3s-traefik]: https://docs.k3s.io/networking/networking-services#traefik-ingress-controller +[k3s-secrets]: https://docs.k3s.io/security/secrets-encryption +[k8s-net]: https://kubernetes.io/docs/concepts/services-networking/ +[k8s-ingress]: https://kubernetes.io/docs/concepts/services-networking/ingress/ +[rolling]: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#rolling-update-deployment +[rollout]: https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#rollout +[kubectl-cs]: https://kubernetes.io/docs/reference/kubectl/cheatsheet/ +[probes]: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-lifecycle +[psa]: https://kubernetes.io/docs/concepts/security/pod-security-standards/ +[rbac]: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ +[netpol]: https://kubernetes.io/docs/concepts/services-networking/network-policies/ +[k8s-ports]: https://kubernetes.io/docs/reference/networking/ports-and-protocols/ +[ms]: https://github.com/kubernetes-sigs/metrics-server + + +[traefik]: https://doc.traefik.io/traefik/v3.6/ +[traefik-swarm]: https://doc.traefik.io/traefik/providers/swarm/ +[traefik-v3]: https://doc.traefik.io/traefik/migrate/v2-to-v3-details/ + + +[cf-ips]: https://www.cloudflare.com/ips/ +[cf-ssl]: https://developers.cloudflare.com/ssl/origin-configuration/ssl-modes/ +[cf-origin-ca]: https://developers.cloudflare.com/ssl/origin-configuration/origin-ca/ +[cf-dns]: https://developers.cloudflare.com/dns/ +[cf-free]: https://www.cloudflare.com/plans/free/ + + +[hetzner-cloud]: https://www.hetzner.com/cloud/ +[hetzner-prices]: https://docs.hetzner.com/general/infrastructure-and-availability/price-adjustment/ +[hetzner-rescue]: https://docs.hetzner.com/cloud/servers/getting-started/enabling-rescue-system/ +[hetzner-k3s]: https://github.com/vitobotta/hetzner-k3s + + +[neon-docs]: https://neon.com/docs/introduction +[neon-pricing]: https://neon.com/pricing +[neon-blog]: https://neon.com/blog/new-usage-based-pricing +[neon-connect]: https://neon.com/docs/connect/connect-from-any-app +[pg-locks]: https://www.postgresql.org/docs/current/explicit-locking.html#ADVISORY-LOCKS +[gorm-automigrate]: https://gorm.io/docs/migration.html + + +[b2-docs]: https://www.backblaze.com/docs/ +[b2-s3]: https://www.backblaze.com/docs/cloud-storage-s3-compatible-api +[b2-pricing]: https://www.backblaze.com/cloud-storage/pricing +[minio-go]: https://github.com/minio/minio-go +[s3-style]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html + + +[gitea-cr]: https://docs.gitea.com/usage/packages/container + + +[flannel-vxlan]: https://github.com/flannel-io/flannel/blob/master/Documentation/backends.md#vxlan +[coredns-k8s]: https://coredns.io/plugins/kubernetes/ +[ipvs]: https://kubernetes.io/blog/2018/07/09/ipvs-based-in-cluster-load-balancing-deep-dive/ +[vxlan-rfc]: https://datatracker.ietf.org/doc/html/rfc7348 + + +[cosign]: https://github.com/sigstore/cosign +[loki]: https://grafana.com/oss/loki/ +[stern]: https://github.com/stern/stern +[fail2ban]: https://www.fail2ban.org/ + + +[asynq]: https://github.com/hibiken/asynq +[asynq-sched]: https://github.com/hibiken/asynq/wiki/Periodic-Tasks + + +[le]: https://letsencrypt.org/ +[ufw-man]: https://manpages.ubuntu.com/manpages/noble/en/man8/ufw.8.html +[ssh-guide]: https://linux-audit.com/audit-and-harden-your-ssh-configuration/ +[pg-dump]: https://www.postgresql.org/docs/current/app-pgdump.html diff --git a/internal/router/router.go b/internal/router/router.go index 0ea6986..acc53e8 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -62,13 +62,24 @@ func SetupRouter(deps *Dependencies) *echo.Echo { e.Use(custommiddleware.StructuredLogger()) // Security headers (X-Frame-Options, X-Content-Type-Options, X-XSS-Protection, etc.) + // + // CSP is permissive enough to serve the marketing landing page at / (which + // loads same-origin CSS/JS/images and Google Fonts over https). JSON API + // responses are unaffected — they don't load any assets, so any CSP is fine. + // frame-ancestors stays 'none' to block clickjacking. e.Use(middleware.SecureWithConfig(middleware.SecureConfig{ - XSSProtection: "1; mode=block", - ContentTypeNosniff: "nosniff", - XFrameOptions: "SAMEORIGIN", - HSTSMaxAge: 31536000, // 1 year in seconds - ReferrerPolicy: "strict-origin-when-cross-origin", - ContentSecurityPolicy: "default-src 'none'; frame-ancestors 'none'", + XSSProtection: "1; mode=block", + ContentTypeNosniff: "nosniff", + XFrameOptions: "SAMEORIGIN", + HSTSMaxAge: 31536000, + ReferrerPolicy: "strict-origin-when-cross-origin", + ContentSecurityPolicy: "default-src 'self'; " + + "style-src 'self' https://fonts.googleapis.com; " + + "font-src 'self' https://fonts.gstatic.com data:; " + + "img-src 'self' data:; " + + "script-src 'self'; " + + "connect-src 'self'; " + + "frame-ancestors 'none'", })) e.Use(middleware.BodyLimitWithConfig(middleware.BodyLimitConfig{ Limit: "1M", // 1MB default for JSON payloads diff --git a/internal/services/cache_service.go b/internal/services/cache_service.go index 7b05c20..1bbc6ba 100644 --- a/internal/services/cache_service.go +++ b/internal/services/cache_service.go @@ -50,8 +50,12 @@ func NewCacheService(cfg *config.RedisConfig) (*CacheService, error) { if err := client.Ping(ctx).Err(); err != nil { initErr = fmt.Errorf("failed to connect to Redis: %w", err) - // Reset Once so a retry is possible after transient failures - cacheOnce = sync.Once{} + // NOTE: Don't reassign `cacheOnce = sync.Once{}` here. Mutating the + // Once from within its own Do() callback fatals with "unlock of + // unlocked mutex" because Do is holding the inner lock while we + // zero it. main.go handles the error (caching disabled, keep running); + // a pod restart is the right "retry" path for a transient Redis + // outage, not in-process. return }