Harden prod deploy: versioned secrets, healthchecks, migration lock, dry-run

Swarm stack
- Resource limits on all services, stop_grace_period 60s on api/worker/admin
- Dozzle bound to manager loopback only (ssh -L required for access)
- Worker health server on :6060, admin /api/health endpoint
- Redis 200M LRU cap, B2/S3 env vars wired through to api service

Deploy script
- DRY_RUN=1 prints plan + exits
- Auto-rollback on failed healthcheck, docker logout at end
- Versioned-secret pruning keeps last SECRET_KEEP_VERSIONS (default 3)
- PUSH_LATEST_TAG default flipped to false
- B2 all-or-none validation before deploy

Code
- cmd/api takes pg_advisory_lock on a dedicated connection before
  AutoMigrate, serialising boot-time migrations across replicas
- cmd/worker exposes an HTTP /health endpoint with graceful shutdown

Docs
- deploy/DEPLOYING.md: step-by-step walkthrough for a real deploy
- deploy/shit_deploy_cant_do.md: manual prerequisites + recurring ops
- deploy/README.md updated with storage toggle, worker-replica caveat,
  multi-arch recipe, connection-pool tuning, renumbered sections

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-04-14 15:22:43 -05:00
parent ca818e8478
commit 33eee812b6
11 changed files with 908 additions and 30 deletions

54
.dockerignore Normal file
View File

@@ -0,0 +1,54 @@
# Git
.git
.gitignore
.gitattributes
.github
.gitea
# Deploy inputs (never bake into images)
deploy/*.env
deploy/secrets/*.txt
deploy/secrets/*.p8
deploy/scripts/
# Local env files
.env
.env.*
!.env.example
# Node (admin)
admin/node_modules
admin/.next
admin/out
admin/.turbo
admin/.vercel
admin/npm-debug.log*
# Go build artifacts
bin/
dist/
tmp/
*.test
*.out
coverage.out
coverage.html
# Tooling / editor
.vscode
.idea
*.swp
*.swo
.DS_Store
# Logs
*.log
logs/
# Tests / docs (not needed at runtime)
docs/
*.md
!README.md
# CI/compose locals (not needed for swarm image build)
docker-compose*.yml
Makefile

View File

@@ -65,8 +65,10 @@ func main() {
log.Error().Err(dbErr).Msg("Failed to connect to database - API will start but database operations will fail") log.Error().Err(dbErr).Msg("Failed to connect to database - API will start but database operations will fail")
} else { } else {
defer database.Close() defer database.Close()
// Run database migrations only if connected // Run database migrations only if connected.
if err := database.Migrate(); err != nil { // MigrateWithLock serialises parallel replica starts via a Postgres
// advisory lock so concurrent AutoMigrate calls don't race on DDL.
if err := database.MigrateWithLock(); err != nil {
log.Error().Err(err).Msg("Failed to run database migrations") log.Error().Err(err).Msg("Failed to run database migrations")
} }
} }

View File

@@ -2,9 +2,11 @@ package main
import ( import (
"context" "context"
"net/http"
"os" "os"
"os/signal" "os/signal"
"syscall" "syscall"
"time"
"github.com/hibiken/asynq" "github.com/hibiken/asynq"
"github.com/redis/go-redis/v9" "github.com/redis/go-redis/v9"
@@ -20,6 +22,8 @@ import (
"github.com/treytartt/honeydue-api/pkg/utils" "github.com/treytartt/honeydue-api/pkg/utils"
) )
const workerHealthAddr = ":6060"
func main() { func main() {
// Initialize logger // Initialize logger
utils.InitLogger(true) utils.InitLogger(true)
@@ -188,6 +192,25 @@ func main() {
quit := make(chan os.Signal, 1) quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM) signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
// Health server (for container healthchecks; not externally published)
healthMux := http.NewServeMux()
healthMux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"status":"ok"}`))
})
healthSrv := &http.Server{
Addr: workerHealthAddr,
Handler: healthMux,
ReadHeaderTimeout: 5 * time.Second,
}
go func() {
log.Info().Str("addr", workerHealthAddr).Msg("Health server listening")
if err := healthSrv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Warn().Err(err).Msg("Health server terminated")
}
}()
// Start scheduler in goroutine // Start scheduler in goroutine
go func() { go func() {
if err := scheduler.Run(); err != nil { if err := scheduler.Run(); err != nil {
@@ -207,6 +230,9 @@ func main() {
log.Info().Msg("Shutting down worker...") log.Info().Msg("Shutting down worker...")
// Graceful shutdown // Graceful shutdown
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer shutdownCancel()
_ = healthSrv.Shutdown(shutdownCtx)
srv.Shutdown() srv.Shutdown()
scheduler.Shutdown() scheduler.Shutdown()

126
deploy/DEPLOYING.md Normal file
View File

@@ -0,0 +1,126 @@
# Deploying Right Now
Practical walkthrough for a prod deploy against the current Swarm stack.
Assumes infrastructure and cloud services already exist — if not, work
through [`shit_deploy_cant_do.md`](./shit_deploy_cant_do.md) first.
See [`README.md`](./README.md) for the reference docs that back each step.
---
## 0. Pre-flight — check local state
```bash
cd honeyDueAPI-go
git status # clean working tree?
git log -1 --oneline # deploying this SHA
ls deploy/cluster.env deploy/registry.env deploy/prod.env
ls deploy/secrets/*.txt deploy/secrets/*.p8
```
## 1. Reconcile your envs with current defaults
These two values **must** be right — the script does not enforce them:
```bash
# deploy/cluster.env
WORKER_REPLICAS=1 # >1 → duplicate cron jobs (Asynq scheduler is a singleton)
PUSH_LATEST_TAG=false # keeps prod images SHA-pinned
SECRET_KEEP_VERSIONS=3 # optional; 3 is the default
```
Decide storage backend in `deploy/prod.env`:
- **Multi-replica safe (recommended):** set all four of `B2_ENDPOINT`,
`B2_KEY_ID`, `B2_APP_KEY`, `B2_BUCKET_NAME`. Uploads go to B2.
- **Single-node ok:** leave all four empty. Script will warn. In this
mode you must also set `API_REPLICAS=1` — otherwise uploads are
invisible from 2/3 of requests.
## 2. Dry run
```bash
DRY_RUN=1 ./.deploy_prod
```
Confirm in the output:
- `Storage backend: S3 (...)` OR the `LOCAL VOLUME` warning matches intent
- `Replicas: api=3, worker=1, admin=1` (or `api=1` if local storage)
- Image SHA matches `git rev-parse --short HEAD`
- `Manager:` host is correct
- `Secret retention: 3 versions`
Fix envs and re-run until the plan looks right. Nothing touches the cluster yet.
## 3. Real deploy
```bash
./.deploy_prod
```
Do **not** pass `SKIP_BUILD=1` after code changes — the worker's health
server and `MigrateWithLock` both require a fresh build.
End-to-end: ~38 minutes. The script prints each phase.
## 4. Post-deploy verification
```bash
# Stack health (replicas X/X = desired)
ssh <manager> docker stack services honeydue
# API smoke
curl -fsS https://api.<domain>/api/health/ && echo OK
# Logs via Dozzle (loopback-bound, needs SSH tunnel)
ssh -p <port> -L 9999:127.0.0.1:9999 <user>@<manager>
# Then browse http://localhost:9999
```
What the logs should show on a healthy boot:
- `api`: exactly one replica logs `Migration advisory lock acquired`,
the others log `Migration advisory lock acquired` after waiting, then
`released`.
- `worker`: `Health server listening addr=:6060`, `Starting worker server...`,
four `Registered ... job` lines.
- No `Failed to connect to Redis` / `Failed to connect to database`.
## 5. If it goes wrong
Auto-rollback triggers when `DEPLOY_HEALTHCHECK_URL` fails — every service
is rolled back to its previous spec, script exits non-zero.
Triage:
```bash
ssh <manager> docker service logs --tail 200 honeydue_api
ssh <manager> docker service ps honeydue_api --no-trunc
```
Manual rollback (if auto didn't catch it):
```bash
ssh <manager> bash -c '
for svc in $(docker stack services honeydue --format "{{.Name}}"); do
docker service rollback "$svc"
done'
```
Redeploy a known-good SHA:
```bash
DEPLOY_TAG=<older-sha> SKIP_BUILD=1 ./.deploy_prod
# Only valid if that image was previously pushed to the registry.
```
## 6. Pre-deploy honesty check
Before pulling the trigger:
- [ ] Tested Neon PITR restore (not just "backups exist")?
- [ ] `WORKER_REPLICAS=1` — otherwise duplicate push notifications next cron tick
- [ ] Cloudflare-only firewall rule on 80/443 — otherwise origin IP is on the public internet
- [ ] If storage is LOCAL, `API_REPLICAS=1` too
- [ ] Last deploy's secrets still valid (rotation hasn't expired any creds)

View File

@@ -2,13 +2,18 @@
This folder is the full production deploy toolkit for `honeyDueAPI-go`. This folder is the full production deploy toolkit for `honeyDueAPI-go`.
Run deploy with: **Recommended flow — always dry-run first:**
```bash ```bash
./.deploy_prod DRY_RUN=1 ./.deploy_prod # validates everything, prints the plan, no changes
./.deploy_prod # then the real deploy
``` ```
The script will refuse to run until all required values are set. The script refuses to run until all required values are set.
- Step-by-step walkthrough for a real deploy: [`DEPLOYING.md`](./DEPLOYING.md)
- Manual prerequisites the script cannot automate (Swarm init, firewall,
Cloudflare, Neon, APNS, etc.): [`shit_deploy_cant_do.md`](./shit_deploy_cant_do.md)
## First-Time Prerequisite: Create The Swarm Cluster ## First-Time Prerequisite: Create The Swarm Cluster
@@ -84,16 +89,159 @@ AllowUsers deploy
### 6) Dozzle Hardening ### 6) Dozzle Hardening
- Keep Dozzle private (no public DNS/ingress). Dozzle exposes the full Docker log stream with no built-in auth — logs contain
secrets, tokens, and user data. The stack binds Dozzle to `127.0.0.1` on the
manager node only (`mode: host`, `host_ip: 127.0.0.1`), so it is **not
reachable from the public internet or from other Swarm nodes**.
To view logs, open an SSH tunnel from your workstation:
```bash
ssh -p "${DEPLOY_MANAGER_SSH_PORT}" \
-L "${DOZZLE_PORT}:127.0.0.1:${DOZZLE_PORT}" \
"${DEPLOY_MANAGER_USER}@${DEPLOY_MANAGER_HOST}"
# Then browse http://localhost:${DOZZLE_PORT}
```
Additional hardening if you ever need to expose Dozzle over a network:
- Put auth/SSO in front (Cloudflare Access or equivalent). - Put auth/SSO in front (Cloudflare Access or equivalent).
- Prefer a Docker socket proxy with restricted read-only scope. - Replace the raw `/var/run/docker.sock` mount with a Docker socket proxy
limited to read-only log endpoints.
- Prefer a persistent log aggregator (Loki, Datadog, CloudWatch) for prod —
Dozzle is ephemeral and not a substitute for audit trails.
### 7) Backup + Restore Readiness ### 7) Backup + Restore Readiness
- Postgres PITR path tested in staging. Treat this as a pre-launch checklist. Nothing below is automated by
- Redis persistence enabled and restore path tested. `./.deploy_prod`.
- Written runbook for restore and secret rotation.
- Named owner for incident response. - [ ] Postgres PITR path tested in staging (restore a real dump, validate app boots).
- [x] Redis AOF persistence enabled (`appendonly yes --appendfsync everysec` in stack).
- [ ] Redis restore path tested (verify AOF replays on a fresh node).
- [ ] Written runbook for restore + secret rotation (see §4 and `shit_deploy_cant_do.md`).
- [ ] Named owner for incident response.
- [ ] Uploads bucket (Backblaze B2) lifecycle / versioning reviewed — deletes are
handled by the app, not by retention rules.
### 8) Storage Backend (Uploads)
The stack supports two storage backends. The choice is **runtime-only** — the
same image runs in both modes, selected by env vars in `prod.env`:
| Mode | When to use | Config |
|---|---|---|
| **Local volume** | Dev / single-node prod | Leave all `B2_*` empty. Files land on `/app/uploads` via the named volume. |
| **S3-compatible** (B2, MinIO) | Multi-replica prod | Set all four of `B2_ENDPOINT`, `B2_KEY_ID`, `B2_APP_KEY`, `B2_BUCKET_NAME`. |
The deploy script enforces **all-or-none** for the B2 vars — a partial config
fails fast rather than silently falling back to the local volume.
**Why this matters:** Docker Swarm named volumes are **per-node**. With 3 API
replicas spread across nodes, an upload written on node A is invisible to
replicas on nodes B and C (the client sees a random 404 two-thirds of the
time). In multi-replica prod you **must** use S3-compatible storage.
The `uploads:` volume is still declared as a harmless fallback: when B2 is
configured, nothing writes to it. `./.deploy_prod` prints the selected
backend at the start of each run.
### 9) Worker Replicas & Scheduler
Keep `WORKER_REPLICAS=1` in `cluster.env` until Asynq `PeriodicTaskManager`
is wired up. The current `asynq.Scheduler` in `cmd/worker/main.go` has no
Redis-based leader election, so each replica independently enqueues the
same cron task — users see duplicate daily digests / onboarding emails.
Asynq workers (task consumers) are already safe to scale horizontally; it's
only the scheduler singleton that is constrained. Future work: migrate to
`asynq.NewPeriodicTaskManager(...)` with `PeriodicTaskConfigProvider` so
multiple scheduler replicas coordinate via Redis.
### 10) Database Migrations
`cmd/api/main.go` runs `database.MigrateWithLock()` on startup, which takes a
Postgres session-level `pg_advisory_lock` on a dedicated connection before
calling `AutoMigrate`. This serialises boot-time migrations across all API
replicas — the first replica migrates, the rest wait, then each sees an
already-current schema and `AutoMigrate` is a no-op.
The lock is released on connection close, so a crashed replica can't leave
a stale lock behind.
For very large schema changes, run migrations as a separate pre-deploy
step (there is no dedicated `cmd/migrate` binary today — this is a future
improvement).
### 11) Redis Redundancy
Redis runs as a **single replica** with an AOF-persisted named volume. If
the node running Redis dies, Swarm reschedules the container but the named
volume is per-node — the new Redis boots **empty**.
Impact:
- **Cache** (ETag lookups, static data): regenerates on first request.
- **Asynq queue**: in-flight jobs at the moment of the crash are lost; Asynq
retry semantics cover most re-enqueues. Scheduled-but-not-yet-fired cron
events are re-triggered on the next cron tick.
- **Sessions / auth tokens**: not stored in Redis, so unaffected.
This is an accepted limitation today. Options to harden later: Redis
Sentinel, a managed Redis (Upstash, Dragonfly Cloud), or restoring from the
AOF on a pinned node.
### 12) Multi-Arch Builds
`./.deploy_prod` builds images for the **host** architecture of the machine
running the script. If your Swarm nodes are a different arch (e.g. ARM64
Ampere VMs), use `docker buildx` explicitly:
```bash
docker buildx create --use
docker buildx build --platform linux/arm64 --target api -t <image> --push .
# repeat for worker, admin
SKIP_BUILD=1 ./.deploy_prod # then deploy the already-pushed images
```
The Go stages cross-compile cleanly (`TARGETARCH` is already honoured).
The Node/admin stages require QEMU emulation (`docker run --privileged --rm
tonistiigi/binfmt --install all` on the build host) since native deps may
need to be rebuilt for the target arch.
### 13) Connection Pool & TLS Tuning
Because Postgres is external (Neon/RDS), each replica opens its own pool.
Sizing matters: total open connections across the cluster must stay under
the database's configured limit. Defaults in `prod.env.example`:
| Setting | Default | Notes |
|---|---|---|
| `DB_SSLMODE` | `require` | Never set to `disable` in prod. For Neon use `require`. |
| `DB_MAX_OPEN_CONNS` | `25` | Per-replica cap. Worst case: 25 × (API+worker replicas). |
| `DB_MAX_IDLE_CONNS` | `10` | Keep warm connections ready without exhausting the pool. |
| `DB_MAX_LIFETIME` | `600s` | Recycle before Neon's idle disconnect (typically 5 min). |
Worked example with default replicas (3 API + 1 worker — see §9 for why
worker is pinned to 1):
```
3 × 25 + 1 × 25 = 100 peak open connections
```
That lands exactly on Neon's free-tier ceiling (100 concurrent connections),
which is risky with even one transient spike. For Neon free tier drop
`DB_MAX_OPEN_CONNS=15` (→ 60 peak). Paid tiers (Neon Scale, 1000+
connections) can keep the default or raise it.
Operational checklist:
- Confirm Neon IP allowlist includes every Swarm node IP.
- After changing pool sizes, redeploy and watch `pg_stat_activity` /
Neon metrics for saturation.
- Keep `DB_MAX_LIFETIME` ≤ Neon idle timeout to avoid "terminating
connection due to administrator command" errors in the API logs.
- For read-heavy workloads, consider a Neon read replica and split
query traffic at the application layer.
## Files You Fill In ## Files You Fill In
@@ -113,20 +261,51 @@ If one is missing, the deploy script auto-copies it from its `.example` template
## What `./.deploy_prod` Does ## What `./.deploy_prod` Does
1. Validates all required config files and credentials. 1. Validates all required config files and credentials.
2. Builds and pushes `api`, `worker`, and `admin` images. 2. Validates the storage-backend toggle (all-or-none for `B2_*`). Prints
3. Uploads deploy bundle to your Swarm manager over SSH. the selected backend (S3 or local volume) before continuing.
4. Creates versioned Docker secrets on the manager. 3. Builds and pushes `api`, `worker`, and `admin` images (skip with
5. Deploys the stack with `docker stack deploy --with-registry-auth`. `SKIP_BUILD=1`).
6. Waits until service replicas converge. 4. Uploads deploy bundle to your Swarm manager over SSH.
7. Runs an HTTP health check (if `DEPLOY_HEALTHCHECK_URL` is set). 5. Creates versioned Docker secrets on the manager.
6. Deploys the stack with `docker stack deploy --with-registry-auth`.
7. Waits until service replicas converge.
8. Prunes old secret versions, keeping the last `SECRET_KEEP_VERSIONS`
(default 3).
9. Runs an HTTP health check (if `DEPLOY_HEALTHCHECK_URL` is set). **On
failure, automatically runs `docker service rollback` for every service
in the stack and exits non-zero.**
10. Logs out of the registry on both the dev host and the manager so the
token doesn't linger in `~/.docker/config.json`.
## Useful Flags ## Useful Flags
Environment flags: Environment flags:
- `SKIP_BUILD=1 ./.deploy_prod` to deploy already-pushed images. - `DRY_RUN=1 ./.deploy_prod` — validate config and print the deploy plan
- `SKIP_HEALTHCHECK=1 ./.deploy_prod` to skip final URL check. without building, pushing, or touching the cluster. Use this before every
- `DEPLOY_TAG=<tag> ./.deploy_prod` to deploy a specific image tag. production deploy to review images, replicas, and secret names.
- `SKIP_BUILD=1 ./.deploy_prod` — deploy already-pushed images.
- `SKIP_HEALTHCHECK=1 ./.deploy_prod` — skip final URL check.
- `DEPLOY_TAG=<tag> ./.deploy_prod` — deploy a specific image tag.
- `PUSH_LATEST_TAG=true ./.deploy_prod` — also push `:latest` to the registry
(default is `false` so prod pins to the SHA tag and stays reproducible).
- `SECRET_KEEP_VERSIONS=<n> ./.deploy_prod` — how many versions of each
Swarm secret to retain after deploy (default: 3). Older unused versions
are pruned automatically once the stack converges.
## Secret Versioning & Pruning
Each deploy creates a fresh set of Swarm secrets named
`<stack>_<secret>_<deploy_id>` (for example
`honeydue_secret_key_abc1234_20260413120000`). The stack file references the
current names via `${POSTGRES_PASSWORD_SECRET}` etc., so rolling updates never
reuse a secret that a running task still holds open.
After the new stack converges, `./.deploy_prod` SSHes to the manager and
prunes old versions per base name, keeping the most recent
`SECRET_KEEP_VERSIONS` (default 3). Anything still referenced by a running
task is left alone (Docker refuses to delete in-use secrets) and will be
pruned on the next deploy.
## Important ## Important

View File

@@ -12,11 +12,21 @@ DEPLOY_HEALTHCHECK_URL=https://api.honeyDue.treytartt.com/api/health/
# Replicas and published ports # Replicas and published ports
API_REPLICAS=3 API_REPLICAS=3
WORKER_REPLICAS=2 # IMPORTANT: keep WORKER_REPLICAS=1 until Asynq PeriodicTaskManager is wired.
# The current asynq.Scheduler in cmd/worker/main.go has no Redis-based
# leader election, so running >1 replica fires every cron task once per
# replica → duplicate daily digests / onboarding emails / etc.
WORKER_REPLICAS=1
ADMIN_REPLICAS=1 ADMIN_REPLICAS=1
API_PORT=8000 API_PORT=8000
ADMIN_PORT=3000 ADMIN_PORT=3000
DOZZLE_PORT=9999 DOZZLE_PORT=9999
# Build behavior # Build behavior
PUSH_LATEST_TAG=true # PUSH_LATEST_TAG=true also tags and pushes :latest on the registry.
# Leave false in production to keep image tags immutable (SHA-pinned only).
PUSH_LATEST_TAG=false
# Secret retention: number of versioned Swarm secrets to keep per name after each deploy.
# Older unused versions are pruned post-convergence. Default: 3.
SECRET_KEEP_VERSIONS=3

View File

@@ -50,6 +50,27 @@ STORAGE_BASE_URL=/uploads
STORAGE_MAX_FILE_SIZE=10485760 STORAGE_MAX_FILE_SIZE=10485760
STORAGE_ALLOWED_TYPES=image/jpeg,image/png,image/gif,image/webp,application/pdf STORAGE_ALLOWED_TYPES=image/jpeg,image/png,image/gif,image/webp,application/pdf
# Storage backend (S3-compatible: Backblaze B2 or MinIO)
#
# Leave all B2_* vars empty to use the local filesystem at STORAGE_UPLOAD_DIR.
# - Safe for single-node setups (dev / single-VPS prod).
# - NOT SAFE for multi-replica prod: named volumes are per-node in Swarm,
# so uploads written on one node are invisible to the other replicas.
#
# Set ALL FOUR of B2_ENDPOINT, B2_KEY_ID, B2_APP_KEY, B2_BUCKET_NAME to
# switch to S3-compatible storage. The deploy script enforces all-or-none.
#
# Example for Backblaze B2 (us-west-004):
# B2_ENDPOINT=s3.us-west-004.backblazeb2.com
# B2_USE_SSL=true
# B2_REGION=us-west-004
B2_ENDPOINT=
B2_KEY_ID=
B2_APP_KEY=
B2_BUCKET_NAME=
B2_USE_SSL=true
B2_REGION=us-east-1
# Feature flags # Feature flags
FEATURE_PUSH_ENABLED=true FEATURE_PUSH_ENABLED=true
FEATURE_EMAIL_ENABLED=true FEATURE_EMAIL_ENABLED=true

View File

@@ -18,6 +18,8 @@ SECRET_APNS_KEY="${DEPLOY_DIR}/secrets/apns_auth_key.p8"
SKIP_BUILD="${SKIP_BUILD:-0}" SKIP_BUILD="${SKIP_BUILD:-0}"
SKIP_HEALTHCHECK="${SKIP_HEALTHCHECK:-0}" SKIP_HEALTHCHECK="${SKIP_HEALTHCHECK:-0}"
DRY_RUN="${DRY_RUN:-0}"
SECRET_KEEP_VERSIONS="${SECRET_KEEP_VERSIONS:-3}"
log() { log() {
printf '[deploy] %s\n' "$*" printf '[deploy] %s\n' "$*"
@@ -91,9 +93,13 @@ Usage:
./.deploy_prod ./.deploy_prod
Optional environment flags: Optional environment flags:
SKIP_BUILD=1 Deploy existing image tags without rebuilding/pushing. DRY_RUN=1 Print the deployment plan and exit without changes.
SKIP_HEALTHCHECK=1 Skip final HTTP health check. SKIP_BUILD=1 Deploy existing image tags without rebuilding/pushing.
DEPLOY_TAG=<tag> Override image tag (default: git short sha). SKIP_HEALTHCHECK=1 Skip final HTTP health check.
DEPLOY_TAG=<tag> Override image tag (default: git short sha).
PUSH_LATEST_TAG=true|false Also tag/push :latest (default: false — SHA only).
SECRET_KEEP_VERSIONS=<n> How many versions of each Swarm secret to retain
(default: 3). Older unused versions are pruned.
EOF EOF
} }
@@ -144,7 +150,7 @@ DEPLOY_STACK_NAME="${DEPLOY_STACK_NAME:-honeydue}"
DEPLOY_REMOTE_DIR="${DEPLOY_REMOTE_DIR:-/opt/honeydue/deploy}" DEPLOY_REMOTE_DIR="${DEPLOY_REMOTE_DIR:-/opt/honeydue/deploy}"
DEPLOY_WAIT_SECONDS="${DEPLOY_WAIT_SECONDS:-420}" DEPLOY_WAIT_SECONDS="${DEPLOY_WAIT_SECONDS:-420}"
DEPLOY_TAG="${DEPLOY_TAG:-$(git -C "${REPO_DIR}" rev-parse --short HEAD)}" DEPLOY_TAG="${DEPLOY_TAG:-$(git -C "${REPO_DIR}" rev-parse --short HEAD)}"
PUSH_LATEST_TAG="${PUSH_LATEST_TAG:-true}" PUSH_LATEST_TAG="${PUSH_LATEST_TAG:-false}"
require_var DEPLOY_MANAGER_HOST require_var DEPLOY_MANAGER_HOST
require_var DEPLOY_MANAGER_USER require_var DEPLOY_MANAGER_USER
@@ -173,6 +179,27 @@ require_var APNS_AUTH_KEY_ID
require_var APNS_TEAM_ID require_var APNS_TEAM_ID
require_var APNS_TOPIC require_var APNS_TOPIC
# Storage backend validation: B2 is all-or-none. If any var is filled with
# a real value, require all four core vars. Empty means "use local volume".
b2_any_set=0
b2_all_set=1
for b2_var in B2_ENDPOINT B2_KEY_ID B2_APP_KEY B2_BUCKET_NAME; do
val="${!b2_var:-}"
if [[ -n "${val}" ]] && ! contains_placeholder "${val}"; then
b2_any_set=1
else
b2_all_set=0
fi
done
if (( b2_any_set == 1 && b2_all_set == 0 )); then
die "Partial B2 configuration detected. Set all four of B2_ENDPOINT, B2_KEY_ID, B2_APP_KEY, B2_BUCKET_NAME, or leave all four empty to use the local volume."
fi
if (( b2_all_set == 1 )); then
log "Storage backend: S3 (${B2_ENDPOINT} / bucket=${B2_BUCKET_NAME})"
else
warn "Storage backend: LOCAL VOLUME. This is not safe for multi-replica prod — uploads will only exist on one node. Set B2_* in prod.env to use object storage."
fi
if [[ ! "$(tr -d '\r\n' < "${SECRET_APNS_KEY}")" =~ BEGIN[[:space:]]+PRIVATE[[:space:]]+KEY ]]; then if [[ ! "$(tr -d '\r\n' < "${SECRET_APNS_KEY}")" =~ BEGIN[[:space:]]+PRIVATE[[:space:]]+KEY ]]; then
die "APNS key file does not look like a private key: ${SECRET_APNS_KEY}" die "APNS key file does not look like a private key: ${SECRET_APNS_KEY}"
fi fi
@@ -200,6 +227,50 @@ if [[ -n "${SSH_KEY_PATH}" ]]; then
SCP_OPTS+=(-i "${SSH_KEY_PATH}") SCP_OPTS+=(-i "${SSH_KEY_PATH}")
fi fi
if [[ "${DRY_RUN}" == "1" ]]; then
cat <<EOF
==================== DRY RUN ====================
Validation passed. Would deploy:
Stack name: ${DEPLOY_STACK_NAME}
Manager: ${SSH_TARGET}:${DEPLOY_MANAGER_SSH_PORT}
Remote dir: ${DEPLOY_REMOTE_DIR}
Deploy tag: ${DEPLOY_TAG}
Push :latest: ${PUSH_LATEST_TAG}
Skip build: ${SKIP_BUILD}
Skip healthcheck: ${SKIP_HEALTHCHECK}
Secret retention: ${SECRET_KEEP_VERSIONS} versions per name
Images that would be built and pushed:
${API_IMAGE}
${WORKER_IMAGE}
${ADMIN_IMAGE}
Replicas:
api: ${API_REPLICAS:-3}
worker: ${WORKER_REPLICAS:-2}
admin: ${ADMIN_REPLICAS:-1}
Published ports:
api: ${API_PORT:-8000} (ingress)
admin: ${ADMIN_PORT:-3000} (ingress)
dozzle: ${DOZZLE_PORT:-9999} (manager loopback only — SSH tunnel required)
Versioned secrets that would be created on this deploy:
${DEPLOY_STACK_NAME}_postgres_password_<deploy_id>
${DEPLOY_STACK_NAME}_secret_key_<deploy_id>
${DEPLOY_STACK_NAME}_email_host_password_<deploy_id>
${DEPLOY_STACK_NAME}_fcm_server_key_<deploy_id>
${DEPLOY_STACK_NAME}_apns_auth_key_<deploy_id>
No changes made. Re-run without DRY_RUN=1 to deploy.
=================================================
EOF
exit 0
fi
log "Validating SSH access to ${SSH_TARGET}" log "Validating SSH access to ${SSH_TARGET}"
if ! ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "echo ok" >/dev/null 2>&1; then if ! ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "echo ok" >/dev/null 2>&1; then
die "SSH connection failed to ${SSH_TARGET}" die "SSH connection failed to ${SSH_TARGET}"
@@ -384,11 +455,77 @@ while true; do
sleep 10 sleep 10
done done
log "Pruning old secret versions (keeping last ${SECRET_KEEP_VERSIONS})"
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "bash -s -- '${DEPLOY_STACK_NAME}' '${SECRET_KEEP_VERSIONS}'" <<'EOF' || warn "Secret pruning reported errors (non-fatal)"
set -euo pipefail
STACK_NAME="$1"
KEEP="$2"
prune_prefix() {
local prefix="$1"
# List matching secrets with creation time, sorted newest-first.
local all
all="$(docker secret ls --format '{{.CreatedAt}}|{{.Name}}' 2>/dev/null \
| grep "|${prefix}_" \
| sort -r \
|| true)"
if [[ -z "${all}" ]]; then
return 0
fi
local total
total="$(printf '%s\n' "${all}" | wc -l | tr -d ' ')"
if (( total <= KEEP )); then
echo "[cleanup] ${prefix}: ${total} version(s) — nothing to prune"
return 0
fi
local to_remove
to_remove="$(printf '%s\n' "${all}" | tail -n +$((KEEP + 1)) | awk -F'|' '{print $2}')"
while IFS= read -r name; do
[[ -z "${name}" ]] && continue
if docker secret rm "${name}" >/dev/null 2>&1; then
echo "[cleanup] removed: ${name}"
else
echo "[cleanup] in-use (kept): ${name}"
fi
done <<< "${to_remove}"
}
for base in postgres_password secret_key email_host_password fcm_server_key apns_auth_key; do
prune_prefix "${STACK_NAME}_${base}"
done
EOF
rollback_stack() {
warn "Rolling back stack ${DEPLOY_STACK_NAME} on ${SSH_TARGET}"
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "bash -s -- '${DEPLOY_STACK_NAME}'" <<'EOF' || true
set +e
STACK="$1"
for svc in $(docker stack services "${STACK}" --format '{{.Name}}'); do
echo "[rollback] ${svc}"
docker service rollback "${svc}" || echo "[rollback] ${svc}: nothing to roll back"
done
EOF
}
if [[ "${SKIP_HEALTHCHECK}" != "1" && -n "${DEPLOY_HEALTHCHECK_URL:-}" ]]; then if [[ "${SKIP_HEALTHCHECK}" != "1" && -n "${DEPLOY_HEALTHCHECK_URL:-}" ]]; then
log "Running health check: ${DEPLOY_HEALTHCHECK_URL}" log "Running health check: ${DEPLOY_HEALTHCHECK_URL}"
curl -fsS --max-time 20 "${DEPLOY_HEALTHCHECK_URL}" >/dev/null if ! curl -fsS --max-time 20 "${DEPLOY_HEALTHCHECK_URL}" >/dev/null; then
warn "Health check FAILED for ${DEPLOY_HEALTHCHECK_URL}"
rollback_stack
die "Deploy rolled back due to failed health check."
fi
fi fi
# Best-effort registry logout — the token should not linger in
# ~/.docker/config.json after deploy completes. Failures are non-fatal.
log "Logging out of registry (local + remote)"
docker logout "${REGISTRY}" >/dev/null 2>&1 || true
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "docker logout '${REGISTRY}' >/dev/null 2>&1 || true"
log "Deploy completed successfully." log "Deploy completed successfully."
log "Stack: ${DEPLOY_STACK_NAME}" log "Stack: ${DEPLOY_STACK_NAME}"
log "Images:" log "Images:"

View File

@@ -0,0 +1,208 @@
# Shit `./.deploy_prod` Can't Do
Everything listed here is **manual**. The deploy script orchestrates builds,
secrets, and the stack — it does not provision infrastructure, touch DNS,
configure Cloudflare, or rotate external credentials. Work through this list
once before your first prod deploy, then revisit after every cloud-side
change.
See [`README.md`](./README.md) for the security checklist that complements
this file.
---
## One-Time: Infrastructure
### Swarm Cluster
- [ ] Provision manager + worker VMs (Hetzner, DO, etc.).
- [ ] `docker swarm init --advertise-addr <manager-private-ip>` on manager #1.
- [ ] `docker swarm join-token {manager,worker}` → join additional nodes.
- [ ] `docker node ls` to verify — all nodes `Ready` and `Active`.
- [ ] Label nodes if you want placement constraints beyond the defaults.
### Node Hardening (every node)
- [ ] SSH: non-default port, key-only auth, no root login — see README §2.
- [ ] Firewall: allow 22 (or 2222), 80, 443 from CF IPs only; 2377/tcp,
7946/tcp+udp, 4789/udp Swarm-nodes only; block the rest — see README §1.
- [ ] Install unattended-upgrades (or equivalent) for security patches.
- [ ] Disable password auth in `/etc/ssh/sshd_config`.
- [ ] Create the `deploy` user (`AllowUsers deploy` in sshd_config).
### DNS + Cloudflare
- [ ] Add A records for `api.<domain>`, `admin.<domain>` pointing to the LB
or manager IPs. Keep them **proxied** (orange cloud).
- [ ] Create a Cloudflare tunnel or enable "Authenticated Origin Pulls" if
you want to lock the origin to CF only.
- [ ] Firewall rule on the nodes: only accept 80/443 from Cloudflare IP ranges
(<https://www.cloudflare.com/ips/>).
- [ ] Configure CF Access (or equivalent SSO) in front of admin panel if
exposing it publicly.
---
## One-Time: External Services
### Postgres (Neon)
- [ ] Create project + database (`honeydue`).
- [ ] Create a dedicated DB user with least privilege — not the project owner.
- [ ] Enable IP allowlist, add every Swarm node's egress IP.
- [ ] Verify `DB_SSLMODE=require` works end-to-end.
- [ ] Turn on PITR (paid tier) or schedule automated `pg_dump` backups.
- [ ] Do one restore drill — boot a staging stack from a real backup. If you
haven't done this, you do not have backups.
### Redis
- Redis runs **inside** the stack on a named volume. No external setup
needed today. See README §11 — this is an accepted SPOF.
- [ ] If you move Redis external (Upstash, Dragonfly Cloud): update
`REDIS_URL` in `prod.env`, remove the `redis` service + volume from
the stack.
### Backblaze B2 (or MinIO)
Skip this section if you're running a single-node prod and are OK with
uploads on a local volume. Required for multi-replica prod — see README §8.
- [ ] Create B2 account + bucket (private).
- [ ] Create a **scoped** application key bound to that single bucket —
not the master key.
- [ ] Set lifecycle rules: keep only the current version of each file,
or whatever matches your policy.
- [ ] Populate `B2_ENDPOINT`, `B2_KEY_ID`, `B2_APP_KEY`, `B2_BUCKET_NAME`
in `deploy/prod.env`. Optionally set `B2_USE_SSL` and `B2_REGION`.
- [ ] Verify uploads round-trip across replicas after the first deploy
(upload a file via client A → fetch via client B in a different session).
### APNS (Apple Push)
- [ ] Create an APNS auth key (`.p8`) in the Apple Developer portal.
- [ ] Save to `deploy/secrets/apns_auth_key.p8` — the script enforces it
contains a real `-----BEGIN PRIVATE KEY-----` block.
- [ ] Fill `APNS_AUTH_KEY_ID`, `APNS_TEAM_ID`, `APNS_TOPIC` (bundle ID) in
`deploy/prod.env`.
- [ ] Decide `APNS_USE_SANDBOX` / `APNS_PRODUCTION` based on build target.
### FCM (Android Push)
- [ ] Create Firebase project + legacy server key (or migrate to HTTP v1 —
the code currently uses the legacy server key).
- [ ] Save to `deploy/secrets/fcm_server_key.txt`.
### SMTP (Email)
- [ ] Provision SMTP credentials (Gmail app password, SES, Postmark, etc.).
- [ ] Fill `EMAIL_HOST`, `EMAIL_PORT`, `EMAIL_HOST_USER`,
`DEFAULT_FROM_EMAIL`, `EMAIL_USE_TLS` in `deploy/prod.env`.
- [ ] Save the password to `deploy/secrets/email_host_password.txt`.
- [ ] Verify SPF, DKIM, DMARC on the sending domain if you care about
deliverability.
### Registry (GHCR / other)
- [ ] Create a personal access token with `write:packages` + `read:packages`.
- [ ] Fill `REGISTRY`, `REGISTRY_NAMESPACE`, `REGISTRY_USERNAME`,
`REGISTRY_TOKEN` in `deploy/registry.env`.
- [ ] Rotate the token on a schedule (quarterly at minimum).
### Apple / Google IAP (optional)
- [ ] Apple: create App Store Connect API key, fill the `APPLE_IAP_*` vars.
- [ ] Google: create a service account with Play Developer API access,
store JSON at a path referenced by `GOOGLE_IAP_SERVICE_ACCOUNT_PATH`.
---
## Recurring Operations
### Secret Rotation
After any compromise, annually at minimum, and when a team member leaves:
1. Generate the new value (e.g. `openssl rand -base64 32 > deploy/secrets/secret_key.txt`).
2. `./.deploy_prod` — creates a new versioned Swarm secret and redeploys
services to pick it up.
3. The old secret lingers until `SECRET_KEEP_VERSIONS` bumps it out (see
README "Secret Versioning & Pruning").
4. For external creds (Neon, B2, APNS, etc.) rotate at the provider first,
update the local secret file, then redeploy.
### Backup Drills
- [ ] Quarterly: pull a Neon backup, restore to a scratch project, boot a
staging stack against it, verify login + basic reads.
- [ ] Monthly: spot-check that B2 objects are actually present and the
app key still works.
- [ ] After any schema change: confirm PITR coverage includes the new
columns before relying on it.
### Certificate Management
- TLS is terminated by Cloudflare today, so there are no origin certs to
renew. If you ever move TLS on-origin (Traefik, Caddy), automate renewal
— don't add it to this list and expect it to happen.
### Multi-Arch Builds
`./.deploy_prod` builds for the host arch. If target ≠ host:
- [ ] Enable buildx: `docker buildx create --use`.
- [ ] Install QEMU: `docker run --privileged --rm tonistiigi/binfmt --install all`.
- [ ] Build + push images manually per target platform.
- [ ] Run `SKIP_BUILD=1 ./.deploy_prod` so the script just deploys.
### Node Maintenance / Rolling Upgrades
- [ ] `docker node update --availability drain <node>` before OS upgrades.
- [ ] Reboot, verify, then `docker node update --availability active <node>`.
- [ ] Re-converge with `docker stack deploy -c swarm-stack.prod.yml honeydue`.
---
## Incident Response
### Redis Node Dies
Named volume is per-node and doesn't follow. Accept the loss:
1. Let Swarm reschedule Redis on a new node.
2. In-flight Asynq jobs are lost; retry semantics cover most of them.
3. Scheduled cron events fire again on the next tick (hourly for smart
reminders and daily digest; daily for onboarding + cleanup).
4. Cache repopulates on first request.
### Deploy Rolled Back Automatically
`./.deploy_prod` triggers `docker service rollback` on every service if
`DEPLOY_HEALTHCHECK_URL` fails. Diagnose with:
```bash
ssh <manager> docker stack services honeydue
ssh <manager> docker service logs --tail 200 honeydue_api
# Or open an SSH tunnel to Dozzle: ssh -L 9999:127.0.0.1:9999 <manager>
```
### Lost Ability to Deploy
- Registry token revoked → regenerate, update `deploy/registry.env`, re-run.
- Manager host key changed → verify legitimacy, update `~/.ssh/known_hosts`.
- All secrets accidentally pruned → restore the `deploy/secrets/*` files
locally and redeploy; new Swarm secret versions will be created.
---
## Known Gaps (Future Work)
- No dedicated `cmd/migrate` binary — migrations run at API boot (see
README §10). Large schema changes still need manual coordination.
- `asynq.Scheduler` has no leader election; `WORKER_REPLICAS` must stay 1
until we migrate to `asynq.PeriodicTaskManager` (README §9).
- No Prometheus / Grafana / alerting in the stack. `/metrics` is exposed
on the API but nothing scrapes it.
- No automated TLS renewal on-origin — add if you ever move off Cloudflare.
- No staging environment wired to the deploy script — `DEPLOY_TAG=<sha>`
is the closest thing. A proper staging flow is future work.

View File

@@ -3,7 +3,7 @@ version: "3.8"
services: services:
redis: redis:
image: redis:7-alpine image: redis:7-alpine
command: redis-server --appendonly yes --appendfsync everysec command: redis-server --appendonly yes --appendfsync everysec --maxmemory 200mb --maxmemory-policy allkeys-lru
volumes: volumes:
- redis_data:/data - redis_data:/data
healthcheck: healthcheck:
@@ -18,6 +18,13 @@ services:
delay: 5s delay: 5s
placement: placement:
max_replicas_per_node: 1 max_replicas_per_node: 1
resources:
limits:
cpus: "0.50"
memory: 256M
reservations:
cpus: "0.10"
memory: 64M
networks: networks:
- honeydue-network - honeydue-network
@@ -67,6 +74,17 @@ services:
STORAGE_MAX_FILE_SIZE: "${STORAGE_MAX_FILE_SIZE}" STORAGE_MAX_FILE_SIZE: "${STORAGE_MAX_FILE_SIZE}"
STORAGE_ALLOWED_TYPES: "${STORAGE_ALLOWED_TYPES}" STORAGE_ALLOWED_TYPES: "${STORAGE_ALLOWED_TYPES}"
# S3-compatible object storage (Backblaze B2, MinIO). When all B2_* vars
# are set, uploads/media are stored in the bucket and the local volume
# mount becomes a no-op fallback. Required for multi-replica prod —
# without it uploads only exist on one node.
B2_ENDPOINT: "${B2_ENDPOINT}"
B2_KEY_ID: "${B2_KEY_ID}"
B2_APP_KEY: "${B2_APP_KEY}"
B2_BUCKET_NAME: "${B2_BUCKET_NAME}"
B2_USE_SSL: "${B2_USE_SSL}"
B2_REGION: "${B2_REGION}"
FEATURE_PUSH_ENABLED: "${FEATURE_PUSH_ENABLED}" FEATURE_PUSH_ENABLED: "${FEATURE_PUSH_ENABLED}"
FEATURE_EMAIL_ENABLED: "${FEATURE_EMAIL_ENABLED}" FEATURE_EMAIL_ENABLED: "${FEATURE_EMAIL_ENABLED}"
FEATURE_WEBHOOKS_ENABLED: "${FEATURE_WEBHOOKS_ENABLED}" FEATURE_WEBHOOKS_ENABLED: "${FEATURE_WEBHOOKS_ENABLED}"
@@ -86,6 +104,7 @@ services:
APPLE_IAP_SANDBOX: "${APPLE_IAP_SANDBOX}" APPLE_IAP_SANDBOX: "${APPLE_IAP_SANDBOX}"
GOOGLE_IAP_SERVICE_ACCOUNT_PATH: "${GOOGLE_IAP_SERVICE_ACCOUNT_PATH}" GOOGLE_IAP_SERVICE_ACCOUNT_PATH: "${GOOGLE_IAP_SERVICE_ACCOUNT_PATH}"
GOOGLE_IAP_PACKAGE_NAME: "${GOOGLE_IAP_PACKAGE_NAME}" GOOGLE_IAP_PACKAGE_NAME: "${GOOGLE_IAP_PACKAGE_NAME}"
stop_grace_period: 60s
command: command:
- /bin/sh - /bin/sh
- -lc - -lc
@@ -128,6 +147,13 @@ services:
parallelism: 1 parallelism: 1
delay: 5s delay: 5s
order: stop-first order: stop-first
resources:
limits:
cpus: "1.00"
memory: 512M
reservations:
cpus: "0.25"
memory: 128M
networks: networks:
- honeydue-network - honeydue-network
@@ -142,10 +168,12 @@ services:
PORT: "3000" PORT: "3000"
HOSTNAME: "0.0.0.0" HOSTNAME: "0.0.0.0"
NEXT_PUBLIC_API_URL: "${NEXT_PUBLIC_API_URL}" NEXT_PUBLIC_API_URL: "${NEXT_PUBLIC_API_URL}"
stop_grace_period: 60s
healthcheck: healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3000/admin/"] test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3000/api/health"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
start_period: 20s
retries: 3 retries: 3
deploy: deploy:
replicas: ${ADMIN_REPLICAS} replicas: ${ADMIN_REPLICAS}
@@ -160,6 +188,13 @@ services:
parallelism: 1 parallelism: 1
delay: 5s delay: 5s
order: stop-first order: stop-first
resources:
limits:
cpus: "0.50"
memory: 384M
reservations:
cpus: "0.10"
memory: 128M
networks: networks:
- honeydue-network - honeydue-network
@@ -201,6 +236,7 @@ services:
FEATURE_ONBOARDING_EMAILS_ENABLED: "${FEATURE_ONBOARDING_EMAILS_ENABLED}" FEATURE_ONBOARDING_EMAILS_ENABLED: "${FEATURE_ONBOARDING_EMAILS_ENABLED}"
FEATURE_PDF_REPORTS_ENABLED: "${FEATURE_PDF_REPORTS_ENABLED}" FEATURE_PDF_REPORTS_ENABLED: "${FEATURE_PDF_REPORTS_ENABLED}"
FEATURE_WORKER_ENABLED: "${FEATURE_WORKER_ENABLED}" FEATURE_WORKER_ENABLED: "${FEATURE_WORKER_ENABLED}"
stop_grace_period: 60s
command: command:
- /bin/sh - /bin/sh
- -lc - -lc
@@ -222,6 +258,12 @@ services:
target: fcm_server_key target: fcm_server_key
- source: ${APNS_AUTH_KEY_SECRET} - source: ${APNS_AUTH_KEY_SECRET}
target: apns_auth_key target: apns_auth_key
healthcheck:
test: ["CMD", "curl", "-f", "http://127.0.0.1:6060/health"]
interval: 30s
timeout: 10s
start_period: 15s
retries: 3
deploy: deploy:
replicas: ${WORKER_REPLICAS} replicas: ${WORKER_REPLICAS}
restart_policy: restart_policy:
@@ -235,16 +277,28 @@ services:
parallelism: 1 parallelism: 1
delay: 5s delay: 5s
order: stop-first order: stop-first
resources:
limits:
cpus: "1.00"
memory: 512M
reservations:
cpus: "0.25"
memory: 128M
networks: networks:
- honeydue-network - honeydue-network
dozzle: dozzle:
# NOTE: Dozzle exposes the full Docker log stream with no built-in auth.
# Bound to manager loopback only — access via SSH tunnel:
# ssh -L ${DOZZLE_PORT}:127.0.0.1:${DOZZLE_PORT} <manager>
# Then browse http://localhost:${DOZZLE_PORT}
image: amir20/dozzle:latest image: amir20/dozzle:latest
ports: ports:
- target: 8080 - target: 8080
published: ${DOZZLE_PORT} published: ${DOZZLE_PORT}
protocol: tcp protocol: tcp
mode: ingress mode: host
host_ip: 127.0.0.1
environment: environment:
DOZZLE_NO_ANALYTICS: "true" DOZZLE_NO_ANALYTICS: "true"
volumes: volumes:
@@ -257,6 +311,13 @@ services:
placement: placement:
constraints: constraints:
- node.role == manager - node.role == manager
resources:
limits:
cpus: "0.25"
memory: 128M
reservations:
cpus: "0.05"
memory: 32M
networks: networks:
- honeydue-network - honeydue-network

View File

@@ -1,6 +1,7 @@
package database package database
import ( import (
"context"
"fmt" "fmt"
"time" "time"
@@ -15,6 +16,11 @@ import (
"github.com/treytartt/honeydue-api/internal/models" "github.com/treytartt/honeydue-api/internal/models"
) )
// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
// Migrate() across API replicas booting in parallel. Value is arbitrary but
// stable ("hdmg" as bytes = honeydue migration).
const migrationAdvisoryLockKey int64 = 0x68646d67
// zerologGormWriter adapts zerolog for GORM's logger interface // zerologGormWriter adapts zerolog for GORM's logger interface
type zerologGormWriter struct{} type zerologGormWriter struct{}
@@ -121,6 +127,54 @@ func Paginate(page, pageSize int) func(db *gorm.DB) *gorm.DB {
} }
} }
// MigrateWithLock runs Migrate() under a Postgres session-level advisory lock
// so that multiple API replicas booting in parallel don't race on AutoMigrate.
// On non-Postgres dialects (sqlite in tests) it falls through to Migrate().
func MigrateWithLock() error {
if db == nil {
return fmt.Errorf("database not initialised")
}
if db.Dialector.Name() != "postgres" {
return Migrate()
}
sqlDB, err := db.DB()
if err != nil {
return fmt.Errorf("get underlying sql.DB: %w", err)
}
// Give ourselves up to 5 min to acquire the lock — long enough for a
// slow migration on a peer replica, short enough to fail fast if Postgres
// is hung.
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
conn, err := sqlDB.Conn(ctx)
if err != nil {
return fmt.Errorf("acquire dedicated migration connection: %w", err)
}
defer conn.Close()
log.Info().Int64("lock_key", migrationAdvisoryLockKey).Msg("Acquiring migration advisory lock...")
if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", migrationAdvisoryLockKey); err != nil {
return fmt.Errorf("pg_advisory_lock: %w", err)
}
log.Info().Msg("Migration advisory lock acquired")
defer func() {
// Unlock with a fresh context — the outer ctx may have expired.
unlockCtx, unlockCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer unlockCancel()
if _, err := conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", migrationAdvisoryLockKey); err != nil {
log.Warn().Err(err).Msg("Failed to release migration advisory lock (session close will also release)")
} else {
log.Info().Msg("Migration advisory lock released")
}
}()
return Migrate()
}
// Migrate runs database migrations for all models // Migrate runs database migrations for all models
func Migrate() error { func Migrate() error {
log.Info().Msg("Running database migrations...") log.Info().Msg("Running database migrations...")