Harden prod deploy: versioned secrets, healthchecks, migration lock, dry-run
Swarm stack - Resource limits on all services, stop_grace_period 60s on api/worker/admin - Dozzle bound to manager loopback only (ssh -L required for access) - Worker health server on :6060, admin /api/health endpoint - Redis 200M LRU cap, B2/S3 env vars wired through to api service Deploy script - DRY_RUN=1 prints plan + exits - Auto-rollback on failed healthcheck, docker logout at end - Versioned-secret pruning keeps last SECRET_KEEP_VERSIONS (default 3) - PUSH_LATEST_TAG default flipped to false - B2 all-or-none validation before deploy Code - cmd/api takes pg_advisory_lock on a dedicated connection before AutoMigrate, serialising boot-time migrations across replicas - cmd/worker exposes an HTTP /health endpoint with graceful shutdown Docs - deploy/DEPLOYING.md: step-by-step walkthrough for a real deploy - deploy/shit_deploy_cant_do.md: manual prerequisites + recurring ops - deploy/README.md updated with storage toggle, worker-replica caveat, multi-arch recipe, connection-pool tuning, renumbered sections Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
54
.dockerignore
Normal file
54
.dockerignore
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
# Git
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
.gitattributes
|
||||||
|
.github
|
||||||
|
.gitea
|
||||||
|
|
||||||
|
# Deploy inputs (never bake into images)
|
||||||
|
deploy/*.env
|
||||||
|
deploy/secrets/*.txt
|
||||||
|
deploy/secrets/*.p8
|
||||||
|
deploy/scripts/
|
||||||
|
|
||||||
|
# Local env files
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
|
||||||
|
# Node (admin)
|
||||||
|
admin/node_modules
|
||||||
|
admin/.next
|
||||||
|
admin/out
|
||||||
|
admin/.turbo
|
||||||
|
admin/.vercel
|
||||||
|
admin/npm-debug.log*
|
||||||
|
|
||||||
|
# Go build artifacts
|
||||||
|
bin/
|
||||||
|
dist/
|
||||||
|
tmp/
|
||||||
|
*.test
|
||||||
|
*.out
|
||||||
|
coverage.out
|
||||||
|
coverage.html
|
||||||
|
|
||||||
|
# Tooling / editor
|
||||||
|
.vscode
|
||||||
|
.idea
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
logs/
|
||||||
|
|
||||||
|
# Tests / docs (not needed at runtime)
|
||||||
|
docs/
|
||||||
|
*.md
|
||||||
|
!README.md
|
||||||
|
|
||||||
|
# CI/compose locals (not needed for swarm image build)
|
||||||
|
docker-compose*.yml
|
||||||
|
Makefile
|
||||||
@@ -65,8 +65,10 @@ func main() {
|
|||||||
log.Error().Err(dbErr).Msg("Failed to connect to database - API will start but database operations will fail")
|
log.Error().Err(dbErr).Msg("Failed to connect to database - API will start but database operations will fail")
|
||||||
} else {
|
} else {
|
||||||
defer database.Close()
|
defer database.Close()
|
||||||
// Run database migrations only if connected
|
// Run database migrations only if connected.
|
||||||
if err := database.Migrate(); err != nil {
|
// MigrateWithLock serialises parallel replica starts via a Postgres
|
||||||
|
// advisory lock so concurrent AutoMigrate calls don't race on DDL.
|
||||||
|
if err := database.MigrateWithLock(); err != nil {
|
||||||
log.Error().Err(err).Msg("Failed to run database migrations")
|
log.Error().Err(err).Msg("Failed to run database migrations")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,9 +2,11 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/hibiken/asynq"
|
"github.com/hibiken/asynq"
|
||||||
"github.com/redis/go-redis/v9"
|
"github.com/redis/go-redis/v9"
|
||||||
@@ -20,6 +22,8 @@ import (
|
|||||||
"github.com/treytartt/honeydue-api/pkg/utils"
|
"github.com/treytartt/honeydue-api/pkg/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const workerHealthAddr = ":6060"
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// Initialize logger
|
// Initialize logger
|
||||||
utils.InitLogger(true)
|
utils.InitLogger(true)
|
||||||
@@ -188,6 +192,25 @@ func main() {
|
|||||||
quit := make(chan os.Signal, 1)
|
quit := make(chan os.Signal, 1)
|
||||||
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
|
||||||
|
// Health server (for container healthchecks; not externally published)
|
||||||
|
healthMux := http.NewServeMux()
|
||||||
|
healthMux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
_, _ = w.Write([]byte(`{"status":"ok"}`))
|
||||||
|
})
|
||||||
|
healthSrv := &http.Server{
|
||||||
|
Addr: workerHealthAddr,
|
||||||
|
Handler: healthMux,
|
||||||
|
ReadHeaderTimeout: 5 * time.Second,
|
||||||
|
}
|
||||||
|
go func() {
|
||||||
|
log.Info().Str("addr", workerHealthAddr).Msg("Health server listening")
|
||||||
|
if err := healthSrv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||||
|
log.Warn().Err(err).Msg("Health server terminated")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// Start scheduler in goroutine
|
// Start scheduler in goroutine
|
||||||
go func() {
|
go func() {
|
||||||
if err := scheduler.Run(); err != nil {
|
if err := scheduler.Run(); err != nil {
|
||||||
@@ -207,6 +230,9 @@ func main() {
|
|||||||
log.Info().Msg("Shutting down worker...")
|
log.Info().Msg("Shutting down worker...")
|
||||||
|
|
||||||
// Graceful shutdown
|
// Graceful shutdown
|
||||||
|
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer shutdownCancel()
|
||||||
|
_ = healthSrv.Shutdown(shutdownCtx)
|
||||||
srv.Shutdown()
|
srv.Shutdown()
|
||||||
scheduler.Shutdown()
|
scheduler.Shutdown()
|
||||||
|
|
||||||
|
|||||||
126
deploy/DEPLOYING.md
Normal file
126
deploy/DEPLOYING.md
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
# Deploying Right Now
|
||||||
|
|
||||||
|
Practical walkthrough for a prod deploy against the current Swarm stack.
|
||||||
|
Assumes infrastructure and cloud services already exist — if not, work
|
||||||
|
through [`shit_deploy_cant_do.md`](./shit_deploy_cant_do.md) first.
|
||||||
|
|
||||||
|
See [`README.md`](./README.md) for the reference docs that back each step.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. Pre-flight — check local state
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd honeyDueAPI-go
|
||||||
|
|
||||||
|
git status # clean working tree?
|
||||||
|
git log -1 --oneline # deploying this SHA
|
||||||
|
|
||||||
|
ls deploy/cluster.env deploy/registry.env deploy/prod.env
|
||||||
|
ls deploy/secrets/*.txt deploy/secrets/*.p8
|
||||||
|
```
|
||||||
|
|
||||||
|
## 1. Reconcile your envs with current defaults
|
||||||
|
|
||||||
|
These two values **must** be right — the script does not enforce them:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# deploy/cluster.env
|
||||||
|
WORKER_REPLICAS=1 # >1 → duplicate cron jobs (Asynq scheduler is a singleton)
|
||||||
|
PUSH_LATEST_TAG=false # keeps prod images SHA-pinned
|
||||||
|
SECRET_KEEP_VERSIONS=3 # optional; 3 is the default
|
||||||
|
```
|
||||||
|
|
||||||
|
Decide storage backend in `deploy/prod.env`:
|
||||||
|
|
||||||
|
- **Multi-replica safe (recommended):** set all four of `B2_ENDPOINT`,
|
||||||
|
`B2_KEY_ID`, `B2_APP_KEY`, `B2_BUCKET_NAME`. Uploads go to B2.
|
||||||
|
- **Single-node ok:** leave all four empty. Script will warn. In this
|
||||||
|
mode you must also set `API_REPLICAS=1` — otherwise uploads are
|
||||||
|
invisible from 2/3 of requests.
|
||||||
|
|
||||||
|
## 2. Dry run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
DRY_RUN=1 ./.deploy_prod
|
||||||
|
```
|
||||||
|
|
||||||
|
Confirm in the output:
|
||||||
|
- `Storage backend: S3 (...)` OR the `LOCAL VOLUME` warning matches intent
|
||||||
|
- `Replicas: api=3, worker=1, admin=1` (or `api=1` if local storage)
|
||||||
|
- Image SHA matches `git rev-parse --short HEAD`
|
||||||
|
- `Manager:` host is correct
|
||||||
|
- `Secret retention: 3 versions`
|
||||||
|
|
||||||
|
Fix envs and re-run until the plan looks right. Nothing touches the cluster yet.
|
||||||
|
|
||||||
|
## 3. Real deploy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./.deploy_prod
|
||||||
|
```
|
||||||
|
|
||||||
|
Do **not** pass `SKIP_BUILD=1` after code changes — the worker's health
|
||||||
|
server and `MigrateWithLock` both require a fresh build.
|
||||||
|
|
||||||
|
End-to-end: ~3–8 minutes. The script prints each phase.
|
||||||
|
|
||||||
|
## 4. Post-deploy verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stack health (replicas X/X = desired)
|
||||||
|
ssh <manager> docker stack services honeydue
|
||||||
|
|
||||||
|
# API smoke
|
||||||
|
curl -fsS https://api.<domain>/api/health/ && echo OK
|
||||||
|
|
||||||
|
# Logs via Dozzle (loopback-bound, needs SSH tunnel)
|
||||||
|
ssh -p <port> -L 9999:127.0.0.1:9999 <user>@<manager>
|
||||||
|
# Then browse http://localhost:9999
|
||||||
|
```
|
||||||
|
|
||||||
|
What the logs should show on a healthy boot:
|
||||||
|
- `api`: exactly one replica logs `Migration advisory lock acquired`,
|
||||||
|
the others log `Migration advisory lock acquired` after waiting, then
|
||||||
|
`released`.
|
||||||
|
- `worker`: `Health server listening addr=:6060`, `Starting worker server...`,
|
||||||
|
four `Registered ... job` lines.
|
||||||
|
- No `Failed to connect to Redis` / `Failed to connect to database`.
|
||||||
|
|
||||||
|
## 5. If it goes wrong
|
||||||
|
|
||||||
|
Auto-rollback triggers when `DEPLOY_HEALTHCHECK_URL` fails — every service
|
||||||
|
is rolled back to its previous spec, script exits non-zero.
|
||||||
|
|
||||||
|
Triage:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh <manager> docker service logs --tail 200 honeydue_api
|
||||||
|
ssh <manager> docker service ps honeydue_api --no-trunc
|
||||||
|
```
|
||||||
|
|
||||||
|
Manual rollback (if auto didn't catch it):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh <manager> bash -c '
|
||||||
|
for svc in $(docker stack services honeydue --format "{{.Name}}"); do
|
||||||
|
docker service rollback "$svc"
|
||||||
|
done'
|
||||||
|
```
|
||||||
|
|
||||||
|
Redeploy a known-good SHA:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
DEPLOY_TAG=<older-sha> SKIP_BUILD=1 ./.deploy_prod
|
||||||
|
# Only valid if that image was previously pushed to the registry.
|
||||||
|
```
|
||||||
|
|
||||||
|
## 6. Pre-deploy honesty check
|
||||||
|
|
||||||
|
Before pulling the trigger:
|
||||||
|
|
||||||
|
- [ ] Tested Neon PITR restore (not just "backups exist")?
|
||||||
|
- [ ] `WORKER_REPLICAS=1` — otherwise duplicate push notifications next cron tick
|
||||||
|
- [ ] Cloudflare-only firewall rule on 80/443 — otherwise origin IP is on the public internet
|
||||||
|
- [ ] If storage is LOCAL, `API_REPLICAS=1` too
|
||||||
|
- [ ] Last deploy's secrets still valid (rotation hasn't expired any creds)
|
||||||
215
deploy/README.md
215
deploy/README.md
@@ -2,13 +2,18 @@
|
|||||||
|
|
||||||
This folder is the full production deploy toolkit for `honeyDueAPI-go`.
|
This folder is the full production deploy toolkit for `honeyDueAPI-go`.
|
||||||
|
|
||||||
Run deploy with:
|
**Recommended flow — always dry-run first:**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./.deploy_prod
|
DRY_RUN=1 ./.deploy_prod # validates everything, prints the plan, no changes
|
||||||
|
./.deploy_prod # then the real deploy
|
||||||
```
|
```
|
||||||
|
|
||||||
The script will refuse to run until all required values are set.
|
The script refuses to run until all required values are set.
|
||||||
|
|
||||||
|
- Step-by-step walkthrough for a real deploy: [`DEPLOYING.md`](./DEPLOYING.md)
|
||||||
|
- Manual prerequisites the script cannot automate (Swarm init, firewall,
|
||||||
|
Cloudflare, Neon, APNS, etc.): [`shit_deploy_cant_do.md`](./shit_deploy_cant_do.md)
|
||||||
|
|
||||||
## First-Time Prerequisite: Create The Swarm Cluster
|
## First-Time Prerequisite: Create The Swarm Cluster
|
||||||
|
|
||||||
@@ -84,16 +89,159 @@ AllowUsers deploy
|
|||||||
|
|
||||||
### 6) Dozzle Hardening
|
### 6) Dozzle Hardening
|
||||||
|
|
||||||
- Keep Dozzle private (no public DNS/ingress).
|
Dozzle exposes the full Docker log stream with no built-in auth — logs contain
|
||||||
|
secrets, tokens, and user data. The stack binds Dozzle to `127.0.0.1` on the
|
||||||
|
manager node only (`mode: host`, `host_ip: 127.0.0.1`), so it is **not
|
||||||
|
reachable from the public internet or from other Swarm nodes**.
|
||||||
|
|
||||||
|
To view logs, open an SSH tunnel from your workstation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh -p "${DEPLOY_MANAGER_SSH_PORT}" \
|
||||||
|
-L "${DOZZLE_PORT}:127.0.0.1:${DOZZLE_PORT}" \
|
||||||
|
"${DEPLOY_MANAGER_USER}@${DEPLOY_MANAGER_HOST}"
|
||||||
|
# Then browse http://localhost:${DOZZLE_PORT}
|
||||||
|
```
|
||||||
|
|
||||||
|
Additional hardening if you ever need to expose Dozzle over a network:
|
||||||
|
|
||||||
- Put auth/SSO in front (Cloudflare Access or equivalent).
|
- Put auth/SSO in front (Cloudflare Access or equivalent).
|
||||||
- Prefer a Docker socket proxy with restricted read-only scope.
|
- Replace the raw `/var/run/docker.sock` mount with a Docker socket proxy
|
||||||
|
limited to read-only log endpoints.
|
||||||
|
- Prefer a persistent log aggregator (Loki, Datadog, CloudWatch) for prod —
|
||||||
|
Dozzle is ephemeral and not a substitute for audit trails.
|
||||||
|
|
||||||
### 7) Backup + Restore Readiness
|
### 7) Backup + Restore Readiness
|
||||||
|
|
||||||
- Postgres PITR path tested in staging.
|
Treat this as a pre-launch checklist. Nothing below is automated by
|
||||||
- Redis persistence enabled and restore path tested.
|
`./.deploy_prod`.
|
||||||
- Written runbook for restore and secret rotation.
|
|
||||||
- Named owner for incident response.
|
- [ ] Postgres PITR path tested in staging (restore a real dump, validate app boots).
|
||||||
|
- [x] Redis AOF persistence enabled (`appendonly yes --appendfsync everysec` in stack).
|
||||||
|
- [ ] Redis restore path tested (verify AOF replays on a fresh node).
|
||||||
|
- [ ] Written runbook for restore + secret rotation (see §4 and `shit_deploy_cant_do.md`).
|
||||||
|
- [ ] Named owner for incident response.
|
||||||
|
- [ ] Uploads bucket (Backblaze B2) lifecycle / versioning reviewed — deletes are
|
||||||
|
handled by the app, not by retention rules.
|
||||||
|
|
||||||
|
### 8) Storage Backend (Uploads)
|
||||||
|
|
||||||
|
The stack supports two storage backends. The choice is **runtime-only** — the
|
||||||
|
same image runs in both modes, selected by env vars in `prod.env`:
|
||||||
|
|
||||||
|
| Mode | When to use | Config |
|
||||||
|
|---|---|---|
|
||||||
|
| **Local volume** | Dev / single-node prod | Leave all `B2_*` empty. Files land on `/app/uploads` via the named volume. |
|
||||||
|
| **S3-compatible** (B2, MinIO) | Multi-replica prod | Set all four of `B2_ENDPOINT`, `B2_KEY_ID`, `B2_APP_KEY`, `B2_BUCKET_NAME`. |
|
||||||
|
|
||||||
|
The deploy script enforces **all-or-none** for the B2 vars — a partial config
|
||||||
|
fails fast rather than silently falling back to the local volume.
|
||||||
|
|
||||||
|
**Why this matters:** Docker Swarm named volumes are **per-node**. With 3 API
|
||||||
|
replicas spread across nodes, an upload written on node A is invisible to
|
||||||
|
replicas on nodes B and C (the client sees a random 404 two-thirds of the
|
||||||
|
time). In multi-replica prod you **must** use S3-compatible storage.
|
||||||
|
|
||||||
|
The `uploads:` volume is still declared as a harmless fallback: when B2 is
|
||||||
|
configured, nothing writes to it. `./.deploy_prod` prints the selected
|
||||||
|
backend at the start of each run.
|
||||||
|
|
||||||
|
### 9) Worker Replicas & Scheduler
|
||||||
|
|
||||||
|
Keep `WORKER_REPLICAS=1` in `cluster.env` until Asynq `PeriodicTaskManager`
|
||||||
|
is wired up. The current `asynq.Scheduler` in `cmd/worker/main.go` has no
|
||||||
|
Redis-based leader election, so each replica independently enqueues the
|
||||||
|
same cron task — users see duplicate daily digests / onboarding emails.
|
||||||
|
|
||||||
|
Asynq workers (task consumers) are already safe to scale horizontally; it's
|
||||||
|
only the scheduler singleton that is constrained. Future work: migrate to
|
||||||
|
`asynq.NewPeriodicTaskManager(...)` with `PeriodicTaskConfigProvider` so
|
||||||
|
multiple scheduler replicas coordinate via Redis.
|
||||||
|
|
||||||
|
### 10) Database Migrations
|
||||||
|
|
||||||
|
`cmd/api/main.go` runs `database.MigrateWithLock()` on startup, which takes a
|
||||||
|
Postgres session-level `pg_advisory_lock` on a dedicated connection before
|
||||||
|
calling `AutoMigrate`. This serialises boot-time migrations across all API
|
||||||
|
replicas — the first replica migrates, the rest wait, then each sees an
|
||||||
|
already-current schema and `AutoMigrate` is a no-op.
|
||||||
|
|
||||||
|
The lock is released on connection close, so a crashed replica can't leave
|
||||||
|
a stale lock behind.
|
||||||
|
|
||||||
|
For very large schema changes, run migrations as a separate pre-deploy
|
||||||
|
step (there is no dedicated `cmd/migrate` binary today — this is a future
|
||||||
|
improvement).
|
||||||
|
|
||||||
|
### 11) Redis Redundancy
|
||||||
|
|
||||||
|
Redis runs as a **single replica** with an AOF-persisted named volume. If
|
||||||
|
the node running Redis dies, Swarm reschedules the container but the named
|
||||||
|
volume is per-node — the new Redis boots **empty**.
|
||||||
|
|
||||||
|
Impact:
|
||||||
|
- **Cache** (ETag lookups, static data): regenerates on first request.
|
||||||
|
- **Asynq queue**: in-flight jobs at the moment of the crash are lost; Asynq
|
||||||
|
retry semantics cover most re-enqueues. Scheduled-but-not-yet-fired cron
|
||||||
|
events are re-triggered on the next cron tick.
|
||||||
|
- **Sessions / auth tokens**: not stored in Redis, so unaffected.
|
||||||
|
|
||||||
|
This is an accepted limitation today. Options to harden later: Redis
|
||||||
|
Sentinel, a managed Redis (Upstash, Dragonfly Cloud), or restoring from the
|
||||||
|
AOF on a pinned node.
|
||||||
|
|
||||||
|
### 12) Multi-Arch Builds
|
||||||
|
|
||||||
|
`./.deploy_prod` builds images for the **host** architecture of the machine
|
||||||
|
running the script. If your Swarm nodes are a different arch (e.g. ARM64
|
||||||
|
Ampere VMs), use `docker buildx` explicitly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker buildx create --use
|
||||||
|
docker buildx build --platform linux/arm64 --target api -t <image> --push .
|
||||||
|
# repeat for worker, admin
|
||||||
|
SKIP_BUILD=1 ./.deploy_prod # then deploy the already-pushed images
|
||||||
|
```
|
||||||
|
|
||||||
|
The Go stages cross-compile cleanly (`TARGETARCH` is already honoured).
|
||||||
|
The Node/admin stages require QEMU emulation (`docker run --privileged --rm
|
||||||
|
tonistiigi/binfmt --install all` on the build host) since native deps may
|
||||||
|
need to be rebuilt for the target arch.
|
||||||
|
|
||||||
|
### 13) Connection Pool & TLS Tuning
|
||||||
|
|
||||||
|
Because Postgres is external (Neon/RDS), each replica opens its own pool.
|
||||||
|
Sizing matters: total open connections across the cluster must stay under
|
||||||
|
the database's configured limit. Defaults in `prod.env.example`:
|
||||||
|
|
||||||
|
| Setting | Default | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `DB_SSLMODE` | `require` | Never set to `disable` in prod. For Neon use `require`. |
|
||||||
|
| `DB_MAX_OPEN_CONNS` | `25` | Per-replica cap. Worst case: 25 × (API+worker replicas). |
|
||||||
|
| `DB_MAX_IDLE_CONNS` | `10` | Keep warm connections ready without exhausting the pool. |
|
||||||
|
| `DB_MAX_LIFETIME` | `600s` | Recycle before Neon's idle disconnect (typically 5 min). |
|
||||||
|
|
||||||
|
Worked example with default replicas (3 API + 1 worker — see §9 for why
|
||||||
|
worker is pinned to 1):
|
||||||
|
|
||||||
|
```
|
||||||
|
3 × 25 + 1 × 25 = 100 peak open connections
|
||||||
|
```
|
||||||
|
|
||||||
|
That lands exactly on Neon's free-tier ceiling (100 concurrent connections),
|
||||||
|
which is risky with even one transient spike. For Neon free tier drop
|
||||||
|
`DB_MAX_OPEN_CONNS=15` (→ 60 peak). Paid tiers (Neon Scale, 1000+
|
||||||
|
connections) can keep the default or raise it.
|
||||||
|
|
||||||
|
Operational checklist:
|
||||||
|
|
||||||
|
- Confirm Neon IP allowlist includes every Swarm node IP.
|
||||||
|
- After changing pool sizes, redeploy and watch `pg_stat_activity` /
|
||||||
|
Neon metrics for saturation.
|
||||||
|
- Keep `DB_MAX_LIFETIME` ≤ Neon idle timeout to avoid "terminating
|
||||||
|
connection due to administrator command" errors in the API logs.
|
||||||
|
- For read-heavy workloads, consider a Neon read replica and split
|
||||||
|
query traffic at the application layer.
|
||||||
|
|
||||||
## Files You Fill In
|
## Files You Fill In
|
||||||
|
|
||||||
@@ -113,20 +261,51 @@ If one is missing, the deploy script auto-copies it from its `.example` template
|
|||||||
## What `./.deploy_prod` Does
|
## What `./.deploy_prod` Does
|
||||||
|
|
||||||
1. Validates all required config files and credentials.
|
1. Validates all required config files and credentials.
|
||||||
2. Builds and pushes `api`, `worker`, and `admin` images.
|
2. Validates the storage-backend toggle (all-or-none for `B2_*`). Prints
|
||||||
3. Uploads deploy bundle to your Swarm manager over SSH.
|
the selected backend (S3 or local volume) before continuing.
|
||||||
4. Creates versioned Docker secrets on the manager.
|
3. Builds and pushes `api`, `worker`, and `admin` images (skip with
|
||||||
5. Deploys the stack with `docker stack deploy --with-registry-auth`.
|
`SKIP_BUILD=1`).
|
||||||
6. Waits until service replicas converge.
|
4. Uploads deploy bundle to your Swarm manager over SSH.
|
||||||
7. Runs an HTTP health check (if `DEPLOY_HEALTHCHECK_URL` is set).
|
5. Creates versioned Docker secrets on the manager.
|
||||||
|
6. Deploys the stack with `docker stack deploy --with-registry-auth`.
|
||||||
|
7. Waits until service replicas converge.
|
||||||
|
8. Prunes old secret versions, keeping the last `SECRET_KEEP_VERSIONS`
|
||||||
|
(default 3).
|
||||||
|
9. Runs an HTTP health check (if `DEPLOY_HEALTHCHECK_URL` is set). **On
|
||||||
|
failure, automatically runs `docker service rollback` for every service
|
||||||
|
in the stack and exits non-zero.**
|
||||||
|
10. Logs out of the registry on both the dev host and the manager so the
|
||||||
|
token doesn't linger in `~/.docker/config.json`.
|
||||||
|
|
||||||
## Useful Flags
|
## Useful Flags
|
||||||
|
|
||||||
Environment flags:
|
Environment flags:
|
||||||
|
|
||||||
- `SKIP_BUILD=1 ./.deploy_prod` to deploy already-pushed images.
|
- `DRY_RUN=1 ./.deploy_prod` — validate config and print the deploy plan
|
||||||
- `SKIP_HEALTHCHECK=1 ./.deploy_prod` to skip final URL check.
|
without building, pushing, or touching the cluster. Use this before every
|
||||||
- `DEPLOY_TAG=<tag> ./.deploy_prod` to deploy a specific image tag.
|
production deploy to review images, replicas, and secret names.
|
||||||
|
- `SKIP_BUILD=1 ./.deploy_prod` — deploy already-pushed images.
|
||||||
|
- `SKIP_HEALTHCHECK=1 ./.deploy_prod` — skip final URL check.
|
||||||
|
- `DEPLOY_TAG=<tag> ./.deploy_prod` — deploy a specific image tag.
|
||||||
|
- `PUSH_LATEST_TAG=true ./.deploy_prod` — also push `:latest` to the registry
|
||||||
|
(default is `false` so prod pins to the SHA tag and stays reproducible).
|
||||||
|
- `SECRET_KEEP_VERSIONS=<n> ./.deploy_prod` — how many versions of each
|
||||||
|
Swarm secret to retain after deploy (default: 3). Older unused versions
|
||||||
|
are pruned automatically once the stack converges.
|
||||||
|
|
||||||
|
## Secret Versioning & Pruning
|
||||||
|
|
||||||
|
Each deploy creates a fresh set of Swarm secrets named
|
||||||
|
`<stack>_<secret>_<deploy_id>` (for example
|
||||||
|
`honeydue_secret_key_abc1234_20260413120000`). The stack file references the
|
||||||
|
current names via `${POSTGRES_PASSWORD_SECRET}` etc., so rolling updates never
|
||||||
|
reuse a secret that a running task still holds open.
|
||||||
|
|
||||||
|
After the new stack converges, `./.deploy_prod` SSHes to the manager and
|
||||||
|
prunes old versions per base name, keeping the most recent
|
||||||
|
`SECRET_KEEP_VERSIONS` (default 3). Anything still referenced by a running
|
||||||
|
task is left alone (Docker refuses to delete in-use secrets) and will be
|
||||||
|
pruned on the next deploy.
|
||||||
|
|
||||||
## Important
|
## Important
|
||||||
|
|
||||||
|
|||||||
@@ -12,11 +12,21 @@ DEPLOY_HEALTHCHECK_URL=https://api.honeyDue.treytartt.com/api/health/
|
|||||||
|
|
||||||
# Replicas and published ports
|
# Replicas and published ports
|
||||||
API_REPLICAS=3
|
API_REPLICAS=3
|
||||||
WORKER_REPLICAS=2
|
# IMPORTANT: keep WORKER_REPLICAS=1 until Asynq PeriodicTaskManager is wired.
|
||||||
|
# The current asynq.Scheduler in cmd/worker/main.go has no Redis-based
|
||||||
|
# leader election, so running >1 replica fires every cron task once per
|
||||||
|
# replica → duplicate daily digests / onboarding emails / etc.
|
||||||
|
WORKER_REPLICAS=1
|
||||||
ADMIN_REPLICAS=1
|
ADMIN_REPLICAS=1
|
||||||
API_PORT=8000
|
API_PORT=8000
|
||||||
ADMIN_PORT=3000
|
ADMIN_PORT=3000
|
||||||
DOZZLE_PORT=9999
|
DOZZLE_PORT=9999
|
||||||
|
|
||||||
# Build behavior
|
# Build behavior
|
||||||
PUSH_LATEST_TAG=true
|
# PUSH_LATEST_TAG=true also tags and pushes :latest on the registry.
|
||||||
|
# Leave false in production to keep image tags immutable (SHA-pinned only).
|
||||||
|
PUSH_LATEST_TAG=false
|
||||||
|
|
||||||
|
# Secret retention: number of versioned Swarm secrets to keep per name after each deploy.
|
||||||
|
# Older unused versions are pruned post-convergence. Default: 3.
|
||||||
|
SECRET_KEEP_VERSIONS=3
|
||||||
|
|||||||
@@ -50,6 +50,27 @@ STORAGE_BASE_URL=/uploads
|
|||||||
STORAGE_MAX_FILE_SIZE=10485760
|
STORAGE_MAX_FILE_SIZE=10485760
|
||||||
STORAGE_ALLOWED_TYPES=image/jpeg,image/png,image/gif,image/webp,application/pdf
|
STORAGE_ALLOWED_TYPES=image/jpeg,image/png,image/gif,image/webp,application/pdf
|
||||||
|
|
||||||
|
# Storage backend (S3-compatible: Backblaze B2 or MinIO)
|
||||||
|
#
|
||||||
|
# Leave all B2_* vars empty to use the local filesystem at STORAGE_UPLOAD_DIR.
|
||||||
|
# - Safe for single-node setups (dev / single-VPS prod).
|
||||||
|
# - NOT SAFE for multi-replica prod: named volumes are per-node in Swarm,
|
||||||
|
# so uploads written on one node are invisible to the other replicas.
|
||||||
|
#
|
||||||
|
# Set ALL FOUR of B2_ENDPOINT, B2_KEY_ID, B2_APP_KEY, B2_BUCKET_NAME to
|
||||||
|
# switch to S3-compatible storage. The deploy script enforces all-or-none.
|
||||||
|
#
|
||||||
|
# Example for Backblaze B2 (us-west-004):
|
||||||
|
# B2_ENDPOINT=s3.us-west-004.backblazeb2.com
|
||||||
|
# B2_USE_SSL=true
|
||||||
|
# B2_REGION=us-west-004
|
||||||
|
B2_ENDPOINT=
|
||||||
|
B2_KEY_ID=
|
||||||
|
B2_APP_KEY=
|
||||||
|
B2_BUCKET_NAME=
|
||||||
|
B2_USE_SSL=true
|
||||||
|
B2_REGION=us-east-1
|
||||||
|
|
||||||
# Feature flags
|
# Feature flags
|
||||||
FEATURE_PUSH_ENABLED=true
|
FEATURE_PUSH_ENABLED=true
|
||||||
FEATURE_EMAIL_ENABLED=true
|
FEATURE_EMAIL_ENABLED=true
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ SECRET_APNS_KEY="${DEPLOY_DIR}/secrets/apns_auth_key.p8"
|
|||||||
|
|
||||||
SKIP_BUILD="${SKIP_BUILD:-0}"
|
SKIP_BUILD="${SKIP_BUILD:-0}"
|
||||||
SKIP_HEALTHCHECK="${SKIP_HEALTHCHECK:-0}"
|
SKIP_HEALTHCHECK="${SKIP_HEALTHCHECK:-0}"
|
||||||
|
DRY_RUN="${DRY_RUN:-0}"
|
||||||
|
SECRET_KEEP_VERSIONS="${SECRET_KEEP_VERSIONS:-3}"
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
printf '[deploy] %s\n' "$*"
|
printf '[deploy] %s\n' "$*"
|
||||||
@@ -91,9 +93,13 @@ Usage:
|
|||||||
./.deploy_prod
|
./.deploy_prod
|
||||||
|
|
||||||
Optional environment flags:
|
Optional environment flags:
|
||||||
|
DRY_RUN=1 Print the deployment plan and exit without changes.
|
||||||
SKIP_BUILD=1 Deploy existing image tags without rebuilding/pushing.
|
SKIP_BUILD=1 Deploy existing image tags without rebuilding/pushing.
|
||||||
SKIP_HEALTHCHECK=1 Skip final HTTP health check.
|
SKIP_HEALTHCHECK=1 Skip final HTTP health check.
|
||||||
DEPLOY_TAG=<tag> Override image tag (default: git short sha).
|
DEPLOY_TAG=<tag> Override image tag (default: git short sha).
|
||||||
|
PUSH_LATEST_TAG=true|false Also tag/push :latest (default: false — SHA only).
|
||||||
|
SECRET_KEEP_VERSIONS=<n> How many versions of each Swarm secret to retain
|
||||||
|
(default: 3). Older unused versions are pruned.
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -144,7 +150,7 @@ DEPLOY_STACK_NAME="${DEPLOY_STACK_NAME:-honeydue}"
|
|||||||
DEPLOY_REMOTE_DIR="${DEPLOY_REMOTE_DIR:-/opt/honeydue/deploy}"
|
DEPLOY_REMOTE_DIR="${DEPLOY_REMOTE_DIR:-/opt/honeydue/deploy}"
|
||||||
DEPLOY_WAIT_SECONDS="${DEPLOY_WAIT_SECONDS:-420}"
|
DEPLOY_WAIT_SECONDS="${DEPLOY_WAIT_SECONDS:-420}"
|
||||||
DEPLOY_TAG="${DEPLOY_TAG:-$(git -C "${REPO_DIR}" rev-parse --short HEAD)}"
|
DEPLOY_TAG="${DEPLOY_TAG:-$(git -C "${REPO_DIR}" rev-parse --short HEAD)}"
|
||||||
PUSH_LATEST_TAG="${PUSH_LATEST_TAG:-true}"
|
PUSH_LATEST_TAG="${PUSH_LATEST_TAG:-false}"
|
||||||
|
|
||||||
require_var DEPLOY_MANAGER_HOST
|
require_var DEPLOY_MANAGER_HOST
|
||||||
require_var DEPLOY_MANAGER_USER
|
require_var DEPLOY_MANAGER_USER
|
||||||
@@ -173,6 +179,27 @@ require_var APNS_AUTH_KEY_ID
|
|||||||
require_var APNS_TEAM_ID
|
require_var APNS_TEAM_ID
|
||||||
require_var APNS_TOPIC
|
require_var APNS_TOPIC
|
||||||
|
|
||||||
|
# Storage backend validation: B2 is all-or-none. If any var is filled with
|
||||||
|
# a real value, require all four core vars. Empty means "use local volume".
|
||||||
|
b2_any_set=0
|
||||||
|
b2_all_set=1
|
||||||
|
for b2_var in B2_ENDPOINT B2_KEY_ID B2_APP_KEY B2_BUCKET_NAME; do
|
||||||
|
val="${!b2_var:-}"
|
||||||
|
if [[ -n "${val}" ]] && ! contains_placeholder "${val}"; then
|
||||||
|
b2_any_set=1
|
||||||
|
else
|
||||||
|
b2_all_set=0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if (( b2_any_set == 1 && b2_all_set == 0 )); then
|
||||||
|
die "Partial B2 configuration detected. Set all four of B2_ENDPOINT, B2_KEY_ID, B2_APP_KEY, B2_BUCKET_NAME, or leave all four empty to use the local volume."
|
||||||
|
fi
|
||||||
|
if (( b2_all_set == 1 )); then
|
||||||
|
log "Storage backend: S3 (${B2_ENDPOINT} / bucket=${B2_BUCKET_NAME})"
|
||||||
|
else
|
||||||
|
warn "Storage backend: LOCAL VOLUME. This is not safe for multi-replica prod — uploads will only exist on one node. Set B2_* in prod.env to use object storage."
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ ! "$(tr -d '\r\n' < "${SECRET_APNS_KEY}")" =~ BEGIN[[:space:]]+PRIVATE[[:space:]]+KEY ]]; then
|
if [[ ! "$(tr -d '\r\n' < "${SECRET_APNS_KEY}")" =~ BEGIN[[:space:]]+PRIVATE[[:space:]]+KEY ]]; then
|
||||||
die "APNS key file does not look like a private key: ${SECRET_APNS_KEY}"
|
die "APNS key file does not look like a private key: ${SECRET_APNS_KEY}"
|
||||||
fi
|
fi
|
||||||
@@ -200,6 +227,50 @@ if [[ -n "${SSH_KEY_PATH}" ]]; then
|
|||||||
SCP_OPTS+=(-i "${SSH_KEY_PATH}")
|
SCP_OPTS+=(-i "${SSH_KEY_PATH}")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ "${DRY_RUN}" == "1" ]]; then
|
||||||
|
cat <<EOF
|
||||||
|
|
||||||
|
==================== DRY RUN ====================
|
||||||
|
Validation passed. Would deploy:
|
||||||
|
|
||||||
|
Stack name: ${DEPLOY_STACK_NAME}
|
||||||
|
Manager: ${SSH_TARGET}:${DEPLOY_MANAGER_SSH_PORT}
|
||||||
|
Remote dir: ${DEPLOY_REMOTE_DIR}
|
||||||
|
Deploy tag: ${DEPLOY_TAG}
|
||||||
|
Push :latest: ${PUSH_LATEST_TAG}
|
||||||
|
Skip build: ${SKIP_BUILD}
|
||||||
|
Skip healthcheck: ${SKIP_HEALTHCHECK}
|
||||||
|
Secret retention: ${SECRET_KEEP_VERSIONS} versions per name
|
||||||
|
|
||||||
|
Images that would be built and pushed:
|
||||||
|
${API_IMAGE}
|
||||||
|
${WORKER_IMAGE}
|
||||||
|
${ADMIN_IMAGE}
|
||||||
|
|
||||||
|
Replicas:
|
||||||
|
api: ${API_REPLICAS:-3}
|
||||||
|
worker: ${WORKER_REPLICAS:-2}
|
||||||
|
admin: ${ADMIN_REPLICAS:-1}
|
||||||
|
|
||||||
|
Published ports:
|
||||||
|
api: ${API_PORT:-8000} (ingress)
|
||||||
|
admin: ${ADMIN_PORT:-3000} (ingress)
|
||||||
|
dozzle: ${DOZZLE_PORT:-9999} (manager loopback only — SSH tunnel required)
|
||||||
|
|
||||||
|
Versioned secrets that would be created on this deploy:
|
||||||
|
${DEPLOY_STACK_NAME}_postgres_password_<deploy_id>
|
||||||
|
${DEPLOY_STACK_NAME}_secret_key_<deploy_id>
|
||||||
|
${DEPLOY_STACK_NAME}_email_host_password_<deploy_id>
|
||||||
|
${DEPLOY_STACK_NAME}_fcm_server_key_<deploy_id>
|
||||||
|
${DEPLOY_STACK_NAME}_apns_auth_key_<deploy_id>
|
||||||
|
|
||||||
|
No changes made. Re-run without DRY_RUN=1 to deploy.
|
||||||
|
=================================================
|
||||||
|
|
||||||
|
EOF
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
log "Validating SSH access to ${SSH_TARGET}"
|
log "Validating SSH access to ${SSH_TARGET}"
|
||||||
if ! ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "echo ok" >/dev/null 2>&1; then
|
if ! ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "echo ok" >/dev/null 2>&1; then
|
||||||
die "SSH connection failed to ${SSH_TARGET}"
|
die "SSH connection failed to ${SSH_TARGET}"
|
||||||
@@ -384,10 +455,76 @@ while true; do
|
|||||||
sleep 10
|
sleep 10
|
||||||
done
|
done
|
||||||
|
|
||||||
|
log "Pruning old secret versions (keeping last ${SECRET_KEEP_VERSIONS})"
|
||||||
|
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "bash -s -- '${DEPLOY_STACK_NAME}' '${SECRET_KEEP_VERSIONS}'" <<'EOF' || warn "Secret pruning reported errors (non-fatal)"
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
STACK_NAME="$1"
|
||||||
|
KEEP="$2"
|
||||||
|
|
||||||
|
prune_prefix() {
|
||||||
|
local prefix="$1"
|
||||||
|
# List matching secrets with creation time, sorted newest-first.
|
||||||
|
local all
|
||||||
|
all="$(docker secret ls --format '{{.CreatedAt}}|{{.Name}}' 2>/dev/null \
|
||||||
|
| grep "|${prefix}_" \
|
||||||
|
| sort -r \
|
||||||
|
|| true)"
|
||||||
|
if [[ -z "${all}" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local total
|
||||||
|
total="$(printf '%s\n' "${all}" | wc -l | tr -d ' ')"
|
||||||
|
if (( total <= KEEP )); then
|
||||||
|
echo "[cleanup] ${prefix}: ${total} version(s) — nothing to prune"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local to_remove
|
||||||
|
to_remove="$(printf '%s\n' "${all}" | tail -n +$((KEEP + 1)) | awk -F'|' '{print $2}')"
|
||||||
|
|
||||||
|
while IFS= read -r name; do
|
||||||
|
[[ -z "${name}" ]] && continue
|
||||||
|
if docker secret rm "${name}" >/dev/null 2>&1; then
|
||||||
|
echo "[cleanup] removed: ${name}"
|
||||||
|
else
|
||||||
|
echo "[cleanup] in-use (kept): ${name}"
|
||||||
|
fi
|
||||||
|
done <<< "${to_remove}"
|
||||||
|
}
|
||||||
|
|
||||||
|
for base in postgres_password secret_key email_host_password fcm_server_key apns_auth_key; do
|
||||||
|
prune_prefix "${STACK_NAME}_${base}"
|
||||||
|
done
|
||||||
|
EOF
|
||||||
|
|
||||||
|
rollback_stack() {
|
||||||
|
warn "Rolling back stack ${DEPLOY_STACK_NAME} on ${SSH_TARGET}"
|
||||||
|
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "bash -s -- '${DEPLOY_STACK_NAME}'" <<'EOF' || true
|
||||||
|
set +e
|
||||||
|
STACK="$1"
|
||||||
|
for svc in $(docker stack services "${STACK}" --format '{{.Name}}'); do
|
||||||
|
echo "[rollback] ${svc}"
|
||||||
|
docker service rollback "${svc}" || echo "[rollback] ${svc}: nothing to roll back"
|
||||||
|
done
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
if [[ "${SKIP_HEALTHCHECK}" != "1" && -n "${DEPLOY_HEALTHCHECK_URL:-}" ]]; then
|
if [[ "${SKIP_HEALTHCHECK}" != "1" && -n "${DEPLOY_HEALTHCHECK_URL:-}" ]]; then
|
||||||
log "Running health check: ${DEPLOY_HEALTHCHECK_URL}"
|
log "Running health check: ${DEPLOY_HEALTHCHECK_URL}"
|
||||||
curl -fsS --max-time 20 "${DEPLOY_HEALTHCHECK_URL}" >/dev/null
|
if ! curl -fsS --max-time 20 "${DEPLOY_HEALTHCHECK_URL}" >/dev/null; then
|
||||||
|
warn "Health check FAILED for ${DEPLOY_HEALTHCHECK_URL}"
|
||||||
|
rollback_stack
|
||||||
|
die "Deploy rolled back due to failed health check."
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Best-effort registry logout — the token should not linger in
|
||||||
|
# ~/.docker/config.json after deploy completes. Failures are non-fatal.
|
||||||
|
log "Logging out of registry (local + remote)"
|
||||||
|
docker logout "${REGISTRY}" >/dev/null 2>&1 || true
|
||||||
|
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "docker logout '${REGISTRY}' >/dev/null 2>&1 || true"
|
||||||
|
|
||||||
log "Deploy completed successfully."
|
log "Deploy completed successfully."
|
||||||
log "Stack: ${DEPLOY_STACK_NAME}"
|
log "Stack: ${DEPLOY_STACK_NAME}"
|
||||||
|
|||||||
208
deploy/shit_deploy_cant_do.md
Normal file
208
deploy/shit_deploy_cant_do.md
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
# Shit `./.deploy_prod` Can't Do
|
||||||
|
|
||||||
|
Everything listed here is **manual**. The deploy script orchestrates builds,
|
||||||
|
secrets, and the stack — it does not provision infrastructure, touch DNS,
|
||||||
|
configure Cloudflare, or rotate external credentials. Work through this list
|
||||||
|
once before your first prod deploy, then revisit after every cloud-side
|
||||||
|
change.
|
||||||
|
|
||||||
|
See [`README.md`](./README.md) for the security checklist that complements
|
||||||
|
this file.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## One-Time: Infrastructure
|
||||||
|
|
||||||
|
### Swarm Cluster
|
||||||
|
|
||||||
|
- [ ] Provision manager + worker VMs (Hetzner, DO, etc.).
|
||||||
|
- [ ] `docker swarm init --advertise-addr <manager-private-ip>` on manager #1.
|
||||||
|
- [ ] `docker swarm join-token {manager,worker}` → join additional nodes.
|
||||||
|
- [ ] `docker node ls` to verify — all nodes `Ready` and `Active`.
|
||||||
|
- [ ] Label nodes if you want placement constraints beyond the defaults.
|
||||||
|
|
||||||
|
### Node Hardening (every node)
|
||||||
|
|
||||||
|
- [ ] SSH: non-default port, key-only auth, no root login — see README §2.
|
||||||
|
- [ ] Firewall: allow 22 (or 2222), 80, 443 from CF IPs only; 2377/tcp,
|
||||||
|
7946/tcp+udp, 4789/udp Swarm-nodes only; block the rest — see README §1.
|
||||||
|
- [ ] Install unattended-upgrades (or equivalent) for security patches.
|
||||||
|
- [ ] Disable password auth in `/etc/ssh/sshd_config`.
|
||||||
|
- [ ] Create the `deploy` user (`AllowUsers deploy` in sshd_config).
|
||||||
|
|
||||||
|
### DNS + Cloudflare
|
||||||
|
|
||||||
|
- [ ] Add A records for `api.<domain>`, `admin.<domain>` pointing to the LB
|
||||||
|
or manager IPs. Keep them **proxied** (orange cloud).
|
||||||
|
- [ ] Create a Cloudflare tunnel or enable "Authenticated Origin Pulls" if
|
||||||
|
you want to lock the origin to CF only.
|
||||||
|
- [ ] Firewall rule on the nodes: only accept 80/443 from Cloudflare IP ranges
|
||||||
|
(<https://www.cloudflare.com/ips/>).
|
||||||
|
- [ ] Configure CF Access (or equivalent SSO) in front of admin panel if
|
||||||
|
exposing it publicly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## One-Time: External Services
|
||||||
|
|
||||||
|
### Postgres (Neon)
|
||||||
|
|
||||||
|
- [ ] Create project + database (`honeydue`).
|
||||||
|
- [ ] Create a dedicated DB user with least privilege — not the project owner.
|
||||||
|
- [ ] Enable IP allowlist, add every Swarm node's egress IP.
|
||||||
|
- [ ] Verify `DB_SSLMODE=require` works end-to-end.
|
||||||
|
- [ ] Turn on PITR (paid tier) or schedule automated `pg_dump` backups.
|
||||||
|
- [ ] Do one restore drill — boot a staging stack from a real backup. If you
|
||||||
|
haven't done this, you do not have backups.
|
||||||
|
|
||||||
|
### Redis
|
||||||
|
|
||||||
|
- Redis runs **inside** the stack on a named volume. No external setup
|
||||||
|
needed today. See README §11 — this is an accepted SPOF.
|
||||||
|
- [ ] If you move Redis external (Upstash, Dragonfly Cloud): update
|
||||||
|
`REDIS_URL` in `prod.env`, remove the `redis` service + volume from
|
||||||
|
the stack.
|
||||||
|
|
||||||
|
### Backblaze B2 (or MinIO)
|
||||||
|
|
||||||
|
Skip this section if you're running a single-node prod and are OK with
|
||||||
|
uploads on a local volume. Required for multi-replica prod — see README §8.
|
||||||
|
|
||||||
|
- [ ] Create B2 account + bucket (private).
|
||||||
|
- [ ] Create a **scoped** application key bound to that single bucket —
|
||||||
|
not the master key.
|
||||||
|
- [ ] Set lifecycle rules: keep only the current version of each file,
|
||||||
|
or whatever matches your policy.
|
||||||
|
- [ ] Populate `B2_ENDPOINT`, `B2_KEY_ID`, `B2_APP_KEY`, `B2_BUCKET_NAME`
|
||||||
|
in `deploy/prod.env`. Optionally set `B2_USE_SSL` and `B2_REGION`.
|
||||||
|
- [ ] Verify uploads round-trip across replicas after the first deploy
|
||||||
|
(upload a file via client A → fetch via client B in a different session).
|
||||||
|
|
||||||
|
### APNS (Apple Push)
|
||||||
|
|
||||||
|
- [ ] Create an APNS auth key (`.p8`) in the Apple Developer portal.
|
||||||
|
- [ ] Save to `deploy/secrets/apns_auth_key.p8` — the script enforces it
|
||||||
|
contains a real `-----BEGIN PRIVATE KEY-----` block.
|
||||||
|
- [ ] Fill `APNS_AUTH_KEY_ID`, `APNS_TEAM_ID`, `APNS_TOPIC` (bundle ID) in
|
||||||
|
`deploy/prod.env`.
|
||||||
|
- [ ] Decide `APNS_USE_SANDBOX` / `APNS_PRODUCTION` based on build target.
|
||||||
|
|
||||||
|
### FCM (Android Push)
|
||||||
|
|
||||||
|
- [ ] Create Firebase project + legacy server key (or migrate to HTTP v1 —
|
||||||
|
the code currently uses the legacy server key).
|
||||||
|
- [ ] Save to `deploy/secrets/fcm_server_key.txt`.
|
||||||
|
|
||||||
|
### SMTP (Email)
|
||||||
|
|
||||||
|
- [ ] Provision SMTP credentials (Gmail app password, SES, Postmark, etc.).
|
||||||
|
- [ ] Fill `EMAIL_HOST`, `EMAIL_PORT`, `EMAIL_HOST_USER`,
|
||||||
|
`DEFAULT_FROM_EMAIL`, `EMAIL_USE_TLS` in `deploy/prod.env`.
|
||||||
|
- [ ] Save the password to `deploy/secrets/email_host_password.txt`.
|
||||||
|
- [ ] Verify SPF, DKIM, DMARC on the sending domain if you care about
|
||||||
|
deliverability.
|
||||||
|
|
||||||
|
### Registry (GHCR / other)
|
||||||
|
|
||||||
|
- [ ] Create a personal access token with `write:packages` + `read:packages`.
|
||||||
|
- [ ] Fill `REGISTRY`, `REGISTRY_NAMESPACE`, `REGISTRY_USERNAME`,
|
||||||
|
`REGISTRY_TOKEN` in `deploy/registry.env`.
|
||||||
|
- [ ] Rotate the token on a schedule (quarterly at minimum).
|
||||||
|
|
||||||
|
### Apple / Google IAP (optional)
|
||||||
|
|
||||||
|
- [ ] Apple: create App Store Connect API key, fill the `APPLE_IAP_*` vars.
|
||||||
|
- [ ] Google: create a service account with Play Developer API access,
|
||||||
|
store JSON at a path referenced by `GOOGLE_IAP_SERVICE_ACCOUNT_PATH`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recurring Operations
|
||||||
|
|
||||||
|
### Secret Rotation
|
||||||
|
|
||||||
|
After any compromise, annually at minimum, and when a team member leaves:
|
||||||
|
|
||||||
|
1. Generate the new value (e.g. `openssl rand -base64 32 > deploy/secrets/secret_key.txt`).
|
||||||
|
2. `./.deploy_prod` — creates a new versioned Swarm secret and redeploys
|
||||||
|
services to pick it up.
|
||||||
|
3. The old secret lingers until `SECRET_KEEP_VERSIONS` bumps it out (see
|
||||||
|
README "Secret Versioning & Pruning").
|
||||||
|
4. For external creds (Neon, B2, APNS, etc.) rotate at the provider first,
|
||||||
|
update the local secret file, then redeploy.
|
||||||
|
|
||||||
|
### Backup Drills
|
||||||
|
|
||||||
|
- [ ] Quarterly: pull a Neon backup, restore to a scratch project, boot a
|
||||||
|
staging stack against it, verify login + basic reads.
|
||||||
|
- [ ] Monthly: spot-check that B2 objects are actually present and the
|
||||||
|
app key still works.
|
||||||
|
- [ ] After any schema change: confirm PITR coverage includes the new
|
||||||
|
columns before relying on it.
|
||||||
|
|
||||||
|
### Certificate Management
|
||||||
|
|
||||||
|
- TLS is terminated by Cloudflare today, so there are no origin certs to
|
||||||
|
renew. If you ever move TLS on-origin (Traefik, Caddy), automate renewal
|
||||||
|
— don't add it to this list and expect it to happen.
|
||||||
|
|
||||||
|
### Multi-Arch Builds
|
||||||
|
|
||||||
|
`./.deploy_prod` builds for the host arch. If target ≠ host:
|
||||||
|
|
||||||
|
- [ ] Enable buildx: `docker buildx create --use`.
|
||||||
|
- [ ] Install QEMU: `docker run --privileged --rm tonistiigi/binfmt --install all`.
|
||||||
|
- [ ] Build + push images manually per target platform.
|
||||||
|
- [ ] Run `SKIP_BUILD=1 ./.deploy_prod` so the script just deploys.
|
||||||
|
|
||||||
|
### Node Maintenance / Rolling Upgrades
|
||||||
|
|
||||||
|
- [ ] `docker node update --availability drain <node>` before OS upgrades.
|
||||||
|
- [ ] Reboot, verify, then `docker node update --availability active <node>`.
|
||||||
|
- [ ] Re-converge with `docker stack deploy -c swarm-stack.prod.yml honeydue`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Incident Response
|
||||||
|
|
||||||
|
### Redis Node Dies
|
||||||
|
|
||||||
|
Named volume is per-node and doesn't follow. Accept the loss:
|
||||||
|
|
||||||
|
1. Let Swarm reschedule Redis on a new node.
|
||||||
|
2. In-flight Asynq jobs are lost; retry semantics cover most of them.
|
||||||
|
3. Scheduled cron events fire again on the next tick (hourly for smart
|
||||||
|
reminders and daily digest; daily for onboarding + cleanup).
|
||||||
|
4. Cache repopulates on first request.
|
||||||
|
|
||||||
|
### Deploy Rolled Back Automatically
|
||||||
|
|
||||||
|
`./.deploy_prod` triggers `docker service rollback` on every service if
|
||||||
|
`DEPLOY_HEALTHCHECK_URL` fails. Diagnose with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh <manager> docker stack services honeydue
|
||||||
|
ssh <manager> docker service logs --tail 200 honeydue_api
|
||||||
|
# Or open an SSH tunnel to Dozzle: ssh -L 9999:127.0.0.1:9999 <manager>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Lost Ability to Deploy
|
||||||
|
|
||||||
|
- Registry token revoked → regenerate, update `deploy/registry.env`, re-run.
|
||||||
|
- Manager host key changed → verify legitimacy, update `~/.ssh/known_hosts`.
|
||||||
|
- All secrets accidentally pruned → restore the `deploy/secrets/*` files
|
||||||
|
locally and redeploy; new Swarm secret versions will be created.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Known Gaps (Future Work)
|
||||||
|
|
||||||
|
- No dedicated `cmd/migrate` binary — migrations run at API boot (see
|
||||||
|
README §10). Large schema changes still need manual coordination.
|
||||||
|
- `asynq.Scheduler` has no leader election; `WORKER_REPLICAS` must stay 1
|
||||||
|
until we migrate to `asynq.PeriodicTaskManager` (README §9).
|
||||||
|
- No Prometheus / Grafana / alerting in the stack. `/metrics` is exposed
|
||||||
|
on the API but nothing scrapes it.
|
||||||
|
- No automated TLS renewal on-origin — add if you ever move off Cloudflare.
|
||||||
|
- No staging environment wired to the deploy script — `DEPLOY_TAG=<sha>`
|
||||||
|
is the closest thing. A proper staging flow is future work.
|
||||||
@@ -3,7 +3,7 @@ version: "3.8"
|
|||||||
services:
|
services:
|
||||||
redis:
|
redis:
|
||||||
image: redis:7-alpine
|
image: redis:7-alpine
|
||||||
command: redis-server --appendonly yes --appendfsync everysec
|
command: redis-server --appendonly yes --appendfsync everysec --maxmemory 200mb --maxmemory-policy allkeys-lru
|
||||||
volumes:
|
volumes:
|
||||||
- redis_data:/data
|
- redis_data:/data
|
||||||
healthcheck:
|
healthcheck:
|
||||||
@@ -18,6 +18,13 @@ services:
|
|||||||
delay: 5s
|
delay: 5s
|
||||||
placement:
|
placement:
|
||||||
max_replicas_per_node: 1
|
max_replicas_per_node: 1
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: "0.50"
|
||||||
|
memory: 256M
|
||||||
|
reservations:
|
||||||
|
cpus: "0.10"
|
||||||
|
memory: 64M
|
||||||
networks:
|
networks:
|
||||||
- honeydue-network
|
- honeydue-network
|
||||||
|
|
||||||
@@ -67,6 +74,17 @@ services:
|
|||||||
STORAGE_MAX_FILE_SIZE: "${STORAGE_MAX_FILE_SIZE}"
|
STORAGE_MAX_FILE_SIZE: "${STORAGE_MAX_FILE_SIZE}"
|
||||||
STORAGE_ALLOWED_TYPES: "${STORAGE_ALLOWED_TYPES}"
|
STORAGE_ALLOWED_TYPES: "${STORAGE_ALLOWED_TYPES}"
|
||||||
|
|
||||||
|
# S3-compatible object storage (Backblaze B2, MinIO). When all B2_* vars
|
||||||
|
# are set, uploads/media are stored in the bucket and the local volume
|
||||||
|
# mount becomes a no-op fallback. Required for multi-replica prod —
|
||||||
|
# without it uploads only exist on one node.
|
||||||
|
B2_ENDPOINT: "${B2_ENDPOINT}"
|
||||||
|
B2_KEY_ID: "${B2_KEY_ID}"
|
||||||
|
B2_APP_KEY: "${B2_APP_KEY}"
|
||||||
|
B2_BUCKET_NAME: "${B2_BUCKET_NAME}"
|
||||||
|
B2_USE_SSL: "${B2_USE_SSL}"
|
||||||
|
B2_REGION: "${B2_REGION}"
|
||||||
|
|
||||||
FEATURE_PUSH_ENABLED: "${FEATURE_PUSH_ENABLED}"
|
FEATURE_PUSH_ENABLED: "${FEATURE_PUSH_ENABLED}"
|
||||||
FEATURE_EMAIL_ENABLED: "${FEATURE_EMAIL_ENABLED}"
|
FEATURE_EMAIL_ENABLED: "${FEATURE_EMAIL_ENABLED}"
|
||||||
FEATURE_WEBHOOKS_ENABLED: "${FEATURE_WEBHOOKS_ENABLED}"
|
FEATURE_WEBHOOKS_ENABLED: "${FEATURE_WEBHOOKS_ENABLED}"
|
||||||
@@ -86,6 +104,7 @@ services:
|
|||||||
APPLE_IAP_SANDBOX: "${APPLE_IAP_SANDBOX}"
|
APPLE_IAP_SANDBOX: "${APPLE_IAP_SANDBOX}"
|
||||||
GOOGLE_IAP_SERVICE_ACCOUNT_PATH: "${GOOGLE_IAP_SERVICE_ACCOUNT_PATH}"
|
GOOGLE_IAP_SERVICE_ACCOUNT_PATH: "${GOOGLE_IAP_SERVICE_ACCOUNT_PATH}"
|
||||||
GOOGLE_IAP_PACKAGE_NAME: "${GOOGLE_IAP_PACKAGE_NAME}"
|
GOOGLE_IAP_PACKAGE_NAME: "${GOOGLE_IAP_PACKAGE_NAME}"
|
||||||
|
stop_grace_period: 60s
|
||||||
command:
|
command:
|
||||||
- /bin/sh
|
- /bin/sh
|
||||||
- -lc
|
- -lc
|
||||||
@@ -128,6 +147,13 @@ services:
|
|||||||
parallelism: 1
|
parallelism: 1
|
||||||
delay: 5s
|
delay: 5s
|
||||||
order: stop-first
|
order: stop-first
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: "1.00"
|
||||||
|
memory: 512M
|
||||||
|
reservations:
|
||||||
|
cpus: "0.25"
|
||||||
|
memory: 128M
|
||||||
networks:
|
networks:
|
||||||
- honeydue-network
|
- honeydue-network
|
||||||
|
|
||||||
@@ -142,10 +168,12 @@ services:
|
|||||||
PORT: "3000"
|
PORT: "3000"
|
||||||
HOSTNAME: "0.0.0.0"
|
HOSTNAME: "0.0.0.0"
|
||||||
NEXT_PUBLIC_API_URL: "${NEXT_PUBLIC_API_URL}"
|
NEXT_PUBLIC_API_URL: "${NEXT_PUBLIC_API_URL}"
|
||||||
|
stop_grace_period: 60s
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3000/admin/"]
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://127.0.0.1:3000/api/health"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
|
start_period: 20s
|
||||||
retries: 3
|
retries: 3
|
||||||
deploy:
|
deploy:
|
||||||
replicas: ${ADMIN_REPLICAS}
|
replicas: ${ADMIN_REPLICAS}
|
||||||
@@ -160,6 +188,13 @@ services:
|
|||||||
parallelism: 1
|
parallelism: 1
|
||||||
delay: 5s
|
delay: 5s
|
||||||
order: stop-first
|
order: stop-first
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: "0.50"
|
||||||
|
memory: 384M
|
||||||
|
reservations:
|
||||||
|
cpus: "0.10"
|
||||||
|
memory: 128M
|
||||||
networks:
|
networks:
|
||||||
- honeydue-network
|
- honeydue-network
|
||||||
|
|
||||||
@@ -201,6 +236,7 @@ services:
|
|||||||
FEATURE_ONBOARDING_EMAILS_ENABLED: "${FEATURE_ONBOARDING_EMAILS_ENABLED}"
|
FEATURE_ONBOARDING_EMAILS_ENABLED: "${FEATURE_ONBOARDING_EMAILS_ENABLED}"
|
||||||
FEATURE_PDF_REPORTS_ENABLED: "${FEATURE_PDF_REPORTS_ENABLED}"
|
FEATURE_PDF_REPORTS_ENABLED: "${FEATURE_PDF_REPORTS_ENABLED}"
|
||||||
FEATURE_WORKER_ENABLED: "${FEATURE_WORKER_ENABLED}"
|
FEATURE_WORKER_ENABLED: "${FEATURE_WORKER_ENABLED}"
|
||||||
|
stop_grace_period: 60s
|
||||||
command:
|
command:
|
||||||
- /bin/sh
|
- /bin/sh
|
||||||
- -lc
|
- -lc
|
||||||
@@ -222,6 +258,12 @@ services:
|
|||||||
target: fcm_server_key
|
target: fcm_server_key
|
||||||
- source: ${APNS_AUTH_KEY_SECRET}
|
- source: ${APNS_AUTH_KEY_SECRET}
|
||||||
target: apns_auth_key
|
target: apns_auth_key
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://127.0.0.1:6060/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
start_period: 15s
|
||||||
|
retries: 3
|
||||||
deploy:
|
deploy:
|
||||||
replicas: ${WORKER_REPLICAS}
|
replicas: ${WORKER_REPLICAS}
|
||||||
restart_policy:
|
restart_policy:
|
||||||
@@ -235,16 +277,28 @@ services:
|
|||||||
parallelism: 1
|
parallelism: 1
|
||||||
delay: 5s
|
delay: 5s
|
||||||
order: stop-first
|
order: stop-first
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: "1.00"
|
||||||
|
memory: 512M
|
||||||
|
reservations:
|
||||||
|
cpus: "0.25"
|
||||||
|
memory: 128M
|
||||||
networks:
|
networks:
|
||||||
- honeydue-network
|
- honeydue-network
|
||||||
|
|
||||||
dozzle:
|
dozzle:
|
||||||
|
# NOTE: Dozzle exposes the full Docker log stream with no built-in auth.
|
||||||
|
# Bound to manager loopback only — access via SSH tunnel:
|
||||||
|
# ssh -L ${DOZZLE_PORT}:127.0.0.1:${DOZZLE_PORT} <manager>
|
||||||
|
# Then browse http://localhost:${DOZZLE_PORT}
|
||||||
image: amir20/dozzle:latest
|
image: amir20/dozzle:latest
|
||||||
ports:
|
ports:
|
||||||
- target: 8080
|
- target: 8080
|
||||||
published: ${DOZZLE_PORT}
|
published: ${DOZZLE_PORT}
|
||||||
protocol: tcp
|
protocol: tcp
|
||||||
mode: ingress
|
mode: host
|
||||||
|
host_ip: 127.0.0.1
|
||||||
environment:
|
environment:
|
||||||
DOZZLE_NO_ANALYTICS: "true"
|
DOZZLE_NO_ANALYTICS: "true"
|
||||||
volumes:
|
volumes:
|
||||||
@@ -257,6 +311,13 @@ services:
|
|||||||
placement:
|
placement:
|
||||||
constraints:
|
constraints:
|
||||||
- node.role == manager
|
- node.role == manager
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: "0.25"
|
||||||
|
memory: 128M
|
||||||
|
reservations:
|
||||||
|
cpus: "0.05"
|
||||||
|
memory: 32M
|
||||||
networks:
|
networks:
|
||||||
- honeydue-network
|
- honeydue-network
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package database
|
package database
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -15,6 +16,11 @@ import (
|
|||||||
"github.com/treytartt/honeydue-api/internal/models"
|
"github.com/treytartt/honeydue-api/internal/models"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
|
||||||
|
// Migrate() across API replicas booting in parallel. Value is arbitrary but
|
||||||
|
// stable ("hdmg" as bytes = honeydue migration).
|
||||||
|
const migrationAdvisoryLockKey int64 = 0x68646d67
|
||||||
|
|
||||||
// zerologGormWriter adapts zerolog for GORM's logger interface
|
// zerologGormWriter adapts zerolog for GORM's logger interface
|
||||||
type zerologGormWriter struct{}
|
type zerologGormWriter struct{}
|
||||||
|
|
||||||
@@ -121,6 +127,54 @@ func Paginate(page, pageSize int) func(db *gorm.DB) *gorm.DB {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MigrateWithLock runs Migrate() under a Postgres session-level advisory lock
|
||||||
|
// so that multiple API replicas booting in parallel don't race on AutoMigrate.
|
||||||
|
// On non-Postgres dialects (sqlite in tests) it falls through to Migrate().
|
||||||
|
func MigrateWithLock() error {
|
||||||
|
if db == nil {
|
||||||
|
return fmt.Errorf("database not initialised")
|
||||||
|
}
|
||||||
|
if db.Dialector.Name() != "postgres" {
|
||||||
|
return Migrate()
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlDB, err := db.DB()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("get underlying sql.DB: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Give ourselves up to 5 min to acquire the lock — long enough for a
|
||||||
|
// slow migration on a peer replica, short enough to fail fast if Postgres
|
||||||
|
// is hung.
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
conn, err := sqlDB.Conn(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("acquire dedicated migration connection: %w", err)
|
||||||
|
}
|
||||||
|
defer conn.Close()
|
||||||
|
|
||||||
|
log.Info().Int64("lock_key", migrationAdvisoryLockKey).Msg("Acquiring migration advisory lock...")
|
||||||
|
if _, err := conn.ExecContext(ctx, "SELECT pg_advisory_lock($1)", migrationAdvisoryLockKey); err != nil {
|
||||||
|
return fmt.Errorf("pg_advisory_lock: %w", err)
|
||||||
|
}
|
||||||
|
log.Info().Msg("Migration advisory lock acquired")
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
// Unlock with a fresh context — the outer ctx may have expired.
|
||||||
|
unlockCtx, unlockCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
defer unlockCancel()
|
||||||
|
if _, err := conn.ExecContext(unlockCtx, "SELECT pg_advisory_unlock($1)", migrationAdvisoryLockKey); err != nil {
|
||||||
|
log.Warn().Err(err).Msg("Failed to release migration advisory lock (session close will also release)")
|
||||||
|
} else {
|
||||||
|
log.Info().Msg("Migration advisory lock released")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
return Migrate()
|
||||||
|
}
|
||||||
|
|
||||||
// Migrate runs database migrations for all models
|
// Migrate runs database migrations for all models
|
||||||
func Migrate() error {
|
func Migrate() error {
|
||||||
log.Info().Msg("Running database migrations...")
|
log.Info().Msg("Running database migrations...")
|
||||||
|
|||||||
Reference in New Issue
Block a user