Harden prod deploy: versioned secrets, healthchecks, migration lock, dry-run
Swarm stack - Resource limits on all services, stop_grace_period 60s on api/worker/admin - Dozzle bound to manager loopback only (ssh -L required for access) - Worker health server on :6060, admin /api/health endpoint - Redis 200M LRU cap, B2/S3 env vars wired through to api service Deploy script - DRY_RUN=1 prints plan + exits - Auto-rollback on failed healthcheck, docker logout at end - Versioned-secret pruning keeps last SECRET_KEEP_VERSIONS (default 3) - PUSH_LATEST_TAG default flipped to false - B2 all-or-none validation before deploy Code - cmd/api takes pg_advisory_lock on a dedicated connection before AutoMigrate, serialising boot-time migrations across replicas - cmd/worker exposes an HTTP /health endpoint with graceful shutdown Docs - deploy/DEPLOYING.md: step-by-step walkthrough for a real deploy - deploy/shit_deploy_cant_do.md: manual prerequisites + recurring ops - deploy/README.md updated with storage toggle, worker-replica caveat, multi-arch recipe, connection-pool tuning, renumbered sections Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -18,6 +18,8 @@ SECRET_APNS_KEY="${DEPLOY_DIR}/secrets/apns_auth_key.p8"
|
||||
|
||||
SKIP_BUILD="${SKIP_BUILD:-0}"
|
||||
SKIP_HEALTHCHECK="${SKIP_HEALTHCHECK:-0}"
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
SECRET_KEEP_VERSIONS="${SECRET_KEEP_VERSIONS:-3}"
|
||||
|
||||
log() {
|
||||
printf '[deploy] %s\n' "$*"
|
||||
@@ -91,9 +93,13 @@ Usage:
|
||||
./.deploy_prod
|
||||
|
||||
Optional environment flags:
|
||||
SKIP_BUILD=1 Deploy existing image tags without rebuilding/pushing.
|
||||
SKIP_HEALTHCHECK=1 Skip final HTTP health check.
|
||||
DEPLOY_TAG=<tag> Override image tag (default: git short sha).
|
||||
DRY_RUN=1 Print the deployment plan and exit without changes.
|
||||
SKIP_BUILD=1 Deploy existing image tags without rebuilding/pushing.
|
||||
SKIP_HEALTHCHECK=1 Skip final HTTP health check.
|
||||
DEPLOY_TAG=<tag> Override image tag (default: git short sha).
|
||||
PUSH_LATEST_TAG=true|false Also tag/push :latest (default: false — SHA only).
|
||||
SECRET_KEEP_VERSIONS=<n> How many versions of each Swarm secret to retain
|
||||
(default: 3). Older unused versions are pruned.
|
||||
EOF
|
||||
}
|
||||
|
||||
@@ -144,7 +150,7 @@ DEPLOY_STACK_NAME="${DEPLOY_STACK_NAME:-honeydue}"
|
||||
DEPLOY_REMOTE_DIR="${DEPLOY_REMOTE_DIR:-/opt/honeydue/deploy}"
|
||||
DEPLOY_WAIT_SECONDS="${DEPLOY_WAIT_SECONDS:-420}"
|
||||
DEPLOY_TAG="${DEPLOY_TAG:-$(git -C "${REPO_DIR}" rev-parse --short HEAD)}"
|
||||
PUSH_LATEST_TAG="${PUSH_LATEST_TAG:-true}"
|
||||
PUSH_LATEST_TAG="${PUSH_LATEST_TAG:-false}"
|
||||
|
||||
require_var DEPLOY_MANAGER_HOST
|
||||
require_var DEPLOY_MANAGER_USER
|
||||
@@ -173,6 +179,27 @@ require_var APNS_AUTH_KEY_ID
|
||||
require_var APNS_TEAM_ID
|
||||
require_var APNS_TOPIC
|
||||
|
||||
# Storage backend validation: B2 is all-or-none. If any var is filled with
|
||||
# a real value, require all four core vars. Empty means "use local volume".
|
||||
b2_any_set=0
|
||||
b2_all_set=1
|
||||
for b2_var in B2_ENDPOINT B2_KEY_ID B2_APP_KEY B2_BUCKET_NAME; do
|
||||
val="${!b2_var:-}"
|
||||
if [[ -n "${val}" ]] && ! contains_placeholder "${val}"; then
|
||||
b2_any_set=1
|
||||
else
|
||||
b2_all_set=0
|
||||
fi
|
||||
done
|
||||
if (( b2_any_set == 1 && b2_all_set == 0 )); then
|
||||
die "Partial B2 configuration detected. Set all four of B2_ENDPOINT, B2_KEY_ID, B2_APP_KEY, B2_BUCKET_NAME, or leave all four empty to use the local volume."
|
||||
fi
|
||||
if (( b2_all_set == 1 )); then
|
||||
log "Storage backend: S3 (${B2_ENDPOINT} / bucket=${B2_BUCKET_NAME})"
|
||||
else
|
||||
warn "Storage backend: LOCAL VOLUME. This is not safe for multi-replica prod — uploads will only exist on one node. Set B2_* in prod.env to use object storage."
|
||||
fi
|
||||
|
||||
if [[ ! "$(tr -d '\r\n' < "${SECRET_APNS_KEY}")" =~ BEGIN[[:space:]]+PRIVATE[[:space:]]+KEY ]]; then
|
||||
die "APNS key file does not look like a private key: ${SECRET_APNS_KEY}"
|
||||
fi
|
||||
@@ -200,6 +227,50 @@ if [[ -n "${SSH_KEY_PATH}" ]]; then
|
||||
SCP_OPTS+=(-i "${SSH_KEY_PATH}")
|
||||
fi
|
||||
|
||||
if [[ "${DRY_RUN}" == "1" ]]; then
|
||||
cat <<EOF
|
||||
|
||||
==================== DRY RUN ====================
|
||||
Validation passed. Would deploy:
|
||||
|
||||
Stack name: ${DEPLOY_STACK_NAME}
|
||||
Manager: ${SSH_TARGET}:${DEPLOY_MANAGER_SSH_PORT}
|
||||
Remote dir: ${DEPLOY_REMOTE_DIR}
|
||||
Deploy tag: ${DEPLOY_TAG}
|
||||
Push :latest: ${PUSH_LATEST_TAG}
|
||||
Skip build: ${SKIP_BUILD}
|
||||
Skip healthcheck: ${SKIP_HEALTHCHECK}
|
||||
Secret retention: ${SECRET_KEEP_VERSIONS} versions per name
|
||||
|
||||
Images that would be built and pushed:
|
||||
${API_IMAGE}
|
||||
${WORKER_IMAGE}
|
||||
${ADMIN_IMAGE}
|
||||
|
||||
Replicas:
|
||||
api: ${API_REPLICAS:-3}
|
||||
worker: ${WORKER_REPLICAS:-2}
|
||||
admin: ${ADMIN_REPLICAS:-1}
|
||||
|
||||
Published ports:
|
||||
api: ${API_PORT:-8000} (ingress)
|
||||
admin: ${ADMIN_PORT:-3000} (ingress)
|
||||
dozzle: ${DOZZLE_PORT:-9999} (manager loopback only — SSH tunnel required)
|
||||
|
||||
Versioned secrets that would be created on this deploy:
|
||||
${DEPLOY_STACK_NAME}_postgres_password_<deploy_id>
|
||||
${DEPLOY_STACK_NAME}_secret_key_<deploy_id>
|
||||
${DEPLOY_STACK_NAME}_email_host_password_<deploy_id>
|
||||
${DEPLOY_STACK_NAME}_fcm_server_key_<deploy_id>
|
||||
${DEPLOY_STACK_NAME}_apns_auth_key_<deploy_id>
|
||||
|
||||
No changes made. Re-run without DRY_RUN=1 to deploy.
|
||||
=================================================
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Validating SSH access to ${SSH_TARGET}"
|
||||
if ! ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "echo ok" >/dev/null 2>&1; then
|
||||
die "SSH connection failed to ${SSH_TARGET}"
|
||||
@@ -384,11 +455,77 @@ while true; do
|
||||
sleep 10
|
||||
done
|
||||
|
||||
log "Pruning old secret versions (keeping last ${SECRET_KEEP_VERSIONS})"
|
||||
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "bash -s -- '${DEPLOY_STACK_NAME}' '${SECRET_KEEP_VERSIONS}'" <<'EOF' || warn "Secret pruning reported errors (non-fatal)"
|
||||
set -euo pipefail
|
||||
|
||||
STACK_NAME="$1"
|
||||
KEEP="$2"
|
||||
|
||||
prune_prefix() {
|
||||
local prefix="$1"
|
||||
# List matching secrets with creation time, sorted newest-first.
|
||||
local all
|
||||
all="$(docker secret ls --format '{{.CreatedAt}}|{{.Name}}' 2>/dev/null \
|
||||
| grep "|${prefix}_" \
|
||||
| sort -r \
|
||||
|| true)"
|
||||
if [[ -z "${all}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local total
|
||||
total="$(printf '%s\n' "${all}" | wc -l | tr -d ' ')"
|
||||
if (( total <= KEEP )); then
|
||||
echo "[cleanup] ${prefix}: ${total} version(s) — nothing to prune"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local to_remove
|
||||
to_remove="$(printf '%s\n' "${all}" | tail -n +$((KEEP + 1)) | awk -F'|' '{print $2}')"
|
||||
|
||||
while IFS= read -r name; do
|
||||
[[ -z "${name}" ]] && continue
|
||||
if docker secret rm "${name}" >/dev/null 2>&1; then
|
||||
echo "[cleanup] removed: ${name}"
|
||||
else
|
||||
echo "[cleanup] in-use (kept): ${name}"
|
||||
fi
|
||||
done <<< "${to_remove}"
|
||||
}
|
||||
|
||||
for base in postgres_password secret_key email_host_password fcm_server_key apns_auth_key; do
|
||||
prune_prefix "${STACK_NAME}_${base}"
|
||||
done
|
||||
EOF
|
||||
|
||||
rollback_stack() {
|
||||
warn "Rolling back stack ${DEPLOY_STACK_NAME} on ${SSH_TARGET}"
|
||||
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "bash -s -- '${DEPLOY_STACK_NAME}'" <<'EOF' || true
|
||||
set +e
|
||||
STACK="$1"
|
||||
for svc in $(docker stack services "${STACK}" --format '{{.Name}}'); do
|
||||
echo "[rollback] ${svc}"
|
||||
docker service rollback "${svc}" || echo "[rollback] ${svc}: nothing to roll back"
|
||||
done
|
||||
EOF
|
||||
}
|
||||
|
||||
if [[ "${SKIP_HEALTHCHECK}" != "1" && -n "${DEPLOY_HEALTHCHECK_URL:-}" ]]; then
|
||||
log "Running health check: ${DEPLOY_HEALTHCHECK_URL}"
|
||||
curl -fsS --max-time 20 "${DEPLOY_HEALTHCHECK_URL}" >/dev/null
|
||||
if ! curl -fsS --max-time 20 "${DEPLOY_HEALTHCHECK_URL}" >/dev/null; then
|
||||
warn "Health check FAILED for ${DEPLOY_HEALTHCHECK_URL}"
|
||||
rollback_stack
|
||||
die "Deploy rolled back due to failed health check."
|
||||
fi
|
||||
fi
|
||||
|
||||
# Best-effort registry logout — the token should not linger in
|
||||
# ~/.docker/config.json after deploy completes. Failures are non-fatal.
|
||||
log "Logging out of registry (local + remote)"
|
||||
docker logout "${REGISTRY}" >/dev/null 2>&1 || true
|
||||
ssh "${SSH_OPTS[@]}" "${SSH_TARGET}" "docker logout '${REGISTRY}' >/dev/null 2>&1 || true"
|
||||
|
||||
log "Deploy completed successfully."
|
||||
log "Stack: ${DEPLOY_STACK_NAME}"
|
||||
log "Images:"
|
||||
|
||||
Reference in New Issue
Block a user