c77ff07ce9
Remediation of the 2026-05-12/13 audits (78 findings + cluster gaps), tracked in deploy-k3s/SECURITY.md, plus fixes from two independent post-remediation reviews. Auth & sessions: - SHA-256 hashed auth-token storage (C1); prior-token cache eviction on re-login (MEDIUM-1) - local Google JWKS verification, iss/aud/exp checks (C2/C3) - constant-time login + generic errors (L1/LIVE-L11/LIVE-L13) - per-account login lockout keyed on distinct source IPs (M5/MEDIUM-3) - verified-email gating, login rate limiting (LIVE-L19, H1-H3) IAP & webhooks: - Apple/Google cross-account replay protection (C5/C6/C10/C13, H5/H6) - migrations 000003-000006 (token hashing, IAP replay, audit_log + webhook_event_log table creation, append-only audit log) Authorization & races: - file-ownership owner-OR-member fix (C7), atomic share-code join (C9/H9), device-token reassignment (C8/LOW-3) Secrets & deploy: - secrets file-mounted at /etc/honeydue/secrets, not env (F8); Redis password out of the ConfigMap (HIGH-1); B2 keys reconciled - digest-pinned images, admin ingress hardening, CSP/HSTS, /metrics lockdown; kubeconfig 0600, etcd secrets-encryption, fail2ban + unattended-upgrades at provision; secret-rotation runbook Build, vet, and the full test suite (incl. -race) pass; the goose migration chain is verified against PostgreSQL 16. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
289 lines
12 KiB
Bash
Executable File
289 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
# shellcheck source=_config.sh
|
|
source "${SCRIPT_DIR}/_config.sh"
|
|
|
|
REPO_DIR="$(cd "${DEPLOY_DIR}/.." && pwd)"
|
|
NAMESPACE="honeydue"
|
|
MANIFESTS="${DEPLOY_DIR}/manifests"
|
|
|
|
log() { printf '[deploy] %s\n' "$*"; }
|
|
warn() { printf '[deploy][warn] %s\n' "$*" >&2; }
|
|
die() { printf '[deploy][error] %s\n' "$*" >&2; exit 1; }
|
|
|
|
# --- Parse arguments ---
|
|
|
|
SKIP_BUILD=false
|
|
DEPLOY_TAG=""
|
|
|
|
while (( $# > 0 )); do
|
|
case "$1" in
|
|
--skip-build) SKIP_BUILD=true; shift ;;
|
|
--tag)
|
|
[[ -n "${2:-}" ]] || die "--tag requires a value"
|
|
DEPLOY_TAG="$2"; shift 2 ;;
|
|
-h|--help)
|
|
cat <<'EOF'
|
|
Usage: ./scripts/03-deploy.sh [OPTIONS]
|
|
|
|
Options:
|
|
--skip-build Skip Docker build/push, use existing images
|
|
--tag <tag> Image tag (default: git short SHA)
|
|
-h, --help Show this help
|
|
EOF
|
|
exit 0 ;;
|
|
*) die "Unknown argument: $1" ;;
|
|
esac
|
|
done
|
|
|
|
# --- Prerequisites ---
|
|
|
|
command -v kubectl >/dev/null 2>&1 || die "Missing: kubectl"
|
|
command -v docker >/dev/null 2>&1 || die "Missing: docker"
|
|
|
|
if [[ -z "${DEPLOY_TAG}" ]]; then
|
|
DEPLOY_TAG="$(git -C "${REPO_DIR}" rev-parse --short HEAD 2>/dev/null || echo "latest")"
|
|
fi
|
|
|
|
# --- Read registry config ---
|
|
|
|
REGISTRY_SERVER="$(cfg_require registry.server "Container registry server")"
|
|
REGISTRY_NS="$(cfg_require registry.namespace "Registry namespace")"
|
|
REGISTRY_USER="$(cfg_require registry.username "Registry username")"
|
|
REGISTRY_TOKEN="$(cfg_require registry.token "Registry token")"
|
|
|
|
REGISTRY_PREFIX="${REGISTRY_SERVER%/}/${REGISTRY_NS#/}"
|
|
API_IMAGE="${REGISTRY_PREFIX}/honeydue-api:${DEPLOY_TAG}"
|
|
WORKER_IMAGE="${REGISTRY_PREFIX}/honeydue-worker:${DEPLOY_TAG}"
|
|
ADMIN_IMAGE="${REGISTRY_PREFIX}/honeydue-admin:${DEPLOY_TAG}"
|
|
WEB_IMAGE="${REGISTRY_PREFIX}/honeydue-web:${DEPLOY_TAG}"
|
|
|
|
# The web client lives in a sibling repo. Resolve its path once.
|
|
WEB_REPO_DIR="$(cd "${REPO_DIR}/../honeyDueAPI-Web" 2>/dev/null && pwd || echo "")"
|
|
|
|
# NEXT_PUBLIC_* is baked into client bundles at build time, so API/PostHog
|
|
# URLs must be passed as build-args — setting them at pod runtime has no
|
|
# effect on already-bundled JS.
|
|
API_DOMAIN="$(cfg_require domains.api "API domain")"
|
|
ADMIN_API_URL="https://${API_DOMAIN}"
|
|
WEB_API_URL="https://${API_DOMAIN}/api"
|
|
|
|
# PostHog keys for the web client are optional — read from operator shell
|
|
# env so they never land in a committed file. Empty disables analytics.
|
|
: "${NEXT_PUBLIC_POSTHOG_KEY:=}"
|
|
: "${NEXT_PUBLIC_POSTHOG_HOST:=}"
|
|
|
|
# --- Build and push ---
|
|
|
|
if [[ "${SKIP_BUILD}" == "false" ]]; then
|
|
log "Logging in to ${REGISTRY_SERVER}..."
|
|
printf '%s' "${REGISTRY_TOKEN}" | docker login "${REGISTRY_SERVER}" -u "${REGISTRY_USER}" --password-stdin >/dev/null
|
|
|
|
# k3s nodes are linux/amd64 (Hetzner CX). Force the build platform so
|
|
# local arm64 Macs don't push images that crash with "exec format error".
|
|
BUILD_PLATFORM="linux/amd64"
|
|
|
|
log "Building API image: ${API_IMAGE} (${BUILD_PLATFORM})"
|
|
docker build --platform "${BUILD_PLATFORM}" --target api -t "${API_IMAGE}" "${REPO_DIR}"
|
|
|
|
log "Building Worker image: ${WORKER_IMAGE} (${BUILD_PLATFORM})"
|
|
docker build --platform "${BUILD_PLATFORM}" --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}"
|
|
|
|
log "Building Admin image: ${ADMIN_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${ADMIN_API_URL})"
|
|
docker build --platform "${BUILD_PLATFORM}" --target admin \
|
|
--build-arg "NEXT_PUBLIC_API_URL=${ADMIN_API_URL}" \
|
|
-t "${ADMIN_IMAGE}" "${REPO_DIR}"
|
|
|
|
if [[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]]; then
|
|
log "Building Web image: ${WEB_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${WEB_API_URL})"
|
|
docker build --platform "${BUILD_PLATFORM}" \
|
|
--build-arg "NEXT_PUBLIC_API_URL=${WEB_API_URL}" \
|
|
--build-arg "NEXT_PUBLIC_POSTHOG_KEY=${NEXT_PUBLIC_POSTHOG_KEY}" \
|
|
--build-arg "NEXT_PUBLIC_POSTHOG_HOST=${NEXT_PUBLIC_POSTHOG_HOST}" \
|
|
-t "${WEB_IMAGE}" "${WEB_REPO_DIR}"
|
|
else
|
|
warn "honeyDueAPI-Web sibling repo not found at ${WEB_REPO_DIR:-<unset>}; skipping web build"
|
|
fi
|
|
|
|
log "Pushing images..."
|
|
docker push "${API_IMAGE}"
|
|
docker push "${WORKER_IMAGE}"
|
|
docker push "${ADMIN_IMAGE}"
|
|
[[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]] && docker push "${WEB_IMAGE}"
|
|
|
|
# Also tag and push :latest
|
|
docker tag "${API_IMAGE}" "${REGISTRY_PREFIX}/honeydue-api:latest"
|
|
docker tag "${WORKER_IMAGE}" "${REGISTRY_PREFIX}/honeydue-worker:latest"
|
|
docker tag "${ADMIN_IMAGE}" "${REGISTRY_PREFIX}/honeydue-admin:latest"
|
|
docker push "${REGISTRY_PREFIX}/honeydue-api:latest"
|
|
docker push "${REGISTRY_PREFIX}/honeydue-worker:latest"
|
|
docker push "${REGISTRY_PREFIX}/honeydue-admin:latest"
|
|
if [[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]]; then
|
|
docker tag "${WEB_IMAGE}" "${REGISTRY_PREFIX}/honeydue-web:latest"
|
|
docker push "${REGISTRY_PREFIX}/honeydue-web:latest"
|
|
fi
|
|
else
|
|
warn "Skipping build. Using images for tag: ${DEPLOY_TAG}"
|
|
fi
|
|
|
|
# --- Resolve immutable image digests (audit F5) ---
|
|
# A short-SHA tag is mutable — anyone who can push to the registry can
|
|
# overwrite it, and imagePullPolicy then pulls the new bits silently. We
|
|
# deploy by @sha256: digest instead, pinning the exact image that was just
|
|
# built and pushed. `docker push` populates RepoDigests; with --skip-build
|
|
# (no local image) resolve_ref falls back to the tag.
|
|
resolve_ref() {
|
|
local img="$1" digest
|
|
digest="$(docker inspect --format='{{range .RepoDigests}}{{println .}}{{end}}' "${img}" 2>/dev/null | grep -m1 '@sha256:' || true)"
|
|
if [[ -n "${digest}" ]]; then
|
|
printf '%s' "${digest}"
|
|
else
|
|
warn "could not resolve a digest for ${img} — deploying by mutable tag"
|
|
printf '%s' "${img}"
|
|
fi
|
|
}
|
|
API_REF="$(resolve_ref "${API_IMAGE}")"
|
|
WORKER_REF="$(resolve_ref "${WORKER_IMAGE}")"
|
|
ADMIN_REF="$(resolve_ref "${ADMIN_IMAGE}")"
|
|
WEB_REF="$(resolve_ref "${WEB_IMAGE}")"
|
|
log "Deploying by digest:"
|
|
log " API: ${API_REF}"
|
|
log " Worker: ${WORKER_REF}"
|
|
log " Admin: ${ADMIN_REF}"
|
|
|
|
# --- Image scan + signing (audit CODE-L5) ---
|
|
# Both steps are best-effort: the deploy does NOT fail if the tools are
|
|
# absent, so an operator who has not set up cosign/trivy yet is not blocked.
|
|
# Install trivy + cosign and export COSIGN_KEY to enforce. Cluster-side
|
|
# admission verification (Kyverno/Connaisseur) is a separate operator step.
|
|
if [[ "${SKIP_BUILD}" == "false" ]]; then
|
|
if command -v trivy >/dev/null 2>&1; then
|
|
log "Scanning images with Trivy (HIGH,CRITICAL)..."
|
|
for img in "${API_IMAGE}" "${WORKER_IMAGE}" "${ADMIN_IMAGE}"; do
|
|
trivy image --severity HIGH,CRITICAL --exit-code 0 --quiet "${img}" \
|
|
|| warn "Trivy reported findings for ${img}"
|
|
done
|
|
else
|
|
warn "trivy not installed — skipping image vulnerability scan (audit L5)"
|
|
fi
|
|
if command -v cosign >/dev/null 2>&1 && [[ -n "${COSIGN_KEY:-}" ]]; then
|
|
log "Signing images with cosign..."
|
|
for ref in "${API_REF}" "${WORKER_REF}" "${ADMIN_REF}"; do
|
|
cosign sign --yes --key "${COSIGN_KEY}" "${ref}" || warn "cosign sign failed for ${ref}"
|
|
done
|
|
else
|
|
warn "cosign not configured (need cosign + COSIGN_KEY) — skipping image signing (audit L5)"
|
|
fi
|
|
fi
|
|
|
|
# --- Generate and apply ConfigMap from config.yaml ---
|
|
|
|
log "Generating env from config.yaml..."
|
|
ENV_FILE="$(mktemp)"
|
|
trap 'rm -f "${ENV_FILE}"' EXIT
|
|
generate_env > "${ENV_FILE}"
|
|
|
|
log "Creating ConfigMap..."
|
|
kubectl create configmap honeydue-config \
|
|
--namespace="${NAMESPACE}" \
|
|
--from-env-file="${ENV_FILE}" \
|
|
--dry-run=client -o yaml | kubectl apply -f -
|
|
|
|
# --- Apply manifests ---
|
|
|
|
log "Applying manifests..."
|
|
|
|
kubectl apply -f "${MANIFESTS}/namespace.yaml"
|
|
|
|
# NetworkPolicies first — default-deny-all + per-app allow rules.
|
|
# These MUST be applied; without them the cluster falls back to default-allow
|
|
# (worse posture) AND the vmagent egress rule for :6443 (which fixes a k3s
|
|
# post-DNAT enforcement quirk for k8s API discovery) is missing.
|
|
# See deploy-k3s/RUNBOOK.md ("vmagent SD broken on fresh deploy").
|
|
kubectl apply -f "${MANIFESTS}/network-policies.yaml"
|
|
|
|
kubectl apply -f "${MANIFESTS}/redis/"
|
|
kubectl apply -f "${MANIFESTS}/ingress/"
|
|
|
|
# --- Run migrations BEFORE rolling api/worker ---
|
|
#
|
|
# goose-based migration Job. We delete any prior Job (Jobs are immutable —
|
|
# applying a duplicate name otherwise fails), apply a fresh one with the new
|
|
# api image (which includes /usr/local/bin/goose and /app/migrations), and
|
|
# block until it succeeds. A failure aborts the deploy before any new app
|
|
# pod sees a stale schema.
|
|
log "Running database migrations (goose Job)..."
|
|
kubectl delete job honeydue-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null
|
|
sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/migrate/job.yaml" | kubectl apply -f -
|
|
if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=10m job/honeydue-migrate; then
|
|
warn "migration Job failed — see logs:"
|
|
kubectl logs -n "${NAMESPACE}" job/honeydue-migrate --tail=200 || true
|
|
die "migrations did not complete cleanly; aborting deploy"
|
|
fi
|
|
log "Migrations applied; proceeding with api/worker rollout"
|
|
|
|
# Apply deployments with image substitution
|
|
sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f -
|
|
kubectl apply -f "${MANIFESTS}/api/service.yaml"
|
|
kubectl apply -f "${MANIFESTS}/api/hpa.yaml"
|
|
|
|
sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_REF}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f -
|
|
|
|
sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_REF}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f -
|
|
kubectl apply -f "${MANIFESTS}/admin/service.yaml"
|
|
|
|
if [[ -d "${MANIFESTS}/web" ]]; then
|
|
sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_REF}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f -
|
|
kubectl apply -f "${MANIFESTS}/web/service.yaml"
|
|
fi
|
|
|
|
# Observability — vmagent scrapes api Pods :8000/metrics + kube-state-metrics
|
|
# :8080/metrics and remote-writes everything to obs.88oakapps.com. The bearer
|
|
# token comes from deploy/prod.env so it stays out of the repo; the manifest
|
|
# holds TOKEN_PLACEHOLDER. kube-state-metrics provides the kube_* metrics
|
|
# Grafana panels need to count pods, deployments, etc.
|
|
if [[ -d "${MANIFESTS}/observability" ]]; then
|
|
# kube-state-metrics — no secrets, plain apply
|
|
kubectl apply -f "${MANIFESTS}/observability/kube-state-metrics.yaml"
|
|
|
|
# vmagent — needs the bearer-token substitution
|
|
# prod.env lives at the repo's deploy/ dir (sibling of deploy-k3s/), not
|
|
# under deploy-k3s/. It's gitignored — operator copies values there once.
|
|
OBS_TOKEN="$(grep -E '^OBS_INGEST_TOKEN=' "${REPO_DIR}/deploy/prod.env" 2>/dev/null | cut -d= -f2- || true)"
|
|
if [[ -z "${OBS_TOKEN}" ]]; then
|
|
warn "OBS_INGEST_TOKEN not found in deploy/prod.env — skipping vmagent apply"
|
|
else
|
|
sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/vmagent.yaml" | kubectl apply -f -
|
|
fi
|
|
fi
|
|
|
|
# --- Wait for rollouts ---
|
|
|
|
log "Waiting for rollouts..."
|
|
|
|
kubectl rollout status deployment/redis -n "${NAMESPACE}" --timeout=120s
|
|
kubectl rollout status deployment/api -n "${NAMESPACE}" --timeout=300s
|
|
kubectl rollout status deployment/worker -n "${NAMESPACE}" --timeout=300s
|
|
kubectl rollout status deployment/admin -n "${NAMESPACE}" --timeout=300s
|
|
if [[ -d "${MANIFESTS}/web" ]]; then
|
|
kubectl rollout status deployment/web -n "${NAMESPACE}" --timeout=300s
|
|
fi
|
|
if kubectl -n "${NAMESPACE}" get deployment vmagent >/dev/null 2>&1; then
|
|
kubectl rollout status deployment/vmagent -n "${NAMESPACE}" --timeout=120s
|
|
fi
|
|
|
|
# --- Done ---
|
|
|
|
log ""
|
|
log "Deploy completed successfully."
|
|
log "Tag: ${DEPLOY_TAG}"
|
|
log "Images:"
|
|
log " API: ${API_IMAGE}"
|
|
log " Worker: ${WORKER_IMAGE}"
|
|
log " Admin: ${ADMIN_IMAGE}"
|
|
[[ -d "${MANIFESTS}/web" ]] && log " Web: ${WEB_IMAGE}"
|
|
log ""
|
|
log "Run ./scripts/04-verify.sh to check cluster health."
|