#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=_config.sh source "${SCRIPT_DIR}/_config.sh" REPO_DIR="$(cd "${DEPLOY_DIR}/.." && pwd)" NAMESPACE="honeydue" MANIFESTS="${DEPLOY_DIR}/manifests" log() { printf '[deploy] %s\n' "$*"; } warn() { printf '[deploy][warn] %s\n' "$*" >&2; } die() { printf '[deploy][error] %s\n' "$*" >&2; exit 1; } # --- Parse arguments --- SKIP_BUILD=false DEPLOY_TAG="" while (( $# > 0 )); do case "$1" in --skip-build) SKIP_BUILD=true; shift ;; --tag) [[ -n "${2:-}" ]] || die "--tag requires a value" DEPLOY_TAG="$2"; shift 2 ;; -h|--help) cat <<'EOF' Usage: ./scripts/03-deploy.sh [OPTIONS] Options: --skip-build Skip Docker build/push, use existing images --tag Image tag (default: git short SHA) -h, --help Show this help EOF exit 0 ;; *) die "Unknown argument: $1" ;; esac done # --- Prerequisites --- command -v kubectl >/dev/null 2>&1 || die "Missing: kubectl" command -v docker >/dev/null 2>&1 || die "Missing: docker" if [[ -z "${DEPLOY_TAG}" ]]; then DEPLOY_TAG="$(git -C "${REPO_DIR}" rev-parse --short HEAD 2>/dev/null || echo "latest")" fi # --- Read registry config --- REGISTRY_SERVER="$(cfg_require registry.server "Container registry server")" REGISTRY_NS="$(cfg_require registry.namespace "Registry namespace")" REGISTRY_USER="$(cfg_require registry.username "Registry username")" REGISTRY_TOKEN="$(cfg_require registry.token "Registry token")" REGISTRY_PREFIX="${REGISTRY_SERVER%/}/${REGISTRY_NS#/}" API_IMAGE="${REGISTRY_PREFIX}/honeydue-api:${DEPLOY_TAG}" WORKER_IMAGE="${REGISTRY_PREFIX}/honeydue-worker:${DEPLOY_TAG}" ADMIN_IMAGE="${REGISTRY_PREFIX}/honeydue-admin:${DEPLOY_TAG}" WEB_IMAGE="${REGISTRY_PREFIX}/honeydue-web:${DEPLOY_TAG}" # The web client lives in a sibling repo. Resolve its path once. WEB_REPO_DIR="$(cd "${REPO_DIR}/../honeyDueAPI-Web" 2>/dev/null && pwd || echo "")" # NEXT_PUBLIC_* is baked into client bundles at build time, so API/PostHog # URLs must be passed as build-args — setting them at pod runtime has no # effect on already-bundled JS. API_DOMAIN="$(cfg_require domains.api "API domain")" ADMIN_API_URL="https://${API_DOMAIN}" WEB_API_URL="https://${API_DOMAIN}/api" # PostHog keys for the web client are optional — read from operator shell # env so they never land in a committed file. Empty disables analytics. : "${NEXT_PUBLIC_POSTHOG_KEY:=}" : "${NEXT_PUBLIC_POSTHOG_HOST:=}" # --- Build and push --- if [[ "${SKIP_BUILD}" == "false" ]]; then log "Logging in to ${REGISTRY_SERVER}..." printf '%s' "${REGISTRY_TOKEN}" | docker login "${REGISTRY_SERVER}" -u "${REGISTRY_USER}" --password-stdin >/dev/null # k3s nodes are linux/amd64 (Hetzner CX). Force the build platform so # local arm64 Macs don't push images that crash with "exec format error". BUILD_PLATFORM="linux/amd64" log "Building API image: ${API_IMAGE} (${BUILD_PLATFORM})" docker build --platform "${BUILD_PLATFORM}" --target api -t "${API_IMAGE}" "${REPO_DIR}" log "Building Worker image: ${WORKER_IMAGE} (${BUILD_PLATFORM})" docker build --platform "${BUILD_PLATFORM}" --target worker -t "${WORKER_IMAGE}" "${REPO_DIR}" log "Building Admin image: ${ADMIN_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${ADMIN_API_URL})" docker build --platform "${BUILD_PLATFORM}" --target admin \ --build-arg "NEXT_PUBLIC_API_URL=${ADMIN_API_URL}" \ -t "${ADMIN_IMAGE}" "${REPO_DIR}" if [[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]]; then log "Building Web image: ${WEB_IMAGE} (${BUILD_PLATFORM}, NEXT_PUBLIC_API_URL=${WEB_API_URL})" docker build --platform "${BUILD_PLATFORM}" \ --build-arg "NEXT_PUBLIC_API_URL=${WEB_API_URL}" \ --build-arg "NEXT_PUBLIC_POSTHOG_KEY=${NEXT_PUBLIC_POSTHOG_KEY}" \ --build-arg "NEXT_PUBLIC_POSTHOG_HOST=${NEXT_PUBLIC_POSTHOG_HOST}" \ -t "${WEB_IMAGE}" "${WEB_REPO_DIR}" else warn "honeyDueAPI-Web sibling repo not found at ${WEB_REPO_DIR:-}; skipping web build" fi log "Pushing images..." docker push "${API_IMAGE}" docker push "${WORKER_IMAGE}" docker push "${ADMIN_IMAGE}" [[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]] && docker push "${WEB_IMAGE}" # Also tag and push :latest docker tag "${API_IMAGE}" "${REGISTRY_PREFIX}/honeydue-api:latest" docker tag "${WORKER_IMAGE}" "${REGISTRY_PREFIX}/honeydue-worker:latest" docker tag "${ADMIN_IMAGE}" "${REGISTRY_PREFIX}/honeydue-admin:latest" docker push "${REGISTRY_PREFIX}/honeydue-api:latest" docker push "${REGISTRY_PREFIX}/honeydue-worker:latest" docker push "${REGISTRY_PREFIX}/honeydue-admin:latest" if [[ -n "${WEB_REPO_DIR}" && -f "${WEB_REPO_DIR}/Dockerfile" ]]; then docker tag "${WEB_IMAGE}" "${REGISTRY_PREFIX}/honeydue-web:latest" docker push "${REGISTRY_PREFIX}/honeydue-web:latest" fi else warn "Skipping build. Using images for tag: ${DEPLOY_TAG}" fi # --- Resolve immutable image digests (audit F5) --- # A short-SHA tag is mutable — anyone who can push to the registry can # overwrite it, and imagePullPolicy then pulls the new bits silently. We # deploy by @sha256: digest instead, pinning the exact image that was just # built and pushed. `docker push` populates RepoDigests; with --skip-build # (no local image) resolve_ref falls back to the tag. resolve_ref() { local img="$1" digest digest="$(docker inspect --format='{{range .RepoDigests}}{{println .}}{{end}}' "${img}" 2>/dev/null | grep -m1 '@sha256:' || true)" if [[ -n "${digest}" ]]; then printf '%s' "${digest}" else warn "could not resolve a digest for ${img} — deploying by mutable tag" printf '%s' "${img}" fi } API_REF="$(resolve_ref "${API_IMAGE}")" WORKER_REF="$(resolve_ref "${WORKER_IMAGE}")" ADMIN_REF="$(resolve_ref "${ADMIN_IMAGE}")" WEB_REF="$(resolve_ref "${WEB_IMAGE}")" log "Deploying by digest:" log " API: ${API_REF}" log " Worker: ${WORKER_REF}" log " Admin: ${ADMIN_REF}" # --- Image scan + signing (audit CODE-L5) --- # Both steps are best-effort: the deploy does NOT fail if the tools are # absent, so an operator who has not set up cosign/trivy yet is not blocked. # Install trivy + cosign and export COSIGN_KEY to enforce. Cluster-side # admission verification (Kyverno/Connaisseur) is a separate operator step. if [[ "${SKIP_BUILD}" == "false" ]]; then if command -v trivy >/dev/null 2>&1; then log "Scanning images with Trivy (HIGH,CRITICAL)..." for img in "${API_IMAGE}" "${WORKER_IMAGE}" "${ADMIN_IMAGE}"; do trivy image --severity HIGH,CRITICAL --exit-code 0 --quiet "${img}" \ || warn "Trivy reported findings for ${img}" done else warn "trivy not installed — skipping image vulnerability scan (audit L5)" fi if command -v cosign >/dev/null 2>&1 && [[ -n "${COSIGN_KEY:-}" ]]; then log "Signing images with cosign..." for ref in "${API_REF}" "${WORKER_REF}" "${ADMIN_REF}"; do cosign sign --yes --key "${COSIGN_KEY}" "${ref}" || warn "cosign sign failed for ${ref}" done else warn "cosign not configured (need cosign + COSIGN_KEY) — skipping image signing (audit L5)" fi fi # --- Generate and apply ConfigMap from config.yaml --- log "Generating env from config.yaml..." ENV_FILE="$(mktemp)" trap 'rm -f "${ENV_FILE}"' EXIT generate_env > "${ENV_FILE}" log "Creating ConfigMap..." kubectl create configmap honeydue-config \ --namespace="${NAMESPACE}" \ --from-env-file="${ENV_FILE}" \ --dry-run=client -o yaml | kubectl apply -f - # --- Apply manifests --- log "Applying manifests..." kubectl apply -f "${MANIFESTS}/namespace.yaml" # NetworkPolicies first — default-deny-all + per-app allow rules. # These MUST be applied; without them the cluster falls back to default-allow # (worse posture) AND the vmagent egress rule for :6443 (which fixes a k3s # post-DNAT enforcement quirk for k8s API discovery) is missing. # See deploy-k3s/RUNBOOK.md ("vmagent SD broken on fresh deploy"). kubectl apply -f "${MANIFESTS}/network-policies.yaml" kubectl apply -f "${MANIFESTS}/redis/" kubectl apply -f "${MANIFESTS}/ingress/" # --- Run migrations BEFORE rolling api/worker --- # # goose-based migration Job. We delete any prior Job (Jobs are immutable — # applying a duplicate name otherwise fails), apply a fresh one with the new # api image (which includes /usr/local/bin/goose and /app/migrations), and # block until it succeeds. A failure aborts the deploy before any new app # pod sees a stale schema. log "Running database migrations (goose Job)..." kubectl delete job honeydue-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/migrate/job.yaml" | kubectl apply -f - if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=10m job/honeydue-migrate; then warn "migration Job failed — see logs:" kubectl logs -n "${NAMESPACE}" job/honeydue-migrate --tail=200 || true die "migrations did not complete cleanly; aborting deploy" fi log "Migrations applied; proceeding with api/worker rollout" # Apply deployments with image substitution sed "s|image: IMAGE_PLACEHOLDER|image: ${API_REF}|" "${MANIFESTS}/api/deployment.yaml" | kubectl apply -f - kubectl apply -f "${MANIFESTS}/api/service.yaml" kubectl apply -f "${MANIFESTS}/api/hpa.yaml" sed "s|image: IMAGE_PLACEHOLDER|image: ${WORKER_REF}|" "${MANIFESTS}/worker/deployment.yaml" | kubectl apply -f - sed "s|image: IMAGE_PLACEHOLDER|image: ${ADMIN_REF}|" "${MANIFESTS}/admin/deployment.yaml" | kubectl apply -f - kubectl apply -f "${MANIFESTS}/admin/service.yaml" if [[ -d "${MANIFESTS}/web" ]]; then sed "s|image: IMAGE_PLACEHOLDER|image: ${WEB_REF}|" "${MANIFESTS}/web/deployment.yaml" | kubectl apply -f - kubectl apply -f "${MANIFESTS}/web/service.yaml" fi # Observability — vmagent scrapes api Pods :8000/metrics + kube-state-metrics # :8080/metrics and remote-writes everything to obs.88oakapps.com. The bearer # token comes from deploy/prod.env so it stays out of the repo; the manifest # holds TOKEN_PLACEHOLDER. kube-state-metrics provides the kube_* metrics # Grafana panels need to count pods, deployments, etc. if [[ -d "${MANIFESTS}/observability" ]]; then # kube-state-metrics — no secrets, plain apply kubectl apply -f "${MANIFESTS}/observability/kube-state-metrics.yaml" # vmagent — needs the bearer-token substitution # prod.env lives at the repo's deploy/ dir (sibling of deploy-k3s/), not # under deploy-k3s/. It's gitignored — operator copies values there once. OBS_TOKEN="$(grep -E '^OBS_INGEST_TOKEN=' "${REPO_DIR}/deploy/prod.env" 2>/dev/null | cut -d= -f2- || true)" if [[ -z "${OBS_TOKEN}" ]]; then warn "OBS_INGEST_TOKEN not found in deploy/prod.env — skipping vmagent + alloy-logs apply" else sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/vmagent.yaml" | kubectl apply -f - # alloy-logs — DaemonSet that tails honeydue pod logs and pushes them to # Loki at obs.88oakapps.com. Same OBS_INGEST_TOKEN as vmagent. if [[ -f "${MANIFESTS}/observability/alloy-logs.yaml" ]]; then sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/alloy-logs.yaml" | kubectl apply -f - fi fi fi # --- Ory Kratos (identity service) --- # Applied only when kratos-secrets exists — i.e. the operator has completed the # Kratos prerequisites in deploy-k3s/manifests/kratos/README.md. Otherwise # skipped, so the existing stack deploys unaffected. if kubectl -n "${NAMESPACE}" get secret kratos-secrets >/dev/null 2>&1; then log "Deploying Ory Kratos..." kubectl apply -f "${MANIFESTS}/kratos/configmap.yaml" # The migrate Job is immutable — delete any prior run, then apply + wait. kubectl delete job kratos-migrate -n "${NAMESPACE}" --ignore-not-found --wait=true >/dev/null kubectl apply -f "${MANIFESTS}/kratos/migrate-job.yaml" if ! kubectl wait --namespace="${NAMESPACE}" --for=condition=complete --timeout=5m job/kratos-migrate; then warn "Kratos migration Job failed — logs:" kubectl logs -n "${NAMESPACE}" job/kratos-migrate --tail=100 || true die "aborting: Kratos schema migration failed" fi kubectl apply -f "${MANIFESTS}/kratos/kratos.yaml" kubectl apply -f "${MANIFESTS}/kratos/ingress.yaml" else log "kratos-secrets not present — skipping Kratos deploy (see manifests/kratos/README.md)." fi # --- Wait for rollouts --- log "Waiting for rollouts..." kubectl rollout status deployment/redis -n "${NAMESPACE}" --timeout=120s kubectl rollout status deployment/api -n "${NAMESPACE}" --timeout=300s kubectl rollout status deployment/worker -n "${NAMESPACE}" --timeout=300s kubectl rollout status deployment/admin -n "${NAMESPACE}" --timeout=300s if [[ -d "${MANIFESTS}/web" ]]; then kubectl rollout status deployment/web -n "${NAMESPACE}" --timeout=300s fi if kubectl -n "${NAMESPACE}" get deployment vmagent >/dev/null 2>&1; then kubectl rollout status deployment/vmagent -n "${NAMESPACE}" --timeout=120s fi if kubectl -n "${NAMESPACE}" get daemonset alloy-logs >/dev/null 2>&1; then kubectl rollout status daemonset/alloy-logs -n "${NAMESPACE}" --timeout=120s fi if kubectl -n "${NAMESPACE}" get deployment kratos >/dev/null 2>&1; then kubectl rollout status deployment/kratos -n "${NAMESPACE}" --timeout=180s fi # --- Done --- log "" log "Deploy completed successfully." log "Tag: ${DEPLOY_TAG}" log "Images:" log " API: ${API_IMAGE}" log " Worker: ${WORKER_IMAGE}" log " Admin: ${ADMIN_IMAGE}" [[ -d "${MANIFESTS}/web" ]] && log " Web: ${WEB_IMAGE}" log "" log "Run ./scripts/04-verify.sh to check cluster health."