feat(observability): ship pod logs to Loki via Grafana Alloy
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled
Backend CI / Build (push) Has been cancelled

Adds a Grafana Alloy DaemonSet that tails honeydue-namespace pod logs
from /var/log/pods and pushes them to Loki at obs.88oakapps.com,
reusing the existing OBS_INGEST_TOKEN (14-day retention).

- deploy-k3s/manifests/observability/alloy-logs.yaml — DaemonSet + RBAC
  + token Secret + Alloy config. Runs as root (/var/log/pods is 0750
  root:root) but otherwise locked down: all caps dropped, read-only
  root filesystem, seccomp RuntimeDefault, read-only hostPath mount.
- network-policies.yaml — allow-egress-from-alloy-logs (DNS + k8s API
  + obs HTTPS), mirroring the vmagent egress policy.
- 03-deploy.sh — applies alloy-logs with the OBS_INGEST_TOKEN
  substitution and waits for the DaemonSet rollout.

The Loki container, nginx /loki/api/v1/push route, and Grafana Loki
datasource live on the obs server and are not repo-managed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-05-17 20:04:09 -05:00
parent c77ff07ce9
commit 93fddc3769
3 changed files with 320 additions and 1 deletions
@@ -372,3 +372,57 @@ spec:
ports: ports:
- port: 8000 - port: 8000
protocol: TCP protocol: TCP
---
# alloy-logs egress — Grafana Alloy discovers honeydue pods via the k8s API
# and pushes their logs to Loki at obs.88oakapps.com. Same k3s NetworkPolicy
# DNAT gotcha as vmagent: API-server traffic is policy-checked as
# dst=<node_public_ip>:6443, so an explicit :6443 rule is required.
# Alloy reads log FILES from a hostPath, so it needs no ingress and no
# egress to pod :8000/:8080 — only DNS, the API server, and obs HTTPS.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-egress-from-alloy-logs
namespace: honeydue
spec:
podSelector:
matchLabels:
app.kubernetes.io/name: alloy-logs
policyTypes:
- Egress
egress:
# DNS (cluster-internal)
- to:
- namespaceSelector: {}
ports:
- port: 53
protocol: UDP
- port: 53
protocol: TCP
# k8s API server via ClusterIP (pre-DNAT view)
- to:
- ipBlock:
cidr: 10.43.0.0/16
ports:
- port: 443
protocol: TCP
# k8s API server post-DNAT (real path k3s NetPol enforcer sees) — REQUIRED
- to:
- ipBlock:
cidr: 0.0.0.0/0
except:
- 10.42.0.0/16
ports:
- port: 6443
protocol: TCP
# HTTPS to public (log push to obs.88oakapps.com via Cloudflare)
- to:
- ipBlock:
cidr: 0.0.0.0/0
except:
- 10.42.0.0/16
- 10.43.0.0/16
ports:
- port: 443
protocol: TCP
@@ -0,0 +1,257 @@
# honeyDue log shipper — Grafana Alloy as a DaemonSet.
#
# Each node runs one Alloy pod that tails the honeydue-namespace pod logs in
# /var/log/pods and pushes them to Loki at obs.88oakapps.com/loki/api/v1/push
# (the same nginx ingest endpoint + bearer token vmagent uses for metrics).
#
# Runs as root: /var/log/pods is 0750 root:root on the k3s nodes, so a
# non-root uid cannot even traverse it. The container is otherwise locked
# down — all capabilities dropped, read-only root filesystem, seccomp
# RuntimeDefault — and root inside the container reads only a read-only
# hostPath mount of /var/log/pods. This is the one root-running workload in
# the namespace (standard for log collectors); see docs/deployment.
#
# 03-deploy.sh substitutes TOKEN_PLACEHOLDER with OBS_INGEST_TOKEN from
# deploy/prod.env before applying — the token never lands in the repo.
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
---
# Least privilege: Alloy's discovery.kubernetes only lists/watches pods, and
# only in the honeydue namespace — so a namespaced Role, not a ClusterRole.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
subjects:
- kind: ServiceAccount
name: alloy-logs
namespace: honeydue
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: alloy-logs
---
# Bearer token for the Loki push endpoint. TOKEN_PLACEHOLDER is replaced by
# 03-deploy.sh with OBS_INGEST_TOKEN (same token vmagent uses).
apiVersion: v1
kind: Secret
metadata:
name: alloy-logs-auth
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
type: Opaque
stringData:
bearer_token: TOKEN_PLACEHOLDER
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
data:
config.alloy: |
// honeyDue log shipper. Each DaemonSet instance discovers honeydue-namespace
// pods via the Kubernetes API, tails the container log files present on its
// own node (/var/log/pods), and pushes them to Loki at obs.88oakapps.com.
logging {
level = "warn"
format = "logfmt"
}
discovery.kubernetes "pods" {
role = "pod"
namespaces {
names = ["honeydue"]
}
}
// Turn pod metadata into Loki labels and build the on-disk log path.
discovery.relabel "pod_logs" {
targets = discovery.kubernetes.pods.targets
rule {
source_labels = ["__meta_kubernetes_namespace"]
action = "replace"
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
action = "replace"
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "container"
}
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
action = "replace"
target_label = "app"
}
rule {
source_labels = ["__meta_kubernetes_pod_node_name"]
action = "replace"
target_label = "node"
}
// /var/log/pods/<namespace>_<pod>_<uid>/<container>/<n>.log
rule {
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
separator = "/"
action = "replace"
replacement = "/var/log/pods/*$1/*.log"
target_label = "__path__"
}
}
local.file_match "pod_logs" {
path_targets = discovery.relabel.pod_logs.output
}
loki.source.file "pod_logs" {
targets = local.file_match.pod_logs.targets
forward_to = [loki.process.pod_logs.receiver]
}
// Parse the CRI log format (timestamp / stream / flags / message).
loki.process "pod_logs" {
forward_to = [loki.write.obs.receiver]
stage.cri {}
}
loki.write "obs" {
endpoint {
url = "https://obs.88oakapps.com/loki/api/v1/push"
bearer_token_file = "/etc/alloy-secrets/bearer_token"
}
external_labels = {
cluster = "honeydue-k3s",
environment = "prod",
}
}
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
spec:
selector:
matchLabels:
app.kubernetes.io/name: alloy-logs
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
spec:
serviceAccountName: alloy-logs
# Alloy needs its SA token — discovery.kubernetes talks to the API server.
automountServiceAccountToken: true
# Root is required to traverse /var/log/pods (0750 root:root). The
# container is otherwise fully confined (see container securityContext).
securityContext:
runAsUser: 0
runAsGroup: 0
seccompProfile:
type: RuntimeDefault
tolerations:
# DaemonSet must run on every node, including any control-plane taint.
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
containers:
- name: alloy
image: grafana/alloy:v1.5.1@sha256:01a63f4e032ce54ee94b22049bc27f597e74f85566478c377f4b5c7f020c1eb3
imagePullPolicy: IfNotPresent
args:
- run
- /etc/alloy/config.alloy
- --storage.path=/tmp/alloy
- --server.http.listen-addr=0.0.0.0:12345
ports:
- name: http
containerPort: 12345
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
volumeMounts:
- name: config
mountPath: /etc/alloy
readOnly: true
- name: auth
mountPath: /etc/alloy-secrets
readOnly: true
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
- name: tmp
mountPath: /tmp/alloy
readinessProbe:
httpGet:
path: /-/ready
port: 12345
initialDelaySeconds: 10
periodSeconds: 20
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 150m
memory: 256Mi
volumes:
- name: config
configMap:
name: alloy-logs
- name: auth
secret:
secretName: alloy-logs-auth
defaultMode: 0400
- name: varlogpods
hostPath:
path: /var/log/pods
type: Directory
- name: tmp
emptyDir:
sizeLimit: 256Mi
+9 -1
View File
@@ -253,9 +253,14 @@ if [[ -d "${MANIFESTS}/observability" ]]; then
# under deploy-k3s/. It's gitignored — operator copies values there once. # under deploy-k3s/. It's gitignored — operator copies values there once.
OBS_TOKEN="$(grep -E '^OBS_INGEST_TOKEN=' "${REPO_DIR}/deploy/prod.env" 2>/dev/null | cut -d= -f2- || true)" OBS_TOKEN="$(grep -E '^OBS_INGEST_TOKEN=' "${REPO_DIR}/deploy/prod.env" 2>/dev/null | cut -d= -f2- || true)"
if [[ -z "${OBS_TOKEN}" ]]; then if [[ -z "${OBS_TOKEN}" ]]; then
warn "OBS_INGEST_TOKEN not found in deploy/prod.env — skipping vmagent apply" warn "OBS_INGEST_TOKEN not found in deploy/prod.env — skipping vmagent + alloy-logs apply"
else else
sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/vmagent.yaml" | kubectl apply -f - sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/vmagent.yaml" | kubectl apply -f -
# alloy-logs — DaemonSet that tails honeydue pod logs and pushes them to
# Loki at obs.88oakapps.com. Same OBS_INGEST_TOKEN as vmagent.
if [[ -f "${MANIFESTS}/observability/alloy-logs.yaml" ]]; then
sed "s|TOKEN_PLACEHOLDER|${OBS_TOKEN}|" "${MANIFESTS}/observability/alloy-logs.yaml" | kubectl apply -f -
fi
fi fi
fi fi
@@ -273,6 +278,9 @@ fi
if kubectl -n "${NAMESPACE}" get deployment vmagent >/dev/null 2>&1; then if kubectl -n "${NAMESPACE}" get deployment vmagent >/dev/null 2>&1; then
kubectl rollout status deployment/vmagent -n "${NAMESPACE}" --timeout=120s kubectl rollout status deployment/vmagent -n "${NAMESPACE}" --timeout=120s
fi fi
if kubectl -n "${NAMESPACE}" get daemonset alloy-logs >/dev/null 2>&1; then
kubectl rollout status daemonset/alloy-logs -n "${NAMESPACE}" --timeout=120s
fi
# --- Done --- # --- Done ---