Files
Trey t c845771946
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled
Backend CI / Build (push) Has been cancelled
feat(observability): drop health/metrics probe noise from shipped logs
The api logs every request, so k8s liveness/readiness probes on
/api/health/ and vmagent's /metrics scrape drowned Loki in 2xx access
logs. Alloy now drops successful probe/scrape access lines at ingest
(loki.process stage.drop) — a non-2xx health check, or one logged
above info level, still matches nothing and is kept.

Also hardens Alloy's read-offset store: moved /tmp/alloy from an
emptyDir to a hostPath and set loki.source.file tail_from_end=true, so
a pod restart resumes from the saved offset instead of re-reading log
files from the start — which made Loki 400-reject the now-too-old
entries ("entry too far behind") and stalled shipping.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 21:29:15 -05:00

279 lines
8.7 KiB
YAML

# honeyDue log shipper — Grafana Alloy as a DaemonSet.
#
# Each node runs one Alloy pod that tails the honeydue-namespace pod logs in
# /var/log/pods and pushes them to Loki at obs.88oakapps.com/loki/api/v1/push
# (the same nginx ingest endpoint + bearer token vmagent uses for metrics).
#
# Runs as root: /var/log/pods is 0750 root:root on the k3s nodes, so a
# non-root uid cannot even traverse it. The container is otherwise locked
# down — all capabilities dropped, read-only root filesystem, seccomp
# RuntimeDefault — and root inside the container reads only a read-only
# hostPath mount of /var/log/pods. This is the one root-running workload in
# the namespace (standard for log collectors); see docs/deployment.
#
# 03-deploy.sh substitutes TOKEN_PLACEHOLDER with OBS_INGEST_TOKEN from
# deploy/prod.env before applying — the token never lands in the repo.
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
---
# Least privilege: Alloy's discovery.kubernetes only lists/watches pods, and
# only in the honeydue namespace — so a namespaced Role, not a ClusterRole.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
subjects:
- kind: ServiceAccount
name: alloy-logs
namespace: honeydue
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: alloy-logs
---
# Bearer token for the Loki push endpoint. TOKEN_PLACEHOLDER is replaced by
# 03-deploy.sh with OBS_INGEST_TOKEN (same token vmagent uses).
apiVersion: v1
kind: Secret
metadata:
name: alloy-logs-auth
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
type: Opaque
stringData:
bearer_token: TOKEN_PLACEHOLDER
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
data:
config.alloy: |
// honeyDue log shipper. Each DaemonSet instance discovers honeydue-namespace
// pods via the Kubernetes API, tails the container log files present on its
// own node (/var/log/pods), and pushes them to Loki at obs.88oakapps.com.
logging {
level = "warn"
format = "logfmt"
}
discovery.kubernetes "pods" {
role = "pod"
namespaces {
names = ["honeydue"]
}
}
// Turn pod metadata into Loki labels and build the on-disk log path.
discovery.relabel "pod_logs" {
targets = discovery.kubernetes.pods.targets
rule {
source_labels = ["__meta_kubernetes_namespace"]
action = "replace"
target_label = "namespace"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
action = "replace"
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
action = "replace"
target_label = "container"
}
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
action = "replace"
target_label = "app"
}
rule {
source_labels = ["__meta_kubernetes_pod_node_name"]
action = "replace"
target_label = "node"
}
// /var/log/pods/<namespace>_<pod>_<uid>/<container>/<n>.log
rule {
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
separator = "/"
action = "replace"
replacement = "/var/log/pods/*$1/*.log"
target_label = "__path__"
}
}
local.file_match "pod_logs" {
path_targets = discovery.relabel.pod_logs.output
}
loki.source.file "pod_logs" {
targets = local.file_match.pod_logs.targets
forward_to = [loki.process.pod_logs.receiver]
// With no stored read offset (fresh node, or positions wiped), start
// at the END of each file instead of re-shipping history — otherwise
// Loki rejects the now-too-old entries ("entry too far behind") and
// shipping stalls. Offsets persist on a hostPath (see volumes), so a
// normal pod restart resumes exactly where it left off.
tail_from_end = true
}
// Parse the CRI log format (timestamp / stream / flags / message),
// then drop probe/scrape noise before shipping.
loki.process "pod_logs" {
forward_to = [loki.write.obs.receiver]
stage.cri {}
// Drop successful probe/scrape access logs. k8s liveness/readiness
// hits /api/health/ every few seconds and vmagent scrapes /metrics
// on a 15s interval — all 2xx, pure noise that drowns real logs.
// A non-2xx health check, or one logged above info level, does NOT
// match this regex and is kept.
stage.drop {
expression = "\"level\":\"info\".*\"path\":\"/(api/health/?|metrics)\".*\"status\":2[0-9][0-9]"
drop_counter_reason = "probe_access_ok"
}
}
loki.write "obs" {
endpoint {
url = "https://obs.88oakapps.com/loki/api/v1/push"
bearer_token_file = "/etc/alloy-secrets/bearer_token"
}
external_labels = {
cluster = "honeydue-k3s",
environment = "prod",
}
}
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: alloy-logs
namespace: honeydue
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
spec:
selector:
matchLabels:
app.kubernetes.io/name: alloy-logs
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app.kubernetes.io/name: alloy-logs
app.kubernetes.io/part-of: honeydue
spec:
serviceAccountName: alloy-logs
# Alloy needs its SA token — discovery.kubernetes talks to the API server.
automountServiceAccountToken: true
# Root is required to traverse /var/log/pods (0750 root:root). The
# container is otherwise fully confined (see container securityContext).
securityContext:
runAsUser: 0
runAsGroup: 0
seccompProfile:
type: RuntimeDefault
tolerations:
# DaemonSet must run on every node, including any control-plane taint.
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
containers:
- name: alloy
image: grafana/alloy:v1.5.1@sha256:01a63f4e032ce54ee94b22049bc27f597e74f85566478c377f4b5c7f020c1eb3
imagePullPolicy: IfNotPresent
args:
- run
- /etc/alloy/config.alloy
- --storage.path=/tmp/alloy
- --server.http.listen-addr=0.0.0.0:12345
ports:
- name: http
containerPort: 12345
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
volumeMounts:
- name: config
mountPath: /etc/alloy
readOnly: true
- name: auth
mountPath: /etc/alloy-secrets
readOnly: true
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
- name: tmp
mountPath: /tmp/alloy
readinessProbe:
httpGet:
path: /-/ready
port: 12345
initialDelaySeconds: 10
periodSeconds: 20
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 150m
memory: 256Mi
volumes:
- name: config
configMap:
name: alloy-logs
- name: auth
secret:
secretName: alloy-logs-auth
defaultMode: 0400
- name: varlogpods
hostPath:
path: /var/log/pods
type: Directory
# Alloy's positions/WAL store. A hostPath (not emptyDir) so file read
# offsets survive pod restarts — otherwise every restart re-reads log
# files from the start and Loki rejects the now-too-old entries.
- name: tmp
hostPath:
path: /var/lib/honeydue-alloy-logs
type: DirectoryOrCreate