c845771946
The api logs every request, so k8s liveness/readiness probes on
/api/health/ and vmagent's /metrics scrape drowned Loki in 2xx access
logs. Alloy now drops successful probe/scrape access lines at ingest
(loki.process stage.drop) — a non-2xx health check, or one logged
above info level, still matches nothing and is kept.
Also hardens Alloy's read-offset store: moved /tmp/alloy from an
emptyDir to a hostPath and set loki.source.file tail_from_end=true, so
a pod restart resumes from the saved offset instead of re-reading log
files from the start — which made Loki 400-reject the now-too-old
entries ("entry too far behind") and stalled shipping.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
279 lines
8.7 KiB
YAML
279 lines
8.7 KiB
YAML
# honeyDue log shipper — Grafana Alloy as a DaemonSet.
|
|
#
|
|
# Each node runs one Alloy pod that tails the honeydue-namespace pod logs in
|
|
# /var/log/pods and pushes them to Loki at obs.88oakapps.com/loki/api/v1/push
|
|
# (the same nginx ingest endpoint + bearer token vmagent uses for metrics).
|
|
#
|
|
# Runs as root: /var/log/pods is 0750 root:root on the k3s nodes, so a
|
|
# non-root uid cannot even traverse it. The container is otherwise locked
|
|
# down — all capabilities dropped, read-only root filesystem, seccomp
|
|
# RuntimeDefault — and root inside the container reads only a read-only
|
|
# hostPath mount of /var/log/pods. This is the one root-running workload in
|
|
# the namespace (standard for log collectors); see docs/deployment.
|
|
#
|
|
# 03-deploy.sh substitutes TOKEN_PLACEHOLDER with OBS_INGEST_TOKEN from
|
|
# deploy/prod.env before applying — the token never lands in the repo.
|
|
---
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: alloy-logs
|
|
namespace: honeydue
|
|
labels:
|
|
app.kubernetes.io/name: alloy-logs
|
|
app.kubernetes.io/part-of: honeydue
|
|
---
|
|
# Least privilege: Alloy's discovery.kubernetes only lists/watches pods, and
|
|
# only in the honeydue namespace — so a namespaced Role, not a ClusterRole.
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: Role
|
|
metadata:
|
|
name: alloy-logs
|
|
namespace: honeydue
|
|
labels:
|
|
app.kubernetes.io/name: alloy-logs
|
|
app.kubernetes.io/part-of: honeydue
|
|
rules:
|
|
- apiGroups: [""]
|
|
resources: ["pods"]
|
|
verbs: ["get", "list", "watch"]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: RoleBinding
|
|
metadata:
|
|
name: alloy-logs
|
|
namespace: honeydue
|
|
labels:
|
|
app.kubernetes.io/name: alloy-logs
|
|
app.kubernetes.io/part-of: honeydue
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: alloy-logs
|
|
namespace: honeydue
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: Role
|
|
name: alloy-logs
|
|
---
|
|
# Bearer token for the Loki push endpoint. TOKEN_PLACEHOLDER is replaced by
|
|
# 03-deploy.sh with OBS_INGEST_TOKEN (same token vmagent uses).
|
|
apiVersion: v1
|
|
kind: Secret
|
|
metadata:
|
|
name: alloy-logs-auth
|
|
namespace: honeydue
|
|
labels:
|
|
app.kubernetes.io/name: alloy-logs
|
|
app.kubernetes.io/part-of: honeydue
|
|
type: Opaque
|
|
stringData:
|
|
bearer_token: TOKEN_PLACEHOLDER
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: alloy-logs
|
|
namespace: honeydue
|
|
labels:
|
|
app.kubernetes.io/name: alloy-logs
|
|
app.kubernetes.io/part-of: honeydue
|
|
data:
|
|
config.alloy: |
|
|
// honeyDue log shipper. Each DaemonSet instance discovers honeydue-namespace
|
|
// pods via the Kubernetes API, tails the container log files present on its
|
|
// own node (/var/log/pods), and pushes them to Loki at obs.88oakapps.com.
|
|
|
|
logging {
|
|
level = "warn"
|
|
format = "logfmt"
|
|
}
|
|
|
|
discovery.kubernetes "pods" {
|
|
role = "pod"
|
|
namespaces {
|
|
names = ["honeydue"]
|
|
}
|
|
}
|
|
|
|
// Turn pod metadata into Loki labels and build the on-disk log path.
|
|
discovery.relabel "pod_logs" {
|
|
targets = discovery.kubernetes.pods.targets
|
|
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_namespace"]
|
|
action = "replace"
|
|
target_label = "namespace"
|
|
}
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_name"]
|
|
action = "replace"
|
|
target_label = "pod"
|
|
}
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_container_name"]
|
|
action = "replace"
|
|
target_label = "container"
|
|
}
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
|
action = "replace"
|
|
target_label = "app"
|
|
}
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_node_name"]
|
|
action = "replace"
|
|
target_label = "node"
|
|
}
|
|
// /var/log/pods/<namespace>_<pod>_<uid>/<container>/<n>.log
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
|
|
separator = "/"
|
|
action = "replace"
|
|
replacement = "/var/log/pods/*$1/*.log"
|
|
target_label = "__path__"
|
|
}
|
|
}
|
|
|
|
local.file_match "pod_logs" {
|
|
path_targets = discovery.relabel.pod_logs.output
|
|
}
|
|
|
|
loki.source.file "pod_logs" {
|
|
targets = local.file_match.pod_logs.targets
|
|
forward_to = [loki.process.pod_logs.receiver]
|
|
// With no stored read offset (fresh node, or positions wiped), start
|
|
// at the END of each file instead of re-shipping history — otherwise
|
|
// Loki rejects the now-too-old entries ("entry too far behind") and
|
|
// shipping stalls. Offsets persist on a hostPath (see volumes), so a
|
|
// normal pod restart resumes exactly where it left off.
|
|
tail_from_end = true
|
|
}
|
|
|
|
// Parse the CRI log format (timestamp / stream / flags / message),
|
|
// then drop probe/scrape noise before shipping.
|
|
loki.process "pod_logs" {
|
|
forward_to = [loki.write.obs.receiver]
|
|
|
|
stage.cri {}
|
|
|
|
// Drop successful probe/scrape access logs. k8s liveness/readiness
|
|
// hits /api/health/ every few seconds and vmagent scrapes /metrics
|
|
// on a 15s interval — all 2xx, pure noise that drowns real logs.
|
|
// A non-2xx health check, or one logged above info level, does NOT
|
|
// match this regex and is kept.
|
|
stage.drop {
|
|
expression = "\"level\":\"info\".*\"path\":\"/(api/health/?|metrics)\".*\"status\":2[0-9][0-9]"
|
|
drop_counter_reason = "probe_access_ok"
|
|
}
|
|
}
|
|
|
|
loki.write "obs" {
|
|
endpoint {
|
|
url = "https://obs.88oakapps.com/loki/api/v1/push"
|
|
bearer_token_file = "/etc/alloy-secrets/bearer_token"
|
|
}
|
|
external_labels = {
|
|
cluster = "honeydue-k3s",
|
|
environment = "prod",
|
|
}
|
|
}
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: alloy-logs
|
|
namespace: honeydue
|
|
labels:
|
|
app.kubernetes.io/name: alloy-logs
|
|
app.kubernetes.io/part-of: honeydue
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: alloy-logs
|
|
updateStrategy:
|
|
type: RollingUpdate
|
|
rollingUpdate:
|
|
maxUnavailable: 1
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: alloy-logs
|
|
app.kubernetes.io/part-of: honeydue
|
|
spec:
|
|
serviceAccountName: alloy-logs
|
|
# Alloy needs its SA token — discovery.kubernetes talks to the API server.
|
|
automountServiceAccountToken: true
|
|
# Root is required to traverse /var/log/pods (0750 root:root). The
|
|
# container is otherwise fully confined (see container securityContext).
|
|
securityContext:
|
|
runAsUser: 0
|
|
runAsGroup: 0
|
|
seccompProfile:
|
|
type: RuntimeDefault
|
|
tolerations:
|
|
# DaemonSet must run on every node, including any control-plane taint.
|
|
- key: node-role.kubernetes.io/control-plane
|
|
operator: Exists
|
|
effect: NoSchedule
|
|
containers:
|
|
- name: alloy
|
|
image: grafana/alloy:v1.5.1@sha256:01a63f4e032ce54ee94b22049bc27f597e74f85566478c377f4b5c7f020c1eb3
|
|
imagePullPolicy: IfNotPresent
|
|
args:
|
|
- run
|
|
- /etc/alloy/config.alloy
|
|
- --storage.path=/tmp/alloy
|
|
- --server.http.listen-addr=0.0.0.0:12345
|
|
ports:
|
|
- name: http
|
|
containerPort: 12345
|
|
securityContext:
|
|
allowPrivilegeEscalation: false
|
|
readOnlyRootFilesystem: true
|
|
capabilities:
|
|
drop: ["ALL"]
|
|
volumeMounts:
|
|
- name: config
|
|
mountPath: /etc/alloy
|
|
readOnly: true
|
|
- name: auth
|
|
mountPath: /etc/alloy-secrets
|
|
readOnly: true
|
|
- name: varlogpods
|
|
mountPath: /var/log/pods
|
|
readOnly: true
|
|
- name: tmp
|
|
mountPath: /tmp/alloy
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /-/ready
|
|
port: 12345
|
|
initialDelaySeconds: 10
|
|
periodSeconds: 20
|
|
resources:
|
|
requests:
|
|
cpu: 25m
|
|
memory: 64Mi
|
|
limits:
|
|
cpu: 150m
|
|
memory: 256Mi
|
|
volumes:
|
|
- name: config
|
|
configMap:
|
|
name: alloy-logs
|
|
- name: auth
|
|
secret:
|
|
secretName: alloy-logs-auth
|
|
defaultMode: 0400
|
|
- name: varlogpods
|
|
hostPath:
|
|
path: /var/log/pods
|
|
type: Directory
|
|
# Alloy's positions/WAL store. A hostPath (not emptyDir) so file read
|
|
# offsets survive pod restarts — otherwise every restart re-reads log
|
|
# files from the start and Loki rejects the now-too-old entries.
|
|
- name: tmp
|
|
hostPath:
|
|
path: /var/lib/honeydue-alloy-logs
|
|
type: DirectoryOrCreate
|