3b2ea9959a
Per-node host metrics (node_filesystem_*, node_memory_*, node_load*) were missing — a node running out of disk would silently fail the cluster before any dashboard signal (RUNBOOK §11.1 gap #9). Adds: - node-exporter DaemonSet (pod-networked, :9100; host /proc,/sys,/ ro) so vmagent scrapes it pod-to-pod over the cluster CIDR, independent of node public IPs (the netpol node-IP list is OVH-stale). - two additive NetworkPolicies (default-deny-all is in force): ingress to node-exporter from vmagent, and vmagent egress to the pod CIDR on :9100. - a node-exporter scrape job in the vmagent-config ConfigMap. Feeds the new "Node host health" row (disk/mem/load) on the eli5 dashboard. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
127 lines
4.0 KiB
YAML
127 lines
4.0 KiB
YAML
# node-exporter — per-node host metrics (filesystem, memory, load, CPU).
|
|
# Runs as a normal pod (NOT hostNetwork) so vmagent scrapes it pod-to-pod over
|
|
# the cluster CIDR, avoiding any dependency on node public IPs (the netpol
|
|
# node-IP list is OVH-stale). Host /proc, /sys and / are bind-mounted read-only
|
|
# so the filesystem/memory/load collectors read the real host, not the pod ns.
|
|
# Added 2026-06-08 to close RUNBOOK §11.1 gap #9 (node disk/mem were unmonitored).
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: node-exporter
|
|
namespace: honeydue
|
|
labels:
|
|
app.kubernetes.io/name: node-exporter
|
|
app.kubernetes.io/part-of: honeydue
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: node-exporter
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/name: node-exporter
|
|
app.kubernetes.io/part-of: honeydue
|
|
spec:
|
|
# Run on every node, including any tainted control-plane nodes.
|
|
tolerations:
|
|
- operator: Exists
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: 65534
|
|
seccompProfile:
|
|
type: RuntimeDefault
|
|
containers:
|
|
- name: node-exporter
|
|
image: quay.io/prometheus/node-exporter:v1.8.2 # TODO digest-pin (audit K3S-F14)
|
|
imagePullPolicy: IfNotPresent
|
|
args:
|
|
- --path.procfs=/host/proc
|
|
- --path.sysfs=/host/sys
|
|
- --path.rootfs=/host/root
|
|
# Only report real host mounts; drop the kubelet/container churn.
|
|
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/kubelet/.+|var/lib/docker/.+|var/lib/containerd/.+)($|/)
|
|
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
|
|
- --no-collector.wifi
|
|
- --no-collector.hwmon
|
|
- --web.listen-address=:9100
|
|
ports:
|
|
- name: metrics
|
|
containerPort: 9100
|
|
protocol: TCP
|
|
securityContext:
|
|
allowPrivilegeEscalation: false
|
|
readOnlyRootFilesystem: true
|
|
capabilities:
|
|
drop: ["ALL"]
|
|
resources:
|
|
requests:
|
|
cpu: 30m
|
|
memory: 32Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 128Mi
|
|
volumeMounts:
|
|
- name: proc
|
|
mountPath: /host/proc
|
|
readOnly: true
|
|
- name: sys
|
|
mountPath: /host/sys
|
|
readOnly: true
|
|
- name: root
|
|
mountPath: /host/root
|
|
mountPropagation: HostToContainer
|
|
readOnly: true
|
|
volumes:
|
|
- name: proc
|
|
hostPath:
|
|
path: /proc
|
|
- name: sys
|
|
hostPath:
|
|
path: /sys
|
|
- name: root
|
|
hostPath:
|
|
path: /
|
|
---
|
|
# default-deny-all blocks ingress; allow vmagent to scrape :9100.
|
|
apiVersion: networking.k8s.io/v1
|
|
kind: NetworkPolicy
|
|
metadata:
|
|
name: allow-ingress-to-node-exporter
|
|
namespace: honeydue
|
|
spec:
|
|
podSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: node-exporter
|
|
policyTypes:
|
|
- Ingress
|
|
ingress:
|
|
- from:
|
|
- podSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: vmagent
|
|
ports:
|
|
- port: 9100
|
|
protocol: TCP
|
|
---
|
|
# vmagent's existing egress policy only opens :8000/:8080 to the pod CIDR.
|
|
# Additive policy (NetworkPolicies are OR'd) opening :9100 for the node-exporter
|
|
# scrape — leaves the working allow-egress-from-vmagent policy untouched.
|
|
apiVersion: networking.k8s.io/v1
|
|
kind: NetworkPolicy
|
|
metadata:
|
|
name: allow-egress-from-vmagent-to-node-exporter
|
|
namespace: honeydue
|
|
spec:
|
|
podSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/name: vmagent
|
|
policyTypes:
|
|
- Egress
|
|
egress:
|
|
- to:
|
|
- ipBlock:
|
|
cidr: 10.42.0.0/16
|
|
ports:
|
|
- port: 9100
|
|
protocol: TCP
|