deploy: add node-exporter DaemonSet + vmagent scrape job
Per-node host metrics (node_filesystem_*, node_memory_*, node_load*) were missing — a node running out of disk would silently fail the cluster before any dashboard signal (RUNBOOK §11.1 gap #9). Adds: - node-exporter DaemonSet (pod-networked, :9100; host /proc,/sys,/ ro) so vmagent scrapes it pod-to-pod over the cluster CIDR, independent of node public IPs (the netpol node-IP list is OVH-stale). - two additive NetworkPolicies (default-deny-all is in force): ingress to node-exporter from vmagent, and vmagent egress to the pod CIDR on :9100. - a node-exporter scrape job in the vmagent-config ConfigMap. Feeds the new "Node host health" row (disk/mem/load) on the eli5 dashboard. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,126 @@
|
||||
# node-exporter — per-node host metrics (filesystem, memory, load, CPU).
|
||||
# Runs as a normal pod (NOT hostNetwork) so vmagent scrapes it pod-to-pod over
|
||||
# the cluster CIDR, avoiding any dependency on node public IPs (the netpol
|
||||
# node-IP list is OVH-stale). Host /proc, /sys and / are bind-mounted read-only
|
||||
# so the filesystem/memory/load collectors read the real host, not the pod ns.
|
||||
# Added 2026-06-08 to close RUNBOOK §11.1 gap #9 (node disk/mem were unmonitored).
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-exporter
|
||||
namespace: honeydue
|
||||
labels:
|
||||
app.kubernetes.io/name: node-exporter
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: node-exporter
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: node-exporter
|
||||
app.kubernetes.io/part-of: honeydue
|
||||
spec:
|
||||
# Run on every node, including any tainted control-plane nodes.
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
containers:
|
||||
- name: node-exporter
|
||||
image: quay.io/prometheus/node-exporter:v1.8.2 # TODO digest-pin (audit K3S-F14)
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
# Only report real host mounts; drop the kubelet/container churn.
|
||||
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/kubelet/.+|var/lib/docker/.+|var/lib/containerd/.+)($|/)
|
||||
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
|
||||
- --no-collector.wifi
|
||||
- --no-collector.hwmon
|
||||
- --web.listen-address=:9100
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9100
|
||||
protocol: TCP
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
resources:
|
||||
requests:
|
||||
cpu: 30m
|
||||
memory: 32Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 128Mi
|
||||
volumeMounts:
|
||||
- name: proc
|
||||
mountPath: /host/proc
|
||||
readOnly: true
|
||||
- name: sys
|
||||
mountPath: /host/sys
|
||||
readOnly: true
|
||||
- name: root
|
||||
mountPath: /host/root
|
||||
mountPropagation: HostToContainer
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: root
|
||||
hostPath:
|
||||
path: /
|
||||
---
|
||||
# default-deny-all blocks ingress; allow vmagent to scrape :9100.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-ingress-to-node-exporter
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: node-exporter
|
||||
policyTypes:
|
||||
- Ingress
|
||||
ingress:
|
||||
- from:
|
||||
- podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
ports:
|
||||
- port: 9100
|
||||
protocol: TCP
|
||||
---
|
||||
# vmagent's existing egress policy only opens :8000/:8080 to the pod CIDR.
|
||||
# Additive policy (NetworkPolicies are OR'd) opening :9100 for the node-exporter
|
||||
# scrape — leaves the working allow-egress-from-vmagent policy untouched.
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: allow-egress-from-vmagent-to-node-exporter
|
||||
namespace: honeydue
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vmagent
|
||||
policyTypes:
|
||||
- Egress
|
||||
egress:
|
||||
- to:
|
||||
- ipBlock:
|
||||
cidr: 10.42.0.0/16
|
||||
ports:
|
||||
- port: 9100
|
||||
protocol: TCP
|
||||
@@ -57,6 +57,27 @@ data:
|
||||
action: keep
|
||||
regex: http-metrics
|
||||
|
||||
# node-exporter — per-node host metrics (node_filesystem_*, node_memory_*,
|
||||
# node_load*). Pod-networked DaemonSet scraped on :9100 over the pod CIDR.
|
||||
- job_name: node-exporter
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names: [honeydue]
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||
action: keep
|
||||
regex: node-exporter
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
||||
action: keep
|
||||
regex: "9100"
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
- source_labels: [__meta_kubernetes_pod_node_name]
|
||||
target_label: node
|
||||
- target_label: service
|
||||
replacement: node-exporter
|
||||
|
||||
# honeyDue worker — also exposes /metrics if/when we add it.
|
||||
# Keep this stanza commented until the worker has a /metrics endpoint;
|
||||
# uncommented form drops scrapes silently.
|
||||
|
||||
Reference in New Issue
Block a user