deploy: add node-exporter DaemonSet + vmagent scrape job
Per-node host metrics (node_filesystem_*, node_memory_*, node_load*) were missing — a node running out of disk would silently fail the cluster before any dashboard signal (RUNBOOK §11.1 gap #9). Adds: - node-exporter DaemonSet (pod-networked, :9100; host /proc,/sys,/ ro) so vmagent scrapes it pod-to-pod over the cluster CIDR, independent of node public IPs (the netpol node-IP list is OVH-stale). - two additive NetworkPolicies (default-deny-all is in force): ingress to node-exporter from vmagent, and vmagent egress to the pod CIDR on :9100. - a node-exporter scrape job in the vmagent-config ConfigMap. Feeds the new "Node host health" row (disk/mem/load) on the eli5 dashboard. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,126 @@
|
|||||||
|
# node-exporter — per-node host metrics (filesystem, memory, load, CPU).
|
||||||
|
# Runs as a normal pod (NOT hostNetwork) so vmagent scrapes it pod-to-pod over
|
||||||
|
# the cluster CIDR, avoiding any dependency on node public IPs (the netpol
|
||||||
|
# node-IP list is OVH-stale). Host /proc, /sys and / are bind-mounted read-only
|
||||||
|
# so the filesystem/memory/load collectors read the real host, not the pod ns.
|
||||||
|
# Added 2026-06-08 to close RUNBOOK §11.1 gap #9 (node disk/mem were unmonitored).
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: node-exporter
|
||||||
|
namespace: honeydue
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: node-exporter
|
||||||
|
app.kubernetes.io/part-of: honeydue
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: node-exporter
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: node-exporter
|
||||||
|
app.kubernetes.io/part-of: honeydue
|
||||||
|
spec:
|
||||||
|
# Run on every node, including any tainted control-plane nodes.
|
||||||
|
tolerations:
|
||||||
|
- operator: Exists
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 65534
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
containers:
|
||||||
|
- name: node-exporter
|
||||||
|
image: quay.io/prometheus/node-exporter:v1.8.2 # TODO digest-pin (audit K3S-F14)
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
args:
|
||||||
|
- --path.procfs=/host/proc
|
||||||
|
- --path.sysfs=/host/sys
|
||||||
|
- --path.rootfs=/host/root
|
||||||
|
# Only report real host mounts; drop the kubelet/container churn.
|
||||||
|
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/kubelet/.+|var/lib/docker/.+|var/lib/containerd/.+)($|/)
|
||||||
|
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
|
||||||
|
- --no-collector.wifi
|
||||||
|
- --no-collector.hwmon
|
||||||
|
- --web.listen-address=:9100
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: 9100
|
||||||
|
protocol: TCP
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 30m
|
||||||
|
memory: 32Mi
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 128Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: proc
|
||||||
|
mountPath: /host/proc
|
||||||
|
readOnly: true
|
||||||
|
- name: sys
|
||||||
|
mountPath: /host/sys
|
||||||
|
readOnly: true
|
||||||
|
- name: root
|
||||||
|
mountPath: /host/root
|
||||||
|
mountPropagation: HostToContainer
|
||||||
|
readOnly: true
|
||||||
|
volumes:
|
||||||
|
- name: proc
|
||||||
|
hostPath:
|
||||||
|
path: /proc
|
||||||
|
- name: sys
|
||||||
|
hostPath:
|
||||||
|
path: /sys
|
||||||
|
- name: root
|
||||||
|
hostPath:
|
||||||
|
path: /
|
||||||
|
---
|
||||||
|
# default-deny-all blocks ingress; allow vmagent to scrape :9100.
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-ingress-to-node-exporter
|
||||||
|
namespace: honeydue
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: node-exporter
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: vmagent
|
||||||
|
ports:
|
||||||
|
- port: 9100
|
||||||
|
protocol: TCP
|
||||||
|
---
|
||||||
|
# vmagent's existing egress policy only opens :8000/:8080 to the pod CIDR.
|
||||||
|
# Additive policy (NetworkPolicies are OR'd) opening :9100 for the node-exporter
|
||||||
|
# scrape — leaves the working allow-egress-from-vmagent policy untouched.
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-egress-from-vmagent-to-node-exporter
|
||||||
|
namespace: honeydue
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: vmagent
|
||||||
|
policyTypes:
|
||||||
|
- Egress
|
||||||
|
egress:
|
||||||
|
- to:
|
||||||
|
- ipBlock:
|
||||||
|
cidr: 10.42.0.0/16
|
||||||
|
ports:
|
||||||
|
- port: 9100
|
||||||
|
protocol: TCP
|
||||||
@@ -57,6 +57,27 @@ data:
|
|||||||
action: keep
|
action: keep
|
||||||
regex: http-metrics
|
regex: http-metrics
|
||||||
|
|
||||||
|
# node-exporter — per-node host metrics (node_filesystem_*, node_memory_*,
|
||||||
|
# node_load*). Pod-networked DaemonSet scraped on :9100 over the pod CIDR.
|
||||||
|
- job_name: node-exporter
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: pod
|
||||||
|
namespaces:
|
||||||
|
names: [honeydue]
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||||
|
action: keep
|
||||||
|
regex: node-exporter
|
||||||
|
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
||||||
|
action: keep
|
||||||
|
regex: "9100"
|
||||||
|
- source_labels: [__meta_kubernetes_pod_name]
|
||||||
|
target_label: pod
|
||||||
|
- source_labels: [__meta_kubernetes_pod_node_name]
|
||||||
|
target_label: node
|
||||||
|
- target_label: service
|
||||||
|
replacement: node-exporter
|
||||||
|
|
||||||
# honeyDue worker — also exposes /metrics if/when we add it.
|
# honeyDue worker — also exposes /metrics if/when we add it.
|
||||||
# Keep this stanza commented until the worker has a /metrics endpoint;
|
# Keep this stanza commented until the worker has a /metrics endpoint;
|
||||||
# uncommented form drops scrapes silently.
|
# uncommented form drops scrapes silently.
|
||||||
|
|||||||
Reference in New Issue
Block a user