diff --git a/deploy-k3s/manifests/observability/node-exporter.yaml b/deploy-k3s/manifests/observability/node-exporter.yaml new file mode 100644 index 0000000..d07e92f --- /dev/null +++ b/deploy-k3s/manifests/observability/node-exporter.yaml @@ -0,0 +1,126 @@ +# node-exporter — per-node host metrics (filesystem, memory, load, CPU). +# Runs as a normal pod (NOT hostNetwork) so vmagent scrapes it pod-to-pod over +# the cluster CIDR, avoiding any dependency on node public IPs (the netpol +# node-IP list is OVH-stale). Host /proc, /sys and / are bind-mounted read-only +# so the filesystem/memory/load collectors read the real host, not the pod ns. +# Added 2026-06-08 to close RUNBOOK §11.1 gap #9 (node disk/mem were unmonitored). +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: honeydue + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: honeydue +spec: + selector: + matchLabels: + app.kubernetes.io/name: node-exporter + template: + metadata: + labels: + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: honeydue + spec: + # Run on every node, including any tainted control-plane nodes. + tolerations: + - operator: Exists + securityContext: + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: node-exporter + image: quay.io/prometheus/node-exporter:v1.8.2 # TODO digest-pin (audit K3S-F14) + imagePullPolicy: IfNotPresent + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + # Only report real host mounts; drop the kubelet/container churn. + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/kubelet/.+|var/lib/docker/.+|var/lib/containerd/.+)($|/) + - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ + - --no-collector.wifi + - --no-collector.hwmon + - --web.listen-address=:9100 + ports: + - name: metrics + containerPort: 9100 + protocol: TCP + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + resources: + requests: + cpu: 30m + memory: 32Mi + limits: + cpu: 200m + memory: 128Mi + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + mountPropagation: HostToContainer + readOnly: true + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / +--- +# default-deny-all blocks ingress; allow vmagent to scrape :9100. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-ingress-to-node-exporter + namespace: honeydue +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: node-exporter + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app.kubernetes.io/name: vmagent + ports: + - port: 9100 + protocol: TCP +--- +# vmagent's existing egress policy only opens :8000/:8080 to the pod CIDR. +# Additive policy (NetworkPolicies are OR'd) opening :9100 for the node-exporter +# scrape — leaves the working allow-egress-from-vmagent policy untouched. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-egress-from-vmagent-to-node-exporter + namespace: honeydue +spec: + podSelector: + matchLabels: + app.kubernetes.io/name: vmagent + policyTypes: + - Egress + egress: + - to: + - ipBlock: + cidr: 10.42.0.0/16 + ports: + - port: 9100 + protocol: TCP diff --git a/deploy-k3s/manifests/observability/vmagent.yaml b/deploy-k3s/manifests/observability/vmagent.yaml index a324aac..5a88cb9 100644 --- a/deploy-k3s/manifests/observability/vmagent.yaml +++ b/deploy-k3s/manifests/observability/vmagent.yaml @@ -57,6 +57,27 @@ data: action: keep regex: http-metrics + # node-exporter — per-node host metrics (node_filesystem_*, node_memory_*, + # node_load*). Pod-networked DaemonSet scraped on :9100 over the pod CIDR. + - job_name: node-exporter + kubernetes_sd_configs: + - role: pod + namespaces: + names: [honeydue] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + action: keep + regex: node-exporter + - source_labels: [__meta_kubernetes_pod_container_port_number] + action: keep + regex: "9100" + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: node + - target_label: service + replacement: node-exporter + # honeyDue worker — also exposes /metrics if/when we add it. # Keep this stanza commented until the worker has a /metrics endpoint; # uncommented form drops scrapes silently.