Files
honeyDueAPI/deploy-k3s/manifests/observability/vmagent.yaml
T
Trey T 3b2ea9959a
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled
Backend CI / Build (push) Has been cancelled
deploy: add node-exporter DaemonSet + vmagent scrape job
Per-node host metrics (node_filesystem_*, node_memory_*, node_load*) were
missing — a node running out of disk would silently fail the cluster before
any dashboard signal (RUNBOOK §11.1 gap #9). Adds:
- node-exporter DaemonSet (pod-networked, :9100; host /proc,/sys,/ ro) so
  vmagent scrapes it pod-to-pod over the cluster CIDR, independent of node
  public IPs (the netpol node-IP list is OVH-stale).
- two additive NetworkPolicies (default-deny-all is in force): ingress to
  node-exporter from vmagent, and vmagent egress to the pod CIDR on :9100.
- a node-exporter scrape job in the vmagent-config ConfigMap.

Feeds the new "Node host health" row (disk/mem/load) on the eli5 dashboard.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 21:41:40 -05:00

283 lines
8.6 KiB
YAML

# vmagent — scrapes Prometheus /metrics from in-cluster services and
# remote-writes them to https://obs.88oakapps.com/api/v1/write
# (VictoriaMetrics on 88oakappsUpdate, fronted by Cloudflare + nginx
# bearer-token auth). Single replica is fine — vmagent buffers locally
# during transient remote outages.
---
apiVersion: v1
kind: ConfigMap
metadata:
name: vmagent-config
namespace: honeydue
labels:
app.kubernetes.io/name: vmagent
app.kubernetes.io/part-of: honeydue
data:
scrape.yaml: |
global:
scrape_interval: 15s
external_labels:
cluster: honeydue-k3s
environment: prod
scrape_configs:
# honeyDue Go API — exposes /metrics on :8000
- job_name: api
kubernetes_sd_configs:
- role: pod
namespaces:
names: [honeydue]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: api
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: keep
regex: "8000"
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_pod_node_name]
target_label: node
- target_label: service
replacement: api
# kube-state-metrics — cluster object state (kube_pod_*, kube_deployment_*,
# etc.) needed for Grafana panels that count pods/replicas/etc.
- job_name: kube-state-metrics
kubernetes_sd_configs:
- role: endpoints
namespaces:
names: [kube-system]
relabel_configs:
- source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
action: keep
regex: kube-state-metrics
- source_labels: [__meta_kubernetes_endpoint_port_name]
action: keep
regex: http-metrics
# node-exporter — per-node host metrics (node_filesystem_*, node_memory_*,
# node_load*). Pod-networked DaemonSet scraped on :9100 over the pod CIDR.
- job_name: node-exporter
kubernetes_sd_configs:
- role: pod
namespaces:
names: [honeydue]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: node-exporter
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: keep
regex: "9100"
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_pod_node_name]
target_label: node
- target_label: service
replacement: node-exporter
# honeyDue worker — also exposes /metrics if/when we add it.
# Keep this stanza commented until the worker has a /metrics endpoint;
# uncommented form drops scrapes silently.
# - job_name: worker
# kubernetes_sd_configs:
# - role: pod
# namespaces:
# names: [honeydue]
# relabel_configs:
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
# action: keep
# regex: worker
---
apiVersion: v1
kind: Secret
metadata:
name: vmagent-remote-write
namespace: honeydue
labels:
app.kubernetes.io/name: vmagent
app.kubernetes.io/part-of: honeydue
type: Opaque
stringData:
# Bearer token for obs.88oakapps.com. Provisioned at deploy time from
# deploy/prod.env (OBS_INGEST_TOKEN). The cluster-side token must match
# the token in /etc/honeydue-obs/ingest_token on 88oakappsUpdate.
bearer_token: TOKEN_PLACEHOLDER
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: vmagent
namespace: honeydue
rules:
- apiGroups: [""]
resources: [pods, services, endpoints]
verbs: [get, list, watch]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: vmagent
namespace: honeydue
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: vmagent
namespace: honeydue
subjects:
- kind: ServiceAccount
name: vmagent
namespace: honeydue
roleRef:
kind: Role
name: vmagent
apiGroup: rbac.authorization.k8s.io
---
# Allow vmagent to discover the kube-state-metrics Service/Endpoints in
# kube-system so the kube-state-metrics scrape job can find its target.
# Cross-namespace SD needs an explicit RoleBinding here.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: vmagent-kube-system
namespace: kube-system
rules:
- apiGroups: [""]
resources: [services, endpoints, pods]
verbs: [get, list, watch]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: vmagent-kube-system
namespace: kube-system
subjects:
- kind: ServiceAccount
name: vmagent
namespace: honeydue
roleRef:
kind: Role
name: vmagent-kube-system
apiGroup: rbac.authorization.k8s.io
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vmagent
namespace: honeydue
labels:
app.kubernetes.io/name: vmagent
app.kubernetes.io/part-of: honeydue
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app.kubernetes.io/name: vmagent
template:
metadata:
labels:
app.kubernetes.io/name: vmagent
app.kubernetes.io/part-of: honeydue
spec:
serviceAccountName: vmagent
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: vmagent
# Pinned by digest (audit K3S-F14).
image: victoriametrics/vmagent:v1.106.1@sha256:90208a667c0baf65f7536b92a84c40b6e35ffe8e88bda7e4447b97b06c6ba6b8
imagePullPolicy: IfNotPresent # audit CODE-L4 — explicit
# Container-level hardening (audit F7) — matches the other 5
# workloads. vmagent only writes to the /tmp/vmagent emptyDir
# (its remoteWrite buffer), so a read-only root filesystem holds.
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
args:
- "-promscrape.config=/etc/vmagent/scrape.yaml"
- "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write"
- "-remoteWrite.bearerTokenFile=/etc/vmagent-secrets/bearer_token"
- "-remoteWrite.tmpDataPath=/tmp/vmagent"
- "-remoteWrite.maxDiskUsagePerURL=512MB"
- "-loggerLevel=INFO"
ports:
- containerPort: 8429
name: http
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
volumeMounts:
- name: config
mountPath: /etc/vmagent
readOnly: true
- name: secrets
mountPath: /etc/vmagent-secrets
readOnly: true
- name: buffer
mountPath: /tmp/vmagent
# Process startup gate. /-/healthy returns 200 once vmagent has
# parsed config — gives the agent up to 2 min to come up before
# liveness starts evaluating.
startupProbe:
httpGet:
path: /-/healthy
port: http
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 24
# Real liveness check: are scrapes actually succeeding?
# /-/healthy was the old probe and returned 200 for 17 days even
# while vmagent had zero healthy targets (stale k8s SD watch).
# This exec probe queries vmagent's own targets API and fails if
# NO target is in state "up". Three consecutive failures (3 min)
# → kubelet kills the pod → fresh SD watch.
livenessProbe:
exec:
command:
- sh
- -c
- 'n=$(wget -qO- -T 4 http://localhost:8429/api/v1/targets 2>/dev/null | grep -c ''"health":"up"''); [ "$n" -gt 0 ]'
initialDelaySeconds: 180
periodSeconds: 120
timeoutSeconds: 5
failureThreshold: 5
readinessProbe:
httpGet:
path: /-/healthy
port: http
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: config
configMap:
name: vmagent-config
- name: secrets
secret:
secretName: vmagent-remote-write
defaultMode: 0400
- name: buffer
emptyDir:
sizeLimit: 512Mi