honeyDueAPI/deploy-k3s/manifests/observability/vmagent.yaml

# vmagent — scrapes Prometheus /metrics from in-cluster services and
# remote-writes them to https://obs.88oakapps.com/api/v1/write
# (VictoriaMetrics on 88oakappsUpdate, fronted by Cloudflare + nginx
# bearer-token auth). Single replica is fine — vmagent buffers locally
# during transient remote outages.

---
apiVersion: v1
kind: ConfigMap
metadata:
  name: vmagent-config
  namespace: honeydue
  labels:
    app.kubernetes.io/name: vmagent
    app.kubernetes.io/part-of: honeydue
data:
  scrape.yaml: |
    global:
      scrape_interval: 15s
      external_labels:
        cluster: honeydue-k3s
        environment: prod

    scrape_configs:
      # honeyDue Go API — exposes /metrics on :8000
      - job_name: api
        kubernetes_sd_configs:
          - role: pod
            namespaces:
              names: [honeydue]
        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
            action: keep
            regex: api
          - source_labels: [__meta_kubernetes_pod_container_port_number]
            action: keep
            regex: "8000"
          - source_labels: [__meta_kubernetes_pod_name]
            target_label: pod
          - source_labels: [__meta_kubernetes_pod_node_name]
            target_label: node
          - target_label: service
            replacement: api

      # kube-state-metrics — cluster object state (kube_pod_*, kube_deployment_*,
      # etc.) needed for Grafana panels that count pods/replicas/etc.
      - job_name: kube-state-metrics
        kubernetes_sd_configs:
          - role: endpoints
            namespaces:
              names: [kube-system]
        relabel_configs:
          - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
            action: keep
            regex: kube-state-metrics
          - source_labels: [__meta_kubernetes_endpoint_port_name]
            action: keep
            regex: http-metrics

      # node-exporter — per-node host metrics (node_filesystem_*, node_memory_*,
      # node_load*). Pod-networked DaemonSet scraped on :9100 over the pod CIDR.
      - job_name: node-exporter
        kubernetes_sd_configs:
          - role: pod
            namespaces:
              names: [honeydue]
        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
            action: keep
            regex: node-exporter
          - source_labels: [__meta_kubernetes_pod_container_port_number]
            action: keep
            regex: "9100"
          - source_labels: [__meta_kubernetes_pod_name]
            target_label: pod
          - source_labels: [__meta_kubernetes_pod_node_name]
            target_label: node
          - target_label: service
            replacement: node-exporter

      # honeyDue worker — also exposes /metrics if/when we add it.
      # Keep this stanza commented until the worker has a /metrics endpoint;
      # uncommented form drops scrapes silently.
      # - job_name: worker
      #   kubernetes_sd_configs:
      #     - role: pod
      #       namespaces:
      #         names: [honeydue]
      #   relabel_configs:
      #     - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
      #       action: keep
      #       regex: worker

---
apiVersion: v1
kind: Secret
metadata:
  name: vmagent-remote-write
  namespace: honeydue
  labels:
    app.kubernetes.io/name: vmagent
    app.kubernetes.io/part-of: honeydue
type: Opaque
stringData:
  # Bearer token for obs.88oakapps.com. Provisioned at deploy time from
  # deploy/prod.env (OBS_INGEST_TOKEN). The cluster-side token must match
  # the token in /etc/honeydue-obs/ingest_token on 88oakappsUpdate.
  bearer_token: TOKEN_PLACEHOLDER

---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: vmagent
  namespace: honeydue
rules:
  - apiGroups: [""]
    resources: [pods, services, endpoints]
    verbs: [get, list, watch]

---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: vmagent
  namespace: honeydue

---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: vmagent
  namespace: honeydue
subjects:
  - kind: ServiceAccount
    name: vmagent
    namespace: honeydue
roleRef:
  kind: Role
  name: vmagent
  apiGroup: rbac.authorization.k8s.io

---
# Allow vmagent to discover the kube-state-metrics Service/Endpoints in
# kube-system so the kube-state-metrics scrape job can find its target.
# Cross-namespace SD needs an explicit RoleBinding here.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: vmagent-kube-system
  namespace: kube-system
rules:
  - apiGroups: [""]
    resources: [services, endpoints, pods]
    verbs: [get, list, watch]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: vmagent-kube-system
  namespace: kube-system
subjects:
  - kind: ServiceAccount
    name: vmagent
    namespace: honeydue
roleRef:
  kind: Role
  name: vmagent-kube-system
  apiGroup: rbac.authorization.k8s.io

---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: vmagent
  namespace: honeydue
  labels:
    app.kubernetes.io/name: vmagent
    app.kubernetes.io/part-of: honeydue
spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app.kubernetes.io/name: vmagent
  template:
    metadata:
      labels:
        app.kubernetes.io/name: vmagent
        app.kubernetes.io/part-of: honeydue
    spec:
      serviceAccountName: vmagent
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        fsGroup: 1000
        seccompProfile:
          type: RuntimeDefault
      containers:
        - name: vmagent
          # Pinned by digest (audit K3S-F14).
          image: victoriametrics/vmagent:v1.106.1@sha256:90208a667c0baf65f7536b92a84c40b6e35ffe8e88bda7e4447b97b06c6ba6b8
          imagePullPolicy: IfNotPresent  # audit CODE-L4 — explicit
          # Container-level hardening (audit F7) — matches the other 5
          # workloads. vmagent only writes to the /tmp/vmagent emptyDir
          # (its remoteWrite buffer), so a read-only root filesystem holds.
          securityContext:
            allowPrivilegeEscalation: false
            readOnlyRootFilesystem: true
            capabilities:
              drop: ["ALL"]
          args:
            - "-promscrape.config=/etc/vmagent/scrape.yaml"
            - "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write"
            - "-remoteWrite.bearerTokenFile=/etc/vmagent-secrets/bearer_token"
            - "-remoteWrite.tmpDataPath=/tmp/vmagent"
            - "-remoteWrite.maxDiskUsagePerURL=512MB"
            - "-loggerLevel=INFO"
          ports:
            - containerPort: 8429
              name: http
          resources:
            requests:
              cpu: 25m
              memory: 64Mi
            limits:
              cpu: 200m
              memory: 256Mi
          volumeMounts:
            - name: config
              mountPath: /etc/vmagent
              readOnly: true
            - name: secrets
              mountPath: /etc/vmagent-secrets
              readOnly: true
            - name: buffer
              mountPath: /tmp/vmagent
          # Process startup gate. /-/healthy returns 200 once vmagent has
          # parsed config — gives the agent up to 2 min to come up before
          # liveness starts evaluating.
          startupProbe:
            httpGet:
              path: /-/healthy
              port: http
            initialDelaySeconds: 5
            periodSeconds: 5
            failureThreshold: 24
          # Real liveness check: are scrapes actually succeeding?
          # /-/healthy was the old probe and returned 200 for 17 days even
          # while vmagent had zero healthy targets (stale k8s SD watch).
          # This exec probe queries vmagent's own targets API and fails if
          # NO target is in state "up". Three consecutive failures (3 min)
          # → kubelet kills the pod → fresh SD watch.
          livenessProbe:
            exec:
              command:
                - sh
                - -c
                - 'n=$(wget -qO- -T 4 http://localhost:8429/api/v1/targets 2>/dev/null | grep -c ''"health":"up"''); [ "$n" -gt 0 ]'
            initialDelaySeconds: 180
            periodSeconds: 120
            timeoutSeconds: 5
            failureThreshold: 5
          readinessProbe:
            httpGet:
              path: /-/healthy
              port: http
            initialDelaySeconds: 5
            periodSeconds: 10
      volumes:
        - name: config
          configMap:
            name: vmagent-config
        - name: secrets
          secret:
            secretName: vmagent-remote-write
            defaultMode: 0400
        - name: buffer
          emptyDir:
            sizeLimit: 512Mi