# vmagent — scrapes Prometheus /metrics from in-cluster services and # remote-writes them to https://obs.88oakapps.com/api/v1/write # (VictoriaMetrics on 88oakappsUpdate, fronted by Cloudflare + nginx # bearer-token auth). Single replica is fine — vmagent buffers locally # during transient remote outages. --- apiVersion: v1 kind: ConfigMap metadata: name: vmagent-config namespace: honeydue labels: app.kubernetes.io/name: vmagent app.kubernetes.io/part-of: honeydue data: scrape.yaml: | global: scrape_interval: 15s external_labels: cluster: honeydue-k3s environment: prod scrape_configs: # honeyDue Go API — exposes /metrics on :8000 - job_name: api kubernetes_sd_configs: - role: pod namespaces: names: [honeydue] relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] action: keep regex: api - source_labels: [__meta_kubernetes_pod_container_port_number] action: keep regex: "8000" - source_labels: [__meta_kubernetes_pod_name] target_label: pod - source_labels: [__meta_kubernetes_pod_node_name] target_label: node - target_label: service replacement: api # kube-state-metrics — cluster object state (kube_pod_*, kube_deployment_*, # etc.) needed for Grafana panels that count pods/replicas/etc. - job_name: kube-state-metrics kubernetes_sd_configs: - role: endpoints namespaces: names: [kube-system] relabel_configs: - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] action: keep regex: kube-state-metrics - source_labels: [__meta_kubernetes_endpoint_port_name] action: keep regex: http-metrics # honeyDue worker — also exposes /metrics if/when we add it. # Keep this stanza commented until the worker has a /metrics endpoint; # uncommented form drops scrapes silently. # - job_name: worker # kubernetes_sd_configs: # - role: pod # namespaces: # names: [honeydue] # relabel_configs: # - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] # action: keep # regex: worker --- apiVersion: v1 kind: Secret metadata: name: vmagent-remote-write namespace: honeydue labels: app.kubernetes.io/name: vmagent app.kubernetes.io/part-of: honeydue type: Opaque stringData: # Bearer token for obs.88oakapps.com. Provisioned at deploy time from # deploy/prod.env (OBS_INGEST_TOKEN). The cluster-side token must match # the token in /etc/honeydue-obs/ingest_token on 88oakappsUpdate. bearer_token: TOKEN_PLACEHOLDER --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: vmagent namespace: honeydue rules: - apiGroups: [""] resources: [pods, services, endpoints] verbs: [get, list, watch] --- apiVersion: v1 kind: ServiceAccount metadata: name: vmagent namespace: honeydue --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: vmagent namespace: honeydue subjects: - kind: ServiceAccount name: vmagent namespace: honeydue roleRef: kind: Role name: vmagent apiGroup: rbac.authorization.k8s.io --- # Allow vmagent to discover the kube-state-metrics Service/Endpoints in # kube-system so the kube-state-metrics scrape job can find its target. # Cross-namespace SD needs an explicit RoleBinding here. apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: vmagent-kube-system namespace: kube-system rules: - apiGroups: [""] resources: [services, endpoints, pods] verbs: [get, list, watch] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: vmagent-kube-system namespace: kube-system subjects: - kind: ServiceAccount name: vmagent namespace: honeydue roleRef: kind: Role name: vmagent-kube-system apiGroup: rbac.authorization.k8s.io --- apiVersion: apps/v1 kind: Deployment metadata: name: vmagent namespace: honeydue labels: app.kubernetes.io/name: vmagent app.kubernetes.io/part-of: honeydue spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app.kubernetes.io/name: vmagent template: metadata: labels: app.kubernetes.io/name: vmagent app.kubernetes.io/part-of: honeydue spec: serviceAccountName: vmagent securityContext: runAsNonRoot: true runAsUser: 1000 fsGroup: 1000 seccompProfile: type: RuntimeDefault containers: - name: vmagent image: victoriametrics/vmagent:v1.106.1 args: - "-promscrape.config=/etc/vmagent/scrape.yaml" - "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write" - "-remoteWrite.bearerTokenFile=/etc/vmagent-secrets/bearer_token" - "-remoteWrite.tmpDataPath=/tmp/vmagent" - "-remoteWrite.maxDiskUsagePerURL=512MB" - "-loggerLevel=INFO" ports: - containerPort: 8429 name: http resources: requests: cpu: 25m memory: 64Mi limits: cpu: 200m memory: 256Mi volumeMounts: - name: config mountPath: /etc/vmagent readOnly: true - name: secrets mountPath: /etc/vmagent-secrets readOnly: true - name: buffer mountPath: /tmp/vmagent # Process startup gate. /-/healthy returns 200 once vmagent has # parsed config — gives the agent up to 2 min to come up before # liveness starts evaluating. startupProbe: httpGet: path: /-/healthy port: http initialDelaySeconds: 5 periodSeconds: 5 failureThreshold: 24 # Real liveness check: are scrapes actually succeeding? # /-/healthy was the old probe and returned 200 for 17 days even # while vmagent had zero healthy targets (stale k8s SD watch). # This exec probe queries vmagent's own targets API and fails if # NO target is in state "up". Three consecutive failures (3 min) # → kubelet kills the pod → fresh SD watch. livenessProbe: exec: command: - sh - -c - 'n=$(wget -qO- -T 4 http://localhost:8429/api/v1/targets 2>/dev/null | grep -c ''"health":"up"''); [ "$n" -gt 0 ]' initialDelaySeconds: 180 periodSeconds: 120 timeoutSeconds: 5 failureThreshold: 5 readinessProbe: httpGet: path: /-/healthy port: http initialDelaySeconds: 5 periodSeconds: 10 volumes: - name: config configMap: name: vmagent-config - name: secrets secret: secretName: vmagent-remote-write defaultMode: 0400 - name: buffer emptyDir: sizeLimit: 512Mi