From c8457719464293ac390ab96c127db14154eb16bf Mon Sep 17 00:00:00 2001 From: Trey t Date: Sun, 17 May 2026 21:29:15 -0500 Subject: [PATCH] feat(observability): drop health/metrics probe noise from shipped logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The api logs every request, so k8s liveness/readiness probes on /api/health/ and vmagent's /metrics scrape drowned Loki in 2xx access logs. Alloy now drops successful probe/scrape access lines at ingest (loki.process stage.drop) — a non-2xx health check, or one logged above info level, still matches nothing and is kept. Also hardens Alloy's read-offset store: moved /tmp/alloy from an emptyDir to a hostPath and set loki.source.file tail_from_end=true, so a pod restart resumes from the saved offset instead of re-reading log files from the start — which made Loki 400-reject the now-too-old entries ("entry too far behind") and stalled shipping. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../manifests/observability/alloy-logs.yaml | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/deploy-k3s/manifests/observability/alloy-logs.yaml b/deploy-k3s/manifests/observability/alloy-logs.yaml index 09ec02f..2d1a49f 100644 --- a/deploy-k3s/manifests/observability/alloy-logs.yaml +++ b/deploy-k3s/manifests/observability/alloy-logs.yaml @@ -139,15 +139,32 @@ data: } loki.source.file "pod_logs" { - targets = local.file_match.pod_logs.targets - forward_to = [loki.process.pod_logs.receiver] + targets = local.file_match.pod_logs.targets + forward_to = [loki.process.pod_logs.receiver] + // With no stored read offset (fresh node, or positions wiped), start + // at the END of each file instead of re-shipping history — otherwise + // Loki rejects the now-too-old entries ("entry too far behind") and + // shipping stalls. Offsets persist on a hostPath (see volumes), so a + // normal pod restart resumes exactly where it left off. + tail_from_end = true } - // Parse the CRI log format (timestamp / stream / flags / message). + // Parse the CRI log format (timestamp / stream / flags / message), + // then drop probe/scrape noise before shipping. loki.process "pod_logs" { forward_to = [loki.write.obs.receiver] stage.cri {} + + // Drop successful probe/scrape access logs. k8s liveness/readiness + // hits /api/health/ every few seconds and vmagent scrapes /metrics + // on a 15s interval — all 2xx, pure noise that drowns real logs. + // A non-2xx health check, or one logged above info level, does NOT + // match this regex and is kept. + stage.drop { + expression = "\"level\":\"info\".*\"path\":\"/(api/health/?|metrics)\".*\"status\":2[0-9][0-9]" + drop_counter_reason = "probe_access_ok" + } } loki.write "obs" { @@ -252,6 +269,10 @@ spec: hostPath: path: /var/log/pods type: Directory + # Alloy's positions/WAL store. A hostPath (not emptyDir) so file read + # offsets survive pod restarts — otherwise every restart re-reads log + # files from the start and Loki rejects the now-too-old entries. - name: tmp - emptyDir: - sizeLimit: 256Mi + hostPath: + path: /var/lib/honeydue-alloy-logs + type: DirectoryOrCreate