diff --git a/deploy-k3s/manifests/observability/vmagent.yaml b/deploy-k3s/manifests/observability/vmagent.yaml new file mode 100644 index 0000000..b36d545 --- /dev/null +++ b/deploy-k3s/manifests/observability/vmagent.yaml @@ -0,0 +1,187 @@ +# vmagent — scrapes Prometheus /metrics from in-cluster services and +# remote-writes them to https://obs.88oakapps.com/api/v1/write +# (VictoriaMetrics on 88oakappsUpdate, fronted by Cloudflare + nginx +# bearer-token auth). Single replica is fine — vmagent buffers locally +# during transient remote outages. + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vmagent-config + namespace: honeydue + labels: + app.kubernetes.io/name: vmagent + app.kubernetes.io/part-of: honeydue +data: + scrape.yaml: | + global: + scrape_interval: 15s + external_labels: + cluster: honeydue-k3s + environment: prod + + scrape_configs: + # honeyDue Go API — exposes /metrics on :8000 + - job_name: api + kubernetes_sd_configs: + - role: pod + namespaces: + names: [honeydue] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + action: keep + regex: api + - source_labels: [__meta_kubernetes_pod_container_port_number] + action: keep + regex: "8000" + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: node + - target_label: service + replacement: api + + # honeyDue worker — also exposes /metrics if/when we add it. + # Keep this stanza commented until the worker has a /metrics endpoint; + # uncommented form drops scrapes silently. + # - job_name: worker + # kubernetes_sd_configs: + # - role: pod + # namespaces: + # names: [honeydue] + # relabel_configs: + # - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + # action: keep + # regex: worker + +--- +apiVersion: v1 +kind: Secret +metadata: + name: vmagent-remote-write + namespace: honeydue + labels: + app.kubernetes.io/name: vmagent + app.kubernetes.io/part-of: honeydue +type: Opaque +stringData: + # Bearer token for obs.88oakapps.com. Provisioned at deploy time from + # deploy/prod.env (OBS_INGEST_TOKEN). The cluster-side token must match + # the token in /etc/honeydue-obs/ingest_token on 88oakappsUpdate. + bearer_token: TOKEN_PLACEHOLDER + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: vmagent + namespace: honeydue +rules: + - apiGroups: [""] + resources: [pods, services, endpoints] + verbs: [get, list, watch] + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vmagent + namespace: honeydue + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: vmagent + namespace: honeydue +subjects: + - kind: ServiceAccount + name: vmagent + namespace: honeydue +roleRef: + kind: Role + name: vmagent + apiGroup: rbac.authorization.k8s.io + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vmagent + namespace: honeydue + labels: + app.kubernetes.io/name: vmagent + app.kubernetes.io/part-of: honeydue +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app.kubernetes.io/name: vmagent + template: + metadata: + labels: + app.kubernetes.io/name: vmagent + app.kubernetes.io/part-of: honeydue + spec: + serviceAccountName: vmagent + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: vmagent + image: victoriametrics/vmagent:v1.106.1 + args: + - "-promscrape.config=/etc/vmagent/scrape.yaml" + - "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write" + - "-remoteWrite.bearerTokenFile=/etc/vmagent-secrets/bearer_token" + - "-remoteWrite.tmpDataPath=/tmp/vmagent" + - "-remoteWrite.maxDiskUsagePerURL=512MB" + - "-loggerLevel=INFO" + ports: + - containerPort: 8429 + name: http + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + volumeMounts: + - name: config + mountPath: /etc/vmagent + readOnly: true + - name: secrets + mountPath: /etc/vmagent-secrets + readOnly: true + - name: buffer + mountPath: /tmp/vmagent + livenessProbe: + httpGet: + path: /-/healthy + port: http + initialDelaySeconds: 10 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /-/healthy + port: http + initialDelaySeconds: 5 + periodSeconds: 10 + volumes: + - name: config + configMap: + name: vmagent-config + - name: secrets + secret: + secretName: vmagent-remote-write + defaultMode: 0400 + - name: buffer + emptyDir: + sizeLimit: 512Mi diff --git a/docs/observability-plan.md b/docs/observability-plan.md new file mode 100644 index 0000000..2eefe53 --- /dev/null +++ b/docs/observability-plan.md @@ -0,0 +1,164 @@ +# Observability Plan — honeyDue (100% self-hosted) + +**Goal:** Live request-timing visibility (HTTP, DB, B2 uploads, APNs, asynq jobs) without paying any SaaS vendor. + +**Deployment target:** `88oakappsUpdate` (Linode VPS at `185.143.228.16`, Ubuntu 24.04, 8 vCPU / 32 GB RAM / 193 GB disk). This box already runs the self-hosted PostHog stack and has nginx + Let's Encrypt set up for `*.88oakapps.com`. Free RAM at rest ≈ 15 GB; the obs stack budget is ≈ 700 MB → ~5% of free RAM. Costs $0 incremental. + +**Why not in the honeyDue k3s cluster:** Frees ~700 MB across the 3 Hetzner nodes, no PVC plumbing, and no need to expose anything from k3s — everything is push-from-app to a public TLS endpoint. + +**Status:** Plan only — nothing implemented yet. + +--- + +## Stack + +| Role | Choice | Why this vs. the obvious alternative | +|---|---|---| +| Metrics store | **VictoriaMetrics** (single-node) | Drop-in Prometheus-compatible. ~4× lower RAM (~200 MB vs ~500 MB) and ~7× better compression. Single binary. | +| Tracing | **Jaeger all-in-one** | ~150 MB RAM with embedded badger storage. Tempo monolithic mode needs 1-2 GB minimum — overkill for honeyDue's scale. | +| Dashboards | **Grafana OSS** | Connects to both VM (Prometheus protocol) and Jaeger natively. | +| App instrumentation | **OpenTelemetry SDK** + `prometheus/client_golang` | OTel is vendor-neutral — backends are swappable without code change. | +| Logs | **Keep Dozzle**; add Loki only when log search becomes painful | Loki adds ~512 MB RAM + a daemonset for log shipping. Not worth it until there's a concrete pain point. | + +### Why not the LGTM stack (Loki + Grafana + Tempo + Mimir)? + +- **Tempo** wants 1-2 GB RAM minimum in monolithic mode ([Grafana community report](https://community.grafana.com/t/tempo-ram-usage-for-6k-spans-per-hour/63801)). Stacking that on top of Loki + Mimir would consume ~3-4 GB RAM. On a 3×8 GB cluster that's 12-17% of capacity for observability infra. +- **Mimir** is wonderful for multi-tenant Prometheus at scale — you have one tenant. +- **Loki** is great if you live in `kubectl logs` and need full-text search across them. You currently use Dozzle and are not feeling that pain. + +VictoriaMetrics + Jaeger all-in-one gives you 90% of the value at 25% of the resource cost. + +--- + +## Resource budget on `88oakappsUpdate` + +Three Docker containers in a separate compose project under `/opt/honeydue-obs/` — fully isolated from the existing PostHog compose stack so PostHog's lifecycle never touches the obs stack and vice versa. + +| Service | `mem_limit` | Disk (bind mount) | Retention | +|---|---|---|---| +| VictoriaMetrics single-node | 256 MB | 10 GB | 30 days metrics | +| Jaeger all-in-one (badger storage) | 256 MB | 10 GB | 7 days traces | +| Grafana OSS | 256 MB | 1 GB | — | +| **Total** | **~768 MB hard cap** | **21 GB** | | + +**~5% of the box's free RAM and ~14% of free disk.** The hard `mem_limit` per container matters: ClickHouse on the same VM can spike under PostHog analytics load, so bounding the obs stack prevents it from competing in a memory pinch. + +**Don't reuse PostHog's ClickHouse / Kafka / Redis.** Tempting because they're sitting right there, but coupling honeyDue's observability to PostHog's storage means a PostHog incident takes honeyDue's incident-response telemetry down with it. Keep them fully separate. + +**Shared blast radius caveat:** A kernel panic on `88oakappsUpdate` loses both PostHog and honeyDue obs at once. At current scale, fine — call it out, don't fix. + +--- + +## App-side instrumentation + +| Surface | Library / approach | Import path | +|---|---|---| +| Echo HTTP middleware | `otelecho` — span per request, tagged route/method/status | `go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho` | +| GORM queries | `uptrace/otelgorm` plugin — `db.Use(otelgorm.NewPlugin())`. Requires threading `ctx` through repositories so `db.WithContext(ctx)` works. | `github.com/uptrace/opentelemetry-go-extra/otelgorm` | +| B2 / minio-go uploads | Manual span around `storage_service.Upload` with attributes for bucket, object size, MIME type | `go.opentelemetry.io/otel` | +| APNs / FCM | Manual span in `internal/push/apns.go` and `fcm.go`; record device-token, response status code | `go.opentelemetry.io/otel` | +| asynq jobs | Custom `asynq.MiddlewareFunc` (~20 lines) — span per task type, attached to ctx, records duration + retry count | `go.opentelemetry.io/otel` + `asynq.MiddlewareFunc` | +| Prometheus `/metrics` endpoint | `prometheus/client_golang` direct — register histograms for HTTP duration / GORM op / B2 op / APNs send | `github.com/prometheus/client_golang/prometheus`, `.../prometheus/promhttp` | +| OTLP exporter | OTLP/HTTP → `https://obs.88oakapps.com/v1/traces` with bearer token. 100% sample in dev, 10% in prod. | `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp` | +| Metrics push | `vmagent` sidecar in k3s scrapes the api Pod's `/metrics` and remote-writes to `https://obs.88oakapps.com/api/v1/write` with bearer token. Cleaner than exposing `/metrics` publicly. | `victoriametrics/vmagent` image | + +**Note on GORM context propagation:** the existing repository methods don't take `ctx context.Context`. Adding `otelgorm` requires plumbing ctx down from the Echo handler through the service layer to the repository call site. ~10 repository files, many call sites. Save for last because the diff is large. + +--- + +## Implementation order (smallest first) + +### Step 1 — Metrics + dashboards (highest immediate ROI) + +**On `88oakappsUpdate`:** +1. `mkdir -p /opt/honeydue-obs/{data/vm,data/jaeger,data/grafana}` and a `docker-compose.yml` defining the three services with `mem_limit: 256m`, bind mounts for persistence, and an isolated bridge network +2. Add nginx vhosts (DNS A records first): + - `grafana.88oakapps.com` → `127.0.0.1:3000` (basic auth via htpasswd, Let's Encrypt) + - `obs.88oakapps.com` → routes by path: + - `/api/v1/write` → `127.0.0.1:8428` (VictoriaMetrics remote-write, bearer-token check) + - `/v1/traces` → `127.0.0.1:4318` (OTLP/HTTP traces, bearer-token check) +3. Generate a 32-byte token, store in `/etc/honeydue-obs/token` (mode 0600), reference from nginx as `auth_request` or simple `if ($http_authorization != ...)` +4. Pre-provision Grafana with the VM datasource pointing at `http://victoriametrics:8428` (in-network) + +**On the honeyDue k3s cluster:** +5. Add `prometheus/client_golang` to `honeyDueAPI-go/go.mod` and a `/metrics` endpoint to the Go API +6. Register histograms: + - `http_request_duration_seconds{route,method,status}` via Echo middleware + - `gorm_query_duration_seconds{table,operation}` via a GORM `Plugin` callback (no ctx needed for this one — operates at the SQL string level) + - `b2_upload_duration_seconds{bucket,result}` + - `apns_send_duration_seconds{result}` +7. Deploy a `vmagent` sidecar (or DaemonSet) in the `honeydue` namespace with: + - Scrape: api Service `/metrics` every 15s + - `remote_write.url`: `https://obs.88oakapps.com/api/v1/write` + - `remote_write.bearer_token`: from k8s Secret +8. Build the RED dashboard in Grafana: rate, errors, duration p50/p95/p99 per route + +**ROI:** "Is the API healthy? Where is time being spent right now?" answered live, served from `grafana.88oakapps.com`. + +### Step 2 — Tracing baseline + +(Jaeger is already up from Step 1. This step adds the app-side wiring.) + +1. Add Grafana datasource for Jaeger pointing at `http://jaeger:16686` (in-network) +2. Wire OTel SDK in `cmd/api/main.go`: + - `otel.SetTracerProvider(tracerProvider)` + - `otelecho.Middleware("honeydue-api")` on Echo + - OTLP/HTTP exporter pointing at `https://obs.88oakapps.com/v1/traces` with `Authorization: Bearer ` header (token from env) + - Sampling: `TraceIDRatioBased(0.1)` in prod, `AlwaysSample()` in dev +3. Verify: a single `POST /api/auth/login/` produces a trace in Jaeger + +**ROI:** "Why is this one request slow?" — answered with a flame graph. + +### Step 3 — Manual spans for the work that actually matters + +Wrap each in `tracer.Start(ctx, ...)` with attributes: +- `storage_service.Upload` → span "b2.PutObject" with `bucket`, `key`, `size_bytes`, result +- `push/apns.go` → span "apns.send" with `device_token_hash`, `status_code`, `reason` +- `asynq` middleware → span per task type with `task.type`, `retry_count`, `payload_size` + +**ROI:** Specific high-value debugging questions ("why did this upload take 30 seconds", "why did these 5 push notifications fail") answered without code archaeology. + +### Step 4 — Repository ctx + `otelgorm` (biggest diff, save for last) + +1. Refactor every repository method to accept `ctx context.Context` as first arg +2. Update every call site to pass `c.Request().Context()` from handlers / propagate through services +3. Add `db.Use(otelgorm.NewPlugin())` in `internal/database/database.go` +4. Verify: a request now has nested spans `http → service → query → query → b2.PutObject → apns.send` with full SQL on the query spans + +**ROI:** Every DB query in every trace, with SQL + table + rows. The "find the N+1" tool you'd otherwise build by hand. + +--- + +## Hard skips (revisit only when explicitly proven needed) + +| Tool | Why skip | +|---|---| +| Loki / Promtail | Dozzle covers the immediate need. Loki adds 512 Mi RAM + a daemonset; defer until log search becomes a hot pain point. | +| Mimir / VM cluster mode | Single-node VM handles honeyDue scale for years. | +| Pyroscope continuous profiling | Overkill at 3 small nodes. Use `pprof` endpoints ad-hoc when CPU pressure shows up. | +| OTel Collector | Only worth running when 3+ services emit telemetry. App → Jaeger direct is fine for now. | +| Any SaaS vendor (Datadog, NR, Honeycomb, Grafana Cloud, Sentry Performance) | User constraint: nothing paid. | + +--- + +## When to move off `88oakappsUpdate` + +Triggers — any one is enough: +- `88oakappsUpdate` available memory drops below ~3 GB sustained (PostHog growth squeezing it) +- ClickHouse OOM events start showing up in `dmesg` (PostHog under load) +- You want fully separate failure domains for honeyDue vs. 88oakapps + +Migration path: the obs stack is a single docker-compose project on a bind-mount, so moving it = `rsync /opt/honeydue-obs/` to a new box, update DNS for `grafana.88oakapps.com` and `obs.88oakapps.com`, `docker compose up -d`. ~30 min of work. Until then: cohabiting on `88oakappsUpdate` is correct. + +--- + +## Quick reference: what shows up where + +| Question | Where to look | +|---|---| +| Is the API up right now? Latency? Errors? | Grafana RED dashboard | +| Why is this specific request slow? | Jaeger trace view | +| What did the slow part of that request actually do (which SQL, which B2 PUT)? | Span details inside the trace | +| Background job throughput / queue depth | VictoriaMetrics + asynq metrics | +| What did the app print to stdout 5 minutes ago? | Dozzle | +| What error did the app log? | Dozzle (search) — or Loki if/when added | diff --git a/go.mod b/go.mod index 798f110..88c028d 100644 --- a/go.mod +++ b/go.mod @@ -33,6 +33,7 @@ require ( ) require ( + github.com/beorn7/perks v1.0.1 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/go-ini/ini v1.67.0 // indirect github.com/klauspost/compress v1.18.2 // indirect @@ -40,9 +41,15 @@ require ( github.com/klauspost/crc32 v1.3.0 // indirect github.com/minio/crc64nvme v1.1.1 // indirect github.com/minio/md5-simd v1.1.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/philhofer/fwd v1.2.0 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/rs/xid v1.6.0 // indirect github.com/tinylib/msgp v1.6.1 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect ) diff --git a/go.sum b/go.sum index 7a073b0..9c2e55f 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20201120081800-1786d5ef83d4/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= @@ -121,6 +123,8 @@ github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= github.com/minio/minio-go/v7 v7.0.99 h1:2vH/byrwUkIpFQFOilvTfaUpvAX3fEFhEzO+DR3DlCE= github.com/minio/minio-go/v7 v7.0.99/go.mod h1:EtGNKtlX20iL2yaYnxEigaIvj0G0GwSDnifnG8ClIdw= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/nicksnyder/go-i18n/v2 v2.6.0 h1:C/m2NNWNiTB6SK4Ao8df5EWm3JETSTIGNXBpMJTxzxQ= github.com/nicksnyder/go-i18n/v2 v2.6.0/go.mod h1:88sRqr0C6OPyJn0/KRNaEz1uWorjxIKP7rUUcvycecE= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= @@ -132,6 +136,14 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/redis/go-redis/v9 v9.17.1 h1:7tl732FjYPRT9H9aNfyTwKg9iTETjWjGKEJ2t/5iWTs= github.com/redis/go-redis/v9 v9.17.1/go.mod h1:u410H11HMLoB+TP67dz8rL9s6QW2j76l0//kSOd3370= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= @@ -204,6 +216,8 @@ go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJr go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20170512130425-ab89591268e0/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= diff --git a/internal/database/database.go b/internal/database/database.go index 52fb7af..c08d0e1 100644 --- a/internal/database/database.go +++ b/internal/database/database.go @@ -14,6 +14,7 @@ import ( "github.com/treytartt/honeydue-api/internal/config" "github.com/treytartt/honeydue-api/internal/models" + "github.com/treytartt/honeydue-api/internal/prom" ) // migrationAdvisoryLockKey is the pg_advisory_lock key that serializes @@ -84,6 +85,13 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) { Str("database", cfg.Database). Msg("Connected to PostgreSQL database") + // Register Prometheus GORM callbacks — emits gorm_query_duration_seconds + // for every SQL operation. Operates at the statement level, so does not + // require ctx to be threaded through repositories. + if err := prom.RegisterGORMCallbacks(db); err != nil { + log.Warn().Err(err).Msg("failed to register prometheus GORM callbacks; metrics will be partial") + } + return db, nil } diff --git a/internal/prom/metrics.go b/internal/prom/metrics.go new file mode 100644 index 0000000..3b4daba --- /dev/null +++ b/internal/prom/metrics.go @@ -0,0 +1,199 @@ +package prom + +import ( + "strconv" + "time" + + "github.com/labstack/echo/v4" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" + "github.com/prometheus/client_golang/prometheus/promhttp" + "gorm.io/gorm" +) + +var ( + Registry = prometheus.NewRegistry() + + httpRequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "http_request_duration_seconds", + Help: "Duration of HTTP requests in seconds.", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, + }, []string{"route", "method", "status"}) + + gormQueryDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "gorm_query_duration_seconds", + Help: "Duration of GORM database queries in seconds.", + Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5}, + }, []string{"table", "operation"}) + + b2UploadDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "b2_upload_duration_seconds", + Help: "Duration of B2/S3 upload operations in seconds.", + Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60}, + }, []string{"bucket", "result"}) + + b2UploadBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "b2_upload_bytes_total", + Help: "Total bytes uploaded to B2/S3.", + }, []string{"bucket", "result"}) + + apnsSendDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "apns_send_duration_seconds", + Help: "Duration of APNs push notification sends in seconds.", + Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5}, + }, []string{"result"}) + + fcmSendDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "fcm_send_duration_seconds", + Help: "Duration of FCM push notification sends in seconds.", + Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5}, + }, []string{"result"}) + + asynqJobDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "asynq_job_duration_seconds", + Help: "Duration of asynq background job execution in seconds.", + Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60, 300}, + }, []string{"task_type", "result"}) +) + +func init() { + Registry.MustRegister( + collectors.NewGoCollector(), + collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), + httpRequestDuration, + gormQueryDuration, + b2UploadDuration, + b2UploadBytes, + apnsSendDuration, + fcmSendDuration, + asynqJobDuration, + ) +} + +// Handler returns a promhttp Handler bound to the package Registry, suitable for +// mounting at GET /metrics on Echo. +func Handler() echo.HandlerFunc { + h := promhttp.HandlerFor(Registry, promhttp.HandlerOpts{Registry: Registry}) + return echo.WrapHandler(h) +} + +// HTTPMiddleware records http_request_duration_seconds for every request, +// labeled by Echo route pattern, method, and status code. +func HTTPMiddleware() echo.MiddlewareFunc { + return func(next echo.HandlerFunc) echo.HandlerFunc { + return func(c echo.Context) error { + start := time.Now() + err := next(c) + route := c.Path() + if route == "" { + route = "unknown" + } + httpRequestDuration.WithLabelValues( + route, + c.Request().Method, + strconv.Itoa(c.Response().Status), + ).Observe(time.Since(start).Seconds()) + return err + } + } +} + +// RegisterGORMCallbacks attaches before/after callbacks on a *gorm.DB so every +// SQL operation records gorm_query_duration_seconds{table,operation}. +// +// Operates at the SQL/statement level — does NOT require ctx to be threaded +// through repositories (that comes later when otelgorm lands). +func RegisterGORMCallbacks(db *gorm.DB) error { + const startKey = "honeydue:prom_start" + + registerBefore := func(name string) error { + cb := db.Callback().Create().Before("gorm:create") + switch name { + case "create": + cb = db.Callback().Create().Before("gorm:create") + case "query": + cb = db.Callback().Query().Before("gorm:query") + case "update": + cb = db.Callback().Update().Before("gorm:update") + case "delete": + cb = db.Callback().Delete().Before("gorm:delete") + case "row": + cb = db.Callback().Row().Before("gorm:row") + case "raw": + cb = db.Callback().Raw().Before("gorm:raw") + } + return cb.Register("prom:before_"+name, func(tx *gorm.DB) { + tx.InstanceSet(startKey, time.Now()) + }) + } + + registerAfter := func(name string) error { + cb := db.Callback().Create().After("gorm:create") + switch name { + case "create": + cb = db.Callback().Create().After("gorm:create") + case "query": + cb = db.Callback().Query().After("gorm:query") + case "update": + cb = db.Callback().Update().After("gorm:update") + case "delete": + cb = db.Callback().Delete().After("gorm:delete") + case "row": + cb = db.Callback().Row().After("gorm:row") + case "raw": + cb = db.Callback().Raw().After("gorm:raw") + } + return cb.Register("prom:after_"+name, func(tx *gorm.DB) { + startVal, ok := tx.InstanceGet(startKey) + if !ok { + return + } + start, ok := startVal.(time.Time) + if !ok { + return + } + table := tx.Statement.Table + if table == "" { + table = "unknown" + } + gormQueryDuration.WithLabelValues(table, name).Observe(time.Since(start).Seconds()) + }) + } + + for _, name := range []string{"create", "query", "update", "delete", "row", "raw"} { + if err := registerBefore(name); err != nil { + return err + } + if err := registerAfter(name); err != nil { + return err + } + } + return nil +} + +// ObserveB2Upload records duration + bytes for a B2/S3 upload. result is "ok" +// or "error". +func ObserveB2Upload(bucket, result string, dur time.Duration, bytes int64) { + b2UploadDuration.WithLabelValues(bucket, result).Observe(dur.Seconds()) + if bytes > 0 { + b2UploadBytes.WithLabelValues(bucket, result).Add(float64(bytes)) + } +} + +// ObserveAPNsSend records duration of a single APNs send. result is "ok", +// "bad_token", or "error". +func ObserveAPNsSend(result string, dur time.Duration) { + apnsSendDuration.WithLabelValues(result).Observe(dur.Seconds()) +} + +// ObserveFCMSend records duration of a single FCM send. result is "ok", +// "bad_token", or "error". +func ObserveFCMSend(result string, dur time.Duration) { + fcmSendDuration.WithLabelValues(result).Observe(dur.Seconds()) +} + +// ObserveAsynqJob records duration of a single asynq job execution. result is +// "ok", "retry", or "error". +func ObserveAsynqJob(taskType, result string, dur time.Duration) { + asynqJobDuration.WithLabelValues(taskType, result).Observe(dur.Seconds()) +} diff --git a/internal/push/apns.go b/internal/push/apns.go index 4544ea3..cf863e8 100644 --- a/internal/push/apns.go +++ b/internal/push/apns.go @@ -3,6 +3,7 @@ package push import ( "context" "fmt" + "time" "github.com/rs/zerolog/log" "github.com/sideshow/apns2" @@ -10,6 +11,7 @@ import ( "github.com/sideshow/apns2/token" "github.com/treytartt/honeydue-api/internal/config" + "github.com/treytartt/honeydue-api/internal/prom" ) // APNsClient handles direct communication with Apple Push Notification service @@ -84,8 +86,10 @@ func (c *APNsClient) Send(ctx context.Context, tokens []string, title, message s Priority: apns2.PriorityHigh, } + sendStart := time.Now() res, err := c.client.PushWithContext(ctx, notification) if err != nil { + prom.ObserveAPNsSend("error", time.Since(sendStart)) log.Error(). Err(err). Str("token", truncateToken(deviceToken)). @@ -95,6 +99,7 @@ func (c *APNsClient) Send(ctx context.Context, tokens []string, title, message s } if !res.Sent() { + prom.ObserveAPNsSend("bad_token", time.Since(sendStart)) log.Error(). Str("token", truncateToken(deviceToken)). Str("reason", res.Reason). @@ -104,6 +109,7 @@ func (c *APNsClient) Send(ctx context.Context, tokens []string, title, message s continue } + prom.ObserveAPNsSend("ok", time.Since(sendStart)) successCount++ log.Debug(). Str("token", truncateToken(deviceToken)). @@ -154,8 +160,10 @@ func (c *APNsClient) SendWithCategory(ctx context.Context, tokens []string, titl Priority: apns2.PriorityHigh, } + sendStart := time.Now() res, err := c.client.PushWithContext(ctx, notification) if err != nil { + prom.ObserveAPNsSend("error", time.Since(sendStart)) log.Error(). Err(err). Str("token", truncateToken(deviceToken)). @@ -166,6 +174,7 @@ func (c *APNsClient) SendWithCategory(ctx context.Context, tokens []string, titl } if !res.Sent() { + prom.ObserveAPNsSend("bad_token", time.Since(sendStart)) log.Error(). Str("token", truncateToken(deviceToken)). Str("reason", res.Reason). @@ -176,6 +185,7 @@ func (c *APNsClient) SendWithCategory(ctx context.Context, tokens []string, titl continue } + prom.ObserveAPNsSend("ok", time.Since(sendStart)) successCount++ log.Debug(). Str("token", truncateToken(deviceToken)). diff --git a/internal/push/fcm.go b/internal/push/fcm.go index f7cfff2..420691a 100644 --- a/internal/push/fcm.go +++ b/internal/push/fcm.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -14,6 +15,7 @@ import ( "golang.org/x/oauth2/google" "github.com/treytartt/honeydue-api/internal/config" + "github.com/treytartt/honeydue-api/internal/prom" ) const ( @@ -213,8 +215,15 @@ func (c *FCMClient) Send(ctx context.Context, tokens []string, title, message st successCount := 0 for _, token := range tokens { + sendStart := time.Now() err := c.sendOne(ctx, token, title, message, data) if err != nil { + result := "error" + var fcmErr *FCMSendError + if errors.As(err, &fcmErr) && fcmErr.IsUnregistered() { + result = "bad_token" + } + prom.ObserveFCMSend(result, time.Since(sendStart)) log.Error(). Err(err). Str("token", truncateToken(token)). @@ -223,6 +232,7 @@ func (c *FCMClient) Send(ctx context.Context, tokens []string, title, message st continue } + prom.ObserveFCMSend("ok", time.Since(sendStart)) successCount++ log.Debug(). Str("token", truncateToken(token)). diff --git a/internal/router/router.go b/internal/router/router.go index acc53e8..dbaf8aa 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -23,6 +23,7 @@ import ( "github.com/treytartt/honeydue-api/internal/i18n" custommiddleware "github.com/treytartt/honeydue-api/internal/middleware" "github.com/treytartt/honeydue-api/internal/monitoring" + "github.com/treytartt/honeydue-api/internal/prom" "github.com/treytartt/honeydue-api/internal/push" "github.com/treytartt/honeydue-api/internal/repositories" "github.com/treytartt/honeydue-api/internal/services" @@ -121,6 +122,15 @@ func SetupRouter(deps *Dependencies) *echo.Echo { } } + // Prometheus metrics middleware — feeds VictoriaMetrics on + // obs.88oakapps.com via vmagent. Records http_request_duration_seconds + // labeled by route pattern, method, and status code. + e.Use(prom.HTTPMiddleware()) + + // /metrics endpoint exposed for vmagent scrape. No auth — bound to + // the cluster network only; not exposed via Cloudflare. + e.GET("/metrics", prom.Handler()) + // Serve landing page static files (if static directory is configured) staticDir := cfg.Server.StaticDir if staticDir != "" { @@ -229,9 +239,11 @@ func SetupRouter(deps *Dependencies) *echo.Echo { mediaHandler = handlers.NewMediaHandler(documentRepo, taskRepo, residenceRepo, deps.StorageService) } - // Prometheus metrics endpoint (no auth required, for scraping) + // Legacy Prometheus-shaped metrics from internal/monitoring (consumed by + // GoAdmin dashboard). Now lives at /metrics/legacy so the canonical /metrics + // route (registered above) emits proper Prometheus histograms with labels. if deps.MonitoringService != nil { - e.GET("/metrics", prometheusMetrics(deps.MonitoringService)) + e.GET("/metrics/legacy", prometheusMetrics(deps.MonitoringService)) } // Set up admin routes with monitoring handler (if available) diff --git a/internal/services/storage_service.go b/internal/services/storage_service.go index b6f1fc6..ea07ae2 100644 --- a/internal/services/storage_service.go +++ b/internal/services/storage_service.go @@ -13,6 +13,7 @@ import ( "github.com/rs/zerolog/log" "github.com/treytartt/honeydue-api/internal/config" + "github.com/treytartt/honeydue-api/internal/prom" ) // StorageService handles file uploads, validation, encryption, and URL generation. @@ -149,11 +150,18 @@ func (s *StorageService) Upload(file *multipart.FileHeader, category string) (*U } } - // Write to backend + // Write to backend (B2/S3 round trip — instrumented for Prometheus) + bucket := s.cfg.S3Bucket + if bucket == "" { + bucket = "local" + } + uploadStart := time.Now() if err := s.backend.Write(key, fileData); err != nil { + prom.ObserveB2Upload(bucket, "error", time.Since(uploadStart), 0) return nil, fmt.Errorf("failed to save file: %w", err) } written := int64(len(fileData)) + prom.ObserveB2Upload(bucket, "ok", time.Since(uploadStart), written) // Generate URL (always uses the original filename without .enc suffix) url := fmt.Sprintf("%s/%s/%s", s.cfg.BaseURL, subdir, newFilename)