Add Prometheus metrics + vmagent push to obs.88oakapps.com
Adds internal/prom package with histograms for HTTP, GORM, B2, APNs, and FCM, wired into the Echo router (HTTPMiddleware + /metrics) and GORM via statement-level callbacks (no ctx plumbing needed). Storage and push clients call ObserveB2Upload / ObserveAPNsSend / ObserveFCMSend at the network round-trip points. Existing internal/monitoring metrics move to /metrics/legacy so the canonical /metrics emits proper histogram buckets for p50/p95/p99 rollups. deploy-k3s/manifests/observability/vmagent.yaml deploys a single-replica vmagent in the honeydue namespace that scrapes api Pods on :8000/metrics every 15s and remote-writes to https://obs.88oakapps.com/api/v1/write with a bearer token (substituted at deploy time from OBS_INGEST_TOKEN in deploy/prod.env). NetworkPolicies allow vmagent egress to api Pods and to the public obs endpoint over :443; the obs side runs VictoriaMetrics + Jaeger + Grafana on 88oakappsUpdate. docs/observability-plan.md captures the full plan including resource budget, instrumentation table, 4-step rollout, and migration triggers. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,187 @@
|
|||||||
|
# vmagent — scrapes Prometheus /metrics from in-cluster services and
|
||||||
|
# remote-writes them to https://obs.88oakapps.com/api/v1/write
|
||||||
|
# (VictoriaMetrics on 88oakappsUpdate, fronted by Cloudflare + nginx
|
||||||
|
# bearer-token auth). Single replica is fine — vmagent buffers locally
|
||||||
|
# during transient remote outages.
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: vmagent-config
|
||||||
|
namespace: honeydue
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: vmagent
|
||||||
|
app.kubernetes.io/part-of: honeydue
|
||||||
|
data:
|
||||||
|
scrape.yaml: |
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
external_labels:
|
||||||
|
cluster: honeydue-k3s
|
||||||
|
environment: prod
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
# honeyDue Go API — exposes /metrics on :8000
|
||||||
|
- job_name: api
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: pod
|
||||||
|
namespaces:
|
||||||
|
names: [honeydue]
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||||
|
action: keep
|
||||||
|
regex: api
|
||||||
|
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
||||||
|
action: keep
|
||||||
|
regex: "8000"
|
||||||
|
- source_labels: [__meta_kubernetes_pod_name]
|
||||||
|
target_label: pod
|
||||||
|
- source_labels: [__meta_kubernetes_pod_node_name]
|
||||||
|
target_label: node
|
||||||
|
- target_label: service
|
||||||
|
replacement: api
|
||||||
|
|
||||||
|
# honeyDue worker — also exposes /metrics if/when we add it.
|
||||||
|
# Keep this stanza commented until the worker has a /metrics endpoint;
|
||||||
|
# uncommented form drops scrapes silently.
|
||||||
|
# - job_name: worker
|
||||||
|
# kubernetes_sd_configs:
|
||||||
|
# - role: pod
|
||||||
|
# namespaces:
|
||||||
|
# names: [honeydue]
|
||||||
|
# relabel_configs:
|
||||||
|
# - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
||||||
|
# action: keep
|
||||||
|
# regex: worker
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: vmagent-remote-write
|
||||||
|
namespace: honeydue
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: vmagent
|
||||||
|
app.kubernetes.io/part-of: honeydue
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
# Bearer token for obs.88oakapps.com. Provisioned at deploy time from
|
||||||
|
# deploy/prod.env (OBS_INGEST_TOKEN). The cluster-side token must match
|
||||||
|
# the token in /etc/honeydue-obs/ingest_token on 88oakappsUpdate.
|
||||||
|
bearer_token: TOKEN_PLACEHOLDER
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
name: vmagent
|
||||||
|
namespace: honeydue
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: [pods, services, endpoints]
|
||||||
|
verbs: [get, list, watch]
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: vmagent
|
||||||
|
namespace: honeydue
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
name: vmagent
|
||||||
|
namespace: honeydue
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: vmagent
|
||||||
|
namespace: honeydue
|
||||||
|
roleRef:
|
||||||
|
kind: Role
|
||||||
|
name: vmagent
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: vmagent
|
||||||
|
namespace: honeydue
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: vmagent
|
||||||
|
app.kubernetes.io/part-of: honeydue
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: vmagent
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: vmagent
|
||||||
|
app.kubernetes.io/part-of: honeydue
|
||||||
|
spec:
|
||||||
|
serviceAccountName: vmagent
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
containers:
|
||||||
|
- name: vmagent
|
||||||
|
image: victoriametrics/vmagent:v1.106.1
|
||||||
|
args:
|
||||||
|
- "-promscrape.config=/etc/vmagent/scrape.yaml"
|
||||||
|
- "-remoteWrite.url=https://obs.88oakapps.com/api/v1/write"
|
||||||
|
- "-remoteWrite.bearerTokenFile=/etc/vmagent-secrets/bearer_token"
|
||||||
|
- "-remoteWrite.tmpDataPath=/tmp/vmagent"
|
||||||
|
- "-remoteWrite.maxDiskUsagePerURL=512MB"
|
||||||
|
- "-loggerLevel=INFO"
|
||||||
|
ports:
|
||||||
|
- containerPort: 8429
|
||||||
|
name: http
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 25m
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 256Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: config
|
||||||
|
mountPath: /etc/vmagent
|
||||||
|
readOnly: true
|
||||||
|
- name: secrets
|
||||||
|
mountPath: /etc/vmagent-secrets
|
||||||
|
readOnly: true
|
||||||
|
- name: buffer
|
||||||
|
mountPath: /tmp/vmagent
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /-/healthy
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 30
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /-/healthy
|
||||||
|
port: http
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
volumes:
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: vmagent-config
|
||||||
|
- name: secrets
|
||||||
|
secret:
|
||||||
|
secretName: vmagent-remote-write
|
||||||
|
defaultMode: 0400
|
||||||
|
- name: buffer
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 512Mi
|
||||||
@@ -0,0 +1,164 @@
|
|||||||
|
# Observability Plan — honeyDue (100% self-hosted)
|
||||||
|
|
||||||
|
**Goal:** Live request-timing visibility (HTTP, DB, B2 uploads, APNs, asynq jobs) without paying any SaaS vendor.
|
||||||
|
|
||||||
|
**Deployment target:** `88oakappsUpdate` (Linode VPS at `185.143.228.16`, Ubuntu 24.04, 8 vCPU / 32 GB RAM / 193 GB disk). This box already runs the self-hosted PostHog stack and has nginx + Let's Encrypt set up for `*.88oakapps.com`. Free RAM at rest ≈ 15 GB; the obs stack budget is ≈ 700 MB → ~5% of free RAM. Costs $0 incremental.
|
||||||
|
|
||||||
|
**Why not in the honeyDue k3s cluster:** Frees ~700 MB across the 3 Hetzner nodes, no PVC plumbing, and no need to expose anything from k3s — everything is push-from-app to a public TLS endpoint.
|
||||||
|
|
||||||
|
**Status:** Plan only — nothing implemented yet.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Stack
|
||||||
|
|
||||||
|
| Role | Choice | Why this vs. the obvious alternative |
|
||||||
|
|---|---|---|
|
||||||
|
| Metrics store | **VictoriaMetrics** (single-node) | Drop-in Prometheus-compatible. ~4× lower RAM (~200 MB vs ~500 MB) and ~7× better compression. Single binary. |
|
||||||
|
| Tracing | **Jaeger all-in-one** | ~150 MB RAM with embedded badger storage. Tempo monolithic mode needs 1-2 GB minimum — overkill for honeyDue's scale. |
|
||||||
|
| Dashboards | **Grafana OSS** | Connects to both VM (Prometheus protocol) and Jaeger natively. |
|
||||||
|
| App instrumentation | **OpenTelemetry SDK** + `prometheus/client_golang` | OTel is vendor-neutral — backends are swappable without code change. |
|
||||||
|
| Logs | **Keep Dozzle**; add Loki only when log search becomes painful | Loki adds ~512 MB RAM + a daemonset for log shipping. Not worth it until there's a concrete pain point. |
|
||||||
|
|
||||||
|
### Why not the LGTM stack (Loki + Grafana + Tempo + Mimir)?
|
||||||
|
|
||||||
|
- **Tempo** wants 1-2 GB RAM minimum in monolithic mode ([Grafana community report](https://community.grafana.com/t/tempo-ram-usage-for-6k-spans-per-hour/63801)). Stacking that on top of Loki + Mimir would consume ~3-4 GB RAM. On a 3×8 GB cluster that's 12-17% of capacity for observability infra.
|
||||||
|
- **Mimir** is wonderful for multi-tenant Prometheus at scale — you have one tenant.
|
||||||
|
- **Loki** is great if you live in `kubectl logs` and need full-text search across them. You currently use Dozzle and are not feeling that pain.
|
||||||
|
|
||||||
|
VictoriaMetrics + Jaeger all-in-one gives you 90% of the value at 25% of the resource cost.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Resource budget on `88oakappsUpdate`
|
||||||
|
|
||||||
|
Three Docker containers in a separate compose project under `/opt/honeydue-obs/` — fully isolated from the existing PostHog compose stack so PostHog's lifecycle never touches the obs stack and vice versa.
|
||||||
|
|
||||||
|
| Service | `mem_limit` | Disk (bind mount) | Retention |
|
||||||
|
|---|---|---|---|
|
||||||
|
| VictoriaMetrics single-node | 256 MB | 10 GB | 30 days metrics |
|
||||||
|
| Jaeger all-in-one (badger storage) | 256 MB | 10 GB | 7 days traces |
|
||||||
|
| Grafana OSS | 256 MB | 1 GB | — |
|
||||||
|
| **Total** | **~768 MB hard cap** | **21 GB** | |
|
||||||
|
|
||||||
|
**~5% of the box's free RAM and ~14% of free disk.** The hard `mem_limit` per container matters: ClickHouse on the same VM can spike under PostHog analytics load, so bounding the obs stack prevents it from competing in a memory pinch.
|
||||||
|
|
||||||
|
**Don't reuse PostHog's ClickHouse / Kafka / Redis.** Tempting because they're sitting right there, but coupling honeyDue's observability to PostHog's storage means a PostHog incident takes honeyDue's incident-response telemetry down with it. Keep them fully separate.
|
||||||
|
|
||||||
|
**Shared blast radius caveat:** A kernel panic on `88oakappsUpdate` loses both PostHog and honeyDue obs at once. At current scale, fine — call it out, don't fix.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## App-side instrumentation
|
||||||
|
|
||||||
|
| Surface | Library / approach | Import path |
|
||||||
|
|---|---|---|
|
||||||
|
| Echo HTTP middleware | `otelecho` — span per request, tagged route/method/status | `go.opentelemetry.io/contrib/instrumentation/github.com/labstack/echo/otelecho` |
|
||||||
|
| GORM queries | `uptrace/otelgorm` plugin — `db.Use(otelgorm.NewPlugin())`. Requires threading `ctx` through repositories so `db.WithContext(ctx)` works. | `github.com/uptrace/opentelemetry-go-extra/otelgorm` |
|
||||||
|
| B2 / minio-go uploads | Manual span around `storage_service.Upload` with attributes for bucket, object size, MIME type | `go.opentelemetry.io/otel` |
|
||||||
|
| APNs / FCM | Manual span in `internal/push/apns.go` and `fcm.go`; record device-token, response status code | `go.opentelemetry.io/otel` |
|
||||||
|
| asynq jobs | Custom `asynq.MiddlewareFunc` (~20 lines) — span per task type, attached to ctx, records duration + retry count | `go.opentelemetry.io/otel` + `asynq.MiddlewareFunc` |
|
||||||
|
| Prometheus `/metrics` endpoint | `prometheus/client_golang` direct — register histograms for HTTP duration / GORM op / B2 op / APNs send | `github.com/prometheus/client_golang/prometheus`, `.../prometheus/promhttp` |
|
||||||
|
| OTLP exporter | OTLP/HTTP → `https://obs.88oakapps.com/v1/traces` with bearer token. 100% sample in dev, 10% in prod. | `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp` |
|
||||||
|
| Metrics push | `vmagent` sidecar in k3s scrapes the api Pod's `/metrics` and remote-writes to `https://obs.88oakapps.com/api/v1/write` with bearer token. Cleaner than exposing `/metrics` publicly. | `victoriametrics/vmagent` image |
|
||||||
|
|
||||||
|
**Note on GORM context propagation:** the existing repository methods don't take `ctx context.Context`. Adding `otelgorm` requires plumbing ctx down from the Echo handler through the service layer to the repository call site. ~10 repository files, many call sites. Save for last because the diff is large.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation order (smallest first)
|
||||||
|
|
||||||
|
### Step 1 — Metrics + dashboards (highest immediate ROI)
|
||||||
|
|
||||||
|
**On `88oakappsUpdate`:**
|
||||||
|
1. `mkdir -p /opt/honeydue-obs/{data/vm,data/jaeger,data/grafana}` and a `docker-compose.yml` defining the three services with `mem_limit: 256m`, bind mounts for persistence, and an isolated bridge network
|
||||||
|
2. Add nginx vhosts (DNS A records first):
|
||||||
|
- `grafana.88oakapps.com` → `127.0.0.1:3000` (basic auth via htpasswd, Let's Encrypt)
|
||||||
|
- `obs.88oakapps.com` → routes by path:
|
||||||
|
- `/api/v1/write` → `127.0.0.1:8428` (VictoriaMetrics remote-write, bearer-token check)
|
||||||
|
- `/v1/traces` → `127.0.0.1:4318` (OTLP/HTTP traces, bearer-token check)
|
||||||
|
3. Generate a 32-byte token, store in `/etc/honeydue-obs/token` (mode 0600), reference from nginx as `auth_request` or simple `if ($http_authorization != ...)`
|
||||||
|
4. Pre-provision Grafana with the VM datasource pointing at `http://victoriametrics:8428` (in-network)
|
||||||
|
|
||||||
|
**On the honeyDue k3s cluster:**
|
||||||
|
5. Add `prometheus/client_golang` to `honeyDueAPI-go/go.mod` and a `/metrics` endpoint to the Go API
|
||||||
|
6. Register histograms:
|
||||||
|
- `http_request_duration_seconds{route,method,status}` via Echo middleware
|
||||||
|
- `gorm_query_duration_seconds{table,operation}` via a GORM `Plugin` callback (no ctx needed for this one — operates at the SQL string level)
|
||||||
|
- `b2_upload_duration_seconds{bucket,result}`
|
||||||
|
- `apns_send_duration_seconds{result}`
|
||||||
|
7. Deploy a `vmagent` sidecar (or DaemonSet) in the `honeydue` namespace with:
|
||||||
|
- Scrape: api Service `/metrics` every 15s
|
||||||
|
- `remote_write.url`: `https://obs.88oakapps.com/api/v1/write`
|
||||||
|
- `remote_write.bearer_token`: from k8s Secret
|
||||||
|
8. Build the RED dashboard in Grafana: rate, errors, duration p50/p95/p99 per route
|
||||||
|
|
||||||
|
**ROI:** "Is the API healthy? Where is time being spent right now?" answered live, served from `grafana.88oakapps.com`.
|
||||||
|
|
||||||
|
### Step 2 — Tracing baseline
|
||||||
|
|
||||||
|
(Jaeger is already up from Step 1. This step adds the app-side wiring.)
|
||||||
|
|
||||||
|
1. Add Grafana datasource for Jaeger pointing at `http://jaeger:16686` (in-network)
|
||||||
|
2. Wire OTel SDK in `cmd/api/main.go`:
|
||||||
|
- `otel.SetTracerProvider(tracerProvider)`
|
||||||
|
- `otelecho.Middleware("honeydue-api")` on Echo
|
||||||
|
- OTLP/HTTP exporter pointing at `https://obs.88oakapps.com/v1/traces` with `Authorization: Bearer <token>` header (token from env)
|
||||||
|
- Sampling: `TraceIDRatioBased(0.1)` in prod, `AlwaysSample()` in dev
|
||||||
|
3. Verify: a single `POST /api/auth/login/` produces a trace in Jaeger
|
||||||
|
|
||||||
|
**ROI:** "Why is this one request slow?" — answered with a flame graph.
|
||||||
|
|
||||||
|
### Step 3 — Manual spans for the work that actually matters
|
||||||
|
|
||||||
|
Wrap each in `tracer.Start(ctx, ...)` with attributes:
|
||||||
|
- `storage_service.Upload` → span "b2.PutObject" with `bucket`, `key`, `size_bytes`, result
|
||||||
|
- `push/apns.go` → span "apns.send" with `device_token_hash`, `status_code`, `reason`
|
||||||
|
- `asynq` middleware → span per task type with `task.type`, `retry_count`, `payload_size`
|
||||||
|
|
||||||
|
**ROI:** Specific high-value debugging questions ("why did this upload take 30 seconds", "why did these 5 push notifications fail") answered without code archaeology.
|
||||||
|
|
||||||
|
### Step 4 — Repository ctx + `otelgorm` (biggest diff, save for last)
|
||||||
|
|
||||||
|
1. Refactor every repository method to accept `ctx context.Context` as first arg
|
||||||
|
2. Update every call site to pass `c.Request().Context()` from handlers / propagate through services
|
||||||
|
3. Add `db.Use(otelgorm.NewPlugin())` in `internal/database/database.go`
|
||||||
|
4. Verify: a request now has nested spans `http → service → query → query → b2.PutObject → apns.send` with full SQL on the query spans
|
||||||
|
|
||||||
|
**ROI:** Every DB query in every trace, with SQL + table + rows. The "find the N+1" tool you'd otherwise build by hand.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Hard skips (revisit only when explicitly proven needed)
|
||||||
|
|
||||||
|
| Tool | Why skip |
|
||||||
|
|---|---|
|
||||||
|
| Loki / Promtail | Dozzle covers the immediate need. Loki adds 512 Mi RAM + a daemonset; defer until log search becomes a hot pain point. |
|
||||||
|
| Mimir / VM cluster mode | Single-node VM handles honeyDue scale for years. |
|
||||||
|
| Pyroscope continuous profiling | Overkill at 3 small nodes. Use `pprof` endpoints ad-hoc when CPU pressure shows up. |
|
||||||
|
| OTel Collector | Only worth running when 3+ services emit telemetry. App → Jaeger direct is fine for now. |
|
||||||
|
| Any SaaS vendor (Datadog, NR, Honeycomb, Grafana Cloud, Sentry Performance) | User constraint: nothing paid. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## When to move off `88oakappsUpdate`
|
||||||
|
|
||||||
|
Triggers — any one is enough:
|
||||||
|
- `88oakappsUpdate` available memory drops below ~3 GB sustained (PostHog growth squeezing it)
|
||||||
|
- ClickHouse OOM events start showing up in `dmesg` (PostHog under load)
|
||||||
|
- You want fully separate failure domains for honeyDue vs. 88oakapps
|
||||||
|
|
||||||
|
Migration path: the obs stack is a single docker-compose project on a bind-mount, so moving it = `rsync /opt/honeydue-obs/` to a new box, update DNS for `grafana.88oakapps.com` and `obs.88oakapps.com`, `docker compose up -d`. ~30 min of work. Until then: cohabiting on `88oakappsUpdate` is correct.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick reference: what shows up where
|
||||||
|
|
||||||
|
| Question | Where to look |
|
||||||
|
|---|---|
|
||||||
|
| Is the API up right now? Latency? Errors? | Grafana RED dashboard |
|
||||||
|
| Why is this specific request slow? | Jaeger trace view |
|
||||||
|
| What did the slow part of that request actually do (which SQL, which B2 PUT)? | Span details inside the trace |
|
||||||
|
| Background job throughput / queue depth | VictoriaMetrics + asynq metrics |
|
||||||
|
| What did the app print to stdout 5 minutes ago? | Dozzle |
|
||||||
|
| What error did the app log? | Dozzle (search) — or Loki if/when added |
|
||||||
@@ -33,6 +33,7 @@ require (
|
|||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
github.com/beorn7/perks v1.0.1 // indirect
|
||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/go-ini/ini v1.67.0 // indirect
|
github.com/go-ini/ini v1.67.0 // indirect
|
||||||
github.com/klauspost/compress v1.18.2 // indirect
|
github.com/klauspost/compress v1.18.2 // indirect
|
||||||
@@ -40,9 +41,15 @@ require (
|
|||||||
github.com/klauspost/crc32 v1.3.0 // indirect
|
github.com/klauspost/crc32 v1.3.0 // indirect
|
||||||
github.com/minio/crc64nvme v1.1.1 // indirect
|
github.com/minio/crc64nvme v1.1.1 // indirect
|
||||||
github.com/minio/md5-simd v1.1.2 // indirect
|
github.com/minio/md5-simd v1.1.2 // indirect
|
||||||
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||||
github.com/philhofer/fwd v1.2.0 // indirect
|
github.com/philhofer/fwd v1.2.0 // indirect
|
||||||
|
github.com/prometheus/client_golang v1.23.2 // indirect
|
||||||
|
github.com/prometheus/client_model v0.6.2 // indirect
|
||||||
|
github.com/prometheus/common v0.66.1 // indirect
|
||||||
|
github.com/prometheus/procfs v0.16.1 // indirect
|
||||||
github.com/rs/xid v1.6.0 // indirect
|
github.com/rs/xid v1.6.0 // indirect
|
||||||
github.com/tinylib/msgp v1.6.1 // indirect
|
github.com/tinylib/msgp v1.6.1 // indirect
|
||||||
|
go.yaml.in/yaml/v2 v2.4.2 // indirect
|
||||||
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg
|
|||||||
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
|
||||||
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
|
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
|
||||||
github.com/alecthomas/units v0.0.0-20201120081800-1786d5ef83d4/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
|
github.com/alecthomas/units v0.0.0-20201120081800-1786d5ef83d4/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE=
|
||||||
|
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||||
|
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||||
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
|
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
|
||||||
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
|
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
|
||||||
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
|
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
|
||||||
@@ -121,6 +123,8 @@ github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
|
|||||||
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
|
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
|
||||||
github.com/minio/minio-go/v7 v7.0.99 h1:2vH/byrwUkIpFQFOilvTfaUpvAX3fEFhEzO+DR3DlCE=
|
github.com/minio/minio-go/v7 v7.0.99 h1:2vH/byrwUkIpFQFOilvTfaUpvAX3fEFhEzO+DR3DlCE=
|
||||||
github.com/minio/minio-go/v7 v7.0.99/go.mod h1:EtGNKtlX20iL2yaYnxEigaIvj0G0GwSDnifnG8ClIdw=
|
github.com/minio/minio-go/v7 v7.0.99/go.mod h1:EtGNKtlX20iL2yaYnxEigaIvj0G0GwSDnifnG8ClIdw=
|
||||||
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||||
|
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||||
github.com/nicksnyder/go-i18n/v2 v2.6.0 h1:C/m2NNWNiTB6SK4Ao8df5EWm3JETSTIGNXBpMJTxzxQ=
|
github.com/nicksnyder/go-i18n/v2 v2.6.0 h1:C/m2NNWNiTB6SK4Ao8df5EWm3JETSTIGNXBpMJTxzxQ=
|
||||||
github.com/nicksnyder/go-i18n/v2 v2.6.0/go.mod h1:88sRqr0C6OPyJn0/KRNaEz1uWorjxIKP7rUUcvycecE=
|
github.com/nicksnyder/go-i18n/v2 v2.6.0/go.mod h1:88sRqr0C6OPyJn0/KRNaEz1uWorjxIKP7rUUcvycecE=
|
||||||
github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
|
github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
|
||||||
@@ -132,6 +136,14 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
|
|||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
|
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw=
|
||||||
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
|
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
|
||||||
|
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||||
|
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||||
|
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||||
|
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||||
|
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
|
||||||
|
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
|
||||||
|
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
|
||||||
|
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
|
||||||
github.com/redis/go-redis/v9 v9.17.1 h1:7tl732FjYPRT9H9aNfyTwKg9iTETjWjGKEJ2t/5iWTs=
|
github.com/redis/go-redis/v9 v9.17.1 h1:7tl732FjYPRT9H9aNfyTwKg9iTETjWjGKEJ2t/5iWTs=
|
||||||
github.com/redis/go-redis/v9 v9.17.1/go.mod h1:u410H11HMLoB+TP67dz8rL9s6QW2j76l0//kSOd3370=
|
github.com/redis/go-redis/v9 v9.17.1/go.mod h1:u410H11HMLoB+TP67dz8rL9s6QW2j76l0//kSOd3370=
|
||||||
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
|
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
|
||||||
@@ -204,6 +216,8 @@ go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJr
|
|||||||
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
|
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
|
||||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||||
|
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||||
|
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||||
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
|
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
|
||||||
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
|
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
|
||||||
golang.org/x/crypto v0.0.0-20170512130425-ab89591268e0/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
golang.org/x/crypto v0.0.0-20170512130425-ab89591268e0/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ import (
|
|||||||
|
|
||||||
"github.com/treytartt/honeydue-api/internal/config"
|
"github.com/treytartt/honeydue-api/internal/config"
|
||||||
"github.com/treytartt/honeydue-api/internal/models"
|
"github.com/treytartt/honeydue-api/internal/models"
|
||||||
|
"github.com/treytartt/honeydue-api/internal/prom"
|
||||||
)
|
)
|
||||||
|
|
||||||
// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
|
// migrationAdvisoryLockKey is the pg_advisory_lock key that serializes
|
||||||
@@ -84,6 +85,13 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) {
|
|||||||
Str("database", cfg.Database).
|
Str("database", cfg.Database).
|
||||||
Msg("Connected to PostgreSQL database")
|
Msg("Connected to PostgreSQL database")
|
||||||
|
|
||||||
|
// Register Prometheus GORM callbacks — emits gorm_query_duration_seconds
|
||||||
|
// for every SQL operation. Operates at the statement level, so does not
|
||||||
|
// require ctx to be threaded through repositories.
|
||||||
|
if err := prom.RegisterGORMCallbacks(db); err != nil {
|
||||||
|
log.Warn().Err(err).Msg("failed to register prometheus GORM callbacks; metrics will be partial")
|
||||||
|
}
|
||||||
|
|
||||||
return db, nil
|
return db, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,199 @@
|
|||||||
|
package prom
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strconv"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/labstack/echo/v4"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/collectors"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
|
"gorm.io/gorm"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
Registry = prometheus.NewRegistry()
|
||||||
|
|
||||||
|
httpRequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Name: "http_request_duration_seconds",
|
||||||
|
Help: "Duration of HTTP requests in seconds.",
|
||||||
|
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
|
||||||
|
}, []string{"route", "method", "status"})
|
||||||
|
|
||||||
|
gormQueryDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Name: "gorm_query_duration_seconds",
|
||||||
|
Help: "Duration of GORM database queries in seconds.",
|
||||||
|
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
|
||||||
|
}, []string{"table", "operation"})
|
||||||
|
|
||||||
|
b2UploadDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Name: "b2_upload_duration_seconds",
|
||||||
|
Help: "Duration of B2/S3 upload operations in seconds.",
|
||||||
|
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
|
||||||
|
}, []string{"bucket", "result"})
|
||||||
|
|
||||||
|
b2UploadBytes = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
Name: "b2_upload_bytes_total",
|
||||||
|
Help: "Total bytes uploaded to B2/S3.",
|
||||||
|
}, []string{"bucket", "result"})
|
||||||
|
|
||||||
|
apnsSendDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Name: "apns_send_duration_seconds",
|
||||||
|
Help: "Duration of APNs push notification sends in seconds.",
|
||||||
|
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
|
||||||
|
}, []string{"result"})
|
||||||
|
|
||||||
|
fcmSendDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Name: "fcm_send_duration_seconds",
|
||||||
|
Help: "Duration of FCM push notification sends in seconds.",
|
||||||
|
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
|
||||||
|
}, []string{"result"})
|
||||||
|
|
||||||
|
asynqJobDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Name: "asynq_job_duration_seconds",
|
||||||
|
Help: "Duration of asynq background job execution in seconds.",
|
||||||
|
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60, 300},
|
||||||
|
}, []string{"task_type", "result"})
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
Registry.MustRegister(
|
||||||
|
collectors.NewGoCollector(),
|
||||||
|
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
|
||||||
|
httpRequestDuration,
|
||||||
|
gormQueryDuration,
|
||||||
|
b2UploadDuration,
|
||||||
|
b2UploadBytes,
|
||||||
|
apnsSendDuration,
|
||||||
|
fcmSendDuration,
|
||||||
|
asynqJobDuration,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handler returns a promhttp Handler bound to the package Registry, suitable for
|
||||||
|
// mounting at GET /metrics on Echo.
|
||||||
|
func Handler() echo.HandlerFunc {
|
||||||
|
h := promhttp.HandlerFor(Registry, promhttp.HandlerOpts{Registry: Registry})
|
||||||
|
return echo.WrapHandler(h)
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTTPMiddleware records http_request_duration_seconds for every request,
|
||||||
|
// labeled by Echo route pattern, method, and status code.
|
||||||
|
func HTTPMiddleware() echo.MiddlewareFunc {
|
||||||
|
return func(next echo.HandlerFunc) echo.HandlerFunc {
|
||||||
|
return func(c echo.Context) error {
|
||||||
|
start := time.Now()
|
||||||
|
err := next(c)
|
||||||
|
route := c.Path()
|
||||||
|
if route == "" {
|
||||||
|
route = "unknown"
|
||||||
|
}
|
||||||
|
httpRequestDuration.WithLabelValues(
|
||||||
|
route,
|
||||||
|
c.Request().Method,
|
||||||
|
strconv.Itoa(c.Response().Status),
|
||||||
|
).Observe(time.Since(start).Seconds())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// RegisterGORMCallbacks attaches before/after callbacks on a *gorm.DB so every
|
||||||
|
// SQL operation records gorm_query_duration_seconds{table,operation}.
|
||||||
|
//
|
||||||
|
// Operates at the SQL/statement level — does NOT require ctx to be threaded
|
||||||
|
// through repositories (that comes later when otelgorm lands).
|
||||||
|
func RegisterGORMCallbacks(db *gorm.DB) error {
|
||||||
|
const startKey = "honeydue:prom_start"
|
||||||
|
|
||||||
|
registerBefore := func(name string) error {
|
||||||
|
cb := db.Callback().Create().Before("gorm:create")
|
||||||
|
switch name {
|
||||||
|
case "create":
|
||||||
|
cb = db.Callback().Create().Before("gorm:create")
|
||||||
|
case "query":
|
||||||
|
cb = db.Callback().Query().Before("gorm:query")
|
||||||
|
case "update":
|
||||||
|
cb = db.Callback().Update().Before("gorm:update")
|
||||||
|
case "delete":
|
||||||
|
cb = db.Callback().Delete().Before("gorm:delete")
|
||||||
|
case "row":
|
||||||
|
cb = db.Callback().Row().Before("gorm:row")
|
||||||
|
case "raw":
|
||||||
|
cb = db.Callback().Raw().Before("gorm:raw")
|
||||||
|
}
|
||||||
|
return cb.Register("prom:before_"+name, func(tx *gorm.DB) {
|
||||||
|
tx.InstanceSet(startKey, time.Now())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
registerAfter := func(name string) error {
|
||||||
|
cb := db.Callback().Create().After("gorm:create")
|
||||||
|
switch name {
|
||||||
|
case "create":
|
||||||
|
cb = db.Callback().Create().After("gorm:create")
|
||||||
|
case "query":
|
||||||
|
cb = db.Callback().Query().After("gorm:query")
|
||||||
|
case "update":
|
||||||
|
cb = db.Callback().Update().After("gorm:update")
|
||||||
|
case "delete":
|
||||||
|
cb = db.Callback().Delete().After("gorm:delete")
|
||||||
|
case "row":
|
||||||
|
cb = db.Callback().Row().After("gorm:row")
|
||||||
|
case "raw":
|
||||||
|
cb = db.Callback().Raw().After("gorm:raw")
|
||||||
|
}
|
||||||
|
return cb.Register("prom:after_"+name, func(tx *gorm.DB) {
|
||||||
|
startVal, ok := tx.InstanceGet(startKey)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
start, ok := startVal.(time.Time)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
table := tx.Statement.Table
|
||||||
|
if table == "" {
|
||||||
|
table = "unknown"
|
||||||
|
}
|
||||||
|
gormQueryDuration.WithLabelValues(table, name).Observe(time.Since(start).Seconds())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, name := range []string{"create", "query", "update", "delete", "row", "raw"} {
|
||||||
|
if err := registerBefore(name); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := registerAfter(name); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ObserveB2Upload records duration + bytes for a B2/S3 upload. result is "ok"
|
||||||
|
// or "error".
|
||||||
|
func ObserveB2Upload(bucket, result string, dur time.Duration, bytes int64) {
|
||||||
|
b2UploadDuration.WithLabelValues(bucket, result).Observe(dur.Seconds())
|
||||||
|
if bytes > 0 {
|
||||||
|
b2UploadBytes.WithLabelValues(bucket, result).Add(float64(bytes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ObserveAPNsSend records duration of a single APNs send. result is "ok",
|
||||||
|
// "bad_token", or "error".
|
||||||
|
func ObserveAPNsSend(result string, dur time.Duration) {
|
||||||
|
apnsSendDuration.WithLabelValues(result).Observe(dur.Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ObserveFCMSend records duration of a single FCM send. result is "ok",
|
||||||
|
// "bad_token", or "error".
|
||||||
|
func ObserveFCMSend(result string, dur time.Duration) {
|
||||||
|
fcmSendDuration.WithLabelValues(result).Observe(dur.Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ObserveAsynqJob records duration of a single asynq job execution. result is
|
||||||
|
// "ok", "retry", or "error".
|
||||||
|
func ObserveAsynqJob(taskType, result string, dur time.Duration) {
|
||||||
|
asynqJobDuration.WithLabelValues(taskType, result).Observe(dur.Seconds())
|
||||||
|
}
|
||||||
@@ -3,6 +3,7 @@ package push
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
"github.com/sideshow/apns2"
|
"github.com/sideshow/apns2"
|
||||||
@@ -10,6 +11,7 @@ import (
|
|||||||
"github.com/sideshow/apns2/token"
|
"github.com/sideshow/apns2/token"
|
||||||
|
|
||||||
"github.com/treytartt/honeydue-api/internal/config"
|
"github.com/treytartt/honeydue-api/internal/config"
|
||||||
|
"github.com/treytartt/honeydue-api/internal/prom"
|
||||||
)
|
)
|
||||||
|
|
||||||
// APNsClient handles direct communication with Apple Push Notification service
|
// APNsClient handles direct communication with Apple Push Notification service
|
||||||
@@ -84,8 +86,10 @@ func (c *APNsClient) Send(ctx context.Context, tokens []string, title, message s
|
|||||||
Priority: apns2.PriorityHigh,
|
Priority: apns2.PriorityHigh,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sendStart := time.Now()
|
||||||
res, err := c.client.PushWithContext(ctx, notification)
|
res, err := c.client.PushWithContext(ctx, notification)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
prom.ObserveAPNsSend("error", time.Since(sendStart))
|
||||||
log.Error().
|
log.Error().
|
||||||
Err(err).
|
Err(err).
|
||||||
Str("token", truncateToken(deviceToken)).
|
Str("token", truncateToken(deviceToken)).
|
||||||
@@ -95,6 +99,7 @@ func (c *APNsClient) Send(ctx context.Context, tokens []string, title, message s
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !res.Sent() {
|
if !res.Sent() {
|
||||||
|
prom.ObserveAPNsSend("bad_token", time.Since(sendStart))
|
||||||
log.Error().
|
log.Error().
|
||||||
Str("token", truncateToken(deviceToken)).
|
Str("token", truncateToken(deviceToken)).
|
||||||
Str("reason", res.Reason).
|
Str("reason", res.Reason).
|
||||||
@@ -104,6 +109,7 @@ func (c *APNsClient) Send(ctx context.Context, tokens []string, title, message s
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prom.ObserveAPNsSend("ok", time.Since(sendStart))
|
||||||
successCount++
|
successCount++
|
||||||
log.Debug().
|
log.Debug().
|
||||||
Str("token", truncateToken(deviceToken)).
|
Str("token", truncateToken(deviceToken)).
|
||||||
@@ -154,8 +160,10 @@ func (c *APNsClient) SendWithCategory(ctx context.Context, tokens []string, titl
|
|||||||
Priority: apns2.PriorityHigh,
|
Priority: apns2.PriorityHigh,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sendStart := time.Now()
|
||||||
res, err := c.client.PushWithContext(ctx, notification)
|
res, err := c.client.PushWithContext(ctx, notification)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
prom.ObserveAPNsSend("error", time.Since(sendStart))
|
||||||
log.Error().
|
log.Error().
|
||||||
Err(err).
|
Err(err).
|
||||||
Str("token", truncateToken(deviceToken)).
|
Str("token", truncateToken(deviceToken)).
|
||||||
@@ -166,6 +174,7 @@ func (c *APNsClient) SendWithCategory(ctx context.Context, tokens []string, titl
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !res.Sent() {
|
if !res.Sent() {
|
||||||
|
prom.ObserveAPNsSend("bad_token", time.Since(sendStart))
|
||||||
log.Error().
|
log.Error().
|
||||||
Str("token", truncateToken(deviceToken)).
|
Str("token", truncateToken(deviceToken)).
|
||||||
Str("reason", res.Reason).
|
Str("reason", res.Reason).
|
||||||
@@ -176,6 +185,7 @@ func (c *APNsClient) SendWithCategory(ctx context.Context, tokens []string, titl
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prom.ObserveAPNsSend("ok", time.Since(sendStart))
|
||||||
successCount++
|
successCount++
|
||||||
log.Debug().
|
log.Debug().
|
||||||
Str("token", truncateToken(deviceToken)).
|
Str("token", truncateToken(deviceToken)).
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
@@ -14,6 +15,7 @@ import (
|
|||||||
"golang.org/x/oauth2/google"
|
"golang.org/x/oauth2/google"
|
||||||
|
|
||||||
"github.com/treytartt/honeydue-api/internal/config"
|
"github.com/treytartt/honeydue-api/internal/config"
|
||||||
|
"github.com/treytartt/honeydue-api/internal/prom"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -213,8 +215,15 @@ func (c *FCMClient) Send(ctx context.Context, tokens []string, title, message st
|
|||||||
successCount := 0
|
successCount := 0
|
||||||
|
|
||||||
for _, token := range tokens {
|
for _, token := range tokens {
|
||||||
|
sendStart := time.Now()
|
||||||
err := c.sendOne(ctx, token, title, message, data)
|
err := c.sendOne(ctx, token, title, message, data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
result := "error"
|
||||||
|
var fcmErr *FCMSendError
|
||||||
|
if errors.As(err, &fcmErr) && fcmErr.IsUnregistered() {
|
||||||
|
result = "bad_token"
|
||||||
|
}
|
||||||
|
prom.ObserveFCMSend(result, time.Since(sendStart))
|
||||||
log.Error().
|
log.Error().
|
||||||
Err(err).
|
Err(err).
|
||||||
Str("token", truncateToken(token)).
|
Str("token", truncateToken(token)).
|
||||||
@@ -223,6 +232,7 @@ func (c *FCMClient) Send(ctx context.Context, tokens []string, title, message st
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prom.ObserveFCMSend("ok", time.Since(sendStart))
|
||||||
successCount++
|
successCount++
|
||||||
log.Debug().
|
log.Debug().
|
||||||
Str("token", truncateToken(token)).
|
Str("token", truncateToken(token)).
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import (
|
|||||||
"github.com/treytartt/honeydue-api/internal/i18n"
|
"github.com/treytartt/honeydue-api/internal/i18n"
|
||||||
custommiddleware "github.com/treytartt/honeydue-api/internal/middleware"
|
custommiddleware "github.com/treytartt/honeydue-api/internal/middleware"
|
||||||
"github.com/treytartt/honeydue-api/internal/monitoring"
|
"github.com/treytartt/honeydue-api/internal/monitoring"
|
||||||
|
"github.com/treytartt/honeydue-api/internal/prom"
|
||||||
"github.com/treytartt/honeydue-api/internal/push"
|
"github.com/treytartt/honeydue-api/internal/push"
|
||||||
"github.com/treytartt/honeydue-api/internal/repositories"
|
"github.com/treytartt/honeydue-api/internal/repositories"
|
||||||
"github.com/treytartt/honeydue-api/internal/services"
|
"github.com/treytartt/honeydue-api/internal/services"
|
||||||
@@ -121,6 +122,15 @@ func SetupRouter(deps *Dependencies) *echo.Echo {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prometheus metrics middleware — feeds VictoriaMetrics on
|
||||||
|
// obs.88oakapps.com via vmagent. Records http_request_duration_seconds
|
||||||
|
// labeled by route pattern, method, and status code.
|
||||||
|
e.Use(prom.HTTPMiddleware())
|
||||||
|
|
||||||
|
// /metrics endpoint exposed for vmagent scrape. No auth — bound to
|
||||||
|
// the cluster network only; not exposed via Cloudflare.
|
||||||
|
e.GET("/metrics", prom.Handler())
|
||||||
|
|
||||||
// Serve landing page static files (if static directory is configured)
|
// Serve landing page static files (if static directory is configured)
|
||||||
staticDir := cfg.Server.StaticDir
|
staticDir := cfg.Server.StaticDir
|
||||||
if staticDir != "" {
|
if staticDir != "" {
|
||||||
@@ -229,9 +239,11 @@ func SetupRouter(deps *Dependencies) *echo.Echo {
|
|||||||
mediaHandler = handlers.NewMediaHandler(documentRepo, taskRepo, residenceRepo, deps.StorageService)
|
mediaHandler = handlers.NewMediaHandler(documentRepo, taskRepo, residenceRepo, deps.StorageService)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prometheus metrics endpoint (no auth required, for scraping)
|
// Legacy Prometheus-shaped metrics from internal/monitoring (consumed by
|
||||||
|
// GoAdmin dashboard). Now lives at /metrics/legacy so the canonical /metrics
|
||||||
|
// route (registered above) emits proper Prometheus histograms with labels.
|
||||||
if deps.MonitoringService != nil {
|
if deps.MonitoringService != nil {
|
||||||
e.GET("/metrics", prometheusMetrics(deps.MonitoringService))
|
e.GET("/metrics/legacy", prometheusMetrics(deps.MonitoringService))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set up admin routes with monitoring handler (if available)
|
// Set up admin routes with monitoring handler (if available)
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ import (
|
|||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
|
|
||||||
"github.com/treytartt/honeydue-api/internal/config"
|
"github.com/treytartt/honeydue-api/internal/config"
|
||||||
|
"github.com/treytartt/honeydue-api/internal/prom"
|
||||||
)
|
)
|
||||||
|
|
||||||
// StorageService handles file uploads, validation, encryption, and URL generation.
|
// StorageService handles file uploads, validation, encryption, and URL generation.
|
||||||
@@ -149,11 +150,18 @@ func (s *StorageService) Upload(file *multipart.FileHeader, category string) (*U
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write to backend
|
// Write to backend (B2/S3 round trip — instrumented for Prometheus)
|
||||||
|
bucket := s.cfg.S3Bucket
|
||||||
|
if bucket == "" {
|
||||||
|
bucket = "local"
|
||||||
|
}
|
||||||
|
uploadStart := time.Now()
|
||||||
if err := s.backend.Write(key, fileData); err != nil {
|
if err := s.backend.Write(key, fileData); err != nil {
|
||||||
|
prom.ObserveB2Upload(bucket, "error", time.Since(uploadStart), 0)
|
||||||
return nil, fmt.Errorf("failed to save file: %w", err)
|
return nil, fmt.Errorf("failed to save file: %w", err)
|
||||||
}
|
}
|
||||||
written := int64(len(fileData))
|
written := int64(len(fileData))
|
||||||
|
prom.ObserveB2Upload(bucket, "ok", time.Since(uploadStart), written)
|
||||||
|
|
||||||
// Generate URL (always uses the original filename without .enc suffix)
|
// Generate URL (always uses the original filename without .enc suffix)
|
||||||
url := fmt.Sprintf("%s/%s/%s", s.cfg.BaseURL, subdir, newFilename)
|
url := fmt.Sprintf("%s/%s/%s", s.cfg.BaseURL, subdir, newFilename)
|
||||||
|
|||||||
Reference in New Issue
Block a user