Add Prometheus metrics + vmagent push to obs.88oakapps.com
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Build (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled

Adds internal/prom package with histograms for HTTP, GORM, B2, APNs, and
FCM, wired into the Echo router (HTTPMiddleware + /metrics) and GORM via
statement-level callbacks (no ctx plumbing needed). Storage and push
clients call ObserveB2Upload / ObserveAPNsSend / ObserveFCMSend at the
network round-trip points.

Existing internal/monitoring metrics move to /metrics/legacy so the
canonical /metrics emits proper histogram buckets for p50/p95/p99 rollups.

deploy-k3s/manifests/observability/vmagent.yaml deploys a single-replica
vmagent in the honeydue namespace that scrapes api Pods on :8000/metrics
every 15s and remote-writes to https://obs.88oakapps.com/api/v1/write
with a bearer token (substituted at deploy time from OBS_INGEST_TOKEN
in deploy/prod.env). NetworkPolicies allow vmagent egress to api Pods
and to the public obs endpoint over :443; the obs side runs
VictoriaMetrics + Jaeger + Grafana on 88oakappsUpdate.

docs/observability-plan.md captures the full plan including resource
budget, instrumentation table, 4-step rollout, and migration triggers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-04-25 14:16:17 -05:00
parent 1cd6cafa9d
commit df78d9ccd8
10 changed files with 622 additions and 3 deletions
+199
View File
@@ -0,0 +1,199 @@
package prom
import (
"strconv"
"time"
"github.com/labstack/echo/v4"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
"gorm.io/gorm"
)
var (
Registry = prometheus.NewRegistry()
httpRequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "Duration of HTTP requests in seconds.",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
}, []string{"route", "method", "status"})
gormQueryDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "gorm_query_duration_seconds",
Help: "Duration of GORM database queries in seconds.",
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
}, []string{"table", "operation"})
b2UploadDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "b2_upload_duration_seconds",
Help: "Duration of B2/S3 upload operations in seconds.",
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
}, []string{"bucket", "result"})
b2UploadBytes = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "b2_upload_bytes_total",
Help: "Total bytes uploaded to B2/S3.",
}, []string{"bucket", "result"})
apnsSendDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "apns_send_duration_seconds",
Help: "Duration of APNs push notification sends in seconds.",
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
}, []string{"result"})
fcmSendDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "fcm_send_duration_seconds",
Help: "Duration of FCM push notification sends in seconds.",
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5},
}, []string{"result"})
asynqJobDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "asynq_job_duration_seconds",
Help: "Duration of asynq background job execution in seconds.",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60, 300},
}, []string{"task_type", "result"})
)
func init() {
Registry.MustRegister(
collectors.NewGoCollector(),
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
httpRequestDuration,
gormQueryDuration,
b2UploadDuration,
b2UploadBytes,
apnsSendDuration,
fcmSendDuration,
asynqJobDuration,
)
}
// Handler returns a promhttp Handler bound to the package Registry, suitable for
// mounting at GET /metrics on Echo.
func Handler() echo.HandlerFunc {
h := promhttp.HandlerFor(Registry, promhttp.HandlerOpts{Registry: Registry})
return echo.WrapHandler(h)
}
// HTTPMiddleware records http_request_duration_seconds for every request,
// labeled by Echo route pattern, method, and status code.
func HTTPMiddleware() echo.MiddlewareFunc {
return func(next echo.HandlerFunc) echo.HandlerFunc {
return func(c echo.Context) error {
start := time.Now()
err := next(c)
route := c.Path()
if route == "" {
route = "unknown"
}
httpRequestDuration.WithLabelValues(
route,
c.Request().Method,
strconv.Itoa(c.Response().Status),
).Observe(time.Since(start).Seconds())
return err
}
}
}
// RegisterGORMCallbacks attaches before/after callbacks on a *gorm.DB so every
// SQL operation records gorm_query_duration_seconds{table,operation}.
//
// Operates at the SQL/statement level — does NOT require ctx to be threaded
// through repositories (that comes later when otelgorm lands).
func RegisterGORMCallbacks(db *gorm.DB) error {
const startKey = "honeydue:prom_start"
registerBefore := func(name string) error {
cb := db.Callback().Create().Before("gorm:create")
switch name {
case "create":
cb = db.Callback().Create().Before("gorm:create")
case "query":
cb = db.Callback().Query().Before("gorm:query")
case "update":
cb = db.Callback().Update().Before("gorm:update")
case "delete":
cb = db.Callback().Delete().Before("gorm:delete")
case "row":
cb = db.Callback().Row().Before("gorm:row")
case "raw":
cb = db.Callback().Raw().Before("gorm:raw")
}
return cb.Register("prom:before_"+name, func(tx *gorm.DB) {
tx.InstanceSet(startKey, time.Now())
})
}
registerAfter := func(name string) error {
cb := db.Callback().Create().After("gorm:create")
switch name {
case "create":
cb = db.Callback().Create().After("gorm:create")
case "query":
cb = db.Callback().Query().After("gorm:query")
case "update":
cb = db.Callback().Update().After("gorm:update")
case "delete":
cb = db.Callback().Delete().After("gorm:delete")
case "row":
cb = db.Callback().Row().After("gorm:row")
case "raw":
cb = db.Callback().Raw().After("gorm:raw")
}
return cb.Register("prom:after_"+name, func(tx *gorm.DB) {
startVal, ok := tx.InstanceGet(startKey)
if !ok {
return
}
start, ok := startVal.(time.Time)
if !ok {
return
}
table := tx.Statement.Table
if table == "" {
table = "unknown"
}
gormQueryDuration.WithLabelValues(table, name).Observe(time.Since(start).Seconds())
})
}
for _, name := range []string{"create", "query", "update", "delete", "row", "raw"} {
if err := registerBefore(name); err != nil {
return err
}
if err := registerAfter(name); err != nil {
return err
}
}
return nil
}
// ObserveB2Upload records duration + bytes for a B2/S3 upload. result is "ok"
// or "error".
func ObserveB2Upload(bucket, result string, dur time.Duration, bytes int64) {
b2UploadDuration.WithLabelValues(bucket, result).Observe(dur.Seconds())
if bytes > 0 {
b2UploadBytes.WithLabelValues(bucket, result).Add(float64(bytes))
}
}
// ObserveAPNsSend records duration of a single APNs send. result is "ok",
// "bad_token", or "error".
func ObserveAPNsSend(result string, dur time.Duration) {
apnsSendDuration.WithLabelValues(result).Observe(dur.Seconds())
}
// ObserveFCMSend records duration of a single FCM send. result is "ok",
// "bad_token", or "error".
func ObserveFCMSend(result string, dur time.Duration) {
fcmSendDuration.WithLabelValues(result).Observe(dur.Seconds())
}
// ObserveAsynqJob records duration of a single asynq job execution. result is
// "ok", "retry", or "error".
func ObserveAsynqJob(taskType, result string, dur time.Duration) {
asynqJobDuration.WithLabelValues(taskType, result).Observe(dur.Seconds())
}