b67f7f9e6b
Trace data revealed subscription_subscriptionsettings was consuming
1,983s of cumulative DB time per day (180× more than the next-largest
table) for a 32-byte singleton row of admin-toggleable global flags.
Root cause was a 30-second poll loop in monitoring.Service per pod
plus uncached reads on every authed status check / CreateResidence /
Stripe webhook. Fix is layered:
1. Redis cache for SubscriptionSettings — same shape as the
residence-IDs cache. 30-min TTL, explicit invalidation on admin
write. New CacheService.{Cache,GetCached,Invalidate}SubscriptionSettings
plus a cachedSubscriptionSettings helper in services/.
2. SubscriptionService, StripeService, and both admin handlers
(settings + limitations) now read through the cache. Admin write
handlers invalidate so toggles propagate cluster-wide within ms
instead of waiting for the TTL.
3. monitoring.Service.syncSettingsFromDB also reads from Redis first
(raw redis.Client to avoid a services→monitoring import cycle).
Polling interval bumped 30s → 5min. Combined with Redis-shared
cache, cluster-wide DB hits from this poll go from ~480/hour to
~2/hour — a 240× reduction.
4. StripeService.CreateCheckoutSession now takes ctx so the cached
settings span (and the Stripe webhook trace) stay attached to the
request. Handler call site updated.
5. Admin handlers' direct h.db.First calls switched to
db.WithContext(ctx) so the resulting orphan SQL spans nest under
the admin request span in Jaeger.
Net DB query rate for subscription_subscriptionsettings should drop
from 0.101/sec to ~0/sec with occasional invalidation-driven refills,
and the table's cumulative DB time from 1,983s/day to ~10s/day.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
247 lines
6.9 KiB
Go
247 lines
6.9 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"io"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hibiken/asynq"
|
|
"github.com/labstack/echo/v4"
|
|
"github.com/redis/go-redis/v9"
|
|
"github.com/rs/zerolog/log"
|
|
"gorm.io/gorm"
|
|
|
|
"github.com/treytartt/honeydue-api/internal/models"
|
|
)
|
|
|
|
const (
|
|
// DefaultStatsInterval is the default interval for collecting/publishing stats
|
|
DefaultStatsInterval = 5 * time.Second
|
|
// SettingsSyncInterval is how often to check the database for the
|
|
// enable_monitoring setting. Was 30s; bumped to 5min after the
|
|
// observability rollout revealed this poll was generating ~9k DB queries
|
|
// per day across all pods just to read a 32-byte admin toggle. The flag
|
|
// is admin-only and changes essentially never; 5-minute freshness is
|
|
// strictly more than enough.
|
|
SettingsSyncInterval = 5 * time.Minute
|
|
)
|
|
|
|
// Service orchestrates all monitoring components
|
|
type Service struct {
|
|
process string
|
|
logBuffer *LogBuffer
|
|
statsStore *StatsStore
|
|
collector *Collector
|
|
httpCollector *HTTPStatsCollector
|
|
handler *Handler
|
|
logWriter *RedisLogWriter
|
|
db *gorm.DB
|
|
redis *redis.Client
|
|
settingsStopCh chan struct{}
|
|
stopOnce sync.Once
|
|
statsInterval time.Duration
|
|
}
|
|
|
|
// settingsCacheKey is the same key services.CacheService uses for the
|
|
// SubscriptionSettings singleton. Duplicated here because the monitoring
|
|
// package doesn't import services (avoids the import cycle that would
|
|
// arise if services ever imports monitoring).
|
|
const settingsCacheKey = "subscription_settings:1"
|
|
|
|
// Config holds configuration for the monitoring service
|
|
type Config struct {
|
|
Process string // "api" or "worker"
|
|
RedisClient *redis.Client // Redis client for log buffer
|
|
StatsInterval time.Duration // Interval for stats collection (default 5s)
|
|
DB *gorm.DB // Database for checking enable_monitoring setting (optional)
|
|
}
|
|
|
|
// NewService creates a new monitoring service
|
|
func NewService(cfg Config) *Service {
|
|
if cfg.StatsInterval == 0 {
|
|
cfg.StatsInterval = DefaultStatsInterval
|
|
}
|
|
|
|
// Create components
|
|
logBuffer := NewLogBuffer(cfg.RedisClient)
|
|
statsStore := NewStatsStore(cfg.RedisClient)
|
|
collector := NewCollector(cfg.Process, statsStore)
|
|
handler := NewHandler(logBuffer, statsStore)
|
|
logWriter := NewRedisLogWriter(logBuffer, cfg.Process)
|
|
|
|
// For API server, create HTTP stats collector
|
|
var httpCollector *HTTPStatsCollector
|
|
if cfg.Process == "api" {
|
|
httpCollector = NewHTTPStatsCollector()
|
|
collector.SetHTTPCollector(httpCollector)
|
|
}
|
|
|
|
svc := &Service{
|
|
process: cfg.Process,
|
|
logBuffer: logBuffer,
|
|
statsStore: statsStore,
|
|
collector: collector,
|
|
httpCollector: httpCollector,
|
|
handler: handler,
|
|
logWriter: logWriter,
|
|
db: cfg.DB,
|
|
redis: cfg.RedisClient,
|
|
settingsStopCh: make(chan struct{}),
|
|
statsInterval: cfg.StatsInterval,
|
|
}
|
|
|
|
// Check initial setting from database
|
|
if cfg.DB != nil {
|
|
svc.syncSettingsFromDB()
|
|
}
|
|
|
|
return svc
|
|
}
|
|
|
|
// SetAsynqInspector sets the Asynq inspector for worker stats
|
|
func (s *Service) SetAsynqInspector(inspector *asynq.Inspector) {
|
|
s.collector.SetAsynqInspector(inspector)
|
|
}
|
|
|
|
// Start begins collecting and publishing stats
|
|
func (s *Service) Start() {
|
|
log.Info().
|
|
Str("process", s.process).
|
|
Dur("interval", s.statsInterval).
|
|
Bool("enabled", s.logWriter.IsEnabled()).
|
|
Msg("Starting monitoring service")
|
|
|
|
s.collector.StartPublishing(s.statsInterval)
|
|
|
|
// Start settings sync if database is available
|
|
if s.db != nil {
|
|
go s.startSettingsSync()
|
|
}
|
|
}
|
|
|
|
// Stop stops the monitoring service. It is safe to call multiple times.
|
|
func (s *Service) Stop() {
|
|
s.stopOnce.Do(func() {
|
|
// Stop settings sync
|
|
close(s.settingsStopCh)
|
|
|
|
s.collector.Stop()
|
|
|
|
// Flush and close the log writer's background goroutine
|
|
s.logWriter.Close()
|
|
|
|
log.Info().Str("process", s.process).Msg("Monitoring service stopped")
|
|
})
|
|
}
|
|
|
|
// syncSettingsFromDB checks for the enable_monitoring setting, going through
|
|
// Redis first and falling back to Postgres on cache miss.
|
|
//
|
|
// In steady state across the cluster: with 4 pods polling every 5 min and a
|
|
// 30-min cache TTL, only ~1 of every 6 polls actually reaches Postgres,
|
|
// and only one pod hits the DB per 30-min window thanks to the shared
|
|
// Redis cache. ~9k DB queries/day → ~50/day.
|
|
func (s *Service) syncSettingsFromDB() {
|
|
if s.db == nil {
|
|
return
|
|
}
|
|
|
|
// Try Redis cache first (same key services.CacheService writes to).
|
|
if s.redis != nil {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
|
|
defer cancel()
|
|
|
|
if data, err := s.redis.Get(ctx, settingsCacheKey).Bytes(); err == nil {
|
|
var cached models.SubscriptionSettings
|
|
if err := json.Unmarshal(data, &cached); err == nil {
|
|
s.applySettings(&cached)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
var settings models.SubscriptionSettings
|
|
err := s.db.First(&settings, 1).Error
|
|
if err != nil {
|
|
if err == gorm.ErrRecordNotFound {
|
|
// No settings record, default to enabled
|
|
s.logWriter.SetEnabled(true)
|
|
}
|
|
// On other errors, keep current state
|
|
return
|
|
}
|
|
|
|
s.applySettings(&settings)
|
|
}
|
|
|
|
// applySettings reflects a SubscriptionSettings record onto the log writer
|
|
// and logs the change if it actually flipped.
|
|
func (s *Service) applySettings(settings *models.SubscriptionSettings) {
|
|
wasEnabled := s.logWriter.IsEnabled()
|
|
s.logWriter.SetEnabled(settings.EnableMonitoring)
|
|
|
|
if wasEnabled != settings.EnableMonitoring {
|
|
log.Info().
|
|
Str("process", s.process).
|
|
Bool("enabled", settings.EnableMonitoring).
|
|
Msg("Monitoring log capture setting changed")
|
|
}
|
|
}
|
|
|
|
// startSettingsSync periodically checks the database for settings changes
|
|
func (s *Service) startSettingsSync() {
|
|
ticker := time.NewTicker(SettingsSyncInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-s.settingsStopCh:
|
|
return
|
|
case <-ticker.C:
|
|
s.syncSettingsFromDB()
|
|
}
|
|
}
|
|
}
|
|
|
|
// SetEnabled manually enables or disables log capture
|
|
func (s *Service) SetEnabled(enabled bool) {
|
|
s.logWriter.SetEnabled(enabled)
|
|
}
|
|
|
|
// IsEnabled returns whether log capture is enabled
|
|
func (s *Service) IsEnabled() bool {
|
|
return s.logWriter.IsEnabled()
|
|
}
|
|
|
|
// SetDB sets the database connection for settings sync
|
|
// This can be called after NewService if DB wasn't available during initialization
|
|
func (s *Service) SetDB(db *gorm.DB) {
|
|
s.db = db
|
|
s.syncSettingsFromDB()
|
|
}
|
|
|
|
// LogWriter returns an io.Writer for zerolog that captures logs to Redis
|
|
func (s *Service) LogWriter() io.Writer {
|
|
return s.logWriter
|
|
}
|
|
|
|
// Handler returns the HTTP handler for monitoring endpoints
|
|
func (s *Service) Handler() *Handler {
|
|
return s.handler
|
|
}
|
|
|
|
// HTTPCollector returns the HTTP stats collector (nil for worker)
|
|
func (s *Service) HTTPCollector() *HTTPStatsCollector {
|
|
return s.httpCollector
|
|
}
|
|
|
|
// MetricsMiddleware returns the Echo middleware for HTTP metrics (API server only)
|
|
func (s *Service) MetricsMiddleware() echo.MiddlewareFunc {
|
|
if s.httpCollector == nil {
|
|
return nil
|
|
}
|
|
return MetricsMiddleware(s.httpCollector)
|
|
}
|