From b67f7f9e6b738ae4aa12837e276757937b95a712 Mon Sep 17 00:00:00 2001 From: Trey t Date: Sun, 26 Apr 2026 21:29:30 -0500 Subject: [PATCH] Cache SubscriptionSettings + cut monitoring poll noise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trace data revealed subscription_subscriptionsettings was consuming 1,983s of cumulative DB time per day (180× more than the next-largest table) for a 32-byte singleton row of admin-toggleable global flags. Root cause was a 30-second poll loop in monitoring.Service per pod plus uncached reads on every authed status check / CreateResidence / Stripe webhook. Fix is layered: 1. Redis cache for SubscriptionSettings — same shape as the residence-IDs cache. 30-min TTL, explicit invalidation on admin write. New CacheService.{Cache,GetCached,Invalidate}SubscriptionSettings plus a cachedSubscriptionSettings helper in services/. 2. SubscriptionService, StripeService, and both admin handlers (settings + limitations) now read through the cache. Admin write handlers invalidate so toggles propagate cluster-wide within ms instead of waiting for the TTL. 3. monitoring.Service.syncSettingsFromDB also reads from Redis first (raw redis.Client to avoid a services→monitoring import cycle). Polling interval bumped 30s → 5min. Combined with Redis-shared cache, cluster-wide DB hits from this poll go from ~480/hour to ~2/hour — a 240× reduction. 4. StripeService.CreateCheckoutSession now takes ctx so the cached settings span (and the Stripe webhook trace) stay attached to the request. Handler call site updated. 5. Admin handlers' direct h.db.First calls switched to db.WithContext(ctx) so the resulting orphan SQL spans nest under the admin request span in Jaeger. Net DB query rate for subscription_subscriptionsettings should drop from 0.101/sec to ~0/sec with occasional invalidation-driven refills, and the table's cumulative DB time from 1,983s/day to ~10s/day. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../admin/handlers/limitations_handler.go | 43 ++++++++++++---- internal/admin/handlers/settings_handler.go | 49 +++++++++++++++---- internal/admin/routes.go | 5 +- internal/handlers/subscription_handler.go | 2 +- internal/monitoring/service.go | 47 ++++++++++++++++-- internal/router/router.go | 5 ++ internal/services/cache_service.go | 49 +++++++++++++++++++ internal/services/stripe_service.go | 19 +++++-- internal/services/subscription_service.go | 10 +++- .../services/subscription_settings_cache.go | 43 ++++++++++++++++ 10 files changed, 240 insertions(+), 32 deletions(-) create mode 100644 internal/services/subscription_settings_cache.go diff --git a/internal/admin/handlers/limitations_handler.go b/internal/admin/handlers/limitations_handler.go index 9e3ec0a..922418e 100644 --- a/internal/admin/handlers/limitations_handler.go +++ b/internal/admin/handlers/limitations_handler.go @@ -8,16 +8,18 @@ import ( "gorm.io/gorm" "github.com/treytartt/honeydue-api/internal/models" + "github.com/treytartt/honeydue-api/internal/services" ) // AdminLimitationsHandler handles subscription limitations management type AdminLimitationsHandler struct { - db *gorm.DB + db *gorm.DB + cache *services.CacheService } -// NewAdminLimitationsHandler creates a new handler -func NewAdminLimitationsHandler(db *gorm.DB) *AdminLimitationsHandler { - return &AdminLimitationsHandler{db: db} +// NewAdminLimitationsHandler creates a new handler. Cache is optional. +func NewAdminLimitationsHandler(db *gorm.DB, cache *services.CacheService) *AdminLimitationsHandler { + return &AdminLimitationsHandler{db: db, cache: cache} } // === Settings (enable_limitations) === @@ -27,14 +29,25 @@ type LimitationsSettingsResponse struct { EnableLimitations bool `json:"enable_limitations"` } -// GetSettings handles GET /api/admin/limitations/settings +// GetSettings handles GET /api/admin/limitations/settings. +// Reads through Redis cache first; on miss falls through to DB. func (h *AdminLimitationsHandler) GetSettings(c echo.Context) error { + ctx := c.Request().Context() + + if h.cache != nil { + var cached models.SubscriptionSettings + if err := h.cache.GetCachedSubscriptionSettings(ctx, &cached); err == nil { + return c.JSON(http.StatusOK, LimitationsSettingsResponse{ + EnableLimitations: cached.EnableLimitations, + }) + } + } + var settings models.SubscriptionSettings - if err := h.db.First(&settings, 1).Error; err != nil { + if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil { if err == gorm.ErrRecordNotFound { - // Create default settings settings = models.SubscriptionSettings{ID: 1, EnableLimitations: false} - if err := h.db.Create(&settings).Error; err != nil { + if err := h.db.WithContext(ctx).Create(&settings).Error; err != nil { return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create default settings"}) } } else { @@ -42,6 +55,10 @@ func (h *AdminLimitationsHandler) GetSettings(c echo.Context) error { } } + if h.cache != nil { + _ = h.cache.CacheSubscriptionSettings(ctx, &settings) + } + return c.JSON(http.StatusOK, LimitationsSettingsResponse{ EnableLimitations: settings.EnableLimitations, }) @@ -60,7 +77,8 @@ func (h *AdminLimitationsHandler) UpdateSettings(c echo.Context) error { } var settings models.SubscriptionSettings - if err := h.db.First(&settings, 1).Error; err != nil { + ctx := c.Request().Context() + if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil { if err == gorm.ErrRecordNotFound { settings = models.SubscriptionSettings{ID: 1} } else { @@ -72,10 +90,15 @@ func (h *AdminLimitationsHandler) UpdateSettings(c echo.Context) error { settings.EnableLimitations = *req.EnableLimitations } - if err := h.db.Save(&settings).Error; err != nil { + if err := h.db.WithContext(ctx).Save(&settings).Error; err != nil { return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update settings"}) } + // Invalidate the cache so the new value is visible to all pods. + if h.cache != nil { + _ = h.cache.InvalidateSubscriptionSettings(ctx) + } + return c.JSON(http.StatusOK, LimitationsSettingsResponse{ EnableLimitations: settings.EnableLimitations, }) diff --git a/internal/admin/handlers/settings_handler.go b/internal/admin/handlers/settings_handler.go index 350723c..ea8c859 100644 --- a/internal/admin/handlers/settings_handler.go +++ b/internal/admin/handlers/settings_handler.go @@ -18,12 +18,14 @@ import ( // AdminSettingsHandler handles system settings management type AdminSettingsHandler struct { - db *gorm.DB + db *gorm.DB + cache *services.CacheService } -// NewAdminSettingsHandler creates a new handler -func NewAdminSettingsHandler(db *gorm.DB) *AdminSettingsHandler { - return &AdminSettingsHandler{db: db} +// NewAdminSettingsHandler creates a new handler. The cache may be nil; the +// handler falls through to direct DB reads in that case. +func NewAdminSettingsHandler(db *gorm.DB, cache *services.CacheService) *AdminSettingsHandler { + return &AdminSettingsHandler{db: db, cache: cache} } // SettingsResponse represents the settings response @@ -34,10 +36,29 @@ type SettingsResponse struct { TrialDurationDays int `json:"trial_duration_days"` } -// GetSettings handles GET /api/admin/settings +// GetSettings handles GET /api/admin/settings. +// +// Reads through Redis (30-min TTL) before hitting Postgres so the same +// row that's checked on every authed request and every monitoring poll +// stays hot. Cache miss / first boot creates and caches the default row. func (h *AdminSettingsHandler) GetSettings(c echo.Context) error { + ctx := c.Request().Context() + + // Try cache first. + if h.cache != nil { + var cached models.SubscriptionSettings + if err := h.cache.GetCachedSubscriptionSettings(ctx, &cached); err == nil { + return c.JSON(http.StatusOK, SettingsResponse{ + EnableLimitations: cached.EnableLimitations, + EnableMonitoring: cached.EnableMonitoring, + TrialEnabled: cached.TrialEnabled, + TrialDurationDays: cached.TrialDurationDays, + }) + } + } + var settings models.SubscriptionSettings - if err := h.db.First(&settings, 1).Error; err != nil { + if err := h.db.WithContext(ctx).First(&settings, 1).Error; err != nil { if err == gorm.ErrRecordNotFound { // Create default settings settings = models.SubscriptionSettings{ @@ -47,7 +68,7 @@ func (h *AdminSettingsHandler) GetSettings(c echo.Context) error { TrialEnabled: true, TrialDurationDays: 14, } - if err := h.db.Create(&settings).Error; err != nil { + if err := h.db.WithContext(ctx).Create(&settings).Error; err != nil { return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to create default settings"}) } } else { @@ -55,6 +76,10 @@ func (h *AdminSettingsHandler) GetSettings(c echo.Context) error { } } + if h.cache != nil { + _ = h.cache.CacheSubscriptionSettings(ctx, &settings) + } + return c.JSON(http.StatusOK, SettingsResponse{ EnableLimitations: settings.EnableLimitations, EnableMonitoring: settings.EnableMonitoring, @@ -79,7 +104,7 @@ func (h *AdminSettingsHandler) UpdateSettings(c echo.Context) error { } var settings models.SubscriptionSettings - if err := h.db.First(&settings, 1).Error; err != nil { + if err := h.db.WithContext(c.Request().Context()).First(&settings, 1).Error; err != nil { if err == gorm.ErrRecordNotFound { settings = models.SubscriptionSettings{ ID: 1, @@ -108,10 +133,16 @@ func (h *AdminSettingsHandler) UpdateSettings(c echo.Context) error { settings.TrialDurationDays = *req.TrialDurationDays } - if err := h.db.Save(&settings).Error; err != nil { + if err := h.db.WithContext(c.Request().Context()).Save(&settings).Error; err != nil { return c.JSON(http.StatusInternalServerError, map[string]interface{}{"error": "Failed to update settings"}) } + // Invalidate the cache so all pods pick up the new value on their + // next read (instead of waiting for the 30-min TTL). + if h.cache != nil { + _ = h.cache.InvalidateSubscriptionSettings(c.Request().Context()) + } + return c.JSON(http.StatusOK, SettingsResponse{ EnableLimitations: settings.EnableLimitations, EnableMonitoring: settings.EnableMonitoring, diff --git a/internal/admin/routes.go b/internal/admin/routes.go index c2d6141..42cbd98 100644 --- a/internal/admin/routes.go +++ b/internal/admin/routes.go @@ -25,6 +25,7 @@ type Dependencies struct { PushClient *push.Client OnboardingService *services.OnboardingEmailService MonitoringHandler *monitoring.Handler + CacheService *services.CacheService } // SetupRoutes configures all admin routes @@ -380,7 +381,7 @@ func SetupRoutes(router *echo.Echo, db *gorm.DB, cfg *config.Config, deps *Depen } // System settings management (super admin only) - settingsHandler := handlers.NewAdminSettingsHandler(db) + settingsHandler := handlers.NewAdminSettingsHandler(db, deps.CacheService) settings := protected.Group("/settings") settings.Use(middleware.RequireSuperAdmin()) { @@ -394,7 +395,7 @@ func SetupRoutes(router *echo.Echo, db *gorm.DB, cfg *config.Config, deps *Depen } // Limitations management (tier limits, upgrade triggers) - limitationsHandler := handlers.NewAdminLimitationsHandler(db) + limitationsHandler := handlers.NewAdminLimitationsHandler(db, deps.CacheService) limitations := protected.Group("/limitations") { // Settings (enable_limitations toggle) diff --git a/internal/handlers/subscription_handler.go b/internal/handlers/subscription_handler.go index faa5a48..3a37596 100644 --- a/internal/handlers/subscription_handler.go +++ b/internal/handlers/subscription_handler.go @@ -244,7 +244,7 @@ func (h *SubscriptionHandler) CreateCheckoutSession(c echo.Context) error { return err } - sessionURL, err := h.stripeService.CreateCheckoutSession(user.ID, req.PriceID, req.SuccessURL, req.CancelURL) + sessionURL, err := h.stripeService.CreateCheckoutSession(c.Request().Context(), user.ID, req.PriceID, req.SuccessURL, req.CancelURL) if err != nil { return err } diff --git a/internal/monitoring/service.go b/internal/monitoring/service.go index 31dd2e0..f63292f 100644 --- a/internal/monitoring/service.go +++ b/internal/monitoring/service.go @@ -1,6 +1,8 @@ package monitoring import ( + "context" + "encoding/json" "io" "sync" "time" @@ -17,8 +19,13 @@ import ( const ( // DefaultStatsInterval is the default interval for collecting/publishing stats DefaultStatsInterval = 5 * time.Second - // SettingsSyncInterval is how often to check the database for enable_monitoring setting - SettingsSyncInterval = 30 * time.Second + // SettingsSyncInterval is how often to check the database for the + // enable_monitoring setting. Was 30s; bumped to 5min after the + // observability rollout revealed this poll was generating ~9k DB queries + // per day across all pods just to read a 32-byte admin toggle. The flag + // is admin-only and changes essentially never; 5-minute freshness is + // strictly more than enough. + SettingsSyncInterval = 5 * time.Minute ) // Service orchestrates all monitoring components @@ -31,11 +38,18 @@ type Service struct { handler *Handler logWriter *RedisLogWriter db *gorm.DB + redis *redis.Client settingsStopCh chan struct{} stopOnce sync.Once statsInterval time.Duration } +// settingsCacheKey is the same key services.CacheService uses for the +// SubscriptionSettings singleton. Duplicated here because the monitoring +// package doesn't import services (avoids the import cycle that would +// arise if services ever imports monitoring). +const settingsCacheKey = "subscription_settings:1" + // Config holds configuration for the monitoring service type Config struct { Process string // "api" or "worker" @@ -73,6 +87,7 @@ func NewService(cfg Config) *Service { handler: handler, logWriter: logWriter, db: cfg.DB, + redis: cfg.RedisClient, settingsStopCh: make(chan struct{}), statsInterval: cfg.StatsInterval, } @@ -121,12 +136,32 @@ func (s *Service) Stop() { }) } -// syncSettingsFromDB checks the database for the enable_monitoring setting +// syncSettingsFromDB checks for the enable_monitoring setting, going through +// Redis first and falling back to Postgres on cache miss. +// +// In steady state across the cluster: with 4 pods polling every 5 min and a +// 30-min cache TTL, only ~1 of every 6 polls actually reaches Postgres, +// and only one pod hits the DB per 30-min window thanks to the shared +// Redis cache. ~9k DB queries/day → ~50/day. func (s *Service) syncSettingsFromDB() { if s.db == nil { return } + // Try Redis cache first (same key services.CacheService writes to). + if s.redis != nil { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + + if data, err := s.redis.Get(ctx, settingsCacheKey).Bytes(); err == nil { + var cached models.SubscriptionSettings + if err := json.Unmarshal(data, &cached); err == nil { + s.applySettings(&cached) + return + } + } + } + var settings models.SubscriptionSettings err := s.db.First(&settings, 1).Error if err != nil { @@ -138,6 +173,12 @@ func (s *Service) syncSettingsFromDB() { return } + s.applySettings(&settings) +} + +// applySettings reflects a SubscriptionSettings record onto the log writer +// and logs the change if it actually flipped. +func (s *Service) applySettings(settings *models.SubscriptionSettings) { wasEnabled := s.logWriter.IsEnabled() s.logWriter.SetEnabled(settings.EnableMonitoring) diff --git a/internal/router/router.go b/internal/router/router.go index f7e6b0f..9119426 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -208,12 +208,16 @@ func SetupRouter(deps *Dependencies) *echo.Echo { taskService.SetCacheService(deps.Cache) contractorService.SetCacheService(deps.Cache) documentService.SetCacheService(deps.Cache) + subscriptionService.SetCacheService(deps.Cache) } taskTemplateService := services.NewTaskTemplateService(taskTemplateRepo) suggestionService := services.NewSuggestionService(deps.DB, residenceRepo) // Initialize Stripe service stripeService := services.NewStripeService(subscriptionRepo, userRepo) + if deps.Cache != nil { + stripeService.SetCacheService(deps.Cache) + } // Initialize webhook event repo for deduplication webhookEventRepo := repositories.NewWebhookEventRepository(deps.DB) @@ -275,6 +279,7 @@ func SetupRouter(deps *Dependencies) *echo.Echo { PushClient: deps.PushClient, OnboardingService: onboardingService, MonitoringHandler: monitoringHandler, + CacheService: deps.Cache, } admin.SetupRoutes(e, deps.DB, cfg, adminDeps) diff --git a/internal/services/cache_service.go b/internal/services/cache_service.go index fcfd70a..24beb0a 100644 --- a/internal/services/cache_service.go +++ b/internal/services/cache_service.go @@ -446,3 +446,52 @@ func (c *CacheService) InvalidateResidenceIDsForUsers(ctx context.Context, userI } return c.Delete(ctx, keys...) } + +// === SubscriptionSettings cache === +// +// SubscriptionSettings is a 32-byte singleton row of admin-toggleable global +// flags (EnableLimitations, EnableMonitoring, TrialEnabled, TrialDurationDays). +// Read on every authed status check, every CreateResidence, and once per +// 30s by every monitoring goroutine. Cached forever-ish here; admin writes +// invalidate explicitly. +// +// 30-minute TTL is belt-and-suspenders against an admin update that somehow +// bypasses the invalidation path (e.g., a manual SQL UPDATE). The flag value +// converging within 30 min is fine for any real use case. + +const ( + subscriptionSettingsKey = "subscription_settings:1" + subscriptionSettingsTTL = 30 * time.Minute +) + +// CacheSubscriptionSettings stores the singleton settings row. Caller passes +// any encodable value — typically *models.SubscriptionSettings. Best-effort. +func (c *CacheService) CacheSubscriptionSettings(ctx context.Context, settings interface{}) error { + if c == nil { + return nil + } + data, err := json.Marshal(settings) + if err != nil { + return err + } + return c.client.Set(ctx, subscriptionSettingsKey, data, subscriptionSettingsTTL).Err() +} + +// GetCachedSubscriptionSettings unmarshals into the supplied destination. +// Returns redis.Nil on cache miss so callers can distinguish from genuine errors. +func (c *CacheService) GetCachedSubscriptionSettings(ctx context.Context, dest interface{}) error { + if c == nil { + return fmt.Errorf("cache not available") + } + return c.Get(ctx, subscriptionSettingsKey, dest) +} + +// InvalidateSubscriptionSettings drops the singleton-settings cache. Called +// from admin handlers that update the row so the new values are visible +// immediately to all pods (instead of waiting for the 30-min TTL). +func (c *CacheService) InvalidateSubscriptionSettings(ctx context.Context) error { + if c == nil { + return nil + } + return c.Delete(ctx, subscriptionSettingsKey) +} diff --git a/internal/services/stripe_service.go b/internal/services/stripe_service.go index 709208c..085471a 100644 --- a/internal/services/stripe_service.go +++ b/internal/services/stripe_service.go @@ -1,6 +1,7 @@ package services import ( + "context" "encoding/json" "fmt" "time" @@ -24,6 +25,12 @@ type StripeService struct { subscriptionRepo *repositories.SubscriptionRepository userRepo *repositories.UserRepository webhookSecret string + cache *CacheService +} + +// SetCacheService wires Redis caching for SubscriptionSettings reads. +func (s *StripeService) SetCacheService(cache *CacheService) { + s.cache = cache } // NewStripeService creates a new Stripe service. It initializes the global @@ -58,7 +65,7 @@ func NewStripeService( // CreateCheckoutSession creates a Stripe Checkout Session for a web subscription purchase. // It ensures the user has a Stripe customer record and configures the session with a trial // period if the user has not used their trial yet. -func (s *StripeService) CreateCheckoutSession(userID uint, priceID string, successURL string, cancelURL string) (string, error) { +func (s *StripeService) CreateCheckoutSession(ctx context.Context, userID uint, priceID string, successURL string, cancelURL string) (string, error) { // Get or create the user's subscription record sub, err := s.subscriptionRepo.GetOrCreate(userID) if err != nil { @@ -94,7 +101,7 @@ func (s *StripeService) CreateCheckoutSession(userID uint, priceID string, succe // Offer a trial period if the user has not used their trial yet if !sub.TrialUsed { - trialDays, err := s.getTrialDays() + trialDays, err := s.getTrialDays(ctx) if err != nil { log.Warn().Err(err).Msg("Failed to get trial duration from settings, skipping trial") } else if trialDays > 0 { @@ -444,9 +451,11 @@ func (s *StripeService) findSubscriptionByStripeID(stripeSubID string) (*models. return sub, nil } -// getTrialDays reads the trial duration from SubscriptionSettings. -func (s *StripeService) getTrialDays() (int, error) { - settings, err := s.subscriptionRepo.GetSettings() +// getTrialDays reads the trial duration from SubscriptionSettings via the +// shared cache. ctx threads through so the SQL span (on cache miss) attaches +// to the parent webhook trace. +func (s *StripeService) getTrialDays(ctx context.Context) (int, error) { + settings, err := cachedSubscriptionSettings(ctx, s.cache, s.subscriptionRepo) if err != nil { return 0, err } diff --git a/internal/services/subscription_service.go b/internal/services/subscription_service.go index 84ff5fe..a340a4f 100644 --- a/internal/services/subscription_service.go +++ b/internal/services/subscription_service.go @@ -50,6 +50,12 @@ type SubscriptionService struct { documentRepo *repositories.DocumentRepository appleClient *AppleIAPClient googleClient *GoogleIAPClient + cache *CacheService +} + +// SetCacheService wires Redis caching for SubscriptionSettings reads. +func (s *SubscriptionService) SetCacheService(cache *CacheService) { + s.cache = cache } // NewSubscriptionService creates a new subscription service @@ -113,7 +119,7 @@ func (s *SubscriptionService) GetSubscriptionStatus(ctx context.Context, userID return nil, apperrors.Internal(err) } - settings, err := s.subscriptionRepo.WithContext(ctx).GetSettings() + settings, err := cachedSubscriptionSettings(ctx, s.cache, s.subscriptionRepo) if err != nil { return nil, apperrors.Internal(err) } @@ -243,7 +249,7 @@ func (s *SubscriptionService) getUserUsage(ctx context.Context, userID uint) (*U // CheckLimit checks if a user has exceeded a specific limit func (s *SubscriptionService) CheckLimit(ctx context.Context, userID uint, limitType string) error { - settings, err := s.subscriptionRepo.WithContext(ctx).GetSettings() + settings, err := cachedSubscriptionSettings(ctx, s.cache, s.subscriptionRepo) if err != nil { return apperrors.Internal(err) } diff --git a/internal/services/subscription_settings_cache.go b/internal/services/subscription_settings_cache.go new file mode 100644 index 0000000..9e3d912 --- /dev/null +++ b/internal/services/subscription_settings_cache.go @@ -0,0 +1,43 @@ +package services + +import ( + "context" + + "github.com/treytartt/honeydue-api/internal/models" + "github.com/treytartt/honeydue-api/internal/repositories" +) + +// cachedSubscriptionSettings fetches the singleton settings row, going +// through Redis (30-min TTL) before falling back to Postgres. +// +// Hot read — touched on every CheckLimit, every GetSubscriptionStatus, +// and every Stripe webhook. The row is admin-toggleable but writes are +// rare; the cache cuts the per-request cost from ~250ms (transatlantic +// Postgres roundtrip) to ~1ms (cluster-internal Redis). +// +// On a nil cache (tests, Redis-down), falls through to the repo directly +// so the caller never sees a hard failure from caching. +// +// Admin writes invalidate via cache.InvalidateSubscriptionSettings. +func cachedSubscriptionSettings( + ctx context.Context, + cache *CacheService, + subRepo *repositories.SubscriptionRepository, +) (*models.SubscriptionSettings, error) { + if cache != nil { + var settings models.SubscriptionSettings + if err := cache.GetCachedSubscriptionSettings(ctx, &settings); err == nil { + return &settings, nil + } + } + + settings, err := subRepo.WithContext(ctx).GetSettings() + if err != nil { + return nil, err + } + + if cache != nil { + _ = cache.CacheSubscriptionSettings(ctx, settings) + } + return settings, nil +}