perf(subscription-status): cache + parallelize + invalidate on mutations
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Build (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled

GET /api/subscription/status/ was the slowest endpoint in the API at
p50≈1750ms / p95≈2425ms — about 12× the floor for our cluster→Neon
geography. Jaeger traces showed seven sequential SQL queries each
costing roughly one transatlantic RTT (~110ms), with the actual queries
running in 0.073ms at the database. Pure network serialization, not slow
SQL.

Three changes, in order of leverage:

1. Cache the assembled SubscriptionStatusResponse per-user in Redis with
   a 5-minute TTL. Hot path collapses to a single Redis GET (~5ms) on
   warm reads; the TTL is a safety net against missed invalidations.

2. Parallelize the three independent COUNT queries in getUserUsage
   (task_task / task_contractor / task_document) via golang.org/x/sync
   errgroup. Three RTTs collapse to one. Also dropped the redundant
   residence_residence COUNT — len(residenceIDs) from FindResidenceIDsByOwner
   is the same number, no need to re-query.

3. Wire explicit invalidation into every mutation that could change a
   user's response — residence/task/contractor/document CRUD,
   residence membership changes (JoinWithCode, RemoveUser, DeleteResidence),
   and every subscription tier flip across the IAP/Stripe/webhook surface.
   Residence-scoped invalidations fan out to every user with access via a
   new ResidenceRepository.FindUserIDsByResidence helper, so members of a
   shared residence don't see stale `usage` numbers when another member
   adds a task.

Net effect: warm path goes from ~1350ms to ~5ms (Redis hit). Cold path
goes from ~1350ms to ~250-450ms (5 sequential queries → 2 phases:
residence IDs lookup, then parallel task/contractor/document counts).

Also fixed a pre-existing CheckLimit signature drift in
internal/integration/subscription_is_free_test.go that was blocking the
package build.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-05-01 11:00:23 -07:00
parent 0798ae8d74
commit 9bee436e86
11 changed files with 286 additions and 34 deletions
+95 -23
View File
@@ -6,6 +6,7 @@ import (
"time"
"github.com/rs/zerolog/log"
"golang.org/x/sync/errgroup"
"gorm.io/gorm"
"github.com/treytartt/honeydue-api/internal/apperrors"
@@ -112,8 +113,24 @@ func (s *SubscriptionService) GetSubscription(ctx context.Context, userID uint)
return NewSubscriptionResponse(sub), nil
}
// GetSubscriptionStatus gets detailed subscription status including limits
// GetSubscriptionStatus gets detailed subscription status including limits.
//
// Hot path on the iOS launch screen — runs 7+ sequential SQL queries against
// transatlantic Neon Postgres at ~110ms RTT each (~800ms floor before
// optimization). The assembled response is cached per-user in Redis with a
// 5-minute TTL; mutation paths (residence/task/contractor/document/sub CRUD)
// invalidate via cache.InvalidateSubscriptionStatusForUsers, fanning out to
// every member of a shared residence.
func (s *SubscriptionService) GetSubscriptionStatus(ctx context.Context, userID uint) (*SubscriptionStatusResponse, error) {
// Cache fast path — only used on warm reads. Cold reads, trial-start
// branch, and the actual mutation paths below all populate fresh.
if s.cache != nil {
var cached SubscriptionStatusResponse
if err := s.cache.GetCachedSubscriptionStatus(ctx, userID, &cached); err == nil {
return &cached, nil
}
}
sub, err := s.subscriptionRepo.WithContext(ctx).GetOrCreate(userID)
if err != nil {
return nil, apperrors.Internal(err)
@@ -204,43 +221,59 @@ func (s *SubscriptionService) GetSubscriptionStatus(ctx context.Context, userID
resp.TrialActive = sub.IsTrialActive()
resp.SubscriptionSource = sub.SubscriptionSource()
// Best-effort cache write. Errors are logged at the cache layer, not fatal.
if s.cache != nil {
_ = s.cache.CacheSubscriptionStatus(ctx, userID, resp)
}
return resp, nil
}
// getUserUsage calculates current usage for a user.
// P-10: Uses CountByOwner for properties count instead of loading all owned residences.
// Uses batch COUNT queries (O(1) queries) instead of per-residence queries (O(N)).
//
// Performance: residence ID lookup is one query (we use len() for the
// properties count instead of a redundant COUNT). The three IN-clause counts
// against task_task / task_contractor / task_document don't depend on each
// other and run concurrently via errgroup, collapsing 3 transatlantic RTTs
// into 1. With residence IDs that's 2 RTT total instead of the prior 5.
func (s *SubscriptionService) getUserUsage(ctx context.Context, userID uint) (*UsageResponse, error) {
// P-10: Use CountByOwner for an efficient COUNT query instead of loading all records
propertiesCount, err := s.residenceRepo.WithContext(ctx).CountByOwner(userID)
if err != nil {
return nil, apperrors.Internal(err)
}
// Still need residence IDs for batch counting tasks/contractors/documents
// One query — used both for the properties count (len) and as the IN-list
// for the three downstream counts. Replaces the prior CountByOwner +
// FindResidenceIDsByOwner pair, which queried residence_residence twice
// with the same predicate.
residenceIDs, err := s.residenceRepo.WithContext(ctx).FindResidenceIDsByOwner(userID)
if err != nil {
return nil, apperrors.Internal(err)
}
// Count tasks, contractors, and documents across all residences with single queries each
tasksCount, err := s.taskRepo.WithContext(ctx).CountByResidenceIDs(residenceIDs)
if err != nil {
return nil, apperrors.Internal(err)
}
var (
tasksCount int64
contractorsCount int64
documentsCount int64
)
contractorsCount, err := s.contractorRepo.WithContext(ctx).CountByResidenceIDs(residenceIDs)
if err != nil {
return nil, apperrors.Internal(err)
}
documentsCount, err := s.documentRepo.WithContext(ctx).CountByResidenceIDs(residenceIDs)
if err != nil {
g, gCtx := errgroup.WithContext(ctx)
g.Go(func() error {
c, err := s.taskRepo.WithContext(gCtx).CountByResidenceIDs(residenceIDs)
tasksCount = c
return err
})
g.Go(func() error {
c, err := s.contractorRepo.WithContext(gCtx).CountByResidenceIDs(residenceIDs)
contractorsCount = c
return err
})
g.Go(func() error {
c, err := s.documentRepo.WithContext(gCtx).CountByResidenceIDs(residenceIDs)
documentsCount = c
return err
})
if err := g.Wait(); err != nil {
return nil, apperrors.Internal(err)
}
return &UsageResponse{
PropertiesCount: propertiesCount,
PropertiesCount: int64(len(residenceIDs)),
TasksCount: tasksCount,
ContractorsCount: contractorsCount,
DocumentsCount: documentsCount,
@@ -416,6 +449,12 @@ func (s *SubscriptionService) ProcessApplePurchase(ctx context.Context, userID u
return nil, apperrors.Internal(err)
}
// Tier flipped — drop cached SubscriptionStatusResponse so the next call
// returns Pro immediately instead of stale Free.
if s.cache != nil {
_ = s.cache.InvalidateSubscriptionStatusForUsers(ctx, userID)
}
return s.GetSubscription(ctx, userID)
}
@@ -473,6 +512,10 @@ func (s *SubscriptionService) ProcessGooglePurchase(ctx context.Context, userID
return nil, apperrors.Internal(err)
}
if s.cache != nil {
_ = s.cache.InvalidateSubscriptionStatusForUsers(ctx, userID)
}
return s.GetSubscription(ctx, userID)
}
@@ -481,6 +524,10 @@ func (s *SubscriptionService) CancelSubscription(ctx context.Context, userID uin
if err := s.subscriptionRepo.WithContext(ctx).SetAutoRenew(userID, false); err != nil {
return nil, apperrors.Internal(err)
}
// auto_renew flips a field surfaced in SubscriptionStatusResponse.
if s.cache != nil {
_ = s.cache.InvalidateSubscriptionStatusForUsers(ctx, userID)
}
return s.GetSubscription(ctx, userID)
}
@@ -657,6 +704,31 @@ func NewUpgradeTriggerDataResponse(t *models.UpgradeTrigger) *UpgradeTriggerData
}
}
// invalidateSubStatusForResidence drops the per-user subscription_status cache
// for every user with access to a residence (owner + members from
// residence_residence_users). Used by every mutation that changes shared data
// counts — tasks, contractors, documents — so members of a shared residence
// don't see stale `usage` numbers.
//
// Best-effort: failures are logged but never returned. The 5-min cache TTL is
// the safety net if this ever silently fails.
func invalidateSubStatusForResidence(ctx context.Context, cache *CacheService, residenceRepo *repositories.ResidenceRepository, residenceID uint) {
if cache == nil {
return
}
userIDs, err := residenceRepo.FindUserIDsByResidence(residenceID)
if err != nil {
log.Warn().Err(err).Uint("residence_id", residenceID).Msg("sub_status invalidation: residence lookup failed")
return
}
if len(userIDs) == 0 {
return
}
if err := cache.InvalidateSubscriptionStatusForUsers(ctx, userIDs...); err != nil {
log.Warn().Err(err).Uint("residence_id", residenceID).Msg("sub_status invalidation: redis delete failed")
}
}
// FeatureBenefitResponse represents a feature benefit
type FeatureBenefitResponse struct {
FeatureName string `json:"feature_name"`