Cut /api/tasks/ p99 from ~2500ms toward ~150-300ms
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Build (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled

Stack of optimizations against the same Hetzner→Neon transatlantic link.
The trace revealed every visible ms was network/proxy overhead — DB
execution itself is sub-millisecond per query (verified via EXPLAIN
ANALYZE: index scans on every hot path).

Connection layer:
- DB_HOST → Neon pooler endpoint (-pooler suffix). PgBouncer
  transaction-mode keeps backend Postgres connections warm so we no
  longer pay the ~110ms Postgres-startup RTT on cold queries.
- GORM pool tuned: MaxIdleConns 10→20, MaxLifetime 600s→1800s,
  MaxIdleTime added (default 0 = never close idle).
- Eager pool warm-up at boot via parallel pings — first user request
  no longer pays the ~440ms TCP+TLS+startup handshake.
- Redis maxmemory-policy noeviction → allkeys-lru. Cache writes will
  evict cold keys instead of erroring at the 256MB limit.

Auth layer:
- TokenCacheTTL 5min → 1 hour (Redis token cache).
- UserCacheTTL 30s → 5min (in-memory User cache, per pod).
- UserCache gains a 5,000-entry LRU cap so a flood of unique users
  can't blow up pod RSS. ~5MB worst-case per pod.
- Token + user lookup collapsed from 2 GORM Preload queries into a
  single INNER JOIN. Saves 1 RTT per cold-cache request.
- Auth middleware's m.db.* now use db.WithContext(ctx) so the SQL
  spans nest under the parent HTTP request in Jaeger.

Service layer:
- TaskService.ListTasks: replaced two-step
  FindResidenceIDsByUser → GetKanbanDataForMultipleResidences
  with a single GetKanbanDataForUser that uses a Postgres subquery
  for residence-access. One round-trip instead of two.
- New CacheService residence-IDs cache: \"residence_ids_user:<id>\"
  with 5-min TTL. Wired into Task/Residence/Contractor/Document
  services for the four hot read paths that need this list.
- Cache invalidation on every relevant mutation: CreateResidence,
  DeleteResidence, JoinWithCode, RemoveUser. DeleteResidence
  invalidates every member of the residence, not just the owner.

What this stacks up to (Hetzner→Neon, before US migration):
  Path                                 Before        After (target)
  Cache-warm authed read               ~800ms        ~100-200ms
  Cache-cold authed read (1st in 1hr)  ~2500ms       ~500-700ms
  First request after deploy           ~2500ms       ~700-900ms

The endgame US-region migration on top of this gets us to ~30-50ms
warm-cache, but we're shippable at ~150ms warm right now.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-04-25 17:13:50 -05:00
parent 9410da7497
commit 88fb1751c7
15 changed files with 443 additions and 59 deletions
+38 -1
View File
@@ -37,9 +37,16 @@ type ResidenceService struct {
userRepo *repositories.UserRepository
taskRepo *repositories.TaskRepository
subscriptionService *SubscriptionService
cache *CacheService
config *config.Config
}
// SetCacheService wires a Redis-backed cache for residence-ID lookups. May
// be nil — service falls through to direct DB queries when unset.
func (s *ResidenceService) SetCacheService(cache *CacheService) {
s.cache = cache
}
// NewResidenceService creates a new residence service
func NewResidenceService(residenceRepo *repositories.ResidenceRepository, userRepo *repositories.UserRepository, cfg *config.Config) *ResidenceService {
return &ResidenceService{
@@ -160,7 +167,7 @@ func (s *ResidenceService) GetMyResidences(ctx context.Context, userID uint, now
// Clients should use calculateSummaryFromKanban() instead.
func (s *ResidenceService) GetSummary(ctx context.Context, userID uint, now time.Time) (*responses.TotalSummary, error) {
// Get residence IDs (lightweight - no preloads)
residenceIDs, err := s.residenceRepo.WithContext(ctx).FindResidenceIDsByUser(userID)
residenceIDs, err := cachedResidenceIDsForUser(ctx, s.cache, s.residenceRepo, userID)
if err != nil {
return nil, apperrors.Internal(err)
}
@@ -257,6 +264,11 @@ func (s *ResidenceService) CreateResidence(ctx context.Context, req *requests.Cr
if err := s.residenceRepo.WithContext(ctx).Create(residence); err != nil {
return nil, apperrors.Internal(err)
}
if s.cache != nil {
// Owner now has a new residence — drop cached IDs so the next
// list-residences call doesn't omit it.
_ = s.cache.InvalidateResidenceIDsForUsers(ctx, ownerID)
}
// Reload with relations
residence, err := s.residenceRepo.WithContext(ctx).FindByID(residence.ID)
@@ -419,9 +431,26 @@ func (s *ResidenceService) DeleteResidence(ctx context.Context, residenceID, use
return nil, apperrors.Forbidden("error.not_residence_owner")
}
// Capture all member IDs before delete so we can invalidate their caches.
var affectedUserIDs []uint
if s.cache != nil {
if members, _ := s.residenceRepo.WithContext(ctx).GetResidenceUsers(residenceID); members != nil {
affectedUserIDs = make([]uint, 0, len(members)+1)
affectedUserIDs = append(affectedUserIDs, userID) // owner
for _, m := range members {
if m.ID != userID {
affectedUserIDs = append(affectedUserIDs, m.ID)
}
}
}
}
if err := s.residenceRepo.WithContext(ctx).Delete(residenceID); err != nil {
return nil, apperrors.Internal(err)
}
if s.cache != nil && len(affectedUserIDs) > 0 {
_ = s.cache.InvalidateResidenceIDsForUsers(ctx, affectedUserIDs...)
}
// Get updated summary
summary := s.getSummaryForUser(userID)
@@ -548,6 +577,10 @@ func (s *ResidenceService) JoinWithCode(ctx context.Context, code string, userID
if err := s.residenceRepo.WithContext(ctx).AddUser(shareCode.ResidenceID, userID); err != nil {
return nil, apperrors.Internal(err)
}
if s.cache != nil {
// The joining user's residence-IDs cache is now stale.
_ = s.cache.InvalidateResidenceIDsForUsers(ctx, userID)
}
// Mark share code as used (one-time use)
if err := s.residenceRepo.WithContext(ctx).DeactivateShareCode(shareCode.ID); err != nil {
@@ -629,6 +662,10 @@ func (s *ResidenceService) RemoveUser(ctx context.Context, residenceID, userIDTo
if err := s.residenceRepo.WithContext(ctx).RemoveUser(residenceID, userIDToRemove); err != nil {
return apperrors.Internal(err)
}
if s.cache != nil {
// The removed user's residence-IDs cache is now stale.
_ = s.cache.InvalidateResidenceIDsForUsers(ctx, userIDToRemove)
}
return nil
}