Cut /api/tasks/ p99 from ~2500ms toward ~150-300ms

Stack of optimizations against the same Hetzner→Neon transatlantic link. The trace revealed every visible ms was network/proxy overhead — DB execution itself is sub-millisecond per query (verified via EXPLAIN ANALYZE: index scans on every hot path). Connection layer: - DB_HOST → Neon pooler endpoint (-pooler suffix). PgBouncer transaction-mode keeps backend Postgres connections warm so we no longer pay the ~110ms Postgres-startup RTT on cold queries. - GORM pool tuned: MaxIdleConns 10→20, MaxLifetime 600s→1800s, MaxIdleTime added (default 0 = never close idle). - Eager pool warm-up at boot via parallel pings — first user request no longer pays the ~440ms TCP+TLS+startup handshake. - Redis maxmemory-policy noeviction → allkeys-lru. Cache writes will evict cold keys instead of erroring at the 256MB limit. Auth layer: - TokenCacheTTL 5min → 1 hour (Redis token cache). - UserCacheTTL 30s → 5min (in-memory User cache, per pod). - UserCache gains a 5,000-entry LRU cap so a flood of unique users can't blow up pod RSS. ~5MB worst-case per pod. - Token + user lookup collapsed from 2 GORM Preload queries into a single INNER JOIN. Saves 1 RTT per cold-cache request. - Auth middleware's m.db.* now use db.WithContext(ctx) so the SQL spans nest under the parent HTTP request in Jaeger. Service layer: - TaskService.ListTasks: replaced two-step FindResidenceIDsByUser → GetKanbanDataForMultipleResidences with a single GetKanbanDataForUser that uses a Postgres subquery for residence-access. One round-trip instead of two. - New CacheService residence-IDs cache: \"residence_ids_user:<id>\" with 5-min TTL. Wired into Task/Residence/Contractor/Document services for the four hot read paths that need this list. - Cache invalidation on every relevant mutation: CreateResidence, DeleteResidence, JoinWithCode, RemoveUser. DeleteResidence invalidates every member of the residence, not just the owner. What this stacks up to (Hetzner→Neon, before US migration): Path Before After (target) Cache-warm authed read ~800ms ~100-200ms Cache-cold authed read (1st in 1hr) ~2500ms ~500-700ms First request after deploy ~2500ms ~700-900ms The endgame US-region migration on top of this gets us to ~30-50ms warm-cache, but we're shippable at ~150ms warm right now. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 17:13:50 -05:00
parent 9410da7497
commit 88fb1751c7
15 changed files with 443 additions and 59 deletions
@@ -22,13 +22,22 @@ const (
 	AuthUserKey = "auth_user"
 	// AuthTokenKey is the key used to store the token in the context
 	AuthTokenKey = "auth_token"
-	// TokenCacheTTL is the duration to cache tokens in Redis
-	TokenCacheTTL = 5 * time.Minute
+	// TokenCacheTTL is the duration to cache tokens in Redis. Tokens are
+	// valid for DefaultTokenExpiryDays (90), and explicit logout invalidates
+	// the cache, so a long TTL here just means most authed requests skip the
+	// auth-token SQL query entirely.
+	TokenCacheTTL = 1 * time.Hour
 	// TokenCachePrefix is the prefix for token cache keys
 	TokenCachePrefix = "auth_token_"
 	// UserCacheTTL is how long full user records are cached in memory to
-	// avoid hitting the database on every authenticated request.
-	UserCacheTTL = 30 * time.Second
+	// avoid hitting the database on every authenticated request. Bumped from
+	// 30s — at 30s the trace showed a SELECT auth_user query on most warm
+	// requests because users aren't in cache long enough to hit twice.
+	UserCacheTTL = 5 * time.Minute
+	// UserCacheMaxSize bounds the per-pod in-memory user cache. With ~1KB
+	// per User struct, 5000 entries = ~5MB per pod. Older entries are
+	// evicted LRU before the limit is exceeded.
+	UserCacheMaxSize = 5000

 	// DefaultTokenExpiryDays is the default number of days before a token expires.
 	DefaultTokenExpiryDays = 90
@@ -47,7 +56,7 @@ func NewAuthMiddleware(db *gorm.DB, cache *services.CacheService) *AuthMiddlewar
 	return &AuthMiddleware{
 		db:              db,
 		cache:           cache,
-		userCache:       NewUserCache(UserCacheTTL),
+		userCache:       NewUserCache(UserCacheTTL, UserCacheMaxSize),
 		tokenExpiryDays: DefaultTokenExpiryDays,
 	}
 }
@@ -61,7 +70,7 @@ func NewAuthMiddlewareWithConfig(db *gorm.DB, cache *services.CacheService, cfg
 	return &AuthMiddleware{
 		db:              db,
 		cache:           cache,
-		userCache:       NewUserCache(UserCacheTTL),
+		userCache:       NewUserCache(UserCacheTTL, UserCacheMaxSize),
 		tokenExpiryDays: expiryDays,
 	}
 }
@@ -244,20 +253,83 @@ func (m *AuthMiddleware) getUserFromCache(ctx context.Context, token string) (*m
 // getUserFromDatabaseWithToken looks up the token in the database and returns
 // both the user and the auth token record (for expiry checking). The ctx is
 // threaded into the GORM session so the SQL span attaches to the request trace.
+//
+// Uses a single JOIN query instead of GORM's Preload (which issues 2 SELECTs).
+// Over a transatlantic link this saves ~110ms RTT per cache miss.
 func (m *AuthMiddleware) getUserFromDatabaseWithToken(ctx context.Context, token string) (*models.User, *models.AuthToken, error) {
-	var authToken models.AuthToken
-	if err := m.db.WithContext(ctx).Preload("User").Where("key = ?", token).First(&authToken).Error; err != nil {
+	// Flat result row: every column from auth_user prefixed `u_`, every
+	// column from user_authtoken left in its native shape. Mapping to two
+	// structs is mechanical so we don't need a struct tag soup.
+	type joinedRow struct {
+		// AuthToken columns
+		Key     string    `gorm:"column:key"`
+		Created time.Time `gorm:"column:created"`
+		UserID  uint      `gorm:"column:user_id"`
+		// User columns (prefixed to avoid collision with UserID)
+		UID         uint   `gorm:"column:u_id"`
+		UUsername   string `gorm:"column:u_username"`
+		UEmail      string `gorm:"column:u_email"`
+		UFirstName  string `gorm:"column:u_first_name"`
+		ULastName   string `gorm:"column:u_last_name"`
+		UPassword   string `gorm:"column:u_password"`
+		UIsActive   bool   `gorm:"column:u_is_active"`
+		UIsStaff    bool   `gorm:"column:u_is_staff"`
+		UIsSuper    bool   `gorm:"column:u_is_superuser"`
+		UDateJoined time.Time `gorm:"column:u_date_joined"`
+		ULastLogin  *time.Time `gorm:"column:u_last_login"`
+	}
+
+	var row joinedRow
+	err := m.db.WithContext(ctx).
+		Table("user_authtoken AS t").
+		Select(`
+			t.key, t.created, t.user_id,
+			u.id          AS u_id,
+			u.username    AS u_username,
+			u.email       AS u_email,
+			u.first_name  AS u_first_name,
+			u.last_name   AS u_last_name,
+			u.password    AS u_password,
+			u.is_active   AS u_is_active,
+			u.is_staff    AS u_is_staff,
+			u.is_superuser AS u_is_superuser,
+			u.date_joined AS u_date_joined,
+			u.last_login  AS u_last_login
+		`).
+		Joins("INNER JOIN auth_user u ON u.id = t.user_id").
+		Where("t.key = ?", token).
+		Limit(1).
+		Scan(&row).Error
+	if err != nil || row.Key == "" {
 		return nil, nil, fmt.Errorf("token not found")
 	}

-	// Check if user is active
-	if !authToken.User.IsActive {
+	user := models.User{
+		ID:         row.UID,
+		Username:   row.UUsername,
+		Email:      row.UEmail,
+		FirstName:  row.UFirstName,
+		LastName:   row.ULastName,
+		Password:   row.UPassword,
+		IsActive:   row.UIsActive,
+		IsStaff:    row.UIsStaff,
+		IsSuperuser: row.UIsSuper,
+		DateJoined: row.UDateJoined,
+		LastLogin:  row.ULastLogin,
+	}
+	authToken := models.AuthToken{
+		Key:     row.Key,
+		Created: row.Created,
+		UserID:  row.UserID,
+		User:    user,
+	}
+
+	if !user.IsActive {
 		return nil, nil, fmt.Errorf("user is inactive")
 	}

-	// Store in in-memory cache for subsequent requests
-	m.userCache.Set(&authToken.User)
-	return &authToken.User, &authToken, nil
+	m.userCache.Set(&user)
+	return &user, &authToken, nil
 }

 // getUserFromDatabase looks up the token in the database and caches the
@@ -2,6 +2,7 @@ package middleware

 import (
 	"sync"
+	"sync/atomic"
 	"time"

 	"github.com/treytartt/honeydue-api/internal/models"
@@ -16,22 +17,29 @@ type userCacheEntry struct {
 // UserCache is a concurrency-safe in-memory cache for User records, keyed by
 // user ID. Entries expire after a configurable TTL. The cache uses a sync.Map
 // for lock-free reads on the hot path, with periodic lazy eviction of stale
-// entries during Set operations.
+// entries during Set operations and a hard size cap to bound memory.
 type UserCache struct {
-	store   sync.Map
-	ttl     time.Duration
-	lastGC  time.Time
-	gcMu    sync.Mutex
-	gcEvery time.Duration
+	store    sync.Map
+	ttl      time.Duration
+	lastGC   time.Time
+	gcMu     sync.Mutex
+	gcEvery  time.Duration
+	size     atomic.Int64 // approximate count; sync.Map has no Len()
+	maxSize  int64
 }

 // NewUserCache creates a UserCache with the given TTL for entries.
-func NewUserCache(ttl time.Duration) *UserCache {
-	return &UserCache{
+// maxSize is the soft upper bound on the number of cached users; when
+// exceeded, the next Set will trigger an eviction sweep before storing.
+// Pass <=0 for no size cap.
+func NewUserCache(ttl time.Duration, maxSize int) *UserCache {
+	c := &UserCache{
 		ttl:     ttl,
 		lastGC:  time.Now(),
 		gcEvery: 2 * time.Minute,
+		maxSize: int64(maxSize),
 	}
+	return c
 }

 // Get returns a cached user by ID, or nil if not found or expired.
@@ -42,7 +50,9 @@ func (c *UserCache) Get(userID uint) *models.User {
 	}
 	entry := val.(*userCacheEntry)
 	if time.Now().After(entry.expiresAt) {
-		c.store.Delete(userID)
+		if _, loaded := c.store.LoadAndDelete(userID); loaded {
+			c.size.Add(-1)
+		}
 		return nil
 	}
 	// Return a shallow copy so callers cannot mutate the cached value.
@@ -51,20 +61,71 @@ func (c *UserCache) Get(userID uint) *models.User {
 }

 // Set stores a user in the cache. It also triggers a background garbage-
-// collection sweep if enough time has elapsed since the last one.
+// collection sweep if enough time has elapsed since the last one or if the
+// cache has grown past maxSize.
 func (c *UserCache) Set(user *models.User) {
 	// Store a copy to prevent external mutation of the cached object.
 	copied := *user
-	c.store.Store(user.ID, &userCacheEntry{
+	if _, loaded := c.store.Swap(user.ID, &userCacheEntry{
 		user:      &copied,
 		expiresAt: time.Now().Add(c.ttl),
-	})
+	}); !loaded {
+		c.size.Add(1)
+	}
+	if c.maxSize > 0 && c.size.Load() > c.maxSize {
+		c.evictUntilUnderCap()
+	}
 	c.maybeGC()
 }

+// evictUntilUnderCap walks the cache and drops the oldest expirable entries
+// until size is under maxSize. Cheap O(n) walk; runs only when the cap is
+// breached, which should be rare in practice (TTL handles most eviction).
+func (c *UserCache) evictUntilUnderCap() {
+	now := time.Now()
+	// First pass: drop expired entries.
+	c.store.Range(func(key, value any) bool {
+		entry := value.(*userCacheEntry)
+		if now.After(entry.expiresAt) {
+			if _, loaded := c.store.LoadAndDelete(key); loaded {
+				c.size.Add(-1)
+			}
+		}
+		return c.size.Load() > c.maxSize
+	})
+	// Second pass: if still over cap, drop entries closest to expiry.
+	if c.size.Load() <= c.maxSize {
+		return
+	}
+	type scored struct {
+		key       any
+		expiresAt time.Time
+	}
+	candidates := make([]scored, 0, 64)
+	c.store.Range(func(key, value any) bool {
+		entry := value.(*userCacheEntry)
+		candidates = append(candidates, scored{key, entry.expiresAt})
+		return true
+	})
+	// Sort by expiry ascending — drop closest-to-expiry first.
+	for i := 1; i < len(candidates); i++ {
+		for j := i; j > 0 && candidates[j-1].expiresAt.After(candidates[j].expiresAt); j-- {
+			candidates[j-1], candidates[j] = candidates[j], candidates[j-1]
+		}
+	}
+	overshoot := int(c.size.Load() - c.maxSize)
+	for i := 0; i < overshoot && i < len(candidates); i++ {
+		if _, loaded := c.store.LoadAndDelete(candidates[i].key); loaded {
+			c.size.Add(-1)
+		}
+	}
+}
+
 // Invalidate removes a user from the cache by ID.
 func (c *UserCache) Invalidate(userID uint) {
-	c.store.Delete(userID)
+	if _, loaded := c.store.LoadAndDelete(userID); loaded {
+		c.size.Add(-1)
+	}
 }

 // maybeGC lazily sweeps expired entries at most once per gcEvery interval.
@@ -81,7 +142,9 @@ func (c *UserCache) maybeGC() {
 	c.store.Range(func(key, value any) bool {
 		entry := value.(*userCacheEntry)
 		if now.After(entry.expiresAt) {
-			c.store.Delete(key)
+			if _, loaded := c.store.LoadAndDelete(key); loaded {
+				c.size.Add(-1)
+			}
 		}
 		return true
 	})
@@ -11,7 +11,7 @@ import (
 )

 func TestUserCache_SetAndGet(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)

 	user := &models.User{Username: "testuser", Email: "test@test.com"}
 	user.ID = 1
@@ -25,7 +25,7 @@ func TestUserCache_SetAndGet(t *testing.T) {
 }

 func TestUserCache_GetNonExistent_ReturnsNil(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)

 	cached := cache.Get(999)
 	assert.Nil(t, cached)
@@ -33,7 +33,7 @@ func TestUserCache_GetNonExistent_ReturnsNil(t *testing.T) {

 func TestUserCache_Expired_ReturnsNil(t *testing.T) {
 	// Very short TTL
-	cache := NewUserCache(1 * time.Millisecond)
+	cache := NewUserCache(1 * time.Millisecond, 0)

 	user := &models.User{Username: "expiring_user"}
 	user.ID = 1
@@ -48,7 +48,7 @@ func TestUserCache_Expired_ReturnsNil(t *testing.T) {
 }

 func TestUserCache_Invalidate(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)

 	user := &models.User{Username: "to_invalidate"}
 	user.ID = 1
@@ -66,7 +66,7 @@ func TestUserCache_Invalidate(t *testing.T) {
 }

 func TestUserCache_ReturnsCopy_NotOriginal(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)

 	user := &models.User{Username: "original"}
 	user.ID = 1
@@ -85,7 +85,7 @@ func TestUserCache_ReturnsCopy_NotOriginal(t *testing.T) {
 }

 func TestUserCache_SetCopiesInput(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)

 	user := &models.User{Username: "original"}
 	user.ID = 1
@@ -102,7 +102,7 @@ func TestUserCache_SetCopiesInput(t *testing.T) {
 }

 func TestUserCache_MultipleUsers(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)

 	user1 := &models.User{Username: "user1"}
 	user1.ID = 1
@@ -122,7 +122,7 @@ func TestUserCache_MultipleUsers(t *testing.T) {
 }

 func TestUserCache_OverwriteEntry(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)

 	user := &models.User{Username: "original"}
 	user.ID = 1