Cut /api/tasks/ p99 from ~2500ms toward ~150-300ms

Stack of optimizations against the same Hetzner→Neon transatlantic link. The trace revealed every visible ms was network/proxy overhead — DB execution itself is sub-millisecond per query (verified via EXPLAIN ANALYZE: index scans on every hot path). Connection layer: - DB_HOST → Neon pooler endpoint (-pooler suffix). PgBouncer transaction-mode keeps backend Postgres connections warm so we no longer pay the ~110ms Postgres-startup RTT on cold queries. - GORM pool tuned: MaxIdleConns 10→20, MaxLifetime 600s→1800s, MaxIdleTime added (default 0 = never close idle). - Eager pool warm-up at boot via parallel pings — first user request no longer pays the ~440ms TCP+TLS+startup handshake. - Redis maxmemory-policy noeviction → allkeys-lru. Cache writes will evict cold keys instead of erroring at the 256MB limit. Auth layer: - TokenCacheTTL 5min → 1 hour (Redis token cache). - UserCacheTTL 30s → 5min (in-memory User cache, per pod). - UserCache gains a 5,000-entry LRU cap so a flood of unique users can't blow up pod RSS. ~5MB worst-case per pod. - Token + user lookup collapsed from 2 GORM Preload queries into a single INNER JOIN. Saves 1 RTT per cold-cache request. - Auth middleware's m.db.* now use db.WithContext(ctx) so the SQL spans nest under the parent HTTP request in Jaeger. Service layer: - TaskService.ListTasks: replaced two-step FindResidenceIDsByUser → GetKanbanDataForMultipleResidences with a single GetKanbanDataForUser that uses a Postgres subquery for residence-access. One round-trip instead of two. - New CacheService residence-IDs cache: \"residence_ids_user:<id>\" with 5-min TTL. Wired into Task/Residence/Contractor/Document services for the four hot read paths that need this list. - Cache invalidation on every relevant mutation: CreateResidence, DeleteResidence, JoinWithCode, RemoveUser. DeleteResidence invalidates every member of the residence, not just the owner. What this stacks up to (Hetzner→Neon, before US migration): Path Before After (target) Cache-warm authed read ~800ms ~100-200ms Cache-cold authed read (1st in 1hr) ~2500ms ~500-700ms First request after deploy ~2500ms ~700-900ms The endgame US-region migration on top of this gets us to ~30-50ms warm-cache, but we're shippable at ~150ms warm right now. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-25 17:13:50 -05:00
parent 9410da7497
commit 88fb1751c7
15 changed files with 443 additions and 59 deletions
@@ -36,7 +36,11 @@ spec:
            - sh
            - -c
            - |
-              ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy noeviction"
+              # allkeys-lru: under memory pressure, evict the least-recently-used key.
              # honeyDue uses Redis as a cache + asynq queue. The cache layer falls
              # through to DB on miss, so eviction is graceful. asynq keys with TTLs
              # would be evicted only after older cache entries are gone.
              ARGS="--appendonly yes --appendfsync everysec --maxmemory 256mb --maxmemory-policy allkeys-lru"
              if [ -n "$REDIS_PASSWORD" ]; then
                ARGS="$ARGS --requirepass $REDIS_PASSWORD"
              fi
@@ -118,6 +118,7 @@ lines = [
    f\"DB_MAX_OPEN_CONNS={db['max_open_conns']}\",
    f\"DB_MAX_IDLE_CONNS={db['max_idle_conns']}\",
    f\"DB_MAX_LIFETIME={db['max_lifetime']}\",
    f\"DB_MAX_IDLE_TIME={db.get('max_idle_time', '0s')}\",
    # Redis (in-namespace DNS short form — password injected if configured;
    # short form works because /etc/resolv.conf in pods searches honeydue.svc.cluster.local)
    f\"REDIS_URL=redis://{':%s@' % val(rd.get('password')) if rd.get('password') else ''}redis:6379/0\",
@@ -52,6 +52,7 @@ type DatabaseConfig struct {
 	MaxOpenConns int
 	MaxIdleConns int
 	MaxLifetime  time.Duration
 	MaxIdleTime  time.Duration
 }
 type RedisConfig struct {
@@ -214,6 +215,7 @@ func Load() (*Config, error) {
 			MaxOpenConns: viper.GetInt("DB_MAX_OPEN_CONNS"),
 			MaxIdleConns: viper.GetInt("DB_MAX_IDLE_CONNS"),
 			MaxLifetime:  viper.GetDuration("DB_MAX_LIFETIME"),
 			MaxIdleTime:  viper.GetDuration("DB_MAX_IDLE_TIME"),
 		}
 		// Override with DATABASE_URL if present (F-16: log warning on parse failure)
@@ -71,16 +71,30 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) {
 		return nil, fmt.Errorf("failed to get underlying sql.DB: %w", err)
 	}
-	// Configure connection pool
+	// Configure connection pool. The Neon pooler endpoint keeps backend
 	// connections warm, so we keep our client-side pool warm too — that
 	// eliminates the ~440ms TCP+TLS+startup handshake on the first query
 	// after a cold pod / idle period.
 	sqlDB.SetMaxOpenConns(cfg.MaxOpenConns)
 	sqlDB.SetMaxIdleConns(cfg.MaxIdleConns)
 	sqlDB.SetConnMaxLifetime(cfg.MaxLifetime)
 	if cfg.MaxIdleTime > 0 {
 		sqlDB.SetConnMaxIdleTime(cfg.MaxIdleTime)
 	}
 	// MaxIdleTime=0 means "never close idle" — the pool fills up to
 	// MaxIdleConns and they stay alive until MaxLifetime expires.
 	// Test connection
 	if err := sqlDB.Ping(); err != nil {
 		return nil, fmt.Errorf("failed to ping database: %w", err)
 	}
 	// Eagerly warm the connection pool to MaxIdleConns. Without this, the
 	// first N user requests each pay the full handshake (~440ms over a
 	// transatlantic link). Pings are issued in parallel so warm-up is
 	// bounded by handshake time, not handshake-time × N.
 	warmUpPool(sqlDB, cfg.MaxIdleConns)
 	log.Info().
 		Str("host", cfg.Host).
 		Int("port", cfg.Port).
@@ -106,6 +120,35 @@ func Connect(cfg *config.DatabaseConfig, debug bool) (*gorm.DB, error) {
 	return db, nil
 }
 // warmUpPool issues N parallel pings so the pool fills with established
 // connections before the first user request lands. Failures are logged but
 // not fatal — the pool will fill on demand under traffic if pre-warm fails.
 //
 // On a transatlantic link to Neon (~110ms RTT, ~440ms cold handshake), this
 // turns "first request pays the cold handshake" into "first request finds a
 // warm pool" — at the cost of ~440ms during pod startup.
 func warmUpPool(sqlDB interface {
 	PingContext(context.Context) error
 }, n int) {
 	if n <= 0 {
 		return
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
 	done := make(chan error, n)
 	for i := 0; i < n; i++ {
 		go func() { done <- sqlDB.PingContext(ctx) }()
 	}
 	successes := 0
 	for i := 0; i < n; i++ {
 		if err := <-done; err == nil {
 			successes++
 		}
 	}
 	log.Info().Int("requested", n).Int("warmed", successes).Msg("DB pool warm-up complete")
 }
 // Get returns the database instance
 func Get() *gorm.DB {
 	return db
@@ -22,13 +22,22 @@ const (
 	AuthUserKey = "auth_user"
 	// AuthTokenKey is the key used to store the token in the context
 	AuthTokenKey = "auth_token"
-	// TokenCacheTTL is the duration to cache tokens in Redis
+	// TokenCacheTTL is the duration to cache tokens in Redis. Tokens are
-	TokenCacheTTL = 5 * time.Minute
+	// valid for DefaultTokenExpiryDays (90), and explicit logout invalidates
 	// the cache, so a long TTL here just means most authed requests skip the
 	// auth-token SQL query entirely.
 	TokenCacheTTL = 1 * time.Hour
 	// TokenCachePrefix is the prefix for token cache keys
 	TokenCachePrefix = "auth_token_"
 	// UserCacheTTL is how long full user records are cached in memory to
-	// avoid hitting the database on every authenticated request.
+	// avoid hitting the database on every authenticated request. Bumped from
-	UserCacheTTL = 30 * time.Second
+	// 30s — at 30s the trace showed a SELECT auth_user query on most warm
 	// requests because users aren't in cache long enough to hit twice.
 	UserCacheTTL = 5 * time.Minute
 	// UserCacheMaxSize bounds the per-pod in-memory user cache. With ~1KB
 	// per User struct, 5000 entries = ~5MB per pod. Older entries are
 	// evicted LRU before the limit is exceeded.
 	UserCacheMaxSize = 5000
 	// DefaultTokenExpiryDays is the default number of days before a token expires.
 	DefaultTokenExpiryDays = 90
@@ -47,7 +56,7 @@ func NewAuthMiddleware(db *gorm.DB, cache *services.CacheService) *AuthMiddlewar
 	return &AuthMiddleware{
 		db:              db,
 		cache:           cache,
-		userCache:       NewUserCache(UserCacheTTL),
+		userCache:       NewUserCache(UserCacheTTL, UserCacheMaxSize),
 		tokenExpiryDays: DefaultTokenExpiryDays,
 	}
 }
@@ -61,7 +70,7 @@ func NewAuthMiddlewareWithConfig(db *gorm.DB, cache *services.CacheService, cfg
 	return &AuthMiddleware{
 		db:              db,
 		cache:           cache,
-		userCache:       NewUserCache(UserCacheTTL),
+		userCache:       NewUserCache(UserCacheTTL, UserCacheMaxSize),
 		tokenExpiryDays: expiryDays,
 	}
 }
@@ -244,20 +253,83 @@ func (m *AuthMiddleware) getUserFromCache(ctx context.Context, token string) (*m
 // getUserFromDatabaseWithToken looks up the token in the database and returns
 // both the user and the auth token record (for expiry checking). The ctx is
 // threaded into the GORM session so the SQL span attaches to the request trace.
 //
 // Uses a single JOIN query instead of GORM's Preload (which issues 2 SELECTs).
 // Over a transatlantic link this saves ~110ms RTT per cache miss.
 func (m *AuthMiddleware) getUserFromDatabaseWithToken(ctx context.Context, token string) (*models.User, *models.AuthToken, error) {
-	var authToken models.AuthToken
+	// Flat result row: every column from auth_user prefixed `u_`, every
-	if err := m.db.WithContext(ctx).Preload("User").Where("key = ?", token).First(&authToken).Error; err != nil {
+	// column from user_authtoken left in its native shape. Mapping to two
 	// structs is mechanical so we don't need a struct tag soup.
 	type joinedRow struct {
 		// AuthToken columns
 		Key     string    `gorm:"column:key"`
 		Created time.Time `gorm:"column:created"`
 		UserID  uint      `gorm:"column:user_id"`
 		// User columns (prefixed to avoid collision with UserID)
 		UID         uint   `gorm:"column:u_id"`
 		UUsername   string `gorm:"column:u_username"`
 		UEmail      string `gorm:"column:u_email"`
 		UFirstName  string `gorm:"column:u_first_name"`
 		ULastName   string `gorm:"column:u_last_name"`
 		UPassword   string `gorm:"column:u_password"`
 		UIsActive   bool   `gorm:"column:u_is_active"`
 		UIsStaff    bool   `gorm:"column:u_is_staff"`
 		UIsSuper    bool   `gorm:"column:u_is_superuser"`
 		UDateJoined time.Time `gorm:"column:u_date_joined"`
 		ULastLogin  *time.Time `gorm:"column:u_last_login"`
 	}
 	var row joinedRow
 	err := m.db.WithContext(ctx).
 		Table("user_authtoken AS t").
 		Select(`
 			t.key, t.created, t.user_id,
 			u.id          AS u_id,
 			u.username    AS u_username,
 			u.email       AS u_email,
 			u.first_name  AS u_first_name,
 			u.last_name   AS u_last_name,
 			u.password    AS u_password,
 			u.is_active   AS u_is_active,
 			u.is_staff    AS u_is_staff,
 			u.is_superuser AS u_is_superuser,
 			u.date_joined AS u_date_joined,
 			u.last_login  AS u_last_login
 		`).
 		Joins("INNER JOIN auth_user u ON u.id = t.user_id").
 		Where("t.key = ?", token).
 		Limit(1).
 		Scan(&row).Error
 	if err != nil || row.Key == "" {
 		return nil, nil, fmt.Errorf("token not found")
 	}
-	// Check if user is active
+	user := models.User{
-	if !authToken.User.IsActive {
+		ID:         row.UID,
 		Username:   row.UUsername,
 		Email:      row.UEmail,
 		FirstName:  row.UFirstName,
 		LastName:   row.ULastName,
 		Password:   row.UPassword,
 		IsActive:   row.UIsActive,
 		IsStaff:    row.UIsStaff,
 		IsSuperuser: row.UIsSuper,
 		DateJoined: row.UDateJoined,
 		LastLogin:  row.ULastLogin,
 	}
 	authToken := models.AuthToken{
 		Key:     row.Key,
 		Created: row.Created,
 		UserID:  row.UserID,
 		User:    user,
 	}
 	if !user.IsActive {
 		return nil, nil, fmt.Errorf("user is inactive")
 	}
-	// Store in in-memory cache for subsequent requests
+	m.userCache.Set(&user)
-	m.userCache.Set(&authToken.User)
+	return &user, &authToken, nil
 	return &authToken.User, &authToken, nil
 }
 // getUserFromDatabase looks up the token in the database and caches the
@@ -2,6 +2,7 @@ package middleware
 import (
 	"sync"
 	"sync/atomic"
 	"time"
 	"github.com/treytartt/honeydue-api/internal/models"
@@ -16,22 +17,29 @@ type userCacheEntry struct {
 // UserCache is a concurrency-safe in-memory cache for User records, keyed by
 // user ID. Entries expire after a configurable TTL. The cache uses a sync.Map
 // for lock-free reads on the hot path, with periodic lazy eviction of stale
-// entries during Set operations.
+// entries during Set operations and a hard size cap to bound memory.
 type UserCache struct {
-	store   sync.Map
+	store    sync.Map
-	ttl     time.Duration
+	ttl      time.Duration
-	lastGC  time.Time
+	lastGC   time.Time
-	gcMu    sync.Mutex
+	gcMu     sync.Mutex
-	gcEvery time.Duration
+	gcEvery  time.Duration
 	size     atomic.Int64 // approximate count; sync.Map has no Len()
 	maxSize  int64
 }
 // NewUserCache creates a UserCache with the given TTL for entries.
-func NewUserCache(ttl time.Duration) *UserCache {
+// maxSize is the soft upper bound on the number of cached users; when
-	return &UserCache{
+// exceeded, the next Set will trigger an eviction sweep before storing.
 // Pass <=0 for no size cap.
 func NewUserCache(ttl time.Duration, maxSize int) *UserCache {
 	c := &UserCache{
 		ttl:     ttl,
 		lastGC:  time.Now(),
 		gcEvery: 2 * time.Minute,
 		maxSize: int64(maxSize),
 	}
 	return c
 }
 // Get returns a cached user by ID, or nil if not found or expired.
@@ -42,7 +50,9 @@ func (c *UserCache) Get(userID uint) *models.User {
 	}
 	entry := val.(*userCacheEntry)
 	if time.Now().After(entry.expiresAt) {
-		c.store.Delete(userID)
+		if _, loaded := c.store.LoadAndDelete(userID); loaded {
 			c.size.Add(-1)
 		}
 		return nil
 	}
 	// Return a shallow copy so callers cannot mutate the cached value.
@@ -51,20 +61,71 @@ func (c *UserCache) Get(userID uint) *models.User {
 }
 // Set stores a user in the cache. It also triggers a background garbage-
-// collection sweep if enough time has elapsed since the last one.
+// collection sweep if enough time has elapsed since the last one or if the
 // cache has grown past maxSize.
 func (c *UserCache) Set(user *models.User) {
 	// Store a copy to prevent external mutation of the cached object.
 	copied := *user
-	c.store.Store(user.ID, &userCacheEntry{
+	if _, loaded := c.store.Swap(user.ID, &userCacheEntry{
 		user:      &copied,
 		expiresAt: time.Now().Add(c.ttl),
-	})
+	}); !loaded {
 		c.size.Add(1)
 	}
 	if c.maxSize > 0 && c.size.Load() > c.maxSize {
 		c.evictUntilUnderCap()
 	}
 	c.maybeGC()
 }
 // evictUntilUnderCap walks the cache and drops the oldest expirable entries
 // until size is under maxSize. Cheap O(n) walk; runs only when the cap is
 // breached, which should be rare in practice (TTL handles most eviction).
 func (c *UserCache) evictUntilUnderCap() {
 	now := time.Now()
 	// First pass: drop expired entries.
 	c.store.Range(func(key, value any) bool {
 		entry := value.(*userCacheEntry)
 		if now.After(entry.expiresAt) {
 			if _, loaded := c.store.LoadAndDelete(key); loaded {
 				c.size.Add(-1)
 			}
 		}
 		return c.size.Load() > c.maxSize
 	})
 	// Second pass: if still over cap, drop entries closest to expiry.
 	if c.size.Load() <= c.maxSize {
 		return
 	}
 	type scored struct {
 		key       any
 		expiresAt time.Time
 	}
 	candidates := make([]scored, 0, 64)
 	c.store.Range(func(key, value any) bool {
 		entry := value.(*userCacheEntry)
 		candidates = append(candidates, scored{key, entry.expiresAt})
 		return true
 	})
 	// Sort by expiry ascending — drop closest-to-expiry first.
 	for i := 1; i < len(candidates); i++ {
 		for j := i; j > 0 && candidates[j-1].expiresAt.After(candidates[j].expiresAt); j-- {
 			candidates[j-1], candidates[j] = candidates[j], candidates[j-1]
 		}
 	}
 	overshoot := int(c.size.Load() - c.maxSize)
 	for i := 0; i < overshoot && i < len(candidates); i++ {
 		if _, loaded := c.store.LoadAndDelete(candidates[i].key); loaded {
 			c.size.Add(-1)
 		}
 	}
 }
 // Invalidate removes a user from the cache by ID.
 func (c *UserCache) Invalidate(userID uint) {
-	c.store.Delete(userID)
+	if _, loaded := c.store.LoadAndDelete(userID); loaded {
 		c.size.Add(-1)
 	}
 }
 // maybeGC lazily sweeps expired entries at most once per gcEvery interval.
@@ -81,7 +142,9 @@ func (c *UserCache) maybeGC() {
 	c.store.Range(func(key, value any) bool {
 		entry := value.(*userCacheEntry)
 		if now.After(entry.expiresAt) {
-			c.store.Delete(key)
+			if _, loaded := c.store.LoadAndDelete(key); loaded {
 				c.size.Add(-1)
 			}
 		}
 		return true
 	})
@@ -11,7 +11,7 @@ import (
 )
 func TestUserCache_SetAndGet(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)
 	user := &models.User{Username: "testuser", Email: "test@test.com"}
 	user.ID = 1
@@ -25,7 +25,7 @@ func TestUserCache_SetAndGet(t *testing.T) {
 }
 func TestUserCache_GetNonExistent_ReturnsNil(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)
 	cached := cache.Get(999)
 	assert.Nil(t, cached)
@@ -33,7 +33,7 @@ func TestUserCache_GetNonExistent_ReturnsNil(t *testing.T) {
 func TestUserCache_Expired_ReturnsNil(t *testing.T) {
 	// Very short TTL
-	cache := NewUserCache(1 * time.Millisecond)
+	cache := NewUserCache(1 * time.Millisecond, 0)
 	user := &models.User{Username: "expiring_user"}
 	user.ID = 1
@@ -48,7 +48,7 @@ func TestUserCache_Expired_ReturnsNil(t *testing.T) {
 }
 func TestUserCache_Invalidate(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)
 	user := &models.User{Username: "to_invalidate"}
 	user.ID = 1
@@ -66,7 +66,7 @@ func TestUserCache_Invalidate(t *testing.T) {
 }
 func TestUserCache_ReturnsCopy_NotOriginal(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)
 	user := &models.User{Username: "original"}
 	user.ID = 1
@@ -85,7 +85,7 @@ func TestUserCache_ReturnsCopy_NotOriginal(t *testing.T) {
 }
 func TestUserCache_SetCopiesInput(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)
 	user := &models.User{Username: "original"}
 	user.ID = 1
@@ -102,7 +102,7 @@ func TestUserCache_SetCopiesInput(t *testing.T) {
 }
 func TestUserCache_MultipleUsers(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)
 	user1 := &models.User{Username: "user1"}
 	user1.ID = 1
@@ -122,7 +122,7 @@ func TestUserCache_MultipleUsers(t *testing.T) {
 }
 func TestUserCache_OverwriteEntry(t *testing.T) {
-	cache := NewUserCache(1 * time.Minute)
+	cache := NewUserCache(1 * time.Minute, 0)
 	user := &models.User{Username: "original"}
 	user.ID = 1
@@ -640,6 +640,54 @@ func (r *TaskRepository) GetKanbanDataForMultipleResidences(residenceIDs []uint,
 	}, nil
 }
 // GetKanbanDataForUser fetches every task across every residence the user has
 // access to, in a single round-trip. Replaces the two-step
 // FindResidenceIDsByUser → GetKanbanDataForMultipleResidences pattern, saving
 // one transatlantic RTT per ListTasks call (~110ms on Hetzner→Neon).
 //
 // The residence-access check runs as a subquery on Postgres rather than a
 // separate Go-side round-trip; Postgres's planner already turns it into a
 // hash semi-join, so there's no perf cost vs the explicit IN(...) approach.
 func (r *TaskRepository) GetKanbanDataForUser(userID uint, daysThreshold int, now time.Time) (*models.KanbanBoard, error) {
 	residenceSubquery := r.db.Table("residence_residence").
 		Select("id").
 		Where("is_active = ?", true).
 		Where("owner_id = ? OR id IN (?)",
 			userID,
 			r.db.Table("residence_residence_users").Select("residence_id").Where("user_id = ?", userID),
 		)
 	var allTasks []models.Task
 	query := r.db.Model(&models.Task{}).
 		Where("task_task.residence_id IN (?)", residenceSubquery).
 		Preload("CreatedBy").
 		Preload("AssignedTo").
 		Preload("Residence").
 		Preload("Completions", func(db *gorm.DB) *gorm.DB {
 			return db.Select("id", "task_id", "completed_at")
 		}).
 		Scopes(task.ScopeKanbanOrder)
 	if err := query.Find(&allTasks).Error; err != nil {
 		return nil, fmt.Errorf("get tasks for kanban: %w", err)
 	}
 	columnMap := categorization.CategorizeTasksIntoColumnsWithTime(allTasks, daysThreshold, now)
 	columns := buildKanbanColumns(
 		columnMap[categorization.ColumnOverdue],
 		columnMap[categorization.ColumnInProgress],
 		columnMap[categorization.ColumnDueSoon],
 		columnMap[categorization.ColumnUpcoming],
 		columnMap[categorization.ColumnCompleted],
 	)
 	return &models.KanbanBoard{
 		Columns:       columns,
 		DaysThreshold: daysThreshold,
 		ResidenceID:   "all",
 	}, nil
 }
 // === Lookup Operations ===
 // GetAllCategories returns all task categories
@@ -200,6 +200,15 @@ func SetupRouter(deps *Dependencies) *echo.Echo {
 	taskService.SetStorageService(deps.StorageService) // For reading completion images for email
 	subscriptionService := services.NewSubscriptionService(subscriptionRepo, residenceRepo, taskRepo, contractorRepo, documentRepo)
 	residenceService.SetSubscriptionService(subscriptionService) // Wire up subscription service for tier limit enforcement
 	// Wire Redis cache for residence-ID lookups across the four services that
 	// read it on the request hot path. Cache is best-effort; nil cache is OK.
 	if deps.Cache != nil {
 		residenceService.SetCacheService(deps.Cache)
 		taskService.SetCacheService(deps.Cache)
 		contractorService.SetCacheService(deps.Cache)
 		documentService.SetCacheService(deps.Cache)
 	}
 	taskTemplateService := services.NewTaskTemplateService(taskTemplateRepo)
 	suggestionService := services.NewSuggestionService(deps.DB, residenceRepo)
@@ -388,3 +388,61 @@ func (c *CacheService) GetSeededDataETag(ctx context.Context) (string, error) {
 func (c *CacheService) InvalidateSeededData(ctx context.Context) error {
 	return c.Delete(ctx, SeededDataKey, SeededDataETagKey)
 }
 // === User → Residence-IDs cache ===
 //
 // Caches the set of residence IDs each user has access to. Hot read on
 // every authenticated API call (auth + tasks + residences + contractors +
 // documents all need it). Mutations on residences/share-codes invalidate
 // only the affected user(s); see Invalidate*ResidenceIDsForUsers.
 const (
 	residenceIDsKeyPrefix = "residence_ids_user:"
 	residenceIDsTTL       = 5 * time.Minute
 )
 // CacheResidenceIDsForUser stores the residence-ID list for a user with a
 // 5-minute TTL. Membership rarely changes (only on share-code accept,
 // remove-user, delete-residence) so a 5-minute window catches the vast
 // majority of repeat reads while keeping staleness bounded.
 func (c *CacheService) CacheResidenceIDsForUser(ctx context.Context, userID uint, ids []uint) error {
 	if c == nil {
 		return nil
 	}
 	key := fmt.Sprintf("%s%d", residenceIDsKeyPrefix, userID)
 	data, err := json.Marshal(ids)
 	if err != nil {
 		return err
 	}
 	return c.client.Set(ctx, key, data, residenceIDsTTL).Err()
 }
 // GetCachedResidenceIDsForUser fetches the cached residence-ID list. Returns
 // (nil, redis.Nil) when not cached so callers can distinguish from "user has
 // zero residences" (empty slice) — though for practical purposes both result
 // in an empty kanban response, so most callers can ignore the distinction.
 func (c *CacheService) GetCachedResidenceIDsForUser(ctx context.Context, userID uint) ([]uint, error) {
 	if c == nil {
 		return nil, fmt.Errorf("cache not available")
 	}
 	key := fmt.Sprintf("%s%d", residenceIDsKeyPrefix, userID)
 	var ids []uint
 	if err := c.Get(ctx, key, &ids); err != nil {
 		return nil, err
 	}
 	return ids, nil
 }
 // InvalidateResidenceIDsForUsers drops the cache for one or more users.
 // Called from JoinWithCode (the joining user) and RemoveUser /
 // DeleteResidence (every affected user). Cheap — single Redis DEL per user.
 func (c *CacheService) InvalidateResidenceIDsForUsers(ctx context.Context, userIDs ...uint) error {
 	if c == nil || len(userIDs) == 0 {
 		return nil
 	}
 	keys := make([]string, len(userIDs))
 	for i, id := range userIDs {
 		keys[i] = fmt.Sprintf("%s%d", residenceIDsKeyPrefix, id)
 	}
 	return c.Delete(ctx, keys...)
 }
@@ -23,6 +23,7 @@ import (
 type ContractorService struct {
 	contractorRepo *repositories.ContractorRepository
 	residenceRepo  *repositories.ResidenceRepository
 	cache          *CacheService
 }
 // NewContractorService creates a new contractor service
@@ -33,6 +34,11 @@ func NewContractorService(contractorRepo *repositories.ContractorRepository, res
 	}
 }
 // SetCacheService wires Redis caching for residence-ID lookups.
 func (s *ContractorService) SetCacheService(cache *CacheService) {
 	s.cache = cache
 }
 // GetContractor gets a contractor by ID with access check
 func (s *ContractorService) GetContractor(ctx context.Context, contractorID, userID uint) (*responses.ContractorResponse, error) {
 	contractor, err := s.contractorRepo.WithContext(ctx).FindByID(contractorID)
@@ -73,7 +79,7 @@ func (s *ContractorService) hasContractorAccess(ctx context.Context, contractor
 // ListContractors lists all contractors accessible to a user
 func (s *ContractorService) ListContractors(ctx context.Context, userID uint) ([]responses.ContractorResponse, error) {
 	// Get residence IDs (lightweight - no preloads)
-	residenceIDs, err := s.residenceRepo.WithContext(ctx).FindResidenceIDsByUser(userID)
+	residenceIDs, err := cachedResidenceIDsForUser(ctx, s.cache, s.residenceRepo, userID)
 	if err != nil {
 		return nil, apperrors.Internal(err)
 	}
@@ -24,6 +24,7 @@ import (
 type DocumentService struct {
 	documentRepo  *repositories.DocumentRepository
 	residenceRepo *repositories.ResidenceRepository
 	cache         *CacheService
 }
 // NewDocumentService creates a new document service
@@ -34,6 +35,11 @@ func NewDocumentService(documentRepo *repositories.DocumentRepository, residence
 	}
 }
 // SetCacheService wires Redis caching for residence-ID lookups.
 func (s *DocumentService) SetCacheService(cache *CacheService) {
 	s.cache = cache
 }
 // GetDocument gets a document by ID with access check
 func (s *DocumentService) GetDocument(ctx context.Context, documentID, userID uint) (*responses.DocumentResponse, error) {
 	document, err := s.documentRepo.WithContext(ctx).FindByID(documentID)
@@ -60,7 +66,7 @@ func (s *DocumentService) GetDocument(ctx context.Context, documentID, userID ui
 // ListDocuments lists all documents accessible to a user, with optional filters.
 func (s *DocumentService) ListDocuments(ctx context.Context, userID uint, filter *repositories.DocumentFilter) ([]responses.DocumentResponse, error) {
 	// Get residence IDs (lightweight - no preloads)
-	residenceIDs, err := s.residenceRepo.WithContext(ctx).FindResidenceIDsByUser(userID)
+	residenceIDs, err := cachedResidenceIDsForUser(ctx, s.cache, s.residenceRepo, userID)
 	if err != nil {
 		return nil, apperrors.Internal(err)
 	}
@@ -95,7 +101,7 @@ func (s *DocumentService) ListDocuments(ctx context.Context, userID uint, filter
 // ListWarranties lists all warranty documents
 func (s *DocumentService) ListWarranties(ctx context.Context, userID uint) ([]responses.DocumentResponse, error) {
 	// Get residence IDs (lightweight - no preloads)
-	residenceIDs, err := s.residenceRepo.WithContext(ctx).FindResidenceIDsByUser(userID)
+	residenceIDs, err := cachedResidenceIDsForUser(ctx, s.cache, s.residenceRepo, userID)
 	if err != nil {
 		return nil, apperrors.Internal(err)
 	}
@@ -0,0 +1,41 @@
 package services
 import (
 	"context"
 	"github.com/treytartt/honeydue-api/internal/repositories"
 )
 // cachedResidenceIDsForUser fetches the residence-ID list for a user, going
 // through Redis (5-min TTL) before falling back to Postgres.
 //
 // Used on every authed read path (tasks, documents, contractors, summary)
 // because the list rarely changes — only on share-code accept, member
 // removal, or residence delete. Callers must invalidate after mutations
 // via cache.InvalidateResidenceIDsForUsers.
 //
 // A nil cache is permitted — the function falls through to the repo
 // directly, so this works in tests and in failure modes.
 func cachedResidenceIDsForUser(
 	ctx context.Context,
 	cache *CacheService,
 	residenceRepo *repositories.ResidenceRepository,
 	userID uint,
 ) ([]uint, error) {
 	if cache != nil {
 		if ids, err := cache.GetCachedResidenceIDsForUser(ctx, userID); err == nil {
 			return ids, nil
 		}
 	}
 	ids, err := residenceRepo.WithContext(ctx).FindResidenceIDsByUser(userID)
 	if err != nil {
 		return nil, err
 	}
 	if cache != nil {
 		// Best-effort cache fill; don't fail the request on Redis hiccup.
 		_ = cache.CacheResidenceIDsForUser(ctx, userID, ids)
 	}
 	return ids, nil
 }
@@ -37,9 +37,16 @@ type ResidenceService struct {
 	userRepo            *repositories.UserRepository
 	taskRepo            *repositories.TaskRepository
 	subscriptionService *SubscriptionService
 	cache               *CacheService
 	config              *config.Config
 }
 // SetCacheService wires a Redis-backed cache for residence-ID lookups. May
 // be nil — service falls through to direct DB queries when unset.
 func (s *ResidenceService) SetCacheService(cache *CacheService) {
 	s.cache = cache
 }
 // NewResidenceService creates a new residence service
 func NewResidenceService(residenceRepo *repositories.ResidenceRepository, userRepo *repositories.UserRepository, cfg *config.Config) *ResidenceService {
 	return &ResidenceService{
@@ -160,7 +167,7 @@ func (s *ResidenceService) GetMyResidences(ctx context.Context, userID uint, now
 // Clients should use calculateSummaryFromKanban() instead.
 func (s *ResidenceService) GetSummary(ctx context.Context, userID uint, now time.Time) (*responses.TotalSummary, error) {
 	// Get residence IDs (lightweight - no preloads)
-	residenceIDs, err := s.residenceRepo.WithContext(ctx).FindResidenceIDsByUser(userID)
+	residenceIDs, err := cachedResidenceIDsForUser(ctx, s.cache, s.residenceRepo, userID)
 	if err != nil {
 		return nil, apperrors.Internal(err)
 	}
@@ -257,6 +264,11 @@ func (s *ResidenceService) CreateResidence(ctx context.Context, req *requests.Cr
 	if err := s.residenceRepo.WithContext(ctx).Create(residence); err != nil {
 		return nil, apperrors.Internal(err)
 	}
 	if s.cache != nil {
 		// Owner now has a new residence — drop cached IDs so the next
 		// list-residences call doesn't omit it.
 		_ = s.cache.InvalidateResidenceIDsForUsers(ctx, ownerID)
 	}
 	// Reload with relations
 	residence, err := s.residenceRepo.WithContext(ctx).FindByID(residence.ID)
@@ -419,9 +431,26 @@ func (s *ResidenceService) DeleteResidence(ctx context.Context, residenceID, use
 		return nil, apperrors.Forbidden("error.not_residence_owner")
 	}
 	// Capture all member IDs before delete so we can invalidate their caches.
 	var affectedUserIDs []uint
 	if s.cache != nil {
 		if members, _ := s.residenceRepo.WithContext(ctx).GetResidenceUsers(residenceID); members != nil {
 			affectedUserIDs = make([]uint, 0, len(members)+1)
 			affectedUserIDs = append(affectedUserIDs, userID) // owner
 			for _, m := range members {
 				if m.ID != userID {
 					affectedUserIDs = append(affectedUserIDs, m.ID)
 				}
 			}
 		}
 	}
 	if err := s.residenceRepo.WithContext(ctx).Delete(residenceID); err != nil {
 		return nil, apperrors.Internal(err)
 	}
 	if s.cache != nil && len(affectedUserIDs) > 0 {
 		_ = s.cache.InvalidateResidenceIDsForUsers(ctx, affectedUserIDs...)
 	}
 	// Get updated summary
 	summary := s.getSummaryForUser(userID)
@@ -548,6 +577,10 @@ func (s *ResidenceService) JoinWithCode(ctx context.Context, code string, userID
 	if err := s.residenceRepo.WithContext(ctx).AddUser(shareCode.ResidenceID, userID); err != nil {
 		return nil, apperrors.Internal(err)
 	}
 	if s.cache != nil {
 		// The joining user's residence-IDs cache is now stale.
 		_ = s.cache.InvalidateResidenceIDsForUsers(ctx, userID)
 	}
 	// Mark share code as used (one-time use)
 	if err := s.residenceRepo.WithContext(ctx).DeactivateShareCode(shareCode.ID); err != nil {
@@ -629,6 +662,10 @@ func (s *ResidenceService) RemoveUser(ctx context.Context, residenceID, userIDTo
 	if err := s.residenceRepo.WithContext(ctx).RemoveUser(residenceID, userIDToRemove); err != nil {
 		return apperrors.Internal(err)
 	}
 	if s.cache != nil {
 		// The removed user's residence-IDs cache is now stale.
 		_ = s.cache.InvalidateResidenceIDsForUsers(ctx, userIDToRemove)
 	}
 	return nil
 }
@@ -37,6 +37,12 @@ type TaskService struct {
 	notificationService *NotificationService
 	emailService        *EmailService
 	storageService      *StorageService
 	cache               *CacheService
 }
 // SetCacheService wires Redis caching for residence-ID lookups.
 func (s *TaskService) SetCacheService(cache *CacheService) {
 	s.cache = cache
 }
 // NewTaskService creates a new task service
@@ -108,23 +114,11 @@ func (s *TaskService) ListTasks(ctx context.Context, userID uint, daysThreshold
 		daysThreshold = 30 // Default
 	}
-	// Get all residence IDs accessible to user (lightweight - no preloads)
+	// Single-round-trip variant: residence-access subquery is folded into the
-	residenceIDs, err := s.residenceRepo.WithContext(ctx).FindResidenceIDsByUser(userID)
+	// task fetch on the Postgres side instead of a separate Go-side round-trip.
-	if err != nil {
+	// Saves ~110ms on Hetzner→Neon. Empty result naturally handles the
-		return nil, apperrors.Internal(err)
+	// "user has no residences" case as an empty board.
-	}
+	board, err := s.taskRepo.WithContext(ctx).GetKanbanDataForUser(userID, daysThreshold, now)
 	if len(residenceIDs) == 0 {
 		// Return empty kanban board
 		return &responses.KanbanBoardResponse{
 			Columns:       []responses.KanbanColumnResponse{},
 			DaysThreshold: daysThreshold,
 			ResidenceID:   "all",
 		}, nil
 	}
 	// Get kanban data aggregated across all residences using user's timezone-aware time
 	board, err := s.taskRepo.WithContext(ctx).GetKanbanDataForMultipleResidences(residenceIDs, daysThreshold, now)
 	if err != nil {
 		return nil, apperrors.Internal(err)
 	}
@@ -1025,7 +1019,7 @@ func (s *TaskService) GetCompletion(ctx context.Context, completionID, userID ui
 // ListCompletions lists all task completions for a user
 func (s *TaskService) ListCompletions(ctx context.Context, userID uint) ([]responses.TaskCompletionResponse, error) {
 	// Get all residence IDs (lightweight - no preloads)
-	residenceIDs, err := s.residenceRepo.WithContext(ctx).FindResidenceIDsByUser(userID)
+	residenceIDs, err := cachedResidenceIDsForUser(ctx, s.cache, s.residenceRepo, userID)
 	if err != nil {
 		return nil, apperrors.Internal(err)
 	}