Cut /api/tasks/ p99 from ~2500ms toward ~150-300ms
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Build (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled

Stack of optimizations against the same Hetzner→Neon transatlantic link.
The trace revealed every visible ms was network/proxy overhead — DB
execution itself is sub-millisecond per query (verified via EXPLAIN
ANALYZE: index scans on every hot path).

Connection layer:
- DB_HOST → Neon pooler endpoint (-pooler suffix). PgBouncer
  transaction-mode keeps backend Postgres connections warm so we no
  longer pay the ~110ms Postgres-startup RTT on cold queries.
- GORM pool tuned: MaxIdleConns 10→20, MaxLifetime 600s→1800s,
  MaxIdleTime added (default 0 = never close idle).
- Eager pool warm-up at boot via parallel pings — first user request
  no longer pays the ~440ms TCP+TLS+startup handshake.
- Redis maxmemory-policy noeviction → allkeys-lru. Cache writes will
  evict cold keys instead of erroring at the 256MB limit.

Auth layer:
- TokenCacheTTL 5min → 1 hour (Redis token cache).
- UserCacheTTL 30s → 5min (in-memory User cache, per pod).
- UserCache gains a 5,000-entry LRU cap so a flood of unique users
  can't blow up pod RSS. ~5MB worst-case per pod.
- Token + user lookup collapsed from 2 GORM Preload queries into a
  single INNER JOIN. Saves 1 RTT per cold-cache request.
- Auth middleware's m.db.* now use db.WithContext(ctx) so the SQL
  spans nest under the parent HTTP request in Jaeger.

Service layer:
- TaskService.ListTasks: replaced two-step
  FindResidenceIDsByUser → GetKanbanDataForMultipleResidences
  with a single GetKanbanDataForUser that uses a Postgres subquery
  for residence-access. One round-trip instead of two.
- New CacheService residence-IDs cache: \"residence_ids_user:<id>\"
  with 5-min TTL. Wired into Task/Residence/Contractor/Document
  services for the four hot read paths that need this list.
- Cache invalidation on every relevant mutation: CreateResidence,
  DeleteResidence, JoinWithCode, RemoveUser. DeleteResidence
  invalidates every member of the residence, not just the owner.

What this stacks up to (Hetzner→Neon, before US migration):
  Path                                 Before        After (target)
  Cache-warm authed read               ~800ms        ~100-200ms
  Cache-cold authed read (1st in 1hr)  ~2500ms       ~500-700ms
  First request after deploy           ~2500ms       ~700-900ms

The endgame US-region migration on top of this gets us to ~30-50ms
warm-cache, but we're shippable at ~150ms warm right now.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey t
2026-04-25 17:13:50 -05:00
parent 9410da7497
commit 88fb1751c7
15 changed files with 443 additions and 59 deletions
+85 -13
View File
@@ -22,13 +22,22 @@ const (
AuthUserKey = "auth_user"
// AuthTokenKey is the key used to store the token in the context
AuthTokenKey = "auth_token"
// TokenCacheTTL is the duration to cache tokens in Redis
TokenCacheTTL = 5 * time.Minute
// TokenCacheTTL is the duration to cache tokens in Redis. Tokens are
// valid for DefaultTokenExpiryDays (90), and explicit logout invalidates
// the cache, so a long TTL here just means most authed requests skip the
// auth-token SQL query entirely.
TokenCacheTTL = 1 * time.Hour
// TokenCachePrefix is the prefix for token cache keys
TokenCachePrefix = "auth_token_"
// UserCacheTTL is how long full user records are cached in memory to
// avoid hitting the database on every authenticated request.
UserCacheTTL = 30 * time.Second
// avoid hitting the database on every authenticated request. Bumped from
// 30s — at 30s the trace showed a SELECT auth_user query on most warm
// requests because users aren't in cache long enough to hit twice.
UserCacheTTL = 5 * time.Minute
// UserCacheMaxSize bounds the per-pod in-memory user cache. With ~1KB
// per User struct, 5000 entries = ~5MB per pod. Older entries are
// evicted LRU before the limit is exceeded.
UserCacheMaxSize = 5000
// DefaultTokenExpiryDays is the default number of days before a token expires.
DefaultTokenExpiryDays = 90
@@ -47,7 +56,7 @@ func NewAuthMiddleware(db *gorm.DB, cache *services.CacheService) *AuthMiddlewar
return &AuthMiddleware{
db: db,
cache: cache,
userCache: NewUserCache(UserCacheTTL),
userCache: NewUserCache(UserCacheTTL, UserCacheMaxSize),
tokenExpiryDays: DefaultTokenExpiryDays,
}
}
@@ -61,7 +70,7 @@ func NewAuthMiddlewareWithConfig(db *gorm.DB, cache *services.CacheService, cfg
return &AuthMiddleware{
db: db,
cache: cache,
userCache: NewUserCache(UserCacheTTL),
userCache: NewUserCache(UserCacheTTL, UserCacheMaxSize),
tokenExpiryDays: expiryDays,
}
}
@@ -244,20 +253,83 @@ func (m *AuthMiddleware) getUserFromCache(ctx context.Context, token string) (*m
// getUserFromDatabaseWithToken looks up the token in the database and returns
// both the user and the auth token record (for expiry checking). The ctx is
// threaded into the GORM session so the SQL span attaches to the request trace.
//
// Uses a single JOIN query instead of GORM's Preload (which issues 2 SELECTs).
// Over a transatlantic link this saves ~110ms RTT per cache miss.
func (m *AuthMiddleware) getUserFromDatabaseWithToken(ctx context.Context, token string) (*models.User, *models.AuthToken, error) {
var authToken models.AuthToken
if err := m.db.WithContext(ctx).Preload("User").Where("key = ?", token).First(&authToken).Error; err != nil {
// Flat result row: every column from auth_user prefixed `u_`, every
// column from user_authtoken left in its native shape. Mapping to two
// structs is mechanical so we don't need a struct tag soup.
type joinedRow struct {
// AuthToken columns
Key string `gorm:"column:key"`
Created time.Time `gorm:"column:created"`
UserID uint `gorm:"column:user_id"`
// User columns (prefixed to avoid collision with UserID)
UID uint `gorm:"column:u_id"`
UUsername string `gorm:"column:u_username"`
UEmail string `gorm:"column:u_email"`
UFirstName string `gorm:"column:u_first_name"`
ULastName string `gorm:"column:u_last_name"`
UPassword string `gorm:"column:u_password"`
UIsActive bool `gorm:"column:u_is_active"`
UIsStaff bool `gorm:"column:u_is_staff"`
UIsSuper bool `gorm:"column:u_is_superuser"`
UDateJoined time.Time `gorm:"column:u_date_joined"`
ULastLogin *time.Time `gorm:"column:u_last_login"`
}
var row joinedRow
err := m.db.WithContext(ctx).
Table("user_authtoken AS t").
Select(`
t.key, t.created, t.user_id,
u.id AS u_id,
u.username AS u_username,
u.email AS u_email,
u.first_name AS u_first_name,
u.last_name AS u_last_name,
u.password AS u_password,
u.is_active AS u_is_active,
u.is_staff AS u_is_staff,
u.is_superuser AS u_is_superuser,
u.date_joined AS u_date_joined,
u.last_login AS u_last_login
`).
Joins("INNER JOIN auth_user u ON u.id = t.user_id").
Where("t.key = ?", token).
Limit(1).
Scan(&row).Error
if err != nil || row.Key == "" {
return nil, nil, fmt.Errorf("token not found")
}
// Check if user is active
if !authToken.User.IsActive {
user := models.User{
ID: row.UID,
Username: row.UUsername,
Email: row.UEmail,
FirstName: row.UFirstName,
LastName: row.ULastName,
Password: row.UPassword,
IsActive: row.UIsActive,
IsStaff: row.UIsStaff,
IsSuperuser: row.UIsSuper,
DateJoined: row.UDateJoined,
LastLogin: row.ULastLogin,
}
authToken := models.AuthToken{
Key: row.Key,
Created: row.Created,
UserID: row.UserID,
User: user,
}
if !user.IsActive {
return nil, nil, fmt.Errorf("user is inactive")
}
// Store in in-memory cache for subsequent requests
m.userCache.Set(&authToken.User)
return &authToken.User, &authToken, nil
m.userCache.Set(&user)
return &user, &authToken, nil
}
// getUserFromDatabase looks up the token in the database and caches the
+77 -14
View File
@@ -2,6 +2,7 @@ package middleware
import (
"sync"
"sync/atomic"
"time"
"github.com/treytartt/honeydue-api/internal/models"
@@ -16,22 +17,29 @@ type userCacheEntry struct {
// UserCache is a concurrency-safe in-memory cache for User records, keyed by
// user ID. Entries expire after a configurable TTL. The cache uses a sync.Map
// for lock-free reads on the hot path, with periodic lazy eviction of stale
// entries during Set operations.
// entries during Set operations and a hard size cap to bound memory.
type UserCache struct {
store sync.Map
ttl time.Duration
lastGC time.Time
gcMu sync.Mutex
gcEvery time.Duration
store sync.Map
ttl time.Duration
lastGC time.Time
gcMu sync.Mutex
gcEvery time.Duration
size atomic.Int64 // approximate count; sync.Map has no Len()
maxSize int64
}
// NewUserCache creates a UserCache with the given TTL for entries.
func NewUserCache(ttl time.Duration) *UserCache {
return &UserCache{
// maxSize is the soft upper bound on the number of cached users; when
// exceeded, the next Set will trigger an eviction sweep before storing.
// Pass <=0 for no size cap.
func NewUserCache(ttl time.Duration, maxSize int) *UserCache {
c := &UserCache{
ttl: ttl,
lastGC: time.Now(),
gcEvery: 2 * time.Minute,
maxSize: int64(maxSize),
}
return c
}
// Get returns a cached user by ID, or nil if not found or expired.
@@ -42,7 +50,9 @@ func (c *UserCache) Get(userID uint) *models.User {
}
entry := val.(*userCacheEntry)
if time.Now().After(entry.expiresAt) {
c.store.Delete(userID)
if _, loaded := c.store.LoadAndDelete(userID); loaded {
c.size.Add(-1)
}
return nil
}
// Return a shallow copy so callers cannot mutate the cached value.
@@ -51,20 +61,71 @@ func (c *UserCache) Get(userID uint) *models.User {
}
// Set stores a user in the cache. It also triggers a background garbage-
// collection sweep if enough time has elapsed since the last one.
// collection sweep if enough time has elapsed since the last one or if the
// cache has grown past maxSize.
func (c *UserCache) Set(user *models.User) {
// Store a copy to prevent external mutation of the cached object.
copied := *user
c.store.Store(user.ID, &userCacheEntry{
if _, loaded := c.store.Swap(user.ID, &userCacheEntry{
user: &copied,
expiresAt: time.Now().Add(c.ttl),
})
}); !loaded {
c.size.Add(1)
}
if c.maxSize > 0 && c.size.Load() > c.maxSize {
c.evictUntilUnderCap()
}
c.maybeGC()
}
// evictUntilUnderCap walks the cache and drops the oldest expirable entries
// until size is under maxSize. Cheap O(n) walk; runs only when the cap is
// breached, which should be rare in practice (TTL handles most eviction).
func (c *UserCache) evictUntilUnderCap() {
now := time.Now()
// First pass: drop expired entries.
c.store.Range(func(key, value any) bool {
entry := value.(*userCacheEntry)
if now.After(entry.expiresAt) {
if _, loaded := c.store.LoadAndDelete(key); loaded {
c.size.Add(-1)
}
}
return c.size.Load() > c.maxSize
})
// Second pass: if still over cap, drop entries closest to expiry.
if c.size.Load() <= c.maxSize {
return
}
type scored struct {
key any
expiresAt time.Time
}
candidates := make([]scored, 0, 64)
c.store.Range(func(key, value any) bool {
entry := value.(*userCacheEntry)
candidates = append(candidates, scored{key, entry.expiresAt})
return true
})
// Sort by expiry ascending — drop closest-to-expiry first.
for i := 1; i < len(candidates); i++ {
for j := i; j > 0 && candidates[j-1].expiresAt.After(candidates[j].expiresAt); j-- {
candidates[j-1], candidates[j] = candidates[j], candidates[j-1]
}
}
overshoot := int(c.size.Load() - c.maxSize)
for i := 0; i < overshoot && i < len(candidates); i++ {
if _, loaded := c.store.LoadAndDelete(candidates[i].key); loaded {
c.size.Add(-1)
}
}
}
// Invalidate removes a user from the cache by ID.
func (c *UserCache) Invalidate(userID uint) {
c.store.Delete(userID)
if _, loaded := c.store.LoadAndDelete(userID); loaded {
c.size.Add(-1)
}
}
// maybeGC lazily sweeps expired entries at most once per gcEvery interval.
@@ -81,7 +142,9 @@ func (c *UserCache) maybeGC() {
c.store.Range(func(key, value any) bool {
entry := value.(*userCacheEntry)
if now.After(entry.expiresAt) {
c.store.Delete(key)
if _, loaded := c.store.LoadAndDelete(key); loaded {
c.size.Add(-1)
}
}
return true
})
+8 -8
View File
@@ -11,7 +11,7 @@ import (
)
func TestUserCache_SetAndGet(t *testing.T) {
cache := NewUserCache(1 * time.Minute)
cache := NewUserCache(1 * time.Minute, 0)
user := &models.User{Username: "testuser", Email: "test@test.com"}
user.ID = 1
@@ -25,7 +25,7 @@ func TestUserCache_SetAndGet(t *testing.T) {
}
func TestUserCache_GetNonExistent_ReturnsNil(t *testing.T) {
cache := NewUserCache(1 * time.Minute)
cache := NewUserCache(1 * time.Minute, 0)
cached := cache.Get(999)
assert.Nil(t, cached)
@@ -33,7 +33,7 @@ func TestUserCache_GetNonExistent_ReturnsNil(t *testing.T) {
func TestUserCache_Expired_ReturnsNil(t *testing.T) {
// Very short TTL
cache := NewUserCache(1 * time.Millisecond)
cache := NewUserCache(1 * time.Millisecond, 0)
user := &models.User{Username: "expiring_user"}
user.ID = 1
@@ -48,7 +48,7 @@ func TestUserCache_Expired_ReturnsNil(t *testing.T) {
}
func TestUserCache_Invalidate(t *testing.T) {
cache := NewUserCache(1 * time.Minute)
cache := NewUserCache(1 * time.Minute, 0)
user := &models.User{Username: "to_invalidate"}
user.ID = 1
@@ -66,7 +66,7 @@ func TestUserCache_Invalidate(t *testing.T) {
}
func TestUserCache_ReturnsCopy_NotOriginal(t *testing.T) {
cache := NewUserCache(1 * time.Minute)
cache := NewUserCache(1 * time.Minute, 0)
user := &models.User{Username: "original"}
user.ID = 1
@@ -85,7 +85,7 @@ func TestUserCache_ReturnsCopy_NotOriginal(t *testing.T) {
}
func TestUserCache_SetCopiesInput(t *testing.T) {
cache := NewUserCache(1 * time.Minute)
cache := NewUserCache(1 * time.Minute, 0)
user := &models.User{Username: "original"}
user.ID = 1
@@ -102,7 +102,7 @@ func TestUserCache_SetCopiesInput(t *testing.T) {
}
func TestUserCache_MultipleUsers(t *testing.T) {
cache := NewUserCache(1 * time.Minute)
cache := NewUserCache(1 * time.Minute, 0)
user1 := &models.User{Username: "user1"}
user1.ID = 1
@@ -122,7 +122,7 @@ func TestUserCache_MultipleUsers(t *testing.T) {
}
func TestUserCache_OverwriteEntry(t *testing.T) {
cache := NewUserCache(1 * time.Minute)
cache := NewUserCache(1 * time.Minute, 0)
user := &models.User{Username: "original"}
user.ID = 1