Files
honeyDueAPI/internal/monitoring/middleware.go
Trey t 42a5533a56 Fix 113 hardening issues across entire Go backend
Security:
- Replace all binding: tags with validate: + c.Validate() in admin handlers
- Add rate limiting to auth endpoints (login, register, password reset)
- Add security headers (HSTS, XSS protection, nosniff, frame options)
- Wire Google Pub/Sub token verification into webhook handler
- Replace ParseUnverified with proper OIDC/JWKS key verification
- Verify inner Apple JWS signatures in webhook handler
- Add io.LimitReader (1MB) to all webhook body reads
- Add ownership verification to file deletion
- Move hardcoded admin credentials to env vars
- Add uniqueIndex to User.Email
- Hide ConfirmationCode from JSON serialization
- Mask confirmation codes in admin responses
- Use http.DetectContentType for upload validation
- Fix path traversal in storage service
- Replace os.Getenv with Viper in stripe service
- Sanitize Redis URLs before logging
- Separate DEBUG_FIXED_CODES from DEBUG flag
- Reject weak SECRET_KEY in production
- Add host check on /_next/* proxy routes
- Use explicit localhost CORS origins in debug mode
- Replace err.Error() with generic messages in all admin error responses

Critical fixes:
- Rewrite FCM to HTTP v1 API with OAuth 2.0 service account auth
- Fix user_customuser -> auth_user table names in raw SQL
- Fix dashboard verified query to use UserProfile model
- Add escapeLikeWildcards() to prevent SQL wildcard injection

Bug fixes:
- Add bounds checks for days/expiring_soon query params (1-3650)
- Add receipt_data/transaction_id empty-check to RestoreSubscription
- Change Active bool -> *bool in device handler
- Check all unchecked GORM/FindByIDWithProfile errors
- Add validation for notification hour fields (0-23)
- Add max=10000 validation on task description updates

Transactions & data integrity:
- Wrap registration flow in transaction
- Wrap QuickComplete in transaction
- Move image creation inside completion transaction
- Wrap SetSpecialties in transaction
- Wrap GetOrCreateToken in transaction
- Wrap completion+image deletion in transaction

Performance:
- Batch completion summaries (2 queries vs 2N)
- Reuse single http.Client in IAP validation
- Cache dashboard counts (30s TTL)
- Batch COUNT queries in admin user list
- Add Limit(500) to document queries
- Add reminder_stage+due_date filters to reminder queries
- Parse AllowedTypes once at init
- In-memory user cache in auth middleware (30s TTL)
- Timezone change detection cache
- Optimize P95 with per-endpoint sorted buffers
- Replace crypto/md5 with hash/fnv for ETags

Code quality:
- Add sync.Once to all monitoring Stop()/Close() methods
- Replace 8 fmt.Printf with zerolog in auth service
- Log previously discarded errors
- Standardize delete response shapes
- Route hardcoded English through i18n
- Remove FileURL from DocumentResponse (keep MediaURL only)
- Thread user timezone through kanban board responses
- Initialize empty slices to prevent null JSON
- Extract shared field map for task Update/UpdateTx
- Delete unused SoftDeleteModel, min(), formatCron, legacy handlers

Worker & jobs:
- Wire Asynq email infrastructure into worker
- Register HandleReminderLogCleanup with daily 3AM cron
- Use per-user timezone in HandleSmartReminder
- Replace direct DB queries with repository calls
- Delete legacy reminder handlers (~200 lines)
- Delete unused task type constants

Dependencies:
- Replace archived jung-kurt/gofpdf with go-pdf/fpdf
- Replace unmaintained gomail.v2 with wneessen/go-mail
- Add TODO for Echo jwt v3 transitive dep removal

Test infrastructure:
- Fix MakeRequest/SeedLookupData error handling
- Replace os.Exit(0) with t.Skip() in scope/consistency tests
- Add 11 new FCM v1 tests

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 23:14:13 -05:00

209 lines
5.8 KiB
Go

package monitoring
import (
"sort"
"sync"
"time"
"github.com/labstack/echo/v4"
)
// HTTPStatsCollector collects HTTP request metrics
type HTTPStatsCollector struct {
mu sync.RWMutex
requests map[string]int64 // endpoint -> count
totalLatency map[string]time.Duration // endpoint -> total latency
errors map[string]int64 // endpoint -> error count
byStatus map[int]int64 // status code -> count
endpointLatencies map[string][]time.Duration // per-endpoint sorted latency buffers for P95
startTime time.Time
lastReset time.Time
}
const (
maxLatencySamplesPerEndpoint = 200 // Max latency samples kept per endpoint
maxEndpoints = 200 // Cap unique endpoints tracked
statsResetPeriod = 1 * time.Hour // Reset stats periodically to prevent unbounded growth
)
// NewHTTPStatsCollector creates a new HTTP stats collector
func NewHTTPStatsCollector() *HTTPStatsCollector {
now := time.Now()
return &HTTPStatsCollector{
requests: make(map[string]int64),
totalLatency: make(map[string]time.Duration),
errors: make(map[string]int64),
byStatus: make(map[int]int64),
endpointLatencies: make(map[string][]time.Duration),
startTime: now,
lastReset: now,
}
}
// Record records a single HTTP request
func (c *HTTPStatsCollector) Record(endpoint string, latency time.Duration, status int) {
c.mu.Lock()
defer c.mu.Unlock()
// Periodically reset to prevent unbounded memory growth
if time.Since(c.lastReset) > statsResetPeriod {
c.resetLocked()
}
// Check if we've hit the endpoint limit and this is a new endpoint
if _, exists := c.requests[endpoint]; !exists && len(c.requests) >= maxEndpoints {
// Use a catch-all bucket for overflow endpoints
endpoint = "OTHER"
}
c.requests[endpoint]++
c.totalLatency[endpoint] += latency
c.byStatus[status]++
if status >= 400 {
c.errors[endpoint]++
}
// Insert latency into per-endpoint sorted buffer using binary search
buf := c.endpointLatencies[endpoint]
idx := sort.Search(len(buf), func(i int) bool {
return buf[i] >= latency
})
buf = append(buf, 0)
copy(buf[idx+1:], buf[idx:])
buf[idx] = latency
// Trim to max samples per endpoint by removing the median element
// to preserve distribution tails (important for P95 accuracy)
if len(buf) > maxLatencySamplesPerEndpoint {
mid := len(buf) / 2
buf = append(buf[:mid], buf[mid+1:]...)
}
c.endpointLatencies[endpoint] = buf
}
// resetLocked resets stats while holding the lock
func (c *HTTPStatsCollector) resetLocked() {
c.requests = make(map[string]int64)
c.totalLatency = make(map[string]time.Duration)
c.errors = make(map[string]int64)
c.byStatus = make(map[int]int64)
c.endpointLatencies = make(map[string][]time.Duration)
c.lastReset = time.Now()
// Keep startTime for uptime calculation
}
// GetStats returns the current HTTP statistics
func (c *HTTPStatsCollector) GetStats() HTTPStats {
c.mu.RLock()
defer c.mu.RUnlock()
stats := HTTPStats{
ByEndpoint: make(map[string]EndpointStats),
ByStatusCode: make(map[int]int64),
}
var totalRequests int64
var totalErrors int64
var totalLatency time.Duration
for endpoint, count := range c.requests {
avgLatency := c.totalLatency[endpoint] / time.Duration(count)
errCount := c.errors[endpoint]
errRate := float64(0)
if count > 0 {
errRate = float64(errCount) / float64(count)
}
stats.ByEndpoint[endpoint] = EndpointStats{
Count: count,
AvgLatencyMs: float64(avgLatency.Milliseconds()),
ErrorRate: errRate,
P95LatencyMs: c.calculateP95(endpoint),
}
totalRequests += count
totalErrors += errCount
totalLatency += c.totalLatency[endpoint]
}
// Copy status code counts
for status, count := range c.byStatus {
stats.ByStatusCode[status] = count
}
stats.RequestsTotal = totalRequests
if totalRequests > 0 {
stats.AvgLatencyMs = float64(totalLatency.Milliseconds()) / float64(totalRequests)
stats.ErrorRate = float64(totalErrors) / float64(totalRequests)
}
uptime := time.Since(c.startTime).Minutes()
if uptime > 0 {
stats.RequestsPerMinute = float64(totalRequests) / uptime
}
return stats
}
// calculateP95 calculates the 95th percentile latency for an endpoint.
// The per-endpoint buffer is maintained in sorted order during insertion,
// so this is an O(1) index lookup.
// Must be called with read lock held.
func (c *HTTPStatsCollector) calculateP95(endpoint string) float64 {
buf := c.endpointLatencies[endpoint]
if len(buf) == 0 {
return 0
}
// Buffer is already sorted; direct index lookup
p95Index := int(float64(len(buf)) * 0.95)
if p95Index >= len(buf) {
p95Index = len(buf) - 1
}
return float64(buf[p95Index].Milliseconds())
}
// Reset clears all collected stats
func (c *HTTPStatsCollector) Reset() {
c.mu.Lock()
defer c.mu.Unlock()
c.requests = make(map[string]int64)
c.totalLatency = make(map[string]time.Duration)
c.errors = make(map[string]int64)
c.byStatus = make(map[int]int64)
c.endpointLatencies = make(map[string][]time.Duration)
c.startTime = time.Now()
}
// MetricsMiddleware returns an Echo middleware that collects request metrics
func MetricsMiddleware(collector *HTTPStatsCollector) echo.MiddlewareFunc {
return func(next echo.HandlerFunc) echo.HandlerFunc {
return func(c echo.Context) error {
start := time.Now()
// Process request
err := next(c)
// Calculate latency
latency := time.Since(start)
// Get endpoint pattern (use route path, fallback to actual path)
endpoint := c.Path()
if endpoint == "" {
endpoint = c.Request().URL.Path
}
// Combine method with path for unique endpoint identification
endpoint = c.Request().Method + " " + endpoint
// Record metrics
collector.Record(endpoint, latency, c.Response().Status)
return err
}
}
}