Security: - Replace all binding: tags with validate: + c.Validate() in admin handlers - Add rate limiting to auth endpoints (login, register, password reset) - Add security headers (HSTS, XSS protection, nosniff, frame options) - Wire Google Pub/Sub token verification into webhook handler - Replace ParseUnverified with proper OIDC/JWKS key verification - Verify inner Apple JWS signatures in webhook handler - Add io.LimitReader (1MB) to all webhook body reads - Add ownership verification to file deletion - Move hardcoded admin credentials to env vars - Add uniqueIndex to User.Email - Hide ConfirmationCode from JSON serialization - Mask confirmation codes in admin responses - Use http.DetectContentType for upload validation - Fix path traversal in storage service - Replace os.Getenv with Viper in stripe service - Sanitize Redis URLs before logging - Separate DEBUG_FIXED_CODES from DEBUG flag - Reject weak SECRET_KEY in production - Add host check on /_next/* proxy routes - Use explicit localhost CORS origins in debug mode - Replace err.Error() with generic messages in all admin error responses Critical fixes: - Rewrite FCM to HTTP v1 API with OAuth 2.0 service account auth - Fix user_customuser -> auth_user table names in raw SQL - Fix dashboard verified query to use UserProfile model - Add escapeLikeWildcards() to prevent SQL wildcard injection Bug fixes: - Add bounds checks for days/expiring_soon query params (1-3650) - Add receipt_data/transaction_id empty-check to RestoreSubscription - Change Active bool -> *bool in device handler - Check all unchecked GORM/FindByIDWithProfile errors - Add validation for notification hour fields (0-23) - Add max=10000 validation on task description updates Transactions & data integrity: - Wrap registration flow in transaction - Wrap QuickComplete in transaction - Move image creation inside completion transaction - Wrap SetSpecialties in transaction - Wrap GetOrCreateToken in transaction - Wrap completion+image deletion in transaction Performance: - Batch completion summaries (2 queries vs 2N) - Reuse single http.Client in IAP validation - Cache dashboard counts (30s TTL) - Batch COUNT queries in admin user list - Add Limit(500) to document queries - Add reminder_stage+due_date filters to reminder queries - Parse AllowedTypes once at init - In-memory user cache in auth middleware (30s TTL) - Timezone change detection cache - Optimize P95 with per-endpoint sorted buffers - Replace crypto/md5 with hash/fnv for ETags Code quality: - Add sync.Once to all monitoring Stop()/Close() methods - Replace 8 fmt.Printf with zerolog in auth service - Log previously discarded errors - Standardize delete response shapes - Route hardcoded English through i18n - Remove FileURL from DocumentResponse (keep MediaURL only) - Thread user timezone through kanban board responses - Initialize empty slices to prevent null JSON - Extract shared field map for task Update/UpdateTx - Delete unused SoftDeleteModel, min(), formatCron, legacy handlers Worker & jobs: - Wire Asynq email infrastructure into worker - Register HandleReminderLogCleanup with daily 3AM cron - Use per-user timezone in HandleSmartReminder - Replace direct DB queries with repository calls - Delete legacy reminder handlers (~200 lines) - Delete unused task type constants Dependencies: - Replace archived jung-kurt/gofpdf with go-pdf/fpdf - Replace unmaintained gomail.v2 with wneessen/go-mail - Add TODO for Echo jwt v3 transitive dep removal Test infrastructure: - Fix MakeRequest/SeedLookupData error handling - Replace os.Exit(0) with t.Skip() in scope/consistency tests - Add 11 new FCM v1 tests Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
209 lines
5.8 KiB
Go
209 lines
5.8 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"sort"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/labstack/echo/v4"
|
|
)
|
|
|
|
// HTTPStatsCollector collects HTTP request metrics
|
|
type HTTPStatsCollector struct {
|
|
mu sync.RWMutex
|
|
requests map[string]int64 // endpoint -> count
|
|
totalLatency map[string]time.Duration // endpoint -> total latency
|
|
errors map[string]int64 // endpoint -> error count
|
|
byStatus map[int]int64 // status code -> count
|
|
endpointLatencies map[string][]time.Duration // per-endpoint sorted latency buffers for P95
|
|
startTime time.Time
|
|
lastReset time.Time
|
|
}
|
|
|
|
const (
|
|
maxLatencySamplesPerEndpoint = 200 // Max latency samples kept per endpoint
|
|
maxEndpoints = 200 // Cap unique endpoints tracked
|
|
statsResetPeriod = 1 * time.Hour // Reset stats periodically to prevent unbounded growth
|
|
)
|
|
|
|
// NewHTTPStatsCollector creates a new HTTP stats collector
|
|
func NewHTTPStatsCollector() *HTTPStatsCollector {
|
|
now := time.Now()
|
|
return &HTTPStatsCollector{
|
|
requests: make(map[string]int64),
|
|
totalLatency: make(map[string]time.Duration),
|
|
errors: make(map[string]int64),
|
|
byStatus: make(map[int]int64),
|
|
endpointLatencies: make(map[string][]time.Duration),
|
|
startTime: now,
|
|
lastReset: now,
|
|
}
|
|
}
|
|
|
|
// Record records a single HTTP request
|
|
func (c *HTTPStatsCollector) Record(endpoint string, latency time.Duration, status int) {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
// Periodically reset to prevent unbounded memory growth
|
|
if time.Since(c.lastReset) > statsResetPeriod {
|
|
c.resetLocked()
|
|
}
|
|
|
|
// Check if we've hit the endpoint limit and this is a new endpoint
|
|
if _, exists := c.requests[endpoint]; !exists && len(c.requests) >= maxEndpoints {
|
|
// Use a catch-all bucket for overflow endpoints
|
|
endpoint = "OTHER"
|
|
}
|
|
|
|
c.requests[endpoint]++
|
|
c.totalLatency[endpoint] += latency
|
|
c.byStatus[status]++
|
|
|
|
if status >= 400 {
|
|
c.errors[endpoint]++
|
|
}
|
|
|
|
// Insert latency into per-endpoint sorted buffer using binary search
|
|
buf := c.endpointLatencies[endpoint]
|
|
idx := sort.Search(len(buf), func(i int) bool {
|
|
return buf[i] >= latency
|
|
})
|
|
buf = append(buf, 0)
|
|
copy(buf[idx+1:], buf[idx:])
|
|
buf[idx] = latency
|
|
|
|
// Trim to max samples per endpoint by removing the median element
|
|
// to preserve distribution tails (important for P95 accuracy)
|
|
if len(buf) > maxLatencySamplesPerEndpoint {
|
|
mid := len(buf) / 2
|
|
buf = append(buf[:mid], buf[mid+1:]...)
|
|
}
|
|
c.endpointLatencies[endpoint] = buf
|
|
}
|
|
|
|
// resetLocked resets stats while holding the lock
|
|
func (c *HTTPStatsCollector) resetLocked() {
|
|
c.requests = make(map[string]int64)
|
|
c.totalLatency = make(map[string]time.Duration)
|
|
c.errors = make(map[string]int64)
|
|
c.byStatus = make(map[int]int64)
|
|
c.endpointLatencies = make(map[string][]time.Duration)
|
|
c.lastReset = time.Now()
|
|
// Keep startTime for uptime calculation
|
|
}
|
|
|
|
// GetStats returns the current HTTP statistics
|
|
func (c *HTTPStatsCollector) GetStats() HTTPStats {
|
|
c.mu.RLock()
|
|
defer c.mu.RUnlock()
|
|
|
|
stats := HTTPStats{
|
|
ByEndpoint: make(map[string]EndpointStats),
|
|
ByStatusCode: make(map[int]int64),
|
|
}
|
|
|
|
var totalRequests int64
|
|
var totalErrors int64
|
|
var totalLatency time.Duration
|
|
|
|
for endpoint, count := range c.requests {
|
|
avgLatency := c.totalLatency[endpoint] / time.Duration(count)
|
|
errCount := c.errors[endpoint]
|
|
errRate := float64(0)
|
|
if count > 0 {
|
|
errRate = float64(errCount) / float64(count)
|
|
}
|
|
|
|
stats.ByEndpoint[endpoint] = EndpointStats{
|
|
Count: count,
|
|
AvgLatencyMs: float64(avgLatency.Milliseconds()),
|
|
ErrorRate: errRate,
|
|
P95LatencyMs: c.calculateP95(endpoint),
|
|
}
|
|
|
|
totalRequests += count
|
|
totalErrors += errCount
|
|
totalLatency += c.totalLatency[endpoint]
|
|
}
|
|
|
|
// Copy status code counts
|
|
for status, count := range c.byStatus {
|
|
stats.ByStatusCode[status] = count
|
|
}
|
|
|
|
stats.RequestsTotal = totalRequests
|
|
if totalRequests > 0 {
|
|
stats.AvgLatencyMs = float64(totalLatency.Milliseconds()) / float64(totalRequests)
|
|
stats.ErrorRate = float64(totalErrors) / float64(totalRequests)
|
|
}
|
|
|
|
uptime := time.Since(c.startTime).Minutes()
|
|
if uptime > 0 {
|
|
stats.RequestsPerMinute = float64(totalRequests) / uptime
|
|
}
|
|
|
|
return stats
|
|
}
|
|
|
|
// calculateP95 calculates the 95th percentile latency for an endpoint.
|
|
// The per-endpoint buffer is maintained in sorted order during insertion,
|
|
// so this is an O(1) index lookup.
|
|
// Must be called with read lock held.
|
|
func (c *HTTPStatsCollector) calculateP95(endpoint string) float64 {
|
|
buf := c.endpointLatencies[endpoint]
|
|
if len(buf) == 0 {
|
|
return 0
|
|
}
|
|
|
|
// Buffer is already sorted; direct index lookup
|
|
p95Index := int(float64(len(buf)) * 0.95)
|
|
if p95Index >= len(buf) {
|
|
p95Index = len(buf) - 1
|
|
}
|
|
|
|
return float64(buf[p95Index].Milliseconds())
|
|
}
|
|
|
|
// Reset clears all collected stats
|
|
func (c *HTTPStatsCollector) Reset() {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
|
|
c.requests = make(map[string]int64)
|
|
c.totalLatency = make(map[string]time.Duration)
|
|
c.errors = make(map[string]int64)
|
|
c.byStatus = make(map[int]int64)
|
|
c.endpointLatencies = make(map[string][]time.Duration)
|
|
c.startTime = time.Now()
|
|
}
|
|
|
|
// MetricsMiddleware returns an Echo middleware that collects request metrics
|
|
func MetricsMiddleware(collector *HTTPStatsCollector) echo.MiddlewareFunc {
|
|
return func(next echo.HandlerFunc) echo.HandlerFunc {
|
|
return func(c echo.Context) error {
|
|
start := time.Now()
|
|
|
|
// Process request
|
|
err := next(c)
|
|
|
|
// Calculate latency
|
|
latency := time.Since(start)
|
|
|
|
// Get endpoint pattern (use route path, fallback to actual path)
|
|
endpoint := c.Path()
|
|
if endpoint == "" {
|
|
endpoint = c.Request().URL.Path
|
|
}
|
|
|
|
// Combine method with path for unique endpoint identification
|
|
endpoint = c.Request().Method + " " + endpoint
|
|
|
|
// Record metrics
|
|
collector.Record(endpoint, latency, c.Response().Status)
|
|
|
|
return err
|
|
}
|
|
}
|
|
}
|