Fix 113 hardening issues across entire Go backend

Security: - Replace all binding: tags with validate: + c.Validate() in admin handlers - Add rate limiting to auth endpoints (login, register, password reset) - Add security headers (HSTS, XSS protection, nosniff, frame options) - Wire Google Pub/Sub token verification into webhook handler - Replace ParseUnverified with proper OIDC/JWKS key verification - Verify inner Apple JWS signatures in webhook handler - Add io.LimitReader (1MB) to all webhook body reads - Add ownership verification to file deletion - Move hardcoded admin credentials to env vars - Add uniqueIndex to User.Email - Hide ConfirmationCode from JSON serialization - Mask confirmation codes in admin responses - Use http.DetectContentType for upload validation - Fix path traversal in storage service - Replace os.Getenv with Viper in stripe service - Sanitize Redis URLs before logging - Separate DEBUG_FIXED_CODES from DEBUG flag - Reject weak SECRET_KEY in production - Add host check on /_next/* proxy routes - Use explicit localhost CORS origins in debug mode - Replace err.Error() with generic messages in all admin error responses Critical fixes: - Rewrite FCM to HTTP v1 API with OAuth 2.0 service account auth - Fix user_customuser -> auth_user table names in raw SQL - Fix dashboard verified query to use UserProfile model - Add escapeLikeWildcards() to prevent SQL wildcard injection Bug fixes: - Add bounds checks for days/expiring_soon query params (1-3650) - Add receipt_data/transaction_id empty-check to RestoreSubscription - Change Active bool -> *bool in device handler - Check all unchecked GORM/FindByIDWithProfile errors - Add validation for notification hour fields (0-23) - Add max=10000 validation on task description updates Transactions & data integrity: - Wrap registration flow in transaction - Wrap QuickComplete in transaction - Move image creation inside completion transaction - Wrap SetSpecialties in transaction - Wrap GetOrCreateToken in transaction - Wrap completion+image deletion in transaction Performance: - Batch completion summaries (2 queries vs 2N) - Reuse single http.Client in IAP validation - Cache dashboard counts (30s TTL) - Batch COUNT queries in admin user list - Add Limit(500) to document queries - Add reminder_stage+due_date filters to reminder queries - Parse AllowedTypes once at init - In-memory user cache in auth middleware (30s TTL) - Timezone change detection cache - Optimize P95 with per-endpoint sorted buffers - Replace crypto/md5 with hash/fnv for ETags Code quality: - Add sync.Once to all monitoring Stop()/Close() methods - Replace 8 fmt.Printf with zerolog in auth service - Log previously discarded errors - Standardize delete response shapes - Route hardcoded English through i18n - Remove FileURL from DocumentResponse (keep MediaURL only) - Thread user timezone through kanban board responses - Initialize empty slices to prevent null JSON - Extract shared field map for task Update/UpdateTx - Delete unused SoftDeleteModel, min(), formatCron, legacy handlers Worker & jobs: - Wire Asynq email infrastructure into worker - Register HandleReminderLogCleanup with daily 3AM cron - Use per-user timezone in HandleSmartReminder - Replace direct DB queries with repository calls - Delete legacy reminder handlers (~200 lines) - Delete unused task type constants Dependencies: - Replace archived jung-kurt/gofpdf with go-pdf/fpdf - Replace unmaintained gomail.v2 with wneessen/go-mail - Add TODO for Echo jwt v3 transitive dep removal Test infrastructure: - Fix MakeRequest/SeedLookupData error handling - Replace os.Exit(0) with t.Skip() in scope/consistency tests - Add 11 new FCM v1 tests Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 23:14:13 -05:00
parent 3b86d0aae1
commit 42a5533a56
95 changed files with 2892 additions and 1783 deletions
--- a/internal/monitoring/collector.go
+++ b/internal/monitoring/collector.go
@@ -2,6 +2,7 @@ package monitoring

 import (
 	"runtime"
+	"sync"
 	"time"

 	"github.com/hibiken/asynq"
@@ -20,6 +21,7 @@ type Collector struct {
 	httpCollector *HTTPStatsCollector // nil for worker
 	asynqClient   *asynq.Inspector    // nil for api
 	stopChan      chan struct{}
+	stopOnce      sync.Once
 }

 // NewCollector creates a new stats collector
@@ -193,7 +195,9 @@ func (c *Collector) publishStats() {
 	}
 }

-// Stop stops the stats publishing
+// Stop stops the stats publishing. It is safe to call multiple times.
 func (c *Collector) Stop() {
-	close(c.stopChan)
+	c.stopOnce.Do(func() {
+		close(c.stopChan)
+	})
 }
--- a/internal/monitoring/handler.go
+++ b/internal/monitoring/handler.go
@@ -150,7 +150,10 @@ func (h *Handler) WebSocket(c echo.Context) error {
 	defer statsTicker.Stop()

 	// Send initial stats
-	h.sendStats(conn, &wsMu)
+	if err := h.sendStats(conn, &wsMu); err != nil {
+		cancel()
+		return nil
+	}

 	for {
 		select {
@@ -173,11 +176,16 @@ func (h *Handler) WebSocket(c echo.Context) error {

 			if err != nil {
 				log.Debug().Err(err).Msg("WebSocket write error")
+				cancel()
+				return nil
 			}

 		case <-statsTicker.C:
 			// Send periodic stats update
-			h.sendStats(conn, &wsMu)
+			if err := h.sendStats(conn, &wsMu); err != nil {
+				cancel()
+				return nil
+			}

 		case <-ctx.Done():
 			return nil
@@ -185,9 +193,11 @@ func (h *Handler) WebSocket(c echo.Context) error {
 	}
 }

-func (h *Handler) sendStats(conn *websocket.Conn, mu *sync.Mutex) {
+func (h *Handler) sendStats(conn *websocket.Conn, mu *sync.Mutex) error {
 	allStats, err := h.statsStore.GetAllStats()
 	if err != nil {
+		log.Error().Err(err).Msg("failed to send stats")
+		return err
 	}

 	wsMsg := WSMessage{
@@ -196,6 +206,12 @@ func (h *Handler) sendStats(conn *websocket.Conn, mu *sync.Mutex) {
 	}

 	mu.Lock()
-	conn.WriteJSON(wsMsg)
+	err = conn.WriteJSON(wsMsg)
 	mu.Unlock()
+
+	if err != nil {
+		log.Debug().Err(err).Msg("WebSocket write error sending stats")
+	}
+
+	return err
 }
--- a/internal/monitoring/middleware.go
+++ b/internal/monitoring/middleware.go
@@ -10,39 +10,33 @@ import (

 // HTTPStatsCollector collects HTTP request metrics
 type HTTPStatsCollector struct {
-	mu           sync.RWMutex
-	requests     map[string]int64         // endpoint -> count
-	totalLatency map[string]time.Duration // endpoint -> total latency
-	errors       map[string]int64         // endpoint -> error count
-	byStatus     map[int]int64            // status code -> count
-	latencies    []latencySample          // recent latency samples for P95
-	startTime    time.Time
-	lastReset    time.Time
-}
-
-type latencySample struct {
-	endpoint  string
-	latency   time.Duration
-	timestamp time.Time
+	mu                  sync.RWMutex
+	requests            map[string]int64         // endpoint -> count
+	totalLatency        map[string]time.Duration // endpoint -> total latency
+	errors              map[string]int64         // endpoint -> error count
+	byStatus            map[int]int64            // status code -> count
+	endpointLatencies   map[string][]time.Duration // per-endpoint sorted latency buffers for P95
+	startTime           time.Time
+	lastReset           time.Time
 }

 const (
-	maxLatencySamples = 1000
-	maxEndpoints      = 200          // Cap unique endpoints tracked
-	statsResetPeriod  = 1 * time.Hour // Reset stats periodically to prevent unbounded growth
+	maxLatencySamplesPerEndpoint = 200  // Max latency samples kept per endpoint
+	maxEndpoints                = 200  // Cap unique endpoints tracked
+	statsResetPeriod            = 1 * time.Hour // Reset stats periodically to prevent unbounded growth
 )

 // NewHTTPStatsCollector creates a new HTTP stats collector
 func NewHTTPStatsCollector() *HTTPStatsCollector {
 	now := time.Now()
 	return &HTTPStatsCollector{
-		requests:     make(map[string]int64),
-		totalLatency: make(map[string]time.Duration),
-		errors:       make(map[string]int64),
-		byStatus:     make(map[int]int64),
-		latencies:    make([]latencySample, 0, maxLatencySamples),
-		startTime:    now,
-		lastReset:    now,
+		requests:          make(map[string]int64),
+		totalLatency:      make(map[string]time.Duration),
+		errors:            make(map[string]int64),
+		byStatus:          make(map[int]int64),
+		endpointLatencies: make(map[string][]time.Duration),
+		startTime:         now,
+		lastReset:         now,
 	}
 }

@@ -70,17 +64,22 @@ func (c *HTTPStatsCollector) Record(endpoint string, latency time.Duration, stat
 		c.errors[endpoint]++
 	}

-	// Store latency sample
-	c.latencies = append(c.latencies, latencySample{
-		endpoint:  endpoint,
-		latency:   latency,
-		timestamp: time.Now(),
+	// Insert latency into per-endpoint sorted buffer using binary search
+	buf := c.endpointLatencies[endpoint]
+	idx := sort.Search(len(buf), func(i int) bool {
+		return buf[i] >= latency
 	})
+	buf = append(buf, 0)
+	copy(buf[idx+1:], buf[idx:])
+	buf[idx] = latency

-	// Keep only recent samples
-	if len(c.latencies) > maxLatencySamples {
-		c.latencies = c.latencies[len(c.latencies)-maxLatencySamples:]
+	// Trim to max samples per endpoint by removing the median element
+	// to preserve distribution tails (important for P95 accuracy)
+	if len(buf) > maxLatencySamplesPerEndpoint {
+		mid := len(buf) / 2
+		buf = append(buf[:mid], buf[mid+1:]...)
 	}
+	c.endpointLatencies[endpoint] = buf
 }

 // resetLocked resets stats while holding the lock
@@ -89,7 +88,7 @@ func (c *HTTPStatsCollector) resetLocked() {
 	c.totalLatency = make(map[string]time.Duration)
 	c.errors = make(map[string]int64)
 	c.byStatus = make(map[int]int64)
-	c.latencies = make([]latencySample, 0, maxLatencySamples)
+	c.endpointLatencies = make(map[string][]time.Duration)
 	c.lastReset = time.Now()
 	// Keep startTime for uptime calculation
 }
@@ -147,33 +146,23 @@ func (c *HTTPStatsCollector) GetStats() HTTPStats {
 	return stats
 }

-// calculateP95 calculates the 95th percentile latency for an endpoint
-// Must be called with read lock held
+// calculateP95 calculates the 95th percentile latency for an endpoint.
+// The per-endpoint buffer is maintained in sorted order during insertion,
+// so this is an O(1) index lookup.
+// Must be called with read lock held.
 func (c *HTTPStatsCollector) calculateP95(endpoint string) float64 {
-	var endpointLatencies []time.Duration
-
-	for _, sample := range c.latencies {
-		if sample.endpoint == endpoint {
-			endpointLatencies = append(endpointLatencies, sample.latency)
-		}
-	}
-
-	if len(endpointLatencies) == 0 {
+	buf := c.endpointLatencies[endpoint]
+	if len(buf) == 0 {
 		return 0
 	}

-	// Sort latencies
-	sort.Slice(endpointLatencies, func(i, j int) bool {
-		return endpointLatencies[i] < endpointLatencies[j]
-	})
-
-	// Calculate P95 index
-	p95Index := int(float64(len(endpointLatencies)) * 0.95)
-	if p95Index >= len(endpointLatencies) {
-		p95Index = len(endpointLatencies) - 1
+	// Buffer is already sorted; direct index lookup
+	p95Index := int(float64(len(buf)) * 0.95)
+	if p95Index >= len(buf) {
+		p95Index = len(buf) - 1
 	}

-	return float64(endpointLatencies[p95Index].Milliseconds())
+	return float64(buf[p95Index].Milliseconds())
 }

 // Reset clears all collected stats
@@ -185,7 +174,7 @@ func (c *HTTPStatsCollector) Reset() {
 	c.totalLatency = make(map[string]time.Duration)
 	c.errors = make(map[string]int64)
 	c.byStatus = make(map[int]int64)
-	c.latencies = make([]latencySample, 0, maxLatencySamples)
+	c.endpointLatencies = make(map[string][]time.Duration)
 	c.startTime = time.Now()
 }

--- a/internal/monitoring/service.go
+++ b/internal/monitoring/service.go
@@ -2,6 +2,7 @@ package monitoring

 import (
 	"io"
+	"sync"
 	"time"

 	"github.com/hibiken/asynq"
@@ -31,6 +32,8 @@ type Service struct {
 	logWriter      *RedisLogWriter
 	db             *gorm.DB
 	settingsStopCh chan struct{}
+	stopOnce       sync.Once
+	statsInterval  time.Duration
 }

 // Config holds configuration for the monitoring service
@@ -71,6 +74,7 @@ func NewService(cfg Config) *Service {
 		logWriter:      logWriter,
 		db:             cfg.DB,
 		settingsStopCh: make(chan struct{}),
+		statsInterval:  cfg.StatsInterval,
 	}

 	// Check initial setting from database
@@ -90,11 +94,11 @@ func (s *Service) SetAsynqInspector(inspector *asynq.Inspector) {
 func (s *Service) Start() {
 	log.Info().
 		Str("process", s.process).
-		Dur("interval", DefaultStatsInterval).
+		Dur("interval", s.statsInterval).
 		Bool("enabled", s.logWriter.IsEnabled()).
 		Msg("Starting monitoring service")

-	s.collector.StartPublishing(DefaultStatsInterval)
+	s.collector.StartPublishing(s.statsInterval)

 	// Start settings sync if database is available
 	if s.db != nil {
@@ -102,17 +106,19 @@ func (s *Service) Start() {
 	}
 }

-// Stop stops the monitoring service
+// Stop stops the monitoring service. It is safe to call multiple times.
 func (s *Service) Stop() {
-	// Stop settings sync
-	close(s.settingsStopCh)
+	s.stopOnce.Do(func() {
+		// Stop settings sync
+		close(s.settingsStopCh)

-	s.collector.Stop()
+		s.collector.Stop()

-	// Flush and close the log writer's background goroutine
-	s.logWriter.Close()
+		// Flush and close the log writer's background goroutine
+		s.logWriter.Close()

-	log.Info().Str("process", s.process).Msg("Monitoring service stopped")
+		log.Info().Str("process", s.process).Msg("Monitoring service stopped")
+	})
 }

 // syncSettingsFromDB checks the database for the enable_monitoring setting
--- a/internal/monitoring/writer.go
+++ b/internal/monitoring/writer.go
@@ -2,6 +2,7 @@ package monitoring

 import (
 	"encoding/json"
+	"sync"
 	"sync/atomic"
 	"time"

@@ -18,11 +19,12 @@ const (
 // It uses a single background goroutine with a buffered channel instead of
 // spawning a new goroutine per log line, preventing unbounded goroutine growth.
 type RedisLogWriter struct {
-	buffer  *LogBuffer
-	process string
-	enabled atomic.Bool
-	ch      chan LogEntry
-	done    chan struct{}
+	buffer    *LogBuffer
+	process   string
+	enabled   atomic.Bool
+	ch        chan LogEntry
+	done      chan struct{}
+	closeOnce sync.Once
 }

 // NewRedisLogWriter creates a new writer that captures logs to Redis.
@@ -53,9 +55,12 @@ func (w *RedisLogWriter) drainLoop() {

 // Close shuts down the background goroutine. It should be called during
 // graceful shutdown to ensure all buffered entries are flushed.
+// It is safe to call multiple times.
 func (w *RedisLogWriter) Close() {
-	close(w.ch)
-	<-w.done // Wait for drain to finish
+	w.closeOnce.Do(func() {
+		close(w.ch)
+		<-w.done // Wait for drain to finish
+	})
 }

 // SetEnabled enables or disables log capture to Redis