Add real-time log monitoring and system stats dashboard

Implements a comprehensive monitoring system for the admin interface: Backend: - New monitoring package with Redis ring buffer for log storage - Zerolog MultiWriter to capture logs to Redis - System stats collection (CPU, memory, disk, goroutines, GC) - HTTP metrics middleware (request counts, latency, error rates) - Asynq queue stats for worker process - WebSocket endpoint for real-time log streaming - Admin auth middleware now accepts token in query params (for WebSocket) Frontend: - New monitoring page with tabs (Overview, Logs, API Stats, Worker Stats) - Real-time log viewer with level filtering and search - System stats cards showing CPU, memory, goroutines, uptime - HTTP endpoint statistics table - Asynq queue depth visualization - Enable/disable monitoring toggle in settings Memory safeguards: - Max 200 unique endpoints tracked - Hourly stats reset to prevent unbounded growth - Max 1000 log entries in ring buffer - Max 1000 latency samples for P95 calculation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-09 10:26:40 -06:00
parent 12eac24632
commit eb127fda20
31 changed files with 2880 additions and 213 deletions
--- a/internal/monitoring/middleware.go
+++ b/internal/monitoring/middleware.go
@@ -0,0 +1,215 @@
+package monitoring
+
+import (
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/gin-gonic/gin"
+)
+
+// HTTPStatsCollector collects HTTP request metrics
+type HTTPStatsCollector struct {
+	mu           sync.RWMutex
+	requests     map[string]int64         // endpoint -> count
+	totalLatency map[string]time.Duration // endpoint -> total latency
+	errors       map[string]int64         // endpoint -> error count
+	byStatus     map[int]int64            // status code -> count
+	latencies    []latencySample          // recent latency samples for P95
+	startTime    time.Time
+	lastReset    time.Time
+}
+
+type latencySample struct {
+	endpoint  string
+	latency   time.Duration
+	timestamp time.Time
+}
+
+const (
+	maxLatencySamples = 1000
+	maxEndpoints      = 200          // Cap unique endpoints tracked
+	statsResetPeriod  = 1 * time.Hour // Reset stats periodically to prevent unbounded growth
+)
+
+// NewHTTPStatsCollector creates a new HTTP stats collector
+func NewHTTPStatsCollector() *HTTPStatsCollector {
+	now := time.Now()
+	return &HTTPStatsCollector{
+		requests:     make(map[string]int64),
+		totalLatency: make(map[string]time.Duration),
+		errors:       make(map[string]int64),
+		byStatus:     make(map[int]int64),
+		latencies:    make([]latencySample, 0, maxLatencySamples),
+		startTime:    now,
+		lastReset:    now,
+	}
+}
+
+// Record records a single HTTP request
+func (c *HTTPStatsCollector) Record(endpoint string, latency time.Duration, status int) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// Periodically reset to prevent unbounded memory growth
+	if time.Since(c.lastReset) > statsResetPeriod {
+		c.resetLocked()
+	}
+
+	// Check if we've hit the endpoint limit and this is a new endpoint
+	if _, exists := c.requests[endpoint]; !exists && len(c.requests) >= maxEndpoints {
+		// Use a catch-all bucket for overflow endpoints
+		endpoint = "OTHER"
+	}
+
+	c.requests[endpoint]++
+	c.totalLatency[endpoint] += latency
+	c.byStatus[status]++
+
+	if status >= 400 {
+		c.errors[endpoint]++
+	}
+
+	// Store latency sample
+	c.latencies = append(c.latencies, latencySample{
+		endpoint:  endpoint,
+		latency:   latency,
+		timestamp: time.Now(),
+	})
+
+	// Keep only recent samples
+	if len(c.latencies) > maxLatencySamples {
+		c.latencies = c.latencies[len(c.latencies)-maxLatencySamples:]
+	}
+}
+
+// resetLocked resets stats while holding the lock
+func (c *HTTPStatsCollector) resetLocked() {
+	c.requests = make(map[string]int64)
+	c.totalLatency = make(map[string]time.Duration)
+	c.errors = make(map[string]int64)
+	c.byStatus = make(map[int]int64)
+	c.latencies = make([]latencySample, 0, maxLatencySamples)
+	c.lastReset = time.Now()
+	// Keep startTime for uptime calculation
+}
+
+// GetStats returns the current HTTP statistics
+func (c *HTTPStatsCollector) GetStats() HTTPStats {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	stats := HTTPStats{
+		ByEndpoint:   make(map[string]EndpointStats),
+		ByStatusCode: make(map[int]int64),
+	}
+
+	var totalRequests int64
+	var totalErrors int64
+	var totalLatency time.Duration
+
+	for endpoint, count := range c.requests {
+		avgLatency := c.totalLatency[endpoint] / time.Duration(count)
+		errCount := c.errors[endpoint]
+		errRate := float64(0)
+		if count > 0 {
+			errRate = float64(errCount) / float64(count)
+		}
+
+		stats.ByEndpoint[endpoint] = EndpointStats{
+			Count:        count,
+			AvgLatencyMs: float64(avgLatency.Milliseconds()),
+			ErrorRate:    errRate,
+			P95LatencyMs: c.calculateP95(endpoint),
+		}
+
+		totalRequests += count
+		totalErrors += errCount
+		totalLatency += c.totalLatency[endpoint]
+	}
+
+	// Copy status code counts
+	for status, count := range c.byStatus {
+		stats.ByStatusCode[status] = count
+	}
+
+	stats.RequestsTotal = totalRequests
+	if totalRequests > 0 {
+		stats.AvgLatencyMs = float64(totalLatency.Milliseconds()) / float64(totalRequests)
+		stats.ErrorRate = float64(totalErrors) / float64(totalRequests)
+	}
+
+	uptime := time.Since(c.startTime).Minutes()
+	if uptime > 0 {
+		stats.RequestsPerMinute = float64(totalRequests) / uptime
+	}
+
+	return stats
+}
+
+// calculateP95 calculates the 95th percentile latency for an endpoint
+// Must be called with read lock held
+func (c *HTTPStatsCollector) calculateP95(endpoint string) float64 {
+	var endpointLatencies []time.Duration
+
+	for _, sample := range c.latencies {
+		if sample.endpoint == endpoint {
+			endpointLatencies = append(endpointLatencies, sample.latency)
+		}
+	}
+
+	if len(endpointLatencies) == 0 {
+		return 0
+	}
+
+	// Sort latencies
+	sort.Slice(endpointLatencies, func(i, j int) bool {
+		return endpointLatencies[i] < endpointLatencies[j]
+	})
+
+	// Calculate P95 index
+	p95Index := int(float64(len(endpointLatencies)) * 0.95)
+	if p95Index >= len(endpointLatencies) {
+		p95Index = len(endpointLatencies) - 1
+	}
+
+	return float64(endpointLatencies[p95Index].Milliseconds())
+}
+
+// Reset clears all collected stats
+func (c *HTTPStatsCollector) Reset() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	c.requests = make(map[string]int64)
+	c.totalLatency = make(map[string]time.Duration)
+	c.errors = make(map[string]int64)
+	c.byStatus = make(map[int]int64)
+	c.latencies = make([]latencySample, 0, maxLatencySamples)
+	c.startTime = time.Now()
+}
+
+// MetricsMiddleware returns a Gin middleware that collects request metrics
+func MetricsMiddleware(collector *HTTPStatsCollector) gin.HandlerFunc {
+	return func(c *gin.Context) {
+		start := time.Now()
+
+		// Process request
+		c.Next()
+
+		// Calculate latency
+		latency := time.Since(start)
+
+		// Get endpoint pattern (use route path, fallback to actual path)
+		endpoint := c.FullPath()
+		if endpoint == "" {
+			endpoint = c.Request.URL.Path
+		}
+
+		// Combine method with path for unique endpoint identification
+		endpoint = c.Request.Method + " " + endpoint
+
+		// Record metrics
+		collector.Record(endpoint, latency, c.Writer.Status())
+	}
+}