Add real-time log monitoring and system stats dashboard
Implements a comprehensive monitoring system for the admin interface: Backend: - New monitoring package with Redis ring buffer for log storage - Zerolog MultiWriter to capture logs to Redis - System stats collection (CPU, memory, disk, goroutines, GC) - HTTP metrics middleware (request counts, latency, error rates) - Asynq queue stats for worker process - WebSocket endpoint for real-time log streaming - Admin auth middleware now accepts token in query params (for WebSocket) Frontend: - New monitoring page with tabs (Overview, Logs, API Stats, Worker Stats) - Real-time log viewer with level filtering and search - System stats cards showing CPU, memory, goroutines, uptime - HTTP endpoint statistics table - Asynq queue depth visualization - Enable/disable monitoring toggle in settings Memory safeguards: - Max 200 unique endpoints tracked - Hourly stats reset to prevent unbounded growth - Max 1000 log entries in ring buffer - Max 1000 latency samples for P95 calculation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
215
internal/monitoring/middleware.go
Normal file
215
internal/monitoring/middleware.go
Normal file
@@ -0,0 +1,215 @@
|
||||
package monitoring
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// HTTPStatsCollector collects HTTP request metrics
|
||||
type HTTPStatsCollector struct {
|
||||
mu sync.RWMutex
|
||||
requests map[string]int64 // endpoint -> count
|
||||
totalLatency map[string]time.Duration // endpoint -> total latency
|
||||
errors map[string]int64 // endpoint -> error count
|
||||
byStatus map[int]int64 // status code -> count
|
||||
latencies []latencySample // recent latency samples for P95
|
||||
startTime time.Time
|
||||
lastReset time.Time
|
||||
}
|
||||
|
||||
type latencySample struct {
|
||||
endpoint string
|
||||
latency time.Duration
|
||||
timestamp time.Time
|
||||
}
|
||||
|
||||
const (
|
||||
maxLatencySamples = 1000
|
||||
maxEndpoints = 200 // Cap unique endpoints tracked
|
||||
statsResetPeriod = 1 * time.Hour // Reset stats periodically to prevent unbounded growth
|
||||
)
|
||||
|
||||
// NewHTTPStatsCollector creates a new HTTP stats collector
|
||||
func NewHTTPStatsCollector() *HTTPStatsCollector {
|
||||
now := time.Now()
|
||||
return &HTTPStatsCollector{
|
||||
requests: make(map[string]int64),
|
||||
totalLatency: make(map[string]time.Duration),
|
||||
errors: make(map[string]int64),
|
||||
byStatus: make(map[int]int64),
|
||||
latencies: make([]latencySample, 0, maxLatencySamples),
|
||||
startTime: now,
|
||||
lastReset: now,
|
||||
}
|
||||
}
|
||||
|
||||
// Record records a single HTTP request
|
||||
func (c *HTTPStatsCollector) Record(endpoint string, latency time.Duration, status int) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
// Periodically reset to prevent unbounded memory growth
|
||||
if time.Since(c.lastReset) > statsResetPeriod {
|
||||
c.resetLocked()
|
||||
}
|
||||
|
||||
// Check if we've hit the endpoint limit and this is a new endpoint
|
||||
if _, exists := c.requests[endpoint]; !exists && len(c.requests) >= maxEndpoints {
|
||||
// Use a catch-all bucket for overflow endpoints
|
||||
endpoint = "OTHER"
|
||||
}
|
||||
|
||||
c.requests[endpoint]++
|
||||
c.totalLatency[endpoint] += latency
|
||||
c.byStatus[status]++
|
||||
|
||||
if status >= 400 {
|
||||
c.errors[endpoint]++
|
||||
}
|
||||
|
||||
// Store latency sample
|
||||
c.latencies = append(c.latencies, latencySample{
|
||||
endpoint: endpoint,
|
||||
latency: latency,
|
||||
timestamp: time.Now(),
|
||||
})
|
||||
|
||||
// Keep only recent samples
|
||||
if len(c.latencies) > maxLatencySamples {
|
||||
c.latencies = c.latencies[len(c.latencies)-maxLatencySamples:]
|
||||
}
|
||||
}
|
||||
|
||||
// resetLocked resets stats while holding the lock
|
||||
func (c *HTTPStatsCollector) resetLocked() {
|
||||
c.requests = make(map[string]int64)
|
||||
c.totalLatency = make(map[string]time.Duration)
|
||||
c.errors = make(map[string]int64)
|
||||
c.byStatus = make(map[int]int64)
|
||||
c.latencies = make([]latencySample, 0, maxLatencySamples)
|
||||
c.lastReset = time.Now()
|
||||
// Keep startTime for uptime calculation
|
||||
}
|
||||
|
||||
// GetStats returns the current HTTP statistics
|
||||
func (c *HTTPStatsCollector) GetStats() HTTPStats {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
|
||||
stats := HTTPStats{
|
||||
ByEndpoint: make(map[string]EndpointStats),
|
||||
ByStatusCode: make(map[int]int64),
|
||||
}
|
||||
|
||||
var totalRequests int64
|
||||
var totalErrors int64
|
||||
var totalLatency time.Duration
|
||||
|
||||
for endpoint, count := range c.requests {
|
||||
avgLatency := c.totalLatency[endpoint] / time.Duration(count)
|
||||
errCount := c.errors[endpoint]
|
||||
errRate := float64(0)
|
||||
if count > 0 {
|
||||
errRate = float64(errCount) / float64(count)
|
||||
}
|
||||
|
||||
stats.ByEndpoint[endpoint] = EndpointStats{
|
||||
Count: count,
|
||||
AvgLatencyMs: float64(avgLatency.Milliseconds()),
|
||||
ErrorRate: errRate,
|
||||
P95LatencyMs: c.calculateP95(endpoint),
|
||||
}
|
||||
|
||||
totalRequests += count
|
||||
totalErrors += errCount
|
||||
totalLatency += c.totalLatency[endpoint]
|
||||
}
|
||||
|
||||
// Copy status code counts
|
||||
for status, count := range c.byStatus {
|
||||
stats.ByStatusCode[status] = count
|
||||
}
|
||||
|
||||
stats.RequestsTotal = totalRequests
|
||||
if totalRequests > 0 {
|
||||
stats.AvgLatencyMs = float64(totalLatency.Milliseconds()) / float64(totalRequests)
|
||||
stats.ErrorRate = float64(totalErrors) / float64(totalRequests)
|
||||
}
|
||||
|
||||
uptime := time.Since(c.startTime).Minutes()
|
||||
if uptime > 0 {
|
||||
stats.RequestsPerMinute = float64(totalRequests) / uptime
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// calculateP95 calculates the 95th percentile latency for an endpoint
|
||||
// Must be called with read lock held
|
||||
func (c *HTTPStatsCollector) calculateP95(endpoint string) float64 {
|
||||
var endpointLatencies []time.Duration
|
||||
|
||||
for _, sample := range c.latencies {
|
||||
if sample.endpoint == endpoint {
|
||||
endpointLatencies = append(endpointLatencies, sample.latency)
|
||||
}
|
||||
}
|
||||
|
||||
if len(endpointLatencies) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Sort latencies
|
||||
sort.Slice(endpointLatencies, func(i, j int) bool {
|
||||
return endpointLatencies[i] < endpointLatencies[j]
|
||||
})
|
||||
|
||||
// Calculate P95 index
|
||||
p95Index := int(float64(len(endpointLatencies)) * 0.95)
|
||||
if p95Index >= len(endpointLatencies) {
|
||||
p95Index = len(endpointLatencies) - 1
|
||||
}
|
||||
|
||||
return float64(endpointLatencies[p95Index].Milliseconds())
|
||||
}
|
||||
|
||||
// Reset clears all collected stats
|
||||
func (c *HTTPStatsCollector) Reset() {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
c.requests = make(map[string]int64)
|
||||
c.totalLatency = make(map[string]time.Duration)
|
||||
c.errors = make(map[string]int64)
|
||||
c.byStatus = make(map[int]int64)
|
||||
c.latencies = make([]latencySample, 0, maxLatencySamples)
|
||||
c.startTime = time.Now()
|
||||
}
|
||||
|
||||
// MetricsMiddleware returns a Gin middleware that collects request metrics
|
||||
func MetricsMiddleware(collector *HTTPStatsCollector) gin.HandlerFunc {
|
||||
return func(c *gin.Context) {
|
||||
start := time.Now()
|
||||
|
||||
// Process request
|
||||
c.Next()
|
||||
|
||||
// Calculate latency
|
||||
latency := time.Since(start)
|
||||
|
||||
// Get endpoint pattern (use route path, fallback to actual path)
|
||||
endpoint := c.FullPath()
|
||||
if endpoint == "" {
|
||||
endpoint = c.Request.URL.Path
|
||||
}
|
||||
|
||||
// Combine method with path for unique endpoint identification
|
||||
endpoint = c.Request.Method + " " + endpoint
|
||||
|
||||
// Record metrics
|
||||
collector.Record(endpoint, latency, c.Writer.Status())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user