honeyDueAPI/internal/monitoring/middleware.go

package monitoring

import (
	"sort"
	"sync"
	"time"

	"github.com/labstack/echo/v4"
)

// HTTPStatsCollector collects HTTP request metrics
type HTTPStatsCollector struct {
	mu                  sync.RWMutex
	requests            map[string]int64         // endpoint -> count
	totalLatency        map[string]time.Duration // endpoint -> total latency
	errors              map[string]int64         // endpoint -> error count
	byStatus            map[int]int64            // status code -> count
	endpointLatencies   map[string][]time.Duration // per-endpoint sorted latency buffers for P95
	startTime           time.Time
	lastReset           time.Time
}

const (
	maxLatencySamplesPerEndpoint = 200  // Max latency samples kept per endpoint
	maxEndpoints                = 200  // Cap unique endpoints tracked
	statsResetPeriod            = 1 * time.Hour // Reset stats periodically to prevent unbounded growth
)

// NewHTTPStatsCollector creates a new HTTP stats collector
func NewHTTPStatsCollector() *HTTPStatsCollector {
	now := time.Now()
	return &HTTPStatsCollector{
		requests:          make(map[string]int64),
		totalLatency:      make(map[string]time.Duration),
		errors:            make(map[string]int64),
		byStatus:          make(map[int]int64),
		endpointLatencies: make(map[string][]time.Duration),
		startTime:         now,
		lastReset:         now,
	}
}

// Record records a single HTTP request
func (c *HTTPStatsCollector) Record(endpoint string, latency time.Duration, status int) {
	c.mu.Lock()
	defer c.mu.Unlock()

	// Periodically reset to prevent unbounded memory growth
	if time.Since(c.lastReset) > statsResetPeriod {
		c.resetLocked()
	}

	// Check if we've hit the endpoint limit and this is a new endpoint
	if _, exists := c.requests[endpoint]; !exists && len(c.requests) >= maxEndpoints {
		// Use a catch-all bucket for overflow endpoints
		endpoint = "OTHER"
	}

	c.requests[endpoint]++
	c.totalLatency[endpoint] += latency
	c.byStatus[status]++

	if status >= 400 {
		c.errors[endpoint]++
	}

	// Insert latency into per-endpoint sorted buffer using binary search
	buf := c.endpointLatencies[endpoint]
	idx := sort.Search(len(buf), func(i int) bool {
		return buf[i] >= latency
	})
	buf = append(buf, 0)
	copy(buf[idx+1:], buf[idx:])
	buf[idx] = latency

	// Trim to max samples per endpoint by removing the median element
	// to preserve distribution tails (important for P95 accuracy)
	if len(buf) > maxLatencySamplesPerEndpoint {
		mid := len(buf) / 2
		buf = append(buf[:mid], buf[mid+1:]...)
	}
	c.endpointLatencies[endpoint] = buf
}

// resetLocked resets stats while holding the lock
func (c *HTTPStatsCollector) resetLocked() {
	c.requests = make(map[string]int64)
	c.totalLatency = make(map[string]time.Duration)
	c.errors = make(map[string]int64)
	c.byStatus = make(map[int]int64)
	c.endpointLatencies = make(map[string][]time.Duration)
	c.lastReset = time.Now()
	// Keep startTime for uptime calculation
}

// GetStats returns the current HTTP statistics
func (c *HTTPStatsCollector) GetStats() HTTPStats {
	c.mu.RLock()
	defer c.mu.RUnlock()

	stats := HTTPStats{
		ByEndpoint:   make(map[string]EndpointStats),
		ByStatusCode: make(map[int]int64),
	}

	var totalRequests int64
	var totalErrors int64
	var totalLatency time.Duration

	for endpoint, count := range c.requests {
		avgLatency := c.totalLatency[endpoint] / time.Duration(count)
		errCount := c.errors[endpoint]
		errRate := float64(0)
		if count > 0 {
			errRate = float64(errCount) / float64(count)
		}

		stats.ByEndpoint[endpoint] = EndpointStats{
			Count:        count,
			AvgLatencyMs: float64(avgLatency.Milliseconds()),
			ErrorRate:    errRate,
			P95LatencyMs: c.calculateP95(endpoint),
		}

		totalRequests += count
		totalErrors += errCount
		totalLatency += c.totalLatency[endpoint]
	}

	// Copy status code counts
	for status, count := range c.byStatus {
		stats.ByStatusCode[status] = count
	}

	stats.RequestsTotal = totalRequests
	if totalRequests > 0 {
		stats.AvgLatencyMs = float64(totalLatency.Milliseconds()) / float64(totalRequests)
		stats.ErrorRate = float64(totalErrors) / float64(totalRequests)
	}

	uptime := time.Since(c.startTime).Minutes()
	if uptime > 0 {
		stats.RequestsPerMinute = float64(totalRequests) / uptime
	}

	return stats
}

// calculateP95 calculates the 95th percentile latency for an endpoint.
// The per-endpoint buffer is maintained in sorted order during insertion,
// so this is an O(1) index lookup.
// Must be called with read lock held.
func (c *HTTPStatsCollector) calculateP95(endpoint string) float64 {
	buf := c.endpointLatencies[endpoint]
	if len(buf) == 0 {
		return 0
	}

	// Buffer is already sorted; direct index lookup
	p95Index := int(float64(len(buf)) * 0.95)
	if p95Index >= len(buf) {
		p95Index = len(buf) - 1
	}

	return float64(buf[p95Index].Milliseconds())
}

// Reset clears all collected stats
func (c *HTTPStatsCollector) Reset() {
	c.mu.Lock()
	defer c.mu.Unlock()

	c.requests = make(map[string]int64)
	c.totalLatency = make(map[string]time.Duration)
	c.errors = make(map[string]int64)
	c.byStatus = make(map[int]int64)
	c.endpointLatencies = make(map[string][]time.Duration)
	c.startTime = time.Now()
}

// MetricsMiddleware returns an Echo middleware that collects request metrics
func MetricsMiddleware(collector *HTTPStatsCollector) echo.MiddlewareFunc {
	return func(next echo.HandlerFunc) echo.HandlerFunc {
		return func(c echo.Context) error {
			start := time.Now()

			// Process request
			err := next(c)

			// Calculate latency
			latency := time.Since(start)

			// Get endpoint pattern (use route path, fallback to actual path)
			endpoint := c.Path()
			if endpoint == "" {
				endpoint = c.Request().URL.Path
			}

			// Combine method with path for unique endpoint identification
			endpoint = c.Request().Method + " " + endpoint

			// Record metrics
			collector.Record(endpoint, latency, c.Response().Status)

			return err
		}
	}
}