Production hardening: security, resilience, observability, and compliance
Password complexity: custom validator requiring uppercase, lowercase, digit (min 8 chars)
Token expiry: 90-day token lifetime with refresh endpoint (60-90 day renewal window)
Health check: /api/health/ now pings Postgres + Redis, returns 503 on failure
Audit logging: async audit_log table for auth events (login, register, delete, etc.)
Circuit breaker: APNs/FCM push sends wrapped with 5-failure threshold, 30s recovery
FK indexes: 27 missing foreign key indexes across all tables (migration 017)
CSP header: default-src 'none'; frame-ancestors 'none'
Gzip compression: level 5 with media endpoint skipper
Prometheus metrics: /metrics endpoint using existing monitoring service
External timeouts: 15s push, 30s SMTP, context timeouts on all external calls
Migrations: 016 (token created_at), 017 (FK indexes), 018 (audit_log)
Tests: circuit breaker (15), audit service (8), token refresh (7), health (4),
middleware expiry (5), validator (new)
This commit is contained in:
167
internal/push/circuit_breaker.go
Normal file
167
internal/push/circuit_breaker.go
Normal file
@@ -0,0 +1,167 @@
|
||||
package push
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Circuit breaker states
|
||||
const (
|
||||
stateClosed = iota // Normal operation, requests pass through
|
||||
stateOpen // Too many failures, requests are rejected
|
||||
stateHalfOpen // Testing recovery, one request allowed through
|
||||
)
|
||||
|
||||
// Default circuit breaker settings
|
||||
const (
|
||||
defaultFailureThreshold = 5 // Open after this many consecutive failures
|
||||
defaultRecoveryTimeout = 30 * time.Second // Try again after this duration
|
||||
)
|
||||
|
||||
// ErrCircuitOpen is returned when the circuit breaker is open and rejecting requests.
|
||||
var ErrCircuitOpen = errors.New("circuit breaker is open")
|
||||
|
||||
// CircuitBreaker implements a simple circuit breaker pattern for external service calls.
|
||||
// It is thread-safe and requires no external dependencies.
|
||||
//
|
||||
// States:
|
||||
// - Closed: normal operation, all requests pass through. Consecutive failures are counted.
|
||||
// - Open: after reaching the failure threshold, all requests are immediately rejected
|
||||
// with ErrCircuitOpen until the recovery timeout elapses.
|
||||
// - Half-Open: after the recovery timeout, one request is allowed through. If it
|
||||
// succeeds the breaker resets to Closed; if it fails it returns to Open.
|
||||
type CircuitBreaker struct {
|
||||
mu sync.Mutex
|
||||
state int
|
||||
failureCount int
|
||||
failureThreshold int
|
||||
recoveryTimeout time.Duration
|
||||
lastFailureTime time.Time
|
||||
name string // For logging
|
||||
}
|
||||
|
||||
// CircuitBreakerOption configures a CircuitBreaker.
|
||||
type CircuitBreakerOption func(*CircuitBreaker)
|
||||
|
||||
// WithFailureThreshold sets the number of consecutive failures before opening the circuit.
|
||||
func WithFailureThreshold(n int) CircuitBreakerOption {
|
||||
return func(cb *CircuitBreaker) {
|
||||
if n > 0 {
|
||||
cb.failureThreshold = n
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// WithRecoveryTimeout sets how long the circuit stays open before trying half-open.
|
||||
func WithRecoveryTimeout(d time.Duration) CircuitBreakerOption {
|
||||
return func(cb *CircuitBreaker) {
|
||||
if d > 0 {
|
||||
cb.recoveryTimeout = d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NewCircuitBreaker creates a new CircuitBreaker with the given name and options.
|
||||
// The name is used for logging and identification.
|
||||
func NewCircuitBreaker(name string, opts ...CircuitBreakerOption) *CircuitBreaker {
|
||||
cb := &CircuitBreaker{
|
||||
state: stateClosed,
|
||||
failureThreshold: defaultFailureThreshold,
|
||||
recoveryTimeout: defaultRecoveryTimeout,
|
||||
name: name,
|
||||
}
|
||||
for _, opt := range opts {
|
||||
opt(cb)
|
||||
}
|
||||
return cb
|
||||
}
|
||||
|
||||
// Allow checks whether a request should be allowed through.
|
||||
// It returns true if the request can proceed, false if the circuit is open.
|
||||
// When transitioning from open to half-open, it returns true for the probe request.
|
||||
func (cb *CircuitBreaker) Allow() bool {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
switch cb.state {
|
||||
case stateClosed:
|
||||
return true
|
||||
case stateOpen:
|
||||
// Check if recovery timeout has elapsed
|
||||
if time.Since(cb.lastFailureTime) >= cb.recoveryTimeout {
|
||||
cb.state = stateHalfOpen
|
||||
return true
|
||||
}
|
||||
return false
|
||||
case stateHalfOpen:
|
||||
// Only one request at a time in half-open state.
|
||||
// The first caller that got here via Allow() is already in flight;
|
||||
// reject subsequent callers until that probe resolves.
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// RecordSuccess records a successful request. If the breaker is half-open, it resets to closed.
|
||||
func (cb *CircuitBreaker) RecordSuccess() {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
cb.failureCount = 0
|
||||
cb.state = stateClosed
|
||||
}
|
||||
|
||||
// RecordFailure records a failed request. If the failure threshold is reached, the
|
||||
// breaker transitions to the open state.
|
||||
func (cb *CircuitBreaker) RecordFailure() {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
cb.failureCount++
|
||||
cb.lastFailureTime = time.Now()
|
||||
|
||||
if cb.failureCount >= cb.failureThreshold {
|
||||
cb.state = stateOpen
|
||||
}
|
||||
}
|
||||
|
||||
// State returns the current state of the circuit breaker as a human-readable string.
|
||||
func (cb *CircuitBreaker) State() string {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
switch cb.state {
|
||||
case stateClosed:
|
||||
return "closed"
|
||||
case stateOpen:
|
||||
return "open"
|
||||
case stateHalfOpen:
|
||||
return "half-open"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns the circuit breaker's name.
|
||||
func (cb *CircuitBreaker) Name() string {
|
||||
return cb.name
|
||||
}
|
||||
|
||||
// Reset resets the circuit breaker to the closed state with zero failures.
|
||||
func (cb *CircuitBreaker) Reset() {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
|
||||
cb.state = stateClosed
|
||||
cb.failureCount = 0
|
||||
cb.lastFailureTime = time.Time{}
|
||||
}
|
||||
|
||||
// Counts returns the current failure count (useful for testing and monitoring).
|
||||
func (cb *CircuitBreaker) Counts() int {
|
||||
cb.mu.Lock()
|
||||
defer cb.mu.Unlock()
|
||||
return cb.failureCount
|
||||
}
|
||||
Reference in New Issue
Block a user