Files
honeyDueAPI/cmd/worker/main.go
T
Trey T b54493f785
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled
Backend CI / Build (push) Has been cancelled
backend: GDPR export + retention cleanups + worker metrics (BE-1/2/3)
BE-3 observability: expose the worker's Prometheus metrics on :6060/metrics
(apns/fcm/asynq histograms + a new cache_ops_total counter were recorded all
along but never scraped — which is why those dashboard panels read empty); add
the worker containerPort, the vmagent worker scrape job, and two additive
NetworkPolicies. Instrument cache Get/Set hit/miss.

BE-2 retention: three periodic Asynq cleanup crons mirroring the reminder-log
cleanup — notifications (90d), webhook dedup log (180d), audit_log (365d).

BE-1 GDPR data export: POST /api/auth/export/ enqueues a low-priority Asynq job
that gathers all of the user's data (owned residences + their tasks/contractors/
documents/share-codes, plus profile/notifications/prefs/push-tokens/subscription/
audit log), zips one JSON file per category, and emails it as an attachment.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 22:15:26 -05:00

402 lines
16 KiB
Go

package main
import (
"context"
"net/http"
"os"
"os/signal"
"syscall"
"time"
"github.com/hibiken/asynq"
"github.com/redis/go-redis/v9"
"github.com/rs/zerolog/log"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"github.com/treytartt/honeydue-api/internal/config"
"github.com/treytartt/honeydue-api/internal/database"
"github.com/treytartt/honeydue-api/internal/monitoring"
"github.com/treytartt/honeydue-api/internal/prom"
"github.com/treytartt/honeydue-api/internal/push"
"github.com/treytartt/honeydue-api/internal/repositories"
"github.com/treytartt/honeydue-api/internal/services"
"github.com/treytartt/honeydue-api/internal/tracing"
"github.com/treytartt/honeydue-api/internal/worker"
"github.com/treytartt/honeydue-api/internal/worker/jobs"
"github.com/treytartt/honeydue-api/pkg/utils"
)
const workerHealthAddr = ":6060"
func main() {
// Initialize logger
utils.InitLogger(true)
// Load configuration
cfg, err := config.Load()
if err != nil {
log.Fatal().Err(err).Msg("Failed to load configuration")
}
// Check worker kill switch
if !cfg.Features.WorkerEnabled {
log.Warn().Msg("Worker disabled by FEATURE_WORKER_ENABLED=false — exiting")
os.Exit(0)
}
// Initialize OpenTelemetry tracing for the worker process. Same OTLP
// destination as the api; service.name distinguishes them in Jaeger.
// config.SecretValue (not os.Getenv) so file-mounted secrets resolve
// after audit F8 removed these from the process environment.
tracingShutdown, err := tracing.Init(context.Background(), tracing.Config{
ServiceName: "honeydue-worker",
Environment: workerDeploymentEnv(cfg.Server.Debug),
EndpointURL: config.SecretValue("OBS_TRACES_URL"),
BearerToken: config.SecretValue("OBS_INGEST_TOKEN"),
SampleRatio: tracing.SampleRatioFromEnv(),
})
if err != nil {
log.Error().Err(err).Msg("worker tracing init failed — continuing without traces")
}
defer func() {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := tracingShutdown(shutdownCtx); err != nil {
log.Warn().Err(err).Msg("worker tracing shutdown error")
}
}()
asynqTracer := tracing.Tracer("honeydue/worker/asynq")
// Initialize database
db, err := database.Connect(&cfg.Database, cfg.Server.Debug)
if err != nil {
log.Fatal().Err(err).Msg("Failed to connect to database")
}
log.Info().Msg("Connected to database")
// Get underlying *sql.DB for cleanup
sqlDB, _ := db.DB()
defer sqlDB.Close()
// Initialize push client (APNs + FCM)
var pushClient *push.Client
pushClient, err = push.NewClient(&cfg.Push, cfg.Features.PushEnabled)
if err != nil {
log.Warn().Err(err).Msg("Failed to initialize push client - push notifications disabled")
} else {
log.Info().
Bool("ios_enabled", pushClient.IsIOSEnabled()).
Bool("android_enabled", pushClient.IsAndroidEnabled()).
Msg("Push notification client initialized")
}
// Initialize email service (optional)
var emailService *services.EmailService
if cfg.Email.Host != "" {
emailService = services.NewEmailService(&cfg.Email, cfg.Features.EmailEnabled)
log.Info().Str("host", cfg.Email.Host).Msg("Email service initialized")
}
// Initialize notification service for actionable push notifications
notificationRepo := repositories.NewNotificationRepository(db)
notificationService := services.NewNotificationService(notificationRepo, pushClient)
log.Info().Msg("Notification service initialized")
// Parse Redis URL for Asynq
redisOpt, err := asynq.ParseRedisURI(cfg.Redis.URL)
if err != nil {
log.Fatal().Err(err).Msg("Failed to parse Redis URL")
}
// Audit HIGH-1: the Redis password is a file-mounted secret (REDIS_PASSWORD),
// not embedded in REDIS_URL — REDIS_URL travels in the honeydue-config
// ConfigMap. Apply the password onto the parsed opt so the Asynq server,
// inspector and monitoring client (all derived from redisOpt below)
// authenticate against a requirepass-protected Redis.
if cfg.Redis.Password != "" {
if clientOpt, ok := redisOpt.(asynq.RedisClientOpt); ok {
clientOpt.Password = cfg.Redis.Password
redisOpt = clientOpt
}
}
// Initialize monitoring service (if Redis is available)
var monitoringService *monitoring.Service
redisClientOpt, ok := redisOpt.(asynq.RedisClientOpt)
if ok {
redisClient := redis.NewClient(&redis.Options{
Addr: redisClientOpt.Addr,
Password: redisClientOpt.Password,
DB: redisClientOpt.DB,
})
// Verify Redis connection
if err := redisClient.Ping(context.Background()).Err(); err != nil {
log.Warn().Err(err).Msg("Failed to connect to Redis for monitoring - monitoring disabled")
} else {
monitoringService = monitoring.NewService(monitoring.Config{
Process: "worker",
RedisClient: redisClient,
DB: db, // Pass database for enable_monitoring setting sync
})
// Reinitialize logger with monitoring writer
utils.InitLoggerWithWriter(cfg.Server.Debug, monitoringService.LogWriter())
// Create Asynq inspector for queue statistics
inspector := asynq.NewInspector(redisOpt)
monitoringService.SetAsynqInspector(inspector)
// Start stats collection
monitoringService.Start()
defer monitoringService.Stop()
log.Info().
Bool("log_capture_enabled", monitoringService.IsEnabled()).
Msg("Monitoring service initialized")
}
}
// Create Asynq server
srv := asynq.NewServer(
redisOpt,
asynq.Config{
Concurrency: 10,
Queues: map[string]int{
"critical": 6,
"default": 3,
"low": 1,
},
ErrorHandler: asynq.ErrorHandlerFunc(func(ctx context.Context, task *asynq.Task, err error) {
log.Error().
Err(err).
Str("type", task.Type()).
Bytes("payload", task.Payload()).
Msg("Task processing failed")
}),
},
)
// Create job handler
jobHandler := jobs.NewHandler(db, pushClient, emailService, notificationService, cfg)
// Wire upload service for the pending_uploads cleanup cron AND share the
// underlying storage service with the TaskService below so the worker can
// load completion images for email embedding. Storage may be local-disk
// (no S3 backend), in which case the upload service stays nil and the
// cleanup handler no-ops. Cache is optional — the cleanup path doesn't
// rate-limit and works fine with a nil cache.
var sharedStorageService *services.StorageService
if storageService, sErr := services.NewStorageService(&cfg.Storage); sErr == nil {
sharedStorageService = storageService
if s3 := storageService.S3Backend(); s3 != nil {
pendingUploadRepo := repositories.NewPendingUploadRepository(db)
uploadService := services.NewUploadService(pendingUploadRepo, s3, &cfg.Storage, nil)
jobHandler.SetUploadService(uploadService)
}
} else {
log.Warn().Err(sErr).Msg("Failed to initialize storage service for upload cleanup; cleanup cron will no-op")
}
// Wire a TaskService for the task-completed notification handler. The
// worker re-creates this (vs. importing the api's wired instance) because
// each binary owns its own dependency graph. The handler is fully nil-safe
// — if any of the wired services are absent, the corresponding side of
// notification delivery (push or email) is skipped.
taskRepo := repositories.NewTaskRepository(db)
residenceRepo := repositories.NewResidenceRepository(db)
workerTaskService := services.NewTaskService(taskRepo, residenceRepo)
if notificationService != nil {
workerTaskService.SetNotificationService(notificationService)
}
if emailService != nil {
workerTaskService.SetEmailService(emailService)
}
if sharedStorageService != nil {
workerTaskService.SetStorageService(sharedStorageService)
}
jobHandler.SetTaskService(workerTaskService)
// Create Asynq mux and register handlers
mux := asynq.NewServeMux()
// Tracing + metrics middleware: every job runs inside a span and emits
// asynq_job_duration_seconds{task_type,result}.
mux.Use(asynqTracingMiddleware(asynqTracer))
mux.HandleFunc(jobs.TypeSmartReminder, jobHandler.HandleSmartReminder)
mux.HandleFunc(jobs.TypeDailyDigest, jobHandler.HandleDailyDigest)
mux.HandleFunc(jobs.TypeSendEmail, jobHandler.HandleSendEmail)
mux.HandleFunc(jobs.TypeSendPush, jobHandler.HandleSendPush)
mux.HandleFunc(jobs.TypeOnboardingEmails, jobHandler.HandleOnboardingEmails)
mux.HandleFunc(jobs.TypeReminderLogCleanup, jobHandler.HandleReminderLogCleanup)
mux.HandleFunc(jobs.TypeUploadCleanup, jobHandler.HandleUploadCleanup)
mux.HandleFunc(jobs.TypeNotificationCleanup, jobHandler.HandleNotificationCleanup)
mux.HandleFunc(jobs.TypeWebhookLogCleanup, jobHandler.HandleWebhookLogCleanup)
mux.HandleFunc(jobs.TypeAuditLogCleanup, jobHandler.HandleAuditLogCleanup)
mux.HandleFunc(worker.TypeTaskCompletedNotification, jobHandler.HandleTaskCompletedNotification)
mux.HandleFunc(worker.TypeDataExport, jobHandler.HandleDataExport)
// Register email job handlers (welcome, verification, password reset, password changed)
if emailService != nil {
emailJobHandler := jobs.NewEmailJobHandler(emailService)
emailJobHandler.RegisterHandlers(mux)
}
// Start scheduler for periodic tasks
scheduler := asynq.NewScheduler(redisOpt, nil)
// Schedule smart reminder notifications (runs every hour to support per-user custom times)
// Replaces old task reminder and overdue reminder with frequency-aware system
// Uses TaskReminderLog to prevent duplicate notifications
if _, err := scheduler.Register("0 * * * *", asynq.NewTask(jobs.TypeSmartReminder, nil)); err != nil {
log.Fatal().Err(err).Msg("Failed to register smart reminder job")
}
log.Info().Str("cron", "0 * * * *").Int("default_hour", cfg.Worker.TaskReminderHour).Msg("Registered smart reminder job (runs hourly for per-user times)")
// Schedule daily digest (runs every hour to support per-user custom times)
// The job handler filters users based on their preferred notification hour
if _, err := scheduler.Register("0 * * * *", asynq.NewTask(jobs.TypeDailyDigest, nil)); err != nil {
log.Fatal().Err(err).Msg("Failed to register daily digest job")
}
log.Info().Str("cron", "0 * * * *").Int("default_hour", cfg.Worker.DailyNotifHour).Msg("Registered daily digest job (runs hourly for per-user times)")
// Schedule onboarding emails (runs daily at 10:00 AM UTC)
// Sends emails to users who haven't created residences or tasks after registration
if _, err := scheduler.Register("0 10 * * *", asynq.NewTask(jobs.TypeOnboardingEmails, nil)); err != nil {
log.Fatal().Err(err).Msg("Failed to register onboarding emails job")
}
log.Info().Str("cron", "0 10 * * *").Msg("Registered onboarding emails job (runs daily at 10:00 AM UTC)")
// Schedule reminder log cleanup (runs daily at 3:00 AM UTC)
// Removes reminder logs older than 90 days to prevent table bloat
if _, err := scheduler.Register("0 3 * * *", asynq.NewTask(jobs.TypeReminderLogCleanup, nil)); err != nil {
log.Fatal().Err(err).Msg("Failed to register reminder log cleanup job")
}
log.Info().Str("cron", "0 3 * * *").Msg("Registered reminder log cleanup job (runs daily at 3:00 AM UTC)")
// Schedule pending_uploads cleanup (hourly at :30 to avoid colliding with
// the top-of-hour reminder + digest crons). Reaps unclaimed expired
// upload sessions; the B2 bucket lifecycle (7 days on uploads/ prefix)
// is the backstop if this worker is offline for an extended period.
if _, err := scheduler.Register("30 * * * *", asynq.NewTask(jobs.TypeUploadCleanup, nil)); err != nil {
log.Fatal().Err(err).Msg("Failed to register upload cleanup job")
}
log.Info().Str("cron", "30 * * * *").Msg("Registered pending_uploads cleanup job (runs hourly)")
// Data-retention cleanups (BE-2). Staggered off the 3:00 reminder cleanup to
// avoid piling DELETEs onto the same Neon connection window.
if _, err := scheduler.Register("0 2 * * *", asynq.NewTask(jobs.TypeNotificationCleanup, nil)); err != nil {
log.Fatal().Err(err).Msg("Failed to register notification cleanup job")
}
log.Info().Str("cron", "0 2 * * *").Msg("Registered notification cleanup job (daily 02:00 UTC, 90d retention)")
if _, err := scheduler.Register("30 2 * * 0", asynq.NewTask(jobs.TypeWebhookLogCleanup, nil)); err != nil {
log.Fatal().Err(err).Msg("Failed to register webhook log cleanup job")
}
log.Info().Str("cron", "30 2 * * 0").Msg("Registered webhook log cleanup job (weekly Sun 02:30 UTC, 180d retention)")
if _, err := scheduler.Register("30 3 * * 0", asynq.NewTask(jobs.TypeAuditLogCleanup, nil)); err != nil {
log.Fatal().Err(err).Msg("Failed to register audit log cleanup job")
}
log.Info().Str("cron", "30 3 * * 0").Msg("Registered audit log cleanup job (weekly Sun 03:30 UTC, 365d retention)")
// Handle graceful shutdown
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
// Health server (for container healthchecks; not externally published)
healthMux := http.NewServeMux()
healthMux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"status":"ok"}`))
})
// Expose Prometheus metrics so vmagent can scrape the worker. The
// apns_send_*, fcm_send_*, asynq_job_* and cache_ops_* series have been
// recorded on this process all along — they were just never exposed, which
// is why those dashboard panels read empty. Same :6060 as health; in-cluster
// only (not externally published).
healthMux.Handle("/metrics", prom.HTTPHandler())
healthSrv := &http.Server{
Addr: workerHealthAddr,
Handler: healthMux,
ReadHeaderTimeout: 5 * time.Second,
}
go func() {
log.Info().Str("addr", workerHealthAddr).Msg("Health server listening")
if err := healthSrv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Warn().Err(err).Msg("Health server terminated")
}
}()
// Start scheduler in goroutine
go func() {
if err := scheduler.Run(); err != nil {
log.Fatal().Err(err).Msg("Failed to start scheduler")
}
}()
// Start worker server in goroutine
go func() {
log.Info().Msg("Starting worker server...")
if err := srv.Run(mux); err != nil {
log.Fatal().Err(err).Msg("Failed to start worker server")
}
}()
<-quit
log.Info().Msg("Shutting down worker...")
// Graceful shutdown
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer shutdownCancel()
_ = healthSrv.Shutdown(shutdownCtx)
srv.Shutdown()
scheduler.Shutdown()
log.Info().Msg("Worker stopped")
}
// asynqTracingMiddleware returns an asynq.MiddlewareFunc that opens a span
// per task execution and records asynq_job_duration_seconds. Span attrs
// include task type, queue, retry count, and the result outcome.
func asynqTracingMiddleware(tracer trace.Tracer) asynq.MiddlewareFunc {
return func(next asynq.Handler) asynq.Handler {
return asynq.HandlerFunc(func(ctx context.Context, t *asynq.Task) error {
ctx, span := tracer.Start(ctx, "asynq.handle:"+t.Type(),
trace.WithAttributes(
attribute.String("asynq.task_type", t.Type()),
attribute.Int("asynq.payload_bytes", len(t.Payload())),
),
)
defer span.End()
start := time.Now()
err := next.ProcessTask(ctx, t)
dur := time.Since(start)
result := "ok"
if err != nil {
result = "error"
span.SetStatus(codes.Error, err.Error())
span.RecordError(err)
}
span.SetAttributes(attribute.String("asynq.result", result))
prom.ObserveAsynqJob(t.Type(), result, dur)
return err
})
}
}
// workerDeploymentEnv mirrors deploymentEnvironment in cmd/api/main.go.
func workerDeploymentEnv(debug bool) string {
if env := os.Getenv("DEPLOYMENT_ENVIRONMENT"); env != "" {
return env
}
if debug {
return "dev"
}
return "prod"
}