Files
honeyDueAPI/internal/middleware/kratos_auth.go
T
Trey t 64c656bde1
Backend CI / Test (push) Has been cancelled
Backend CI / Contract Tests (push) Has been cancelled
Backend CI / Lint (push) Has been cancelled
Backend CI / Secret Scanning (push) Has been cancelled
Backend CI / Build (push) Has been cancelled
fix(auth): keep users logged in while Kratos is down
Production is running with no Kratos deployed in-cluster (the deploy
script's kratos-secrets prerequisite isn't satisfied yet — see runbook
§11 #7). That means Whoami calls ALWAYS fail, so any time a user's Redis
session cache expires they get a 401, which the iOS app treats as session
invalid → forced re-login → can't re-authenticate because the same
Whoami is the only way back in.

Two-part mitigation:

1. Bump kratosSessionCacheTTL from 5 minutes to 24 hours. Active users
   stay logged in indefinitely; idle users get bounced after a day.
2. Refresh the cache TTL on every successful cache hit (sliding window)
   so usage-driven expiry is no longer a cliff at the original TTL.

When Kratos actually comes up:
  - revert the TTL constant to a sensible value (1-15 min)
  - the sliding-window refresh is fine to keep; it's good UX regardless

Caveat: this papers over the missing Kratos. New sign-ins still cannot
complete because the api needs Kratos to populate the cache the first
time. Real fix is to deploy Kratos.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-03 10:48:12 -05:00

285 lines
8.8 KiB
Go

package middleware
import (
"context"
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"strings"
"time"
"github.com/labstack/echo/v4"
"github.com/rs/zerolog/log"
"gorm.io/gorm"
"github.com/treytartt/honeydue-api/internal/apperrors"
"github.com/treytartt/honeydue-api/internal/kratos"
"github.com/treytartt/honeydue-api/internal/models"
"github.com/treytartt/honeydue-api/internal/services"
)
const (
// AuthUserKey stores the authenticated *models.User in the echo context.
AuthUserKey = "auth_user"
// AuthTokenKey stores the raw session credential in the echo context.
AuthTokenKey = "auth_token"
// authVerifiedKey stores the Kratos email-verified flag in the context.
authVerifiedKey = "auth_email_verified"
// UserCacheTTL / UserCacheMaxSize bound the in-memory local-user cache.
UserCacheTTL = 5 * time.Minute
UserCacheMaxSize = 5000
// kratosSessionCacheTTL is how long a validated session is cached in
// Redis, so most authed requests skip the Kratos /whoami round trip.
//
// PRODUCTION CAVEAT (2026-06-03): until Kratos is deployed in-cluster,
// the Whoami fallback ALWAYS fails (no kratos Service). That means every
// cache miss = 401 = forced re-login. We mitigate by (a) using a long
// TTL and (b) refreshing the TTL on every cache hit (see resolve()).
// This is a short-term workaround — restore to a few minutes once Kratos
// is live and the runbook §11 #7 prerequisites are done.
kratosSessionCacheTTL = 24 * time.Hour
kratosSessionPrefix = "kratos_sess:"
)
// KratosAuth authenticates requests against an Ory Kratos session. It
// replaces the hand-rolled token auth: the session is validated via Kratos
// /sessions/whoami (Redis-cached), and the matching local auth_user row is
// lazily provisioned on first sight of a Kratos identity.
type KratosAuth struct {
kratos *kratos.Client
cache *services.CacheService
db *gorm.DB
userCache *UserCache
}
// NewKratosAuth builds the Kratos auth middleware.
func NewKratosAuth(k *kratos.Client, cache *services.CacheService, db *gorm.DB) *KratosAuth {
return &KratosAuth{
kratos: k,
cache: cache,
db: db,
userCache: NewUserCache(UserCacheTTL, UserCacheMaxSize),
}
}
// Authenticate validates the Kratos session and requires it.
func (m *KratosAuth) Authenticate() echo.MiddlewareFunc {
return func(next echo.HandlerFunc) echo.HandlerFunc {
return func(c echo.Context) error {
user, verified, cred, err := m.resolve(c)
if err != nil {
log.Debug().Err(err).Msg("Kratos authentication failed")
return apperrors.Unauthorized("error.not_authenticated")
}
c.Set(AuthUserKey, user)
c.Set(AuthTokenKey, cred)
c.Set(authVerifiedKey, verified)
return next(c)
}
}
}
// OptionalAuthenticate authenticates if a session is present, else continues
// unauthenticated.
func (m *KratosAuth) OptionalAuthenticate() echo.MiddlewareFunc {
return func(next echo.HandlerFunc) echo.HandlerFunc {
return func(c echo.Context) error {
if user, verified, cred, err := m.resolve(c); err == nil {
c.Set(AuthUserKey, user)
c.Set(AuthTokenKey, cred)
c.Set(authVerifiedKey, verified)
}
return next(c)
}
}
}
// RequireVerified rejects users whose Kratos email address is not verified.
// Apply after Authenticate.
func (m *KratosAuth) RequireVerified() echo.MiddlewareFunc {
return func(next echo.HandlerFunc) echo.HandlerFunc {
return func(c echo.Context) error {
if GetAuthUser(c) == nil {
return apperrors.Unauthorized("error.not_authenticated")
}
if verified, _ := c.Get(authVerifiedKey).(bool); !verified {
return apperrors.Forbidden("error.email_not_verified")
}
return next(c)
}
}
}
// resolve validates the request's session and returns the local user.
func (m *KratosAuth) resolve(c echo.Context) (*models.User, bool, string, error) {
token, cookie := extractSession(c)
if token == "" && cookie == "" {
return nil, false, "", errors.New("no session credential")
}
cred := token
if cred == "" {
cred = cookie
}
ctx := c.Request().Context()
// Redis cache: kratos_sess:<hash(cred)> -> "<userID>|<0|1>"
cacheKey := kratosSessionPrefix + hashCredential(cred)
if m.cache != nil {
if v, err := m.cache.GetString(ctx, cacheKey); err == nil && v != "" {
if user, verified, ok := m.userFromCacheValue(ctx, v); ok {
// Sliding-window refresh: extend the TTL on every successful
// hit so active users don't get bounced when their original
// cache entry would have otherwise expired. Best-effort —
// failure to refresh just means the entry expires on the
// original schedule.
_ = m.cache.SetString(ctx, cacheKey, v, kratosSessionCacheTTL)
return user, verified, cred, nil
}
}
}
sess, err := m.kratos.Whoami(ctx, token, cookie)
if err != nil {
return nil, false, "", err
}
user, err := m.provision(ctx, sess)
if err != nil {
return nil, false, "", err
}
if m.cache != nil {
_ = m.cache.SetString(ctx, cacheKey,
fmt.Sprintf("%d|%s", user.ID, boolDigit(sess.EmailVerified())), kratosSessionCacheTTL)
}
return user, sess.EmailVerified(), cred, nil
}
// provision finds the local auth_user row for a Kratos identity, creating it
// (and a UserProfile) on first sight. Concurrent first requests are handled
// by re-reading after a unique-constraint conflict.
func (m *KratosAuth) provision(ctx context.Context, sess *kratos.Session) (*models.User, error) {
var user models.User
err := m.db.WithContext(ctx).Where("kratos_id = ?", sess.Identity.ID).First(&user).Error
if err == nil {
return &user, nil
}
if !errors.Is(err, gorm.ErrRecordNotFound) {
return nil, err
}
user = models.User{
KratosID: sess.Identity.ID,
Email: sess.Identity.Traits.Email,
Username: sess.Identity.Traits.Email,
FirstName: sess.Identity.Traits.Name.First,
LastName: sess.Identity.Traits.Name.Last,
IsActive: true,
DateJoined: time.Now().UTC(),
}
txErr := m.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
if err := tx.Create(&user).Error; err != nil {
return err
}
return tx.Create(&models.UserProfile{
UserID: user.ID,
Verified: sess.EmailVerified(),
}).Error
})
if txErr != nil {
// Likely a concurrent provision of the same identity — re-read.
if e := m.db.WithContext(ctx).Where("kratos_id = ?", sess.Identity.ID).First(&user).Error; e == nil {
return &user, nil
}
return nil, txErr
}
log.Info().Str("kratos_id", sess.Identity.ID).Uint("user_id", user.ID).
Msg("provisioned local user from Kratos identity")
return &user, nil
}
// userFromCacheValue resolves a cached "<userID>|<0|1>" value to a user.
func (m *KratosAuth) userFromCacheValue(ctx context.Context, v string) (*models.User, bool, bool) {
parts := strings.SplitN(v, "|", 2)
if len(parts) != 2 {
return nil, false, false
}
var id uint
if _, err := fmt.Sscanf(parts[0], "%d", &id); err != nil || id == 0 {
return nil, false, false
}
verified := parts[1] == "1"
if cached := m.userCache.Get(id); cached != nil {
return cached, verified, true
}
var user models.User
if err := m.db.WithContext(ctx).First(&user, id).Error; err != nil {
return nil, false, false
}
m.userCache.Set(&user)
return &user, verified, true
}
// extractSession pulls the session credential from the request: the
// X-Session-Token header or Authorization bearer (mobile clients), or the
// ory_kratos_session cookie (web).
func extractSession(c echo.Context) (token, cookie string) {
if t := c.Request().Header.Get("X-Session-Token"); t != "" {
token = t
} else if ah := c.Request().Header.Get("Authorization"); ah != "" {
parts := strings.SplitN(ah, " ", 2)
if len(parts) == 2 && (parts[0] == "Bearer" || parts[0] == "Token") {
token = parts[1]
}
}
if token == "" {
if ck := c.Request().Header.Get("Cookie"); strings.Contains(ck, "ory_kratos_session") {
cookie = ck
}
}
return token, cookie
}
func hashCredential(cred string) string {
sum := sha256.Sum256([]byte(cred))
return hex.EncodeToString(sum[:])
}
func boolDigit(b bool) string {
if b {
return "1"
}
return "0"
}
// truncateToken returns the first 8 characters of a credential followed by
// "..." for safe inclusion in log lines.
func truncateToken(tok string) string {
if len(tok) <= 8 {
return tok + "..."
}
return tok[:8] + "..."
}
// GetAuthUser retrieves the authenticated user from the echo context.
func GetAuthUser(c echo.Context) *models.User {
user, _ := c.Get(AuthUserKey).(*models.User)
return user
}
// GetAuthToken retrieves the session credential from the echo context.
func GetAuthToken(c echo.Context) string {
tok, _ := c.Get(AuthTokenKey).(string)
return tok
}
// MustGetAuthUser retrieves the authenticated user or returns a 401 error.
func MustGetAuthUser(c echo.Context) (*models.User, error) {
user := GetAuthUser(c)
if user == nil {
return nil, apperrors.Unauthorized("error.not_authenticated")
}
return user, nil
}