Spanish/Conjuga/SharedModels/Sources/SharedModels/Lexeme.swift

import Foundation
import SwiftData

/// A non-verb vocabulary item harvested from the books pipeline's per-book
/// glossary. Verbs keep their own richer `Verb` model — `Lexeme` covers
/// nouns, adjectives, etc. so the flashcard study modes can drill the grammar
/// that's specific to each part of speech.
///
/// Identity is `"<sourceBookSlug>:<partOfSpeech>:<baseForm>"`; the seeder
/// dedupes on `(partOfSpeech, baseForm)` across books and keeps the first-
/// seen source. Lives in the LOCAL reference-data store (same place as
/// `Book`/`BookChapter`), not the cloud container.
@Model
public final class Lexeme {
    @Attribute(.unique) public var id: String = ""
    public var partOfSpeech: String = ""
    public var baseForm: String = ""
    public var english: String = ""
    /// For nouns: "m", "f", or "m/f". Nil for non-nouns or when unknown.
    /// The curated catalog (`vocab_lexemes.json` from doozan/spanish_data)
    /// emits Wiktionary-sourced gender; `Lexeme.inferGender` provides a
    /// morphology fallback if a different seeder ever lands a noun without
    /// one.
    public var gender: String? = nil
    /// Source tag — `"catalog"` for entries from `vocab_lexemes.json`, or a
    /// book slug for legacy book-glossary-derived entries. Used to keep
    /// catalog refreshes from wiping book-personal additions later.
    public var sourceBookSlug: String = ""
    /// 1-based rank in the source frequency list (lower = more common).
    /// 0 means unknown/unranked. `LexemePool` sorts fresh cards by this so
    /// the most-useful words surface first.
    public var frequencyRank: Int = 0
    /// Optional example sentence pair, shown below the answer in Recall
    /// mode. Sourced from Wiktionary's `ex:`/`eng:` lines when available.
    public var exampleES: String? = nil
    public var exampleEN: String? = nil

    public init(
        id: String,
        partOfSpeech: String,
        baseForm: String,
        english: String,
        gender: String? = nil,
        sourceBookSlug: String = "",
        frequencyRank: Int = 0,
        exampleES: String? = nil,
        exampleEN: String? = nil
    ) {
        self.id = id
        self.partOfSpeech = partOfSpeech
        self.baseForm = baseForm
        self.english = english
        self.gender = gender
        self.sourceBookSlug = sourceBookSlug
        self.frequencyRank = frequencyRank
        self.exampleES = exampleES
        self.exampleEN = exampleEN
    }

    public static func makeID(sourceBookSlug: String, partOfSpeech: String, baseForm: String) -> String {
        "\(sourceBookSlug):\(partOfSpeech):\(baseForm)"
    }

    /// Best-effort gender from Spanish morphology. Used as a fallback when
    /// the glossary pipeline hasn't emitted a `gender` field yet. Conservative:
    /// returns nil for ambiguous endings rather than guessing wrong.
    ///
    /// - `-ción/-sión/-dad/-tad/-tud/-umbre/-ez/-anza` → feminine
    /// - `-aje/-or` → masculine
    /// - `-ma/-pa/-ta` → nil (Greek-origin masculines mix with regular -a feminines)
    /// - `-a` (other) → feminine
    /// - `-o` → masculine
    /// - everything else → nil
    public static func inferGender(forBaseForm baseForm: String) -> String? {
        let s = baseForm.lowercased()
        if s.hasSuffix("ción") || s.hasSuffix("sión") || s.hasSuffix("dad") ||
            s.hasSuffix("tad") || s.hasSuffix("tud") || s.hasSuffix("umbre") ||
            s.hasSuffix("ez") || s.hasSuffix("anza") {
            return "f"
        }
        if s.hasSuffix("aje") || s.hasSuffix("or") {
            return "m"
        }
        if s.hasSuffix("ma") || s.hasSuffix("pa") || s.hasSuffix("ta") {
            return nil
        }
        if s.hasSuffix("a") { return "f" }
        if s.hasSuffix("o") { return "m" }
        return nil
    }
}