Vocab study — noun & adjective flashcards with CEFR level toggles

Add SRS-driven noun and adjective flashcards modeled on the existing verb flashcard flow: - SharedModels/Lexeme — catalog of non-verb vocab, frequency-ranked, with gender for nouns and optional example sentences. Seeded from a bundled vocab_lexemes.json built by Scripts/vocab/build_lexemes.py, which joins frequency.csv + es-en.data from a pinned doozan/spanish_data commit (CC-BY-SA: hermitdave/FrequencyWords + Wiktionary). 1,449 nouns and 600 adjectives, each with Wiktionary-sourced gender and (where available) an example sentence with English translation. - LexemeReviewCard + LexemeReviewStore — cloud-synced SM-2 SRS, keyed by partOfSpeech + lexemeId + drillMode so future drill modes can coexist. - LexemeSessionQueue + LexemePool — parallel to VocabSessionQueue; fresh cards sort by frequency rank. - LexemeStudyGroup — cloud-synced resumable session per (partOfSpeech, drillMode). - NounFlashcardPracticeView + AdjectiveFlashcardPracticeView — same flow as VocabFlashcardPracticeView: English prompt → tap to reveal Spanish → Again/Hard/Good/Easy. Nouns reveal with their article (la taza, el problema) so gender is taught alongside meaning, not as a separate quiz. Example sentence shown when present. CEFR-style level toggles: - LexemeLevel enum (A1/A2/B1/B2/C1+) derived from frequencyRank with standard Spanish-frequency-dictionary cutoffs (250/500/1000/2000). - UserProgress.selectedLexemeLevels — cloud-synced multi-select, defaults to A1+A2 on first launch. - SettingsView gains a "Vocabulary Levels" section with five toggles; the existing "Levels" section is renamed "Verb Levels" for clarity. - Due SRS cards always surface regardless of toggles. Disabling a level only stops new cards from that band entering the pool. PracticeView gets "Nouns" and "Adjectives" rows under "Books". DataLoader: new lexemeDataVersion gate that re-seeds the Lexeme table from vocab_lexemes.json independent of book seeding. project.yml lists the new JSON resource and the existing book_olly-vol2.json (which the previous build was silently excluding because xcodegen rewrote the project from project.yml). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 20:16:55 -05:00
parent ac84b22977
commit 7da98d786c
24 changed files with 1811 additions and 72 deletions
@@ -44,15 +44,24 @@ public final class Book {
 }

 /// One glossary entry: a word's dictionary base form, English meaning, and
-/// part of speech, translated in the book's context at import time.
+/// part of speech, translated in the book's context at import time. `gender`
+/// is populated by the glossary pipeline for nouns ("m"/"f"/"m/f"); nil for
+/// non-nouns or when the pipeline hasn't been re-run yet.
 public struct WordGloss: Codable, Hashable, Sendable {
    public let baseForm: String
    public let english: String
    public let partOfSpeech: String
+    public let gender: String?

-    public init(baseForm: String, english: String, partOfSpeech: String) {
+    public init(
+        baseForm: String,
+        english: String,
+        partOfSpeech: String,
+        gender: String? = nil
+    ) {
        self.baseForm = baseForm
        self.english = english
        self.partOfSpeech = partOfSpeech
+        self.gender = gender
    }
 }
@@ -0,0 +1,91 @@
+import Foundation
+import SwiftData
+
+/// A non-verb vocabulary item harvested from the books pipeline's per-book
+/// glossary. Verbs keep their own richer `Verb` model — `Lexeme` covers
+/// nouns, adjectives, etc. so the flashcard study modes can drill the grammar
+/// that's specific to each part of speech.
+///
+/// Identity is `"<sourceBookSlug>:<partOfSpeech>:<baseForm>"`; the seeder
+/// dedupes on `(partOfSpeech, baseForm)` across books and keeps the first-
+/// seen source. Lives in the LOCAL reference-data store (same place as
+/// `Book`/`BookChapter`), not the cloud container.
+@Model
+public final class Lexeme {
+    @Attribute(.unique) public var id: String = ""
+    public var partOfSpeech: String = ""
+    public var baseForm: String = ""
+    public var english: String = ""
+    /// For nouns: "m", "f", or "m/f". Nil for non-nouns or when unknown.
+    /// The curated catalog (`vocab_lexemes.json` from doozan/spanish_data)
+    /// emits Wiktionary-sourced gender; `Lexeme.inferGender` provides a
+    /// morphology fallback if a different seeder ever lands a noun without
+    /// one.
+    public var gender: String? = nil
+    /// Source tag — `"catalog"` for entries from `vocab_lexemes.json`, or a
+    /// book slug for legacy book-glossary-derived entries. Used to keep
+    /// catalog refreshes from wiping book-personal additions later.
+    public var sourceBookSlug: String = ""
+    /// 1-based rank in the source frequency list (lower = more common).
+    /// 0 means unknown/unranked. `LexemePool` sorts fresh cards by this so
+    /// the most-useful words surface first.
+    public var frequencyRank: Int = 0
+    /// Optional example sentence pair, shown below the answer in Recall
+    /// mode. Sourced from Wiktionary's `ex:`/`eng:` lines when available.
+    public var exampleES: String? = nil
+    public var exampleEN: String? = nil
+
+    public init(
+        id: String,
+        partOfSpeech: String,
+        baseForm: String,
+        english: String,
+        gender: String? = nil,
+        sourceBookSlug: String = "",
+        frequencyRank: Int = 0,
+        exampleES: String? = nil,
+        exampleEN: String? = nil
+    ) {
+        self.id = id
+        self.partOfSpeech = partOfSpeech
+        self.baseForm = baseForm
+        self.english = english
+        self.gender = gender
+        self.sourceBookSlug = sourceBookSlug
+        self.frequencyRank = frequencyRank
+        self.exampleES = exampleES
+        self.exampleEN = exampleEN
+    }
+
+    public static func makeID(sourceBookSlug: String, partOfSpeech: String, baseForm: String) -> String {
+        "\(sourceBookSlug):\(partOfSpeech):\(baseForm)"
+    }
+
+    /// Best-effort gender from Spanish morphology. Used as a fallback when
+    /// the glossary pipeline hasn't emitted a `gender` field yet. Conservative:
+    /// returns nil for ambiguous endings rather than guessing wrong.
+    ///
+    /// - `-ción/-sión/-dad/-tad/-tud/-umbre/-ez/-anza` → feminine
+    /// - `-aje/-or` → masculine
+    /// - `-ma/-pa/-ta` → nil (Greek-origin masculines mix with regular -a feminines)
+    /// - `-a` (other) → feminine
+    /// - `-o` → masculine
+    /// - everything else → nil
+    public static func inferGender(forBaseForm baseForm: String) -> String? {
+        let s = baseForm.lowercased()
+        if s.hasSuffix("ción") || s.hasSuffix("sión") || s.hasSuffix("dad") ||
+            s.hasSuffix("tad") || s.hasSuffix("tud") || s.hasSuffix("umbre") ||
+            s.hasSuffix("ez") || s.hasSuffix("anza") {
+            return "f"
+        }
+        if s.hasSuffix("aje") || s.hasSuffix("or") {
+            return "m"
+        }
+        if s.hasSuffix("ma") || s.hasSuffix("pa") || s.hasSuffix("ta") {
+            return nil
+        }
+        if s.hasSuffix("a") { return "f" }
+        if s.hasSuffix("o") { return "m" }
+        return nil
+    }
+}
@@ -0,0 +1,47 @@
+import Foundation
+
+/// CEFR-style level for a `Lexeme`, derived from its `frequencyRank`. Lets
+/// users gate noun/adjective flashcard sessions by level via a Settings
+/// toggle. Cutoffs follow the standard Spanish-frequency-dictionary
+/// convention (Davies; RAE CEFR-aligned lists).
+///
+/// Note: SRS is *not* level-gated. Disabling a level only stops *new*
+/// cards from that band entering the session pool — already-studied cards
+/// keep coming back on their SM-2 schedule regardless. See
+/// `LexemePool.sessionLexemes` for where the filter is applied.
+public enum LexemeLevel: String, Codable, Hashable, CaseIterable, Sendable {
+    case a1, a2, b1, b2, c1
+
+    /// 1-based frequency rank range. `c1` is open-ended on the high end so
+    /// any far-tail entry has a level even if the catalog later expands.
+    public var rankRange: ClosedRange<Int> {
+        switch self {
+        case .a1: return 1...250
+        case .a2: return 251...500
+        case .b1: return 501...1000
+        case .b2: return 1001...2000
+        case .c1: return 2001...Int.max
+        }
+    }
+
+    public var displayName: String {
+        switch self {
+        case .a1: return "A1 — Beginner"
+        case .a2: return "A2 — Elementary"
+        case .b1: return "B1 — Intermediate"
+        case .b2: return "B2 — Upper-intermediate"
+        case .c1: return "C1+ — Advanced"
+        }
+    }
+
+    /// The level containing this frequency rank. Rank 0 (unranked) falls
+    /// into `c1` — better to include unknown-rank lexemes when only the
+    /// top end is on than silently drop them.
+    public static func level(forRank rank: Int) -> LexemeLevel {
+        guard rank > 0 else { return .c1 }
+        for level in LexemeLevel.allCases where level.rankRange.contains(rank) {
+            return level
+        }
+        return .c1
+    }
+}
@@ -39,6 +39,7 @@ public enum SharedStore {
            TenseGuide.self, CourseDeck.self, VocabCard.self,
            TextbookChapter.self, DownloadedVideo.self,
            Book.self, BookChapter.self,
+            Lexeme.self,
        ]
    }
 }