Books — pre-computed per-book glossary for context-correct word lookup

The book reader's word lookup used DictionaryService, a verb-conjugation index plus ~200 hand-typed words: ordinary nouns like "taza" returned nothing, and homographs always lost (tapping "como" in "como siempre" gave the verb "comer" because the verb index is checked first). Add a glossary phase to the books pipeline (build_glossary.py): every distinct Spanish word is translated once, in its sentence context, by the same Claude-Code-subagent LLM step the pipeline already uses for chapter translation. English front matter is excluded by an ES==EN paragraph-ratio heuristic. The glossary is bundled into book_<slug>.json and is now part of the pipeline for every book. In the app, Book carries the decoded glossary and BookReaderView resolves each tap automatically through cache -> glossary -> DictionaryService -> on-device LLM, citing which source answered so a curated glossary hit reads differently from a best-effort AI guess. book_olly-vol2.json regenerated with a 3,658-word glossary. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 10:44:32 -05:00
parent d0582c4ce7
commit 3ee1563cb0
10 changed files with 18669 additions and 24 deletions
@@ -12,6 +12,10 @@ public final class Book {
    public var language: String = ""
    public var chapterCount: Int = 0
    public var accentColorHex: String = ""
+    /// JSON-encoded `[String: WordGloss]` — the book reader's primary word
+    /// lookup, keyed by the cleaned (lowercased, punctuation-trimmed) word.
+    /// Pre-computed at import time so taps resolve instantly and in context.
+    public var glossaryJSON: Data = Data()

    public init(
        slug: String,
@@ -19,7 +23,8 @@ public final class Book {
        author: String,
        language: String,
        chapterCount: Int,
-        accentColorHex: String
+        accentColorHex: String,
+        glossaryJSON: Data = Data()
    ) {
        self.id = slug
        self.slug = slug
@@ -28,5 +33,26 @@ public final class Book {
        self.language = language
        self.chapterCount = chapterCount
        self.accentColorHex = accentColorHex
+        self.glossaryJSON = glossaryJSON
+    }
+
+    /// The decoded per-book glossary. Decode once and cache at the call site —
+    /// this re-decodes on every call.
+    public func glossary() -> [String: WordGloss] {
+        (try? JSONDecoder().decode([String: WordGloss].self, from: glossaryJSON)) ?? [:]
+    }
+}
+
+/// One glossary entry: a word's dictionary base form, English meaning, and
+/// part of speech, translated in the book's context at import time.
+public struct WordGloss: Codable, Hashable, Sendable {
+    public let baseForm: String
+    public let english: String
+    public let partOfSpeech: String
+
+    public init(baseForm: String, english: String, partOfSpeech: String) {
+        self.baseForm = baseForm
+        self.english = english
+        self.partOfSpeech = partOfSpeech
    }
 }
@@ -32,12 +32,23 @@ public struct WordAnnotation: Codable, Identifiable, Hashable {
    public let baseForm: String
    public let english: String
    public let partOfSpeech: String
+    /// Human-readable name of the resource that produced this definition
+    /// (e.g. "Book glossary", "Dictionary", "AI guess"). Defaulted so older
+    /// persisted annotations without the field still decode.
+    public var source: String = ""

-    public init(word: String, baseForm: String, english: String, partOfSpeech: String) {
+    public init(
+        word: String,
+        baseForm: String,
+        english: String,
+        partOfSpeech: String,
+        source: String = ""
+    ) {
        self.word = word
        self.baseForm = baseForm
        self.english = english
        self.partOfSpeech = partOfSpeech
+        self.source = source
    }
 }