Books — pre-computed per-book glossary for context-correct word lookup

The book reader's word lookup used DictionaryService, a verb-conjugation index plus ~200 hand-typed words: ordinary nouns like "taza" returned nothing, and homographs always lost (tapping "como" in "como siempre" gave the verb "comer" because the verb index is checked first). Add a glossary phase to the books pipeline (build_glossary.py): every distinct Spanish word is translated once, in its sentence context, by the same Claude-Code-subagent LLM step the pipeline already uses for chapter translation. English front matter is excluded by an ES==EN paragraph-ratio heuristic. The glossary is bundled into book_<slug>.json and is now part of the pipeline for every book. In the app, Book carries the decoded glossary and BookReaderView resolves each tap automatically through cache -> glossary -> DictionaryService -> on-device LLM, citing which source answered so a curated glossary hit reads differently from a best-effort AI guess. book_olly-vol2.json regenerated with a 3,658-word glossary. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 10:44:32 -05:00
parent d0582c4ce7
commit 3ee1563cb0
10 changed files with 18669 additions and 24 deletions
@@ -7,7 +7,8 @@ Usage:

 Inputs:
    BUILD_DIR/<slug>/chapters.json
-    BUILD_DIR/<slug>/jobs/*.output.json   (from translation subagents)
+    BUILD_DIR/<slug>/jobs/*.output.json       (from translation subagents)
+    BUILD_DIR/<slug>/glossary/*.output.json   (from glossary subagents, Phase 2b)

 Output:
    DEST_DIR/book_<slug>.json
@@ -21,11 +22,16 @@ Output:
             "paragraphsES": ["...", ...],
             "paragraphsEN": ["...", ...]},
            ...
-          ]
+          ],
+          "glossary": {
+            "taza": {"baseForm": "taza", "english": "cup", "partOfSpeech": "noun"},
+            ...
+          }
        }

-If --require-all is passed, the script fails if any job is missing its output.
-Otherwise it fills missing translations with empty strings and warns.
+If --require-all is passed, the script fails if any translation OR glossary job
+is missing its output. Otherwise it fills missing translations with empty
+strings, leaves missing glossary entries out, and warns.
 """

 from __future__ import annotations
@@ -86,6 +92,35 @@ def main() -> None:
            sys.exit(1)
        print(f"WARN: {msg} — using empty strings for those paragraphs.", file=sys.stderr)

+    # Glossary (Phase 2b) — merge every glossary job's entries into one map
+    # keyed by the cleaned word the app looks up.
+    glossary_dir = base / "glossary"
+    glossary: dict[str, dict] = {}
+    glossary_missing: list[str] = []
+    if glossary_dir.exists():
+        for input_path in sorted(glossary_dir.glob("*.input.json")):
+            job_id = input_path.stem.removesuffix(".input")
+            output_path = glossary_dir / f"{job_id}.output.json"
+            if not output_path.exists():
+                glossary_missing.append(job_id)
+                continue
+            output_data = json.loads(output_path.read_text(encoding="utf-8"))
+            for entry in output_data.get("entries", []):
+                word = (entry.get("word") or "").strip()
+                if not word:
+                    continue
+                glossary[word] = {
+                    "baseForm": entry.get("baseForm") or word,
+                    "english": entry.get("english") or "",
+                    "partOfSpeech": entry.get("partOfSpeech") or "",
+                }
+    if glossary_missing:
+        msg = f"{len(glossary_missing)} glossary job(s) missing output: {glossary_missing[:5]}{'...' if len(glossary_missing) > 5 else ''}"
+        if args.require_all:
+            print(f"ERROR: {msg}", file=sys.stderr)
+            sys.exit(1)
+        print(f"WARN: {msg} — glossary will be incomplete.", file=sys.stderr)
+
    bundled_chapters: list[dict] = []
    for ch in chapters["chapters"]:
        translations = sorted(chapter_translations.get(ch["number"], []))
@@ -113,6 +148,7 @@ def main() -> None:
        "author": chapters["author"],
        "language": chapters["language"],
        "chapters": bundled_chapters,
+        "glossary": glossary,
    }

    dest_dir = (args.dest or DEFAULT_DEST).resolve()
@@ -122,6 +158,7 @@ def main() -> None:
    print(f"Wrote {out_path}")
    print(f"  Chapters:        {len(bundled_chapters)}")
    print(f"  Translated jobs: {sum(len(v) for v in chapter_translations.values())} / {sum(len(v) for v in chapter_translations.values()) + len(missing)}")
+    print(f"  Glossary words:  {len(glossary)}")


 if __name__ == "__main__":