Books — pre-computed per-book glossary for context-correct word lookup
The book reader's word lookup used DictionaryService, a verb-conjugation index plus ~200 hand-typed words: ordinary nouns like "taza" returned nothing, and homographs always lost (tapping "como" in "como siempre" gave the verb "comer" because the verb index is checked first). Add a glossary phase to the books pipeline (build_glossary.py): every distinct Spanish word is translated once, in its sentence context, by the same Claude-Code-subagent LLM step the pipeline already uses for chapter translation. English front matter is excluded by an ES==EN paragraph-ratio heuristic. The glossary is bundled into book_<slug>.json and is now part of the pipeline for every book. In the app, Book carries the decoded glossary and BookReaderView resolves each tap automatically through cache -> glossary -> DictionaryService -> on-device LLM, citing which source answered so a curated glossary hit reads differently from a best-effort AI guess. book_olly-vol2.json regenerated with a 3,658-word glossary. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -7,7 +7,8 @@ Usage:
|
||||
|
||||
Inputs:
|
||||
BUILD_DIR/<slug>/chapters.json
|
||||
BUILD_DIR/<slug>/jobs/*.output.json (from translation subagents)
|
||||
BUILD_DIR/<slug>/jobs/*.output.json (from translation subagents)
|
||||
BUILD_DIR/<slug>/glossary/*.output.json (from glossary subagents, Phase 2b)
|
||||
|
||||
Output:
|
||||
DEST_DIR/book_<slug>.json
|
||||
@@ -21,11 +22,16 @@ Output:
|
||||
"paragraphsES": ["...", ...],
|
||||
"paragraphsEN": ["...", ...]},
|
||||
...
|
||||
]
|
||||
],
|
||||
"glossary": {
|
||||
"taza": {"baseForm": "taza", "english": "cup", "partOfSpeech": "noun"},
|
||||
...
|
||||
}
|
||||
}
|
||||
|
||||
If --require-all is passed, the script fails if any job is missing its output.
|
||||
Otherwise it fills missing translations with empty strings and warns.
|
||||
If --require-all is passed, the script fails if any translation OR glossary job
|
||||
is missing its output. Otherwise it fills missing translations with empty
|
||||
strings, leaves missing glossary entries out, and warns.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -86,6 +92,35 @@ def main() -> None:
|
||||
sys.exit(1)
|
||||
print(f"WARN: {msg} — using empty strings for those paragraphs.", file=sys.stderr)
|
||||
|
||||
# Glossary (Phase 2b) — merge every glossary job's entries into one map
|
||||
# keyed by the cleaned word the app looks up.
|
||||
glossary_dir = base / "glossary"
|
||||
glossary: dict[str, dict] = {}
|
||||
glossary_missing: list[str] = []
|
||||
if glossary_dir.exists():
|
||||
for input_path in sorted(glossary_dir.glob("*.input.json")):
|
||||
job_id = input_path.stem.removesuffix(".input")
|
||||
output_path = glossary_dir / f"{job_id}.output.json"
|
||||
if not output_path.exists():
|
||||
glossary_missing.append(job_id)
|
||||
continue
|
||||
output_data = json.loads(output_path.read_text(encoding="utf-8"))
|
||||
for entry in output_data.get("entries", []):
|
||||
word = (entry.get("word") or "").strip()
|
||||
if not word:
|
||||
continue
|
||||
glossary[word] = {
|
||||
"baseForm": entry.get("baseForm") or word,
|
||||
"english": entry.get("english") or "",
|
||||
"partOfSpeech": entry.get("partOfSpeech") or "",
|
||||
}
|
||||
if glossary_missing:
|
||||
msg = f"{len(glossary_missing)} glossary job(s) missing output: {glossary_missing[:5]}{'...' if len(glossary_missing) > 5 else ''}"
|
||||
if args.require_all:
|
||||
print(f"ERROR: {msg}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
print(f"WARN: {msg} — glossary will be incomplete.", file=sys.stderr)
|
||||
|
||||
bundled_chapters: list[dict] = []
|
||||
for ch in chapters["chapters"]:
|
||||
translations = sorted(chapter_translations.get(ch["number"], []))
|
||||
@@ -113,6 +148,7 @@ def main() -> None:
|
||||
"author": chapters["author"],
|
||||
"language": chapters["language"],
|
||||
"chapters": bundled_chapters,
|
||||
"glossary": glossary,
|
||||
}
|
||||
|
||||
dest_dir = (args.dest or DEFAULT_DEST).resolve()
|
||||
@@ -122,6 +158,7 @@ def main() -> None:
|
||||
print(f"Wrote {out_path}")
|
||||
print(f" Chapters: {len(bundled_chapters)}")
|
||||
print(f" Translated jobs: {sum(len(v) for v in chapter_translations.values())} / {sum(len(v) for v in chapter_translations.values()) + len(missing)}")
|
||||
print(f" Glossary words: {len(glossary)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user