Files
Spanish/Conjuga/Scripts/books/bundle_book.py
T
Trey T 3ee1563cb0 Books — pre-computed per-book glossary for context-correct word lookup
The book reader's word lookup used DictionaryService, a verb-conjugation
index plus ~200 hand-typed words: ordinary nouns like "taza" returned
nothing, and homographs always lost (tapping "como" in "como siempre"
gave the verb "comer" because the verb index is checked first).

Add a glossary phase to the books pipeline (build_glossary.py): every
distinct Spanish word is translated once, in its sentence context, by
the same Claude-Code-subagent LLM step the pipeline already uses for
chapter translation. English front matter is excluded by an ES==EN
paragraph-ratio heuristic. The glossary is bundled into book_<slug>.json
and is now part of the pipeline for every book.

In the app, Book carries the decoded glossary and BookReaderView resolves
each tap automatically through cache -> glossary -> DictionaryService ->
on-device LLM, citing which source answered so a curated glossary hit
reads differently from a best-effort AI guess.

book_olly-vol2.json regenerated with a 3,658-word glossary.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 10:44:32 -05:00

166 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""Merge chapters.json + per-job translation outputs into the final bundled
book_<slug>.json that the iOS app reads from its bundle.
Usage:
python3 bundle_book.py <slug> [--build BUILD_DIR] [--dest DEST_DIR] [--require-all]
Inputs:
BUILD_DIR/<slug>/chapters.json
BUILD_DIR/<slug>/jobs/*.output.json (from translation subagents)
BUILD_DIR/<slug>/glossary/*.output.json (from glossary subagents, Phase 2b)
Output:
DEST_DIR/book_<slug>.json
{
"slug": "...",
"title": "...",
"author": "...",
"language": "...",
"chapters": [
{"id": "ch1", "number": 1, "title": "Preface",
"paragraphsES": ["...", ...],
"paragraphsEN": ["...", ...]},
...
],
"glossary": {
"taza": {"baseForm": "taza", "english": "cup", "partOfSpeech": "noun"},
...
}
}
If --require-all is passed, the script fails if any translation OR glossary job
is missing its output. Otherwise it fills missing translations with empty
strings, leaves missing glossary entries out, and warns.
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
DEFAULT_DEST = Path("../../Conjuga")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("slug")
parser.add_argument("--build", type=Path, default=Path("build"))
parser.add_argument("--dest", type=Path, default=None)
parser.add_argument("--require-all", action="store_true")
args = parser.parse_args()
base = args.build / args.slug
chapters = json.loads((base / "chapters.json").read_text(encoding="utf-8"))
jobs_dir = base / "jobs"
# Index translation jobs by chapter -> ordered (offset, paragraphsEN).
chapter_translations: dict[int, list[tuple[int, list[str]]]] = {}
missing: list[str] = []
for input_path in sorted(jobs_dir.glob("*.input.json")):
job_id = input_path.stem.removesuffix(".input")
input_data = json.loads(input_path.read_text(encoding="utf-8"))
output_path = jobs_dir / f"{job_id}.output.json"
if not output_path.exists():
missing.append(job_id)
continue
output_data = json.loads(output_path.read_text(encoding="utf-8"))
paragraphs_en = output_data.get("paragraphsEN", [])
expected = len(input_data["paragraphsES"])
if len(paragraphs_en) != expected:
print(
f"WARN: {job_id} length mismatch — got {len(paragraphs_en)}, "
f"expected {expected}. Padding/truncating.",
file=sys.stderr,
)
if len(paragraphs_en) < expected:
paragraphs_en = paragraphs_en + [""] * (expected - len(paragraphs_en))
else:
paragraphs_en = paragraphs_en[:expected]
chapter_translations.setdefault(input_data["chapter"], []).append(
(input_data["rangeStart"], paragraphs_en)
)
if missing:
msg = f"{len(missing)} translation job(s) missing output: {missing[:5]}{'...' if len(missing) > 5 else ''}"
if args.require_all:
print(f"ERROR: {msg}", file=sys.stderr)
sys.exit(1)
print(f"WARN: {msg} — using empty strings for those paragraphs.", file=sys.stderr)
# Glossary (Phase 2b) — merge every glossary job's entries into one map
# keyed by the cleaned word the app looks up.
glossary_dir = base / "glossary"
glossary: dict[str, dict] = {}
glossary_missing: list[str] = []
if glossary_dir.exists():
for input_path in sorted(glossary_dir.glob("*.input.json")):
job_id = input_path.stem.removesuffix(".input")
output_path = glossary_dir / f"{job_id}.output.json"
if not output_path.exists():
glossary_missing.append(job_id)
continue
output_data = json.loads(output_path.read_text(encoding="utf-8"))
for entry in output_data.get("entries", []):
word = (entry.get("word") or "").strip()
if not word:
continue
glossary[word] = {
"baseForm": entry.get("baseForm") or word,
"english": entry.get("english") or "",
"partOfSpeech": entry.get("partOfSpeech") or "",
}
if glossary_missing:
msg = f"{len(glossary_missing)} glossary job(s) missing output: {glossary_missing[:5]}{'...' if len(glossary_missing) > 5 else ''}"
if args.require_all:
print(f"ERROR: {msg}", file=sys.stderr)
sys.exit(1)
print(f"WARN: {msg} — glossary will be incomplete.", file=sys.stderr)
bundled_chapters: list[dict] = []
for ch in chapters["chapters"]:
translations = sorted(chapter_translations.get(ch["number"], []))
paragraphs_en: list[str] = []
for _, en_chunk in translations:
paragraphs_en.extend(en_chunk)
# Pad to match ES length if jobs were missing for parts of this chapter.
if len(paragraphs_en) < len(ch["paragraphsES"]):
paragraphs_en += [""] * (len(ch["paragraphsES"]) - len(paragraphs_en))
elif len(paragraphs_en) > len(ch["paragraphsES"]):
paragraphs_en = paragraphs_en[: len(ch["paragraphsES"])]
bundled_chapters.append(
{
"id": ch["id"],
"number": ch["number"],
"title": ch["title"],
"paragraphsES": ch["paragraphsES"],
"paragraphsEN": paragraphs_en,
}
)
payload = {
"slug": chapters["slug"],
"title": chapters["title"],
"author": chapters["author"],
"language": chapters["language"],
"chapters": bundled_chapters,
"glossary": glossary,
}
dest_dir = (args.dest or DEFAULT_DEST).resolve()
dest_dir.mkdir(parents=True, exist_ok=True)
out_path = dest_dir / f"book_{args.slug}.json"
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Wrote {out_path}")
print(f" Chapters: {len(bundled_chapters)}")
print(f" Translated jobs: {sum(len(v) for v in chapter_translations.values())} / {sum(len(v) for v in chapter_translations.values()) + len(missing)}")
print(f" Glossary words: {len(glossary)}")
if __name__ == "__main__":
main()