Books — pre-computed per-book glossary for context-correct word lookup

The book reader's word lookup used DictionaryService, a verb-conjugation
index plus ~200 hand-typed words: ordinary nouns like "taza" returned
nothing, and homographs always lost (tapping "como" in "como siempre"
gave the verb "comer" because the verb index is checked first).

Add a glossary phase to the books pipeline (build_glossary.py): every
distinct Spanish word is translated once, in its sentence context, by
the same Claude-Code-subagent LLM step the pipeline already uses for
chapter translation. English front matter is excluded by an ES==EN
paragraph-ratio heuristic. The glossary is bundled into book_<slug>.json
and is now part of the pipeline for every book.

In the app, Book carries the decoded glossary and BookReaderView resolves
each tap automatically through cache -> glossary -> DictionaryService ->
on-device LLM, citing which source answered so a curated glossary hit
reads differently from a best-effort AI guess.

book_olly-vol2.json regenerated with a 3,658-word glossary.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-05-18 10:44:32 -05:00
parent d0582c4ce7
commit 3ee1563cb0
10 changed files with 18669 additions and 24 deletions
+16 -2
View File
@@ -9,7 +9,7 @@ actor DataLoader {
static let textbookDataVersion = 14 static let textbookDataVersion = 14
static let textbookDataKey = "textbookDataVersion" static let textbookDataKey = "textbookDataVersion"
static let bookDataVersion = 4 // bump: force re-seed for installs where books didn't persist static let bookDataVersion = 5 // bump: per-book glossary added
static let bookDataKey = "bookDataVersion" static let bookDataKey = "bookDataVersion"
/// Quick check: does the DB need seeding or course data refresh? /// Quick check: does the DB need seeding or course data refresh?
@@ -595,13 +595,27 @@ actor DataLoader {
let author = (json["author"] as? String) ?? "" let author = (json["author"] as? String) ?? ""
let language = (json["language"] as? String) ?? "es" let language = (json["language"] as? String) ?? "es"
// Pre-computed per-book glossary, keyed by cleaned word.
var glossary: [String: WordGloss] = [:]
if let glossaryRaw = json["glossary"] as? [String: [String: String]] {
for (word, fields) in glossaryRaw {
glossary[word] = WordGloss(
baseForm: fields["baseForm"] ?? word,
english: fields["english"] ?? "",
partOfSpeech: fields["partOfSpeech"] ?? ""
)
}
}
let glossaryData = (try? JSONEncoder().encode(glossary)) ?? Data()
let book = Book( let book = Book(
slug: slug, slug: slug,
title: title, title: title,
author: author, author: author,
language: language, language: language,
chapterCount: chaptersRaw.count, chapterCount: chaptersRaw.count,
accentColorHex: accentHex(forSlug: slug) accentColorHex: accentHex(forSlug: slug),
glossaryJSON: glossaryData
) )
context.insert(book) context.insert(book)
insertedBooks += 1 insertedBooks += 1
@@ -20,7 +20,7 @@ struct BookChapterListView: View {
List { List {
ForEach(allChapters) { chapter in ForEach(allChapters) { chapter in
NavigationLink { NavigationLink {
BookReaderView(chapter: chapter) BookReaderView(chapter: chapter, book: book)
} label: { } label: {
HStack(spacing: 12) { HStack(spacing: 12) {
Text("\(chapter.number)") Text("\(chapter.number)")
@@ -4,6 +4,7 @@ import FoundationModels
struct BookReaderView: View { struct BookReaderView: View {
let chapter: BookChapter let chapter: BookChapter
let book: Book
@Environment(DictionaryService.self) private var dictionary @Environment(DictionaryService.self) private var dictionary
@State private var speech = BookSpeechController() @State private var speech = BookSpeechController()
@@ -12,6 +13,8 @@ struct BookReaderView: View {
@State private var showVoicePicker = false @State private var showVoicePicker = false
@State private var wasReadingBeforeTap = false @State private var wasReadingBeforeTap = false
@State private var lookupCache: [String: WordAnnotation] = [:] @State private var lookupCache: [String: WordAnnotation] = [:]
/// The book's pre-computed glossary, decoded once on appear.
@State private var glossary: [String: WordGloss] = [:]
@AppStorage("bookReaderVoiceId") private var storedVoiceId: String = "" @AppStorage("bookReaderVoiceId") private var storedVoiceId: String = ""
@AppStorage("bookReaderRate") private var storedRate: Double = 0.45 @AppStorage("bookReaderRate") private var storedRate: Double = 0.45
@@ -83,6 +86,9 @@ struct BookReaderView: View {
.onAppear { .onAppear {
speech.voiceIdentifier = storedVoiceId.isEmpty ? nil : storedVoiceId speech.voiceIdentifier = storedVoiceId.isEmpty ? nil : storedVoiceId
speech.rate = Float(storedRate) speech.rate = Float(storedRate)
if glossary.isEmpty {
glossary = book.glossary()
}
} }
.onDisappear { .onDisappear {
speech.stop() speech.stop()
@@ -158,16 +164,32 @@ struct BookReaderView: View {
wasReadingBeforeTap = true wasReadingBeforeTap = true
} }
// Fall-through chain, best source first. Whichever resource answers,
// the popup names it so a curated glossary hit reads differently from
// a best-effort on-device LLM guess.
if let cached = lookupCache[cleaned] { if let cached = lookupCache[cleaned] {
selectedWord = cached selectedWord = cached
return return
} }
if let gloss = glossary[cleaned] {
let annotation = WordAnnotation(
word: cleaned,
baseForm: gloss.baseForm,
english: gloss.english,
partOfSpeech: gloss.partOfSpeech,
source: "Book glossary"
)
lookupCache[cleaned] = annotation
selectedWord = annotation
return
}
if let entry = dictionary.lookup(cleaned) { if let entry = dictionary.lookup(cleaned) {
let annotation = WordAnnotation( let annotation = WordAnnotation(
word: cleaned, word: cleaned,
baseForm: entry.baseForm, baseForm: entry.baseForm,
english: entry.english, english: entry.english,
partOfSpeech: entry.partOfSpeech partOfSpeech: entry.partOfSpeech,
source: "Dictionary"
) )
lookupCache[cleaned] = annotation lookupCache[cleaned] = annotation
selectedWord = annotation selectedWord = annotation
@@ -176,7 +198,8 @@ struct BookReaderView: View {
selectedWord = WordAnnotation(word: cleaned, baseForm: cleaned, english: "Looking up...", partOfSpeech: "") selectedWord = WordAnnotation(word: cleaned, baseForm: cleaned, english: "Looking up...", partOfSpeech: "")
Task { Task {
do { do {
let annotation = try await WordLookup.lookup(word: cleaned, inContext: paragraph) var annotation = try await WordLookup.lookup(word: cleaned, inContext: paragraph)
annotation.source = "AI guess"
lookupCache[cleaned] = annotation lookupCache[cleaned] = annotation
selectedWord = annotation selectedWord = annotation
} catch { } catch {
@@ -347,9 +370,22 @@ private struct WordDetailSheet: View {
} }
Spacer() Spacer()
if !word.source.isEmpty {
Text(sourceLabel)
.font(.caption2)
.foregroundStyle(.tertiary)
.frame(maxWidth: .infinity, alignment: .leading)
}
} }
.padding() .padding()
} }
private var sourceLabel: String {
word.source == "AI guess"
? "AI guess · on-device estimate, may be approximate"
: "Source: \(word.source)"
}
} }
// MARK: - On-demand word lookup (matches StoryReaderView's WordLookup) // MARK: - On-demand word lookup (matches StoryReaderView's WordLookup)
File diff suppressed because it is too large Load Diff
+25 -8
View File
@@ -17,10 +17,13 @@ This runs Phase 1 (extract) and Phase 2 (manifest jobs), then stops and tells yo
|---|---|---|---| |---|---|---|---|
| 1 | `extract_epub.py` | Unzip the EPUB, walk `content.opf` spine + `toc.ncx` navMap, group HTML files into chapters, strip HTML→text. | `build/<slug>/chapters.json` | | 1 | `extract_epub.py` | Unzip the EPUB, walk `content.opf` spine + `toc.ncx` navMap, group HTML files into chapters, strip HTML→text. | `build/<slug>/chapters.json` |
| 2 | `translate_chapters.py` | Split each chapter into ~30-paragraph translation batches. Each batch becomes a job with its own input/output file. **Resumable**: jobs whose output file already exists are skipped. | `build/<slug>/jobs/<jobid>.input.json` + `_pending.txt` | | 2 | `translate_chapters.py` | Split each chapter into ~30-paragraph translation batches. Each batch becomes a job with its own input/output file. **Resumable**: jobs whose output file already exists are skipped. | `build/<slug>/jobs/<jobid>.input.json` + `_pending.txt` |
| 2.5 | Claude Code subagents | Read each job's `.input.json`, translate Spanish→English, write `<jobid>.output.json`. See "Running translations" below. | `build/<slug>/jobs/<jobid>.output.json` | | 2b | `build_glossary.py` | Tokenize every Spanish paragraph the same way the app does, collect the distinct words with example sentences, split into ~150-word glossary batches. **Resumable** the same way. | `build/<slug>/glossary/<jobid>.input.json` + `_pending.txt` |
| 3 | `bundle_book.py` | Merge `chapters.json` + every `*.output.json` into the final bundled JSON the app reads. | `Conjuga/Conjuga/book_<slug>.json` | | 2.5 | Claude Code subagents | Drain **both** manifests: translate the chapter jobs *and* the glossary jobs, writing each job's `<jobid>.output.json`. See "Running translations" below. | `build/<slug>/{jobs,glossary}/<jobid>.output.json` |
| 3 | `bundle_book.py` | Merge `chapters.json` + every translation `*.output.json` + every glossary `*.output.json` into the final bundled JSON the app reads. | `Conjuga/Conjuga/book_<slug>.json` |
`run.sh` chains 1 → 2 → 3. If Phase 2 produces pending jobs, Phase 3 still runs but bundles with empty `paragraphsEN` placeholders so you can preview app structure before translation completes. Re-running `run.sh` after subagents fill in the outputs gives you the real bundled file. `run.sh` chains 1 → 2 → 2b → 3. If Phase 2 or 2b produces pending jobs, Phase 3 still runs but bundles with placeholders so you can preview app structure before the LLM passes complete. Re-running `run.sh` after subagents fill in the outputs gives you the real bundled file.
The glossary is the book reader's primary word-lookup source: every distinct word translated once, in context, so taps are instant, cover the whole book, and don't mis-resolve homographs (e.g. "como" as the conjunction vs. the verb *comer*). This phase is a permanent part of the pipeline — every book imported this way gets a glossary.
## Adding a new book ## Adding a new book
@@ -34,7 +37,11 @@ This runs Phase 1 (extract) and Phase 2 (manifest jobs), then stops and tells yo
3. **Run translations** (Phase 2.5). The default approach is to spawn Claude Code subagents from inside a Claude Code session pointed at this repo: 3. **Run translations** (Phase 2.5). The default approach is to spawn Claude Code subagents from inside a Claude Code session pointed at this repo:
For each pending job ID listed in `build/<slug>/jobs/_pending.txt`, hand a subagent the prompt at `build/<slug>/jobs/_prompt_template.md` with `<JOB_INPUT_PATH>` / `<JOB_OUTPUT_PATH>` filled in. The subagent reads the input, translates, and writes the output. Resumable — interrupted runs just leave the missing job IDs in `_pending.txt`. There are **two** manifests to drain — translation and glossary:
- `build/<slug>/jobs/_pending.txt` with prompt `build/<slug>/jobs/_prompt_template.md`
- `build/<slug>/glossary/_pending.txt` with prompt `build/<slug>/glossary/_prompt_template.md`
For each pending job ID, hand a subagent the matching prompt with `<JOB_INPUT_PATH>` / `<JOB_OUTPUT_PATH>` filled in. The subagent reads the input, produces the translation/glossary, and writes the output. Resumable — interrupted runs just leave the missing job IDs in `_pending.txt`.
Cluster jobs into agent batches of ~510 jobs each to keep per-agent context manageable. ~5 parallel agents is a good throughput target. Cluster jobs into agent batches of ~510 jobs each to keep per-agent context manageable. ~5 parallel agents is a good throughput target.
@@ -56,16 +63,23 @@ This runs Phase 1 (extract) and Phase 2 (manifest jobs), then stops and tells yo
Conjuga/Scripts/books/ Conjuga/Scripts/books/
├── extract_epub.py # Phase 1 ├── extract_epub.py # Phase 1
├── translate_chapters.py # Phase 2 ├── translate_chapters.py # Phase 2
├── build_glossary.py # Phase 2b
├── bundle_book.py # Phase 3 ├── bundle_book.py # Phase 3
├── run.sh # Orchestrator ├── run.sh # Orchestrator
└── build/ # gitignored └── build/ # gitignored
└── <slug>/ └── <slug>/
├── chapters.json ├── chapters.json
── jobs/ ── jobs/ # translation jobs
│ ├── _pending.txt
│ ├── _prompt_template.md
│ ├── ch01_b00.input.json
│ ├── ch01_b00.output.json
│ └── ...
└── glossary/ # glossary jobs (Phase 2b)
├── _pending.txt ├── _pending.txt
├── _prompt_template.md ├── _prompt_template.md
├── ch01_b00.input.json ├── gloss_b00.input.json
├── ch01_b00.output.json ├── gloss_b00.output.json
└── ... └── ...
``` ```
@@ -81,5 +95,8 @@ The final output (`book_<slug>.json`) lives at `Conjuga/Conjuga/book_<slug>.json
- OCR of vocab image tables (use `Scripts/textbook/` if your book is image-heavy). - OCR of vocab image tables (use `Scripts/textbook/` if your book is image-heavy).
- Exercise extraction (textbook pipeline). - Exercise extraction (textbook pipeline).
- Pre-computed per-word annotations (the app uses `DictionaryService.lookup()` at runtime). - Per-occurrence word sense disambiguation. The glossary has one entry per
distinct word, translated in context; a word genuinely used in two senses in
the same book gets its dominant sense. The runtime `DictionaryService` + the
on-device LLM remain as fallbacks for anything the glossary misses.
- Cover image extraction (covers are derived from a color hash in the app for now). - Cover image extraction (covers are derived from a color hash in the app for now).
+200
View File
@@ -0,0 +1,200 @@
#!/usr/bin/env python3
"""Phase 2b — build a per-book glossary job manifest.
Scans chapters.json, tokenizes every Spanish paragraph the SAME way the iOS app
does (whitespace split, lowercase, strip leading/trailing punctuation), collects
the distinct words with a few example sentences each, and writes batched
glossary jobs that Claude Code subagents can translate in parallel. Resumable:
jobs whose output file already exists are skipped.
Usage:
python3 build_glossary.py <slug> [--batch-size N] [--max-examples N]
[--build BUILD_DIR]
Inputs:
BUILD_DIR/<slug>/chapters.json (from extract_epub.py)
Outputs:
BUILD_DIR/<slug>/glossary/<jobid>.input.json (one per batch — read by subagents)
BUILD_DIR/<slug>/glossary/_pending.txt (job IDs still missing output)
BUILD_DIR/<slug>/glossary/_prompt_template.md (prompt for each subagent)
Job input shape (.input.json):
{"jobId": "gloss_b00",
"words": [{"word": "taza", "examples": ["...", "..."]}, ...]}
Subagents must write <jobid>.output.json with shape:
{"jobId": "gloss_b00",
"entries": [{"word": "taza", "baseForm": "taza",
"english": "cup", "partOfSpeech": "noun"}, ...]}
`entries` must contain exactly one object per input word.
"""
from __future__ import annotations
import argparse
import json
import re
import unicodedata
from pathlib import Path
PROMPT_TEMPLATE = """\
You are building a Spanish->English glossary for a language-learning app.
Input file: {input_path}
Output file: {output_path}
Read the input file. It contains a JSON object with a `words` array; each item
has a `word` (a lowercase Spanish word exactly as it appears in a book) and
`examples` (sentences from the book that use that word).
For EACH word, produce one entry:
- baseForm: the dictionary base form -- infinitive for verbs, masculine
singular for nouns/adjectives, the word itself for invariant words.
- english: a concise English translation (1-4 words). Use the sense the word
carries in the example sentences. Many Spanish words are both a verb form
AND a function word -- e.g. "como" is "I eat" (verb) and "as/like"
(conjunction). Choose the meaning shown in the examples, not the most common
dictionary sense.
- partOfSpeech: one of verb, noun, adjective, adverb, pronoun, preposition,
conjunction, article, interjection, numeral, proper noun, other.
Write the output file as JSON with this exact shape:
{{"jobId": "<the jobId from the input>", "entries": [
{{"word": "...", "baseForm": "...", "english": "...", "partOfSpeech": "..."}}
]}}
`entries` MUST contain exactly one object per input word, cover every word, and
echo each `word` back verbatim. Write nothing else to disk and produce no other
output.
"""
SENTENCE_SPLIT = re.compile(r"(?<=[.!?…])\s+")
def is_punct(ch: str) -> bool:
"""True for any Unicode punctuation — matches Swift's .punctuationCharacters."""
return unicodedata.category(ch).startswith("P")
def clean_word(token: str) -> str:
"""Mirror BookReaderView.cleanWord: lowercase, strip leading/trailing
punctuation, trim whitespace. Accents are preserved (no folding)."""
t = token.lower()
start, end = 0, len(t)
while start < end and is_punct(t[start]):
start += 1
while end > start and is_punct(t[end - 1]):
end -= 1
return t[start:end].strip()
def has_letter(s: str) -> bool:
return any(c.isalpha() for c in s)
def split_sentences(paragraph: str) -> list[str]:
parts = SENTENCE_SPLIT.split(paragraph.strip())
return [p.strip() for p in parts if p.strip()]
def is_english_front_matter(chapter: dict, threshold: float = 0.5) -> bool:
"""True when most of a chapter's paragraphs are untranslated — i.e. it is
English front matter (Preface, reading guide, …) rather than Spanish story
content. Story chapters still have *some* identical lines (verbatim
`word = meaning` vocab entries), so a majority threshold separates them:
front matter runs ~70-100% identical, stories ~25-35%. Only detectable once
paragraphsEN is populated; raw extracted chapters carry none, so nothing is
skipped on a fresh book's first pass."""
es = [p.strip() for p in chapter.get("paragraphsES", [])]
en = [p.strip() for p in chapter.get("paragraphsEN", [])]
if not en or len(en) != len(es) or not es:
return False
identical = sum(1 for a, b in zip(es, en) if a == b)
return identical / len(es) > threshold
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("slug")
parser.add_argument("--batch-size", type=int, default=150)
parser.add_argument("--max-examples", type=int, default=3)
parser.add_argument("--build", type=Path, default=Path("build"))
args = parser.parse_args()
base = args.build / args.slug
chapters = json.loads((base / "chapters.json").read_text(encoding="utf-8"))
gloss_dir = base / "glossary"
gloss_dir.mkdir(parents=True, exist_ok=True)
examples: dict[str, list[str]] = {}
first_seen: dict[str, int] = {}
order = 0
skipped_front_matter = 0
for ch in chapters["chapters"]:
if is_english_front_matter(ch):
skipped_front_matter += 1
continue
for paragraph in ch.get("paragraphsES", []):
for sentence in split_sentences(paragraph):
cleaned = {clean_word(tok) for tok in sentence.split()}
for w in cleaned:
if not w or not has_letter(w):
continue
if w not in first_seen:
first_seen[w] = order
order += 1
examples[w] = []
bucket = examples[w]
if len(bucket) < args.max_examples and sentence not in bucket:
bucket.append(sentence)
words = sorted(examples.keys(), key=lambda w: first_seen[w])
pending: list[str] = []
completed: list[str] = []
total_jobs = 0
for offset in range(0, len(words), args.batch_size):
chunk = words[offset : offset + args.batch_size]
job_id = f"gloss_b{offset // args.batch_size:02d}"
input_path = gloss_dir / f"{job_id}.input.json"
output_path = gloss_dir / f"{job_id}.output.json"
input_path.write_text(
json.dumps(
{
"jobId": job_id,
"words": [{"word": w, "examples": examples[w]} for w in chunk],
},
ensure_ascii=False,
indent=2,
),
encoding="utf-8",
)
total_jobs += 1
(completed if output_path.exists() else pending).append(job_id)
(gloss_dir / "_pending.txt").write_text(
"\n".join(pending) + ("\n" if pending else ""), encoding="utf-8"
)
(gloss_dir / "_prompt_template.md").write_text(
PROMPT_TEMPLATE.format(
input_path="<JOB_INPUT_PATH>", output_path="<JOB_OUTPUT_PATH>"
),
encoding="utf-8",
)
print(f"Skipped front matter: {skipped_front_matter} chapter(s)")
print(f"Distinct words: {len(words)}")
print(f"Total glossary jobs: {total_jobs}")
print(f" Completed: {len(completed)}")
print(f" Pending: {len(pending)}")
print(f"Manifest at: {gloss_dir / '_pending.txt'}")
print(f"Prompt template at: {gloss_dir / '_prompt_template.md'}")
if __name__ == "__main__":
main()
+41 -4
View File
@@ -7,7 +7,8 @@ Usage:
Inputs: Inputs:
BUILD_DIR/<slug>/chapters.json BUILD_DIR/<slug>/chapters.json
BUILD_DIR/<slug>/jobs/*.output.json (from translation subagents) BUILD_DIR/<slug>/jobs/*.output.json (from translation subagents)
BUILD_DIR/<slug>/glossary/*.output.json (from glossary subagents, Phase 2b)
Output: Output:
DEST_DIR/book_<slug>.json DEST_DIR/book_<slug>.json
@@ -21,11 +22,16 @@ Output:
"paragraphsES": ["...", ...], "paragraphsES": ["...", ...],
"paragraphsEN": ["...", ...]}, "paragraphsEN": ["...", ...]},
... ...
] ],
"glossary": {
"taza": {"baseForm": "taza", "english": "cup", "partOfSpeech": "noun"},
...
}
} }
If --require-all is passed, the script fails if any job is missing its output. If --require-all is passed, the script fails if any translation OR glossary job
Otherwise it fills missing translations with empty strings and warns. is missing its output. Otherwise it fills missing translations with empty
strings, leaves missing glossary entries out, and warns.
""" """
from __future__ import annotations from __future__ import annotations
@@ -86,6 +92,35 @@ def main() -> None:
sys.exit(1) sys.exit(1)
print(f"WARN: {msg} — using empty strings for those paragraphs.", file=sys.stderr) print(f"WARN: {msg} — using empty strings for those paragraphs.", file=sys.stderr)
# Glossary (Phase 2b) — merge every glossary job's entries into one map
# keyed by the cleaned word the app looks up.
glossary_dir = base / "glossary"
glossary: dict[str, dict] = {}
glossary_missing: list[str] = []
if glossary_dir.exists():
for input_path in sorted(glossary_dir.glob("*.input.json")):
job_id = input_path.stem.removesuffix(".input")
output_path = glossary_dir / f"{job_id}.output.json"
if not output_path.exists():
glossary_missing.append(job_id)
continue
output_data = json.loads(output_path.read_text(encoding="utf-8"))
for entry in output_data.get("entries", []):
word = (entry.get("word") or "").strip()
if not word:
continue
glossary[word] = {
"baseForm": entry.get("baseForm") or word,
"english": entry.get("english") or "",
"partOfSpeech": entry.get("partOfSpeech") or "",
}
if glossary_missing:
msg = f"{len(glossary_missing)} glossary job(s) missing output: {glossary_missing[:5]}{'...' if len(glossary_missing) > 5 else ''}"
if args.require_all:
print(f"ERROR: {msg}", file=sys.stderr)
sys.exit(1)
print(f"WARN: {msg} — glossary will be incomplete.", file=sys.stderr)
bundled_chapters: list[dict] = [] bundled_chapters: list[dict] = []
for ch in chapters["chapters"]: for ch in chapters["chapters"]:
translations = sorted(chapter_translations.get(ch["number"], [])) translations = sorted(chapter_translations.get(ch["number"], []))
@@ -113,6 +148,7 @@ def main() -> None:
"author": chapters["author"], "author": chapters["author"],
"language": chapters["language"], "language": chapters["language"],
"chapters": bundled_chapters, "chapters": bundled_chapters,
"glossary": glossary,
} }
dest_dir = (args.dest or DEFAULT_DEST).resolve() dest_dir = (args.dest or DEFAULT_DEST).resolve()
@@ -122,6 +158,7 @@ def main() -> None:
print(f"Wrote {out_path}") print(f"Wrote {out_path}")
print(f" Chapters: {len(bundled_chapters)}") print(f" Chapters: {len(bundled_chapters)}")
print(f" Translated jobs: {sum(len(v) for v in chapter_translations.values())} / {sum(len(v) for v in chapter_translations.values()) + len(missing)}") print(f" Translated jobs: {sum(len(v) for v in chapter_translations.values())} / {sum(len(v) for v in chapter_translations.values()) + len(missing)}")
print(f" Glossary words: {len(glossary)}")
if __name__ == "__main__": if __name__ == "__main__":
+16 -4
View File
@@ -23,11 +23,13 @@ fi
EPUB="$1"; shift EPUB="$1"; shift
SLUG="" SLUG=""
BATCH_SIZE="30" BATCH_SIZE="30"
GLOSSARY_BATCH_SIZE="150"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case "$1" in case "$1" in
--slug) SLUG="$2"; shift 2 ;; --slug) SLUG="$2"; shift 2 ;;
--batch-size) BATCH_SIZE="$2"; shift 2 ;; --batch-size) BATCH_SIZE="$2"; shift 2 ;;
--glossary-batch-size) GLOSSARY_BATCH_SIZE="$2"; shift 2 ;;
*) echo "unknown option: $1" >&2; exit 2 ;; *) echo "unknown option: $1" >&2; exit 2 ;;
esac esac
done done
@@ -53,12 +55,22 @@ python3 translate_chapters.py "$SLUG" --batch-size "$BATCH_SIZE"
PENDING_FILE="build/$SLUG/jobs/_pending.txt" PENDING_FILE="build/$SLUG/jobs/_pending.txt"
PENDING_COUNT=$(wc -l < "$PENDING_FILE" | tr -d ' ') PENDING_COUNT=$(wc -l < "$PENDING_FILE" | tr -d ' ')
echo
echo "=== Phase 2b: build_glossary.py ==="
python3 build_glossary.py "$SLUG" --batch-size "$GLOSSARY_BATCH_SIZE"
GLOSS_PENDING_FILE="build/$SLUG/glossary/_pending.txt"
GLOSS_PENDING_COUNT=$(wc -l < "$GLOSS_PENDING_FILE" | tr -d ' ')
TOTAL_PENDING=$((PENDING_COUNT + GLOSS_PENDING_COUNT))
echo echo
echo "=== Phase 3: bundle_book.py ===" echo "=== Phase 3: bundle_book.py ==="
if [[ "$PENDING_COUNT" -gt 0 ]]; then if [[ "$TOTAL_PENDING" -gt 0 ]]; then
echo " $PENDING_COUNT translation job(s) still pending." echo " $PENDING_COUNT translation job(s) and $GLOSS_PENDING_COUNT glossary job(s) still pending."
echo " Run the Claude Code subagent translation step (see README.md), then re-run this script." echo " Run the Claude Code subagent step (see README.md) for BOTH manifests:"
echo " Bundling with empty placeholders so you can preview app structure now." echo " build/$SLUG/jobs/_pending.txt (translation)"
echo " build/$SLUG/glossary/_pending.txt (glossary)"
echo " then re-run this script. Bundling with placeholders so you can preview now."
python3 bundle_book.py "$SLUG" python3 bundle_book.py "$SLUG"
else else
python3 bundle_book.py "$SLUG" --require-all python3 bundle_book.py "$SLUG" --require-all
@@ -12,6 +12,10 @@ public final class Book {
public var language: String = "" public var language: String = ""
public var chapterCount: Int = 0 public var chapterCount: Int = 0
public var accentColorHex: String = "" public var accentColorHex: String = ""
/// JSON-encoded `[String: WordGloss]` the book reader's primary word
/// lookup, keyed by the cleaned (lowercased, punctuation-trimmed) word.
/// Pre-computed at import time so taps resolve instantly and in context.
public var glossaryJSON: Data = Data()
public init( public init(
slug: String, slug: String,
@@ -19,7 +23,8 @@ public final class Book {
author: String, author: String,
language: String, language: String,
chapterCount: Int, chapterCount: Int,
accentColorHex: String accentColorHex: String,
glossaryJSON: Data = Data()
) { ) {
self.id = slug self.id = slug
self.slug = slug self.slug = slug
@@ -28,5 +33,26 @@ public final class Book {
self.language = language self.language = language
self.chapterCount = chapterCount self.chapterCount = chapterCount
self.accentColorHex = accentColorHex self.accentColorHex = accentColorHex
self.glossaryJSON = glossaryJSON
}
/// The decoded per-book glossary. Decode once and cache at the call site
/// this re-decodes on every call.
public func glossary() -> [String: WordGloss] {
(try? JSONDecoder().decode([String: WordGloss].self, from: glossaryJSON)) ?? [:]
}
}
/// One glossary entry: a word's dictionary base form, English meaning, and
/// part of speech, translated in the book's context at import time.
public struct WordGloss: Codable, Hashable, Sendable {
public let baseForm: String
public let english: String
public let partOfSpeech: String
public init(baseForm: String, english: String, partOfSpeech: String) {
self.baseForm = baseForm
self.english = english
self.partOfSpeech = partOfSpeech
} }
} }
@@ -32,12 +32,23 @@ public struct WordAnnotation: Codable, Identifiable, Hashable {
public let baseForm: String public let baseForm: String
public let english: String public let english: String
public let partOfSpeech: String public let partOfSpeech: String
/// Human-readable name of the resource that produced this definition
/// (e.g. "Book glossary", "Dictionary", "AI guess"). Defaulted so older
/// persisted annotations without the field still decode.
public var source: String = ""
public init(word: String, baseForm: String, english: String, partOfSpeech: String) { public init(
word: String,
baseForm: String,
english: String,
partOfSpeech: String,
source: String = ""
) {
self.word = word self.word = word
self.baseForm = baseForm self.baseForm = baseForm
self.english = english self.english = english
self.partOfSpeech = partOfSpeech self.partOfSpeech = partOfSpeech
self.source = source
} }
} }