Books — pre-computed per-book glossary for context-correct word lookup
The book reader's word lookup used DictionaryService, a verb-conjugation index plus ~200 hand-typed words: ordinary nouns like "taza" returned nothing, and homographs always lost (tapping "como" in "como siempre" gave the verb "comer" because the verb index is checked first). Add a glossary phase to the books pipeline (build_glossary.py): every distinct Spanish word is translated once, in its sentence context, by the same Claude-Code-subagent LLM step the pipeline already uses for chapter translation. English front matter is excluded by an ES==EN paragraph-ratio heuristic. The glossary is bundled into book_<slug>.json and is now part of the pipeline for every book. In the app, Book carries the decoded glossary and BookReaderView resolves each tap automatically through cache -> glossary -> DictionaryService -> on-device LLM, citing which source answered so a curated glossary hit reads differently from a best-effort AI guess. book_olly-vol2.json regenerated with a 3,658-word glossary. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,7 +9,7 @@ actor DataLoader {
|
|||||||
static let textbookDataVersion = 14
|
static let textbookDataVersion = 14
|
||||||
static let textbookDataKey = "textbookDataVersion"
|
static let textbookDataKey = "textbookDataVersion"
|
||||||
|
|
||||||
static let bookDataVersion = 4 // bump: force re-seed for installs where books didn't persist
|
static let bookDataVersion = 5 // bump: per-book glossary added
|
||||||
static let bookDataKey = "bookDataVersion"
|
static let bookDataKey = "bookDataVersion"
|
||||||
|
|
||||||
/// Quick check: does the DB need seeding or course data refresh?
|
/// Quick check: does the DB need seeding or course data refresh?
|
||||||
@@ -595,13 +595,27 @@ actor DataLoader {
|
|||||||
let author = (json["author"] as? String) ?? ""
|
let author = (json["author"] as? String) ?? ""
|
||||||
let language = (json["language"] as? String) ?? "es"
|
let language = (json["language"] as? String) ?? "es"
|
||||||
|
|
||||||
|
// Pre-computed per-book glossary, keyed by cleaned word.
|
||||||
|
var glossary: [String: WordGloss] = [:]
|
||||||
|
if let glossaryRaw = json["glossary"] as? [String: [String: String]] {
|
||||||
|
for (word, fields) in glossaryRaw {
|
||||||
|
glossary[word] = WordGloss(
|
||||||
|
baseForm: fields["baseForm"] ?? word,
|
||||||
|
english: fields["english"] ?? "",
|
||||||
|
partOfSpeech: fields["partOfSpeech"] ?? ""
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let glossaryData = (try? JSONEncoder().encode(glossary)) ?? Data()
|
||||||
|
|
||||||
let book = Book(
|
let book = Book(
|
||||||
slug: slug,
|
slug: slug,
|
||||||
title: title,
|
title: title,
|
||||||
author: author,
|
author: author,
|
||||||
language: language,
|
language: language,
|
||||||
chapterCount: chaptersRaw.count,
|
chapterCount: chaptersRaw.count,
|
||||||
accentColorHex: accentHex(forSlug: slug)
|
accentColorHex: accentHex(forSlug: slug),
|
||||||
|
glossaryJSON: glossaryData
|
||||||
)
|
)
|
||||||
context.insert(book)
|
context.insert(book)
|
||||||
insertedBooks += 1
|
insertedBooks += 1
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ struct BookChapterListView: View {
|
|||||||
List {
|
List {
|
||||||
ForEach(allChapters) { chapter in
|
ForEach(allChapters) { chapter in
|
||||||
NavigationLink {
|
NavigationLink {
|
||||||
BookReaderView(chapter: chapter)
|
BookReaderView(chapter: chapter, book: book)
|
||||||
} label: {
|
} label: {
|
||||||
HStack(spacing: 12) {
|
HStack(spacing: 12) {
|
||||||
Text("\(chapter.number)")
|
Text("\(chapter.number)")
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import FoundationModels
|
|||||||
|
|
||||||
struct BookReaderView: View {
|
struct BookReaderView: View {
|
||||||
let chapter: BookChapter
|
let chapter: BookChapter
|
||||||
|
let book: Book
|
||||||
|
|
||||||
@Environment(DictionaryService.self) private var dictionary
|
@Environment(DictionaryService.self) private var dictionary
|
||||||
@State private var speech = BookSpeechController()
|
@State private var speech = BookSpeechController()
|
||||||
@@ -12,6 +13,8 @@ struct BookReaderView: View {
|
|||||||
@State private var showVoicePicker = false
|
@State private var showVoicePicker = false
|
||||||
@State private var wasReadingBeforeTap = false
|
@State private var wasReadingBeforeTap = false
|
||||||
@State private var lookupCache: [String: WordAnnotation] = [:]
|
@State private var lookupCache: [String: WordAnnotation] = [:]
|
||||||
|
/// The book's pre-computed glossary, decoded once on appear.
|
||||||
|
@State private var glossary: [String: WordGloss] = [:]
|
||||||
|
|
||||||
@AppStorage("bookReaderVoiceId") private var storedVoiceId: String = ""
|
@AppStorage("bookReaderVoiceId") private var storedVoiceId: String = ""
|
||||||
@AppStorage("bookReaderRate") private var storedRate: Double = 0.45
|
@AppStorage("bookReaderRate") private var storedRate: Double = 0.45
|
||||||
@@ -83,6 +86,9 @@ struct BookReaderView: View {
|
|||||||
.onAppear {
|
.onAppear {
|
||||||
speech.voiceIdentifier = storedVoiceId.isEmpty ? nil : storedVoiceId
|
speech.voiceIdentifier = storedVoiceId.isEmpty ? nil : storedVoiceId
|
||||||
speech.rate = Float(storedRate)
|
speech.rate = Float(storedRate)
|
||||||
|
if glossary.isEmpty {
|
||||||
|
glossary = book.glossary()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
.onDisappear {
|
.onDisappear {
|
||||||
speech.stop()
|
speech.stop()
|
||||||
@@ -158,16 +164,32 @@ struct BookReaderView: View {
|
|||||||
wasReadingBeforeTap = true
|
wasReadingBeforeTap = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fall-through chain, best source first. Whichever resource answers,
|
||||||
|
// the popup names it so a curated glossary hit reads differently from
|
||||||
|
// a best-effort on-device LLM guess.
|
||||||
if let cached = lookupCache[cleaned] {
|
if let cached = lookupCache[cleaned] {
|
||||||
selectedWord = cached
|
selectedWord = cached
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if let gloss = glossary[cleaned] {
|
||||||
|
let annotation = WordAnnotation(
|
||||||
|
word: cleaned,
|
||||||
|
baseForm: gloss.baseForm,
|
||||||
|
english: gloss.english,
|
||||||
|
partOfSpeech: gloss.partOfSpeech,
|
||||||
|
source: "Book glossary"
|
||||||
|
)
|
||||||
|
lookupCache[cleaned] = annotation
|
||||||
|
selectedWord = annotation
|
||||||
|
return
|
||||||
|
}
|
||||||
if let entry = dictionary.lookup(cleaned) {
|
if let entry = dictionary.lookup(cleaned) {
|
||||||
let annotation = WordAnnotation(
|
let annotation = WordAnnotation(
|
||||||
word: cleaned,
|
word: cleaned,
|
||||||
baseForm: entry.baseForm,
|
baseForm: entry.baseForm,
|
||||||
english: entry.english,
|
english: entry.english,
|
||||||
partOfSpeech: entry.partOfSpeech
|
partOfSpeech: entry.partOfSpeech,
|
||||||
|
source: "Dictionary"
|
||||||
)
|
)
|
||||||
lookupCache[cleaned] = annotation
|
lookupCache[cleaned] = annotation
|
||||||
selectedWord = annotation
|
selectedWord = annotation
|
||||||
@@ -176,7 +198,8 @@ struct BookReaderView: View {
|
|||||||
selectedWord = WordAnnotation(word: cleaned, baseForm: cleaned, english: "Looking up...", partOfSpeech: "")
|
selectedWord = WordAnnotation(word: cleaned, baseForm: cleaned, english: "Looking up...", partOfSpeech: "")
|
||||||
Task {
|
Task {
|
||||||
do {
|
do {
|
||||||
let annotation = try await WordLookup.lookup(word: cleaned, inContext: paragraph)
|
var annotation = try await WordLookup.lookup(word: cleaned, inContext: paragraph)
|
||||||
|
annotation.source = "AI guess"
|
||||||
lookupCache[cleaned] = annotation
|
lookupCache[cleaned] = annotation
|
||||||
selectedWord = annotation
|
selectedWord = annotation
|
||||||
} catch {
|
} catch {
|
||||||
@@ -347,9 +370,22 @@ private struct WordDetailSheet: View {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Spacer()
|
Spacer()
|
||||||
|
|
||||||
|
if !word.source.isEmpty {
|
||||||
|
Text(sourceLabel)
|
||||||
|
.font(.caption2)
|
||||||
|
.foregroundStyle(.tertiary)
|
||||||
|
.frame(maxWidth: .infinity, alignment: .leading)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
.padding()
|
.padding()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private var sourceLabel: String {
|
||||||
|
word.source == "AI guess"
|
||||||
|
? "AI guess · on-device estimate, may be approximate"
|
||||||
|
: "Source: \(word.source)"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// MARK: - On-demand word lookup (matches StoryReaderView's WordLookup)
|
// MARK: - On-demand word lookup (matches StoryReaderView's WordLookup)
|
||||||
|
|||||||
+18293
-1
File diff suppressed because it is too large
Load Diff
@@ -17,10 +17,13 @@ This runs Phase 1 (extract) and Phase 2 (manifest jobs), then stops and tells yo
|
|||||||
|---|---|---|---|
|
|---|---|---|---|
|
||||||
| 1 | `extract_epub.py` | Unzip the EPUB, walk `content.opf` spine + `toc.ncx` navMap, group HTML files into chapters, strip HTML→text. | `build/<slug>/chapters.json` |
|
| 1 | `extract_epub.py` | Unzip the EPUB, walk `content.opf` spine + `toc.ncx` navMap, group HTML files into chapters, strip HTML→text. | `build/<slug>/chapters.json` |
|
||||||
| 2 | `translate_chapters.py` | Split each chapter into ~30-paragraph translation batches. Each batch becomes a job with its own input/output file. **Resumable**: jobs whose output file already exists are skipped. | `build/<slug>/jobs/<jobid>.input.json` + `_pending.txt` |
|
| 2 | `translate_chapters.py` | Split each chapter into ~30-paragraph translation batches. Each batch becomes a job with its own input/output file. **Resumable**: jobs whose output file already exists are skipped. | `build/<slug>/jobs/<jobid>.input.json` + `_pending.txt` |
|
||||||
| 2.5 | Claude Code subagents | Read each job's `.input.json`, translate Spanish→English, write `<jobid>.output.json`. See "Running translations" below. | `build/<slug>/jobs/<jobid>.output.json` |
|
| 2b | `build_glossary.py` | Tokenize every Spanish paragraph the same way the app does, collect the distinct words with example sentences, split into ~150-word glossary batches. **Resumable** the same way. | `build/<slug>/glossary/<jobid>.input.json` + `_pending.txt` |
|
||||||
| 3 | `bundle_book.py` | Merge `chapters.json` + every `*.output.json` into the final bundled JSON the app reads. | `Conjuga/Conjuga/book_<slug>.json` |
|
| 2.5 | Claude Code subagents | Drain **both** manifests: translate the chapter jobs *and* the glossary jobs, writing each job's `<jobid>.output.json`. See "Running translations" below. | `build/<slug>/{jobs,glossary}/<jobid>.output.json` |
|
||||||
|
| 3 | `bundle_book.py` | Merge `chapters.json` + every translation `*.output.json` + every glossary `*.output.json` into the final bundled JSON the app reads. | `Conjuga/Conjuga/book_<slug>.json` |
|
||||||
|
|
||||||
`run.sh` chains 1 → 2 → 3. If Phase 2 produces pending jobs, Phase 3 still runs but bundles with empty `paragraphsEN` placeholders so you can preview app structure before translation completes. Re-running `run.sh` after subagents fill in the outputs gives you the real bundled file.
|
`run.sh` chains 1 → 2 → 2b → 3. If Phase 2 or 2b produces pending jobs, Phase 3 still runs but bundles with placeholders so you can preview app structure before the LLM passes complete. Re-running `run.sh` after subagents fill in the outputs gives you the real bundled file.
|
||||||
|
|
||||||
|
The glossary is the book reader's primary word-lookup source: every distinct word translated once, in context, so taps are instant, cover the whole book, and don't mis-resolve homographs (e.g. "como" as the conjunction vs. the verb *comer*). This phase is a permanent part of the pipeline — every book imported this way gets a glossary.
|
||||||
|
|
||||||
## Adding a new book
|
## Adding a new book
|
||||||
|
|
||||||
@@ -34,7 +37,11 @@ This runs Phase 1 (extract) and Phase 2 (manifest jobs), then stops and tells yo
|
|||||||
|
|
||||||
3. **Run translations** (Phase 2.5). The default approach is to spawn Claude Code subagents from inside a Claude Code session pointed at this repo:
|
3. **Run translations** (Phase 2.5). The default approach is to spawn Claude Code subagents from inside a Claude Code session pointed at this repo:
|
||||||
|
|
||||||
For each pending job ID listed in `build/<slug>/jobs/_pending.txt`, hand a subagent the prompt at `build/<slug>/jobs/_prompt_template.md` with `<JOB_INPUT_PATH>` / `<JOB_OUTPUT_PATH>` filled in. The subagent reads the input, translates, and writes the output. Resumable — interrupted runs just leave the missing job IDs in `_pending.txt`.
|
There are **two** manifests to drain — translation and glossary:
|
||||||
|
- `build/<slug>/jobs/_pending.txt` with prompt `build/<slug>/jobs/_prompt_template.md`
|
||||||
|
- `build/<slug>/glossary/_pending.txt` with prompt `build/<slug>/glossary/_prompt_template.md`
|
||||||
|
|
||||||
|
For each pending job ID, hand a subagent the matching prompt with `<JOB_INPUT_PATH>` / `<JOB_OUTPUT_PATH>` filled in. The subagent reads the input, produces the translation/glossary, and writes the output. Resumable — interrupted runs just leave the missing job IDs in `_pending.txt`.
|
||||||
|
|
||||||
Cluster jobs into agent batches of ~5–10 jobs each to keep per-agent context manageable. ~5 parallel agents is a good throughput target.
|
Cluster jobs into agent batches of ~5–10 jobs each to keep per-agent context manageable. ~5 parallel agents is a good throughput target.
|
||||||
|
|
||||||
@@ -56,16 +63,23 @@ This runs Phase 1 (extract) and Phase 2 (manifest jobs), then stops and tells yo
|
|||||||
Conjuga/Scripts/books/
|
Conjuga/Scripts/books/
|
||||||
├── extract_epub.py # Phase 1
|
├── extract_epub.py # Phase 1
|
||||||
├── translate_chapters.py # Phase 2
|
├── translate_chapters.py # Phase 2
|
||||||
|
├── build_glossary.py # Phase 2b
|
||||||
├── bundle_book.py # Phase 3
|
├── bundle_book.py # Phase 3
|
||||||
├── run.sh # Orchestrator
|
├── run.sh # Orchestrator
|
||||||
└── build/ # gitignored
|
└── build/ # gitignored
|
||||||
└── <slug>/
|
└── <slug>/
|
||||||
├── chapters.json
|
├── chapters.json
|
||||||
└── jobs/
|
├── jobs/ # translation jobs
|
||||||
|
│ ├── _pending.txt
|
||||||
|
│ ├── _prompt_template.md
|
||||||
|
│ ├── ch01_b00.input.json
|
||||||
|
│ ├── ch01_b00.output.json
|
||||||
|
│ └── ...
|
||||||
|
└── glossary/ # glossary jobs (Phase 2b)
|
||||||
├── _pending.txt
|
├── _pending.txt
|
||||||
├── _prompt_template.md
|
├── _prompt_template.md
|
||||||
├── ch01_b00.input.json
|
├── gloss_b00.input.json
|
||||||
├── ch01_b00.output.json
|
├── gloss_b00.output.json
|
||||||
└── ...
|
└── ...
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -81,5 +95,8 @@ The final output (`book_<slug>.json`) lives at `Conjuga/Conjuga/book_<slug>.json
|
|||||||
|
|
||||||
- OCR of vocab image tables (use `Scripts/textbook/` if your book is image-heavy).
|
- OCR of vocab image tables (use `Scripts/textbook/` if your book is image-heavy).
|
||||||
- Exercise extraction (textbook pipeline).
|
- Exercise extraction (textbook pipeline).
|
||||||
- Pre-computed per-word annotations (the app uses `DictionaryService.lookup()` at runtime).
|
- Per-occurrence word sense disambiguation. The glossary has one entry per
|
||||||
|
distinct word, translated in context; a word genuinely used in two senses in
|
||||||
|
the same book gets its dominant sense. The runtime `DictionaryService` + the
|
||||||
|
on-device LLM remain as fallbacks for anything the glossary misses.
|
||||||
- Cover image extraction (covers are derived from a color hash in the app for now).
|
- Cover image extraction (covers are derived from a color hash in the app for now).
|
||||||
|
|||||||
@@ -0,0 +1,200 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Phase 2b — build a per-book glossary job manifest.
|
||||||
|
|
||||||
|
Scans chapters.json, tokenizes every Spanish paragraph the SAME way the iOS app
|
||||||
|
does (whitespace split, lowercase, strip leading/trailing punctuation), collects
|
||||||
|
the distinct words with a few example sentences each, and writes batched
|
||||||
|
glossary jobs that Claude Code subagents can translate in parallel. Resumable:
|
||||||
|
jobs whose output file already exists are skipped.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 build_glossary.py <slug> [--batch-size N] [--max-examples N]
|
||||||
|
[--build BUILD_DIR]
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
BUILD_DIR/<slug>/chapters.json (from extract_epub.py)
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
BUILD_DIR/<slug>/glossary/<jobid>.input.json (one per batch — read by subagents)
|
||||||
|
BUILD_DIR/<slug>/glossary/_pending.txt (job IDs still missing output)
|
||||||
|
BUILD_DIR/<slug>/glossary/_prompt_template.md (prompt for each subagent)
|
||||||
|
|
||||||
|
Job input shape (.input.json):
|
||||||
|
{"jobId": "gloss_b00",
|
||||||
|
"words": [{"word": "taza", "examples": ["...", "..."]}, ...]}
|
||||||
|
|
||||||
|
Subagents must write <jobid>.output.json with shape:
|
||||||
|
{"jobId": "gloss_b00",
|
||||||
|
"entries": [{"word": "taza", "baseForm": "taza",
|
||||||
|
"english": "cup", "partOfSpeech": "noun"}, ...]}
|
||||||
|
|
||||||
|
`entries` must contain exactly one object per input word.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_TEMPLATE = """\
|
||||||
|
You are building a Spanish->English glossary for a language-learning app.
|
||||||
|
|
||||||
|
Input file: {input_path}
|
||||||
|
Output file: {output_path}
|
||||||
|
|
||||||
|
Read the input file. It contains a JSON object with a `words` array; each item
|
||||||
|
has a `word` (a lowercase Spanish word exactly as it appears in a book) and
|
||||||
|
`examples` (sentences from the book that use that word).
|
||||||
|
|
||||||
|
For EACH word, produce one entry:
|
||||||
|
- baseForm: the dictionary base form -- infinitive for verbs, masculine
|
||||||
|
singular for nouns/adjectives, the word itself for invariant words.
|
||||||
|
- english: a concise English translation (1-4 words). Use the sense the word
|
||||||
|
carries in the example sentences. Many Spanish words are both a verb form
|
||||||
|
AND a function word -- e.g. "como" is "I eat" (verb) and "as/like"
|
||||||
|
(conjunction). Choose the meaning shown in the examples, not the most common
|
||||||
|
dictionary sense.
|
||||||
|
- partOfSpeech: one of verb, noun, adjective, adverb, pronoun, preposition,
|
||||||
|
conjunction, article, interjection, numeral, proper noun, other.
|
||||||
|
|
||||||
|
Write the output file as JSON with this exact shape:
|
||||||
|
{{"jobId": "<the jobId from the input>", "entries": [
|
||||||
|
{{"word": "...", "baseForm": "...", "english": "...", "partOfSpeech": "..."}}
|
||||||
|
]}}
|
||||||
|
|
||||||
|
`entries` MUST contain exactly one object per input word, cover every word, and
|
||||||
|
echo each `word` back verbatim. Write nothing else to disk and produce no other
|
||||||
|
output.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SENTENCE_SPLIT = re.compile(r"(?<=[.!?…])\s+")
|
||||||
|
|
||||||
|
|
||||||
|
def is_punct(ch: str) -> bool:
|
||||||
|
"""True for any Unicode punctuation — matches Swift's .punctuationCharacters."""
|
||||||
|
return unicodedata.category(ch).startswith("P")
|
||||||
|
|
||||||
|
|
||||||
|
def clean_word(token: str) -> str:
|
||||||
|
"""Mirror BookReaderView.cleanWord: lowercase, strip leading/trailing
|
||||||
|
punctuation, trim whitespace. Accents are preserved (no folding)."""
|
||||||
|
t = token.lower()
|
||||||
|
start, end = 0, len(t)
|
||||||
|
while start < end and is_punct(t[start]):
|
||||||
|
start += 1
|
||||||
|
while end > start and is_punct(t[end - 1]):
|
||||||
|
end -= 1
|
||||||
|
return t[start:end].strip()
|
||||||
|
|
||||||
|
|
||||||
|
def has_letter(s: str) -> bool:
|
||||||
|
return any(c.isalpha() for c in s)
|
||||||
|
|
||||||
|
|
||||||
|
def split_sentences(paragraph: str) -> list[str]:
|
||||||
|
parts = SENTENCE_SPLIT.split(paragraph.strip())
|
||||||
|
return [p.strip() for p in parts if p.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def is_english_front_matter(chapter: dict, threshold: float = 0.5) -> bool:
|
||||||
|
"""True when most of a chapter's paragraphs are untranslated — i.e. it is
|
||||||
|
English front matter (Preface, reading guide, …) rather than Spanish story
|
||||||
|
content. Story chapters still have *some* identical lines (verbatim
|
||||||
|
`word = meaning` vocab entries), so a majority threshold separates them:
|
||||||
|
front matter runs ~70-100% identical, stories ~25-35%. Only detectable once
|
||||||
|
paragraphsEN is populated; raw extracted chapters carry none, so nothing is
|
||||||
|
skipped on a fresh book's first pass."""
|
||||||
|
es = [p.strip() for p in chapter.get("paragraphsES", [])]
|
||||||
|
en = [p.strip() for p in chapter.get("paragraphsEN", [])]
|
||||||
|
if not en or len(en) != len(es) or not es:
|
||||||
|
return False
|
||||||
|
identical = sum(1 for a, b in zip(es, en) if a == b)
|
||||||
|
return identical / len(es) > threshold
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("slug")
|
||||||
|
parser.add_argument("--batch-size", type=int, default=150)
|
||||||
|
parser.add_argument("--max-examples", type=int, default=3)
|
||||||
|
parser.add_argument("--build", type=Path, default=Path("build"))
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
base = args.build / args.slug
|
||||||
|
chapters = json.loads((base / "chapters.json").read_text(encoding="utf-8"))
|
||||||
|
gloss_dir = base / "glossary"
|
||||||
|
gloss_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
examples: dict[str, list[str]] = {}
|
||||||
|
first_seen: dict[str, int] = {}
|
||||||
|
order = 0
|
||||||
|
|
||||||
|
skipped_front_matter = 0
|
||||||
|
for ch in chapters["chapters"]:
|
||||||
|
if is_english_front_matter(ch):
|
||||||
|
skipped_front_matter += 1
|
||||||
|
continue
|
||||||
|
for paragraph in ch.get("paragraphsES", []):
|
||||||
|
for sentence in split_sentences(paragraph):
|
||||||
|
cleaned = {clean_word(tok) for tok in sentence.split()}
|
||||||
|
for w in cleaned:
|
||||||
|
if not w or not has_letter(w):
|
||||||
|
continue
|
||||||
|
if w not in first_seen:
|
||||||
|
first_seen[w] = order
|
||||||
|
order += 1
|
||||||
|
examples[w] = []
|
||||||
|
bucket = examples[w]
|
||||||
|
if len(bucket) < args.max_examples and sentence not in bucket:
|
||||||
|
bucket.append(sentence)
|
||||||
|
|
||||||
|
words = sorted(examples.keys(), key=lambda w: first_seen[w])
|
||||||
|
|
||||||
|
pending: list[str] = []
|
||||||
|
completed: list[str] = []
|
||||||
|
total_jobs = 0
|
||||||
|
|
||||||
|
for offset in range(0, len(words), args.batch_size):
|
||||||
|
chunk = words[offset : offset + args.batch_size]
|
||||||
|
job_id = f"gloss_b{offset // args.batch_size:02d}"
|
||||||
|
input_path = gloss_dir / f"{job_id}.input.json"
|
||||||
|
output_path = gloss_dir / f"{job_id}.output.json"
|
||||||
|
input_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"jobId": job_id,
|
||||||
|
"words": [{"word": w, "examples": examples[w]} for w in chunk],
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
indent=2,
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
total_jobs += 1
|
||||||
|
(completed if output_path.exists() else pending).append(job_id)
|
||||||
|
|
||||||
|
(gloss_dir / "_pending.txt").write_text(
|
||||||
|
"\n".join(pending) + ("\n" if pending else ""), encoding="utf-8"
|
||||||
|
)
|
||||||
|
(gloss_dir / "_prompt_template.md").write_text(
|
||||||
|
PROMPT_TEMPLATE.format(
|
||||||
|
input_path="<JOB_INPUT_PATH>", output_path="<JOB_OUTPUT_PATH>"
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Skipped front matter: {skipped_front_matter} chapter(s)")
|
||||||
|
print(f"Distinct words: {len(words)}")
|
||||||
|
print(f"Total glossary jobs: {total_jobs}")
|
||||||
|
print(f" Completed: {len(completed)}")
|
||||||
|
print(f" Pending: {len(pending)}")
|
||||||
|
print(f"Manifest at: {gloss_dir / '_pending.txt'}")
|
||||||
|
print(f"Prompt template at: {gloss_dir / '_prompt_template.md'}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -8,6 +8,7 @@ Usage:
|
|||||||
Inputs:
|
Inputs:
|
||||||
BUILD_DIR/<slug>/chapters.json
|
BUILD_DIR/<slug>/chapters.json
|
||||||
BUILD_DIR/<slug>/jobs/*.output.json (from translation subagents)
|
BUILD_DIR/<slug>/jobs/*.output.json (from translation subagents)
|
||||||
|
BUILD_DIR/<slug>/glossary/*.output.json (from glossary subagents, Phase 2b)
|
||||||
|
|
||||||
Output:
|
Output:
|
||||||
DEST_DIR/book_<slug>.json
|
DEST_DIR/book_<slug>.json
|
||||||
@@ -21,11 +22,16 @@ Output:
|
|||||||
"paragraphsES": ["...", ...],
|
"paragraphsES": ["...", ...],
|
||||||
"paragraphsEN": ["...", ...]},
|
"paragraphsEN": ["...", ...]},
|
||||||
...
|
...
|
||||||
]
|
],
|
||||||
|
"glossary": {
|
||||||
|
"taza": {"baseForm": "taza", "english": "cup", "partOfSpeech": "noun"},
|
||||||
|
...
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
If --require-all is passed, the script fails if any job is missing its output.
|
If --require-all is passed, the script fails if any translation OR glossary job
|
||||||
Otherwise it fills missing translations with empty strings and warns.
|
is missing its output. Otherwise it fills missing translations with empty
|
||||||
|
strings, leaves missing glossary entries out, and warns.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -86,6 +92,35 @@ def main() -> None:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
print(f"WARN: {msg} — using empty strings for those paragraphs.", file=sys.stderr)
|
print(f"WARN: {msg} — using empty strings for those paragraphs.", file=sys.stderr)
|
||||||
|
|
||||||
|
# Glossary (Phase 2b) — merge every glossary job's entries into one map
|
||||||
|
# keyed by the cleaned word the app looks up.
|
||||||
|
glossary_dir = base / "glossary"
|
||||||
|
glossary: dict[str, dict] = {}
|
||||||
|
glossary_missing: list[str] = []
|
||||||
|
if glossary_dir.exists():
|
||||||
|
for input_path in sorted(glossary_dir.glob("*.input.json")):
|
||||||
|
job_id = input_path.stem.removesuffix(".input")
|
||||||
|
output_path = glossary_dir / f"{job_id}.output.json"
|
||||||
|
if not output_path.exists():
|
||||||
|
glossary_missing.append(job_id)
|
||||||
|
continue
|
||||||
|
output_data = json.loads(output_path.read_text(encoding="utf-8"))
|
||||||
|
for entry in output_data.get("entries", []):
|
||||||
|
word = (entry.get("word") or "").strip()
|
||||||
|
if not word:
|
||||||
|
continue
|
||||||
|
glossary[word] = {
|
||||||
|
"baseForm": entry.get("baseForm") or word,
|
||||||
|
"english": entry.get("english") or "",
|
||||||
|
"partOfSpeech": entry.get("partOfSpeech") or "",
|
||||||
|
}
|
||||||
|
if glossary_missing:
|
||||||
|
msg = f"{len(glossary_missing)} glossary job(s) missing output: {glossary_missing[:5]}{'...' if len(glossary_missing) > 5 else ''}"
|
||||||
|
if args.require_all:
|
||||||
|
print(f"ERROR: {msg}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"WARN: {msg} — glossary will be incomplete.", file=sys.stderr)
|
||||||
|
|
||||||
bundled_chapters: list[dict] = []
|
bundled_chapters: list[dict] = []
|
||||||
for ch in chapters["chapters"]:
|
for ch in chapters["chapters"]:
|
||||||
translations = sorted(chapter_translations.get(ch["number"], []))
|
translations = sorted(chapter_translations.get(ch["number"], []))
|
||||||
@@ -113,6 +148,7 @@ def main() -> None:
|
|||||||
"author": chapters["author"],
|
"author": chapters["author"],
|
||||||
"language": chapters["language"],
|
"language": chapters["language"],
|
||||||
"chapters": bundled_chapters,
|
"chapters": bundled_chapters,
|
||||||
|
"glossary": glossary,
|
||||||
}
|
}
|
||||||
|
|
||||||
dest_dir = (args.dest or DEFAULT_DEST).resolve()
|
dest_dir = (args.dest or DEFAULT_DEST).resolve()
|
||||||
@@ -122,6 +158,7 @@ def main() -> None:
|
|||||||
print(f"Wrote {out_path}")
|
print(f"Wrote {out_path}")
|
||||||
print(f" Chapters: {len(bundled_chapters)}")
|
print(f" Chapters: {len(bundled_chapters)}")
|
||||||
print(f" Translated jobs: {sum(len(v) for v in chapter_translations.values())} / {sum(len(v) for v in chapter_translations.values()) + len(missing)}")
|
print(f" Translated jobs: {sum(len(v) for v in chapter_translations.values())} / {sum(len(v) for v in chapter_translations.values()) + len(missing)}")
|
||||||
|
print(f" Glossary words: {len(glossary)}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -23,11 +23,13 @@ fi
|
|||||||
EPUB="$1"; shift
|
EPUB="$1"; shift
|
||||||
SLUG=""
|
SLUG=""
|
||||||
BATCH_SIZE="30"
|
BATCH_SIZE="30"
|
||||||
|
GLOSSARY_BATCH_SIZE="150"
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
while [[ $# -gt 0 ]]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--slug) SLUG="$2"; shift 2 ;;
|
--slug) SLUG="$2"; shift 2 ;;
|
||||||
--batch-size) BATCH_SIZE="$2"; shift 2 ;;
|
--batch-size) BATCH_SIZE="$2"; shift 2 ;;
|
||||||
|
--glossary-batch-size) GLOSSARY_BATCH_SIZE="$2"; shift 2 ;;
|
||||||
*) echo "unknown option: $1" >&2; exit 2 ;;
|
*) echo "unknown option: $1" >&2; exit 2 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
@@ -53,12 +55,22 @@ python3 translate_chapters.py "$SLUG" --batch-size "$BATCH_SIZE"
|
|||||||
PENDING_FILE="build/$SLUG/jobs/_pending.txt"
|
PENDING_FILE="build/$SLUG/jobs/_pending.txt"
|
||||||
PENDING_COUNT=$(wc -l < "$PENDING_FILE" | tr -d ' ')
|
PENDING_COUNT=$(wc -l < "$PENDING_FILE" | tr -d ' ')
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "=== Phase 2b: build_glossary.py ==="
|
||||||
|
python3 build_glossary.py "$SLUG" --batch-size "$GLOSSARY_BATCH_SIZE"
|
||||||
|
|
||||||
|
GLOSS_PENDING_FILE="build/$SLUG/glossary/_pending.txt"
|
||||||
|
GLOSS_PENDING_COUNT=$(wc -l < "$GLOSS_PENDING_FILE" | tr -d ' ')
|
||||||
|
TOTAL_PENDING=$((PENDING_COUNT + GLOSS_PENDING_COUNT))
|
||||||
|
|
||||||
echo
|
echo
|
||||||
echo "=== Phase 3: bundle_book.py ==="
|
echo "=== Phase 3: bundle_book.py ==="
|
||||||
if [[ "$PENDING_COUNT" -gt 0 ]]; then
|
if [[ "$TOTAL_PENDING" -gt 0 ]]; then
|
||||||
echo " $PENDING_COUNT translation job(s) still pending."
|
echo " $PENDING_COUNT translation job(s) and $GLOSS_PENDING_COUNT glossary job(s) still pending."
|
||||||
echo " Run the Claude Code subagent translation step (see README.md), then re-run this script."
|
echo " Run the Claude Code subagent step (see README.md) for BOTH manifests:"
|
||||||
echo " Bundling with empty placeholders so you can preview app structure now."
|
echo " build/$SLUG/jobs/_pending.txt (translation)"
|
||||||
|
echo " build/$SLUG/glossary/_pending.txt (glossary)"
|
||||||
|
echo " then re-run this script. Bundling with placeholders so you can preview now."
|
||||||
python3 bundle_book.py "$SLUG"
|
python3 bundle_book.py "$SLUG"
|
||||||
else
|
else
|
||||||
python3 bundle_book.py "$SLUG" --require-all
|
python3 bundle_book.py "$SLUG" --require-all
|
||||||
|
|||||||
@@ -12,6 +12,10 @@ public final class Book {
|
|||||||
public var language: String = ""
|
public var language: String = ""
|
||||||
public var chapterCount: Int = 0
|
public var chapterCount: Int = 0
|
||||||
public var accentColorHex: String = ""
|
public var accentColorHex: String = ""
|
||||||
|
/// JSON-encoded `[String: WordGloss]` — the book reader's primary word
|
||||||
|
/// lookup, keyed by the cleaned (lowercased, punctuation-trimmed) word.
|
||||||
|
/// Pre-computed at import time so taps resolve instantly and in context.
|
||||||
|
public var glossaryJSON: Data = Data()
|
||||||
|
|
||||||
public init(
|
public init(
|
||||||
slug: String,
|
slug: String,
|
||||||
@@ -19,7 +23,8 @@ public final class Book {
|
|||||||
author: String,
|
author: String,
|
||||||
language: String,
|
language: String,
|
||||||
chapterCount: Int,
|
chapterCount: Int,
|
||||||
accentColorHex: String
|
accentColorHex: String,
|
||||||
|
glossaryJSON: Data = Data()
|
||||||
) {
|
) {
|
||||||
self.id = slug
|
self.id = slug
|
||||||
self.slug = slug
|
self.slug = slug
|
||||||
@@ -28,5 +33,26 @@ public final class Book {
|
|||||||
self.language = language
|
self.language = language
|
||||||
self.chapterCount = chapterCount
|
self.chapterCount = chapterCount
|
||||||
self.accentColorHex = accentColorHex
|
self.accentColorHex = accentColorHex
|
||||||
|
self.glossaryJSON = glossaryJSON
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The decoded per-book glossary. Decode once and cache at the call site —
|
||||||
|
/// this re-decodes on every call.
|
||||||
|
public func glossary() -> [String: WordGloss] {
|
||||||
|
(try? JSONDecoder().decode([String: WordGloss].self, from: glossaryJSON)) ?? [:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// One glossary entry: a word's dictionary base form, English meaning, and
|
||||||
|
/// part of speech, translated in the book's context at import time.
|
||||||
|
public struct WordGloss: Codable, Hashable, Sendable {
|
||||||
|
public let baseForm: String
|
||||||
|
public let english: String
|
||||||
|
public let partOfSpeech: String
|
||||||
|
|
||||||
|
public init(baseForm: String, english: String, partOfSpeech: String) {
|
||||||
|
self.baseForm = baseForm
|
||||||
|
self.english = english
|
||||||
|
self.partOfSpeech = partOfSpeech
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,12 +32,23 @@ public struct WordAnnotation: Codable, Identifiable, Hashable {
|
|||||||
public let baseForm: String
|
public let baseForm: String
|
||||||
public let english: String
|
public let english: String
|
||||||
public let partOfSpeech: String
|
public let partOfSpeech: String
|
||||||
|
/// Human-readable name of the resource that produced this definition
|
||||||
|
/// (e.g. "Book glossary", "Dictionary", "AI guess"). Defaulted so older
|
||||||
|
/// persisted annotations without the field still decode.
|
||||||
|
public var source: String = ""
|
||||||
|
|
||||||
public init(word: String, baseForm: String, english: String, partOfSpeech: String) {
|
public init(
|
||||||
|
word: String,
|
||||||
|
baseForm: String,
|
||||||
|
english: String,
|
||||||
|
partOfSpeech: String,
|
||||||
|
source: String = ""
|
||||||
|
) {
|
||||||
self.word = word
|
self.word = word
|
||||||
self.baseForm = baseForm
|
self.baseForm = baseForm
|
||||||
self.english = english
|
self.english = english
|
||||||
self.partOfSpeech = partOfSpeech
|
self.partOfSpeech = partOfSpeech
|
||||||
|
self.source = source
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user