05a367fdbe
extract_epub.py was walking <p> only, but every "Vocabulario" section in the Olly Richards EPUB lives inside <ul><li>...</li></ul>. That meant the heading made it through but the entries didn't — 680 vocab lines across 24 sections in this book were missing from the bundled JSON. Audit (text-node owner by closest block ancestor) confirmed <li> is the only silent drop: 5,260 nodes in <p>, 1,960 in <li>, 0 anywhere else. No <h1>-<h6>, tables, or blockquotes in this EPUB at all. Fix: walk find_all(["p", "li"]) in document order so bullet entries slot in right after their "Vocabulario" / list heading. Re-extracted (2,646 → 3,326 paragraphs), re-translated all 118 jobs in parallel Claude Code subagents. translate_chapters.py prompt template now tells subagents to keep bilingual `palabra = meaning` lines verbatim — both sides already coexist on the line. Bumped bookDataVersion to 2 so refreshBooksDataIfNeeded re-seeds. Verified in simulator: all 13 chapter row sizes grew (e.g. ch6 18,295→20,951 chars). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
141 lines
4.8 KiB
Python
141 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
||
"""Split chapters.json into translation jobs that Claude Code subagents can
|
||
process in parallel. Resumable: jobs whose output file already exists are
|
||
skipped.
|
||
|
||
Usage:
|
||
python3 translate_chapters.py <slug> [--batch-size N] [--build BUILD_DIR]
|
||
|
||
Inputs:
|
||
BUILD_DIR/<slug>/chapters.json (from extract_epub.py)
|
||
|
||
Outputs:
|
||
BUILD_DIR/<slug>/jobs/<jobid>.input.json (one per batch — read by subagents)
|
||
BUILD_DIR/<slug>/jobs/_pending.txt (list of job IDs still missing output)
|
||
BUILD_DIR/<slug>/jobs/_prompt_template.md (prompt the orchestrator hands each subagent)
|
||
|
||
Job layout (.input.json):
|
||
{
|
||
"jobId": "ch06_b00",
|
||
"chapter": 6,
|
||
"chapterTitle": "1. El Castillo",
|
||
"rangeStart": 0,
|
||
"rangeEnd": 30,
|
||
"paragraphsES": ["...", "..."]
|
||
}
|
||
|
||
Subagents must write `<jobid>.output.json` with shape:
|
||
{"jobId": "ch06_b00", "paragraphsEN": ["...", "..."]}
|
||
|
||
The output array MUST have the same length as paragraphsES, in the same order.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
from pathlib import Path
|
||
|
||
|
||
PROMPT_TEMPLATE = """\
|
||
You are translating a chunk of a Spanish-language book into English for a
|
||
language-learning app.
|
||
|
||
Input file: {input_path}
|
||
Output file: {output_path}
|
||
|
||
Read the input file. It contains a JSON object with a `paragraphsES` array.
|
||
Translate each paragraph into natural English. Preserve meaning, tone, and
|
||
dialogue markers (—, –, ¡, ¿) as appropriate for the English output. Keep
|
||
the same number of paragraphs in the same order.
|
||
|
||
Notes for translation quality:
|
||
- This is a beginner Spanish reader, so prefer plain natural English over
|
||
literary flourish.
|
||
- Preserve proper nouns (character names, place names) verbatim.
|
||
- Convert Spanish dialogue dashes (–, —) to English-style quotation marks
|
||
ONLY if it reads more naturally; otherwise keep them as em-dashes.
|
||
- Do NOT add explanatory parentheticals; the in-app dictionary handles
|
||
per-word lookup.
|
||
- Some paragraphs are vocabulary entries shaped like `palabra = meaning`
|
||
(e.g. `alto = tall`, `el dueño = owner`). Keep these verbatim — both the
|
||
Spanish word and its English gloss already coexist on the line, and the
|
||
bilingual reader UI shows the same line in both views.
|
||
|
||
Write the output as JSON with shape:
|
||
{{"jobId": "<the jobId from the input>", "paragraphsEN": [...]}}
|
||
|
||
The `paragraphsEN` array MUST be the same length and order as `paragraphsES`
|
||
in the input. Write nothing else to disk and produce no other output.
|
||
"""
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("slug")
|
||
parser.add_argument("--batch-size", type=int, default=30)
|
||
parser.add_argument("--build", type=Path, default=Path("build"))
|
||
args = parser.parse_args()
|
||
|
||
base = args.build / args.slug
|
||
chapters_path = base / "chapters.json"
|
||
jobs_dir = base / "jobs"
|
||
jobs_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
data = json.loads(chapters_path.read_text(encoding="utf-8"))
|
||
|
||
pending: list[str] = []
|
||
completed: list[str] = []
|
||
total_jobs = 0
|
||
|
||
for ch in data["chapters"]:
|
||
paragraphs = ch["paragraphsES"]
|
||
if not paragraphs:
|
||
continue
|
||
for offset in range(0, len(paragraphs), args.batch_size):
|
||
chunk = paragraphs[offset : offset + args.batch_size]
|
||
job_id = f"ch{ch['number']:02d}_b{offset // args.batch_size:02d}"
|
||
input_path = jobs_dir / f"{job_id}.input.json"
|
||
output_path = jobs_dir / f"{job_id}.output.json"
|
||
|
||
input_path.write_text(
|
||
json.dumps(
|
||
{
|
||
"jobId": job_id,
|
||
"chapter": ch["number"],
|
||
"chapterTitle": ch["title"],
|
||
"rangeStart": offset,
|
||
"rangeEnd": offset + len(chunk),
|
||
"paragraphsES": chunk,
|
||
},
|
||
ensure_ascii=False,
|
||
indent=2,
|
||
),
|
||
encoding="utf-8",
|
||
)
|
||
total_jobs += 1
|
||
if output_path.exists():
|
||
completed.append(job_id)
|
||
else:
|
||
pending.append(job_id)
|
||
|
||
(jobs_dir / "_pending.txt").write_text("\n".join(pending) + ("\n" if pending else ""))
|
||
|
||
(jobs_dir / "_prompt_template.md").write_text(
|
||
PROMPT_TEMPLATE.format(
|
||
input_path="<JOB_INPUT_PATH>",
|
||
output_path="<JOB_OUTPUT_PATH>",
|
||
),
|
||
encoding="utf-8",
|
||
)
|
||
|
||
print(f"Total translation jobs: {total_jobs}")
|
||
print(f" Completed: {len(completed)}")
|
||
print(f" Pending: {len(pending)}")
|
||
print(f"Manifest at: {jobs_dir / '_pending.txt'}")
|
||
print(f"Prompt template at: {jobs_dir / '_prompt_template.md'}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|