#!/usr/bin/env python3 """Split chapters.json into translation jobs that Claude Code subagents can process in parallel. Resumable: jobs whose output file already exists are skipped. Usage: python3 translate_chapters.py [--batch-size N] [--build BUILD_DIR] Inputs: BUILD_DIR//chapters.json (from extract_epub.py) Outputs: BUILD_DIR//jobs/.input.json (one per batch — read by subagents) BUILD_DIR//jobs/_pending.txt (list of job IDs still missing output) BUILD_DIR//jobs/_prompt_template.md (prompt the orchestrator hands each subagent) Job layout (.input.json): { "jobId": "ch06_b00", "chapter": 6, "chapterTitle": "1. El Castillo", "rangeStart": 0, "rangeEnd": 30, "paragraphsES": ["...", "..."] } Subagents must write `.output.json` with shape: {"jobId": "ch06_b00", "paragraphsEN": ["...", "..."]} The output array MUST have the same length as paragraphsES, in the same order. """ from __future__ import annotations import argparse import json from pathlib import Path PROMPT_TEMPLATE = """\ You are translating a chunk of a Spanish-language book into English for a language-learning app. Input file: {input_path} Output file: {output_path} Read the input file. It contains a JSON object with a `paragraphsES` array. Translate each paragraph into natural English. Preserve meaning, tone, and dialogue markers (—, –, ¡, ¿) as appropriate for the English output. Keep the same number of paragraphs in the same order. Notes for translation quality: - This is a beginner Spanish reader, so prefer plain natural English over literary flourish. - Preserve proper nouns (character names, place names) verbatim. - Convert Spanish dialogue dashes (–, —) to English-style quotation marks ONLY if it reads more naturally; otherwise keep them as em-dashes. - Do NOT add explanatory parentheticals; the in-app dictionary handles per-word lookup. - Some paragraphs are vocabulary entries shaped like `palabra = meaning` (e.g. `alto = tall`, `el dueño = owner`). Keep these verbatim — both the Spanish word and its English gloss already coexist on the line, and the bilingual reader UI shows the same line in both views. Write the output as JSON with shape: {{"jobId": "", "paragraphsEN": [...]}} The `paragraphsEN` array MUST be the same length and order as `paragraphsES` in the input. Write nothing else to disk and produce no other output. """ def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("slug") parser.add_argument("--batch-size", type=int, default=30) parser.add_argument("--build", type=Path, default=Path("build")) args = parser.parse_args() base = args.build / args.slug chapters_path = base / "chapters.json" jobs_dir = base / "jobs" jobs_dir.mkdir(parents=True, exist_ok=True) data = json.loads(chapters_path.read_text(encoding="utf-8")) pending: list[str] = [] completed: list[str] = [] total_jobs = 0 for ch in data["chapters"]: paragraphs = ch["paragraphsES"] if not paragraphs: continue for offset in range(0, len(paragraphs), args.batch_size): chunk = paragraphs[offset : offset + args.batch_size] job_id = f"ch{ch['number']:02d}_b{offset // args.batch_size:02d}" input_path = jobs_dir / f"{job_id}.input.json" output_path = jobs_dir / f"{job_id}.output.json" input_path.write_text( json.dumps( { "jobId": job_id, "chapter": ch["number"], "chapterTitle": ch["title"], "rangeStart": offset, "rangeEnd": offset + len(chunk), "paragraphsES": chunk, }, ensure_ascii=False, indent=2, ), encoding="utf-8", ) total_jobs += 1 if output_path.exists(): completed.append(job_id) else: pending.append(job_id) (jobs_dir / "_pending.txt").write_text("\n".join(pending) + ("\n" if pending else "")) (jobs_dir / "_prompt_template.md").write_text( PROMPT_TEMPLATE.format( input_path="", output_path="", ), encoding="utf-8", ) print(f"Total translation jobs: {total_jobs}") print(f" Completed: {len(completed)}") print(f" Pending: {len(pending)}") print(f"Manifest at: {jobs_dir / '_pending.txt'}") print(f"Prompt template at: {jobs_dir / '_prompt_template.md'}") if __name__ == "__main__": main()