Spanish/Conjuga/Scripts/books/translate_chapters.py

#!/usr/bin/env python3
"""Split chapters.json into translation jobs that Claude Code subagents can
process in parallel. Resumable: jobs whose output file already exists are
skipped.

Usage:
    python3 translate_chapters.py <slug> [--batch-size N] [--build BUILD_DIR]

Inputs:
    BUILD_DIR/<slug>/chapters.json  (from extract_epub.py)

Outputs:
    BUILD_DIR/<slug>/jobs/<jobid>.input.json    (one per batch — read by subagents)
    BUILD_DIR/<slug>/jobs/_pending.txt           (list of job IDs still missing output)
    BUILD_DIR/<slug>/jobs/_prompt_template.md    (prompt the orchestrator hands each subagent)

Job layout (.input.json):
    {
      "jobId": "ch06_b00",
      "chapter": 6,
      "chapterTitle": "1. El Castillo",
      "rangeStart": 0,
      "rangeEnd": 30,
      "paragraphsES": ["...", "..."]
    }

Subagents must write `<jobid>.output.json` with shape:
    {"jobId": "ch06_b00", "paragraphsEN": ["...", "..."]}

The output array MUST have the same length as paragraphsES, in the same order.
"""

from __future__ import annotations

import argparse
import json
from pathlib import Path


PROMPT_TEMPLATE = """\
You are translating a chunk of a Spanish-language book into English for a
language-learning app.

Input file: {input_path}
Output file: {output_path}

Read the input file. It contains a JSON object with a `paragraphsES` array.
Translate each paragraph into natural English. Preserve meaning, tone, and
dialogue markers (—, –, ¡, ¿) as appropriate for the English output. Keep
the same number of paragraphs in the same order.

Notes for translation quality:
- This is a beginner Spanish reader, so prefer plain natural English over
  literary flourish.
- Preserve proper nouns (character names, place names) verbatim.
- Convert Spanish dialogue dashes (–, —) to English-style quotation marks
  ONLY if it reads more naturally; otherwise keep them as em-dashes.
- Do NOT add explanatory parentheticals; the in-app dictionary handles
  per-word lookup.
- Some paragraphs are vocabulary entries shaped like `palabra = meaning`
  (e.g. `alto = tall`, `el dueño = owner`). Keep these verbatim — both the
  Spanish word and its English gloss already coexist on the line, and the
  bilingual reader UI shows the same line in both views.

Write the output as JSON with shape:
    {{"jobId": "<the jobId from the input>", "paragraphsEN": [...]}}

The `paragraphsEN` array MUST be the same length and order as `paragraphsES`
in the input. Write nothing else to disk and produce no other output.
"""


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("slug")
    parser.add_argument("--batch-size", type=int, default=30)
    parser.add_argument("--build", type=Path, default=Path("build"))
    args = parser.parse_args()

    base = args.build / args.slug
    chapters_path = base / "chapters.json"
    jobs_dir = base / "jobs"
    jobs_dir.mkdir(parents=True, exist_ok=True)

    data = json.loads(chapters_path.read_text(encoding="utf-8"))

    pending: list[str] = []
    completed: list[str] = []
    total_jobs = 0

    for ch in data["chapters"]:
        paragraphs = ch["paragraphsES"]
        if not paragraphs:
            continue
        for offset in range(0, len(paragraphs), args.batch_size):
            chunk = paragraphs[offset : offset + args.batch_size]
            job_id = f"ch{ch['number']:02d}_b{offset // args.batch_size:02d}"
            input_path = jobs_dir / f"{job_id}.input.json"
            output_path = jobs_dir / f"{job_id}.output.json"

            input_path.write_text(
                json.dumps(
                    {
                        "jobId": job_id,
                        "chapter": ch["number"],
                        "chapterTitle": ch["title"],
                        "rangeStart": offset,
                        "rangeEnd": offset + len(chunk),
                        "paragraphsES": chunk,
                    },
                    ensure_ascii=False,
                    indent=2,
                ),
                encoding="utf-8",
            )
            total_jobs += 1
            if output_path.exists():
                completed.append(job_id)
            else:
                pending.append(job_id)

    (jobs_dir / "_pending.txt").write_text("\n".join(pending) + ("\n" if pending else ""))

    (jobs_dir / "_prompt_template.md").write_text(
        PROMPT_TEMPLATE.format(
            input_path="<JOB_INPUT_PATH>",
            output_path="<JOB_OUTPUT_PATH>",
        ),
        encoding="utf-8",
    )

    print(f"Total translation jobs: {total_jobs}")
    print(f"  Completed:            {len(completed)}")
    print(f"  Pending:              {len(pending)}")
    print(f"Manifest at:            {jobs_dir / '_pending.txt'}")
    print(f"Prompt template at:     {jobs_dir / '_prompt_template.md'}")


if __name__ == "__main__":
    main()