Spanish/Conjuga/Scripts/books/bundle_book.py

#!/usr/bin/env python3
"""Merge chapters.json + per-job translation outputs into the final bundled
book_<slug>.json that the iOS app reads from its bundle.

Usage:
    python3 bundle_book.py <slug> [--build BUILD_DIR] [--dest DEST_DIR] [--require-all]

Inputs:
    BUILD_DIR/<slug>/chapters.json
    BUILD_DIR/<slug>/jobs/*.output.json       (from translation subagents)
    BUILD_DIR/<slug>/glossary/*.output.json   (from glossary subagents, Phase 2b)

Output:
    DEST_DIR/book_<slug>.json
        {
          "slug": "...",
          "title": "...",
          "author": "...",
          "language": "...",
          "chapters": [
            {"id": "ch1", "number": 1, "title": "Preface",
             "paragraphsES": ["...", ...],
             "paragraphsEN": ["...", ...]},
            ...
          ],
          "glossary": {
            "taza": {"baseForm": "taza", "english": "cup", "partOfSpeech": "noun"},
            ...
          }
        }

If --require-all is passed, the script fails if any translation OR glossary job
is missing its output. Otherwise it fills missing translations with empty
strings, leaves missing glossary entries out, and warns.
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path


DEFAULT_DEST = Path("../../Conjuga")


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("slug")
    parser.add_argument("--build", type=Path, default=Path("build"))
    parser.add_argument("--dest", type=Path, default=None)
    parser.add_argument("--require-all", action="store_true")
    args = parser.parse_args()

    base = args.build / args.slug
    chapters = json.loads((base / "chapters.json").read_text(encoding="utf-8"))
    jobs_dir = base / "jobs"

    # Index translation jobs by chapter -> ordered (offset, paragraphsEN).
    chapter_translations: dict[int, list[tuple[int, list[str]]]] = {}
    missing: list[str] = []

    for input_path in sorted(jobs_dir.glob("*.input.json")):
        job_id = input_path.stem.removesuffix(".input")
        input_data = json.loads(input_path.read_text(encoding="utf-8"))
        output_path = jobs_dir / f"{job_id}.output.json"
        if not output_path.exists():
            missing.append(job_id)
            continue
        output_data = json.loads(output_path.read_text(encoding="utf-8"))
        paragraphs_en = output_data.get("paragraphsEN", [])
        expected = len(input_data["paragraphsES"])
        if len(paragraphs_en) != expected:
            print(
                f"WARN: {job_id} length mismatch — got {len(paragraphs_en)}, "
                f"expected {expected}. Padding/truncating.",
                file=sys.stderr,
            )
            if len(paragraphs_en) < expected:
                paragraphs_en = paragraphs_en + [""] * (expected - len(paragraphs_en))
            else:
                paragraphs_en = paragraphs_en[:expected]
        chapter_translations.setdefault(input_data["chapter"], []).append(
            (input_data["rangeStart"], paragraphs_en)
        )

    if missing:
        msg = f"{len(missing)} translation job(s) missing output: {missing[:5]}{'...' if len(missing) > 5 else ''}"
        if args.require_all:
            print(f"ERROR: {msg}", file=sys.stderr)
            sys.exit(1)
        print(f"WARN: {msg} — using empty strings for those paragraphs.", file=sys.stderr)

    # Glossary (Phase 2b) — merge every glossary job's entries into one map
    # keyed by the cleaned word the app looks up.
    glossary_dir = base / "glossary"
    glossary: dict[str, dict] = {}
    glossary_missing: list[str] = []
    if glossary_dir.exists():
        for input_path in sorted(glossary_dir.glob("*.input.json")):
            job_id = input_path.stem.removesuffix(".input")
            output_path = glossary_dir / f"{job_id}.output.json"
            if not output_path.exists():
                glossary_missing.append(job_id)
                continue
            output_data = json.loads(output_path.read_text(encoding="utf-8"))
            for entry in output_data.get("entries", []):
                word = (entry.get("word") or "").strip()
                if not word:
                    continue
                glossary[word] = {
                    "baseForm": entry.get("baseForm") or word,
                    "english": entry.get("english") or "",
                    "partOfSpeech": entry.get("partOfSpeech") or "",
                }
    if glossary_missing:
        msg = f"{len(glossary_missing)} glossary job(s) missing output: {glossary_missing[:5]}{'...' if len(glossary_missing) > 5 else ''}"
        if args.require_all:
            print(f"ERROR: {msg}", file=sys.stderr)
            sys.exit(1)
        print(f"WARN: {msg} — glossary will be incomplete.", file=sys.stderr)

    bundled_chapters: list[dict] = []
    for ch in chapters["chapters"]:
        translations = sorted(chapter_translations.get(ch["number"], []))
        paragraphs_en: list[str] = []
        for _, en_chunk in translations:
            paragraphs_en.extend(en_chunk)
        # Pad to match ES length if jobs were missing for parts of this chapter.
        if len(paragraphs_en) < len(ch["paragraphsES"]):
            paragraphs_en += [""] * (len(ch["paragraphsES"]) - len(paragraphs_en))
        elif len(paragraphs_en) > len(ch["paragraphsES"]):
            paragraphs_en = paragraphs_en[: len(ch["paragraphsES"])]
        bundled_chapters.append(
            {
                "id": ch["id"],
                "number": ch["number"],
                "title": ch["title"],
                "paragraphsES": ch["paragraphsES"],
                "paragraphsEN": paragraphs_en,
            }
        )

    payload = {
        "slug": chapters["slug"],
        "title": chapters["title"],
        "author": chapters["author"],
        "language": chapters["language"],
        "chapters": bundled_chapters,
        "glossary": glossary,
    }

    dest_dir = (args.dest or DEFAULT_DEST).resolve()
    dest_dir.mkdir(parents=True, exist_ok=True)
    out_path = dest_dir / f"book_{args.slug}.json"
    out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Wrote {out_path}")
    print(f"  Chapters:        {len(bundled_chapters)}")
    print(f"  Translated jobs: {sum(len(v) for v in chapter_translations.values())} / {sum(len(v) for v in chapter_translations.values()) + len(missing)}")
    print(f"  Glossary words:  {len(glossary)}")


if __name__ == "__main__":
    main()