Spanish/Conjuga/Scripts/books/build_glossary.py

#!/usr/bin/env python3
"""Phase 2b — build a per-book glossary job manifest.

Scans chapters.json, tokenizes every Spanish paragraph the SAME way the iOS app
does (whitespace split, lowercase, strip leading/trailing punctuation), collects
the distinct words with a few example sentences each, and writes batched
glossary jobs that Claude Code subagents can translate in parallel. Resumable:
jobs whose output file already exists are skipped.

Usage:
    python3 build_glossary.py <slug> [--batch-size N] [--max-examples N]
                                     [--build BUILD_DIR]

Inputs:
    BUILD_DIR/<slug>/chapters.json            (from extract_epub.py)

Outputs:
    BUILD_DIR/<slug>/glossary/<jobid>.input.json   (one per batch — read by subagents)
    BUILD_DIR/<slug>/glossary/_pending.txt          (job IDs still missing output)
    BUILD_DIR/<slug>/glossary/_prompt_template.md   (prompt for each subagent)

Job input shape (.input.json):
    {"jobId": "gloss_b00",
     "words": [{"word": "taza", "examples": ["...", "..."]}, ...]}

Subagents must write <jobid>.output.json with shape:
    {"jobId": "gloss_b00",
     "entries": [{"word": "taza", "baseForm": "taza",
                  "english": "cup", "partOfSpeech": "noun"}, ...]}

`entries` must contain exactly one object per input word.
"""

from __future__ import annotations

import argparse
import json
import re
import unicodedata
from pathlib import Path


PROMPT_TEMPLATE = """\
You are building a Spanish->English glossary for a language-learning app.

Input file: {input_path}
Output file: {output_path}

Read the input file. It contains a JSON object with a `words` array; each item
has a `word` (a lowercase Spanish word exactly as it appears in a book) and
`examples` (sentences from the book that use that word).

For EACH word, produce one entry:
- baseForm: the dictionary base form -- infinitive for verbs, masculine
  singular for nouns/adjectives, the word itself for invariant words.
- english: a concise English translation (1-4 words). Use the sense the word
  carries in the example sentences. Many Spanish words are both a verb form
  AND a function word -- e.g. "como" is "I eat" (verb) and "as/like"
  (conjunction). Choose the meaning shown in the examples, not the most common
  dictionary sense.
- partOfSpeech: one of verb, noun, adjective, adverb, pronoun, preposition,
  conjunction, article, interjection, numeral, proper noun, other.

Write the output file as JSON with this exact shape:
    {{"jobId": "<the jobId from the input>", "entries": [
        {{"word": "...", "baseForm": "...", "english": "...", "partOfSpeech": "..."}}
    ]}}

`entries` MUST contain exactly one object per input word, cover every word, and
echo each `word` back verbatim. Write nothing else to disk and produce no other
output.
"""

SENTENCE_SPLIT = re.compile(r"(?<=[.!?…])\s+")


def is_punct(ch: str) -> bool:
    """True for any Unicode punctuation — matches Swift's .punctuationCharacters."""
    return unicodedata.category(ch).startswith("P")


def clean_word(token: str) -> str:
    """Mirror BookReaderView.cleanWord: lowercase, strip leading/trailing
    punctuation, trim whitespace. Accents are preserved (no folding)."""
    t = token.lower()
    start, end = 0, len(t)
    while start < end and is_punct(t[start]):
        start += 1
    while end > start and is_punct(t[end - 1]):
        end -= 1
    return t[start:end].strip()


def has_letter(s: str) -> bool:
    return any(c.isalpha() for c in s)


def split_sentences(paragraph: str) -> list[str]:
    parts = SENTENCE_SPLIT.split(paragraph.strip())
    return [p.strip() for p in parts if p.strip()]


def is_english_front_matter(chapter: dict, threshold: float = 0.5) -> bool:
    """True when most of a chapter's paragraphs are untranslated — i.e. it is
    English front matter (Preface, reading guide, …) rather than Spanish story
    content. Story chapters still have *some* identical lines (verbatim
    `word = meaning` vocab entries), so a majority threshold separates them:
    front matter runs ~70-100% identical, stories ~25-35%. Only detectable once
    paragraphsEN is populated; raw extracted chapters carry none, so nothing is
    skipped on a fresh book's first pass."""
    es = [p.strip() for p in chapter.get("paragraphsES", [])]
    en = [p.strip() for p in chapter.get("paragraphsEN", [])]
    if not en or len(en) != len(es) or not es:
        return False
    identical = sum(1 for a, b in zip(es, en) if a == b)
    return identical / len(es) > threshold


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("slug")
    parser.add_argument("--batch-size", type=int, default=150)
    parser.add_argument("--max-examples", type=int, default=3)
    parser.add_argument("--build", type=Path, default=Path("build"))
    args = parser.parse_args()

    base = args.build / args.slug
    chapters = json.loads((base / "chapters.json").read_text(encoding="utf-8"))
    gloss_dir = base / "glossary"
    gloss_dir.mkdir(parents=True, exist_ok=True)

    examples: dict[str, list[str]] = {}
    first_seen: dict[str, int] = {}
    order = 0

    skipped_front_matter = 0
    for ch in chapters["chapters"]:
        if is_english_front_matter(ch):
            skipped_front_matter += 1
            continue
        for paragraph in ch.get("paragraphsES", []):
            for sentence in split_sentences(paragraph):
                cleaned = {clean_word(tok) for tok in sentence.split()}
                for w in cleaned:
                    if not w or not has_letter(w):
                        continue
                    if w not in first_seen:
                        first_seen[w] = order
                        order += 1
                        examples[w] = []
                    bucket = examples[w]
                    if len(bucket) < args.max_examples and sentence not in bucket:
                        bucket.append(sentence)

    words = sorted(examples.keys(), key=lambda w: first_seen[w])

    pending: list[str] = []
    completed: list[str] = []
    total_jobs = 0

    for offset in range(0, len(words), args.batch_size):
        chunk = words[offset : offset + args.batch_size]
        job_id = f"gloss_b{offset // args.batch_size:02d}"
        input_path = gloss_dir / f"{job_id}.input.json"
        output_path = gloss_dir / f"{job_id}.output.json"
        input_path.write_text(
            json.dumps(
                {
                    "jobId": job_id,
                    "words": [{"word": w, "examples": examples[w]} for w in chunk],
                },
                ensure_ascii=False,
                indent=2,
            ),
            encoding="utf-8",
        )
        total_jobs += 1
        (completed if output_path.exists() else pending).append(job_id)

    (gloss_dir / "_pending.txt").write_text(
        "\n".join(pending) + ("\n" if pending else ""), encoding="utf-8"
    )
    (gloss_dir / "_prompt_template.md").write_text(
        PROMPT_TEMPLATE.format(
            input_path="<JOB_INPUT_PATH>", output_path="<JOB_OUTPUT_PATH>"
        ),
        encoding="utf-8",
    )

    print(f"Skipped front matter:  {skipped_front_matter} chapter(s)")
    print(f"Distinct words:        {len(words)}")
    print(f"Total glossary jobs:   {total_jobs}")
    print(f"  Completed:           {len(completed)}")
    print(f"  Pending:             {len(pending)}")
    print(f"Manifest at:           {gloss_dir / '_pending.txt'}")
    print(f"Prompt template at:    {gloss_dir / '_prompt_template.md'}")


if __name__ == "__main__":
    main()