#!/usr/bin/env python3 """Phase 2b — build a per-book glossary job manifest. Scans chapters.json, tokenizes every Spanish paragraph the SAME way the iOS app does (whitespace split, lowercase, strip leading/trailing punctuation), collects the distinct words with a few example sentences each, and writes batched glossary jobs that Claude Code subagents can translate in parallel. Resumable: jobs whose output file already exists are skipped. Usage: python3 build_glossary.py [--batch-size N] [--max-examples N] [--build BUILD_DIR] Inputs: BUILD_DIR//chapters.json (from extract_epub.py) Outputs: BUILD_DIR//glossary/.input.json (one per batch — read by subagents) BUILD_DIR//glossary/_pending.txt (job IDs still missing output) BUILD_DIR//glossary/_prompt_template.md (prompt for each subagent) Job input shape (.input.json): {"jobId": "gloss_b00", "words": [{"word": "taza", "examples": ["...", "..."]}, ...]} Subagents must write .output.json with shape: {"jobId": "gloss_b00", "entries": [{"word": "taza", "baseForm": "taza", "english": "cup", "partOfSpeech": "noun"}, ...]} `entries` must contain exactly one object per input word. """ from __future__ import annotations import argparse import json import re import unicodedata from pathlib import Path PROMPT_TEMPLATE = """\ You are building a Spanish->English glossary for a language-learning app. Input file: {input_path} Output file: {output_path} Read the input file. It contains a JSON object with a `words` array; each item has a `word` (a lowercase Spanish word exactly as it appears in a book) and `examples` (sentences from the book that use that word). For EACH word, produce one entry: - baseForm: the dictionary base form -- infinitive for verbs, masculine singular for nouns/adjectives, the word itself for invariant words. - english: a concise English translation (1-4 words). Use the sense the word carries in the example sentences. Many Spanish words are both a verb form AND a function word -- e.g. "como" is "I eat" (verb) and "as/like" (conjunction). Choose the meaning shown in the examples, not the most common dictionary sense. - partOfSpeech: one of verb, noun, adjective, adverb, pronoun, preposition, conjunction, article, interjection, numeral, proper noun, other. Write the output file as JSON with this exact shape: {{"jobId": "", "entries": [ {{"word": "...", "baseForm": "...", "english": "...", "partOfSpeech": "..."}} ]}} `entries` MUST contain exactly one object per input word, cover every word, and echo each `word` back verbatim. Write nothing else to disk and produce no other output. """ SENTENCE_SPLIT = re.compile(r"(?<=[.!?…])\s+") def is_punct(ch: str) -> bool: """True for any Unicode punctuation — matches Swift's .punctuationCharacters.""" return unicodedata.category(ch).startswith("P") def clean_word(token: str) -> str: """Mirror BookReaderView.cleanWord: lowercase, strip leading/trailing punctuation, trim whitespace. Accents are preserved (no folding).""" t = token.lower() start, end = 0, len(t) while start < end and is_punct(t[start]): start += 1 while end > start and is_punct(t[end - 1]): end -= 1 return t[start:end].strip() def has_letter(s: str) -> bool: return any(c.isalpha() for c in s) def split_sentences(paragraph: str) -> list[str]: parts = SENTENCE_SPLIT.split(paragraph.strip()) return [p.strip() for p in parts if p.strip()] def is_english_front_matter(chapter: dict, threshold: float = 0.5) -> bool: """True when most of a chapter's paragraphs are untranslated — i.e. it is English front matter (Preface, reading guide, …) rather than Spanish story content. Story chapters still have *some* identical lines (verbatim `word = meaning` vocab entries), so a majority threshold separates them: front matter runs ~70-100% identical, stories ~25-35%. Only detectable once paragraphsEN is populated; raw extracted chapters carry none, so nothing is skipped on a fresh book's first pass.""" es = [p.strip() for p in chapter.get("paragraphsES", [])] en = [p.strip() for p in chapter.get("paragraphsEN", [])] if not en or len(en) != len(es) or not es: return False identical = sum(1 for a, b in zip(es, en) if a == b) return identical / len(es) > threshold def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("slug") parser.add_argument("--batch-size", type=int, default=150) parser.add_argument("--max-examples", type=int, default=3) parser.add_argument("--build", type=Path, default=Path("build")) args = parser.parse_args() base = args.build / args.slug chapters = json.loads((base / "chapters.json").read_text(encoding="utf-8")) gloss_dir = base / "glossary" gloss_dir.mkdir(parents=True, exist_ok=True) examples: dict[str, list[str]] = {} first_seen: dict[str, int] = {} order = 0 skipped_front_matter = 0 for ch in chapters["chapters"]: if is_english_front_matter(ch): skipped_front_matter += 1 continue for paragraph in ch.get("paragraphsES", []): for sentence in split_sentences(paragraph): cleaned = {clean_word(tok) for tok in sentence.split()} for w in cleaned: if not w or not has_letter(w): continue if w not in first_seen: first_seen[w] = order order += 1 examples[w] = [] bucket = examples[w] if len(bucket) < args.max_examples and sentence not in bucket: bucket.append(sentence) words = sorted(examples.keys(), key=lambda w: first_seen[w]) pending: list[str] = [] completed: list[str] = [] total_jobs = 0 for offset in range(0, len(words), args.batch_size): chunk = words[offset : offset + args.batch_size] job_id = f"gloss_b{offset // args.batch_size:02d}" input_path = gloss_dir / f"{job_id}.input.json" output_path = gloss_dir / f"{job_id}.output.json" input_path.write_text( json.dumps( { "jobId": job_id, "words": [{"word": w, "examples": examples[w]} for w in chunk], }, ensure_ascii=False, indent=2, ), encoding="utf-8", ) total_jobs += 1 (completed if output_path.exists() else pending).append(job_id) (gloss_dir / "_pending.txt").write_text( "\n".join(pending) + ("\n" if pending else ""), encoding="utf-8" ) (gloss_dir / "_prompt_template.md").write_text( PROMPT_TEMPLATE.format( input_path="", output_path="" ), encoding="utf-8", ) print(f"Skipped front matter: {skipped_front_matter} chapter(s)") print(f"Distinct words: {len(words)}") print(f"Total glossary jobs: {total_jobs}") print(f" Completed: {len(completed)}") print(f" Pending: {len(pending)}") print(f"Manifest at: {gloss_dir / '_pending.txt'}") print(f"Prompt template at: {gloss_dir / '_prompt_template.md'}") if __name__ == "__main__": main()