Spanish/Conjuga/Scripts/textbook/build_book.py

#!/usr/bin/env python3
"""Merge chapters.json + answers.json + ocr.json → book.json (single source).

Also emits vocab_cards.json: flashcards derived from vocab_image blocks where
OCR text parses as a clean two-column (Spanish ↔ English) table.
"""

import json
import re
import sys
from pathlib import Path

HERE = Path(__file__).resolve().parent
CHAPTERS_JSON = HERE / "chapters.json"
ANSWERS_JSON = HERE / "answers.json"
OCR_JSON = HERE / "ocr.json"
OUT_BOOK = HERE / "book.json"
OUT_VOCAB = HERE / "vocab_cards.json"

COURSE_NAME = "Complete Spanish Step-by-Step"

# Heuristic: parseable "Spanish | English" vocab rows.
# OCR usually produces "word  —  translation" or "word translation" separated
# by 2+ spaces. We detect rows that contain both Spanish and English words.
SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their", "your", "some"}
# English-only words that would never appear as Spanish
ENGLISH_ONLY_WORDS = {"the", "he", "she", "it", "we", "they", "I", "is", "are", "was", "were",
                     "been", "have", "has", "had", "will", "would", "should", "could"}
SEP_RE = re.compile(r"[ \t]{2,}|\s[—–−-]\s")


def classify_line(line: str) -> str:
    """Return 'es', 'en', or 'unknown' for the dominant language of a vocab line."""
    line = line.strip()
    if not line:
        return "unknown"
    # Accent = definitely Spanish
    if SPANISH_ACCENT_RE.search(line):
        return "es"
    first = line.split()[0].lower().strip(",.;:")
    if first in SPANISH_ARTICLES:
        return "es"
    if first in ENGLISH_STARTERS:
        return "en"
    # Check if the leading word is an English-only function word
    if first in ENGLISH_ONLY_WORDS:
        return "en"
    return "unknown"


def looks_english(word: str) -> bool:
    """Legacy helper — kept for try_split_row below."""
    w = word.lower().strip()
    if not w:
        return False
    if SPANISH_ACCENT_RE.search(w):
        return False
    if w in SPANISH_ARTICLES:
        return False
    if w in ENGLISH_STARTERS or w in ENGLISH_ONLY_WORDS:
        return True
    return bool(re.match(r"^[a-z][a-z\s'/()\-,.]*$", w))


def try_split_row(line: str) -> "tuple[str, str] | None":
    """Split a line into (spanish, english) if it looks like a vocab entry."""
    line = line.strip()
    if not line or len(line) < 3:
        return None
    # Try explicit separators first
    parts = SEP_RE.split(line)
    parts = [p.strip() for p in parts if p.strip()]
    if len(parts) == 2:
        spanish, english = parts
        if looks_english(english) and not looks_english(spanish.split()[0]):
            return (spanish, english)
    return None


def load(p: Path) -> dict:
    return json.loads(p.read_text(encoding="utf-8"))


def build_vocab_cards_for_block(block: dict, ocr_entry: dict, chapter: dict, context_title: str, idx: int) -> list:
    """Given a vocab_image block + its OCR lines, derive flashcards.

    Vision OCR reads top-to-bottom, left-to-right; a two-column vocab table
    produces Spanish lines first, then English lines. We split the list in
    half when one side is predominantly Spanish and the other English.
    Per-line '—' separators are also supported as a fallback.
    """
    cards = []
    if not ocr_entry:
        return cards
    lines = [l.strip() for l in ocr_entry.get("lines", []) if l.strip()]
    if not lines:
        return cards

    def card(front: str, back: str) -> dict:
        return {
            "front": front,
            "back": back,
            "chapter": chapter["number"],
            "chapterTitle": chapter["title"],
            "section": context_title,
            "sourceImage": block["src"],
        }

    # Attempt 1: explicit inline separator (e.g. "la casa — the house")
    inline = []
    all_inline = True
    for line in lines:
        pair = try_split_row(line)
        if pair:
            inline.append(pair)
        else:
            all_inline = False
            break
    if all_inline and inline:
        for es, en in inline:
            cards.append(card(es, en))
        return cards

    # Attempt 2: block-alternating layout.
    # Vision OCR reads columns top-to-bottom, so a 2-col table rendered across
    # 2 visual columns produces runs like: [ES...ES][EN...EN][ES...ES][EN...EN]
    # We classify each line, smooth "unknown" using neighbors, then pair
    # same-sized consecutive ES/EN blocks.
    classes = [classify_line(l) for l in lines]

    # Pass 1: fill unknowns using nearest non-unknown neighbor (forward)
    last_known = "unknown"
    forward = []
    for c in classes:
        if c != "unknown":
            last_known = c
        forward.append(last_known)
    # Pass 2: backfill leading unknowns (backward)
    last_known = "unknown"
    backward = [""] * len(classes)
    for i in range(len(classes) - 1, -1, -1):
        if classes[i] != "unknown":
            last_known = classes[i]
        backward[i] = last_known
    # Merge: prefer forward unless still unknown
    resolved = []
    for f, b in zip(forward, backward):
        if f != "unknown":
            resolved.append(f)
        elif b != "unknown":
            resolved.append(b)
        else:
            resolved.append("unknown")

    # Group consecutive same-lang lines
    blocks: list = []
    cur_lang: "str | None" = None
    cur_block: list = []
    for line, lang in zip(lines, resolved):
        if lang != cur_lang:
            if cur_block and cur_lang is not None:
                blocks.append((cur_lang, cur_block))
            cur_block = [line]
            cur_lang = lang
        else:
            cur_block.append(line)
    if cur_block and cur_lang is not None:
        blocks.append((cur_lang, cur_block))

    # Walk blocks pairing ES then EN of equal length
    i = 0
    while i < len(blocks) - 1:
        lang_a, lines_a = blocks[i]
        lang_b, lines_b = blocks[i + 1]
        if lang_a == "es" and lang_b == "en" and len(lines_a) == len(lines_b):
            for es, en in zip(lines_a, lines_b):
                cards.append(card(es, en))
            i += 2
            continue
        # If reversed order (some pages have EN column on left), try that too
        if lang_a == "en" and lang_b == "es" and len(lines_a) == len(lines_b):
            for es, en in zip(lines_b, lines_a):
                cards.append(card(es, en))
            i += 2
            continue
        i += 1

    return cards


def clean_instruction(text: str) -> str:
    """Strip leading/trailing emphasis markers from a parsed instruction."""
    # Our XHTML parser emitted * and ** for emphasis; flatten them
    t = re.sub(r"\*+", "", text)
    return t.strip()


def merge() -> None:
    chapters_data = load(CHAPTERS_JSON)
    answers_data = load(ANSWERS_JSON)
    try:
        ocr_data = load(OCR_JSON)
    except FileNotFoundError:
        print("ocr.json not found — proceeding with empty OCR data")
        ocr_data = {}

    answers = answers_data["answers"]
    chapters = chapters_data["chapters"]
    parts = chapters_data.get("part_memberships", {})

    book_chapters = []
    all_vocab_cards = []
    missing_ocr = set()
    current_section_title = ""

    for ch in chapters:
        out_blocks = []
        current_section_title = ch["title"]

        for bi, block in enumerate(ch["blocks"]):
            k = block["kind"]

            if k == "heading":
                current_section_title = block["text"]
                out_blocks.append(block)
                continue

            if k == "paragraph":
                out_blocks.append(block)
                continue

            if k == "key_vocab_header":
                out_blocks.append(block)
                continue

            if k == "vocab_image":
                ocr_entry = ocr_data.get(block["src"])
                if ocr_entry is None:
                    missing_ocr.add(block["src"])
                derived = build_vocab_cards_for_block(
                    block, ocr_entry, ch, current_section_title, bi
                )
                all_vocab_cards.extend(derived)
                out_blocks.append({
                    "kind": "vocab_table",
                    "sourceImage": block["src"],
                    "ocrLines": ocr_entry.get("lines", []) if ocr_entry else [],
                    "ocrConfidence": ocr_entry.get("confidence", 0.0) if ocr_entry else 0.0,
                    "cardCount": len(derived),
                })
                continue

            if k == "exercise":
                ans = answers.get(block["id"])
                image_ocr_lines = []
                for src in block.get("image_refs", []):
                    e = ocr_data.get(src)
                    if e is None:
                        missing_ocr.add(src)
                        continue
                    image_ocr_lines.extend(e.get("lines", []))

                # Build the final prompt list. If we have text prompts from
                # XHTML, prefer them. Otherwise, attempt to use OCR lines.
                prompts = [p for p in block.get("prompts", []) if p.strip()]
                extras = [e for e in block.get("extra", []) if e.strip()]
                if not prompts and image_ocr_lines:
                    # Extract numbered lines from OCR (look for "1.  ..." pattern)
                    for line in image_ocr_lines:
                        m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip())
                        if m:
                            prompts.append(f"{m.group(1)}. {m.group(2)}")

                # Cross-reference prompts with answers
                sub = ans["subparts"] if ans else []
                answer_items = []
                for sp in sub:
                    for it in sp["items"]:
                        answer_items.append({
                            "label": sp["label"],
                            "number": it["number"],
                            "answer": it["answer"],
                            "alternates": it["alternates"],
                        })

                out_blocks.append({
                    "kind": "exercise",
                    "id": block["id"],
                    "ansAnchor": block.get("ans_anchor", ""),
                    "instruction": clean_instruction(block.get("instruction", "")),
                    "extra": extras,
                    "prompts": prompts,
                    "ocrLines": image_ocr_lines,
                    "freeform": ans["freeform"] if ans else False,
                    "answerItems": answer_items,
                    "answerRaw": ans["raw"] if ans else "",
                    "answerSubparts": sub,
                })
                continue

            out_blocks.append(block)

        book_chapters.append({
            "id": ch["id"],
            "number": ch["number"],
            "title": ch["title"],
            "part": ch.get("part"),
            "blocks": out_blocks,
        })

    book = {
        "courseName": COURSE_NAME,
        "totalChapters": len(book_chapters),
        "totalExercises": sum(
            1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise"
        ),
        "totalVocabTables": sum(
            1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table"
        ),
        "totalVocabCards": len(all_vocab_cards),
        "parts": parts,
        "chapters": book_chapters,
    }
    OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False))

    # Vocab cards as a separate file (grouped per chapter so they can be seeded
    # as CourseDecks in the existing schema).
    vocab_by_chapter: dict = {}
    for card in all_vocab_cards:
        vocab_by_chapter.setdefault(card["chapter"], []).append(card)
    OUT_VOCAB.write_text(json.dumps({
        "courseName": COURSE_NAME,
        "chapters": [
            {
                "chapter": ch_num,
                "cards": cards,
            }
            for ch_num, cards in sorted(vocab_by_chapter.items())
        ],
    }, ensure_ascii=False, indent=2))

    # Summary
    print(f"Wrote {OUT_BOOK}")
    print(f"Wrote {OUT_VOCAB}")
    print(f"Chapters:           {book['totalChapters']}")
    print(f"Exercises:          {book['totalExercises']}")
    print(f"Vocab tables:       {book['totalVocabTables']}")
    print(f"Vocab cards (auto): {book['totalVocabCards']}")
    if missing_ocr:
        print(f"Missing OCR for {len(missing_ocr)} images (first 5): {sorted(list(missing_ocr))[:5]}")

    # Validation
    total_exercises = book["totalExercises"]
    exercises_with_prompts = sum(
        1 for ch in book_chapters for b in ch["blocks"]
        if b["kind"] == "exercise" and (b["prompts"] or b["extra"])
    )
    exercises_with_answers = sum(
        1 for ch in book_chapters for b in ch["blocks"]
        if b["kind"] == "exercise" and b["answerItems"]
    )
    exercises_freeform = sum(
        1 for ch in book_chapters for b in ch["blocks"]
        if b["kind"] == "exercise" and b["freeform"]
    )
    print(f"Exercises with prompts: {exercises_with_prompts}/{total_exercises}")
    print(f"Exercises with answers: {exercises_with_answers}/{total_exercises}")
    print(f"Freeform exercises:     {exercises_freeform}")


if __name__ == "__main__":
    merge()