Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions
@@ -0,0 +1,374 @@
+#!/usr/bin/env python3
+"""Merge chapters.json + answers.json + ocr.json → book.json (single source).
+
+Also emits vocab_cards.json: flashcards derived from vocab_image blocks where
+OCR text parses as a clean two-column (Spanish ↔ English) table.
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+CHAPTERS_JSON = HERE / "chapters.json"
+ANSWERS_JSON = HERE / "answers.json"
+OCR_JSON = HERE / "ocr.json"
+OUT_BOOK = HERE / "book.json"
+OUT_VOCAB = HERE / "vocab_cards.json"
+
+COURSE_NAME = "Complete Spanish Step-by-Step"
+
+# Heuristic: parseable "Spanish | English" vocab rows.
+# OCR usually produces "word  —  translation" or "word translation" separated
+# by 2+ spaces. We detect rows that contain both Spanish and English words.
+SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
+SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
+ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their", "your", "some"}
+# English-only words that would never appear as Spanish
+ENGLISH_ONLY_WORDS = {"the", "he", "she", "it", "we", "they", "I", "is", "are", "was", "were",
+                     "been", "have", "has", "had", "will", "would", "should", "could"}
+SEP_RE = re.compile(r"[ \t]{2,}|\s[—–−-]\s")
+
+
+def classify_line(line: str) -> str:
+    """Return 'es', 'en', or 'unknown' for the dominant language of a vocab line."""
+    line = line.strip()
+    if not line:
+        return "unknown"
+    # Accent = definitely Spanish
+    if SPANISH_ACCENT_RE.search(line):
+        return "es"
+    first = line.split()[0].lower().strip(",.;:")
+    if first in SPANISH_ARTICLES:
+        return "es"
+    if first in ENGLISH_STARTERS:
+        return "en"
+    # Check if the leading word is an English-only function word
+    if first in ENGLISH_ONLY_WORDS:
+        return "en"
+    return "unknown"
+
+
+def looks_english(word: str) -> bool:
+    """Legacy helper — kept for try_split_row below."""
+    w = word.lower().strip()
+    if not w:
+        return False
+    if SPANISH_ACCENT_RE.search(w):
+        return False
+    if w in SPANISH_ARTICLES:
+        return False
+    if w in ENGLISH_STARTERS or w in ENGLISH_ONLY_WORDS:
+        return True
+    return bool(re.match(r"^[a-z][a-z\s'/()\-,.]*$", w))
+
+
+def try_split_row(line: str) -> "tuple[str, str] | None":
+    """Split a line into (spanish, english) if it looks like a vocab entry."""
+    line = line.strip()
+    if not line or len(line) < 3:
+        return None
+    # Try explicit separators first
+    parts = SEP_RE.split(line)
+    parts = [p.strip() for p in parts if p.strip()]
+    if len(parts) == 2:
+        spanish, english = parts
+        if looks_english(english) and not looks_english(spanish.split()[0]):
+            return (spanish, english)
+    return None
+
+
+def load(p: Path) -> dict:
+    return json.loads(p.read_text(encoding="utf-8"))
+
+
+def build_vocab_cards_for_block(block: dict, ocr_entry: dict, chapter: dict, context_title: str, idx: int) -> list:
+    """Given a vocab_image block + its OCR lines, derive flashcards.
+
+    Vision OCR reads top-to-bottom, left-to-right; a two-column vocab table
+    produces Spanish lines first, then English lines. We split the list in
+    half when one side is predominantly Spanish and the other English.
+    Per-line '—' separators are also supported as a fallback.
+    """
+    cards = []
+    if not ocr_entry:
+        return cards
+    lines = [l.strip() for l in ocr_entry.get("lines", []) if l.strip()]
+    if not lines:
+        return cards
+
+    def card(front: str, back: str) -> dict:
+        return {
+            "front": front,
+            "back": back,
+            "chapter": chapter["number"],
+            "chapterTitle": chapter["title"],
+            "section": context_title,
+            "sourceImage": block["src"],
+        }
+
+    # Attempt 1: explicit inline separator (e.g. "la casa — the house")
+    inline = []
+    all_inline = True
+    for line in lines:
+        pair = try_split_row(line)
+        if pair:
+            inline.append(pair)
+        else:
+            all_inline = False
+            break
+    if all_inline and inline:
+        for es, en in inline:
+            cards.append(card(es, en))
+        return cards
+
+    # Attempt 2: block-alternating layout.
+    # Vision OCR reads columns top-to-bottom, so a 2-col table rendered across
+    # 2 visual columns produces runs like: [ES...ES][EN...EN][ES...ES][EN...EN]
+    # We classify each line, smooth "unknown" using neighbors, then pair
+    # same-sized consecutive ES/EN blocks.
+    classes = [classify_line(l) for l in lines]
+
+    # Pass 1: fill unknowns using nearest non-unknown neighbor (forward)
+    last_known = "unknown"
+    forward = []
+    for c in classes:
+        if c != "unknown":
+            last_known = c
+        forward.append(last_known)
+    # Pass 2: backfill leading unknowns (backward)
+    last_known = "unknown"
+    backward = [""] * len(classes)
+    for i in range(len(classes) - 1, -1, -1):
+        if classes[i] != "unknown":
+            last_known = classes[i]
+        backward[i] = last_known
+    # Merge: prefer forward unless still unknown
+    resolved = []
+    for f, b in zip(forward, backward):
+        if f != "unknown":
+            resolved.append(f)
+        elif b != "unknown":
+            resolved.append(b)
+        else:
+            resolved.append("unknown")
+
+    # Group consecutive same-lang lines
+    blocks: list = []
+    cur_lang: "str | None" = None
+    cur_block: list = []
+    for line, lang in zip(lines, resolved):
+        if lang != cur_lang:
+            if cur_block and cur_lang is not None:
+                blocks.append((cur_lang, cur_block))
+            cur_block = [line]
+            cur_lang = lang
+        else:
+            cur_block.append(line)
+    if cur_block and cur_lang is not None:
+        blocks.append((cur_lang, cur_block))
+
+    # Walk blocks pairing ES then EN of equal length
+    i = 0
+    while i < len(blocks) - 1:
+        lang_a, lines_a = blocks[i]
+        lang_b, lines_b = blocks[i + 1]
+        if lang_a == "es" and lang_b == "en" and len(lines_a) == len(lines_b):
+            for es, en in zip(lines_a, lines_b):
+                cards.append(card(es, en))
+            i += 2
+            continue
+        # If reversed order (some pages have EN column on left), try that too
+        if lang_a == "en" and lang_b == "es" and len(lines_a) == len(lines_b):
+            for es, en in zip(lines_b, lines_a):
+                cards.append(card(es, en))
+            i += 2
+            continue
+        i += 1
+
+    return cards
+
+
+def clean_instruction(text: str) -> str:
+    """Strip leading/trailing emphasis markers from a parsed instruction."""
+    # Our XHTML parser emitted * and ** for emphasis; flatten them
+    t = re.sub(r"\*+", "", text)
+    return t.strip()
+
+
+def merge() -> None:
+    chapters_data = load(CHAPTERS_JSON)
+    answers_data = load(ANSWERS_JSON)
+    try:
+        ocr_data = load(OCR_JSON)
+    except FileNotFoundError:
+        print("ocr.json not found — proceeding with empty OCR data")
+        ocr_data = {}
+
+    answers = answers_data["answers"]
+    chapters = chapters_data["chapters"]
+    parts = chapters_data.get("part_memberships", {})
+
+    book_chapters = []
+    all_vocab_cards = []
+    missing_ocr = set()
+    current_section_title = ""
+
+    for ch in chapters:
+        out_blocks = []
+        current_section_title = ch["title"]
+
+        for bi, block in enumerate(ch["blocks"]):
+            k = block["kind"]
+
+            if k == "heading":
+                current_section_title = block["text"]
+                out_blocks.append(block)
+                continue
+
+            if k == "paragraph":
+                out_blocks.append(block)
+                continue
+
+            if k == "key_vocab_header":
+                out_blocks.append(block)
+                continue
+
+            if k == "vocab_image":
+                ocr_entry = ocr_data.get(block["src"])
+                if ocr_entry is None:
+                    missing_ocr.add(block["src"])
+                derived = build_vocab_cards_for_block(
+                    block, ocr_entry, ch, current_section_title, bi
+                )
+                all_vocab_cards.extend(derived)
+                out_blocks.append({
+                    "kind": "vocab_table",
+                    "sourceImage": block["src"],
+                    "ocrLines": ocr_entry.get("lines", []) if ocr_entry else [],
+                    "ocrConfidence": ocr_entry.get("confidence", 0.0) if ocr_entry else 0.0,
+                    "cardCount": len(derived),
+                })
+                continue
+
+            if k == "exercise":
+                ans = answers.get(block["id"])
+                image_ocr_lines = []
+                for src in block.get("image_refs", []):
+                    e = ocr_data.get(src)
+                    if e is None:
+                        missing_ocr.add(src)
+                        continue
+                    image_ocr_lines.extend(e.get("lines", []))
+
+                # Build the final prompt list. If we have text prompts from
+                # XHTML, prefer them. Otherwise, attempt to use OCR lines.
+                prompts = [p for p in block.get("prompts", []) if p.strip()]
+                extras = [e for e in block.get("extra", []) if e.strip()]
+                if not prompts and image_ocr_lines:
+                    # Extract numbered lines from OCR (look for "1.  ..." pattern)
+                    for line in image_ocr_lines:
+                        m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip())
+                        if m:
+                            prompts.append(f"{m.group(1)}. {m.group(2)}")
+
+                # Cross-reference prompts with answers
+                sub = ans["subparts"] if ans else []
+                answer_items = []
+                for sp in sub:
+                    for it in sp["items"]:
+                        answer_items.append({
+                            "label": sp["label"],
+                            "number": it["number"],
+                            "answer": it["answer"],
+                            "alternates": it["alternates"],
+                        })
+
+                out_blocks.append({
+                    "kind": "exercise",
+                    "id": block["id"],
+                    "ansAnchor": block.get("ans_anchor", ""),
+                    "instruction": clean_instruction(block.get("instruction", "")),
+                    "extra": extras,
+                    "prompts": prompts,
+                    "ocrLines": image_ocr_lines,
+                    "freeform": ans["freeform"] if ans else False,
+                    "answerItems": answer_items,
+                    "answerRaw": ans["raw"] if ans else "",
+                    "answerSubparts": sub,
+                })
+                continue
+
+            out_blocks.append(block)
+
+        book_chapters.append({
+            "id": ch["id"],
+            "number": ch["number"],
+            "title": ch["title"],
+            "part": ch.get("part"),
+            "blocks": out_blocks,
+        })
+
+    book = {
+        "courseName": COURSE_NAME,
+        "totalChapters": len(book_chapters),
+        "totalExercises": sum(
+            1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise"
+        ),
+        "totalVocabTables": sum(
+            1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table"
+        ),
+        "totalVocabCards": len(all_vocab_cards),
+        "parts": parts,
+        "chapters": book_chapters,
+    }
+    OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False))
+
+    # Vocab cards as a separate file (grouped per chapter so they can be seeded
+    # as CourseDecks in the existing schema).
+    vocab_by_chapter: dict = {}
+    for card in all_vocab_cards:
+        vocab_by_chapter.setdefault(card["chapter"], []).append(card)
+    OUT_VOCAB.write_text(json.dumps({
+        "courseName": COURSE_NAME,
+        "chapters": [
+            {
+                "chapter": ch_num,
+                "cards": cards,
+            }
+            for ch_num, cards in sorted(vocab_by_chapter.items())
+        ],
+    }, ensure_ascii=False, indent=2))
+
+    # Summary
+    print(f"Wrote {OUT_BOOK}")
+    print(f"Wrote {OUT_VOCAB}")
+    print(f"Chapters:           {book['totalChapters']}")
+    print(f"Exercises:          {book['totalExercises']}")
+    print(f"Vocab tables:       {book['totalVocabTables']}")
+    print(f"Vocab cards (auto): {book['totalVocabCards']}")
+    if missing_ocr:
+        print(f"Missing OCR for {len(missing_ocr)} images (first 5): {sorted(list(missing_ocr))[:5]}")
+
+    # Validation
+    total_exercises = book["totalExercises"]
+    exercises_with_prompts = sum(
+        1 for ch in book_chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and (b["prompts"] or b["extra"])
+    )
+    exercises_with_answers = sum(
+        1 for ch in book_chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and b["answerItems"]
+    )
+    exercises_freeform = sum(
+        1 for ch in book_chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and b["freeform"]
+    )
+    print(f"Exercises with prompts: {exercises_with_prompts}/{total_exercises}")
+    print(f"Exercises with answers: {exercises_with_answers}/{total_exercises}")
+    print(f"Freeform exercises:     {exercises_freeform}")
+
+
+if __name__ == "__main__":
+    merge()