Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+"""Parse all chapter XHTMLs + appendix into structured chapters.json.
+
+Output schema:
+{
+  "chapters": [
+    {
+      "id": "ch1",
+      "number": 1,
+      "title": "Nouns, Articles, and Adjectives",
+      "part": 1,                          # part 1/2/3 or null
+      "blocks": [                         # ordered content
+        {"kind": "heading", "level": 3, "text": "..."},
+        {"kind": "paragraph", "text": "...", "hasItalic": false},
+        {"kind": "key_vocab_header", "title": "Los colores (The colors)"},
+        {"kind": "vocab_image", "src": "f0010-03.jpg"},
+        {
+          "kind": "exercise",
+          "id": "1.1",
+          "ans_anchor": "ch1ans1",
+          "instruction": "Write the appropriate...",
+          "image_refs": ["f0005-02.jpg"]
+        },
+        {"kind": "image", "src": "...", "alt": "..."}
+      ]
+    }
+  ]
+}
+"""
+
+import json
+import re
+from pathlib import Path
+from bs4 import BeautifulSoup
+
+ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
+OUT = Path(__file__).resolve().parent / "chapters.json"
+
+# Common icon images embedded in headings — ignore when collecting content images
+ICON_IMAGES = {"Common01.jpg", "Common02.jpg", "Common03.jpg", "Common04.jpg", "Common05.jpg"}
+
+EXERCISE_ID_RE = re.compile(r"Exercise\s+([0-9]+\.[0-9]+)")
+ANS_REF_RE = re.compile(r"ch(\d+)ans(\d+)")
+
+
+def clean_text(el) -> str:
+    """Extract text preserving inline emphasis markers."""
+    if el is None:
+        return ""
+    # Replace <em>/<i> with markdown-ish *...*, <strong>/<b> with **...**
+    html = str(el)
+    soup = BeautifulSoup(html, "lxml")
+    # First: flatten nested emphasis so we don't emit overlapping markers.
+    # For <strong><em>X</em></strong>, drop the inner em (the bold wrapping
+    # already carries the emphasis visually). Same for <em><strong>...</strong></em>.
+    for tag in soup.find_all(["strong", "b"]):
+        for inner in tag.find_all(["em", "i"]):
+            inner.unwrap()
+    for tag in soup.find_all(["em", "i"]):
+        for inner in tag.find_all(["strong", "b"]):
+            inner.unwrap()
+    # Drop ALL inline emphasis. The source has nested/sibling em/strong
+    # patterns that CommonMark can't reliably parse, causing markers to leak
+    # into the UI. Plain text renders cleanly everywhere.
+    for tag in soup.find_all(["em", "i", "strong", "b"]):
+        tag.unwrap()
+    # Drop pagebreak spans
+    for tag in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
+        tag.decompose()
+    # Replace <br/> with newline
+    for br in soup.find_all("br"):
+        br.replace_with("\n")
+    # Use a separator so adjacent inline tags don't concatenate without spaces
+    # (e.g. "<strong><em>Ir</em></strong> and" would otherwise become "Irand").
+    text = soup.get_text(separator=" ", strip=False)
+    # Collapse runs of whitespace first.
+    text = re.sub(r"\s+", " ", text).strip()
+    # Strip any stray asterisks that sneak through (e.g. author's literal *).
+    text = text.replace("*", "")
+    # De-space punctuation
+    text = re.sub(r"\s+([,.;:!?])", r"\1", text)
+    # Tighten brackets that picked up separator-spaces: "( foo )" -> "(foo)"
+    text = re.sub(r"([(\[])\s+", r"\1", text)
+    text = re.sub(r"\s+([)\]])", r"\1", text)
+    # Collapse any double-spaces
+    text = re.sub(r"  +", " ", text).strip()
+    return text
+
+
+def is_exercise_header(h) -> bool:
+    """Heading with an <a href='ans.xhtml#...'>Exercise N.N</a> link.
+    Chapters 1-16 use h3.h3k; chapters 17+ use h4.h4."""
+    if h.name not in ("h3", "h4"):
+        return False
+    a = h.find("a", href=True)
+    if a and "ans.xhtml" in a["href"]:
+        return True
+    return False
+
+
+def is_key_vocab_header(h) -> bool:
+    """Heading with 'Key Vocabulary' text (no anchor link to answers)."""
+    if h.name not in ("h3", "h4"):
+        return False
+    text = h.get_text(strip=True)
+    if "Key Vocabulary" in text and not h.find("a", href=lambda v: v and "ans.xhtml" in v):
+        return True
+    return False
+
+
+def extract_image_srcs(parent) -> list:
+    """Return list of image src attributes, skipping icon images."""
+    srcs = []
+    for img in parent.find_all("img"):
+        src = img.get("src", "")
+        if not src or Path(src).name in ICON_IMAGES:
+            continue
+        srcs.append(src)
+    return srcs
+
+
+def parse_chapter(path: Path) -> "dict | None":
+    """Parse one chapter file into structured blocks."""
+    html = path.read_text(encoding="utf-8")
+    soup = BeautifulSoup(html, "lxml")
+    body = soup.find("body")
+    if body is None:
+        return None
+
+    # Chapter number + title
+    number = None
+    title = ""
+    h2s = body.find_all("h2")
+    for h2 in h2s:
+        classes = h2.get("class") or []
+        # Use a separator so consecutive inline tags don't concatenate
+        # (e.g. "<strong><em>Ir</em></strong> and the Future" → "Ir and the Future")
+        text_with_sep = re.sub(r"\s+", " ", h2.get_text(" ", strip=True))
+        # Strip spaces that were inserted before punctuation
+        text_with_sep = re.sub(r"\s+([,.;:!?])", r"\1", text_with_sep).strip()
+        if "h2c" in classes and text_with_sep.isdigit():
+            number = int(text_with_sep)
+        # Chapters 1–16 use h2c1; chapters 17+ use h2-c
+        elif ("h2c1" in classes or "h2-c" in classes) and not title:
+            title = text_with_sep
+    if number is None:
+        # Try id on chapter header (ch1 → 1)
+        for h2 in h2s:
+            id_ = h2.get("id", "")
+            m = re.match(r"ch(\d+)", id_)
+            if m:
+                number = int(m.group(1))
+                break
+
+    chapter_id = path.stem  # ch1, ch2, ...
+
+    # Walk section content in document order
+    section = body.find("section") or body
+    blocks: list = []
+    pending_instruction = None  # holds italic paragraph following an exercise header
+
+    for el in section.descendants:
+        if el.name is None:
+            continue
+
+        classes = el.get("class") or []
+
+        # Skip nested tags already captured via parent processing
+        # We operate only on direct h2/h3/h4/h5/p elements
+        if el.name not in ("h2", "h3", "h4", "h5", "p"):
+            continue
+
+        # Exercise header detection (h3 in ch1-16, h4 in ch17+)
+        if is_exercise_header(el):
+            a = el.find("a", href=True)
+            href = a["href"] if a else ""
+            m = EXERCISE_ID_RE.search(el.get_text())
+            ex_id = m.group(1) if m else ""
+            anchor_m = ANS_REF_RE.search(href)
+            ans_anchor = anchor_m.group(0) if anchor_m else ""
+            blocks.append({
+                "kind": "exercise",
+                "id": ex_id,
+                "ans_anchor": ans_anchor,
+                "instruction": "",
+                "image_refs": [],
+                "prompts": []
+            })
+            pending_instruction = blocks[-1]
+            continue
+
+        # Key Vocabulary header
+        if is_key_vocab_header(el):
+            blocks.append({"kind": "key_vocab_header", "title": "Key Vocabulary"})
+            pending_instruction = None
+            continue
+
+        # Other headings
+        if el.name in ("h2", "h3", "h4", "h5"):
+            if el.name == "h2":
+                # Skip the chapter-number/chapter-title h2s we already captured
+                continue
+            txt = clean_text(el)
+            if txt:
+                blocks.append({
+                    "kind": "heading",
+                    "level": int(el.name[1]),
+                    "text": txt,
+                })
+            pending_instruction = None
+            continue
+
+        # Paragraphs
+        if el.name == "p":
+            imgs = extract_image_srcs(el)
+            text = clean_text(el)
+            p_classes = set(classes)
+
+            # Skip pure blank-line class ("nump" = underscore lines under number prompts)
+            if p_classes & {"nump", "numpa"} and not text:
+                continue
+
+            # Exercise prompt: <p class="number">1.  Prompt text</p>
+            # Also number1, number2 (continuation numbering), numbera, numbert
+            if pending_instruction is not None and p_classes & {"number", "number1", "number2", "numbera", "numbert"}:
+                if text:
+                    pending_instruction["prompts"].append(text)
+                continue
+
+            # Image container for a pending exercise
+            if pending_instruction is not None and imgs and not text:
+                pending_instruction["image_refs"].extend(imgs)
+                continue
+
+            # Instruction line right after the exercise header
+            if pending_instruction is not None and text and not imgs and not pending_instruction["instruction"]:
+                pending_instruction["instruction"] = text
+                continue
+
+            # While in pending-exercise state, extra text paragraphs are word
+            # banks / context ("from the following list:" etc) — keep pending alive.
+            if pending_instruction is not None and text and not imgs:
+                pending_instruction.setdefault("extra", []).append(text)
+                continue
+
+            # Paragraphs that contain an image belong to vocab/key-vocab callouts
+            if imgs and not text:
+                for src in imgs:
+                    blocks.append({"kind": "vocab_image", "src": src})
+                continue
+
+            # Mixed paragraph: image with caption
+            if imgs and text:
+                for src in imgs:
+                    blocks.append({"kind": "vocab_image", "src": src})
+                blocks.append({"kind": "paragraph", "text": text})
+                continue
+
+            # Plain paragraph — outside any exercise
+            if text:
+                blocks.append({"kind": "paragraph", "text": text})
+
+    return {
+        "id": chapter_id,
+        "number": number,
+        "title": title,
+        "blocks": blocks,
+    }
+
+
+def assign_parts(chapters: list, part_files: "dict[int, list[int]]") -> None:
+    """Annotate chapters with part number based on TOC membership."""
+    for part_num, chapter_nums in part_files.items():
+        for ch in chapters:
+            if ch["number"] in chapter_nums:
+                ch["part"] = part_num
+    for ch in chapters:
+        ch.setdefault("part", None)
+
+
+def read_part_memberships() -> "dict[int, list[int]]":
+    """Derive part→chapter grouping from the OPF spine order."""
+    opf = next(ROOT.glob("*.opf"), None)
+    if opf is None:
+        return {}
+    soup = BeautifulSoup(opf.read_text(encoding="utf-8"), "xml")
+    memberships: dict = {}
+    current_part: "int | None" = None
+    for item in soup.find_all("item"):
+        href = item.get("href", "")
+        m_part = re.match(r"part(\d+)\.xhtml", href)
+        m_ch = re.match(r"ch(\d+)\.xhtml", href)
+        if m_part:
+            current_part = int(m_part.group(1))
+            memberships.setdefault(current_part, [])
+        elif m_ch and current_part is not None:
+            memberships[current_part].append(int(m_ch.group(1)))
+    # Manifest order tends to match spine order for this book; verify via spine just in case
+    spine = soup.find("spine")
+    if spine is not None:
+        order = []
+        for ref in spine.find_all("itemref"):
+            idref = ref.get("idref")
+            item = soup.find("item", attrs={"id": idref})
+            if item is not None:
+                order.append(item.get("href", ""))
+        # Rebuild from spine order
+        memberships = {}
+        current_part = None
+        for href in order:
+            m_part = re.match(r"part(\d+)\.xhtml", href)
+            m_ch = re.match(r"ch(\d+)\.xhtml", href)
+            if m_part:
+                current_part = int(m_part.group(1))
+                memberships.setdefault(current_part, [])
+            elif m_ch and current_part is not None:
+                memberships[current_part].append(int(m_ch.group(1)))
+    return memberships
+
+
+def main() -> None:
+    chapter_files = sorted(
+        ROOT.glob("ch*.xhtml"),
+        key=lambda p: int(re.match(r"ch(\d+)", p.stem).group(1))
+    )
+    chapters = []
+    for path in chapter_files:
+        ch = parse_chapter(path)
+        if ch:
+            chapters.append(ch)
+
+    part_memberships = read_part_memberships()
+    assign_parts(chapters, part_memberships)
+
+    out = {
+        "chapters": chapters,
+        "part_memberships": part_memberships,
+    }
+    OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))
+
+    # Summary
+    ex_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise")
+    ex_with_prompts = sum(
+        1 for ch in chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and b["prompts"]
+    )
+    ex_with_images = sum(
+        1 for ch in chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and b["image_refs"]
+    )
+    ex_empty = sum(
+        1 for ch in chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and not b["prompts"] and not b["image_refs"]
+    )
+    para_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "paragraph")
+    vocab_img_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "vocab_image")
+    print(f"Chapters:             {len(chapters)}")
+    print(f"Exercises total:      {ex_total}")
+    print(f"  with text prompts:  {ex_with_prompts}")
+    print(f"  with image prompts: {ex_with_images}")
+    print(f"  empty:              {ex_empty}")
+    print(f"Paragraphs:           {para_total}")
+    print(f"Vocab images:         {vocab_img_total}")
+    print(f"Parts: {part_memberships}")
+    print(f"Wrote {OUT}")
+
+
+if __name__ == "__main__":
+    main()