Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""Extract clean text from the PDF source and map each PDF page to the
+book's printed page number.
+
+Output: pdf_text.json
+{
+  "pdfPageCount": 806,
+  "bookPages": {
+    "3": { "text": "...", "pdfIndex": 29 },
+    "4": { ... },
+    ...
+  },
+  "unmapped": [list of pdfIndex values with no detectable book page number]
+}
+"""
+
+import json
+import re
+from pathlib import Path
+import pypdf
+
+HERE = Path(__file__).resolve().parent
+PDF = next(
+    Path(__file__).resolve().parents[3].glob("Complete Spanish Step-By-Step*.pdf"),
+    None,
+)
+OUT = HERE / "pdf_text.json"
+
+ROMAN_RE = re.compile(r"^[ivxlcdmIVXLCDM]+$")
+# Match a page number on its own line at top/bottom of the page.
+# The book uses Arabic numerals for main chapters (e.g., "3") and Roman for front matter.
+PAGE_NUM_LINE_RE = re.compile(r"^\s*(\d{1,4})\s*$", re.MULTILINE)
+
+
+def detect_book_page(text: str) -> "int | None":
+    """Find the printed page number from standalone page-number lines at the
+    top or bottom of a page."""
+    lines = [l.strip() for l in text.splitlines() if l.strip()]
+    # Check first 2 lines and last 2 lines
+    for candidate in lines[:2] + lines[-2:]:
+        m = re.match(r"^(\d{1,4})$", candidate)
+        if m:
+            return int(m.group(1))
+    return None
+
+
+def main() -> None:
+    if PDF is None:
+        print("No PDF found in project root")
+        return
+
+    print(f"Reading {PDF.name}")
+    reader = pypdf.PdfReader(str(PDF))
+    pages = reader.pages
+    print(f"PDF has {len(pages)} pages")
+
+    by_book_page: dict = {}
+    unmapped: list = []
+    last_seen: "int | None" = None
+    missed_count = 0
+
+    for i, page in enumerate(pages):
+        text = page.extract_text() or ""
+        book_page = detect_book_page(text)
+
+        if book_page is None:
+            # Carry forward sequence: if we saw page N last, assume N+1.
+            if last_seen is not None:
+                book_page = last_seen + 1
+                missed_count += 1
+            else:
+                unmapped.append(i)
+                continue
+        last_seen = book_page
+        # Strip the detected page number from text to clean the output
+        cleaned = re.sub(r"(?m)^\s*\d{1,4}\s*$", "", text).strip()
+        by_book_page[str(book_page)] = {
+            "text": cleaned,
+            "pdfIndex": i,
+        }
+
+    out = {
+        "pdfPageCount": len(pages),
+        "bookPages": by_book_page,
+        "unmapped": unmapped,
+        "inferredPages": missed_count,
+    }
+    OUT.write_text(json.dumps(out, ensure_ascii=False))
+    print(f"Mapped {len(by_book_page)} book pages; {missed_count} inferred; {len(unmapped)} unmapped")
+    print(f"Wrote {OUT}")
+
+
+if __name__ == "__main__":
+    main()