Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""Apply high-confidence auto-fixes from vocab_validation.json to vocab_cards.json.
+
+Auto-fix rules (conservative):
+  1. If a flagged word has exactly one suggestion AND that suggestion differs by
+     <= 2 characters AND has the same starting letter (high-confidence character swap).
+  2. If a card is detected as reversed (Spanish on EN side, English on ES side),
+     swap front/back.
+
+Cards that aren't auto-fixable end up in manual_review.json.
+"""
+
+import json
+import re
+import unicodedata
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+VOCAB = HERE / "vocab_cards.json"
+VALIDATION = HERE / "vocab_validation.json"
+OUT_VOCAB = HERE / "vocab_cards.json"
+OUT_REVIEW = HERE / "manual_review.json"
+OUT_QUARANTINE = HERE / "quarantined_cards.json"
+
+
+def _strip_accents(s: str) -> str:
+    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
+
+
+def _levenshtein(a: str, b: str) -> int:
+    if a == b: return 0
+    if not a: return len(b)
+    if not b: return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        curr = [i]
+        for j, cb in enumerate(b, 1):
+            cost = 0 if ca == cb else 1
+            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
+        prev = curr
+    return prev[-1]
+
+
+SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
+SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
+ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
+
+
+def language_score(s: str) -> "tuple[int, int]":
+    """Return (es_score, en_score) for a string."""
+    es = 0
+    en = 0
+    if SPANISH_ACCENT_RE.search(s):
+        es += 3
+    words = s.lower().split()
+    if not words:
+        return (es, en)
+    first = words[0].strip(",.;:")
+    if first in SPANISH_ARTICLES:
+        es += 2
+    if first in ENGLISH_STARTERS:
+        en += 2
+    # Spanish-likely endings on later words
+    for w in words:
+        w = w.strip(",.;:")
+        if not w: continue
+        if w.endswith(("ción", "sión", "dad", "tud")):
+            es += 1
+        if w.endswith(("ing", "tion", "ness", "ment", "able", "ly")):
+            en += 1
+    return (es, en)
+
+
+def is_reversed(front: str, back: str) -> bool:
+    """True when front looks like English and back looks like Spanish (i.e. swapped)."""
+    fes, fen = language_score(front)
+    bes, ben = language_score(back)
+    # Front English-leaning AND back Spanish-leaning
+    return fen > fes and bes > ben
+
+
+def best_replacement(word: str, suggestions: list) -> "str | None":
+    """Pick the one safe correction, or None to leave it alone."""
+    if not suggestions:
+        return None
+    # Prefer suggestions that share the same first letter
+    same_initial = [s for s in suggestions if s and word and s[0].lower() == word[0].lower()]
+    candidates = same_initial or suggestions
+    # Single best: short edit distance
+    best = None
+    best_d = 99
+    for s in candidates:
+        d = _levenshtein(word.lower(), s.lower())
+        # Don't apply if the "fix" changes too much
+        if d == 0:
+            continue
+        if d > 2:
+            continue
+        if d < best_d:
+            best = s
+            best_d = d
+    return best
+
+
+def side_language_match(text: str, expected_side: str) -> bool:
+    """Return True when `text` looks like the expected language (es/en).
+    Guards against applying Spanish spell-fix to English words on a mis-paired card.
+    """
+    es, en = language_score(text)
+    if expected_side == "es":
+        return es > en  # require clear Spanish signal
+    if expected_side == "en":
+        return en >= es  # allow equal when text has no strong signal (common for English)
+    return False
+
+
+def apply_word_fixes(text: str, bad_words: list, expected_side: str) -> "tuple[str, list]":
+    """Apply word-level corrections inside a string. Skips fixes entirely when
+    the side's actual language doesn't match the dictionary used, to avoid
+    corrupting mis-paired cards."""
+    if not side_language_match(text, expected_side):
+        return (text, [])
+
+    new_text = text
+    applied = []
+    for bw in bad_words:
+        word = bw["word"]
+        sugg = bw["suggestions"]
+        replacement = best_replacement(word, sugg)
+        if replacement is None:
+            continue
+        # Match standalone word including the (possibly-omitted) trailing period:
+        # `Uds` in the text should be replaced with `Uds.` even when adjacent to `.`.
+        escaped = re.escape(word)
+        # Allow an optional existing period that we'd otherwise duplicate.
+        pattern = re.compile(rf"(?<![A-Za-zÁ-ú]){escaped}\.?(?![A-Za-zÁ-ú])")
+        if pattern.search(new_text):
+            new_text = pattern.sub(replacement, new_text, count=1)
+            applied.append({"from": word, "to": replacement})
+    return (new_text, applied)
+
+
+def main() -> None:
+    vocab_data = json.loads(VOCAB.read_text(encoding="utf-8"))
+    val_data = json.loads(VALIDATION.read_text(encoding="utf-8"))
+
+    # Index validation by (chapter, front, back, sourceImage) for lookup
+    val_index: dict = {}
+    for f in val_data["flags"]:
+        key = (f["chapter"], f["front"], f["back"], f["sourceImage"])
+        val_index[key] = f
+
+    # Walk the cards in place
+    auto_fixed_word = 0
+    auto_swapped = 0
+    quarantined = 0
+    manual_review_cards = []
+    quarantined_cards = []
+
+    for ch in vocab_data["chapters"]:
+        kept_cards = []
+        for card in ch["cards"]:
+            key = (ch["chapter"], card["front"], card["back"], card.get("sourceImage", ""))
+            flag = val_index.get(key)
+
+            # 1) Reversal swap (apply even when not flagged)
+            if is_reversed(card["front"], card["back"]):
+                card["front"], card["back"] = card["back"], card["front"]
+                auto_swapped += 1
+                # Re-key for any further validation lookup (no-op here)
+
+            if flag is None:
+                kept_cards.append(card)
+                continue
+
+            # Quarantine obvious mis-pairs: both sides same language OR language mismatch
+            fes, fen = language_score(card["front"])
+            bes, ben = language_score(card["back"])
+            front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown")
+            back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown")
+            # A good card has front=es, back=en. Anything else when the card is
+            # flagged is almost always a column-pairing error.
+            if front_lang != "es" or back_lang != "en":
+                quarantined_cards.append({
+                    "chapter": ch["chapter"],
+                    "front": card["front"],
+                    "back": card["back"],
+                    "sourceImage": card.get("sourceImage", ""),
+                    "reason": f"language-mismatch front={front_lang} back={back_lang}",
+                })
+                quarantined += 1
+                continue
+
+            # 2) Word-level fixes (language-aware)
+            new_front, applied_front = apply_word_fixes(card["front"], flag["badFront"], "es")
+            new_back, applied_back = apply_word_fixes(card["back"], flag["badBack"], "en")
+            card["front"] = new_front
+            card["back"] = new_back
+            auto_fixed_word += len(applied_front) + len(applied_back)
+
+            # If after auto-fix there are STILL flagged words with no
+            # confident replacement, flag for manual review.
+            unresolved_front = [
+                bw for bw in flag["badFront"]
+                if not any(a["from"] == bw["word"] for a in applied_front)
+                and best_replacement(bw["word"], bw["suggestions"]) is None
+            ]
+            unresolved_back = [
+                bw for bw in flag["badBack"]
+                if not any(a["from"] == bw["word"] for a in applied_back)
+                and best_replacement(bw["word"], bw["suggestions"]) is None
+            ]
+            if unresolved_front or unresolved_back:
+                manual_review_cards.append({
+                    "chapter": ch["chapter"],
+                    "front": card["front"],
+                    "back": card["back"],
+                    "sourceImage": card.get("sourceImage", ""),
+                    "unresolvedFront": unresolved_front,
+                    "unresolvedBack": unresolved_back,
+                })
+            kept_cards.append(card)
+
+        ch["cards"] = kept_cards
+
+    OUT_VOCAB.write_text(json.dumps(vocab_data, ensure_ascii=False, indent=2))
+    OUT_REVIEW.write_text(json.dumps({
+        "totalManualReview": len(manual_review_cards),
+        "cards": manual_review_cards,
+    }, ensure_ascii=False, indent=2))
+
+    OUT_QUARANTINE.write_text(json.dumps({
+        "totalQuarantined": len(quarantined_cards),
+        "cards": quarantined_cards,
+    }, ensure_ascii=False, indent=2))
+
+    total_cards = sum(len(c["cards"]) for c in vocab_data["chapters"])
+    print(f"Active cards (after quarantine): {total_cards}")
+    print(f"Auto-swapped (reversed):         {auto_swapped}")
+    print(f"Auto-fixed words:                {auto_fixed_word}")
+    print(f"Quarantined (mis-paired):        {quarantined}")
+    print(f"Cards needing manual review:     {len(manual_review_cards)}")
+    print(f"Wrote {OUT_VOCAB}")
+    print(f"Wrote {OUT_REVIEW}")
+    print(f"Wrote {OUT_QUARANTINE}")
+
+
+if __name__ == "__main__":
+    main()