Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
249
Conjuga/Scripts/textbook/fix_vocab.py
Normal file
249
Conjuga/Scripts/textbook/fix_vocab.py
Normal file
@@ -0,0 +1,249 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Apply high-confidence auto-fixes from vocab_validation.json to vocab_cards.json.
|
||||
|
||||
Auto-fix rules (conservative):
|
||||
1. If a flagged word has exactly one suggestion AND that suggestion differs by
|
||||
<= 2 characters AND has the same starting letter (high-confidence character swap).
|
||||
2. If a card is detected as reversed (Spanish on EN side, English on ES side),
|
||||
swap front/back.
|
||||
|
||||
Cards that aren't auto-fixable end up in manual_review.json.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
HERE = Path(__file__).resolve().parent
|
||||
VOCAB = HERE / "vocab_cards.json"
|
||||
VALIDATION = HERE / "vocab_validation.json"
|
||||
OUT_VOCAB = HERE / "vocab_cards.json"
|
||||
OUT_REVIEW = HERE / "manual_review.json"
|
||||
OUT_QUARANTINE = HERE / "quarantined_cards.json"
|
||||
|
||||
|
||||
def _strip_accents(s: str) -> str:
|
||||
return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
|
||||
|
||||
|
||||
def _levenshtein(a: str, b: str) -> int:
|
||||
if a == b: return 0
|
||||
if not a: return len(b)
|
||||
if not b: return len(a)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a, 1):
|
||||
curr = [i]
|
||||
for j, cb in enumerate(b, 1):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
|
||||
prev = curr
|
||||
return prev[-1]
|
||||
|
||||
|
||||
SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
|
||||
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
|
||||
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
|
||||
|
||||
|
||||
def language_score(s: str) -> "tuple[int, int]":
|
||||
"""Return (es_score, en_score) for a string."""
|
||||
es = 0
|
||||
en = 0
|
||||
if SPANISH_ACCENT_RE.search(s):
|
||||
es += 3
|
||||
words = s.lower().split()
|
||||
if not words:
|
||||
return (es, en)
|
||||
first = words[0].strip(",.;:")
|
||||
if first in SPANISH_ARTICLES:
|
||||
es += 2
|
||||
if first in ENGLISH_STARTERS:
|
||||
en += 2
|
||||
# Spanish-likely endings on later words
|
||||
for w in words:
|
||||
w = w.strip(",.;:")
|
||||
if not w: continue
|
||||
if w.endswith(("ción", "sión", "dad", "tud")):
|
||||
es += 1
|
||||
if w.endswith(("ing", "tion", "ness", "ment", "able", "ly")):
|
||||
en += 1
|
||||
return (es, en)
|
||||
|
||||
|
||||
def is_reversed(front: str, back: str) -> bool:
|
||||
"""True when front looks like English and back looks like Spanish (i.e. swapped)."""
|
||||
fes, fen = language_score(front)
|
||||
bes, ben = language_score(back)
|
||||
# Front English-leaning AND back Spanish-leaning
|
||||
return fen > fes and bes > ben
|
||||
|
||||
|
||||
def best_replacement(word: str, suggestions: list) -> "str | None":
|
||||
"""Pick the one safe correction, or None to leave it alone."""
|
||||
if not suggestions:
|
||||
return None
|
||||
# Prefer suggestions that share the same first letter
|
||||
same_initial = [s for s in suggestions if s and word and s[0].lower() == word[0].lower()]
|
||||
candidates = same_initial or suggestions
|
||||
# Single best: short edit distance
|
||||
best = None
|
||||
best_d = 99
|
||||
for s in candidates:
|
||||
d = _levenshtein(word.lower(), s.lower())
|
||||
# Don't apply if the "fix" changes too much
|
||||
if d == 0:
|
||||
continue
|
||||
if d > 2:
|
||||
continue
|
||||
if d < best_d:
|
||||
best = s
|
||||
best_d = d
|
||||
return best
|
||||
|
||||
|
||||
def side_language_match(text: str, expected_side: str) -> bool:
|
||||
"""Return True when `text` looks like the expected language (es/en).
|
||||
Guards against applying Spanish spell-fix to English words on a mis-paired card.
|
||||
"""
|
||||
es, en = language_score(text)
|
||||
if expected_side == "es":
|
||||
return es > en # require clear Spanish signal
|
||||
if expected_side == "en":
|
||||
return en >= es # allow equal when text has no strong signal (common for English)
|
||||
return False
|
||||
|
||||
|
||||
def apply_word_fixes(text: str, bad_words: list, expected_side: str) -> "tuple[str, list]":
|
||||
"""Apply word-level corrections inside a string. Skips fixes entirely when
|
||||
the side's actual language doesn't match the dictionary used, to avoid
|
||||
corrupting mis-paired cards."""
|
||||
if not side_language_match(text, expected_side):
|
||||
return (text, [])
|
||||
|
||||
new_text = text
|
||||
applied = []
|
||||
for bw in bad_words:
|
||||
word = bw["word"]
|
||||
sugg = bw["suggestions"]
|
||||
replacement = best_replacement(word, sugg)
|
||||
if replacement is None:
|
||||
continue
|
||||
# Match standalone word including the (possibly-omitted) trailing period:
|
||||
# `Uds` in the text should be replaced with `Uds.` even when adjacent to `.`.
|
||||
escaped = re.escape(word)
|
||||
# Allow an optional existing period that we'd otherwise duplicate.
|
||||
pattern = re.compile(rf"(?<![A-Za-zÁ-ú]){escaped}\.?(?![A-Za-zÁ-ú])")
|
||||
if pattern.search(new_text):
|
||||
new_text = pattern.sub(replacement, new_text, count=1)
|
||||
applied.append({"from": word, "to": replacement})
|
||||
return (new_text, applied)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
vocab_data = json.loads(VOCAB.read_text(encoding="utf-8"))
|
||||
val_data = json.loads(VALIDATION.read_text(encoding="utf-8"))
|
||||
|
||||
# Index validation by (chapter, front, back, sourceImage) for lookup
|
||||
val_index: dict = {}
|
||||
for f in val_data["flags"]:
|
||||
key = (f["chapter"], f["front"], f["back"], f["sourceImage"])
|
||||
val_index[key] = f
|
||||
|
||||
# Walk the cards in place
|
||||
auto_fixed_word = 0
|
||||
auto_swapped = 0
|
||||
quarantined = 0
|
||||
manual_review_cards = []
|
||||
quarantined_cards = []
|
||||
|
||||
for ch in vocab_data["chapters"]:
|
||||
kept_cards = []
|
||||
for card in ch["cards"]:
|
||||
key = (ch["chapter"], card["front"], card["back"], card.get("sourceImage", ""))
|
||||
flag = val_index.get(key)
|
||||
|
||||
# 1) Reversal swap (apply even when not flagged)
|
||||
if is_reversed(card["front"], card["back"]):
|
||||
card["front"], card["back"] = card["back"], card["front"]
|
||||
auto_swapped += 1
|
||||
# Re-key for any further validation lookup (no-op here)
|
||||
|
||||
if flag is None:
|
||||
kept_cards.append(card)
|
||||
continue
|
||||
|
||||
# Quarantine obvious mis-pairs: both sides same language OR language mismatch
|
||||
fes, fen = language_score(card["front"])
|
||||
bes, ben = language_score(card["back"])
|
||||
front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown")
|
||||
back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown")
|
||||
# A good card has front=es, back=en. Anything else when the card is
|
||||
# flagged is almost always a column-pairing error.
|
||||
if front_lang != "es" or back_lang != "en":
|
||||
quarantined_cards.append({
|
||||
"chapter": ch["chapter"],
|
||||
"front": card["front"],
|
||||
"back": card["back"],
|
||||
"sourceImage": card.get("sourceImage", ""),
|
||||
"reason": f"language-mismatch front={front_lang} back={back_lang}",
|
||||
})
|
||||
quarantined += 1
|
||||
continue
|
||||
|
||||
# 2) Word-level fixes (language-aware)
|
||||
new_front, applied_front = apply_word_fixes(card["front"], flag["badFront"], "es")
|
||||
new_back, applied_back = apply_word_fixes(card["back"], flag["badBack"], "en")
|
||||
card["front"] = new_front
|
||||
card["back"] = new_back
|
||||
auto_fixed_word += len(applied_front) + len(applied_back)
|
||||
|
||||
# If after auto-fix there are STILL flagged words with no
|
||||
# confident replacement, flag for manual review.
|
||||
unresolved_front = [
|
||||
bw for bw in flag["badFront"]
|
||||
if not any(a["from"] == bw["word"] for a in applied_front)
|
||||
and best_replacement(bw["word"], bw["suggestions"]) is None
|
||||
]
|
||||
unresolved_back = [
|
||||
bw for bw in flag["badBack"]
|
||||
if not any(a["from"] == bw["word"] for a in applied_back)
|
||||
and best_replacement(bw["word"], bw["suggestions"]) is None
|
||||
]
|
||||
if unresolved_front or unresolved_back:
|
||||
manual_review_cards.append({
|
||||
"chapter": ch["chapter"],
|
||||
"front": card["front"],
|
||||
"back": card["back"],
|
||||
"sourceImage": card.get("sourceImage", ""),
|
||||
"unresolvedFront": unresolved_front,
|
||||
"unresolvedBack": unresolved_back,
|
||||
})
|
||||
kept_cards.append(card)
|
||||
|
||||
ch["cards"] = kept_cards
|
||||
|
||||
OUT_VOCAB.write_text(json.dumps(vocab_data, ensure_ascii=False, indent=2))
|
||||
OUT_REVIEW.write_text(json.dumps({
|
||||
"totalManualReview": len(manual_review_cards),
|
||||
"cards": manual_review_cards,
|
||||
}, ensure_ascii=False, indent=2))
|
||||
|
||||
OUT_QUARANTINE.write_text(json.dumps({
|
||||
"totalQuarantined": len(quarantined_cards),
|
||||
"cards": quarantined_cards,
|
||||
}, ensure_ascii=False, indent=2))
|
||||
|
||||
total_cards = sum(len(c["cards"]) for c in vocab_data["chapters"])
|
||||
print(f"Active cards (after quarantine): {total_cards}")
|
||||
print(f"Auto-swapped (reversed): {auto_swapped}")
|
||||
print(f"Auto-fixed words: {auto_fixed_word}")
|
||||
print(f"Quarantined (mis-paired): {quarantined}")
|
||||
print(f"Cards needing manual review: {len(manual_review_cards)}")
|
||||
print(f"Wrote {OUT_VOCAB}")
|
||||
print(f"Wrote {OUT_REVIEW}")
|
||||
print(f"Wrote {OUT_QUARANTINE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user