Files
Spanish/Conjuga/Scripts/textbook/fix_vocab.py
Trey T 63dfc5e41a Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00

250 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""Apply high-confidence auto-fixes from vocab_validation.json to vocab_cards.json.
Auto-fix rules (conservative):
1. If a flagged word has exactly one suggestion AND that suggestion differs by
<= 2 characters AND has the same starting letter (high-confidence character swap).
2. If a card is detected as reversed (Spanish on EN side, English on ES side),
swap front/back.
Cards that aren't auto-fixable end up in manual_review.json.
"""
import json
import re
import unicodedata
from pathlib import Path
HERE = Path(__file__).resolve().parent
VOCAB = HERE / "vocab_cards.json"
VALIDATION = HERE / "vocab_validation.json"
OUT_VOCAB = HERE / "vocab_cards.json"
OUT_REVIEW = HERE / "manual_review.json"
OUT_QUARANTINE = HERE / "quarantined_cards.json"
def _strip_accents(s: str) -> str:
return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
def _levenshtein(a: str, b: str) -> int:
if a == b: return 0
if not a: return len(b)
if not b: return len(a)
prev = list(range(len(b) + 1))
for i, ca in enumerate(a, 1):
curr = [i]
for j, cb in enumerate(b, 1):
cost = 0 if ca == cb else 1
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
prev = curr
return prev[-1]
SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
def language_score(s: str) -> "tuple[int, int]":
"""Return (es_score, en_score) for a string."""
es = 0
en = 0
if SPANISH_ACCENT_RE.search(s):
es += 3
words = s.lower().split()
if not words:
return (es, en)
first = words[0].strip(",.;:")
if first in SPANISH_ARTICLES:
es += 2
if first in ENGLISH_STARTERS:
en += 2
# Spanish-likely endings on later words
for w in words:
w = w.strip(",.;:")
if not w: continue
if w.endswith(("ción", "sión", "dad", "tud")):
es += 1
if w.endswith(("ing", "tion", "ness", "ment", "able", "ly")):
en += 1
return (es, en)
def is_reversed(front: str, back: str) -> bool:
"""True when front looks like English and back looks like Spanish (i.e. swapped)."""
fes, fen = language_score(front)
bes, ben = language_score(back)
# Front English-leaning AND back Spanish-leaning
return fen > fes and bes > ben
def best_replacement(word: str, suggestions: list) -> "str | None":
"""Pick the one safe correction, or None to leave it alone."""
if not suggestions:
return None
# Prefer suggestions that share the same first letter
same_initial = [s for s in suggestions if s and word and s[0].lower() == word[0].lower()]
candidates = same_initial or suggestions
# Single best: short edit distance
best = None
best_d = 99
for s in candidates:
d = _levenshtein(word.lower(), s.lower())
# Don't apply if the "fix" changes too much
if d == 0:
continue
if d > 2:
continue
if d < best_d:
best = s
best_d = d
return best
def side_language_match(text: str, expected_side: str) -> bool:
"""Return True when `text` looks like the expected language (es/en).
Guards against applying Spanish spell-fix to English words on a mis-paired card.
"""
es, en = language_score(text)
if expected_side == "es":
return es > en # require clear Spanish signal
if expected_side == "en":
return en >= es # allow equal when text has no strong signal (common for English)
return False
def apply_word_fixes(text: str, bad_words: list, expected_side: str) -> "tuple[str, list]":
"""Apply word-level corrections inside a string. Skips fixes entirely when
the side's actual language doesn't match the dictionary used, to avoid
corrupting mis-paired cards."""
if not side_language_match(text, expected_side):
return (text, [])
new_text = text
applied = []
for bw in bad_words:
word = bw["word"]
sugg = bw["suggestions"]
replacement = best_replacement(word, sugg)
if replacement is None:
continue
# Match standalone word including the (possibly-omitted) trailing period:
# `Uds` in the text should be replaced with `Uds.` even when adjacent to `.`.
escaped = re.escape(word)
# Allow an optional existing period that we'd otherwise duplicate.
pattern = re.compile(rf"(?<![A-Za-zÁ-ú]){escaped}\.?(?![A-Za-zÁ-ú])")
if pattern.search(new_text):
new_text = pattern.sub(replacement, new_text, count=1)
applied.append({"from": word, "to": replacement})
return (new_text, applied)
def main() -> None:
vocab_data = json.loads(VOCAB.read_text(encoding="utf-8"))
val_data = json.loads(VALIDATION.read_text(encoding="utf-8"))
# Index validation by (chapter, front, back, sourceImage) for lookup
val_index: dict = {}
for f in val_data["flags"]:
key = (f["chapter"], f["front"], f["back"], f["sourceImage"])
val_index[key] = f
# Walk the cards in place
auto_fixed_word = 0
auto_swapped = 0
quarantined = 0
manual_review_cards = []
quarantined_cards = []
for ch in vocab_data["chapters"]:
kept_cards = []
for card in ch["cards"]:
key = (ch["chapter"], card["front"], card["back"], card.get("sourceImage", ""))
flag = val_index.get(key)
# 1) Reversal swap (apply even when not flagged)
if is_reversed(card["front"], card["back"]):
card["front"], card["back"] = card["back"], card["front"]
auto_swapped += 1
# Re-key for any further validation lookup (no-op here)
if flag is None:
kept_cards.append(card)
continue
# Quarantine obvious mis-pairs: both sides same language OR language mismatch
fes, fen = language_score(card["front"])
bes, ben = language_score(card["back"])
front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown")
back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown")
# A good card has front=es, back=en. Anything else when the card is
# flagged is almost always a column-pairing error.
if front_lang != "es" or back_lang != "en":
quarantined_cards.append({
"chapter": ch["chapter"],
"front": card["front"],
"back": card["back"],
"sourceImage": card.get("sourceImage", ""),
"reason": f"language-mismatch front={front_lang} back={back_lang}",
})
quarantined += 1
continue
# 2) Word-level fixes (language-aware)
new_front, applied_front = apply_word_fixes(card["front"], flag["badFront"], "es")
new_back, applied_back = apply_word_fixes(card["back"], flag["badBack"], "en")
card["front"] = new_front
card["back"] = new_back
auto_fixed_word += len(applied_front) + len(applied_back)
# If after auto-fix there are STILL flagged words with no
# confident replacement, flag for manual review.
unresolved_front = [
bw for bw in flag["badFront"]
if not any(a["from"] == bw["word"] for a in applied_front)
and best_replacement(bw["word"], bw["suggestions"]) is None
]
unresolved_back = [
bw for bw in flag["badBack"]
if not any(a["from"] == bw["word"] for a in applied_back)
and best_replacement(bw["word"], bw["suggestions"]) is None
]
if unresolved_front or unresolved_back:
manual_review_cards.append({
"chapter": ch["chapter"],
"front": card["front"],
"back": card["back"],
"sourceImage": card.get("sourceImage", ""),
"unresolvedFront": unresolved_front,
"unresolvedBack": unresolved_back,
})
kept_cards.append(card)
ch["cards"] = kept_cards
OUT_VOCAB.write_text(json.dumps(vocab_data, ensure_ascii=False, indent=2))
OUT_REVIEW.write_text(json.dumps({
"totalManualReview": len(manual_review_cards),
"cards": manual_review_cards,
}, ensure_ascii=False, indent=2))
OUT_QUARANTINE.write_text(json.dumps({
"totalQuarantined": len(quarantined_cards),
"cards": quarantined_cards,
}, ensure_ascii=False, indent=2))
total_cards = sum(len(c["cards"]) for c in vocab_data["chapters"])
print(f"Active cards (after quarantine): {total_cards}")
print(f"Auto-swapped (reversed): {auto_swapped}")
print(f"Auto-fixed words: {auto_fixed_word}")
print(f"Quarantined (mis-paired): {quarantined}")
print(f"Cards needing manual review: {len(manual_review_cards)}")
print(f"Wrote {OUT_VOCAB}")
print(f"Wrote {OUT_REVIEW}")
print(f"Wrote {OUT_QUARANTINE}")
if __name__ == "__main__":
main()