#!/usr/bin/env python3 """Apply high-confidence auto-fixes from vocab_validation.json to vocab_cards.json. Auto-fix rules (conservative): 1. If a flagged word has exactly one suggestion AND that suggestion differs by <= 2 characters AND has the same starting letter (high-confidence character swap). 2. If a card is detected as reversed (Spanish on EN side, English on ES side), swap front/back. Cards that aren't auto-fixable end up in manual_review.json. """ import json import re import unicodedata from pathlib import Path HERE = Path(__file__).resolve().parent VOCAB = HERE / "vocab_cards.json" VALIDATION = HERE / "vocab_validation.json" OUT_VOCAB = HERE / "vocab_cards.json" OUT_REVIEW = HERE / "manual_review.json" OUT_QUARANTINE = HERE / "quarantined_cards.json" def _strip_accents(s: str) -> str: return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn") def _levenshtein(a: str, b: str) -> int: if a == b: return 0 if not a: return len(b) if not b: return len(a) prev = list(range(len(b) + 1)) for i, ca in enumerate(a, 1): curr = [i] for j, cb in enumerate(b, 1): cost = 0 if ca == cb else 1 curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost)) prev = curr return prev[-1] SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]") SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"} ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"} def language_score(s: str) -> "tuple[int, int]": """Return (es_score, en_score) for a string.""" es = 0 en = 0 if SPANISH_ACCENT_RE.search(s): es += 3 words = s.lower().split() if not words: return (es, en) first = words[0].strip(",.;:") if first in SPANISH_ARTICLES: es += 2 if first in ENGLISH_STARTERS: en += 2 # Spanish-likely endings on later words for w in words: w = w.strip(",.;:") if not w: continue if w.endswith(("ción", "sión", "dad", "tud")): es += 1 if w.endswith(("ing", "tion", "ness", "ment", "able", "ly")): en += 1 return (es, en) def is_reversed(front: str, back: str) -> bool: """True when front looks like English and back looks like Spanish (i.e. swapped).""" fes, fen = language_score(front) bes, ben = language_score(back) # Front English-leaning AND back Spanish-leaning return fen > fes and bes > ben def best_replacement(word: str, suggestions: list) -> "str | None": """Pick the one safe correction, or None to leave it alone.""" if not suggestions: return None # Prefer suggestions that share the same first letter same_initial = [s for s in suggestions if s and word and s[0].lower() == word[0].lower()] candidates = same_initial or suggestions # Single best: short edit distance best = None best_d = 99 for s in candidates: d = _levenshtein(word.lower(), s.lower()) # Don't apply if the "fix" changes too much if d == 0: continue if d > 2: continue if d < best_d: best = s best_d = d return best def side_language_match(text: str, expected_side: str) -> bool: """Return True when `text` looks like the expected language (es/en). Guards against applying Spanish spell-fix to English words on a mis-paired card. """ es, en = language_score(text) if expected_side == "es": return es > en # require clear Spanish signal if expected_side == "en": return en >= es # allow equal when text has no strong signal (common for English) return False def apply_word_fixes(text: str, bad_words: list, expected_side: str) -> "tuple[str, list]": """Apply word-level corrections inside a string. Skips fixes entirely when the side's actual language doesn't match the dictionary used, to avoid corrupting mis-paired cards.""" if not side_language_match(text, expected_side): return (text, []) new_text = text applied = [] for bw in bad_words: word = bw["word"] sugg = bw["suggestions"] replacement = best_replacement(word, sugg) if replacement is None: continue # Match standalone word including the (possibly-omitted) trailing period: # `Uds` in the text should be replaced with `Uds.` even when adjacent to `.`. escaped = re.escape(word) # Allow an optional existing period that we'd otherwise duplicate. pattern = re.compile(rf"(? None: vocab_data = json.loads(VOCAB.read_text(encoding="utf-8")) val_data = json.loads(VALIDATION.read_text(encoding="utf-8")) # Index validation by (chapter, front, back, sourceImage) for lookup val_index: dict = {} for f in val_data["flags"]: key = (f["chapter"], f["front"], f["back"], f["sourceImage"]) val_index[key] = f # Walk the cards in place auto_fixed_word = 0 auto_swapped = 0 quarantined = 0 manual_review_cards = [] quarantined_cards = [] for ch in vocab_data["chapters"]: kept_cards = [] for card in ch["cards"]: key = (ch["chapter"], card["front"], card["back"], card.get("sourceImage", "")) flag = val_index.get(key) # 1) Reversal swap (apply even when not flagged) if is_reversed(card["front"], card["back"]): card["front"], card["back"] = card["back"], card["front"] auto_swapped += 1 # Re-key for any further validation lookup (no-op here) if flag is None: kept_cards.append(card) continue # Quarantine obvious mis-pairs: both sides same language OR language mismatch fes, fen = language_score(card["front"]) bes, ben = language_score(card["back"]) front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown") back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown") # A good card has front=es, back=en. Anything else when the card is # flagged is almost always a column-pairing error. if front_lang != "es" or back_lang != "en": quarantined_cards.append({ "chapter": ch["chapter"], "front": card["front"], "back": card["back"], "sourceImage": card.get("sourceImage", ""), "reason": f"language-mismatch front={front_lang} back={back_lang}", }) quarantined += 1 continue # 2) Word-level fixes (language-aware) new_front, applied_front = apply_word_fixes(card["front"], flag["badFront"], "es") new_back, applied_back = apply_word_fixes(card["back"], flag["badBack"], "en") card["front"] = new_front card["back"] = new_back auto_fixed_word += len(applied_front) + len(applied_back) # If after auto-fix there are STILL flagged words with no # confident replacement, flag for manual review. unresolved_front = [ bw for bw in flag["badFront"] if not any(a["from"] == bw["word"] for a in applied_front) and best_replacement(bw["word"], bw["suggestions"]) is None ] unresolved_back = [ bw for bw in flag["badBack"] if not any(a["from"] == bw["word"] for a in applied_back) and best_replacement(bw["word"], bw["suggestions"]) is None ] if unresolved_front or unresolved_back: manual_review_cards.append({ "chapter": ch["chapter"], "front": card["front"], "back": card["back"], "sourceImage": card.get("sourceImage", ""), "unresolvedFront": unresolved_front, "unresolvedBack": unresolved_back, }) kept_cards.append(card) ch["cards"] = kept_cards OUT_VOCAB.write_text(json.dumps(vocab_data, ensure_ascii=False, indent=2)) OUT_REVIEW.write_text(json.dumps({ "totalManualReview": len(manual_review_cards), "cards": manual_review_cards, }, ensure_ascii=False, indent=2)) OUT_QUARANTINE.write_text(json.dumps({ "totalQuarantined": len(quarantined_cards), "cards": quarantined_cards, }, ensure_ascii=False, indent=2)) total_cards = sum(len(c["cards"]) for c in vocab_data["chapters"]) print(f"Active cards (after quarantine): {total_cards}") print(f"Auto-swapped (reversed): {auto_swapped}") print(f"Auto-fixed words: {auto_fixed_word}") print(f"Quarantined (mis-paired): {quarantined}") print(f"Cards needing manual review: {len(manual_review_cards)}") print(f"Wrote {OUT_VOCAB}") print(f"Wrote {OUT_REVIEW}") print(f"Wrote {OUT_QUARANTINE}") if __name__ == "__main__": main()