Spanish/Conjuga/Scripts/textbook/fix_vocab.py

#!/usr/bin/env python3
"""Apply high-confidence auto-fixes from vocab_validation.json to vocab_cards.json.

Auto-fix rules (conservative):
  1. If a flagged word has exactly one suggestion AND that suggestion differs by
     <= 2 characters AND has the same starting letter (high-confidence character swap).
  2. If a card is detected as reversed (Spanish on EN side, English on ES side),
     swap front/back.

Cards that aren't auto-fixable end up in manual_review.json.
"""

import json
import re
import unicodedata
from pathlib import Path

HERE = Path(__file__).resolve().parent
VOCAB = HERE / "vocab_cards.json"
VALIDATION = HERE / "vocab_validation.json"
OUT_VOCAB = HERE / "vocab_cards.json"
OUT_REVIEW = HERE / "manual_review.json"
OUT_QUARANTINE = HERE / "quarantined_cards.json"


def _strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")


def _levenshtein(a: str, b: str) -> int:
    if a == b: return 0
    if not a: return len(b)
    if not b: return len(a)
    prev = list(range(len(b) + 1))
    for i, ca in enumerate(a, 1):
        curr = [i]
        for j, cb in enumerate(b, 1):
            cost = 0 if ca == cb else 1
            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
        prev = curr
    return prev[-1]


SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}


def language_score(s: str) -> "tuple[int, int]":
    """Return (es_score, en_score) for a string."""
    es = 0
    en = 0
    if SPANISH_ACCENT_RE.search(s):
        es += 3
    words = s.lower().split()
    if not words:
        return (es, en)
    first = words[0].strip(",.;:")
    if first in SPANISH_ARTICLES:
        es += 2
    if first in ENGLISH_STARTERS:
        en += 2
    # Spanish-likely endings on later words
    for w in words:
        w = w.strip(",.;:")
        if not w: continue
        if w.endswith(("ción", "sión", "dad", "tud")):
            es += 1
        if w.endswith(("ing", "tion", "ness", "ment", "able", "ly")):
            en += 1
    return (es, en)


def is_reversed(front: str, back: str) -> bool:
    """True when front looks like English and back looks like Spanish (i.e. swapped)."""
    fes, fen = language_score(front)
    bes, ben = language_score(back)
    # Front English-leaning AND back Spanish-leaning
    return fen > fes and bes > ben


def best_replacement(word: str, suggestions: list) -> "str | None":
    """Pick the one safe correction, or None to leave it alone."""
    if not suggestions:
        return None
    # Prefer suggestions that share the same first letter
    same_initial = [s for s in suggestions if s and word and s[0].lower() == word[0].lower()]
    candidates = same_initial or suggestions
    # Single best: short edit distance
    best = None
    best_d = 99
    for s in candidates:
        d = _levenshtein(word.lower(), s.lower())
        # Don't apply if the "fix" changes too much
        if d == 0:
            continue
        if d > 2:
            continue
        if d < best_d:
            best = s
            best_d = d
    return best


def side_language_match(text: str, expected_side: str) -> bool:
    """Return True when `text` looks like the expected language (es/en).
    Guards against applying Spanish spell-fix to English words on a mis-paired card.
    """
    es, en = language_score(text)
    if expected_side == "es":
        return es > en  # require clear Spanish signal
    if expected_side == "en":
        return en >= es  # allow equal when text has no strong signal (common for English)
    return False


def apply_word_fixes(text: str, bad_words: list, expected_side: str) -> "tuple[str, list]":
    """Apply word-level corrections inside a string. Skips fixes entirely when
    the side's actual language doesn't match the dictionary used, to avoid
    corrupting mis-paired cards."""
    if not side_language_match(text, expected_side):
        return (text, [])

    new_text = text
    applied = []
    for bw in bad_words:
        word = bw["word"]
        sugg = bw["suggestions"]
        replacement = best_replacement(word, sugg)
        if replacement is None:
            continue
        # Match standalone word including the (possibly-omitted) trailing period:
        # `Uds` in the text should be replaced with `Uds.` even when adjacent to `.`.
        escaped = re.escape(word)
        # Allow an optional existing period that we'd otherwise duplicate.
        pattern = re.compile(rf"(?<![A-Za-zÁ-ú]){escaped}\.?(?![A-Za-zÁ-ú])")
        if pattern.search(new_text):
            new_text = pattern.sub(replacement, new_text, count=1)
            applied.append({"from": word, "to": replacement})
    return (new_text, applied)


def main() -> None:
    vocab_data = json.loads(VOCAB.read_text(encoding="utf-8"))
    val_data = json.loads(VALIDATION.read_text(encoding="utf-8"))

    # Index validation by (chapter, front, back, sourceImage) for lookup
    val_index: dict = {}
    for f in val_data["flags"]:
        key = (f["chapter"], f["front"], f["back"], f["sourceImage"])
        val_index[key] = f

    # Walk the cards in place
    auto_fixed_word = 0
    auto_swapped = 0
    quarantined = 0
    manual_review_cards = []
    quarantined_cards = []

    for ch in vocab_data["chapters"]:
        kept_cards = []
        for card in ch["cards"]:
            key = (ch["chapter"], card["front"], card["back"], card.get("sourceImage", ""))
            flag = val_index.get(key)

            # 1) Reversal swap (apply even when not flagged)
            if is_reversed(card["front"], card["back"]):
                card["front"], card["back"] = card["back"], card["front"]
                auto_swapped += 1
                # Re-key for any further validation lookup (no-op here)

            if flag is None:
                kept_cards.append(card)
                continue

            # Quarantine only clear mis-pairs: both sides EXPLICITLY the wrong
            # language (both Spanish or both English). "unknown" sides stay —
            # the bounding-box pipeline already handled orientation correctly
            # and many valid pairs lack the article/accent markers we classify on.
            fes, fen = language_score(card["front"])
            bes, ben = language_score(card["back"])
            front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown")
            back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown")
            bothSameLang = (front_lang == "es" and back_lang == "es") or (front_lang == "en" and back_lang == "en")
            reversed_pair = front_lang == "en" and back_lang == "es"
            if bothSameLang or reversed_pair:
                quarantined_cards.append({
                    "chapter": ch["chapter"],
                    "front": card["front"],
                    "back": card["back"],
                    "sourceImage": card.get("sourceImage", ""),
                    "reason": f"language-mismatch front={front_lang} back={back_lang}",
                })
                quarantined += 1
                continue

            # 2) Word-level fixes (language-aware)
            new_front, applied_front = apply_word_fixes(card["front"], flag["badFront"], "es")
            new_back, applied_back = apply_word_fixes(card["back"], flag["badBack"], "en")
            card["front"] = new_front
            card["back"] = new_back
            auto_fixed_word += len(applied_front) + len(applied_back)

            # If after auto-fix there are STILL flagged words with no
            # confident replacement, flag for manual review.
            unresolved_front = [
                bw for bw in flag["badFront"]
                if not any(a["from"] == bw["word"] for a in applied_front)
                and best_replacement(bw["word"], bw["suggestions"]) is None
            ]
            unresolved_back = [
                bw for bw in flag["badBack"]
                if not any(a["from"] == bw["word"] for a in applied_back)
                and best_replacement(bw["word"], bw["suggestions"]) is None
            ]
            if unresolved_front or unresolved_back:
                manual_review_cards.append({
                    "chapter": ch["chapter"],
                    "front": card["front"],
                    "back": card["back"],
                    "sourceImage": card.get("sourceImage", ""),
                    "unresolvedFront": unresolved_front,
                    "unresolvedBack": unresolved_back,
                })
            kept_cards.append(card)

        ch["cards"] = kept_cards

    OUT_VOCAB.write_text(json.dumps(vocab_data, ensure_ascii=False, indent=2))
    OUT_REVIEW.write_text(json.dumps({
        "totalManualReview": len(manual_review_cards),
        "cards": manual_review_cards,
    }, ensure_ascii=False, indent=2))

    OUT_QUARANTINE.write_text(json.dumps({
        "totalQuarantined": len(quarantined_cards),
        "cards": quarantined_cards,
    }, ensure_ascii=False, indent=2))

    total_cards = sum(len(c["cards"]) for c in vocab_data["chapters"])
    print(f"Active cards (after quarantine): {total_cards}")
    print(f"Auto-swapped (reversed):         {auto_swapped}")
    print(f"Auto-fixed words:                {auto_fixed_word}")
    print(f"Quarantined (mis-paired):        {quarantined}")
    print(f"Cards needing manual review:     {len(manual_review_cards)}")
    print(f"Wrote {OUT_VOCAB}")
    print(f"Wrote {OUT_REVIEW}")
    print(f"Wrote {OUT_QUARANTINE}")


if __name__ == "__main__":
    main()