Previously the chapter reader showed vocab tables as a flat list of OCR lines — because Vision reads columns top-to-bottom, the Spanish column appeared as one block followed by the English column, making pairings illegible. Now every vocab table renders as a 2-column grid with Spanish on the left and English on the right. Supporting changes: - New ocr_all_vocab.swift: bounding-box OCR over all 931 vocab images, cluster lines into rows by Y-coordinate, split rows by largest X-gap, detect 2- / 3- / 4-column layouts automatically. ~2800 pairs extracted this pass vs ~1100 from the old block-alternation heuristic. - merge_pdf_into_book.py now prefers bounding-box pairs when present, falls back to the heuristic, embeds the resulting pairs as vocab_table.cards in book.json. - DataLoader passes cards through to TextbookBlock on seed. - TextbookChapterView renders cards via SwiftUI Grid (2 cols). - fix_vocab.py quarantine rule relaxed — only mis-pairs where both sides are clearly the same language are removed. "unknown" sides stay (bbox pipeline already oriented them correctly). Textbook card count jumps from 1044 → 3118 active pairs. textbookDataVersion bumped to 9. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
253 lines
9.5 KiB
Python
253 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Apply high-confidence auto-fixes from vocab_validation.json to vocab_cards.json.
|
|
|
|
Auto-fix rules (conservative):
|
|
1. If a flagged word has exactly one suggestion AND that suggestion differs by
|
|
<= 2 characters AND has the same starting letter (high-confidence character swap).
|
|
2. If a card is detected as reversed (Spanish on EN side, English on ES side),
|
|
swap front/back.
|
|
|
|
Cards that aren't auto-fixable end up in manual_review.json.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import unicodedata
|
|
from pathlib import Path
|
|
|
|
HERE = Path(__file__).resolve().parent
|
|
VOCAB = HERE / "vocab_cards.json"
|
|
VALIDATION = HERE / "vocab_validation.json"
|
|
OUT_VOCAB = HERE / "vocab_cards.json"
|
|
OUT_REVIEW = HERE / "manual_review.json"
|
|
OUT_QUARANTINE = HERE / "quarantined_cards.json"
|
|
|
|
|
|
def _strip_accents(s: str) -> str:
|
|
return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
|
|
|
|
|
|
def _levenshtein(a: str, b: str) -> int:
|
|
if a == b: return 0
|
|
if not a: return len(b)
|
|
if not b: return len(a)
|
|
prev = list(range(len(b) + 1))
|
|
for i, ca in enumerate(a, 1):
|
|
curr = [i]
|
|
for j, cb in enumerate(b, 1):
|
|
cost = 0 if ca == cb else 1
|
|
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
|
|
prev = curr
|
|
return prev[-1]
|
|
|
|
|
|
SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
|
|
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
|
|
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
|
|
|
|
|
|
def language_score(s: str) -> "tuple[int, int]":
|
|
"""Return (es_score, en_score) for a string."""
|
|
es = 0
|
|
en = 0
|
|
if SPANISH_ACCENT_RE.search(s):
|
|
es += 3
|
|
words = s.lower().split()
|
|
if not words:
|
|
return (es, en)
|
|
first = words[0].strip(",.;:")
|
|
if first in SPANISH_ARTICLES:
|
|
es += 2
|
|
if first in ENGLISH_STARTERS:
|
|
en += 2
|
|
# Spanish-likely endings on later words
|
|
for w in words:
|
|
w = w.strip(",.;:")
|
|
if not w: continue
|
|
if w.endswith(("ción", "sión", "dad", "tud")):
|
|
es += 1
|
|
if w.endswith(("ing", "tion", "ness", "ment", "able", "ly")):
|
|
en += 1
|
|
return (es, en)
|
|
|
|
|
|
def is_reversed(front: str, back: str) -> bool:
|
|
"""True when front looks like English and back looks like Spanish (i.e. swapped)."""
|
|
fes, fen = language_score(front)
|
|
bes, ben = language_score(back)
|
|
# Front English-leaning AND back Spanish-leaning
|
|
return fen > fes and bes > ben
|
|
|
|
|
|
def best_replacement(word: str, suggestions: list) -> "str | None":
|
|
"""Pick the one safe correction, or None to leave it alone."""
|
|
if not suggestions:
|
|
return None
|
|
# Prefer suggestions that share the same first letter
|
|
same_initial = [s for s in suggestions if s and word and s[0].lower() == word[0].lower()]
|
|
candidates = same_initial or suggestions
|
|
# Single best: short edit distance
|
|
best = None
|
|
best_d = 99
|
|
for s in candidates:
|
|
d = _levenshtein(word.lower(), s.lower())
|
|
# Don't apply if the "fix" changes too much
|
|
if d == 0:
|
|
continue
|
|
if d > 2:
|
|
continue
|
|
if d < best_d:
|
|
best = s
|
|
best_d = d
|
|
return best
|
|
|
|
|
|
def side_language_match(text: str, expected_side: str) -> bool:
|
|
"""Return True when `text` looks like the expected language (es/en).
|
|
Guards against applying Spanish spell-fix to English words on a mis-paired card.
|
|
"""
|
|
es, en = language_score(text)
|
|
if expected_side == "es":
|
|
return es > en # require clear Spanish signal
|
|
if expected_side == "en":
|
|
return en >= es # allow equal when text has no strong signal (common for English)
|
|
return False
|
|
|
|
|
|
def apply_word_fixes(text: str, bad_words: list, expected_side: str) -> "tuple[str, list]":
|
|
"""Apply word-level corrections inside a string. Skips fixes entirely when
|
|
the side's actual language doesn't match the dictionary used, to avoid
|
|
corrupting mis-paired cards."""
|
|
if not side_language_match(text, expected_side):
|
|
return (text, [])
|
|
|
|
new_text = text
|
|
applied = []
|
|
for bw in bad_words:
|
|
word = bw["word"]
|
|
sugg = bw["suggestions"]
|
|
replacement = best_replacement(word, sugg)
|
|
if replacement is None:
|
|
continue
|
|
# Match standalone word including the (possibly-omitted) trailing period:
|
|
# `Uds` in the text should be replaced with `Uds.` even when adjacent to `.`.
|
|
escaped = re.escape(word)
|
|
# Allow an optional existing period that we'd otherwise duplicate.
|
|
pattern = re.compile(rf"(?<![A-Za-zÁ-ú]){escaped}\.?(?![A-Za-zÁ-ú])")
|
|
if pattern.search(new_text):
|
|
new_text = pattern.sub(replacement, new_text, count=1)
|
|
applied.append({"from": word, "to": replacement})
|
|
return (new_text, applied)
|
|
|
|
|
|
def main() -> None:
|
|
vocab_data = json.loads(VOCAB.read_text(encoding="utf-8"))
|
|
val_data = json.loads(VALIDATION.read_text(encoding="utf-8"))
|
|
|
|
# Index validation by (chapter, front, back, sourceImage) for lookup
|
|
val_index: dict = {}
|
|
for f in val_data["flags"]:
|
|
key = (f["chapter"], f["front"], f["back"], f["sourceImage"])
|
|
val_index[key] = f
|
|
|
|
# Walk the cards in place
|
|
auto_fixed_word = 0
|
|
auto_swapped = 0
|
|
quarantined = 0
|
|
manual_review_cards = []
|
|
quarantined_cards = []
|
|
|
|
for ch in vocab_data["chapters"]:
|
|
kept_cards = []
|
|
for card in ch["cards"]:
|
|
key = (ch["chapter"], card["front"], card["back"], card.get("sourceImage", ""))
|
|
flag = val_index.get(key)
|
|
|
|
# 1) Reversal swap (apply even when not flagged)
|
|
if is_reversed(card["front"], card["back"]):
|
|
card["front"], card["back"] = card["back"], card["front"]
|
|
auto_swapped += 1
|
|
# Re-key for any further validation lookup (no-op here)
|
|
|
|
if flag is None:
|
|
kept_cards.append(card)
|
|
continue
|
|
|
|
# Quarantine only clear mis-pairs: both sides EXPLICITLY the wrong
|
|
# language (both Spanish or both English). "unknown" sides stay —
|
|
# the bounding-box pipeline already handled orientation correctly
|
|
# and many valid pairs lack the article/accent markers we classify on.
|
|
fes, fen = language_score(card["front"])
|
|
bes, ben = language_score(card["back"])
|
|
front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown")
|
|
back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown")
|
|
bothSameLang = (front_lang == "es" and back_lang == "es") or (front_lang == "en" and back_lang == "en")
|
|
reversed_pair = front_lang == "en" and back_lang == "es"
|
|
if bothSameLang or reversed_pair:
|
|
quarantined_cards.append({
|
|
"chapter": ch["chapter"],
|
|
"front": card["front"],
|
|
"back": card["back"],
|
|
"sourceImage": card.get("sourceImage", ""),
|
|
"reason": f"language-mismatch front={front_lang} back={back_lang}",
|
|
})
|
|
quarantined += 1
|
|
continue
|
|
|
|
# 2) Word-level fixes (language-aware)
|
|
new_front, applied_front = apply_word_fixes(card["front"], flag["badFront"], "es")
|
|
new_back, applied_back = apply_word_fixes(card["back"], flag["badBack"], "en")
|
|
card["front"] = new_front
|
|
card["back"] = new_back
|
|
auto_fixed_word += len(applied_front) + len(applied_back)
|
|
|
|
# If after auto-fix there are STILL flagged words with no
|
|
# confident replacement, flag for manual review.
|
|
unresolved_front = [
|
|
bw for bw in flag["badFront"]
|
|
if not any(a["from"] == bw["word"] for a in applied_front)
|
|
and best_replacement(bw["word"], bw["suggestions"]) is None
|
|
]
|
|
unresolved_back = [
|
|
bw for bw in flag["badBack"]
|
|
if not any(a["from"] == bw["word"] for a in applied_back)
|
|
and best_replacement(bw["word"], bw["suggestions"]) is None
|
|
]
|
|
if unresolved_front or unresolved_back:
|
|
manual_review_cards.append({
|
|
"chapter": ch["chapter"],
|
|
"front": card["front"],
|
|
"back": card["back"],
|
|
"sourceImage": card.get("sourceImage", ""),
|
|
"unresolvedFront": unresolved_front,
|
|
"unresolvedBack": unresolved_back,
|
|
})
|
|
kept_cards.append(card)
|
|
|
|
ch["cards"] = kept_cards
|
|
|
|
OUT_VOCAB.write_text(json.dumps(vocab_data, ensure_ascii=False, indent=2))
|
|
OUT_REVIEW.write_text(json.dumps({
|
|
"totalManualReview": len(manual_review_cards),
|
|
"cards": manual_review_cards,
|
|
}, ensure_ascii=False, indent=2))
|
|
|
|
OUT_QUARANTINE.write_text(json.dumps({
|
|
"totalQuarantined": len(quarantined_cards),
|
|
"cards": quarantined_cards,
|
|
}, ensure_ascii=False, indent=2))
|
|
|
|
total_cards = sum(len(c["cards"]) for c in vocab_data["chapters"])
|
|
print(f"Active cards (after quarantine): {total_cards}")
|
|
print(f"Auto-swapped (reversed): {auto_swapped}")
|
|
print(f"Auto-fixed words: {auto_fixed_word}")
|
|
print(f"Quarantined (mis-paired): {quarantined}")
|
|
print(f"Cards needing manual review: {len(manual_review_cards)}")
|
|
print(f"Wrote {OUT_VOCAB}")
|
|
print(f"Wrote {OUT_REVIEW}")
|
|
print(f"Wrote {OUT_QUARANTINE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|