#!/usr/bin/env python3 """Merge repaired_cards.json into vocab_cards.json. Rules: 1. New pairs are added to their chapter's deck if they don't duplicate an existing pair. 2. Duplicate detection uses normalize(front)+normalize(back). 3. Pairs whose back side starts with a Spanish-article or front side starts with an English article are dropped (pairer got orientation wrong). 4. Emits integrate_report.json with counts. """ import json import re import unicodedata from pathlib import Path HERE = Path(__file__).resolve().parent VOCAB = HERE / "vocab_cards.json" REPAIRED = HERE / "repaired_cards.json" QUARANTINED = HERE / "quarantined_cards.json" OUT = HERE / "vocab_cards.json" REPORT = HERE / "integrate_report.json" def _strip_accents(s: str) -> str: return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn") def norm(s: str) -> str: return _strip_accents(s.lower()).strip() SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]") SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"} ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"} def looks_swapped(front: str, back: str) -> bool: """True if front looks English and back looks Spanish (pair should be swapped).""" fl = front.lower().split() bl = back.lower().split() if not fl or not bl: return False f_first = fl[0].strip(",.;:") b_first = bl[0].strip(",.;:") front_is_en = f_first in ENGLISH_STARTERS back_is_es = ( SPANISH_ACCENT_RE.search(back) is not None or b_first in SPANISH_ARTICLES ) return front_is_en and back_is_es def looks_good(pair: dict) -> bool: """Basic sanity filter on a repaired pair before it enters the deck.""" es = pair["es"].strip() en = pair["en"].strip() if not es or not en: return False if len(es) < 2 or len(en) < 2: return False # Drop if both sides obviously same language (neither has clear orientation) es_has_accent = SPANISH_ACCENT_RE.search(es) is not None en_has_accent = SPANISH_ACCENT_RE.search(en) is not None if en_has_accent and not es_has_accent: # The "en" side has accents — likely swapped return False return True def main() -> None: vocab = json.loads(VOCAB.read_text(encoding="utf-8")) repaired = json.loads(REPAIRED.read_text(encoding="utf-8")) quarantined = json.loads(QUARANTINED.read_text(encoding="utf-8")) # Map image → chapter (from the quarantine list — all images here belong to the # chapter they were quarantined from). image_chapter: dict = {} for c in quarantined["cards"]: image_chapter[c["sourceImage"]] = c["chapter"] # Build existing key set existing_keys = set() chapter_map: dict = {c["chapter"]: c for c in vocab["chapters"]} for c in vocab["chapters"]: for card in c["cards"]: existing_keys.add((c["chapter"], norm(card["front"]), norm(card["back"]))) added_per_image: dict = {} dropped_swapped = 0 dropped_sanity = 0 dropped_dup = 0 for image_name, data in repaired["byImage"].items(): ch_num = image_chapter.get(image_name) if ch_num is None: # Image not in quarantine list (shouldn't happen, but bail) continue deck = chapter_map.setdefault(ch_num, {"chapter": ch_num, "cards": []}) added = 0 for p in data.get("pairs", []): es = p["es"].strip() en = p["en"].strip() if looks_swapped(es, en): es, en = en, es pair = {"es": es, "en": en} if not looks_good(pair): dropped_sanity += 1 continue key = (ch_num, norm(pair["es"]), norm(pair["en"])) if key in existing_keys: dropped_dup += 1 continue existing_keys.add(key) card = { "front": pair["es"], "back": pair["en"], "chapter": ch_num, "chapterTitle": "", "section": "", "sourceImage": image_name, } deck["cards"].append(card) added += 1 if added: added_per_image[image_name] = added # If any new chapter was created, ensure ordered insertion vocab["chapters"] = sorted(chapter_map.values(), key=lambda c: c["chapter"]) OUT.write_text(json.dumps(vocab, ensure_ascii=False, indent=2)) total_added = sum(added_per_image.values()) report = { "totalRepairedInput": repaired["totalPairs"], "added": total_added, "dropped_duplicate": dropped_dup, "dropped_sanity": dropped_sanity, "addedPerImage": added_per_image, } REPORT.write_text(json.dumps(report, ensure_ascii=False, indent=2)) print(f"Repaired pairs in: {repaired['totalPairs']}") print(f"Added to deck: {total_added}") print(f"Dropped as duplicate: {dropped_dup}") print(f"Dropped as swapped/bad: {dropped_sanity}") print(f"Wrote {OUT}") if __name__ == "__main__": main()