Spanish/Conjuga/Scripts/textbook/integrate_repaired.py

#!/usr/bin/env python3
"""Merge repaired_cards.json into vocab_cards.json.

Rules:
  1. New pairs are added to their chapter's deck if they don't duplicate an existing pair.
  2. Duplicate detection uses normalize(front)+normalize(back).
  3. Pairs whose back side starts with a Spanish-article or front side starts
     with an English article are dropped (pairer got orientation wrong).
  4. Emits integrate_report.json with counts.
"""

import json
import re
import unicodedata
from pathlib import Path

HERE = Path(__file__).resolve().parent
VOCAB = HERE / "vocab_cards.json"
REPAIRED = HERE / "repaired_cards.json"
QUARANTINED = HERE / "quarantined_cards.json"
OUT = HERE / "vocab_cards.json"
REPORT = HERE / "integrate_report.json"


def _strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")


def norm(s: str) -> str:
    return _strip_accents(s.lower()).strip()


SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}


def looks_swapped(front: str, back: str) -> bool:
    """True if front looks English and back looks Spanish (pair should be swapped)."""
    fl = front.lower().split()
    bl = back.lower().split()
    if not fl or not bl:
        return False
    f_first = fl[0].strip(",.;:")
    b_first = bl[0].strip(",.;:")
    front_is_en = f_first in ENGLISH_STARTERS
    back_is_es = (
        SPANISH_ACCENT_RE.search(back) is not None
        or b_first in SPANISH_ARTICLES
    )
    return front_is_en and back_is_es


def looks_good(pair: dict) -> bool:
    """Basic sanity filter on a repaired pair before it enters the deck."""
    es = pair["es"].strip()
    en = pair["en"].strip()
    if not es or not en: return False
    if len(es) < 2 or len(en) < 2: return False
    # Drop if both sides obviously same language (neither has clear orientation)
    es_has_accent = SPANISH_ACCENT_RE.search(es) is not None
    en_has_accent = SPANISH_ACCENT_RE.search(en) is not None
    if en_has_accent and not es_has_accent:
        # The "en" side has accents — likely swapped
        return False
    return True


def main() -> None:
    vocab = json.loads(VOCAB.read_text(encoding="utf-8"))
    repaired = json.loads(REPAIRED.read_text(encoding="utf-8"))
    quarantined = json.loads(QUARANTINED.read_text(encoding="utf-8"))

    # Map image → chapter (from the quarantine list — all images here belong to the
    # chapter they were quarantined from).
    image_chapter: dict = {}
    for c in quarantined["cards"]:
        image_chapter[c["sourceImage"]] = c["chapter"]

    # Build existing key set
    existing_keys = set()
    chapter_map: dict = {c["chapter"]: c for c in vocab["chapters"]}
    for c in vocab["chapters"]:
        for card in c["cards"]:
            existing_keys.add((c["chapter"], norm(card["front"]), norm(card["back"])))

    added_per_image: dict = {}
    dropped_swapped = 0
    dropped_sanity = 0
    dropped_dup = 0

    for image_name, data in repaired["byImage"].items():
        ch_num = image_chapter.get(image_name)
        if ch_num is None:
            # Image not in quarantine list (shouldn't happen, but bail)
            continue
        deck = chapter_map.setdefault(ch_num, {"chapter": ch_num, "cards": []})
        added = 0
        for p in data.get("pairs", []):
            es = p["es"].strip()
            en = p["en"].strip()
            if looks_swapped(es, en):
                es, en = en, es
            pair = {"es": es, "en": en}
            if not looks_good(pair):
                dropped_sanity += 1
                continue
            key = (ch_num, norm(pair["es"]), norm(pair["en"]))
            if key in existing_keys:
                dropped_dup += 1
                continue
            existing_keys.add(key)
            card = {
                "front": pair["es"],
                "back": pair["en"],
                "chapter": ch_num,
                "chapterTitle": "",
                "section": "",
                "sourceImage": image_name,
            }
            deck["cards"].append(card)
            added += 1
        if added:
            added_per_image[image_name] = added

    # If any new chapter was created, ensure ordered insertion
    vocab["chapters"] = sorted(chapter_map.values(), key=lambda c: c["chapter"])
    OUT.write_text(json.dumps(vocab, ensure_ascii=False, indent=2))

    total_added = sum(added_per_image.values())
    report = {
        "totalRepairedInput": repaired["totalPairs"],
        "added": total_added,
        "dropped_duplicate": dropped_dup,
        "dropped_sanity": dropped_sanity,
        "addedPerImage": added_per_image,
    }
    REPORT.write_text(json.dumps(report, ensure_ascii=False, indent=2))
    print(f"Repaired pairs in:          {repaired['totalPairs']}")
    print(f"Added to deck:              {total_added}")
    print(f"Dropped as duplicate:       {dropped_dup}")
    print(f"Dropped as swapped/bad:     {dropped_sanity}")
    print(f"Wrote {OUT}")


if __name__ == "__main__":
    main()