Files
Spanish/Conjuga/Scripts/textbook/integrate_repaired.py
Trey T 63dfc5e41a Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00

148 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""Merge repaired_cards.json into vocab_cards.json.
Rules:
1. New pairs are added to their chapter's deck if they don't duplicate an existing pair.
2. Duplicate detection uses normalize(front)+normalize(back).
3. Pairs whose back side starts with a Spanish-article or front side starts
with an English article are dropped (pairer got orientation wrong).
4. Emits integrate_report.json with counts.
"""
import json
import re
import unicodedata
from pathlib import Path
HERE = Path(__file__).resolve().parent
VOCAB = HERE / "vocab_cards.json"
REPAIRED = HERE / "repaired_cards.json"
QUARANTINED = HERE / "quarantined_cards.json"
OUT = HERE / "vocab_cards.json"
REPORT = HERE / "integrate_report.json"
def _strip_accents(s: str) -> str:
return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
def norm(s: str) -> str:
return _strip_accents(s.lower()).strip()
SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
def looks_swapped(front: str, back: str) -> bool:
"""True if front looks English and back looks Spanish (pair should be swapped)."""
fl = front.lower().split()
bl = back.lower().split()
if not fl or not bl:
return False
f_first = fl[0].strip(",.;:")
b_first = bl[0].strip(",.;:")
front_is_en = f_first in ENGLISH_STARTERS
back_is_es = (
SPANISH_ACCENT_RE.search(back) is not None
or b_first in SPANISH_ARTICLES
)
return front_is_en and back_is_es
def looks_good(pair: dict) -> bool:
"""Basic sanity filter on a repaired pair before it enters the deck."""
es = pair["es"].strip()
en = pair["en"].strip()
if not es or not en: return False
if len(es) < 2 or len(en) < 2: return False
# Drop if both sides obviously same language (neither has clear orientation)
es_has_accent = SPANISH_ACCENT_RE.search(es) is not None
en_has_accent = SPANISH_ACCENT_RE.search(en) is not None
if en_has_accent and not es_has_accent:
# The "en" side has accents — likely swapped
return False
return True
def main() -> None:
vocab = json.loads(VOCAB.read_text(encoding="utf-8"))
repaired = json.loads(REPAIRED.read_text(encoding="utf-8"))
quarantined = json.loads(QUARANTINED.read_text(encoding="utf-8"))
# Map image → chapter (from the quarantine list — all images here belong to the
# chapter they were quarantined from).
image_chapter: dict = {}
for c in quarantined["cards"]:
image_chapter[c["sourceImage"]] = c["chapter"]
# Build existing key set
existing_keys = set()
chapter_map: dict = {c["chapter"]: c for c in vocab["chapters"]}
for c in vocab["chapters"]:
for card in c["cards"]:
existing_keys.add((c["chapter"], norm(card["front"]), norm(card["back"])))
added_per_image: dict = {}
dropped_swapped = 0
dropped_sanity = 0
dropped_dup = 0
for image_name, data in repaired["byImage"].items():
ch_num = image_chapter.get(image_name)
if ch_num is None:
# Image not in quarantine list (shouldn't happen, but bail)
continue
deck = chapter_map.setdefault(ch_num, {"chapter": ch_num, "cards": []})
added = 0
for p in data.get("pairs", []):
es = p["es"].strip()
en = p["en"].strip()
if looks_swapped(es, en):
es, en = en, es
pair = {"es": es, "en": en}
if not looks_good(pair):
dropped_sanity += 1
continue
key = (ch_num, norm(pair["es"]), norm(pair["en"]))
if key in existing_keys:
dropped_dup += 1
continue
existing_keys.add(key)
card = {
"front": pair["es"],
"back": pair["en"],
"chapter": ch_num,
"chapterTitle": "",
"section": "",
"sourceImage": image_name,
}
deck["cards"].append(card)
added += 1
if added:
added_per_image[image_name] = added
# If any new chapter was created, ensure ordered insertion
vocab["chapters"] = sorted(chapter_map.values(), key=lambda c: c["chapter"])
OUT.write_text(json.dumps(vocab, ensure_ascii=False, indent=2))
total_added = sum(added_per_image.values())
report = {
"totalRepairedInput": repaired["totalPairs"],
"added": total_added,
"dropped_duplicate": dropped_dup,
"dropped_sanity": dropped_sanity,
"addedPerImage": added_per_image,
}
REPORT.write_text(json.dumps(report, ensure_ascii=False, indent=2))
print(f"Repaired pairs in: {repaired['totalPairs']}")
print(f"Added to deck: {total_added}")
print(f"Dropped as duplicate: {dropped_dup}")
print(f"Dropped as swapped/bad: {dropped_sanity}")
print(f"Wrote {OUT}")
if __name__ == "__main__":
main()