Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
148 lines
5.1 KiB
Python
148 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Merge repaired_cards.json into vocab_cards.json.
|
|
|
|
Rules:
|
|
1. New pairs are added to their chapter's deck if they don't duplicate an existing pair.
|
|
2. Duplicate detection uses normalize(front)+normalize(back).
|
|
3. Pairs whose back side starts with a Spanish-article or front side starts
|
|
with an English article are dropped (pairer got orientation wrong).
|
|
4. Emits integrate_report.json with counts.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import unicodedata
|
|
from pathlib import Path
|
|
|
|
HERE = Path(__file__).resolve().parent
|
|
VOCAB = HERE / "vocab_cards.json"
|
|
REPAIRED = HERE / "repaired_cards.json"
|
|
QUARANTINED = HERE / "quarantined_cards.json"
|
|
OUT = HERE / "vocab_cards.json"
|
|
REPORT = HERE / "integrate_report.json"
|
|
|
|
|
|
def _strip_accents(s: str) -> str:
|
|
return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
|
|
|
|
|
|
def norm(s: str) -> str:
|
|
return _strip_accents(s.lower()).strip()
|
|
|
|
|
|
SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
|
|
SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
|
|
ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
|
|
|
|
|
|
def looks_swapped(front: str, back: str) -> bool:
|
|
"""True if front looks English and back looks Spanish (pair should be swapped)."""
|
|
fl = front.lower().split()
|
|
bl = back.lower().split()
|
|
if not fl or not bl:
|
|
return False
|
|
f_first = fl[0].strip(",.;:")
|
|
b_first = bl[0].strip(",.;:")
|
|
front_is_en = f_first in ENGLISH_STARTERS
|
|
back_is_es = (
|
|
SPANISH_ACCENT_RE.search(back) is not None
|
|
or b_first in SPANISH_ARTICLES
|
|
)
|
|
return front_is_en and back_is_es
|
|
|
|
|
|
def looks_good(pair: dict) -> bool:
|
|
"""Basic sanity filter on a repaired pair before it enters the deck."""
|
|
es = pair["es"].strip()
|
|
en = pair["en"].strip()
|
|
if not es or not en: return False
|
|
if len(es) < 2 or len(en) < 2: return False
|
|
# Drop if both sides obviously same language (neither has clear orientation)
|
|
es_has_accent = SPANISH_ACCENT_RE.search(es) is not None
|
|
en_has_accent = SPANISH_ACCENT_RE.search(en) is not None
|
|
if en_has_accent and not es_has_accent:
|
|
# The "en" side has accents — likely swapped
|
|
return False
|
|
return True
|
|
|
|
|
|
def main() -> None:
|
|
vocab = json.loads(VOCAB.read_text(encoding="utf-8"))
|
|
repaired = json.loads(REPAIRED.read_text(encoding="utf-8"))
|
|
quarantined = json.loads(QUARANTINED.read_text(encoding="utf-8"))
|
|
|
|
# Map image → chapter (from the quarantine list — all images here belong to the
|
|
# chapter they were quarantined from).
|
|
image_chapter: dict = {}
|
|
for c in quarantined["cards"]:
|
|
image_chapter[c["sourceImage"]] = c["chapter"]
|
|
|
|
# Build existing key set
|
|
existing_keys = set()
|
|
chapter_map: dict = {c["chapter"]: c for c in vocab["chapters"]}
|
|
for c in vocab["chapters"]:
|
|
for card in c["cards"]:
|
|
existing_keys.add((c["chapter"], norm(card["front"]), norm(card["back"])))
|
|
|
|
added_per_image: dict = {}
|
|
dropped_swapped = 0
|
|
dropped_sanity = 0
|
|
dropped_dup = 0
|
|
|
|
for image_name, data in repaired["byImage"].items():
|
|
ch_num = image_chapter.get(image_name)
|
|
if ch_num is None:
|
|
# Image not in quarantine list (shouldn't happen, but bail)
|
|
continue
|
|
deck = chapter_map.setdefault(ch_num, {"chapter": ch_num, "cards": []})
|
|
added = 0
|
|
for p in data.get("pairs", []):
|
|
es = p["es"].strip()
|
|
en = p["en"].strip()
|
|
if looks_swapped(es, en):
|
|
es, en = en, es
|
|
pair = {"es": es, "en": en}
|
|
if not looks_good(pair):
|
|
dropped_sanity += 1
|
|
continue
|
|
key = (ch_num, norm(pair["es"]), norm(pair["en"]))
|
|
if key in existing_keys:
|
|
dropped_dup += 1
|
|
continue
|
|
existing_keys.add(key)
|
|
card = {
|
|
"front": pair["es"],
|
|
"back": pair["en"],
|
|
"chapter": ch_num,
|
|
"chapterTitle": "",
|
|
"section": "",
|
|
"sourceImage": image_name,
|
|
}
|
|
deck["cards"].append(card)
|
|
added += 1
|
|
if added:
|
|
added_per_image[image_name] = added
|
|
|
|
# If any new chapter was created, ensure ordered insertion
|
|
vocab["chapters"] = sorted(chapter_map.values(), key=lambda c: c["chapter"])
|
|
OUT.write_text(json.dumps(vocab, ensure_ascii=False, indent=2))
|
|
|
|
total_added = sum(added_per_image.values())
|
|
report = {
|
|
"totalRepairedInput": repaired["totalPairs"],
|
|
"added": total_added,
|
|
"dropped_duplicate": dropped_dup,
|
|
"dropped_sanity": dropped_sanity,
|
|
"addedPerImage": added_per_image,
|
|
}
|
|
REPORT.write_text(json.dumps(report, ensure_ascii=False, indent=2))
|
|
print(f"Repaired pairs in: {repaired['totalPairs']}")
|
|
print(f"Added to deck: {total_added}")
|
|
print(f"Dropped as duplicate: {dropped_dup}")
|
|
print(f"Dropped as swapped/bad: {dropped_sanity}")
|
|
print(f"Wrote {OUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|