Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions
@@ -0,0 +1,374 @@
+#!/usr/bin/env python3
+"""Merge chapters.json + answers.json + ocr.json → book.json (single source).
+
+Also emits vocab_cards.json: flashcards derived from vocab_image blocks where
+OCR text parses as a clean two-column (Spanish ↔ English) table.
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+CHAPTERS_JSON = HERE / "chapters.json"
+ANSWERS_JSON = HERE / "answers.json"
+OCR_JSON = HERE / "ocr.json"
+OUT_BOOK = HERE / "book.json"
+OUT_VOCAB = HERE / "vocab_cards.json"
+
+COURSE_NAME = "Complete Spanish Step-by-Step"
+
+# Heuristic: parseable "Spanish | English" vocab rows.
+# OCR usually produces "word  —  translation" or "word translation" separated
+# by 2+ spaces. We detect rows that contain both Spanish and English words.
+SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
+SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
+ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their", "your", "some"}
+# English-only words that would never appear as Spanish
+ENGLISH_ONLY_WORDS = {"the", "he", "she", "it", "we", "they", "I", "is", "are", "was", "were",
+                     "been", "have", "has", "had", "will", "would", "should", "could"}
+SEP_RE = re.compile(r"[ \t]{2,}|\s[—–−-]\s")
+
+
+def classify_line(line: str) -> str:
+    """Return 'es', 'en', or 'unknown' for the dominant language of a vocab line."""
+    line = line.strip()
+    if not line:
+        return "unknown"
+    # Accent = definitely Spanish
+    if SPANISH_ACCENT_RE.search(line):
+        return "es"
+    first = line.split()[0].lower().strip(",.;:")
+    if first in SPANISH_ARTICLES:
+        return "es"
+    if first in ENGLISH_STARTERS:
+        return "en"
+    # Check if the leading word is an English-only function word
+    if first in ENGLISH_ONLY_WORDS:
+        return "en"
+    return "unknown"
+
+
+def looks_english(word: str) -> bool:
+    """Legacy helper — kept for try_split_row below."""
+    w = word.lower().strip()
+    if not w:
+        return False
+    if SPANISH_ACCENT_RE.search(w):
+        return False
+    if w in SPANISH_ARTICLES:
+        return False
+    if w in ENGLISH_STARTERS or w in ENGLISH_ONLY_WORDS:
+        return True
+    return bool(re.match(r"^[a-z][a-z\s'/()\-,.]*$", w))
+
+
+def try_split_row(line: str) -> "tuple[str, str] | None":
+    """Split a line into (spanish, english) if it looks like a vocab entry."""
+    line = line.strip()
+    if not line or len(line) < 3:
+        return None
+    # Try explicit separators first
+    parts = SEP_RE.split(line)
+    parts = [p.strip() for p in parts if p.strip()]
+    if len(parts) == 2:
+        spanish, english = parts
+        if looks_english(english) and not looks_english(spanish.split()[0]):
+            return (spanish, english)
+    return None
+
+
+def load(p: Path) -> dict:
+    return json.loads(p.read_text(encoding="utf-8"))
+
+
+def build_vocab_cards_for_block(block: dict, ocr_entry: dict, chapter: dict, context_title: str, idx: int) -> list:
+    """Given a vocab_image block + its OCR lines, derive flashcards.
+
+    Vision OCR reads top-to-bottom, left-to-right; a two-column vocab table
+    produces Spanish lines first, then English lines. We split the list in
+    half when one side is predominantly Spanish and the other English.
+    Per-line '—' separators are also supported as a fallback.
+    """
+    cards = []
+    if not ocr_entry:
+        return cards
+    lines = [l.strip() for l in ocr_entry.get("lines", []) if l.strip()]
+    if not lines:
+        return cards
+
+    def card(front: str, back: str) -> dict:
+        return {
+            "front": front,
+            "back": back,
+            "chapter": chapter["number"],
+            "chapterTitle": chapter["title"],
+            "section": context_title,
+            "sourceImage": block["src"],
+        }
+
+    # Attempt 1: explicit inline separator (e.g. "la casa — the house")
+    inline = []
+    all_inline = True
+    for line in lines:
+        pair = try_split_row(line)
+        if pair:
+            inline.append(pair)
+        else:
+            all_inline = False
+            break
+    if all_inline and inline:
+        for es, en in inline:
+            cards.append(card(es, en))
+        return cards
+
+    # Attempt 2: block-alternating layout.
+    # Vision OCR reads columns top-to-bottom, so a 2-col table rendered across
+    # 2 visual columns produces runs like: [ES...ES][EN...EN][ES...ES][EN...EN]
+    # We classify each line, smooth "unknown" using neighbors, then pair
+    # same-sized consecutive ES/EN blocks.
+    classes = [classify_line(l) for l in lines]
+
+    # Pass 1: fill unknowns using nearest non-unknown neighbor (forward)
+    last_known = "unknown"
+    forward = []
+    for c in classes:
+        if c != "unknown":
+            last_known = c
+        forward.append(last_known)
+    # Pass 2: backfill leading unknowns (backward)
+    last_known = "unknown"
+    backward = [""] * len(classes)
+    for i in range(len(classes) - 1, -1, -1):
+        if classes[i] != "unknown":
+            last_known = classes[i]
+        backward[i] = last_known
+    # Merge: prefer forward unless still unknown
+    resolved = []
+    for f, b in zip(forward, backward):
+        if f != "unknown":
+            resolved.append(f)
+        elif b != "unknown":
+            resolved.append(b)
+        else:
+            resolved.append("unknown")
+
+    # Group consecutive same-lang lines
+    blocks: list = []
+    cur_lang: "str | None" = None
+    cur_block: list = []
+    for line, lang in zip(lines, resolved):
+        if lang != cur_lang:
+            if cur_block and cur_lang is not None:
+                blocks.append((cur_lang, cur_block))
+            cur_block = [line]
+            cur_lang = lang
+        else:
+            cur_block.append(line)
+    if cur_block and cur_lang is not None:
+        blocks.append((cur_lang, cur_block))
+
+    # Walk blocks pairing ES then EN of equal length
+    i = 0
+    while i < len(blocks) - 1:
+        lang_a, lines_a = blocks[i]
+        lang_b, lines_b = blocks[i + 1]
+        if lang_a == "es" and lang_b == "en" and len(lines_a) == len(lines_b):
+            for es, en in zip(lines_a, lines_b):
+                cards.append(card(es, en))
+            i += 2
+            continue
+        # If reversed order (some pages have EN column on left), try that too
+        if lang_a == "en" and lang_b == "es" and len(lines_a) == len(lines_b):
+            for es, en in zip(lines_b, lines_a):
+                cards.append(card(es, en))
+            i += 2
+            continue
+        i += 1
+
+    return cards
+
+
+def clean_instruction(text: str) -> str:
+    """Strip leading/trailing emphasis markers from a parsed instruction."""
+    # Our XHTML parser emitted * and ** for emphasis; flatten them
+    t = re.sub(r"\*+", "", text)
+    return t.strip()
+
+
+def merge() -> None:
+    chapters_data = load(CHAPTERS_JSON)
+    answers_data = load(ANSWERS_JSON)
+    try:
+        ocr_data = load(OCR_JSON)
+    except FileNotFoundError:
+        print("ocr.json not found — proceeding with empty OCR data")
+        ocr_data = {}
+
+    answers = answers_data["answers"]
+    chapters = chapters_data["chapters"]
+    parts = chapters_data.get("part_memberships", {})
+
+    book_chapters = []
+    all_vocab_cards = []
+    missing_ocr = set()
+    current_section_title = ""
+
+    for ch in chapters:
+        out_blocks = []
+        current_section_title = ch["title"]
+
+        for bi, block in enumerate(ch["blocks"]):
+            k = block["kind"]
+
+            if k == "heading":
+                current_section_title = block["text"]
+                out_blocks.append(block)
+                continue
+
+            if k == "paragraph":
+                out_blocks.append(block)
+                continue
+
+            if k == "key_vocab_header":
+                out_blocks.append(block)
+                continue
+
+            if k == "vocab_image":
+                ocr_entry = ocr_data.get(block["src"])
+                if ocr_entry is None:
+                    missing_ocr.add(block["src"])
+                derived = build_vocab_cards_for_block(
+                    block, ocr_entry, ch, current_section_title, bi
+                )
+                all_vocab_cards.extend(derived)
+                out_blocks.append({
+                    "kind": "vocab_table",
+                    "sourceImage": block["src"],
+                    "ocrLines": ocr_entry.get("lines", []) if ocr_entry else [],
+                    "ocrConfidence": ocr_entry.get("confidence", 0.0) if ocr_entry else 0.0,
+                    "cardCount": len(derived),
+                })
+                continue
+
+            if k == "exercise":
+                ans = answers.get(block["id"])
+                image_ocr_lines = []
+                for src in block.get("image_refs", []):
+                    e = ocr_data.get(src)
+                    if e is None:
+                        missing_ocr.add(src)
+                        continue
+                    image_ocr_lines.extend(e.get("lines", []))
+
+                # Build the final prompt list. If we have text prompts from
+                # XHTML, prefer them. Otherwise, attempt to use OCR lines.
+                prompts = [p for p in block.get("prompts", []) if p.strip()]
+                extras = [e for e in block.get("extra", []) if e.strip()]
+                if not prompts and image_ocr_lines:
+                    # Extract numbered lines from OCR (look for "1.  ..." pattern)
+                    for line in image_ocr_lines:
+                        m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip())
+                        if m:
+                            prompts.append(f"{m.group(1)}. {m.group(2)}")
+
+                # Cross-reference prompts with answers
+                sub = ans["subparts"] if ans else []
+                answer_items = []
+                for sp in sub:
+                    for it in sp["items"]:
+                        answer_items.append({
+                            "label": sp["label"],
+                            "number": it["number"],
+                            "answer": it["answer"],
+                            "alternates": it["alternates"],
+                        })
+
+                out_blocks.append({
+                    "kind": "exercise",
+                    "id": block["id"],
+                    "ansAnchor": block.get("ans_anchor", ""),
+                    "instruction": clean_instruction(block.get("instruction", "")),
+                    "extra": extras,
+                    "prompts": prompts,
+                    "ocrLines": image_ocr_lines,
+                    "freeform": ans["freeform"] if ans else False,
+                    "answerItems": answer_items,
+                    "answerRaw": ans["raw"] if ans else "",
+                    "answerSubparts": sub,
+                })
+                continue
+
+            out_blocks.append(block)
+
+        book_chapters.append({
+            "id": ch["id"],
+            "number": ch["number"],
+            "title": ch["title"],
+            "part": ch.get("part"),
+            "blocks": out_blocks,
+        })
+
+    book = {
+        "courseName": COURSE_NAME,
+        "totalChapters": len(book_chapters),
+        "totalExercises": sum(
+            1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise"
+        ),
+        "totalVocabTables": sum(
+            1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table"
+        ),
+        "totalVocabCards": len(all_vocab_cards),
+        "parts": parts,
+        "chapters": book_chapters,
+    }
+    OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False))
+
+    # Vocab cards as a separate file (grouped per chapter so they can be seeded
+    # as CourseDecks in the existing schema).
+    vocab_by_chapter: dict = {}
+    for card in all_vocab_cards:
+        vocab_by_chapter.setdefault(card["chapter"], []).append(card)
+    OUT_VOCAB.write_text(json.dumps({
+        "courseName": COURSE_NAME,
+        "chapters": [
+            {
+                "chapter": ch_num,
+                "cards": cards,
+            }
+            for ch_num, cards in sorted(vocab_by_chapter.items())
+        ],
+    }, ensure_ascii=False, indent=2))
+
+    # Summary
+    print(f"Wrote {OUT_BOOK}")
+    print(f"Wrote {OUT_VOCAB}")
+    print(f"Chapters:           {book['totalChapters']}")
+    print(f"Exercises:          {book['totalExercises']}")
+    print(f"Vocab tables:       {book['totalVocabTables']}")
+    print(f"Vocab cards (auto): {book['totalVocabCards']}")
+    if missing_ocr:
+        print(f"Missing OCR for {len(missing_ocr)} images (first 5): {sorted(list(missing_ocr))[:5]}")
+
+    # Validation
+    total_exercises = book["totalExercises"]
+    exercises_with_prompts = sum(
+        1 for ch in book_chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and (b["prompts"] or b["extra"])
+    )
+    exercises_with_answers = sum(
+        1 for ch in book_chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and b["answerItems"]
+    )
+    exercises_freeform = sum(
+        1 for ch in book_chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and b["freeform"]
+    )
+    print(f"Exercises with prompts: {exercises_with_prompts}/{total_exercises}")
+    print(f"Exercises with answers: {exercises_with_answers}/{total_exercises}")
+    print(f"Freeform exercises:     {exercises_freeform}")
+
+
+if __name__ == "__main__":
+    merge()
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Render book.json + ocr.json into a static HTML review page.
+
+The HTML surfaces low-confidence OCR results in red, and shows the parsed
+exercise prompts/answers next to the original image. Designed for rapid
+visual diffing against the source book.
+"""
+
+import html
+import json
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+BOOK = HERE / "book.json"
+OCR = HERE / "ocr.json"
+OUT_HTML = HERE / "review.html"
+EPUB_IMAGES = Path(HERE).parents[2] / "epub_extract" / "OEBPS"
+IMAGE_REL = EPUB_IMAGES.relative_to(HERE.parent) if False else EPUB_IMAGES
+
+
+def load(p: Path) -> dict:
+    return json.loads(p.read_text(encoding="utf-8"))
+
+
+def esc(s: str) -> str:
+    return html.escape(s or "")
+
+
+def img_tag(src: str) -> str:
+    full = (EPUB_IMAGES / src).resolve()
+    return f'<img src="file://{full}" alt="{esc(src)}" class="src"/>'
+
+
+def render() -> None:
+    book = load(BOOK)
+    ocr = load(OCR) if OCR.exists() else {}
+
+    out: list = []
+    out.append("""<!DOCTYPE html>
+<html><head><meta charset='utf-8'><title>Book review</title>
+<style>
+body { font-family: -apple-system, system-ui, sans-serif; margin: 2em; max-width: 1000px; color: #222; }
+h1 { color: #c44; }
+h2.chapter { background: #eee; padding: 0.5em; border-left: 4px solid #c44; }
+h3.heading { color: #555; }
+.para { margin: 0.5em 0; }
+.vocab-table { background: #fafff0; padding: 0.5em; margin: 0.5em 0; border: 1px solid #bda; border-radius: 6px; }
+.ocr-line { font-family: ui-monospace, monospace; font-size: 12px; }
+.lowconf { color: #c44; background: #fee; }
+.exercise { background: #fff8e8; padding: 0.5em; margin: 0.75em 0; border: 1px solid #cb9; border-radius: 6px; }
+.prompt { font-family: ui-monospace, monospace; font-size: 13px; margin: 2px 0; }
+.answer { color: #080; font-family: ui-monospace, monospace; font-size: 13px; }
+img.src { max-width: 520px; border: 1px solid #ccc; margin: 4px 0; }
+.kv { color: #04a; font-weight: bold; }
+summary { cursor: pointer; font-weight: bold; color: #666; }
+.card-pair { font-family: ui-monospace, monospace; font-size: 12px; }
+.card-es { color: #04a; }
+.card-en { color: #555; }
+.counts { color: #888; font-size: 12px; }
+</style></head><body>""")
+    out.append(f"<h1>{esc(book['courseName'])} — review</h1>")
+    out.append(f"<p>{book['totalChapters']} chapters · {book['totalExercises']} exercises · {book['totalVocabTables']} vocab tables · {book['totalVocabCards']} auto-derived cards</p>")
+
+    for ch in book["chapters"]:
+        part = ch.get("part")
+        part_str = f" (Part {part})" if part else ""
+        out.append(f"<h2 class='chapter'>Chapter {ch['number']}: {esc(ch['title'])}{esc(part_str)}</h2>")
+
+        for b in ch["blocks"]:
+            kind = b["kind"]
+            if kind == "heading":
+                level = b["level"]
+                out.append(f"<h{level} class='heading'>{esc(b['text'])}</h{level}>")
+            elif kind == "paragraph":
+                out.append(f"<p class='para'>{esc(b['text'])}</p>")
+            elif kind == "key_vocab_header":
+                out.append(f"<p class='kv'>★ Key Vocabulary</p>")
+            elif kind == "vocab_table":
+                src = b["sourceImage"]
+                conf = b["ocrConfidence"]
+                conf_class = "lowconf" if conf < 0.85 else ""
+                out.append(f"<div class='vocab-table'>")
+                out.append(f"<details><summary>vocab {esc(src)} · confidence {conf:.2f} · {b['cardCount']} card(s)</summary>")
+                out.append(img_tag(src))
+                out.append("<div>")
+                for line in b.get("ocrLines", []):
+                    out.append(f"<div class='ocr-line {conf_class}'>{esc(line)}</div>")
+                out.append("</div>")
+                # Show derived pairs (if any). We don't have them inline in book.json,
+                # but we can recompute from ocrLines using the same function.
+                out.append("</details></div>")
+            elif kind == "exercise":
+                out.append(f"<div class='exercise'>")
+                out.append(f"<b>Exercise {esc(b['id'])}</b> — <i>{esc(b['instruction'])}</i>")
+                if b.get("extra"):
+                    for e in b["extra"]:
+                        out.append(f"<div class='para'>{esc(e)}</div>")
+                if b.get("ocrLines"):
+                    out.append(f"<details><summary>OCR lines from image</summary>")
+                    for line in b["ocrLines"]:
+                        out.append(f"<div class='ocr-line'>{esc(line)}</div>")
+                    out.append("</details>")
+                if b.get("prompts"):
+                    out.append("<div><b>Parsed prompts:</b></div>")
+                    for p in b["prompts"]:
+                        out.append(f"<div class='prompt'>• {esc(p)}</div>")
+                if b.get("answerItems"):
+                    out.append("<div><b>Answer key:</b></div>")
+                    for a in b["answerItems"]:
+                        label_str = f"{a['label']}. " if a.get("label") else ""
+                        alts = ", ".join(a["alternates"])
+                        alt_str = f"  <span style='color:#999'>(also: {esc(alts)})</span>" if alts else ""
+                        out.append(f"<div class='answer'>{esc(label_str)}{a['number']}. {esc(a['answer'])}{alt_str}</div>")
+                if b.get("freeform"):
+                    out.append("<div style='color:#c44'>(Freeform — answers will vary)</div>")
+                for img_src in b.get("image_refs", []):
+                    out.append(img_tag(img_src))
+                out.append("</div>")
+
+    out.append("</body></html>")
+    OUT_HTML.write_text("\n".join(out), encoding="utf-8")
+    print(f"Wrote {OUT_HTML}")
+
+
+if __name__ == "__main__":
+    render()
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""Parse ans.xhtml into structured answers.json.
+
+Output schema:
+{
+  "answers": {
+    "1.1": {
+      "id": "1.1",
+      "anchor": "ch1ans1",
+      "chapter": 1,
+      "subparts": [
+        {"label": null, "items": [
+          {"number": 1, "answer": "el", "alternates": []},
+          {"number": 2, "answer": "el", "alternates": []},
+          ...
+        ]}
+      ],
+      "freeform": false,        # true if "Answers will vary"
+      "raw": "..."              # raw text for fallback
+    },
+    "2.4": {                     # multi-part exercise
+      "subparts": [
+        {"label": "A", "items": [...]},
+        {"label": "B", "items": [...]},
+        {"label": "C", "items": [...]}
+      ]
+    }
+  }
+}
+"""
+
+import json
+import re
+from pathlib import Path
+from bs4 import BeautifulSoup, NavigableString
+
+ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
+OUT = Path(__file__).resolve().parent / "answers.json"
+
+ANSWER_CLASSES = {"answerq", "answerq1", "answerq2", "answerqa"}
+EXERCISE_ID_RE = re.compile(r"^([0-9]+)\.([0-9]+)$")
+SUBPART_LABEL_RE = re.compile(r"^([A-Z])\b")
+NUMBERED_ITEM_RE = re.compile(r"(?:^|\s)(\d+)\.\s+")
+FREEFORM_PATTERNS = [
+    re.compile(r"answers? will vary", re.IGNORECASE),
+    re.compile(r"answer will vary", re.IGNORECASE),
+]
+OR_TOKEN = "{{OR}}"
+
+
+def render_with_or(p) -> str:
+    """Convert <p> to plain text, replacing 'OR' span markers with sentinel."""
+    soup = BeautifulSoup(str(p), "lxml")
+    # Replace <span class="small">OR</span> with sentinel
+    for span in soup.find_all("span"):
+        cls = span.get("class") or []
+        if "small" in cls and span.get_text(strip=True).upper() == "OR":
+            span.replace_with(f" {OR_TOKEN} ")
+    # Drop pagebreak spans
+    for span in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
+        span.decompose()
+    # Drop emphasis but keep text
+    for tag in soup.find_all(["em", "i", "strong", "b"]):
+        tag.unwrap()
+    text = soup.get_text(separator=" ", strip=False)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def split_numbered_items(text: str) -> "list[dict]":
+    """Given '1. el 2. la 3. el ...' return [{'number':1,'answer':'el'}, ...]."""
+    # Find positions of N. tokens
+    matches = list(NUMBERED_ITEM_RE.finditer(text))
+    items = []
+    for i, m in enumerate(matches):
+        num = int(m.group(1))
+        start = m.end()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+        body = text[start:end].strip().rstrip(".,;")
+        # Split alternates on the OR token
+        parts = [p.strip() for p in body.split(OR_TOKEN) if p.strip()]
+        if not parts:
+            continue
+        items.append({
+            "number": num,
+            "answer": parts[0],
+            "alternates": parts[1:],
+        })
+    return items
+
+
+def parse_subpart_label(text: str) -> "tuple[str | None, str]":
+    """Try to peel a leading subpart label (A, B, C) from the text.
+    Returns (label_or_None, remaining_text)."""
+    # Pattern at start: "A " or "A      " (lots of whitespace from <em>A</em><tab>)
+    m = re.match(r"^([A-Z])\s+(?=\d)", text)
+    if m:
+        return m.group(1), text[m.end():]
+    return None, text
+
+
+def parse_answer_paragraph(p, exercise_id: str) -> "list[dict]":
+    """Convert one <p> into a list of subparts.
+    For p.answerq, the text typically starts with the exercise id, then items.
+    For p.answerqa, the text starts with a subpart label letter."""
+    raw = render_with_or(p)
+    # Strip the leading exercise id if present
+    raw = re.sub(rf"^{re.escape(exercise_id)}\s*", "", raw)
+
+    label, body = parse_subpart_label(raw)
+
+    # Detect freeform
+    freeform = any(pat.search(body) for pat in FREEFORM_PATTERNS)
+    if freeform:
+        return [{"label": label, "items": [], "freeform": True, "raw": body}]
+
+    items = split_numbered_items(body)
+    return [{"label": label, "items": items, "freeform": False, "raw": body}]
+
+
+def main() -> None:
+    src = ROOT / "ans.xhtml"
+    soup = BeautifulSoup(src.read_text(encoding="utf-8"), "lxml")
+    body = soup.find("body")
+
+    answers: dict = {}
+    current_chapter = None
+    current_exercise_id: "str | None" = None
+
+    for el in body.find_all(["h3", "p"]):
+        classes = set(el.get("class") or [])
+
+        # Chapter boundary
+        if el.name == "h3" and "h3b" in classes:
+            text = el.get_text(strip=True)
+            m = re.search(r"Chapter\s+(\d+)", text)
+            if m:
+                current_chapter = int(m.group(1))
+                current_exercise_id = None
+            continue
+
+        if el.name != "p" or not (classes & ANSWER_CLASSES):
+            continue
+
+        # Find the exercise-id anchor (only present on p.answerq, not on continuation)
+        a = el.find("a", href=True)
+        ex_link = None
+        if a:
+            link_text = a.get_text(strip=True)
+            if EXERCISE_ID_RE.match(link_text):
+                ex_link = link_text
+
+        if ex_link:
+            current_exercise_id = ex_link
+            anchor = ""
+            href = a.get("href", "")
+            anchor_m = re.search(r"#(ch\d+ans\d+)", href + " " + (a.get("id") or ""))
+            anchor = anchor_m.group(1) if anchor_m else (a.get("id") or "")
+            # Use the anchor's `id` attr if it's the entry id (e.g. "ch1ans1")
+            entry_id = a.get("id") or anchor
+
+            answers[ex_link] = {
+                "id": ex_link,
+                "anchor": entry_id,
+                "chapter": current_chapter,
+                "subparts": [],
+                "freeform": False,
+                "raw": "",
+            }
+            new_subparts = parse_answer_paragraph(el, ex_link)
+            answers[ex_link]["subparts"].extend(new_subparts)
+            answers[ex_link]["raw"] = render_with_or(el)
+            answers[ex_link]["freeform"] = any(sp["freeform"] for sp in new_subparts)
+        else:
+            # Continuation paragraph for current exercise
+            if current_exercise_id and current_exercise_id in answers:
+                more = parse_answer_paragraph(el, current_exercise_id)
+                answers[current_exercise_id]["subparts"].extend(more)
+                if any(sp["freeform"] for sp in more):
+                    answers[current_exercise_id]["freeform"] = True
+
+    out = {"answers": answers}
+    OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))
+
+    total = len(answers)
+    freeform = sum(1 for v in answers.values() if v["freeform"])
+    multipart = sum(1 for v in answers.values() if len(v["subparts"]) > 1)
+    total_items = sum(
+        len(sp["items"]) for v in answers.values() for sp in v["subparts"]
+    )
+    with_alternates = sum(
+        1 for v in answers.values()
+        for sp in v["subparts"] for it in sp["items"]
+        if it["alternates"]
+    )
+    print(f"Exercises with answers: {total}")
+    print(f"  freeform:             {freeform}")
+    print(f"  multi-part (A/B/C):   {multipart}")
+    print(f"  total numbered items: {total_items}")
+    print(f"  items with alternates:{with_alternates}")
+    print(f"Wrote {OUT}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+"""Parse all chapter XHTMLs + appendix into structured chapters.json.
+
+Output schema:
+{
+  "chapters": [
+    {
+      "id": "ch1",
+      "number": 1,
+      "title": "Nouns, Articles, and Adjectives",
+      "part": 1,                          # part 1/2/3 or null
+      "blocks": [                         # ordered content
+        {"kind": "heading", "level": 3, "text": "..."},
+        {"kind": "paragraph", "text": "...", "hasItalic": false},
+        {"kind": "key_vocab_header", "title": "Los colores (The colors)"},
+        {"kind": "vocab_image", "src": "f0010-03.jpg"},
+        {
+          "kind": "exercise",
+          "id": "1.1",
+          "ans_anchor": "ch1ans1",
+          "instruction": "Write the appropriate...",
+          "image_refs": ["f0005-02.jpg"]
+        },
+        {"kind": "image", "src": "...", "alt": "..."}
+      ]
+    }
+  ]
+}
+"""
+
+import json
+import re
+from pathlib import Path
+from bs4 import BeautifulSoup
+
+ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
+OUT = Path(__file__).resolve().parent / "chapters.json"
+
+# Common icon images embedded in headings — ignore when collecting content images
+ICON_IMAGES = {"Common01.jpg", "Common02.jpg", "Common03.jpg", "Common04.jpg", "Common05.jpg"}
+
+EXERCISE_ID_RE = re.compile(r"Exercise\s+([0-9]+\.[0-9]+)")
+ANS_REF_RE = re.compile(r"ch(\d+)ans(\d+)")
+
+
+def clean_text(el) -> str:
+    """Extract text preserving inline emphasis markers."""
+    if el is None:
+        return ""
+    # Replace <em>/<i> with markdown-ish *...*, <strong>/<b> with **...**
+    html = str(el)
+    soup = BeautifulSoup(html, "lxml")
+    # First: flatten nested emphasis so we don't emit overlapping markers.
+    # For <strong><em>X</em></strong>, drop the inner em (the bold wrapping
+    # already carries the emphasis visually). Same for <em><strong>...</strong></em>.
+    for tag in soup.find_all(["strong", "b"]):
+        for inner in tag.find_all(["em", "i"]):
+            inner.unwrap()
+    for tag in soup.find_all(["em", "i"]):
+        for inner in tag.find_all(["strong", "b"]):
+            inner.unwrap()
+    # Drop ALL inline emphasis. The source has nested/sibling em/strong
+    # patterns that CommonMark can't reliably parse, causing markers to leak
+    # into the UI. Plain text renders cleanly everywhere.
+    for tag in soup.find_all(["em", "i", "strong", "b"]):
+        tag.unwrap()
+    # Drop pagebreak spans
+    for tag in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
+        tag.decompose()
+    # Replace <br/> with newline
+    for br in soup.find_all("br"):
+        br.replace_with("\n")
+    # Use a separator so adjacent inline tags don't concatenate without spaces
+    # (e.g. "<strong><em>Ir</em></strong> and" would otherwise become "Irand").
+    text = soup.get_text(separator=" ", strip=False)
+    # Collapse runs of whitespace first.
+    text = re.sub(r"\s+", " ", text).strip()
+    # Strip any stray asterisks that sneak through (e.g. author's literal *).
+    text = text.replace("*", "")
+    # De-space punctuation
+    text = re.sub(r"\s+([,.;:!?])", r"\1", text)
+    # Tighten brackets that picked up separator-spaces: "( foo )" -> "(foo)"
+    text = re.sub(r"([(\[])\s+", r"\1", text)
+    text = re.sub(r"\s+([)\]])", r"\1", text)
+    # Collapse any double-spaces
+    text = re.sub(r"  +", " ", text).strip()
+    return text
+
+
+def is_exercise_header(h) -> bool:
+    """Heading with an <a href='ans.xhtml#...'>Exercise N.N</a> link.
+    Chapters 1-16 use h3.h3k; chapters 17+ use h4.h4."""
+    if h.name not in ("h3", "h4"):
+        return False
+    a = h.find("a", href=True)
+    if a and "ans.xhtml" in a["href"]:
+        return True
+    return False
+
+
+def is_key_vocab_header(h) -> bool:
+    """Heading with 'Key Vocabulary' text (no anchor link to answers)."""
+    if h.name not in ("h3", "h4"):
+        return False
+    text = h.get_text(strip=True)
+    if "Key Vocabulary" in text and not h.find("a", href=lambda v: v and "ans.xhtml" in v):
+        return True
+    return False
+
+
+def extract_image_srcs(parent) -> list:
+    """Return list of image src attributes, skipping icon images."""
+    srcs = []
+    for img in parent.find_all("img"):
+        src = img.get("src", "")
+        if not src or Path(src).name in ICON_IMAGES:
+            continue
+        srcs.append(src)
+    return srcs
+
+
+def parse_chapter(path: Path) -> "dict | None":
+    """Parse one chapter file into structured blocks."""
+    html = path.read_text(encoding="utf-8")
+    soup = BeautifulSoup(html, "lxml")
+    body = soup.find("body")
+    if body is None:
+        return None
+
+    # Chapter number + title
+    number = None
+    title = ""
+    h2s = body.find_all("h2")
+    for h2 in h2s:
+        classes = h2.get("class") or []
+        # Use a separator so consecutive inline tags don't concatenate
+        # (e.g. "<strong><em>Ir</em></strong> and the Future" → "Ir and the Future")
+        text_with_sep = re.sub(r"\s+", " ", h2.get_text(" ", strip=True))
+        # Strip spaces that were inserted before punctuation
+        text_with_sep = re.sub(r"\s+([,.;:!?])", r"\1", text_with_sep).strip()
+        if "h2c" in classes and text_with_sep.isdigit():
+            number = int(text_with_sep)
+        # Chapters 1–16 use h2c1; chapters 17+ use h2-c
+        elif ("h2c1" in classes or "h2-c" in classes) and not title:
+            title = text_with_sep
+    if number is None:
+        # Try id on chapter header (ch1 → 1)
+        for h2 in h2s:
+            id_ = h2.get("id", "")
+            m = re.match(r"ch(\d+)", id_)
+            if m:
+                number = int(m.group(1))
+                break
+
+    chapter_id = path.stem  # ch1, ch2, ...
+
+    # Walk section content in document order
+    section = body.find("section") or body
+    blocks: list = []
+    pending_instruction = None  # holds italic paragraph following an exercise header
+
+    for el in section.descendants:
+        if el.name is None:
+            continue
+
+        classes = el.get("class") or []
+
+        # Skip nested tags already captured via parent processing
+        # We operate only on direct h2/h3/h4/h5/p elements
+        if el.name not in ("h2", "h3", "h4", "h5", "p"):
+            continue
+
+        # Exercise header detection (h3 in ch1-16, h4 in ch17+)
+        if is_exercise_header(el):
+            a = el.find("a", href=True)
+            href = a["href"] if a else ""
+            m = EXERCISE_ID_RE.search(el.get_text())
+            ex_id = m.group(1) if m else ""
+            anchor_m = ANS_REF_RE.search(href)
+            ans_anchor = anchor_m.group(0) if anchor_m else ""
+            blocks.append({
+                "kind": "exercise",
+                "id": ex_id,
+                "ans_anchor": ans_anchor,
+                "instruction": "",
+                "image_refs": [],
+                "prompts": []
+            })
+            pending_instruction = blocks[-1]
+            continue
+
+        # Key Vocabulary header
+        if is_key_vocab_header(el):
+            blocks.append({"kind": "key_vocab_header", "title": "Key Vocabulary"})
+            pending_instruction = None
+            continue
+
+        # Other headings
+        if el.name in ("h2", "h3", "h4", "h5"):
+            if el.name == "h2":
+                # Skip the chapter-number/chapter-title h2s we already captured
+                continue
+            txt = clean_text(el)
+            if txt:
+                blocks.append({
+                    "kind": "heading",
+                    "level": int(el.name[1]),
+                    "text": txt,
+                })
+            pending_instruction = None
+            continue
+
+        # Paragraphs
+        if el.name == "p":
+            imgs = extract_image_srcs(el)
+            text = clean_text(el)
+            p_classes = set(classes)
+
+            # Skip pure blank-line class ("nump" = underscore lines under number prompts)
+            if p_classes & {"nump", "numpa"} and not text:
+                continue
+
+            # Exercise prompt: <p class="number">1.  Prompt text</p>
+            # Also number1, number2 (continuation numbering), numbera, numbert
+            if pending_instruction is not None and p_classes & {"number", "number1", "number2", "numbera", "numbert"}:
+                if text:
+                    pending_instruction["prompts"].append(text)
+                continue
+
+            # Image container for a pending exercise
+            if pending_instruction is not None and imgs and not text:
+                pending_instruction["image_refs"].extend(imgs)
+                continue
+
+            # Instruction line right after the exercise header
+            if pending_instruction is not None and text and not imgs and not pending_instruction["instruction"]:
+                pending_instruction["instruction"] = text
+                continue
+
+            # While in pending-exercise state, extra text paragraphs are word
+            # banks / context ("from the following list:" etc) — keep pending alive.
+            if pending_instruction is not None and text and not imgs:
+                pending_instruction.setdefault("extra", []).append(text)
+                continue
+
+            # Paragraphs that contain an image belong to vocab/key-vocab callouts
+            if imgs and not text:
+                for src in imgs:
+                    blocks.append({"kind": "vocab_image", "src": src})
+                continue
+
+            # Mixed paragraph: image with caption
+            if imgs and text:
+                for src in imgs:
+                    blocks.append({"kind": "vocab_image", "src": src})
+                blocks.append({"kind": "paragraph", "text": text})
+                continue
+
+            # Plain paragraph — outside any exercise
+            if text:
+                blocks.append({"kind": "paragraph", "text": text})
+
+    return {
+        "id": chapter_id,
+        "number": number,
+        "title": title,
+        "blocks": blocks,
+    }
+
+
+def assign_parts(chapters: list, part_files: "dict[int, list[int]]") -> None:
+    """Annotate chapters with part number based on TOC membership."""
+    for part_num, chapter_nums in part_files.items():
+        for ch in chapters:
+            if ch["number"] in chapter_nums:
+                ch["part"] = part_num
+    for ch in chapters:
+        ch.setdefault("part", None)
+
+
+def read_part_memberships() -> "dict[int, list[int]]":
+    """Derive part→chapter grouping from the OPF spine order."""
+    opf = next(ROOT.glob("*.opf"), None)
+    if opf is None:
+        return {}
+    soup = BeautifulSoup(opf.read_text(encoding="utf-8"), "xml")
+    memberships: dict = {}
+    current_part: "int | None" = None
+    for item in soup.find_all("item"):
+        href = item.get("href", "")
+        m_part = re.match(r"part(\d+)\.xhtml", href)
+        m_ch = re.match(r"ch(\d+)\.xhtml", href)
+        if m_part:
+            current_part = int(m_part.group(1))
+            memberships.setdefault(current_part, [])
+        elif m_ch and current_part is not None:
+            memberships[current_part].append(int(m_ch.group(1)))
+    # Manifest order tends to match spine order for this book; verify via spine just in case
+    spine = soup.find("spine")
+    if spine is not None:
+        order = []
+        for ref in spine.find_all("itemref"):
+            idref = ref.get("idref")
+            item = soup.find("item", attrs={"id": idref})
+            if item is not None:
+                order.append(item.get("href", ""))
+        # Rebuild from spine order
+        memberships = {}
+        current_part = None
+        for href in order:
+            m_part = re.match(r"part(\d+)\.xhtml", href)
+            m_ch = re.match(r"ch(\d+)\.xhtml", href)
+            if m_part:
+                current_part = int(m_part.group(1))
+                memberships.setdefault(current_part, [])
+            elif m_ch and current_part is not None:
+                memberships[current_part].append(int(m_ch.group(1)))
+    return memberships
+
+
+def main() -> None:
+    chapter_files = sorted(
+        ROOT.glob("ch*.xhtml"),
+        key=lambda p: int(re.match(r"ch(\d+)", p.stem).group(1))
+    )
+    chapters = []
+    for path in chapter_files:
+        ch = parse_chapter(path)
+        if ch:
+            chapters.append(ch)
+
+    part_memberships = read_part_memberships()
+    assign_parts(chapters, part_memberships)
+
+    out = {
+        "chapters": chapters,
+        "part_memberships": part_memberships,
+    }
+    OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))
+
+    # Summary
+    ex_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise")
+    ex_with_prompts = sum(
+        1 for ch in chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and b["prompts"]
+    )
+    ex_with_images = sum(
+        1 for ch in chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and b["image_refs"]
+    )
+    ex_empty = sum(
+        1 for ch in chapters for b in ch["blocks"]
+        if b["kind"] == "exercise" and not b["prompts"] and not b["image_refs"]
+    )
+    para_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "paragraph")
+    vocab_img_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "vocab_image")
+    print(f"Chapters:             {len(chapters)}")
+    print(f"Exercises total:      {ex_total}")
+    print(f"  with text prompts:  {ex_with_prompts}")
+    print(f"  with image prompts: {ex_with_images}")
+    print(f"  empty:              {ex_empty}")
+    print(f"Paragraphs:           {para_total}")
+    print(f"Vocab images:         {vocab_img_total}")
+    print(f"Parts: {part_memberships}")
+    print(f"Wrote {OUT}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""Extract clean text from the PDF source and map each PDF page to the
+book's printed page number.
+
+Output: pdf_text.json
+{
+  "pdfPageCount": 806,
+  "bookPages": {
+    "3": { "text": "...", "pdfIndex": 29 },
+    "4": { ... },
+    ...
+  },
+  "unmapped": [list of pdfIndex values with no detectable book page number]
+}
+"""
+
+import json
+import re
+from pathlib import Path
+import pypdf
+
+HERE = Path(__file__).resolve().parent
+PDF = next(
+    Path(__file__).resolve().parents[3].glob("Complete Spanish Step-By-Step*.pdf"),
+    None,
+)
+OUT = HERE / "pdf_text.json"
+
+ROMAN_RE = re.compile(r"^[ivxlcdmIVXLCDM]+$")
+# Match a page number on its own line at top/bottom of the page.
+# The book uses Arabic numerals for main chapters (e.g., "3") and Roman for front matter.
+PAGE_NUM_LINE_RE = re.compile(r"^\s*(\d{1,4})\s*$", re.MULTILINE)
+
+
+def detect_book_page(text: str) -> "int | None":
+    """Find the printed page number from standalone page-number lines at the
+    top or bottom of a page."""
+    lines = [l.strip() for l in text.splitlines() if l.strip()]
+    # Check first 2 lines and last 2 lines
+    for candidate in lines[:2] + lines[-2:]:
+        m = re.match(r"^(\d{1,4})$", candidate)
+        if m:
+            return int(m.group(1))
+    return None
+
+
+def main() -> None:
+    if PDF is None:
+        print("No PDF found in project root")
+        return
+
+    print(f"Reading {PDF.name}")
+    reader = pypdf.PdfReader(str(PDF))
+    pages = reader.pages
+    print(f"PDF has {len(pages)} pages")
+
+    by_book_page: dict = {}
+    unmapped: list = []
+    last_seen: "int | None" = None
+    missed_count = 0
+
+    for i, page in enumerate(pages):
+        text = page.extract_text() or ""
+        book_page = detect_book_page(text)
+
+        if book_page is None:
+            # Carry forward sequence: if we saw page N last, assume N+1.
+            if last_seen is not None:
+                book_page = last_seen + 1
+                missed_count += 1
+            else:
+                unmapped.append(i)
+                continue
+        last_seen = book_page
+        # Strip the detected page number from text to clean the output
+        cleaned = re.sub(r"(?m)^\s*\d{1,4}\s*$", "", text).strip()
+        by_book_page[str(book_page)] = {
+            "text": cleaned,
+            "pdfIndex": i,
+        }
+
+    out = {
+        "pdfPageCount": len(pages),
+        "bookPages": by_book_page,
+        "unmapped": unmapped,
+        "inferredPages": missed_count,
+    }
+    OUT.write_text(json.dumps(out, ensure_ascii=False))
+    print(f"Mapped {len(by_book_page)} book pages; {missed_count} inferred; {len(unmapped)} unmapped")
+    print(f"Wrote {OUT}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""Apply high-confidence auto-fixes from vocab_validation.json to vocab_cards.json.
+
+Auto-fix rules (conservative):
+  1. If a flagged word has exactly one suggestion AND that suggestion differs by
+     <= 2 characters AND has the same starting letter (high-confidence character swap).
+  2. If a card is detected as reversed (Spanish on EN side, English on ES side),
+     swap front/back.
+
+Cards that aren't auto-fixable end up in manual_review.json.
+"""
+
+import json
+import re
+import unicodedata
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+VOCAB = HERE / "vocab_cards.json"
+VALIDATION = HERE / "vocab_validation.json"
+OUT_VOCAB = HERE / "vocab_cards.json"
+OUT_REVIEW = HERE / "manual_review.json"
+OUT_QUARANTINE = HERE / "quarantined_cards.json"
+
+
+def _strip_accents(s: str) -> str:
+    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
+
+
+def _levenshtein(a: str, b: str) -> int:
+    if a == b: return 0
+    if not a: return len(b)
+    if not b: return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        curr = [i]
+        for j, cb in enumerate(b, 1):
+            cost = 0 if ca == cb else 1
+            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
+        prev = curr
+    return prev[-1]
+
+
+SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
+SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
+ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
+
+
+def language_score(s: str) -> "tuple[int, int]":
+    """Return (es_score, en_score) for a string."""
+    es = 0
+    en = 0
+    if SPANISH_ACCENT_RE.search(s):
+        es += 3
+    words = s.lower().split()
+    if not words:
+        return (es, en)
+    first = words[0].strip(",.;:")
+    if first in SPANISH_ARTICLES:
+        es += 2
+    if first in ENGLISH_STARTERS:
+        en += 2
+    # Spanish-likely endings on later words
+    for w in words:
+        w = w.strip(",.;:")
+        if not w: continue
+        if w.endswith(("ción", "sión", "dad", "tud")):
+            es += 1
+        if w.endswith(("ing", "tion", "ness", "ment", "able", "ly")):
+            en += 1
+    return (es, en)
+
+
+def is_reversed(front: str, back: str) -> bool:
+    """True when front looks like English and back looks like Spanish (i.e. swapped)."""
+    fes, fen = language_score(front)
+    bes, ben = language_score(back)
+    # Front English-leaning AND back Spanish-leaning
+    return fen > fes and bes > ben
+
+
+def best_replacement(word: str, suggestions: list) -> "str | None":
+    """Pick the one safe correction, or None to leave it alone."""
+    if not suggestions:
+        return None
+    # Prefer suggestions that share the same first letter
+    same_initial = [s for s in suggestions if s and word and s[0].lower() == word[0].lower()]
+    candidates = same_initial or suggestions
+    # Single best: short edit distance
+    best = None
+    best_d = 99
+    for s in candidates:
+        d = _levenshtein(word.lower(), s.lower())
+        # Don't apply if the "fix" changes too much
+        if d == 0:
+            continue
+        if d > 2:
+            continue
+        if d < best_d:
+            best = s
+            best_d = d
+    return best
+
+
+def side_language_match(text: str, expected_side: str) -> bool:
+    """Return True when `text` looks like the expected language (es/en).
+    Guards against applying Spanish spell-fix to English words on a mis-paired card.
+    """
+    es, en = language_score(text)
+    if expected_side == "es":
+        return es > en  # require clear Spanish signal
+    if expected_side == "en":
+        return en >= es  # allow equal when text has no strong signal (common for English)
+    return False
+
+
+def apply_word_fixes(text: str, bad_words: list, expected_side: str) -> "tuple[str, list]":
+    """Apply word-level corrections inside a string. Skips fixes entirely when
+    the side's actual language doesn't match the dictionary used, to avoid
+    corrupting mis-paired cards."""
+    if not side_language_match(text, expected_side):
+        return (text, [])
+
+    new_text = text
+    applied = []
+    for bw in bad_words:
+        word = bw["word"]
+        sugg = bw["suggestions"]
+        replacement = best_replacement(word, sugg)
+        if replacement is None:
+            continue
+        # Match standalone word including the (possibly-omitted) trailing period:
+        # `Uds` in the text should be replaced with `Uds.` even when adjacent to `.`.
+        escaped = re.escape(word)
+        # Allow an optional existing period that we'd otherwise duplicate.
+        pattern = re.compile(rf"(?<![A-Za-zÁ-ú]){escaped}\.?(?![A-Za-zÁ-ú])")
+        if pattern.search(new_text):
+            new_text = pattern.sub(replacement, new_text, count=1)
+            applied.append({"from": word, "to": replacement})
+    return (new_text, applied)
+
+
+def main() -> None:
+    vocab_data = json.loads(VOCAB.read_text(encoding="utf-8"))
+    val_data = json.loads(VALIDATION.read_text(encoding="utf-8"))
+
+    # Index validation by (chapter, front, back, sourceImage) for lookup
+    val_index: dict = {}
+    for f in val_data["flags"]:
+        key = (f["chapter"], f["front"], f["back"], f["sourceImage"])
+        val_index[key] = f
+
+    # Walk the cards in place
+    auto_fixed_word = 0
+    auto_swapped = 0
+    quarantined = 0
+    manual_review_cards = []
+    quarantined_cards = []
+
+    for ch in vocab_data["chapters"]:
+        kept_cards = []
+        for card in ch["cards"]:
+            key = (ch["chapter"], card["front"], card["back"], card.get("sourceImage", ""))
+            flag = val_index.get(key)
+
+            # 1) Reversal swap (apply even when not flagged)
+            if is_reversed(card["front"], card["back"]):
+                card["front"], card["back"] = card["back"], card["front"]
+                auto_swapped += 1
+                # Re-key for any further validation lookup (no-op here)
+
+            if flag is None:
+                kept_cards.append(card)
+                continue
+
+            # Quarantine obvious mis-pairs: both sides same language OR language mismatch
+            fes, fen = language_score(card["front"])
+            bes, ben = language_score(card["back"])
+            front_lang = "es" if fes > fen else ("en" if fen > fes else "unknown")
+            back_lang = "es" if bes > ben else ("en" if ben > bes else "unknown")
+            # A good card has front=es, back=en. Anything else when the card is
+            # flagged is almost always a column-pairing error.
+            if front_lang != "es" or back_lang != "en":
+                quarantined_cards.append({
+                    "chapter": ch["chapter"],
+                    "front": card["front"],
+                    "back": card["back"],
+                    "sourceImage": card.get("sourceImage", ""),
+                    "reason": f"language-mismatch front={front_lang} back={back_lang}",
+                })
+                quarantined += 1
+                continue
+
+            # 2) Word-level fixes (language-aware)
+            new_front, applied_front = apply_word_fixes(card["front"], flag["badFront"], "es")
+            new_back, applied_back = apply_word_fixes(card["back"], flag["badBack"], "en")
+            card["front"] = new_front
+            card["back"] = new_back
+            auto_fixed_word += len(applied_front) + len(applied_back)
+
+            # If after auto-fix there are STILL flagged words with no
+            # confident replacement, flag for manual review.
+            unresolved_front = [
+                bw for bw in flag["badFront"]
+                if not any(a["from"] == bw["word"] for a in applied_front)
+                and best_replacement(bw["word"], bw["suggestions"]) is None
+            ]
+            unresolved_back = [
+                bw for bw in flag["badBack"]
+                if not any(a["from"] == bw["word"] for a in applied_back)
+                and best_replacement(bw["word"], bw["suggestions"]) is None
+            ]
+            if unresolved_front or unresolved_back:
+                manual_review_cards.append({
+                    "chapter": ch["chapter"],
+                    "front": card["front"],
+                    "back": card["back"],
+                    "sourceImage": card.get("sourceImage", ""),
+                    "unresolvedFront": unresolved_front,
+                    "unresolvedBack": unresolved_back,
+                })
+            kept_cards.append(card)
+
+        ch["cards"] = kept_cards
+
+    OUT_VOCAB.write_text(json.dumps(vocab_data, ensure_ascii=False, indent=2))
+    OUT_REVIEW.write_text(json.dumps({
+        "totalManualReview": len(manual_review_cards),
+        "cards": manual_review_cards,
+    }, ensure_ascii=False, indent=2))
+
+    OUT_QUARANTINE.write_text(json.dumps({
+        "totalQuarantined": len(quarantined_cards),
+        "cards": quarantined_cards,
+    }, ensure_ascii=False, indent=2))
+
+    total_cards = sum(len(c["cards"]) for c in vocab_data["chapters"])
+    print(f"Active cards (after quarantine): {total_cards}")
+    print(f"Auto-swapped (reversed):         {auto_swapped}")
+    print(f"Auto-fixed words:                {auto_fixed_word}")
+    print(f"Quarantined (mis-paired):        {quarantined}")
+    print(f"Cards needing manual review:     {len(manual_review_cards)}")
+    print(f"Wrote {OUT_VOCAB}")
+    print(f"Wrote {OUT_REVIEW}")
+    print(f"Wrote {OUT_QUARANTINE}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""Merge repaired_cards.json into vocab_cards.json.
+
+Rules:
+  1. New pairs are added to their chapter's deck if they don't duplicate an existing pair.
+  2. Duplicate detection uses normalize(front)+normalize(back).
+  3. Pairs whose back side starts with a Spanish-article or front side starts
+     with an English article are dropped (pairer got orientation wrong).
+  4. Emits integrate_report.json with counts.
+"""
+
+import json
+import re
+import unicodedata
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+VOCAB = HERE / "vocab_cards.json"
+REPAIRED = HERE / "repaired_cards.json"
+QUARANTINED = HERE / "quarantined_cards.json"
+OUT = HERE / "vocab_cards.json"
+REPORT = HERE / "integrate_report.json"
+
+
+def _strip_accents(s: str) -> str:
+    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
+
+
+def norm(s: str) -> str:
+    return _strip_accents(s.lower()).strip()
+
+
+SPANISH_ACCENT_RE = re.compile(r"[áéíóúñüÁÉÍÓÚÑÜ¿¡]")
+SPANISH_ARTICLES = {"el", "la", "los", "las", "un", "una", "unos", "unas"}
+ENGLISH_STARTERS = {"the", "a", "an", "to", "my", "his", "her", "our", "their"}
+
+
+def looks_swapped(front: str, back: str) -> bool:
+    """True if front looks English and back looks Spanish (pair should be swapped)."""
+    fl = front.lower().split()
+    bl = back.lower().split()
+    if not fl or not bl:
+        return False
+    f_first = fl[0].strip(",.;:")
+    b_first = bl[0].strip(",.;:")
+    front_is_en = f_first in ENGLISH_STARTERS
+    back_is_es = (
+        SPANISH_ACCENT_RE.search(back) is not None
+        or b_first in SPANISH_ARTICLES
+    )
+    return front_is_en and back_is_es
+
+
+def looks_good(pair: dict) -> bool:
+    """Basic sanity filter on a repaired pair before it enters the deck."""
+    es = pair["es"].strip()
+    en = pair["en"].strip()
+    if not es or not en: return False
+    if len(es) < 2 or len(en) < 2: return False
+    # Drop if both sides obviously same language (neither has clear orientation)
+    es_has_accent = SPANISH_ACCENT_RE.search(es) is not None
+    en_has_accent = SPANISH_ACCENT_RE.search(en) is not None
+    if en_has_accent and not es_has_accent:
+        # The "en" side has accents — likely swapped
+        return False
+    return True
+
+
+def main() -> None:
+    vocab = json.loads(VOCAB.read_text(encoding="utf-8"))
+    repaired = json.loads(REPAIRED.read_text(encoding="utf-8"))
+    quarantined = json.loads(QUARANTINED.read_text(encoding="utf-8"))
+
+    # Map image → chapter (from the quarantine list — all images here belong to the
+    # chapter they were quarantined from).
+    image_chapter: dict = {}
+    for c in quarantined["cards"]:
+        image_chapter[c["sourceImage"]] = c["chapter"]
+
+    # Build existing key set
+    existing_keys = set()
+    chapter_map: dict = {c["chapter"]: c for c in vocab["chapters"]}
+    for c in vocab["chapters"]:
+        for card in c["cards"]:
+            existing_keys.add((c["chapter"], norm(card["front"]), norm(card["back"])))
+
+    added_per_image: dict = {}
+    dropped_swapped = 0
+    dropped_sanity = 0
+    dropped_dup = 0
+
+    for image_name, data in repaired["byImage"].items():
+        ch_num = image_chapter.get(image_name)
+        if ch_num is None:
+            # Image not in quarantine list (shouldn't happen, but bail)
+            continue
+        deck = chapter_map.setdefault(ch_num, {"chapter": ch_num, "cards": []})
+        added = 0
+        for p in data.get("pairs", []):
+            es = p["es"].strip()
+            en = p["en"].strip()
+            if looks_swapped(es, en):
+                es, en = en, es
+            pair = {"es": es, "en": en}
+            if not looks_good(pair):
+                dropped_sanity += 1
+                continue
+            key = (ch_num, norm(pair["es"]), norm(pair["en"]))
+            if key in existing_keys:
+                dropped_dup += 1
+                continue
+            existing_keys.add(key)
+            card = {
+                "front": pair["es"],
+                "back": pair["en"],
+                "chapter": ch_num,
+                "chapterTitle": "",
+                "section": "",
+                "sourceImage": image_name,
+            }
+            deck["cards"].append(card)
+            added += 1
+        if added:
+            added_per_image[image_name] = added
+
+    # If any new chapter was created, ensure ordered insertion
+    vocab["chapters"] = sorted(chapter_map.values(), key=lambda c: c["chapter"])
+    OUT.write_text(json.dumps(vocab, ensure_ascii=False, indent=2))
+
+    total_added = sum(added_per_image.values())
+    report = {
+        "totalRepairedInput": repaired["totalPairs"],
+        "added": total_added,
+        "dropped_duplicate": dropped_dup,
+        "dropped_sanity": dropped_sanity,
+        "addedPerImage": added_per_image,
+    }
+    REPORT.write_text(json.dumps(report, ensure_ascii=False, indent=2))
+    print(f"Repaired pairs in:          {repaired['totalPairs']}")
+    print(f"Added to deck:              {total_added}")
+    print(f"Dropped as duplicate:       {dropped_dup}")
+    print(f"Dropped as swapped/bad:     {dropped_sanity}")
+    print(f"Wrote {OUT}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+"""Second-pass extractor: use PDF OCR (from ocr_pdf.swift) as a supplementary
+source of clean text, then re-build book.json with PDF-derived content where it
+improves on the EPUB's image-based extraction.
+
+Inputs:
+  chapters.json  — EPUB structural extraction (narrative text + exercise prompts + image refs)
+  answers.json   — EPUB answer key
+  ocr.json       — EPUB image OCR (first pass)
+  pdf_ocr.json   — PDF page-level OCR (this pass, higher DPI + cleaner)
+
+Outputs:
+  book.json         — merged book used by the app
+  vocab_cards.json  — derived vocabulary flashcards
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE))
+from build_book import (  # reuse the helpers defined in build_book.py
+    COURSE_NAME,
+    build_vocab_cards_for_block,
+    clean_instruction,
+    classify_line,
+    load,
+)
+
+CHAPTERS_JSON = HERE / "chapters.json"
+ANSWERS_JSON = HERE / "answers.json"
+OCR_JSON = HERE / "ocr.json"
+PDF_OCR_JSON = HERE / "pdf_ocr.json"
+OUT_BOOK = HERE / "book.json"
+OUT_VOCAB = HERE / "vocab_cards.json"
+
+IMAGE_NAME_RE = re.compile(r"^f(\d{4})-(\d{2})\.jpg$")
+
+
+def extract_book_page(image_src: str) -> "int | None":
+    m = IMAGE_NAME_RE.match(image_src)
+    return int(m.group(1)) if m else None
+
+
+def build_pdf_page_index(pdf_ocr: dict) -> "dict[int, dict]":
+    """Map bookPage → {lines, confidence, pdfIndex}.
+
+    Strategy: use chapter-start alignments as anchors. For each chapter N,
+    anchor[N] = (pdf_idx_where_chapter_starts, book_page_where_chapter_starts).
+    Between anchors we interpolate page-by-page (pages run sequentially within
+    a chapter in this textbook's layout).
+    """
+    pages: "dict[int, dict]" = {}
+    sorted_keys = sorted(pdf_ocr.keys(), key=lambda k: int(k))
+
+    # --- Detect chapter starts in the PDF OCR ---
+    pdf_ch_start: "dict[int, int]" = {}
+    for k in sorted_keys:
+        entry = pdf_ocr[k]
+        lines = entry.get("lines", [])
+        if len(lines) < 2:
+            continue
+        first = lines[0].strip()
+        second = lines[1].strip()
+        if first.isdigit() and 1 <= int(first) <= 30 and len(second) > 5 and second[0:1].isupper():
+            ch = int(first)
+            if ch not in pdf_ch_start:
+                pdf_ch_start[ch] = int(k)
+
+    # --- Load EPUB's authoritative book-page starts ---
+    import re as _re
+    from bs4 import BeautifulSoup as _BS
+    epub_root = HERE.parents[2] / "epub_extract" / "OEBPS"
+    book_ch_start: "dict[int, int]" = {}
+    for ch in sorted(pdf_ch_start.keys()):
+        p = epub_root / f"ch{ch}.xhtml"
+        if not p.exists():
+            continue
+        soup = _BS(p.read_text(encoding="utf-8"), "lxml")
+        for span in soup.find_all(True):
+            id_ = span.get("id", "") or ""
+            m = _re.match(r"page_(\d+)$", id_)
+            if m:
+                book_ch_start[ch] = int(m.group(1))
+                break
+
+    # Build per-chapter (pdf_anchor, book_anchor, next_pdf_anchor) intervals
+    anchors = []  # list of (ch, pdf_start, book_start)
+    for ch in sorted(pdf_ch_start.keys()):
+        if ch in book_ch_start:
+            anchors.append((ch, pdf_ch_start[ch], book_ch_start[ch]))
+
+    for i, (ch, pdf_s, book_s) in enumerate(anchors):
+        next_pdf = anchors[i + 1][1] if i + 1 < len(anchors) else pdf_s + 50
+        # Interpolate book page for each pdf index in [pdf_s, next_pdf)
+        for pdf_idx in range(pdf_s, next_pdf):
+            book_page = book_s + (pdf_idx - pdf_s)
+            entry = pdf_ocr.get(str(pdf_idx))
+            if entry is None:
+                continue
+            if book_page in pages:
+                continue
+            pages[book_page] = {
+                "lines": entry["lines"],
+                "confidence": entry.get("confidence", 0),
+                "pdfIndex": pdf_idx,
+            }
+    return pages
+
+
+def merge_ocr(epub_lines: list, pdf_lines: list) -> list:
+    """EPUB per-image OCR is our primary (targeted, no prose bleed). PDF
+    page-level OCR is only used when EPUB is missing. Per-line accent repair
+    is handled separately via `repair_accents_from_pdf`.
+    """
+    if epub_lines:
+        return epub_lines
+    return pdf_lines
+
+
+import unicodedata as _u
+
+def _strip_accents(s: str) -> str:
+    return "".join(c for c in _u.normalize("NFD", s) if _u.category(c) != "Mn")
+
+
+def _levenshtein(a: str, b: str) -> int:
+    if a == b: return 0
+    if not a: return len(b)
+    if not b: return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        curr = [i]
+        for j, cb in enumerate(b, 1):
+            cost = 0 if ca == cb else 1
+            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
+        prev = curr
+    return prev[-1]
+
+
+def repair_accents_from_pdf(epub_lines: list, pdf_page_lines: list) -> "tuple[list, int]":
+    """For each EPUB OCR line, find a near-match in the PDF page OCR and
+    prefer the PDF version. Repairs include:
+      1. exact accent/case differences (e.g. 'iglesia' vs 'Iglesia')
+      2. single-character OCR errors (e.g. 'the hrother' -> 'the brother')
+      3. two-character OCR errors when the target is long enough
+    """
+    if not epub_lines or not pdf_page_lines:
+        return (epub_lines, 0)
+    # Pre-normalize PDF lines for matching
+    pdf_cleaned = [p.strip() for p in pdf_page_lines if p.strip()]
+    pdf_by_stripped: dict = {}
+    for p in pdf_cleaned:
+        key = _strip_accents(p.lower())
+        pdf_by_stripped.setdefault(key, p)
+
+    out: list = []
+    repairs = 0
+    for e in epub_lines:
+        e_stripped = e.strip()
+        e_key = _strip_accents(e_stripped.lower())
+        # Pass 1: exact accent-only difference
+        if e_key and e_key in pdf_by_stripped and pdf_by_stripped[e_key] != e_stripped:
+            out.append(pdf_by_stripped[e_key])
+            repairs += 1
+            continue
+        # Pass 2: fuzzy — find best PDF line within edit distance 1 or 2
+        if len(e_key) >= 4:
+            max_distance = 1 if len(e_key) < 10 else 2
+            best_match = None
+            best_d = max_distance + 1
+            for p in pdf_cleaned:
+                p_key = _strip_accents(p.lower())
+                # Only match lines of similar length
+                if abs(len(p_key) - len(e_key)) > max_distance:
+                    continue
+                d = _levenshtein(e_key, p_key)
+                if d < best_d:
+                    best_d = d
+                    best_match = p
+                    if d == 0:
+                        break
+            if best_match and best_match != e_stripped and best_d <= max_distance:
+                out.append(best_match)
+                repairs += 1
+                continue
+        out.append(e)
+    return (out, repairs)
+
+
+def vocab_lines_from_pdf_page(
+    pdf_page_entry: dict,
+    epub_narrative_lines: set
+) -> list:
+    """Extract likely vocab-table lines from a PDF page's OCR by filtering out
+    narrative-looking lines (long sentences) and already-known EPUB content."""
+    lines = pdf_page_entry.get("lines", [])
+    out: list = []
+    for raw in lines:
+        line = raw.strip()
+        if not line:
+            continue
+        # Skip lines that look like body prose (too long)
+        if len(line) > 80:
+            continue
+        # Skip narrative we already captured in the EPUB
+        if line in epub_narrative_lines:
+            continue
+        # Skip page-number-only lines
+        if re.fullmatch(r"\d{1,4}", line):
+            continue
+        # Skip standalone chapter headers (e.g. "Nouns, Articles, and Adjectives")
+        out.append(line)
+    return out
+
+
+def main() -> None:
+    chapters_data = load(CHAPTERS_JSON)
+    answers = load(ANSWERS_JSON)["answers"]
+    epub_ocr = load(OCR_JSON)
+    pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
+    pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
+    print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
+
+    # Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
+    narrative_set = set()
+    for ch in chapters_data["chapters"]:
+        for b in ch["blocks"]:
+            if b["kind"] == "paragraph" and b.get("text"):
+                narrative_set.add(b["text"].strip())
+
+    book_chapters = []
+    all_vocab_cards = []
+    pdf_hits = 0
+    pdf_misses = 0
+    merged_pages = 0
+
+    for ch in chapters_data["chapters"]:
+        out_blocks = []
+        current_section_title = ch["title"]
+
+        for bi, block in enumerate(ch["blocks"]):
+            k = block["kind"]
+
+            if k == "heading":
+                current_section_title = block["text"]
+                out_blocks.append(block)
+                continue
+
+            if k == "paragraph":
+                out_blocks.append(block)
+                continue
+
+            if k == "key_vocab_header":
+                out_blocks.append(block)
+                continue
+
+            if k == "vocab_image":
+                src = block["src"]
+                epub_entry = epub_ocr.get(src)
+                epub_lines = epub_entry.get("lines", []) if epub_entry else []
+                epub_conf = epub_entry.get("confidence", 0.0) if epub_entry else 0.0
+
+                book_page = extract_book_page(src)
+                pdf_entry = pdf_pages.get(book_page) if book_page else None
+                pdf_lines = pdf_entry["lines"] if pdf_entry else []
+
+                # Primary: EPUB per-image OCR. Supplementary: PDF page OCR
+                # used only for accent/diacritic repair where keys match.
+                if pdf_lines:
+                    pdf_hits += 1
+                else:
+                    pdf_misses += 1
+                repaired_lines, repairs = repair_accents_from_pdf(epub_lines, pdf_lines)
+                merged_lines = repaired_lines if repaired_lines else pdf_lines
+                merged_conf = max(epub_conf, pdf_entry.get("confidence", 0) if pdf_entry else 0.0)
+                if repairs > 0:
+                    merged_pages += 1
+
+                derived = build_vocab_cards_for_block(
+                    {"src": src},
+                    {"lines": merged_lines, "confidence": merged_conf},
+                    ch, current_section_title, bi
+                )
+                all_vocab_cards.extend(derived)
+                out_blocks.append({
+                    "kind": "vocab_table",
+                    "sourceImage": src,
+                    "ocrLines": merged_lines,
+                    "ocrConfidence": merged_conf,
+                    "cardCount": len(derived),
+                    "source": "pdf-repaired" if repairs > 0 else ("epub" if epub_lines else "pdf"),
+                    "bookPage": book_page,
+                    "repairs": repairs,
+                })
+                continue
+
+            if k == "exercise":
+                ans = answers.get(block["id"])
+                # EPUB image OCR (if any image refs)
+                image_ocr_lines: list = []
+                for src in block.get("image_refs", []):
+                    ee = epub_ocr.get(src)
+                    if ee:
+                        image_ocr_lines.extend(ee.get("lines", []))
+                    # Add PDF-page OCR for that page if available
+                    bp = extract_book_page(src)
+                    if bp and pdf_pages.get(bp):
+                        # Only add lines not already present from EPUB OCR
+                        pdf_lines = pdf_pages[bp]["lines"]
+                        for line in pdf_lines:
+                            line = line.strip()
+                            if not line or line in image_ocr_lines:
+                                continue
+                            if line in narrative_set:
+                                continue
+                            image_ocr_lines.append(line)
+
+                prompts = [p for p in block.get("prompts", []) if p.strip()]
+                extras = [e for e in block.get("extra", []) if e.strip()]
+                if not prompts and image_ocr_lines:
+                    # Extract numbered lines from OCR
+                    for line in image_ocr_lines:
+                        m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip())
+                        if m:
+                            prompts.append(f"{m.group(1)}. {m.group(2)}")
+
+                sub = ans["subparts"] if ans else []
+                answer_items = []
+                for sp in sub:
+                    for it in sp["items"]:
+                        answer_items.append({
+                            "label": sp["label"],
+                            "number": it["number"],
+                            "answer": it["answer"],
+                            "alternates": it["alternates"],
+                        })
+
+                out_blocks.append({
+                    "kind": "exercise",
+                    "id": block["id"],
+                    "ansAnchor": block.get("ans_anchor", ""),
+                    "instruction": clean_instruction(block.get("instruction", "")),
+                    "extra": extras,
+                    "prompts": prompts,
+                    "ocrLines": image_ocr_lines,
+                    "freeform": ans["freeform"] if ans else False,
+                    "answerItems": answer_items,
+                    "answerRaw": ans["raw"] if ans else "",
+                    "answerSubparts": sub,
+                })
+                continue
+
+            out_blocks.append(block)
+
+        book_chapters.append({
+            "id": ch["id"],
+            "number": ch["number"],
+            "title": ch["title"],
+            "part": ch.get("part"),
+            "blocks": out_blocks,
+        })
+
+    book = {
+        "courseName": COURSE_NAME,
+        "totalChapters": len(book_chapters),
+        "totalExercises": sum(1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise"),
+        "totalVocabTables": sum(1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table"),
+        "totalVocabCards": len(all_vocab_cards),
+        "parts": chapters_data.get("part_memberships", {}),
+        "chapters": book_chapters,
+        "sources": {
+            "epub_images_ocr": bool(epub_ocr),
+            "pdf_pages_ocr": bool(pdf_ocr_raw),
+            "pdf_pages_mapped": len(pdf_pages),
+        },
+    }
+    OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False))
+
+    vocab_by_chapter: dict = {}
+    for card in all_vocab_cards:
+        vocab_by_chapter.setdefault(card["chapter"], []).append(card)
+    OUT_VOCAB.write_text(json.dumps({
+        "courseName": COURSE_NAME,
+        "chapters": [
+            {"chapter": n, "cards": cs}
+            for n, cs in sorted(vocab_by_chapter.items())
+        ],
+    }, ensure_ascii=False, indent=2))
+
+    print(f"Wrote {OUT_BOOK}")
+    print(f"Wrote {OUT_VOCAB}")
+    print(f"Chapters:               {book['totalChapters']}")
+    print(f"Exercises:              {book['totalExercises']}")
+    print(f"Vocab tables:           {book['totalVocabTables']}")
+    print(f"Vocab cards (derived):  {book['totalVocabCards']}")
+    print(f"PDF hits vs misses:     {pdf_hits} / {pdf_misses}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,110 @@
+#!/usr/bin/env swift
+// OCR every JPG in the given input directory using the macOS Vision framework.
+// Output: JSON map of { "<filename>": { "lines": [...], "confidence": Double } }
+//
+// Usage: swift ocr_images.swift <input_dir> <output_json>
+// Example: swift ocr_images.swift ../../../epub_extract/OEBPS ocr.json
+
+import Foundation
+import Vision
+import AppKit
+
+guard CommandLine.arguments.count >= 3 else {
+    print("Usage: swift ocr_images.swift <input_dir> <output_json>")
+    exit(1)
+}
+
+let inputDir = URL(fileURLWithPath: CommandLine.arguments[1])
+let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])
+
+// Skip images that are icons/inline markers — not real content
+let skipSubstrings = ["Common", "cover", "title"]
+
+let fileManager = FileManager.default
+guard let enumerator = fileManager.enumerator(at: inputDir, includingPropertiesForKeys: nil) else {
+    print("Could not enumerate \(inputDir.path)")
+    exit(1)
+}
+
+var jpgs: [URL] = []
+for case let url as URL in enumerator {
+    let name = url.lastPathComponent
+    guard name.hasSuffix(".jpg") || name.hasSuffix(".jpeg") || name.hasSuffix(".png") else { continue }
+    if skipSubstrings.contains(where: { name.contains($0) }) { continue }
+    jpgs.append(url)
+}
+jpgs.sort { $0.lastPathComponent < $1.lastPathComponent }
+print("Found \(jpgs.count) images to OCR")
+
+struct OCRResult: Encodable {
+    var lines: [String]
+    var confidence: Double
+}
+
+var results: [String: OCRResult] = [:]
+let total = jpgs.count
+var processed = 0
+let startTime = Date()
+
+for url in jpgs {
+    processed += 1
+    let name = url.lastPathComponent
+
+    guard let nsImage = NSImage(contentsOf: url),
+          let tiffData = nsImage.tiffRepresentation,
+          let bitmap = NSBitmapImageRep(data: tiffData),
+          let cgImage = bitmap.cgImage else {
+        print("\(processed)/\(total) \(name) — could not load")
+        continue
+    }
+
+    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = .accurate
+    request.recognitionLanguages = ["es-ES", "es", "en-US"]
+    request.usesLanguageCorrection = true
+    // For the 2020 book, automaticallyDetectsLanguage helps with mixed content
+    if #available(macOS 13.0, *) {
+        request.automaticallyDetectsLanguage = true
+    }
+
+    do {
+        try handler.perform([request])
+        let observations = request.results ?? []
+        var lines: [String] = []
+        var totalConfidence: Float = 0
+        var count = 0
+        for obs in observations {
+            if let top = obs.topCandidates(1).first {
+                let s = top.string.trimmingCharacters(in: .whitespaces)
+                if !s.isEmpty {
+                    lines.append(s)
+                    totalConfidence += top.confidence
+                    count += 1
+                }
+            }
+        }
+        let avg = count > 0 ? Double(totalConfidence) / Double(count) : 0.0
+        results[name] = OCRResult(lines: lines, confidence: avg)
+    } catch {
+        print("\(processed)/\(total) \(name) — error: \(error)")
+    }
+
+    if processed % 50 == 0 || processed == total {
+        let elapsed = Date().timeIntervalSince(startTime)
+        let rate = Double(processed) / max(elapsed, 0.001)
+        let remaining = Double(total - processed) / max(rate, 0.001)
+        print(String(format: "%d/%d  %.1f img/s  eta %.0fs", processed, total, rate, remaining))
+    }
+}
+
+let encoder = JSONEncoder()
+encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+do {
+    let data = try encoder.encode(results)
+    try data.write(to: outputURL)
+    print("Wrote \(results.count) OCR entries to \(outputURL.path)")
+} catch {
+    print("Error writing output: \(error)")
+    exit(1)
+}
@@ -0,0 +1,133 @@
+#!/usr/bin/env swift
+// Rasterize each page of a PDF at high DPI and OCR it with Vision.
+// Output: { "<pdfIndex>": { "lines": [...], "confidence": Double, "bookPage": Int? } }
+//
+// Usage: swift ocr_pdf.swift <pdf_path> <output_json> [dpi]
+// Example: swift ocr_pdf.swift "book.pdf" pdf_ocr.json 240
+
+import Foundation
+import Vision
+import AppKit
+import Quartz
+
+guard CommandLine.arguments.count >= 3 else {
+    print("Usage: swift ocr_pdf.swift <pdf_path> <output_json> [dpi]")
+    exit(1)
+}
+
+let pdfURL = URL(fileURLWithPath: CommandLine.arguments[1])
+let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])
+let dpi: CGFloat = CommandLine.arguments.count >= 4 ? CGFloat(Double(CommandLine.arguments[3]) ?? 240.0) : 240.0
+
+guard let pdfDoc = PDFDocument(url: pdfURL) else {
+    print("Could not open PDF at \(pdfURL.path)")
+    exit(1)
+}
+
+let pageCount = pdfDoc.pageCount
+print("PDF has \(pageCount) pages. Rendering at \(dpi) DPI.")
+
+struct PageResult: Encodable {
+    var lines: [String]
+    var confidence: Double
+    var bookPage: Int?
+}
+
+var results: [String: PageResult] = [:]
+let startTime = Date()
+
+// Render at scale = dpi / 72 (72 is default PDF DPI)
+let scale: CGFloat = dpi / 72.0
+
+for i in 0..<pageCount {
+    guard let page = pdfDoc.page(at: i) else { continue }
+    let pageBounds = page.bounds(for: .mediaBox)
+    let scaledSize = CGSize(width: pageBounds.width * scale, height: pageBounds.height * scale)
+
+    // Render the page into a CGImage
+    let colorSpace = CGColorSpaceCreateDeviceRGB()
+    let bitmapInfo = CGImageAlphaInfo.noneSkipLast.rawValue
+    guard let context = CGContext(
+        data: nil,
+        width: Int(scaledSize.width),
+        height: Int(scaledSize.height),
+        bitsPerComponent: 8,
+        bytesPerRow: 0,
+        space: colorSpace,
+        bitmapInfo: bitmapInfo
+    ) else {
+        print("\(i): could not create CGContext")
+        continue
+    }
+    context.setFillColor(CGColor(gray: 1.0, alpha: 1.0))
+    context.fill(CGRect(origin: .zero, size: scaledSize))
+    context.scaleBy(x: scale, y: scale)
+    page.draw(with: .mediaBox, to: context)
+
+    guard let cgImage = context.makeImage() else {
+        print("\(i): could not create CGImage")
+        continue
+    }
+
+    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = .accurate
+    request.recognitionLanguages = ["es-ES", "es", "en-US"]
+    request.usesLanguageCorrection = true
+    if #available(macOS 13.0, *) {
+        request.automaticallyDetectsLanguage = true
+    }
+
+    do {
+        try handler.perform([request])
+        let observations = request.results ?? []
+        var lines: [String] = []
+        var totalConfidence: Float = 0
+        var count = 0
+        for obs in observations {
+            if let top = obs.topCandidates(1).first {
+                let s = top.string.trimmingCharacters(in: .whitespaces)
+                if !s.isEmpty {
+                    lines.append(s)
+                    totalConfidence += top.confidence
+                    count += 1
+                }
+            }
+        }
+        let avg = count > 0 ? Double(totalConfidence) / Double(count) : 0.0
+
+        // Try to detect book page number: a short numeric line in the first
+        // 3 or last 3 entries (typical page-number placement).
+        var bookPage: Int? = nil
+        let candidates = Array(lines.prefix(3)) + Array(lines.suffix(3))
+        for c in candidates {
+            let trimmed = c.trimmingCharacters(in: .whitespaces)
+            if let n = Int(trimmed), n >= 1 && n <= 1000 {
+                bookPage = n
+                break
+            }
+        }
+
+        results[String(i)] = PageResult(lines: lines, confidence: avg, bookPage: bookPage)
+    } catch {
+        print("\(i): \(error)")
+    }
+
+    if (i + 1) % 25 == 0 || (i + 1) == pageCount {
+        let elapsed = Date().timeIntervalSince(startTime)
+        let rate = Double(i + 1) / max(elapsed, 0.001)
+        let remaining = Double(pageCount - (i + 1)) / max(rate, 0.001)
+        print(String(format: "%d/%d  %.1f pg/s  eta %.0fs", i + 1, pageCount, rate, remaining))
+    }
+}
+
+let encoder = JSONEncoder()
+encoder.outputFormatting = [.sortedKeys]
+do {
+    let data = try encoder.encode(results)
+    try data.write(to: outputURL)
+    print("Wrote \(results.count) pages to \(outputURL.path)")
+} catch {
+    print("Error writing output: \(error)")
+    exit(1)
+}
@@ -0,0 +1,177 @@
+#!/usr/bin/env swift
+// Re-OCR the images referenced in quarantined_cards.json using Vision with
+// bounding-box info, then pair lines by column position (left = Spanish,
+// right = English) instead of by document read order.
+//
+// Output: repaired_cards.json — {"byImage": {"f0142-02.jpg": [{"es":..., "en":...}, ...]}}
+
+import Foundation
+import Vision
+import AppKit
+
+guard CommandLine.arguments.count >= 4 else {
+    print("Usage: swift repair_quarantined.swift <quarantined.json> <epub_oebps_dir> <output.json>")
+    exit(1)
+}
+
+let quarantinedURL = URL(fileURLWithPath: CommandLine.arguments[1])
+let imageDir = URL(fileURLWithPath: CommandLine.arguments[2])
+let outputURL = URL(fileURLWithPath: CommandLine.arguments[3])
+
+guard let data = try? Data(contentsOf: quarantinedURL),
+      let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
+      let cards = json["cards"] as? [[String: Any]] else {
+    print("Could not load \(quarantinedURL.path)")
+    exit(1)
+}
+
+var uniqueImages = Set<String>()
+for card in cards {
+    if let src = card["sourceImage"] as? String { uniqueImages.insert(src) }
+}
+print("Unique images to re-OCR: \(uniqueImages.count)")
+
+struct RecognizedLine {
+    let text: String
+    let cx: CGFloat   // center X (normalized 0..1)
+    let cy: CGFloat   // center Y (normalized 0..1 from top)
+    let confidence: Float
+}
+
+struct Pair: Encodable {
+    var es: String
+    var en: String
+    var confidence: Double
+}
+
+struct ImageResult: Encodable {
+    var pairs: [Pair]
+    var lineCount: Int
+    var strategy: String
+}
+
+func classify(_ s: String) -> String {
+    // "es" if has accents or starts with ES article; "en" if starts with EN article; else "?"
+    let lower = s.lowercased()
+    let accentChars: Set<Character> = ["á", "é", "í", "ó", "ú", "ñ", "ü", "¿", "¡"]
+    if lower.contains(where: { accentChars.contains($0) }) { return "es" }
+    let first = lower.split(separator: " ").first.map(String.init) ?? ""
+    let esArticles: Set<String> = ["el", "la", "los", "las", "un", "una", "unos", "unas"]
+    let enStarters: Set<String> = ["the", "a", "an", "to", "my", "his", "her", "our", "their"]
+    if esArticles.contains(first) { return "es" }
+    if enStarters.contains(first) { return "en" }
+    return "?"
+}
+
+func recognizeLines(cgImage: CGImage) -> [RecognizedLine] {
+    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = .accurate
+    request.recognitionLanguages = ["es-ES", "es", "en-US"]
+    request.usesLanguageCorrection = true
+    if #available(macOS 13.0, *) {
+        request.automaticallyDetectsLanguage = true
+    }
+    do { try handler.perform([request]) } catch { return [] }
+    var out: [RecognizedLine] = []
+    for obs in request.results ?? [] {
+        guard let top = obs.topCandidates(1).first else { continue }
+        let s = top.string.trimmingCharacters(in: .whitespaces)
+        if s.isEmpty { continue }
+        // Vision's boundingBox is normalized with origin at lower-left
+        let bb = obs.boundingBox
+        let cx = bb.origin.x + bb.width / 2
+        let cyTop = 1.0 - (bb.origin.y + bb.height / 2)  // flip to top-origin
+        out.append(RecognizedLine(text: s, cx: cx, cy: cyTop, confidence: top.confidence))
+    }
+    return out
+}
+
+/// Pair lines by column position: left column = Spanish, right column = English.
+/// Groups lines into rows by Y proximity, then within each row pairs left-right.
+func pairByPosition(_ lines: [RecognizedLine]) -> ([Pair], String) {
+    guard !lines.isEmpty else { return ([], "empty") }
+
+    // Cluster by Y into rows. Use adaptive row height: median line gap * 0.6
+    let sortedByY = lines.sorted { $0.cy < $1.cy }
+    var rows: [[RecognizedLine]] = []
+    var current: [RecognizedLine] = []
+    let rowTol: CGFloat = 0.015   // 1.5% of page height
+    for l in sortedByY {
+        if let last = current.last, abs(l.cy - last.cy) > rowTol {
+            rows.append(current)
+            current = [l]
+        } else {
+            current.append(l)
+        }
+    }
+    if !current.isEmpty { rows.append(current) }
+
+    var pairs: [Pair] = []
+    var strategy = "row-pair"
+    for row in rows {
+        guard row.count >= 2 else { continue }
+        // Sort row by X, split at midpoint; left = Spanish, right = English
+        let sortedX = row.sorted { $0.cx < $1.cx }
+        // Find gap: pick the biggest x-gap in the row to split
+        var maxGap: CGFloat = 0
+        var splitIdx = 1
+        for i in 1..<sortedX.count {
+            let gap = sortedX[i].cx - sortedX[i - 1].cx
+            if gap > maxGap {
+                maxGap = gap
+                splitIdx = i
+            }
+        }
+        let leftLines = Array(sortedX[0..<splitIdx])
+        let rightLines = Array(sortedX[splitIdx..<sortedX.count])
+        let leftText = leftLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
+        let rightText = rightLines.map(\.text).joined(separator: " ").trimmingCharacters(in: .whitespaces)
+        if leftText.isEmpty || rightText.isEmpty { continue }
+        // Verify language orientation — swap if we got it backwards
+        var es = leftText
+        var en = rightText
+        let lc = classify(es)
+        let rc = classify(en)
+        if lc == "en" && rc == "es" {
+            es = rightText
+            en = leftText
+        }
+        let avgConf = (leftLines + rightLines).reduce(Float(0)) { $0 + $1.confidence } / Float(leftLines.count + rightLines.count)
+        pairs.append(Pair(es: es, en: en, confidence: Double(avgConf)))
+    }
+
+    if pairs.isEmpty { strategy = "no-rows" }
+    return (pairs, strategy)
+}
+
+var results: [String: ImageResult] = [:]
+
+for name in uniqueImages.sorted() {
+    let url = imageDir.appendingPathComponent(name)
+    guard let img = NSImage(contentsOf: url),
+          let tiff = img.tiffRepresentation,
+          let rep = NSBitmapImageRep(data: tiff),
+          let cg = rep.cgImage else {
+        print("\(name): could not load")
+        continue
+    }
+    let lines = recognizeLines(cgImage: cg)
+    let (pairs, strategy) = pairByPosition(lines)
+    results[name] = ImageResult(pairs: pairs, lineCount: lines.count, strategy: strategy)
+    print("\(name): \(lines.count) lines -> \(pairs.count) pairs via \(strategy)")
+}
+
+struct Output: Encodable {
+    var byImage: [String: ImageResult]
+    var totalPairs: Int
+}
+let output = Output(
+    byImage: results,
+    totalPairs: results.values.reduce(0) { $0 + $1.pairs.count }
+)
+
+let enc = JSONEncoder()
+enc.outputFormatting = [.prettyPrinted, .sortedKeys]
+try enc.encode(output).write(to: outputURL)
+print("Wrote \(output.totalPairs) repaired pairs to \(outputURL.path)")
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# End-to-end textbook extraction pipeline.
+#
+# Requires: Python 3 + lxml/beautifulsoup4/pypdf installed.
+#           macOS for Vision + NSSpellChecker (Swift).
+#
+# Inputs: EPUB extracted to epub_extract/OEBPS/ and the PDF at project root.
+# Outputs: book.json, vocab_cards.json, manual_review.json, quarantined_cards.json
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+cd "$ROOT"
+
+echo "=== Phase 1a: parse XHTML chapters ==="
+python3 "$SCRIPT_DIR/extract_chapters.py"
+
+echo "=== Phase 1b: parse answer key ==="
+python3 "$SCRIPT_DIR/extract_answers.py"
+
+if [ ! -f "$SCRIPT_DIR/ocr.json" ]; then
+    echo "=== Phase 1c: OCR EPUB images (first-time only) ==="
+    swift "$SCRIPT_DIR/ocr_images.swift" "$ROOT/epub_extract/OEBPS" "$SCRIPT_DIR/ocr.json"
+else
+    echo "=== Phase 1c: EPUB OCR already cached ==="
+fi
+
+PDF_FILE="$(ls "$ROOT"/Complete\ Spanish\ Step-By-Step*.pdf 2>/dev/null | head -1 || true)"
+if [ -n "$PDF_FILE" ] && [ ! -f "$SCRIPT_DIR/pdf_ocr.json" ]; then
+    echo "=== Phase 1d: OCR PDF pages (first-time only) ==="
+    swift "$SCRIPT_DIR/ocr_pdf.swift" "$PDF_FILE" "$SCRIPT_DIR/pdf_ocr.json" 240
+fi
+
+echo "=== Phase 1e: merge into book.json ==="
+python3 "$SCRIPT_DIR/merge_pdf_into_book.py"
+
+echo "=== Phase 2: spell-check validation ==="
+swift "$SCRIPT_DIR/validate_vocab.swift" "$SCRIPT_DIR/vocab_cards.json" "$SCRIPT_DIR/vocab_validation.json"
+
+echo "=== Phase 3: auto-fix + quarantine pass 1 ==="
+python3 "$SCRIPT_DIR/fix_vocab.py"
+
+echo "=== Phase 3: auto-fix + quarantine pass 2 (convergence) ==="
+swift "$SCRIPT_DIR/validate_vocab.swift" "$SCRIPT_DIR/vocab_cards.json" "$SCRIPT_DIR/vocab_validation.json"
+python3 "$SCRIPT_DIR/fix_vocab.py"
+
+echo ""
+echo "=== Copy to app bundle ==="
+cp "$SCRIPT_DIR/book.json" "$ROOT/Conjuga/Conjuga/textbook_data.json"
+cp "$SCRIPT_DIR/vocab_cards.json" "$ROOT/Conjuga/Conjuga/textbook_vocab.json"
+ls -lh "$ROOT/Conjuga/Conjuga/textbook_"*.json
+echo ""
+echo "Done. Bump textbookDataVersion in DataLoader.swift to trigger re-seed."
@@ -0,0 +1,156 @@
+#!/usr/bin/env swift
+// Validate every Spanish/English word in vocab_cards.json using NSSpellChecker.
+// For each flagged word, produce up to 3 candidate corrections.
+//
+// Usage: swift validate_vocab.swift <vocab_cards.json> <output_report.json>
+
+import Foundation
+import AppKit
+
+guard CommandLine.arguments.count >= 3 else {
+    print("Usage: swift validate_vocab.swift <vocab_cards.json> <output_report.json>")
+    exit(1)
+}
+
+let inputURL = URL(fileURLWithPath: CommandLine.arguments[1])
+let outputURL = URL(fileURLWithPath: CommandLine.arguments[2])
+
+guard let data = try? Data(contentsOf: inputURL),
+      let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
+      let chapters = json["chapters"] as? [[String: Any]] else {
+    print("Could not load \(inputURL.path)")
+    exit(1)
+}
+
+let checker = NSSpellChecker.shared
+
+// Tokenize — only letter runs (Unicode aware for Spanish accents)
+func tokens(_ s: String) -> [String] {
+    let letters = CharacterSet.letters
+    return s.unicodeScalars
+        .split { !letters.contains($0) }
+        .map { String(String.UnicodeScalarView($0)) }
+        .filter { !$0.isEmpty }
+}
+
+// Minimal stopword set — names, proper nouns, numeric tokens already filtered
+let stopES: Set<String> = [
+    "el", "la", "los", "las", "un", "una", "unos", "unas", "del", "al", "de",
+    "a", "en", "y", "o", "que", "no", "se", "con", "por", "para", "lo", "le",
+    "su", "mi", "tu", "yo", "te", "me", "es", "son", "está", "están",
+]
+let stopEN: Set<String> = [
+    "the", "a", "an", "to", "of", "in", "and", "or", "is", "are", "was", "were",
+    "be", "been", "my", "his", "her", "our", "their", "your",
+]
+
+func checkWord(_ w: String, lang: String, stop: Set<String>) -> [String]? {
+    // Return nil if word is OK, else list of candidate corrections.
+    if w.count < 2 { return nil }
+    if stop.contains(w.lowercased()) { return nil }
+    if w.rangeOfCharacter(from: .decimalDigits) != nil { return nil }
+
+    let range = checker.checkSpelling(
+        of: w,
+        startingAt: 0,
+        language: lang,
+        wrap: false,
+        inSpellDocumentWithTag: 0,
+        wordCount: nil
+    )
+    // Range of `(0, 0)` means no misspelling; otherwise we have a misspelling.
+    if range.location == NSNotFound || range.length == 0 { return nil }
+
+    let guesses = checker.guesses(
+        forWordRange: NSRange(location: 0, length: (w as NSString).length),
+        in: w,
+        language: lang,
+        inSpellDocumentWithTag: 0
+    ) ?? []
+    return Array(guesses.prefix(3))
+}
+
+struct Flag: Encodable {
+    var chapter: Int
+    var front: String
+    var back: String
+    var badFront: [BadWord]
+    var badBack: [BadWord]
+    var sourceImage: String
+}
+struct BadWord: Encodable {
+    var word: String
+    var suggestions: [String]
+    var side: String  // "es" or "en"
+}
+
+var flags: [Flag] = []
+var totalCards = 0
+var totalBadES = 0
+var totalBadEN = 0
+
+for ch in chapters {
+    guard let chNum = ch["chapter"] as? Int,
+          let cards = ch["cards"] as? [[String: Any]] else { continue }
+    for card in cards {
+        totalCards += 1
+        let front = (card["front"] as? String) ?? ""
+        let back = (card["back"] as? String) ?? ""
+        let img = (card["sourceImage"] as? String) ?? ""
+
+        var badFront: [BadWord] = []
+        for w in tokens(front) {
+            if let sugg = checkWord(w, lang: "es", stop: stopES) {
+                badFront.append(BadWord(word: w, suggestions: sugg, side: "es"))
+                totalBadES += 1
+            }
+        }
+        var badBack: [BadWord] = []
+        for w in tokens(back) {
+            if let sugg = checkWord(w, lang: "en", stop: stopEN) {
+                badBack.append(BadWord(word: w, suggestions: sugg, side: "en"))
+                totalBadEN += 1
+            }
+        }
+        if !badFront.isEmpty || !badBack.isEmpty {
+            flags.append(Flag(
+                chapter: chNum,
+                front: front,
+                back: back,
+                badFront: badFront,
+                badBack: badBack,
+                sourceImage: img
+            ))
+        }
+    }
+}
+
+struct Report: Encodable {
+    var totalCards: Int
+    var flaggedCards: Int
+    var flaggedSpanishWords: Int
+    var flaggedEnglishWords: Int
+    var flags: [Flag]
+}
+let report = Report(
+    totalCards: totalCards,
+    flaggedCards: flags.count,
+    flaggedSpanishWords: totalBadES,
+    flaggedEnglishWords: totalBadEN,
+    flags: flags
+)
+
+let encoder = JSONEncoder()
+encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+do {
+    let data = try encoder.encode(report)
+    try data.write(to: outputURL)
+    print("Cards:              \(totalCards)")
+    print("Flagged cards:      \(flags.count)  (\(Double(flags.count)/Double(totalCards)*100.0 as Double)%)")
+    print("Flagged ES words:   \(totalBadES)")
+    print("Flagged EN words:   \(totalBadEN)")
+    print("Wrote \(outputURL.path)")
+} catch {
+    print("Error writing output: \(error)")
+    exit(1)
+}