Fixes #32 — LLM vision pass for vocab pairs, fixes scrambled English/Spanish

The bbox-OCR pipeline mis-paired ~114 vocab tables across the book — the chapter 7 "Other Idioms" image (issue #32) being the most visible. Three failure modes were collapsing the data: 1) classifier blind to subject pronouns ("yo", "I", etc.) 2) right-then-left OCR reads on 2-col tables 3) Y-cluster drift on multi-line cells in 4-col layouts Replaced the entire vocab-extraction tier with a Claude vision pass over all 931 vocab images. Output is keyed by image with three classifications: - pair_table (extract all Spanish↔English pairs) - reference_only (Spanish-only conjugation tables — no pairs, UI shows the flat OCR lines as a reference list instead) - hybrid (some header pairs + reference content beneath; only the genuine pairs become cards) merge_pdf_into_book.py now picks pair source by priority: llm-vision → bounding-box OCR → block-alternation heuristic. Numbers (across the whole book): - mis-oriented tables: 114 → 5 - quarantined cards: 250 → 2 - extracted pairs: 2832 → 4569 textbookDataVersion bumped to 13. Per-batch agent outputs gitignored under Conjuga/Scripts/textbook/paired_vocab_llm/ — only the merged paired_vocab_llm.json (also gitignored) is needed to rebuild. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 18:48:04 -05:00
parent 90aea92fba
commit f368c24ad6
5 changed files with 21072 additions and 9446 deletions
@@ -6,7 +6,7 @@ actor DataLoader {
    static let courseDataVersion = 7
    static let courseDataKey = "courseDataVersion"

-    static let textbookDataVersion = 12
+    static let textbookDataVersion = 13
    static let textbookDataKey = "textbookDataVersion"

    /// Quick check: does the DB need seeding or course data refresh?
@@ -33,7 +33,8 @@ CHAPTERS_JSON = HERE / "chapters.json"
 ANSWERS_JSON = HERE / "answers.json"
 OCR_JSON = HERE / "ocr.json"
 PDF_OCR_JSON = HERE / "pdf_ocr.json"
-PAIRED_VOCAB_JSON = HERE / "paired_vocab.json"  # bounding-box pairs (preferred)
+PAIRED_VOCAB_JSON = HERE / "paired_vocab.json"        # bounding-box pairs (fallback)
+PAIRED_VOCAB_LLM_JSON = HERE / "paired_vocab_llm.json"  # LLM vision pairs (preferred)
 OUT_BOOK = HERE / "book.json"
 OUT_VOCAB = HERE / "vocab_cards.json"

@@ -224,8 +225,10 @@ def main() -> None:
    pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
    pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
    paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
+    paired_llm = load(PAIRED_VOCAB_LLM_JSON) if PAIRED_VOCAB_LLM_JSON.exists() else {}
    print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
    print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")
+    print(f"Loaded LLM-vision pairs for {len(paired_llm)} vocab images")

    # Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
    narrative_set = set()
@@ -282,28 +285,57 @@ def main() -> None:
                if repairs > 0:
                    merged_pages += 1

-                # Prefer bounding-box pairs (from paired_vocab.json) when
-                # present. Fall back to the block-alternation heuristic.
+                # Source priority:
+                #   1) LLM-vision pairs (paired_vocab_llm.json) — semantic
+                #      classification (pair_table / reference_only / hybrid)
+                #      with correct orientation.
+                #   2) Bounding-box pairs (paired_vocab.json) — Vision OCR
+                #      with X-gap row splitting.
+                #   3) Block-alternation heuristic — flat OCR fallback.
+                llm_entry = paired_llm.get(src, {}) if isinstance(paired_llm.get(src), dict) else {}
+                llm_kind = llm_entry.get("kind")
+                llm_pairs = llm_entry.get("pairs", []) if llm_entry else []
+
                bbox = paired_vocab.get(src, {})
                bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
+
                heuristic = build_vocab_cards_for_block(
                    {"src": src},
                    {"lines": merged_lines, "confidence": merged_conf},
                    ch, current_section_title, bi
                )

-                if bbox_pairs:
+                # Choose pair source. For reference_only (Spanish-only tables)
+                # we deliberately produce no cards — the UI will fall back to
+                # rendering the flat OCR lines as a reference list.
+                if llm_kind == "reference_only":
+                    cards_for_block = []
+                    pair_source = "llm-reference"
+                elif llm_pairs:
+                    cards_for_block = [
+                        {"front": p["es"], "back": p["en"]}
+                        for p in llm_pairs
+                        if p.get("es") and p.get("en")
+                    ]
+                    for c in cards_for_block:
+                        all_vocab_cards.append({
+                            "front": c["front"], "back": c["back"],
+                            "chapter": ch["number"],
+                            "chapterTitle": ch["title"],
+                            "section": current_section_title,
+                            "sourceImage": src,
+                        })
+                    pair_source = "llm-" + (llm_kind or "pairs")
+                elif bbox_pairs:
                    cards_for_block = [
                        {"front": p["es"], "back": p["en"]}
                        for p in bbox_pairs
                        if p.get("es") and p.get("en")
                    ]
-                    # Also feed the flashcard deck
                    for p in bbox_pairs:
                        if p.get("es") and p.get("en"):
                            all_vocab_cards.append({
-                                "front": p["es"],
-                                "back": p["en"],
+                                "front": p["es"], "back": p["en"],
                                "chapter": ch["number"],
                                "chapterTitle": ch["title"],
                                "section": current_section_title,
@@ -326,6 +358,7 @@ def main() -> None:
                    "source": pair_source,
                    "bookPage": book_page,
                    "repairs": repairs,
+                    "tableKind": llm_kind,
                })
                continue