Fixes #32 — LLM vision pass for vocab pairs, fixes scrambled English/Spanish

The bbox-OCR pipeline mis-paired ~114 vocab tables across the book — the
chapter 7 "Other Idioms" image (issue #32) being the most visible.
Three failure modes were collapsing the data:
  1) classifier blind to subject pronouns ("yo", "I", etc.)
  2) right-then-left OCR reads on 2-col tables
  3) Y-cluster drift on multi-line cells in 4-col layouts

Replaced the entire vocab-extraction tier with a Claude vision pass over
all 931 vocab images. Output is keyed by image with three classifications:
  - pair_table       (extract all Spanish↔English pairs)
  - reference_only   (Spanish-only conjugation tables — no pairs, UI shows
                      the flat OCR lines as a reference list instead)
  - hybrid           (some header pairs + reference content beneath; only
                      the genuine pairs become cards)

merge_pdf_into_book.py now picks pair source by priority:
  llm-vision → bounding-box OCR → block-alternation heuristic.

Numbers (across the whole book):
  - mis-oriented tables: 114 → 5
  - quarantined cards:   250 → 2
  - extracted pairs:     2832 → 4569

textbookDataVersion bumped to 13. Per-batch agent outputs gitignored
under Conjuga/Scripts/textbook/paired_vocab_llm/ — only the merged
paired_vocab_llm.json (also gitignored) is needed to rebuild.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-05-03 18:48:04 -05:00
parent 90aea92fba
commit f368c24ad6
5 changed files with 21072 additions and 9446 deletions
+1 -1
View File
@@ -6,7 +6,7 @@ actor DataLoader {
static let courseDataVersion = 7
static let courseDataKey = "courseDataVersion"
static let textbookDataVersion = 12
static let textbookDataVersion = 13
static let textbookDataKey = "textbookDataVersion"
/// Quick check: does the DB need seeding or course data refresh?
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large Load Diff
@@ -33,7 +33,8 @@ CHAPTERS_JSON = HERE / "chapters.json"
ANSWERS_JSON = HERE / "answers.json"
OCR_JSON = HERE / "ocr.json"
PDF_OCR_JSON = HERE / "pdf_ocr.json"
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (preferred)
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (fallback)
PAIRED_VOCAB_LLM_JSON = HERE / "paired_vocab_llm.json" # LLM vision pairs (preferred)
OUT_BOOK = HERE / "book.json"
OUT_VOCAB = HERE / "vocab_cards.json"
@@ -224,8 +225,10 @@ def main() -> None:
pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
paired_llm = load(PAIRED_VOCAB_LLM_JSON) if PAIRED_VOCAB_LLM_JSON.exists() else {}
print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")
print(f"Loaded LLM-vision pairs for {len(paired_llm)} vocab images")
# Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
narrative_set = set()
@@ -282,28 +285,57 @@ def main() -> None:
if repairs > 0:
merged_pages += 1
# Prefer bounding-box pairs (from paired_vocab.json) when
# present. Fall back to the block-alternation heuristic.
# Source priority:
# 1) LLM-vision pairs (paired_vocab_llm.json) — semantic
# classification (pair_table / reference_only / hybrid)
# with correct orientation.
# 2) Bounding-box pairs (paired_vocab.json) — Vision OCR
# with X-gap row splitting.
# 3) Block-alternation heuristic — flat OCR fallback.
llm_entry = paired_llm.get(src, {}) if isinstance(paired_llm.get(src), dict) else {}
llm_kind = llm_entry.get("kind")
llm_pairs = llm_entry.get("pairs", []) if llm_entry else []
bbox = paired_vocab.get(src, {})
bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
heuristic = build_vocab_cards_for_block(
{"src": src},
{"lines": merged_lines, "confidence": merged_conf},
ch, current_section_title, bi
)
if bbox_pairs:
# Choose pair source. For reference_only (Spanish-only tables)
# we deliberately produce no cards — the UI will fall back to
# rendering the flat OCR lines as a reference list.
if llm_kind == "reference_only":
cards_for_block = []
pair_source = "llm-reference"
elif llm_pairs:
cards_for_block = [
{"front": p["es"], "back": p["en"]}
for p in llm_pairs
if p.get("es") and p.get("en")
]
for c in cards_for_block:
all_vocab_cards.append({
"front": c["front"], "back": c["back"],
"chapter": ch["number"],
"chapterTitle": ch["title"],
"section": current_section_title,
"sourceImage": src,
})
pair_source = "llm-" + (llm_kind or "pairs")
elif bbox_pairs:
cards_for_block = [
{"front": p["es"], "back": p["en"]}
for p in bbox_pairs
if p.get("es") and p.get("en")
]
# Also feed the flashcard deck
for p in bbox_pairs:
if p.get("es") and p.get("en"):
all_vocab_cards.append({
"front": p["es"],
"back": p["en"],
"front": p["es"], "back": p["en"],
"chapter": ch["number"],
"chapterTitle": ch["title"],
"section": current_section_title,
@@ -326,6 +358,7 @@ def main() -> None:
"source": pair_source,
"bookPage": book_page,
"repairs": repairs,
"tableKind": llm_kind,
})
continue