Fixes #32 — LLM vision pass for vocab pairs, fixes scrambled English/Spanish
The bbox-OCR pipeline mis-paired ~114 vocab tables across the book — the chapter 7 "Other Idioms" image (issue #32) being the most visible. Three failure modes were collapsing the data: 1) classifier blind to subject pronouns ("yo", "I", etc.) 2) right-then-left OCR reads on 2-col tables 3) Y-cluster drift on multi-line cells in 4-col layouts Replaced the entire vocab-extraction tier with a Claude vision pass over all 931 vocab images. Output is keyed by image with three classifications: - pair_table (extract all Spanish↔English pairs) - reference_only (Spanish-only conjugation tables — no pairs, UI shows the flat OCR lines as a reference list instead) - hybrid (some header pairs + reference content beneath; only the genuine pairs become cards) merge_pdf_into_book.py now picks pair source by priority: llm-vision → bounding-box OCR → block-alternation heuristic. Numbers (across the whole book): - mis-oriented tables: 114 → 5 - quarantined cards: 250 → 2 - extracted pairs: 2832 → 4569 textbookDataVersion bumped to 13. Per-batch agent outputs gitignored under Conjuga/Scripts/textbook/paired_vocab_llm/ — only the merged paired_vocab_llm.json (also gitignored) is needed to rebuild. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -33,7 +33,8 @@ CHAPTERS_JSON = HERE / "chapters.json"
|
||||
ANSWERS_JSON = HERE / "answers.json"
|
||||
OCR_JSON = HERE / "ocr.json"
|
||||
PDF_OCR_JSON = HERE / "pdf_ocr.json"
|
||||
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (preferred)
|
||||
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (fallback)
|
||||
PAIRED_VOCAB_LLM_JSON = HERE / "paired_vocab_llm.json" # LLM vision pairs (preferred)
|
||||
OUT_BOOK = HERE / "book.json"
|
||||
OUT_VOCAB = HERE / "vocab_cards.json"
|
||||
|
||||
@@ -224,8 +225,10 @@ def main() -> None:
|
||||
pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
|
||||
pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
|
||||
paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
|
||||
paired_llm = load(PAIRED_VOCAB_LLM_JSON) if PAIRED_VOCAB_LLM_JSON.exists() else {}
|
||||
print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
|
||||
print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")
|
||||
print(f"Loaded LLM-vision pairs for {len(paired_llm)} vocab images")
|
||||
|
||||
# Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
|
||||
narrative_set = set()
|
||||
@@ -282,28 +285,57 @@ def main() -> None:
|
||||
if repairs > 0:
|
||||
merged_pages += 1
|
||||
|
||||
# Prefer bounding-box pairs (from paired_vocab.json) when
|
||||
# present. Fall back to the block-alternation heuristic.
|
||||
# Source priority:
|
||||
# 1) LLM-vision pairs (paired_vocab_llm.json) — semantic
|
||||
# classification (pair_table / reference_only / hybrid)
|
||||
# with correct orientation.
|
||||
# 2) Bounding-box pairs (paired_vocab.json) — Vision OCR
|
||||
# with X-gap row splitting.
|
||||
# 3) Block-alternation heuristic — flat OCR fallback.
|
||||
llm_entry = paired_llm.get(src, {}) if isinstance(paired_llm.get(src), dict) else {}
|
||||
llm_kind = llm_entry.get("kind")
|
||||
llm_pairs = llm_entry.get("pairs", []) if llm_entry else []
|
||||
|
||||
bbox = paired_vocab.get(src, {})
|
||||
bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
|
||||
|
||||
heuristic = build_vocab_cards_for_block(
|
||||
{"src": src},
|
||||
{"lines": merged_lines, "confidence": merged_conf},
|
||||
ch, current_section_title, bi
|
||||
)
|
||||
|
||||
if bbox_pairs:
|
||||
# Choose pair source. For reference_only (Spanish-only tables)
|
||||
# we deliberately produce no cards — the UI will fall back to
|
||||
# rendering the flat OCR lines as a reference list.
|
||||
if llm_kind == "reference_only":
|
||||
cards_for_block = []
|
||||
pair_source = "llm-reference"
|
||||
elif llm_pairs:
|
||||
cards_for_block = [
|
||||
{"front": p["es"], "back": p["en"]}
|
||||
for p in llm_pairs
|
||||
if p.get("es") and p.get("en")
|
||||
]
|
||||
for c in cards_for_block:
|
||||
all_vocab_cards.append({
|
||||
"front": c["front"], "back": c["back"],
|
||||
"chapter": ch["number"],
|
||||
"chapterTitle": ch["title"],
|
||||
"section": current_section_title,
|
||||
"sourceImage": src,
|
||||
})
|
||||
pair_source = "llm-" + (llm_kind or "pairs")
|
||||
elif bbox_pairs:
|
||||
cards_for_block = [
|
||||
{"front": p["es"], "back": p["en"]}
|
||||
for p in bbox_pairs
|
||||
if p.get("es") and p.get("en")
|
||||
]
|
||||
# Also feed the flashcard deck
|
||||
for p in bbox_pairs:
|
||||
if p.get("es") and p.get("en"):
|
||||
all_vocab_cards.append({
|
||||
"front": p["es"],
|
||||
"back": p["en"],
|
||||
"front": p["es"], "back": p["en"],
|
||||
"chapter": ch["number"],
|
||||
"chapterTitle": ch["title"],
|
||||
"section": current_section_title,
|
||||
@@ -326,6 +358,7 @@ def main() -> None:
|
||||
"source": pair_source,
|
||||
"bookPage": book_page,
|
||||
"repairs": repairs,
|
||||
"tableKind": llm_kind,
|
||||
})
|
||||
continue
|
||||
|
||||
|
||||
Reference in New Issue
Block a user