Fixes #32 — LLM vision pass for vocab pairs, fixes scrambled English/Spanish
The bbox-OCR pipeline mis-paired ~114 vocab tables across the book — the chapter 7 "Other Idioms" image (issue #32) being the most visible. Three failure modes were collapsing the data: 1) classifier blind to subject pronouns ("yo", "I", etc.) 2) right-then-left OCR reads on 2-col tables 3) Y-cluster drift on multi-line cells in 4-col layouts Replaced the entire vocab-extraction tier with a Claude vision pass over all 931 vocab images. Output is keyed by image with three classifications: - pair_table (extract all Spanish↔English pairs) - reference_only (Spanish-only conjugation tables — no pairs, UI shows the flat OCR lines as a reference list instead) - hybrid (some header pairs + reference content beneath; only the genuine pairs become cards) merge_pdf_into_book.py now picks pair source by priority: llm-vision → bounding-box OCR → block-alternation heuristic. Numbers (across the whole book): - mis-oriented tables: 114 → 5 - quarantined cards: 250 → 2 - extracted pairs: 2832 → 4569 textbookDataVersion bumped to 13. Per-batch agent outputs gitignored under Conjuga/Scripts/textbook/paired_vocab_llm/ — only the merged paired_vocab_llm.json (also gitignored) is needed to rebuild. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -50,6 +50,7 @@ epub_extract/
|
|||||||
# Scripts are committed; their generated outputs are not.
|
# Scripts are committed; their generated outputs are not.
|
||||||
Conjuga/Scripts/textbook/*.json
|
Conjuga/Scripts/textbook/*.json
|
||||||
Conjuga/Scripts/textbook/review.html
|
Conjuga/Scripts/textbook/review.html
|
||||||
|
Conjuga/Scripts/textbook/paired_vocab_llm/
|
||||||
# Note: the app-bundle copies (Conjuga/Conjuga/textbook_{data,vocab}.json)
|
# Note: the app-bundle copies (Conjuga/Conjuga/textbook_{data,vocab}.json)
|
||||||
# ARE committed so `xcodebuild` works on a fresh clone without first running
|
# ARE committed so `xcodebuild` works on a fresh clone without first running
|
||||||
# the pipeline. They're regenerated from the scripts when content changes.
|
# the pipeline. They're regenerated from the scripts when content changes.
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ actor DataLoader {
|
|||||||
static let courseDataVersion = 7
|
static let courseDataVersion = 7
|
||||||
static let courseDataKey = "courseDataVersion"
|
static let courseDataKey = "courseDataVersion"
|
||||||
|
|
||||||
static let textbookDataVersion = 12
|
static let textbookDataVersion = 13
|
||||||
static let textbookDataKey = "textbookDataVersion"
|
static let textbookDataKey = "textbookDataVersion"
|
||||||
|
|
||||||
/// Quick check: does the DB need seeding or course data refresh?
|
/// Quick check: does the DB need seeding or course data refresh?
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
+21029
-9437
File diff suppressed because it is too large
Load Diff
@@ -33,7 +33,8 @@ CHAPTERS_JSON = HERE / "chapters.json"
|
|||||||
ANSWERS_JSON = HERE / "answers.json"
|
ANSWERS_JSON = HERE / "answers.json"
|
||||||
OCR_JSON = HERE / "ocr.json"
|
OCR_JSON = HERE / "ocr.json"
|
||||||
PDF_OCR_JSON = HERE / "pdf_ocr.json"
|
PDF_OCR_JSON = HERE / "pdf_ocr.json"
|
||||||
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (preferred)
|
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (fallback)
|
||||||
|
PAIRED_VOCAB_LLM_JSON = HERE / "paired_vocab_llm.json" # LLM vision pairs (preferred)
|
||||||
OUT_BOOK = HERE / "book.json"
|
OUT_BOOK = HERE / "book.json"
|
||||||
OUT_VOCAB = HERE / "vocab_cards.json"
|
OUT_VOCAB = HERE / "vocab_cards.json"
|
||||||
|
|
||||||
@@ -224,8 +225,10 @@ def main() -> None:
|
|||||||
pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
|
pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
|
||||||
pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
|
pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
|
||||||
paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
|
paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
|
||||||
|
paired_llm = load(PAIRED_VOCAB_LLM_JSON) if PAIRED_VOCAB_LLM_JSON.exists() else {}
|
||||||
print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
|
print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
|
||||||
print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")
|
print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")
|
||||||
|
print(f"Loaded LLM-vision pairs for {len(paired_llm)} vocab images")
|
||||||
|
|
||||||
# Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
|
# Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
|
||||||
narrative_set = set()
|
narrative_set = set()
|
||||||
@@ -282,28 +285,57 @@ def main() -> None:
|
|||||||
if repairs > 0:
|
if repairs > 0:
|
||||||
merged_pages += 1
|
merged_pages += 1
|
||||||
|
|
||||||
# Prefer bounding-box pairs (from paired_vocab.json) when
|
# Source priority:
|
||||||
# present. Fall back to the block-alternation heuristic.
|
# 1) LLM-vision pairs (paired_vocab_llm.json) — semantic
|
||||||
|
# classification (pair_table / reference_only / hybrid)
|
||||||
|
# with correct orientation.
|
||||||
|
# 2) Bounding-box pairs (paired_vocab.json) — Vision OCR
|
||||||
|
# with X-gap row splitting.
|
||||||
|
# 3) Block-alternation heuristic — flat OCR fallback.
|
||||||
|
llm_entry = paired_llm.get(src, {}) if isinstance(paired_llm.get(src), dict) else {}
|
||||||
|
llm_kind = llm_entry.get("kind")
|
||||||
|
llm_pairs = llm_entry.get("pairs", []) if llm_entry else []
|
||||||
|
|
||||||
bbox = paired_vocab.get(src, {})
|
bbox = paired_vocab.get(src, {})
|
||||||
bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
|
bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
|
||||||
|
|
||||||
heuristic = build_vocab_cards_for_block(
|
heuristic = build_vocab_cards_for_block(
|
||||||
{"src": src},
|
{"src": src},
|
||||||
{"lines": merged_lines, "confidence": merged_conf},
|
{"lines": merged_lines, "confidence": merged_conf},
|
||||||
ch, current_section_title, bi
|
ch, current_section_title, bi
|
||||||
)
|
)
|
||||||
|
|
||||||
if bbox_pairs:
|
# Choose pair source. For reference_only (Spanish-only tables)
|
||||||
|
# we deliberately produce no cards — the UI will fall back to
|
||||||
|
# rendering the flat OCR lines as a reference list.
|
||||||
|
if llm_kind == "reference_only":
|
||||||
|
cards_for_block = []
|
||||||
|
pair_source = "llm-reference"
|
||||||
|
elif llm_pairs:
|
||||||
|
cards_for_block = [
|
||||||
|
{"front": p["es"], "back": p["en"]}
|
||||||
|
for p in llm_pairs
|
||||||
|
if p.get("es") and p.get("en")
|
||||||
|
]
|
||||||
|
for c in cards_for_block:
|
||||||
|
all_vocab_cards.append({
|
||||||
|
"front": c["front"], "back": c["back"],
|
||||||
|
"chapter": ch["number"],
|
||||||
|
"chapterTitle": ch["title"],
|
||||||
|
"section": current_section_title,
|
||||||
|
"sourceImage": src,
|
||||||
|
})
|
||||||
|
pair_source = "llm-" + (llm_kind or "pairs")
|
||||||
|
elif bbox_pairs:
|
||||||
cards_for_block = [
|
cards_for_block = [
|
||||||
{"front": p["es"], "back": p["en"]}
|
{"front": p["es"], "back": p["en"]}
|
||||||
for p in bbox_pairs
|
for p in bbox_pairs
|
||||||
if p.get("es") and p.get("en")
|
if p.get("es") and p.get("en")
|
||||||
]
|
]
|
||||||
# Also feed the flashcard deck
|
|
||||||
for p in bbox_pairs:
|
for p in bbox_pairs:
|
||||||
if p.get("es") and p.get("en"):
|
if p.get("es") and p.get("en"):
|
||||||
all_vocab_cards.append({
|
all_vocab_cards.append({
|
||||||
"front": p["es"],
|
"front": p["es"], "back": p["en"],
|
||||||
"back": p["en"],
|
|
||||||
"chapter": ch["number"],
|
"chapter": ch["number"],
|
||||||
"chapterTitle": ch["title"],
|
"chapterTitle": ch["title"],
|
||||||
"section": current_section_title,
|
"section": current_section_title,
|
||||||
@@ -326,6 +358,7 @@ def main() -> None:
|
|||||||
"source": pair_source,
|
"source": pair_source,
|
||||||
"bookPage": book_page,
|
"bookPage": book_page,
|
||||||
"repairs": repairs,
|
"repairs": repairs,
|
||||||
|
"tableKind": llm_kind,
|
||||||
})
|
})
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user