Render textbook vocab as paired Spanish→English grid
Previously the chapter reader showed vocab tables as a flat list of OCR lines — because Vision reads columns top-to-bottom, the Spanish column appeared as one block followed by the English column, making pairings illegible. Now every vocab table renders as a 2-column grid with Spanish on the left and English on the right. Supporting changes: - New ocr_all_vocab.swift: bounding-box OCR over all 931 vocab images, cluster lines into rows by Y-coordinate, split rows by largest X-gap, detect 2- / 3- / 4-column layouts automatically. ~2800 pairs extracted this pass vs ~1100 from the old block-alternation heuristic. - merge_pdf_into_book.py now prefers bounding-box pairs when present, falls back to the heuristic, embeds the resulting pairs as vocab_table.cards in book.json. - DataLoader passes cards through to TextbookBlock on seed. - TextbookChapterView renders cards via SwiftUI Grid (2 cols). - fix_vocab.py quarantine rule relaxed — only mis-pairs where both sides are clearly the same language are removed. "unknown" sides stay (bbox pipeline already oriented them correctly). Textbook card count jumps from 1044 → 3118 active pairs. textbookDataVersion bumped to 9. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -33,6 +33,7 @@ CHAPTERS_JSON = HERE / "chapters.json"
|
||||
ANSWERS_JSON = HERE / "answers.json"
|
||||
OCR_JSON = HERE / "ocr.json"
|
||||
PDF_OCR_JSON = HERE / "pdf_ocr.json"
|
||||
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (preferred)
|
||||
OUT_BOOK = HERE / "book.json"
|
||||
OUT_VOCAB = HERE / "vocab_cards.json"
|
||||
|
||||
@@ -222,7 +223,9 @@ def main() -> None:
|
||||
epub_ocr = load(OCR_JSON)
|
||||
pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
|
||||
pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
|
||||
paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
|
||||
print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
|
||||
print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")
|
||||
|
||||
# Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
|
||||
narrative_set = set()
|
||||
@@ -279,19 +282,48 @@ def main() -> None:
|
||||
if repairs > 0:
|
||||
merged_pages += 1
|
||||
|
||||
derived = build_vocab_cards_for_block(
|
||||
# Prefer bounding-box pairs (from paired_vocab.json) when
|
||||
# present. Fall back to the block-alternation heuristic.
|
||||
bbox = paired_vocab.get(src, {})
|
||||
bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
|
||||
heuristic = build_vocab_cards_for_block(
|
||||
{"src": src},
|
||||
{"lines": merged_lines, "confidence": merged_conf},
|
||||
ch, current_section_title, bi
|
||||
)
|
||||
all_vocab_cards.extend(derived)
|
||||
|
||||
if bbox_pairs:
|
||||
cards_for_block = [
|
||||
{"front": p["es"], "back": p["en"]}
|
||||
for p in bbox_pairs
|
||||
if p.get("es") and p.get("en")
|
||||
]
|
||||
# Also feed the flashcard deck
|
||||
for p in bbox_pairs:
|
||||
if p.get("es") and p.get("en"):
|
||||
all_vocab_cards.append({
|
||||
"front": p["es"],
|
||||
"back": p["en"],
|
||||
"chapter": ch["number"],
|
||||
"chapterTitle": ch["title"],
|
||||
"section": current_section_title,
|
||||
"sourceImage": src,
|
||||
})
|
||||
pair_source = "bbox"
|
||||
else:
|
||||
cards_for_block = [{"front": c["front"], "back": c["back"]} for c in heuristic]
|
||||
all_vocab_cards.extend(heuristic)
|
||||
pair_source = "heuristic"
|
||||
|
||||
out_blocks.append({
|
||||
"kind": "vocab_table",
|
||||
"sourceImage": src,
|
||||
"ocrLines": merged_lines,
|
||||
"ocrConfidence": merged_conf,
|
||||
"cardCount": len(derived),
|
||||
"source": "pdf-repaired" if repairs > 0 else ("epub" if epub_lines else "pdf"),
|
||||
"cardCount": len(cards_for_block),
|
||||
"cards": cards_for_block,
|
||||
"columnCount": bbox.get("columnCount", 2) if isinstance(bbox, dict) else 2,
|
||||
"source": pair_source,
|
||||
"bookPage": book_page,
|
||||
"repairs": repairs,
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user