Previously the chapter reader showed vocab tables as a flat list of OCR lines — because Vision reads columns top-to-bottom, the Spanish column appeared as one block followed by the English column, making pairings illegible. Now every vocab table renders as a 2-column grid with Spanish on the left and English on the right. Supporting changes: - New ocr_all_vocab.swift: bounding-box OCR over all 931 vocab images, cluster lines into rows by Y-coordinate, split rows by largest X-gap, detect 2- / 3- / 4-column layouts automatically. ~2800 pairs extracted this pass vs ~1100 from the old block-alternation heuristic. - merge_pdf_into_book.py now prefers bounding-box pairs when present, falls back to the heuristic, embeds the resulting pairs as vocab_table.cards in book.json. - DataLoader passes cards through to TextbookBlock on seed. - TextbookChapterView renders cards via SwiftUI Grid (2 cols). - fix_vocab.py quarantine rule relaxed — only mis-pairs where both sides are clearly the same language are removed. "unknown" sides stay (bbox pipeline already oriented them correctly). Textbook card count jumps from 1044 → 3118 active pairs. textbookDataVersion bumped to 9. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
436 lines
16 KiB
Python
436 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""Second-pass extractor: use PDF OCR (from ocr_pdf.swift) as a supplementary
|
|
source of clean text, then re-build book.json with PDF-derived content where it
|
|
improves on the EPUB's image-based extraction.
|
|
|
|
Inputs:
|
|
chapters.json — EPUB structural extraction (narrative text + exercise prompts + image refs)
|
|
answers.json — EPUB answer key
|
|
ocr.json — EPUB image OCR (first pass)
|
|
pdf_ocr.json — PDF page-level OCR (this pass, higher DPI + cleaner)
|
|
|
|
Outputs:
|
|
book.json — merged book used by the app
|
|
vocab_cards.json — derived vocabulary flashcards
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
HERE = Path(__file__).resolve().parent
|
|
sys.path.insert(0, str(HERE))
|
|
from build_book import ( # reuse the helpers defined in build_book.py
|
|
COURSE_NAME,
|
|
build_vocab_cards_for_block,
|
|
clean_instruction,
|
|
classify_line,
|
|
load,
|
|
)
|
|
|
|
CHAPTERS_JSON = HERE / "chapters.json"
|
|
ANSWERS_JSON = HERE / "answers.json"
|
|
OCR_JSON = HERE / "ocr.json"
|
|
PDF_OCR_JSON = HERE / "pdf_ocr.json"
|
|
PAIRED_VOCAB_JSON = HERE / "paired_vocab.json" # bounding-box pairs (preferred)
|
|
OUT_BOOK = HERE / "book.json"
|
|
OUT_VOCAB = HERE / "vocab_cards.json"
|
|
|
|
IMAGE_NAME_RE = re.compile(r"^f(\d{4})-(\d{2})\.jpg$")
|
|
|
|
|
|
def extract_book_page(image_src: str) -> "int | None":
|
|
m = IMAGE_NAME_RE.match(image_src)
|
|
return int(m.group(1)) if m else None
|
|
|
|
|
|
def build_pdf_page_index(pdf_ocr: dict) -> "dict[int, dict]":
|
|
"""Map bookPage → {lines, confidence, pdfIndex}.
|
|
|
|
Strategy: use chapter-start alignments as anchors. For each chapter N,
|
|
anchor[N] = (pdf_idx_where_chapter_starts, book_page_where_chapter_starts).
|
|
Between anchors we interpolate page-by-page (pages run sequentially within
|
|
a chapter in this textbook's layout).
|
|
"""
|
|
pages: "dict[int, dict]" = {}
|
|
sorted_keys = sorted(pdf_ocr.keys(), key=lambda k: int(k))
|
|
|
|
# --- Detect chapter starts in the PDF OCR ---
|
|
pdf_ch_start: "dict[int, int]" = {}
|
|
for k in sorted_keys:
|
|
entry = pdf_ocr[k]
|
|
lines = entry.get("lines", [])
|
|
if len(lines) < 2:
|
|
continue
|
|
first = lines[0].strip()
|
|
second = lines[1].strip()
|
|
if first.isdigit() and 1 <= int(first) <= 30 and len(second) > 5 and second[0:1].isupper():
|
|
ch = int(first)
|
|
if ch not in pdf_ch_start:
|
|
pdf_ch_start[ch] = int(k)
|
|
|
|
# --- Load EPUB's authoritative book-page starts ---
|
|
import re as _re
|
|
from bs4 import BeautifulSoup as _BS
|
|
epub_root = HERE.parents[2] / "epub_extract" / "OEBPS"
|
|
book_ch_start: "dict[int, int]" = {}
|
|
for ch in sorted(pdf_ch_start.keys()):
|
|
p = epub_root / f"ch{ch}.xhtml"
|
|
if not p.exists():
|
|
continue
|
|
soup = _BS(p.read_text(encoding="utf-8"), "lxml")
|
|
for span in soup.find_all(True):
|
|
id_ = span.get("id", "") or ""
|
|
m = _re.match(r"page_(\d+)$", id_)
|
|
if m:
|
|
book_ch_start[ch] = int(m.group(1))
|
|
break
|
|
|
|
# Build per-chapter (pdf_anchor, book_anchor, next_pdf_anchor) intervals
|
|
anchors = [] # list of (ch, pdf_start, book_start)
|
|
for ch in sorted(pdf_ch_start.keys()):
|
|
if ch in book_ch_start:
|
|
anchors.append((ch, pdf_ch_start[ch], book_ch_start[ch]))
|
|
|
|
for i, (ch, pdf_s, book_s) in enumerate(anchors):
|
|
next_pdf = anchors[i + 1][1] if i + 1 < len(anchors) else pdf_s + 50
|
|
# Interpolate book page for each pdf index in [pdf_s, next_pdf)
|
|
for pdf_idx in range(pdf_s, next_pdf):
|
|
book_page = book_s + (pdf_idx - pdf_s)
|
|
entry = pdf_ocr.get(str(pdf_idx))
|
|
if entry is None:
|
|
continue
|
|
if book_page in pages:
|
|
continue
|
|
pages[book_page] = {
|
|
"lines": entry["lines"],
|
|
"confidence": entry.get("confidence", 0),
|
|
"pdfIndex": pdf_idx,
|
|
}
|
|
return pages
|
|
|
|
|
|
def merge_ocr(epub_lines: list, pdf_lines: list) -> list:
|
|
"""EPUB per-image OCR is our primary (targeted, no prose bleed). PDF
|
|
page-level OCR is only used when EPUB is missing. Per-line accent repair
|
|
is handled separately via `repair_accents_from_pdf`.
|
|
"""
|
|
if epub_lines:
|
|
return epub_lines
|
|
return pdf_lines
|
|
|
|
|
|
import unicodedata as _u
|
|
|
|
def _strip_accents(s: str) -> str:
|
|
return "".join(c for c in _u.normalize("NFD", s) if _u.category(c) != "Mn")
|
|
|
|
|
|
def _levenshtein(a: str, b: str) -> int:
|
|
if a == b: return 0
|
|
if not a: return len(b)
|
|
if not b: return len(a)
|
|
prev = list(range(len(b) + 1))
|
|
for i, ca in enumerate(a, 1):
|
|
curr = [i]
|
|
for j, cb in enumerate(b, 1):
|
|
cost = 0 if ca == cb else 1
|
|
curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
|
|
prev = curr
|
|
return prev[-1]
|
|
|
|
|
|
def repair_accents_from_pdf(epub_lines: list, pdf_page_lines: list) -> "tuple[list, int]":
|
|
"""For each EPUB OCR line, find a near-match in the PDF page OCR and
|
|
prefer the PDF version. Repairs include:
|
|
1. exact accent/case differences (e.g. 'iglesia' vs 'Iglesia')
|
|
2. single-character OCR errors (e.g. 'the hrother' -> 'the brother')
|
|
3. two-character OCR errors when the target is long enough
|
|
"""
|
|
if not epub_lines or not pdf_page_lines:
|
|
return (epub_lines, 0)
|
|
# Pre-normalize PDF lines for matching
|
|
pdf_cleaned = [p.strip() for p in pdf_page_lines if p.strip()]
|
|
pdf_by_stripped: dict = {}
|
|
for p in pdf_cleaned:
|
|
key = _strip_accents(p.lower())
|
|
pdf_by_stripped.setdefault(key, p)
|
|
|
|
out: list = []
|
|
repairs = 0
|
|
for e in epub_lines:
|
|
e_stripped = e.strip()
|
|
e_key = _strip_accents(e_stripped.lower())
|
|
# Pass 1: exact accent-only difference
|
|
if e_key and e_key in pdf_by_stripped and pdf_by_stripped[e_key] != e_stripped:
|
|
out.append(pdf_by_stripped[e_key])
|
|
repairs += 1
|
|
continue
|
|
# Pass 2: fuzzy — find best PDF line within edit distance 1 or 2
|
|
if len(e_key) >= 4:
|
|
max_distance = 1 if len(e_key) < 10 else 2
|
|
best_match = None
|
|
best_d = max_distance + 1
|
|
for p in pdf_cleaned:
|
|
p_key = _strip_accents(p.lower())
|
|
# Only match lines of similar length
|
|
if abs(len(p_key) - len(e_key)) > max_distance:
|
|
continue
|
|
d = _levenshtein(e_key, p_key)
|
|
if d < best_d:
|
|
best_d = d
|
|
best_match = p
|
|
if d == 0:
|
|
break
|
|
if best_match and best_match != e_stripped and best_d <= max_distance:
|
|
out.append(best_match)
|
|
repairs += 1
|
|
continue
|
|
out.append(e)
|
|
return (out, repairs)
|
|
|
|
|
|
def vocab_lines_from_pdf_page(
|
|
pdf_page_entry: dict,
|
|
epub_narrative_lines: set
|
|
) -> list:
|
|
"""Extract likely vocab-table lines from a PDF page's OCR by filtering out
|
|
narrative-looking lines (long sentences) and already-known EPUB content."""
|
|
lines = pdf_page_entry.get("lines", [])
|
|
out: list = []
|
|
for raw in lines:
|
|
line = raw.strip()
|
|
if not line:
|
|
continue
|
|
# Skip lines that look like body prose (too long)
|
|
if len(line) > 80:
|
|
continue
|
|
# Skip narrative we already captured in the EPUB
|
|
if line in epub_narrative_lines:
|
|
continue
|
|
# Skip page-number-only lines
|
|
if re.fullmatch(r"\d{1,4}", line):
|
|
continue
|
|
# Skip standalone chapter headers (e.g. "Nouns, Articles, and Adjectives")
|
|
out.append(line)
|
|
return out
|
|
|
|
|
|
def main() -> None:
|
|
chapters_data = load(CHAPTERS_JSON)
|
|
answers = load(ANSWERS_JSON)["answers"]
|
|
epub_ocr = load(OCR_JSON)
|
|
pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
|
|
pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
|
|
paired_vocab = load(PAIRED_VOCAB_JSON) if PAIRED_VOCAB_JSON.exists() else {}
|
|
print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")
|
|
print(f"Loaded bounding-box pairs for {len(paired_vocab)} vocab images")
|
|
|
|
# Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
|
|
narrative_set = set()
|
|
for ch in chapters_data["chapters"]:
|
|
for b in ch["blocks"]:
|
|
if b["kind"] == "paragraph" and b.get("text"):
|
|
narrative_set.add(b["text"].strip())
|
|
|
|
book_chapters = []
|
|
all_vocab_cards = []
|
|
pdf_hits = 0
|
|
pdf_misses = 0
|
|
merged_pages = 0
|
|
|
|
for ch in chapters_data["chapters"]:
|
|
out_blocks = []
|
|
current_section_title = ch["title"]
|
|
|
|
for bi, block in enumerate(ch["blocks"]):
|
|
k = block["kind"]
|
|
|
|
if k == "heading":
|
|
current_section_title = block["text"]
|
|
out_blocks.append(block)
|
|
continue
|
|
|
|
if k == "paragraph":
|
|
out_blocks.append(block)
|
|
continue
|
|
|
|
if k == "key_vocab_header":
|
|
out_blocks.append(block)
|
|
continue
|
|
|
|
if k == "vocab_image":
|
|
src = block["src"]
|
|
epub_entry = epub_ocr.get(src)
|
|
epub_lines = epub_entry.get("lines", []) if epub_entry else []
|
|
epub_conf = epub_entry.get("confidence", 0.0) if epub_entry else 0.0
|
|
|
|
book_page = extract_book_page(src)
|
|
pdf_entry = pdf_pages.get(book_page) if book_page else None
|
|
pdf_lines = pdf_entry["lines"] if pdf_entry else []
|
|
|
|
# Primary: EPUB per-image OCR. Supplementary: PDF page OCR
|
|
# used only for accent/diacritic repair where keys match.
|
|
if pdf_lines:
|
|
pdf_hits += 1
|
|
else:
|
|
pdf_misses += 1
|
|
repaired_lines, repairs = repair_accents_from_pdf(epub_lines, pdf_lines)
|
|
merged_lines = repaired_lines if repaired_lines else pdf_lines
|
|
merged_conf = max(epub_conf, pdf_entry.get("confidence", 0) if pdf_entry else 0.0)
|
|
if repairs > 0:
|
|
merged_pages += 1
|
|
|
|
# Prefer bounding-box pairs (from paired_vocab.json) when
|
|
# present. Fall back to the block-alternation heuristic.
|
|
bbox = paired_vocab.get(src, {})
|
|
bbox_pairs = bbox.get("pairs", []) if isinstance(bbox, dict) else []
|
|
heuristic = build_vocab_cards_for_block(
|
|
{"src": src},
|
|
{"lines": merged_lines, "confidence": merged_conf},
|
|
ch, current_section_title, bi
|
|
)
|
|
|
|
if bbox_pairs:
|
|
cards_for_block = [
|
|
{"front": p["es"], "back": p["en"]}
|
|
for p in bbox_pairs
|
|
if p.get("es") and p.get("en")
|
|
]
|
|
# Also feed the flashcard deck
|
|
for p in bbox_pairs:
|
|
if p.get("es") and p.get("en"):
|
|
all_vocab_cards.append({
|
|
"front": p["es"],
|
|
"back": p["en"],
|
|
"chapter": ch["number"],
|
|
"chapterTitle": ch["title"],
|
|
"section": current_section_title,
|
|
"sourceImage": src,
|
|
})
|
|
pair_source = "bbox"
|
|
else:
|
|
cards_for_block = [{"front": c["front"], "back": c["back"]} for c in heuristic]
|
|
all_vocab_cards.extend(heuristic)
|
|
pair_source = "heuristic"
|
|
|
|
out_blocks.append({
|
|
"kind": "vocab_table",
|
|
"sourceImage": src,
|
|
"ocrLines": merged_lines,
|
|
"ocrConfidence": merged_conf,
|
|
"cardCount": len(cards_for_block),
|
|
"cards": cards_for_block,
|
|
"columnCount": bbox.get("columnCount", 2) if isinstance(bbox, dict) else 2,
|
|
"source": pair_source,
|
|
"bookPage": book_page,
|
|
"repairs": repairs,
|
|
})
|
|
continue
|
|
|
|
if k == "exercise":
|
|
ans = answers.get(block["id"])
|
|
# EPUB image OCR (if any image refs)
|
|
image_ocr_lines: list = []
|
|
for src in block.get("image_refs", []):
|
|
ee = epub_ocr.get(src)
|
|
if ee:
|
|
image_ocr_lines.extend(ee.get("lines", []))
|
|
# Add PDF-page OCR for that page if available
|
|
bp = extract_book_page(src)
|
|
if bp and pdf_pages.get(bp):
|
|
# Only add lines not already present from EPUB OCR
|
|
pdf_lines = pdf_pages[bp]["lines"]
|
|
for line in pdf_lines:
|
|
line = line.strip()
|
|
if not line or line in image_ocr_lines:
|
|
continue
|
|
if line in narrative_set:
|
|
continue
|
|
image_ocr_lines.append(line)
|
|
|
|
prompts = [p for p in block.get("prompts", []) if p.strip()]
|
|
extras = [e for e in block.get("extra", []) if e.strip()]
|
|
if not prompts and image_ocr_lines:
|
|
# Extract numbered lines from OCR
|
|
for line in image_ocr_lines:
|
|
m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip())
|
|
if m:
|
|
prompts.append(f"{m.group(1)}. {m.group(2)}")
|
|
|
|
sub = ans["subparts"] if ans else []
|
|
answer_items = []
|
|
for sp in sub:
|
|
for it in sp["items"]:
|
|
answer_items.append({
|
|
"label": sp["label"],
|
|
"number": it["number"],
|
|
"answer": it["answer"],
|
|
"alternates": it["alternates"],
|
|
})
|
|
|
|
out_blocks.append({
|
|
"kind": "exercise",
|
|
"id": block["id"],
|
|
"ansAnchor": block.get("ans_anchor", ""),
|
|
"instruction": clean_instruction(block.get("instruction", "")),
|
|
"extra": extras,
|
|
"prompts": prompts,
|
|
"ocrLines": image_ocr_lines,
|
|
"freeform": ans["freeform"] if ans else False,
|
|
"answerItems": answer_items,
|
|
"answerRaw": ans["raw"] if ans else "",
|
|
"answerSubparts": sub,
|
|
})
|
|
continue
|
|
|
|
out_blocks.append(block)
|
|
|
|
book_chapters.append({
|
|
"id": ch["id"],
|
|
"number": ch["number"],
|
|
"title": ch["title"],
|
|
"part": ch.get("part"),
|
|
"blocks": out_blocks,
|
|
})
|
|
|
|
book = {
|
|
"courseName": COURSE_NAME,
|
|
"totalChapters": len(book_chapters),
|
|
"totalExercises": sum(1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise"),
|
|
"totalVocabTables": sum(1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table"),
|
|
"totalVocabCards": len(all_vocab_cards),
|
|
"parts": chapters_data.get("part_memberships", {}),
|
|
"chapters": book_chapters,
|
|
"sources": {
|
|
"epub_images_ocr": bool(epub_ocr),
|
|
"pdf_pages_ocr": bool(pdf_ocr_raw),
|
|
"pdf_pages_mapped": len(pdf_pages),
|
|
},
|
|
}
|
|
OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False))
|
|
|
|
vocab_by_chapter: dict = {}
|
|
for card in all_vocab_cards:
|
|
vocab_by_chapter.setdefault(card["chapter"], []).append(card)
|
|
OUT_VOCAB.write_text(json.dumps({
|
|
"courseName": COURSE_NAME,
|
|
"chapters": [
|
|
{"chapter": n, "cards": cs}
|
|
for n, cs in sorted(vocab_by_chapter.items())
|
|
],
|
|
}, ensure_ascii=False, indent=2))
|
|
|
|
print(f"Wrote {OUT_BOOK}")
|
|
print(f"Wrote {OUT_VOCAB}")
|
|
print(f"Chapters: {book['totalChapters']}")
|
|
print(f"Exercises: {book['totalExercises']}")
|
|
print(f"Vocab tables: {book['totalVocabTables']}")
|
|
print(f"Vocab cards (derived): {book['totalVocabCards']}")
|
|
print(f"PDF hits vs misses: {pdf_hits} / {pdf_misses}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|