#!/usr/bin/env python3 """Second-pass extractor: use PDF OCR (from ocr_pdf.swift) as a supplementary source of clean text, then re-build book.json with PDF-derived content where it improves on the EPUB's image-based extraction. Inputs: chapters.json — EPUB structural extraction (narrative text + exercise prompts + image refs) answers.json — EPUB answer key ocr.json — EPUB image OCR (first pass) pdf_ocr.json — PDF page-level OCR (this pass, higher DPI + cleaner) Outputs: book.json — merged book used by the app vocab_cards.json — derived vocabulary flashcards """ import json import re import sys from pathlib import Path HERE = Path(__file__).resolve().parent sys.path.insert(0, str(HERE)) from build_book import ( # reuse the helpers defined in build_book.py COURSE_NAME, build_vocab_cards_for_block, clean_instruction, classify_line, load, ) CHAPTERS_JSON = HERE / "chapters.json" ANSWERS_JSON = HERE / "answers.json" OCR_JSON = HERE / "ocr.json" PDF_OCR_JSON = HERE / "pdf_ocr.json" OUT_BOOK = HERE / "book.json" OUT_VOCAB = HERE / "vocab_cards.json" IMAGE_NAME_RE = re.compile(r"^f(\d{4})-(\d{2})\.jpg$") def extract_book_page(image_src: str) -> "int | None": m = IMAGE_NAME_RE.match(image_src) return int(m.group(1)) if m else None def build_pdf_page_index(pdf_ocr: dict) -> "dict[int, dict]": """Map bookPage → {lines, confidence, pdfIndex}. Strategy: use chapter-start alignments as anchors. For each chapter N, anchor[N] = (pdf_idx_where_chapter_starts, book_page_where_chapter_starts). Between anchors we interpolate page-by-page (pages run sequentially within a chapter in this textbook's layout). """ pages: "dict[int, dict]" = {} sorted_keys = sorted(pdf_ocr.keys(), key=lambda k: int(k)) # --- Detect chapter starts in the PDF OCR --- pdf_ch_start: "dict[int, int]" = {} for k in sorted_keys: entry = pdf_ocr[k] lines = entry.get("lines", []) if len(lines) < 2: continue first = lines[0].strip() second = lines[1].strip() if first.isdigit() and 1 <= int(first) <= 30 and len(second) > 5 and second[0:1].isupper(): ch = int(first) if ch not in pdf_ch_start: pdf_ch_start[ch] = int(k) # --- Load EPUB's authoritative book-page starts --- import re as _re from bs4 import BeautifulSoup as _BS epub_root = HERE.parents[2] / "epub_extract" / "OEBPS" book_ch_start: "dict[int, int]" = {} for ch in sorted(pdf_ch_start.keys()): p = epub_root / f"ch{ch}.xhtml" if not p.exists(): continue soup = _BS(p.read_text(encoding="utf-8"), "lxml") for span in soup.find_all(True): id_ = span.get("id", "") or "" m = _re.match(r"page_(\d+)$", id_) if m: book_ch_start[ch] = int(m.group(1)) break # Build per-chapter (pdf_anchor, book_anchor, next_pdf_anchor) intervals anchors = [] # list of (ch, pdf_start, book_start) for ch in sorted(pdf_ch_start.keys()): if ch in book_ch_start: anchors.append((ch, pdf_ch_start[ch], book_ch_start[ch])) for i, (ch, pdf_s, book_s) in enumerate(anchors): next_pdf = anchors[i + 1][1] if i + 1 < len(anchors) else pdf_s + 50 # Interpolate book page for each pdf index in [pdf_s, next_pdf) for pdf_idx in range(pdf_s, next_pdf): book_page = book_s + (pdf_idx - pdf_s) entry = pdf_ocr.get(str(pdf_idx)) if entry is None: continue if book_page in pages: continue pages[book_page] = { "lines": entry["lines"], "confidence": entry.get("confidence", 0), "pdfIndex": pdf_idx, } return pages def merge_ocr(epub_lines: list, pdf_lines: list) -> list: """EPUB per-image OCR is our primary (targeted, no prose bleed). PDF page-level OCR is only used when EPUB is missing. Per-line accent repair is handled separately via `repair_accents_from_pdf`. """ if epub_lines: return epub_lines return pdf_lines import unicodedata as _u def _strip_accents(s: str) -> str: return "".join(c for c in _u.normalize("NFD", s) if _u.category(c) != "Mn") def _levenshtein(a: str, b: str) -> int: if a == b: return 0 if not a: return len(b) if not b: return len(a) prev = list(range(len(b) + 1)) for i, ca in enumerate(a, 1): curr = [i] for j, cb in enumerate(b, 1): cost = 0 if ca == cb else 1 curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost)) prev = curr return prev[-1] def repair_accents_from_pdf(epub_lines: list, pdf_page_lines: list) -> "tuple[list, int]": """For each EPUB OCR line, find a near-match in the PDF page OCR and prefer the PDF version. Repairs include: 1. exact accent/case differences (e.g. 'iglesia' vs 'Iglesia') 2. single-character OCR errors (e.g. 'the hrother' -> 'the brother') 3. two-character OCR errors when the target is long enough """ if not epub_lines or not pdf_page_lines: return (epub_lines, 0) # Pre-normalize PDF lines for matching pdf_cleaned = [p.strip() for p in pdf_page_lines if p.strip()] pdf_by_stripped: dict = {} for p in pdf_cleaned: key = _strip_accents(p.lower()) pdf_by_stripped.setdefault(key, p) out: list = [] repairs = 0 for e in epub_lines: e_stripped = e.strip() e_key = _strip_accents(e_stripped.lower()) # Pass 1: exact accent-only difference if e_key and e_key in pdf_by_stripped and pdf_by_stripped[e_key] != e_stripped: out.append(pdf_by_stripped[e_key]) repairs += 1 continue # Pass 2: fuzzy — find best PDF line within edit distance 1 or 2 if len(e_key) >= 4: max_distance = 1 if len(e_key) < 10 else 2 best_match = None best_d = max_distance + 1 for p in pdf_cleaned: p_key = _strip_accents(p.lower()) # Only match lines of similar length if abs(len(p_key) - len(e_key)) > max_distance: continue d = _levenshtein(e_key, p_key) if d < best_d: best_d = d best_match = p if d == 0: break if best_match and best_match != e_stripped and best_d <= max_distance: out.append(best_match) repairs += 1 continue out.append(e) return (out, repairs) def vocab_lines_from_pdf_page( pdf_page_entry: dict, epub_narrative_lines: set ) -> list: """Extract likely vocab-table lines from a PDF page's OCR by filtering out narrative-looking lines (long sentences) and already-known EPUB content.""" lines = pdf_page_entry.get("lines", []) out: list = [] for raw in lines: line = raw.strip() if not line: continue # Skip lines that look like body prose (too long) if len(line) > 80: continue # Skip narrative we already captured in the EPUB if line in epub_narrative_lines: continue # Skip page-number-only lines if re.fullmatch(r"\d{1,4}", line): continue # Skip standalone chapter headers (e.g. "Nouns, Articles, and Adjectives") out.append(line) return out def main() -> None: chapters_data = load(CHAPTERS_JSON) answers = load(ANSWERS_JSON)["answers"] epub_ocr = load(OCR_JSON) pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {} pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {} print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers") # Build a global set of EPUB narrative lines (for subtraction when pulling vocab) narrative_set = set() for ch in chapters_data["chapters"]: for b in ch["blocks"]: if b["kind"] == "paragraph" and b.get("text"): narrative_set.add(b["text"].strip()) book_chapters = [] all_vocab_cards = [] pdf_hits = 0 pdf_misses = 0 merged_pages = 0 for ch in chapters_data["chapters"]: out_blocks = [] current_section_title = ch["title"] for bi, block in enumerate(ch["blocks"]): k = block["kind"] if k == "heading": current_section_title = block["text"] out_blocks.append(block) continue if k == "paragraph": out_blocks.append(block) continue if k == "key_vocab_header": out_blocks.append(block) continue if k == "vocab_image": src = block["src"] epub_entry = epub_ocr.get(src) epub_lines = epub_entry.get("lines", []) if epub_entry else [] epub_conf = epub_entry.get("confidence", 0.0) if epub_entry else 0.0 book_page = extract_book_page(src) pdf_entry = pdf_pages.get(book_page) if book_page else None pdf_lines = pdf_entry["lines"] if pdf_entry else [] # Primary: EPUB per-image OCR. Supplementary: PDF page OCR # used only for accent/diacritic repair where keys match. if pdf_lines: pdf_hits += 1 else: pdf_misses += 1 repaired_lines, repairs = repair_accents_from_pdf(epub_lines, pdf_lines) merged_lines = repaired_lines if repaired_lines else pdf_lines merged_conf = max(epub_conf, pdf_entry.get("confidence", 0) if pdf_entry else 0.0) if repairs > 0: merged_pages += 1 derived = build_vocab_cards_for_block( {"src": src}, {"lines": merged_lines, "confidence": merged_conf}, ch, current_section_title, bi ) all_vocab_cards.extend(derived) out_blocks.append({ "kind": "vocab_table", "sourceImage": src, "ocrLines": merged_lines, "ocrConfidence": merged_conf, "cardCount": len(derived), "source": "pdf-repaired" if repairs > 0 else ("epub" if epub_lines else "pdf"), "bookPage": book_page, "repairs": repairs, }) continue if k == "exercise": ans = answers.get(block["id"]) # EPUB image OCR (if any image refs) image_ocr_lines: list = [] for src in block.get("image_refs", []): ee = epub_ocr.get(src) if ee: image_ocr_lines.extend(ee.get("lines", [])) # Add PDF-page OCR for that page if available bp = extract_book_page(src) if bp and pdf_pages.get(bp): # Only add lines not already present from EPUB OCR pdf_lines = pdf_pages[bp]["lines"] for line in pdf_lines: line = line.strip() if not line or line in image_ocr_lines: continue if line in narrative_set: continue image_ocr_lines.append(line) prompts = [p for p in block.get("prompts", []) if p.strip()] extras = [e for e in block.get("extra", []) if e.strip()] if not prompts and image_ocr_lines: # Extract numbered lines from OCR for line in image_ocr_lines: m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip()) if m: prompts.append(f"{m.group(1)}. {m.group(2)}") sub = ans["subparts"] if ans else [] answer_items = [] for sp in sub: for it in sp["items"]: answer_items.append({ "label": sp["label"], "number": it["number"], "answer": it["answer"], "alternates": it["alternates"], }) out_blocks.append({ "kind": "exercise", "id": block["id"], "ansAnchor": block.get("ans_anchor", ""), "instruction": clean_instruction(block.get("instruction", "")), "extra": extras, "prompts": prompts, "ocrLines": image_ocr_lines, "freeform": ans["freeform"] if ans else False, "answerItems": answer_items, "answerRaw": ans["raw"] if ans else "", "answerSubparts": sub, }) continue out_blocks.append(block) book_chapters.append({ "id": ch["id"], "number": ch["number"], "title": ch["title"], "part": ch.get("part"), "blocks": out_blocks, }) book = { "courseName": COURSE_NAME, "totalChapters": len(book_chapters), "totalExercises": sum(1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise"), "totalVocabTables": sum(1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table"), "totalVocabCards": len(all_vocab_cards), "parts": chapters_data.get("part_memberships", {}), "chapters": book_chapters, "sources": { "epub_images_ocr": bool(epub_ocr), "pdf_pages_ocr": bool(pdf_ocr_raw), "pdf_pages_mapped": len(pdf_pages), }, } OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False)) vocab_by_chapter: dict = {} for card in all_vocab_cards: vocab_by_chapter.setdefault(card["chapter"], []).append(card) OUT_VOCAB.write_text(json.dumps({ "courseName": COURSE_NAME, "chapters": [ {"chapter": n, "cards": cs} for n, cs in sorted(vocab_by_chapter.items()) ], }, ensure_ascii=False, indent=2)) print(f"Wrote {OUT_BOOK}") print(f"Wrote {OUT_VOCAB}") print(f"Chapters: {book['totalChapters']}") print(f"Exercises: {book['totalExercises']}") print(f"Vocab tables: {book['totalVocabTables']}") print(f"Vocab cards (derived): {book['totalVocabCards']}") print(f"PDF hits vs misses: {pdf_hits} / {pdf_misses}") if __name__ == "__main__": main()