Spanish/Conjuga/Scripts/textbook/merge_pdf_into_book.py

#!/usr/bin/env python3
"""Second-pass extractor: use PDF OCR (from ocr_pdf.swift) as a supplementary
source of clean text, then re-build book.json with PDF-derived content where it
improves on the EPUB's image-based extraction.

Inputs:
  chapters.json  — EPUB structural extraction (narrative text + exercise prompts + image refs)
  answers.json   — EPUB answer key
  ocr.json       — EPUB image OCR (first pass)
  pdf_ocr.json   — PDF page-level OCR (this pass, higher DPI + cleaner)

Outputs:
  book.json         — merged book used by the app
  vocab_cards.json  — derived vocabulary flashcards
"""

import json
import re
import sys
from pathlib import Path

HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(HERE))
from build_book import (  # reuse the helpers defined in build_book.py
    COURSE_NAME,
    build_vocab_cards_for_block,
    clean_instruction,
    classify_line,
    load,
)

CHAPTERS_JSON = HERE / "chapters.json"
ANSWERS_JSON = HERE / "answers.json"
OCR_JSON = HERE / "ocr.json"
PDF_OCR_JSON = HERE / "pdf_ocr.json"
OUT_BOOK = HERE / "book.json"
OUT_VOCAB = HERE / "vocab_cards.json"

IMAGE_NAME_RE = re.compile(r"^f(\d{4})-(\d{2})\.jpg$")


def extract_book_page(image_src: str) -> "int | None":
    m = IMAGE_NAME_RE.match(image_src)
    return int(m.group(1)) if m else None


def build_pdf_page_index(pdf_ocr: dict) -> "dict[int, dict]":
    """Map bookPage → {lines, confidence, pdfIndex}.

    Strategy: use chapter-start alignments as anchors. For each chapter N,
    anchor[N] = (pdf_idx_where_chapter_starts, book_page_where_chapter_starts).
    Between anchors we interpolate page-by-page (pages run sequentially within
    a chapter in this textbook's layout).
    """
    pages: "dict[int, dict]" = {}
    sorted_keys = sorted(pdf_ocr.keys(), key=lambda k: int(k))

    # --- Detect chapter starts in the PDF OCR ---
    pdf_ch_start: "dict[int, int]" = {}
    for k in sorted_keys:
        entry = pdf_ocr[k]
        lines = entry.get("lines", [])
        if len(lines) < 2:
            continue
        first = lines[0].strip()
        second = lines[1].strip()
        if first.isdigit() and 1 <= int(first) <= 30 and len(second) > 5 and second[0:1].isupper():
            ch = int(first)
            if ch not in pdf_ch_start:
                pdf_ch_start[ch] = int(k)

    # --- Load EPUB's authoritative book-page starts ---
    import re as _re
    from bs4 import BeautifulSoup as _BS
    epub_root = HERE.parents[2] / "epub_extract" / "OEBPS"
    book_ch_start: "dict[int, int]" = {}
    for ch in sorted(pdf_ch_start.keys()):
        p = epub_root / f"ch{ch}.xhtml"
        if not p.exists():
            continue
        soup = _BS(p.read_text(encoding="utf-8"), "lxml")
        for span in soup.find_all(True):
            id_ = span.get("id", "") or ""
            m = _re.match(r"page_(\d+)$", id_)
            if m:
                book_ch_start[ch] = int(m.group(1))
                break

    # Build per-chapter (pdf_anchor, book_anchor, next_pdf_anchor) intervals
    anchors = []  # list of (ch, pdf_start, book_start)
    for ch in sorted(pdf_ch_start.keys()):
        if ch in book_ch_start:
            anchors.append((ch, pdf_ch_start[ch], book_ch_start[ch]))

    for i, (ch, pdf_s, book_s) in enumerate(anchors):
        next_pdf = anchors[i + 1][1] if i + 1 < len(anchors) else pdf_s + 50
        # Interpolate book page for each pdf index in [pdf_s, next_pdf)
        for pdf_idx in range(pdf_s, next_pdf):
            book_page = book_s + (pdf_idx - pdf_s)
            entry = pdf_ocr.get(str(pdf_idx))
            if entry is None:
                continue
            if book_page in pages:
                continue
            pages[book_page] = {
                "lines": entry["lines"],
                "confidence": entry.get("confidence", 0),
                "pdfIndex": pdf_idx,
            }
    return pages


def merge_ocr(epub_lines: list, pdf_lines: list) -> list:
    """EPUB per-image OCR is our primary (targeted, no prose bleed). PDF
    page-level OCR is only used when EPUB is missing. Per-line accent repair
    is handled separately via `repair_accents_from_pdf`.
    """
    if epub_lines:
        return epub_lines
    return pdf_lines


import unicodedata as _u

def _strip_accents(s: str) -> str:
    return "".join(c for c in _u.normalize("NFD", s) if _u.category(c) != "Mn")


def _levenshtein(a: str, b: str) -> int:
    if a == b: return 0
    if not a: return len(b)
    if not b: return len(a)
    prev = list(range(len(b) + 1))
    for i, ca in enumerate(a, 1):
        curr = [i]
        for j, cb in enumerate(b, 1):
            cost = 0 if ca == cb else 1
            curr.append(min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost))
        prev = curr
    return prev[-1]


def repair_accents_from_pdf(epub_lines: list, pdf_page_lines: list) -> "tuple[list, int]":
    """For each EPUB OCR line, find a near-match in the PDF page OCR and
    prefer the PDF version. Repairs include:
      1. exact accent/case differences (e.g. 'iglesia' vs 'Iglesia')
      2. single-character OCR errors (e.g. 'the hrother' -> 'the brother')
      3. two-character OCR errors when the target is long enough
    """
    if not epub_lines or not pdf_page_lines:
        return (epub_lines, 0)
    # Pre-normalize PDF lines for matching
    pdf_cleaned = [p.strip() for p in pdf_page_lines if p.strip()]
    pdf_by_stripped: dict = {}
    for p in pdf_cleaned:
        key = _strip_accents(p.lower())
        pdf_by_stripped.setdefault(key, p)

    out: list = []
    repairs = 0
    for e in epub_lines:
        e_stripped = e.strip()
        e_key = _strip_accents(e_stripped.lower())
        # Pass 1: exact accent-only difference
        if e_key and e_key in pdf_by_stripped and pdf_by_stripped[e_key] != e_stripped:
            out.append(pdf_by_stripped[e_key])
            repairs += 1
            continue
        # Pass 2: fuzzy — find best PDF line within edit distance 1 or 2
        if len(e_key) >= 4:
            max_distance = 1 if len(e_key) < 10 else 2
            best_match = None
            best_d = max_distance + 1
            for p in pdf_cleaned:
                p_key = _strip_accents(p.lower())
                # Only match lines of similar length
                if abs(len(p_key) - len(e_key)) > max_distance:
                    continue
                d = _levenshtein(e_key, p_key)
                if d < best_d:
                    best_d = d
                    best_match = p
                    if d == 0:
                        break
            if best_match and best_match != e_stripped and best_d <= max_distance:
                out.append(best_match)
                repairs += 1
                continue
        out.append(e)
    return (out, repairs)


def vocab_lines_from_pdf_page(
    pdf_page_entry: dict,
    epub_narrative_lines: set
) -> list:
    """Extract likely vocab-table lines from a PDF page's OCR by filtering out
    narrative-looking lines (long sentences) and already-known EPUB content."""
    lines = pdf_page_entry.get("lines", [])
    out: list = []
    for raw in lines:
        line = raw.strip()
        if not line:
            continue
        # Skip lines that look like body prose (too long)
        if len(line) > 80:
            continue
        # Skip narrative we already captured in the EPUB
        if line in epub_narrative_lines:
            continue
        # Skip page-number-only lines
        if re.fullmatch(r"\d{1,4}", line):
            continue
        # Skip standalone chapter headers (e.g. "Nouns, Articles, and Adjectives")
        out.append(line)
    return out


def main() -> None:
    chapters_data = load(CHAPTERS_JSON)
    answers = load(ANSWERS_JSON)["answers"]
    epub_ocr = load(OCR_JSON)
    pdf_ocr_raw = load(PDF_OCR_JSON) if PDF_OCR_JSON.exists() else {}
    pdf_pages = build_pdf_page_index(pdf_ocr_raw) if pdf_ocr_raw else {}
    print(f"Mapped {len(pdf_pages)} PDF pages to book page numbers")

    # Build a global set of EPUB narrative lines (for subtraction when pulling vocab)
    narrative_set = set()
    for ch in chapters_data["chapters"]:
        for b in ch["blocks"]:
            if b["kind"] == "paragraph" and b.get("text"):
                narrative_set.add(b["text"].strip())

    book_chapters = []
    all_vocab_cards = []
    pdf_hits = 0
    pdf_misses = 0
    merged_pages = 0

    for ch in chapters_data["chapters"]:
        out_blocks = []
        current_section_title = ch["title"]

        for bi, block in enumerate(ch["blocks"]):
            k = block["kind"]

            if k == "heading":
                current_section_title = block["text"]
                out_blocks.append(block)
                continue

            if k == "paragraph":
                out_blocks.append(block)
                continue

            if k == "key_vocab_header":
                out_blocks.append(block)
                continue

            if k == "vocab_image":
                src = block["src"]
                epub_entry = epub_ocr.get(src)
                epub_lines = epub_entry.get("lines", []) if epub_entry else []
                epub_conf = epub_entry.get("confidence", 0.0) if epub_entry else 0.0

                book_page = extract_book_page(src)
                pdf_entry = pdf_pages.get(book_page) if book_page else None
                pdf_lines = pdf_entry["lines"] if pdf_entry else []

                # Primary: EPUB per-image OCR. Supplementary: PDF page OCR
                # used only for accent/diacritic repair where keys match.
                if pdf_lines:
                    pdf_hits += 1
                else:
                    pdf_misses += 1
                repaired_lines, repairs = repair_accents_from_pdf(epub_lines, pdf_lines)
                merged_lines = repaired_lines if repaired_lines else pdf_lines
                merged_conf = max(epub_conf, pdf_entry.get("confidence", 0) if pdf_entry else 0.0)
                if repairs > 0:
                    merged_pages += 1

                derived = build_vocab_cards_for_block(
                    {"src": src},
                    {"lines": merged_lines, "confidence": merged_conf},
                    ch, current_section_title, bi
                )
                all_vocab_cards.extend(derived)
                out_blocks.append({
                    "kind": "vocab_table",
                    "sourceImage": src,
                    "ocrLines": merged_lines,
                    "ocrConfidence": merged_conf,
                    "cardCount": len(derived),
                    "source": "pdf-repaired" if repairs > 0 else ("epub" if epub_lines else "pdf"),
                    "bookPage": book_page,
                    "repairs": repairs,
                })
                continue

            if k == "exercise":
                ans = answers.get(block["id"])
                # EPUB image OCR (if any image refs)
                image_ocr_lines: list = []
                for src in block.get("image_refs", []):
                    ee = epub_ocr.get(src)
                    if ee:
                        image_ocr_lines.extend(ee.get("lines", []))
                    # Add PDF-page OCR for that page if available
                    bp = extract_book_page(src)
                    if bp and pdf_pages.get(bp):
                        # Only add lines not already present from EPUB OCR
                        pdf_lines = pdf_pages[bp]["lines"]
                        for line in pdf_lines:
                            line = line.strip()
                            if not line or line in image_ocr_lines:
                                continue
                            if line in narrative_set:
                                continue
                            image_ocr_lines.append(line)

                prompts = [p for p in block.get("prompts", []) if p.strip()]
                extras = [e for e in block.get("extra", []) if e.strip()]
                if not prompts and image_ocr_lines:
                    # Extract numbered lines from OCR
                    for line in image_ocr_lines:
                        m = re.match(r"^(\d+)[.)]\s*(.+)", line.strip())
                        if m:
                            prompts.append(f"{m.group(1)}. {m.group(2)}")

                sub = ans["subparts"] if ans else []
                answer_items = []
                for sp in sub:
                    for it in sp["items"]:
                        answer_items.append({
                            "label": sp["label"],
                            "number": it["number"],
                            "answer": it["answer"],
                            "alternates": it["alternates"],
                        })

                out_blocks.append({
                    "kind": "exercise",
                    "id": block["id"],
                    "ansAnchor": block.get("ans_anchor", ""),
                    "instruction": clean_instruction(block.get("instruction", "")),
                    "extra": extras,
                    "prompts": prompts,
                    "ocrLines": image_ocr_lines,
                    "freeform": ans["freeform"] if ans else False,
                    "answerItems": answer_items,
                    "answerRaw": ans["raw"] if ans else "",
                    "answerSubparts": sub,
                })
                continue

            out_blocks.append(block)

        book_chapters.append({
            "id": ch["id"],
            "number": ch["number"],
            "title": ch["title"],
            "part": ch.get("part"),
            "blocks": out_blocks,
        })

    book = {
        "courseName": COURSE_NAME,
        "totalChapters": len(book_chapters),
        "totalExercises": sum(1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "exercise"),
        "totalVocabTables": sum(1 for ch in book_chapters for b in ch["blocks"] if b["kind"] == "vocab_table"),
        "totalVocabCards": len(all_vocab_cards),
        "parts": chapters_data.get("part_memberships", {}),
        "chapters": book_chapters,
        "sources": {
            "epub_images_ocr": bool(epub_ocr),
            "pdf_pages_ocr": bool(pdf_ocr_raw),
            "pdf_pages_mapped": len(pdf_pages),
        },
    }
    OUT_BOOK.write_text(json.dumps(book, ensure_ascii=False))

    vocab_by_chapter: dict = {}
    for card in all_vocab_cards:
        vocab_by_chapter.setdefault(card["chapter"], []).append(card)
    OUT_VOCAB.write_text(json.dumps({
        "courseName": COURSE_NAME,
        "chapters": [
            {"chapter": n, "cards": cs}
            for n, cs in sorted(vocab_by_chapter.items())
        ],
    }, ensure_ascii=False, indent=2))

    print(f"Wrote {OUT_BOOK}")
    print(f"Wrote {OUT_VOCAB}")
    print(f"Chapters:               {book['totalChapters']}")
    print(f"Exercises:              {book['totalExercises']}")
    print(f"Vocab tables:           {book['totalVocabTables']}")
    print(f"Vocab cards (derived):  {book['totalVocabCards']}")
    print(f"PDF hits vs misses:     {pdf_hits} / {pdf_misses}")


if __name__ == "__main__":
    main()