Spanish/Conjuga/Scripts/textbook/extract_pdf_text.py

#!/usr/bin/env python3
"""Extract clean text from the PDF source and map each PDF page to the
book's printed page number.

Output: pdf_text.json
{
  "pdfPageCount": 806,
  "bookPages": {
    "3": { "text": "...", "pdfIndex": 29 },
    "4": { ... },
    ...
  },
  "unmapped": [list of pdfIndex values with no detectable book page number]
}
"""

import json
import re
from pathlib import Path
import pypdf

HERE = Path(__file__).resolve().parent
PDF = next(
    Path(__file__).resolve().parents[3].glob("Complete Spanish Step-By-Step*.pdf"),
    None,
)
OUT = HERE / "pdf_text.json"

ROMAN_RE = re.compile(r"^[ivxlcdmIVXLCDM]+$")
# Match a page number on its own line at top/bottom of the page.
# The book uses Arabic numerals for main chapters (e.g., "3") and Roman for front matter.
PAGE_NUM_LINE_RE = re.compile(r"^\s*(\d{1,4})\s*$", re.MULTILINE)


def detect_book_page(text: str) -> "int | None":
    """Find the printed page number from standalone page-number lines at the
    top or bottom of a page."""
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    # Check first 2 lines and last 2 lines
    for candidate in lines[:2] + lines[-2:]:
        m = re.match(r"^(\d{1,4})$", candidate)
        if m:
            return int(m.group(1))
    return None


def main() -> None:
    if PDF is None:
        print("No PDF found in project root")
        return

    print(f"Reading {PDF.name}")
    reader = pypdf.PdfReader(str(PDF))
    pages = reader.pages
    print(f"PDF has {len(pages)} pages")

    by_book_page: dict = {}
    unmapped: list = []
    last_seen: "int | None" = None
    missed_count = 0

    for i, page in enumerate(pages):
        text = page.extract_text() or ""
        book_page = detect_book_page(text)

        if book_page is None:
            # Carry forward sequence: if we saw page N last, assume N+1.
            if last_seen is not None:
                book_page = last_seen + 1
                missed_count += 1
            else:
                unmapped.append(i)
                continue
        last_seen = book_page
        # Strip the detected page number from text to clean the output
        cleaned = re.sub(r"(?m)^\s*\d{1,4}\s*$", "", text).strip()
        by_book_page[str(book_page)] = {
            "text": cleaned,
            "pdfIndex": i,
        }

    out = {
        "pdfPageCount": len(pages),
        "bookPages": by_book_page,
        "unmapped": unmapped,
        "inferredPages": missed_count,
    }
    OUT.write_text(json.dumps(out, ensure_ascii=False))
    print(f"Mapped {len(by_book_page)} book pages; {missed_count} inferred; {len(unmapped)} unmapped")
    print(f"Wrote {OUT}")


if __name__ == "__main__":
    main()