#!/usr/bin/env python3 """Extract clean text from the PDF source and map each PDF page to the book's printed page number. Output: pdf_text.json { "pdfPageCount": 806, "bookPages": { "3": { "text": "...", "pdfIndex": 29 }, "4": { ... }, ... }, "unmapped": [list of pdfIndex values with no detectable book page number] } """ import json import re from pathlib import Path import pypdf HERE = Path(__file__).resolve().parent PDF = next( Path(__file__).resolve().parents[3].glob("Complete Spanish Step-By-Step*.pdf"), None, ) OUT = HERE / "pdf_text.json" ROMAN_RE = re.compile(r"^[ivxlcdmIVXLCDM]+$") # Match a page number on its own line at top/bottom of the page. # The book uses Arabic numerals for main chapters (e.g., "3") and Roman for front matter. PAGE_NUM_LINE_RE = re.compile(r"^\s*(\d{1,4})\s*$", re.MULTILINE) def detect_book_page(text: str) -> "int | None": """Find the printed page number from standalone page-number lines at the top or bottom of a page.""" lines = [l.strip() for l in text.splitlines() if l.strip()] # Check first 2 lines and last 2 lines for candidate in lines[:2] + lines[-2:]: m = re.match(r"^(\d{1,4})$", candidate) if m: return int(m.group(1)) return None def main() -> None: if PDF is None: print("No PDF found in project root") return print(f"Reading {PDF.name}") reader = pypdf.PdfReader(str(PDF)) pages = reader.pages print(f"PDF has {len(pages)} pages") by_book_page: dict = {} unmapped: list = [] last_seen: "int | None" = None missed_count = 0 for i, page in enumerate(pages): text = page.extract_text() or "" book_page = detect_book_page(text) if book_page is None: # Carry forward sequence: if we saw page N last, assume N+1. if last_seen is not None: book_page = last_seen + 1 missed_count += 1 else: unmapped.append(i) continue last_seen = book_page # Strip the detected page number from text to clean the output cleaned = re.sub(r"(?m)^\s*\d{1,4}\s*$", "", text).strip() by_book_page[str(book_page)] = { "text": cleaned, "pdfIndex": i, } out = { "pdfPageCount": len(pages), "bookPages": by_book_page, "unmapped": unmapped, "inferredPages": missed_count, } OUT.write_text(json.dumps(out, ensure_ascii=False)) print(f"Mapped {len(by_book_page)} book pages; {missed_count} inferred; {len(unmapped)} unmapped") print(f"Wrote {OUT}") if __name__ == "__main__": main()