Spanish/Conjuga/Scripts/books/extract_epub.py

#!/usr/bin/env python3
"""Parse an EPUB into chapters.json for the in-app Books feature.

Usage:
    python3 extract_epub.py <epub_path> [--slug SLUG] [--out OUT_DIR]

Defaults:
    SLUG    derived from the EPUB filename (lowercased, dashed)
    OUT_DIR ./build/<slug>

Output:
    OUT_DIR/chapters.json
        {
          "title": "...",
          "author": "...",
          "language": "...",
          "slug": "...",
          "chapters": [
            {"id": "ch1", "number": 1, "title": "Preface",
             "paragraphsES": ["...", "..."]},
            ...
          ]
        }

How chapter grouping works:
    1. Read content.opf manifest (id -> href) and spine (ordered idrefs).
    2. Read toc.ncx navMap to get the ordered list of chapter (title, first-href).
    3. For each chapter, claim every spine file from its first href up to (but
       not including) the next chapter's first href.
    4. For each file in the chapter's range, parse <p> elements, strip tags,
       normalise whitespace + smart quotes, drop empties.
"""

from __future__ import annotations

import argparse
import json
import re
import sys
import unicodedata
import warnings
import zipfile
from pathlib import Path
from typing import Iterable
from xml.etree import ElementTree as ET

from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)


NS = {
    "opf": "http://www.idpf.org/2007/opf",
    "dc": "http://purl.org/dc/elements/1.1/",
    "ncx": "http://www.daisy.org/z3986/2005/ncx/",
    "xhtml": "http://www.w3.org/1999/xhtml",
}


def _slugify(s: str) -> str:
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
    return s or "book"


def _normalise(text: str) -> str:
    # Collapse runs of whitespace, normalise smart quotes to plain ones.
    text = text.replace(" ", " ")
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\s+([.,;:!?…])", r"\1", text)
    text = re.sub(r"([¡¿])\s+", r"\1", text)
    return text


def _read_zip_text(zf: zipfile.ZipFile, path: str) -> str:
    return zf.read(path).decode("utf-8")


def _container_root(zf: zipfile.ZipFile) -> str:
    container = ET.fromstring(_read_zip_text(zf, "META-INF/container.xml"))
    rootfile = container.find(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile")
    if rootfile is None:
        raise RuntimeError("Missing rootfile entry in META-INF/container.xml")
    return rootfile.attrib["full-path"]


def _parse_opf(zf: zipfile.ZipFile, opf_path: str):
    text = _read_zip_text(zf, opf_path)
    root = ET.fromstring(text)

    title = (root.findtext(".//dc:title", default="", namespaces=NS) or "").strip()
    author = (root.findtext(".//dc:creator", default="", namespaces=NS) or "").strip()
    language = (root.findtext(".//dc:language", default="", namespaces=NS) or "").strip()

    manifest: dict[str, str] = {}
    for item in root.findall("opf:manifest/opf:item", NS):
        manifest[item.attrib["id"]] = item.attrib["href"]

    spine: list[str] = []
    for itemref in root.findall("opf:spine/opf:itemref", NS):
        spine.append(itemref.attrib["idref"])

    ncx_id = root.find("opf:spine", NS).attrib.get("toc") if root.find("opf:spine", NS) is not None else None
    ncx_href = manifest.get(ncx_id) if ncx_id else None

    return {
        "title": title,
        "author": author,
        "language": language,
        "manifest": manifest,
        "spine": spine,
        "ncx_href": ncx_href,
        "opf_dir": str(Path(opf_path).parent) if "/" in opf_path else "",
    }


def _parse_ncx(zf: zipfile.ZipFile, ncx_path: str) -> list[dict]:
    text = _read_zip_text(zf, ncx_path)
    root = ET.fromstring(text)
    chapters: list[dict] = []
    for nav in root.findall("ncx:navMap/ncx:navPoint", NS):
        title = (nav.findtext("ncx:navLabel/ncx:text", default="", namespaces=NS) or "").strip()
        content = nav.find("ncx:content", NS)
        src = content.attrib.get("src", "") if content is not None else ""
        # Strip the anchor — we want the file path only.
        href = src.split("#", 1)[0]
        chapters.append({"title": title, "href": href})
    return chapters


def _resolve_zip_path(base_dir: str, href: str) -> str:
    if not base_dir:
        return href
    return f"{base_dir}/{href}".lstrip("/")


def _extract_paragraphs(zf: zipfile.ZipFile, zip_path: str) -> list[str]:
    try:
        html = _read_zip_text(zf, zip_path)
    except KeyError:
        return []
    soup = BeautifulSoup(html, "lxml")
    paragraphs: list[str] = []
    for p in soup.find_all("p"):
        # Drop nav-anchor wrappers that contain no real text.
        text = _normalise(p.get_text(" ", strip=True))
        if not text:
            continue
        # Drop chapter-heading paragraphs that only echo the title — handled
        # separately by the TOC. Heuristic: very short paragraph that's just
        # numbers + the chapter title pattern. Keep everything else.
        paragraphs.append(text)
    return paragraphs


def _chapter_files(
    spine_files: list[str], chapter_hrefs: list[str]
) -> list[list[str]]:
    """Slice the spine into one list of files per chapter, using the chapter's
    first href as the chapter boundary. Files before the first chapter (e.g.
    cover, titlepage) are dropped."""
    boundaries: list[int] = []
    for href in chapter_hrefs:
        try:
            idx = spine_files.index(href)
        except ValueError:
            boundaries.append(-1)
            continue
        boundaries.append(idx)

    ranges: list[list[str]] = []
    for i, start in enumerate(boundaries):
        if start < 0:
            ranges.append([])
            continue
        end = len(spine_files)
        for next_start in boundaries[i + 1:]:
            if next_start >= 0:
                end = next_start
                break
        ranges.append(spine_files[start:end])
    return ranges


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("epub", type=Path)
    parser.add_argument("--slug", default=None)
    parser.add_argument("--out", type=Path, default=None)
    args = parser.parse_args()

    if not args.epub.exists():
        print(f"EPUB not found: {args.epub}", file=sys.stderr)
        sys.exit(2)

    with zipfile.ZipFile(args.epub) as zf:
        opf_path = _container_root(zf)
        opf = _parse_opf(zf, opf_path)

        if not opf["ncx_href"]:
            print("No NCX found in spine; cannot derive chapter structure.", file=sys.stderr)
            sys.exit(3)

        ncx_path = _resolve_zip_path(opf["opf_dir"], opf["ncx_href"])
        toc = _parse_ncx(zf, ncx_path)

        spine_files = [
            _resolve_zip_path(opf["opf_dir"], opf["manifest"].get(idref, ""))
            for idref in opf["spine"]
        ]
        chapter_hrefs = [_resolve_zip_path(opf["opf_dir"], c["href"]) for c in toc]
        chapter_file_ranges = _chapter_files(spine_files, chapter_hrefs)

        chapters_out: list[dict] = []
        for i, (meta, files) in enumerate(zip(toc, chapter_file_ranges), start=1):
            paragraphs: list[str] = []
            for f in files:
                paragraphs.extend(_extract_paragraphs(zf, f))
            # Drop leading paragraph(s) that just echo the chapter title — the
            # title is already stored separately.
            title_norm = _normalise(meta["title"]).lower()
            while paragraphs and _normalise(paragraphs[0]).lower() == title_norm:
                paragraphs.pop(0)
            chapters_out.append(
                {
                    "id": f"ch{i}",
                    "number": i,
                    "title": meta["title"],
                    "paragraphsES": paragraphs,
                }
            )

    slug = args.slug or _slugify(opf["title"]) or args.epub.stem
    out_dir = args.out or (Path("build") / slug)
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "chapters.json"

    payload = {
        "title": opf["title"],
        "author": opf["author"],
        "language": opf["language"],
        "slug": slug,
        "chapters": chapters_out,
    }
    out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")

    total_paragraphs = sum(len(c["paragraphsES"]) for c in chapters_out)
    print(f"Wrote {out_path}")
    print(f"  Title:      {opf['title']}")
    print(f"  Author:     {opf['author']}")
    print(f"  Chapters:   {len(chapters_out)}")
    print(f"  Paragraphs: {total_paragraphs}")
    for ch in chapters_out:
        print(f"    ch{ch['number']:02d}  {len(ch['paragraphsES']):4d} ¶  {ch['title']}")


if __name__ == "__main__":
    main()