#!/usr/bin/env python3 """Parse an EPUB into chapters.json for the in-app Books feature. Usage: python3 extract_epub.py [--slug SLUG] [--out OUT_DIR] Defaults: SLUG derived from the EPUB filename (lowercased, dashed) OUT_DIR ./build/ Output: OUT_DIR/chapters.json { "title": "...", "author": "...", "language": "...", "slug": "...", "chapters": [ {"id": "ch1", "number": 1, "title": "Preface", "paragraphsES": ["...", "..."]}, ... ] } How chapter grouping works: 1. Read content.opf manifest (id -> href) and spine (ordered idrefs). 2. Read toc.ncx navMap to get the ordered list of chapter (title, first-href). 3. For each chapter, claim every spine file from its first href up to (but not including) the next chapter's first href. 4. For each file in the chapter's range, parse

elements, strip tags, normalise whitespace + smart quotes, drop empties. """ from __future__ import annotations import argparse import json import re import sys import unicodedata import warnings import zipfile from pathlib import Path from typing import Iterable from xml.etree import ElementTree as ET from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) NS = { "opf": "http://www.idpf.org/2007/opf", "dc": "http://purl.org/dc/elements/1.1/", "ncx": "http://www.daisy.org/z3986/2005/ncx/", "xhtml": "http://www.w3.org/1999/xhtml", } def _slugify(s: str) -> str: s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii") s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower() return s or "book" def _normalise(text: str) -> str: # Collapse runs of whitespace, normalise smart quotes to plain ones. text = text.replace(" ", " ") text = re.sub(r"\s+", " ", text).strip() text = re.sub(r"\s+([.,;:!?…])", r"\1", text) text = re.sub(r"([¡¿])\s+", r"\1", text) return text def _read_zip_text(zf: zipfile.ZipFile, path: str) -> str: return zf.read(path).decode("utf-8") def _container_root(zf: zipfile.ZipFile) -> str: container = ET.fromstring(_read_zip_text(zf, "META-INF/container.xml")) rootfile = container.find(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile") if rootfile is None: raise RuntimeError("Missing rootfile entry in META-INF/container.xml") return rootfile.attrib["full-path"] def _parse_opf(zf: zipfile.ZipFile, opf_path: str): text = _read_zip_text(zf, opf_path) root = ET.fromstring(text) title = (root.findtext(".//dc:title", default="", namespaces=NS) or "").strip() author = (root.findtext(".//dc:creator", default="", namespaces=NS) or "").strip() language = (root.findtext(".//dc:language", default="", namespaces=NS) or "").strip() manifest: dict[str, str] = {} for item in root.findall("opf:manifest/opf:item", NS): manifest[item.attrib["id"]] = item.attrib["href"] spine: list[str] = [] for itemref in root.findall("opf:spine/opf:itemref", NS): spine.append(itemref.attrib["idref"]) ncx_id = root.find("opf:spine", NS).attrib.get("toc") if root.find("opf:spine", NS) is not None else None ncx_href = manifest.get(ncx_id) if ncx_id else None return { "title": title, "author": author, "language": language, "manifest": manifest, "spine": spine, "ncx_href": ncx_href, "opf_dir": str(Path(opf_path).parent) if "/" in opf_path else "", } def _parse_ncx(zf: zipfile.ZipFile, ncx_path: str) -> list[dict]: text = _read_zip_text(zf, ncx_path) root = ET.fromstring(text) chapters: list[dict] = [] for nav in root.findall("ncx:navMap/ncx:navPoint", NS): title = (nav.findtext("ncx:navLabel/ncx:text", default="", namespaces=NS) or "").strip() content = nav.find("ncx:content", NS) src = content.attrib.get("src", "") if content is not None else "" # Strip the anchor — we want the file path only. href = src.split("#", 1)[0] chapters.append({"title": title, "href": href}) return chapters def _resolve_zip_path(base_dir: str, href: str) -> str: if not base_dir: return href return f"{base_dir}/{href}".lstrip("/") def _extract_paragraphs(zf: zipfile.ZipFile, zip_path: str) -> list[str]: try: html = _read_zip_text(zf, zip_path) except KeyError: return [] soup = BeautifulSoup(html, "lxml") paragraphs: list[str] = [] for p in soup.find_all("p"): # Drop nav-anchor wrappers that contain no real text. text = _normalise(p.get_text(" ", strip=True)) if not text: continue # Drop chapter-heading paragraphs that only echo the title — handled # separately by the TOC. Heuristic: very short paragraph that's just # numbers + the chapter title pattern. Keep everything else. paragraphs.append(text) return paragraphs def _chapter_files( spine_files: list[str], chapter_hrefs: list[str] ) -> list[list[str]]: """Slice the spine into one list of files per chapter, using the chapter's first href as the chapter boundary. Files before the first chapter (e.g. cover, titlepage) are dropped.""" boundaries: list[int] = [] for href in chapter_hrefs: try: idx = spine_files.index(href) except ValueError: boundaries.append(-1) continue boundaries.append(idx) ranges: list[list[str]] = [] for i, start in enumerate(boundaries): if start < 0: ranges.append([]) continue end = len(spine_files) for next_start in boundaries[i + 1:]: if next_start >= 0: end = next_start break ranges.append(spine_files[start:end]) return ranges def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("epub", type=Path) parser.add_argument("--slug", default=None) parser.add_argument("--out", type=Path, default=None) args = parser.parse_args() if not args.epub.exists(): print(f"EPUB not found: {args.epub}", file=sys.stderr) sys.exit(2) with zipfile.ZipFile(args.epub) as zf: opf_path = _container_root(zf) opf = _parse_opf(zf, opf_path) if not opf["ncx_href"]: print("No NCX found in spine; cannot derive chapter structure.", file=sys.stderr) sys.exit(3) ncx_path = _resolve_zip_path(opf["opf_dir"], opf["ncx_href"]) toc = _parse_ncx(zf, ncx_path) spine_files = [ _resolve_zip_path(opf["opf_dir"], opf["manifest"].get(idref, "")) for idref in opf["spine"] ] chapter_hrefs = [_resolve_zip_path(opf["opf_dir"], c["href"]) for c in toc] chapter_file_ranges = _chapter_files(spine_files, chapter_hrefs) chapters_out: list[dict] = [] for i, (meta, files) in enumerate(zip(toc, chapter_file_ranges), start=1): paragraphs: list[str] = [] for f in files: paragraphs.extend(_extract_paragraphs(zf, f)) # Drop leading paragraph(s) that just echo the chapter title — the # title is already stored separately. title_norm = _normalise(meta["title"]).lower() while paragraphs and _normalise(paragraphs[0]).lower() == title_norm: paragraphs.pop(0) chapters_out.append( { "id": f"ch{i}", "number": i, "title": meta["title"], "paragraphsES": paragraphs, } ) slug = args.slug or _slugify(opf["title"]) or args.epub.stem out_dir = args.out or (Path("build") / slug) out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / "chapters.json" payload = { "title": opf["title"], "author": opf["author"], "language": opf["language"], "slug": slug, "chapters": chapters_out, } out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") total_paragraphs = sum(len(c["paragraphsES"]) for c in chapters_out) print(f"Wrote {out_path}") print(f" Title: {opf['title']}") print(f" Author: {opf['author']}") print(f" Chapters: {len(chapters_out)}") print(f" Paragraphs: {total_paragraphs}") for ch in chapters_out: print(f" ch{ch['number']:02d} {len(ch['paragraphsES']):4d} ¶ {ch['title']}") if __name__ == "__main__": main()