Files
Spanish/Conjuga/Scripts/books/extract_epub.py
T
Trey T 05a367fdbe Books — capture <li> vocab bullets the extractor was silently dropping
extract_epub.py was walking <p> only, but every "Vocabulario" section in
the Olly Richards EPUB lives inside <ul><li>...</li></ul>. That meant
the heading made it through but the entries didn't — 680 vocab lines
across 24 sections in this book were missing from the bundled JSON.

Audit (text-node owner by closest block ancestor) confirmed <li> is the
only silent drop: 5,260 nodes in <p>, 1,960 in <li>, 0 anywhere else.
No <h1>-<h6>, tables, or blockquotes in this EPUB at all.

Fix: walk find_all(["p", "li"]) in document order so bullet entries
slot in right after their "Vocabulario" / list heading. Re-extracted
(2,646 → 3,326 paragraphs), re-translated all 118 jobs in parallel
Claude Code subagents. translate_chapters.py prompt template now tells
subagents to keep bilingual `palabra = meaning` lines verbatim — both
sides already coexist on the line.

Bumped bookDataVersion to 2 so refreshBooksDataIfNeeded re-seeds.
Verified in simulator: all 13 chapter row sizes grew (e.g. ch6
18,295→20,951 chars).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 10:10:34 -05:00

258 lines
8.5 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Parse an EPUB into chapters.json for the in-app Books feature.
Usage:
python3 extract_epub.py <epub_path> [--slug SLUG] [--out OUT_DIR]
Defaults:
SLUG derived from the EPUB filename (lowercased, dashed)
OUT_DIR ./build/<slug>
Output:
OUT_DIR/chapters.json
{
"title": "...",
"author": "...",
"language": "...",
"slug": "...",
"chapters": [
{"id": "ch1", "number": 1, "title": "Preface",
"paragraphsES": ["...", "..."]},
...
]
}
How chapter grouping works:
1. Read content.opf manifest (id -> href) and spine (ordered idrefs).
2. Read toc.ncx navMap to get the ordered list of chapter (title, first-href).
3. For each chapter, claim every spine file from its first href up to (but
not including) the next chapter's first href.
4. For each file in the chapter's range, parse <p> elements, strip tags,
normalise whitespace + smart quotes, drop empties.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import unicodedata
import warnings
import zipfile
from pathlib import Path
from typing import Iterable
from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
NS = {
"opf": "http://www.idpf.org/2007/opf",
"dc": "http://purl.org/dc/elements/1.1/",
"ncx": "http://www.daisy.org/z3986/2005/ncx/",
"xhtml": "http://www.w3.org/1999/xhtml",
}
def _slugify(s: str) -> str:
s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
return s or "book"
def _normalise(text: str) -> str:
# Collapse runs of whitespace, normalise smart quotes to plain ones.
text = text.replace(" ", " ")
text = re.sub(r"\s+", " ", text).strip()
text = re.sub(r"\s+([.,;:!?…])", r"\1", text)
text = re.sub(r"([¡¿])\s+", r"\1", text)
return text
def _read_zip_text(zf: zipfile.ZipFile, path: str) -> str:
return zf.read(path).decode("utf-8")
def _container_root(zf: zipfile.ZipFile) -> str:
container = ET.fromstring(_read_zip_text(zf, "META-INF/container.xml"))
rootfile = container.find(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile")
if rootfile is None:
raise RuntimeError("Missing rootfile entry in META-INF/container.xml")
return rootfile.attrib["full-path"]
def _parse_opf(zf: zipfile.ZipFile, opf_path: str):
text = _read_zip_text(zf, opf_path)
root = ET.fromstring(text)
title = (root.findtext(".//dc:title", default="", namespaces=NS) or "").strip()
author = (root.findtext(".//dc:creator", default="", namespaces=NS) or "").strip()
language = (root.findtext(".//dc:language", default="", namespaces=NS) or "").strip()
manifest: dict[str, str] = {}
for item in root.findall("opf:manifest/opf:item", NS):
manifest[item.attrib["id"]] = item.attrib["href"]
spine: list[str] = []
for itemref in root.findall("opf:spine/opf:itemref", NS):
spine.append(itemref.attrib["idref"])
ncx_id = root.find("opf:spine", NS).attrib.get("toc") if root.find("opf:spine", NS) is not None else None
ncx_href = manifest.get(ncx_id) if ncx_id else None
return {
"title": title,
"author": author,
"language": language,
"manifest": manifest,
"spine": spine,
"ncx_href": ncx_href,
"opf_dir": str(Path(opf_path).parent) if "/" in opf_path else "",
}
def _parse_ncx(zf: zipfile.ZipFile, ncx_path: str) -> list[dict]:
text = _read_zip_text(zf, ncx_path)
root = ET.fromstring(text)
chapters: list[dict] = []
for nav in root.findall("ncx:navMap/ncx:navPoint", NS):
title = (nav.findtext("ncx:navLabel/ncx:text", default="", namespaces=NS) or "").strip()
content = nav.find("ncx:content", NS)
src = content.attrib.get("src", "") if content is not None else ""
# Strip the anchor — we want the file path only.
href = src.split("#", 1)[0]
chapters.append({"title": title, "href": href})
return chapters
def _resolve_zip_path(base_dir: str, href: str) -> str:
if not base_dir:
return href
return f"{base_dir}/{href}".lstrip("/")
def _extract_paragraphs(zf: zipfile.ZipFile, zip_path: str) -> list[str]:
try:
html = _read_zip_text(zf, zip_path)
except KeyError:
return []
soup = BeautifulSoup(html, "lxml")
paragraphs: list[str] = []
# Walk <p> and <li> in document order so vocab bullets (rendered as
# <ul><li>...</li></ul> in this EPUB family) are kept alongside narrative
# paragraphs. `<li>` rolls up its inline <b>/<span> children via get_text.
for el in soup.find_all(["p", "li"]):
text = _normalise(el.get_text(" ", strip=True))
if not text:
continue
paragraphs.append(text)
return paragraphs
def _chapter_files(
spine_files: list[str], chapter_hrefs: list[str]
) -> list[list[str]]:
"""Slice the spine into one list of files per chapter, using the chapter's
first href as the chapter boundary. Files before the first chapter (e.g.
cover, titlepage) are dropped."""
boundaries: list[int] = []
for href in chapter_hrefs:
try:
idx = spine_files.index(href)
except ValueError:
boundaries.append(-1)
continue
boundaries.append(idx)
ranges: list[list[str]] = []
for i, start in enumerate(boundaries):
if start < 0:
ranges.append([])
continue
end = len(spine_files)
for next_start in boundaries[i + 1:]:
if next_start >= 0:
end = next_start
break
ranges.append(spine_files[start:end])
return ranges
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("epub", type=Path)
parser.add_argument("--slug", default=None)
parser.add_argument("--out", type=Path, default=None)
args = parser.parse_args()
if not args.epub.exists():
print(f"EPUB not found: {args.epub}", file=sys.stderr)
sys.exit(2)
with zipfile.ZipFile(args.epub) as zf:
opf_path = _container_root(zf)
opf = _parse_opf(zf, opf_path)
if not opf["ncx_href"]:
print("No NCX found in spine; cannot derive chapter structure.", file=sys.stderr)
sys.exit(3)
ncx_path = _resolve_zip_path(opf["opf_dir"], opf["ncx_href"])
toc = _parse_ncx(zf, ncx_path)
spine_files = [
_resolve_zip_path(opf["opf_dir"], opf["manifest"].get(idref, ""))
for idref in opf["spine"]
]
chapter_hrefs = [_resolve_zip_path(opf["opf_dir"], c["href"]) for c in toc]
chapter_file_ranges = _chapter_files(spine_files, chapter_hrefs)
chapters_out: list[dict] = []
for i, (meta, files) in enumerate(zip(toc, chapter_file_ranges), start=1):
paragraphs: list[str] = []
for f in files:
paragraphs.extend(_extract_paragraphs(zf, f))
# Drop leading paragraph(s) that just echo the chapter title — the
# title is already stored separately.
title_norm = _normalise(meta["title"]).lower()
while paragraphs and _normalise(paragraphs[0]).lower() == title_norm:
paragraphs.pop(0)
chapters_out.append(
{
"id": f"ch{i}",
"number": i,
"title": meta["title"],
"paragraphsES": paragraphs,
}
)
slug = args.slug or _slugify(opf["title"]) or args.epub.stem
out_dir = args.out or (Path("build") / slug)
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "chapters.json"
payload = {
"title": opf["title"],
"author": opf["author"],
"language": opf["language"],
"slug": slug,
"chapters": chapters_out,
}
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
total_paragraphs = sum(len(c["paragraphsES"]) for c in chapters_out)
print(f"Wrote {out_path}")
print(f" Title: {opf['title']}")
print(f" Author: {opf['author']}")
print(f" Chapters: {len(chapters_out)}")
print(f" Paragraphs: {total_paragraphs}")
for ch in chapters_out:
print(f" ch{ch['number']:02d} {len(ch['paragraphsES']):4d}{ch['title']}")
if __name__ == "__main__":
main()