09e49bda2c
New "Books" row in the Practice tab opens a library of bundled bilingual books. Each chapter renders Spanish paragraph-by-paragraph; tap any word for a definition sheet (DictionaryService with on-device AI fallback), or toggle the toolbar button to swap to the pre-computed English translation inline. Local-only Book + BookChapter SwiftData models added to the local container schema (reset version bumped to 5). DataLoader.seedBooks walks the bundle for `book_*.json` resources, so future books drop in without touching app code — just bundle a new JSON and bump bookDataVersion. First book: Olly Richards' "Spanish Short Stories For Beginners Vol 2" — 13 chapters, 2,646 paragraphs, bilingual. Scripts/books/ is the repeatable pipeline for future EPUBs: extract_epub.py → translate_chapters.py (per-chapter resumable jobs) → bundle_book.py. Translation is done by parallel Claude Code subagents reading per-job input files and writing output files — no API key required, matching the pattern used for the textbook vocab vision pass. See Scripts/books/README.md for the full how-to. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
259 lines
8.6 KiB
Python
259 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
||
"""Parse an EPUB into chapters.json for the in-app Books feature.
|
||
|
||
Usage:
|
||
python3 extract_epub.py <epub_path> [--slug SLUG] [--out OUT_DIR]
|
||
|
||
Defaults:
|
||
SLUG derived from the EPUB filename (lowercased, dashed)
|
||
OUT_DIR ./build/<slug>
|
||
|
||
Output:
|
||
OUT_DIR/chapters.json
|
||
{
|
||
"title": "...",
|
||
"author": "...",
|
||
"language": "...",
|
||
"slug": "...",
|
||
"chapters": [
|
||
{"id": "ch1", "number": 1, "title": "Preface",
|
||
"paragraphsES": ["...", "..."]},
|
||
...
|
||
]
|
||
}
|
||
|
||
How chapter grouping works:
|
||
1. Read content.opf manifest (id -> href) and spine (ordered idrefs).
|
||
2. Read toc.ncx navMap to get the ordered list of chapter (title, first-href).
|
||
3. For each chapter, claim every spine file from its first href up to (but
|
||
not including) the next chapter's first href.
|
||
4. For each file in the chapter's range, parse <p> elements, strip tags,
|
||
normalise whitespace + smart quotes, drop empties.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
import sys
|
||
import unicodedata
|
||
import warnings
|
||
import zipfile
|
||
from pathlib import Path
|
||
from typing import Iterable
|
||
from xml.etree import ElementTree as ET
|
||
|
||
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
|
||
|
||
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
||
|
||
|
||
NS = {
|
||
"opf": "http://www.idpf.org/2007/opf",
|
||
"dc": "http://purl.org/dc/elements/1.1/",
|
||
"ncx": "http://www.daisy.org/z3986/2005/ncx/",
|
||
"xhtml": "http://www.w3.org/1999/xhtml",
|
||
}
|
||
|
||
|
||
def _slugify(s: str) -> str:
|
||
s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
|
||
s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
|
||
return s or "book"
|
||
|
||
|
||
def _normalise(text: str) -> str:
|
||
# Collapse runs of whitespace, normalise smart quotes to plain ones.
|
||
text = text.replace(" ", " ")
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
text = re.sub(r"\s+([.,;:!?…])", r"\1", text)
|
||
text = re.sub(r"([¡¿])\s+", r"\1", text)
|
||
return text
|
||
|
||
|
||
def _read_zip_text(zf: zipfile.ZipFile, path: str) -> str:
|
||
return zf.read(path).decode("utf-8")
|
||
|
||
|
||
def _container_root(zf: zipfile.ZipFile) -> str:
|
||
container = ET.fromstring(_read_zip_text(zf, "META-INF/container.xml"))
|
||
rootfile = container.find(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile")
|
||
if rootfile is None:
|
||
raise RuntimeError("Missing rootfile entry in META-INF/container.xml")
|
||
return rootfile.attrib["full-path"]
|
||
|
||
|
||
def _parse_opf(zf: zipfile.ZipFile, opf_path: str):
|
||
text = _read_zip_text(zf, opf_path)
|
||
root = ET.fromstring(text)
|
||
|
||
title = (root.findtext(".//dc:title", default="", namespaces=NS) or "").strip()
|
||
author = (root.findtext(".//dc:creator", default="", namespaces=NS) or "").strip()
|
||
language = (root.findtext(".//dc:language", default="", namespaces=NS) or "").strip()
|
||
|
||
manifest: dict[str, str] = {}
|
||
for item in root.findall("opf:manifest/opf:item", NS):
|
||
manifest[item.attrib["id"]] = item.attrib["href"]
|
||
|
||
spine: list[str] = []
|
||
for itemref in root.findall("opf:spine/opf:itemref", NS):
|
||
spine.append(itemref.attrib["idref"])
|
||
|
||
ncx_id = root.find("opf:spine", NS).attrib.get("toc") if root.find("opf:spine", NS) is not None else None
|
||
ncx_href = manifest.get(ncx_id) if ncx_id else None
|
||
|
||
return {
|
||
"title": title,
|
||
"author": author,
|
||
"language": language,
|
||
"manifest": manifest,
|
||
"spine": spine,
|
||
"ncx_href": ncx_href,
|
||
"opf_dir": str(Path(opf_path).parent) if "/" in opf_path else "",
|
||
}
|
||
|
||
|
||
def _parse_ncx(zf: zipfile.ZipFile, ncx_path: str) -> list[dict]:
|
||
text = _read_zip_text(zf, ncx_path)
|
||
root = ET.fromstring(text)
|
||
chapters: list[dict] = []
|
||
for nav in root.findall("ncx:navMap/ncx:navPoint", NS):
|
||
title = (nav.findtext("ncx:navLabel/ncx:text", default="", namespaces=NS) or "").strip()
|
||
content = nav.find("ncx:content", NS)
|
||
src = content.attrib.get("src", "") if content is not None else ""
|
||
# Strip the anchor — we want the file path only.
|
||
href = src.split("#", 1)[0]
|
||
chapters.append({"title": title, "href": href})
|
||
return chapters
|
||
|
||
|
||
def _resolve_zip_path(base_dir: str, href: str) -> str:
|
||
if not base_dir:
|
||
return href
|
||
return f"{base_dir}/{href}".lstrip("/")
|
||
|
||
|
||
def _extract_paragraphs(zf: zipfile.ZipFile, zip_path: str) -> list[str]:
|
||
try:
|
||
html = _read_zip_text(zf, zip_path)
|
||
except KeyError:
|
||
return []
|
||
soup = BeautifulSoup(html, "lxml")
|
||
paragraphs: list[str] = []
|
||
for p in soup.find_all("p"):
|
||
# Drop nav-anchor wrappers that contain no real text.
|
||
text = _normalise(p.get_text(" ", strip=True))
|
||
if not text:
|
||
continue
|
||
# Drop chapter-heading paragraphs that only echo the title — handled
|
||
# separately by the TOC. Heuristic: very short paragraph that's just
|
||
# numbers + the chapter title pattern. Keep everything else.
|
||
paragraphs.append(text)
|
||
return paragraphs
|
||
|
||
|
||
def _chapter_files(
|
||
spine_files: list[str], chapter_hrefs: list[str]
|
||
) -> list[list[str]]:
|
||
"""Slice the spine into one list of files per chapter, using the chapter's
|
||
first href as the chapter boundary. Files before the first chapter (e.g.
|
||
cover, titlepage) are dropped."""
|
||
boundaries: list[int] = []
|
||
for href in chapter_hrefs:
|
||
try:
|
||
idx = spine_files.index(href)
|
||
except ValueError:
|
||
boundaries.append(-1)
|
||
continue
|
||
boundaries.append(idx)
|
||
|
||
ranges: list[list[str]] = []
|
||
for i, start in enumerate(boundaries):
|
||
if start < 0:
|
||
ranges.append([])
|
||
continue
|
||
end = len(spine_files)
|
||
for next_start in boundaries[i + 1:]:
|
||
if next_start >= 0:
|
||
end = next_start
|
||
break
|
||
ranges.append(spine_files[start:end])
|
||
return ranges
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("epub", type=Path)
|
||
parser.add_argument("--slug", default=None)
|
||
parser.add_argument("--out", type=Path, default=None)
|
||
args = parser.parse_args()
|
||
|
||
if not args.epub.exists():
|
||
print(f"EPUB not found: {args.epub}", file=sys.stderr)
|
||
sys.exit(2)
|
||
|
||
with zipfile.ZipFile(args.epub) as zf:
|
||
opf_path = _container_root(zf)
|
||
opf = _parse_opf(zf, opf_path)
|
||
|
||
if not opf["ncx_href"]:
|
||
print("No NCX found in spine; cannot derive chapter structure.", file=sys.stderr)
|
||
sys.exit(3)
|
||
|
||
ncx_path = _resolve_zip_path(opf["opf_dir"], opf["ncx_href"])
|
||
toc = _parse_ncx(zf, ncx_path)
|
||
|
||
spine_files = [
|
||
_resolve_zip_path(opf["opf_dir"], opf["manifest"].get(idref, ""))
|
||
for idref in opf["spine"]
|
||
]
|
||
chapter_hrefs = [_resolve_zip_path(opf["opf_dir"], c["href"]) for c in toc]
|
||
chapter_file_ranges = _chapter_files(spine_files, chapter_hrefs)
|
||
|
||
chapters_out: list[dict] = []
|
||
for i, (meta, files) in enumerate(zip(toc, chapter_file_ranges), start=1):
|
||
paragraphs: list[str] = []
|
||
for f in files:
|
||
paragraphs.extend(_extract_paragraphs(zf, f))
|
||
# Drop leading paragraph(s) that just echo the chapter title — the
|
||
# title is already stored separately.
|
||
title_norm = _normalise(meta["title"]).lower()
|
||
while paragraphs and _normalise(paragraphs[0]).lower() == title_norm:
|
||
paragraphs.pop(0)
|
||
chapters_out.append(
|
||
{
|
||
"id": f"ch{i}",
|
||
"number": i,
|
||
"title": meta["title"],
|
||
"paragraphsES": paragraphs,
|
||
}
|
||
)
|
||
|
||
slug = args.slug or _slugify(opf["title"]) or args.epub.stem
|
||
out_dir = args.out or (Path("build") / slug)
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
out_path = out_dir / "chapters.json"
|
||
|
||
payload = {
|
||
"title": opf["title"],
|
||
"author": opf["author"],
|
||
"language": opf["language"],
|
||
"slug": slug,
|
||
"chapters": chapters_out,
|
||
}
|
||
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
|
||
total_paragraphs = sum(len(c["paragraphsES"]) for c in chapters_out)
|
||
print(f"Wrote {out_path}")
|
||
print(f" Title: {opf['title']}")
|
||
print(f" Author: {opf['author']}")
|
||
print(f" Chapters: {len(chapters_out)}")
|
||
print(f" Paragraphs: {total_paragraphs}")
|
||
for ch in chapters_out:
|
||
print(f" ch{ch['number']:02d} {len(ch['paragraphsES']):4d} ¶ {ch['title']}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|