Add Books — read EPUB-imported books in Practice with tap-to-define

New "Books" row in the Practice tab opens a library of bundled bilingual
books. Each chapter renders Spanish paragraph-by-paragraph; tap any
word for a definition sheet (DictionaryService with on-device AI
fallback), or toggle the toolbar button to swap to the pre-computed
English translation inline.

Local-only Book + BookChapter SwiftData models added to the local
container schema (reset version bumped to 5). DataLoader.seedBooks
walks the bundle for `book_*.json` resources, so future books drop in
without touching app code — just bundle a new JSON and bump
bookDataVersion.

First book: Olly Richards' "Spanish Short Stories For Beginners
Vol 2" — 13 chapters, 2,646 paragraphs, bilingual.

Scripts/books/ is the repeatable pipeline for future EPUBs:
extract_epub.py → translate_chapters.py (per-chapter resumable jobs) →
bundle_book.py. Translation is done by parallel Claude Code subagents
reading per-job input files and writing output files — no API key
required, matching the pattern used for the textbook vocab vision
pass. See Scripts/books/README.md for the full how-to.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-05-11 09:21:44 -05:00
parent ade091f108
commit 09e49bda2c
17 changed files with 6782 additions and 1 deletions
+258
View File
@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""Parse an EPUB into chapters.json for the in-app Books feature.
Usage:
python3 extract_epub.py <epub_path> [--slug SLUG] [--out OUT_DIR]
Defaults:
SLUG derived from the EPUB filename (lowercased, dashed)
OUT_DIR ./build/<slug>
Output:
OUT_DIR/chapters.json
{
"title": "...",
"author": "...",
"language": "...",
"slug": "...",
"chapters": [
{"id": "ch1", "number": 1, "title": "Preface",
"paragraphsES": ["...", "..."]},
...
]
}
How chapter grouping works:
1. Read content.opf manifest (id -> href) and spine (ordered idrefs).
2. Read toc.ncx navMap to get the ordered list of chapter (title, first-href).
3. For each chapter, claim every spine file from its first href up to (but
not including) the next chapter's first href.
4. For each file in the chapter's range, parse <p> elements, strip tags,
normalise whitespace + smart quotes, drop empties.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import unicodedata
import warnings
import zipfile
from pathlib import Path
from typing import Iterable
from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
NS = {
"opf": "http://www.idpf.org/2007/opf",
"dc": "http://purl.org/dc/elements/1.1/",
"ncx": "http://www.daisy.org/z3986/2005/ncx/",
"xhtml": "http://www.w3.org/1999/xhtml",
}
def _slugify(s: str) -> str:
s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
return s or "book"
def _normalise(text: str) -> str:
# Collapse runs of whitespace, normalise smart quotes to plain ones.
text = text.replace(" ", " ")
text = re.sub(r"\s+", " ", text).strip()
text = re.sub(r"\s+([.,;:!?…])", r"\1", text)
text = re.sub(r"([¡¿])\s+", r"\1", text)
return text
def _read_zip_text(zf: zipfile.ZipFile, path: str) -> str:
return zf.read(path).decode("utf-8")
def _container_root(zf: zipfile.ZipFile) -> str:
container = ET.fromstring(_read_zip_text(zf, "META-INF/container.xml"))
rootfile = container.find(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile")
if rootfile is None:
raise RuntimeError("Missing rootfile entry in META-INF/container.xml")
return rootfile.attrib["full-path"]
def _parse_opf(zf: zipfile.ZipFile, opf_path: str):
text = _read_zip_text(zf, opf_path)
root = ET.fromstring(text)
title = (root.findtext(".//dc:title", default="", namespaces=NS) or "").strip()
author = (root.findtext(".//dc:creator", default="", namespaces=NS) or "").strip()
language = (root.findtext(".//dc:language", default="", namespaces=NS) or "").strip()
manifest: dict[str, str] = {}
for item in root.findall("opf:manifest/opf:item", NS):
manifest[item.attrib["id"]] = item.attrib["href"]
spine: list[str] = []
for itemref in root.findall("opf:spine/opf:itemref", NS):
spine.append(itemref.attrib["idref"])
ncx_id = root.find("opf:spine", NS).attrib.get("toc") if root.find("opf:spine", NS) is not None else None
ncx_href = manifest.get(ncx_id) if ncx_id else None
return {
"title": title,
"author": author,
"language": language,
"manifest": manifest,
"spine": spine,
"ncx_href": ncx_href,
"opf_dir": str(Path(opf_path).parent) if "/" in opf_path else "",
}
def _parse_ncx(zf: zipfile.ZipFile, ncx_path: str) -> list[dict]:
text = _read_zip_text(zf, ncx_path)
root = ET.fromstring(text)
chapters: list[dict] = []
for nav in root.findall("ncx:navMap/ncx:navPoint", NS):
title = (nav.findtext("ncx:navLabel/ncx:text", default="", namespaces=NS) or "").strip()
content = nav.find("ncx:content", NS)
src = content.attrib.get("src", "") if content is not None else ""
# Strip the anchor — we want the file path only.
href = src.split("#", 1)[0]
chapters.append({"title": title, "href": href})
return chapters
def _resolve_zip_path(base_dir: str, href: str) -> str:
if not base_dir:
return href
return f"{base_dir}/{href}".lstrip("/")
def _extract_paragraphs(zf: zipfile.ZipFile, zip_path: str) -> list[str]:
try:
html = _read_zip_text(zf, zip_path)
except KeyError:
return []
soup = BeautifulSoup(html, "lxml")
paragraphs: list[str] = []
for p in soup.find_all("p"):
# Drop nav-anchor wrappers that contain no real text.
text = _normalise(p.get_text(" ", strip=True))
if not text:
continue
# Drop chapter-heading paragraphs that only echo the title — handled
# separately by the TOC. Heuristic: very short paragraph that's just
# numbers + the chapter title pattern. Keep everything else.
paragraphs.append(text)
return paragraphs
def _chapter_files(
spine_files: list[str], chapter_hrefs: list[str]
) -> list[list[str]]:
"""Slice the spine into one list of files per chapter, using the chapter's
first href as the chapter boundary. Files before the first chapter (e.g.
cover, titlepage) are dropped."""
boundaries: list[int] = []
for href in chapter_hrefs:
try:
idx = spine_files.index(href)
except ValueError:
boundaries.append(-1)
continue
boundaries.append(idx)
ranges: list[list[str]] = []
for i, start in enumerate(boundaries):
if start < 0:
ranges.append([])
continue
end = len(spine_files)
for next_start in boundaries[i + 1:]:
if next_start >= 0:
end = next_start
break
ranges.append(spine_files[start:end])
return ranges
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("epub", type=Path)
parser.add_argument("--slug", default=None)
parser.add_argument("--out", type=Path, default=None)
args = parser.parse_args()
if not args.epub.exists():
print(f"EPUB not found: {args.epub}", file=sys.stderr)
sys.exit(2)
with zipfile.ZipFile(args.epub) as zf:
opf_path = _container_root(zf)
opf = _parse_opf(zf, opf_path)
if not opf["ncx_href"]:
print("No NCX found in spine; cannot derive chapter structure.", file=sys.stderr)
sys.exit(3)
ncx_path = _resolve_zip_path(opf["opf_dir"], opf["ncx_href"])
toc = _parse_ncx(zf, ncx_path)
spine_files = [
_resolve_zip_path(opf["opf_dir"], opf["manifest"].get(idref, ""))
for idref in opf["spine"]
]
chapter_hrefs = [_resolve_zip_path(opf["opf_dir"], c["href"]) for c in toc]
chapter_file_ranges = _chapter_files(spine_files, chapter_hrefs)
chapters_out: list[dict] = []
for i, (meta, files) in enumerate(zip(toc, chapter_file_ranges), start=1):
paragraphs: list[str] = []
for f in files:
paragraphs.extend(_extract_paragraphs(zf, f))
# Drop leading paragraph(s) that just echo the chapter title — the
# title is already stored separately.
title_norm = _normalise(meta["title"]).lower()
while paragraphs and _normalise(paragraphs[0]).lower() == title_norm:
paragraphs.pop(0)
chapters_out.append(
{
"id": f"ch{i}",
"number": i,
"title": meta["title"],
"paragraphsES": paragraphs,
}
)
slug = args.slug or _slugify(opf["title"]) or args.epub.stem
out_dir = args.out or (Path("build") / slug)
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "chapters.json"
payload = {
"title": opf["title"],
"author": opf["author"],
"language": opf["language"],
"slug": slug,
"chapters": chapters_out,
}
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
total_paragraphs = sum(len(c["paragraphsES"]) for c in chapters_out)
print(f"Wrote {out_path}")
print(f" Title: {opf['title']}")
print(f" Author: {opf['author']}")
print(f" Chapters: {len(chapters_out)}")
print(f" Paragraphs: {total_paragraphs}")
for ch in chapters_out:
print(f" ch{ch['number']:02d} {len(ch['paragraphsES']):4d}{ch['title']}")
if __name__ == "__main__":
main()