Add Books — read EPUB-imported books in Practice with tap-to-define
New "Books" row in the Practice tab opens a library of bundled bilingual books. Each chapter renders Spanish paragraph-by-paragraph; tap any word for a definition sheet (DictionaryService with on-device AI fallback), or toggle the toolbar button to swap to the pre-computed English translation inline. Local-only Book + BookChapter SwiftData models added to the local container schema (reset version bumped to 5). DataLoader.seedBooks walks the bundle for `book_*.json` resources, so future books drop in without touching app code — just bundle a new JSON and bump bookDataVersion. First book: Olly Richards' "Spanish Short Stories For Beginners Vol 2" — 13 chapters, 2,646 paragraphs, bilingual. Scripts/books/ is the repeatable pipeline for future EPUBs: extract_epub.py → translate_chapters.py (per-chapter resumable jobs) → bundle_book.py. Translation is done by parallel Claude Code subagents reading per-job input files and writing output files — no API key required, matching the pattern used for the textbook vocab vision pass. See Scripts/books/README.md for the full how-to. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Parse an EPUB into chapters.json for the in-app Books feature.
|
||||
|
||||
Usage:
|
||||
python3 extract_epub.py <epub_path> [--slug SLUG] [--out OUT_DIR]
|
||||
|
||||
Defaults:
|
||||
SLUG derived from the EPUB filename (lowercased, dashed)
|
||||
OUT_DIR ./build/<slug>
|
||||
|
||||
Output:
|
||||
OUT_DIR/chapters.json
|
||||
{
|
||||
"title": "...",
|
||||
"author": "...",
|
||||
"language": "...",
|
||||
"slug": "...",
|
||||
"chapters": [
|
||||
{"id": "ch1", "number": 1, "title": "Preface",
|
||||
"paragraphsES": ["...", "..."]},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
How chapter grouping works:
|
||||
1. Read content.opf manifest (id -> href) and spine (ordered idrefs).
|
||||
2. Read toc.ncx navMap to get the ordered list of chapter (title, first-href).
|
||||
3. For each chapter, claim every spine file from its first href up to (but
|
||||
not including) the next chapter's first href.
|
||||
4. For each file in the chapter's range, parse <p> elements, strip tags,
|
||||
normalise whitespace + smart quotes, drop empties.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
import warnings
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
|
||||
|
||||
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
||||
|
||||
|
||||
NS = {
|
||||
"opf": "http://www.idpf.org/2007/opf",
|
||||
"dc": "http://purl.org/dc/elements/1.1/",
|
||||
"ncx": "http://www.daisy.org/z3986/2005/ncx/",
|
||||
"xhtml": "http://www.w3.org/1999/xhtml",
|
||||
}
|
||||
|
||||
|
||||
def _slugify(s: str) -> str:
|
||||
s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
|
||||
s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
|
||||
return s or "book"
|
||||
|
||||
|
||||
def _normalise(text: str) -> str:
|
||||
# Collapse runs of whitespace, normalise smart quotes to plain ones.
|
||||
text = text.replace(" ", " ")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
text = re.sub(r"\s+([.,;:!?…])", r"\1", text)
|
||||
text = re.sub(r"([¡¿])\s+", r"\1", text)
|
||||
return text
|
||||
|
||||
|
||||
def _read_zip_text(zf: zipfile.ZipFile, path: str) -> str:
|
||||
return zf.read(path).decode("utf-8")
|
||||
|
||||
|
||||
def _container_root(zf: zipfile.ZipFile) -> str:
|
||||
container = ET.fromstring(_read_zip_text(zf, "META-INF/container.xml"))
|
||||
rootfile = container.find(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile")
|
||||
if rootfile is None:
|
||||
raise RuntimeError("Missing rootfile entry in META-INF/container.xml")
|
||||
return rootfile.attrib["full-path"]
|
||||
|
||||
|
||||
def _parse_opf(zf: zipfile.ZipFile, opf_path: str):
|
||||
text = _read_zip_text(zf, opf_path)
|
||||
root = ET.fromstring(text)
|
||||
|
||||
title = (root.findtext(".//dc:title", default="", namespaces=NS) or "").strip()
|
||||
author = (root.findtext(".//dc:creator", default="", namespaces=NS) or "").strip()
|
||||
language = (root.findtext(".//dc:language", default="", namespaces=NS) or "").strip()
|
||||
|
||||
manifest: dict[str, str] = {}
|
||||
for item in root.findall("opf:manifest/opf:item", NS):
|
||||
manifest[item.attrib["id"]] = item.attrib["href"]
|
||||
|
||||
spine: list[str] = []
|
||||
for itemref in root.findall("opf:spine/opf:itemref", NS):
|
||||
spine.append(itemref.attrib["idref"])
|
||||
|
||||
ncx_id = root.find("opf:spine", NS).attrib.get("toc") if root.find("opf:spine", NS) is not None else None
|
||||
ncx_href = manifest.get(ncx_id) if ncx_id else None
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"author": author,
|
||||
"language": language,
|
||||
"manifest": manifest,
|
||||
"spine": spine,
|
||||
"ncx_href": ncx_href,
|
||||
"opf_dir": str(Path(opf_path).parent) if "/" in opf_path else "",
|
||||
}
|
||||
|
||||
|
||||
def _parse_ncx(zf: zipfile.ZipFile, ncx_path: str) -> list[dict]:
|
||||
text = _read_zip_text(zf, ncx_path)
|
||||
root = ET.fromstring(text)
|
||||
chapters: list[dict] = []
|
||||
for nav in root.findall("ncx:navMap/ncx:navPoint", NS):
|
||||
title = (nav.findtext("ncx:navLabel/ncx:text", default="", namespaces=NS) or "").strip()
|
||||
content = nav.find("ncx:content", NS)
|
||||
src = content.attrib.get("src", "") if content is not None else ""
|
||||
# Strip the anchor — we want the file path only.
|
||||
href = src.split("#", 1)[0]
|
||||
chapters.append({"title": title, "href": href})
|
||||
return chapters
|
||||
|
||||
|
||||
def _resolve_zip_path(base_dir: str, href: str) -> str:
|
||||
if not base_dir:
|
||||
return href
|
||||
return f"{base_dir}/{href}".lstrip("/")
|
||||
|
||||
|
||||
def _extract_paragraphs(zf: zipfile.ZipFile, zip_path: str) -> list[str]:
|
||||
try:
|
||||
html = _read_zip_text(zf, zip_path)
|
||||
except KeyError:
|
||||
return []
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
paragraphs: list[str] = []
|
||||
for p in soup.find_all("p"):
|
||||
# Drop nav-anchor wrappers that contain no real text.
|
||||
text = _normalise(p.get_text(" ", strip=True))
|
||||
if not text:
|
||||
continue
|
||||
# Drop chapter-heading paragraphs that only echo the title — handled
|
||||
# separately by the TOC. Heuristic: very short paragraph that's just
|
||||
# numbers + the chapter title pattern. Keep everything else.
|
||||
paragraphs.append(text)
|
||||
return paragraphs
|
||||
|
||||
|
||||
def _chapter_files(
|
||||
spine_files: list[str], chapter_hrefs: list[str]
|
||||
) -> list[list[str]]:
|
||||
"""Slice the spine into one list of files per chapter, using the chapter's
|
||||
first href as the chapter boundary. Files before the first chapter (e.g.
|
||||
cover, titlepage) are dropped."""
|
||||
boundaries: list[int] = []
|
||||
for href in chapter_hrefs:
|
||||
try:
|
||||
idx = spine_files.index(href)
|
||||
except ValueError:
|
||||
boundaries.append(-1)
|
||||
continue
|
||||
boundaries.append(idx)
|
||||
|
||||
ranges: list[list[str]] = []
|
||||
for i, start in enumerate(boundaries):
|
||||
if start < 0:
|
||||
ranges.append([])
|
||||
continue
|
||||
end = len(spine_files)
|
||||
for next_start in boundaries[i + 1:]:
|
||||
if next_start >= 0:
|
||||
end = next_start
|
||||
break
|
||||
ranges.append(spine_files[start:end])
|
||||
return ranges
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("epub", type=Path)
|
||||
parser.add_argument("--slug", default=None)
|
||||
parser.add_argument("--out", type=Path, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.epub.exists():
|
||||
print(f"EPUB not found: {args.epub}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
with zipfile.ZipFile(args.epub) as zf:
|
||||
opf_path = _container_root(zf)
|
||||
opf = _parse_opf(zf, opf_path)
|
||||
|
||||
if not opf["ncx_href"]:
|
||||
print("No NCX found in spine; cannot derive chapter structure.", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
ncx_path = _resolve_zip_path(opf["opf_dir"], opf["ncx_href"])
|
||||
toc = _parse_ncx(zf, ncx_path)
|
||||
|
||||
spine_files = [
|
||||
_resolve_zip_path(opf["opf_dir"], opf["manifest"].get(idref, ""))
|
||||
for idref in opf["spine"]
|
||||
]
|
||||
chapter_hrefs = [_resolve_zip_path(opf["opf_dir"], c["href"]) for c in toc]
|
||||
chapter_file_ranges = _chapter_files(spine_files, chapter_hrefs)
|
||||
|
||||
chapters_out: list[dict] = []
|
||||
for i, (meta, files) in enumerate(zip(toc, chapter_file_ranges), start=1):
|
||||
paragraphs: list[str] = []
|
||||
for f in files:
|
||||
paragraphs.extend(_extract_paragraphs(zf, f))
|
||||
# Drop leading paragraph(s) that just echo the chapter title — the
|
||||
# title is already stored separately.
|
||||
title_norm = _normalise(meta["title"]).lower()
|
||||
while paragraphs and _normalise(paragraphs[0]).lower() == title_norm:
|
||||
paragraphs.pop(0)
|
||||
chapters_out.append(
|
||||
{
|
||||
"id": f"ch{i}",
|
||||
"number": i,
|
||||
"title": meta["title"],
|
||||
"paragraphsES": paragraphs,
|
||||
}
|
||||
)
|
||||
|
||||
slug = args.slug or _slugify(opf["title"]) or args.epub.stem
|
||||
out_dir = args.out or (Path("build") / slug)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = out_dir / "chapters.json"
|
||||
|
||||
payload = {
|
||||
"title": opf["title"],
|
||||
"author": opf["author"],
|
||||
"language": opf["language"],
|
||||
"slug": slug,
|
||||
"chapters": chapters_out,
|
||||
}
|
||||
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
total_paragraphs = sum(len(c["paragraphsES"]) for c in chapters_out)
|
||||
print(f"Wrote {out_path}")
|
||||
print(f" Title: {opf['title']}")
|
||||
print(f" Author: {opf['author']}")
|
||||
print(f" Chapters: {len(chapters_out)}")
|
||||
print(f" Paragraphs: {total_paragraphs}")
|
||||
for ch in chapters_out:
|
||||
print(f" ch{ch['number']:02d} {len(ch['paragraphsES']):4d} ¶ {ch['title']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user