Add Books — read EPUB-imported books in Practice with tap-to-define

New "Books" row in the Practice tab opens a library of bundled bilingual books. Each chapter renders Spanish paragraph-by-paragraph; tap any word for a definition sheet (DictionaryService with on-device AI fallback), or toggle the toolbar button to swap to the pre-computed English translation inline. Local-only Book + BookChapter SwiftData models added to the local container schema (reset version bumped to 5). DataLoader.seedBooks walks the bundle for `book_*.json` resources, so future books drop in without touching app code — just bundle a new JSON and bump bookDataVersion. First book: Olly Richards' "Spanish Short Stories For Beginners Vol 2" — 13 chapters, 2,646 paragraphs, bilingual. Scripts/books/ is the repeatable pipeline for future EPUBs: extract_epub.py → translate_chapters.py (per-chapter resumable jobs) → bundle_book.py. Translation is done by parallel Claude Code subagents reading per-job input files and writing output files — no API key required, matching the pattern used for the textbook vocab vision pass. See Scripts/books/README.md for the full how-to. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 09:21:44 -05:00
parent ade091f108
commit 09e49bda2c
17 changed files with 6782 additions and 1 deletions
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""Parse an EPUB into chapters.json for the in-app Books feature.
+
+Usage:
+    python3 extract_epub.py <epub_path> [--slug SLUG] [--out OUT_DIR]
+
+Defaults:
+    SLUG    derived from the EPUB filename (lowercased, dashed)
+    OUT_DIR ./build/<slug>
+
+Output:
+    OUT_DIR/chapters.json
+        {
+          "title": "...",
+          "author": "...",
+          "language": "...",
+          "slug": "...",
+          "chapters": [
+            {"id": "ch1", "number": 1, "title": "Preface",
+             "paragraphsES": ["...", "..."]},
+            ...
+          ]
+        }
+
+How chapter grouping works:
+    1. Read content.opf manifest (id -> href) and spine (ordered idrefs).
+    2. Read toc.ncx navMap to get the ordered list of chapter (title, first-href).
+    3. For each chapter, claim every spine file from its first href up to (but
+       not including) the next chapter's first href.
+    4. For each file in the chapter's range, parse <p> elements, strip tags,
+       normalise whitespace + smart quotes, drop empties.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+import unicodedata
+import warnings
+import zipfile
+from pathlib import Path
+from typing import Iterable
+from xml.etree import ElementTree as ET
+
+from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
+
+warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
+
+
+NS = {
+    "opf": "http://www.idpf.org/2007/opf",
+    "dc": "http://purl.org/dc/elements/1.1/",
+    "ncx": "http://www.daisy.org/z3986/2005/ncx/",
+    "xhtml": "http://www.w3.org/1999/xhtml",
+}
+
+
+def _slugify(s: str) -> str:
+    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
+    s = re.sub(r"[^a-zA-Z0-9]+", "-", s).strip("-").lower()
+    return s or "book"
+
+
+def _normalise(text: str) -> str:
+    # Collapse runs of whitespace, normalise smart quotes to plain ones.
+    text = text.replace(" ", " ")
+    text = re.sub(r"\s+", " ", text).strip()
+    text = re.sub(r"\s+([.,;:!?…])", r"\1", text)
+    text = re.sub(r"([¡¿])\s+", r"\1", text)
+    return text
+
+
+def _read_zip_text(zf: zipfile.ZipFile, path: str) -> str:
+    return zf.read(path).decode("utf-8")
+
+
+def _container_root(zf: zipfile.ZipFile) -> str:
+    container = ET.fromstring(_read_zip_text(zf, "META-INF/container.xml"))
+    rootfile = container.find(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile")
+    if rootfile is None:
+        raise RuntimeError("Missing rootfile entry in META-INF/container.xml")
+    return rootfile.attrib["full-path"]
+
+
+def _parse_opf(zf: zipfile.ZipFile, opf_path: str):
+    text = _read_zip_text(zf, opf_path)
+    root = ET.fromstring(text)
+
+    title = (root.findtext(".//dc:title", default="", namespaces=NS) or "").strip()
+    author = (root.findtext(".//dc:creator", default="", namespaces=NS) or "").strip()
+    language = (root.findtext(".//dc:language", default="", namespaces=NS) or "").strip()
+
+    manifest: dict[str, str] = {}
+    for item in root.findall("opf:manifest/opf:item", NS):
+        manifest[item.attrib["id"]] = item.attrib["href"]
+
+    spine: list[str] = []
+    for itemref in root.findall("opf:spine/opf:itemref", NS):
+        spine.append(itemref.attrib["idref"])
+
+    ncx_id = root.find("opf:spine", NS).attrib.get("toc") if root.find("opf:spine", NS) is not None else None
+    ncx_href = manifest.get(ncx_id) if ncx_id else None
+
+    return {
+        "title": title,
+        "author": author,
+        "language": language,
+        "manifest": manifest,
+        "spine": spine,
+        "ncx_href": ncx_href,
+        "opf_dir": str(Path(opf_path).parent) if "/" in opf_path else "",
+    }
+
+
+def _parse_ncx(zf: zipfile.ZipFile, ncx_path: str) -> list[dict]:
+    text = _read_zip_text(zf, ncx_path)
+    root = ET.fromstring(text)
+    chapters: list[dict] = []
+    for nav in root.findall("ncx:navMap/ncx:navPoint", NS):
+        title = (nav.findtext("ncx:navLabel/ncx:text", default="", namespaces=NS) or "").strip()
+        content = nav.find("ncx:content", NS)
+        src = content.attrib.get("src", "") if content is not None else ""
+        # Strip the anchor — we want the file path only.
+        href = src.split("#", 1)[0]
+        chapters.append({"title": title, "href": href})
+    return chapters
+
+
+def _resolve_zip_path(base_dir: str, href: str) -> str:
+    if not base_dir:
+        return href
+    return f"{base_dir}/{href}".lstrip("/")
+
+
+def _extract_paragraphs(zf: zipfile.ZipFile, zip_path: str) -> list[str]:
+    try:
+        html = _read_zip_text(zf, zip_path)
+    except KeyError:
+        return []
+    soup = BeautifulSoup(html, "lxml")
+    paragraphs: list[str] = []
+    for p in soup.find_all("p"):
+        # Drop nav-anchor wrappers that contain no real text.
+        text = _normalise(p.get_text(" ", strip=True))
+        if not text:
+            continue
+        # Drop chapter-heading paragraphs that only echo the title — handled
+        # separately by the TOC. Heuristic: very short paragraph that's just
+        # numbers + the chapter title pattern. Keep everything else.
+        paragraphs.append(text)
+    return paragraphs
+
+
+def _chapter_files(
+    spine_files: list[str], chapter_hrefs: list[str]
+) -> list[list[str]]:
+    """Slice the spine into one list of files per chapter, using the chapter's
+    first href as the chapter boundary. Files before the first chapter (e.g.
+    cover, titlepage) are dropped."""
+    boundaries: list[int] = []
+    for href in chapter_hrefs:
+        try:
+            idx = spine_files.index(href)
+        except ValueError:
+            boundaries.append(-1)
+            continue
+        boundaries.append(idx)
+
+    ranges: list[list[str]] = []
+    for i, start in enumerate(boundaries):
+        if start < 0:
+            ranges.append([])
+            continue
+        end = len(spine_files)
+        for next_start in boundaries[i + 1:]:
+            if next_start >= 0:
+                end = next_start
+                break
+        ranges.append(spine_files[start:end])
+    return ranges
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("epub", type=Path)
+    parser.add_argument("--slug", default=None)
+    parser.add_argument("--out", type=Path, default=None)
+    args = parser.parse_args()
+
+    if not args.epub.exists():
+        print(f"EPUB not found: {args.epub}", file=sys.stderr)
+        sys.exit(2)
+
+    with zipfile.ZipFile(args.epub) as zf:
+        opf_path = _container_root(zf)
+        opf = _parse_opf(zf, opf_path)
+
+        if not opf["ncx_href"]:
+            print("No NCX found in spine; cannot derive chapter structure.", file=sys.stderr)
+            sys.exit(3)
+
+        ncx_path = _resolve_zip_path(opf["opf_dir"], opf["ncx_href"])
+        toc = _parse_ncx(zf, ncx_path)
+
+        spine_files = [
+            _resolve_zip_path(opf["opf_dir"], opf["manifest"].get(idref, ""))
+            for idref in opf["spine"]
+        ]
+        chapter_hrefs = [_resolve_zip_path(opf["opf_dir"], c["href"]) for c in toc]
+        chapter_file_ranges = _chapter_files(spine_files, chapter_hrefs)
+
+        chapters_out: list[dict] = []
+        for i, (meta, files) in enumerate(zip(toc, chapter_file_ranges), start=1):
+            paragraphs: list[str] = []
+            for f in files:
+                paragraphs.extend(_extract_paragraphs(zf, f))
+            # Drop leading paragraph(s) that just echo the chapter title — the
+            # title is already stored separately.
+            title_norm = _normalise(meta["title"]).lower()
+            while paragraphs and _normalise(paragraphs[0]).lower() == title_norm:
+                paragraphs.pop(0)
+            chapters_out.append(
+                {
+                    "id": f"ch{i}",
+                    "number": i,
+                    "title": meta["title"],
+                    "paragraphsES": paragraphs,
+                }
+            )
+
+    slug = args.slug or _slugify(opf["title"]) or args.epub.stem
+    out_dir = args.out or (Path("build") / slug)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / "chapters.json"
+
+    payload = {
+        "title": opf["title"],
+        "author": opf["author"],
+        "language": opf["language"],
+        "slug": slug,
+        "chapters": chapters_out,
+    }
+    out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    total_paragraphs = sum(len(c["paragraphsES"]) for c in chapters_out)
+    print(f"Wrote {out_path}")
+    print(f"  Title:      {opf['title']}")
+    print(f"  Author:     {opf['author']}")
+    print(f"  Chapters:   {len(chapters_out)}")
+    print(f"  Paragraphs: {total_paragraphs}")
+    for ch in chapters_out:
+        print(f"    ch{ch['number']:02d}  {len(ch['paragraphsES']):4d} ¶  {ch['title']}")
+
+
+if __name__ == "__main__":
+    main()