Files
Spanish/Conjuga/Scripts/textbook/extract_chapters.py
Trey T 63dfc5e41a Add textbook reader, exercise grading, stem-change toggle, extraction pipeline
Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:12:55 -05:00

370 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Parse all chapter XHTMLs + appendix into structured chapters.json.
Output schema:
{
"chapters": [
{
"id": "ch1",
"number": 1,
"title": "Nouns, Articles, and Adjectives",
"part": 1, # part 1/2/3 or null
"blocks": [ # ordered content
{"kind": "heading", "level": 3, "text": "..."},
{"kind": "paragraph", "text": "...", "hasItalic": false},
{"kind": "key_vocab_header", "title": "Los colores (The colors)"},
{"kind": "vocab_image", "src": "f0010-03.jpg"},
{
"kind": "exercise",
"id": "1.1",
"ans_anchor": "ch1ans1",
"instruction": "Write the appropriate...",
"image_refs": ["f0005-02.jpg"]
},
{"kind": "image", "src": "...", "alt": "..."}
]
}
]
}
"""
import json
import re
from pathlib import Path
from bs4 import BeautifulSoup
ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
OUT = Path(__file__).resolve().parent / "chapters.json"
# Common icon images embedded in headings — ignore when collecting content images
ICON_IMAGES = {"Common01.jpg", "Common02.jpg", "Common03.jpg", "Common04.jpg", "Common05.jpg"}
EXERCISE_ID_RE = re.compile(r"Exercise\s+([0-9]+\.[0-9]+)")
ANS_REF_RE = re.compile(r"ch(\d+)ans(\d+)")
def clean_text(el) -> str:
"""Extract text preserving inline emphasis markers."""
if el is None:
return ""
# Replace <em>/<i> with markdown-ish *...*, <strong>/<b> with **...**
html = str(el)
soup = BeautifulSoup(html, "lxml")
# First: flatten nested emphasis so we don't emit overlapping markers.
# For <strong><em>X</em></strong>, drop the inner em (the bold wrapping
# already carries the emphasis visually). Same for <em><strong>...</strong></em>.
for tag in soup.find_all(["strong", "b"]):
for inner in tag.find_all(["em", "i"]):
inner.unwrap()
for tag in soup.find_all(["em", "i"]):
for inner in tag.find_all(["strong", "b"]):
inner.unwrap()
# Drop ALL inline emphasis. The source has nested/sibling em/strong
# patterns that CommonMark can't reliably parse, causing markers to leak
# into the UI. Plain text renders cleanly everywhere.
for tag in soup.find_all(["em", "i", "strong", "b"]):
tag.unwrap()
# Drop pagebreak spans
for tag in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
tag.decompose()
# Replace <br/> with newline
for br in soup.find_all("br"):
br.replace_with("\n")
# Use a separator so adjacent inline tags don't concatenate without spaces
# (e.g. "<strong><em>Ir</em></strong> and" would otherwise become "Irand").
text = soup.get_text(separator=" ", strip=False)
# Collapse runs of whitespace first.
text = re.sub(r"\s+", " ", text).strip()
# Strip any stray asterisks that sneak through (e.g. author's literal *).
text = text.replace("*", "")
# De-space punctuation
text = re.sub(r"\s+([,.;:!?])", r"\1", text)
# Tighten brackets that picked up separator-spaces: "( foo )" -> "(foo)"
text = re.sub(r"([(\[])\s+", r"\1", text)
text = re.sub(r"\s+([)\]])", r"\1", text)
# Collapse any double-spaces
text = re.sub(r" +", " ", text).strip()
return text
def is_exercise_header(h) -> bool:
"""Heading with an <a href='ans.xhtml#...'>Exercise N.N</a> link.
Chapters 1-16 use h3.h3k; chapters 17+ use h4.h4."""
if h.name not in ("h3", "h4"):
return False
a = h.find("a", href=True)
if a and "ans.xhtml" in a["href"]:
return True
return False
def is_key_vocab_header(h) -> bool:
"""Heading with 'Key Vocabulary' text (no anchor link to answers)."""
if h.name not in ("h3", "h4"):
return False
text = h.get_text(strip=True)
if "Key Vocabulary" in text and not h.find("a", href=lambda v: v and "ans.xhtml" in v):
return True
return False
def extract_image_srcs(parent) -> list:
"""Return list of image src attributes, skipping icon images."""
srcs = []
for img in parent.find_all("img"):
src = img.get("src", "")
if not src or Path(src).name in ICON_IMAGES:
continue
srcs.append(src)
return srcs
def parse_chapter(path: Path) -> "dict | None":
"""Parse one chapter file into structured blocks."""
html = path.read_text(encoding="utf-8")
soup = BeautifulSoup(html, "lxml")
body = soup.find("body")
if body is None:
return None
# Chapter number + title
number = None
title = ""
h2s = body.find_all("h2")
for h2 in h2s:
classes = h2.get("class") or []
# Use a separator so consecutive inline tags don't concatenate
# (e.g. "<strong><em>Ir</em></strong> and the Future" → "Ir and the Future")
text_with_sep = re.sub(r"\s+", " ", h2.get_text(" ", strip=True))
# Strip spaces that were inserted before punctuation
text_with_sep = re.sub(r"\s+([,.;:!?])", r"\1", text_with_sep).strip()
if "h2c" in classes and text_with_sep.isdigit():
number = int(text_with_sep)
# Chapters 116 use h2c1; chapters 17+ use h2-c
elif ("h2c1" in classes or "h2-c" in classes) and not title:
title = text_with_sep
if number is None:
# Try id on chapter header (ch1 → 1)
for h2 in h2s:
id_ = h2.get("id", "")
m = re.match(r"ch(\d+)", id_)
if m:
number = int(m.group(1))
break
chapter_id = path.stem # ch1, ch2, ...
# Walk section content in document order
section = body.find("section") or body
blocks: list = []
pending_instruction = None # holds italic paragraph following an exercise header
for el in section.descendants:
if el.name is None:
continue
classes = el.get("class") or []
# Skip nested tags already captured via parent processing
# We operate only on direct h2/h3/h4/h5/p elements
if el.name not in ("h2", "h3", "h4", "h5", "p"):
continue
# Exercise header detection (h3 in ch1-16, h4 in ch17+)
if is_exercise_header(el):
a = el.find("a", href=True)
href = a["href"] if a else ""
m = EXERCISE_ID_RE.search(el.get_text())
ex_id = m.group(1) if m else ""
anchor_m = ANS_REF_RE.search(href)
ans_anchor = anchor_m.group(0) if anchor_m else ""
blocks.append({
"kind": "exercise",
"id": ex_id,
"ans_anchor": ans_anchor,
"instruction": "",
"image_refs": [],
"prompts": []
})
pending_instruction = blocks[-1]
continue
# Key Vocabulary header
if is_key_vocab_header(el):
blocks.append({"kind": "key_vocab_header", "title": "Key Vocabulary"})
pending_instruction = None
continue
# Other headings
if el.name in ("h2", "h3", "h4", "h5"):
if el.name == "h2":
# Skip the chapter-number/chapter-title h2s we already captured
continue
txt = clean_text(el)
if txt:
blocks.append({
"kind": "heading",
"level": int(el.name[1]),
"text": txt,
})
pending_instruction = None
continue
# Paragraphs
if el.name == "p":
imgs = extract_image_srcs(el)
text = clean_text(el)
p_classes = set(classes)
# Skip pure blank-line class ("nump" = underscore lines under number prompts)
if p_classes & {"nump", "numpa"} and not text:
continue
# Exercise prompt: <p class="number">1. Prompt text</p>
# Also number1, number2 (continuation numbering), numbera, numbert
if pending_instruction is not None and p_classes & {"number", "number1", "number2", "numbera", "numbert"}:
if text:
pending_instruction["prompts"].append(text)
continue
# Image container for a pending exercise
if pending_instruction is not None and imgs and not text:
pending_instruction["image_refs"].extend(imgs)
continue
# Instruction line right after the exercise header
if pending_instruction is not None and text and not imgs and not pending_instruction["instruction"]:
pending_instruction["instruction"] = text
continue
# While in pending-exercise state, extra text paragraphs are word
# banks / context ("from the following list:" etc) — keep pending alive.
if pending_instruction is not None and text and not imgs:
pending_instruction.setdefault("extra", []).append(text)
continue
# Paragraphs that contain an image belong to vocab/key-vocab callouts
if imgs and not text:
for src in imgs:
blocks.append({"kind": "vocab_image", "src": src})
continue
# Mixed paragraph: image with caption
if imgs and text:
for src in imgs:
blocks.append({"kind": "vocab_image", "src": src})
blocks.append({"kind": "paragraph", "text": text})
continue
# Plain paragraph — outside any exercise
if text:
blocks.append({"kind": "paragraph", "text": text})
return {
"id": chapter_id,
"number": number,
"title": title,
"blocks": blocks,
}
def assign_parts(chapters: list, part_files: "dict[int, list[int]]") -> None:
"""Annotate chapters with part number based on TOC membership."""
for part_num, chapter_nums in part_files.items():
for ch in chapters:
if ch["number"] in chapter_nums:
ch["part"] = part_num
for ch in chapters:
ch.setdefault("part", None)
def read_part_memberships() -> "dict[int, list[int]]":
"""Derive part→chapter grouping from the OPF spine order."""
opf = next(ROOT.glob("*.opf"), None)
if opf is None:
return {}
soup = BeautifulSoup(opf.read_text(encoding="utf-8"), "xml")
memberships: dict = {}
current_part: "int | None" = None
for item in soup.find_all("item"):
href = item.get("href", "")
m_part = re.match(r"part(\d+)\.xhtml", href)
m_ch = re.match(r"ch(\d+)\.xhtml", href)
if m_part:
current_part = int(m_part.group(1))
memberships.setdefault(current_part, [])
elif m_ch and current_part is not None:
memberships[current_part].append(int(m_ch.group(1)))
# Manifest order tends to match spine order for this book; verify via spine just in case
spine = soup.find("spine")
if spine is not None:
order = []
for ref in spine.find_all("itemref"):
idref = ref.get("idref")
item = soup.find("item", attrs={"id": idref})
if item is not None:
order.append(item.get("href", ""))
# Rebuild from spine order
memberships = {}
current_part = None
for href in order:
m_part = re.match(r"part(\d+)\.xhtml", href)
m_ch = re.match(r"ch(\d+)\.xhtml", href)
if m_part:
current_part = int(m_part.group(1))
memberships.setdefault(current_part, [])
elif m_ch and current_part is not None:
memberships[current_part].append(int(m_ch.group(1)))
return memberships
def main() -> None:
chapter_files = sorted(
ROOT.glob("ch*.xhtml"),
key=lambda p: int(re.match(r"ch(\d+)", p.stem).group(1))
)
chapters = []
for path in chapter_files:
ch = parse_chapter(path)
if ch:
chapters.append(ch)
part_memberships = read_part_memberships()
assign_parts(chapters, part_memberships)
out = {
"chapters": chapters,
"part_memberships": part_memberships,
}
OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))
# Summary
ex_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise")
ex_with_prompts = sum(
1 for ch in chapters for b in ch["blocks"]
if b["kind"] == "exercise" and b["prompts"]
)
ex_with_images = sum(
1 for ch in chapters for b in ch["blocks"]
if b["kind"] == "exercise" and b["image_refs"]
)
ex_empty = sum(
1 for ch in chapters for b in ch["blocks"]
if b["kind"] == "exercise" and not b["prompts"] and not b["image_refs"]
)
para_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "paragraph")
vocab_img_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "vocab_image")
print(f"Chapters: {len(chapters)}")
print(f"Exercises total: {ex_total}")
print(f" with text prompts: {ex_with_prompts}")
print(f" with image prompts: {ex_with_images}")
print(f" empty: {ex_empty}")
print(f"Paragraphs: {para_total}")
print(f"Vocab images: {vocab_img_total}")
print(f"Parts: {part_memberships}")
print(f"Wrote {OUT}")
if __name__ == "__main__":
main()