Add textbook reader, exercise grading, stem-change toggle, extraction pipeline

Major changes:
- Textbook UI: chapter list, reader, and interactive exercise view (keyboard
  + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises.
- Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE).
  Uses existing VerbForm + IrregularSpan data to render highlighted present
  tense conjugations inline.
- Deterministic on-device answer grader with partial credit (correct / close
  for accent-stripped or single-char-typo / wrong). 11 unit tests cover it.
- SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud-
  synced), AnswerGrader helpers. Bumped schema.
- DataLoader: textbook seeder (version 8) + refresh helpers that preserve
  LanGo course decks when textbook data is re-seeded.
- Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter
  parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger,
  NSSpellChecker validator, language-aware auto-fixer, and repair pass that
  re-pairs quarantined vocab rows using bounding-box coordinates.
- UI test target (ConjugaUITests) with three tests: end-to-end textbook
  flow, all-chapters screenshot audit, and stem-change toggle verification.

Generated textbook content (textbook_data.json, textbook_vocab.json) and
third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh
locally to regenerate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Trey T
2026-04-19 15:12:55 -05:00
parent 5ba76a947b
commit 63dfc5e41a
34 changed files with 4516 additions and 61 deletions

View File

@@ -0,0 +1,369 @@
#!/usr/bin/env python3
"""Parse all chapter XHTMLs + appendix into structured chapters.json.
Output schema:
{
"chapters": [
{
"id": "ch1",
"number": 1,
"title": "Nouns, Articles, and Adjectives",
"part": 1, # part 1/2/3 or null
"blocks": [ # ordered content
{"kind": "heading", "level": 3, "text": "..."},
{"kind": "paragraph", "text": "...", "hasItalic": false},
{"kind": "key_vocab_header", "title": "Los colores (The colors)"},
{"kind": "vocab_image", "src": "f0010-03.jpg"},
{
"kind": "exercise",
"id": "1.1",
"ans_anchor": "ch1ans1",
"instruction": "Write the appropriate...",
"image_refs": ["f0005-02.jpg"]
},
{"kind": "image", "src": "...", "alt": "..."}
]
}
]
}
"""
import json
import re
from pathlib import Path
from bs4 import BeautifulSoup
ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
OUT = Path(__file__).resolve().parent / "chapters.json"
# Common icon images embedded in headings — ignore when collecting content images
ICON_IMAGES = {"Common01.jpg", "Common02.jpg", "Common03.jpg", "Common04.jpg", "Common05.jpg"}
EXERCISE_ID_RE = re.compile(r"Exercise\s+([0-9]+\.[0-9]+)")
ANS_REF_RE = re.compile(r"ch(\d+)ans(\d+)")
def clean_text(el) -> str:
"""Extract text preserving inline emphasis markers."""
if el is None:
return ""
# Replace <em>/<i> with markdown-ish *...*, <strong>/<b> with **...**
html = str(el)
soup = BeautifulSoup(html, "lxml")
# First: flatten nested emphasis so we don't emit overlapping markers.
# For <strong><em>X</em></strong>, drop the inner em (the bold wrapping
# already carries the emphasis visually). Same for <em><strong>...</strong></em>.
for tag in soup.find_all(["strong", "b"]):
for inner in tag.find_all(["em", "i"]):
inner.unwrap()
for tag in soup.find_all(["em", "i"]):
for inner in tag.find_all(["strong", "b"]):
inner.unwrap()
# Drop ALL inline emphasis. The source has nested/sibling em/strong
# patterns that CommonMark can't reliably parse, causing markers to leak
# into the UI. Plain text renders cleanly everywhere.
for tag in soup.find_all(["em", "i", "strong", "b"]):
tag.unwrap()
# Drop pagebreak spans
for tag in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
tag.decompose()
# Replace <br/> with newline
for br in soup.find_all("br"):
br.replace_with("\n")
# Use a separator so adjacent inline tags don't concatenate without spaces
# (e.g. "<strong><em>Ir</em></strong> and" would otherwise become "Irand").
text = soup.get_text(separator=" ", strip=False)
# Collapse runs of whitespace first.
text = re.sub(r"\s+", " ", text).strip()
# Strip any stray asterisks that sneak through (e.g. author's literal *).
text = text.replace("*", "")
# De-space punctuation
text = re.sub(r"\s+([,.;:!?])", r"\1", text)
# Tighten brackets that picked up separator-spaces: "( foo )" -> "(foo)"
text = re.sub(r"([(\[])\s+", r"\1", text)
text = re.sub(r"\s+([)\]])", r"\1", text)
# Collapse any double-spaces
text = re.sub(r" +", " ", text).strip()
return text
def is_exercise_header(h) -> bool:
"""Heading with an <a href='ans.xhtml#...'>Exercise N.N</a> link.
Chapters 1-16 use h3.h3k; chapters 17+ use h4.h4."""
if h.name not in ("h3", "h4"):
return False
a = h.find("a", href=True)
if a and "ans.xhtml" in a["href"]:
return True
return False
def is_key_vocab_header(h) -> bool:
"""Heading with 'Key Vocabulary' text (no anchor link to answers)."""
if h.name not in ("h3", "h4"):
return False
text = h.get_text(strip=True)
if "Key Vocabulary" in text and not h.find("a", href=lambda v: v and "ans.xhtml" in v):
return True
return False
def extract_image_srcs(parent) -> list:
"""Return list of image src attributes, skipping icon images."""
srcs = []
for img in parent.find_all("img"):
src = img.get("src", "")
if not src or Path(src).name in ICON_IMAGES:
continue
srcs.append(src)
return srcs
def parse_chapter(path: Path) -> "dict | None":
"""Parse one chapter file into structured blocks."""
html = path.read_text(encoding="utf-8")
soup = BeautifulSoup(html, "lxml")
body = soup.find("body")
if body is None:
return None
# Chapter number + title
number = None
title = ""
h2s = body.find_all("h2")
for h2 in h2s:
classes = h2.get("class") or []
# Use a separator so consecutive inline tags don't concatenate
# (e.g. "<strong><em>Ir</em></strong> and the Future" → "Ir and the Future")
text_with_sep = re.sub(r"\s+", " ", h2.get_text(" ", strip=True))
# Strip spaces that were inserted before punctuation
text_with_sep = re.sub(r"\s+([,.;:!?])", r"\1", text_with_sep).strip()
if "h2c" in classes and text_with_sep.isdigit():
number = int(text_with_sep)
# Chapters 116 use h2c1; chapters 17+ use h2-c
elif ("h2c1" in classes or "h2-c" in classes) and not title:
title = text_with_sep
if number is None:
# Try id on chapter header (ch1 → 1)
for h2 in h2s:
id_ = h2.get("id", "")
m = re.match(r"ch(\d+)", id_)
if m:
number = int(m.group(1))
break
chapter_id = path.stem # ch1, ch2, ...
# Walk section content in document order
section = body.find("section") or body
blocks: list = []
pending_instruction = None # holds italic paragraph following an exercise header
for el in section.descendants:
if el.name is None:
continue
classes = el.get("class") or []
# Skip nested tags already captured via parent processing
# We operate only on direct h2/h3/h4/h5/p elements
if el.name not in ("h2", "h3", "h4", "h5", "p"):
continue
# Exercise header detection (h3 in ch1-16, h4 in ch17+)
if is_exercise_header(el):
a = el.find("a", href=True)
href = a["href"] if a else ""
m = EXERCISE_ID_RE.search(el.get_text())
ex_id = m.group(1) if m else ""
anchor_m = ANS_REF_RE.search(href)
ans_anchor = anchor_m.group(0) if anchor_m else ""
blocks.append({
"kind": "exercise",
"id": ex_id,
"ans_anchor": ans_anchor,
"instruction": "",
"image_refs": [],
"prompts": []
})
pending_instruction = blocks[-1]
continue
# Key Vocabulary header
if is_key_vocab_header(el):
blocks.append({"kind": "key_vocab_header", "title": "Key Vocabulary"})
pending_instruction = None
continue
# Other headings
if el.name in ("h2", "h3", "h4", "h5"):
if el.name == "h2":
# Skip the chapter-number/chapter-title h2s we already captured
continue
txt = clean_text(el)
if txt:
blocks.append({
"kind": "heading",
"level": int(el.name[1]),
"text": txt,
})
pending_instruction = None
continue
# Paragraphs
if el.name == "p":
imgs = extract_image_srcs(el)
text = clean_text(el)
p_classes = set(classes)
# Skip pure blank-line class ("nump" = underscore lines under number prompts)
if p_classes & {"nump", "numpa"} and not text:
continue
# Exercise prompt: <p class="number">1. Prompt text</p>
# Also number1, number2 (continuation numbering), numbera, numbert
if pending_instruction is not None and p_classes & {"number", "number1", "number2", "numbera", "numbert"}:
if text:
pending_instruction["prompts"].append(text)
continue
# Image container for a pending exercise
if pending_instruction is not None and imgs and not text:
pending_instruction["image_refs"].extend(imgs)
continue
# Instruction line right after the exercise header
if pending_instruction is not None and text and not imgs and not pending_instruction["instruction"]:
pending_instruction["instruction"] = text
continue
# While in pending-exercise state, extra text paragraphs are word
# banks / context ("from the following list:" etc) — keep pending alive.
if pending_instruction is not None and text and not imgs:
pending_instruction.setdefault("extra", []).append(text)
continue
# Paragraphs that contain an image belong to vocab/key-vocab callouts
if imgs and not text:
for src in imgs:
blocks.append({"kind": "vocab_image", "src": src})
continue
# Mixed paragraph: image with caption
if imgs and text:
for src in imgs:
blocks.append({"kind": "vocab_image", "src": src})
blocks.append({"kind": "paragraph", "text": text})
continue
# Plain paragraph — outside any exercise
if text:
blocks.append({"kind": "paragraph", "text": text})
return {
"id": chapter_id,
"number": number,
"title": title,
"blocks": blocks,
}
def assign_parts(chapters: list, part_files: "dict[int, list[int]]") -> None:
"""Annotate chapters with part number based on TOC membership."""
for part_num, chapter_nums in part_files.items():
for ch in chapters:
if ch["number"] in chapter_nums:
ch["part"] = part_num
for ch in chapters:
ch.setdefault("part", None)
def read_part_memberships() -> "dict[int, list[int]]":
"""Derive part→chapter grouping from the OPF spine order."""
opf = next(ROOT.glob("*.opf"), None)
if opf is None:
return {}
soup = BeautifulSoup(opf.read_text(encoding="utf-8"), "xml")
memberships: dict = {}
current_part: "int | None" = None
for item in soup.find_all("item"):
href = item.get("href", "")
m_part = re.match(r"part(\d+)\.xhtml", href)
m_ch = re.match(r"ch(\d+)\.xhtml", href)
if m_part:
current_part = int(m_part.group(1))
memberships.setdefault(current_part, [])
elif m_ch and current_part is not None:
memberships[current_part].append(int(m_ch.group(1)))
# Manifest order tends to match spine order for this book; verify via spine just in case
spine = soup.find("spine")
if spine is not None:
order = []
for ref in spine.find_all("itemref"):
idref = ref.get("idref")
item = soup.find("item", attrs={"id": idref})
if item is not None:
order.append(item.get("href", ""))
# Rebuild from spine order
memberships = {}
current_part = None
for href in order:
m_part = re.match(r"part(\d+)\.xhtml", href)
m_ch = re.match(r"ch(\d+)\.xhtml", href)
if m_part:
current_part = int(m_part.group(1))
memberships.setdefault(current_part, [])
elif m_ch and current_part is not None:
memberships[current_part].append(int(m_ch.group(1)))
return memberships
def main() -> None:
chapter_files = sorted(
ROOT.glob("ch*.xhtml"),
key=lambda p: int(re.match(r"ch(\d+)", p.stem).group(1))
)
chapters = []
for path in chapter_files:
ch = parse_chapter(path)
if ch:
chapters.append(ch)
part_memberships = read_part_memberships()
assign_parts(chapters, part_memberships)
out = {
"chapters": chapters,
"part_memberships": part_memberships,
}
OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))
# Summary
ex_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise")
ex_with_prompts = sum(
1 for ch in chapters for b in ch["blocks"]
if b["kind"] == "exercise" and b["prompts"]
)
ex_with_images = sum(
1 for ch in chapters for b in ch["blocks"]
if b["kind"] == "exercise" and b["image_refs"]
)
ex_empty = sum(
1 for ch in chapters for b in ch["blocks"]
if b["kind"] == "exercise" and not b["prompts"] and not b["image_refs"]
)
para_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "paragraph")
vocab_img_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "vocab_image")
print(f"Chapters: {len(chapters)}")
print(f"Exercises total: {ex_total}")
print(f" with text prompts: {ex_with_prompts}")
print(f" with image prompts: {ex_with_images}")
print(f" empty: {ex_empty}")
print(f"Paragraphs: {para_total}")
print(f"Vocab images: {vocab_img_total}")
print(f"Parts: {part_memberships}")
print(f"Wrote {OUT}")
if __name__ == "__main__":
main()