Major changes: - Textbook UI: chapter list, reader, and interactive exercise view (keyboard + Apple Pencil) surfaced under the Course tab. 30 chapters, 251 exercises. - Stem-change conjugation toggle on Week 4 flashcard decks (E-IE, E-I, O-UE). Uses existing VerbForm + IrregularSpan data to render highlighted present tense conjugations inline. - Deterministic on-device answer grader with partial credit (correct / close for accent-stripped or single-char-typo / wrong). 11 unit tests cover it. - SharedModels: TextbookChapter (local), TextbookExerciseAttempt (cloud- synced), AnswerGrader helpers. Bumped schema. - DataLoader: textbook seeder (version 8) + refresh helpers that preserve LanGo course decks when textbook data is re-seeded. - Local extraction pipeline in Conjuga/Scripts/textbook/ — XHTML chapter parser, answer-key parser, macOS Vision image OCR + PDF page OCR, merger, NSSpellChecker validator, language-aware auto-fixer, and repair pass that re-pairs quarantined vocab rows using bounding-box coordinates. - UI test target (ConjugaUITests) with three tests: end-to-end textbook flow, all-chapters screenshot audit, and stem-change toggle verification. Generated textbook content (textbook_data.json, textbook_vocab.json) and third-party source files are gitignored — re-run Scripts/textbook/run_pipeline.sh locally to regenerate. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
370 lines
13 KiB
Python
370 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""Parse all chapter XHTMLs + appendix into structured chapters.json.
|
||
|
||
Output schema:
|
||
{
|
||
"chapters": [
|
||
{
|
||
"id": "ch1",
|
||
"number": 1,
|
||
"title": "Nouns, Articles, and Adjectives",
|
||
"part": 1, # part 1/2/3 or null
|
||
"blocks": [ # ordered content
|
||
{"kind": "heading", "level": 3, "text": "..."},
|
||
{"kind": "paragraph", "text": "...", "hasItalic": false},
|
||
{"kind": "key_vocab_header", "title": "Los colores (The colors)"},
|
||
{"kind": "vocab_image", "src": "f0010-03.jpg"},
|
||
{
|
||
"kind": "exercise",
|
||
"id": "1.1",
|
||
"ans_anchor": "ch1ans1",
|
||
"instruction": "Write the appropriate...",
|
||
"image_refs": ["f0005-02.jpg"]
|
||
},
|
||
{"kind": "image", "src": "...", "alt": "..."}
|
||
]
|
||
}
|
||
]
|
||
}
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
from bs4 import BeautifulSoup
|
||
|
||
ROOT = Path(__file__).resolve().parents[3] / "epub_extract" / "OEBPS"
|
||
OUT = Path(__file__).resolve().parent / "chapters.json"
|
||
|
||
# Common icon images embedded in headings — ignore when collecting content images
|
||
ICON_IMAGES = {"Common01.jpg", "Common02.jpg", "Common03.jpg", "Common04.jpg", "Common05.jpg"}
|
||
|
||
EXERCISE_ID_RE = re.compile(r"Exercise\s+([0-9]+\.[0-9]+)")
|
||
ANS_REF_RE = re.compile(r"ch(\d+)ans(\d+)")
|
||
|
||
|
||
def clean_text(el) -> str:
|
||
"""Extract text preserving inline emphasis markers."""
|
||
if el is None:
|
||
return ""
|
||
# Replace <em>/<i> with markdown-ish *...*, <strong>/<b> with **...**
|
||
html = str(el)
|
||
soup = BeautifulSoup(html, "lxml")
|
||
# First: flatten nested emphasis so we don't emit overlapping markers.
|
||
# For <strong><em>X</em></strong>, drop the inner em (the bold wrapping
|
||
# already carries the emphasis visually). Same for <em><strong>...</strong></em>.
|
||
for tag in soup.find_all(["strong", "b"]):
|
||
for inner in tag.find_all(["em", "i"]):
|
||
inner.unwrap()
|
||
for tag in soup.find_all(["em", "i"]):
|
||
for inner in tag.find_all(["strong", "b"]):
|
||
inner.unwrap()
|
||
# Drop ALL inline emphasis. The source has nested/sibling em/strong
|
||
# patterns that CommonMark can't reliably parse, causing markers to leak
|
||
# into the UI. Plain text renders cleanly everywhere.
|
||
for tag in soup.find_all(["em", "i", "strong", "b"]):
|
||
tag.unwrap()
|
||
# Drop pagebreak spans
|
||
for tag in soup.find_all("span", attrs={"epub:type": "pagebreak"}):
|
||
tag.decompose()
|
||
# Replace <br/> with newline
|
||
for br in soup.find_all("br"):
|
||
br.replace_with("\n")
|
||
# Use a separator so adjacent inline tags don't concatenate without spaces
|
||
# (e.g. "<strong><em>Ir</em></strong> and" would otherwise become "Irand").
|
||
text = soup.get_text(separator=" ", strip=False)
|
||
# Collapse runs of whitespace first.
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
# Strip any stray asterisks that sneak through (e.g. author's literal *).
|
||
text = text.replace("*", "")
|
||
# De-space punctuation
|
||
text = re.sub(r"\s+([,.;:!?])", r"\1", text)
|
||
# Tighten brackets that picked up separator-spaces: "( foo )" -> "(foo)"
|
||
text = re.sub(r"([(\[])\s+", r"\1", text)
|
||
text = re.sub(r"\s+([)\]])", r"\1", text)
|
||
# Collapse any double-spaces
|
||
text = re.sub(r" +", " ", text).strip()
|
||
return text
|
||
|
||
|
||
def is_exercise_header(h) -> bool:
|
||
"""Heading with an <a href='ans.xhtml#...'>Exercise N.N</a> link.
|
||
Chapters 1-16 use h3.h3k; chapters 17+ use h4.h4."""
|
||
if h.name not in ("h3", "h4"):
|
||
return False
|
||
a = h.find("a", href=True)
|
||
if a and "ans.xhtml" in a["href"]:
|
||
return True
|
||
return False
|
||
|
||
|
||
def is_key_vocab_header(h) -> bool:
|
||
"""Heading with 'Key Vocabulary' text (no anchor link to answers)."""
|
||
if h.name not in ("h3", "h4"):
|
||
return False
|
||
text = h.get_text(strip=True)
|
||
if "Key Vocabulary" in text and not h.find("a", href=lambda v: v and "ans.xhtml" in v):
|
||
return True
|
||
return False
|
||
|
||
|
||
def extract_image_srcs(parent) -> list:
|
||
"""Return list of image src attributes, skipping icon images."""
|
||
srcs = []
|
||
for img in parent.find_all("img"):
|
||
src = img.get("src", "")
|
||
if not src or Path(src).name in ICON_IMAGES:
|
||
continue
|
||
srcs.append(src)
|
||
return srcs
|
||
|
||
|
||
def parse_chapter(path: Path) -> "dict | None":
|
||
"""Parse one chapter file into structured blocks."""
|
||
html = path.read_text(encoding="utf-8")
|
||
soup = BeautifulSoup(html, "lxml")
|
||
body = soup.find("body")
|
||
if body is None:
|
||
return None
|
||
|
||
# Chapter number + title
|
||
number = None
|
||
title = ""
|
||
h2s = body.find_all("h2")
|
||
for h2 in h2s:
|
||
classes = h2.get("class") or []
|
||
# Use a separator so consecutive inline tags don't concatenate
|
||
# (e.g. "<strong><em>Ir</em></strong> and the Future" → "Ir and the Future")
|
||
text_with_sep = re.sub(r"\s+", " ", h2.get_text(" ", strip=True))
|
||
# Strip spaces that were inserted before punctuation
|
||
text_with_sep = re.sub(r"\s+([,.;:!?])", r"\1", text_with_sep).strip()
|
||
if "h2c" in classes and text_with_sep.isdigit():
|
||
number = int(text_with_sep)
|
||
# Chapters 1–16 use h2c1; chapters 17+ use h2-c
|
||
elif ("h2c1" in classes or "h2-c" in classes) and not title:
|
||
title = text_with_sep
|
||
if number is None:
|
||
# Try id on chapter header (ch1 → 1)
|
||
for h2 in h2s:
|
||
id_ = h2.get("id", "")
|
||
m = re.match(r"ch(\d+)", id_)
|
||
if m:
|
||
number = int(m.group(1))
|
||
break
|
||
|
||
chapter_id = path.stem # ch1, ch2, ...
|
||
|
||
# Walk section content in document order
|
||
section = body.find("section") or body
|
||
blocks: list = []
|
||
pending_instruction = None # holds italic paragraph following an exercise header
|
||
|
||
for el in section.descendants:
|
||
if el.name is None:
|
||
continue
|
||
|
||
classes = el.get("class") or []
|
||
|
||
# Skip nested tags already captured via parent processing
|
||
# We operate only on direct h2/h3/h4/h5/p elements
|
||
if el.name not in ("h2", "h3", "h4", "h5", "p"):
|
||
continue
|
||
|
||
# Exercise header detection (h3 in ch1-16, h4 in ch17+)
|
||
if is_exercise_header(el):
|
||
a = el.find("a", href=True)
|
||
href = a["href"] if a else ""
|
||
m = EXERCISE_ID_RE.search(el.get_text())
|
||
ex_id = m.group(1) if m else ""
|
||
anchor_m = ANS_REF_RE.search(href)
|
||
ans_anchor = anchor_m.group(0) if anchor_m else ""
|
||
blocks.append({
|
||
"kind": "exercise",
|
||
"id": ex_id,
|
||
"ans_anchor": ans_anchor,
|
||
"instruction": "",
|
||
"image_refs": [],
|
||
"prompts": []
|
||
})
|
||
pending_instruction = blocks[-1]
|
||
continue
|
||
|
||
# Key Vocabulary header
|
||
if is_key_vocab_header(el):
|
||
blocks.append({"kind": "key_vocab_header", "title": "Key Vocabulary"})
|
||
pending_instruction = None
|
||
continue
|
||
|
||
# Other headings
|
||
if el.name in ("h2", "h3", "h4", "h5"):
|
||
if el.name == "h2":
|
||
# Skip the chapter-number/chapter-title h2s we already captured
|
||
continue
|
||
txt = clean_text(el)
|
||
if txt:
|
||
blocks.append({
|
||
"kind": "heading",
|
||
"level": int(el.name[1]),
|
||
"text": txt,
|
||
})
|
||
pending_instruction = None
|
||
continue
|
||
|
||
# Paragraphs
|
||
if el.name == "p":
|
||
imgs = extract_image_srcs(el)
|
||
text = clean_text(el)
|
||
p_classes = set(classes)
|
||
|
||
# Skip pure blank-line class ("nump" = underscore lines under number prompts)
|
||
if p_classes & {"nump", "numpa"} and not text:
|
||
continue
|
||
|
||
# Exercise prompt: <p class="number">1. Prompt text</p>
|
||
# Also number1, number2 (continuation numbering), numbera, numbert
|
||
if pending_instruction is not None and p_classes & {"number", "number1", "number2", "numbera", "numbert"}:
|
||
if text:
|
||
pending_instruction["prompts"].append(text)
|
||
continue
|
||
|
||
# Image container for a pending exercise
|
||
if pending_instruction is not None and imgs and not text:
|
||
pending_instruction["image_refs"].extend(imgs)
|
||
continue
|
||
|
||
# Instruction line right after the exercise header
|
||
if pending_instruction is not None and text and not imgs and not pending_instruction["instruction"]:
|
||
pending_instruction["instruction"] = text
|
||
continue
|
||
|
||
# While in pending-exercise state, extra text paragraphs are word
|
||
# banks / context ("from the following list:" etc) — keep pending alive.
|
||
if pending_instruction is not None and text and not imgs:
|
||
pending_instruction.setdefault("extra", []).append(text)
|
||
continue
|
||
|
||
# Paragraphs that contain an image belong to vocab/key-vocab callouts
|
||
if imgs and not text:
|
||
for src in imgs:
|
||
blocks.append({"kind": "vocab_image", "src": src})
|
||
continue
|
||
|
||
# Mixed paragraph: image with caption
|
||
if imgs and text:
|
||
for src in imgs:
|
||
blocks.append({"kind": "vocab_image", "src": src})
|
||
blocks.append({"kind": "paragraph", "text": text})
|
||
continue
|
||
|
||
# Plain paragraph — outside any exercise
|
||
if text:
|
||
blocks.append({"kind": "paragraph", "text": text})
|
||
|
||
return {
|
||
"id": chapter_id,
|
||
"number": number,
|
||
"title": title,
|
||
"blocks": blocks,
|
||
}
|
||
|
||
|
||
def assign_parts(chapters: list, part_files: "dict[int, list[int]]") -> None:
|
||
"""Annotate chapters with part number based on TOC membership."""
|
||
for part_num, chapter_nums in part_files.items():
|
||
for ch in chapters:
|
||
if ch["number"] in chapter_nums:
|
||
ch["part"] = part_num
|
||
for ch in chapters:
|
||
ch.setdefault("part", None)
|
||
|
||
|
||
def read_part_memberships() -> "dict[int, list[int]]":
|
||
"""Derive part→chapter grouping from the OPF spine order."""
|
||
opf = next(ROOT.glob("*.opf"), None)
|
||
if opf is None:
|
||
return {}
|
||
soup = BeautifulSoup(opf.read_text(encoding="utf-8"), "xml")
|
||
memberships: dict = {}
|
||
current_part: "int | None" = None
|
||
for item in soup.find_all("item"):
|
||
href = item.get("href", "")
|
||
m_part = re.match(r"part(\d+)\.xhtml", href)
|
||
m_ch = re.match(r"ch(\d+)\.xhtml", href)
|
||
if m_part:
|
||
current_part = int(m_part.group(1))
|
||
memberships.setdefault(current_part, [])
|
||
elif m_ch and current_part is not None:
|
||
memberships[current_part].append(int(m_ch.group(1)))
|
||
# Manifest order tends to match spine order for this book; verify via spine just in case
|
||
spine = soup.find("spine")
|
||
if spine is not None:
|
||
order = []
|
||
for ref in spine.find_all("itemref"):
|
||
idref = ref.get("idref")
|
||
item = soup.find("item", attrs={"id": idref})
|
||
if item is not None:
|
||
order.append(item.get("href", ""))
|
||
# Rebuild from spine order
|
||
memberships = {}
|
||
current_part = None
|
||
for href in order:
|
||
m_part = re.match(r"part(\d+)\.xhtml", href)
|
||
m_ch = re.match(r"ch(\d+)\.xhtml", href)
|
||
if m_part:
|
||
current_part = int(m_part.group(1))
|
||
memberships.setdefault(current_part, [])
|
||
elif m_ch and current_part is not None:
|
||
memberships[current_part].append(int(m_ch.group(1)))
|
||
return memberships
|
||
|
||
|
||
def main() -> None:
|
||
chapter_files = sorted(
|
||
ROOT.glob("ch*.xhtml"),
|
||
key=lambda p: int(re.match(r"ch(\d+)", p.stem).group(1))
|
||
)
|
||
chapters = []
|
||
for path in chapter_files:
|
||
ch = parse_chapter(path)
|
||
if ch:
|
||
chapters.append(ch)
|
||
|
||
part_memberships = read_part_memberships()
|
||
assign_parts(chapters, part_memberships)
|
||
|
||
out = {
|
||
"chapters": chapters,
|
||
"part_memberships": part_memberships,
|
||
}
|
||
OUT.write_text(json.dumps(out, ensure_ascii=False, indent=2))
|
||
|
||
# Summary
|
||
ex_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "exercise")
|
||
ex_with_prompts = sum(
|
||
1 for ch in chapters for b in ch["blocks"]
|
||
if b["kind"] == "exercise" and b["prompts"]
|
||
)
|
||
ex_with_images = sum(
|
||
1 for ch in chapters for b in ch["blocks"]
|
||
if b["kind"] == "exercise" and b["image_refs"]
|
||
)
|
||
ex_empty = sum(
|
||
1 for ch in chapters for b in ch["blocks"]
|
||
if b["kind"] == "exercise" and not b["prompts"] and not b["image_refs"]
|
||
)
|
||
para_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "paragraph")
|
||
vocab_img_total = sum(1 for ch in chapters for b in ch["blocks"] if b["kind"] == "vocab_image")
|
||
print(f"Chapters: {len(chapters)}")
|
||
print(f"Exercises total: {ex_total}")
|
||
print(f" with text prompts: {ex_with_prompts}")
|
||
print(f" with image prompts: {ex_with_images}")
|
||
print(f" empty: {ex_empty}")
|
||
print(f"Paragraphs: {para_total}")
|
||
print(f"Vocab images: {vocab_img_total}")
|
||
print(f"Parts: {part_memberships}")
|
||
print(f"Wrote {OUT}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|